1
1

Compute and pass the local_rank and local number of procs (in that proc's job) on the node.

To be precise, given this hypothetical launching pattern:

host1: vpids 0, 2, 4, 6
host2: vpids 1, 3, 5, 7

The local_rank for these procs would be:

host1: vpids 0->local_rank 0, v2->lr1, v4->lr2, v6->lr3
host2: vpids 1->local_rank 0, v3->lr1, v5->lr2, v7->lr3

and the number of local procs on each node would be four. If vpid=0 then does a comm_spawn of one process on host1, the values of the parent job would remain unchanged. The local_rank of the child process would be 0 and its num_local_procs would be 1 since it is in a separate jobid.

I have verified this functionality for the rsh case - need to verify that slurm and other cases also get the right values. Some consolidation of common code is probably going to occur in the SDS components to make this simpler and more maintainable in the future.

This commit was SVN r14706.
Этот коммит содержится в:
Ralph Castain 2007-05-21 14:30:10 +00:00
родитель 5ceaa605d7
Коммит d9acc93efa
28 изменённых файлов: 391 добавлений и 81 удалений

Просмотреть файл

@ -52,6 +52,8 @@ orte_odls_base_module_t orte_odls;
static void orte_odls_child_constructor(orte_odls_child_t *ptr)
{
ptr->name = NULL;
ptr->local_rank = ORTE_VPID_INVALID;
ptr->num_procs = 0;
ptr->pid = 0;
ptr->app_idx = -1;
ptr->alive = false;

Просмотреть файл

@ -51,6 +51,8 @@ extern "C" {
typedef struct orte_odls_child_t {
opal_list_item_t super; /* required to place this on a list */
orte_process_name_t *name; /* the OpenRTE name of the proc */
orte_vpid_t local_rank; /* local rank of the proc on this node */
orte_std_cntr_t num_procs; /* number of procs from this job on this node */
pid_t pid; /* local pid of the proc */
orte_std_cntr_t app_idx; /* index of the app_context for this proc */
bool alive; /* is this proc alive? */

Просмотреть файл

@ -138,11 +138,13 @@ int orte_odls_default_subscribe_launch_data(orte_jobid_t job, orte_gpr_notify_cb
int num_glob_keys = 4;
char* keys[] = {
ORTE_PROC_NAME_KEY,
ORTE_PROC_LOCAL_RANK_KEY,
ORTE_PROC_APP_CONTEXT_KEY,
ORTE_NODE_NAME_KEY,
ORTE_NODE_NUM_PROCS_KEY,
ORTE_NODE_OVERSUBSCRIBED_KEY
};
int num_keys = 4;
int num_keys = 6;
int i, rc;
/* get the job segment name */
@ -312,7 +314,7 @@ int orte_odls_default_get_add_procs_data(orte_gpr_notify_data_t **data,
proc = (orte_mapped_proc_t*)item;
/* must not have any tokens so that launch_procs can process it correctly */
if (ORTE_SUCCESS != (rc = orte_gpr.create_value(&value, 0, segment, 3, 0))) {
if (ORTE_SUCCESS != (rc = orte_gpr.create_value(&value, 0, segment, 5, 0))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(ndat);
OBJ_RELEASE(value);
@ -346,6 +348,24 @@ int orte_odls_default_get_add_procs_data(orte_gpr_notify_data_t **data,
return rc;
}
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[3]),
ORTE_PROC_LOCAL_RANK_KEY,
ORTE_VPID, &proc->local_rank))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(ndat);
OBJ_RELEASE(value);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[4]),
ORTE_NODE_NUM_PROCS_KEY,
ORTE_STD_CNTR, &node->num_procs))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(ndat);
OBJ_RELEASE(value);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_pointer_array_add(&cnt, ndat->values, value))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(ndat);
@ -884,8 +904,9 @@ static int odls_default_fork_local_proc(
opal_setenv(param, orte_system_info.nodename, true, &environ_copy);
free(param);
/* push name into environment */
/* push data into environment */
orte_ns_nds_env_put(child->name, vpid_start, vpid_range,
child->local_rank, child->num_procs,
&environ_copy);
@ -1135,6 +1156,22 @@ int orte_odls_default_launch_local_procs(orte_gpr_notify_data_t *data, char **ba
child->app_idx = *sptr; /* save the index into the app_context objects */
continue;
}
if(strcmp(kval->key, ORTE_PROC_LOCAL_RANK_KEY) == 0) {
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&vptr, kval->value, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
child->local_rank = *vptr; /* save the local_rank */
continue;
}
if(strcmp(kval->key, ORTE_NODE_NUM_PROCS_KEY) == 0) {
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&sptr, kval->value, ORTE_STD_CNTR))) {
ORTE_ERROR_LOG(rc);
return rc;
}
child->num_procs = *sptr; /* save the number of procs from this job on this node */
continue;
}
if(strcmp(kval->key, ORTE_NODE_OVERSUBSCRIBED_KEY) == 0) {
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&bptr, kval->value, ORTE_BOOL))) {
ORTE_ERROR_LOG(rc);

Просмотреть файл

@ -113,6 +113,7 @@ int orte_rmaps_base_copy_mapped_proc(orte_mapped_proc_t **dest, orte_mapped_proc
(*dest)->name = src->name;
(*dest)->rank = src->rank;
(*dest)->local_rank = src->local_rank;
(*dest)->app_idx = src->app_idx;
@ -166,6 +167,7 @@ int orte_rmaps_base_copy_mapped_node(orte_mapped_node_t **dest, orte_mapped_node
(*dest)->oversubscribed = src->oversubscribed;
(*dest)->num_procs = src->num_procs;
for (item = opal_list_get_first(&(src->procs));
item != opal_list_get_end(&(src->procs));
item = opal_list_get_next(item)) {

Просмотреть файл

@ -129,7 +129,13 @@ int orte_rmaps_base_pack_mapped_proc(orte_buffer_t *buffer, void *src,
}
/* pack the rank */
if (ORTE_SUCCESS != (rc = orte_dss_pack_buffer(buffer, &(procs[i]->rank), 1, ORTE_STD_CNTR))) {
if (ORTE_SUCCESS != (rc = orte_dss_pack_buffer(buffer, &(procs[i]->rank), 1, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* pack the local rank */
if (ORTE_SUCCESS != (rc = orte_dss_pack_buffer(buffer, &(procs[i]->local_rank), 1, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
return rc;
}

Просмотреть файл

@ -126,8 +126,8 @@ int orte_rmaps_base_print_mapped_proc(char **output, char *prefix, orte_mapped_p
free(tmp3);
return rc;
}
asprintf(&tmp, "%s\n%s\n%sProc Rank: %ld\tProc PID: %ld\tApp_context index: %ld\n", tmp3, tmp2, pfx,
(long)src->rank, (long)src->pid, (long)src->app_idx);
asprintf(&tmp, "%s\n%s\n%sProc Rank: %ld\tLocal Rank: %ld\tProc PID: %ld\tApp_context index: %ld\n", tmp3, tmp2, pfx,
(long)src->rank, (long)src->local_rank, (long)src->pid, (long)src->app_idx);
free(tmp2);
free(tmp3);
@ -173,7 +173,7 @@ int orte_rmaps_base_print_mapped_node(char **output, char *prefix, orte_mapped_n
return rc;
}
asprintf(&tmp3, "%s\n\t%s\n%sOversubscribed: %s\tNum elements in procs list: %ld", tmp, tmp2, pfx,
asprintf(&tmp3, "%s\n\t%s\n%sOversubscribed: %s\tNum procs from this job on node: %ld", tmp, tmp2, pfx,
(src->oversubscribed ? "True" : "False"), (long)src->num_procs);
free(tmp);
free(tmp2);

Просмотреть файл

@ -159,7 +159,15 @@ int orte_rmaps_base_unpack_mapped_proc(orte_buffer_t *buffer, void *dest,
/* unpack the rank */
n = 1;
if (ORTE_SUCCESS != (rc = orte_dss_unpack_buffer(buffer,
&(procs[i]->rank), &n, ORTE_STD_CNTR))) {
&(procs[i]->rank), &n, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* unpack the local rank */
n = 1;
if (ORTE_SUCCESS != (rc = orte_dss_unpack_buffer(buffer,
&(procs[i]->local_rank), &n, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
return rc;
}

Просмотреть файл

@ -43,8 +43,6 @@ int orte_rmaps_base_get_job_map(orte_job_map_t **map, orte_jobid_t jobid)
{
orte_job_map_t *mapping;
orte_mapped_proc_t *proc;
orte_mapped_node_t *mnode;
opal_list_item_t *item;
orte_cellid_t *cellptr, cell=ORTE_CELLID_INVALID;
orte_vpid_t *vptr;
orte_std_cntr_t *sptr;
@ -176,11 +174,11 @@ int orte_rmaps_base_get_job_map(orte_job_map_t **map, orte_jobid_t jobid)
keyval = value->keyvals[kv];
if(strcmp(keyval->key, ORTE_PROC_RANK_KEY) == 0) {
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&sptr, keyval->value, ORTE_STD_CNTR))) {
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&vptr, keyval->value, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
proc->rank = *sptr;
proc->rank = *vptr;
continue;
}
if(strcmp(keyval->key, ORTE_PROC_NAME_KEY) == 0) {
@ -287,14 +285,12 @@ int orte_rmaps_base_get_job_map(orte_job_map_t **map, orte_jobid_t jobid)
}
/* compute and save convenience values */
mapping->num_nodes = opal_list_get_size(&mapping->nodes);
for (item = opal_list_get_first(&mapping->nodes);
item != opal_list_get_end(&mapping->nodes);
item = opal_list_get_next(item)) {
mnode = (orte_mapped_node_t*)item;
mnode->num_procs = opal_list_get_size(&mnode->procs);
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_usage(mapping, mapping->vpid_range))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* all done */
*map = mapping;
rc = ORTE_SUCCESS;
@ -438,9 +434,9 @@ int orte_rmaps_base_put_job_map(orte_job_map_t *map)
if (ORTE_SUCCESS != (rc = orte_gpr.create_value(&(values[i]),
ORTE_GPR_OVERWRITE|ORTE_GPR_TOKENS_AND,
#if OPAL_ENABLE_FT == 1
segment, 12,
segment, 14,
#else
segment, 9,
segment, 11,
#endif
0))) {
ORTE_ERROR_LOG(rc);
@ -467,7 +463,7 @@ int orte_rmaps_base_put_job_map(orte_job_map_t *map)
value = values[index++];
/* initialize keyvals */
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[0]), ORTE_PROC_RANK_KEY, ORTE_STD_CNTR, &(proc->rank)))) {
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[0]), ORTE_PROC_RANK_KEY, ORTE_VPID, &(proc->rank)))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
@ -512,6 +508,16 @@ int orte_rmaps_base_put_job_map(orte_job_map_t *map)
goto cleanup;
}
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[9]), ORTE_PROC_LOCAL_RANK_KEY, ORTE_VPID, &(proc->local_rank)))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[10]), ORTE_NODE_NUM_PROCS_KEY, ORTE_STD_CNTR, &(node->num_procs)))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
#if OPAL_ENABLE_FT == 1
/*
* Checkpoint tokens
@ -521,21 +527,21 @@ int orte_rmaps_base_put_job_map(orte_job_map_t *map)
if( NULL == proc->ckpt_snapshot_loc)
proc->ckpt_snapshot_loc = strdup("");
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[9]),
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[11]),
ORTE_PROC_CKPT_STATE_KEY, ORTE_SIZE,
&(proc->ckpt_state)))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[10]),
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[12]),
ORTE_PROC_CKPT_SNAPSHOT_REF_KEY, ORTE_STRING,
proc->ckpt_snapshot_ref))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[11]),
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[13]),
ORTE_PROC_CKPT_SNAPSHOT_LOC_KEY, ORTE_STRING,
proc->ckpt_snapshot_loc))) {
ORTE_ERROR_LOG(rc);

Просмотреть файл

@ -446,3 +446,59 @@ cleanup:
return rc;
}
int orte_rmaps_base_compute_usage(orte_job_map_t *map, orte_std_cntr_t num_procs)
{
opal_list_item_t *item, *item2;
orte_mapped_node_t *mnode;
orte_mapped_proc_t *mproc, *psave;
orte_vpid_t minv, local_rank;
/* set the vpid range for the job */
map->vpid_range = num_procs;
/* set the number of total nodes involved */
map->num_nodes = opal_list_get_size(&map->nodes);
/* for each node being used by this job... */
for (item = opal_list_get_first(&map->nodes);
item != opal_list_get_end(&map->nodes);
item = opal_list_get_next(item)) {
mnode = (orte_mapped_node_t*)item;
/* set the number of procs for this job on that node */
mnode->num_procs = opal_list_get_size(&mnode->procs);
/* cycle through the list of procs, looking for the minimum
* vpid one and setting that local rank, until we have
* done so for all procs on the node
*/
/* init search values */
local_rank = 0;
while (local_rank < mnode->num_procs) {
minv = ORTE_VPID_MAX;
/* find the minimum vpid proc */
for (item2 = opal_list_get_first(&mnode->procs);
item2 != opal_list_get_end(&mnode->procs);
item2 = opal_list_get_next(item2)) {
mproc = (orte_mapped_proc_t*)item2;
if (ORTE_VPID_INVALID != mproc->local_rank) {
/* already done this one */
continue;
}
if (mproc->rank < minv) {
minv = mproc->rank;
psave = mproc;
}
}
psave->local_rank = local_rank;
++local_rank;
}
}
return ORTE_SUCCESS;
}

Просмотреть файл

@ -53,8 +53,9 @@ static void orte_rmaps_mapped_proc_construct(orte_mapped_proc_t* proc)
proc->name.cellid = ORTE_CELLID_INVALID;
proc->name.jobid = ORTE_JOBID_INVALID;
proc->name.vpid = ORTE_VPID_INVALID;
proc->rank = 0;
proc->app_idx = 0;
proc->rank = ORTE_VPID_INVALID;
proc->local_rank = ORTE_VPID_INVALID;
proc->app_idx = -1;
proc->pid = 0;
#if OPAL_ENABLE_FT == 1
proc->ckpt_state = ORTE_SNAPC_CKPT_STATE_NONE;
@ -68,8 +69,9 @@ static void orte_rmaps_mapped_proc_destruct(orte_mapped_proc_t* proc)
proc->name.cellid = ORTE_CELLID_INVALID;
proc->name.jobid = ORTE_JOBID_INVALID;
proc->name.vpid = ORTE_VPID_INVALID;
proc->rank = 0;
proc->app_idx = 0;
proc->rank = ORTE_VPID_INVALID;
proc->local_rank = ORTE_VPID_INVALID;
proc->app_idx = -1;
proc->pid = 0;
#if OPAL_ENABLE_FT == 1
proc->ckpt_state = ORTE_SNAPC_CKPT_STATE_NONE;

Просмотреть файл

@ -179,6 +179,9 @@ ORTE_DECLSPEC int orte_rmaps_base_claim_slot(orte_job_map_t *map,
ORTE_DECLSPEC int orte_rmaps_base_proxy_map_job(orte_jobid_t job, opal_list_t *attributes);
ORTE_DECLSPEC int orte_rmaps_base_compute_usage(orte_job_map_t *map, orte_std_cntr_t num_procs);
/** Local data type functions */
void orte_rmaps_base_std_obj_release(orte_data_value_t *value);

Просмотреть файл

@ -52,7 +52,8 @@ extern "C" {
struct orte_mapped_proc_t {
opal_list_item_t super;
orte_process_name_t name; /* process name */
orte_std_cntr_t rank; /* process rank */
orte_vpid_t rank; /* process rank */
orte_vpid_t local_rank; /* local rank on the node */
orte_std_cntr_t app_idx; /* index of app_context for this process */
pid_t pid;
#if OPAL_ENABLE_FT == 1

Просмотреть файл

@ -357,7 +357,6 @@ static int orte_rmaps_rr_map(orte_jobid_t jobid, opal_list_t *attributes)
opal_list_t master_node_list, mapped_node_list, max_used_nodes, *working_node_list;
opal_list_item_t *item, *item2;
orte_ras_node_t *node, *node2;
orte_mapped_node_t *mnode;
char *save_bookmark;
orte_vpid_t vpid_start;
orte_std_cntr_t num_procs = 0, total_num_slots, mapped_num_slots, num_nodes, num_slots;
@ -704,17 +703,14 @@ static int orte_rmaps_rr_map(orte_jobid_t jobid, opal_list_t *attributes)
}
/* compute and save convenience values */
map->vpid_range = num_procs;
map->num_nodes = opal_list_get_size(&map->nodes);
for (item = opal_list_get_first(&map->nodes);
item != opal_list_get_end(&map->nodes);
item = opal_list_get_next(item)) {
mnode = (orte_mapped_node_t*)item;
mnode->num_procs = opal_list_get_size(&mnode->procs);
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_usage(map, num_procs))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* save mapping to the registry */
if(ORTE_SUCCESS != (rc = orte_rmaps_base_put_job_map(map))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}

Просмотреть файл

@ -76,6 +76,7 @@
#define ORTE_NODE_BOOTPROXY_KEY "orte-node-bootproxy"
#define ORTE_NODE_USERNAME_KEY "orte-node-username"
#define ORTE_NODE_OVERSUBSCRIBED_KEY "orte-node-oversubscribed"
#define ORTE_NODE_NUM_PROCS_KEY "orte-node-num-procs"
/* JOB specific keys */
#define ORTE_JOB_APP_CONTEXT_KEY "orte-job-app-context"
@ -91,6 +92,7 @@
/* PROCESS specific keys */
#define ORTE_PROC_NAME_KEY "orte-proc-name"
#define ORTE_PROC_RANK_KEY "orte-proc-rank"
#define ORTE_PROC_LOCAL_RANK_KEY "orte-proc-local-rank"
#define ORTE_PROC_PID_KEY "orte-proc-pid"
#define ORTE_PROC_LOCAL_PID_KEY "orte-proc-local-pid"
#define ORTE_PROC_STATE_KEY "orte-proc-state"

Просмотреть файл

@ -72,21 +72,31 @@ extern "C" {
*/
ORTE_DECLSPEC int orte_ns_nds_env_put(const orte_process_name_t* proc,
orte_vpid_t vpid_start,
size_t num_procs,
orte_std_cntr_t num_procs,
orte_vpid_t local_rank,
orte_std_cntr_t num_local_procs,
char ***environ);
ORTE_DECLSPEC int orte_ns_nds_pipe_put(const orte_process_name_t* proc,
orte_vpid_t vpid_start,
size_t num_procs,
orte_std_cntr_t num_procs,
orte_vpid_t local_rank,
orte_std_cntr_t num_local_procs,
int fd);
ORTE_DECLSPEC int orte_ns_nds_bproc_put(orte_cellid_t cell,
orte_jobid_t job,
orte_vpid_t vpid_start,
orte_vpid_t global_vpid_start,
int num_procs, char ***env);
orte_std_cntr_t num_procs,
orte_vpid_t local_rank,
orte_std_cntr_t num_local_procs,
char ***env);
ORTE_DECLSPEC int orte_ns_nds_xcpu_put(orte_cellid_t cell,
orte_jobid_t job,
orte_vpid_t vpid_start,
int num_procs, char ***env);
orte_std_cntr_t num_procs,
orte_vpid_t local_rank,
orte_std_cntr_t num_local_procs,
char ***env);
ORTE_DECLSPEC extern opal_list_t orte_sds_base_components_available;

Просмотреть файл

@ -34,7 +34,9 @@
#include "orte/mca/errmgr/base/base.h"
int orte_ns_nds_env_put(const orte_process_name_t* name,
orte_vpid_t vpid_start, size_t num_procs,
orte_vpid_t vpid_start, orte_std_cntr_t num_procs,
orte_vpid_t local_rank,
orte_std_cntr_t num_local_procs,
char ***env)
{
char* param;
@ -125,6 +127,25 @@ int orte_ns_nds_env_put(const orte_process_name_t* name,
opal_setenv(param, value, true, env);
free(param);
free(value);
asprintf(&value, "%lu", (unsigned long) local_rank);
if(NULL == (param = mca_base_param_environ_variable("ns","nds","local_rank"))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
opal_setenv(param, value, true, env);
free(param);
free(value);
asprintf(&value, "%lu", (unsigned long) num_local_procs);
if(NULL == (param = mca_base_param_environ_variable("ns","nds","num_local_procs"))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
opal_setenv(param, value, true, env);
free(param);
free(value);
return ORTE_SUCCESS;
}
@ -142,7 +163,11 @@ int orte_ns_nds_env_put(const orte_process_name_t* name,
*/
int orte_ns_nds_bproc_put(orte_cellid_t cell, orte_jobid_t job,
orte_vpid_t vpid_start, orte_vpid_t global_vpid_start,
int num_procs, char ***env) {
orte_std_cntr_t num_procs,
orte_vpid_t local_rank,
orte_std_cntr_t num_local_procs,
char ***env)
{
char* param;
char* value;
int rc;
@ -224,7 +249,7 @@ int orte_ns_nds_bproc_put(orte_cellid_t cell, orte_jobid_t job,
free(param);
free(value);
asprintf(&value, "%d", num_procs);
asprintf(&value, "%d", (int)num_procs);
if(NULL == (param = mca_base_param_environ_variable("ns","nds","num_procs")))
{
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
@ -234,6 +259,24 @@ int orte_ns_nds_bproc_put(orte_cellid_t cell, orte_jobid_t job,
free(param);
free(value);
asprintf(&value, "%lu", (unsigned long) local_rank);
if(NULL == (param = mca_base_param_environ_variable("ns","nds","local_rank"))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
opal_setenv(param, value, true, env);
free(param);
free(value);
asprintf(&value, "%lu", (unsigned long) num_local_procs);
if(NULL == (param = mca_base_param_environ_variable("ns","nds","num_local_procs"))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
opal_setenv(param, value, true, env);
free(param);
free(value);
/* we have to set this environmental variable so bproc will give us our rank
* after the launch */
@ -256,7 +299,9 @@ int orte_ns_nds_bproc_put(orte_cellid_t cell, orte_jobid_t job,
* @retval error
*/
int orte_ns_nds_xcpu_put(orte_cellid_t cell, orte_jobid_t job,
orte_vpid_t vpid_start, int num_procs,
orte_vpid_t vpid_start, orte_std_cntr_t num_procs,
orte_vpid_t local_rank,
orte_std_cntr_t num_local_procs,
char ***env)
{
char* param;
@ -319,7 +364,7 @@ int orte_ns_nds_xcpu_put(orte_cellid_t cell, orte_jobid_t job,
free(param);
free(value);
asprintf(&value, "%d", num_procs);
asprintf(&value, "%d", (int)num_procs);
if(NULL == (param = mca_base_param_environ_variable("ns","nds","num_procs"))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
@ -328,10 +373,33 @@ int orte_ns_nds_xcpu_put(orte_cellid_t cell, orte_jobid_t job,
free(param);
free(value);
asprintf(&value, "%lu", (unsigned long) local_rank);
if(NULL == (param = mca_base_param_environ_variable("ns","nds","local_rank"))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
opal_setenv(param, value, true, env);
free(param);
free(value);
asprintf(&value, "%lu", (unsigned long) num_local_procs);
if(NULL == (param = mca_base_param_environ_variable("ns","nds","num_local_procs"))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
opal_setenv(param, value, true, env);
free(param);
free(value);
return ORTE_SUCCESS;
}
int orte_ns_nds_pipe_put(const orte_process_name_t* name, orte_vpid_t vpid_start, size_t num_procs, int fd)
int orte_ns_nds_pipe_put(const orte_process_name_t* name,
orte_vpid_t vpid_start,
orte_std_cntr_t num_procs,
orte_vpid_t local_rank,
orte_std_cntr_t num_local_procs,
int fd)
{
int rc;
@ -353,6 +421,18 @@ int orte_ns_nds_pipe_put(const orte_process_name_t* name, orte_vpid_t vpid_start
return ORTE_ERR_NOT_FOUND;
}
rc = write(fd,&local_rank, sizeof(local_rank));
if(rc != sizeof(local_rank)) {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
return ORTE_ERR_NOT_FOUND;
}
rc = write(fd,&num_local_procs, sizeof(num_local_procs));
if(rc != sizeof(num_local_procs)) {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
return ORTE_ERR_NOT_FOUND;
}
return ORTE_SUCCESS;
}

Просмотреть файл

@ -58,6 +58,8 @@ int orte_sds_bproc_set_name(void)
char *session_dir;
char *uri_file;
FILE *fp;
int local_rank;
int num_local_procs;
id = mca_base_param_register_string("ns", "nds", "name", NULL, NULL);
mca_base_param_lookup_string(id, &name_string);
@ -170,7 +172,7 @@ int orte_sds_bproc_set_name(void)
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
orte_process_info.num_procs = (size_t)num_procs;
orte_process_info.num_procs = (orte_std_cntr_t)num_procs;
id = mca_base_param_register_string("ns", "nds", "global_vpid_start", NULL, NULL);
mca_base_param_lookup_string(id, &vpid_string);
@ -196,6 +198,22 @@ int orte_sds_bproc_set_name(void)
cleanup_vpid_string = true;
}
/* it is okay for this param not to be found - for example, we don't bother
* to set it for orteds - so just set it to an invalid value which indicates
* it wasn't found if it isn't there
*/
id = mca_base_param_register_int("ns", "nds", "local_rank", NULL, ORTE_VPID_INVALID);
mca_base_param_lookup_int(id, &local_rank);
orte_process_info.local_rank = (orte_vpid_t)local_rank;
/* it is okay for this param not to be found - for example, we don't bother
* to set it for orteds - so just set it to a value which indicates
* it wasn't found if it isn't there
*/
id = mca_base_param_register_int("ns", "nds", "num_local_procs", NULL, 0);
mca_base_param_lookup_int(id, &num_local_procs);
orte_process_info.num_local_procs = (orte_std_cntr_t)num_local_procs;
/* if we are NOT a daemon, then lookup our local daemon's contact info
* and setup that link
*/

Просмотреть файл

@ -87,7 +87,7 @@ orte_sds_cnos_set_name(void)
}
orte_process_info.vpid_start = (orte_vpid_t) 0;
orte_process_info.num_procs = (size_t) cnos_get_size();
orte_process_info.num_procs = (orte_std_cntr_t) cnos_get_size();
return ORTE_SUCCESS;
}

29
orte/mca/sds/env/sds_env_module.c поставляемый
Просмотреть файл

@ -20,16 +20,19 @@
#include "orte_config.h"
#include "orte/orte_constants.h"
#include "orte/mca/sds/sds.h"
#include "orte/mca/sds/base/base.h"
#include "orte/mca/sds/env/sds_env.h"
#include "orte/util/proc_info.h"
#include "opal/util/opal_environ.h"
#include "opal/util/output.h"
#include "opal/mca/base/mca_base_param.h"
#include "orte/util/proc_info.h"
#include "orte/mca/ns/ns.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ns/ns.h"
#include "orte/mca/sds/sds.h"
#include "orte/mca/sds/base/base.h"
#include "orte/mca/sds/env/sds_env.h"
orte_sds_base_module_t orte_sds_env_module = {
orte_sds_base_basic_contact_universe,
@ -45,6 +48,8 @@ orte_sds_env_set_name(void)
int id;
int vpid_start;
int num_procs;
int local_rank;
int num_local_procs;
char* name_string = NULL;
char *local_daemon_uri = NULL;
@ -130,6 +135,22 @@ orte_sds_env_set_name(void)
}
orte_process_info.num_procs = (orte_std_cntr_t)num_procs;
/* it is okay for this param not to be found - for example, we don't bother
* to set it for orteds - so just set it to an invalid value which indicates
* it wasn't found if it isn't there
*/
id = mca_base_param_register_int("ns", "nds", "local_rank", NULL, ORTE_VPID_INVALID);
mca_base_param_lookup_int(id, &local_rank);
orte_process_info.local_rank = (orte_vpid_t)local_rank;
/* it is okay for this param not to be found - for example, we don't bother
* to set it for orteds - so just set it to a value which indicates
* it wasn't found if it isn't there
*/
id = mca_base_param_register_int("ns", "nds", "num_local_procs", NULL, 0);
mca_base_param_lookup_int(id, &num_local_procs);
orte_process_info.num_local_procs = (orte_std_cntr_t)num_local_procs;
id = mca_base_param_register_string("orte", "local_daemon", "uri", NULL, NULL);
mca_base_param_lookup_string(id, &local_daemon_uri);
if (NULL != local_daemon_uri) {

Просмотреть файл

@ -50,7 +50,6 @@ orte_sds_pipe_set_name(void)
{
int rc, fd, id;
orte_process_name_t name;
size_t num_procs;
/* lookup the fd to use */
id = mca_base_param_register_int("nds","pipe","fd", NULL, 3);
@ -73,13 +72,26 @@ orte_sds_pipe_set_name(void)
return ORTE_ERR_NOT_FOUND;
}
rc = read(fd,&num_procs, sizeof(num_procs));
if(rc != sizeof(num_procs)) {
rc = read(fd,&orte_process_info.num_procs, sizeof(orte_process_info.num_procs));
if(rc != sizeof(orte_process_info.num_procs)) {
opal_output(0, "orte_ns_nds_pipe_get: read returned %d, errno=%d\n", rc, errno);
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
rc = read(fd,&orte_process_info.local_rank, sizeof(orte_process_info.local_rank));
if(rc != sizeof(orte_process_info.local_rank)) {
opal_output(0, "orte_ns_nds_pipe_get: read returned %d, errno=%d\n", rc, errno);
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
rc = read(fd,&orte_process_info.num_local_procs, sizeof(orte_process_info.num_local_procs));
if(rc != sizeof(orte_process_info.num_local_procs)) {
opal_output(0, "orte_ns_nds_pipe_get: read returned %d, errno=%d\n", rc, errno);
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
orte_process_info.num_procs = (orte_std_cntr_t)num_procs;
close(fd);
return ORTE_SUCCESS;

Просмотреть файл

@ -58,6 +58,11 @@ orte_sds_singleton_set_name(void)
mca_base_param_lookup_int(id, &flag);
if (!flag) {
orte_process_info.singleton = true;
/* since we are a singleton, then we must have a local_rank of 0
* and only 1 local process
*/
orte_process_info.local_rank = 0;
orte_process_info.num_local_procs = 1;
}
return ORTE_SUCCESS;

Просмотреть файл

@ -56,6 +56,8 @@ orte_sds_slurm_set_name(void)
int id;
int vpid_start;
int num_procs;
int local_rank;
int num_local_procs;
char* name_string = NULL;
int slurm_nodeid;
char *local_daemon_uri = NULL;
@ -151,6 +153,22 @@ orte_sds_slurm_set_name(void)
}
orte_process_info.num_procs = (orte_std_cntr_t)num_procs;
/* it is okay for this param not to be found - for example, we don't bother
* to set it for orteds - so just set it to an invalid value which indicates
* it wasn't found if it isn't there
*/
id = mca_base_param_register_int("ns", "nds", "local_rank", NULL, ORTE_VPID_INVALID);
mca_base_param_lookup_int(id, &local_rank);
orte_process_info.local_rank = (orte_vpid_t)local_rank;
/* it is okay for this param not to be found - for example, we don't bother
* to set it for orteds - so just set it to a value which indicates
* it wasn't found if it isn't there
*/
id = mca_base_param_register_int("ns", "nds", "num_local_procs", NULL, 0);
mca_base_param_lookup_int(id, &num_local_procs);
orte_process_info.num_local_procs = (orte_std_cntr_t)num_local_procs;
id = mca_base_param_register_string("orte", "local_daemon", "uri", NULL, NULL);
mca_base_param_lookup_string(id, &local_daemon_uri);
if (NULL != local_daemon_uri) {

Просмотреть файл

@ -70,7 +70,7 @@ int orte_sds_xcpu_set_name(void)
char* cellid_string;
char* jobid_string;
char* vpid_string;
int num_procs;
int num_procs, local_rank, num_local_procs;
char *xcpu_rank_string;
int xcpu_rank;
int stride;
@ -140,7 +140,23 @@ int orte_sds_xcpu_set_name(void)
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
orte_process_info.num_procs = (size_t)num_procs;
orte_process_info.num_procs = (orte_std_cntr_t)num_procs;
/* it is okay for this param not to be found - for example, we don't bother
* to set it for orteds - so just set it to an invalid value which indicates
* it wasn't found if it isn't there
*/
id = mca_base_param_register_int("ns", "nds", "local_rank", NULL, ORTE_VPID_INVALID);
mca_base_param_lookup_int(id, &local_rank);
orte_process_info.local_rank = (orte_vpid_t)local_rank;
/* it is okay for this param not to be found - for example, we don't bother
* to set it for orteds - so just set it to a value which indicates
* it wasn't found if it isn't there
*/
id = mca_base_param_register_int("ns", "nds", "num_local_procs", NULL, 0);
mca_base_param_lookup_int(id, &num_local_procs);
orte_process_info.num_local_procs = (orte_std_cntr_t)num_local_procs;
#if 0
id = mca_base_param_register_string("ns", "nds", "global_vpid_start", NULL, NULL);

Просмотреть файл

@ -23,7 +23,9 @@ int main(int argc, char* argv[])
gethostname(hostname, 512);
pid = getpid();
printf("orte_nodename: Node %s Name [%lu,%lu,%lu] Pid %ld\n", hostname, ORTE_NAME_ARGS(orte_process_info.my_name), (long)pid);
printf("orte_nodename: Node %s Name [%lu,%lu,%lu] Pid %ld Local Rank: %ld Num_local_procs %ld\n",
hostname, ORTE_NAME_ARGS(orte_process_info.my_name), (long)pid,
(long)orte_process_info.local_rank, (long)orte_process_info.num_local_procs);
orte_finalize();
return 0;

Просмотреть файл

@ -87,7 +87,8 @@ int main(int argc, char* argv[])
* when it completes
*/
fprintf(stderr, "Parent: spawning children!\n");
fprintf(stderr, "Parent: My local rank is %ld with %ld num_local_procs - spawning children!\n",
(long)orte_process_info.local_rank, (long)orte_process_info.num_local_procs);
cb_states = ORTE_PROC_STATE_TERMINATED;
spawned = true;
if (ORTE_SUCCESS != (rc = orte_rmgr.spawn_job(&app, 1, &job, 0, NULL, job_state_callback, cb_states, &attributes))) {

Просмотреть файл

@ -642,7 +642,7 @@ static void dump_aborted_procs(orte_jobid_t jobid, orte_app_context_t **apps, or
continue;
}
if(strcmp(keyval->key, ORTE_PROC_RANK_KEY) == 0) {
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&sptr, keyval->value, ORTE_STD_CNTR))) {
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&sptr, keyval->value, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
continue;
}

Просмотреть файл

@ -39,6 +39,8 @@ ORTE_DECLSPEC orte_proc_info_t orte_process_info = {
/* .singleton = */ false,
/* .vpid_start = */ 0,
/* .num_procs = */ 1,
/* .local_rank = */ ORTE_VPID_INVALID,
/* .num_local_procs = */ 0,
/* .pid = */ 0,
/* .seed = */ false,
/* .daemon = */ false,

Просмотреть файл

@ -52,6 +52,8 @@ struct orte_proc_info_t {
bool singleton; /**< I am a singleton */
orte_vpid_t vpid_start; /**< starting vpid for this job */
orte_std_cntr_t num_procs; /**< number of processes in this job */
orte_vpid_t local_rank; /**< local rank on this node */
orte_std_cntr_t num_local_procs; /**< total number of procs on this node */
pid_t pid; /**< Local process ID for this process */
bool seed; /**< Indicate whether or not this is seed daemon */
bool daemon; /**< Indicate whether or not I am a daemon */