From 3b064a624e5bd9dedad7bfb86861ad4dbb880b21 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Tue, 12 Dec 2006 16:07:23 +0000 Subject: [PATCH] For convenience, revise the orte_job_map_t object so it includes the vpid start/range values, the number of nodes, and the number of processes on each node. These values are all used in various places in the code base - we currently re-compute them multiple times. Since these values do not change and are already being computed by the RMAPS framework, we might as well just save them for re-use. This commit was SVN r12829. --- .../rmaps_data_type_copy_fns.c | 7 + .../rmaps_data_type_packing_fns.c | 32 ++- .../rmaps_data_type_print_fns.c | 13 +- .../rmaps_data_type_unpacking_fns.c | 48 +++-- orte/mca/rmaps/base/rmaps_base_registry_fns.c | 191 +++++++++++------- orte/mca/rmaps/base/rmaps_class_instances.h | 7 + orte/mca/rmaps/rmaps_types.h | 11 +- orte/mca/rmaps/round_robin/rmaps_rr.c | 21 +- 8 files changed, 221 insertions(+), 109 deletions(-) diff --git a/orte/mca/rmaps/base/data_type_support/rmaps_data_type_copy_fns.c b/orte/mca/rmaps/base/data_type_support/rmaps_data_type_copy_fns.c index 7ff142484c..52f17f35c5 100644 --- a/orte/mca/rmaps/base/data_type_support/rmaps_data_type_copy_fns.c +++ b/orte/mca/rmaps/base/data_type_support/rmaps_data_type_copy_fns.c @@ -55,6 +55,11 @@ int orte_rmaps_base_copy_map(orte_job_map_t **dest, orte_job_map_t *src, orte_da /* copy data into it */ (*dest)->job = src->job; + if (NULL != src->mapping_mode) { + (*dest)->mapping_mode = strdup(src->mapping_mode); + } + (*dest)->vpid_start = src->vpid_start; + (*dest)->vpid_range = src->vpid_range; (*dest)->num_apps = src->num_apps; (*dest)->apps = (orte_app_context_t**)malloc(src->num_apps * sizeof(orte_app_context_t*)); @@ -71,6 +76,7 @@ int orte_rmaps_base_copy_map(orte_job_map_t **dest, orte_job_map_t *src, orte_da } } + (*dest)->num_nodes = src->num_nodes; for (item = opal_list_get_first(&(src->nodes)); item != opal_list_get_end(&(src->nodes)); item = opal_list_get_next(item)) { @@ -157,6 +163,7 @@ int orte_rmaps_base_copy_mapped_node(orte_mapped_node_t **dest, orte_mapped_node (*dest)->oversubscribed = src->oversubscribed; + (*dest)->num_procs = src->num_procs; for (item = opal_list_get_first(&(src->procs)); item != opal_list_get_end(&(src->procs)); item = opal_list_get_next(item)) { diff --git a/orte/mca/rmaps/base/data_type_support/rmaps_data_type_packing_fns.c b/orte/mca/rmaps/base/data_type_support/rmaps_data_type_packing_fns.c index 106fa0985f..8c1d83f923 100644 --- a/orte/mca/rmaps/base/data_type_support/rmaps_data_type_packing_fns.c +++ b/orte/mca/rmaps/base/data_type_support/rmaps_data_type_packing_fns.c @@ -37,7 +37,7 @@ int orte_rmaps_base_pack_map(orte_buffer_t *buffer, void *src, orte_std_cntr_t num_vals, orte_data_type_t type) { int rc; - orte_std_cntr_t i, num_nodes; + orte_std_cntr_t i; orte_job_map_t **maps; opal_list_item_t *item; orte_mapped_node_t *srcnode; @@ -52,6 +52,24 @@ int orte_rmaps_base_pack_map(orte_buffer_t *buffer, void *src, return rc; } + /* pack the mapping mode used to generate it */ + if (ORTE_SUCCESS != (rc = orte_dss_pack_buffer(buffer, &(maps[i]->mapping_mode), 1, ORTE_STRING))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + /* pack the starting vpid */ + if (ORTE_SUCCESS != (rc = orte_dss_pack_buffer(buffer, &(maps[i]->vpid_start), 1, ORTE_VPID))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + /* pack the range */ + if (ORTE_SUCCESS != (rc = orte_dss_pack_buffer(buffer, &(maps[i]->vpid_range), 1, ORTE_VPID))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* pack the number of app_contexts */ if (ORTE_SUCCESS != (rc = orte_dss_pack_buffer(buffer, &(maps[i]->num_apps), 1, ORTE_STD_CNTR))) { ORTE_ERROR_LOG(rc); @@ -65,14 +83,13 @@ int orte_rmaps_base_pack_map(orte_buffer_t *buffer, void *src, } /* pack the number of nodes */ - num_nodes = (orte_std_cntr_t)opal_list_get_size(&(maps[i]->nodes)); - if (ORTE_SUCCESS != (rc = orte_dss_pack_buffer(buffer, &num_nodes, 1, ORTE_STD_CNTR))) { + if (ORTE_SUCCESS != (rc = orte_dss_pack_buffer(buffer, &(maps[i]->num_nodes), 1, ORTE_STD_CNTR))) { ORTE_ERROR_LOG(rc); return rc; } /* pack the nodes list */ - if (0 < num_nodes) { + if (0 < maps[i]->num_nodes) { for (item = opal_list_get_first(&(maps[i]->nodes)); item != opal_list_get_end(&(maps[i]->nodes)); item = opal_list_get_next(item)) { @@ -141,7 +158,7 @@ int orte_rmaps_base_pack_mapped_node(orte_buffer_t *buffer, void *src, orte_std_cntr_t num_vals, orte_data_type_t type) { int rc; - orte_std_cntr_t i, num_procs; + orte_std_cntr_t i; orte_mapped_node_t **nodes; opal_list_item_t *item; orte_mapped_proc_t *srcproc; @@ -181,14 +198,13 @@ int orte_rmaps_base_pack_mapped_node(orte_buffer_t *buffer, void *src, } /* pack the number of procs */ - num_procs = (orte_std_cntr_t)opal_list_get_size(&(nodes[i]->procs)); - if (ORTE_SUCCESS != (rc = orte_dss_pack_buffer(buffer, &num_procs, 1, ORTE_STD_CNTR))) { + if (ORTE_SUCCESS != (rc = orte_dss_pack_buffer(buffer, &(nodes[i]->num_procs), 1, ORTE_STD_CNTR))) { ORTE_ERROR_LOG(rc); return rc; } /* pack the procs list */ - if (0 < num_procs) { + if (0 < nodes[i]->num_procs) { for (item = opal_list_get_first(&(nodes[i]->procs)); item != opal_list_get_end(&(nodes[i]->procs)); item = opal_list_get_next(item)) { diff --git a/orte/mca/rmaps/base/data_type_support/rmaps_data_type_print_fns.c b/orte/mca/rmaps/base/data_type_support/rmaps_data_type_print_fns.c index bb3869f2a3..6b9374923f 100644 --- a/orte/mca/rmaps/base/data_type_support/rmaps_data_type_print_fns.c +++ b/orte/mca/rmaps/base/data_type_support/rmaps_data_type_print_fns.c @@ -38,7 +38,7 @@ int orte_rmaps_base_print_map(char **output, char *prefix, orte_job_map_t *src, { char *tmp, *tmp2, *tmp3, *pfx, *pfx2; orte_mapped_node_t *srcnode; - orte_std_cntr_t i, num_nodes; + orte_std_cntr_t i; opal_list_item_t *item; int rc; @@ -52,7 +52,9 @@ int orte_rmaps_base_print_map(char **output, char *prefix, orte_job_map_t *src, asprintf(&pfx2, "%s", prefix); } - asprintf(&tmp, "%sMap for job: %ld\tNum app_contexts: %ld", pfx2, (long)src->job, (long)src->num_apps); + asprintf(&tmp, "%sMap for job: %ld\tGenerated by mapping mode: %s\n%s\tStarting vpid: %ld\tVpid range: %ld\tNum app_contexts: %ld", + pfx2, (long)src->job, (NULL == src->mapping_mode) ? "NULL" : src->mapping_mode, + pfx2, (long)src->vpid_start, (long)src->vpid_range, (long)src->num_apps); asprintf(&pfx, "%s\t", pfx2); free(pfx2); @@ -70,8 +72,7 @@ int orte_rmaps_base_print_map(char **output, char *prefix, orte_job_map_t *src, tmp = tmp3; } - num_nodes = (orte_std_cntr_t)opal_list_get_size(&(src->nodes)); - asprintf(&tmp, "%s\n%sNum elements in nodes list: %ld", tmp3, pfx, (long)num_nodes); + asprintf(&tmp, "%s\n%sNum elements in nodes list: %ld", tmp3, pfx, (long)src->num_nodes); for (item = opal_list_get_first(&(src->nodes)); item != opal_list_get_end(&(src->nodes)); @@ -145,7 +146,6 @@ int orte_rmaps_base_print_mapped_node(char **output, char *prefix, orte_mapped_n { int rc; char *tmp, *tmp2, *tmp3, *pfx, *pfx2; - orte_std_cntr_t num_procs; opal_list_item_t *item; orte_mapped_proc_t *srcproc; @@ -173,9 +173,8 @@ int orte_rmaps_base_print_mapped_node(char **output, char *prefix, orte_mapped_n return rc; } - num_procs = (orte_std_cntr_t)opal_list_get_size(&(src->procs)); asprintf(&tmp3, "%s\n\t%s\n%sOversubscribed: %s\tNum elements in procs list: %ld", tmp, tmp2, pfx, - (src->oversubscribed ? "True" : "False"), (long)num_procs); + (src->oversubscribed ? "True" : "False"), (long)src->num_procs); free(tmp); free(tmp2); diff --git a/orte/mca/rmaps/base/data_type_support/rmaps_data_type_unpacking_fns.c b/orte/mca/rmaps/base/data_type_support/rmaps_data_type_unpacking_fns.c index 1c63931d97..7f9f21434c 100644 --- a/orte/mca/rmaps/base/data_type_support/rmaps_data_type_unpacking_fns.c +++ b/orte/mca/rmaps/base/data_type_support/rmaps_data_type_unpacking_fns.c @@ -39,7 +39,7 @@ int orte_rmaps_base_unpack_map(orte_buffer_t *buffer, void *dest, orte_std_cntr_t *num_vals, orte_data_type_t type) { int rc; - orte_std_cntr_t i, j, n, num_nodes; + orte_std_cntr_t i, j, n; orte_job_map_t **maps; orte_mapped_node_t *node; @@ -62,6 +62,30 @@ int orte_rmaps_base_unpack_map(orte_buffer_t *buffer, void *dest, return rc; } + /* unpack the mapping mode */ + n = 1; + if (ORTE_SUCCESS != (rc = orte_dss_unpack_buffer(buffer, + &(maps[i]->mapping_mode), &n, ORTE_STRING))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + /* unpack the starting vpid */ + n = 1; + if (ORTE_SUCCESS != (rc = orte_dss_unpack_buffer(buffer, + &(maps[i]->vpid_start), &n, ORTE_VPID))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + /* unpack the vpid range */ + n = 1; + if (ORTE_SUCCESS != (rc = orte_dss_unpack_buffer(buffer, + &(maps[i]->vpid_range), &n, ORTE_VPID))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* unpack the number of app_contexts */ n = 1; if (ORTE_SUCCESS != (rc = orte_dss_unpack_buffer(buffer, @@ -85,12 +109,12 @@ int orte_rmaps_base_unpack_map(orte_buffer_t *buffer, void *dest, /* unpack the number of nodes */ n = 1; - if (ORTE_SUCCESS != (rc = orte_dss_unpack_buffer(buffer, &num_nodes, &n, ORTE_STD_CNTR))) { + if (ORTE_SUCCESS != (rc = orte_dss_unpack_buffer(buffer, &(maps[i]->num_nodes), &n, ORTE_STD_CNTR))) { ORTE_ERROR_LOG(rc); return rc; } - for (j=0; j < num_nodes; j++) { + for (j=0; j < maps[i]->num_nodes; j++) { n = 1; if (ORTE_SUCCESS != (rc = orte_dss_unpack_buffer(buffer, &node, &n, ORTE_MAPPED_NODE))) { ORTE_ERROR_LOG(rc); @@ -167,7 +191,7 @@ int orte_rmaps_base_unpack_mapped_node(orte_buffer_t *buffer, void *dest, orte_std_cntr_t *num_vals, orte_data_type_t type) { int rc; - orte_std_cntr_t i, j, n, num_procs; + orte_std_cntr_t i, j, n; orte_mapped_node_t **nodes; orte_mapped_proc_t *srcproc; @@ -224,21 +248,19 @@ int orte_rmaps_base_unpack_mapped_node(orte_buffer_t *buffer, void *dest, /* unpack the number of procs */ n = 1; - if (ORTE_SUCCESS != (rc = orte_dss_unpack_buffer(buffer, &num_procs, &n, ORTE_STD_CNTR))) { + if (ORTE_SUCCESS != (rc = orte_dss_unpack_buffer(buffer, &(nodes[i]->num_procs), &n, ORTE_STD_CNTR))) { ORTE_ERROR_LOG(rc); return rc; } /* if we have some, unpack them */ - if (0 < num_procs) { - for (j=0; j < num_procs; j++) { - n = 1; - if (ORTE_SUCCESS != (rc = orte_dss_unpack_buffer(buffer, &srcproc, &n, ORTE_MAPPED_PROC))) { - ORTE_ERROR_LOG(rc); - return rc; - } - opal_list_append(&(nodes[i]->procs), &srcproc->super); + for (j=0; j < nodes[i]->num_procs; j++) { + n = 1; + if (ORTE_SUCCESS != (rc = orte_dss_unpack_buffer(buffer, &srcproc, &n, ORTE_MAPPED_PROC))) { + ORTE_ERROR_LOG(rc); + return rc; } + opal_list_append(&(nodes[i]->procs), &srcproc->super); } } diff --git a/orte/mca/rmaps/base/rmaps_base_registry_fns.c b/orte/mca/rmaps/base/rmaps_base_registry_fns.c index 1b3d7696c3..0d27522af1 100644 --- a/orte/mca/rmaps/base/rmaps_base_registry_fns.c +++ b/orte/mca/rmaps/base/rmaps_base_registry_fns.c @@ -43,7 +43,10 @@ int orte_rmaps_base_get_job_map(orte_job_map_t **map, orte_jobid_t jobid) { orte_job_map_t *mapping; orte_mapped_proc_t *proc; + orte_mapped_node_t *mnode; + opal_list_item_t *item; orte_cellid_t *cellptr, cell=ORTE_CELLID_INVALID; + orte_vpid_t *vptr; orte_std_cntr_t *sptr; bool *bptr, oversub=false; pid_t *pidptr; @@ -64,6 +67,8 @@ int orte_rmaps_base_get_job_map(orte_job_map_t **map, orte_jobid_t jobid) ORTE_NODE_NAME_KEY, ORTE_NODE_USERNAME_KEY, ORTE_NODE_OVERSUBSCRIBED_KEY, + ORTE_JOB_VPID_START_KEY, + ORTE_JOB_VPID_RANGE_KEY, NULL }; @@ -79,7 +84,7 @@ int orte_rmaps_base_get_job_map(orte_job_map_t **map, orte_jobid_t jobid) return ORTE_ERR_OUT_OF_RESOURCE; } - /* store the jobid */ + /* set the jobid */ mapping->job = jobid; /* get the job segment name */ @@ -119,89 +124,123 @@ int orte_rmaps_base_get_job_map(orte_job_map_t **map, orte_jobid_t jobid) value = values[v]; node_name = NULL; - proc = OBJ_NEW(orte_mapped_proc_t); - if(NULL == proc) { - rc = ORTE_ERR_OUT_OF_RESOURCE; - ORTE_ERROR_LOG(rc); - goto cleanup; + if (0 == strcmp(value->tokens[0], ORTE_JOB_GLOBALS)) { + /* this came from the job_globals container, so look for the related values */ + for (kv=0; kv < value->cnt; kv++) { + if(strcmp(value->keyvals[kv]->key, ORTE_JOB_VPID_START_KEY) == 0) { + if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&vptr, value->keyvals[kv]->value, ORTE_VPID))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + mapping->vpid_start = *vptr; + continue; + } + if(strcmp(value->keyvals[kv]->key, ORTE_JOB_VPID_RANGE_KEY) == 0) { + if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&vptr, value->keyvals[kv]->value, ORTE_VPID))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + mapping->vpid_range = *vptr; + continue; + } + } } - - for(kv = 0; kvcnt; kv++) { - keyval = value->keyvals[kv]; + + else { + /* this came from a process container */ + proc = OBJ_NEW(orte_mapped_proc_t); + if(NULL == proc) { + rc = ORTE_ERR_OUT_OF_RESOURCE; + ORTE_ERROR_LOG(rc); + goto cleanup; + } - if(strcmp(keyval->key, ORTE_PROC_RANK_KEY) == 0) { - if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&sptr, keyval->value, ORTE_STD_CNTR))) { - ORTE_ERROR_LOG(rc); - goto cleanup; + for(kv = 0; kvcnt; kv++) { + keyval = value->keyvals[kv]; + + if(strcmp(keyval->key, ORTE_PROC_RANK_KEY) == 0) { + if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&sptr, keyval->value, ORTE_STD_CNTR))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + proc->rank = *sptr; + continue; } - proc->rank = *sptr; - continue; - } - if(strcmp(keyval->key, ORTE_PROC_NAME_KEY) == 0) { - if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&pptr, keyval->value, ORTE_NAME))) { - ORTE_ERROR_LOG(rc); - goto cleanup; + if(strcmp(keyval->key, ORTE_PROC_NAME_KEY) == 0) { + if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&pptr, keyval->value, ORTE_NAME))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + proc->name = *pptr; + continue; } - proc->name = *pptr; - continue; - } - if(strcmp(keyval->key, ORTE_PROC_APP_CONTEXT_KEY) == 0) { - if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&sptr, keyval->value, ORTE_STD_CNTR))) { - ORTE_ERROR_LOG(rc); - goto cleanup; + if(strcmp(keyval->key, ORTE_PROC_APP_CONTEXT_KEY) == 0) { + if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&sptr, keyval->value, ORTE_STD_CNTR))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + proc->app_idx = *sptr; + continue; } - proc->app_idx = *sptr; - continue; - } - if(strcmp(keyval->key, ORTE_PROC_LOCAL_PID_KEY) == 0) { - if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&pidptr, keyval->value, ORTE_PID))) { - ORTE_ERROR_LOG(rc); - goto cleanup; + if(strcmp(keyval->key, ORTE_PROC_LOCAL_PID_KEY) == 0) { + if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&pidptr, keyval->value, ORTE_PID))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + proc->pid = *pidptr; + continue; } - proc->pid = *pidptr; - continue; - } - if(strcmp(keyval->key, ORTE_CELLID_KEY) == 0) { - if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&cellptr, keyval->value, ORTE_CELLID))) { - ORTE_ERROR_LOG(rc); - goto cleanup; + if(strcmp(keyval->key, ORTE_CELLID_KEY) == 0) { + if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&cellptr, keyval->value, ORTE_CELLID))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + cell = *cellptr; + continue; } - cell = *cellptr; - continue; - } - if(strcmp(keyval->key, ORTE_NODE_NAME_KEY) == 0) { - /* use the dss.copy function here to protect us against zero-length strings */ - if (ORTE_SUCCESS != (rc = orte_dss.copy((void**)&node_name, keyval->value->data, ORTE_STRING))) { - ORTE_ERROR_LOG(rc); - goto cleanup; + if(strcmp(keyval->key, ORTE_NODE_NAME_KEY) == 0) { + /* use the dss.copy function here to protect us against zero-length strings */ + if (ORTE_SUCCESS != (rc = orte_dss.copy((void**)&node_name, keyval->value->data, ORTE_STRING))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + continue; } - continue; - } - if(strcmp(keyval->key, ORTE_NODE_USERNAME_KEY) == 0) { - /* use the dss.copy function here to protect us against zero-length strings */ - if (ORTE_SUCCESS != (rc = orte_dss.copy((void**)&username, keyval->value->data, ORTE_STRING))) { - ORTE_ERROR_LOG(rc); - goto cleanup; + if(strcmp(keyval->key, ORTE_NODE_USERNAME_KEY) == 0) { + /* use the dss.copy function here to protect us against zero-length strings */ + if (ORTE_SUCCESS != (rc = orte_dss.copy((void**)&username, keyval->value->data, ORTE_STRING))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + continue; } - continue; - } - if(strcmp(keyval->key, ORTE_NODE_OVERSUBSCRIBED_KEY) == 0) { - if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&bptr, keyval->value, ORTE_BOOL))) { - ORTE_ERROR_LOG(rc); - goto cleanup; + if(strcmp(keyval->key, ORTE_NODE_OVERSUBSCRIBED_KEY) == 0) { + if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&bptr, keyval->value, ORTE_BOOL))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + oversub = *bptr; + continue; } - oversub = *bptr; - continue; } + /* store this process in the map */ + if (ORTE_SUCCESS != (rc = orte_rmaps_base_add_proc_to_map(mapping, cell, node_name, username, oversub, proc))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + if (NULL != node_name) free(node_name); } - /* store this process in the map */ - if (ORTE_SUCCESS != (rc = orte_rmaps_base_add_proc_to_map(mapping, cell, node_name, username, oversub, proc))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - if (NULL != node_name) free(node_name); } + /* compute and save convenience values */ + mapping->num_nodes = opal_list_get_size(&mapping->nodes); + for (item = opal_list_get_first(&mapping->nodes); + item != opal_list_get_end(&mapping->nodes); + item = opal_list_get_next(item)) { + mnode = (orte_mapped_node_t*)item; + mnode->num_procs = opal_list_get_size(&mnode->procs); + } + /* all done */ *map = mapping; return ORTE_SUCCESS; @@ -303,10 +342,10 @@ int orte_rmaps_base_put_job_map(orte_job_map_t *map) return rc; } - /** setup the last value in the array to update the INIT counter */ + /** setup the last value in the array to store the vpid start/range and update the INIT counter */ if (ORTE_SUCCESS != (rc = orte_gpr.create_value(&(values[num_procs]), ORTE_GPR_OVERWRITE|ORTE_GPR_TOKENS_AND, - segment, 1, 1))) { + segment, 3, 1))) { ORTE_ERROR_LOG(rc); free(values); free(segment); @@ -316,6 +355,14 @@ int orte_rmaps_base_put_job_map(orte_job_map_t *map) ORTE_ERROR_LOG(rc); goto cleanup; } + if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(values[num_procs]->keyvals[1]), ORTE_JOB_VPID_START_KEY, ORTE_VPID, &map->vpid_start))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(values[num_procs]->keyvals[2]), ORTE_JOB_VPID_RANGE_KEY, ORTE_VPID, &map->vpid_range))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } values[num_procs]->tokens[0] = strdup(ORTE_JOB_GLOBALS); /* counter is in the job's globals container */ diff --git a/orte/mca/rmaps/base/rmaps_class_instances.h b/orte/mca/rmaps/base/rmaps_class_instances.h index 003b0c1822..ed8a33e1b2 100644 --- a/orte/mca/rmaps/base/rmaps_class_instances.h +++ b/orte/mca/rmaps/base/rmaps_class_instances.h @@ -66,6 +66,7 @@ static void orte_rmaps_mapped_node_construct(orte_mapped_node_t* node) node->username = NULL; node->daemon = NULL; node->oversubscribed = false; + node->num_procs = 0; OBJ_CONSTRUCT(&node->procs, opal_list_t); } @@ -103,8 +104,12 @@ OBJ_CLASS_INSTANCE(orte_mapped_node_t, static void orte_rmaps_job_map_construct(orte_job_map_t* map) { map->job = ORTE_JOBID_INVALID; + map->mapping_mode = NULL; + map->vpid_start = ORTE_VPID_INVALID; + map->vpid_range = 0; map->num_apps = 0; map->apps = NULL; + map->num_nodes = 0; OBJ_CONSTRUCT(&map->nodes, opal_list_t); } @@ -113,6 +118,8 @@ static void orte_rmaps_job_map_destruct(orte_job_map_t* map) orte_std_cntr_t i=0; opal_list_item_t* item; + if (NULL != map->mapping_mode) free(map->mapping_mode); + for(i=0; i < map->num_apps; i++) { if (NULL != map->apps[i]) OBJ_RELEASE(map->apps[i]); } diff --git a/orte/mca/rmaps/rmaps_types.h b/orte/mca/rmaps/rmaps_types.h index a51a106245..6325bfa998 100644 --- a/orte/mca/rmaps/rmaps_types.h +++ b/orte/mca/rmaps/rmaps_types.h @@ -69,7 +69,10 @@ struct orte_mapped_node_t { orte_process_name_t *daemon; /* name of the daemon on this node * NULL => daemon not assigned yet */ - bool oversubscribed; /* whether or not the #procs > #processors */ + bool oversubscribed; /* whether or not the #procs > #process slots on this node */ + orte_std_cntr_t num_procs; /* #procs on this node - just the length of the procs list, but + * stored here so we don't have to keep recomputing it elsewhere + */ opal_list_t procs; /* list of mapped_proc objects on this node */ }; typedef struct orte_mapped_node_t orte_mapped_node_t; @@ -82,8 +85,14 @@ OBJ_CLASS_DECLARATION(orte_mapped_node_t); struct orte_job_map_t { opal_object_t super; orte_jobid_t job; + char *mapping_mode; + orte_vpid_t vpid_start; + orte_vpid_t vpid_range; orte_std_cntr_t num_apps; /* number of app_contexts */ orte_app_context_t **apps; /* the array of app_contexts for this job */ + orte_std_cntr_t num_nodes; /* #nodes in this map - just the length of the nodes list, but + * stored here so we don't have to keep recomputing it elsewhere + */ opal_list_t nodes; /* list of mapped_node_t */ }; typedef struct orte_job_map_t orte_job_map_t; diff --git a/orte/mca/rmaps/round_robin/rmaps_rr.c b/orte/mca/rmaps/round_robin/rmaps_rr.c index 9f2c63c795..914d16e0b2 100644 --- a/orte/mca/rmaps/round_robin/rmaps_rr.c +++ b/orte/mca/rmaps/round_robin/rmaps_rr.c @@ -330,8 +330,9 @@ static int orte_rmaps_rr_map(orte_jobid_t jobid, opal_list_t *attributes) opal_list_t master_node_list, mapped_node_list, max_used_nodes, *working_node_list; opal_list_item_t *item, *item2; orte_ras_node_t *node, *node2; + orte_mapped_node_t *mnode; char *save_bookmark; - orte_vpid_t vpid_start, job_vpid_start=0; + orte_vpid_t vpid_start; orte_std_cntr_t num_procs = 0, total_num_slots, mapped_num_slots, num_nodes, num_slots; int rc; bool modify_app_context = false; @@ -509,7 +510,7 @@ static int orte_rmaps_rr_map(orte_jobid_t jobid, opal_list_t *attributes) /** save the initial starting vpid for later */ if (0 == i) { - job_vpid_start = vpid_start; + map->vpid_start = vpid_start; } /** track the total number of processes we mapped */ @@ -646,17 +647,21 @@ static int orte_rmaps_rr_map(orte_jobid_t jobid, opal_list_t *attributes) } + /* compute and save convenience values */ + map->vpid_range = num_procs; + map->num_nodes = opal_list_get_size(&map->nodes); + for (item = opal_list_get_first(&map->nodes); + item != opal_list_get_end(&map->nodes); + item = opal_list_get_next(item)) { + mnode = (orte_mapped_node_t*)item; + mnode->num_procs = opal_list_get_size(&mnode->procs); + } + /* save mapping to the registry */ if(ORTE_SUCCESS != (rc = orte_rmaps_base_put_job_map(map))) { goto cleanup; } - /* save vpid start/range on the job segment */ - if (ORTE_SUCCESS != (rc = orte_rmgr.set_vpid_range(jobid, job_vpid_start, num_procs))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - /** join the master_node_list and fully_used_list so that all info gets updated */ opal_list_join(&master_node_list, opal_list_get_end(&master_node_list), &fully_used_nodes);