1
1
openmpi/orte/mca/rmaps/base/rmaps_base_node.c
Ralph Castain febc143d8c Per LANL's stated need, add functionality that runs a.out across ALL available process slots if no num_proc is specified on the command line. However, please note the following limitation: we ONLY allow ONE application to be specified on the command line when this feature is invoked. If multiple apps are specified, the user MUST also specify the number to be launched for each and every one of them.
Update the help text to report errors when not following that rule.

Also updated the RMAPS help text to reflect the reorganization of some of the round-robin code into the base.

The new functionality has been tested under Mac OS-X and on Odin using an MPI program. Both byslot and bynode mapping have been checked and verified. Operational support for other systems needs to be verified - I respectfully request people's help in doing so.

This commit was SVN r10708.
2006-07-10 21:25:33 +00:00

390 строки
13 KiB
C

/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/orte_constants.h"
#include <string.h>
#include "opal/util/output.h"
#include "opal/util/argv.h"
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/util/if.h"
#include "opal/util/show_help.h"
#include "orte/util/sys_info.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/soh/soh_types.h"
#include "orte/mca/ras/base/ras_base_node.h"
#include "orte/mca/gpr/gpr.h"
#include "orte/mca/ns/ns.h"
#include "orte/mca/rmaps/base/base.h"
#include "orte/mca/rmaps/base/rmaps_base_map.h"
#include "orte/mca/rmaps/base/rmaps_base_node.h"
/*
* A sanity check to ensure that all of the requested nodes are actually
* allocated to this application.
*/
static bool are_all_mapped_valid(char **mapping,
int num_mapped,
opal_list_t* nodes)
{
opal_list_item_t *item;
int i;
bool matched;
for (i = 0; i < num_mapped; ++i) {
matched = false;
for(item = opal_list_get_first(nodes);
item != opal_list_get_end(nodes);
item = opal_list_get_next(item) ) {
if( 0 == strcmp( ((orte_ras_node_t*) item)->node_name, mapping[i]) ) {
matched = true;
break;
}
}
/* If we find one requested resource that is not allocated,
* then return an error */
if(!matched) {
return false;
}
}
return true;
}
/*
* If the node in question is in the current mapping.
*/
static bool is_mapped(opal_list_item_t *item,
char **mapping,
int num_mapped)
{
int i;
for ( i = 0; i < num_mapped; ++i) {
if ( 0 == strcmp( ((orte_ras_node_t*) item)->node_name, mapping[i])){
return true;
}
}
return false;
}
/*
* Query the registry for all nodes allocated to a specified job
*/
int orte_rmaps_base_get_target_nodes(opal_list_t* nodes, orte_jobid_t jobid, size_t *total_num_slots)
{
opal_list_item_t *item, *next;
orte_ras_node_t *node;
int id, rc, nolocal;
size_t num_slots=0;
/** set default answer */
*total_num_slots = 0;
if(ORTE_SUCCESS != (rc = orte_ras_base_node_query_alloc(nodes, jobid))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* If the "no local" option was set, then remove the local node
from the list */
id = mca_base_param_find("rmaps", NULL, "base_schedule_local");
mca_base_param_lookup_int(id, &nolocal);
if (0 == nolocal) {
for (item = opal_list_get_first(nodes);
item != opal_list_get_end(nodes);
item = opal_list_get_next(item) ) {
if (0 == strcmp(((orte_ras_node_t *) item)->node_name,
orte_system_info.nodename) ||
opal_ifislocal(((orte_ras_node_t *) item)->node_name)) {
opal_list_remove_item(nodes, item);
break;
}
}
}
/** remove all nodes that are already at max usage */
item = opal_list_get_first(nodes);
while (item != opal_list_get_end(nodes)) {
/** save the next pointer in case we remove this node */
next = opal_list_get_next(item);
/** check to see if this node is fully used - remove if so */
node = (orte_ras_node_t*)item;
if (0 != node->node_slots_max && node->node_slots_inuse > node->node_slots_max) {
opal_list_remove_item(nodes, item);
} else { /** otherwise, add its slots to the total */
num_slots += node->node_slots;
}
/** go on to next item */
item = next;
}
/* Sanity check to make sure we have been allocated nodes */
if (0 == opal_list_get_size(nodes)) {
ORTE_ERROR_LOG(ORTE_ERR_TEMP_OUT_OF_RESOURCE);
return ORTE_ERR_TEMP_OUT_OF_RESOURCE;
}
*total_num_slots = num_slots;
return ORTE_SUCCESS;
}
/*
* Create a sub-list of nodes to be used for user-specified mappings
*/
int orte_rmaps_base_get_mapped_targets(opal_list_t *mapped_node_list,
orte_app_context_t *app,
opal_list_t *master_node_list,
size_t *total_num_slots)
{
orte_app_context_map_t** loc_map = app->map_data;
opal_list_item_t *item;
orte_ras_node_t *node, *new_node;
char **mapped_nodes = NULL;
int num_mapped_nodes = 0;
size_t j, k, num_slots=0;
int rc;
/** set default answer */
*total_num_slots = 0;
/* Accumulate all of the host name mappings */
for(k = 0; k < app->num_map; ++k) {
if ( ORTE_APP_CONTEXT_MAP_HOSTNAME == loc_map[k]->map_type ) {
if(mapped_nodes == NULL) {
mapped_nodes = opal_argv_split(loc_map[k]->map_data, ',');
num_mapped_nodes = opal_argv_count(mapped_nodes);
}
else { /* Append to the existing mapping */
char ** mini_map = opal_argv_split(loc_map[k]->map_data, ',');
size_t mini_num_map = opal_argv_count(mini_map);
for (j = 0; j < mini_num_map; ++j) {
rc = opal_argv_append(&num_mapped_nodes, &mapped_nodes, mini_map[j]);
if (OPAL_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
opal_argv_free(mini_map);
}
}
}
/** check to see that all the nodes in the specified mapping have been allocated
* for our use - if not, then that's an error
*/
if( !are_all_mapped_valid(mapped_nodes, num_mapped_nodes, master_node_list) ) {
opal_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:not-all-mapped-alloc",
true, app->app, opal_argv_join(mapped_nodes, ','));
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
/** setup the working node list to include only those nodes that were specified
* in this mapping. We don't need to worry about nodes being fully used or not
* since the master list only includes nodes that aren't.
*/
for (item = opal_list_get_first(master_node_list);
item != opal_list_get_end(master_node_list);
item = opal_list_get_next(item) ) {
node = (orte_ras_node_t*)item;
if( is_mapped(item, mapped_nodes, num_mapped_nodes) ) {
/** we can't just add this item to the mapped_node_list as it cannot be
* on two lists at the same time, so we need to copy it first
*/
if (ORTE_SUCCESS != (rc = orte_dss.copy((void**)&new_node, node, ORTE_RAS_NODE))) {
ORTE_ERROR_LOG(rc);
return rc;
}
opal_list_append(mapped_node_list, &new_node->super);
num_slots += new_node->node_slots_alloc;
}
}
/** check that anything is left! */
if (0 == opal_list_get_size(mapped_node_list)) {
opal_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-mapped-node",
true, app->num_procs, app->app);
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
*total_num_slots = num_slots;
return ORTE_SUCCESS;
}
/*
* Claim a slot for a specified job on a node
*/
int orte_rmaps_base_claim_slot(orte_rmaps_base_map_t *map,
orte_ras_node_t *current_node,
orte_jobid_t jobid, orte_vpid_t vpid,
int proc_index,
opal_list_t *nodes,
opal_list_t *fully_used_nodes)
{
orte_rmaps_base_proc_t *proc;
orte_process_name_t *proc_name;
orte_rmaps_base_node_t *rmaps_node;
int rc;
/* create objects */
rmaps_node = OBJ_NEW(orte_rmaps_base_node_t);
if (NULL == rmaps_node) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
OBJ_RETAIN(current_node);
rmaps_node->node = current_node;
proc = OBJ_NEW(orte_rmaps_base_proc_t);
if (NULL == proc) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OBJ_RELEASE(rmaps_node);
return ORTE_ERR_OUT_OF_RESOURCE;
}
/* create the process name as an offset from the vpid-start */
rc = orte_ns.create_process_name(&proc_name, current_node->node_cellid,
jobid, vpid);
if (rc != ORTE_SUCCESS) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(proc);
OBJ_RELEASE(rmaps_node);
return rc;
}
proc->proc_node = rmaps_node;
proc->proc_name = *proc_name;
proc->proc_rank = vpid;
orte_ns.free_name(&proc_name);
OBJ_RETAIN(proc); /* bump reference count for the node */
opal_list_append(&rmaps_node->node_procs, &proc->super);
map->procs[proc_index] = proc;
/* Save this node on the map */
opal_list_append(&map->nodes, &rmaps_node->super);
/* Be sure to demarcate this slot as claimed for the node */
current_node->node_slots_inuse++;
/* Remove this node if it has reached its max number of allocatable slots OR it has
* reached the soft limit AND we are in a "no oversubscribe" state
*/
if ((0 != current_node->node_slots_max &&
current_node->node_slots_inuse >= current_node->node_slots_max) ||
(!orte_rmaps_base.oversubscribe &&
current_node->node_slots_inuse >= current_node->node_slots)) {
opal_list_remove_item(nodes, (opal_list_item_t*)current_node);
/* add it to the list of fully used nodes */
opal_list_append(fully_used_nodes, &current_node->super);
/** now return the proper code so the caller knows we removed the node! */
return ORTE_ERR_NODE_FULLY_USED;
}
return ORTE_SUCCESS;
}
/*
* Update the node allocations stored in the registry
*/
int orte_rmaps_base_update_node_usage(opal_list_t *nodes)
{
opal_list_item_t* item;
orte_gpr_value_t **values;
int rc;
size_t num_values, i, j;
orte_ras_node_t* node;
num_values = opal_list_get_size(nodes);
if (0 >= num_values) {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
return ORTE_ERR_BAD_PARAM;
}
values = (orte_gpr_value_t**)malloc(num_values * sizeof(orte_gpr_value_t*));
if (NULL == values) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
for (i=0; i < num_values; i++) {
if (ORTE_SUCCESS != (rc = orte_gpr.create_value(&(values[i]),
ORTE_GPR_OVERWRITE | ORTE_GPR_TOKENS_AND,
ORTE_NODE_SEGMENT, 1, 0))) {
ORTE_ERROR_LOG(rc);
for (j=0; j < i; j++) {
OBJ_RELEASE(values[j]);
}
free(values);
return rc;
}
}
for(i=0, item = opal_list_get_first(nodes);
i < num_values && item != opal_list_get_end(nodes);
i++, item = opal_list_get_next(item)) {
node = (orte_ras_node_t*)item;
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(values[i]->keyvals[0]), ORTE_NODE_SLOTS_IN_USE_KEY,
ORTE_SIZE, &(node->node_slots_inuse)))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* setup index/keys for this node */
rc = orte_schema.get_node_tokens(&(values[i]->tokens), &(values[i]->num_tokens), node->node_cellid, node->node_name);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
}
/* try the insert */
if (ORTE_SUCCESS != (rc = orte_gpr.put(num_values, values))) {
ORTE_ERROR_LOG(rc);
}
cleanup:
for (j=0; j < num_values; j++) {
OBJ_RELEASE(values[j]);
}
if (NULL != values) free(values);
return rc;
}