febc143d8c
Update the help text to report errors when not following that rule. Also updated the RMAPS help text to reflect the reorganization of some of the round-robin code into the base. The new functionality has been tested under Mac OS-X and on Odin using an MPI program. Both byslot and bynode mapping have been checked and verified. Operational support for other systems needs to be verified - I respectfully request people's help in doing so. This commit was SVN r10708.
390 строки
13 KiB
C
390 строки
13 KiB
C
/*
|
|
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
|
* University Research and Technology
|
|
* Corporation. All rights reserved.
|
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
|
* of Tennessee Research Foundation. All rights
|
|
* reserved.
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
* University of Stuttgart. All rights reserved.
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
* All rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
#include "orte_config.h"
|
|
#include "orte/orte_constants.h"
|
|
|
|
#include <string.h>
|
|
|
|
#include "opal/util/output.h"
|
|
#include "opal/util/argv.h"
|
|
#include "opal/mca/mca.h"
|
|
#include "opal/mca/base/base.h"
|
|
#include "opal/mca/base/mca_base_param.h"
|
|
#include "opal/util/if.h"
|
|
#include "opal/util/show_help.h"
|
|
|
|
#include "orte/util/sys_info.h"
|
|
#include "orte/mca/errmgr/errmgr.h"
|
|
#include "orte/mca/soh/soh_types.h"
|
|
#include "orte/mca/ras/base/ras_base_node.h"
|
|
#include "orte/mca/gpr/gpr.h"
|
|
#include "orte/mca/ns/ns.h"
|
|
|
|
#include "orte/mca/rmaps/base/base.h"
|
|
#include "orte/mca/rmaps/base/rmaps_base_map.h"
|
|
#include "orte/mca/rmaps/base/rmaps_base_node.h"
|
|
|
|
|
|
/*
|
|
* A sanity check to ensure that all of the requested nodes are actually
|
|
* allocated to this application.
|
|
*/
|
|
static bool are_all_mapped_valid(char **mapping,
|
|
int num_mapped,
|
|
opal_list_t* nodes)
|
|
{
|
|
opal_list_item_t *item;
|
|
int i;
|
|
bool matched;
|
|
|
|
for (i = 0; i < num_mapped; ++i) {
|
|
matched = false;
|
|
|
|
for(item = opal_list_get_first(nodes);
|
|
item != opal_list_get_end(nodes);
|
|
item = opal_list_get_next(item) ) {
|
|
if( 0 == strcmp( ((orte_ras_node_t*) item)->node_name, mapping[i]) ) {
|
|
matched = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* If we find one requested resource that is not allocated,
|
|
* then return an error */
|
|
if(!matched) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
* If the node in question is in the current mapping.
|
|
*/
|
|
static bool is_mapped(opal_list_item_t *item,
|
|
char **mapping,
|
|
int num_mapped)
|
|
{
|
|
int i;
|
|
|
|
for ( i = 0; i < num_mapped; ++i) {
|
|
if ( 0 == strcmp( ((orte_ras_node_t*) item)->node_name, mapping[i])){
|
|
return true;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/*
|
|
* Query the registry for all nodes allocated to a specified job
|
|
*/
|
|
int orte_rmaps_base_get_target_nodes(opal_list_t* nodes, orte_jobid_t jobid, size_t *total_num_slots)
|
|
{
|
|
opal_list_item_t *item, *next;
|
|
orte_ras_node_t *node;
|
|
int id, rc, nolocal;
|
|
size_t num_slots=0;
|
|
|
|
/** set default answer */
|
|
*total_num_slots = 0;
|
|
|
|
if(ORTE_SUCCESS != (rc = orte_ras_base_node_query_alloc(nodes, jobid))) {
|
|
ORTE_ERROR_LOG(rc);
|
|
return rc;
|
|
}
|
|
|
|
/* If the "no local" option was set, then remove the local node
|
|
from the list */
|
|
|
|
id = mca_base_param_find("rmaps", NULL, "base_schedule_local");
|
|
mca_base_param_lookup_int(id, &nolocal);
|
|
if (0 == nolocal) {
|
|
for (item = opal_list_get_first(nodes);
|
|
item != opal_list_get_end(nodes);
|
|
item = opal_list_get_next(item) ) {
|
|
if (0 == strcmp(((orte_ras_node_t *) item)->node_name,
|
|
orte_system_info.nodename) ||
|
|
opal_ifislocal(((orte_ras_node_t *) item)->node_name)) {
|
|
opal_list_remove_item(nodes, item);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
/** remove all nodes that are already at max usage */
|
|
item = opal_list_get_first(nodes);
|
|
while (item != opal_list_get_end(nodes)) {
|
|
|
|
/** save the next pointer in case we remove this node */
|
|
next = opal_list_get_next(item);
|
|
|
|
/** check to see if this node is fully used - remove if so */
|
|
node = (orte_ras_node_t*)item;
|
|
if (0 != node->node_slots_max && node->node_slots_inuse > node->node_slots_max) {
|
|
opal_list_remove_item(nodes, item);
|
|
} else { /** otherwise, add its slots to the total */
|
|
num_slots += node->node_slots;
|
|
}
|
|
|
|
/** go on to next item */
|
|
item = next;
|
|
}
|
|
|
|
/* Sanity check to make sure we have been allocated nodes */
|
|
if (0 == opal_list_get_size(nodes)) {
|
|
ORTE_ERROR_LOG(ORTE_ERR_TEMP_OUT_OF_RESOURCE);
|
|
return ORTE_ERR_TEMP_OUT_OF_RESOURCE;
|
|
}
|
|
|
|
*total_num_slots = num_slots;
|
|
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
|
|
/*
|
|
* Create a sub-list of nodes to be used for user-specified mappings
|
|
*/
|
|
int orte_rmaps_base_get_mapped_targets(opal_list_t *mapped_node_list,
|
|
orte_app_context_t *app,
|
|
opal_list_t *master_node_list,
|
|
size_t *total_num_slots)
|
|
{
|
|
orte_app_context_map_t** loc_map = app->map_data;
|
|
opal_list_item_t *item;
|
|
orte_ras_node_t *node, *new_node;
|
|
char **mapped_nodes = NULL;
|
|
int num_mapped_nodes = 0;
|
|
size_t j, k, num_slots=0;
|
|
int rc;
|
|
|
|
/** set default answer */
|
|
*total_num_slots = 0;
|
|
|
|
/* Accumulate all of the host name mappings */
|
|
for(k = 0; k < app->num_map; ++k) {
|
|
if ( ORTE_APP_CONTEXT_MAP_HOSTNAME == loc_map[k]->map_type ) {
|
|
if(mapped_nodes == NULL) {
|
|
mapped_nodes = opal_argv_split(loc_map[k]->map_data, ',');
|
|
num_mapped_nodes = opal_argv_count(mapped_nodes);
|
|
}
|
|
else { /* Append to the existing mapping */
|
|
char ** mini_map = opal_argv_split(loc_map[k]->map_data, ',');
|
|
size_t mini_num_map = opal_argv_count(mini_map);
|
|
for (j = 0; j < mini_num_map; ++j) {
|
|
rc = opal_argv_append(&num_mapped_nodes, &mapped_nodes, mini_map[j]);
|
|
if (OPAL_SUCCESS != rc) {
|
|
ORTE_ERROR_LOG(rc);
|
|
return rc;
|
|
}
|
|
}
|
|
opal_argv_free(mini_map);
|
|
}
|
|
}
|
|
}
|
|
|
|
/** check to see that all the nodes in the specified mapping have been allocated
|
|
* for our use - if not, then that's an error
|
|
*/
|
|
if( !are_all_mapped_valid(mapped_nodes, num_mapped_nodes, master_node_list) ) {
|
|
opal_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:not-all-mapped-alloc",
|
|
true, app->app, opal_argv_join(mapped_nodes, ','));
|
|
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
|
return ORTE_ERR_OUT_OF_RESOURCE;
|
|
}
|
|
|
|
/** setup the working node list to include only those nodes that were specified
|
|
* in this mapping. We don't need to worry about nodes being fully used or not
|
|
* since the master list only includes nodes that aren't.
|
|
*/
|
|
for (item = opal_list_get_first(master_node_list);
|
|
item != opal_list_get_end(master_node_list);
|
|
item = opal_list_get_next(item) ) {
|
|
node = (orte_ras_node_t*)item;
|
|
|
|
if( is_mapped(item, mapped_nodes, num_mapped_nodes) ) {
|
|
/** we can't just add this item to the mapped_node_list as it cannot be
|
|
* on two lists at the same time, so we need to copy it first
|
|
*/
|
|
if (ORTE_SUCCESS != (rc = orte_dss.copy((void**)&new_node, node, ORTE_RAS_NODE))) {
|
|
ORTE_ERROR_LOG(rc);
|
|
return rc;
|
|
}
|
|
opal_list_append(mapped_node_list, &new_node->super);
|
|
num_slots += new_node->node_slots_alloc;
|
|
}
|
|
}
|
|
|
|
/** check that anything is left! */
|
|
if (0 == opal_list_get_size(mapped_node_list)) {
|
|
opal_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-mapped-node",
|
|
true, app->num_procs, app->app);
|
|
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
|
return ORTE_ERR_OUT_OF_RESOURCE;
|
|
}
|
|
|
|
*total_num_slots = num_slots;
|
|
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
|
|
/*
|
|
* Claim a slot for a specified job on a node
|
|
*/
|
|
int orte_rmaps_base_claim_slot(orte_rmaps_base_map_t *map,
|
|
orte_ras_node_t *current_node,
|
|
orte_jobid_t jobid, orte_vpid_t vpid,
|
|
int proc_index,
|
|
opal_list_t *nodes,
|
|
opal_list_t *fully_used_nodes)
|
|
{
|
|
orte_rmaps_base_proc_t *proc;
|
|
orte_process_name_t *proc_name;
|
|
orte_rmaps_base_node_t *rmaps_node;
|
|
int rc;
|
|
|
|
/* create objects */
|
|
rmaps_node = OBJ_NEW(orte_rmaps_base_node_t);
|
|
if (NULL == rmaps_node) {
|
|
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
|
return ORTE_ERR_OUT_OF_RESOURCE;
|
|
}
|
|
|
|
OBJ_RETAIN(current_node);
|
|
rmaps_node->node = current_node;
|
|
proc = OBJ_NEW(orte_rmaps_base_proc_t);
|
|
if (NULL == proc) {
|
|
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
|
OBJ_RELEASE(rmaps_node);
|
|
return ORTE_ERR_OUT_OF_RESOURCE;
|
|
}
|
|
|
|
/* create the process name as an offset from the vpid-start */
|
|
rc = orte_ns.create_process_name(&proc_name, current_node->node_cellid,
|
|
jobid, vpid);
|
|
if (rc != ORTE_SUCCESS) {
|
|
ORTE_ERROR_LOG(rc);
|
|
OBJ_RELEASE(proc);
|
|
OBJ_RELEASE(rmaps_node);
|
|
return rc;
|
|
}
|
|
proc->proc_node = rmaps_node;
|
|
proc->proc_name = *proc_name;
|
|
proc->proc_rank = vpid;
|
|
orte_ns.free_name(&proc_name);
|
|
OBJ_RETAIN(proc); /* bump reference count for the node */
|
|
opal_list_append(&rmaps_node->node_procs, &proc->super);
|
|
map->procs[proc_index] = proc;
|
|
|
|
/* Save this node on the map */
|
|
opal_list_append(&map->nodes, &rmaps_node->super);
|
|
|
|
/* Be sure to demarcate this slot as claimed for the node */
|
|
current_node->node_slots_inuse++;
|
|
|
|
/* Remove this node if it has reached its max number of allocatable slots OR it has
|
|
* reached the soft limit AND we are in a "no oversubscribe" state
|
|
*/
|
|
if ((0 != current_node->node_slots_max &&
|
|
current_node->node_slots_inuse >= current_node->node_slots_max) ||
|
|
(!orte_rmaps_base.oversubscribe &&
|
|
current_node->node_slots_inuse >= current_node->node_slots)) {
|
|
opal_list_remove_item(nodes, (opal_list_item_t*)current_node);
|
|
/* add it to the list of fully used nodes */
|
|
opal_list_append(fully_used_nodes, ¤t_node->super);
|
|
/** now return the proper code so the caller knows we removed the node! */
|
|
return ORTE_ERR_NODE_FULLY_USED;
|
|
}
|
|
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
|
|
/*
|
|
* Update the node allocations stored in the registry
|
|
*/
|
|
int orte_rmaps_base_update_node_usage(opal_list_t *nodes)
|
|
{
|
|
opal_list_item_t* item;
|
|
orte_gpr_value_t **values;
|
|
int rc;
|
|
size_t num_values, i, j;
|
|
orte_ras_node_t* node;
|
|
|
|
num_values = opal_list_get_size(nodes);
|
|
if (0 >= num_values) {
|
|
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
|
return ORTE_ERR_BAD_PARAM;
|
|
}
|
|
|
|
values = (orte_gpr_value_t**)malloc(num_values * sizeof(orte_gpr_value_t*));
|
|
if (NULL == values) {
|
|
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
|
return ORTE_ERR_OUT_OF_RESOURCE;
|
|
}
|
|
|
|
for (i=0; i < num_values; i++) {
|
|
if (ORTE_SUCCESS != (rc = orte_gpr.create_value(&(values[i]),
|
|
ORTE_GPR_OVERWRITE | ORTE_GPR_TOKENS_AND,
|
|
ORTE_NODE_SEGMENT, 1, 0))) {
|
|
ORTE_ERROR_LOG(rc);
|
|
for (j=0; j < i; j++) {
|
|
OBJ_RELEASE(values[j]);
|
|
}
|
|
free(values);
|
|
return rc;
|
|
}
|
|
}
|
|
|
|
for(i=0, item = opal_list_get_first(nodes);
|
|
i < num_values && item != opal_list_get_end(nodes);
|
|
i++, item = opal_list_get_next(item)) {
|
|
node = (orte_ras_node_t*)item;
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(values[i]->keyvals[0]), ORTE_NODE_SLOTS_IN_USE_KEY,
|
|
ORTE_SIZE, &(node->node_slots_inuse)))) {
|
|
ORTE_ERROR_LOG(rc);
|
|
goto cleanup;
|
|
}
|
|
|
|
/* setup index/keys for this node */
|
|
rc = orte_schema.get_node_tokens(&(values[i]->tokens), &(values[i]->num_tokens), node->node_cellid, node->node_name);
|
|
if (ORTE_SUCCESS != rc) {
|
|
ORTE_ERROR_LOG(rc);
|
|
goto cleanup;
|
|
}
|
|
}
|
|
|
|
/* try the insert */
|
|
if (ORTE_SUCCESS != (rc = orte_gpr.put(num_values, values))) {
|
|
ORTE_ERROR_LOG(rc);
|
|
}
|
|
|
|
cleanup:
|
|
for (j=0; j < num_values; j++) {
|
|
OBJ_RELEASE(values[j]);
|
|
}
|
|
if (NULL != values) free(values);
|
|
|
|
return rc;
|
|
}
|