538965aeb0
/tmp/tm-merge). Validated by RHC. Summary: - Add --nolocal (and -nolocal) options to orterun - Make some scalability improvements to the tm pls This commit was SVN r10651.
627 строки
20 KiB
C
627 строки
20 KiB
C
/*
|
|
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
|
* University Research and Technology
|
|
* Corporation. All rights reserved.
|
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
|
* of Tennessee Research Foundation. All rights
|
|
* reserved.
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
* University of Stuttgart. All rights reserved.
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
* All rights reserved.
|
|
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
#include "orte_config.h"
|
|
#include <errno.h>
|
|
#ifdef HAVE_UNISTD_H
|
|
#include <unistd.h>
|
|
#endif /* HAVE_UNISTD_H */
|
|
#ifdef HAVE_STRING_H
|
|
#include <string.h>
|
|
#endif /* HAVE_STRING_H */
|
|
|
|
#include "opal/mca/base/mca_base_param.h"
|
|
#include "opal/util/output.h"
|
|
#include "opal/util/show_help.h"
|
|
#include "opal/util/argv.h"
|
|
#include "opal/util/if.h"
|
|
#include "orte/orte_constants.h"
|
|
#include "orte/orte_types.h"
|
|
#include "orte/util/sys_info.h"
|
|
#include "orte/mca/ns/ns.h"
|
|
#include "orte/mca/gpr/gpr.h"
|
|
#include "orte/mca/rmaps/base/base.h"
|
|
#include "orte/mca/rmgr/base/base.h"
|
|
#include "orte/mca/rmaps/base/rmaps_base_map.h"
|
|
#include "orte/mca/ras/base/ras_base_node.h"
|
|
#include "rmaps_rr.h"
|
|
|
|
|
|
/*
|
|
* Local variable
|
|
*/
|
|
static opal_list_item_t *cur_node_item = NULL;
|
|
|
|
/*
|
|
* A sanity check to ensure that all of the requested nodes are actually
|
|
* allocated to this application.
|
|
*/
|
|
static bool are_all_mapped_valid(char **mapping,
|
|
int num_mapped,
|
|
opal_list_t* nodes)
|
|
{
|
|
opal_list_item_t *item;
|
|
int i;
|
|
bool matched;
|
|
|
|
for (i = 0; i < num_mapped; ++i) {
|
|
matched = false;
|
|
|
|
for(item = opal_list_get_first(nodes);
|
|
item != opal_list_get_end(nodes);
|
|
item = opal_list_get_next(item) ) {
|
|
if( 0 == strcmp( ((orte_ras_node_t*) item)->node_name, mapping[i]) ) {
|
|
matched = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* If we find one requested resource that is not allocated,
|
|
* then return an error */
|
|
if(!matched) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
* If the node in question is in the current mapping.
|
|
*/
|
|
static bool is_mapped(opal_list_item_t *item,
|
|
char **mapping,
|
|
int num_mapped,
|
|
opal_list_t* nodes)
|
|
{
|
|
int i;
|
|
|
|
for ( i = 0; i < num_mapped; ++i) {
|
|
if ( 0 == strcmp( ((orte_ras_node_t*) item)->node_name, mapping[i])){
|
|
return true;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/*
|
|
* Return a point to the next node allocated, included in the mapping.
|
|
*/
|
|
static opal_list_item_t* get_next_mapped(opal_list_item_t *node_item,
|
|
char **mapping,
|
|
int num_mapped,
|
|
opal_list_t* nodes)
|
|
{
|
|
opal_list_item_t *item, *initial_item = NULL;
|
|
|
|
/* Wrap around to beginning if we are at the end of the list */
|
|
if (opal_list_get_end(nodes) == opal_list_get_next(node_item)) {
|
|
item = opal_list_get_first(nodes);
|
|
}
|
|
else {
|
|
item = opal_list_get_next(node_item);
|
|
}
|
|
|
|
do {
|
|
/* See if current node is in the mapping and contains slots */
|
|
if( is_mapped(item, mapping, num_mapped, nodes) ) {
|
|
return item;
|
|
}
|
|
|
|
/*
|
|
* We just rechecked the current item and concluded that
|
|
* it wasn't in the list, thus the list contains no matches
|
|
* in this mapping. Return an error.
|
|
*/
|
|
if(node_item == item){
|
|
return NULL;
|
|
}
|
|
|
|
/* save the node we started with */
|
|
if( NULL == initial_item ) {
|
|
initial_item = item;
|
|
}
|
|
|
|
/* Access next item in Round Robin Manner */
|
|
if (opal_list_get_end(nodes) == opal_list_get_next(item)) {
|
|
item = opal_list_get_first(nodes);
|
|
}
|
|
else {
|
|
item = opal_list_get_next(item);
|
|
}
|
|
|
|
/* Check to make sure we didn't loop back around without
|
|
* finding a node in the mapping */
|
|
if( initial_item == item) {
|
|
return NULL;
|
|
}
|
|
|
|
} while( true );
|
|
}
|
|
|
|
static int claim_slot(orte_rmaps_base_map_t *map,
|
|
orte_ras_node_t *current_node,
|
|
orte_jobid_t jobid,
|
|
orte_vpid_t vpid,
|
|
int proc_index)
|
|
{
|
|
orte_rmaps_base_proc_t *proc;
|
|
orte_process_name_t *proc_name;
|
|
orte_rmaps_base_node_t *rmaps_node;
|
|
int rc;
|
|
|
|
/* create objects */
|
|
rmaps_node = OBJ_NEW(orte_rmaps_base_node_t);
|
|
if (NULL == rmaps_node) {
|
|
return ORTE_ERR_OUT_OF_RESOURCE;
|
|
}
|
|
|
|
OBJ_RETAIN(current_node);
|
|
rmaps_node->node = current_node;
|
|
proc = OBJ_NEW(orte_rmaps_base_proc_t);
|
|
if (NULL == proc) {
|
|
OBJ_RELEASE(rmaps_node);
|
|
return ORTE_ERR_OUT_OF_RESOURCE;
|
|
}
|
|
|
|
/* create the process name as an offset from the vpid-start */
|
|
rc = orte_ns.create_process_name(&proc_name, current_node->node_cellid,
|
|
jobid, vpid);
|
|
if (rc != ORTE_SUCCESS) {
|
|
OBJ_RELEASE(proc);
|
|
OBJ_RELEASE(rmaps_node);
|
|
return rc;
|
|
}
|
|
proc->proc_node = rmaps_node;
|
|
proc->proc_name = *proc_name;
|
|
proc->proc_rank = vpid;
|
|
orte_ns.free_name(&proc_name);
|
|
OBJ_RETAIN(proc); /* bump reference count for the node */
|
|
opal_list_append(&rmaps_node->node_procs, &proc->super);
|
|
map->procs[proc_index] = proc;
|
|
|
|
/* Save this node on the map */
|
|
opal_list_append(&map->nodes, &rmaps_node->super);
|
|
|
|
/* Be sure to demarcate this slot claim for the node */
|
|
current_node->node_slots_inuse++;
|
|
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
|
|
/*
|
|
* Create a default mapping for the application, scheduling round
|
|
* robin by node.
|
|
*/
|
|
static int map_app_by_node(
|
|
orte_app_context_t* app,
|
|
orte_rmaps_base_map_t* map,
|
|
orte_jobid_t jobid,
|
|
orte_vpid_t vpid_start,
|
|
int rank,
|
|
opal_list_t* nodes,
|
|
char **mapped_nodes,
|
|
int num_mapped_nodes)
|
|
{
|
|
int rc = ORTE_SUCCESS;
|
|
size_t num_alloc = 0;
|
|
size_t proc_index = 0;
|
|
opal_list_item_t *next;
|
|
orte_ras_node_t *node;
|
|
|
|
/* This loop continues until all procs have been mapped or we run
|
|
out of resources. We determine that we have "run out of
|
|
resources" when all nodes have node_slots_max processes mapped to them,
|
|
thus there are no free slots for a process to be mapped.
|
|
If we still have processes that haven't been mapped yet, then it's an
|
|
"out of resources" error. */
|
|
while (num_alloc < app->num_procs) {
|
|
node = (orte_ras_node_t*) cur_node_item;
|
|
|
|
/* Find the next node we can use before claiming slots, since
|
|
* we may need to prune the nodes list removing overused nodes */
|
|
if ( 0 < app->num_map ) {
|
|
next = get_next_mapped(cur_node_item, mapped_nodes, num_mapped_nodes, nodes);
|
|
if (NULL == next ) {
|
|
/* Not allocated anything */
|
|
opal_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:no-mapped-node",
|
|
true, app->app, opal_argv_join(mapped_nodes, ','));
|
|
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
|
goto cleanup;
|
|
}
|
|
}
|
|
else {
|
|
if (opal_list_get_end(nodes) == opal_list_get_next(cur_node_item)) {
|
|
next = opal_list_get_first(nodes);
|
|
}
|
|
else {
|
|
next = opal_list_get_next(cur_node_item);
|
|
}
|
|
}
|
|
|
|
/* Remove this node if it has reached its max number of allocatable slots */
|
|
if( 0 != node->node_slots_max &&
|
|
node->node_slots_inuse > node->node_slots_max) {
|
|
opal_list_remove_item(nodes, (opal_list_item_t*)node);
|
|
if(0 >= opal_list_get_size(nodes) ) {
|
|
/* No more nodes to allocate :( */
|
|
opal_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:alloc-error",
|
|
true, app->num_procs, app->app);
|
|
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
|
goto cleanup;
|
|
}
|
|
}
|
|
/* Allocate this node */
|
|
else {
|
|
rc = claim_slot(map, node, jobid, vpid_start + rank, proc_index);
|
|
if (ORTE_SUCCESS != rc) {
|
|
goto cleanup;
|
|
}
|
|
|
|
++rank;
|
|
++proc_index;
|
|
++num_alloc;
|
|
}
|
|
|
|
cur_node_item = next;
|
|
}
|
|
|
|
map->num_procs = num_alloc;
|
|
|
|
cleanup:
|
|
|
|
return rc;
|
|
}
|
|
|
|
|
|
/*
|
|
* Create a default mapping for the application, scheduling one round
|
|
* robin by slot.
|
|
*/
|
|
static int map_app_by_slot(
|
|
orte_app_context_t* app,
|
|
orte_rmaps_base_map_t* map,
|
|
orte_jobid_t jobid,
|
|
orte_vpid_t vpid_start,
|
|
int rank,
|
|
opal_list_t* nodes,
|
|
char **mapped_nodes,
|
|
int num_mapped_nodes)
|
|
{
|
|
int rc = ORTE_SUCCESS;
|
|
size_t i;
|
|
size_t num_alloc = 0;
|
|
size_t proc_index = 0;
|
|
orte_ras_node_t *node;
|
|
opal_list_item_t *start, *next;
|
|
bool oversubscribe;
|
|
|
|
|
|
/* This loop continues until all procs have been mapped or we run
|
|
out of resources. We determine that we have "run out of
|
|
resources" when all nodes have node_slots_max processes mapped to them,
|
|
thus there are no free slots for a process to be mapped.
|
|
If we still have processes that haven't been mapped yet, then it's an
|
|
"out of resources" error. */
|
|
num_alloc = 0;
|
|
start = cur_node_item;
|
|
oversubscribe = false;
|
|
while ( num_alloc < app->num_procs) {
|
|
node = (orte_ras_node_t*) cur_node_item;
|
|
|
|
/* Find the next node we can use before claiming slots, since
|
|
* we may need to prune the nodes list removing over used nodes */
|
|
if ( 0 < app->num_map ) {
|
|
next = get_next_mapped(cur_node_item, mapped_nodes, num_mapped_nodes, nodes);
|
|
if (NULL == next ) {
|
|
/* Not allocated anything */
|
|
opal_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:no-mapped-node",
|
|
true, app->app, opal_argv_join(mapped_nodes, ','));
|
|
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
|
goto cleanup;
|
|
}
|
|
}
|
|
else {
|
|
if (opal_list_get_end(nodes) == opal_list_get_next(cur_node_item)) {
|
|
next = opal_list_get_first(nodes);
|
|
}
|
|
else {
|
|
next = opal_list_get_next(cur_node_item);
|
|
}
|
|
}
|
|
|
|
/* If we have available slots on this node, claim all of them
|
|
* If node_slots == 0, assume 1 slot for that node.
|
|
* JJH - is this assumption fully justified? */
|
|
for( i = 0; i < ((node->node_slots == 0) ? 1 : node->node_slots); ++i) {
|
|
/* If we are not oversubscribing, and this node is full, skip it. */
|
|
if( !oversubscribe &&
|
|
0 != node->node_slots &&
|
|
node->node_slots_inuse > node->node_slots) {
|
|
break;
|
|
}
|
|
/* If this node has reached its max number of slots,
|
|
* take it out of the list, and skip it */
|
|
else if( 0 != node->node_slots_max &&
|
|
node->node_slots_inuse > node->node_slots_max){
|
|
opal_list_remove_item(nodes, (opal_list_item_t*)node);
|
|
if( 0 >= opal_list_get_size(nodes) ) {
|
|
/* No more nodes to allocate */
|
|
opal_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:alloc-error",
|
|
true, app->num_procs, app->app);
|
|
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
|
goto cleanup;
|
|
}
|
|
break;
|
|
}
|
|
|
|
rc = claim_slot(map, node, jobid, vpid_start + rank, proc_index);
|
|
if (ORTE_SUCCESS != rc) {
|
|
goto cleanup;
|
|
}
|
|
|
|
/* Increase the number of procs allocated */
|
|
++num_alloc;
|
|
++rank;
|
|
++proc_index;
|
|
if(num_alloc >= app->num_procs) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
cur_node_item = next;
|
|
|
|
/* Since we have now looped back around, go ahead and oversubscribe nodes */
|
|
if(start == cur_node_item) {
|
|
oversubscribe = true;
|
|
}
|
|
}
|
|
|
|
map->num_procs = num_alloc;
|
|
|
|
cleanup:
|
|
|
|
return rc;
|
|
}
|
|
|
|
|
|
/*
|
|
* Create a default mapping for the job.
|
|
*/
|
|
|
|
static int orte_rmaps_rr_map(orte_jobid_t jobid)
|
|
{
|
|
orte_app_context_t** context, *app;
|
|
orte_rmaps_base_map_t* map;
|
|
size_t i, j, k, num_context;
|
|
opal_list_t nodes;
|
|
opal_list_t mapping;
|
|
opal_list_item_t* item;
|
|
orte_vpid_t vpid_start;
|
|
size_t num_procs = 0;
|
|
int rank = 0;
|
|
int rc = ORTE_SUCCESS;
|
|
bool bynode = true;
|
|
char **mapped_nodes = NULL;
|
|
int num_mapped_nodes = 0;
|
|
int id, value;
|
|
|
|
/* query for the application context and allocated nodes */
|
|
if(ORTE_SUCCESS != (rc = orte_rmgr_base_get_app_context(jobid, &context, &num_context))) {
|
|
return rc;
|
|
}
|
|
|
|
/* which policy should we use? */
|
|
if (0 == strcmp(mca_rmaps_round_robin_component.schedule_policy, "node")) {
|
|
bynode = true;
|
|
} else {
|
|
bynode = false;
|
|
}
|
|
|
|
/* query for all nodes allocated to this job */
|
|
OBJ_CONSTRUCT(&nodes, opal_list_t);
|
|
if(ORTE_SUCCESS != (rc = orte_ras_base_node_query_alloc(&nodes, jobid))) {
|
|
OBJ_DESTRUCT(&nodes);
|
|
return rc;
|
|
}
|
|
|
|
/* If the "no local" option was set, then remove the local node
|
|
from the list */
|
|
|
|
id = mca_base_param_find("rmaps", NULL, "base_schedule_local");
|
|
mca_base_param_lookup_int(id, &value);
|
|
if (0 == value) {
|
|
for (item = opal_list_get_first(&nodes);
|
|
item != opal_list_get_end(&nodes);
|
|
item = opal_list_get_next(item) ) {
|
|
if (0 == strcmp(((orte_ras_node_t *) item)->node_name,
|
|
orte_system_info.nodename) ||
|
|
opal_ifislocal(((orte_ras_node_t *) item)->node_name)) {
|
|
opal_list_remove_item(&nodes, item);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Sanity check to make sure we have been allocated nodes */
|
|
if (0 == opal_list_get_size(&nodes)) {
|
|
OBJ_DESTRUCT(&nodes);
|
|
return ORTE_ERR_TEMP_OUT_OF_RESOURCE;
|
|
}
|
|
|
|
/* Total number of procs required
|
|
* DEVEL NOTE: Need to extend this when implementing C/N notation
|
|
* Will need to set the app->num_procs approprately before this,
|
|
* Since we will have the allocated node information at this point.
|
|
*/
|
|
for(i=0; i<num_context; i++) {
|
|
app = context[i];
|
|
num_procs += app->num_procs;
|
|
}
|
|
|
|
/* allocate a vpid range for the job */
|
|
if(ORTE_SUCCESS != (rc = orte_ns.reserve_range(jobid, num_procs, &vpid_start))) {
|
|
return rc;
|
|
}
|
|
|
|
|
|
/* construct a default mapping by application */
|
|
OBJ_CONSTRUCT(&mapping, opal_list_t);
|
|
cur_node_item = opal_list_get_first(&nodes);
|
|
for(i=0; i<num_context; i++) {
|
|
app = context[i];
|
|
|
|
map = OBJ_NEW(orte_rmaps_base_map_t);
|
|
if(NULL == map) {
|
|
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
|
goto cleanup;
|
|
}
|
|
opal_list_append(&mapping, &map->super);
|
|
|
|
map->app = app;
|
|
map->procs = malloc(sizeof(orte_rmaps_base_proc_t*) * app->num_procs);
|
|
if(NULL == map->procs) {
|
|
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
|
goto cleanup;
|
|
}
|
|
|
|
/* Extract the requested mapping for this application */
|
|
|
|
/* Note that cur_node_item already points to the Right place in
|
|
the node list to start looking (i.e., if this is the first time
|
|
through, it'll point to the first item. If this is not the
|
|
first time through -- i.e., we have multiple app contexts --
|
|
it'll point to where we left off last time.). If we're at the
|
|
end, bounce back to the front (as would happen in the loop
|
|
below)
|
|
|
|
But do a bozo check to ensure that we don't have a empty
|
|
node list.*/
|
|
if (0 == opal_list_get_size(&nodes)) {
|
|
rc = ORTE_ERR_TEMP_OUT_OF_RESOURCE;
|
|
goto cleanup;
|
|
} else if (opal_list_get_end(&nodes) == cur_node_item) {
|
|
cur_node_item = opal_list_get_first(&nodes);
|
|
}
|
|
|
|
/* If this application has a mapping then
|
|
* - if the current node is in the mapping, use it
|
|
* - ow get the next node in that mapping.
|
|
*/
|
|
if ( 0 < app->num_map ) {
|
|
orte_app_context_map_t** loc_map = app->map_data;
|
|
|
|
/* Accumulate all of the host name mappings */
|
|
for(k = 0; k < app->num_map; ++k) {
|
|
if ( ORTE_APP_CONTEXT_MAP_HOSTNAME == loc_map[k]->map_type ) {
|
|
if(mapped_nodes == NULL) {
|
|
mapped_nodes = opal_argv_split(loc_map[k]->map_data, ',');
|
|
num_mapped_nodes = opal_argv_count(mapped_nodes);
|
|
}
|
|
else { /* Append to the existing mapping */
|
|
char ** mini_map = opal_argv_split(loc_map[k]->map_data, ',');
|
|
size_t mini_num_map = opal_argv_count(mini_map);
|
|
for (j = 0; j < mini_num_map; ++j) {
|
|
rc = opal_argv_append(&num_mapped_nodes, &mapped_nodes, mini_map[j]);
|
|
if (OPAL_SUCCESS != rc) {
|
|
goto cleanup;
|
|
}
|
|
}
|
|
opal_argv_free(mini_map);
|
|
}
|
|
}
|
|
}
|
|
|
|
if( !are_all_mapped_valid(mapped_nodes, num_mapped_nodes, &nodes) ) {
|
|
opal_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:not-all-mapped-alloc",
|
|
true, app->app, opal_argv_join(mapped_nodes, ','));
|
|
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
|
goto cleanup;
|
|
}
|
|
|
|
/* If the current node is not in the current mapping
|
|
* Then get the next node that is in the mapping */
|
|
if( !is_mapped(cur_node_item, mapped_nodes, num_mapped_nodes, &nodes) ) {
|
|
cur_node_item = get_next_mapped(cur_node_item, mapped_nodes, num_mapped_nodes, &nodes);
|
|
if( NULL == cur_node_item) {
|
|
opal_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:no-mapped-node",
|
|
true, app->app, opal_argv_join(mapped_nodes, ','));
|
|
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
|
goto cleanup;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Make assignments */
|
|
if (bynode) {
|
|
rc = map_app_by_node(app, map, jobid, vpid_start, rank, &nodes, mapped_nodes, num_mapped_nodes);
|
|
} else {
|
|
rc = map_app_by_slot(app, map, jobid, vpid_start, rank, &nodes, mapped_nodes, num_mapped_nodes);
|
|
}
|
|
if (ORTE_SUCCESS != rc) {
|
|
goto cleanup;
|
|
}
|
|
|
|
rank += app->num_procs;
|
|
opal_argv_free(mapped_nodes);
|
|
mapped_nodes = NULL;
|
|
}
|
|
|
|
/* save mapping to the registry */
|
|
if(ORTE_SUCCESS != (rc = orte_rmaps_base_set_map(jobid, &mapping))) {
|
|
goto cleanup;
|
|
}
|
|
|
|
/* save vpid start/range on the job segment */
|
|
rc = orte_rmaps_base_set_vpid_range(jobid,vpid_start,num_procs);
|
|
|
|
cleanup:
|
|
while(NULL != (item = opal_list_remove_first(&nodes))) {
|
|
OBJ_RELEASE(item);
|
|
}
|
|
OBJ_DESTRUCT(&nodes);
|
|
|
|
while(NULL != (item = opal_list_remove_first(&mapping))) {
|
|
OBJ_RELEASE(item);
|
|
}
|
|
|
|
OBJ_DESTRUCT(&mapping);
|
|
|
|
if( NULL != mapped_nodes ) {
|
|
opal_argv_free(mapped_nodes);
|
|
}
|
|
|
|
return rc;
|
|
}
|
|
|
|
|
|
static int orte_rmaps_rr_finalize(void)
|
|
{
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
|
|
orte_rmaps_base_module_t orte_rmaps_round_robin_module = {
|
|
orte_rmaps_rr_map,
|
|
orte_rmaps_rr_finalize
|
|
};
|
|
|