1
1

(copied from a mail that has a lengthy description of this commit)

I spoke with Tim about this the other day -- he gave me the green
light to go ahead with this, but it turned into a bigger job than I
thought it would be.  I revamped how the default RAS scheduling and
round_robin RMAPS mapping occurs.  The previous algorithms were pretty
brain dead, and ignored the "slots" and "max_slots" tokens in
hostfiles.  I considered this a big enough problem to fix it for the
beta (because there is currently no way to control where processes are
launched on SMPs).

There's still some more bells and whistles that I'd like to implement,
but there's no hurry, and they can go on the trunk at any time.  My
patches below are for what I considered "essential", and do the
following:

- honor the "slots" and "max-slots" tokens in the hostfile (and all
  their synonyms), meaning that we allocate/map until we fill slots,
  and if there are still more processes to allocate/map, we keep going
  until we fill max-slots (i.e., only oversubscribe a node if we have
  to).

- offer two different algorithms, currently supported by two new
  options to orterun.  Remember that there are two parts here -- slot
  allocation and process mapping.  Slot allocation controls how many
  processes we'll be running on a node.  After that decision has been
  made, process mapping effectively controls where the ranks of
  MPI_COMM_WORLD (MCW) are placed. Some of the examples given below
  don't make sense unless you remember that there is a difference
  between the two (which makes total sense, but you have to think
  about it in terms of both things):

1. "-bynode": allocates/maps one process per node in a round-robin
fashion until all slots on the node are taken.  If we still have more
processes after all slots are taken, then keep going until all
max-slots are taken.  Examples:

- The hostfile:

  eddie slots=2 max-slots=4
  vogon slots=4 max-slots=8

- orterun -bynode -np 6 -hostfile hostfile a.out

  eddie: MCW ranks 0, 2
  vogon: MCW ranks 1, 3, 4, 5

- orterun -bynode -np 8 -hostfile hostfile a.out

  eddie: MCW ranks 0, 2, 4
  vogon: MCW ranks 1, 3, 5, 6, 7
  -> the algorithm oversubscribes all nodes "equally" (until each
  node's max_slots is hit, of course)

- orterun -bynode -np 12 -hostfile hostfile a.out

  eddie: MCW ranks 0, 2, 4, 6
  vogon: MCW ranks 1, 3, 5, 7, 8, 9, 10, 11

2. "-byslot" (this is the default if you don't specify -bynode):
greedily takes all available slots on a node for a job before moving
on to the next node.  If we still have processes to allocate/schedule,
then oversubscribe all nodes equally (i.e., go round robin on all
nodes until each node's max_slots is hit).  Examples:

- The hostfile

  eddie slots=2 max-slots=4
  vogon slots=4 max-slots=8

- orterun -np 6 -hostfile hostfile a.out

  eddie: MCW ranks 0, 1
  vogon: MCW ranks 2, 3, 4, 5

- orterun -np 8 -hostfile hostfile a.out

  eddie: MCW ranks 0, 1, 2
  vogon: MCW ranks 3, 4, 5, 6, 7
  -> the algorithm oversubscribes all nodes "equally" (until max_slots
  is hit)

- orterun -np 12 -hostfile hostfile a.out

  eddie: MCW ranks 0, 1, 2, 3
  vogon: MCW ranks 4, 5, 6, 7, 8, 9, 10, 11

The above examples are fairly contrived, and it's not clear from them
that you can get different allocation answers in all cases (the
mapping differences are obvious).  Consider the following allocation
example:

- The hostfile

  eddie count=4
  vogon count=4
  earth count=4
  deep-thought count=4

- orterun -np 8 -hostfile hostfile a.out

  eddie: 4 slots will be allocated
  vogon: 4 slots will be allocated
  earth: no slots allocated
  deep-thought: no slots allocated

- orterun -bynode -np 8 -hostfile hostfile a.out

  eddie: 2 slots will be allocated
  vogon: 2 slots will be allocated
  earth: 2 slots will be allocated
  deep-thought: 2 slots will be allocated

This commit was SVN r5894.
Этот коммит содержится в:
Jeff Squyres 2005-05-31 16:36:53 +00:00
родитель 497580441d
Коммит c80f54052e
9 изменённых файлов: 468 добавлений и 98 удалений

Просмотреть файл

@ -61,7 +61,10 @@ ORTE_DECLSPEC int orte_ras_base_close(void);
ORTE_DECLSPEC orte_ras_base_module_t* orte_ras_base_select(const char*);
ORTE_DECLSPEC int orte_ras_base_allocate(orte_jobid_t job);
ORTE_DECLSPEC int orte_ras_base_deallocate(orte_jobid_t job);
ORTE_DECLSPEC int orte_ras_base_allocate_nodes(orte_jobid_t jobid, ompi_list_t* nodes);
ORTE_DECLSPEC int orte_ras_base_allocate_nodes_by_node(orte_jobid_t jobid,
ompi_list_t* nodes);
ORTE_DECLSPEC int orte_ras_base_allocate_nodes_by_slot(orte_jobid_t jobid,
ompi_list_t* nodes);
/*
* globals that might be needed

Просмотреть файл

@ -26,73 +26,187 @@
#include "mca/errmgr/errmgr.h"
/**
*
/*
* Allocate one process per node on a round-robin basis, looping back
* around to the beginning as necessary
*/
static int orte_ras_base_node_compare(orte_ras_base_node_t** n1,
orte_ras_base_node_t** n2)
{
if((*n1)->node_slots_inuse < (*n2)->node_slots_inuse) {
return -1;
} else if((*n1)->node_slots_inuse > (*n2)->node_slots_inuse) {
return 1;
}
return 0;
}
/**
*
*/
int orte_ras_base_allocate_nodes(orte_jobid_t jobid, ompi_list_t* nodes)
int orte_ras_base_allocate_nodes_by_node(orte_jobid_t jobid,
ompi_list_t* nodes)
{
ompi_list_t allocated;
ompi_list_item_t* item;
size_t num_requested = 0;
size_t num_allocated = 0;
size_t num_constrained = 0;
size_t slots;
bool oversubscribe = false;
int rc;
/* query for the number of process slots required */
if(ORTE_SUCCESS != (rc = orte_rmgr_base_get_job_slots(jobid, &num_requested))) {
if (ORTE_SUCCESS !=
(rc = orte_rmgr_base_get_job_slots(jobid, &num_requested))) {
return rc;
}
/* sort the node list by proc slots inuse - lowest to highest */
ompi_list_sort(nodes, (ompi_list_item_compare_fn_t)orte_ras_base_node_compare);
OBJ_CONSTRUCT(&allocated, ompi_list_t);
num_allocated = 0;
/* iterate through nodes until request is satisfied or all are oversubscribed */
while(num_allocated < num_requested) {
/* This loop continues until all procs have been allocated or we run
out of resources. There are two definitions of "run out of
resources":
1. All nodes have node_slots processes allocated to them
2. All nodes have node_slots_max processes allocated to them
We first map until condition #1 is met. If there are still
processes that haven't been allocated yet, then we continue
until condition #2 is met. If we still have processes that
haven't been allocated yet, then it's an "out of resources"
error. */
while (num_allocated < num_requested) {
num_constrained = 0;
for(item = ompi_list_get_first(nodes);
item != ompi_list_get_end(nodes) && num_allocated < num_requested;
item = ompi_list_get_next(item)) {
/* loop over all nodes until either all processes are
allocated or they all become constrained */
for (item = ompi_list_get_first(nodes);
item != ompi_list_get_end(nodes) && num_allocated < num_requested;
item = ompi_list_get_next(item)) {
orte_ras_base_node_t* node = (orte_ras_base_node_t*)item;
/* are any slots available */
if (node->node_slots_inuse >= node->node_slots) {
/* if there is a constraint on the max number of slots - skip this node */
if(node->node_slots_max && node->node_slots_inuse >= node->node_slots_max) {
num_constrained++;
continue;
}
/* are any slots available? */
slots = (oversubscribe ? node->node_slots_max : node->node_slots);
if (node->node_slots_inuse < slots ||
(oversubscribe && 0 == slots)) {
++num_allocated;
++node->node_slots_inuse; /* running total */
++node->node_slots_alloc; /* this job */
} else {
++num_constrained;
}
/* otherwise take one slot on this node */
num_allocated++;
node->node_slots_inuse++; /* running total */
node->node_slots_alloc++; /* this job */
}
if(num_constrained == ompi_list_get_size(nodes)) {
/* if all nodes are constrained:
- if this is the first time through the loop, then set
"oversubscribe" to true, and we'll now start obeying
node_slots_max instead of node_slots
- if this is the second time through the loop, then all
nodes are full to the max, and therefore we can't do
anything more -- we're out of resources */
if (ompi_list_get_size(nodes) == num_constrained) {
if (oversubscribe) {
rc = ORTE_ERR_OUT_OF_RESOURCE;
goto cleanup;
} else {
oversubscribe = true;
}
}
}
/* move all nodes w/ allocations to the allocated list */
item = ompi_list_get_first(nodes);
while(item != ompi_list_get_end(nodes)) {
orte_ras_base_node_t* node = (orte_ras_base_node_t*)item;
ompi_list_item_t* next = ompi_list_get_next(item);
if(node->node_slots_alloc) {
ompi_list_remove_item(nodes, item);
ompi_list_append(&allocated, item);
}
item = next;
}
rc = orte_ras_base_node_assign(&allocated, jobid);
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
}
cleanup:
while(NULL != (item = ompi_list_remove_first(&allocated)))
ompi_list_append(nodes, item);
OBJ_DESTRUCT(&allocated);
return rc;
}
/*
* Allocate processes to nodes, using all available slots on a node.
*/
int orte_ras_base_allocate_nodes_by_slot(orte_jobid_t jobid,
ompi_list_t* nodes)
{
ompi_list_t allocated;
ompi_list_item_t* item;
size_t num_requested = 0;
size_t num_allocated = 0;
size_t num_constrained = 0;
size_t available;
int rc;
/* query for the number of process slots required */
if (ORTE_SUCCESS !=
(rc = orte_rmgr_base_get_job_slots(jobid, &num_requested))) {
return rc;
}
OBJ_CONSTRUCT(&allocated, ompi_list_t);
num_allocated = 0;
/* In the first pass, just grab all available slots (i.e., stay <=
node_slots) greedily off each node */
for (item = ompi_list_get_first(nodes);
item != ompi_list_get_end(nodes) && num_allocated < num_requested;
item = ompi_list_get_next(item)) {
orte_ras_base_node_t* node = (orte_ras_base_node_t*)item;
/* are any slots available? */
if (node->node_slots_inuse < node->node_slots) {
available = node->node_slots - node->node_slots_inuse;
if (num_requested - num_allocated < available) {
node->node_slots_inuse +=
(num_requested - num_allocated); /* running total */
node->node_slots_alloc +=
(num_requested - num_allocated); /* this job */
num_allocated = num_requested;
} else {
num_allocated += available;
node->node_slots_inuse += available; /* running total */
node->node_slots_alloc += available; /* this job */
}
}
}
/* If we're not done, then we're in an oversubscribing situation.
Switch to a round-robin-by-node policy -- take one slot from
each node until we hit node_slots_max or we have no more
resources; whichever occurs first. */
while (num_allocated < num_requested) {
num_constrained = 0;
/* loop over all nodes until either all processes are
allocated or they all become constrained */
for (item = ompi_list_get_first(nodes);
item != ompi_list_get_end(nodes) && num_allocated < num_requested;
item = ompi_list_get_next(item)) {
orte_ras_base_node_t* node = (orte_ras_base_node_t*)item;
/* are any slots available? */
if (node->node_slots_inuse < node->node_slots_max ||
0 == node->node_slots_max) {
++num_allocated;
++node->node_slots_inuse; /* running total */
++node->node_slots_alloc; /* this job */
} else {
++num_constrained;
}
}
/* if all nodes are constrained, then we're out of resources
-- thanks for playing */
if (ompi_list_get_size(nodes) == num_constrained) {
rc = ORTE_ERR_OUT_OF_RESOURCE;
goto cleanup;
}
}
/* move all nodes w/ allocations to the allocated list */
item = ompi_list_get_first(nodes);
while(item != ompi_list_get_end(nodes)) {
@ -117,4 +231,3 @@ cleanup:
OBJ_DESTRUCT(&allocated);
return rc;
}

Просмотреть файл

@ -39,8 +39,17 @@ static int orte_ras_host_allocate(orte_jobid_t jobid)
if(ORTE_SUCCESS != (rc = orte_ras_base_node_query(&nodes))) {
goto cleanup;
}
if(ORTE_SUCCESS != (rc = orte_ras_base_allocate_nodes(jobid, &nodes))) {
goto cleanup;
if (0 == strcmp(mca_ras_host_component.schedule_policy, "node")) {
if (ORTE_SUCCESS !=
(rc = orte_ras_base_allocate_nodes_by_node(jobid, &nodes))) {
goto cleanup;
}
} else {
if (ORTE_SUCCESS !=
(rc = orte_ras_base_allocate_nodes_by_slot(jobid, &nodes))) {
goto cleanup;
}
}
cleanup:

Просмотреть файл

@ -34,6 +34,7 @@ struct orte_ras_host_component_t {
orte_ras_base_component_t super;
int debug;
int priority;
char *schedule_policy;
};
typedef struct orte_ras_host_component_t orte_ras_host_component_t;

Просмотреть файл

@ -76,13 +76,29 @@ static int orte_ras_host_param_register_int(
}
static char *orte_rmaps_round_robin_param_register_string(
const char* param_name,
char *default_value)
{
int id = mca_base_param_register_string("ras", "host",
param_name, NULL, default_value);
char *param_value = default_value;
mca_base_param_lookup_string(id,&param_value);
return param_value;
}
/**
* component open/close/init function
*/
static int orte_ras_host_open(void)
{
mca_ras_host_component.debug = orte_ras_host_param_register_int("debug",1);
mca_ras_host_component.priority = orte_ras_host_param_register_int("priority",1);
mca_ras_host_component.debug =
orte_ras_host_param_register_int("debug", 1);
mca_ras_host_component.priority =
orte_ras_host_param_register_int("priority", 1);
mca_ras_host_component.schedule_policy =
orte_rmaps_round_robin_param_register_string("policy", "slot");
return ORTE_SUCCESS;
}

Просмотреть файл

@ -31,10 +31,75 @@
/*
* Create a default mapping for the application.
* Local variable
*/
static ompi_list_item_t *cur_node_item = NULL;
static int orte_rmaps_rr_map_app(
static int claim_slot(orte_rmaps_base_map_t *map,
ompi_list_t *nodes,
orte_ras_base_node_t *current_node,
orte_jobid_t jobid, orte_vpid_t vpid, int proc_index)
{
orte_rmaps_base_proc_t *proc;
orte_process_name_t *proc_name;
orte_rmaps_base_node_t *rmaps_node;
int rc;
/* create objects */
rmaps_node = OBJ_NEW(orte_rmaps_base_node_t);
if (NULL == rmaps_node) {
return ORTE_ERR_OUT_OF_RESOURCE;
}
rmaps_node->node_name = strdup(current_node->node_name);
proc = OBJ_NEW(orte_rmaps_base_proc_t);
if (NULL == proc) {
OBJ_RELEASE(rmaps_node);
return ORTE_ERR_OUT_OF_RESOURCE;
}
/* create the process name as an offset from the vpid-start */
rc = orte_ns.create_process_name(&proc_name, current_node->node_cellid,
jobid, vpid);
if (rc != ORTE_SUCCESS) {
OBJ_RELEASE(proc);
OBJ_RELEASE(rmaps_node);
return rc;
}
proc->proc_node = rmaps_node;
proc->proc_name = *proc_name;
proc->proc_rank = vpid;
orte_ns.free_name(&proc_name);
OBJ_RETAIN(proc); /* bump reference count for the node */
ompi_list_append(&rmaps_node->node_procs, &proc->super);
map->procs[proc_index] = proc;
/* Save this node on the map */
ompi_list_append(&map->nodes, &rmaps_node->super);
/* Decrease the number of slots available for allocation
on this node */
--current_node->node_slots_alloc;
if (current_node->node_slots_alloc == 0) {
ompi_list_remove_item(nodes, (ompi_list_item_t*) current_node);
OBJ_RELEASE(current_node);
}
return ORTE_SUCCESS;
}
/*
* Create a default mapping for the application, scheduling round
* robin by node.
*
* NOTE: This function assumes that the allocator has already setup
* the list of nodes such that the sum of the node_slots_alloc fields
* from all entries will be the total number of processes in all the
* apps.
*/
static int map_app_by_node(
orte_app_context_t* app,
orte_rmaps_base_map_t* map,
orte_jobid_t jobid,
@ -42,61 +107,164 @@ static int orte_rmaps_rr_map_app(
int rank,
ompi_list_t* nodes)
{
/* build a nodelist and assign process slots in order */
int rc;
size_t num_alloc = 0;
size_t proc_index = 0;
ompi_list_item_t* item;
ompi_list_item_t *start, *next;
orte_ras_base_node_t *node;
bool did_alloc;
item = ompi_list_get_first(nodes);
while(item != ompi_list_get_end(nodes)) {
ompi_list_item_t* next = ompi_list_get_next(item);
orte_ras_base_node_t* node = (orte_ras_base_node_t*)item;
orte_rmaps_base_node_t* rmaps_node = OBJ_NEW(orte_rmaps_base_node_t);
size_t i, num_procs;
if(NULL == rmaps_node) {
return ORTE_ERR_OUT_OF_RESOURCE;
}
rmaps_node->node_name = strdup(node->node_name);
/* Note that cur_node_item already points to the Right place in
the node list to start looking (i.e., if this is the first time
through, it'll point to the first item. If this is not the
first time through -- i.e., we have multiple app contexts --
it'll point to where we left off last time.).
if(num_alloc + node->node_slots_alloc >= (size_t)app->num_procs) {
num_procs = app->num_procs - num_alloc;
} else {
num_procs = node->node_slots_alloc;
}
But do a bozo check to ensure that we don't have a empty node
list. */
if (ompi_list_get_end(nodes) == cur_node_item) {
return ORTE_ERR_TEMP_OUT_OF_RESOURCE;
}
start = cur_node_item;
/* assign the next num_procs to this node */
for(i=0; i<num_procs; i++) {
orte_rmaps_base_proc_t* proc = OBJ_NEW(orte_rmaps_base_proc_t);
orte_process_name_t* proc_name;
int rc;
/* This loop continues until all procs have been mapped or we run
out of resources. There are two definitions of "run out of
resources":
/* create the process name as an offset from the vpid-start */
rc = orte_ns.create_process_name(&proc_name, node->node_cellid, jobid, vpid_start+rank);
if(rc != ORTE_SUCCESS) {
OBJ_RELEASE(proc);
1. All nodes have node_slots processes mapped to them
2. All nodes have node_slots_max processes mapped to them
We first map until condition #1 is met. If there are still
processes that haven't been mapped yet, then we continue until
condition #2 is met. If we still have processes that haven't
been mapped yet, then it's an "out of resources" error. */
did_alloc = false;
while (num_alloc < app->num_procs) {
node = (orte_ras_base_node_t*) cur_node_item;
next = ompi_list_get_next(cur_node_item);
/* If we have an available slot on this node, claim it */
if (node->node_slots_alloc > 0) {
fflush(stdout);
rc = claim_slot(map, nodes, node, jobid, vpid_start + rank,
proc_index);
if (ORTE_SUCCESS != rc) {
return rc;
}
proc->proc_node = rmaps_node;
proc->proc_name = *proc_name;
proc->proc_rank = rank;
rank++;
orte_ns.free_name(&proc_name);
OBJ_RETAIN(proc); /* bump reference count for the node */
ompi_list_append(&rmaps_node->node_procs, &proc->super);
map->procs[proc_index++] = proc;
++rank;
++proc_index;
/* Save the fact that we successfully allocated a process
to a node in this round */
did_alloc = true;
/* Increase the number of procs allocated and see if we're
done */
++num_alloc;
}
node->node_slots_alloc -= num_procs;
if(node->node_slots_alloc == 0) {
ompi_list_remove_item(nodes,item);
OBJ_RELEASE(item);
/* Move on to the next node */
cur_node_item = next;
if (ompi_list_get_end(nodes) == cur_node_item) {
cur_node_item = ompi_list_get_first(nodes);
}
num_alloc += num_procs;
if(num_alloc == (size_t)app->num_procs)
/* Are we done? */
if (num_alloc == app->num_procs) {
break;
item = next;
ompi_list_append(&map->nodes, &rmaps_node->super);
}
/* Double check that the list is not empty */
if (ompi_list_get_end(nodes) == cur_node_item) {
return ORTE_ERR_TEMP_OUT_OF_RESOURCE;
}
/* If we looped around without allocating any new processes,
then we're full */
if (start == cur_node_item) {
if (!did_alloc) {
return ORTE_ERR_TEMP_OUT_OF_RESOURCE;
}
}
}
map->num_procs = num_alloc;
return ORTE_SUCCESS;
}
/*
* Create a default mapping for the application, scheduling one round
* robin by slot.
*
* NOTE: This function assumes that the allocator has already setup
* the list of nodes such that the sum of the node_slots_alloc fields
* from all entries will be the total number of processes in all the
* apps.
*/
static int map_app_by_slot(
orte_app_context_t* app,
orte_rmaps_base_map_t* map,
orte_jobid_t jobid,
orte_vpid_t vpid_start,
int rank,
ompi_list_t* nodes)
{
int rc;
size_t num_alloc = 0;
size_t proc_index = 0;
ompi_list_item_t *next;
orte_ras_base_node_t *node;
/* Note that cur_node_item already points to the Right place in
the node list to start looking (i.e., if this is the first time
through, it'll point to the first item. If this is not the
first time through -- i.e., we have multiple app contexts --
it'll point to where we left off last time.).
But do a bozo check to ensure that we don't have a empty node
list. */
if (ompi_list_get_end(nodes) == cur_node_item) {
return ORTE_ERR_TEMP_OUT_OF_RESOURCE;
}
/* Go through all nodes and take up to node_slots_alloc slots and
map it to this job */
while (ompi_list_get_end(nodes) != cur_node_item &&
num_alloc < app->num_procs) {
node = (orte_ras_base_node_t*) cur_node_item;
next = ompi_list_get_next(cur_node_item);
/* If we have available slots on this node, claim it */
while (node->node_slots_alloc > 0 &&
num_alloc < app->num_procs) {
fflush(stdout);
rc = claim_slot(map, nodes, node, jobid, vpid_start + rank,
proc_index);
if (ORTE_SUCCESS != rc) {
return rc;
}
++rank;
++proc_index;
/* Increase the number of procs allocated and see if we're
done */
++num_alloc;
}
/* Move on to the next node */
cur_node_item = next;
}
/* Did we allocate everything? */
if (num_alloc < app->num_procs) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
map->num_procs = num_alloc;
return ORTE_SUCCESS;
}
@ -117,12 +285,20 @@ static int orte_rmaps_rr_map(orte_jobid_t jobid)
size_t num_procs = 0;
int rank = 0;
int rc = ORTE_SUCCESS;
bool bynode = true;
/* query for the application context and allocated nodes */
if(ORTE_SUCCESS != (rc = orte_rmgr_base_get_app_context(jobid, &context, &num_context))) {
return rc;
}
/* which policy should we use? */
if (0 == strcmp(mca_rmaps_round_robin_component.schedule_policy, "node")) {
bynode = true;
} else {
bynode = false;
}
/* total number of procs required */
for(i=0; i<num_context; i++) {
orte_app_context_t* app = context[i];
@ -143,6 +319,7 @@ static int orte_rmaps_rr_map(orte_jobid_t jobid)
/* construct a default mapping */
OBJ_CONSTRUCT(&mapping, ompi_list_t);
cur_node_item = ompi_list_get_first(&nodes);
for(i=0; i<num_context; i++) {
orte_app_context_t* app = context[i];
orte_rmaps_base_map_t* map = OBJ_NEW(orte_rmaps_base_map_t);
@ -153,12 +330,17 @@ static int orte_rmaps_rr_map(orte_jobid_t jobid)
ompi_list_append(&mapping, &map->super);
map->app = app;
map->procs = malloc(sizeof(orte_rmaps_base_proc_t*)*app->num_procs);
map->procs = malloc(sizeof(orte_rmaps_base_proc_t*) * app->num_procs);
if(NULL == map->procs) {
rc = ORTE_ERR_OUT_OF_RESOURCE;
goto cleanup;
}
if(ORTE_SUCCESS != (rc = orte_rmaps_rr_map_app(app,map,jobid,vpid_start,rank,&nodes))) {
if (bynode) {
rc = map_app_by_node(app, map, jobid, vpid_start, rank, &nodes);
} else {
rc = map_app_by_slot(app, map, jobid, vpid_start, rank, &nodes);
}
if (ORTE_SUCCESS != rc) {
goto cleanup;
}
rank += app->num_procs;

Просмотреть файл

@ -34,6 +34,7 @@ struct orte_rmaps_round_robin_component_t {
orte_rmaps_base_component_t super;
int debug;
int priority;
char *schedule_policy;
};
typedef struct orte_rmaps_round_robin_component_t orte_rmaps_round_robin_component_t;

Просмотреть файл

@ -69,20 +69,37 @@ static int orte_rmaps_round_robin_param_register_int(
const char* param_name,
int default_value)
{
int id = mca_base_param_register_int("rmaps","round_robin",param_name,NULL,default_value);
int id = mca_base_param_register_int("rmaps","round_robin",
param_name,NULL,default_value);
int param_value = default_value;
mca_base_param_lookup_int(id,&param_value);
return param_value;
}
static char *orte_rmaps_round_robin_param_register_string(
const char* param_name,
char *default_value)
{
int id = mca_base_param_register_string("rmaps","round_robin",
param_name,NULL,default_value);
char *param_value = default_value;
mca_base_param_lookup_string(id,&param_value);
return param_value;
}
/**
* component open/close/init function
*/
static int orte_rmaps_round_robin_open(void)
{
mca_rmaps_round_robin_component.debug = orte_rmaps_round_robin_param_register_int("debug",1);
mca_rmaps_round_robin_component.priority = orte_rmaps_round_robin_param_register_int("priority",1);
mca_rmaps_round_robin_component.debug =
orte_rmaps_round_robin_param_register_int("debug", 1);
mca_rmaps_round_robin_component.priority =
orte_rmaps_round_robin_param_register_int("priority", 1);
mca_rmaps_round_robin_component.schedule_policy =
orte_rmaps_round_robin_param_register_string("policy", "slot");
return ORTE_SUCCESS;
}

Просмотреть файл

@ -86,6 +86,8 @@ struct globals_t {
bool verbose;
bool exit;
bool no_wait_for_job_completion;
bool by_node;
bool by_slot;
size_t num_procs;
int exit_status;
char *hostfile;
@ -159,6 +161,12 @@ ompi_cmd_line_init_t cmd_line_init[] = {
{ NULL, NULL, NULL, '\0', NULL, "map", 1,
NULL, OMPI_CMD_LINE_TYPE_STRING,
"Mapping of processes to nodes / CPUs" },
{ NULL, NULL, NULL, '\0', "bynode", "bynode", 0,
&orterun_globals.by_node, OMPI_CMD_LINE_TYPE_BOOL,
"Whether to allocate/map processes round-robin by node" },
{ NULL, NULL, NULL, '\0', "byslot", "byslot", 0,
&orterun_globals.by_slot, OMPI_CMD_LINE_TYPE_BOOL,
"Whether to allocate/map processes round-robin by slot (the default)" },
/* mpiexec-like arguments */
{ NULL, NULL, NULL, '\0', "wdir", "wdir", 1,
@ -576,6 +584,8 @@ static int init_globals(void)
false,
false,
false,
false,
true,
0,
0,
NULL,
@ -606,6 +616,7 @@ static int init_globals(void)
static int parse_globals(int argc, char* argv[])
{
ompi_cmd_line_t cmd_line;
int ras, rmaps;
/* Setup and parse the command line */
@ -633,6 +644,23 @@ static int parse_globals(int argc, char* argv[])
exit(0);
}
/* Allocate and map by node or by slot? Shortcut for setting 2
MCA params. */
ras = mca_base_param_register_string("ras", "host", "policy", NULL,
"slot");
rmaps = mca_base_param_register_string("rmaps", "round_robin", "policy",
NULL, "slot");
if (orterun_globals.by_node) {
orterun_globals.by_slot = false;
mca_base_param_set_string(ras, "node");
mca_base_param_set_string(rmaps, "node");
} else {
orterun_globals.by_slot = true;
mca_base_param_set_string(ras, "slot");
mca_base_param_set_string(rmaps, "slot");
}
/* If we don't want to wait, we don't want to wait */
if (orterun_globals.no_wait_for_job_completion) {