(copied from a mail that has a lengthy description of this commit)
I spoke with Tim about this the other day -- he gave me the green light to go ahead with this, but it turned into a bigger job than I thought it would be. I revamped how the default RAS scheduling and round_robin RMAPS mapping occurs. The previous algorithms were pretty brain dead, and ignored the "slots" and "max_slots" tokens in hostfiles. I considered this a big enough problem to fix it for the beta (because there is currently no way to control where processes are launched on SMPs). There's still some more bells and whistles that I'd like to implement, but there's no hurry, and they can go on the trunk at any time. My patches below are for what I considered "essential", and do the following: - honor the "slots" and "max-slots" tokens in the hostfile (and all their synonyms), meaning that we allocate/map until we fill slots, and if there are still more processes to allocate/map, we keep going until we fill max-slots (i.e., only oversubscribe a node if we have to). - offer two different algorithms, currently supported by two new options to orterun. Remember that there are two parts here -- slot allocation and process mapping. Slot allocation controls how many processes we'll be running on a node. After that decision has been made, process mapping effectively controls where the ranks of MPI_COMM_WORLD (MCW) are placed. Some of the examples given below don't make sense unless you remember that there is a difference between the two (which makes total sense, but you have to think about it in terms of both things): 1. "-bynode": allocates/maps one process per node in a round-robin fashion until all slots on the node are taken. If we still have more processes after all slots are taken, then keep going until all max-slots are taken. Examples: - The hostfile: eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -bynode -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 2 vogon: MCW ranks 1, 3, 4, 5 - orterun -bynode -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4 vogon: MCW ranks 1, 3, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until each node's max_slots is hit, of course) - orterun -bynode -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 2, 4, 6 vogon: MCW ranks 1, 3, 5, 7, 8, 9, 10, 11 2. "-byslot" (this is the default if you don't specify -bynode): greedily takes all available slots on a node for a job before moving on to the next node. If we still have processes to allocate/schedule, then oversubscribe all nodes equally (i.e., go round robin on all nodes until each node's max_slots is hit). Examples: - The hostfile eddie slots=2 max-slots=4 vogon slots=4 max-slots=8 - orterun -np 6 -hostfile hostfile a.out eddie: MCW ranks 0, 1 vogon: MCW ranks 2, 3, 4, 5 - orterun -np 8 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2 vogon: MCW ranks 3, 4, 5, 6, 7 -> the algorithm oversubscribes all nodes "equally" (until max_slots is hit) - orterun -np 12 -hostfile hostfile a.out eddie: MCW ranks 0, 1, 2, 3 vogon: MCW ranks 4, 5, 6, 7, 8, 9, 10, 11 The above examples are fairly contrived, and it's not clear from them that you can get different allocation answers in all cases (the mapping differences are obvious). Consider the following allocation example: - The hostfile eddie count=4 vogon count=4 earth count=4 deep-thought count=4 - orterun -np 8 -hostfile hostfile a.out eddie: 4 slots will be allocated vogon: 4 slots will be allocated earth: no slots allocated deep-thought: no slots allocated - orterun -bynode -np 8 -hostfile hostfile a.out eddie: 2 slots will be allocated vogon: 2 slots will be allocated earth: 2 slots will be allocated deep-thought: 2 slots will be allocated This commit was SVN r5894.
Этот коммит содержится в:
родитель
497580441d
Коммит
c80f54052e
@ -61,7 +61,10 @@ ORTE_DECLSPEC int orte_ras_base_close(void);
|
||||
ORTE_DECLSPEC orte_ras_base_module_t* orte_ras_base_select(const char*);
|
||||
ORTE_DECLSPEC int orte_ras_base_allocate(orte_jobid_t job);
|
||||
ORTE_DECLSPEC int orte_ras_base_deallocate(orte_jobid_t job);
|
||||
ORTE_DECLSPEC int orte_ras_base_allocate_nodes(orte_jobid_t jobid, ompi_list_t* nodes);
|
||||
ORTE_DECLSPEC int orte_ras_base_allocate_nodes_by_node(orte_jobid_t jobid,
|
||||
ompi_list_t* nodes);
|
||||
ORTE_DECLSPEC int orte_ras_base_allocate_nodes_by_slot(orte_jobid_t jobid,
|
||||
ompi_list_t* nodes);
|
||||
|
||||
/*
|
||||
* globals that might be needed
|
||||
|
@ -26,73 +26,187 @@
|
||||
#include "mca/errmgr/errmgr.h"
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
/*
|
||||
* Allocate one process per node on a round-robin basis, looping back
|
||||
* around to the beginning as necessary
|
||||
*/
|
||||
|
||||
static int orte_ras_base_node_compare(orte_ras_base_node_t** n1,
|
||||
orte_ras_base_node_t** n2)
|
||||
{
|
||||
if((*n1)->node_slots_inuse < (*n2)->node_slots_inuse) {
|
||||
return -1;
|
||||
} else if((*n1)->node_slots_inuse > (*n2)->node_slots_inuse) {
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
|
||||
int orte_ras_base_allocate_nodes(orte_jobid_t jobid, ompi_list_t* nodes)
|
||||
int orte_ras_base_allocate_nodes_by_node(orte_jobid_t jobid,
|
||||
ompi_list_t* nodes)
|
||||
{
|
||||
ompi_list_t allocated;
|
||||
ompi_list_item_t* item;
|
||||
size_t num_requested = 0;
|
||||
size_t num_allocated = 0;
|
||||
size_t num_constrained = 0;
|
||||
size_t slots;
|
||||
bool oversubscribe = false;
|
||||
int rc;
|
||||
|
||||
/* query for the number of process slots required */
|
||||
if(ORTE_SUCCESS != (rc = orte_rmgr_base_get_job_slots(jobid, &num_requested))) {
|
||||
if (ORTE_SUCCESS !=
|
||||
(rc = orte_rmgr_base_get_job_slots(jobid, &num_requested))) {
|
||||
return rc;
|
||||
}
|
||||
/* sort the node list by proc slots inuse - lowest to highest */
|
||||
ompi_list_sort(nodes, (ompi_list_item_compare_fn_t)orte_ras_base_node_compare);
|
||||
|
||||
OBJ_CONSTRUCT(&allocated, ompi_list_t);
|
||||
num_allocated = 0;
|
||||
|
||||
/* iterate through nodes until request is satisfied or all are oversubscribed */
|
||||
while(num_allocated < num_requested) {
|
||||
/* This loop continues until all procs have been allocated or we run
|
||||
out of resources. There are two definitions of "run out of
|
||||
resources":
|
||||
|
||||
1. All nodes have node_slots processes allocated to them
|
||||
2. All nodes have node_slots_max processes allocated to them
|
||||
|
||||
We first map until condition #1 is met. If there are still
|
||||
processes that haven't been allocated yet, then we continue
|
||||
until condition #2 is met. If we still have processes that
|
||||
haven't been allocated yet, then it's an "out of resources"
|
||||
error. */
|
||||
while (num_allocated < num_requested) {
|
||||
num_constrained = 0;
|
||||
for(item = ompi_list_get_first(nodes);
|
||||
item != ompi_list_get_end(nodes) && num_allocated < num_requested;
|
||||
item = ompi_list_get_next(item)) {
|
||||
|
||||
/* loop over all nodes until either all processes are
|
||||
allocated or they all become constrained */
|
||||
for (item = ompi_list_get_first(nodes);
|
||||
item != ompi_list_get_end(nodes) && num_allocated < num_requested;
|
||||
item = ompi_list_get_next(item)) {
|
||||
orte_ras_base_node_t* node = (orte_ras_base_node_t*)item;
|
||||
|
||||
/* are any slots available */
|
||||
if (node->node_slots_inuse >= node->node_slots) {
|
||||
|
||||
/* if there is a constraint on the max number of slots - skip this node */
|
||||
if(node->node_slots_max && node->node_slots_inuse >= node->node_slots_max) {
|
||||
num_constrained++;
|
||||
continue;
|
||||
}
|
||||
|
||||
/* are any slots available? */
|
||||
slots = (oversubscribe ? node->node_slots_max : node->node_slots);
|
||||
if (node->node_slots_inuse < slots ||
|
||||
(oversubscribe && 0 == slots)) {
|
||||
++num_allocated;
|
||||
++node->node_slots_inuse; /* running total */
|
||||
++node->node_slots_alloc; /* this job */
|
||||
} else {
|
||||
++num_constrained;
|
||||
}
|
||||
|
||||
/* otherwise take one slot on this node */
|
||||
num_allocated++;
|
||||
node->node_slots_inuse++; /* running total */
|
||||
node->node_slots_alloc++; /* this job */
|
||||
}
|
||||
if(num_constrained == ompi_list_get_size(nodes)) {
|
||||
|
||||
/* if all nodes are constrained:
|
||||
- if this is the first time through the loop, then set
|
||||
"oversubscribe" to true, and we'll now start obeying
|
||||
node_slots_max instead of node_slots
|
||||
- if this is the second time through the loop, then all
|
||||
nodes are full to the max, and therefore we can't do
|
||||
anything more -- we're out of resources */
|
||||
if (ompi_list_get_size(nodes) == num_constrained) {
|
||||
if (oversubscribe) {
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
goto cleanup;
|
||||
} else {
|
||||
oversubscribe = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* move all nodes w/ allocations to the allocated list */
|
||||
item = ompi_list_get_first(nodes);
|
||||
while(item != ompi_list_get_end(nodes)) {
|
||||
orte_ras_base_node_t* node = (orte_ras_base_node_t*)item;
|
||||
ompi_list_item_t* next = ompi_list_get_next(item);
|
||||
if(node->node_slots_alloc) {
|
||||
ompi_list_remove_item(nodes, item);
|
||||
ompi_list_append(&allocated, item);
|
||||
}
|
||||
item = next;
|
||||
}
|
||||
|
||||
rc = orte_ras_base_node_assign(&allocated, jobid);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
cleanup:
|
||||
|
||||
while(NULL != (item = ompi_list_remove_first(&allocated)))
|
||||
ompi_list_append(nodes, item);
|
||||
OBJ_DESTRUCT(&allocated);
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Allocate processes to nodes, using all available slots on a node.
|
||||
*/
|
||||
int orte_ras_base_allocate_nodes_by_slot(orte_jobid_t jobid,
|
||||
ompi_list_t* nodes)
|
||||
{
|
||||
ompi_list_t allocated;
|
||||
ompi_list_item_t* item;
|
||||
size_t num_requested = 0;
|
||||
size_t num_allocated = 0;
|
||||
size_t num_constrained = 0;
|
||||
size_t available;
|
||||
int rc;
|
||||
|
||||
/* query for the number of process slots required */
|
||||
if (ORTE_SUCCESS !=
|
||||
(rc = orte_rmgr_base_get_job_slots(jobid, &num_requested))) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
OBJ_CONSTRUCT(&allocated, ompi_list_t);
|
||||
num_allocated = 0;
|
||||
|
||||
/* In the first pass, just grab all available slots (i.e., stay <=
|
||||
node_slots) greedily off each node */
|
||||
for (item = ompi_list_get_first(nodes);
|
||||
item != ompi_list_get_end(nodes) && num_allocated < num_requested;
|
||||
item = ompi_list_get_next(item)) {
|
||||
orte_ras_base_node_t* node = (orte_ras_base_node_t*)item;
|
||||
|
||||
/* are any slots available? */
|
||||
if (node->node_slots_inuse < node->node_slots) {
|
||||
available = node->node_slots - node->node_slots_inuse;
|
||||
if (num_requested - num_allocated < available) {
|
||||
node->node_slots_inuse +=
|
||||
(num_requested - num_allocated); /* running total */
|
||||
node->node_slots_alloc +=
|
||||
(num_requested - num_allocated); /* this job */
|
||||
num_allocated = num_requested;
|
||||
} else {
|
||||
num_allocated += available;
|
||||
node->node_slots_inuse += available; /* running total */
|
||||
node->node_slots_alloc += available; /* this job */
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* If we're not done, then we're in an oversubscribing situation.
|
||||
Switch to a round-robin-by-node policy -- take one slot from
|
||||
each node until we hit node_slots_max or we have no more
|
||||
resources; whichever occurs first. */
|
||||
while (num_allocated < num_requested) {
|
||||
num_constrained = 0;
|
||||
|
||||
/* loop over all nodes until either all processes are
|
||||
allocated or they all become constrained */
|
||||
for (item = ompi_list_get_first(nodes);
|
||||
item != ompi_list_get_end(nodes) && num_allocated < num_requested;
|
||||
item = ompi_list_get_next(item)) {
|
||||
orte_ras_base_node_t* node = (orte_ras_base_node_t*)item;
|
||||
|
||||
/* are any slots available? */
|
||||
if (node->node_slots_inuse < node->node_slots_max ||
|
||||
0 == node->node_slots_max) {
|
||||
++num_allocated;
|
||||
++node->node_slots_inuse; /* running total */
|
||||
++node->node_slots_alloc; /* this job */
|
||||
} else {
|
||||
++num_constrained;
|
||||
}
|
||||
}
|
||||
|
||||
/* if all nodes are constrained, then we're out of resources
|
||||
-- thanks for playing */
|
||||
if (ompi_list_get_size(nodes) == num_constrained) {
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* move all nodes w/ allocations to the allocated list */
|
||||
item = ompi_list_get_first(nodes);
|
||||
while(item != ompi_list_get_end(nodes)) {
|
||||
@ -117,4 +231,3 @@ cleanup:
|
||||
OBJ_DESTRUCT(&allocated);
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
@ -39,8 +39,17 @@ static int orte_ras_host_allocate(orte_jobid_t jobid)
|
||||
if(ORTE_SUCCESS != (rc = orte_ras_base_node_query(&nodes))) {
|
||||
goto cleanup;
|
||||
}
|
||||
if(ORTE_SUCCESS != (rc = orte_ras_base_allocate_nodes(jobid, &nodes))) {
|
||||
goto cleanup;
|
||||
|
||||
if (0 == strcmp(mca_ras_host_component.schedule_policy, "node")) {
|
||||
if (ORTE_SUCCESS !=
|
||||
(rc = orte_ras_base_allocate_nodes_by_node(jobid, &nodes))) {
|
||||
goto cleanup;
|
||||
}
|
||||
} else {
|
||||
if (ORTE_SUCCESS !=
|
||||
(rc = orte_ras_base_allocate_nodes_by_slot(jobid, &nodes))) {
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
|
||||
cleanup:
|
||||
|
@ -34,6 +34,7 @@ struct orte_ras_host_component_t {
|
||||
orte_ras_base_component_t super;
|
||||
int debug;
|
||||
int priority;
|
||||
char *schedule_policy;
|
||||
};
|
||||
typedef struct orte_ras_host_component_t orte_ras_host_component_t;
|
||||
|
||||
|
@ -76,13 +76,29 @@ static int orte_ras_host_param_register_int(
|
||||
}
|
||||
|
||||
|
||||
static char *orte_rmaps_round_robin_param_register_string(
|
||||
const char* param_name,
|
||||
char *default_value)
|
||||
{
|
||||
int id = mca_base_param_register_string("ras", "host",
|
||||
param_name, NULL, default_value);
|
||||
char *param_value = default_value;
|
||||
mca_base_param_lookup_string(id,¶m_value);
|
||||
return param_value;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* component open/close/init function
|
||||
*/
|
||||
static int orte_ras_host_open(void)
|
||||
{
|
||||
mca_ras_host_component.debug = orte_ras_host_param_register_int("debug",1);
|
||||
mca_ras_host_component.priority = orte_ras_host_param_register_int("priority",1);
|
||||
mca_ras_host_component.debug =
|
||||
orte_ras_host_param_register_int("debug", 1);
|
||||
mca_ras_host_component.priority =
|
||||
orte_ras_host_param_register_int("priority", 1);
|
||||
mca_ras_host_component.schedule_policy =
|
||||
orte_rmaps_round_robin_param_register_string("policy", "slot");
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -31,10 +31,75 @@
|
||||
|
||||
|
||||
/*
|
||||
* Create a default mapping for the application.
|
||||
* Local variable
|
||||
*/
|
||||
static ompi_list_item_t *cur_node_item = NULL;
|
||||
|
||||
static int orte_rmaps_rr_map_app(
|
||||
|
||||
static int claim_slot(orte_rmaps_base_map_t *map,
|
||||
ompi_list_t *nodes,
|
||||
orte_ras_base_node_t *current_node,
|
||||
orte_jobid_t jobid, orte_vpid_t vpid, int proc_index)
|
||||
{
|
||||
orte_rmaps_base_proc_t *proc;
|
||||
orte_process_name_t *proc_name;
|
||||
orte_rmaps_base_node_t *rmaps_node;
|
||||
int rc;
|
||||
|
||||
/* create objects */
|
||||
rmaps_node = OBJ_NEW(orte_rmaps_base_node_t);
|
||||
if (NULL == rmaps_node) {
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
rmaps_node->node_name = strdup(current_node->node_name);
|
||||
|
||||
proc = OBJ_NEW(orte_rmaps_base_proc_t);
|
||||
if (NULL == proc) {
|
||||
OBJ_RELEASE(rmaps_node);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
/* create the process name as an offset from the vpid-start */
|
||||
rc = orte_ns.create_process_name(&proc_name, current_node->node_cellid,
|
||||
jobid, vpid);
|
||||
if (rc != ORTE_SUCCESS) {
|
||||
OBJ_RELEASE(proc);
|
||||
OBJ_RELEASE(rmaps_node);
|
||||
return rc;
|
||||
}
|
||||
proc->proc_node = rmaps_node;
|
||||
proc->proc_name = *proc_name;
|
||||
proc->proc_rank = vpid;
|
||||
orte_ns.free_name(&proc_name);
|
||||
OBJ_RETAIN(proc); /* bump reference count for the node */
|
||||
ompi_list_append(&rmaps_node->node_procs, &proc->super);
|
||||
map->procs[proc_index] = proc;
|
||||
|
||||
/* Save this node on the map */
|
||||
ompi_list_append(&map->nodes, &rmaps_node->super);
|
||||
|
||||
/* Decrease the number of slots available for allocation
|
||||
on this node */
|
||||
--current_node->node_slots_alloc;
|
||||
if (current_node->node_slots_alloc == 0) {
|
||||
ompi_list_remove_item(nodes, (ompi_list_item_t*) current_node);
|
||||
OBJ_RELEASE(current_node);
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Create a default mapping for the application, scheduling round
|
||||
* robin by node.
|
||||
*
|
||||
* NOTE: This function assumes that the allocator has already setup
|
||||
* the list of nodes such that the sum of the node_slots_alloc fields
|
||||
* from all entries will be the total number of processes in all the
|
||||
* apps.
|
||||
*/
|
||||
static int map_app_by_node(
|
||||
orte_app_context_t* app,
|
||||
orte_rmaps_base_map_t* map,
|
||||
orte_jobid_t jobid,
|
||||
@ -42,61 +107,164 @@ static int orte_rmaps_rr_map_app(
|
||||
int rank,
|
||||
ompi_list_t* nodes)
|
||||
{
|
||||
/* build a nodelist and assign process slots in order */
|
||||
int rc;
|
||||
size_t num_alloc = 0;
|
||||
size_t proc_index = 0;
|
||||
ompi_list_item_t* item;
|
||||
ompi_list_item_t *start, *next;
|
||||
orte_ras_base_node_t *node;
|
||||
bool did_alloc;
|
||||
|
||||
item = ompi_list_get_first(nodes);
|
||||
while(item != ompi_list_get_end(nodes)) {
|
||||
ompi_list_item_t* next = ompi_list_get_next(item);
|
||||
orte_ras_base_node_t* node = (orte_ras_base_node_t*)item;
|
||||
orte_rmaps_base_node_t* rmaps_node = OBJ_NEW(orte_rmaps_base_node_t);
|
||||
size_t i, num_procs;
|
||||
if(NULL == rmaps_node) {
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
rmaps_node->node_name = strdup(node->node_name);
|
||||
/* Note that cur_node_item already points to the Right place in
|
||||
the node list to start looking (i.e., if this is the first time
|
||||
through, it'll point to the first item. If this is not the
|
||||
first time through -- i.e., we have multiple app contexts --
|
||||
it'll point to where we left off last time.).
|
||||
|
||||
if(num_alloc + node->node_slots_alloc >= (size_t)app->num_procs) {
|
||||
num_procs = app->num_procs - num_alloc;
|
||||
} else {
|
||||
num_procs = node->node_slots_alloc;
|
||||
}
|
||||
But do a bozo check to ensure that we don't have a empty node
|
||||
list. */
|
||||
if (ompi_list_get_end(nodes) == cur_node_item) {
|
||||
return ORTE_ERR_TEMP_OUT_OF_RESOURCE;
|
||||
}
|
||||
start = cur_node_item;
|
||||
|
||||
/* assign the next num_procs to this node */
|
||||
for(i=0; i<num_procs; i++) {
|
||||
orte_rmaps_base_proc_t* proc = OBJ_NEW(orte_rmaps_base_proc_t);
|
||||
orte_process_name_t* proc_name;
|
||||
int rc;
|
||||
/* This loop continues until all procs have been mapped or we run
|
||||
out of resources. There are two definitions of "run out of
|
||||
resources":
|
||||
|
||||
/* create the process name as an offset from the vpid-start */
|
||||
rc = orte_ns.create_process_name(&proc_name, node->node_cellid, jobid, vpid_start+rank);
|
||||
if(rc != ORTE_SUCCESS) {
|
||||
OBJ_RELEASE(proc);
|
||||
1. All nodes have node_slots processes mapped to them
|
||||
2. All nodes have node_slots_max processes mapped to them
|
||||
|
||||
We first map until condition #1 is met. If there are still
|
||||
processes that haven't been mapped yet, then we continue until
|
||||
condition #2 is met. If we still have processes that haven't
|
||||
been mapped yet, then it's an "out of resources" error. */
|
||||
did_alloc = false;
|
||||
while (num_alloc < app->num_procs) {
|
||||
node = (orte_ras_base_node_t*) cur_node_item;
|
||||
next = ompi_list_get_next(cur_node_item);
|
||||
|
||||
/* If we have an available slot on this node, claim it */
|
||||
if (node->node_slots_alloc > 0) {
|
||||
fflush(stdout);
|
||||
rc = claim_slot(map, nodes, node, jobid, vpid_start + rank,
|
||||
proc_index);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
return rc;
|
||||
}
|
||||
proc->proc_node = rmaps_node;
|
||||
proc->proc_name = *proc_name;
|
||||
proc->proc_rank = rank;
|
||||
rank++;
|
||||
orte_ns.free_name(&proc_name);
|
||||
OBJ_RETAIN(proc); /* bump reference count for the node */
|
||||
ompi_list_append(&rmaps_node->node_procs, &proc->super);
|
||||
map->procs[proc_index++] = proc;
|
||||
++rank;
|
||||
++proc_index;
|
||||
|
||||
/* Save the fact that we successfully allocated a process
|
||||
to a node in this round */
|
||||
did_alloc = true;
|
||||
|
||||
/* Increase the number of procs allocated and see if we're
|
||||
done */
|
||||
++num_alloc;
|
||||
}
|
||||
|
||||
node->node_slots_alloc -= num_procs;
|
||||
if(node->node_slots_alloc == 0) {
|
||||
ompi_list_remove_item(nodes,item);
|
||||
OBJ_RELEASE(item);
|
||||
/* Move on to the next node */
|
||||
|
||||
cur_node_item = next;
|
||||
if (ompi_list_get_end(nodes) == cur_node_item) {
|
||||
cur_node_item = ompi_list_get_first(nodes);
|
||||
}
|
||||
num_alloc += num_procs;
|
||||
if(num_alloc == (size_t)app->num_procs)
|
||||
|
||||
/* Are we done? */
|
||||
if (num_alloc == app->num_procs) {
|
||||
break;
|
||||
item = next;
|
||||
ompi_list_append(&map->nodes, &rmaps_node->super);
|
||||
}
|
||||
|
||||
/* Double check that the list is not empty */
|
||||
if (ompi_list_get_end(nodes) == cur_node_item) {
|
||||
return ORTE_ERR_TEMP_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
/* If we looped around without allocating any new processes,
|
||||
then we're full */
|
||||
if (start == cur_node_item) {
|
||||
if (!did_alloc) {
|
||||
return ORTE_ERR_TEMP_OUT_OF_RESOURCE;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
map->num_procs = num_alloc;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Create a default mapping for the application, scheduling one round
|
||||
* robin by slot.
|
||||
*
|
||||
* NOTE: This function assumes that the allocator has already setup
|
||||
* the list of nodes such that the sum of the node_slots_alloc fields
|
||||
* from all entries will be the total number of processes in all the
|
||||
* apps.
|
||||
*/
|
||||
static int map_app_by_slot(
|
||||
orte_app_context_t* app,
|
||||
orte_rmaps_base_map_t* map,
|
||||
orte_jobid_t jobid,
|
||||
orte_vpid_t vpid_start,
|
||||
int rank,
|
||||
ompi_list_t* nodes)
|
||||
{
|
||||
int rc;
|
||||
size_t num_alloc = 0;
|
||||
size_t proc_index = 0;
|
||||
ompi_list_item_t *next;
|
||||
orte_ras_base_node_t *node;
|
||||
|
||||
/* Note that cur_node_item already points to the Right place in
|
||||
the node list to start looking (i.e., if this is the first time
|
||||
through, it'll point to the first item. If this is not the
|
||||
first time through -- i.e., we have multiple app contexts --
|
||||
it'll point to where we left off last time.).
|
||||
|
||||
But do a bozo check to ensure that we don't have a empty node
|
||||
list. */
|
||||
if (ompi_list_get_end(nodes) == cur_node_item) {
|
||||
return ORTE_ERR_TEMP_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
/* Go through all nodes and take up to node_slots_alloc slots and
|
||||
map it to this job */
|
||||
|
||||
while (ompi_list_get_end(nodes) != cur_node_item &&
|
||||
num_alloc < app->num_procs) {
|
||||
node = (orte_ras_base_node_t*) cur_node_item;
|
||||
next = ompi_list_get_next(cur_node_item);
|
||||
|
||||
/* If we have available slots on this node, claim it */
|
||||
while (node->node_slots_alloc > 0 &&
|
||||
num_alloc < app->num_procs) {
|
||||
fflush(stdout);
|
||||
rc = claim_slot(map, nodes, node, jobid, vpid_start + rank,
|
||||
proc_index);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
return rc;
|
||||
}
|
||||
++rank;
|
||||
++proc_index;
|
||||
|
||||
/* Increase the number of procs allocated and see if we're
|
||||
done */
|
||||
++num_alloc;
|
||||
}
|
||||
|
||||
/* Move on to the next node */
|
||||
|
||||
cur_node_item = next;
|
||||
}
|
||||
|
||||
/* Did we allocate everything? */
|
||||
|
||||
if (num_alloc < app->num_procs) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
map->num_procs = num_alloc;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
@ -117,12 +285,20 @@ static int orte_rmaps_rr_map(orte_jobid_t jobid)
|
||||
size_t num_procs = 0;
|
||||
int rank = 0;
|
||||
int rc = ORTE_SUCCESS;
|
||||
bool bynode = true;
|
||||
|
||||
/* query for the application context and allocated nodes */
|
||||
if(ORTE_SUCCESS != (rc = orte_rmgr_base_get_app_context(jobid, &context, &num_context))) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* which policy should we use? */
|
||||
if (0 == strcmp(mca_rmaps_round_robin_component.schedule_policy, "node")) {
|
||||
bynode = true;
|
||||
} else {
|
||||
bynode = false;
|
||||
}
|
||||
|
||||
/* total number of procs required */
|
||||
for(i=0; i<num_context; i++) {
|
||||
orte_app_context_t* app = context[i];
|
||||
@ -143,6 +319,7 @@ static int orte_rmaps_rr_map(orte_jobid_t jobid)
|
||||
|
||||
/* construct a default mapping */
|
||||
OBJ_CONSTRUCT(&mapping, ompi_list_t);
|
||||
cur_node_item = ompi_list_get_first(&nodes);
|
||||
for(i=0; i<num_context; i++) {
|
||||
orte_app_context_t* app = context[i];
|
||||
orte_rmaps_base_map_t* map = OBJ_NEW(orte_rmaps_base_map_t);
|
||||
@ -153,12 +330,17 @@ static int orte_rmaps_rr_map(orte_jobid_t jobid)
|
||||
ompi_list_append(&mapping, &map->super);
|
||||
|
||||
map->app = app;
|
||||
map->procs = malloc(sizeof(orte_rmaps_base_proc_t*)*app->num_procs);
|
||||
map->procs = malloc(sizeof(orte_rmaps_base_proc_t*) * app->num_procs);
|
||||
if(NULL == map->procs) {
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
goto cleanup;
|
||||
}
|
||||
if(ORTE_SUCCESS != (rc = orte_rmaps_rr_map_app(app,map,jobid,vpid_start,rank,&nodes))) {
|
||||
if (bynode) {
|
||||
rc = map_app_by_node(app, map, jobid, vpid_start, rank, &nodes);
|
||||
} else {
|
||||
rc = map_app_by_slot(app, map, jobid, vpid_start, rank, &nodes);
|
||||
}
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
goto cleanup;
|
||||
}
|
||||
rank += app->num_procs;
|
||||
|
@ -34,6 +34,7 @@ struct orte_rmaps_round_robin_component_t {
|
||||
orte_rmaps_base_component_t super;
|
||||
int debug;
|
||||
int priority;
|
||||
char *schedule_policy;
|
||||
};
|
||||
typedef struct orte_rmaps_round_robin_component_t orte_rmaps_round_robin_component_t;
|
||||
|
||||
|
@ -69,20 +69,37 @@ static int orte_rmaps_round_robin_param_register_int(
|
||||
const char* param_name,
|
||||
int default_value)
|
||||
{
|
||||
int id = mca_base_param_register_int("rmaps","round_robin",param_name,NULL,default_value);
|
||||
int id = mca_base_param_register_int("rmaps","round_robin",
|
||||
param_name,NULL,default_value);
|
||||
int param_value = default_value;
|
||||
mca_base_param_lookup_int(id,¶m_value);
|
||||
return param_value;
|
||||
}
|
||||
|
||||
|
||||
static char *orte_rmaps_round_robin_param_register_string(
|
||||
const char* param_name,
|
||||
char *default_value)
|
||||
{
|
||||
int id = mca_base_param_register_string("rmaps","round_robin",
|
||||
param_name,NULL,default_value);
|
||||
char *param_value = default_value;
|
||||
mca_base_param_lookup_string(id,¶m_value);
|
||||
return param_value;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* component open/close/init function
|
||||
*/
|
||||
static int orte_rmaps_round_robin_open(void)
|
||||
{
|
||||
mca_rmaps_round_robin_component.debug = orte_rmaps_round_robin_param_register_int("debug",1);
|
||||
mca_rmaps_round_robin_component.priority = orte_rmaps_round_robin_param_register_int("priority",1);
|
||||
mca_rmaps_round_robin_component.debug =
|
||||
orte_rmaps_round_robin_param_register_int("debug", 1);
|
||||
mca_rmaps_round_robin_component.priority =
|
||||
orte_rmaps_round_robin_param_register_int("priority", 1);
|
||||
mca_rmaps_round_robin_component.schedule_policy =
|
||||
orte_rmaps_round_robin_param_register_string("policy", "slot");
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -86,6 +86,8 @@ struct globals_t {
|
||||
bool verbose;
|
||||
bool exit;
|
||||
bool no_wait_for_job_completion;
|
||||
bool by_node;
|
||||
bool by_slot;
|
||||
size_t num_procs;
|
||||
int exit_status;
|
||||
char *hostfile;
|
||||
@ -159,6 +161,12 @@ ompi_cmd_line_init_t cmd_line_init[] = {
|
||||
{ NULL, NULL, NULL, '\0', NULL, "map", 1,
|
||||
NULL, OMPI_CMD_LINE_TYPE_STRING,
|
||||
"Mapping of processes to nodes / CPUs" },
|
||||
{ NULL, NULL, NULL, '\0', "bynode", "bynode", 0,
|
||||
&orterun_globals.by_node, OMPI_CMD_LINE_TYPE_BOOL,
|
||||
"Whether to allocate/map processes round-robin by node" },
|
||||
{ NULL, NULL, NULL, '\0', "byslot", "byslot", 0,
|
||||
&orterun_globals.by_slot, OMPI_CMD_LINE_TYPE_BOOL,
|
||||
"Whether to allocate/map processes round-robin by slot (the default)" },
|
||||
|
||||
/* mpiexec-like arguments */
|
||||
{ NULL, NULL, NULL, '\0', "wdir", "wdir", 1,
|
||||
@ -576,6 +584,8 @@ static int init_globals(void)
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
true,
|
||||
0,
|
||||
0,
|
||||
NULL,
|
||||
@ -606,6 +616,7 @@ static int init_globals(void)
|
||||
static int parse_globals(int argc, char* argv[])
|
||||
{
|
||||
ompi_cmd_line_t cmd_line;
|
||||
int ras, rmaps;
|
||||
|
||||
/* Setup and parse the command line */
|
||||
|
||||
@ -633,6 +644,23 @@ static int parse_globals(int argc, char* argv[])
|
||||
exit(0);
|
||||
}
|
||||
|
||||
/* Allocate and map by node or by slot? Shortcut for setting 2
|
||||
MCA params. */
|
||||
|
||||
ras = mca_base_param_register_string("ras", "host", "policy", NULL,
|
||||
"slot");
|
||||
rmaps = mca_base_param_register_string("rmaps", "round_robin", "policy",
|
||||
NULL, "slot");
|
||||
if (orterun_globals.by_node) {
|
||||
orterun_globals.by_slot = false;
|
||||
mca_base_param_set_string(ras, "node");
|
||||
mca_base_param_set_string(rmaps, "node");
|
||||
} else {
|
||||
orterun_globals.by_slot = true;
|
||||
mca_base_param_set_string(ras, "slot");
|
||||
mca_base_param_set_string(rmaps, "slot");
|
||||
}
|
||||
|
||||
/* If we don't want to wait, we don't want to wait */
|
||||
|
||||
if (orterun_globals.no_wait_for_job_completion) {
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user