1
1

This confusion has been around for awhile, caused by a long-ago decision to track slots allocated to a specific job as opposed to allocated to the overall mpirun instance. We eliminated that quite a while ago, but never consolidated the "slots_alloc" and "slots" fields in orte_node_t. As a result, confusion has grown in the code base as to which field to look at and/or update.

So (finally) consolidate these two fields into one "slots" field. Add a field in orte_job_t to indicate when all the procs for a job will be launched together, so that staged operations can know when MPI operations are allowed.

This commit was SVN r27239.
This commit is contained in:
Ralph Castain 2012-09-05 01:30:39 +00:00
parent bae5dab916
commit fde83a44ab
12 changed files with 70 additions and 60 deletions

View File

@ -347,7 +347,6 @@ void orte_ras_base_allocate(int fd, short args, void *cbdata)
*/
node->name = strdup(orte_process_info.nodename);
node->state = ORTE_NODE_STATE_UP;
node->slots_alloc = 1;
node->slots_inuse = 0;
node->slots_max = 0;
node->slots = 1;

View File

@ -118,10 +118,7 @@ int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata)
hnp_node->slots = node->slots;
hnp_node->slots_max = node->slots_max;
hnp_node->launch_id = node->launch_id;
/* default allocate all the slots - may be modified later
* as a result of filtering actions in mapper
*/
hnp_node->slots_alloc = node->slots;
hnp_node->slots = node->slots;
/* use the local name for our node - don't trust what
* we got from an RM. If requested, store the resolved
* nodename info
@ -149,10 +146,6 @@ int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata)
"%s ras:base:node_insert node %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(NULL == node->name) ? "NULL" : node->name));
/* default allocate all the slots - may be modified later
* as a result of filtering actions in mapper
*/
node->slots_alloc = node->slots;
/* insert it into the array */
node->index = opal_pointer_array_add(orte_node_pool, (void*)node);
if (ORTE_SUCCESS > (rc = node->index)) {

View File

@ -471,7 +471,7 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
node->name));
opal_list_remove_item(allocated_nodes, item);
OBJ_RELEASE(item); /* "un-retain" it */
} else if (node->slots_alloc <= node->slots_inuse &&
} else if (node->slots <= node->slots_inuse &&
(ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(policy))) {
/* remove the node as fully used */
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
@ -481,9 +481,9 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
opal_list_remove_item(allocated_nodes, item);
OBJ_RELEASE(item); /* "un-retain" it */
} else {
if (node->slots_alloc > node->slots_inuse) {
if (node->slots > node->slots_inuse) {
/* add the available slots */
num_slots += node->slots_alloc - node->slots_inuse;
num_slots += node->slots - node->slots_inuse;
} else {
/* always allocate at least one */
num_slots++;
@ -542,7 +542,7 @@ orte_proc_t* orte_rmaps_base_setup_proc(orte_job_t *jdata,
proc->node = node;
proc->nodename = node->name;
node->num_procs++;
if (node->slots_inuse < node->slots_alloc) {
if (node->slots_inuse < node->slots) {
node->slots_inuse++;
}
if (0 > (rc = opal_pointer_array_add(node->procs, (void*)proc))) {
@ -600,8 +600,8 @@ orte_node_t* orte_rmaps_base_get_starting_point(opal_list_t *node_list,
*/
node = (orte_node_t*)cur_node_item;
ndmin = node;
overload = ndmin->slots_inuse - ndmin->slots_alloc;
if (node->slots_inuse >= node->slots_alloc) {
overload = ndmin->slots_inuse - ndmin->slots;
if (node->slots_inuse >= node->slots) {
/* work down the list - is there another node that
* would not be oversubscribed?
*/
@ -613,7 +613,7 @@ orte_node_t* orte_rmaps_base_get_starting_point(opal_list_t *node_list,
nd1 = NULL;
while (item != cur_node_item) {
nd1 = (orte_node_t*)item;
if (nd1->slots_inuse < nd1->slots_alloc) {
if (nd1->slots_inuse < nd1->slots) {
/* this node is not oversubscribed! use it! */
cur_node_item = item;
goto process;
@ -623,9 +623,9 @@ orte_node_t* orte_rmaps_base_get_starting_point(opal_list_t *node_list,
* find anyone who isn't fully utilized, we will
* start with the least used node
*/
if (overload >= (nd1->slots_inuse - nd1->slots_alloc)) {
if (overload >= (nd1->slots_inuse - nd1->slots)) {
ndmin = nd1;
overload = ndmin->slots_inuse - ndmin->slots_alloc;
overload = ndmin->slots_inuse - ndmin->slots;
}
if (item == opal_list_get_last(node_list)) {
item = opal_list_get_first(node_list);
@ -639,7 +639,7 @@ orte_node_t* orte_rmaps_base_get_starting_point(opal_list_t *node_list,
* what we already have
*/
if (NULL != nd1 &&
(nd1->slots_inuse - nd1->slots_alloc) < (node->slots_inuse - node->slots_alloc)) {
(nd1->slots_inuse - nd1->slots) < (node->slots_inuse - node->slots)) {
cur_node_item = (opal_list_item_t*)ndmin;
}
}

View File

@ -83,13 +83,13 @@ int orte_rmaps_rr_byslot(orte_job_t *jdata,
obj = hwloc_get_root_obj(node->topology);
}
#endif
if (node->slots_alloc <= node->slots_inuse) {
if (node->slots <= node->slots_inuse) {
opal_output_verbose(2, orte_rmaps_base.rmaps_output,
"mca:rmaps:rr:slot working node %s is full - skipping",
node->name);
continue;
}
num_procs_to_assign = node->slots_alloc - node->slots_inuse;
num_procs_to_assign = node->slots - node->slots_inuse;
for (i=0; i < num_procs_to_assign && nprocs_mapped < app->num_procs; i++) {
/* add this node to the map - do it only once */
if (!node->mapped) {
@ -170,7 +170,7 @@ int orte_rmaps_rr_byslot(orte_job_t *jdata,
--nxtra_nodes;
}
}
num_procs_to_assign = (node->slots_alloc - node->slots_inuse) + extra_procs_to_assign;
num_procs_to_assign = (node->slots - node->slots_inuse) + extra_procs_to_assign;
opal_output_verbose(2, orte_rmaps_base.rmaps_output,
"mca:rmaps:rr:slot adding up to %d procs to node %s",
num_procs_to_assign, node->name);
@ -186,7 +186,7 @@ int orte_rmaps_rr_byslot(orte_job_t *jdata,
/* not all nodes are equal, so only set oversubscribed for
* this node if it is in that state
*/
if (node->slots_alloc < (int)node->num_procs) {
if (node->slots < (int)node->num_procs) {
/* flag the node as oversubscribed so that sched-yield gets
* properly set
*/
@ -306,7 +306,7 @@ int orte_rmaps_rr_bynode(orte_job_t *jdata,
* have to track how many procs to "shift" elsewhere
* to make up the difference
*/
if (node->slots_alloc <= node->slots_inuse) {
if (node->slots <= node->slots_inuse) {
/* if there are no extras to take, then we can
* ignore this node
*/
@ -320,16 +320,16 @@ int orte_rmaps_rr_bynode(orte_job_t *jdata,
/* update how many we are lagging behind */
lag += navg;
} else {
/* if slots_alloc < avg, then take all */
if ((node->slots_alloc - node->slots_inuse) < navg) {
num_procs_to_assign = (node->slots_alloc - node->slots_inuse) + extra_procs_to_assign;
/* if slots < avg, then take all */
if ((node->slots - node->slots_inuse) < navg) {
num_procs_to_assign = (node->slots - node->slots_inuse) + extra_procs_to_assign;
/* update how many we are lagging behind */
lag += navg - (node->slots_alloc - node->slots_inuse);
lag += navg - (node->slots - node->slots_inuse);
} else {
/* take the avg plus as much of the "lag" as we can */
delta = 0;
if (0 < lag) {
delta = (node->slots_alloc - node->slots_inuse) - navg;
delta = (node->slots - node->slots_inuse) - navg;
if (lag < delta) {
delta = lag;
}
@ -351,7 +351,7 @@ int orte_rmaps_rr_bynode(orte_job_t *jdata,
/* not all nodes are equal, so only set oversubscribed for
* this node if it is in that state
*/
if (node->slots_alloc < (int)node->num_procs) {
if (node->slots < (int)node->num_procs) {
/* flag the node as oversubscribed so that sched-yield gets
* properly set
*/
@ -381,7 +381,7 @@ int orte_rmaps_rr_bynode(orte_job_t *jdata,
/* not all nodes are equal, so only set oversubscribed for
* this node if it is in that state
*/
if (node->slots_alloc < (int)node->num_procs) {
if (node->slots < (int)node->num_procs) {
/* flag the node as oversubscribed so that sched-yield gets
* properly set
*/
@ -514,11 +514,11 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata,
--nxtra_nodes;
}
}
if (node->slots_alloc <= node->slots_inuse) {
if (node->slots <= node->slots_inuse) {
/* everybody takes at least the extras */
num_procs_to_assign = extra_procs_to_assign;
} else {
num_procs_to_assign = (node->slots_alloc - node->slots_inuse) + extra_procs_to_assign;
num_procs_to_assign = (node->slots - node->slots_inuse) + extra_procs_to_assign;
}
/* get the number of objects of this type on this node */
@ -570,7 +570,7 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata,
/* not all nodes are equal, so only set oversubscribed for
* this node if it is in that state
*/
if (node->slots_alloc < (int)node->num_procs) {
if (node->slots < (int)node->num_procs) {
/* flag the node as oversubscribed so that sched-yield gets
* properly set
*/
@ -699,7 +699,7 @@ static int byobj_span(orte_job_t *jdata,
* have to track how many procs to "shift" elsewhere
* to make up the difference
*/
if (node->slots_alloc <= node->slots_inuse) {
if (node->slots <= node->slots_inuse) {
/* if there are no extras to take, then we can
* safely remove this node as we don't need it
*/
@ -716,16 +716,16 @@ static int byobj_span(orte_job_t *jdata,
/* update how many we are lagging behind */
lag += navg;
} else {
/* if slots_alloc < avg, then take all */
if ((node->slots_alloc - node->slots_inuse) < navg) {
num_procs_to_assign = (node->slots_alloc - node->slots_inuse) + extra_procs_to_assign;
/* if slots < avg, then take all */
if ((node->slots - node->slots_inuse) < navg) {
num_procs_to_assign = (node->slots - node->slots_inuse) + extra_procs_to_assign;
/* update how many we are lagging behind */
lag += navg - (node->slots_alloc - node->slots_inuse);
lag += navg - (node->slots - node->slots_inuse);
} else {
/* take the avg plus as much of the "lag" as we can */
delta = 0;
if (0 < lag) {
delta = (node->slots_alloc - node->slots_inuse) - navg;
delta = (node->slots - node->slots_inuse) - navg;
if (lag < delta) {
delta = lag;
}
@ -777,7 +777,7 @@ static int byobj_span(orte_job_t *jdata,
/* not all nodes are equal, so only set oversubscribed for
* this node if it is in that state
*/
if (node->slots_alloc < (int)node->num_procs) {
if (node->slots < (int)node->num_procs) {
/* flag the node as oversubscribed so that sched-yield gets
* properly set
*/

View File

@ -47,7 +47,7 @@ static int staged_mapper(orte_job_t *jdata)
orte_std_cntr_t num_slots;
orte_proc_t *proc;
orte_node_t *node;
bool work_to_do = false;
bool work_to_do = false, first_pass = false;
opal_list_item_t *item;
/* only use this mapper if it was specified */
@ -72,6 +72,13 @@ static int staged_mapper(orte_job_t *jdata)
}
jdata->map->last_mapper = strdup(c->mca_component_name);
/* if there are no nodes in the map, then this is our first
* pass thru this job
*/
if (0 == jdata->map->num_nodes) {
first_pass = true;
}
/* we assume that the app_contexts are in priority order,
* with the highest priority being the first entry in the
* job's app_context array. Loop across the app_contexts
@ -165,7 +172,7 @@ static int staged_mapper(orte_job_t *jdata)
/* track number of procs on node and number of slots used */
node->num_procs++;
node->slots_inuse++;
if (node->slots_inuse == node->slots_alloc) {
if (node->slots_inuse == node->slots) {
opal_list_remove_item(&node_list, &node->super);
OBJ_RELEASE(node);
}
@ -216,5 +223,16 @@ static int staged_mapper(orte_job_t *jdata)
*/
jdata->updated = true;
/* if we successfully mapped ALL procs in the first pass,
* then this job is capable of supporting MPI procs
*/
if (first_pass && jdata->num_mapped == jdata->num_procs) {
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"%s mca:rmaps:staged: job %s is MPI-capable",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jdata->jobid));
jdata->gang_launched = true;
}
return ORTE_SUCCESS;
}

View File

@ -282,9 +282,9 @@ static void cleanup_node(orte_proc_t *proc)
}
}
OPAL_OUTPUT_VERBOSE((5, orte_state_base_output,
"%s state:staged:track_procs node %s has %d slots alloc, %d slots inuse",
"%s state:staged:track_procs node %s has %d slots, %d slots inuse",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name,
(int)node->slots_alloc, (int)node->slots_inuse));
(int)node->slots, (int)node->slots_inuse));
}
static void track_procs(int fd, short args, void *cbdata)
@ -314,7 +314,7 @@ static void track_procs(int fd, short args, void *cbdata)
* inside MPI_Init - if it did, that is not acceptable
*/
if (ORTE_PROC_STATE_REGISTERED == state) {
if (pdata->mpi_proc) {
if (pdata->mpi_proc && !jdata->gang_launched) {
/* we can't support this - issue an error and abort */
orte_show_help("help-state-staged.txt", "mpi-procs-not-supported", true);
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_SILENT_ABORT);

View File

@ -379,7 +379,7 @@ int orte_dt_pack_node(opal_buffer_t *buffer, const void *src,
/* pack the number of slots allocated */
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
(void*)(&(nodes[i]->slots_alloc)), 1, ORTE_STD_CNTR))) {
(void*)(&(nodes[i]->slots)), 1, ORTE_STD_CNTR))) {
ORTE_ERROR_LOG(rc);
return rc;
}

View File

@ -385,7 +385,7 @@ int orte_dt_print_node(char **output, char *prefix, orte_node_t *src, opal_data_
tmp = tmp2;
asprintf(&tmp2, "%s\n%s\tNum slots allocated: %ld\tMax slots: %ld", tmp, pfx2,
(long)src->slots_alloc, (long)src->slots_max);
(long)src->slots, (long)src->slots_max);
free(tmp);
tmp = tmp2;

View File

@ -405,7 +405,7 @@ int orte_dt_unpack_node(opal_buffer_t *buffer, void *dest,
/* unpack the number of slots allocated */
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
(&(nodes[i]->slots_alloc)), &n, ORTE_STD_CNTR))) {
(&(nodes[i]->slots)), &n, ORTE_STD_CNTR))) {
ORTE_ERROR_LOG(rc);
return rc;
}

View File

@ -700,6 +700,7 @@ static void orte_job_construct(orte_job_t* job)
2);
job->num_apps = 0;
job->controls = ORTE_JOB_CONTROL_FORWARD_OUTPUT;
job->gang_launched = true;
job->stdin_target = ORTE_VPID_INVALID;
job->stdout_target = ORTE_JOBID_INVALID;
job->total_slots_alloc = 0;
@ -831,7 +832,6 @@ static void orte_node_construct(orte_node_t* node)
node->slots = 0;
node->slots_given = false;
node->slots_inuse = 0;
node->slots_alloc = 0;
node->slots_max = 0;
node->username = NULL;

View File

@ -340,11 +340,6 @@ typedef struct {
/** How many processes have already been launched, used by one or
more jobs on this node. */
orte_std_cntr_t slots_inuse;
/** This represents the number of slots we (the allocator) are
attempting to allocate to the current job - or the number of
slots allocated to a specific job on a query for the jobs
allocations */
orte_std_cntr_t slots_alloc;
/** A "hard" limit (if set -- a value of 0 implies no hard limit)
on the number of slots that can be allocated on a given
node. This is for some environments (e.g. grid) there may be
@ -384,6 +379,11 @@ typedef struct {
* for description of supported flags
*/
orte_job_controls_t controls;
/* flag to indicate that MPI is allowed on this job - i.e.,
* that all members of the job are being simultaneously
* launched
*/
bool gang_launched;
/* rank desiring stdin - for now, either one rank, all ranks
* (wildcard), or none (invalid)
*/

View File

@ -736,7 +736,7 @@ int orte_util_filter_hostfile_nodes(opal_list_t *nodes,
* to subdivide an allocation
*/
if (node_from_file->slots < node_from_list->slots) {
node_from_list->slots_alloc = node_from_file->slots;
node_from_list->slots = node_from_file->slots;
}
if (remove) {
/* remove the node from the list */
@ -860,9 +860,9 @@ int orte_util_get_ordered_host_list(opal_list_t *nodes,
* to subdivide an allocation
*/
if (node->slots < node_from_pool->slots) {
newnode->slots_alloc = node->slots;
newnode->slots = node->slots;
} else {
newnode->slots_alloc = node_from_pool->slots;
newnode->slots = node_from_pool->slots;
}
opal_list_insert_pos(nodes, item1, &newnode->super);
/* track number added */
@ -913,9 +913,9 @@ int orte_util_get_ordered_host_list(opal_list_t *nodes,
* to subdivide an allocation
*/
if (node->slots < node_from_pool->slots) {
newnode->slots_alloc = node->slots;
newnode->slots = node->slots;
} else {
newnode->slots_alloc = node_from_pool->slots;
newnode->slots = node_from_pool->slots;
}
/* insert it before item1 */
opal_list_insert_pos(nodes, item1, &newnode->super);