This confusion has been around for awhile, caused by a long-ago decision to track slots allocated to a specific job as opposed to allocated to the overall mpirun instance. We eliminated that quite a while ago, but never consolidated the "slots_alloc" and "slots" fields in orte_node_t. As a result, confusion has grown in the code base as to which field to look at and/or update.
So (finally) consolidate these two fields into one "slots" field. Add a field in orte_job_t to indicate when all the procs for a job will be launched together, so that staged operations can know when MPI operations are allowed. This commit was SVN r27239.
This commit is contained in:
parent
bae5dab916
commit
fde83a44ab
@ -347,7 +347,6 @@ void orte_ras_base_allocate(int fd, short args, void *cbdata)
|
||||
*/
|
||||
node->name = strdup(orte_process_info.nodename);
|
||||
node->state = ORTE_NODE_STATE_UP;
|
||||
node->slots_alloc = 1;
|
||||
node->slots_inuse = 0;
|
||||
node->slots_max = 0;
|
||||
node->slots = 1;
|
||||
|
@ -118,10 +118,7 @@ int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata)
|
||||
hnp_node->slots = node->slots;
|
||||
hnp_node->slots_max = node->slots_max;
|
||||
hnp_node->launch_id = node->launch_id;
|
||||
/* default allocate all the slots - may be modified later
|
||||
* as a result of filtering actions in mapper
|
||||
*/
|
||||
hnp_node->slots_alloc = node->slots;
|
||||
hnp_node->slots = node->slots;
|
||||
/* use the local name for our node - don't trust what
|
||||
* we got from an RM. If requested, store the resolved
|
||||
* nodename info
|
||||
@ -149,10 +146,6 @@ int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata)
|
||||
"%s ras:base:node_insert node %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(NULL == node->name) ? "NULL" : node->name));
|
||||
/* default allocate all the slots - may be modified later
|
||||
* as a result of filtering actions in mapper
|
||||
*/
|
||||
node->slots_alloc = node->slots;
|
||||
/* insert it into the array */
|
||||
node->index = opal_pointer_array_add(orte_node_pool, (void*)node);
|
||||
if (ORTE_SUCCESS > (rc = node->index)) {
|
||||
|
@ -471,7 +471,7 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
|
||||
node->name));
|
||||
opal_list_remove_item(allocated_nodes, item);
|
||||
OBJ_RELEASE(item); /* "un-retain" it */
|
||||
} else if (node->slots_alloc <= node->slots_inuse &&
|
||||
} else if (node->slots <= node->slots_inuse &&
|
||||
(ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(policy))) {
|
||||
/* remove the node as fully used */
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
|
||||
@ -481,9 +481,9 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
|
||||
opal_list_remove_item(allocated_nodes, item);
|
||||
OBJ_RELEASE(item); /* "un-retain" it */
|
||||
} else {
|
||||
if (node->slots_alloc > node->slots_inuse) {
|
||||
if (node->slots > node->slots_inuse) {
|
||||
/* add the available slots */
|
||||
num_slots += node->slots_alloc - node->slots_inuse;
|
||||
num_slots += node->slots - node->slots_inuse;
|
||||
} else {
|
||||
/* always allocate at least one */
|
||||
num_slots++;
|
||||
@ -542,7 +542,7 @@ orte_proc_t* orte_rmaps_base_setup_proc(orte_job_t *jdata,
|
||||
proc->node = node;
|
||||
proc->nodename = node->name;
|
||||
node->num_procs++;
|
||||
if (node->slots_inuse < node->slots_alloc) {
|
||||
if (node->slots_inuse < node->slots) {
|
||||
node->slots_inuse++;
|
||||
}
|
||||
if (0 > (rc = opal_pointer_array_add(node->procs, (void*)proc))) {
|
||||
@ -600,8 +600,8 @@ orte_node_t* orte_rmaps_base_get_starting_point(opal_list_t *node_list,
|
||||
*/
|
||||
node = (orte_node_t*)cur_node_item;
|
||||
ndmin = node;
|
||||
overload = ndmin->slots_inuse - ndmin->slots_alloc;
|
||||
if (node->slots_inuse >= node->slots_alloc) {
|
||||
overload = ndmin->slots_inuse - ndmin->slots;
|
||||
if (node->slots_inuse >= node->slots) {
|
||||
/* work down the list - is there another node that
|
||||
* would not be oversubscribed?
|
||||
*/
|
||||
@ -613,7 +613,7 @@ orte_node_t* orte_rmaps_base_get_starting_point(opal_list_t *node_list,
|
||||
nd1 = NULL;
|
||||
while (item != cur_node_item) {
|
||||
nd1 = (orte_node_t*)item;
|
||||
if (nd1->slots_inuse < nd1->slots_alloc) {
|
||||
if (nd1->slots_inuse < nd1->slots) {
|
||||
/* this node is not oversubscribed! use it! */
|
||||
cur_node_item = item;
|
||||
goto process;
|
||||
@ -623,9 +623,9 @@ orte_node_t* orte_rmaps_base_get_starting_point(opal_list_t *node_list,
|
||||
* find anyone who isn't fully utilized, we will
|
||||
* start with the least used node
|
||||
*/
|
||||
if (overload >= (nd1->slots_inuse - nd1->slots_alloc)) {
|
||||
if (overload >= (nd1->slots_inuse - nd1->slots)) {
|
||||
ndmin = nd1;
|
||||
overload = ndmin->slots_inuse - ndmin->slots_alloc;
|
||||
overload = ndmin->slots_inuse - ndmin->slots;
|
||||
}
|
||||
if (item == opal_list_get_last(node_list)) {
|
||||
item = opal_list_get_first(node_list);
|
||||
@ -639,7 +639,7 @@ orte_node_t* orte_rmaps_base_get_starting_point(opal_list_t *node_list,
|
||||
* what we already have
|
||||
*/
|
||||
if (NULL != nd1 &&
|
||||
(nd1->slots_inuse - nd1->slots_alloc) < (node->slots_inuse - node->slots_alloc)) {
|
||||
(nd1->slots_inuse - nd1->slots) < (node->slots_inuse - node->slots)) {
|
||||
cur_node_item = (opal_list_item_t*)ndmin;
|
||||
}
|
||||
}
|
||||
|
@ -83,13 +83,13 @@ int orte_rmaps_rr_byslot(orte_job_t *jdata,
|
||||
obj = hwloc_get_root_obj(node->topology);
|
||||
}
|
||||
#endif
|
||||
if (node->slots_alloc <= node->slots_inuse) {
|
||||
if (node->slots <= node->slots_inuse) {
|
||||
opal_output_verbose(2, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps:rr:slot working node %s is full - skipping",
|
||||
node->name);
|
||||
continue;
|
||||
}
|
||||
num_procs_to_assign = node->slots_alloc - node->slots_inuse;
|
||||
num_procs_to_assign = node->slots - node->slots_inuse;
|
||||
for (i=0; i < num_procs_to_assign && nprocs_mapped < app->num_procs; i++) {
|
||||
/* add this node to the map - do it only once */
|
||||
if (!node->mapped) {
|
||||
@ -170,7 +170,7 @@ int orte_rmaps_rr_byslot(orte_job_t *jdata,
|
||||
--nxtra_nodes;
|
||||
}
|
||||
}
|
||||
num_procs_to_assign = (node->slots_alloc - node->slots_inuse) + extra_procs_to_assign;
|
||||
num_procs_to_assign = (node->slots - node->slots_inuse) + extra_procs_to_assign;
|
||||
opal_output_verbose(2, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps:rr:slot adding up to %d procs to node %s",
|
||||
num_procs_to_assign, node->name);
|
||||
@ -186,7 +186,7 @@ int orte_rmaps_rr_byslot(orte_job_t *jdata,
|
||||
/* not all nodes are equal, so only set oversubscribed for
|
||||
* this node if it is in that state
|
||||
*/
|
||||
if (node->slots_alloc < (int)node->num_procs) {
|
||||
if (node->slots < (int)node->num_procs) {
|
||||
/* flag the node as oversubscribed so that sched-yield gets
|
||||
* properly set
|
||||
*/
|
||||
@ -306,7 +306,7 @@ int orte_rmaps_rr_bynode(orte_job_t *jdata,
|
||||
* have to track how many procs to "shift" elsewhere
|
||||
* to make up the difference
|
||||
*/
|
||||
if (node->slots_alloc <= node->slots_inuse) {
|
||||
if (node->slots <= node->slots_inuse) {
|
||||
/* if there are no extras to take, then we can
|
||||
* ignore this node
|
||||
*/
|
||||
@ -320,16 +320,16 @@ int orte_rmaps_rr_bynode(orte_job_t *jdata,
|
||||
/* update how many we are lagging behind */
|
||||
lag += navg;
|
||||
} else {
|
||||
/* if slots_alloc < avg, then take all */
|
||||
if ((node->slots_alloc - node->slots_inuse) < navg) {
|
||||
num_procs_to_assign = (node->slots_alloc - node->slots_inuse) + extra_procs_to_assign;
|
||||
/* if slots < avg, then take all */
|
||||
if ((node->slots - node->slots_inuse) < navg) {
|
||||
num_procs_to_assign = (node->slots - node->slots_inuse) + extra_procs_to_assign;
|
||||
/* update how many we are lagging behind */
|
||||
lag += navg - (node->slots_alloc - node->slots_inuse);
|
||||
lag += navg - (node->slots - node->slots_inuse);
|
||||
} else {
|
||||
/* take the avg plus as much of the "lag" as we can */
|
||||
delta = 0;
|
||||
if (0 < lag) {
|
||||
delta = (node->slots_alloc - node->slots_inuse) - navg;
|
||||
delta = (node->slots - node->slots_inuse) - navg;
|
||||
if (lag < delta) {
|
||||
delta = lag;
|
||||
}
|
||||
@ -351,7 +351,7 @@ int orte_rmaps_rr_bynode(orte_job_t *jdata,
|
||||
/* not all nodes are equal, so only set oversubscribed for
|
||||
* this node if it is in that state
|
||||
*/
|
||||
if (node->slots_alloc < (int)node->num_procs) {
|
||||
if (node->slots < (int)node->num_procs) {
|
||||
/* flag the node as oversubscribed so that sched-yield gets
|
||||
* properly set
|
||||
*/
|
||||
@ -381,7 +381,7 @@ int orte_rmaps_rr_bynode(orte_job_t *jdata,
|
||||
/* not all nodes are equal, so only set oversubscribed for
|
||||
* this node if it is in that state
|
||||
*/
|
||||
if (node->slots_alloc < (int)node->num_procs) {
|
||||
if (node->slots < (int)node->num_procs) {
|
||||
/* flag the node as oversubscribed so that sched-yield gets
|
||||
* properly set
|
||||
*/
|
||||
@ -514,11 +514,11 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata,
|
||||
--nxtra_nodes;
|
||||
}
|
||||
}
|
||||
if (node->slots_alloc <= node->slots_inuse) {
|
||||
if (node->slots <= node->slots_inuse) {
|
||||
/* everybody takes at least the extras */
|
||||
num_procs_to_assign = extra_procs_to_assign;
|
||||
} else {
|
||||
num_procs_to_assign = (node->slots_alloc - node->slots_inuse) + extra_procs_to_assign;
|
||||
num_procs_to_assign = (node->slots - node->slots_inuse) + extra_procs_to_assign;
|
||||
}
|
||||
|
||||
/* get the number of objects of this type on this node */
|
||||
@ -570,7 +570,7 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata,
|
||||
/* not all nodes are equal, so only set oversubscribed for
|
||||
* this node if it is in that state
|
||||
*/
|
||||
if (node->slots_alloc < (int)node->num_procs) {
|
||||
if (node->slots < (int)node->num_procs) {
|
||||
/* flag the node as oversubscribed so that sched-yield gets
|
||||
* properly set
|
||||
*/
|
||||
@ -699,7 +699,7 @@ static int byobj_span(orte_job_t *jdata,
|
||||
* have to track how many procs to "shift" elsewhere
|
||||
* to make up the difference
|
||||
*/
|
||||
if (node->slots_alloc <= node->slots_inuse) {
|
||||
if (node->slots <= node->slots_inuse) {
|
||||
/* if there are no extras to take, then we can
|
||||
* safely remove this node as we don't need it
|
||||
*/
|
||||
@ -716,16 +716,16 @@ static int byobj_span(orte_job_t *jdata,
|
||||
/* update how many we are lagging behind */
|
||||
lag += navg;
|
||||
} else {
|
||||
/* if slots_alloc < avg, then take all */
|
||||
if ((node->slots_alloc - node->slots_inuse) < navg) {
|
||||
num_procs_to_assign = (node->slots_alloc - node->slots_inuse) + extra_procs_to_assign;
|
||||
/* if slots < avg, then take all */
|
||||
if ((node->slots - node->slots_inuse) < navg) {
|
||||
num_procs_to_assign = (node->slots - node->slots_inuse) + extra_procs_to_assign;
|
||||
/* update how many we are lagging behind */
|
||||
lag += navg - (node->slots_alloc - node->slots_inuse);
|
||||
lag += navg - (node->slots - node->slots_inuse);
|
||||
} else {
|
||||
/* take the avg plus as much of the "lag" as we can */
|
||||
delta = 0;
|
||||
if (0 < lag) {
|
||||
delta = (node->slots_alloc - node->slots_inuse) - navg;
|
||||
delta = (node->slots - node->slots_inuse) - navg;
|
||||
if (lag < delta) {
|
||||
delta = lag;
|
||||
}
|
||||
@ -777,7 +777,7 @@ static int byobj_span(orte_job_t *jdata,
|
||||
/* not all nodes are equal, so only set oversubscribed for
|
||||
* this node if it is in that state
|
||||
*/
|
||||
if (node->slots_alloc < (int)node->num_procs) {
|
||||
if (node->slots < (int)node->num_procs) {
|
||||
/* flag the node as oversubscribed so that sched-yield gets
|
||||
* properly set
|
||||
*/
|
||||
|
@ -47,7 +47,7 @@ static int staged_mapper(orte_job_t *jdata)
|
||||
orte_std_cntr_t num_slots;
|
||||
orte_proc_t *proc;
|
||||
orte_node_t *node;
|
||||
bool work_to_do = false;
|
||||
bool work_to_do = false, first_pass = false;
|
||||
opal_list_item_t *item;
|
||||
|
||||
/* only use this mapper if it was specified */
|
||||
@ -72,6 +72,13 @@ static int staged_mapper(orte_job_t *jdata)
|
||||
}
|
||||
jdata->map->last_mapper = strdup(c->mca_component_name);
|
||||
|
||||
/* if there are no nodes in the map, then this is our first
|
||||
* pass thru this job
|
||||
*/
|
||||
if (0 == jdata->map->num_nodes) {
|
||||
first_pass = true;
|
||||
}
|
||||
|
||||
/* we assume that the app_contexts are in priority order,
|
||||
* with the highest priority being the first entry in the
|
||||
* job's app_context array. Loop across the app_contexts
|
||||
@ -165,7 +172,7 @@ static int staged_mapper(orte_job_t *jdata)
|
||||
/* track number of procs on node and number of slots used */
|
||||
node->num_procs++;
|
||||
node->slots_inuse++;
|
||||
if (node->slots_inuse == node->slots_alloc) {
|
||||
if (node->slots_inuse == node->slots) {
|
||||
opal_list_remove_item(&node_list, &node->super);
|
||||
OBJ_RELEASE(node);
|
||||
}
|
||||
@ -216,5 +223,16 @@ static int staged_mapper(orte_job_t *jdata)
|
||||
*/
|
||||
jdata->updated = true;
|
||||
|
||||
/* if we successfully mapped ALL procs in the first pass,
|
||||
* then this job is capable of supporting MPI procs
|
||||
*/
|
||||
if (first_pass && jdata->num_mapped == jdata->num_procs) {
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"%s mca:rmaps:staged: job %s is MPI-capable",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(jdata->jobid));
|
||||
jdata->gang_launched = true;
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
@ -282,9 +282,9 @@ static void cleanup_node(orte_proc_t *proc)
|
||||
}
|
||||
}
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_state_base_output,
|
||||
"%s state:staged:track_procs node %s has %d slots alloc, %d slots inuse",
|
||||
"%s state:staged:track_procs node %s has %d slots, %d slots inuse",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name,
|
||||
(int)node->slots_alloc, (int)node->slots_inuse));
|
||||
(int)node->slots, (int)node->slots_inuse));
|
||||
}
|
||||
|
||||
static void track_procs(int fd, short args, void *cbdata)
|
||||
@ -314,7 +314,7 @@ static void track_procs(int fd, short args, void *cbdata)
|
||||
* inside MPI_Init - if it did, that is not acceptable
|
||||
*/
|
||||
if (ORTE_PROC_STATE_REGISTERED == state) {
|
||||
if (pdata->mpi_proc) {
|
||||
if (pdata->mpi_proc && !jdata->gang_launched) {
|
||||
/* we can't support this - issue an error and abort */
|
||||
orte_show_help("help-state-staged.txt", "mpi-procs-not-supported", true);
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_SILENT_ABORT);
|
||||
|
@ -379,7 +379,7 @@ int orte_dt_pack_node(opal_buffer_t *buffer, const void *src,
|
||||
|
||||
/* pack the number of slots allocated */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
|
||||
(void*)(&(nodes[i]->slots_alloc)), 1, ORTE_STD_CNTR))) {
|
||||
(void*)(&(nodes[i]->slots)), 1, ORTE_STD_CNTR))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
@ -385,7 +385,7 @@ int orte_dt_print_node(char **output, char *prefix, orte_node_t *src, opal_data_
|
||||
tmp = tmp2;
|
||||
|
||||
asprintf(&tmp2, "%s\n%s\tNum slots allocated: %ld\tMax slots: %ld", tmp, pfx2,
|
||||
(long)src->slots_alloc, (long)src->slots_max);
|
||||
(long)src->slots, (long)src->slots_max);
|
||||
free(tmp);
|
||||
tmp = tmp2;
|
||||
|
||||
|
@ -405,7 +405,7 @@ int orte_dt_unpack_node(opal_buffer_t *buffer, void *dest,
|
||||
|
||||
/* unpack the number of slots allocated */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
|
||||
(&(nodes[i]->slots_alloc)), &n, ORTE_STD_CNTR))) {
|
||||
(&(nodes[i]->slots)), &n, ORTE_STD_CNTR))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
@ -700,6 +700,7 @@ static void orte_job_construct(orte_job_t* job)
|
||||
2);
|
||||
job->num_apps = 0;
|
||||
job->controls = ORTE_JOB_CONTROL_FORWARD_OUTPUT;
|
||||
job->gang_launched = true;
|
||||
job->stdin_target = ORTE_VPID_INVALID;
|
||||
job->stdout_target = ORTE_JOBID_INVALID;
|
||||
job->total_slots_alloc = 0;
|
||||
@ -831,7 +832,6 @@ static void orte_node_construct(orte_node_t* node)
|
||||
node->slots = 0;
|
||||
node->slots_given = false;
|
||||
node->slots_inuse = 0;
|
||||
node->slots_alloc = 0;
|
||||
node->slots_max = 0;
|
||||
|
||||
node->username = NULL;
|
||||
|
@ -340,11 +340,6 @@ typedef struct {
|
||||
/** How many processes have already been launched, used by one or
|
||||
more jobs on this node. */
|
||||
orte_std_cntr_t slots_inuse;
|
||||
/** This represents the number of slots we (the allocator) are
|
||||
attempting to allocate to the current job - or the number of
|
||||
slots allocated to a specific job on a query for the jobs
|
||||
allocations */
|
||||
orte_std_cntr_t slots_alloc;
|
||||
/** A "hard" limit (if set -- a value of 0 implies no hard limit)
|
||||
on the number of slots that can be allocated on a given
|
||||
node. This is for some environments (e.g. grid) there may be
|
||||
@ -384,6 +379,11 @@ typedef struct {
|
||||
* for description of supported flags
|
||||
*/
|
||||
orte_job_controls_t controls;
|
||||
/* flag to indicate that MPI is allowed on this job - i.e.,
|
||||
* that all members of the job are being simultaneously
|
||||
* launched
|
||||
*/
|
||||
bool gang_launched;
|
||||
/* rank desiring stdin - for now, either one rank, all ranks
|
||||
* (wildcard), or none (invalid)
|
||||
*/
|
||||
|
@ -736,7 +736,7 @@ int orte_util_filter_hostfile_nodes(opal_list_t *nodes,
|
||||
* to subdivide an allocation
|
||||
*/
|
||||
if (node_from_file->slots < node_from_list->slots) {
|
||||
node_from_list->slots_alloc = node_from_file->slots;
|
||||
node_from_list->slots = node_from_file->slots;
|
||||
}
|
||||
if (remove) {
|
||||
/* remove the node from the list */
|
||||
@ -860,9 +860,9 @@ int orte_util_get_ordered_host_list(opal_list_t *nodes,
|
||||
* to subdivide an allocation
|
||||
*/
|
||||
if (node->slots < node_from_pool->slots) {
|
||||
newnode->slots_alloc = node->slots;
|
||||
newnode->slots = node->slots;
|
||||
} else {
|
||||
newnode->slots_alloc = node_from_pool->slots;
|
||||
newnode->slots = node_from_pool->slots;
|
||||
}
|
||||
opal_list_insert_pos(nodes, item1, &newnode->super);
|
||||
/* track number added */
|
||||
@ -913,9 +913,9 @@ int orte_util_get_ordered_host_list(opal_list_t *nodes,
|
||||
* to subdivide an allocation
|
||||
*/
|
||||
if (node->slots < node_from_pool->slots) {
|
||||
newnode->slots_alloc = node->slots;
|
||||
newnode->slots = node->slots;
|
||||
} else {
|
||||
newnode->slots_alloc = node_from_pool->slots;
|
||||
newnode->slots = node_from_pool->slots;
|
||||
}
|
||||
/* insert it before item1 */
|
||||
opal_list_insert_pos(nodes, item1, &newnode->super);
|
||||
|
Loading…
x
Reference in New Issue
Block a user