Some cleanup associated with multiple app_contexts. Ensure nodes only get entered once into the map. Correctly handle bookmarks. Cleanup tracking of slots_inuse and correct detection of oversubscription.
Still need to resolve the ranking issue so it starts at the bookmark, but that will come next. This commit was SVN r25574.
Этот коммит содержится в:
родитель
0b7c51fae2
Коммит
df2f594aa8
@ -120,13 +120,18 @@ static int plm_alps_init(void)
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
/* we do NOT assign daemons to nodes at launch - we will
|
||||
* determine that mapping when the daemon
|
||||
* calls back. This is required because alps does
|
||||
* its own mapping of proc-to-node, and we cannot know
|
||||
* in advance which daemon will wind up on which node
|
||||
*/
|
||||
orte_plm_globals.daemon_nodes_assigned_at_launch = false;
|
||||
if (orte_do_not_launch) {
|
||||
/* must map daemons since we won't be launching them */
|
||||
orte_plm_globals.daemon_nodes_assigned_at_launch = true;
|
||||
} else {
|
||||
/* we do NOT assign daemons to nodes at launch - we will
|
||||
* determine that mapping when the daemon
|
||||
* calls back. This is required because alps does
|
||||
* its own mapping of proc-to-node, and we cannot know
|
||||
* in advance which daemon will wind up on which node
|
||||
*/
|
||||
orte_plm_globals.daemon_nodes_assigned_at_launch = false;
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
@ -174,6 +179,13 @@ static int plm_alps_launch_job(orte_job_t *jdata)
|
||||
/* indicate the state of the launch */
|
||||
failed_launch = true;
|
||||
|
||||
/* start by setting up the virtual machine */
|
||||
daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_setup_virtual_machine(daemons))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* if we don't want to launch, then don't attempt to
|
||||
* launch the daemons - the user really wants to just
|
||||
* look at the proposed process map
|
||||
@ -182,13 +194,6 @@ static int plm_alps_launch_job(orte_job_t *jdata)
|
||||
goto launch_apps;
|
||||
}
|
||||
|
||||
/* start by setting up the virtual machine */
|
||||
daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_setup_virtual_machine(daemons))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:alps: launching vm",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
@ -109,13 +109,18 @@ int plm_lsf_init(void)
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
/* we do NOT assign daemons to nodes at launch - we will
|
||||
* determine that mapping when the daemon
|
||||
* calls back. This is required because lsf does
|
||||
* its own mapping of proc-to-node, and we cannot know
|
||||
* in advance which daemon will wind up on which node
|
||||
*/
|
||||
orte_plm_globals.daemon_nodes_assigned_at_launch = false;
|
||||
if (orte_do_not_launch) {
|
||||
/* must assign daemons as won't be launching them */
|
||||
orte_plm_globals.daemon_nodes_assigned_at_launch = true;
|
||||
} else {
|
||||
/* we do NOT assign daemons to nodes at launch - we will
|
||||
* determine that mapping when the daemon
|
||||
* calls back. This is required because lsf does
|
||||
* its own mapping of proc-to-node, and we cannot know
|
||||
* in advance which daemon will wind up on which node
|
||||
*/
|
||||
orte_plm_globals.daemon_nodes_assigned_at_launch = false;
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
@ -158,6 +163,13 @@ static int plm_lsf_launch_job(orte_job_t *jdata)
|
||||
}
|
||||
}
|
||||
|
||||
/* start by setting up the virtual machine */
|
||||
daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_setup_virtual_machine(daemons))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* if we don't want to launch, then don't attempt to
|
||||
* launch the daemons - the user really wants to just
|
||||
* look at the proposed process map
|
||||
@ -166,13 +178,6 @@ static int plm_lsf_launch_job(orte_job_t *jdata)
|
||||
goto launch_apps;
|
||||
}
|
||||
|
||||
/* start by setting up the virtual machine */
|
||||
daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_setup_virtual_machine(daemons))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:slurm: launching vm",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
@ -835,6 +835,13 @@ static int rsh_launch(orte_job_t *jdata)
|
||||
joblaunchstart = orte_plm_globals.daemonlaunchstart;
|
||||
}
|
||||
|
||||
/* setup the virtual machine */
|
||||
daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_setup_virtual_machine(daemons))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* if we don't want to launch, then don't attempt to
|
||||
* launch the daemons - the user really wants to just
|
||||
* look at the proposed process map
|
||||
@ -843,13 +850,6 @@ static int rsh_launch(orte_job_t *jdata)
|
||||
goto launch_apps;
|
||||
}
|
||||
|
||||
/* start by launching the virtual machine */
|
||||
daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_setup_virtual_machine(daemons))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:rsh: launching vm",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
@ -123,13 +123,21 @@ static int plm_slurm_init(void)
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
/* we do NOT assign daemons to nodes at launch - we will
|
||||
* determine that mapping when the daemon
|
||||
* calls back. This is required because slurm does
|
||||
* its own mapping of proc-to-node, and we cannot know
|
||||
* in advance which daemon will wind up on which node
|
||||
/* if we don't want to launch (e.g., someone just wants
|
||||
* to test the mappers), then we assign vpids at "launch"
|
||||
* so the mapper has something to work with
|
||||
*/
|
||||
orte_plm_globals.daemon_nodes_assigned_at_launch = false;
|
||||
if (orte_do_not_launch) {
|
||||
orte_plm_globals.daemon_nodes_assigned_at_launch = true;
|
||||
} else {
|
||||
/* we do NOT assign daemons to nodes at launch - we will
|
||||
* determine that mapping when the daemon
|
||||
* calls back. This is required because slurm does
|
||||
* its own mapping of proc-to-node, and we cannot know
|
||||
* in advance which daemon will wind up on which node
|
||||
*/
|
||||
orte_plm_globals.daemon_nodes_assigned_at_launch = false;
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
@ -167,6 +167,13 @@ static int plm_tm_launch_job(orte_job_t *jdata)
|
||||
/* default to declaring the daemons as failed */
|
||||
failed_job = ORTE_PROC_MY_NAME->jobid;
|
||||
|
||||
/* start by launching the virtual machine */
|
||||
daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_setup_virtual_machine(daemons))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* if we don't want to launch, then don't attempt to
|
||||
* launch the daemons - the user really wants to just
|
||||
* look at the proposed process map
|
||||
@ -175,13 +182,6 @@ static int plm_tm_launch_job(orte_job_t *jdata)
|
||||
goto launch_apps;
|
||||
}
|
||||
|
||||
/* start by launching the virtual machine */
|
||||
daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_setup_virtual_machine(daemons))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:tm: launching vm",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
@ -53,7 +53,8 @@
|
||||
* Query the registry for all nodes allocated to a specified app_context
|
||||
*/
|
||||
int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr_t *total_num_slots,
|
||||
orte_app_context_t *app, orte_mapping_policy_t policy)
|
||||
orte_app_context_t *app, orte_mapping_policy_t policy,
|
||||
bool initial_map)
|
||||
{
|
||||
opal_list_item_t *item, *next;
|
||||
orte_node_t *node, *nd;
|
||||
@ -72,7 +73,13 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
|
||||
node->state = ORTE_NODE_STATE_UP;
|
||||
} else if (ORTE_NODE_STATE_NOT_INCLUDED != node->state) {
|
||||
OBJ_RETAIN(node);
|
||||
node->mapped = false;
|
||||
if (initial_map) {
|
||||
/* if this is the first app_context we
|
||||
* are getting for an initial map of a job,
|
||||
* then mark all nodes as unmapped
|
||||
*/
|
||||
node->mapped = false;
|
||||
}
|
||||
opal_list_append(allocated_nodes, &node->super);
|
||||
}
|
||||
}
|
||||
@ -114,7 +121,13 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
|
||||
* destructed along the way
|
||||
*/
|
||||
OBJ_RETAIN(node);
|
||||
node->mapped = false;
|
||||
if (initial_map) {
|
||||
/* if this is the first app_context we
|
||||
* are getting for an initial map of a job,
|
||||
* then mark all nodes as unmapped
|
||||
*/
|
||||
node->mapped = false;
|
||||
}
|
||||
/* quick sanity check */
|
||||
if (NULL == node->daemon) {
|
||||
orte_show_help("help-orte-rmaps-base.txt",
|
||||
@ -382,3 +395,140 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
orte_proc_t* orte_rmaps_base_setup_proc(orte_job_t *jdata,
|
||||
orte_node_t *node,
|
||||
orte_app_idx_t idx)
|
||||
{
|
||||
orte_proc_t *proc;
|
||||
int rc;
|
||||
|
||||
proc = OBJ_NEW(orte_proc_t);
|
||||
/* set the jobid */
|
||||
proc->name.jobid = jdata->jobid;
|
||||
/* we do not set the vpid here - this will be done
|
||||
* during a second phase, but we do set the epoch here
|
||||
* since they all start with the same value.
|
||||
*/
|
||||
ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN);
|
||||
/* flag the proc as ready for launch */
|
||||
proc->state = ORTE_PROC_STATE_INIT;
|
||||
proc->app_idx = idx;
|
||||
|
||||
OBJ_RETAIN(node); /* maintain accounting on object */
|
||||
proc->node = node;
|
||||
proc->nodename = node->name;
|
||||
node->num_procs++;
|
||||
if (node->slots_inuse < node->slots_alloc) {
|
||||
node->slots_inuse++;
|
||||
}
|
||||
if (0 > (rc = opal_pointer_array_add(node->procs, (void*)proc))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(proc);
|
||||
return NULL;
|
||||
}
|
||||
/* retain the proc struct so that we correctly track its release */
|
||||
OBJ_RETAIN(proc);
|
||||
|
||||
return proc;
|
||||
}
|
||||
|
||||
/*
|
||||
* determine the proper starting point for the next mapping operation
|
||||
*/
|
||||
orte_node_t* orte_rmaps_base_get_starting_point(opal_list_t *node_list,
|
||||
orte_job_t *jdata)
|
||||
{
|
||||
opal_list_item_t *item, *cur_node_item;
|
||||
orte_node_t *node, *nd1, *ndmin;
|
||||
int overload;
|
||||
|
||||
/* if a bookmark exists from some prior mapping, set us to start there */
|
||||
if (NULL != jdata->bookmark) {
|
||||
cur_node_item = NULL;
|
||||
/* find this node on the list */
|
||||
for (item = opal_list_get_first(node_list);
|
||||
item != opal_list_get_end(node_list);
|
||||
item = opal_list_get_next(item)) {
|
||||
node = (orte_node_t*)item;
|
||||
|
||||
if (node->index == jdata->bookmark->index) {
|
||||
cur_node_item = item;
|
||||
break;
|
||||
}
|
||||
}
|
||||
/* see if we found it - if not, just start at the beginning */
|
||||
if (NULL == cur_node_item) {
|
||||
cur_node_item = opal_list_get_first(node_list);
|
||||
}
|
||||
} else {
|
||||
/* if no bookmark, then just start at the beginning of the list */
|
||||
cur_node_item = opal_list_get_first(node_list);
|
||||
}
|
||||
|
||||
opal_output(0, "INITIAL STARTING PT: %s", ((orte_node_t*)cur_node_item)->name);
|
||||
|
||||
/* is this node fully subscribed? If so, then the first
|
||||
* proc we assign will oversubscribe it, so let's look
|
||||
* for another candidate
|
||||
*/
|
||||
node = (orte_node_t*)cur_node_item;
|
||||
ndmin = node;
|
||||
overload = ndmin->slots_inuse - ndmin->slots_alloc;
|
||||
if (node->slots_inuse >= node->slots_alloc) {
|
||||
opal_output(0, "NODE %s IS FULL", node->name);
|
||||
/* work down the list - is there another node that
|
||||
* would not be oversubscribed?
|
||||
*/
|
||||
if (cur_node_item != opal_list_get_last(node_list)) {
|
||||
item = opal_list_get_next(cur_node_item);
|
||||
} else {
|
||||
item = opal_list_get_first(node_list);
|
||||
}
|
||||
while (item != cur_node_item) {
|
||||
nd1 = (orte_node_t*)item;
|
||||
if (nd1->slots_inuse < nd1->slots_alloc) {
|
||||
/* this node is not oversubscribed! use it! */
|
||||
cur_node_item = item;
|
||||
goto process;
|
||||
}
|
||||
/* this one was also oversubscribed, keep track of the
|
||||
* node that has the least usage - if we can't
|
||||
* find anyone who isn't fully utilized, we will
|
||||
* start with the least used node
|
||||
*/
|
||||
if (overload >= (nd1->slots_inuse - nd1->slots_alloc)) {
|
||||
ndmin = nd1;
|
||||
overload = ndmin->slots_inuse - ndmin->slots_alloc;
|
||||
}
|
||||
if (item == opal_list_get_last(node_list)) {
|
||||
item = opal_list_get_first(node_list);
|
||||
} else {
|
||||
item= opal_list_get_next(item);
|
||||
}
|
||||
}
|
||||
/* if we get here, then we cycled all the way around the
|
||||
* list without finding a better answer - just use the node
|
||||
* that is minimally overloaded
|
||||
*/
|
||||
cur_node_item = (opal_list_item_t*)ndmin;
|
||||
}
|
||||
|
||||
process:
|
||||
node = (orte_node_t*)cur_node_item;
|
||||
opal_dss.dump(0, node, ORTE_NODE);
|
||||
|
||||
/* make life easier - put the bookmark at the top of the list,
|
||||
* shifting everything above it to the end of the list while
|
||||
* preserving order
|
||||
*/
|
||||
while (cur_node_item != (item = opal_list_get_first(node_list))) {
|
||||
opal_output(0, "ROTATING NODE:");
|
||||
node = (orte_node_t*)item;
|
||||
opal_dss.dump(0, node, ORTE_NODE);
|
||||
opal_list_remove_item(node_list, item);
|
||||
opal_list_append(node_list, item);
|
||||
}
|
||||
|
||||
return (orte_node_t*)cur_node_item;
|
||||
}
|
||||
|
@ -52,14 +52,15 @@ ORTE_DECLSPEC orte_job_map_t* orte_rmaps_base_get_job_map(orte_jobid_t job);
|
||||
ORTE_DECLSPEC int orte_rmaps_base_get_target_nodes(opal_list_t* node_list,
|
||||
orte_std_cntr_t *total_num_slots,
|
||||
orte_app_context_t *app,
|
||||
orte_mapping_policy_t policy);
|
||||
ORTE_DECLSPEC int orte_rmaps_base_get_target_procs(opal_list_t *procs);
|
||||
orte_mapping_policy_t policy,
|
||||
bool initial_map);
|
||||
|
||||
ORTE_DECLSPEC int orte_rmaps_base_update_node_usage(opal_list_t *nodes);
|
||||
ORTE_DECLSPEC int orte_rmaps_base_get_mapped_targets(opal_list_t *mapped_node_list,
|
||||
orte_app_context_t *app,
|
||||
opal_list_t *master_node_list,
|
||||
orte_std_cntr_t *total_num_slots);
|
||||
ORTE_DECLSPEC orte_proc_t* orte_rmaps_base_setup_proc(orte_job_t *jdata,
|
||||
orte_node_t *node,
|
||||
orte_app_idx_t idx);
|
||||
|
||||
ORTE_DECLSPEC orte_node_t* orte_rmaps_base_get_starting_point(opal_list_t *node_list,
|
||||
orte_job_t *jdata);
|
||||
|
||||
ORTE_DECLSPEC int orte_rmaps_base_compute_vpids(orte_job_t *jdata);
|
||||
|
||||
|
@ -35,10 +35,6 @@ orte_rmaps_base_module_t orte_rmaps_ppr_module = {
|
||||
ppr_mapper
|
||||
};
|
||||
|
||||
static orte_proc_t* setup_proc(orte_job_t *jdata,
|
||||
orte_node_t *node,
|
||||
orte_app_idx_t idx);
|
||||
|
||||
#if OPAL_HAVE_HWLOC
|
||||
static void prune(orte_jobid_t jobid,
|
||||
orte_app_idx_t app_idx,
|
||||
@ -71,6 +67,7 @@ static int ppr_mapper(orte_job_t *jdata)
|
||||
char **ppr_req, **ck;
|
||||
size_t len;
|
||||
bool pruning_reqd = false;
|
||||
bool initial_map=true;
|
||||
|
||||
/* only handle initial launch of loadbalanced
|
||||
* or NPERxxx jobs - allow restarting of failed apps
|
||||
@ -233,11 +230,16 @@ static int ppr_mapper(orte_job_t *jdata)
|
||||
/* get the available nodes */
|
||||
OBJ_CONSTRUCT(&node_list, opal_list_t);
|
||||
if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
|
||||
jdata->map->mapping))) {
|
||||
jdata->map->mapping, initial_map))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto error;
|
||||
}
|
||||
/* flag that all subsequent requests should not reset the node->mapped flag */
|
||||
initial_map = false;
|
||||
|
||||
/* if a bookmark exists from some prior mapping, set us to start there */
|
||||
jdata->bookmark = orte_rmaps_base_get_starting_point(&node_list, jdata);
|
||||
|
||||
/* cycle across the nodes */
|
||||
nprocs_mapped = 0;
|
||||
while (NULL != (node = (orte_node_t*)opal_list_remove_first(&node_list))) {
|
||||
@ -250,13 +252,16 @@ static int ppr_mapper(orte_job_t *jdata)
|
||||
goto error;
|
||||
}
|
||||
#endif
|
||||
/* add the node to the map */
|
||||
if (ORTE_SUCCESS > (rc = opal_pointer_array_add(jdata->map->nodes, (void*)node))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto error;
|
||||
/* add the node to the map, if needed */
|
||||
if (!node->mapped) {
|
||||
if (ORTE_SUCCESS > (rc = opal_pointer_array_add(jdata->map->nodes, (void*)node))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto error;
|
||||
}
|
||||
node->mapped = true;
|
||||
OBJ_RETAIN(node); /* maintain accounting on object */
|
||||
jdata->map->num_nodes++;
|
||||
}
|
||||
OBJ_RETAIN(node); /* maintain accounting on object */
|
||||
jdata->map->num_nodes++;
|
||||
/* if we are mapping solely at the node level, just put
|
||||
* that many procs on this node
|
||||
*/
|
||||
@ -265,7 +270,7 @@ static int ppr_mapper(orte_job_t *jdata)
|
||||
obj = hwloc_get_root_obj(node->topology);
|
||||
#endif
|
||||
for (j=0; j < ppr[start] && nprocs_mapped < total_procs; j++) {
|
||||
if (NULL == (proc = setup_proc(jdata, node, idx))) {
|
||||
if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, idx))) {
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
goto error;
|
||||
}
|
||||
@ -289,7 +294,7 @@ static int ppr_mapper(orte_job_t *jdata)
|
||||
lowest, cache_level,
|
||||
i, OPAL_HWLOC_AVAILABLE);
|
||||
for (j=0; j < ppr[start] && nprocs_mapped < total_procs; j++) {
|
||||
if (NULL == (proc = setup_proc(jdata, node, idx))) {
|
||||
if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, idx))) {
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
goto error;
|
||||
}
|
||||
@ -309,17 +314,19 @@ static int ppr_mapper(orte_job_t *jdata)
|
||||
#endif
|
||||
}
|
||||
|
||||
/* set the total slots used to the number of procs placed
|
||||
* on this node
|
||||
*/
|
||||
node->slots_inuse = node->num_procs;
|
||||
/* set the total slots used */
|
||||
if ((int)node->num_procs <= node->slots) {
|
||||
node->slots_inuse = (int)node->num_procs;
|
||||
} else {
|
||||
node->slots_inuse = node->slots;
|
||||
}
|
||||
|
||||
/* if no-oversubscribe was specified, check to see if
|
||||
* we have violated the total slot specification - regardless,
|
||||
* if slots_max was given, we are not allowed to violate it!
|
||||
*/
|
||||
if ((node->slots < node->slots_inuse) ||
|
||||
(0 < node->slots_max && node->slots_max < node->slots_inuse)) {
|
||||
if ((node->slots < (int)node->num_procs) ||
|
||||
(0 < node->slots_max && node->slots_max < (int)node->num_procs)) {
|
||||
if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
|
||||
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
|
||||
true, node->num_procs, app->app);
|
||||
@ -332,9 +339,8 @@ static int ppr_mapper(orte_job_t *jdata)
|
||||
node->oversubscribed = true;
|
||||
}
|
||||
|
||||
/* update the number of procs in the job and the app */
|
||||
/* update the number of procs in the job */
|
||||
jdata->num_procs += node->num_procs;
|
||||
app->num_procs = node->num_procs;
|
||||
|
||||
/* if we haven't mapped all the procs, continue on to the
|
||||
* next node
|
||||
@ -343,6 +349,9 @@ static int ppr_mapper(orte_job_t *jdata)
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (0 == app->num_procs) {
|
||||
app->num_procs = nprocs_mapped;
|
||||
}
|
||||
if (ORTE_VPID_MAX != total_procs && nprocs_mapped < total_procs) {
|
||||
/* couldn't map them all */
|
||||
orte_show_help("help-orte-rmaps-ppr.txt", "ppr-too-many-procs",
|
||||
@ -541,6 +550,10 @@ static void prune(orte_jobid_t jobid,
|
||||
idxmax);
|
||||
opal_pointer_array_set_item(node->procs, idxmax, NULL);
|
||||
node->num_procs--;
|
||||
node->slots_inuse--;
|
||||
if (node->slots_inuse < 0) {
|
||||
node->slots_inuse = 0;
|
||||
}
|
||||
nprocs--;
|
||||
*nmapped -= 1;
|
||||
OBJ_RELEASE(procmax);
|
||||
@ -558,36 +571,3 @@ static void prune(orte_jobid_t jobid,
|
||||
opal_output(0, "INFINITE LOOP");
|
||||
}
|
||||
#endif
|
||||
|
||||
static orte_proc_t* setup_proc(orte_job_t *jdata,
|
||||
orte_node_t *node,
|
||||
orte_app_idx_t idx)
|
||||
{
|
||||
orte_proc_t *proc;
|
||||
int rc;
|
||||
|
||||
proc = OBJ_NEW(orte_proc_t);
|
||||
/* set the jobid */
|
||||
proc->name.jobid = jdata->jobid;
|
||||
/* we do not set the vpid here - this will be done
|
||||
* during a second phase, but we do set the epoch here
|
||||
since they all start with the same value. */
|
||||
ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN);
|
||||
/* flag the proc as ready for launch */
|
||||
proc->state = ORTE_PROC_STATE_INIT;
|
||||
proc->app_idx = idx;
|
||||
|
||||
OBJ_RETAIN(node); /* maintain accounting on object */
|
||||
proc->node = node;
|
||||
proc->nodename = node->name;
|
||||
node->num_procs++;
|
||||
if (0 > (rc = opal_pointer_array_add(node->procs, (void*)proc))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(proc);
|
||||
return NULL;
|
||||
}
|
||||
/* retain the proc struct so that we correctly track its release */
|
||||
OBJ_RETAIN(proc);
|
||||
|
||||
return proc;
|
||||
}
|
||||
|
@ -79,6 +79,7 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
|
||||
orte_proc_t *proc;
|
||||
mca_base_component_t *c = &mca_rmaps_rank_file_component.super.base_version;
|
||||
char *slots;
|
||||
bool initial_map=true;
|
||||
|
||||
/* only handle initial launch of rf job */
|
||||
if (ORTE_JOB_STATE_INIT != jdata->state) {
|
||||
@ -158,12 +159,14 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
|
||||
* option
|
||||
*/
|
||||
if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
|
||||
map->mapping))) {
|
||||
map->mapping, initial_map))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto error;
|
||||
}
|
||||
num_nodes = (orte_std_cntr_t)opal_list_get_size(&node_list);
|
||||
|
||||
/* flag that all subsequent requests should not reset the node->mapped flag */
|
||||
initial_map = false;
|
||||
|
||||
/* we already checked for sanity, so it's okay to just do here */
|
||||
if (0 == app->num_procs) {
|
||||
/** set the num_procs to equal the number of slots on these mapped nodes */
|
||||
@ -240,21 +243,9 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
|
||||
opal_pointer_array_add(map->nodes, node);
|
||||
node->mapped = true;
|
||||
}
|
||||
proc = OBJ_NEW(orte_proc_t);
|
||||
/* set the jobid */
|
||||
proc->name.jobid = jdata->jobid;
|
||||
proc->name.vpid = rank;
|
||||
ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN);
|
||||
/* flag the proc as ready for launch */
|
||||
proc->state = ORTE_PROC_STATE_INIT;
|
||||
proc->app_idx = i;
|
||||
|
||||
OBJ_RETAIN(node); /* maintain accounting on object */
|
||||
proc->node = node;
|
||||
proc->nodename = node->name;
|
||||
node->num_procs++;
|
||||
if ((node->slots < node->slots_inuse) ||
|
||||
(0 < node->slots_max && node->slots_max < node->slots_inuse)) {
|
||||
proc = orte_rmaps_base_setup_proc(jdata, node, i);
|
||||
if ((node->slots < (int)node->num_procs) ||
|
||||
(0 < node->slots_max && node->slots_max < (int)node->num_procs)) {
|
||||
if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
|
||||
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
|
||||
true, node->num_procs, app->app);
|
||||
@ -266,13 +257,8 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
|
||||
*/
|
||||
node->oversubscribed = true;
|
||||
}
|
||||
if (0 > (rc = opal_pointer_array_add(node->procs, (void*)proc))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(proc);
|
||||
return rc;
|
||||
}
|
||||
/* retain the proc struct so that we correctly track its release */
|
||||
OBJ_RETAIN(proc);
|
||||
/* set the vpid */
|
||||
proc->name.vpid = rank;
|
||||
|
||||
#if OPAL_HAVE_HWLOC
|
||||
if (NULL != slots) {
|
||||
|
@ -173,7 +173,8 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list,
|
||||
&num_slots,
|
||||
app,
|
||||
jdata->map->mapping))) {
|
||||
jdata->map->mapping,
|
||||
false))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
while (NULL != (item = opal_list_remove_first(&node_list))) {
|
||||
OBJ_RELEASE(item);
|
||||
@ -476,7 +477,8 @@ static int get_new_node(orte_proc_t *proc,
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list,
|
||||
&num_slots,
|
||||
app,
|
||||
map->mapping))) {
|
||||
map->mapping,
|
||||
false))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto release;
|
||||
}
|
||||
@ -687,6 +689,7 @@ static int map_to_ftgrps(orte_job_t *jdata)
|
||||
orte_rmaps_res_ftgrp_t *ftgrp, *target = NULL;
|
||||
orte_vpid_t totprocs, num_assigned;
|
||||
orte_proc_t *proc;
|
||||
bool initial_map=true;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output,
|
||||
"%s rmaps:resilient: creating initial map for job %s",
|
||||
@ -718,10 +721,13 @@ static int map_to_ftgrps(orte_job_t *jdata)
|
||||
*/
|
||||
OBJ_CONSTRUCT(&node_list, opal_list_t);
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
|
||||
map->mapping))) {
|
||||
map->mapping, initial_map))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
/* flag that all subsequent requests should not reset the node->mapped flag */
|
||||
initial_map = false;
|
||||
|
||||
/* remove all nodes that are not "up" or do not have a running daemon on them */
|
||||
item = opal_list_get_first(&node_list);
|
||||
while (item != opal_list_get_end(&node_list)) {
|
||||
@ -821,16 +827,9 @@ static int map_to_ftgrps(orte_job_t *jdata)
|
||||
opal_pointer_array_add(map->nodes, nd);
|
||||
nd->mapped = true;
|
||||
}
|
||||
proc = OBJ_NEW(orte_proc_t);
|
||||
/* set the jobid */
|
||||
proc->name.jobid = jdata->jobid;
|
||||
proc->app_idx = app->idx;
|
||||
OBJ_RETAIN(node); /* maintain accounting on object */
|
||||
proc->node = nd;
|
||||
proc->nodename = nd->name;
|
||||
nd->num_procs++;
|
||||
if ((nd->slots < nd->slots_inuse) ||
|
||||
(0 < nd->slots_max && nd->slots_max < nd->slots_inuse)) {
|
||||
proc = orte_rmaps_base_setup_proc(jdata, node, app->idx);
|
||||
if ((nd->slots < (int)nd->num_procs) ||
|
||||
(0 < nd->slots_max && nd->slots_max < (int)nd->num_procs)) {
|
||||
if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
|
||||
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
|
||||
true, nd->num_procs, app->app);
|
||||
@ -841,12 +840,6 @@ static int map_to_ftgrps(orte_job_t *jdata)
|
||||
*/
|
||||
nd->oversubscribed = true;
|
||||
}
|
||||
opal_pointer_array_add(nd->procs, (void*)proc);
|
||||
/* retain the proc struct so that we correctly track its release */
|
||||
OBJ_RETAIN(proc);
|
||||
|
||||
/* flag the proc as ready for launch */
|
||||
proc->state = ORTE_PROC_STATE_INIT;
|
||||
|
||||
/* track number of procs mapped */
|
||||
num_assigned++;
|
||||
|
@ -39,8 +39,6 @@
|
||||
#include "orte/mca/rmaps/base/base.h"
|
||||
#include "rmaps_rr.h"
|
||||
|
||||
static orte_node_t* get_starting_point(opal_list_t *node_list, orte_job_t *jdata);
|
||||
|
||||
/*
|
||||
* Create a round-robin mapping for the job.
|
||||
*/
|
||||
@ -53,6 +51,7 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
|
||||
orte_std_cntr_t num_nodes, num_slots;
|
||||
int rc;
|
||||
mca_base_component_t *c = &mca_rmaps_round_robin_component.base_version;
|
||||
bool initial_map=true;
|
||||
|
||||
/* this mapper can only handle initial launch
|
||||
* when rr mapping is desired - allow
|
||||
@ -119,14 +118,16 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
|
||||
* option
|
||||
*/
|
||||
if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
|
||||
jdata->map->mapping))) {
|
||||
jdata->map->mapping, initial_map))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto error;
|
||||
}
|
||||
num_nodes = (orte_std_cntr_t)opal_list_get_size(&node_list);
|
||||
/* flag that all subsequent requests should not reset the node->mapped flag */
|
||||
initial_map = false;
|
||||
|
||||
/* if a bookmark exists from some prior mapping, set us to start there */
|
||||
jdata->bookmark = get_starting_point(&node_list, jdata);
|
||||
jdata->bookmark = orte_rmaps_base_get_starting_point(&node_list, jdata);
|
||||
|
||||
if (0 == app->num_procs) {
|
||||
/* set the num_procs to equal the number of slots on these mapped nodes */
|
||||
@ -208,85 +209,6 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
|
||||
return rc;
|
||||
}
|
||||
|
||||
/*
|
||||
* determine the proper starting point for the next mapping operation
|
||||
*/
|
||||
static orte_node_t* get_starting_point(opal_list_t *node_list, orte_job_t *jdata)
|
||||
{
|
||||
opal_list_item_t *item, *cur_node_item;
|
||||
orte_node_t *node, *nd1, *ndmin;
|
||||
int overload;
|
||||
|
||||
/* if a bookmark exists from some prior mapping, set us to start there */
|
||||
if (NULL != jdata->bookmark) {
|
||||
cur_node_item = NULL;
|
||||
/* find this node on the list */
|
||||
for (item = opal_list_get_first(node_list);
|
||||
item != opal_list_get_end(node_list);
|
||||
item = opal_list_get_next(item)) {
|
||||
node = (orte_node_t*)item;
|
||||
|
||||
if (node->index == jdata->bookmark->index) {
|
||||
cur_node_item = item;
|
||||
break;
|
||||
}
|
||||
}
|
||||
/* see if we found it - if not, just start at the beginning */
|
||||
if (NULL == cur_node_item) {
|
||||
cur_node_item = opal_list_get_first(node_list);
|
||||
}
|
||||
} else {
|
||||
/* if no bookmark, then just start at the beginning of the list */
|
||||
cur_node_item = opal_list_get_first(node_list);
|
||||
}
|
||||
|
||||
/* is this node fully subscribed? If so, then the first
|
||||
* proc we assign will oversubscribe it, so let's look
|
||||
* for another candidate
|
||||
*/
|
||||
node = (orte_node_t*)cur_node_item;
|
||||
ndmin = node;
|
||||
overload = ndmin->slots_inuse - ndmin->slots_alloc;
|
||||
if (node->slots_inuse >= node->slots_alloc) {
|
||||
/* work down the list - is there another node that
|
||||
* would not be oversubscribed?
|
||||
*/
|
||||
if (cur_node_item != opal_list_get_last(node_list)) {
|
||||
item = opal_list_get_next(cur_node_item);
|
||||
} else {
|
||||
item = opal_list_get_first(node_list);
|
||||
}
|
||||
while (item != cur_node_item) {
|
||||
nd1 = (orte_node_t*)item;
|
||||
if (nd1->slots_inuse < nd1->slots_alloc) {
|
||||
/* this node is not oversubscribed! use it! */
|
||||
return (orte_node_t*)item;
|
||||
}
|
||||
/* this one was also oversubscribed, keep track of the
|
||||
* node that has the least usage - if we can't
|
||||
* find anyone who isn't fully utilized, we will
|
||||
* start with the least used node
|
||||
*/
|
||||
if (overload >= (nd1->slots_inuse - nd1->slots_alloc)) {
|
||||
ndmin = nd1;
|
||||
overload = ndmin->slots_inuse - ndmin->slots_alloc;
|
||||
}
|
||||
if (item == opal_list_get_last(node_list)) {
|
||||
item = opal_list_get_first(node_list);
|
||||
} else {
|
||||
item= opal_list_get_next(item);
|
||||
}
|
||||
}
|
||||
/* if we get here, then we cycled all the way around the
|
||||
* list without finding a better answer - just use the node
|
||||
* that is minimally overloaded
|
||||
*/
|
||||
cur_node_item = (opal_list_item_t*)ndmin;
|
||||
}
|
||||
|
||||
return (orte_node_t*)cur_node_item;
|
||||
}
|
||||
|
||||
orte_rmaps_base_module_t orte_rmaps_round_robin_module = {
|
||||
orte_rmaps_rr_map
|
||||
};
|
||||
|
@ -34,10 +34,6 @@
|
||||
#include "orte/mca/rmaps/base/base.h"
|
||||
#include "rmaps_rr.h"
|
||||
|
||||
static orte_proc_t* setup_proc(orte_job_t *jdata,
|
||||
orte_node_t *node,
|
||||
orte_app_idx_t idx);
|
||||
|
||||
int orte_rmaps_rr_byslot(orte_job_t *jdata,
|
||||
orte_app_context_t *app,
|
||||
opal_list_t *node_list,
|
||||
@ -89,6 +85,7 @@ int orte_rmaps_rr_byslot(orte_job_t *jdata,
|
||||
nprocs_mapped = 0;
|
||||
while (NULL != (item = opal_list_remove_first(node_list))) {
|
||||
node = (orte_node_t*)item;
|
||||
opal_output(0, "MAPPING TO %s", node->name);
|
||||
#if OPAL_HAVE_HWLOC
|
||||
/* get the root object as we are not assigning
|
||||
* locale except at the node level
|
||||
@ -111,30 +108,34 @@ int orte_rmaps_rr_byslot(orte_job_t *jdata,
|
||||
*/
|
||||
node->oversubscribed = true;
|
||||
}
|
||||
if (0 == node->slots_alloc) {
|
||||
if (0 == (node->slots_alloc - node->slots_inuse)) {
|
||||
num_procs_to_assign = 1 + extra_procs_to_assign;
|
||||
} else {
|
||||
num_procs_to_assign = node->slots_alloc + extra_procs_to_assign;
|
||||
num_procs_to_assign = (node->slots_alloc - node->slots_inuse) + extra_procs_to_assign;
|
||||
}
|
||||
for (i=0; i < num_procs_to_assign && nprocs_mapped < app->num_procs; i++) {
|
||||
if (0 == i) {
|
||||
/* add this node to the map - do it only once */
|
||||
if (ORTE_SUCCESS > (rc = opal_pointer_array_add(jdata->map->nodes, (void*)node))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
if (!node->mapped) {
|
||||
if (ORTE_SUCCESS > (rc = opal_pointer_array_add(jdata->map->nodes, (void*)node))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
node->mapped = true;
|
||||
OBJ_RETAIN(node); /* maintain accounting on object */
|
||||
++(jdata->map->num_nodes);
|
||||
}
|
||||
OBJ_RETAIN(node); /* maintain accounting on object */
|
||||
++(jdata->map->num_nodes);
|
||||
}
|
||||
if (NULL == (proc = setup_proc(jdata, node, app->idx))) {
|
||||
if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, app->idx))) {
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
nprocs_mapped++;
|
||||
#if OPAL_HAVE_HWLOC
|
||||
proc->locale = obj;
|
||||
#endif
|
||||
}
|
||||
jdata->bookmark = node;
|
||||
/* keep track of the node we last used */
|
||||
jdata->bookmark = node;
|
||||
}
|
||||
/* release the node - the object will persist */
|
||||
OBJ_RELEASE(node);
|
||||
}
|
||||
@ -221,13 +222,16 @@ int orte_rmaps_rr_bynode(orte_job_t *jdata,
|
||||
obj = hwloc_get_root_obj(node->topology);
|
||||
}
|
||||
#endif
|
||||
/* add this node to the map */
|
||||
if (ORTE_SUCCESS > (idx = opal_pointer_array_add(jdata->map->nodes, (void*)node))) {
|
||||
ORTE_ERROR_LOG(idx);
|
||||
return idx;
|
||||
/* add this node to the map, but only do so once */
|
||||
if (!node->mapped) {
|
||||
if (ORTE_SUCCESS > (idx = opal_pointer_array_add(jdata->map->nodes, (void*)node))) {
|
||||
ORTE_ERROR_LOG(idx);
|
||||
return idx;
|
||||
}
|
||||
node->mapped = true;
|
||||
OBJ_RETAIN(node); /* maintain accounting on object */
|
||||
++(jdata->map->num_nodes);
|
||||
}
|
||||
OBJ_RETAIN(node); /* maintain accounting on object */
|
||||
++(jdata->map->num_nodes);
|
||||
/* compute the number of procs to go on this node */
|
||||
if (add_one) {
|
||||
if (0 == nxtra_nodes) {
|
||||
@ -251,7 +255,7 @@ int orte_rmaps_rr_bynode(orte_job_t *jdata,
|
||||
* have to track how many procs to "shift" elsewhere
|
||||
* to make up the difference
|
||||
*/
|
||||
if (0 == node->slots_alloc) {
|
||||
if (0 == (node->slots_alloc - node->slots_inuse)) {
|
||||
/* if there are no extras to take, then we can
|
||||
* safely remove this node as we don't need it
|
||||
*/
|
||||
@ -269,15 +273,15 @@ int orte_rmaps_rr_bynode(orte_job_t *jdata,
|
||||
lag += navg;
|
||||
} else {
|
||||
/* if slots_alloc < avg, then take all */
|
||||
if (node->slots_alloc < navg) {
|
||||
num_procs_to_assign = node->slots_alloc + extra_procs_to_assign;
|
||||
if ((node->slots_alloc - node->slots_inuse) < navg) {
|
||||
num_procs_to_assign = (node->slots_alloc - node->slots_inuse) + extra_procs_to_assign;
|
||||
/* update how many we are lagging behind */
|
||||
lag += navg - node->slots_alloc;
|
||||
lag += navg - (node->slots_alloc - node->slots_inuse);
|
||||
} else {
|
||||
/* take the avg plus as much of the "lag" as we can */
|
||||
delta = 0;
|
||||
if (0 < lag) {
|
||||
delta = node->slots_alloc - navg;
|
||||
delta = (node->slots_alloc - node->slots_inuse) - navg;
|
||||
if (lag < delta) {
|
||||
delta = lag;
|
||||
}
|
||||
@ -288,15 +292,16 @@ int orte_rmaps_rr_bynode(orte_job_t *jdata,
|
||||
}
|
||||
}
|
||||
for (j=0; j < num_procs_to_assign && nprocs_mapped < app->num_procs; j++) {
|
||||
if (NULL == (proc = setup_proc(jdata, node, app->idx))) {
|
||||
if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, app->idx))) {
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
nprocs_mapped++;
|
||||
#if OPAL_HAVE_HWLOC
|
||||
proc->locale = obj;
|
||||
#endif
|
||||
/* keep track of the node we last used */
|
||||
jdata->bookmark = node;
|
||||
}
|
||||
jdata->bookmark = node;
|
||||
/* maintain acctg */
|
||||
OBJ_RELEASE(node);
|
||||
if (nprocs_mapped == app->num_procs) {
|
||||
@ -427,11 +432,11 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata,
|
||||
--nxtra_nodes;
|
||||
}
|
||||
}
|
||||
if (0 == node->slots_alloc) {
|
||||
if (0 == (node->slots_alloc - node->slots_inuse)) {
|
||||
/* everybody takes at least the extras */
|
||||
num_procs_to_assign = extra_procs_to_assign;
|
||||
} else {
|
||||
num_procs_to_assign = node->slots_alloc + extra_procs_to_assign;
|
||||
num_procs_to_assign = (node->slots_alloc - node->slots_inuse) + extra_procs_to_assign;
|
||||
}
|
||||
|
||||
/* get the number of objects of this type on this node */
|
||||
@ -463,14 +468,15 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata,
|
||||
nprocs = nperobj;
|
||||
}
|
||||
for (j=0; j < nprocs && nprocs_mapped < app->num_procs; j++) {
|
||||
if (NULL == (proc = setup_proc(jdata, node, app->idx))) {
|
||||
if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, app->idx))) {
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
nprocs_mapped++;
|
||||
proc->locale = obj;
|
||||
}
|
||||
/* keep track of the node we last used */
|
||||
jdata->bookmark = node;
|
||||
}
|
||||
jdata->bookmark = node;
|
||||
/* maintain acctg */
|
||||
OBJ_RELEASE(node);
|
||||
if (nprocs_mapped == app->num_procs) {
|
||||
@ -591,7 +597,7 @@ static int byobj_span(orte_job_t *jdata,
|
||||
* have to track how many procs to "shift" elsewhere
|
||||
* to make up the difference
|
||||
*/
|
||||
if (0 == node->slots_alloc) {
|
||||
if (0 == (node->slots_alloc - node->slots_inuse)) {
|
||||
/* if there are no extras to take, then we can
|
||||
* safely remove this node as we don't need it
|
||||
*/
|
||||
@ -609,15 +615,15 @@ static int byobj_span(orte_job_t *jdata,
|
||||
lag += navg;
|
||||
} else {
|
||||
/* if slots_alloc < avg, then take all */
|
||||
if (node->slots_alloc < navg) {
|
||||
num_procs_to_assign = node->slots_alloc + extra_procs_to_assign;
|
||||
if ((node->slots_alloc - node->slots_inuse) < navg) {
|
||||
num_procs_to_assign = (node->slots_alloc - node->slots_inuse) + extra_procs_to_assign;
|
||||
/* update how many we are lagging behind */
|
||||
lag += navg - node->slots_alloc;
|
||||
lag += navg - (node->slots_alloc - node->slots_inuse);
|
||||
} else {
|
||||
/* take the avg plus as much of the "lag" as we can */
|
||||
delta = 0;
|
||||
if (0 < lag) {
|
||||
delta = node->slots_alloc - navg;
|
||||
delta = (node->slots_alloc - node->slots_inuse) - navg;
|
||||
if (lag < delta) {
|
||||
delta = lag;
|
||||
}
|
||||
@ -657,14 +663,15 @@ static int byobj_span(orte_job_t *jdata,
|
||||
nprocs = nperobj;
|
||||
}
|
||||
for (j=0; j < nprocs && nprocs_mapped < app->num_procs; j++) {
|
||||
if (NULL == (proc = setup_proc(jdata, node, app->idx))) {
|
||||
if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, app->idx))) {
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
nprocs_mapped++;
|
||||
proc->locale = obj;
|
||||
}
|
||||
/* keep track of the node we last used */
|
||||
jdata->bookmark = node;
|
||||
}
|
||||
jdata->bookmark = node;
|
||||
/* maintain acctg */
|
||||
OBJ_RELEASE(node);
|
||||
if (nprocs_mapped == app->num_procs) {
|
||||
@ -677,36 +684,3 @@ static int byobj_span(orte_job_t *jdata,
|
||||
}
|
||||
#endif
|
||||
|
||||
static orte_proc_t* setup_proc(orte_job_t *jdata,
|
||||
orte_node_t *node,
|
||||
orte_app_idx_t idx)
|
||||
{
|
||||
orte_proc_t *proc;
|
||||
int rc;
|
||||
|
||||
proc = OBJ_NEW(orte_proc_t);
|
||||
/* set the jobid */
|
||||
proc->name.jobid = jdata->jobid;
|
||||
/* we do not set the vpid here - this will be done
|
||||
* during a second phase, but we do set the epoch here
|
||||
* since they all start with the same value.
|
||||
*/
|
||||
ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN);
|
||||
/* flag the proc as ready for launch */
|
||||
proc->state = ORTE_PROC_STATE_INIT;
|
||||
proc->app_idx = idx;
|
||||
|
||||
OBJ_RETAIN(node); /* maintain accounting on object */
|
||||
proc->node = node;
|
||||
proc->nodename = node->name;
|
||||
node->num_procs++;
|
||||
if (0 > (rc = opal_pointer_array_add(node->procs, (void*)proc))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(proc);
|
||||
return NULL;
|
||||
}
|
||||
/* retain the proc struct so that we correctly track its release */
|
||||
OBJ_RETAIN(proc);
|
||||
|
||||
return proc;
|
||||
}
|
||||
|
@ -222,21 +222,9 @@ static int orte_rmaps_seq_map(orte_job_t *jdata)
|
||||
opal_pointer_array_add(map->nodes, node);
|
||||
node->mapped = true;
|
||||
}
|
||||
proc = OBJ_NEW(orte_proc_t);
|
||||
/* set the jobid */
|
||||
proc->name.jobid = jdata->jobid;
|
||||
proc->name.vpid = vpid++;
|
||||
ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN);
|
||||
/* flag the proc as ready for launch */
|
||||
proc->state = ORTE_PROC_STATE_INIT;
|
||||
proc->app_idx = i;
|
||||
|
||||
OBJ_RETAIN(node); /* maintain accounting on object */
|
||||
proc->node = node;
|
||||
proc->nodename = node->name;
|
||||
node->num_procs++;
|
||||
if ((node->slots < node->slots_inuse) ||
|
||||
(0 < node->slots_max && node->slots_max < node->slots_inuse)) {
|
||||
proc = orte_rmaps_base_setup_proc(jdata, node, i);
|
||||
if ((node->slots < (int)node->num_procs) ||
|
||||
(0 < node->slots_max && node->slots_max < (int)node->num_procs)) {
|
||||
if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
|
||||
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
|
||||
true, node->num_procs, app->app);
|
||||
@ -248,13 +236,8 @@ static int orte_rmaps_seq_map(orte_job_t *jdata)
|
||||
*/
|
||||
node->oversubscribed = true;
|
||||
}
|
||||
if (0 > (rc = opal_pointer_array_add(node->procs, (void*)proc))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(proc);
|
||||
return rc;
|
||||
}
|
||||
/* retain the proc struct so that we correctly track its release */
|
||||
OBJ_RETAIN(proc);
|
||||
/* assign the vpid */
|
||||
proc->name.vpid = vpid++;
|
||||
|
||||
#if OPAL_HAVE_HWLOC
|
||||
/* assign the locale - okay for the topo to be null as
|
||||
|
@ -494,13 +494,13 @@ int orte_dt_print_proc(char **output, char *prefix, orte_proc_t *src, opal_data_
|
||||
if (!orte_devel_level_output) {
|
||||
/* just print a very simple output for users */
|
||||
#if ORTE_ENABLE_EPOCH
|
||||
asprintf(&tmp, "\n%sProcess OMPI jobid: %s Process rank: %s Epoch: %s", pfx2,
|
||||
ORTE_JOBID_PRINT(src->name.jobid),
|
||||
asprintf(&tmp, "\n%sProcess OMPI jobid: %s App: %ld Process rank: %s Epoch: %s", pfx2,
|
||||
ORTE_JOBID_PRINT(src->name.jobid), (long)src->app_idx,
|
||||
ORTE_VPID_PRINT(src->name.vpid),
|
||||
ORTE_EPOCH_PRINT(src->name.epoch));
|
||||
#else
|
||||
asprintf(&tmp, "\n%sProcess OMPI jobid: %s Process rank: %s", pfx2,
|
||||
ORTE_JOBID_PRINT(src->name.jobid),
|
||||
asprintf(&tmp, "\n%sProcess OMPI jobid: %s App: %ld Process rank: %s", pfx2,
|
||||
ORTE_JOBID_PRINT(src->name.jobid), (long)src->app_idx,
|
||||
ORTE_VPID_PRINT(src->name.vpid));
|
||||
#endif
|
||||
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user