Ensure that the oversubscribed condition of nodes is accurately reported by the mapper, and that the results are communicated and used by the backend orteds when setting sched_yield on local procs. Restores prior behavior that was somehow lost along the way.
Includes a patch from Damien Guinier to fix vpid assignments when cpus-per-task is specified. This commit was SVN r24126.
This commit is contained in:
parent
828404bdad
commit
30c37ea536
@ -608,7 +608,7 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
|
||||
char **slot_str=NULL;
|
||||
orte_jobid_t debugger;
|
||||
bool add_child;
|
||||
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||
"%s odls:constructing child list",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
@ -1414,7 +1414,9 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
|
||||
int inm;
|
||||
opal_event_t *delay;
|
||||
int num_procs_alive;
|
||||
|
||||
orte_nid_t *nid;
|
||||
orte_node_t *node;
|
||||
|
||||
/* protect operations involving the global list of children */
|
||||
OPAL_THREAD_LOCK(&orte_odls_globals.mutex);
|
||||
|
||||
@ -1453,7 +1455,7 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
|
||||
if (NULL == jobdat) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
rc = ORTE_ERR_NOT_FOUND;
|
||||
goto CLEANUP;
|
||||
goto GETOUT;
|
||||
}
|
||||
|
||||
/* do we have any local procs to launch? */
|
||||
@ -1466,6 +1468,29 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
|
||||
apps = jobdat->apps;
|
||||
num_apps = jobdat->num_apps;
|
||||
|
||||
/* see if the mapper thinks we are oversubscribed */
|
||||
oversubscribed = false;
|
||||
if (ORTE_PROC_IS_HNP) {
|
||||
/* just fake it - we don't keep a local nidmap */
|
||||
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
rc = ORTE_ERR_NOT_FOUND;
|
||||
goto CLEANUP;
|
||||
}
|
||||
if (node->oversubscribed) {
|
||||
oversubscribed = true;
|
||||
}
|
||||
} else {
|
||||
if (NULL == (nid = orte_util_lookup_nid(ORTE_PROC_MY_NAME))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
rc = ORTE_ERR_NOT_FOUND;
|
||||
goto CLEANUP;
|
||||
}
|
||||
if (nid->oversubscribed) {
|
||||
oversubscribed = true;
|
||||
}
|
||||
}
|
||||
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
/*
|
||||
* Notify the local SnapC component regarding new job
|
||||
@ -1496,30 +1521,41 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
|
||||
orte_sstore.wait_all_deps();
|
||||
#endif
|
||||
|
||||
if (ORTE_SUCCESS != (rc = opal_paffinity_base_get_processor_info(&num_processors))) {
|
||||
/* if we cannot find the number of local processors, we have no choice
|
||||
* but to default to conservative settings
|
||||
*/
|
||||
oversubscribed = true;
|
||||
/* if the mapper says we are oversubscribed, then we trust it */
|
||||
if (oversubscribed) {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||
"%s odls:launch mapper declares this node oversubscribed",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
} else {
|
||||
if (num_procs_alive > num_processors) {
|
||||
/* if the #procs > #processors, declare us oversubscribed. This
|
||||
* covers the case where the user didn't tell us anything about the
|
||||
* number of available slots, so we defaulted to a value of 1
|
||||
/* if the mapper thinks we are not oversubscribed, then we
|
||||
* do a final smoke test by checking against the #processors. This
|
||||
* is done solely in case the mapper had incorrect knowledge of
|
||||
* the #local processors
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = opal_paffinity_base_get_processor_info(&num_processors))) {
|
||||
/* if we cannot find the number of local processors, we have no choice
|
||||
* but to default to conservative settings
|
||||
*/
|
||||
oversubscribed = true;
|
||||
} else {
|
||||
/* otherwise, declare us to not be oversubscribed so we can be aggressive */
|
||||
oversubscribed = false;
|
||||
if (num_procs_alive > num_processors) {
|
||||
/* if the #procs > #processors, declare us oversubscribed. This
|
||||
* covers the case where the user didn't tell us anything about the
|
||||
* number of available slots, so we defaulted to a value of 1
|
||||
*/
|
||||
oversubscribed = true;
|
||||
} else {
|
||||
/* otherwise, declare us to not be oversubscribed so we can be aggressive */
|
||||
oversubscribed = false;
|
||||
}
|
||||
}
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||
"%s odls:launch found %d processors for %d children and set oversubscribed to %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(ORTE_SUCCESS == rc) ? num_processors : -1, (int)opal_list_get_size(&orte_local_children),
|
||||
oversubscribed ? "true" : "false"));
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||
"%s odls:launch found %d processors for %d children and set oversubscribed to %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(ORTE_SUCCESS == rc) ? num_processors: -1, (int)opal_list_get_size(&orte_local_children),
|
||||
oversubscribed ? "true" : "false"));
|
||||
|
||||
/* setup to report the proc state to the HNP */
|
||||
OBJ_CONSTRUCT(&alert, opal_buffer_t);
|
||||
|
||||
@ -1975,6 +2011,7 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
|
||||
}
|
||||
}
|
||||
|
||||
GETOUT:
|
||||
opal_condition_signal(&orte_odls_globals.cond);
|
||||
OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex);
|
||||
return rc;
|
||||
|
@ -192,7 +192,8 @@ int orte_rmaps_base_map_byslot(orte_job_t *jdata, orte_app_context_t *app,
|
||||
if (0 == node->slots_alloc) {
|
||||
num_procs_to_assign = 1;
|
||||
} else {
|
||||
num_possible_procs = node->slots_alloc / jdata->map->cpus_per_rank;
|
||||
/* 'num_possible_procs' defines the number of ranks */
|
||||
num_possible_procs = node->slots_alloc;
|
||||
if (0 == num_possible_procs) {
|
||||
num_procs_to_assign = 1;
|
||||
} else {
|
||||
@ -200,7 +201,11 @@ int orte_rmaps_base_map_byslot(orte_job_t *jdata, orte_app_context_t *app,
|
||||
}
|
||||
}
|
||||
} else {
|
||||
num_possible_procs = (node->slots_alloc - node->slots_inuse) / jdata->map->cpus_per_rank;
|
||||
/* 'num_possible_procs' define number of ranks on the node. Each
|
||||
* rank occupies one slot. Each slot may represent more than one
|
||||
* cpu, depending on the cpus-per-task setting
|
||||
*/
|
||||
num_possible_procs = (node->slots_alloc - node->slots_inuse);
|
||||
if (0 == num_possible_procs) {
|
||||
num_procs_to_assign = 1;
|
||||
} else {
|
||||
|
@ -364,6 +364,9 @@ PROCESS:
|
||||
/* retain the proc struct so that we correctly track its release */
|
||||
OBJ_RETAIN(proc);
|
||||
++node->num_procs;
|
||||
|
||||
/* update the oversubscribed state of the node */
|
||||
node->oversubscribed = oversubscribed;
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
@ -423,7 +426,7 @@ int orte_rmaps_base_claim_slot(orte_job_t *jdata,
|
||||
ORTE_JOBID_PRINT(jdata->jobid), current_node->name));
|
||||
|
||||
/* Be sure to demarcate the slots for this proc as claimed from the node */
|
||||
current_node->slots_inuse += cpus_per_rank;
|
||||
current_node->slots_inuse += 1;
|
||||
|
||||
/* see if this node is oversubscribed now */
|
||||
if (current_node->slots_inuse > current_node->slots) {
|
||||
|
@ -46,6 +46,7 @@ struct orte_job_map_t {
|
||||
int npersocket;
|
||||
int16_t cpus_per_rank;
|
||||
int16_t stride;
|
||||
/* are we allowed to oversubscribe the nodes in this job */
|
||||
bool oversubscribe;
|
||||
bool display_map;
|
||||
bool cpu_lists;
|
||||
|
@ -384,8 +384,9 @@ int orte_dt_print_node(char **output, char *prefix, orte_node_t *src, opal_data_
|
||||
free(tmp);
|
||||
tmp = tmp2;
|
||||
|
||||
asprintf(&tmp2, "%s\n%s\tNum slots: %ld\tSlots in use: %ld", tmp, pfx2,
|
||||
(long)src->slots, (long)src->slots_inuse);
|
||||
asprintf(&tmp2, "%s\n%s\tNum slots: %ld\tSlots in use: %ld\tOversubscribed: %s", tmp, pfx2,
|
||||
(long)src->slots, (long)src->slots_inuse,
|
||||
(src->oversubscribed) ? "TRUE" : "FALSE");
|
||||
free(tmp);
|
||||
tmp = tmp2;
|
||||
|
||||
|
@ -516,6 +516,8 @@ typedef struct {
|
||||
char *name;
|
||||
/* vpid of this job family's daemon on this node */
|
||||
orte_vpid_t daemon;
|
||||
/* whether or not this node is oversubscribed */
|
||||
bool oversubscribed;
|
||||
/* list of interface attributes */
|
||||
opal_list_t attrs;
|
||||
/* list of system info */
|
||||
|
@ -299,6 +299,7 @@ int orte_util_encode_nodemap(opal_byte_object_t *boptr)
|
||||
char *nodename;
|
||||
opal_buffer_t buf;
|
||||
char *ptr;
|
||||
uint8_t *oversub=NULL;
|
||||
|
||||
/* setup a buffer for tmp use */
|
||||
OBJ_CONSTRUCT(&buf, opal_buffer_t);
|
||||
@ -357,8 +358,9 @@ int orte_util_encode_nodemap(opal_byte_object_t *boptr)
|
||||
* match their node array index
|
||||
*/
|
||||
|
||||
/* allocate space for the daemon vpids */
|
||||
/* allocate space for the daemon vpids and oversubscribed flags */
|
||||
vpids = (orte_vpid_t*)malloc(num_nodes * sizeof(orte_vpid_t));
|
||||
oversub = (uint8_t*)malloc(num_nodes * sizeof(uint8_t));
|
||||
for (i=0; i < orte_node_pool->size; i++) {
|
||||
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
|
||||
continue;
|
||||
@ -369,12 +371,18 @@ int orte_util_encode_nodemap(opal_byte_object_t *boptr)
|
||||
continue;
|
||||
}
|
||||
vpids[i] = node->daemon->name.vpid;
|
||||
oversub[i] = node->oversubscribed;
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, vpids, num_nodes, ORTE_VPID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
free(vpids);
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, oversub, num_nodes, OPAL_UINT8))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
free(oversub);
|
||||
|
||||
/* check if we are to send the profile file data */
|
||||
if (orte_send_profile) {
|
||||
@ -426,6 +434,7 @@ int orte_util_decode_nodemap(opal_byte_object_t *bo)
|
||||
opal_buffer_t buf;
|
||||
opal_byte_object_t *boptr;
|
||||
int rc;
|
||||
uint8_t *oversub;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_debug_output,
|
||||
"%s decode:nidmap decoding nodemap",
|
||||
@ -490,6 +499,15 @@ int orte_util_decode_nodemap(opal_byte_object_t *bo)
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* unpack the oversubscribed flags */
|
||||
oversub = (uint8_t*)malloc(num_nodes * sizeof(uint8_t));
|
||||
n=num_nodes;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, oversub, &n, OPAL_UINT8))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* transfer the data to the nidmap, counting the number of
|
||||
* daemons in the system
|
||||
*/
|
||||
@ -497,13 +515,19 @@ int orte_util_decode_nodemap(opal_byte_object_t *bo)
|
||||
for (i=0; i < num_nodes; i++) {
|
||||
if (NULL != (ndptr = (orte_nid_t*)opal_pointer_array_get_item(&orte_nidmap, i))) {
|
||||
ndptr->daemon = vpids[i];
|
||||
if (0 == oversub[i]) {
|
||||
ndptr->oversubscribed = false;
|
||||
} else {
|
||||
ndptr->oversubscribed = true;
|
||||
}
|
||||
if (ORTE_VPID_INVALID != vpids[i]) {
|
||||
++num_daemons;
|
||||
}
|
||||
}
|
||||
}
|
||||
free(vpids);
|
||||
|
||||
free(oversub);
|
||||
|
||||
/* if we are a daemon or the HNP, update our num_procs */
|
||||
if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_DAEMON) {
|
||||
orte_process_info.num_procs = num_daemons;
|
||||
|
Loading…
x
Reference in New Issue
Block a user