1
1

Ensure that the oversubscribed condition of nodes is accurately reported by the mapper, and that the results are communicated and used by the backend orteds when setting sched_yield on local procs. Restores prior behavior that was somehow lost along the way.

Includes a patch from Damien Guinier to fix vpid assignments when cpus-per-task is specified.

This commit was SVN r24126.
This commit is contained in:
Ralph Castain 2010-12-01 12:51:39 +00:00
parent 828404bdad
commit 30c37ea536
7 changed files with 100 additions and 27 deletions

View File

@ -608,7 +608,7 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
char **slot_str=NULL;
orte_jobid_t debugger;
bool add_child;
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:constructing child list",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
@ -1414,7 +1414,9 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
int inm;
opal_event_t *delay;
int num_procs_alive;
orte_nid_t *nid;
orte_node_t *node;
/* protect operations involving the global list of children */
OPAL_THREAD_LOCK(&orte_odls_globals.mutex);
@ -1453,7 +1455,7 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
if (NULL == jobdat) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
rc = ORTE_ERR_NOT_FOUND;
goto CLEANUP;
goto GETOUT;
}
/* do we have any local procs to launch? */
@ -1466,6 +1468,29 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
apps = jobdat->apps;
num_apps = jobdat->num_apps;
/* see if the mapper thinks we are oversubscribed */
oversubscribed = false;
if (ORTE_PROC_IS_HNP) {
/* just fake it - we don't keep a local nidmap */
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
rc = ORTE_ERR_NOT_FOUND;
goto CLEANUP;
}
if (node->oversubscribed) {
oversubscribed = true;
}
} else {
if (NULL == (nid = orte_util_lookup_nid(ORTE_PROC_MY_NAME))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
rc = ORTE_ERR_NOT_FOUND;
goto CLEANUP;
}
if (nid->oversubscribed) {
oversubscribed = true;
}
}
#if OPAL_ENABLE_FT_CR == 1
/*
* Notify the local SnapC component regarding new job
@ -1496,30 +1521,41 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
orte_sstore.wait_all_deps();
#endif
if (ORTE_SUCCESS != (rc = opal_paffinity_base_get_processor_info(&num_processors))) {
/* if we cannot find the number of local processors, we have no choice
* but to default to conservative settings
*/
oversubscribed = true;
/* if the mapper says we are oversubscribed, then we trust it */
if (oversubscribed) {
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:launch mapper declares this node oversubscribed",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
} else {
if (num_procs_alive > num_processors) {
/* if the #procs > #processors, declare us oversubscribed. This
* covers the case where the user didn't tell us anything about the
* number of available slots, so we defaulted to a value of 1
/* if the mapper thinks we are not oversubscribed, then we
* do a final smoke test by checking against the #processors. This
* is done solely in case the mapper had incorrect knowledge of
* the #local processors
*/
if (ORTE_SUCCESS != (rc = opal_paffinity_base_get_processor_info(&num_processors))) {
/* if we cannot find the number of local processors, we have no choice
* but to default to conservative settings
*/
oversubscribed = true;
} else {
/* otherwise, declare us to not be oversubscribed so we can be aggressive */
oversubscribed = false;
if (num_procs_alive > num_processors) {
/* if the #procs > #processors, declare us oversubscribed. This
* covers the case where the user didn't tell us anything about the
* number of available slots, so we defaulted to a value of 1
*/
oversubscribed = true;
} else {
/* otherwise, declare us to not be oversubscribed so we can be aggressive */
oversubscribed = false;
}
}
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:launch found %d processors for %d children and set oversubscribed to %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(ORTE_SUCCESS == rc) ? num_processors : -1, (int)opal_list_get_size(&orte_local_children),
oversubscribed ? "true" : "false"));
}
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:launch found %d processors for %d children and set oversubscribed to %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(ORTE_SUCCESS == rc) ? num_processors: -1, (int)opal_list_get_size(&orte_local_children),
oversubscribed ? "true" : "false"));
/* setup to report the proc state to the HNP */
OBJ_CONSTRUCT(&alert, opal_buffer_t);
@ -1975,6 +2011,7 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
}
}
GETOUT:
opal_condition_signal(&orte_odls_globals.cond);
OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex);
return rc;

View File

@ -192,7 +192,8 @@ int orte_rmaps_base_map_byslot(orte_job_t *jdata, orte_app_context_t *app,
if (0 == node->slots_alloc) {
num_procs_to_assign = 1;
} else {
num_possible_procs = node->slots_alloc / jdata->map->cpus_per_rank;
/* 'num_possible_procs' defines the number of ranks */
num_possible_procs = node->slots_alloc;
if (0 == num_possible_procs) {
num_procs_to_assign = 1;
} else {
@ -200,7 +201,11 @@ int orte_rmaps_base_map_byslot(orte_job_t *jdata, orte_app_context_t *app,
}
}
} else {
num_possible_procs = (node->slots_alloc - node->slots_inuse) / jdata->map->cpus_per_rank;
/* 'num_possible_procs' define number of ranks on the node. Each
* rank occupies one slot. Each slot may represent more than one
* cpu, depending on the cpus-per-task setting
*/
num_possible_procs = (node->slots_alloc - node->slots_inuse);
if (0 == num_possible_procs) {
num_procs_to_assign = 1;
} else {

View File

@ -364,6 +364,9 @@ PROCESS:
/* retain the proc struct so that we correctly track its release */
OBJ_RETAIN(proc);
++node->num_procs;
/* update the oversubscribed state of the node */
node->oversubscribed = oversubscribed;
return ORTE_SUCCESS;
}
@ -423,7 +426,7 @@ int orte_rmaps_base_claim_slot(orte_job_t *jdata,
ORTE_JOBID_PRINT(jdata->jobid), current_node->name));
/* Be sure to demarcate the slots for this proc as claimed from the node */
current_node->slots_inuse += cpus_per_rank;
current_node->slots_inuse += 1;
/* see if this node is oversubscribed now */
if (current_node->slots_inuse > current_node->slots) {

View File

@ -46,6 +46,7 @@ struct orte_job_map_t {
int npersocket;
int16_t cpus_per_rank;
int16_t stride;
/* are we allowed to oversubscribe the nodes in this job */
bool oversubscribe;
bool display_map;
bool cpu_lists;

View File

@ -384,8 +384,9 @@ int orte_dt_print_node(char **output, char *prefix, orte_node_t *src, opal_data_
free(tmp);
tmp = tmp2;
asprintf(&tmp2, "%s\n%s\tNum slots: %ld\tSlots in use: %ld", tmp, pfx2,
(long)src->slots, (long)src->slots_inuse);
asprintf(&tmp2, "%s\n%s\tNum slots: %ld\tSlots in use: %ld\tOversubscribed: %s", tmp, pfx2,
(long)src->slots, (long)src->slots_inuse,
(src->oversubscribed) ? "TRUE" : "FALSE");
free(tmp);
tmp = tmp2;

View File

@ -516,6 +516,8 @@ typedef struct {
char *name;
/* vpid of this job family's daemon on this node */
orte_vpid_t daemon;
/* whether or not this node is oversubscribed */
bool oversubscribed;
/* list of interface attributes */
opal_list_t attrs;
/* list of system info */

View File

@ -299,6 +299,7 @@ int orte_util_encode_nodemap(opal_byte_object_t *boptr)
char *nodename;
opal_buffer_t buf;
char *ptr;
uint8_t *oversub=NULL;
/* setup a buffer for tmp use */
OBJ_CONSTRUCT(&buf, opal_buffer_t);
@ -357,8 +358,9 @@ int orte_util_encode_nodemap(opal_byte_object_t *boptr)
* match their node array index
*/
/* allocate space for the daemon vpids */
/* allocate space for the daemon vpids and oversubscribed flags */
vpids = (orte_vpid_t*)malloc(num_nodes * sizeof(orte_vpid_t));
oversub = (uint8_t*)malloc(num_nodes * sizeof(uint8_t));
for (i=0; i < orte_node_pool->size; i++) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
continue;
@ -369,12 +371,18 @@ int orte_util_encode_nodemap(opal_byte_object_t *boptr)
continue;
}
vpids[i] = node->daemon->name.vpid;
oversub[i] = node->oversubscribed;
}
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, vpids, num_nodes, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
free(vpids);
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, oversub, num_nodes, OPAL_UINT8))) {
ORTE_ERROR_LOG(rc);
return rc;
}
free(oversub);
/* check if we are to send the profile file data */
if (orte_send_profile) {
@ -426,6 +434,7 @@ int orte_util_decode_nodemap(opal_byte_object_t *bo)
opal_buffer_t buf;
opal_byte_object_t *boptr;
int rc;
uint8_t *oversub;
OPAL_OUTPUT_VERBOSE((2, orte_debug_output,
"%s decode:nidmap decoding nodemap",
@ -490,6 +499,15 @@ int orte_util_decode_nodemap(opal_byte_object_t *bo)
ORTE_ERROR_LOG(rc);
return rc;
}
/* unpack the oversubscribed flags */
oversub = (uint8_t*)malloc(num_nodes * sizeof(uint8_t));
n=num_nodes;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, oversub, &n, OPAL_UINT8))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* transfer the data to the nidmap, counting the number of
* daemons in the system
*/
@ -497,13 +515,19 @@ int orte_util_decode_nodemap(opal_byte_object_t *bo)
for (i=0; i < num_nodes; i++) {
if (NULL != (ndptr = (orte_nid_t*)opal_pointer_array_get_item(&orte_nidmap, i))) {
ndptr->daemon = vpids[i];
if (0 == oversub[i]) {
ndptr->oversubscribed = false;
} else {
ndptr->oversubscribed = true;
}
if (ORTE_VPID_INVALID != vpids[i]) {
++num_daemons;
}
}
}
free(vpids);
free(oversub);
/* if we are a daemon or the HNP, update our num_procs */
if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_DAEMON) {
orte_process_info.num_procs = num_daemons;