1
1

One more time, into the breach!

Restore the use of override_oversubscribe to indicate that the data source for resources on the backend nodes used in mapping is unreliable. In this situation (e.g., data came from hostfile, or we are just using localhost because nothing was provided), we don't trust the oversubscribe condition passed by the mapper. Instead, we check locally to ensure we set sched_yield correctly.

This commit was SVN r24130.
Этот коммит содержится в:
Ralph Castain 2010-12-01 15:15:26 +00:00
родитель eba65e97f3
Коммит ad814f26cd

Просмотреть файл

@ -81,6 +81,8 @@
#include "orte/mca/odls/base/base.h"
#include "orte/mca/odls/base/odls_private.h"
static bool override_oversubscribed = false;
/* IT IS CRITICAL THAT ANY CHANGE IN THE ORDER OF THE INFO PACKED IN
* THIS FUNCTION BE REFLECTED IN THE CONSTRUCT_CHILD_LIST PARSER BELOW
*/
@ -346,6 +348,12 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
return rc;
}
/* pack the oversubscribe override flag */
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &jdata->oversubscribe_override, 1, OPAL_BOOL))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* pack the map & binding policy for this job */
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &map->policy, 1, ORTE_MAPPING_POLICY))) {
ORTE_ERROR_LOG(rc);
@ -804,6 +812,12 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
ORTE_ERROR_LOG(rc);
goto REPORT_ERROR;
}
/* unpack the override oversubscribed flag */
cnt=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &override_oversubscribed, &cnt, OPAL_BOOL))) {
ORTE_ERROR_LOG(rc);
goto REPORT_ERROR;
}
/* unpack the mapping policy for the job */
cnt=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->policy, &cnt, ORTE_MAPPING_POLICY))) {
@ -1426,20 +1440,6 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
*/
getcwd(basedir, sizeof(basedir));
/* compute the number of local procs alive or about to be launched
* as part of this job
*/
num_procs_alive = 0;
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
child = (orte_odls_child_t*)item;
if (child->alive ||
OPAL_EQUAL == opal_dss.compare(&job, &(child->name->jobid), ORTE_JOBID)) {
num_procs_alive++;
}
}
/* find the jobdat for this job */
jobdat = NULL;
for (item = opal_list_get_first(&orte_local_jobdata);
@ -1521,8 +1521,10 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
orte_sstore.wait_all_deps();
#endif
/* if the mapper says we are oversubscribed, then we trust it */
if (oversubscribed) {
/* if the mapper says we are oversubscribed, then we trust it - unless
* it told us -not- to!
*/
if (oversubscribed && !override_oversubscribed) {
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:launch mapper declares this node oversubscribed",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
@ -1532,6 +1534,20 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
* is done solely in case the mapper had incorrect knowledge of
* the #local processors
*/
/* compute the number of local procs alive or about to be launched
* as part of this job
*/
num_procs_alive = 0;
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
child = (orte_odls_child_t*)item;
if (child->alive ||
OPAL_EQUAL == opal_dss.compare(&job, &(child->name->jobid), ORTE_JOBID)) {
num_procs_alive++;
}
}
/* get the number of local processors */
if (ORTE_SUCCESS != (rc = opal_paffinity_base_get_processor_info(&num_processors))) {
/* if we cannot find the number of local processors, we have no choice
* but to default to conservative settings
@ -1550,7 +1566,7 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
}
}
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:launch found %d processors for %d children and set oversubscribed to %s",
"%s odls:launch found %d processors for %d children and locally set oversubscribed to %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(ORTE_SUCCESS == rc) ? num_processors : -1, (int)opal_list_get_size(&orte_local_children),
oversubscribed ? "true" : "false"));