diff --git a/orte/mca/odls/base/odls_base_default_fns.c b/orte/mca/odls/base/odls_base_default_fns.c index becb83250c..405d5fc3aa 100644 --- a/orte/mca/odls/base/odls_base_default_fns.c +++ b/orte/mca/odls/base/odls_base_default_fns.c @@ -81,6 +81,8 @@ #include "orte/mca/odls/base/base.h" #include "orte/mca/odls/base/odls_private.h" +static bool override_oversubscribed = false; + /* IT IS CRITICAL THAT ANY CHANGE IN THE ORDER OF THE INFO PACKED IN * THIS FUNCTION BE REFLECTED IN THE CONSTRUCT_CHILD_LIST PARSER BELOW */ @@ -346,6 +348,12 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data, return rc; } + /* pack the oversubscribe override flag */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &jdata->oversubscribe_override, 1, OPAL_BOOL))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* pack the map & binding policy for this job */ if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &map->policy, 1, ORTE_MAPPING_POLICY))) { ORTE_ERROR_LOG(rc); @@ -804,6 +812,12 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data, ORTE_ERROR_LOG(rc); goto REPORT_ERROR; } + /* unpack the override oversubscribed flag */ + cnt=1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &override_oversubscribed, &cnt, OPAL_BOOL))) { + ORTE_ERROR_LOG(rc); + goto REPORT_ERROR; + } /* unpack the mapping policy for the job */ cnt=1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->policy, &cnt, ORTE_MAPPING_POLICY))) { @@ -1426,20 +1440,6 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, */ getcwd(basedir, sizeof(basedir)); - /* compute the number of local procs alive or about to be launched - * as part of this job - */ - num_procs_alive = 0; - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; - if (child->alive || - OPAL_EQUAL == opal_dss.compare(&job, &(child->name->jobid), ORTE_JOBID)) { - num_procs_alive++; - } - } - /* find the jobdat for this job */ jobdat = NULL; for (item = opal_list_get_first(&orte_local_jobdata); @@ -1521,8 +1521,10 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, orte_sstore.wait_all_deps(); #endif - /* if the mapper says we are oversubscribed, then we trust it */ - if (oversubscribed) { + /* if the mapper says we are oversubscribed, then we trust it - unless + * it told us -not- to! + */ + if (oversubscribed && !override_oversubscribed) { OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls:launch mapper declares this node oversubscribed", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); @@ -1532,6 +1534,20 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, * is done solely in case the mapper had incorrect knowledge of * the #local processors */ + /* compute the number of local procs alive or about to be launched + * as part of this job + */ + num_procs_alive = 0; + for (item = opal_list_get_first(&orte_local_children); + item != opal_list_get_end(&orte_local_children); + item = opal_list_get_next(item)) { + child = (orte_odls_child_t*)item; + if (child->alive || + OPAL_EQUAL == opal_dss.compare(&job, &(child->name->jobid), ORTE_JOBID)) { + num_procs_alive++; + } + } + /* get the number of local processors */ if (ORTE_SUCCESS != (rc = opal_paffinity_base_get_processor_info(&num_processors))) { /* if we cannot find the number of local processors, we have no choice * but to default to conservative settings @@ -1550,7 +1566,7 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, } } OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, - "%s odls:launch found %d processors for %d children and set oversubscribed to %s", + "%s odls:launch found %d processors for %d children and locally set oversubscribed to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (ORTE_SUCCESS == rc) ? num_processors : -1, (int)opal_list_get_size(&orte_local_children), oversubscribed ? "true" : "false"));