1
1

Merge pull request #5301 from rhc54/topic/nodis

Fix the no-disconnect test
Этот коммит содержится в:
Ralph Castain 2018-06-19 14:56:12 -07:00 коммит произвёл GitHub
родитель 66bc86a25b 98b4ed9a3a
Коммит 30ffdc9efc
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
2 изменённых файлов: 20 добавлений и 3 удалений

Просмотреть файл

@ -65,3 +65,4 @@ orte_abort_timeout = 10
hwloc_base_mem_bind_failure_action = silent
btl_tcp_if_include=10.10.10.0/24
oob=^ud
btl=self,vader,tcp

Просмотреть файл

@ -390,12 +390,16 @@ static int bind_in_place(orte_job_t *jdata,
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
return ORTE_ERR_SILENT;
}
data = (opal_hwloc_obj_data_t*)locale->userdata;
/* get the number of cpus under this location */
if (0 == (ncpus = opal_hwloc_base_get_npus(node->topology->topo, locale))) {
orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name);
return ORTE_ERR_SILENT;
}
data = (opal_hwloc_obj_data_t*)locale->userdata;
if (NULL == data) {
data = OBJ_NEW(opal_hwloc_obj_data_t);
locale->userdata = data;
}
/* if we don't have enough cpus to support this additional proc, try
* shifting the location to a cousin that can support it - the important
* thing is that we maintain the same level in the topology */
@ -406,8 +410,12 @@ static int bind_in_place(orte_job_t *jdata,
sib = locale;
found = false;
while (NULL != (sib = sib->next_cousin)) {
data = (opal_hwloc_obj_data_t*)sib->userdata;
ncpus = opal_hwloc_base_get_npus(node->topology->topo, sib);
data = (opal_hwloc_obj_data_t*)sib->userdata;
if (NULL == data) {
data = OBJ_NEW(opal_hwloc_obj_data_t);
sib->userdata = data;
}
if (data->num_bound < ncpus) {
found = true;
locale = sib;
@ -421,8 +429,12 @@ static int bind_in_place(orte_job_t *jdata,
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
sib = locale;
while (NULL != (sib = sib->prev_cousin)) {
data = (opal_hwloc_obj_data_t*)sib->userdata;
ncpus = opal_hwloc_base_get_npus(node->topology->topo, sib);
data = (opal_hwloc_obj_data_t*)sib->userdata;
if (NULL == data) {
data = OBJ_NEW(opal_hwloc_obj_data_t);
sib->userdata = data;
}
if (data->num_bound < ncpus) {
found = true;
locale = sib;
@ -453,6 +465,10 @@ static int bind_in_place(orte_job_t *jdata,
}
/* track the number bound */
data = (opal_hwloc_obj_data_t*)locale->userdata; // just in case it changed
if (NULL == data) {
data = OBJ_NEW(opal_hwloc_obj_data_t);
locale->userdata = data;
}
data->num_bound++;
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"BINDING PROC %s TO %s NUMBER %u",