1
1
A race condition exists based on whether or not the userdata object attached to a hwloc_obj_t has been initialized. These objects are setup whenever we scan for resources under that location. You therefore must not set a variable to the pointer to the userdata object and then call a function that will initialize the data in it - you need to set the variable after the function call, and protect against a NULL pointer

Signed-off-by: Ralph Castain <rhc@open-mpi.org>
Этот коммит содержится в:
Ralph Castain 2018-06-19 13:52:34 -07:00
родитель 66bc86a25b
Коммит 98b4ed9a3a
2 изменённых файлов: 20 добавлений и 3 удалений

Просмотреть файл

@ -65,3 +65,4 @@ orte_abort_timeout = 10
hwloc_base_mem_bind_failure_action = silent
btl_tcp_if_include=10.10.10.0/24
oob=^ud
btl=self,vader,tcp

Просмотреть файл

@ -390,12 +390,16 @@ static int bind_in_place(orte_job_t *jdata,
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
return ORTE_ERR_SILENT;
}
data = (opal_hwloc_obj_data_t*)locale->userdata;
/* get the number of cpus under this location */
if (0 == (ncpus = opal_hwloc_base_get_npus(node->topology->topo, locale))) {
orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name);
return ORTE_ERR_SILENT;
}
data = (opal_hwloc_obj_data_t*)locale->userdata;
if (NULL == data) {
data = OBJ_NEW(opal_hwloc_obj_data_t);
locale->userdata = data;
}
/* if we don't have enough cpus to support this additional proc, try
* shifting the location to a cousin that can support it - the important
* thing is that we maintain the same level in the topology */
@ -406,8 +410,12 @@ static int bind_in_place(orte_job_t *jdata,
sib = locale;
found = false;
while (NULL != (sib = sib->next_cousin)) {
data = (opal_hwloc_obj_data_t*)sib->userdata;
ncpus = opal_hwloc_base_get_npus(node->topology->topo, sib);
data = (opal_hwloc_obj_data_t*)sib->userdata;
if (NULL == data) {
data = OBJ_NEW(opal_hwloc_obj_data_t);
sib->userdata = data;
}
if (data->num_bound < ncpus) {
found = true;
locale = sib;
@ -421,8 +429,12 @@ static int bind_in_place(orte_job_t *jdata,
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
sib = locale;
while (NULL != (sib = sib->prev_cousin)) {
data = (opal_hwloc_obj_data_t*)sib->userdata;
ncpus = opal_hwloc_base_get_npus(node->topology->topo, sib);
data = (opal_hwloc_obj_data_t*)sib->userdata;
if (NULL == data) {
data = OBJ_NEW(opal_hwloc_obj_data_t);
sib->userdata = data;
}
if (data->num_bound < ncpus) {
found = true;
locale = sib;
@ -453,6 +465,10 @@ static int bind_in_place(orte_job_t *jdata,
}
/* track the number bound */
data = (opal_hwloc_obj_data_t*)locale->userdata; // just in case it changed
if (NULL == data) {
data = OBJ_NEW(opal_hwloc_obj_data_t);
locale->userdata = data;
}
data->num_bound++;
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"BINDING PROC %s TO %s NUMBER %u",