Fix the no-disconnect test
A race condition exists based on whether or not the userdata object attached to a hwloc_obj_t has been initialized. These objects are setup whenever we scan for resources under that location. You therefore must not set a variable to the pointer to the userdata object and then call a function that will initialize the data in it - you need to set the variable after the function call, and protect against a NULL pointer Signed-off-by: Ralph Castain <rhc@open-mpi.org>
Этот коммит содержится в:
родитель
66bc86a25b
Коммит
98b4ed9a3a
@ -65,3 +65,4 @@ orte_abort_timeout = 10
|
|||||||
hwloc_base_mem_bind_failure_action = silent
|
hwloc_base_mem_bind_failure_action = silent
|
||||||
btl_tcp_if_include=10.10.10.0/24
|
btl_tcp_if_include=10.10.10.0/24
|
||||||
oob=^ud
|
oob=^ud
|
||||||
|
btl=self,vader,tcp
|
||||||
|
@ -390,12 +390,16 @@ static int bind_in_place(orte_job_t *jdata,
|
|||||||
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
||||||
return ORTE_ERR_SILENT;
|
return ORTE_ERR_SILENT;
|
||||||
}
|
}
|
||||||
data = (opal_hwloc_obj_data_t*)locale->userdata;
|
|
||||||
/* get the number of cpus under this location */
|
/* get the number of cpus under this location */
|
||||||
if (0 == (ncpus = opal_hwloc_base_get_npus(node->topology->topo, locale))) {
|
if (0 == (ncpus = opal_hwloc_base_get_npus(node->topology->topo, locale))) {
|
||||||
orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name);
|
orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name);
|
||||||
return ORTE_ERR_SILENT;
|
return ORTE_ERR_SILENT;
|
||||||
}
|
}
|
||||||
|
data = (opal_hwloc_obj_data_t*)locale->userdata;
|
||||||
|
if (NULL == data) {
|
||||||
|
data = OBJ_NEW(opal_hwloc_obj_data_t);
|
||||||
|
locale->userdata = data;
|
||||||
|
}
|
||||||
/* if we don't have enough cpus to support this additional proc, try
|
/* if we don't have enough cpus to support this additional proc, try
|
||||||
* shifting the location to a cousin that can support it - the important
|
* shifting the location to a cousin that can support it - the important
|
||||||
* thing is that we maintain the same level in the topology */
|
* thing is that we maintain the same level in the topology */
|
||||||
@ -406,8 +410,12 @@ static int bind_in_place(orte_job_t *jdata,
|
|||||||
sib = locale;
|
sib = locale;
|
||||||
found = false;
|
found = false;
|
||||||
while (NULL != (sib = sib->next_cousin)) {
|
while (NULL != (sib = sib->next_cousin)) {
|
||||||
data = (opal_hwloc_obj_data_t*)sib->userdata;
|
|
||||||
ncpus = opal_hwloc_base_get_npus(node->topology->topo, sib);
|
ncpus = opal_hwloc_base_get_npus(node->topology->topo, sib);
|
||||||
|
data = (opal_hwloc_obj_data_t*)sib->userdata;
|
||||||
|
if (NULL == data) {
|
||||||
|
data = OBJ_NEW(opal_hwloc_obj_data_t);
|
||||||
|
sib->userdata = data;
|
||||||
|
}
|
||||||
if (data->num_bound < ncpus) {
|
if (data->num_bound < ncpus) {
|
||||||
found = true;
|
found = true;
|
||||||
locale = sib;
|
locale = sib;
|
||||||
@ -421,8 +429,12 @@ static int bind_in_place(orte_job_t *jdata,
|
|||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||||
sib = locale;
|
sib = locale;
|
||||||
while (NULL != (sib = sib->prev_cousin)) {
|
while (NULL != (sib = sib->prev_cousin)) {
|
||||||
data = (opal_hwloc_obj_data_t*)sib->userdata;
|
|
||||||
ncpus = opal_hwloc_base_get_npus(node->topology->topo, sib);
|
ncpus = opal_hwloc_base_get_npus(node->topology->topo, sib);
|
||||||
|
data = (opal_hwloc_obj_data_t*)sib->userdata;
|
||||||
|
if (NULL == data) {
|
||||||
|
data = OBJ_NEW(opal_hwloc_obj_data_t);
|
||||||
|
sib->userdata = data;
|
||||||
|
}
|
||||||
if (data->num_bound < ncpus) {
|
if (data->num_bound < ncpus) {
|
||||||
found = true;
|
found = true;
|
||||||
locale = sib;
|
locale = sib;
|
||||||
@ -453,6 +465,10 @@ static int bind_in_place(orte_job_t *jdata,
|
|||||||
}
|
}
|
||||||
/* track the number bound */
|
/* track the number bound */
|
||||||
data = (opal_hwloc_obj_data_t*)locale->userdata; // just in case it changed
|
data = (opal_hwloc_obj_data_t*)locale->userdata; // just in case it changed
|
||||||
|
if (NULL == data) {
|
||||||
|
data = OBJ_NEW(opal_hwloc_obj_data_t);
|
||||||
|
locale->userdata = data;
|
||||||
|
}
|
||||||
data->num_bound++;
|
data->num_bound++;
|
||||||
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
||||||
"BINDING PROC %s TO %s NUMBER %u",
|
"BINDING PROC %s TO %s NUMBER %u",
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user