From 98b4ed9a3a6b92a03b59c0e10e74262bb1d37902 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Tue, 19 Jun 2018 13:52:34 -0700 Subject: [PATCH] Fix the no-disconnect test A race condition exists based on whether or not the userdata object attached to a hwloc_obj_t has been initialized. These objects are setup whenever we scan for resources under that location. You therefore must not set a variable to the pointer to the userdata object and then call a function that will initialize the data in it - you need to set the variable after the function call, and protect against a NULL pointer Signed-off-by: Ralph Castain --- contrib/platform/intel/bend/linux.conf | 1 + orte/mca/rmaps/base/rmaps_base_binding.c | 22 +++++++++++++++++++--- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/contrib/platform/intel/bend/linux.conf b/contrib/platform/intel/bend/linux.conf index 48482907c6..0ae921b800 100644 --- a/contrib/platform/intel/bend/linux.conf +++ b/contrib/platform/intel/bend/linux.conf @@ -65,3 +65,4 @@ orte_abort_timeout = 10 hwloc_base_mem_bind_failure_action = silent btl_tcp_if_include=10.10.10.0/24 oob=^ud +btl=self,vader,tcp diff --git a/orte/mca/rmaps/base/rmaps_base_binding.c b/orte/mca/rmaps/base/rmaps_base_binding.c index 3c280c4d3b..43e0916564 100644 --- a/orte/mca/rmaps/base/rmaps_base_binding.c +++ b/orte/mca/rmaps/base/rmaps_base_binding.c @@ -390,12 +390,16 @@ static int bind_in_place(orte_job_t *jdata, ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); return ORTE_ERR_SILENT; } - data = (opal_hwloc_obj_data_t*)locale->userdata; /* get the number of cpus under this location */ if (0 == (ncpus = opal_hwloc_base_get_npus(node->topology->topo, locale))) { orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name); return ORTE_ERR_SILENT; } + data = (opal_hwloc_obj_data_t*)locale->userdata; + if (NULL == data) { + data = OBJ_NEW(opal_hwloc_obj_data_t); + locale->userdata = data; + } /* if we don't have enough cpus to support this additional proc, try * shifting the location to a cousin that can support it - the important * thing is that we maintain the same level in the topology */ @@ -406,8 +410,12 @@ static int bind_in_place(orte_job_t *jdata, sib = locale; found = false; while (NULL != (sib = sib->next_cousin)) { - data = (opal_hwloc_obj_data_t*)sib->userdata; ncpus = opal_hwloc_base_get_npus(node->topology->topo, sib); + data = (opal_hwloc_obj_data_t*)sib->userdata; + if (NULL == data) { + data = OBJ_NEW(opal_hwloc_obj_data_t); + sib->userdata = data; + } if (data->num_bound < ncpus) { found = true; locale = sib; @@ -421,8 +429,12 @@ static int bind_in_place(orte_job_t *jdata, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); sib = locale; while (NULL != (sib = sib->prev_cousin)) { - data = (opal_hwloc_obj_data_t*)sib->userdata; ncpus = opal_hwloc_base_get_npus(node->topology->topo, sib); + data = (opal_hwloc_obj_data_t*)sib->userdata; + if (NULL == data) { + data = OBJ_NEW(opal_hwloc_obj_data_t); + sib->userdata = data; + } if (data->num_bound < ncpus) { found = true; locale = sib; @@ -453,6 +465,10 @@ static int bind_in_place(orte_job_t *jdata, } /* track the number bound */ data = (opal_hwloc_obj_data_t*)locale->userdata; // just in case it changed + if (NULL == data) { + data = OBJ_NEW(opal_hwloc_obj_data_t); + locale->userdata = data; + } data->num_bound++; opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "BINDING PROC %s TO %s NUMBER %u",