From 70a106fa71bc265f15b93a7d59c912050d265b6a Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Tue, 8 May 2012 03:52:39 +0000 Subject: [PATCH] Fix binding on remote nodes - need to pass the binding bitmap! This commit was SVN r26403. --- orte/mca/ess/base/ess_base_fns.c | 31 +++++++++++++++++++++- orte/mca/odls/base/odls_base_default_fns.c | 28 +++++++++++++++++++ 2 files changed, 58 insertions(+), 1 deletion(-) diff --git a/orte/mca/ess/base/ess_base_fns.c b/orte/mca/ess/base/ess_base_fns.c index b795b1d091..4cea983f7f 100644 --- a/orte/mca/ess/base/ess_base_fns.c +++ b/orte/mca/ess/base/ess_base_fns.c @@ -290,6 +290,9 @@ int orte_ess_base_proc_binding(void) /* see if we were bound when launched */ if (!orte_proc_is_bound) { + OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output, + "%s Not bound at launch", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* we were not bound at launch */ if (NULL != opal_hwloc_topology) { support = (struct hwloc_topology_support*)hwloc_topology_get_support(opal_hwloc_topology); @@ -303,6 +306,9 @@ int orte_ess_base_proc_binding(void) * environment does not support it */ hwloc_bitmap_free(cpus); + OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output, + "%s Binding not supported", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto MOVEON; } /* we are bound if the two cpusets are not equal, @@ -316,6 +322,9 @@ int orte_ess_base_proc_binding(void) */ orte_proc_is_bound = true; hwloc_bitmap_free(cpus); + OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output, + "%s Process was externally bound", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); } else if (support->cpubind->set_thisproc_cpubind && OPAL_BINDING_POLICY_IS_SET(opal_hwloc_binding_policy) && OPAL_BIND_TO_NONE != OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { @@ -340,6 +349,9 @@ int orte_ess_base_proc_binding(void) /* cleanup */ hwloc_bitmap_free(cpus); orte_proc_is_bound = true; + OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output, + "%s Process bound according to slot_list", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); } else { /* cleanup */ hwloc_bitmap_free(cpus); @@ -349,6 +361,9 @@ int orte_ess_base_proc_binding(void) * direct launched - so just ignore and leave * us unbound */ + OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output, + "%s Process not bound - no node rank available", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto MOVEON; } /* if the binding policy is hwthread, then we bind to the nrank-th @@ -367,8 +382,11 @@ int orte_ess_base_proc_binding(void) error = "Setting processor affinity failed"; goto error; } - orte_process_info.bind_level = OPAL_HWLOC_L1CACHE_LEVEL; + orte_process_info.bind_level = OPAL_HWLOC_HWTHREAD_LEVEL; orte_process_info.bind_idx = nrank; + OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output, + "%s Process bound to hwthread", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); } else if (OPAL_BIND_TO_CORE == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { /* if the binding policy is core, then we bind to the nrank-th * core on this node @@ -387,6 +405,9 @@ int orte_ess_base_proc_binding(void) } orte_process_info.bind_level = OPAL_HWLOC_CORE_LEVEL; orte_process_info.bind_idx = nrank; + OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output, + "%s Process bound to core", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); } else { /* for all higher binding policies, we bind to the specified * object that the nrank-th core belongs to @@ -435,6 +456,10 @@ int orte_ess_base_proc_binding(void) orte_process_info.bind_idx = opal_hwloc_base_get_obj_idx(opal_hwloc_topology, obj, OPAL_HWLOC_LOGICAL); orte_proc_is_bound = true; + OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output, + "%s Process bound to %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + opal_hwloc_base_print_level(orte_process_info.bind_level))); break; } } @@ -447,6 +472,10 @@ int orte_ess_base_proc_binding(void) } } } + } else { + OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output, + "%s Process bound at launch", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); } MOVEON: diff --git a/orte/mca/odls/base/odls_base_default_fns.c b/orte/mca/odls/base/odls_base_default_fns.c index 3ba8ad5102..847d5a50a1 100644 --- a/orte/mca/odls/base/odls_base_default_fns.c +++ b/orte/mca/odls/base/odls_base_default_fns.c @@ -96,6 +96,7 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data, { int rc; orte_job_t *jdata=NULL; + orte_proc_t *proc; orte_job_map_t *map=NULL; opal_buffer_t *wireup; opal_byte_object_t bo, *boptr; @@ -307,6 +308,18 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data, /* release the data since it has now been copied into our buffer */ free(bo.bytes); + /* pack the binding bitmaps */ + for (j=0; j < jdata->procs->size; j++) { + if (NULL == (proc = (orte_proc_t *) opal_pointer_array_get_item(jdata->procs, j))) { + continue; + } + /* okay to pack NULL strings */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &proc->cpu_bitmap, 1, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + return rc; + } + } + /* pack the collective ids */ if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &jdata->peer_modex, 1, ORTE_GRPCOMM_COLL_ID_T))) { ORTE_ERROR_LOG(rc); @@ -383,6 +396,7 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data, orte_vpid_t j; orte_std_cntr_t cnt; orte_job_t *jdata=NULL; + orte_proc_t *proc; opal_byte_object_t *bo; int8_t flag; orte_jobid_t debugger; @@ -625,6 +639,20 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data, goto REPORT_ERROR; } + /* unpack the binding bitmaps */ + for (j=0; j < jdata->num_procs; j++) { + if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, j))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + rc = ORTE_ERR_NOT_FOUND; + goto REPORT_ERROR; + } + cnt = 1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &proc->cpu_bitmap, &cnt, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + goto REPORT_ERROR; + } + } + /* unpack the collective ids */ cnt=1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jdata->peer_modex, &cnt, ORTE_GRPCOMM_COLL_ID_T))) {