1
1

Fix binding on remote nodes - need to pass the binding bitmap!

This commit was SVN r26403.
Этот коммит содержится в:
Ralph Castain 2012-05-08 03:52:39 +00:00
родитель 903f9fac09
Коммит 70a106fa71
2 изменённых файлов: 58 добавлений и 1 удалений

Просмотреть файл

@ -290,6 +290,9 @@ int orte_ess_base_proc_binding(void)
/* see if we were bound when launched */
if (!orte_proc_is_bound) {
OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output,
"%s Not bound at launch",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* we were not bound at launch */
if (NULL != opal_hwloc_topology) {
support = (struct hwloc_topology_support*)hwloc_topology_get_support(opal_hwloc_topology);
@ -303,6 +306,9 @@ int orte_ess_base_proc_binding(void)
* environment does not support it
*/
hwloc_bitmap_free(cpus);
OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output,
"%s Binding not supported",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
goto MOVEON;
}
/* we are bound if the two cpusets are not equal,
@ -316,6 +322,9 @@ int orte_ess_base_proc_binding(void)
*/
orte_proc_is_bound = true;
hwloc_bitmap_free(cpus);
OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output,
"%s Process was externally bound",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
} else if (support->cpubind->set_thisproc_cpubind &&
OPAL_BINDING_POLICY_IS_SET(opal_hwloc_binding_policy) &&
OPAL_BIND_TO_NONE != OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
@ -340,6 +349,9 @@ int orte_ess_base_proc_binding(void)
/* cleanup */
hwloc_bitmap_free(cpus);
orte_proc_is_bound = true;
OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output,
"%s Process bound according to slot_list",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
} else {
/* cleanup */
hwloc_bitmap_free(cpus);
@ -349,6 +361,9 @@ int orte_ess_base_proc_binding(void)
* direct launched - so just ignore and leave
* us unbound
*/
OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output,
"%s Process not bound - no node rank available",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
goto MOVEON;
}
/* if the binding policy is hwthread, then we bind to the nrank-th
@ -367,8 +382,11 @@ int orte_ess_base_proc_binding(void)
error = "Setting processor affinity failed";
goto error;
}
orte_process_info.bind_level = OPAL_HWLOC_L1CACHE_LEVEL;
orte_process_info.bind_level = OPAL_HWLOC_HWTHREAD_LEVEL;
orte_process_info.bind_idx = nrank;
OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output,
"%s Process bound to hwthread",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
} else if (OPAL_BIND_TO_CORE == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
/* if the binding policy is core, then we bind to the nrank-th
* core on this node
@ -387,6 +405,9 @@ int orte_ess_base_proc_binding(void)
}
orte_process_info.bind_level = OPAL_HWLOC_CORE_LEVEL;
orte_process_info.bind_idx = nrank;
OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output,
"%s Process bound to core",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
} else {
/* for all higher binding policies, we bind to the specified
* object that the nrank-th core belongs to
@ -435,6 +456,10 @@ int orte_ess_base_proc_binding(void)
orte_process_info.bind_idx = opal_hwloc_base_get_obj_idx(opal_hwloc_topology,
obj, OPAL_HWLOC_LOGICAL);
orte_proc_is_bound = true;
OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output,
"%s Process bound to %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
opal_hwloc_base_print_level(orte_process_info.bind_level)));
break;
}
}
@ -447,6 +472,10 @@ int orte_ess_base_proc_binding(void)
}
}
}
} else {
OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output,
"%s Process bound at launch",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
}
MOVEON:

Просмотреть файл

@ -96,6 +96,7 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
{
int rc;
orte_job_t *jdata=NULL;
orte_proc_t *proc;
orte_job_map_t *map=NULL;
opal_buffer_t *wireup;
opal_byte_object_t bo, *boptr;
@ -307,6 +308,18 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
/* release the data since it has now been copied into our buffer */
free(bo.bytes);
/* pack the binding bitmaps */
for (j=0; j < jdata->procs->size; j++) {
if (NULL == (proc = (orte_proc_t *) opal_pointer_array_get_item(jdata->procs, j))) {
continue;
}
/* okay to pack NULL strings */
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &proc->cpu_bitmap, 1, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
/* pack the collective ids */
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &jdata->peer_modex, 1, ORTE_GRPCOMM_COLL_ID_T))) {
ORTE_ERROR_LOG(rc);
@ -383,6 +396,7 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
orte_vpid_t j;
orte_std_cntr_t cnt;
orte_job_t *jdata=NULL;
orte_proc_t *proc;
opal_byte_object_t *bo;
int8_t flag;
orte_jobid_t debugger;
@ -625,6 +639,20 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
goto REPORT_ERROR;
}
/* unpack the binding bitmaps */
for (j=0; j < jdata->num_procs; j++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, j))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
rc = ORTE_ERR_NOT_FOUND;
goto REPORT_ERROR;
}
cnt = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &proc->cpu_bitmap, &cnt, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
goto REPORT_ERROR;
}
}
/* unpack the collective ids */
cnt=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jdata->peer_modex, &cnt, ORTE_GRPCOMM_COLL_ID_T))) {