1
1

Fix binding on remote nodes - need to pass the binding bitmap!

This commit was SVN r26403.
Этот коммит содержится в:
Ralph Castain 2012-05-08 03:52:39 +00:00
родитель 903f9fac09
Коммит 70a106fa71
2 изменённых файлов: 58 добавлений и 1 удалений

Просмотреть файл

@ -290,6 +290,9 @@ int orte_ess_base_proc_binding(void)
/* see if we were bound when launched */ /* see if we were bound when launched */
if (!orte_proc_is_bound) { if (!orte_proc_is_bound) {
OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output,
"%s Not bound at launch",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* we were not bound at launch */ /* we were not bound at launch */
if (NULL != opal_hwloc_topology) { if (NULL != opal_hwloc_topology) {
support = (struct hwloc_topology_support*)hwloc_topology_get_support(opal_hwloc_topology); support = (struct hwloc_topology_support*)hwloc_topology_get_support(opal_hwloc_topology);
@ -303,6 +306,9 @@ int orte_ess_base_proc_binding(void)
* environment does not support it * environment does not support it
*/ */
hwloc_bitmap_free(cpus); hwloc_bitmap_free(cpus);
OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output,
"%s Binding not supported",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
goto MOVEON; goto MOVEON;
} }
/* we are bound if the two cpusets are not equal, /* we are bound if the two cpusets are not equal,
@ -316,6 +322,9 @@ int orte_ess_base_proc_binding(void)
*/ */
orte_proc_is_bound = true; orte_proc_is_bound = true;
hwloc_bitmap_free(cpus); hwloc_bitmap_free(cpus);
OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output,
"%s Process was externally bound",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
} else if (support->cpubind->set_thisproc_cpubind && } else if (support->cpubind->set_thisproc_cpubind &&
OPAL_BINDING_POLICY_IS_SET(opal_hwloc_binding_policy) && OPAL_BINDING_POLICY_IS_SET(opal_hwloc_binding_policy) &&
OPAL_BIND_TO_NONE != OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { OPAL_BIND_TO_NONE != OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
@ -340,6 +349,9 @@ int orte_ess_base_proc_binding(void)
/* cleanup */ /* cleanup */
hwloc_bitmap_free(cpus); hwloc_bitmap_free(cpus);
orte_proc_is_bound = true; orte_proc_is_bound = true;
OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output,
"%s Process bound according to slot_list",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
} else { } else {
/* cleanup */ /* cleanup */
hwloc_bitmap_free(cpus); hwloc_bitmap_free(cpus);
@ -349,6 +361,9 @@ int orte_ess_base_proc_binding(void)
* direct launched - so just ignore and leave * direct launched - so just ignore and leave
* us unbound * us unbound
*/ */
OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output,
"%s Process not bound - no node rank available",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
goto MOVEON; goto MOVEON;
} }
/* if the binding policy is hwthread, then we bind to the nrank-th /* if the binding policy is hwthread, then we bind to the nrank-th
@ -367,8 +382,11 @@ int orte_ess_base_proc_binding(void)
error = "Setting processor affinity failed"; error = "Setting processor affinity failed";
goto error; goto error;
} }
orte_process_info.bind_level = OPAL_HWLOC_L1CACHE_LEVEL; orte_process_info.bind_level = OPAL_HWLOC_HWTHREAD_LEVEL;
orte_process_info.bind_idx = nrank; orte_process_info.bind_idx = nrank;
OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output,
"%s Process bound to hwthread",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
} else if (OPAL_BIND_TO_CORE == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { } else if (OPAL_BIND_TO_CORE == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
/* if the binding policy is core, then we bind to the nrank-th /* if the binding policy is core, then we bind to the nrank-th
* core on this node * core on this node
@ -387,6 +405,9 @@ int orte_ess_base_proc_binding(void)
} }
orte_process_info.bind_level = OPAL_HWLOC_CORE_LEVEL; orte_process_info.bind_level = OPAL_HWLOC_CORE_LEVEL;
orte_process_info.bind_idx = nrank; orte_process_info.bind_idx = nrank;
OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output,
"%s Process bound to core",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
} else { } else {
/* for all higher binding policies, we bind to the specified /* for all higher binding policies, we bind to the specified
* object that the nrank-th core belongs to * object that the nrank-th core belongs to
@ -435,6 +456,10 @@ int orte_ess_base_proc_binding(void)
orte_process_info.bind_idx = opal_hwloc_base_get_obj_idx(opal_hwloc_topology, orte_process_info.bind_idx = opal_hwloc_base_get_obj_idx(opal_hwloc_topology,
obj, OPAL_HWLOC_LOGICAL); obj, OPAL_HWLOC_LOGICAL);
orte_proc_is_bound = true; orte_proc_is_bound = true;
OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output,
"%s Process bound to %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
opal_hwloc_base_print_level(orte_process_info.bind_level)));
break; break;
} }
} }
@ -447,6 +472,10 @@ int orte_ess_base_proc_binding(void)
} }
} }
} }
} else {
OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output,
"%s Process bound at launch",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
} }
MOVEON: MOVEON:

Просмотреть файл

@ -96,6 +96,7 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
{ {
int rc; int rc;
orte_job_t *jdata=NULL; orte_job_t *jdata=NULL;
orte_proc_t *proc;
orte_job_map_t *map=NULL; orte_job_map_t *map=NULL;
opal_buffer_t *wireup; opal_buffer_t *wireup;
opal_byte_object_t bo, *boptr; opal_byte_object_t bo, *boptr;
@ -307,6 +308,18 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
/* release the data since it has now been copied into our buffer */ /* release the data since it has now been copied into our buffer */
free(bo.bytes); free(bo.bytes);
/* pack the binding bitmaps */
for (j=0; j < jdata->procs->size; j++) {
if (NULL == (proc = (orte_proc_t *) opal_pointer_array_get_item(jdata->procs, j))) {
continue;
}
/* okay to pack NULL strings */
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &proc->cpu_bitmap, 1, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
/* pack the collective ids */ /* pack the collective ids */
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &jdata->peer_modex, 1, ORTE_GRPCOMM_COLL_ID_T))) { if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &jdata->peer_modex, 1, ORTE_GRPCOMM_COLL_ID_T))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
@ -383,6 +396,7 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
orte_vpid_t j; orte_vpid_t j;
orte_std_cntr_t cnt; orte_std_cntr_t cnt;
orte_job_t *jdata=NULL; orte_job_t *jdata=NULL;
orte_proc_t *proc;
opal_byte_object_t *bo; opal_byte_object_t *bo;
int8_t flag; int8_t flag;
orte_jobid_t debugger; orte_jobid_t debugger;
@ -625,6 +639,20 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
goto REPORT_ERROR; goto REPORT_ERROR;
} }
/* unpack the binding bitmaps */
for (j=0; j < jdata->num_procs; j++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, j))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
rc = ORTE_ERR_NOT_FOUND;
goto REPORT_ERROR;
}
cnt = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &proc->cpu_bitmap, &cnt, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
goto REPORT_ERROR;
}
}
/* unpack the collective ids */ /* unpack the collective ids */
cnt=1; cnt=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jdata->peer_modex, &cnt, ORTE_GRPCOMM_COLL_ID_T))) { if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jdata->peer_modex, &cnt, ORTE_GRPCOMM_COLL_ID_T))) {