Fix binding on remote nodes - need to pass the binding bitmap!
This commit was SVN r26403.
Этот коммит содержится в:
родитель
903f9fac09
Коммит
70a106fa71
@ -290,6 +290,9 @@ int orte_ess_base_proc_binding(void)
|
||||
|
||||
/* see if we were bound when launched */
|
||||
if (!orte_proc_is_bound) {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output,
|
||||
"%s Not bound at launch",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
/* we were not bound at launch */
|
||||
if (NULL != opal_hwloc_topology) {
|
||||
support = (struct hwloc_topology_support*)hwloc_topology_get_support(opal_hwloc_topology);
|
||||
@ -303,6 +306,9 @@ int orte_ess_base_proc_binding(void)
|
||||
* environment does not support it
|
||||
*/
|
||||
hwloc_bitmap_free(cpus);
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output,
|
||||
"%s Binding not supported",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
goto MOVEON;
|
||||
}
|
||||
/* we are bound if the two cpusets are not equal,
|
||||
@ -316,6 +322,9 @@ int orte_ess_base_proc_binding(void)
|
||||
*/
|
||||
orte_proc_is_bound = true;
|
||||
hwloc_bitmap_free(cpus);
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output,
|
||||
"%s Process was externally bound",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
} else if (support->cpubind->set_thisproc_cpubind &&
|
||||
OPAL_BINDING_POLICY_IS_SET(opal_hwloc_binding_policy) &&
|
||||
OPAL_BIND_TO_NONE != OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
|
||||
@ -340,6 +349,9 @@ int orte_ess_base_proc_binding(void)
|
||||
/* cleanup */
|
||||
hwloc_bitmap_free(cpus);
|
||||
orte_proc_is_bound = true;
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output,
|
||||
"%s Process bound according to slot_list",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
} else {
|
||||
/* cleanup */
|
||||
hwloc_bitmap_free(cpus);
|
||||
@ -349,6 +361,9 @@ int orte_ess_base_proc_binding(void)
|
||||
* direct launched - so just ignore and leave
|
||||
* us unbound
|
||||
*/
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output,
|
||||
"%s Process not bound - no node rank available",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
goto MOVEON;
|
||||
}
|
||||
/* if the binding policy is hwthread, then we bind to the nrank-th
|
||||
@ -367,8 +382,11 @@ int orte_ess_base_proc_binding(void)
|
||||
error = "Setting processor affinity failed";
|
||||
goto error;
|
||||
}
|
||||
orte_process_info.bind_level = OPAL_HWLOC_L1CACHE_LEVEL;
|
||||
orte_process_info.bind_level = OPAL_HWLOC_HWTHREAD_LEVEL;
|
||||
orte_process_info.bind_idx = nrank;
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output,
|
||||
"%s Process bound to hwthread",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
} else if (OPAL_BIND_TO_CORE == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
|
||||
/* if the binding policy is core, then we bind to the nrank-th
|
||||
* core on this node
|
||||
@ -387,6 +405,9 @@ int orte_ess_base_proc_binding(void)
|
||||
}
|
||||
orte_process_info.bind_level = OPAL_HWLOC_CORE_LEVEL;
|
||||
orte_process_info.bind_idx = nrank;
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output,
|
||||
"%s Process bound to core",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
} else {
|
||||
/* for all higher binding policies, we bind to the specified
|
||||
* object that the nrank-th core belongs to
|
||||
@ -435,6 +456,10 @@ int orte_ess_base_proc_binding(void)
|
||||
orte_process_info.bind_idx = opal_hwloc_base_get_obj_idx(opal_hwloc_topology,
|
||||
obj, OPAL_HWLOC_LOGICAL);
|
||||
orte_proc_is_bound = true;
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output,
|
||||
"%s Process bound to %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
opal_hwloc_base_print_level(orte_process_info.bind_level)));
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -447,6 +472,10 @@ int orte_ess_base_proc_binding(void)
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output,
|
||||
"%s Process bound at launch",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
}
|
||||
|
||||
MOVEON:
|
||||
|
@ -96,6 +96,7 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
|
||||
{
|
||||
int rc;
|
||||
orte_job_t *jdata=NULL;
|
||||
orte_proc_t *proc;
|
||||
orte_job_map_t *map=NULL;
|
||||
opal_buffer_t *wireup;
|
||||
opal_byte_object_t bo, *boptr;
|
||||
@ -307,6 +308,18 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
|
||||
/* release the data since it has now been copied into our buffer */
|
||||
free(bo.bytes);
|
||||
|
||||
/* pack the binding bitmaps */
|
||||
for (j=0; j < jdata->procs->size; j++) {
|
||||
if (NULL == (proc = (orte_proc_t *) opal_pointer_array_get_item(jdata->procs, j))) {
|
||||
continue;
|
||||
}
|
||||
/* okay to pack NULL strings */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &proc->cpu_bitmap, 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
|
||||
/* pack the collective ids */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &jdata->peer_modex, 1, ORTE_GRPCOMM_COLL_ID_T))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -383,6 +396,7 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
|
||||
orte_vpid_t j;
|
||||
orte_std_cntr_t cnt;
|
||||
orte_job_t *jdata=NULL;
|
||||
orte_proc_t *proc;
|
||||
opal_byte_object_t *bo;
|
||||
int8_t flag;
|
||||
orte_jobid_t debugger;
|
||||
@ -625,6 +639,20 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
|
||||
goto REPORT_ERROR;
|
||||
}
|
||||
|
||||
/* unpack the binding bitmaps */
|
||||
for (j=0; j < jdata->num_procs; j++) {
|
||||
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, j))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
rc = ORTE_ERR_NOT_FOUND;
|
||||
goto REPORT_ERROR;
|
||||
}
|
||||
cnt = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &proc->cpu_bitmap, &cnt, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto REPORT_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
/* unpack the collective ids */
|
||||
cnt=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jdata->peer_modex, &cnt, ORTE_GRPCOMM_COLL_ID_T))) {
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user