Fix binding on remote nodes - need to pass the binding bitmap!
This commit was SVN r26403.
Этот коммит содержится в:
родитель
903f9fac09
Коммит
70a106fa71
@ -290,6 +290,9 @@ int orte_ess_base_proc_binding(void)
|
|||||||
|
|
||||||
/* see if we were bound when launched */
|
/* see if we were bound when launched */
|
||||||
if (!orte_proc_is_bound) {
|
if (!orte_proc_is_bound) {
|
||||||
|
OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output,
|
||||||
|
"%s Not bound at launch",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||||
/* we were not bound at launch */
|
/* we were not bound at launch */
|
||||||
if (NULL != opal_hwloc_topology) {
|
if (NULL != opal_hwloc_topology) {
|
||||||
support = (struct hwloc_topology_support*)hwloc_topology_get_support(opal_hwloc_topology);
|
support = (struct hwloc_topology_support*)hwloc_topology_get_support(opal_hwloc_topology);
|
||||||
@ -303,6 +306,9 @@ int orte_ess_base_proc_binding(void)
|
|||||||
* environment does not support it
|
* environment does not support it
|
||||||
*/
|
*/
|
||||||
hwloc_bitmap_free(cpus);
|
hwloc_bitmap_free(cpus);
|
||||||
|
OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output,
|
||||||
|
"%s Binding not supported",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||||
goto MOVEON;
|
goto MOVEON;
|
||||||
}
|
}
|
||||||
/* we are bound if the two cpusets are not equal,
|
/* we are bound if the two cpusets are not equal,
|
||||||
@ -316,6 +322,9 @@ int orte_ess_base_proc_binding(void)
|
|||||||
*/
|
*/
|
||||||
orte_proc_is_bound = true;
|
orte_proc_is_bound = true;
|
||||||
hwloc_bitmap_free(cpus);
|
hwloc_bitmap_free(cpus);
|
||||||
|
OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output,
|
||||||
|
"%s Process was externally bound",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||||
} else if (support->cpubind->set_thisproc_cpubind &&
|
} else if (support->cpubind->set_thisproc_cpubind &&
|
||||||
OPAL_BINDING_POLICY_IS_SET(opal_hwloc_binding_policy) &&
|
OPAL_BINDING_POLICY_IS_SET(opal_hwloc_binding_policy) &&
|
||||||
OPAL_BIND_TO_NONE != OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
|
OPAL_BIND_TO_NONE != OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
|
||||||
@ -340,6 +349,9 @@ int orte_ess_base_proc_binding(void)
|
|||||||
/* cleanup */
|
/* cleanup */
|
||||||
hwloc_bitmap_free(cpus);
|
hwloc_bitmap_free(cpus);
|
||||||
orte_proc_is_bound = true;
|
orte_proc_is_bound = true;
|
||||||
|
OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output,
|
||||||
|
"%s Process bound according to slot_list",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||||
} else {
|
} else {
|
||||||
/* cleanup */
|
/* cleanup */
|
||||||
hwloc_bitmap_free(cpus);
|
hwloc_bitmap_free(cpus);
|
||||||
@ -349,6 +361,9 @@ int orte_ess_base_proc_binding(void)
|
|||||||
* direct launched - so just ignore and leave
|
* direct launched - so just ignore and leave
|
||||||
* us unbound
|
* us unbound
|
||||||
*/
|
*/
|
||||||
|
OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output,
|
||||||
|
"%s Process not bound - no node rank available",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||||
goto MOVEON;
|
goto MOVEON;
|
||||||
}
|
}
|
||||||
/* if the binding policy is hwthread, then we bind to the nrank-th
|
/* if the binding policy is hwthread, then we bind to the nrank-th
|
||||||
@ -367,8 +382,11 @@ int orte_ess_base_proc_binding(void)
|
|||||||
error = "Setting processor affinity failed";
|
error = "Setting processor affinity failed";
|
||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
orte_process_info.bind_level = OPAL_HWLOC_L1CACHE_LEVEL;
|
orte_process_info.bind_level = OPAL_HWLOC_HWTHREAD_LEVEL;
|
||||||
orte_process_info.bind_idx = nrank;
|
orte_process_info.bind_idx = nrank;
|
||||||
|
OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output,
|
||||||
|
"%s Process bound to hwthread",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||||
} else if (OPAL_BIND_TO_CORE == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
|
} else if (OPAL_BIND_TO_CORE == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
|
||||||
/* if the binding policy is core, then we bind to the nrank-th
|
/* if the binding policy is core, then we bind to the nrank-th
|
||||||
* core on this node
|
* core on this node
|
||||||
@ -387,6 +405,9 @@ int orte_ess_base_proc_binding(void)
|
|||||||
}
|
}
|
||||||
orte_process_info.bind_level = OPAL_HWLOC_CORE_LEVEL;
|
orte_process_info.bind_level = OPAL_HWLOC_CORE_LEVEL;
|
||||||
orte_process_info.bind_idx = nrank;
|
orte_process_info.bind_idx = nrank;
|
||||||
|
OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output,
|
||||||
|
"%s Process bound to core",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||||
} else {
|
} else {
|
||||||
/* for all higher binding policies, we bind to the specified
|
/* for all higher binding policies, we bind to the specified
|
||||||
* object that the nrank-th core belongs to
|
* object that the nrank-th core belongs to
|
||||||
@ -435,6 +456,10 @@ int orte_ess_base_proc_binding(void)
|
|||||||
orte_process_info.bind_idx = opal_hwloc_base_get_obj_idx(opal_hwloc_topology,
|
orte_process_info.bind_idx = opal_hwloc_base_get_obj_idx(opal_hwloc_topology,
|
||||||
obj, OPAL_HWLOC_LOGICAL);
|
obj, OPAL_HWLOC_LOGICAL);
|
||||||
orte_proc_is_bound = true;
|
orte_proc_is_bound = true;
|
||||||
|
OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output,
|
||||||
|
"%s Process bound to %s",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
opal_hwloc_base_print_level(orte_process_info.bind_level)));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -447,6 +472,10 @@ int orte_ess_base_proc_binding(void)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output,
|
||||||
|
"%s Process bound at launch",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||||
}
|
}
|
||||||
|
|
||||||
MOVEON:
|
MOVEON:
|
||||||
|
@ -96,6 +96,7 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
|
|||||||
{
|
{
|
||||||
int rc;
|
int rc;
|
||||||
orte_job_t *jdata=NULL;
|
orte_job_t *jdata=NULL;
|
||||||
|
orte_proc_t *proc;
|
||||||
orte_job_map_t *map=NULL;
|
orte_job_map_t *map=NULL;
|
||||||
opal_buffer_t *wireup;
|
opal_buffer_t *wireup;
|
||||||
opal_byte_object_t bo, *boptr;
|
opal_byte_object_t bo, *boptr;
|
||||||
@ -307,6 +308,18 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
|
|||||||
/* release the data since it has now been copied into our buffer */
|
/* release the data since it has now been copied into our buffer */
|
||||||
free(bo.bytes);
|
free(bo.bytes);
|
||||||
|
|
||||||
|
/* pack the binding bitmaps */
|
||||||
|
for (j=0; j < jdata->procs->size; j++) {
|
||||||
|
if (NULL == (proc = (orte_proc_t *) opal_pointer_array_get_item(jdata->procs, j))) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
/* okay to pack NULL strings */
|
||||||
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &proc->cpu_bitmap, 1, OPAL_STRING))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/* pack the collective ids */
|
/* pack the collective ids */
|
||||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &jdata->peer_modex, 1, ORTE_GRPCOMM_COLL_ID_T))) {
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &jdata->peer_modex, 1, ORTE_GRPCOMM_COLL_ID_T))) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
@ -383,6 +396,7 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
|
|||||||
orte_vpid_t j;
|
orte_vpid_t j;
|
||||||
orte_std_cntr_t cnt;
|
orte_std_cntr_t cnt;
|
||||||
orte_job_t *jdata=NULL;
|
orte_job_t *jdata=NULL;
|
||||||
|
orte_proc_t *proc;
|
||||||
opal_byte_object_t *bo;
|
opal_byte_object_t *bo;
|
||||||
int8_t flag;
|
int8_t flag;
|
||||||
orte_jobid_t debugger;
|
orte_jobid_t debugger;
|
||||||
@ -625,6 +639,20 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
|
|||||||
goto REPORT_ERROR;
|
goto REPORT_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* unpack the binding bitmaps */
|
||||||
|
for (j=0; j < jdata->num_procs; j++) {
|
||||||
|
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, j))) {
|
||||||
|
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||||
|
rc = ORTE_ERR_NOT_FOUND;
|
||||||
|
goto REPORT_ERROR;
|
||||||
|
}
|
||||||
|
cnt = 1;
|
||||||
|
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &proc->cpu_bitmap, &cnt, OPAL_STRING))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
goto REPORT_ERROR;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/* unpack the collective ids */
|
/* unpack the collective ids */
|
||||||
cnt=1;
|
cnt=1;
|
||||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jdata->peer_modex, &cnt, ORTE_GRPCOMM_COLL_ID_T))) {
|
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jdata->peer_modex, &cnt, ORTE_GRPCOMM_COLL_ID_T))) {
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user