ras/lsf: Fix affinity for MPMD jobs running under LSF
Этот коммит содержится в:
родитель
c2c59db49c
Коммит
29b49351af
@ -11,6 +11,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2014 Intel, Inc. All rights reserved
|
||||
* Copyright (c) 2016 IBM Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -38,6 +39,7 @@
|
||||
#include "orte/util/show_help.h"
|
||||
|
||||
#include "orte/mca/ras/base/ras_private.h"
|
||||
#include "orte/mca/ras/base/base.h"
|
||||
#include "ras_lsf.h"
|
||||
|
||||
|
||||
@ -98,6 +100,8 @@ static int allocate(orte_job_t *jdata, opal_list_t *nodes)
|
||||
if (NULL != node && 0 == strcmp(nodelist[i], node->name)) {
|
||||
/* it is a repeat - just bump the slot count */
|
||||
++node->slots;
|
||||
opal_output_verbose(10, orte_ras_base_framework.framework_output,
|
||||
"ras/lsf: +++ Node (%s) [slots=%d]", node->name, node->slots);
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -109,6 +113,9 @@ static int allocate(orte_job_t *jdata, opal_list_t *nodes)
|
||||
node->slots = 1;
|
||||
node->state = ORTE_NODE_STATE_UP;
|
||||
opal_list_append(nodes, &node->super);
|
||||
|
||||
opal_output_verbose(10, orte_ras_base_framework.framework_output,
|
||||
"ras/lsf: New Node (%s) [slots=%d]", node->name, node->slots);
|
||||
}
|
||||
|
||||
/* release the nodelist from lsf */
|
||||
@ -142,14 +149,20 @@ static int allocate(orte_job_t *jdata, opal_list_t *nodes)
|
||||
if (!OPAL_BINDING_POLICY_IS_SET(opal_hwloc_binding_policy)) {
|
||||
OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_HWTHREAD);
|
||||
}
|
||||
/* get the apps and set the hostfile attribute in each to point to
|
||||
* the hostfile */
|
||||
for (i=0; i < jdata->apps->size; i++) {
|
||||
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
|
||||
continue;
|
||||
}
|
||||
orte_set_attribute(&app->attributes, ORTE_APP_HOSTFILE, true, (void*)affinity_file, OPAL_STRING);
|
||||
/*
|
||||
* Do not set the hostfile attribute on each app_context since that
|
||||
* would confuse the sequential mapper when it tries to assign bindings
|
||||
* when running an MPMD job.
|
||||
* Instead just overwrite the orte_default_hostfile so it will be
|
||||
* general for all of the app_contexts.
|
||||
*/
|
||||
if( NULL != orte_default_hostfile ) {
|
||||
free(orte_default_hostfile);
|
||||
orte_default_hostfile = NULL;
|
||||
}
|
||||
orte_default_hostfile = strdup(affinity_file);
|
||||
opal_output_verbose(10, orte_ras_base_framework.framework_output,
|
||||
"ras/lsf: Set default_hostfile to %s",orte_default_hostfile);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user