1
1

ras/lsf: Fix affinity for MPMD jobs running under LSF

Этот коммит содержится в:
Joshua Hursey 2016-04-21 15:37:58 -05:00
родитель c2c59db49c
Коммит 29b49351af

Просмотреть файл

@ -11,6 +11,7 @@
* All rights reserved.
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2014 Intel, Inc. All rights reserved
* Copyright (c) 2016 IBM Corporation. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -38,6 +39,7 @@
#include "orte/util/show_help.h"
#include "orte/mca/ras/base/ras_private.h"
#include "orte/mca/ras/base/base.h"
#include "ras_lsf.h"
@ -98,6 +100,8 @@ static int allocate(orte_job_t *jdata, opal_list_t *nodes)
if (NULL != node && 0 == strcmp(nodelist[i], node->name)) {
/* it is a repeat - just bump the slot count */
++node->slots;
opal_output_verbose(10, orte_ras_base_framework.framework_output,
"ras/lsf: +++ Node (%s) [slots=%d]", node->name, node->slots);
continue;
}
@ -109,6 +113,9 @@ static int allocate(orte_job_t *jdata, opal_list_t *nodes)
node->slots = 1;
node->state = ORTE_NODE_STATE_UP;
opal_list_append(nodes, &node->super);
opal_output_verbose(10, orte_ras_base_framework.framework_output,
"ras/lsf: New Node (%s) [slots=%d]", node->name, node->slots);
}
/* release the nodelist from lsf */
@ -142,14 +149,20 @@ static int allocate(orte_job_t *jdata, opal_list_t *nodes)
if (!OPAL_BINDING_POLICY_IS_SET(opal_hwloc_binding_policy)) {
OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_HWTHREAD);
}
/* get the apps and set the hostfile attribute in each to point to
* the hostfile */
for (i=0; i < jdata->apps->size; i++) {
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
continue;
}
orte_set_attribute(&app->attributes, ORTE_APP_HOSTFILE, true, (void*)affinity_file, OPAL_STRING);
/*
* Do not set the hostfile attribute on each app_context since that
* would confuse the sequential mapper when it tries to assign bindings
* when running an MPMD job.
* Instead just overwrite the orte_default_hostfile so it will be
* general for all of the app_contexts.
*/
if( NULL != orte_default_hostfile ) {
free(orte_default_hostfile);
orte_default_hostfile = NULL;
}
orte_default_hostfile = strdup(affinity_file);
opal_output_verbose(10, orte_ras_base_framework.framework_output,
"ras/lsf: Set default_hostfile to %s",orte_default_hostfile);
return ORTE_SUCCESS;
}