Merge pull request #1572 from jjhursey/fix/lsf-1570
ras/lsf: Fix affinity for MPMD jobs running under LSF
Этот коммит содержится в:
Коммит
b213c58e71
@ -11,6 +11,7 @@
|
|||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
|
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
|
||||||
* Copyright (c) 2014 Intel, Inc. All rights reserved
|
* Copyright (c) 2014 Intel, Inc. All rights reserved
|
||||||
|
* Copyright (c) 2016 IBM Corporation. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -38,6 +39,7 @@
|
|||||||
#include "orte/util/show_help.h"
|
#include "orte/util/show_help.h"
|
||||||
|
|
||||||
#include "orte/mca/ras/base/ras_private.h"
|
#include "orte/mca/ras/base/ras_private.h"
|
||||||
|
#include "orte/mca/ras/base/base.h"
|
||||||
#include "ras_lsf.h"
|
#include "ras_lsf.h"
|
||||||
|
|
||||||
|
|
||||||
@ -98,6 +100,8 @@ static int allocate(orte_job_t *jdata, opal_list_t *nodes)
|
|||||||
if (NULL != node && 0 == strcmp(nodelist[i], node->name)) {
|
if (NULL != node && 0 == strcmp(nodelist[i], node->name)) {
|
||||||
/* it is a repeat - just bump the slot count */
|
/* it is a repeat - just bump the slot count */
|
||||||
++node->slots;
|
++node->slots;
|
||||||
|
opal_output_verbose(10, orte_ras_base_framework.framework_output,
|
||||||
|
"ras/lsf: +++ Node (%s) [slots=%d]", node->name, node->slots);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -109,6 +113,9 @@ static int allocate(orte_job_t *jdata, opal_list_t *nodes)
|
|||||||
node->slots = 1;
|
node->slots = 1;
|
||||||
node->state = ORTE_NODE_STATE_UP;
|
node->state = ORTE_NODE_STATE_UP;
|
||||||
opal_list_append(nodes, &node->super);
|
opal_list_append(nodes, &node->super);
|
||||||
|
|
||||||
|
opal_output_verbose(10, orte_ras_base_framework.framework_output,
|
||||||
|
"ras/lsf: New Node (%s) [slots=%d]", node->name, node->slots);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* release the nodelist from lsf */
|
/* release the nodelist from lsf */
|
||||||
@ -142,14 +149,20 @@ static int allocate(orte_job_t *jdata, opal_list_t *nodes)
|
|||||||
if (!OPAL_BINDING_POLICY_IS_SET(opal_hwloc_binding_policy)) {
|
if (!OPAL_BINDING_POLICY_IS_SET(opal_hwloc_binding_policy)) {
|
||||||
OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_HWTHREAD);
|
OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_HWTHREAD);
|
||||||
}
|
}
|
||||||
/* get the apps and set the hostfile attribute in each to point to
|
/*
|
||||||
* the hostfile */
|
* Do not set the hostfile attribute on each app_context since that
|
||||||
for (i=0; i < jdata->apps->size; i++) {
|
* would confuse the sequential mapper when it tries to assign bindings
|
||||||
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
|
* when running an MPMD job.
|
||||||
continue;
|
* Instead just overwrite the orte_default_hostfile so it will be
|
||||||
}
|
* general for all of the app_contexts.
|
||||||
orte_set_attribute(&app->attributes, ORTE_APP_HOSTFILE, true, (void*)affinity_file, OPAL_STRING);
|
*/
|
||||||
|
if( NULL != orte_default_hostfile ) {
|
||||||
|
free(orte_default_hostfile);
|
||||||
|
orte_default_hostfile = NULL;
|
||||||
}
|
}
|
||||||
|
orte_default_hostfile = strdup(affinity_file);
|
||||||
|
opal_output_verbose(10, orte_ras_base_framework.framework_output,
|
||||||
|
"ras/lsf: Set default_hostfile to %s",orte_default_hostfile);
|
||||||
|
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user