1
1

Further cleanups on the LSF integration - the affinity file is apparently always present, but simply empty if affinity wasn't set.

Этот коммит содержится в:
Ralph Castain 2014-12-04 08:58:30 -08:00
родитель dc311e3a4b
Коммит c4002a8485
3 изменённых файлов: 48 добавлений и 61 удалений

Просмотреть файл

@ -299,14 +299,14 @@ static void launch_daemons(int fd, short args, void *cbdata)
the LSF plm) */ the LSF plm) */
cur_prefix = NULL; cur_prefix = NULL;
for (i=0; i < jdata->apps->size; i++) { for (i=0; i < jdata->apps->size; i++) {
char *app_prefix_dir; char *app_prefix_dir=NULL;
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) { if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
continue; continue;
} }
orte_get_attribute(&app->attributes, ORTE_APP_PREFIX_DIR, (void**)&app_prefix_dir, OPAL_STRING); if (orte_get_attribute(&app->attributes, ORTE_APP_PREFIX_DIR, (void**)&app_prefix_dir, OPAL_STRING) &&
NULL != app_prefix_dir) {
/* Check for already set cur_prefix_dir -- if different, /* Check for already set cur_prefix_dir -- if different,
complain */ complain */
if (NULL != app_prefix_dir) {
if (NULL != cur_prefix && if (NULL != cur_prefix &&
0 != strcmp (cur_prefix, app_prefix_dir)) { 0 != strcmp (cur_prefix, app_prefix_dir)) {
orte_show_help("help-plm-lsf.txt", "multiple-prefixes", orte_show_help("help-plm-lsf.txt", "multiple-prefixes",

Просмотреть файл

@ -27,4 +27,10 @@ LSF or your cluster.
While trying to determine what resources are available, LSF returned While trying to determine what resources are available, LSF returned
a list of available nodes from which we were unable to extract anything a list of available nodes from which we were unable to extract anything
usable. This may indicate a problem with LSF or your cluster. usable. This may indicate a problem with LSF or your cluster.
[affinity-file-not-found]
The affinity file provided in LSB_AFFINITY_HOSTFILE could not be found:
File: %s
We cannot continue.

Просмотреть файл

@ -23,6 +23,8 @@
#include <errno.h> #include <errno.h>
#include <unistd.h> #include <unistd.h>
#include <string.h> #include <string.h>
#include <sys/types.h>
#include <sys/stat.h>
#define SR1_PJOBS #define SR1_PJOBS
#include <lsf/lsbatch.h> #include <lsf/lsbatch.h>
@ -78,64 +80,9 @@ static int allocate(orte_job_t *jdata, opal_list_t *nodes)
int i, num_nodes; int i, num_nodes;
char *affinity_file, *hstname; char *affinity_file, *hstname;
bool found; bool found;
FILE *fp; struct stat buf;
orte_app_context_t *app; orte_app_context_t *app;
/* check for an affinity file */
if (NULL != (affinity_file = getenv("LSB_AFFINITY_HOSTFILE"))) {
/* the affinity file sequentially lists rank locations, with
* cpusets given as physical cpu-ids. Setup the job object
* so it knows to process this accordingly */
if (NULL == jdata->map) {
jdata->map = OBJ_NEW(orte_job_map_t);
}
ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_SEQ);
jdata->map->req_mapper = strdup("seq"); // need sequential mapper
/* tell the sequential mapper that all cpusets are to be treated as "physical" */
orte_set_attribute(&jdata->attributes, ORTE_JOB_PHYSICAL_CPUIDS, true, NULL, OPAL_BOOL);
/* get the apps and set the hostfile attribute in each to point to
* the hostfile */
for (i=0; i < jdata->apps->size; i++) {
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
continue;
}
orte_set_attribute(&app->attributes, ORTE_APP_HOSTFILE, true, (void*)&affinity_file, OPAL_STRING);
}
/* read the specified file to get the allocation */
fp = fopen(affinity_file, "r");
if (NULL == fp) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
while (NULL != (hstname = orte_getline(fp))) {
if (0 == strlen(hstname)) {
/* blank line - ignore */
continue;
}
/* see if we already have this node */
found = false;
OPAL_LIST_FOREACH(node, nodes, orte_node_t) {
if (0 == strcmp(node->name, hstname)) {
/* just increment the slots */
++node->slots;
found = true;
break;
}
}
if (!found) {
node = OBJ_NEW(orte_node_t);
node->name = strdup(hstname);
node->slots_inuse = 0;
node->slots_max = 0;
node->slots = 1;
opal_list_append(nodes, &node->super);
}
}
fclose(fp);
return ORTE_SUCCESS;
}
/* get the list of allocated nodes */ /* get the list of allocated nodes */
if ((num_nodes = lsb_getalloc(&nodelist)) < 0) { if ((num_nodes = lsb_getalloc(&nodelist)) < 0) {
orte_show_help("help-ras-lsf.txt", "nodelist-failed", true); orte_show_help("help-ras-lsf.txt", "nodelist-failed", true);
@ -165,6 +112,40 @@ static int allocate(orte_job_t *jdata, opal_list_t *nodes)
/* release the nodelist from lsf */ /* release the nodelist from lsf */
opal_argv_free(nodelist); opal_argv_free(nodelist);
/* check for an affinity file */
if (NULL != (affinity_file = getenv("LSB_AFFINITY_HOSTFILE"))) {
/* check to see if the file is empty - if it is,
* then affinity wasn't actually set for this job */
if (0 != stat(affinity_file, &buf))
orte_show_help("help-ras-lsf.txt", "affinity-file-not-found", true, affinity_file);
return ORTE_ERR_SILENT;
}
if (0 == buf.st_size) {
/* no affinity, so just return */
return ORTE_SUCCESS;
}
/* the affinity file sequentially lists rank locations, with
* cpusets given as physical cpu-ids. Setup the job object
* so it knows to process this accordingly */
if (NULL == jdata->map) {
jdata->map = OBJ_NEW(orte_job_map_t);
}
ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_SEQ);
jdata->map->req_mapper = strdup("seq"); // need sequential mapper
/* tell the sequential mapper that all cpusets are to be treated as "physical" */
orte_set_attribute(&jdata->attributes, ORTE_JOB_PHYSICAL_CPUIDS, true, NULL, OPAL_BOOL);
/* get the apps and set the hostfile attribute in each to point to
* the hostfile */
for (i=0; i < jdata->apps->size; i++) {
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
continue;
}
orte_set_attribute(&app->attributes, ORTE_APP_HOSTFILE, true, (void*)affinity_file, OPAL_STRING);
}
return ORTE_SUCCESS;
}
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }