Further cleanups on the LSF integration - the affinity file is apparently always present, but simply empty if affinity wasn't set.
Этот коммит содержится в:
родитель
dc311e3a4b
Коммит
c4002a8485
@ -299,14 +299,14 @@ static void launch_daemons(int fd, short args, void *cbdata)
|
||||
the LSF plm) */
|
||||
cur_prefix = NULL;
|
||||
for (i=0; i < jdata->apps->size; i++) {
|
||||
char *app_prefix_dir;
|
||||
char *app_prefix_dir=NULL;
|
||||
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
|
||||
continue;
|
||||
}
|
||||
orte_get_attribute(&app->attributes, ORTE_APP_PREFIX_DIR, (void**)&app_prefix_dir, OPAL_STRING);
|
||||
/* Check for already set cur_prefix_dir -- if different,
|
||||
complain */
|
||||
if (NULL != app_prefix_dir) {
|
||||
if (orte_get_attribute(&app->attributes, ORTE_APP_PREFIX_DIR, (void**)&app_prefix_dir, OPAL_STRING) &&
|
||||
NULL != app_prefix_dir) {
|
||||
/* Check for already set cur_prefix_dir -- if different,
|
||||
complain */
|
||||
if (NULL != cur_prefix &&
|
||||
0 != strcmp (cur_prefix, app_prefix_dir)) {
|
||||
orte_show_help("help-plm-lsf.txt", "multiple-prefixes",
|
||||
|
@ -27,4 +27,10 @@ LSF or your cluster.
|
||||
While trying to determine what resources are available, LSF returned
|
||||
a list of available nodes from which we were unable to extract anything
|
||||
usable. This may indicate a problem with LSF or your cluster.
|
||||
[affinity-file-not-found]
|
||||
The affinity file provided in LSB_AFFINITY_HOSTFILE could not be found:
|
||||
|
||||
File: %s
|
||||
|
||||
We cannot continue.
|
||||
|
||||
|
@ -23,6 +23,8 @@
|
||||
#include <errno.h>
|
||||
#include <unistd.h>
|
||||
#include <string.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
|
||||
#define SR1_PJOBS
|
||||
#include <lsf/lsbatch.h>
|
||||
@ -78,64 +80,9 @@ static int allocate(orte_job_t *jdata, opal_list_t *nodes)
|
||||
int i, num_nodes;
|
||||
char *affinity_file, *hstname;
|
||||
bool found;
|
||||
FILE *fp;
|
||||
struct stat buf;
|
||||
orte_app_context_t *app;
|
||||
|
||||
/* check for an affinity file */
|
||||
if (NULL != (affinity_file = getenv("LSB_AFFINITY_HOSTFILE"))) {
|
||||
/* the affinity file sequentially lists rank locations, with
|
||||
* cpusets given as physical cpu-ids. Setup the job object
|
||||
* so it knows to process this accordingly */
|
||||
if (NULL == jdata->map) {
|
||||
jdata->map = OBJ_NEW(orte_job_map_t);
|
||||
}
|
||||
ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_SEQ);
|
||||
jdata->map->req_mapper = strdup("seq"); // need sequential mapper
|
||||
/* tell the sequential mapper that all cpusets are to be treated as "physical" */
|
||||
orte_set_attribute(&jdata->attributes, ORTE_JOB_PHYSICAL_CPUIDS, true, NULL, OPAL_BOOL);
|
||||
/* get the apps and set the hostfile attribute in each to point to
|
||||
* the hostfile */
|
||||
for (i=0; i < jdata->apps->size; i++) {
|
||||
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
|
||||
continue;
|
||||
}
|
||||
orte_set_attribute(&app->attributes, ORTE_APP_HOSTFILE, true, (void*)&affinity_file, OPAL_STRING);
|
||||
}
|
||||
/* read the specified file to get the allocation */
|
||||
fp = fopen(affinity_file, "r");
|
||||
if (NULL == fp) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
while (NULL != (hstname = orte_getline(fp))) {
|
||||
if (0 == strlen(hstname)) {
|
||||
/* blank line - ignore */
|
||||
continue;
|
||||
}
|
||||
/* see if we already have this node */
|
||||
found = false;
|
||||
OPAL_LIST_FOREACH(node, nodes, orte_node_t) {
|
||||
if (0 == strcmp(node->name, hstname)) {
|
||||
/* just increment the slots */
|
||||
++node->slots;
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!found) {
|
||||
node = OBJ_NEW(orte_node_t);
|
||||
node->name = strdup(hstname);
|
||||
node->slots_inuse = 0;
|
||||
node->slots_max = 0;
|
||||
node->slots = 1;
|
||||
opal_list_append(nodes, &node->super);
|
||||
}
|
||||
}
|
||||
fclose(fp);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* get the list of allocated nodes */
|
||||
if ((num_nodes = lsb_getalloc(&nodelist)) < 0) {
|
||||
orte_show_help("help-ras-lsf.txt", "nodelist-failed", true);
|
||||
@ -165,6 +112,40 @@ static int allocate(orte_job_t *jdata, opal_list_t *nodes)
|
||||
/* release the nodelist from lsf */
|
||||
opal_argv_free(nodelist);
|
||||
|
||||
/* check for an affinity file */
|
||||
if (NULL != (affinity_file = getenv("LSB_AFFINITY_HOSTFILE"))) {
|
||||
/* check to see if the file is empty - if it is,
|
||||
* then affinity wasn't actually set for this job */
|
||||
if (0 != stat(affinity_file, &buf))
|
||||
orte_show_help("help-ras-lsf.txt", "affinity-file-not-found", true, affinity_file);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
if (0 == buf.st_size) {
|
||||
/* no affinity, so just return */
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
/* the affinity file sequentially lists rank locations, with
|
||||
* cpusets given as physical cpu-ids. Setup the job object
|
||||
* so it knows to process this accordingly */
|
||||
if (NULL == jdata->map) {
|
||||
jdata->map = OBJ_NEW(orte_job_map_t);
|
||||
}
|
||||
ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_SEQ);
|
||||
jdata->map->req_mapper = strdup("seq"); // need sequential mapper
|
||||
/* tell the sequential mapper that all cpusets are to be treated as "physical" */
|
||||
orte_set_attribute(&jdata->attributes, ORTE_JOB_PHYSICAL_CPUIDS, true, NULL, OPAL_BOOL);
|
||||
/* get the apps and set the hostfile attribute in each to point to
|
||||
* the hostfile */
|
||||
for (i=0; i < jdata->apps->size; i++) {
|
||||
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
|
||||
continue;
|
||||
}
|
||||
orte_set_attribute(&app->attributes, ORTE_APP_HOSTFILE, true, (void*)affinity_file, OPAL_STRING);
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user