diff --git a/orte/mca/plm/lsf/plm_lsf_module.c b/orte/mca/plm/lsf/plm_lsf_module.c index 269a62c686..b910a1d56d 100644 --- a/orte/mca/plm/lsf/plm_lsf_module.c +++ b/orte/mca/plm/lsf/plm_lsf_module.c @@ -299,14 +299,14 @@ static void launch_daemons(int fd, short args, void *cbdata) the LSF plm) */ cur_prefix = NULL; for (i=0; i < jdata->apps->size; i++) { - char *app_prefix_dir; + char *app_prefix_dir=NULL; if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) { continue; } - orte_get_attribute(&app->attributes, ORTE_APP_PREFIX_DIR, (void**)&app_prefix_dir, OPAL_STRING); - /* Check for already set cur_prefix_dir -- if different, - complain */ - if (NULL != app_prefix_dir) { + if (orte_get_attribute(&app->attributes, ORTE_APP_PREFIX_DIR, (void**)&app_prefix_dir, OPAL_STRING) && + NULL != app_prefix_dir) { + /* Check for already set cur_prefix_dir -- if different, + complain */ if (NULL != cur_prefix && 0 != strcmp (cur_prefix, app_prefix_dir)) { orte_show_help("help-plm-lsf.txt", "multiple-prefixes", diff --git a/orte/mca/ras/lsf/help-ras-lsf.txt b/orte/mca/ras/lsf/help-ras-lsf.txt index 8f3dbe2680..c846b74269 100644 --- a/orte/mca/ras/lsf/help-ras-lsf.txt +++ b/orte/mca/ras/lsf/help-ras-lsf.txt @@ -27,4 +27,10 @@ LSF or your cluster. While trying to determine what resources are available, LSF returned a list of available nodes from which we were unable to extract anything usable. This may indicate a problem with LSF or your cluster. +[affinity-file-not-found] +The affinity file provided in LSB_AFFINITY_HOSTFILE could not be found: + + File: %s + +We cannot continue. diff --git a/orte/mca/ras/lsf/ras_lsf_module.c b/orte/mca/ras/lsf/ras_lsf_module.c index 34b85aa21b..fce34bcf77 100644 --- a/orte/mca/ras/lsf/ras_lsf_module.c +++ b/orte/mca/ras/lsf/ras_lsf_module.c @@ -23,6 +23,8 @@ #include #include #include +#include +#include #define SR1_PJOBS #include @@ -78,64 +80,9 @@ static int allocate(orte_job_t *jdata, opal_list_t *nodes) int i, num_nodes; char *affinity_file, *hstname; bool found; - FILE *fp; + struct stat buf; orte_app_context_t *app; - /* check for an affinity file */ - if (NULL != (affinity_file = getenv("LSB_AFFINITY_HOSTFILE"))) { - /* the affinity file sequentially lists rank locations, with - * cpusets given as physical cpu-ids. Setup the job object - * so it knows to process this accordingly */ - if (NULL == jdata->map) { - jdata->map = OBJ_NEW(orte_job_map_t); - } - ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_SEQ); - jdata->map->req_mapper = strdup("seq"); // need sequential mapper - /* tell the sequential mapper that all cpusets are to be treated as "physical" */ - orte_set_attribute(&jdata->attributes, ORTE_JOB_PHYSICAL_CPUIDS, true, NULL, OPAL_BOOL); - /* get the apps and set the hostfile attribute in each to point to - * the hostfile */ - for (i=0; i < jdata->apps->size; i++) { - if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) { - continue; - } - orte_set_attribute(&app->attributes, ORTE_APP_HOSTFILE, true, (void*)&affinity_file, OPAL_STRING); - } - /* read the specified file to get the allocation */ - fp = fopen(affinity_file, "r"); - if (NULL == fp) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - return ORTE_ERR_NOT_FOUND; - } - while (NULL != (hstname = orte_getline(fp))) { - if (0 == strlen(hstname)) { - /* blank line - ignore */ - continue; - } - /* see if we already have this node */ - found = false; - OPAL_LIST_FOREACH(node, nodes, orte_node_t) { - if (0 == strcmp(node->name, hstname)) { - /* just increment the slots */ - ++node->slots; - found = true; - break; - } - } - if (!found) { - node = OBJ_NEW(orte_node_t); - node->name = strdup(hstname); - node->slots_inuse = 0; - node->slots_max = 0; - node->slots = 1; - opal_list_append(nodes, &node->super); - } - } - fclose(fp); - - return ORTE_SUCCESS; - } - /* get the list of allocated nodes */ if ((num_nodes = lsb_getalloc(&nodelist)) < 0) { orte_show_help("help-ras-lsf.txt", "nodelist-failed", true); @@ -165,6 +112,40 @@ static int allocate(orte_job_t *jdata, opal_list_t *nodes) /* release the nodelist from lsf */ opal_argv_free(nodelist); + /* check for an affinity file */ + if (NULL != (affinity_file = getenv("LSB_AFFINITY_HOSTFILE"))) { + /* check to see if the file is empty - if it is, + * then affinity wasn't actually set for this job */ + if (0 != stat(affinity_file, &buf)) + orte_show_help("help-ras-lsf.txt", "affinity-file-not-found", true, affinity_file); + return ORTE_ERR_SILENT; + } + if (0 == buf.st_size) { + /* no affinity, so just return */ + return ORTE_SUCCESS; + } + /* the affinity file sequentially lists rank locations, with + * cpusets given as physical cpu-ids. Setup the job object + * so it knows to process this accordingly */ + if (NULL == jdata->map) { + jdata->map = OBJ_NEW(orte_job_map_t); + } + ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_SEQ); + jdata->map->req_mapper = strdup("seq"); // need sequential mapper + /* tell the sequential mapper that all cpusets are to be treated as "physical" */ + orte_set_attribute(&jdata->attributes, ORTE_JOB_PHYSICAL_CPUIDS, true, NULL, OPAL_BOOL); + /* get the apps and set the hostfile attribute in each to point to + * the hostfile */ + for (i=0; i < jdata->apps->size; i++) { + if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) { + continue; + } + orte_set_attribute(&app->attributes, ORTE_APP_HOSTFILE, true, (void*)affinity_file, OPAL_STRING); + } + + return ORTE_SUCCESS; + } + return ORTE_SUCCESS; }