1
1

LSF support is now working. W00t! May be subject to a further tweak

or two.

 * checking lsb_init() is not sufficient to know whether you're in an
   LSF job or not; you also need to check for environment variable
   markers 
 * remove lots of debugging output
 * no need for the sds lsf to call lsb_init()
 * remove some slurm-like dead code and a copy-n-paste error in the
   sds lsf

This commit was SVN r15644.
Этот коммит содержится в:
Jeff Squyres 2007-07-26 18:49:29 +00:00
родитель 8e9c71282d
Коммит 75192de1fc
6 изменённых файлов: 3 добавлений и 86 удалений

Просмотреть файл

@ -142,7 +142,7 @@ static orte_pls_base_module_t *pls_lsf_init(int *priority)
}
/* check if lsf is running here */
if (lsb_init("ORTE launcher") < 0) {
if (NULL == getenv("LSB_JOBID") || lsb_init("ORTE launcher") < 0) {
/* nope, not here */
opal_output_verbose(10, orte_pls_base.pls_output,
"pls:lsf: NOT available for selection");

Просмотреть файл

@ -133,7 +133,6 @@ static int pls_lsf_launch_job(orte_jobid_t jobid)
int proc_name_index = 0;
bool failed_launch = true;
opal_output(0, "pls lsf being used to launch!\n");
if (mca_pls_lsf_component.timing) {
if (0 != gettimeofday(&joblaunchstart, NULL)) {
opal_output(0, "pls_lsf: could not obtain job start time");
@ -179,7 +178,6 @@ static int pls_lsf_launch_job(orte_jobid_t jobid)
#endif
num_nodes = map->num_new_daemons;
opal_output(0, "pls lsf num new daemons: %d!\n", num_nodes);
if (num_nodes == 0) {
/* no new daemons required - just launch apps */
goto launch_apps;
@ -231,7 +229,7 @@ static int pls_lsf_launch_job(orte_jobid_t jobid)
/* force orted to use the lsf sds */
opal_argv_append(&argc, &argv, "--ns-nds");
opal_argv_append(&argc, &argv, "lsf");
opal_argv_append(&argc, &argv, "env");
/* tell the new daemons the base of the name list so they can compute
* their own name on the other end
@ -306,16 +304,12 @@ static int pls_lsf_launch_job(orte_jobid_t jobid)
* orterun can do the rest of its stuff. Instead, we'll catch any
* failures and deal with them elsewhere
*/
opal_output(0, "launching on: %s", opal_argv_join(nodelist_argv, ' '));
opal_output(0, "launching: %s", opal_argv_join(argv, ' '));
if (lsb_launch(nodelist_argv, argv, LSF_DJOB_NOWAIT, env) < 0) {
ORTE_ERROR_LOG(ORTE_ERR_FAILED_TO_START);
opal_output(0, "lsb_launch failed: %d", rc);
rc = ORTE_ERR_FAILED_TO_START;
goto cleanup;
}
opal_output(0, "lsb_launch launched ok; waiting for %d daemons\n",
map->num_new_daemons);
/* wait for daemons to callback */
if (ORTE_SUCCESS !=
@ -325,12 +319,6 @@ static int pls_lsf_launch_job(orte_jobid_t jobid)
}
launch_apps:
{
int i = 0;
opal_output(0, "waiting for attach");
while (i == 0) sleep(5);
}
opal_output(0, "laounching apps using lsf");
if (ORTE_SUCCESS != (rc = orte_pls_base_launch_apps(map))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
@ -338,7 +326,6 @@ launch_apps:
/* declare the launch a success */
failed_launch = false;
opal_output(0, "launched apps with lsf ok");
if (mca_pls_lsf_component.timing) {
if (0 != gettimeofday(&launchstop, NULL)) {
@ -382,7 +369,6 @@ cleanup:
}
}
opal_output(0, "lsf pls returning: %d\n", rc);
return rc;
}

Просмотреть файл

@ -97,7 +97,7 @@ static orte_ras_base_module_t *orte_ras_lsf_init(int* priority)
}
/* check if lsf is running here */
if (lsb_init("ORTE launcher") < 0) {
if (NULL == getenv("LSB_JOBID") || lsb_init("ORTE launcher") < 0) {
/* nope, not here */
return NULL;
}

Просмотреть файл

@ -77,24 +77,16 @@ static int allocate(orte_jobid_t jobid, opal_list_t *attributes)
/* step through the list */
for (count = i = 0; i < num_nodes; i++) {
opal_output(0, "lsf got node: %s", nodelist[i]);
/* is this a repeat of the current node? */
if (NULL != node && 0 == strcmp(nodelist[i], node->node_name)) {
/* it is a repeat - just bump the slot count */
++node->node_slots;
opal_output(0, "lsf ras repeat -- slot count now %d",
node->node_slots);
continue;
}
opal_output(0, "lsf ras new node");
/* not a repeat - create a node entry for it */
node = OBJ_NEW(orte_ras_node_t);
node->node_name = strdup(nodelist[i]);
/* RHC: LSF does not use launch_id, so leave it invalid */
#if 0
node->launch_id = count++;
#endif
node->node_slots_inuse = 0;
node->node_slots_max = 0;
node->node_slots = 1;

Просмотреть файл

@ -71,12 +71,6 @@ orte_sds_base_module_t *orte_sds_lsf_component_init(int *priority)
int id;
char *mode;
/* check if lsf is running here */
if (lsb_init("ORTE launcher") < 0) {
/* nope, not here */
return NULL;
}
id = mca_base_param_register_string("ns", "nds", NULL, NULL, NULL);
mca_base_param_lookup_string(id, &mode);

Просмотреть файл

@ -49,15 +49,12 @@ orte_sds_base_module_t orte_sds_lsf_module = {
orte_sds_lsf_finalize,
};
static char *get_lsf_nodename(int nodeid);
int orte_sds_lsf_set_name(void)
{
int rc;
int id;
char* name_string = NULL;
int lsf_nodeid;
/* start by getting our jobid, and vpid (which is the
starting vpid for the list of daemons) */
@ -76,17 +73,9 @@ int orte_sds_lsf_set_name(void)
} else {
orte_jobid_t jobid;
orte_vpid_t vpid;
char* cellid_string;
char* jobid_string;
char* vpid_string;
id = mca_base_param_register_string("ns", "nds", "cellid", NULL, NULL);
mca_base_param_lookup_string(id, &cellid_string);
if (NULL == cellid_string) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
id = mca_base_param_register_string("ns", "nds", "jobid", NULL, NULL);
mca_base_param_lookup_string(id, &jobid_string);
if (NULL == jobid_string) {
@ -119,18 +108,6 @@ int orte_sds_lsf_set_name(void)
}
}
/* fix up the base name and make it the "real" name */
lsf_nodeid = atoi(getenv("LSF_PM_TASKID"));
orte_process_info.my_name->vpid += lsf_nodeid;
#if 0
/* fix up the system info nodename to match exactly what lsf returned */
if (NULL != orte_system_info.nodename) {
free(orte_system_info.nodename);
}
orte_system_info.nodename = get_lsf_nodename(lsf_nodeid);
#endif
/* get the non-name common environmental variables */
if (ORTE_SUCCESS != (rc = orte_sds_env_get())) {
ORTE_ERROR_LOG(rc);
@ -145,35 +122,3 @@ int orte_sds_lsf_finalize(void)
{
return ORTE_SUCCESS;
}
static char *get_lsf_nodename(int nodeid)
{
char **names = NULL;
char *lsf_nodelist;
char *ret;
lsf_nodelist = getenv("OMPI_MCA_orte_lsf_nodelist");
if (NULL == lsf_nodelist) {
return NULL;
}
/* split the node list into an argv array */
names = opal_argv_split(lsf_nodelist, ',');
if (NULL == names) { /* got an error */
return NULL;
}
/* check to see if there are enough entries */
if (nodeid > opal_argv_count(names)) {
return NULL;
}
ret = strdup(names[nodeid]);
opal_argv_free(names);
/* All done */
return ret;
}