LSF support is now working. W00t! May be subject to a further tweak
or two. * checking lsb_init() is not sufficient to know whether you're in an LSF job or not; you also need to check for environment variable markers * remove lots of debugging output * no need for the sds lsf to call lsb_init() * remove some slurm-like dead code and a copy-n-paste error in the sds lsf This commit was SVN r15644.
Этот коммит содержится в:
родитель
8e9c71282d
Коммит
75192de1fc
@ -142,7 +142,7 @@ static orte_pls_base_module_t *pls_lsf_init(int *priority)
|
||||
}
|
||||
|
||||
/* check if lsf is running here */
|
||||
if (lsb_init("ORTE launcher") < 0) {
|
||||
if (NULL == getenv("LSB_JOBID") || lsb_init("ORTE launcher") < 0) {
|
||||
/* nope, not here */
|
||||
opal_output_verbose(10, orte_pls_base.pls_output,
|
||||
"pls:lsf: NOT available for selection");
|
||||
|
@ -133,7 +133,6 @@ static int pls_lsf_launch_job(orte_jobid_t jobid)
|
||||
int proc_name_index = 0;
|
||||
bool failed_launch = true;
|
||||
|
||||
opal_output(0, "pls lsf being used to launch!\n");
|
||||
if (mca_pls_lsf_component.timing) {
|
||||
if (0 != gettimeofday(&joblaunchstart, NULL)) {
|
||||
opal_output(0, "pls_lsf: could not obtain job start time");
|
||||
@ -179,7 +178,6 @@ static int pls_lsf_launch_job(orte_jobid_t jobid)
|
||||
#endif
|
||||
|
||||
num_nodes = map->num_new_daemons;
|
||||
opal_output(0, "pls lsf num new daemons: %d!\n", num_nodes);
|
||||
if (num_nodes == 0) {
|
||||
/* no new daemons required - just launch apps */
|
||||
goto launch_apps;
|
||||
@ -231,7 +229,7 @@ static int pls_lsf_launch_job(orte_jobid_t jobid)
|
||||
|
||||
/* force orted to use the lsf sds */
|
||||
opal_argv_append(&argc, &argv, "--ns-nds");
|
||||
opal_argv_append(&argc, &argv, "lsf");
|
||||
opal_argv_append(&argc, &argv, "env");
|
||||
|
||||
/* tell the new daemons the base of the name list so they can compute
|
||||
* their own name on the other end
|
||||
@ -306,16 +304,12 @@ static int pls_lsf_launch_job(orte_jobid_t jobid)
|
||||
* orterun can do the rest of its stuff. Instead, we'll catch any
|
||||
* failures and deal with them elsewhere
|
||||
*/
|
||||
opal_output(0, "launching on: %s", opal_argv_join(nodelist_argv, ' '));
|
||||
opal_output(0, "launching: %s", opal_argv_join(argv, ' '));
|
||||
if (lsb_launch(nodelist_argv, argv, LSF_DJOB_NOWAIT, env) < 0) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_FAILED_TO_START);
|
||||
opal_output(0, "lsb_launch failed: %d", rc);
|
||||
rc = ORTE_ERR_FAILED_TO_START;
|
||||
goto cleanup;
|
||||
}
|
||||
opal_output(0, "lsb_launch launched ok; waiting for %d daemons\n",
|
||||
map->num_new_daemons);
|
||||
|
||||
/* wait for daemons to callback */
|
||||
if (ORTE_SUCCESS !=
|
||||
@ -325,12 +319,6 @@ static int pls_lsf_launch_job(orte_jobid_t jobid)
|
||||
}
|
||||
|
||||
launch_apps:
|
||||
{
|
||||
int i = 0;
|
||||
opal_output(0, "waiting for attach");
|
||||
while (i == 0) sleep(5);
|
||||
}
|
||||
opal_output(0, "laounching apps using lsf");
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_launch_apps(map))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
@ -338,7 +326,6 @@ launch_apps:
|
||||
|
||||
/* declare the launch a success */
|
||||
failed_launch = false;
|
||||
opal_output(0, "launched apps with lsf ok");
|
||||
|
||||
if (mca_pls_lsf_component.timing) {
|
||||
if (0 != gettimeofday(&launchstop, NULL)) {
|
||||
@ -382,7 +369,6 @@ cleanup:
|
||||
}
|
||||
}
|
||||
|
||||
opal_output(0, "lsf pls returning: %d\n", rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
@ -97,7 +97,7 @@ static orte_ras_base_module_t *orte_ras_lsf_init(int* priority)
|
||||
}
|
||||
|
||||
/* check if lsf is running here */
|
||||
if (lsb_init("ORTE launcher") < 0) {
|
||||
if (NULL == getenv("LSB_JOBID") || lsb_init("ORTE launcher") < 0) {
|
||||
/* nope, not here */
|
||||
return NULL;
|
||||
}
|
||||
|
@ -77,24 +77,16 @@ static int allocate(orte_jobid_t jobid, opal_list_t *attributes)
|
||||
|
||||
/* step through the list */
|
||||
for (count = i = 0; i < num_nodes; i++) {
|
||||
opal_output(0, "lsf got node: %s", nodelist[i]);
|
||||
/* is this a repeat of the current node? */
|
||||
if (NULL != node && 0 == strcmp(nodelist[i], node->node_name)) {
|
||||
/* it is a repeat - just bump the slot count */
|
||||
++node->node_slots;
|
||||
opal_output(0, "lsf ras repeat -- slot count now %d",
|
||||
node->node_slots);
|
||||
continue;
|
||||
}
|
||||
|
||||
opal_output(0, "lsf ras new node");
|
||||
/* not a repeat - create a node entry for it */
|
||||
node = OBJ_NEW(orte_ras_node_t);
|
||||
node->node_name = strdup(nodelist[i]);
|
||||
/* RHC: LSF does not use launch_id, so leave it invalid */
|
||||
#if 0
|
||||
node->launch_id = count++;
|
||||
#endif
|
||||
node->node_slots_inuse = 0;
|
||||
node->node_slots_max = 0;
|
||||
node->node_slots = 1;
|
||||
|
@ -71,12 +71,6 @@ orte_sds_base_module_t *orte_sds_lsf_component_init(int *priority)
|
||||
int id;
|
||||
char *mode;
|
||||
|
||||
/* check if lsf is running here */
|
||||
if (lsb_init("ORTE launcher") < 0) {
|
||||
/* nope, not here */
|
||||
return NULL;
|
||||
}
|
||||
|
||||
id = mca_base_param_register_string("ns", "nds", NULL, NULL, NULL);
|
||||
mca_base_param_lookup_string(id, &mode);
|
||||
|
||||
|
@ -49,15 +49,12 @@ orte_sds_base_module_t orte_sds_lsf_module = {
|
||||
orte_sds_lsf_finalize,
|
||||
};
|
||||
|
||||
static char *get_lsf_nodename(int nodeid);
|
||||
|
||||
|
||||
int orte_sds_lsf_set_name(void)
|
||||
{
|
||||
int rc;
|
||||
int id;
|
||||
char* name_string = NULL;
|
||||
int lsf_nodeid;
|
||||
|
||||
/* start by getting our jobid, and vpid (which is the
|
||||
starting vpid for the list of daemons) */
|
||||
@ -76,17 +73,9 @@ int orte_sds_lsf_set_name(void)
|
||||
} else {
|
||||
orte_jobid_t jobid;
|
||||
orte_vpid_t vpid;
|
||||
char* cellid_string;
|
||||
char* jobid_string;
|
||||
char* vpid_string;
|
||||
|
||||
id = mca_base_param_register_string("ns", "nds", "cellid", NULL, NULL);
|
||||
mca_base_param_lookup_string(id, &cellid_string);
|
||||
if (NULL == cellid_string) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
|
||||
id = mca_base_param_register_string("ns", "nds", "jobid", NULL, NULL);
|
||||
mca_base_param_lookup_string(id, &jobid_string);
|
||||
if (NULL == jobid_string) {
|
||||
@ -119,18 +108,6 @@ int orte_sds_lsf_set_name(void)
|
||||
}
|
||||
}
|
||||
|
||||
/* fix up the base name and make it the "real" name */
|
||||
lsf_nodeid = atoi(getenv("LSF_PM_TASKID"));
|
||||
orte_process_info.my_name->vpid += lsf_nodeid;
|
||||
|
||||
#if 0
|
||||
/* fix up the system info nodename to match exactly what lsf returned */
|
||||
if (NULL != orte_system_info.nodename) {
|
||||
free(orte_system_info.nodename);
|
||||
}
|
||||
orte_system_info.nodename = get_lsf_nodename(lsf_nodeid);
|
||||
#endif
|
||||
|
||||
/* get the non-name common environmental variables */
|
||||
if (ORTE_SUCCESS != (rc = orte_sds_env_get())) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -145,35 +122,3 @@ int orte_sds_lsf_finalize(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static char *get_lsf_nodename(int nodeid)
|
||||
{
|
||||
char **names = NULL;
|
||||
char *lsf_nodelist;
|
||||
char *ret;
|
||||
|
||||
lsf_nodelist = getenv("OMPI_MCA_orte_lsf_nodelist");
|
||||
|
||||
if (NULL == lsf_nodelist) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* split the node list into an argv array */
|
||||
names = opal_argv_split(lsf_nodelist, ',');
|
||||
if (NULL == names) { /* got an error */
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* check to see if there are enough entries */
|
||||
if (nodeid > opal_argv_count(names)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
ret = strdup(names[nodeid]);
|
||||
|
||||
opal_argv_free(names);
|
||||
|
||||
/* All done */
|
||||
return ret;
|
||||
}
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user