LSF support is now working. W00t! May be subject to a further tweak
or two. * checking lsb_init() is not sufficient to know whether you're in an LSF job or not; you also need to check for environment variable markers * remove lots of debugging output * no need for the sds lsf to call lsb_init() * remove some slurm-like dead code and a copy-n-paste error in the sds lsf This commit was SVN r15644.
Этот коммит содержится в:
родитель
8e9c71282d
Коммит
75192de1fc
@ -142,7 +142,7 @@ static orte_pls_base_module_t *pls_lsf_init(int *priority)
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* check if lsf is running here */
|
/* check if lsf is running here */
|
||||||
if (lsb_init("ORTE launcher") < 0) {
|
if (NULL == getenv("LSB_JOBID") || lsb_init("ORTE launcher") < 0) {
|
||||||
/* nope, not here */
|
/* nope, not here */
|
||||||
opal_output_verbose(10, orte_pls_base.pls_output,
|
opal_output_verbose(10, orte_pls_base.pls_output,
|
||||||
"pls:lsf: NOT available for selection");
|
"pls:lsf: NOT available for selection");
|
||||||
|
@ -133,7 +133,6 @@ static int pls_lsf_launch_job(orte_jobid_t jobid)
|
|||||||
int proc_name_index = 0;
|
int proc_name_index = 0;
|
||||||
bool failed_launch = true;
|
bool failed_launch = true;
|
||||||
|
|
||||||
opal_output(0, "pls lsf being used to launch!\n");
|
|
||||||
if (mca_pls_lsf_component.timing) {
|
if (mca_pls_lsf_component.timing) {
|
||||||
if (0 != gettimeofday(&joblaunchstart, NULL)) {
|
if (0 != gettimeofday(&joblaunchstart, NULL)) {
|
||||||
opal_output(0, "pls_lsf: could not obtain job start time");
|
opal_output(0, "pls_lsf: could not obtain job start time");
|
||||||
@ -179,7 +178,6 @@ static int pls_lsf_launch_job(orte_jobid_t jobid)
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
num_nodes = map->num_new_daemons;
|
num_nodes = map->num_new_daemons;
|
||||||
opal_output(0, "pls lsf num new daemons: %d!\n", num_nodes);
|
|
||||||
if (num_nodes == 0) {
|
if (num_nodes == 0) {
|
||||||
/* no new daemons required - just launch apps */
|
/* no new daemons required - just launch apps */
|
||||||
goto launch_apps;
|
goto launch_apps;
|
||||||
@ -231,7 +229,7 @@ static int pls_lsf_launch_job(orte_jobid_t jobid)
|
|||||||
|
|
||||||
/* force orted to use the lsf sds */
|
/* force orted to use the lsf sds */
|
||||||
opal_argv_append(&argc, &argv, "--ns-nds");
|
opal_argv_append(&argc, &argv, "--ns-nds");
|
||||||
opal_argv_append(&argc, &argv, "lsf");
|
opal_argv_append(&argc, &argv, "env");
|
||||||
|
|
||||||
/* tell the new daemons the base of the name list so they can compute
|
/* tell the new daemons the base of the name list so they can compute
|
||||||
* their own name on the other end
|
* their own name on the other end
|
||||||
@ -306,16 +304,12 @@ static int pls_lsf_launch_job(orte_jobid_t jobid)
|
|||||||
* orterun can do the rest of its stuff. Instead, we'll catch any
|
* orterun can do the rest of its stuff. Instead, we'll catch any
|
||||||
* failures and deal with them elsewhere
|
* failures and deal with them elsewhere
|
||||||
*/
|
*/
|
||||||
opal_output(0, "launching on: %s", opal_argv_join(nodelist_argv, ' '));
|
|
||||||
opal_output(0, "launching: %s", opal_argv_join(argv, ' '));
|
|
||||||
if (lsb_launch(nodelist_argv, argv, LSF_DJOB_NOWAIT, env) < 0) {
|
if (lsb_launch(nodelist_argv, argv, LSF_DJOB_NOWAIT, env) < 0) {
|
||||||
ORTE_ERROR_LOG(ORTE_ERR_FAILED_TO_START);
|
ORTE_ERROR_LOG(ORTE_ERR_FAILED_TO_START);
|
||||||
opal_output(0, "lsb_launch failed: %d", rc);
|
opal_output(0, "lsb_launch failed: %d", rc);
|
||||||
rc = ORTE_ERR_FAILED_TO_START;
|
rc = ORTE_ERR_FAILED_TO_START;
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
opal_output(0, "lsb_launch launched ok; waiting for %d daemons\n",
|
|
||||||
map->num_new_daemons);
|
|
||||||
|
|
||||||
/* wait for daemons to callback */
|
/* wait for daemons to callback */
|
||||||
if (ORTE_SUCCESS !=
|
if (ORTE_SUCCESS !=
|
||||||
@ -325,12 +319,6 @@ static int pls_lsf_launch_job(orte_jobid_t jobid)
|
|||||||
}
|
}
|
||||||
|
|
||||||
launch_apps:
|
launch_apps:
|
||||||
{
|
|
||||||
int i = 0;
|
|
||||||
opal_output(0, "waiting for attach");
|
|
||||||
while (i == 0) sleep(5);
|
|
||||||
}
|
|
||||||
opal_output(0, "laounching apps using lsf");
|
|
||||||
if (ORTE_SUCCESS != (rc = orte_pls_base_launch_apps(map))) {
|
if (ORTE_SUCCESS != (rc = orte_pls_base_launch_apps(map))) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
@ -338,7 +326,6 @@ launch_apps:
|
|||||||
|
|
||||||
/* declare the launch a success */
|
/* declare the launch a success */
|
||||||
failed_launch = false;
|
failed_launch = false;
|
||||||
opal_output(0, "launched apps with lsf ok");
|
|
||||||
|
|
||||||
if (mca_pls_lsf_component.timing) {
|
if (mca_pls_lsf_component.timing) {
|
||||||
if (0 != gettimeofday(&launchstop, NULL)) {
|
if (0 != gettimeofday(&launchstop, NULL)) {
|
||||||
@ -382,7 +369,6 @@ cleanup:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
opal_output(0, "lsf pls returning: %d\n", rc);
|
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -97,7 +97,7 @@ static orte_ras_base_module_t *orte_ras_lsf_init(int* priority)
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* check if lsf is running here */
|
/* check if lsf is running here */
|
||||||
if (lsb_init("ORTE launcher") < 0) {
|
if (NULL == getenv("LSB_JOBID") || lsb_init("ORTE launcher") < 0) {
|
||||||
/* nope, not here */
|
/* nope, not here */
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
@ -77,24 +77,16 @@ static int allocate(orte_jobid_t jobid, opal_list_t *attributes)
|
|||||||
|
|
||||||
/* step through the list */
|
/* step through the list */
|
||||||
for (count = i = 0; i < num_nodes; i++) {
|
for (count = i = 0; i < num_nodes; i++) {
|
||||||
opal_output(0, "lsf got node: %s", nodelist[i]);
|
|
||||||
/* is this a repeat of the current node? */
|
/* is this a repeat of the current node? */
|
||||||
if (NULL != node && 0 == strcmp(nodelist[i], node->node_name)) {
|
if (NULL != node && 0 == strcmp(nodelist[i], node->node_name)) {
|
||||||
/* it is a repeat - just bump the slot count */
|
/* it is a repeat - just bump the slot count */
|
||||||
++node->node_slots;
|
++node->node_slots;
|
||||||
opal_output(0, "lsf ras repeat -- slot count now %d",
|
|
||||||
node->node_slots);
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
opal_output(0, "lsf ras new node");
|
|
||||||
/* not a repeat - create a node entry for it */
|
/* not a repeat - create a node entry for it */
|
||||||
node = OBJ_NEW(orte_ras_node_t);
|
node = OBJ_NEW(orte_ras_node_t);
|
||||||
node->node_name = strdup(nodelist[i]);
|
node->node_name = strdup(nodelist[i]);
|
||||||
/* RHC: LSF does not use launch_id, so leave it invalid */
|
|
||||||
#if 0
|
|
||||||
node->launch_id = count++;
|
|
||||||
#endif
|
|
||||||
node->node_slots_inuse = 0;
|
node->node_slots_inuse = 0;
|
||||||
node->node_slots_max = 0;
|
node->node_slots_max = 0;
|
||||||
node->node_slots = 1;
|
node->node_slots = 1;
|
||||||
|
@ -71,12 +71,6 @@ orte_sds_base_module_t *orte_sds_lsf_component_init(int *priority)
|
|||||||
int id;
|
int id;
|
||||||
char *mode;
|
char *mode;
|
||||||
|
|
||||||
/* check if lsf is running here */
|
|
||||||
if (lsb_init("ORTE launcher") < 0) {
|
|
||||||
/* nope, not here */
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
id = mca_base_param_register_string("ns", "nds", NULL, NULL, NULL);
|
id = mca_base_param_register_string("ns", "nds", NULL, NULL, NULL);
|
||||||
mca_base_param_lookup_string(id, &mode);
|
mca_base_param_lookup_string(id, &mode);
|
||||||
|
|
||||||
|
@ -49,15 +49,12 @@ orte_sds_base_module_t orte_sds_lsf_module = {
|
|||||||
orte_sds_lsf_finalize,
|
orte_sds_lsf_finalize,
|
||||||
};
|
};
|
||||||
|
|
||||||
static char *get_lsf_nodename(int nodeid);
|
|
||||||
|
|
||||||
|
|
||||||
int orte_sds_lsf_set_name(void)
|
int orte_sds_lsf_set_name(void)
|
||||||
{
|
{
|
||||||
int rc;
|
int rc;
|
||||||
int id;
|
int id;
|
||||||
char* name_string = NULL;
|
char* name_string = NULL;
|
||||||
int lsf_nodeid;
|
|
||||||
|
|
||||||
/* start by getting our jobid, and vpid (which is the
|
/* start by getting our jobid, and vpid (which is the
|
||||||
starting vpid for the list of daemons) */
|
starting vpid for the list of daemons) */
|
||||||
@ -76,17 +73,9 @@ int orte_sds_lsf_set_name(void)
|
|||||||
} else {
|
} else {
|
||||||
orte_jobid_t jobid;
|
orte_jobid_t jobid;
|
||||||
orte_vpid_t vpid;
|
orte_vpid_t vpid;
|
||||||
char* cellid_string;
|
|
||||||
char* jobid_string;
|
char* jobid_string;
|
||||||
char* vpid_string;
|
char* vpid_string;
|
||||||
|
|
||||||
id = mca_base_param_register_string("ns", "nds", "cellid", NULL, NULL);
|
|
||||||
mca_base_param_lookup_string(id, &cellid_string);
|
|
||||||
if (NULL == cellid_string) {
|
|
||||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
|
||||||
return ORTE_ERR_NOT_FOUND;
|
|
||||||
}
|
|
||||||
|
|
||||||
id = mca_base_param_register_string("ns", "nds", "jobid", NULL, NULL);
|
id = mca_base_param_register_string("ns", "nds", "jobid", NULL, NULL);
|
||||||
mca_base_param_lookup_string(id, &jobid_string);
|
mca_base_param_lookup_string(id, &jobid_string);
|
||||||
if (NULL == jobid_string) {
|
if (NULL == jobid_string) {
|
||||||
@ -119,18 +108,6 @@ int orte_sds_lsf_set_name(void)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* fix up the base name and make it the "real" name */
|
|
||||||
lsf_nodeid = atoi(getenv("LSF_PM_TASKID"));
|
|
||||||
orte_process_info.my_name->vpid += lsf_nodeid;
|
|
||||||
|
|
||||||
#if 0
|
|
||||||
/* fix up the system info nodename to match exactly what lsf returned */
|
|
||||||
if (NULL != orte_system_info.nodename) {
|
|
||||||
free(orte_system_info.nodename);
|
|
||||||
}
|
|
||||||
orte_system_info.nodename = get_lsf_nodename(lsf_nodeid);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* get the non-name common environmental variables */
|
/* get the non-name common environmental variables */
|
||||||
if (ORTE_SUCCESS != (rc = orte_sds_env_get())) {
|
if (ORTE_SUCCESS != (rc = orte_sds_env_get())) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
@ -145,35 +122,3 @@ int orte_sds_lsf_finalize(void)
|
|||||||
{
|
{
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static char *get_lsf_nodename(int nodeid)
|
|
||||||
{
|
|
||||||
char **names = NULL;
|
|
||||||
char *lsf_nodelist;
|
|
||||||
char *ret;
|
|
||||||
|
|
||||||
lsf_nodelist = getenv("OMPI_MCA_orte_lsf_nodelist");
|
|
||||||
|
|
||||||
if (NULL == lsf_nodelist) {
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* split the node list into an argv array */
|
|
||||||
names = opal_argv_split(lsf_nodelist, ',');
|
|
||||||
if (NULL == names) { /* got an error */
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* check to see if there are enough entries */
|
|
||||||
if (nodeid > opal_argv_count(names)) {
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
ret = strdup(names[nodeid]);
|
|
||||||
|
|
||||||
opal_argv_free(names);
|
|
||||||
|
|
||||||
/* All done */
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user