Compiles to the new API, but doesn't quite work yet...
This commit was SVN r15537.
Этот коммит содержится в:
родитель
7445a11f61
Коммит
2baa866026
@ -133,7 +133,7 @@ static int pls_lsf_launch_job(orte_jobid_t jobid)
|
||||
int proc_name_index = 0;
|
||||
bool failed_launch = true;
|
||||
|
||||
printf("pls lsf being used to launch!\n");
|
||||
opal_output(0, "pls lsf being used to launch!\n");
|
||||
if (mca_pls_lsf_component.timing) {
|
||||
if (0 != gettimeofday(&joblaunchstart, NULL)) {
|
||||
opal_output(0, "pls_lsf: could not obtain job start time");
|
||||
@ -155,18 +155,28 @@ static int pls_lsf_launch_job(orte_jobid_t jobid)
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* account for any reuse of daemons */
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_launch_on_existing_daemons(map))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* Iterate through each of the nodes and check to see if we have
|
||||
* a valid launch_id (must be > 0). If not, then error out as
|
||||
* we cannot do anything
|
||||
*/
|
||||
for (item = opal_list_get_first(&map->nodes);
|
||||
item != opal_list_get_end(&map->nodes);
|
||||
item = opal_list_get_next(item)) {
|
||||
orte_mapped_node_t* node = (orte_mapped_node_t*)item;
|
||||
|
||||
if (node->launch_id < 0) {
|
||||
/* JMS fix me */
|
||||
opal_show_help("help-pls-lsf.txt", "lsf-bad-launchid",
|
||||
true, node->nodename, node->launch_id);
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
|
||||
num_nodes = map->num_new_daemons;
|
||||
opal_output(0, "pls lsf num new daemons: %d!\n", num_nodes);
|
||||
if (num_nodes == 0) {
|
||||
/* nothing to do - just return */
|
||||
failed_launch = false;
|
||||
rc = ORTE_SUCCESS;
|
||||
goto cleanup;
|
||||
/* no new daemons required - just launch apps */
|
||||
goto launch_apps;
|
||||
}
|
||||
|
||||
/* create nodelist */
|
||||
@ -290,29 +300,39 @@ static int pls_lsf_launch_job(orte_jobid_t jobid)
|
||||
* orterun can do the rest of its stuff. Instead, we'll catch any
|
||||
* failures and deal with them elsewhere
|
||||
*/
|
||||
argv = NULL;
|
||||
argc = 0;
|
||||
opal_argv_append(&argc, &argv, "env");
|
||||
opal_output(0, "launching on: %s", opal_argv_join(nodelist_argv, ' '));
|
||||
opal_output(0, "launching: %s", opal_argv_join(argv, ' '));
|
||||
if (lsb_launch(nodelist_argv, argv, LSF_DJOB_NOWAIT, env) < 0) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_FAILED_TO_START);
|
||||
opal_output(0, "got nonzero: %d", rc);
|
||||
opal_output(0, "lsb_launch failed: %d", rc);
|
||||
rc = ORTE_ERR_FAILED_TO_START;
|
||||
goto cleanup;
|
||||
}
|
||||
opal_output(0, "launched ok");
|
||||
sleep(5);
|
||||
exit(0);
|
||||
opal_output(0, "lsb_launch launched ok; waiting for %d daemons\n",
|
||||
map->num_new_daemons);
|
||||
|
||||
/* wait for daemons to callback */
|
||||
if (ORTE_SUCCESS !=
|
||||
(rc = orte_pls_base_daemon_callback(map->num_new_daemons))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if (lsb_launch(nodelist_argv, argv, LSF_DJOB_NOWAIT, env) < 0) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_FAILED_TO_START);
|
||||
rc = ORTE_ERR_FAILED_TO_START;
|
||||
launch_apps:
|
||||
{
|
||||
int i = 0;
|
||||
opal_output(0, "waiting for attach");
|
||||
while (i == 0) sleep(5);
|
||||
}
|
||||
opal_output(0, "laounching apps using lsf");
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_launch_apps(map))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* declare the launch a success */
|
||||
failed_launch = false;
|
||||
opal_output(0, "launched apps with lsf ok");
|
||||
|
||||
if (mca_pls_lsf_component.timing) {
|
||||
if (0 != gettimeofday(&launchstop, NULL)) {
|
||||
@ -355,7 +375,8 @@ cleanup:
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
opal_output(0, "lsf pls returning: %d\n", rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
@ -9,6 +9,7 @@
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2007 Cisco, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
@ -36,8 +37,8 @@ endif
|
||||
AM_CPPFLAGS= $(ras_lsf_CPPFLAGS)
|
||||
|
||||
proxy_SOURCES = \
|
||||
ras_lsf.c \
|
||||
ras_lsf.h \
|
||||
ras_lsf_module.c \
|
||||
ras_lsf_component.c
|
||||
|
||||
mcacomponentdir = $(pkglibdir)
|
||||
|
@ -35,13 +35,36 @@
|
||||
#include "ras_lsf.h"
|
||||
|
||||
|
||||
static int orte_ras_lsf_allocate(orte_jobid_t jobid, opal_list_t *attributes)
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
static int allocate(orte_jobid_t jobid, opal_list_t *attributes);
|
||||
static int deallocate(orte_jobid_t jobid);
|
||||
static int finalize(void);
|
||||
|
||||
|
||||
/*
|
||||
* Global variable
|
||||
*/
|
||||
orte_ras_base_module_t orte_ras_lsf_module = {
|
||||
allocate,
|
||||
orte_ras_base_node_insert,
|
||||
orte_ras_base_node_query,
|
||||
orte_ras_base_node_query_alloc,
|
||||
orte_ras_base_node_lookup,
|
||||
orte_ras_base_proc_query_alloc,
|
||||
deallocate,
|
||||
finalize
|
||||
};
|
||||
|
||||
|
||||
static int allocate(orte_jobid_t jobid, opal_list_t *attributes)
|
||||
{
|
||||
char **nodelist;
|
||||
opal_list_t nodes;
|
||||
opal_list_item_t *item;
|
||||
orte_ras_node_t *node;
|
||||
int i, rc, num_nodes;
|
||||
int i, count, rc, num_nodes;
|
||||
|
||||
/* get the list of allocated nodes */
|
||||
if ((num_nodes = lsb_getalloc(&nodelist)) < 0) {
|
||||
@ -53,18 +76,24 @@ static int orte_ras_lsf_allocate(orte_jobid_t jobid, opal_list_t *attributes)
|
||||
node = NULL;
|
||||
|
||||
/* step through the list */
|
||||
for (i=0; i < num_nodes; i++) {
|
||||
printf("lsf got node: %s\n", nodelist[i]);
|
||||
for (count = i = 0; i < num_nodes; i++) {
|
||||
opal_output(0, "lsf got node: %s", nodelist[i]);
|
||||
/* is this a repeat of the current node? */
|
||||
if (NULL != node && 0 == strcmp(nodelist[i], node->node_name)) {
|
||||
/* it is a repeat - just bump the slot count */
|
||||
++node->node_slots;
|
||||
opal_output(0, "lsf ras repeat -- slot count now %d",
|
||||
node->node_slots);
|
||||
continue;
|
||||
}
|
||||
|
||||
opal_output(0, "lsf ras new node");
|
||||
/* not a repeat - create a node entry for it */
|
||||
node = OBJ_NEW(orte_ras_node_t);
|
||||
node->node_name = strdup(nodelist[i]);
|
||||
node->launch_id = count++;
|
||||
node->node_slots_inuse = 0;
|
||||
node->node_slots_max = 0;
|
||||
node->node_slots = 1;
|
||||
opal_list_append(&nodes, &node->super);
|
||||
}
|
||||
@ -101,25 +130,13 @@ cleanup:
|
||||
return rc;
|
||||
}
|
||||
|
||||
static int orte_ras_lsf_deallocate(orte_jobid_t jobid)
|
||||
static int deallocate(orte_jobid_t jobid)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static int orte_ras_lsf_finalize(void)
|
||||
static int finalize(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
orte_ras_base_module_t orte_ras_lsf_module = {
|
||||
orte_ras_lsf_allocate,
|
||||
orte_ras_base_node_insert,
|
||||
orte_ras_base_node_query,
|
||||
orte_ras_base_node_query_alloc,
|
||||
orte_ras_base_node_lookup,
|
||||
orte_ras_lsf_deallocate,
|
||||
orte_ras_lsf_finalize
|
||||
};
|
||||
|
@ -59,7 +59,7 @@ int orte_sds_lsf_set_name(void)
|
||||
char* name_string = NULL;
|
||||
int lsf_nodeid;
|
||||
|
||||
/* start by getting our cellid, jobid, and vpid (which is the
|
||||
/* start by getting our jobid, and vpid (which is the
|
||||
starting vpid for the list of daemons) */
|
||||
id = mca_base_param_register_string("ns", "nds", "name", NULL, NULL);
|
||||
mca_base_param_lookup_string(id, &name_string);
|
||||
@ -74,7 +74,6 @@ int orte_sds_lsf_set_name(void)
|
||||
}
|
||||
free(name_string);
|
||||
} else {
|
||||
orte_cellid_t cellid;
|
||||
orte_jobid_t jobid;
|
||||
orte_vpid_t vpid;
|
||||
char* cellid_string;
|
||||
@ -87,11 +86,6 @@ int orte_sds_lsf_set_name(void)
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
if (ORTE_SUCCESS !=
|
||||
(rc = orte_ns.convert_string_to_cellid(&cellid, cellid_string))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return(rc);
|
||||
}
|
||||
|
||||
id = mca_base_param_register_string("ns", "nds", "jobid", NULL, NULL);
|
||||
mca_base_param_lookup_string(id, &jobid_string);
|
||||
@ -119,21 +113,23 @@ int orte_sds_lsf_set_name(void)
|
||||
|
||||
if (ORTE_SUCCESS !=
|
||||
(rc = orte_ns.create_process_name(&(orte_process_info.my_name),
|
||||
cellid, jobid, vpid))) {
|
||||
jobid, vpid))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
|
||||
/* fix up the base name and make it the "real" name */
|
||||
lsf_nodeid = atoi(getenv("LSB_JOBINDEX"));
|
||||
lsf_nodeid = atoi(getenv("LSF_PM_TASKID"));
|
||||
orte_process_info.my_name->vpid += lsf_nodeid;
|
||||
|
||||
#if 0
|
||||
/* fix up the system info nodename to match exactly what lsf returned */
|
||||
if (NULL != orte_system_info.nodename) {
|
||||
free(orte_system_info.nodename);
|
||||
}
|
||||
orte_system_info.nodename = get_lsf_nodename(lsf_nodeid);
|
||||
#endif
|
||||
|
||||
/* get the non-name common environmental variables */
|
||||
if (ORTE_SUCCESS != (rc = orte_sds_env_get())) {
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user