1
1

Compiles to the new API, but doesn't quite work yet...

This commit was SVN r15537.
Этот коммит содержится в:
Jeff Squyres 2007-07-20 19:49:27 +00:00
родитель 7445a11f61
Коммит 2baa866026
4 изменённых файлов: 85 добавлений и 50 удалений

Просмотреть файл

@ -133,7 +133,7 @@ static int pls_lsf_launch_job(orte_jobid_t jobid)
int proc_name_index = 0;
bool failed_launch = true;
printf("pls lsf being used to launch!\n");
opal_output(0, "pls lsf being used to launch!\n");
if (mca_pls_lsf_component.timing) {
if (0 != gettimeofday(&joblaunchstart, NULL)) {
opal_output(0, "pls_lsf: could not obtain job start time");
@ -155,18 +155,28 @@ static int pls_lsf_launch_job(orte_jobid_t jobid)
goto cleanup;
}
/* account for any reuse of daemons */
if (ORTE_SUCCESS != (rc = orte_pls_base_launch_on_existing_daemons(map))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* Iterate through each of the nodes and check to see if we have
* a valid launch_id (must be > 0). If not, then error out as
* we cannot do anything
*/
for (item = opal_list_get_first(&map->nodes);
item != opal_list_get_end(&map->nodes);
item = opal_list_get_next(item)) {
orte_mapped_node_t* node = (orte_mapped_node_t*)item;
if (node->launch_id < 0) {
/* JMS fix me */
opal_show_help("help-pls-lsf.txt", "lsf-bad-launchid",
true, node->nodename, node->launch_id);
goto cleanup;
}
}
num_nodes = map->num_new_daemons;
opal_output(0, "pls lsf num new daemons: %d!\n", num_nodes);
if (num_nodes == 0) {
/* nothing to do - just return */
failed_launch = false;
rc = ORTE_SUCCESS;
goto cleanup;
/* no new daemons required - just launch apps */
goto launch_apps;
}
/* create nodelist */
@ -290,29 +300,39 @@ static int pls_lsf_launch_job(orte_jobid_t jobid)
* orterun can do the rest of its stuff. Instead, we'll catch any
* failures and deal with them elsewhere
*/
argv = NULL;
argc = 0;
opal_argv_append(&argc, &argv, "env");
opal_output(0, "launching on: %s", opal_argv_join(nodelist_argv, ' '));
opal_output(0, "launching: %s", opal_argv_join(argv, ' '));
if (lsb_launch(nodelist_argv, argv, LSF_DJOB_NOWAIT, env) < 0) {
ORTE_ERROR_LOG(ORTE_ERR_FAILED_TO_START);
opal_output(0, "got nonzero: %d", rc);
opal_output(0, "lsb_launch failed: %d", rc);
rc = ORTE_ERR_FAILED_TO_START;
goto cleanup;
}
opal_output(0, "launched ok");
sleep(5);
exit(0);
opal_output(0, "lsb_launch launched ok; waiting for %d daemons\n",
map->num_new_daemons);
/* wait for daemons to callback */
if (ORTE_SUCCESS !=
(rc = orte_pls_base_daemon_callback(map->num_new_daemons))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
if (lsb_launch(nodelist_argv, argv, LSF_DJOB_NOWAIT, env) < 0) {
ORTE_ERROR_LOG(ORTE_ERR_FAILED_TO_START);
rc = ORTE_ERR_FAILED_TO_START;
launch_apps:
{
int i = 0;
opal_output(0, "waiting for attach");
while (i == 0) sleep(5);
}
opal_output(0, "laounching apps using lsf");
if (ORTE_SUCCESS != (rc = orte_pls_base_launch_apps(map))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* declare the launch a success */
failed_launch = false;
opal_output(0, "launched apps with lsf ok");
if (mca_pls_lsf_component.timing) {
if (0 != gettimeofday(&launchstop, NULL)) {
@ -355,7 +375,8 @@ cleanup:
ORTE_ERROR_LOG(rc);
}
}
opal_output(0, "lsf pls returning: %d\n", rc);
return rc;
}

Просмотреть файл

@ -9,6 +9,7 @@
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2007 Cisco, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
@ -36,8 +37,8 @@ endif
AM_CPPFLAGS= $(ras_lsf_CPPFLAGS)
proxy_SOURCES = \
ras_lsf.c \
ras_lsf.h \
ras_lsf_module.c \
ras_lsf_component.c
mcacomponentdir = $(pkglibdir)

Просмотреть файл

@ -35,13 +35,36 @@
#include "ras_lsf.h"
static int orte_ras_lsf_allocate(orte_jobid_t jobid, opal_list_t *attributes)
/*
* Local functions
*/
static int allocate(orte_jobid_t jobid, opal_list_t *attributes);
static int deallocate(orte_jobid_t jobid);
static int finalize(void);
/*
* Global variable
*/
orte_ras_base_module_t orte_ras_lsf_module = {
allocate,
orte_ras_base_node_insert,
orte_ras_base_node_query,
orte_ras_base_node_query_alloc,
orte_ras_base_node_lookup,
orte_ras_base_proc_query_alloc,
deallocate,
finalize
};
static int allocate(orte_jobid_t jobid, opal_list_t *attributes)
{
char **nodelist;
opal_list_t nodes;
opal_list_item_t *item;
orte_ras_node_t *node;
int i, rc, num_nodes;
int i, count, rc, num_nodes;
/* get the list of allocated nodes */
if ((num_nodes = lsb_getalloc(&nodelist)) < 0) {
@ -53,18 +76,24 @@ static int orte_ras_lsf_allocate(orte_jobid_t jobid, opal_list_t *attributes)
node = NULL;
/* step through the list */
for (i=0; i < num_nodes; i++) {
printf("lsf got node: %s\n", nodelist[i]);
for (count = i = 0; i < num_nodes; i++) {
opal_output(0, "lsf got node: %s", nodelist[i]);
/* is this a repeat of the current node? */
if (NULL != node && 0 == strcmp(nodelist[i], node->node_name)) {
/* it is a repeat - just bump the slot count */
++node->node_slots;
opal_output(0, "lsf ras repeat -- slot count now %d",
node->node_slots);
continue;
}
opal_output(0, "lsf ras new node");
/* not a repeat - create a node entry for it */
node = OBJ_NEW(orte_ras_node_t);
node->node_name = strdup(nodelist[i]);
node->launch_id = count++;
node->node_slots_inuse = 0;
node->node_slots_max = 0;
node->node_slots = 1;
opal_list_append(&nodes, &node->super);
}
@ -101,25 +130,13 @@ cleanup:
return rc;
}
static int orte_ras_lsf_deallocate(orte_jobid_t jobid)
static int deallocate(orte_jobid_t jobid)
{
return ORTE_SUCCESS;
}
static int orte_ras_lsf_finalize(void)
static int finalize(void)
{
return ORTE_SUCCESS;
}
orte_ras_base_module_t orte_ras_lsf_module = {
orte_ras_lsf_allocate,
orte_ras_base_node_insert,
orte_ras_base_node_query,
orte_ras_base_node_query_alloc,
orte_ras_base_node_lookup,
orte_ras_lsf_deallocate,
orte_ras_lsf_finalize
};

Просмотреть файл

@ -59,7 +59,7 @@ int orte_sds_lsf_set_name(void)
char* name_string = NULL;
int lsf_nodeid;
/* start by getting our cellid, jobid, and vpid (which is the
/* start by getting our jobid, and vpid (which is the
starting vpid for the list of daemons) */
id = mca_base_param_register_string("ns", "nds", "name", NULL, NULL);
mca_base_param_lookup_string(id, &name_string);
@ -74,7 +74,6 @@ int orte_sds_lsf_set_name(void)
}
free(name_string);
} else {
orte_cellid_t cellid;
orte_jobid_t jobid;
orte_vpid_t vpid;
char* cellid_string;
@ -87,11 +86,6 @@ int orte_sds_lsf_set_name(void)
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
if (ORTE_SUCCESS !=
(rc = orte_ns.convert_string_to_cellid(&cellid, cellid_string))) {
ORTE_ERROR_LOG(rc);
return(rc);
}
id = mca_base_param_register_string("ns", "nds", "jobid", NULL, NULL);
mca_base_param_lookup_string(id, &jobid_string);
@ -119,21 +113,23 @@ int orte_sds_lsf_set_name(void)
if (ORTE_SUCCESS !=
(rc = orte_ns.create_process_name(&(orte_process_info.my_name),
cellid, jobid, vpid))) {
jobid, vpid))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
/* fix up the base name and make it the "real" name */
lsf_nodeid = atoi(getenv("LSB_JOBINDEX"));
lsf_nodeid = atoi(getenv("LSF_PM_TASKID"));
orte_process_info.my_name->vpid += lsf_nodeid;
#if 0
/* fix up the system info nodename to match exactly what lsf returned */
if (NULL != orte_system_info.nodename) {
free(orte_system_info.nodename);
}
orte_system_info.nodename = get_lsf_nodename(lsf_nodeid);
#endif
/* get the non-name common environmental variables */
if (ORTE_SUCCESS != (rc = orte_sds_env_get())) {