Compiles to the new API, but doesn't quite work yet...
This commit was SVN r15537.
Этот коммит содержится в:
родитель
7445a11f61
Коммит
2baa866026
@ -133,7 +133,7 @@ static int pls_lsf_launch_job(orte_jobid_t jobid)
|
|||||||
int proc_name_index = 0;
|
int proc_name_index = 0;
|
||||||
bool failed_launch = true;
|
bool failed_launch = true;
|
||||||
|
|
||||||
printf("pls lsf being used to launch!\n");
|
opal_output(0, "pls lsf being used to launch!\n");
|
||||||
if (mca_pls_lsf_component.timing) {
|
if (mca_pls_lsf_component.timing) {
|
||||||
if (0 != gettimeofday(&joblaunchstart, NULL)) {
|
if (0 != gettimeofday(&joblaunchstart, NULL)) {
|
||||||
opal_output(0, "pls_lsf: could not obtain job start time");
|
opal_output(0, "pls_lsf: could not obtain job start time");
|
||||||
@ -155,18 +155,28 @@ static int pls_lsf_launch_job(orte_jobid_t jobid)
|
|||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* account for any reuse of daemons */
|
/* Iterate through each of the nodes and check to see if we have
|
||||||
if (ORTE_SUCCESS != (rc = orte_pls_base_launch_on_existing_daemons(map))) {
|
* a valid launch_id (must be > 0). If not, then error out as
|
||||||
ORTE_ERROR_LOG(rc);
|
* we cannot do anything
|
||||||
goto cleanup;
|
*/
|
||||||
}
|
for (item = opal_list_get_first(&map->nodes);
|
||||||
|
item != opal_list_get_end(&map->nodes);
|
||||||
|
item = opal_list_get_next(item)) {
|
||||||
|
orte_mapped_node_t* node = (orte_mapped_node_t*)item;
|
||||||
|
|
||||||
|
if (node->launch_id < 0) {
|
||||||
|
/* JMS fix me */
|
||||||
|
opal_show_help("help-pls-lsf.txt", "lsf-bad-launchid",
|
||||||
|
true, node->nodename, node->launch_id);
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
num_nodes = map->num_new_daemons;
|
num_nodes = map->num_new_daemons;
|
||||||
|
opal_output(0, "pls lsf num new daemons: %d!\n", num_nodes);
|
||||||
if (num_nodes == 0) {
|
if (num_nodes == 0) {
|
||||||
/* nothing to do - just return */
|
/* no new daemons required - just launch apps */
|
||||||
failed_launch = false;
|
goto launch_apps;
|
||||||
rc = ORTE_SUCCESS;
|
|
||||||
goto cleanup;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* create nodelist */
|
/* create nodelist */
|
||||||
@ -290,29 +300,39 @@ static int pls_lsf_launch_job(orte_jobid_t jobid)
|
|||||||
* orterun can do the rest of its stuff. Instead, we'll catch any
|
* orterun can do the rest of its stuff. Instead, we'll catch any
|
||||||
* failures and deal with them elsewhere
|
* failures and deal with them elsewhere
|
||||||
*/
|
*/
|
||||||
argv = NULL;
|
|
||||||
argc = 0;
|
|
||||||
opal_argv_append(&argc, &argv, "env");
|
|
||||||
opal_output(0, "launching on: %s", opal_argv_join(nodelist_argv, ' '));
|
opal_output(0, "launching on: %s", opal_argv_join(nodelist_argv, ' '));
|
||||||
opal_output(0, "launching: %s", opal_argv_join(argv, ' '));
|
opal_output(0, "launching: %s", opal_argv_join(argv, ' '));
|
||||||
if (lsb_launch(nodelist_argv, argv, LSF_DJOB_NOWAIT, env) < 0) {
|
if (lsb_launch(nodelist_argv, argv, LSF_DJOB_NOWAIT, env) < 0) {
|
||||||
ORTE_ERROR_LOG(ORTE_ERR_FAILED_TO_START);
|
ORTE_ERROR_LOG(ORTE_ERR_FAILED_TO_START);
|
||||||
opal_output(0, "got nonzero: %d", rc);
|
opal_output(0, "lsb_launch failed: %d", rc);
|
||||||
rc = ORTE_ERR_FAILED_TO_START;
|
rc = ORTE_ERR_FAILED_TO_START;
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
opal_output(0, "launched ok");
|
opal_output(0, "lsb_launch launched ok; waiting for %d daemons\n",
|
||||||
sleep(5);
|
map->num_new_daemons);
|
||||||
exit(0);
|
|
||||||
|
/* wait for daemons to callback */
|
||||||
|
if (ORTE_SUCCESS !=
|
||||||
|
(rc = orte_pls_base_daemon_callback(map->num_new_daemons))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
|
||||||
if (lsb_launch(nodelist_argv, argv, LSF_DJOB_NOWAIT, env) < 0) {
|
launch_apps:
|
||||||
ORTE_ERROR_LOG(ORTE_ERR_FAILED_TO_START);
|
{
|
||||||
rc = ORTE_ERR_FAILED_TO_START;
|
int i = 0;
|
||||||
|
opal_output(0, "waiting for attach");
|
||||||
|
while (i == 0) sleep(5);
|
||||||
|
}
|
||||||
|
opal_output(0, "laounching apps using lsf");
|
||||||
|
if (ORTE_SUCCESS != (rc = orte_pls_base_launch_apps(map))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* declare the launch a success */
|
/* declare the launch a success */
|
||||||
failed_launch = false;
|
failed_launch = false;
|
||||||
|
opal_output(0, "launched apps with lsf ok");
|
||||||
|
|
||||||
if (mca_pls_lsf_component.timing) {
|
if (mca_pls_lsf_component.timing) {
|
||||||
if (0 != gettimeofday(&launchstop, NULL)) {
|
if (0 != gettimeofday(&launchstop, NULL)) {
|
||||||
@ -355,7 +375,8 @@ cleanup:
|
|||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
opal_output(0, "lsf pls returning: %d\n", rc);
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -9,6 +9,7 @@
|
|||||||
# University of Stuttgart. All rights reserved.
|
# University of Stuttgart. All rights reserved.
|
||||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
# All rights reserved.
|
# All rights reserved.
|
||||||
|
# Copyright (c) 2007 Cisco, Inc. All rights reserved.
|
||||||
# $COPYRIGHT$
|
# $COPYRIGHT$
|
||||||
#
|
#
|
||||||
# Additional copyrights may follow
|
# Additional copyrights may follow
|
||||||
@ -36,8 +37,8 @@ endif
|
|||||||
AM_CPPFLAGS= $(ras_lsf_CPPFLAGS)
|
AM_CPPFLAGS= $(ras_lsf_CPPFLAGS)
|
||||||
|
|
||||||
proxy_SOURCES = \
|
proxy_SOURCES = \
|
||||||
ras_lsf.c \
|
|
||||||
ras_lsf.h \
|
ras_lsf.h \
|
||||||
|
ras_lsf_module.c \
|
||||||
ras_lsf_component.c
|
ras_lsf_component.c
|
||||||
|
|
||||||
mcacomponentdir = $(pkglibdir)
|
mcacomponentdir = $(pkglibdir)
|
||||||
|
@ -35,13 +35,36 @@
|
|||||||
#include "ras_lsf.h"
|
#include "ras_lsf.h"
|
||||||
|
|
||||||
|
|
||||||
static int orte_ras_lsf_allocate(orte_jobid_t jobid, opal_list_t *attributes)
|
/*
|
||||||
|
* Local functions
|
||||||
|
*/
|
||||||
|
static int allocate(orte_jobid_t jobid, opal_list_t *attributes);
|
||||||
|
static int deallocate(orte_jobid_t jobid);
|
||||||
|
static int finalize(void);
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Global variable
|
||||||
|
*/
|
||||||
|
orte_ras_base_module_t orte_ras_lsf_module = {
|
||||||
|
allocate,
|
||||||
|
orte_ras_base_node_insert,
|
||||||
|
orte_ras_base_node_query,
|
||||||
|
orte_ras_base_node_query_alloc,
|
||||||
|
orte_ras_base_node_lookup,
|
||||||
|
orte_ras_base_proc_query_alloc,
|
||||||
|
deallocate,
|
||||||
|
finalize
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
static int allocate(orte_jobid_t jobid, opal_list_t *attributes)
|
||||||
{
|
{
|
||||||
char **nodelist;
|
char **nodelist;
|
||||||
opal_list_t nodes;
|
opal_list_t nodes;
|
||||||
opal_list_item_t *item;
|
opal_list_item_t *item;
|
||||||
orte_ras_node_t *node;
|
orte_ras_node_t *node;
|
||||||
int i, rc, num_nodes;
|
int i, count, rc, num_nodes;
|
||||||
|
|
||||||
/* get the list of allocated nodes */
|
/* get the list of allocated nodes */
|
||||||
if ((num_nodes = lsb_getalloc(&nodelist)) < 0) {
|
if ((num_nodes = lsb_getalloc(&nodelist)) < 0) {
|
||||||
@ -53,18 +76,24 @@ static int orte_ras_lsf_allocate(orte_jobid_t jobid, opal_list_t *attributes)
|
|||||||
node = NULL;
|
node = NULL;
|
||||||
|
|
||||||
/* step through the list */
|
/* step through the list */
|
||||||
for (i=0; i < num_nodes; i++) {
|
for (count = i = 0; i < num_nodes; i++) {
|
||||||
printf("lsf got node: %s\n", nodelist[i]);
|
opal_output(0, "lsf got node: %s", nodelist[i]);
|
||||||
/* is this a repeat of the current node? */
|
/* is this a repeat of the current node? */
|
||||||
if (NULL != node && 0 == strcmp(nodelist[i], node->node_name)) {
|
if (NULL != node && 0 == strcmp(nodelist[i], node->node_name)) {
|
||||||
/* it is a repeat - just bump the slot count */
|
/* it is a repeat - just bump the slot count */
|
||||||
++node->node_slots;
|
++node->node_slots;
|
||||||
|
opal_output(0, "lsf ras repeat -- slot count now %d",
|
||||||
|
node->node_slots);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
opal_output(0, "lsf ras new node");
|
||||||
/* not a repeat - create a node entry for it */
|
/* not a repeat - create a node entry for it */
|
||||||
node = OBJ_NEW(orte_ras_node_t);
|
node = OBJ_NEW(orte_ras_node_t);
|
||||||
node->node_name = strdup(nodelist[i]);
|
node->node_name = strdup(nodelist[i]);
|
||||||
|
node->launch_id = count++;
|
||||||
|
node->node_slots_inuse = 0;
|
||||||
|
node->node_slots_max = 0;
|
||||||
node->node_slots = 1;
|
node->node_slots = 1;
|
||||||
opal_list_append(&nodes, &node->super);
|
opal_list_append(&nodes, &node->super);
|
||||||
}
|
}
|
||||||
@ -101,25 +130,13 @@ cleanup:
|
|||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int orte_ras_lsf_deallocate(orte_jobid_t jobid)
|
static int deallocate(orte_jobid_t jobid)
|
||||||
{
|
{
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static int orte_ras_lsf_finalize(void)
|
static int finalize(void)
|
||||||
{
|
{
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
orte_ras_base_module_t orte_ras_lsf_module = {
|
|
||||||
orte_ras_lsf_allocate,
|
|
||||||
orte_ras_base_node_insert,
|
|
||||||
orte_ras_base_node_query,
|
|
||||||
orte_ras_base_node_query_alloc,
|
|
||||||
orte_ras_base_node_lookup,
|
|
||||||
orte_ras_lsf_deallocate,
|
|
||||||
orte_ras_lsf_finalize
|
|
||||||
};
|
|
||||||
|
|
@ -59,7 +59,7 @@ int orte_sds_lsf_set_name(void)
|
|||||||
char* name_string = NULL;
|
char* name_string = NULL;
|
||||||
int lsf_nodeid;
|
int lsf_nodeid;
|
||||||
|
|
||||||
/* start by getting our cellid, jobid, and vpid (which is the
|
/* start by getting our jobid, and vpid (which is the
|
||||||
starting vpid for the list of daemons) */
|
starting vpid for the list of daemons) */
|
||||||
id = mca_base_param_register_string("ns", "nds", "name", NULL, NULL);
|
id = mca_base_param_register_string("ns", "nds", "name", NULL, NULL);
|
||||||
mca_base_param_lookup_string(id, &name_string);
|
mca_base_param_lookup_string(id, &name_string);
|
||||||
@ -74,7 +74,6 @@ int orte_sds_lsf_set_name(void)
|
|||||||
}
|
}
|
||||||
free(name_string);
|
free(name_string);
|
||||||
} else {
|
} else {
|
||||||
orte_cellid_t cellid;
|
|
||||||
orte_jobid_t jobid;
|
orte_jobid_t jobid;
|
||||||
orte_vpid_t vpid;
|
orte_vpid_t vpid;
|
||||||
char* cellid_string;
|
char* cellid_string;
|
||||||
@ -87,11 +86,6 @@ int orte_sds_lsf_set_name(void)
|
|||||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||||
return ORTE_ERR_NOT_FOUND;
|
return ORTE_ERR_NOT_FOUND;
|
||||||
}
|
}
|
||||||
if (ORTE_SUCCESS !=
|
|
||||||
(rc = orte_ns.convert_string_to_cellid(&cellid, cellid_string))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
return(rc);
|
|
||||||
}
|
|
||||||
|
|
||||||
id = mca_base_param_register_string("ns", "nds", "jobid", NULL, NULL);
|
id = mca_base_param_register_string("ns", "nds", "jobid", NULL, NULL);
|
||||||
mca_base_param_lookup_string(id, &jobid_string);
|
mca_base_param_lookup_string(id, &jobid_string);
|
||||||
@ -119,21 +113,23 @@ int orte_sds_lsf_set_name(void)
|
|||||||
|
|
||||||
if (ORTE_SUCCESS !=
|
if (ORTE_SUCCESS !=
|
||||||
(rc = orte_ns.create_process_name(&(orte_process_info.my_name),
|
(rc = orte_ns.create_process_name(&(orte_process_info.my_name),
|
||||||
cellid, jobid, vpid))) {
|
jobid, vpid))) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* fix up the base name and make it the "real" name */
|
/* fix up the base name and make it the "real" name */
|
||||||
lsf_nodeid = atoi(getenv("LSB_JOBINDEX"));
|
lsf_nodeid = atoi(getenv("LSF_PM_TASKID"));
|
||||||
orte_process_info.my_name->vpid += lsf_nodeid;
|
orte_process_info.my_name->vpid += lsf_nodeid;
|
||||||
|
|
||||||
|
#if 0
|
||||||
/* fix up the system info nodename to match exactly what lsf returned */
|
/* fix up the system info nodename to match exactly what lsf returned */
|
||||||
if (NULL != orte_system_info.nodename) {
|
if (NULL != orte_system_info.nodename) {
|
||||||
free(orte_system_info.nodename);
|
free(orte_system_info.nodename);
|
||||||
}
|
}
|
||||||
orte_system_info.nodename = get_lsf_nodename(lsf_nodeid);
|
orte_system_info.nodename = get_lsf_nodename(lsf_nodeid);
|
||||||
|
#endif
|
||||||
|
|
||||||
/* get the non-name common environmental variables */
|
/* get the non-name common environmental variables */
|
||||||
if (ORTE_SUCCESS != (rc = orte_sds_env_get())) {
|
if (ORTE_SUCCESS != (rc = orte_sds_env_get())) {
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user