From 2baa8660268380af7649e013e9bbdeb99418516d Mon Sep 17 00:00:00 2001 From: Jeff Squyres Date: Fri, 20 Jul 2007 19:49:27 +0000 Subject: [PATCH] Compiles to the new API, but doesn't quite work yet... This commit was SVN r15537. --- orte/mca/pls/lsf/pls_lsf_module.c | 65 ++++++++++++------- orte/mca/ras/lsf/Makefile.am | 3 +- .../ras/lsf/{ras_lsf.c => ras_lsf_module.c} | 53 ++++++++++----- orte/mca/sds/lsf/sds_lsf_module.c | 14 ++-- 4 files changed, 85 insertions(+), 50 deletions(-) rename orte/mca/ras/lsf/{ras_lsf.c => ras_lsf_module.c} (79%) diff --git a/orte/mca/pls/lsf/pls_lsf_module.c b/orte/mca/pls/lsf/pls_lsf_module.c index 260c61e2c8..552247d9af 100644 --- a/orte/mca/pls/lsf/pls_lsf_module.c +++ b/orte/mca/pls/lsf/pls_lsf_module.c @@ -133,7 +133,7 @@ static int pls_lsf_launch_job(orte_jobid_t jobid) int proc_name_index = 0; bool failed_launch = true; - printf("pls lsf being used to launch!\n"); + opal_output(0, "pls lsf being used to launch!\n"); if (mca_pls_lsf_component.timing) { if (0 != gettimeofday(&joblaunchstart, NULL)) { opal_output(0, "pls_lsf: could not obtain job start time"); @@ -155,18 +155,28 @@ static int pls_lsf_launch_job(orte_jobid_t jobid) goto cleanup; } - /* account for any reuse of daemons */ - if (ORTE_SUCCESS != (rc = orte_pls_base_launch_on_existing_daemons(map))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - + /* Iterate through each of the nodes and check to see if we have + * a valid launch_id (must be > 0). If not, then error out as + * we cannot do anything + */ + for (item = opal_list_get_first(&map->nodes); + item != opal_list_get_end(&map->nodes); + item = opal_list_get_next(item)) { + orte_mapped_node_t* node = (orte_mapped_node_t*)item; + + if (node->launch_id < 0) { + /* JMS fix me */ + opal_show_help("help-pls-lsf.txt", "lsf-bad-launchid", + true, node->nodename, node->launch_id); + goto cleanup; + } + } + num_nodes = map->num_new_daemons; + opal_output(0, "pls lsf num new daemons: %d!\n", num_nodes); if (num_nodes == 0) { - /* nothing to do - just return */ - failed_launch = false; - rc = ORTE_SUCCESS; - goto cleanup; + /* no new daemons required - just launch apps */ + goto launch_apps; } /* create nodelist */ @@ -290,29 +300,39 @@ static int pls_lsf_launch_job(orte_jobid_t jobid) * orterun can do the rest of its stuff. Instead, we'll catch any * failures and deal with them elsewhere */ - argv = NULL; - argc = 0; - opal_argv_append(&argc, &argv, "env"); opal_output(0, "launching on: %s", opal_argv_join(nodelist_argv, ' ')); opal_output(0, "launching: %s", opal_argv_join(argv, ' ')); if (lsb_launch(nodelist_argv, argv, LSF_DJOB_NOWAIT, env) < 0) { ORTE_ERROR_LOG(ORTE_ERR_FAILED_TO_START); - opal_output(0, "got nonzero: %d", rc); + opal_output(0, "lsb_launch failed: %d", rc); rc = ORTE_ERR_FAILED_TO_START; goto cleanup; } - opal_output(0, "launched ok"); - sleep(5); - exit(0); + opal_output(0, "lsb_launch launched ok; waiting for %d daemons\n", + map->num_new_daemons); + + /* wait for daemons to callback */ + if (ORTE_SUCCESS != + (rc = orte_pls_base_daemon_callback(map->num_new_daemons))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } - if (lsb_launch(nodelist_argv, argv, LSF_DJOB_NOWAIT, env) < 0) { - ORTE_ERROR_LOG(ORTE_ERR_FAILED_TO_START); - rc = ORTE_ERR_FAILED_TO_START; +launch_apps: + { + int i = 0; + opal_output(0, "waiting for attach"); + while (i == 0) sleep(5); + } + opal_output(0, "laounching apps using lsf"); + if (ORTE_SUCCESS != (rc = orte_pls_base_launch_apps(map))) { + ORTE_ERROR_LOG(rc); goto cleanup; } /* declare the launch a success */ failed_launch = false; + opal_output(0, "launched apps with lsf ok"); if (mca_pls_lsf_component.timing) { if (0 != gettimeofday(&launchstop, NULL)) { @@ -355,7 +375,8 @@ cleanup: ORTE_ERROR_LOG(rc); } } - + + opal_output(0, "lsf pls returning: %d\n", rc); return rc; } diff --git a/orte/mca/ras/lsf/Makefile.am b/orte/mca/ras/lsf/Makefile.am index 956d381bf5..66e612f963 100644 --- a/orte/mca/ras/lsf/Makefile.am +++ b/orte/mca/ras/lsf/Makefile.am @@ -9,6 +9,7 @@ # University of Stuttgart. All rights reserved. # Copyright (c) 2004-2005 The Regents of the University of California. # All rights reserved. +# Copyright (c) 2007 Cisco, Inc. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -36,8 +37,8 @@ endif AM_CPPFLAGS= $(ras_lsf_CPPFLAGS) proxy_SOURCES = \ - ras_lsf.c \ ras_lsf.h \ + ras_lsf_module.c \ ras_lsf_component.c mcacomponentdir = $(pkglibdir) diff --git a/orte/mca/ras/lsf/ras_lsf.c b/orte/mca/ras/lsf/ras_lsf_module.c similarity index 79% rename from orte/mca/ras/lsf/ras_lsf.c rename to orte/mca/ras/lsf/ras_lsf_module.c index de4d36edae..5088178536 100644 --- a/orte/mca/ras/lsf/ras_lsf.c +++ b/orte/mca/ras/lsf/ras_lsf_module.c @@ -35,13 +35,36 @@ #include "ras_lsf.h" -static int orte_ras_lsf_allocate(orte_jobid_t jobid, opal_list_t *attributes) +/* + * Local functions + */ +static int allocate(orte_jobid_t jobid, opal_list_t *attributes); +static int deallocate(orte_jobid_t jobid); +static int finalize(void); + + +/* + * Global variable + */ +orte_ras_base_module_t orte_ras_lsf_module = { + allocate, + orte_ras_base_node_insert, + orte_ras_base_node_query, + orte_ras_base_node_query_alloc, + orte_ras_base_node_lookup, + orte_ras_base_proc_query_alloc, + deallocate, + finalize +}; + + +static int allocate(orte_jobid_t jobid, opal_list_t *attributes) { char **nodelist; opal_list_t nodes; opal_list_item_t *item; orte_ras_node_t *node; - int i, rc, num_nodes; + int i, count, rc, num_nodes; /* get the list of allocated nodes */ if ((num_nodes = lsb_getalloc(&nodelist)) < 0) { @@ -53,18 +76,24 @@ static int orte_ras_lsf_allocate(orte_jobid_t jobid, opal_list_t *attributes) node = NULL; /* step through the list */ - for (i=0; i < num_nodes; i++) { - printf("lsf got node: %s\n", nodelist[i]); + for (count = i = 0; i < num_nodes; i++) { + opal_output(0, "lsf got node: %s", nodelist[i]); /* is this a repeat of the current node? */ if (NULL != node && 0 == strcmp(nodelist[i], node->node_name)) { /* it is a repeat - just bump the slot count */ ++node->node_slots; + opal_output(0, "lsf ras repeat -- slot count now %d", + node->node_slots); continue; } + opal_output(0, "lsf ras new node"); /* not a repeat - create a node entry for it */ node = OBJ_NEW(orte_ras_node_t); node->node_name = strdup(nodelist[i]); + node->launch_id = count++; + node->node_slots_inuse = 0; + node->node_slots_max = 0; node->node_slots = 1; opal_list_append(&nodes, &node->super); } @@ -101,25 +130,13 @@ cleanup: return rc; } -static int orte_ras_lsf_deallocate(orte_jobid_t jobid) +static int deallocate(orte_jobid_t jobid) { return ORTE_SUCCESS; } -static int orte_ras_lsf_finalize(void) +static int finalize(void) { return ORTE_SUCCESS; } - - -orte_ras_base_module_t orte_ras_lsf_module = { - orte_ras_lsf_allocate, - orte_ras_base_node_insert, - orte_ras_base_node_query, - orte_ras_base_node_query_alloc, - orte_ras_base_node_lookup, - orte_ras_lsf_deallocate, - orte_ras_lsf_finalize -}; - diff --git a/orte/mca/sds/lsf/sds_lsf_module.c b/orte/mca/sds/lsf/sds_lsf_module.c index eed246a2d7..356893b086 100644 --- a/orte/mca/sds/lsf/sds_lsf_module.c +++ b/orte/mca/sds/lsf/sds_lsf_module.c @@ -59,7 +59,7 @@ int orte_sds_lsf_set_name(void) char* name_string = NULL; int lsf_nodeid; - /* start by getting our cellid, jobid, and vpid (which is the + /* start by getting our jobid, and vpid (which is the starting vpid for the list of daemons) */ id = mca_base_param_register_string("ns", "nds", "name", NULL, NULL); mca_base_param_lookup_string(id, &name_string); @@ -74,7 +74,6 @@ int orte_sds_lsf_set_name(void) } free(name_string); } else { - orte_cellid_t cellid; orte_jobid_t jobid; orte_vpid_t vpid; char* cellid_string; @@ -87,11 +86,6 @@ int orte_sds_lsf_set_name(void) ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); return ORTE_ERR_NOT_FOUND; } - if (ORTE_SUCCESS != - (rc = orte_ns.convert_string_to_cellid(&cellid, cellid_string))) { - ORTE_ERROR_LOG(rc); - return(rc); - } id = mca_base_param_register_string("ns", "nds", "jobid", NULL, NULL); mca_base_param_lookup_string(id, &jobid_string); @@ -119,21 +113,23 @@ int orte_sds_lsf_set_name(void) if (ORTE_SUCCESS != (rc = orte_ns.create_process_name(&(orte_process_info.my_name), - cellid, jobid, vpid))) { + jobid, vpid))) { ORTE_ERROR_LOG(rc); return rc; } } /* fix up the base name and make it the "real" name */ - lsf_nodeid = atoi(getenv("LSB_JOBINDEX")); + lsf_nodeid = atoi(getenv("LSF_PM_TASKID")); orte_process_info.my_name->vpid += lsf_nodeid; +#if 0 /* fix up the system info nodename to match exactly what lsf returned */ if (NULL != orte_system_info.nodename) { free(orte_system_info.nodename); } orte_system_info.nodename = get_lsf_nodename(lsf_nodeid); +#endif /* get the non-name common environmental variables */ if (ORTE_SUCCESS != (rc = orte_sds_env_get())) {