1
1

Improve LoadLeveler integration with Open MPI. Add support for LL native rsh agent - llspawn

This commit was SVN r24579.
Этот коммит содержится в:
Nysal Jan 2011-03-29 07:46:59 +00:00
родитель f40edd6b4f
Коммит c8c6b0edab
6 изменённых файлов: 167 добавлений и 354 удалений

Просмотреть файл

@ -11,6 +11,7 @@
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2006-2009 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2011 IBM Corporation. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
@ -18,35 +19,35 @@
# $HEADER$
#
# 1. if --with-loadleveler is given, always build
# 2. if --without-loadleveler is given, never build
# 3. if neither is given, build if-and-only-if the OS is Linux or AIX
# ORTE_CHECK_LOADLEVELER(prefix, [action-if-found], [action-if-not-found])
# --------------------------------------------------------
AC_DEFUN([ORTE_CHECK_LOADLEVELER],[
AC_ARG_WITH([loadleveler],
[AC_HELP_STRING([--with-loadleveler(=DIR)],
[Build LoadLeveler support, optionally adding DIR/include, DIR/lib, and DIR/lib64 to the search path for headers and libraries])])
OMPI_CHECK_WITHDIR([loadleveler], [$with_loadleveler], [include/llapi.h])
[AC_HELP_STRING([--with-loadleveler],
[Build LoadLeveler scheduler component (default: yes)])])
AS_IF([test "$with_loadleveler" = "no"],
[orte_check_loadleveler_happy="no"],
[orte_check_loadleveler_happy="yes"
AS_IF([test ! -z "$with_loadleveler" -a "$with_loadleveler" != "yes"],
[orte_check_loadleveler_dir="$with_loadleveler"],
[orte_check_loadleveler_dir=""])])
if test "$with_loadleveler" = "no" ; then
orte_check_loadleveler_happy="no"
elif test "$with_loadleveler" = "" ; then
# unless user asked, only build LoadLeveler component on Linux
# and AIX (these are the platforms that LoadLeveler supports)
case $host in
*-linux*|*-aix*)
orte_check_loadleveler_happy="yes"
;;
*)
orte_check_loadleveler_happy="no"
;;
esac
else
orte_check_loadleveler_happy="yes"
fi
AS_IF([test "$orte_check_loadleveler_happy" = "yes"],
[OMPI_CHECK_PACKAGE([$1],
[llapi.h],
[llapi],
[ll_query],
[],
[$orte_check_loadleveler_dir],
[],
[orte_check_loadleveler_happy="yes"],
[orte_check_loadleveler_happy="no"])])
AS_IF([test "$orte_check_loadleveler_happy" = "yes"],
[$2],
[AS_IF([test ! -z "$with_loadleveler" -a "$with_loadleveler" != "no"],
[AC_MSG_ERROR([LOADLEVELER support requested but not found. Aborting])])
$3])
AS_IF([test "$orte_check_loadleveler_happy" = "yes"],
[$2],
[$3])
])

Просмотреть файл

@ -114,7 +114,7 @@ int orte_plm_poe_component_open(void)
false, false, 0, &mca_plm_poe_component.debug);
mca_base_param_reg_int(c, "priority",
"Priority of the poe plm component",
false , false, 100, &mca_plm_poe_component.priority);
false , false, 0, &mca_plm_poe_component.priority);
mca_base_param_reg_string(c, "class",
"class (interactive or batch)",
true, false, "interactive", &mca_plm_poe_component.class);

Просмотреть файл

@ -10,6 +10,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2011 IBM Corporation. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -66,6 +67,9 @@ struct orte_plm_rsh_component_t {
bool disable_qrsh;
bool using_qrsh;
bool daemonize_qrsh;
bool disable_llspawn;
bool using_llspawn;
bool daemonize_llspawn;
int delay;
int priority;
bool tree_spawn;

Просмотреть файл

@ -15,6 +15,7 @@
* Copyright (c) 2010 Oracle and/or its affiliates. All rights
* reserved.
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011 IBM Corporation. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -103,6 +104,7 @@ int orte_plm_rsh_component_open(void)
mca_plm_rsh_component.num_children = 0;
OBJ_CONSTRUCT(&mca_plm_rsh_component.children, opal_list_t);
mca_plm_rsh_component.using_qrsh = false;
mca_plm_rsh_component.using_llspawn = false;
/* lookup parameters */
mca_base_param_reg_int(c, "num_concurrent",
@ -129,6 +131,16 @@ int orte_plm_rsh_component_open(void)
false, false, false, &tmp);
mca_plm_rsh_component.daemonize_qrsh = OPAL_INT_TO_BOOL(tmp);
mca_base_param_reg_int(c, "disable_llspawn",
"Disable the use of llspawn when under the LoadLeveler environment",
false, false, false, &tmp);
mca_plm_rsh_component.disable_llspawn = OPAL_INT_TO_BOOL(tmp);
mca_base_param_reg_int(c, "daemonize_llspawn",
"Daemonize the orted when under the LoadLeveler environment",
false, false, false, &tmp);
mca_plm_rsh_component.daemonize_llspawn = OPAL_INT_TO_BOOL(tmp);
mca_base_param_reg_int(c, "priority",
"Priority of the rsh plm component",
false, false, 10,
@ -170,12 +182,26 @@ int orte_plm_rsh_component_query(mca_base_module_t **module, int *priority)
}
free(tmp);
mca_plm_rsh_component.using_qrsh = true;
*priority = mca_plm_rsh_component.priority;
*module = (mca_base_module_t *) &orte_plm_rsh_module;
return ORTE_SUCCESS;
goto success;
} else if (!mca_plm_rsh_component.disable_llspawn &&
NULL != getenv("LOADL_STEP_ID")) {
/* We are running as a LOADLEVELER job.
Search for llspawn in the users PATH */
if (ORTE_SUCCESS != orte_plm_base_rsh_launch_agent_lookup("llspawn", NULL)) {
opal_output_verbose(1, orte_plm_globals.output,
"%s plm:rsh: unable to be used: LoadLeveler "
"indicated but cannot find path or execution "
"permissions not set for launching agent llspawn",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
*module = NULL;
return ORTE_ERROR;
}
mca_plm_rsh_component.using_llspawn = true;
goto success;
}
/* if this isn't an Grid Engine environment, see if MCA-specified agent (default: ssh:rsh) is available */
/* if this isn't an Grid Engine or LoadLeveler environment,
see if MCA-specified agent (default: ssh:rsh) is available */
if (ORTE_SUCCESS != orte_plm_base_rsh_launch_agent_lookup(NULL, NULL)) {
/* this isn't an error - we just cannot be selected */
@ -187,7 +213,7 @@ int orte_plm_rsh_component_query(mca_base_module_t **module, int *priority)
*module = NULL;
return ORTE_ERROR;
}
success:
/* we are good - make ourselves available */
*priority = mca_plm_rsh_component.priority;
*module = (mca_base_module_t *) &orte_plm_rsh_module;

Просмотреть файл

@ -13,6 +13,7 @@
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2008-2009 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2011 IBM Corporation. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -177,8 +178,18 @@ int orte_plm_rsh_init(void)
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), tmp);
free(tmp);
}
} else if(mca_plm_rsh_component.using_llspawn) {
/* perform base setup for llspawn */
if (ORTE_SUCCESS != (rc = orte_plm_base_rsh_launch_agent_setup("llspawn", NULL))) {
ORTE_ERROR_LOG(rc);
return rc;
}
opal_output_verbose(1, orte_plm_globals.output,
"%s plm:rsh: using \"%s\" for launching\n",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
orte_plm_globals.rsh_agent_path);
} else {
/* not using qrsh - use MCA-specified agent */
/* not using qrsh or llspawn - use MCA-specified agent */
if (ORTE_SUCCESS != (rc = orte_plm_base_rsh_launch_agent_setup(NULL, NULL))) {
ORTE_ERROR_LOG(rc);
return rc;
@ -684,7 +695,9 @@ static int setup_launch(int *argcptr, char ***argvptr,
/* Daemonize when not using qrsh. Or, if using qrsh, only
* daemonize if told to by user with daemonize_qrsh flag. */
((!mca_plm_rsh_component.using_qrsh) ||
(mca_plm_rsh_component.using_qrsh && mca_plm_rsh_component.daemonize_qrsh))) {
(mca_plm_rsh_component.using_qrsh && mca_plm_rsh_component.daemonize_qrsh)) &&
((!mca_plm_rsh_component.using_llspawn) ||
(mca_plm_rsh_component.using_llspawn && mca_plm_rsh_component.daemonize_llspawn))) {
opal_argv_append(&argc, &argv, "--daemonize");
}

Просмотреть файл

@ -10,22 +10,13 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2010 IBM Corporation. All rights reserved.
* Copyright (c) 2010-2011 IBM Corporation. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/* Much of the code in this file is taken from the file ll_get_machine_list.c,
* which is provided by IBM as part of their sample programs for LoadLeveler
* in the samples/llmpich directory.
*
* IBM has approved the release of the sample code for Loadleveler under the
* BSD license. Consequently, a more restrictive licensing clause that was
* originally associated with the sample code and replicated here has been
* removed.
*/
#include "orte_config.h"
@ -33,8 +24,6 @@
#include <unistd.h>
#include <string.h>
#include <llapi.h>
#include "opal/util/argv.h"
#include "opal/util/output.h"
@ -51,8 +40,11 @@
*/
static int orte_ras_loadleveler_allocate(opal_list_t *nodes);
static int orte_ras_loadleveler_finalize(void);
static int orte_ras_loadleveler_get_hostlist(int * num_hosts, char*** hostlist);
static int orte_ras_loadleveler_discover(opal_list_t *nodelist);
static int ll_getline(FILE *fp, char *input);
#define LL_FILE_MAX_LINE_LENGTH 512
/*
* Global variable
@ -69,50 +61,23 @@ orte_ras_base_module_t orte_ras_loadleveler_module = {
*/
static int orte_ras_loadleveler_allocate(opal_list_t *nodes)
{
int i, ret=ORTE_SUCCESS;
opal_list_item_t* item;
orte_node_t* node;
char ** hostlist = NULL;
int num_hosts = 0;
int ret = ORTE_SUCCESS;
ret = orte_ras_loadleveler_get_hostlist(&num_hosts, &hostlist);
if(ORTE_SUCCESS != ret) {
if (ORTE_SUCCESS != (ret = orte_ras_loadleveler_discover(nodes))) {
ORTE_ERROR_LOG(ret);
return ret;
}
for (i = 0; i < num_hosts; i++) {
/* check for duplicated nodes */
for (item = opal_list_get_first(nodes);
opal_list_get_end(nodes) != item;
item = opal_list_get_next(item)) {
node = (orte_node_t*) item;
if (0 == strcmp(node->name, hostlist[i])) {
++node->slots;
break;
}
}
if(opal_list_get_end(nodes) == item) {
/* we did not find a duplicate, so add a new item to the list */
node = OBJ_NEW(orte_node_t);
if (NULL == node) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
ret = ORTE_ERR_OUT_OF_RESOURCE;
goto cleanup;
}
node->name = strdup(hostlist[i]);
node->state = ORTE_NODE_STATE_UP;
node->slots_inuse = 0;
node->slots_max = 0;
node->slots = 1;
opal_list_append(nodes, &node->super);
}
/* If we didn't find anything, then this
* is an unrecoverable error - report it
*/
if (opal_list_is_empty(nodes)) {
opal_output(orte_ras_base.ras_output,
"ras:loadleveler:allocate: No nodes were found in the LOADL_HOSTFILE - %s",
getenv("LOADL_HOSTFILE"));
return ORTE_ERR_NOT_FOUND;
}
cleanup:
opal_argv_free(hostlist);
return ret;
}
@ -121,292 +86,96 @@ cleanup:
*/
static int orte_ras_loadleveler_finalize(void)
{
opal_output(orte_ras_base.ras_output,
"ras:loadleveler:finalize: success (nothing to do)");
OPAL_OUTPUT_VERBOSE((1, orte_ras_base.ras_output,
"ras:loadleveler:finalize: success (nothing to do)"));
return ORTE_SUCCESS;
}
/*
* get the hostlist from LoadLeveler
* *hostlist should either by NULL or a valid argv and *num_hosts
* should be 0 or the number of elements in the hostlist argv
/**
* Discover the available resources. Obtain directly from LoadLeveler (and
* therefore have no need to validate) -- ignore hostfile or any other
* user-specified parameters.
*/
static int orte_ras_loadleveler_get_hostlist(int* num_hosts, char*** hostlist)
static int orte_ras_loadleveler_discover(opal_list_t* nodelist)
{
LL_element *queryObject = NULL, *job = NULL, *step = NULL;
LL_element *node = NULL, *task = NULL, *task_instance = NULL;
int rc, obj_count, err_code, ll_master_task, job_step_count;
char *ll_step_id= NULL, *job_step_list[2], *task_machine_name = NULL;
char *schedd_host_name = NULL;
int step_mode;
/* Get the step ID from LOADL_STEP_ID environment variable. */
if(NULL == (ll_step_id = getenv("LOADL_STEP_ID"))) {
orte_node_t *node;
opal_list_item_t* item;
FILE *fp;
char *hostname;
char *filename;
char input[LL_FILE_MAX_LINE_LENGTH];
/* Ignore anything that the user already specified -- we're
getting nodes only from LoadLeveler. */
filename = getenv("LOADL_HOSTFILE");
if(NULL == filename) {
opal_output(orte_ras_base.ras_output,
"ras:loadleveler:get:hostlist: could not get LOADL_STEP_ID "
"from environment!");
"ras:loadleveler:allocate:discover: LOADL_HOSTFILE not set. "
"Unable to discover allocated nodes.");
return ORTE_ERROR;
}
job_step_list[0] = ll_step_id;
job_step_list[1] = NULL;
/* STEP 1: Get Job object from Central Manager to find out the name of the
* Schedd daemon that handles this job. In a Multicluster environment we
* can not get the schedd name from the job step id. */
/* Initialize the LL API. Specify that query type is JOBS. */
if(NULL == (queryObject = ll_query(JOBS))) {
opal_output(orte_ras_base.ras_output,
"ras:loadleveler:get:hostlist: 1 ll_query faild on JOBS!");
return ORTE_ERROR;
fp = fopen(filename, "r");
if (NULL == fp) {
ORTE_ERROR_LOG(ORTE_ERR_FILE_OPEN_FAILURE);
return ORTE_ERR_FILE_OPEN_FAILURE;
}
/* Specify that this is a QUERY_STEPID type of query. */
rc = ll_set_request(queryObject, QUERY_STEPID, job_step_list, ALL_DATA);
if(0 > rc) {
opal_output(orte_ras_base.ras_output,
"ras:loadleveler:get:hostlist: 1 ll_set_request failed: "
"error %d!", rc);
return ORTE_ERROR;
}
/* Iterate through all the nodes and make an entry for each */
while (0 != ll_getline(fp, input)) {
hostname = strdup(input);
OPAL_OUTPUT_VERBOSE((1, orte_ras_base.ras_output,
"%s ras:loadleveler:allocate:discover: got hostname %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hostname));
/* Get a Job object from LoadL_schedd that contains the relevant job step */
job = ll_get_objs(queryObject, LL_CM, NULL, &obj_count, &err_code);
if(NULL == job) {
opal_output(orte_ras_base.ras_output,
"ras:loadleveler:get:hostlist: ll_get_objs LL_CM "
"failed: err_code=%d", err_code);
return ORTE_ERROR;
}
/* Remember that LoadLeveler may list the same node more than once.
So we have to check for duplicates. */
for (item = opal_list_get_first(nodelist);
opal_list_get_end(nodelist) != item;
item = opal_list_get_next(item)) {
node = (orte_node_t*) item;
if (0 == strcmp(node->name, hostname)) {
++node->slots;
if (obj_count != 1) { /* Only 1 Job object is expected. */
opal_output(orte_ras_base.ras_output,
"ras:loadleveler:get:hostlist: ll_get_objs LL_CM "
"expected one job to match, got %d!", obj_count);
return ORTE_ERROR;
}
if(0 != (rc = ll_get_data(job, LL_JobSchedd, &schedd_host_name))) {
opal_output(orte_ras_base.ras_output,
"ras:loadleveler:get:hostlist: ll_get_data LL_JobSchedd"
" failure, RC= %d!", rc);
return ORTE_ERROR;
}
if (schedd_host_name != NULL) {
job_step_list[0] = ll_step_id;
job_step_list[1] = NULL;
} else {
opal_output(orte_ras_base.ras_output,
"ras:loadleveler:get:hostlist: ll_get_data() Error: Could "
"not determine managing schedd for job %s.\n",
job_step_list[0]);
return ORTE_ERROR;
}
ll_free_objs(queryObject);
ll_deallocate(queryObject);
/* STEP 2: Get Job object from Schedd that manages this job step. */
/* Only schedd query gives us all the relevant task instance info. */
/* Initialize the LL API. Specify that query type is JOBS. */
if(NULL == (queryObject = ll_query(JOBS))) {
opal_output(orte_ras_base.ras_output,
"ras:loadleveler:get:hostlist: 2 ll_query faild on JOBS!");
return ORTE_ERROR;
}
/* Specify that this is a QUERY_STEPID type of query. */
rc = ll_set_request(queryObject, QUERY_STEPID, job_step_list, ALL_DATA);
if(0 != rc) {
opal_output(orte_ras_base.ras_output,
"ras:loadleveler:get:hostlist: 2 ll_set_request failed: "
"error %d!", rc);
return ORTE_ERROR;
}
/* Get a Job object from LoadL_schedd that contains the relevant job step */
job = ll_get_objs(queryObject, LL_SCHEDD, schedd_host_name, &obj_count,
&err_code);
if(NULL == job) {
opal_output(orte_ras_base.ras_output,
"ras:loadleveler:get:hostlist: ll_get_objs LL_SCHEDD "
"failed: err_code=%d", err_code);
return ORTE_ERROR;
}
if (obj_count != 1) { /* Only 1 Job object is expected. */
opal_output(orte_ras_base.ras_output,
"ras:loadleveler:get:hostlist: ll_get_objs LL_SCHEDD "
"expected one job to match, got %d!", obj_count);
return ORTE_ERROR;
}
if(0 != (rc = ll_get_data(job, LL_JobStepCount, &job_step_count))) {
opal_output(orte_ras_base.ras_output,
"ras:loadleveler:get:hostlist: ll_get_data LL_JobStepCount"
" failure, RC= %d!", rc);
return ORTE_ERROR;
}
if (job_step_count != 1) { /* Only 1 Job Step object is expected. */
opal_output(orte_ras_base.ras_output,
"ras:loadleveler:get:hostlist: ll_get_data LL_JobStepCount"
" expected one jobstep to match, got %d!", job_step_count);
return ORTE_ERROR;
}
step = NULL;
if(0 != (rc = ll_get_data(job, LL_JobGetFirstStep, &step))) {
opal_output(orte_ras_base.ras_output,
"ras:loadleveler:get:hostlist: 3 ll_get_data: failure on "
"LL_JobGetFirstStep. RC= %d!", rc);
return ORTE_ERROR;
}
if(NULL == step) {
opal_output(orte_ras_base.ras_output,
"ras:loadleveler:get:hostlist: 3 ll_get_data: Error: "
"Unable to obtain Job Step information.\n");
return ORTE_ERROR;
}
step_mode = -1;
if(0 != (rc = ll_get_data(step, LL_StepParallelMode, &step_mode))) {
opal_output(orte_ras_base.ras_output,
"ras:loadleveler:get:hostlist: 4 ll_get_data: failure on "
"LL_StepParallelMode. RC= %d!", rc);
return ORTE_ERROR;
}
/* Serial job step: step_mode==0; Parallel: step_mode==1; Others:2,3,4. */
if ((step_mode != 0) && (step_mode != 1)) {
opal_output(orte_ras_base.ras_output,
"ras:loadleveler:get:hostlist: We support only Serial and "
"Parallel LoadLeveler job types. PVM, NQS, and Blue Gene"
"jobs are not supported by the LoadLeveler RAS!");
return ORTE_ERROR;
}
if(step_mode == 0) { /* serial job */
node = NULL;
if(0 != (rc = ll_get_data(step, LL_StepGetFirstNode, &node))) {
opal_output(orte_ras_base.ras_output,
"ras:loadleveler:get:hostlist: ll_get_data: failure "
"on serial LL_StepGetFirstNode. RC= %d!", rc);
return ORTE_ERROR;
}
task = NULL;
if(0 != (rc = ll_get_data(node, LL_NodeGetFirstTask, &task))) {
opal_output(orte_ras_base.ras_output,
"ras:loadleveler:get:hostlist: ll_get_data: failure "
"on serial LL_NodeGetFirstTask. RC= %d!", rc);
return ORTE_ERROR;
}
task_instance = NULL;
rc = ll_get_data(task, LL_TaskGetFirstTaskInstance, &task_instance);
if(0 != rc) {
opal_output(orte_ras_base.ras_output,
"ras:loadleveler:get:hostlist: ll_get_data: failure "
"on serial LL_TaskGetFirstInstance. RC= %d!", rc);
return ORTE_ERROR;
}
task_machine_name = NULL;
if(0 != (rc = ll_get_data(task_instance, LL_TaskInstanceMachineName,
&task_machine_name))) {
opal_output(orte_ras_base.ras_output,
"ras:loadleveler:get:hostlist: ll_get_data: failure "
"on serial LL_TaskInstanceMachineName. RC= %d!", rc);
return ORTE_ERROR;
}
opal_argv_append(num_hosts, hostlist, task_machine_name);
} else { /* parallel job */
node = NULL;
if(0 != (rc = ll_get_data(step, LL_StepGetFirstNode, &node))) {
opal_output(orte_ras_base.ras_output,
"ras:loadleveler:get:hostlist: ll_get_data: failure "
"on LL_StepGetFirstNode. RC= %d!", rc);
return ORTE_ERROR;
}
while(NULL != node) { /* Loop through the "Node" objects. */
task = NULL;
if(0 != (rc = ll_get_data(node, LL_NodeGetFirstTask, &task))) {
opal_output(orte_ras_base.ras_output,
"ras:loadleveler:get:hostlist: ll_get_data: failure "
"on LL_NodeGetFirstTask. RC= %d!", rc);
return ORTE_ERROR;
}
while(task) { /* Loop through the "Task" objects. */
ll_master_task = 0;
rc = ll_get_data(task, LL_TaskIsMaster, &ll_master_task);
if(0 != rc) {
opal_output(orte_ras_base.ras_output,
"ras:loadleveler:get:hostlist: ll_get_data: "
"failure on LL_TaskIsMaster. RC= %d!", rc);
return ORTE_ERROR;
}
/* The "master task" Task object is a LoadLeveler abstraction
* and is not relevant here. Look at only Task objects that
* are not "master".*/
if (!ll_master_task) {
task_instance = NULL;
if(0 != (rc = ll_get_data(task, LL_TaskGetFirstTaskInstance,
&task_instance))) {
opal_output(orte_ras_base.ras_output,
"ras:loadleveler:get:hostlist: ll_get_data:"
" failure on LL_TaskGetFirstTaskInstance."
" RC= %d!", rc);
return ORTE_ERROR;
}
/* Loop through the "Task Instance" objects. */
while (task_instance) {
task_machine_name = NULL;
rc = ll_get_data(task_instance,
LL_TaskInstanceMachineName,
&task_machine_name);
if(0 != rc) {
opal_output(orte_ras_base.ras_output,
"ras:loadleveler:get:hostlist: ll_get_data:"
" failure on LL_TaskInstanceMachineName"
"RC= %d!", rc);
return ORTE_ERROR;
}
opal_argv_append(num_hosts, hostlist, task_machine_name);
task_instance = NULL;
rc = ll_get_data(task, LL_TaskGetNextTaskInstance,
&task_instance);
if(0 != rc) {
opal_output(orte_ras_base.ras_output,
"ras:loadleveler:get:hostlist: ll_get_data:"
" failure on LL_TaskGetNextTaskInstance. "
"RC= %d!", rc);
return ORTE_ERROR;
}
}
}
task = NULL;
if(0 != (rc = ll_get_data(node, LL_NodeGetNextTask, &task))) {
opal_output(orte_ras_base.ras_output,
"ras:loadleveler:get:hostlist: ll_get_data: "
"failure on LL_NodeGetNextTask. RC= %d!", rc);
return ORTE_ERROR;
}
}
node = NULL;
if(0 != (rc = ll_get_data(step, LL_StepGetNextNode, &node))) {
opal_output(orte_ras_base.ras_output,
"ras:loadleveler:get:hostlist: ll_get_data: "
"failure on LL_StepGetNextNode. RC= %d!", rc);
return ORTE_ERROR;
OPAL_OUTPUT_VERBOSE((1, orte_ras_base.ras_output,
"%s ras:loadleveler:allocate:discover: found -- bumped slots to %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->slots));
break;
}
}
/* Did we find it? */
if (opal_list_get_end(nodelist) == item) {
/* Nope -- didn't find it, so add a new item to the list */
OPAL_OUTPUT_VERBOSE((1, orte_ras_base.ras_output,
"%s ras:loadleveler:allocate:discover: not found -- added to list",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
node = OBJ_NEW(orte_node_t);
node->name = hostname;
node->state = ORTE_NODE_STATE_UP;
node->slots_inuse = 0;
node->slots_max = 0;
node->slots = 1;
opal_list_append(nodelist, &node->super);
} else {
/* Yes, so we need to free the hostname that came back */
free(hostname);
}
}
ll_free_objs(queryObject);
ll_deallocate(queryObject);
fclose(fp);
return ORTE_SUCCESS;
}
static int ll_getline(FILE *fp, char *input)
{
char *ret;
ret = fgets(input, LL_FILE_MAX_LINE_LENGTH, fp);
if (NULL != ret) {
input[strlen(input)-1] = '\0'; /* remove newline */
return 1;
}
return 0;
}