Improve LoadLeveler integration with Open MPI. Add support for LL native rsh agent - llspawn
This commit was SVN r24579.
Этот коммит содержится в:
родитель
f40edd6b4f
Коммит
c8c6b0edab
@ -11,6 +11,7 @@
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2006-2009 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2011 IBM Corporation. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
@ -18,35 +19,35 @@
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# 1. if --with-loadleveler is given, always build
|
||||
# 2. if --without-loadleveler is given, never build
|
||||
# 3. if neither is given, build if-and-only-if the OS is Linux or AIX
|
||||
|
||||
# ORTE_CHECK_LOADLEVELER(prefix, [action-if-found], [action-if-not-found])
|
||||
# --------------------------------------------------------
|
||||
AC_DEFUN([ORTE_CHECK_LOADLEVELER],[
|
||||
AC_ARG_WITH([loadleveler],
|
||||
[AC_HELP_STRING([--with-loadleveler(=DIR)],
|
||||
[Build LoadLeveler support, optionally adding DIR/include, DIR/lib, and DIR/lib64 to the search path for headers and libraries])])
|
||||
OMPI_CHECK_WITHDIR([loadleveler], [$with_loadleveler], [include/llapi.h])
|
||||
[AC_HELP_STRING([--with-loadleveler],
|
||||
[Build LoadLeveler scheduler component (default: yes)])])
|
||||
|
||||
AS_IF([test "$with_loadleveler" = "no"],
|
||||
[orte_check_loadleveler_happy="no"],
|
||||
[orte_check_loadleveler_happy="yes"
|
||||
AS_IF([test ! -z "$with_loadleveler" -a "$with_loadleveler" != "yes"],
|
||||
[orte_check_loadleveler_dir="$with_loadleveler"],
|
||||
[orte_check_loadleveler_dir=""])])
|
||||
if test "$with_loadleveler" = "no" ; then
|
||||
orte_check_loadleveler_happy="no"
|
||||
elif test "$with_loadleveler" = "" ; then
|
||||
# unless user asked, only build LoadLeveler component on Linux
|
||||
# and AIX (these are the platforms that LoadLeveler supports)
|
||||
case $host in
|
||||
*-linux*|*-aix*)
|
||||
orte_check_loadleveler_happy="yes"
|
||||
;;
|
||||
*)
|
||||
orte_check_loadleveler_happy="no"
|
||||
;;
|
||||
esac
|
||||
else
|
||||
orte_check_loadleveler_happy="yes"
|
||||
fi
|
||||
|
||||
AS_IF([test "$orte_check_loadleveler_happy" = "yes"],
|
||||
[OMPI_CHECK_PACKAGE([$1],
|
||||
[llapi.h],
|
||||
[llapi],
|
||||
[ll_query],
|
||||
[],
|
||||
[$orte_check_loadleveler_dir],
|
||||
[],
|
||||
[orte_check_loadleveler_happy="yes"],
|
||||
[orte_check_loadleveler_happy="no"])])
|
||||
|
||||
AS_IF([test "$orte_check_loadleveler_happy" = "yes"],
|
||||
[$2],
|
||||
[AS_IF([test ! -z "$with_loadleveler" -a "$with_loadleveler" != "no"],
|
||||
[AC_MSG_ERROR([LOADLEVELER support requested but not found. Aborting])])
|
||||
$3])
|
||||
AS_IF([test "$orte_check_loadleveler_happy" = "yes"],
|
||||
[$2],
|
||||
[$3])
|
||||
])
|
||||
|
@ -114,7 +114,7 @@ int orte_plm_poe_component_open(void)
|
||||
false, false, 0, &mca_plm_poe_component.debug);
|
||||
mca_base_param_reg_int(c, "priority",
|
||||
"Priority of the poe plm component",
|
||||
false , false, 100, &mca_plm_poe_component.priority);
|
||||
false , false, 0, &mca_plm_poe_component.priority);
|
||||
mca_base_param_reg_string(c, "class",
|
||||
"class (interactive or batch)",
|
||||
true, false, "interactive", &mca_plm_poe_component.class);
|
||||
|
@ -10,6 +10,7 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011 IBM Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -66,6 +67,9 @@ struct orte_plm_rsh_component_t {
|
||||
bool disable_qrsh;
|
||||
bool using_qrsh;
|
||||
bool daemonize_qrsh;
|
||||
bool disable_llspawn;
|
||||
bool using_llspawn;
|
||||
bool daemonize_llspawn;
|
||||
int delay;
|
||||
int priority;
|
||||
bool tree_spawn;
|
||||
|
@ -15,6 +15,7 @@
|
||||
* Copyright (c) 2010 Oracle and/or its affiliates. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011 IBM Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -103,6 +104,7 @@ int orte_plm_rsh_component_open(void)
|
||||
mca_plm_rsh_component.num_children = 0;
|
||||
OBJ_CONSTRUCT(&mca_plm_rsh_component.children, opal_list_t);
|
||||
mca_plm_rsh_component.using_qrsh = false;
|
||||
mca_plm_rsh_component.using_llspawn = false;
|
||||
|
||||
/* lookup parameters */
|
||||
mca_base_param_reg_int(c, "num_concurrent",
|
||||
@ -129,6 +131,16 @@ int orte_plm_rsh_component_open(void)
|
||||
false, false, false, &tmp);
|
||||
mca_plm_rsh_component.daemonize_qrsh = OPAL_INT_TO_BOOL(tmp);
|
||||
|
||||
mca_base_param_reg_int(c, "disable_llspawn",
|
||||
"Disable the use of llspawn when under the LoadLeveler environment",
|
||||
false, false, false, &tmp);
|
||||
mca_plm_rsh_component.disable_llspawn = OPAL_INT_TO_BOOL(tmp);
|
||||
|
||||
mca_base_param_reg_int(c, "daemonize_llspawn",
|
||||
"Daemonize the orted when under the LoadLeveler environment",
|
||||
false, false, false, &tmp);
|
||||
mca_plm_rsh_component.daemonize_llspawn = OPAL_INT_TO_BOOL(tmp);
|
||||
|
||||
mca_base_param_reg_int(c, "priority",
|
||||
"Priority of the rsh plm component",
|
||||
false, false, 10,
|
||||
@ -170,12 +182,26 @@ int orte_plm_rsh_component_query(mca_base_module_t **module, int *priority)
|
||||
}
|
||||
free(tmp);
|
||||
mca_plm_rsh_component.using_qrsh = true;
|
||||
*priority = mca_plm_rsh_component.priority;
|
||||
*module = (mca_base_module_t *) &orte_plm_rsh_module;
|
||||
return ORTE_SUCCESS;
|
||||
goto success;
|
||||
} else if (!mca_plm_rsh_component.disable_llspawn &&
|
||||
NULL != getenv("LOADL_STEP_ID")) {
|
||||
/* We are running as a LOADLEVELER job.
|
||||
Search for llspawn in the users PATH */
|
||||
if (ORTE_SUCCESS != orte_plm_base_rsh_launch_agent_lookup("llspawn", NULL)) {
|
||||
opal_output_verbose(1, orte_plm_globals.output,
|
||||
"%s plm:rsh: unable to be used: LoadLeveler "
|
||||
"indicated but cannot find path or execution "
|
||||
"permissions not set for launching agent llspawn",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
*module = NULL;
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
mca_plm_rsh_component.using_llspawn = true;
|
||||
goto success;
|
||||
}
|
||||
|
||||
/* if this isn't an Grid Engine environment, see if MCA-specified agent (default: ssh:rsh) is available */
|
||||
/* if this isn't an Grid Engine or LoadLeveler environment,
|
||||
see if MCA-specified agent (default: ssh:rsh) is available */
|
||||
|
||||
if (ORTE_SUCCESS != orte_plm_base_rsh_launch_agent_lookup(NULL, NULL)) {
|
||||
/* this isn't an error - we just cannot be selected */
|
||||
@ -187,7 +213,7 @@ int orte_plm_rsh_component_query(mca_base_module_t **module, int *priority)
|
||||
*module = NULL;
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
success:
|
||||
/* we are good - make ourselves available */
|
||||
*priority = mca_plm_rsh_component.priority;
|
||||
*module = (mca_base_module_t *) &orte_plm_rsh_module;
|
||||
|
@ -13,6 +13,7 @@
|
||||
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2008-2009 Sun Microsystems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011 IBM Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -177,8 +178,18 @@ int orte_plm_rsh_init(void)
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), tmp);
|
||||
free(tmp);
|
||||
}
|
||||
} else if(mca_plm_rsh_component.using_llspawn) {
|
||||
/* perform base setup for llspawn */
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_rsh_launch_agent_setup("llspawn", NULL))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
opal_output_verbose(1, orte_plm_globals.output,
|
||||
"%s plm:rsh: using \"%s\" for launching\n",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
orte_plm_globals.rsh_agent_path);
|
||||
} else {
|
||||
/* not using qrsh - use MCA-specified agent */
|
||||
/* not using qrsh or llspawn - use MCA-specified agent */
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_rsh_launch_agent_setup(NULL, NULL))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
@ -684,7 +695,9 @@ static int setup_launch(int *argcptr, char ***argvptr,
|
||||
/* Daemonize when not using qrsh. Or, if using qrsh, only
|
||||
* daemonize if told to by user with daemonize_qrsh flag. */
|
||||
((!mca_plm_rsh_component.using_qrsh) ||
|
||||
(mca_plm_rsh_component.using_qrsh && mca_plm_rsh_component.daemonize_qrsh))) {
|
||||
(mca_plm_rsh_component.using_qrsh && mca_plm_rsh_component.daemonize_qrsh)) &&
|
||||
((!mca_plm_rsh_component.using_llspawn) ||
|
||||
(mca_plm_rsh_component.using_llspawn && mca_plm_rsh_component.daemonize_llspawn))) {
|
||||
opal_argv_append(&argc, &argv, "--daemonize");
|
||||
}
|
||||
|
||||
|
@ -10,22 +10,13 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2010 IBM Corporation. All rights reserved.
|
||||
* Copyright (c) 2010-2011 IBM Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/* Much of the code in this file is taken from the file ll_get_machine_list.c,
|
||||
* which is provided by IBM as part of their sample programs for LoadLeveler
|
||||
* in the samples/llmpich directory.
|
||||
*
|
||||
* IBM has approved the release of the sample code for Loadleveler under the
|
||||
* BSD license. Consequently, a more restrictive licensing clause that was
|
||||
* originally associated with the sample code and replicated here has been
|
||||
* removed.
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
@ -33,8 +24,6 @@
|
||||
#include <unistd.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <llapi.h>
|
||||
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/output.h"
|
||||
|
||||
@ -51,8 +40,11 @@
|
||||
*/
|
||||
static int orte_ras_loadleveler_allocate(opal_list_t *nodes);
|
||||
static int orte_ras_loadleveler_finalize(void);
|
||||
static int orte_ras_loadleveler_get_hostlist(int * num_hosts, char*** hostlist);
|
||||
|
||||
static int orte_ras_loadleveler_discover(opal_list_t *nodelist);
|
||||
static int ll_getline(FILE *fp, char *input);
|
||||
|
||||
#define LL_FILE_MAX_LINE_LENGTH 512
|
||||
|
||||
/*
|
||||
* Global variable
|
||||
@ -69,50 +61,23 @@ orte_ras_base_module_t orte_ras_loadleveler_module = {
|
||||
*/
|
||||
static int orte_ras_loadleveler_allocate(opal_list_t *nodes)
|
||||
{
|
||||
int i, ret=ORTE_SUCCESS;
|
||||
opal_list_item_t* item;
|
||||
orte_node_t* node;
|
||||
char ** hostlist = NULL;
|
||||
int num_hosts = 0;
|
||||
int ret = ORTE_SUCCESS;
|
||||
|
||||
ret = orte_ras_loadleveler_get_hostlist(&num_hosts, &hostlist);
|
||||
if(ORTE_SUCCESS != ret) {
|
||||
if (ORTE_SUCCESS != (ret = orte_ras_loadleveler_discover(nodes))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
for (i = 0; i < num_hosts; i++) {
|
||||
/* check for duplicated nodes */
|
||||
for (item = opal_list_get_first(nodes);
|
||||
opal_list_get_end(nodes) != item;
|
||||
item = opal_list_get_next(item)) {
|
||||
node = (orte_node_t*) item;
|
||||
if (0 == strcmp(node->name, hostlist[i])) {
|
||||
++node->slots;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if(opal_list_get_end(nodes) == item) {
|
||||
/* we did not find a duplicate, so add a new item to the list */
|
||||
node = OBJ_NEW(orte_node_t);
|
||||
if (NULL == node) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
ret = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
goto cleanup;
|
||||
}
|
||||
node->name = strdup(hostlist[i]);
|
||||
node->state = ORTE_NODE_STATE_UP;
|
||||
node->slots_inuse = 0;
|
||||
node->slots_max = 0;
|
||||
node->slots = 1;
|
||||
opal_list_append(nodes, &node->super);
|
||||
}
|
||||
/* If we didn't find anything, then this
|
||||
* is an unrecoverable error - report it
|
||||
*/
|
||||
if (opal_list_is_empty(nodes)) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: No nodes were found in the LOADL_HOSTFILE - %s",
|
||||
getenv("LOADL_HOSTFILE"));
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
|
||||
cleanup:
|
||||
opal_argv_free(hostlist);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -121,292 +86,96 @@ cleanup:
|
||||
*/
|
||||
static int orte_ras_loadleveler_finalize(void)
|
||||
{
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:finalize: success (nothing to do)");
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_ras_base.ras_output,
|
||||
"ras:loadleveler:finalize: success (nothing to do)"));
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* get the hostlist from LoadLeveler
|
||||
* *hostlist should either by NULL or a valid argv and *num_hosts
|
||||
* should be 0 or the number of elements in the hostlist argv
|
||||
/**
|
||||
* Discover the available resources. Obtain directly from LoadLeveler (and
|
||||
* therefore have no need to validate) -- ignore hostfile or any other
|
||||
* user-specified parameters.
|
||||
*/
|
||||
static int orte_ras_loadleveler_get_hostlist(int* num_hosts, char*** hostlist)
|
||||
static int orte_ras_loadleveler_discover(opal_list_t* nodelist)
|
||||
{
|
||||
LL_element *queryObject = NULL, *job = NULL, *step = NULL;
|
||||
LL_element *node = NULL, *task = NULL, *task_instance = NULL;
|
||||
int rc, obj_count, err_code, ll_master_task, job_step_count;
|
||||
char *ll_step_id= NULL, *job_step_list[2], *task_machine_name = NULL;
|
||||
char *schedd_host_name = NULL;
|
||||
int step_mode;
|
||||
|
||||
/* Get the step ID from LOADL_STEP_ID environment variable. */
|
||||
if(NULL == (ll_step_id = getenv("LOADL_STEP_ID"))) {
|
||||
orte_node_t *node;
|
||||
opal_list_item_t* item;
|
||||
FILE *fp;
|
||||
char *hostname;
|
||||
char *filename;
|
||||
char input[LL_FILE_MAX_LINE_LENGTH];
|
||||
|
||||
/* Ignore anything that the user already specified -- we're
|
||||
getting nodes only from LoadLeveler. */
|
||||
filename = getenv("LOADL_HOSTFILE");
|
||||
if(NULL == filename) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:get:hostlist: could not get LOADL_STEP_ID "
|
||||
"from environment!");
|
||||
"ras:loadleveler:allocate:discover: LOADL_HOSTFILE not set. "
|
||||
"Unable to discover allocated nodes.");
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
job_step_list[0] = ll_step_id;
|
||||
job_step_list[1] = NULL;
|
||||
|
||||
/* STEP 1: Get Job object from Central Manager to find out the name of the
|
||||
* Schedd daemon that handles this job. In a Multicluster environment we
|
||||
* can not get the schedd name from the job step id. */
|
||||
|
||||
/* Initialize the LL API. Specify that query type is JOBS. */
|
||||
if(NULL == (queryObject = ll_query(JOBS))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:get:hostlist: 1 ll_query faild on JOBS!");
|
||||
return ORTE_ERROR;
|
||||
fp = fopen(filename, "r");
|
||||
if (NULL == fp) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_FILE_OPEN_FAILURE);
|
||||
return ORTE_ERR_FILE_OPEN_FAILURE;
|
||||
}
|
||||
|
||||
/* Specify that this is a QUERY_STEPID type of query. */
|
||||
rc = ll_set_request(queryObject, QUERY_STEPID, job_step_list, ALL_DATA);
|
||||
if(0 > rc) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:get:hostlist: 1 ll_set_request failed: "
|
||||
"error %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
/* Iterate through all the nodes and make an entry for each */
|
||||
while (0 != ll_getline(fp, input)) {
|
||||
hostname = strdup(input);
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_ras_base.ras_output,
|
||||
"%s ras:loadleveler:allocate:discover: got hostname %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hostname));
|
||||
|
||||
/* Get a Job object from LoadL_schedd that contains the relevant job step */
|
||||
job = ll_get_objs(queryObject, LL_CM, NULL, &obj_count, &err_code);
|
||||
if(NULL == job) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:get:hostlist: ll_get_objs LL_CM "
|
||||
"failed: err_code=%d", err_code);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
/* Remember that LoadLeveler may list the same node more than once.
|
||||
So we have to check for duplicates. */
|
||||
for (item = opal_list_get_first(nodelist);
|
||||
opal_list_get_end(nodelist) != item;
|
||||
item = opal_list_get_next(item)) {
|
||||
node = (orte_node_t*) item;
|
||||
if (0 == strcmp(node->name, hostname)) {
|
||||
++node->slots;
|
||||
|
||||
if (obj_count != 1) { /* Only 1 Job object is expected. */
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:get:hostlist: ll_get_objs LL_CM "
|
||||
"expected one job to match, got %d!", obj_count);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
if(0 != (rc = ll_get_data(job, LL_JobSchedd, &schedd_host_name))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:get:hostlist: ll_get_data LL_JobSchedd"
|
||||
" failure, RC= %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
if (schedd_host_name != NULL) {
|
||||
job_step_list[0] = ll_step_id;
|
||||
job_step_list[1] = NULL;
|
||||
} else {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:get:hostlist: ll_get_data() Error: Could "
|
||||
"not determine managing schedd for job %s.\n",
|
||||
job_step_list[0]);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
ll_free_objs(queryObject);
|
||||
ll_deallocate(queryObject);
|
||||
|
||||
/* STEP 2: Get Job object from Schedd that manages this job step. */
|
||||
/* Only schedd query gives us all the relevant task instance info. */
|
||||
|
||||
/* Initialize the LL API. Specify that query type is JOBS. */
|
||||
if(NULL == (queryObject = ll_query(JOBS))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:get:hostlist: 2 ll_query faild on JOBS!");
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
/* Specify that this is a QUERY_STEPID type of query. */
|
||||
rc = ll_set_request(queryObject, QUERY_STEPID, job_step_list, ALL_DATA);
|
||||
if(0 != rc) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:get:hostlist: 2 ll_set_request failed: "
|
||||
"error %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
/* Get a Job object from LoadL_schedd that contains the relevant job step */
|
||||
job = ll_get_objs(queryObject, LL_SCHEDD, schedd_host_name, &obj_count,
|
||||
&err_code);
|
||||
if(NULL == job) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:get:hostlist: ll_get_objs LL_SCHEDD "
|
||||
"failed: err_code=%d", err_code);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
if (obj_count != 1) { /* Only 1 Job object is expected. */
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:get:hostlist: ll_get_objs LL_SCHEDD "
|
||||
"expected one job to match, got %d!", obj_count);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
if(0 != (rc = ll_get_data(job, LL_JobStepCount, &job_step_count))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:get:hostlist: ll_get_data LL_JobStepCount"
|
||||
" failure, RC= %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
if (job_step_count != 1) { /* Only 1 Job Step object is expected. */
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:get:hostlist: ll_get_data LL_JobStepCount"
|
||||
" expected one jobstep to match, got %d!", job_step_count);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
step = NULL;
|
||||
if(0 != (rc = ll_get_data(job, LL_JobGetFirstStep, &step))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:get:hostlist: 3 ll_get_data: failure on "
|
||||
"LL_JobGetFirstStep. RC= %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
if(NULL == step) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:get:hostlist: 3 ll_get_data: Error: "
|
||||
"Unable to obtain Job Step information.\n");
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
step_mode = -1;
|
||||
if(0 != (rc = ll_get_data(step, LL_StepParallelMode, &step_mode))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:get:hostlist: 4 ll_get_data: failure on "
|
||||
"LL_StepParallelMode. RC= %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
/* Serial job step: step_mode==0; Parallel: step_mode==1; Others:2,3,4. */
|
||||
if ((step_mode != 0) && (step_mode != 1)) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:get:hostlist: We support only Serial and "
|
||||
"Parallel LoadLeveler job types. PVM, NQS, and Blue Gene"
|
||||
"jobs are not supported by the LoadLeveler RAS!");
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
if(step_mode == 0) { /* serial job */
|
||||
node = NULL;
|
||||
if(0 != (rc = ll_get_data(step, LL_StepGetFirstNode, &node))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:get:hostlist: ll_get_data: failure "
|
||||
"on serial LL_StepGetFirstNode. RC= %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
task = NULL;
|
||||
if(0 != (rc = ll_get_data(node, LL_NodeGetFirstTask, &task))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:get:hostlist: ll_get_data: failure "
|
||||
"on serial LL_NodeGetFirstTask. RC= %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
task_instance = NULL;
|
||||
rc = ll_get_data(task, LL_TaskGetFirstTaskInstance, &task_instance);
|
||||
if(0 != rc) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:get:hostlist: ll_get_data: failure "
|
||||
"on serial LL_TaskGetFirstInstance. RC= %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
task_machine_name = NULL;
|
||||
if(0 != (rc = ll_get_data(task_instance, LL_TaskInstanceMachineName,
|
||||
&task_machine_name))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:get:hostlist: ll_get_data: failure "
|
||||
"on serial LL_TaskInstanceMachineName. RC= %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
opal_argv_append(num_hosts, hostlist, task_machine_name);
|
||||
|
||||
} else { /* parallel job */
|
||||
|
||||
node = NULL;
|
||||
if(0 != (rc = ll_get_data(step, LL_StepGetFirstNode, &node))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:get:hostlist: ll_get_data: failure "
|
||||
"on LL_StepGetFirstNode. RC= %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
while(NULL != node) { /* Loop through the "Node" objects. */
|
||||
task = NULL;
|
||||
if(0 != (rc = ll_get_data(node, LL_NodeGetFirstTask, &task))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:get:hostlist: ll_get_data: failure "
|
||||
"on LL_NodeGetFirstTask. RC= %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
while(task) { /* Loop through the "Task" objects. */
|
||||
ll_master_task = 0;
|
||||
rc = ll_get_data(task, LL_TaskIsMaster, &ll_master_task);
|
||||
if(0 != rc) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:get:hostlist: ll_get_data: "
|
||||
"failure on LL_TaskIsMaster. RC= %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
/* The "master task" Task object is a LoadLeveler abstraction
|
||||
* and is not relevant here. Look at only Task objects that
|
||||
* are not "master".*/
|
||||
if (!ll_master_task) {
|
||||
task_instance = NULL;
|
||||
if(0 != (rc = ll_get_data(task, LL_TaskGetFirstTaskInstance,
|
||||
&task_instance))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:get:hostlist: ll_get_data:"
|
||||
" failure on LL_TaskGetFirstTaskInstance."
|
||||
" RC= %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
/* Loop through the "Task Instance" objects. */
|
||||
while (task_instance) {
|
||||
task_machine_name = NULL;
|
||||
rc = ll_get_data(task_instance,
|
||||
LL_TaskInstanceMachineName,
|
||||
&task_machine_name);
|
||||
if(0 != rc) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:get:hostlist: ll_get_data:"
|
||||
" failure on LL_TaskInstanceMachineName"
|
||||
"RC= %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
opal_argv_append(num_hosts, hostlist, task_machine_name);
|
||||
task_instance = NULL;
|
||||
rc = ll_get_data(task, LL_TaskGetNextTaskInstance,
|
||||
&task_instance);
|
||||
if(0 != rc) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:get:hostlist: ll_get_data:"
|
||||
" failure on LL_TaskGetNextTaskInstance. "
|
||||
"RC= %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
}
|
||||
}
|
||||
task = NULL;
|
||||
if(0 != (rc = ll_get_data(node, LL_NodeGetNextTask, &task))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:get:hostlist: ll_get_data: "
|
||||
"failure on LL_NodeGetNextTask. RC= %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
}
|
||||
node = NULL;
|
||||
if(0 != (rc = ll_get_data(step, LL_StepGetNextNode, &node))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:get:hostlist: ll_get_data: "
|
||||
"failure on LL_StepGetNextNode. RC= %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_ras_base.ras_output,
|
||||
"%s ras:loadleveler:allocate:discover: found -- bumped slots to %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->slots));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* Did we find it? */
|
||||
if (opal_list_get_end(nodelist) == item) {
|
||||
/* Nope -- didn't find it, so add a new item to the list */
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_ras_base.ras_output,
|
||||
"%s ras:loadleveler:allocate:discover: not found -- added to list",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
node = OBJ_NEW(orte_node_t);
|
||||
node->name = hostname;
|
||||
node->state = ORTE_NODE_STATE_UP;
|
||||
node->slots_inuse = 0;
|
||||
node->slots_max = 0;
|
||||
node->slots = 1;
|
||||
opal_list_append(nodelist, &node->super);
|
||||
} else {
|
||||
/* Yes, so we need to free the hostname that came back */
|
||||
free(hostname);
|
||||
}
|
||||
}
|
||||
ll_free_objs(queryObject);
|
||||
ll_deallocate(queryObject);
|
||||
fclose(fp);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int ll_getline(FILE *fp, char *input)
|
||||
{
|
||||
char *ret;
|
||||
|
||||
ret = fgets(input, LL_FILE_MAX_LINE_LENGTH, fp);
|
||||
if (NULL != ret) {
|
||||
input[strlen(input)-1] = '\0'; /* remove newline */
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user