Fix for bug #369.
LoadLeveler only sets LOADL_PROCESSOR_LIST when there are 128 or less tasks allocated to a job. The POE RAS relied on this variable so I created a new RAS which uses the LoadLeveler API instead of relying on the environment variable. This still needs some testing, so for now we use the POE RAS whenever LOADL_PROCESSOR_LIST, otherwise we fall back on this component. Unfortunately, this will require an autogen... This commit was SVN r11732.
Этот коммит содержится в:
родитель
645790dd9c
Коммит
83a7f6e4de
@ -70,6 +70,7 @@ m4_include(config/ompi_check_portals.m4)
|
||||
m4_include(config/ompi_check_psm.m4)
|
||||
m4_include(config/ompi_check_udapl.m4)
|
||||
m4_include(config/ompi_check_package.m4)
|
||||
m4_include(config/ompi_check_loadleveler.m4)
|
||||
m4_include(config/ompi_check_slurm.m4)
|
||||
m4_include(config/ompi_check_tm.m4)
|
||||
m4_include(config/ompi_check_xgrid.m4)
|
||||
|
52
config/ompi_check_loadleveler.m4
Обычный файл
52
config/ompi_check_loadleveler.m4
Обычный файл
@ -0,0 +1,52 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
|
||||
# OMPI_CHECK_LOADLEVELER(prefix, [action-if-found], [action-if-not-found])
|
||||
# --------------------------------------------------------
|
||||
AC_DEFUN([OMPI_CHECK_LOADLEVELER],[
|
||||
AC_ARG_WITH([loadleveler],
|
||||
[AC_HELP_STRING([--with-loadleveler],
|
||||
[Directory where the loadleveler software is installed])])
|
||||
|
||||
AS_IF([test "$with_loadleveler" = "no"],
|
||||
[ompi_check_loadleveler_hapy="no"],
|
||||
[ompi_check_loadleveler_happy="yes"
|
||||
AS_IF([test ! -z "$with_loadleveler" -a "$with_loadleveler" != "yes"],
|
||||
[ompi_check_loadleveler_dir="$with_loadleveler"],
|
||||
[ompi_check_loadleveler_dir=""])])
|
||||
|
||||
AS_IF([test "$ompi_check_loadleveler_happy" = "yes"],
|
||||
[OMPI_CHECK_PACKAGE([$1],
|
||||
[llapi.h],
|
||||
[llapi],
|
||||
[ll_query],
|
||||
[],
|
||||
[$ompi_check_loadleveler_dir],
|
||||
[],
|
||||
[ompi_check_loadleveler_happy="yes"],
|
||||
[ompi_check_loadleveler_happy="no"])])
|
||||
|
||||
AS_IF([test "$ompi_check_loadleveler_happy" = "yes"],
|
||||
[$2],
|
||||
[AS_IF([test ! -z "$with_loadleveler" -a "$with_loadleveler" != "no"],
|
||||
[AC_MSG_ERROR([LOADLEVELER support requested but not found. Aborting])])
|
||||
$3])
|
||||
])
|
59
orte/mca/ras/loadleveler/Makefile.am
Обычный файл
59
orte/mca/ras/loadleveler/Makefile.am
Обычный файл
@ -0,0 +1,59 @@
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# Use the top-level Makefile.options
|
||||
|
||||
|
||||
|
||||
AM_CPPFLAGS = $(ras_loadleveler_CPPFLAGS)
|
||||
|
||||
sources = \
|
||||
ras_loadleveler.h \
|
||||
ras_loadleveler_component.c \
|
||||
ras_loadleveler_module.c
|
||||
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if OMPI_BUILD_ras_loadleveler_DSO
|
||||
lib =
|
||||
lib_sources =
|
||||
component = mca_ras_loadleveler.la
|
||||
component_sources = $(sources)
|
||||
else
|
||||
lib = libmca_ras_loadleveler.la
|
||||
lib_sources = $(sources)
|
||||
component =
|
||||
component_sources =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(libdir)/openmpi
|
||||
mcacomponent_LTLIBRARIES = $(component)
|
||||
mca_ras_loadleveler_la_SOURCES = $(component_sources)
|
||||
mca_ras_loadleveler_la_LDFLAGS = -module -avoid-version $(ras_loadleveler_LDFLAGS)
|
||||
mca_ras_loadleveler_la_LIBADD = \
|
||||
$(ras_loadleveler_LIBS) \
|
||||
$(top_ompi_builddir)/orte/liborte.la \
|
||||
$(top_ompi_builddir)/opal/libopal.la
|
||||
|
||||
noinst_LTLIBRARIES = $(lib)
|
||||
libmca_ras_loadleveler_la_SOURCES = $(lib_sources)
|
||||
libmca_ras_loadleveler_la_LDFLAGS = -module -avoid-version $(ras_loadleveler_LDFLAGS)
|
||||
libmca_ras_loadleveler_la_LIBADD = $(ras_loadleveler_LIBS)
|
37
orte/mca/ras/loadleveler/configure.m4
Обычный файл
37
orte/mca/ras/loadleveler/configure.m4
Обычный файл
@ -0,0 +1,37 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# MCA_ras_loadleveler_CONFIG([action-if-found], [action-if-not-found])
|
||||
# -----------------------------------------------------------
|
||||
AC_DEFUN([MCA_ras_loadleveler_CONFIG],[
|
||||
OMPI_CHECK_LOADLEVELER([ras_loadleveler], [ras_loadleveler_good=1], [ras_loadleveler_good=0])
|
||||
|
||||
# if check worked, set wrapper flags if so.
|
||||
# Evaluate succeed / fail
|
||||
AS_IF([test "$ras_loadleveler_good" = "1"],
|
||||
[ras_loadleveler_WRAPPER_EXTRA_LDFLAGS="$ras_loadleveler_LDFLAGS"
|
||||
ras_loadleveler_WRAPPER_EXTRA_LIBS="$ras_loadleveler_LIBS"
|
||||
$1],
|
||||
[$2])
|
||||
|
||||
# set build flags to use in makefile
|
||||
AC_SUBST([ras_loadleveler_CPPFLAGS])
|
||||
AC_SUBST([ras_loadleveler_LDFLAGS])
|
||||
AC_SUBST([ras_loadleveler_LIBS])
|
||||
])dnl
|
21
orte/mca/ras/loadleveler/configure.params
Обычный файл
21
orte/mca/ras/loadleveler/configure.params
Обычный файл
@ -0,0 +1,21 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
PARAM_INIT_FILE=ras_loadleveler_component.c
|
||||
PARAM_CONFIG_FILES="Makefile"
|
40
orte/mca/ras/loadleveler/ras_loadleveler.h
Обычный файл
40
orte/mca/ras/loadleveler/ras_loadleveler.h
Обычный файл
@ -0,0 +1,40 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/**
|
||||
* @file
|
||||
*
|
||||
* Resource Allocation (Loadleveler)
|
||||
*/
|
||||
#ifndef ORTE_RAS_LOADLEVELER_H
|
||||
#define ORTE_RAS_LOADLEVELER_H
|
||||
|
||||
#include "orte/mca/ras/ras.h"
|
||||
#include "orte/mca/ras/base/base.h"
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
ORTE_DECLSPEC extern orte_ras_base_component_t mca_ras_loadleveler_component;
|
||||
ORTE_DECLSPEC extern orte_ras_base_module_t orte_ras_loadleveler_module;
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
109
orte/mca/ras/loadleveler/ras_loadleveler_component.c
Обычный файл
109
orte/mca/ras/loadleveler/ras_loadleveler_component.c
Обычный файл
@ -0,0 +1,109 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "orte/orte_constants.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "ras_loadleveler.h"
|
||||
|
||||
|
||||
/*
|
||||
* Local variables
|
||||
*/
|
||||
static int param_priority;
|
||||
|
||||
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
static int orte_ras_loadleveler_open(void);
|
||||
static orte_ras_base_module_t *ras_loadleveler_init(int*);
|
||||
|
||||
|
||||
orte_ras_base_component_t mca_ras_loadleveler_component = {
|
||||
/* First, the mca_base_component_t struct containing meta
|
||||
information about the component itself */
|
||||
{
|
||||
/* Indicate that we are a ras v1.3.0 component (which also
|
||||
implies a specific MCA version) */
|
||||
|
||||
ORTE_RAS_BASE_VERSION_1_3_0,
|
||||
|
||||
/* Component name and version */
|
||||
|
||||
"loadleveler",
|
||||
ORTE_MAJOR_VERSION,
|
||||
ORTE_MINOR_VERSION,
|
||||
ORTE_RELEASE_VERSION,
|
||||
|
||||
/* Component open and close functions */
|
||||
|
||||
orte_ras_loadleveler_open,
|
||||
NULL
|
||||
},
|
||||
|
||||
/* Next the MCA v1.0.0 component meta data */
|
||||
{
|
||||
/* Whether the component is checkpointable or not */
|
||||
false
|
||||
},
|
||||
|
||||
ras_loadleveler_init
|
||||
};
|
||||
|
||||
|
||||
static int orte_ras_loadleveler_open(void)
|
||||
{
|
||||
/* for now we set the priority lower then the priority of the POE RAS
|
||||
* so that it is used whenever the LOADL_PROCESSOR_LIST is actually set */
|
||||
param_priority =
|
||||
mca_base_param_reg_int(&mca_ras_loadleveler_component.ras_version,
|
||||
"priority",
|
||||
"Priority of the loadleveler ras component",
|
||||
false, false, 90, NULL);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static orte_ras_base_module_t *ras_loadleveler_init(int* priority)
|
||||
{
|
||||
/* if we are not an HNP, then we must not be selected */
|
||||
if (!orte_process_info.seed) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Are we running under a LOADLEVELER job? */
|
||||
if (NULL != getenv("LOADL_STEP_ID")) {
|
||||
mca_base_param_lookup_int(param_priority, priority);
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler: available for selection with priority %d",
|
||||
param_priority);
|
||||
return &orte_ras_loadleveler_module;
|
||||
}
|
||||
|
||||
/* Sadly, no */
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler: NOT available for selection");
|
||||
return NULL;
|
||||
}
|
||||
|
399
orte/mca/ras/loadleveler/ras_loadleveler_module.c
Обычный файл
399
orte/mca/ras/loadleveler/ras_loadleveler_module.c
Обычный файл
@ -0,0 +1,399 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/* Much of the code in this file is taken from the file ll_get_machine_list.c,
|
||||
* which is provided by IBM as part of their sample programs for LoadLeveler
|
||||
* in the samples/llmpich directory. The documentation has the following license:
|
||||
* COPYRIGHT LICENSE:
|
||||
* This information contains sample application programs in source language, which
|
||||
* illustrate programming techniques on various operating platforms. You may copy,
|
||||
* modify, and distribute these sample programs in any form without payment to
|
||||
* IBM, for the purposes of developing, using, marketing or distributing
|
||||
* application programs conforming to the application programming interface for
|
||||
* the operating platform for which the sample programs are written. These
|
||||
* examples have not been thoroughly tested under all conditions. IBM,
|
||||
* therefore, cannot guarantee or imply reliability, serviceability, or
|
||||
* function of these programs.
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include <errno.h>
|
||||
#include <unistd.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <llapi.h>
|
||||
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "orte/orte_constants.h"
|
||||
#include "orte/orte_types.h"
|
||||
#include "orte/mca/ras/base/ras_private.h"
|
||||
#include "ras_loadleveler.h"
|
||||
|
||||
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
static int orte_ras_loadleveler_allocate(orte_jobid_t jobid);
|
||||
static int orte_ras_loadleveler_deallocate(orte_jobid_t jobid);
|
||||
static int orte_ras_loadleveler_finalize(void);
|
||||
static int orte_ras_loadleveler_get_hostlist(int * num_hosts, char*** hostlist);
|
||||
|
||||
|
||||
/*
|
||||
* Global variable
|
||||
*/
|
||||
orte_ras_base_module_t orte_ras_loadleveler_module = {
|
||||
orte_ras_loadleveler_allocate,
|
||||
orte_ras_base_node_insert,
|
||||
orte_ras_base_node_query,
|
||||
orte_ras_base_node_query_alloc,
|
||||
orte_ras_base_node_lookup,
|
||||
orte_ras_loadleveler_deallocate,
|
||||
orte_ras_loadleveler_finalize
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* Discover available (pre-allocated) nodes. Allocate the
|
||||
* requested number of nodes/process slots to the job.
|
||||
*
|
||||
*/
|
||||
#include "orte/mca/gpr/gpr.h"
|
||||
static int orte_ras_loadleveler_allocate(orte_jobid_t jobid)
|
||||
{
|
||||
int i, rc, ret;
|
||||
opal_list_t nodes_list;
|
||||
opal_list_item_t* item;
|
||||
orte_ras_node_t* node;
|
||||
char ** hostlist = NULL;
|
||||
int num_hosts = 0;
|
||||
|
||||
rc = orte_ras_loadleveler_get_hostlist(&num_hosts, &hostlist);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
OBJ_CONSTRUCT(&nodes_list, opal_list_t);
|
||||
for (i = 0; i < num_hosts; i++) {
|
||||
/* check for duplicated nodes */
|
||||
for (item = opal_list_get_first(&nodes_list);
|
||||
opal_list_get_end(&nodes_list) != item;
|
||||
item = opal_list_get_next(item)) {
|
||||
node = (orte_ras_node_t*) item;
|
||||
if (0 == strcmp(node->node_name, hostlist[i])) {
|
||||
++node->node_slots;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if(opal_list_get_end(&nodes_list) == item) {
|
||||
/* we did not find a duplicate, so add a new item to the list */
|
||||
node = OBJ_NEW(orte_ras_node_t);
|
||||
if (NULL == node) {
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
node->node_name = strdup(hostlist[i]);
|
||||
node->node_arch = NULL;
|
||||
node->node_state = ORTE_NODE_STATE_UP;
|
||||
node->node_cellid = 0;
|
||||
node->node_slots_inuse = 0;
|
||||
node->node_slots_max = 0;
|
||||
node->node_slots = 1;
|
||||
opal_list_append(&nodes_list, &node->super);
|
||||
}
|
||||
}
|
||||
ret = orte_ras_base_node_insert(&nodes_list);
|
||||
ret = orte_ras_base_allocate_nodes(jobid, &nodes_list);
|
||||
|
||||
while (NULL != (item = opal_list_remove_first(&nodes_list))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
OBJ_DESTRUCT(&nodes_list);
|
||||
opal_argv_free(hostlist);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* There's really nothing to do here
|
||||
*/
|
||||
static int orte_ras_loadleveler_deallocate(orte_jobid_t jobid)
|
||||
{
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:deallocate: success (nothing to do)");
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* There's really nothing to do here
|
||||
*/
|
||||
static int orte_ras_loadleveler_finalize(void)
|
||||
{
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:finalize: success (nothing to do)");
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* get the hostlist from LoadLeveler
|
||||
* *hostlist should either by NULL or a valid argv and *num_hosts
|
||||
* should be 0 or the number of elements in the hostlist argv
|
||||
*/
|
||||
static int orte_ras_loadleveler_get_hostlist(int* num_hosts, char*** hostlist)
|
||||
{
|
||||
LL_element *queryObject = NULL, *job = NULL, *step = NULL;
|
||||
LL_element *node = NULL, *task = NULL, *task_instance = NULL;
|
||||
int rc, obj_count, err_code, ll_master_task, job_step_count;
|
||||
char *ll_step_id= NULL, *job_step_list[2], *task_machine_name = NULL;
|
||||
char *schedd_host_name = NULL;
|
||||
int step_mode;
|
||||
|
||||
/* Get the step ID from LOADL_STEP_ID environment variable. */
|
||||
if(NULL == (ll_step_id = getenv("LOADL_STEP_ID"))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: could not get LOADL_STEP_ID from environment!");
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
job_step_list[0] = ll_step_id;
|
||||
job_step_list[1] = NULL;
|
||||
|
||||
/* STEP 1: Get Job object from Central Manager to find out the name of the Schedd */
|
||||
/* daemon that handles this job. In a Multicluster environment we can not get */
|
||||
/* the schedd name from the job step id. */
|
||||
|
||||
/* Initialize the LL API. Specify that query type is JOBS. */
|
||||
if(NULL == (queryObject = ll_query(JOBS))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_query faild on JOBS!");
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
/* Specify that this is a QUERY_STEPID type of query. */
|
||||
if(0 > (rc = ll_set_request(queryObject, QUERY_STEPID, job_step_list, ALL_DATA))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_set request failed: error %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
/* Get a Job object from LoadL_schedd that contains the relevant job step. */
|
||||
if(NULL == (job = ll_get_objs(queryObject, LL_CM, NULL, &obj_count, &err_code))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_get_objs faild!");
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
if (obj_count != 1) { /* Only 1 Job object is expected. */
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_get_objs: expected one job to match, got %d!", obj_count);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
if(0 != (rc = ll_get_data(job, LL_JobSchedd, &schedd_host_name))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_get_data: failure. RC= %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
if (schedd_host_name != NULL) {
|
||||
job_step_list[0] = ll_step_id;
|
||||
job_step_list[1] = NULL;
|
||||
} else {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_get_objs() Error: Could not "
|
||||
"determine managing schedd for job %s.\n",
|
||||
job_step_list[0]);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
ll_free_objs(queryObject);
|
||||
ll_deallocate(queryObject);
|
||||
|
||||
/* STEP 2: Get Job object from Schedd that manages this job step. */
|
||||
/* Only schedd query gives us all the relevant task instance info. */
|
||||
|
||||
/* Initialize the LL API. Specify that query type is JOBS. */
|
||||
if(NULL == (queryObject = ll_query(JOBS))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_query faild on JOBS!");
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
/* Specify that this is a QUERY_STEPID type of query. */
|
||||
if(0 != (rc = ll_set_request(queryObject, QUERY_STEPID, job_step_list, ALL_DATA))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_set request failed: error %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
/* Get a Job object from LoadL_schedd that contains the relevant job step. */
|
||||
if(NULL == (job = ll_get_objs(queryObject, LL_SCHEDD, schedd_host_name, &obj_count, &err_code))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_set request failed: error %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
if (obj_count != 1) { /* Only 1 Job object is expected. */
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_get_objs: expected one job to match, got %d!", obj_count);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
if(0 != (rc = ll_get_data(job, LL_JobStepCount, &job_step_count))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_get_data: failure. RC= %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
if (job_step_count != 1) { /* Only 1 Job Step object is expected. */
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_get_objs: expected one job step to match, got %d!", obj_count);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
step = NULL;
|
||||
if(0 != (rc = ll_get_data(job, LL_JobGetFirstStep, &step))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_get_data: failure. RC= %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
if (!step) {
|
||||
fprintf(stderr, "ll_get_data() Error: Unable to obtain Job Step information.\n");
|
||||
exit(2);
|
||||
}
|
||||
|
||||
step_mode = -1;
|
||||
if(0 != (rc = ll_get_data(step, LL_StepParallelMode, &step_mode))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_get_data: failure on LL_StepParallelMode. RC= %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
/* Serial job step: step_mode == 0; Parallel: step_mode == 1; Others: 2, 3, 4. */
|
||||
if ((step_mode != 0) && (step_mode != 1)) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: We support only Serial and Parallel LoadLeveler job types."
|
||||
"PVM, NQS, and Blue Gene jobs are not supported by the LoadLeveler RAS!");
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
if(step_mode == 0) { /* serial job */
|
||||
node = NULL;
|
||||
if(0 != (rc = ll_get_data(step, LL_StepGetFirstNode, &node))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_get_data: failure on LL_StepGetFirstNode. RC= %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
task = NULL;
|
||||
if(0 != (rc = ll_get_data(node, LL_NodeGetFirstTask, &task))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_get_data: failure on LL_NodeGetFirstTask. RC= %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
task_instance = NULL;
|
||||
if(0 != (rc = ll_get_data(task, LL_TaskGetFirstTaskInstance, &task_instance))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_get_data: failure on LL_TaskGetFirstInstance. RC= %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
task_machine_name = NULL;
|
||||
if(0 != (rc = ll_get_data(task_instance, LL_TaskInstanceMachineName, &task_machine_name))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_get_data: failure on LL_TaskInstanceMachineName. RC= %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
opal_argv_append(num_hosts, hostlist, task_machine_name);
|
||||
ll_free_objs(queryObject);
|
||||
ll_deallocate(queryObject);
|
||||
|
||||
} else { /* parallel job */
|
||||
|
||||
node = NULL;
|
||||
if(0 != (rc = ll_get_data(step, LL_StepGetFirstNode, &node))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_get_data: failure on "
|
||||
"LL_StepGetFirstNode. RC= %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
while(NULL != node) { /* Loop through the "Node" objects. */
|
||||
task = NULL;
|
||||
if(0 != (rc = ll_get_data(node, LL_NodeGetFirstTask, &task))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_get_data: failure on "
|
||||
"LL_NodeGetFirstTask. RC= %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
while(task) { /* Loop through the "Task" objects. */
|
||||
ll_master_task = 0;
|
||||
if(0 != (rc = ll_get_data(task, LL_TaskIsMaster, &ll_master_task))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_get_data: failure on LL_TaskIsMaster. RC= %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
/* The "master task" Task object is a LoadLeveler abstraction and is not relevant here. */
|
||||
/* Look at only Task objects that are not "master". */
|
||||
if (!ll_master_task) {
|
||||
task_instance = NULL;
|
||||
if(0 != (rc = ll_get_data(task, LL_TaskGetFirstTaskInstance, &task_instance))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_get_data: failure on LL_TaskGetFirstTaskInstance."
|
||||
" RC= %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
while (task_instance) { /* Loop through the "Task Instance" objects. */
|
||||
task_machine_name = NULL;
|
||||
if(0 != (rc = ll_get_data(task_instance, LL_TaskInstanceMachineName, &task_machine_name))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_get_data: failure on "
|
||||
"LL_TaskInstanceMachineName. RC= %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
opal_argv_append(num_hosts, hostlist, task_machine_name);
|
||||
printf("added %s\n", task_machine_name);
|
||||
task_instance = NULL;
|
||||
if(0 != (rc = ll_get_data(task, LL_TaskGetNextTaskInstance, &task_instance))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_get_data: failure on "
|
||||
"LL_TaskGetNextInstance. RC= %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
}
|
||||
}
|
||||
task = NULL;
|
||||
if(0 != (rc = ll_get_data(node, LL_NodeGetNextTask, &task))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_get_data: failure on LL_NodeGetNextTask. RC= %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
}
|
||||
node = NULL;
|
||||
if(0 != (rc = ll_get_data(step, LL_StepGetNextNode, &node))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_get_data: failure on LL_StepGetNextNode. RC= %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user