1
1

* Added in new ras and pls components to support Sun N1 Grid Engine (N1GE)

6 and its open source version as the job launchers for ORTE.

This commit was SVN r11153.
Этот коммит содержится в:
Pak Lui 2006-08-10 21:46:52 +00:00
родитель 726e92e3c5
Коммит 08352878cc
17 изменённых файлов: 2088 добавлений и 0 удалений

Просмотреть файл

@ -11,6 +11,8 @@ Copyright (c) 2004-2006 The Regents of the University of California.
Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
Copyright (c) 2006 Voltaire, Inc. All rights reserved.
Copyright (c) 2006 Sandia National Laboratories. All rights reserved.
Copyright (c) 2006 Sun Microsystems, Inc. All rights reserved.
Use is subject to license terms.
$COPYRIGHT$
Additional copyrights may follow

4
NEWS
Просмотреть файл

@ -10,6 +10,8 @@ Copyright (c) 2004-2006 The Regents of the University of California.
All rights reserved.
Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
Copyright (c) 2006 Voltaire, Inc. All rights reserved.
Copyright (c) 2006 Sun Microsystems, Inc. All rights reserved.
Use is subject to license terms.
$COPYRIGHT$
Additional copyrights may follow
@ -25,6 +27,8 @@ version 1.0.
1.2
---
- Added tight integration with Sun N1 Grid Engine (N1GE) 6 and the
open source Grid Engine.
- Allow building the F90 MPI bindings as shared libraries for most
compilers / platforms. Explicitly disallow building the F90
bindings as shared libraries on OS X because of complicated

3
README
Просмотреть файл

@ -10,6 +10,8 @@ Copyright (c) 2004-2006 The Regents of the University of California.
All rights reserved.
Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
Copyright (c) 2006 Voltaire, Inc. All rights reserved.
Copyright (c) 2006 Sun Microsystems, Inc. All rights reserved.
Use is subject to license terms.
$COPYRIGHT$
Additional copyrights may follow
@ -73,6 +75,7 @@ base as of this writing (17 Jun 2006):
- SLURM
- XGrid
- Cray XT-3 / Red Storm
- Sun N1 Grid Engine (N1GE) 6 and open source Grid Engine
- The majority of Open MPI's documentation is here in this file and on
the web site FAQ (http://www.open-mpi.org/). This will eventually

50
orte/mca/pls/gridengine/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,50 @@
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2006 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
dist_pkgdata_DATA = help-pls-gridengine.txt
sources = \
pls_gridengine.h \
pls_gridengine_component.c \
pls_gridengine_module.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if OMPI_BUILD_pls_gridengine_DSO
component_noinst =
component_install = mca_pls_gridengine.la
else
component_noinst = libmca_pls_gridengine.la
component_install =
endif
mcacomponentdir = $(libdir)/openmpi
mcacomponent_LTLIBRARIES = $(component_install)
mca_pls_gridengine_la_SOURCES = $(sources)
mca_pls_gridengine_la_LDFLAGS = -module -avoid-version
mca_pls_gridengine_la_LIBADD = \
$(top_ompi_builddir)/orte/liborte.la \
$(top_ompi_builddir)/opal/libopal.la
noinst_LTLIBRARIES = $(component_noinst)
libmca_pls_gridengine_la_SOURCES =$(sources)
libmca_pls_gridengine_la_LDFLAGS = -module -avoid-version

26
orte/mca/pls/gridengine/configure.m4 Обычный файл
Просмотреть файл

@ -0,0 +1,26 @@
# -*- shell-script -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2006 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_pls_sge_CONFIG([action-if-found], [action-if-not-found])
# -----------------------------------------------------------
AC_DEFUN([MCA_pls_gridengine_CONFIG],[
AC_CHECK_FUNC([fork], [$1], [$2])
])dnl

26
orte/mca/pls/gridengine/configure.params Обычный файл
Просмотреть файл

@ -0,0 +1,26 @@
# -*- shell-script -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2006 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# Specific to this module
PARAM_INIT_FILE=pls_gridengine_component.c
PARAM_CONFIG_HEADER_FILE="pls_gridengine_config.h"
PARAM_CONFIG_FILES="Makefile"

Просмотреть файл

@ -0,0 +1,53 @@
# -*- text -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2006 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
[no-local-orted]
The gridengine pls component was not able to find the executable "orted" in
your PATH or in the directory where Open MPI was initially installed,
and therefore cannot continue.
For reference, your current PATH is:
%s
We also looked for orted in the following directory:
%s
[bad-qrsh-path]
The gridengine pls component is not able to find the
executable "qrsh" in $SGE_ROOT/bin/$ARC/qrsh, and therefore cannot
continue.
For reference, your current path to qrsh is:
%s
The $SGE_ROOT environment variable points to:
%s
The $ARC environment variable points to:
%s
[insufficient-pe-slot]
Exiting from the gridengine pls module. There are not enough PE slots
available in the Grid Engine Parallel Environment (PE) to satisfy the
slot requirement needed to launch ORTE daemon to this host:
%s

143
orte/mca/pls/gridengine/pls_gridengine.h Обычный файл
Просмотреть файл

@ -0,0 +1,143 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file:
* Process launcher for the Grid Engine
*
* (1) Example of running a batch job under the Grid Engine:
*
* Write a script and put in the MPI job
* @code
* % cat gridengine.csh
* #!/usr/bin/csh
*
* # you can optionally set PATH and LD_LIBRARY_PATH instead of setting prefix
* set prefix=/path_to/open-mpi-build
*
* /path_to/mpirun -np 4 -prefix $prefix ./connectivity -v
* @endcode
*
* Source the grid engine environment:
* @code
* % source /opt/n1ge/default/common/settings.csh
* @endcode
*
* Submit the job with 4 PE slots
* under a predefined Parallel Environment 'orte':
* @code
* % qsub -pe orte 4 gridengine.csh
* your job 305 ("gridengine.csh") has been submitted
* @endcode
*
* Getting the output:
* @code
* % more gridengine.csh.o305
* Warning: no access to tty (Bad file number).
* Thus no job control in this shell.
* Sun Microsystems Inc. SunOS 5.10 Generic January 2005
* checking connection 0 <-> 1
* checking connection 0 <-> 2
* checking connection 1 <-> 2
* checking connection 0 <-> 3
* checking connection 1 <-> 3
* checking connection 2 <-> 3
* Connectivity test on 4 processes PASSED.
* @endcode
*
* (2) Example of running an interactive job under the Grid Engine:
*
* Source the grid engine environment:
* @code
* % source /opt/n1ge/default/common/settings.csh
* @endcode
*
* Start an interactive job with 4 slots
* under a predefined Parallel Environment 'orte':
* @code
* % qsh -pe orte 4
* waiting for interactive job to be scheduled ...
* Your interactive job 324 has been successfully scheduled.
* @endcode
*
* Run the MPI job. You may need to set PATH and LD_LIBRARY_PATH or -prefix
* @code
* % /path_to/mpirun -np 4 hostname
* host-5
* host-5
* host-4
* host-4
* @endcode
*/
#ifndef ORTE_PLS_GRIDENGINE_EXPORT_H
#define ORTE_PLS_GRIDENGINE_EXPORT_H
#include "orte_config.h"
#include "orte/mca/pls/pls.h"
#include "opal/mca/mca.h"
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
/*
* Module open / close
*/
int orte_pls_gridengine_component_open(void);
int orte_pls_gridengine_component_close(void);
orte_pls_base_module_t* orte_pls_gridengine_component_init(int *priority);
/*
* Startup / Shutdown
*/
int orte_pls_gridengine_finalize(void);
/*
* Interface
*/
int orte_pls_gridengine_launch(orte_jobid_t);
int orte_pls_gridengine_terminate_job(orte_jobid_t);
int orte_pls_gridengine_terminate_proc(const orte_process_name_t*);
int orte_pls_gridengine_signal_job(orte_jobid_t, int32_t);
int orte_pls_gridengine_signal_proc(const orte_process_name_t*, int32_t);
/**
* PLS Component
*/
struct orte_pls_gridengine_component_t {
orte_pls_base_component_t super;
orte_jobid_t jobid;
int priority;
int verbose;
int debug;
char* orted;
};
typedef struct orte_pls_gridengine_component_t orte_pls_gridengine_component_t;
ORTE_DECLSPEC extern orte_pls_gridengine_component_t mca_pls_gridengine_component;
ORTE_DECLSPEC extern orte_pls_base_module_t orte_pls_gridengine_module;
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif /* ORTE_PLS_GRIDENGINE_EXPORT_H */

Просмотреть файл

@ -0,0 +1,150 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
* These symbols are in a file by themselves to provide nice linker
* semantics. Since linkers generally pull in symbols by object
* files, keeping these symbols as the only symbols in this file
* prevents utility programs such as "ompi_info" from having to import
* entire components just to query their version and parameters.
*/
/**
* @file:
* Part of the gridengine launcher.
* See pls_gridengine.h for an overview of how it works.
*/
#include "orte_config.h"
#include "orte/orte_constants.h"
#include "orte/mca/pls/pls.h"
#include "pls_gridengine.h"
#include "opal/util/path.h"
#include "opal/util/argv.h"
#include "opal/mca/base/mca_base_param.h"
#include "orte/mca/pls/base/base.h"
#include "opal/util/output.h"
#include "orte/mca/ras/base/base.h"
#include "orte/mca/ras/base/ras_base_node.h"
/**
* Public string showing the pls ompi_gridengine component version number
*/
const char *mca_pls_gridengine_component_version_string =
"Open MPI gridengine pls MCA component version " ORTE_VERSION;
/**
* Local functions
*/
/**
* Instantiate the public struct with all of our public information
* and pointers to our public functions in it
*/
orte_pls_gridengine_component_t mca_pls_gridengine_component = {
{
/* First, the mca_component_t struct containing meta information
about the component itself */
{
/* Indicate that we are a pls v1.0.0 component (which also
implies a specific MCA version) */
ORTE_PLS_BASE_VERSION_1_0_0,
/* Component name and version */
"gridengine",
ORTE_MAJOR_VERSION,
ORTE_MINOR_VERSION,
ORTE_RELEASE_VERSION,
/* Component open and close functions */
orte_pls_gridengine_component_open,
orte_pls_gridengine_component_close
},
/* Next the MCA v1.0.0 component meta data */
{
/* Whether the component is checkpointable or not */
false
},
/* Initialization / querying functions */
orte_pls_gridengine_component_init
}
};
/**
orte_pls_gridengine_component_open - open component and register all parameters
@return error number
*/
int orte_pls_gridengine_component_open(void)
{
mca_base_component_t *c = &mca_pls_gridengine_component.super.pls_version;
mca_base_param_reg_int(c, "debug",
"Enable debugging of gridengine pls component",
false, false, 0, &mca_pls_gridengine_component.debug);
mca_base_param_reg_int(c, "verbose",
"Enable verbose output of the gridengine qrsh -inherit command",
false, false, 0, &mca_pls_gridengine_component.verbose);
mca_base_param_reg_int(c, "priority",
"Priority of the gridengine pls component",
false , false, 100, &mca_pls_gridengine_component.priority);
mca_base_param_reg_string(c, "orted",
"The command name that the gridengine pls component will invoke for the ORTE daemon",
false, false, "orted", &mca_pls_gridengine_component.orted);
return ORTE_SUCCESS;
}
/**
orte_pls_gridengine_component_close - close component and register all parameters
@return error number
*/
int orte_pls_gridengine_component_close(void)
{
/* cleanup state */
if (NULL != mca_pls_gridengine_component.orted) {
free(mca_pls_gridengine_component.orted);
}
return ORTE_SUCCESS;
}
/**
orte_pls_gridengine_component_init - initialize component, check if we can run on this machine.
@return error number
*/
orte_pls_base_module_t *orte_pls_gridengine_component_init(int *priority)
{
if (NULL != getenv("SGE_ROOT") && NULL != getenv("ARC") &&
NULL != getenv("PE_HOSTFILE")){
opal_output(orte_pls_base.pls_output,
"pls:gridengine: available for selection");
*priority = mca_pls_gridengine_component.priority;
return &orte_pls_gridengine_module;
}
opal_output(orte_pls_base.pls_output,
"pls:gridengine: NOT available for selection");
return NULL;
}

Просмотреть файл

@ -0,0 +1,860 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
* These symbols are in a file by themselves to provide nice linker
* semantics. Since linkers generally pull in symbols by object
* files, keeping these symbols as the only symbols in this file
* prevents utility programs such as "ompi_info" from having to import
* entire components just to query their version and parameters.
*/
/**
* @file:
* Part of the gridengine launcher.
* See pls_gridengine.h for an overview of how it works.
*/
#include "orte_config.h"
#include <stdlib.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#include <errno.h>
#include <string.h>
#ifdef HAVE_SYS_SELECT_H
#include <sys/select.h>
#endif
#ifdef HAVE_SYS_TIME_H
#include <sys/time.h>
#endif
#ifdef HAVE_SYS_TYPES_H
#include <sys/types.h>
#endif
#ifdef HAVE_SYS_STAT_H
#include <sys/stat.h>
#endif
#ifdef HAVE_SYS_WAIT_H
#include <sys/wait.h>
#endif
#include <fcntl.h>
#include <signal.h>
#ifdef HAVE_PWD_H
#include <pwd.h>
#endif
#include "opal/install_dirs.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/util/if.h"
#include "opal/util/path.h"
#include "opal/event/event.h"
#include "opal/util/show_help.h"
#include "opal/util/argv.h"
#include "opal/util/opal_environ.h"
#include "opal/util/output.h"
#include "opal/util/basename.h"
#include "orte/orte_constants.h"
#include "orte/util/univ_info.h"
#include "orte/util/session_dir.h"
#include "orte/runtime/orte_wait.h"
#include "orte/mca/ns/ns.h"
#include "orte/mca/pls/pls.h"
#include "orte/mca/pls/base/base.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/gpr/gpr.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ras/base/ras_base_node.h"
#include "orte/mca/rmaps/base/rmaps_base_map.h"
#include "orte/mca/rmgr/base/base.h"
#include "orte/mca/soh/soh.h"
#include "orte/mca/soh/base/base.h"
#include "orte/mca/pls/gridengine/pls_gridengine.h"
#include "orte/util/sys_info.h"
extern char **environ;
orte_pls_base_module_1_0_0_t orte_pls_gridengine_module = {
orte_pls_gridengine_launch,
orte_pls_gridengine_terminate_job,
orte_pls_gridengine_terminate_proc,
orte_pls_gridengine_signal_job,
orte_pls_gridengine_signal_proc,
orte_pls_gridengine_finalize
};
/**
* struct used to have enough information to clean up the state of the
* universe if a daemon aborts
*/
struct gridengine_daemon_info_t {
opal_object_t super;
orte_ras_node_t* node;
orte_jobid_t jobid;
};
typedef struct gridengine_daemon_info_t gridengine_daemon_info_t;
static OBJ_CLASS_INSTANCE(gridengine_daemon_info_t,
opal_object_t,
NULL, NULL);
static void set_handler_default(int sig);
static int update_slot_keyval(orte_ras_node_t* node, int* slot_cnt);
/**
* Fill the orted_path variable with the directory to the orted
*/
static int orte_pls_gridengine_fill_orted_path(char** orted_path)
{
struct stat buf;
asprintf(orted_path, "%s/orted", OPAL_BINDIR);
if (0 != stat(*orted_path, &buf)) {
char *path = getenv("PATH");
if (NULL == path) {
path = ("PATH is empty!");
}
opal_show_help("help-pls-gridengine.txt", "no-local-orted",
true, path, OPAL_BINDIR);
return ORTE_ERR_NOT_FOUND;
}
return ORTE_SUCCESS;
}
/**
* Callback on daemon exit.
*/
static void orte_pls_gridengine_wait_daemon(pid_t pid, int status, void* cbdata)
{
gridengine_daemon_info_t *info = (gridengine_daemon_info_t*) cbdata;
opal_list_t map;
opal_list_item_t* item;
int rc;
/* if qrsh exited abnormally, set the child processes to aborted
and print something useful to the user. The usual reasons for
qrsh to exit abnormally all are a pretty good indication that
the child processes aren't going to start up properly.
This should somehow be pushed up to the calling level, but we
don't really have a way to do that just yet.
*/
#ifdef __WINDOWS__
printf("This is not implemented yet for windows\n");
ORTE_ERROR_LOG(ORTE_ERROR);
return;
#else
if (! WIFEXITED(status) || ! WEXITSTATUS(status) == 0) {
/* get the mapping for our node so we can cancel the right things */
OBJ_CONSTRUCT(&map, opal_list_t);
rc = orte_rmaps_base_get_node_map(orte_process_info.my_name->cellid,
info->jobid,
info->node->node_name,
&map);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* set state of all processes associated with the daemon as
terminated */
for(item = opal_list_get_first(&map);
item != opal_list_get_end(&map);
item = opal_list_get_next(item)) {
orte_rmaps_base_map_t* map = (orte_rmaps_base_map_t*) item;
size_t i;
for (i = 0 ; i < map->num_procs ; ++i) {
/* Clean up the session directory as if we were the
process itself. This covers the case where the
process died abnormally and didn't cleanup its own
session directory. */
orte_session_dir_finalize(&(map->procs[i])->proc_name);
rc = orte_soh.set_proc_soh(&(map->procs[i]->proc_name),
ORTE_PROC_STATE_ABORTED, status);
}
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
}
}
OBJ_DESTRUCT(&map);
cleanup:
/* tell the user something went wrong */
opal_output(0, "ERROR: A daemon on node %s failed to start as expected.",
info->node->node_name);
opal_output(0, "ERROR: There may be more information available from");
opal_output(0, "ERROR: the 'qstat -t' command on the Grid Engine tasks.");
opal_output(0, "ERROR: If the problem persists, please restart the");
opal_output(0, "ERROR: Grid Engine PE job");
if (WIFEXITED(status)) {
opal_output(0, "ERROR: The daemon exited unexpectedly with status %d.",
WEXITSTATUS(status));
} else if (WIFSIGNALED(status)) {
#ifdef WCOREDUMP
if (WCOREDUMP(status)) {
opal_output(0, "The daemon received a signal %d (with core).",
WTERMSIG(status));
} else {
opal_output(0, "The daemon received a signal %d.", WTERMSIG(status));
}
#else
opal_output(0, "The daemon received a signal %d.", WTERMSIG(status));
#endif /* WCOREDUMP */
} else {
opal_output(0, "No extra status information is available: %d.", status);
}
}
#endif /* __WINDOWS__ */
/* cleanup */
OBJ_RELEASE(info->node);
OBJ_RELEASE(info);
}
/**
* Launch a daemon (bootproxy) on each node. The daemon will be responsible
* for launching the application.
*/
int orte_pls_gridengine_launch(orte_jobid_t jobid)
{
opal_list_t mapping;
opal_list_item_t* m_item, *n_item;
size_t num_nodes;
orte_vpid_t vpid;
int node_name_index1;
int node_name_index2;
int proc_name_index;
int orted_index;
int call_yield_index;
char *jobid_string;
char *uri, *param;
char **argv;
int argc;
int rc;
sigset_t sigs;
char *lib_base = NULL, *bin_base = NULL;
char *sge_root, *sge_arch;
/* Query the list of nodes allocated and mapped to this job.
* We need the entire mapping for a couple of reasons:
* - need the prefix to start with.
* - need to know if we are launching on a subset of the allocated nodes
* All other mapping responsibilities fall to orted in the fork PLS
*/
OBJ_CONSTRUCT(&mapping, opal_list_t);
rc = orte_rmaps_base_get_map(jobid, &mapping);
if (ORTE_SUCCESS != rc) {
goto cleanup;
}
num_nodes = 0;
for(m_item = opal_list_get_first(&mapping);
m_item != opal_list_get_end(&mapping);
m_item = opal_list_get_next(m_item)) {
orte_rmaps_base_map_t* map = (orte_rmaps_base_map_t*)m_item;
num_nodes += opal_list_get_size(&map->nodes);
}
/*
* Allocate a range of vpids for the daemons.
*/
if (num_nodes == 0) {
return ORTE_ERR_BAD_PARAM;
}
rc = orte_ns.reserve_range(0, num_nodes, &vpid);
if (ORTE_SUCCESS != rc) {
goto cleanup;
}
/* need integer value for command line parameter */
if (ORTE_SUCCESS != (rc = orte_ns.convert_jobid_to_string(&jobid_string, jobid))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/*
* Build argv array
*/
argv = opal_argv_split("qrsh", ' ');
argc = opal_argv_count(argv);
/* gridengine specific flags */
opal_argv_append(&argc, &argv, "-inherit");/*run tasks within curr job*/
opal_argv_append(&argc, &argv, "-noshell");/*execute w/o wrapping shell*/
opal_argv_append(&argc, &argv, "-nostdin");/*suppress input stream stdin*/
opal_argv_append(&argc, &argv, "-V"); /*task to have the env as job*/
if (mca_pls_gridengine_component.verbose) {
opal_argv_append(&argc, &argv, "-verbose");
}
node_name_index1 = argc;
opal_argv_append(&argc, &argv, "<template>");
/* add the orted daemon in command and
* force orted in the same ptree as sge_shephard with no daemonize */
orted_index = argc;
opal_argv_append(&argc, &argv, mca_pls_gridengine_component.orted);
opal_argv_append(&argc, &argv, "--no-daemonize");
/* check for debug flags */
orte_pls_base_proxy_mca_argv(&argc, &argv);
opal_argv_append(&argc, &argv, "--bootproxy");
opal_argv_append(&argc, &argv, jobid_string);
opal_argv_append(&argc, &argv, "--name");
proc_name_index = argc;
opal_argv_append(&argc, &argv, "<template>");
/* tell the daemon how many procs are in the daemon's job */
opal_argv_append(&argc, &argv, "--num_procs");
asprintf(&param, "%lu", (unsigned long)(vpid + num_nodes));
opal_argv_append(&argc, &argv, param);
free(param);
/* tell the daemon the starting vpid of the daemon's job */
opal_argv_append(&argc, &argv, "--vpid_start");
opal_argv_append(&argc, &argv, "0");
opal_argv_append(&argc, &argv, "--nodename");
node_name_index2 = argc;
opal_argv_append(&argc, &argv, "<template>");
/* pass along the universe name and location info */
opal_argv_append(&argc, &argv, "--universe");
asprintf(&param, "%s@%s:%s", orte_universe_info.uid,
orte_universe_info.host, orte_universe_info.name);
opal_argv_append(&argc, &argv, param);
free(param);
/* setup ns contact info */
opal_argv_append(&argc, &argv, "--nsreplica");
if (NULL != orte_process_info.ns_replica_uri) {
uri = strdup(orte_process_info.ns_replica_uri);
} else {
uri = orte_rml.get_uri();
}
asprintf(&param, "\"%s\"", uri);
opal_argv_append(&argc, &argv, param);
free(uri);
free(param);
/* setup gpr contact info */
opal_argv_append(&argc, &argv, "--gprreplica");
if (NULL != orte_process_info.gpr_replica_uri) {
uri = strdup(orte_process_info.gpr_replica_uri);
} else {
uri = orte_rml.get_uri();
}
asprintf(&param, "\"%s\"", uri);
opal_argv_append(&argc, &argv, param);
free(uri);
free(param);
opal_argv_append(&argc, &argv, "--mpi-call-yield");
call_yield_index = argc;
opal_argv_append(&argc, &argv, "0");
if (mca_pls_gridengine_component.debug) {
param = opal_argv_join(argv, ' ');
if (NULL != param) {
opal_output(0, "pls:gridengine: final template argv:");
opal_output(0, "pls:gridengine: %s", param);
free(param);
}
}
/* Figure out the basenames for the libdir and bindir. There is a
lengthy comment about this in pls_rsh_module.c explaining all
the rationale for how / why we're doing this. */
lib_base = opal_basename(OPAL_LIBDIR);
bin_base = opal_basename(OPAL_BINDIR);
/*
* Iterate through each of the contexts
*/
for(m_item = opal_list_get_first(&mapping);
m_item != opal_list_get_end(&mapping);
m_item = opal_list_get_next(m_item)) {
orte_rmaps_base_map_t* map = (orte_rmaps_base_map_t*)m_item;
char *prefix_dir = map->app->prefix_dir;
/*
* For each of the contexts - iterate through the nodes.
*/
for(n_item = opal_list_get_first(&map->nodes);
n_item != opal_list_get_end(&map->nodes);
n_item = opal_list_get_next(n_item)) {
orte_rmaps_base_node_t* rmaps_node = (orte_rmaps_base_node_t*)n_item;
orte_ras_node_t* ras_node = rmaps_node->node;
orte_process_name_t* name;
pid_t pid;
char *exec_path, *orted_path;
char **exec_argv;
/* already launched on this node */
if(ras_node->node_launched++ != 0) {
if (mca_pls_gridengine_component.debug) {
opal_output(0, "pls:gridengine: already launched on this node, %s",
ras_node->node_name);
}
continue;
}
/* query the registry for the remaining gridengine slot count on
* this node, and update the registry for the count for the
* current process launch */
int remain_slot_cnt;
if (ORTE_SUCCESS != (rc =
update_slot_keyval(ras_node, &remain_slot_cnt))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* check for the unlikely scenario, because gridengine ras already
* checks for it, but still provide a check there. */
if (remain_slot_cnt < 0) {
opal_show_help("help-pls-gridengine.txt", "insufficient-pe-slot",
true, ras_node->node_name, true);
exit(-1); /* exit instead of return ORTE_ERR_OUT_OF_RESOURCE */
}
/* setup node name */
free(argv[node_name_index1]);
if (NULL != ras_node->node_username &&
0 != strlen (ras_node->node_username)) {
asprintf(&argv[node_name_index1], "%s@%s",
ras_node->node_username, ras_node->node_name);
} else {
argv[node_name_index1] = strdup(ras_node->node_name);
}
free(argv[node_name_index2]);
argv[node_name_index2] = strdup(ras_node->node_name);
/* initialize daemons process name */
rc = orte_ns.create_process_name(&name, ras_node->node_cellid, 0, vpid);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
#ifdef __WINDOWS__
printf("Unimplemented feature for windows\n");
return;
#else
/* fork a child to do qrsh */
pid = fork();
#endif
if (pid < 0) {
rc = ORTE_ERR_OUT_OF_RESOURCE;
goto cleanup;
}
/* child */
if (pid == 0) {
char* name_string;
char** env;
char* var;
long fd, fdmax = sysconf(_SC_OPEN_MAX);
if (mca_pls_gridengine_component.debug) {
opal_output(0, "pls:gridengine: launching on node %s",
ras_node->node_name);
}
/* set the progress engine schedule for this node.
* if node_slots is set to zero, then we default to
* NOT being oversubscribed
*/
if (ras_node->node_slots > 0 &&
opal_list_get_size(&rmaps_node->node_procs) > ras_node->node_slots) {
if (mca_pls_gridengine_component.debug) {
opal_output(0, "pls:gridengine: oversubscribed -- setting mpi_yield_when_idle to 1 (%d %d)",
ras_node->node_slots, opal_list_get_size(&rmaps_node->node_procs));
}
free(argv[call_yield_index]);
argv[call_yield_index] = strdup("1");
} else {
if (mca_pls_gridengine_component.debug) {
opal_output(0, "pls:gridengine: not oversubscribed -- setting mpi_yield_when_idle to 0");
}
free(argv[call_yield_index]);
argv[call_yield_index] = strdup("0");
}
/* setting exec_argv and exec_path for qrsh */
exec_argv = &argv[0];
sge_root = getenv("SGE_ROOT");
sge_arch = getenv("ARC");
asprintf(&exec_path, "%s/bin/%s/qrsh", sge_root, sge_arch);
exec_path = opal_path_findv(exec_path, X_OK, environ, NULL);
if (NULL == exec_path) {
opal_show_help("help-pls-gridengine.txt", "bad-qrsh-path",
true, exec_path, sge_root, sge_arch);
return ORTE_ERR_NOT_FOUND;
}
if (mca_pls_gridengine_component.debug) {
opal_output(0, "pls:gridengine: exec_argv[0]=%s, exec_path=%s",
exec_argv[0], exec_path);
}
/* setting orted_path for orted */
orted_path = opal_path_findv(exec_argv[orted_index], 0, environ, NULL);
if (NULL == orted_path && NULL == prefix_dir) {
rc = orte_pls_gridengine_fill_orted_path(&orted_path);
if (ORTE_SUCCESS != rc) {
return rc;
}
} else {
if (NULL != prefix_dir) {
asprintf(&argv[orted_index], "%s/%s/orted",
prefix_dir, bin_base);
if (mca_pls_gridengine_component.debug) {
opal_output(0, "pls:gridengine: orted path=%s\n",
argv[orted_index]);
}
}
/* If we yet did not fill up the orted_path, do so now */
if (NULL == orted_path) {
rc = orte_pls_gridengine_fill_orted_path(&orted_path);
if (ORTE_SUCCESS != rc) {
return rc;
}
}
}
/* If we have a prefix, then modify the PATH and
LD_LIBRARY_PATH environment variables. We're
already in the child process, so it's ok to modify
environ. */
if (NULL != prefix_dir) {
char *oldenv, *newenv;
/* Reset PATH */
oldenv = getenv("PATH");
if (NULL != oldenv) {
asprintf(&newenv, "%s/%s:%s", prefix_dir,
bin_base, oldenv);
} else {
asprintf(&newenv, "%s/%s", prefix_dir, bin_base);
}
opal_setenv("PATH", newenv, true, &environ);
if (mca_pls_gridengine_component.debug) {
opal_output(0, "pls:gridengine: reset PATH: %s", newenv);
}
free(newenv);
/* Reset LD_LIBRARY_PATH */
oldenv = getenv("LD_LIBRARY_PATH");
if (NULL != oldenv) {
asprintf(&newenv, "%s/%s:%s", prefix_dir,
lib_base, oldenv);
} else {
asprintf(&newenv, "%s/%s", prefix_dir, lib_base);
}
opal_setenv("LD_LIBRARY_PATH", newenv, true, &environ);
if (mca_pls_gridengine_component.debug) {
opal_output(0, "pls:gridengine: reset LD_LIBRARY_PATH: %s",
newenv);
}
free(newenv);
}
var = getenv("HOME");
if (NULL != var) {
if (mca_pls_gridengine_component.debug) {
opal_output(0, "pls:gridengine: changing to directory %s",
var);
}
/* Ignore errors -- what are we going to do?
(and we ignore errors on the remote nodes
in the fork pls, so this is consistent) */
chdir(var);
}
/* setup process name */
rc = orte_ns.get_proc_name_string(&name_string, name);
if (ORTE_SUCCESS != rc) {
opal_output(0, "pls:gridengine: unable to create process name");
exit(-1);
}
free(argv[proc_name_index]);
argv[proc_name_index] = strdup(name_string);
if (!mca_pls_gridengine_component.debug) {
/* setup stdin */
int fd = open("/dev/null", O_RDWR);
dup2(fd, 0);
close(fd);
}
/* close all file descriptors w/ exception of stdin/stdout/stderr */
for(fd=3; fd<fdmax; fd++)
close(fd);
/* Set signal handlers back to the default. Do this close
to the execve() because the event library may (and likely
will) reset them. If we don't do this, the event
library may have left some set that, at least on some
OS's, don't get reset via fork() or exec(). Hence, the
orted could be unkillable (for example). */
set_handler_default(SIGTERM);
set_handler_default(SIGINT);
#ifndef __WINDOWS__
set_handler_default(SIGHUP);
set_handler_default(SIGPIPE);
#endif
set_handler_default(SIGCHLD);
/* Unblock all signals, for many of the same reasons that
we set the default handlers, above. This is noticable
on Linux where the event library blocks SIGTERM, but we
don't want that blocked by the orted (or, more
specifically, we don't want it to be blocked by the
orted and then inherited by the ORTE processes that it
forks, making them unkillable by SIGTERM). */
#ifndef __WINDOWS__
sigprocmask(0, 0, &sigs);
sigprocmask(SIG_UNBLOCK, &sigs, 0);
#endif
/* setup environment */
env = opal_argv_copy(environ);
var = mca_base_param_environ_variable("seed",NULL,NULL);
opal_setenv(var, "0", true, &env);
/* exec the daemon */
if (mca_pls_gridengine_component.debug) {
param = opal_argv_join(exec_argv, ' ');
if (NULL != param) {
opal_output(0, "pls:gridengine: executing: %s", param);
free(param);
}
}
execve(exec_path, exec_argv, env);
opal_output(0, "pls:gridengine: execve failed with errno=%d\n", errno);
exit(-1);
} else { /* parent */
gridengine_daemon_info_t *daemon_info;
if (mca_pls_gridengine_component.debug) {
opal_output(0, "pls:gridengine: parent");
}
/* save the daemons name on the node */
if (ORTE_SUCCESS != (rc = orte_pls_base_proxy_set_node_name(ras_node,jobid,name))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* setup callback on sigchild - wait until setup above is complete
* as the callback can occur in the call to orte_wait_cb
*/
daemon_info = OBJ_NEW(gridengine_daemon_info_t);
OBJ_RETAIN(ras_node);
daemon_info->node = ras_node;
daemon_info->jobid = jobid;
orte_wait_cb(pid, orte_pls_gridengine_wait_daemon, daemon_info);
vpid++;
}
free(name);
}
}
cleanup:
while (NULL != (m_item = opal_list_remove_first(&mapping))) {
OBJ_RELEASE(m_item);
}
OBJ_DESTRUCT(&mapping);
if (NULL != lib_base) {
free(lib_base);
}
if (NULL != bin_base) {
free(bin_base);
}
free(jobid_string); /* done with this variable */
opal_argv_free(argv);
return rc;
}
/**
* Query the registry for the gridengine slot count, and update it
*/
static int update_slot_keyval(orte_ras_node_t* ras_node, int* slot_cnt)
{
int rc, *iptr, ivalue;
size_t num_tokens, i, get_cnt;
orte_gpr_value_t** get_values;
char **tokens;
char *get_keys[] = {"orte-gridengine-slot-cnt", NULL};
/* get token */
if (ORTE_SUCCESS != (rc = orte_schema.get_node_tokens(&tokens,
&num_tokens, ras_node->node_cellid, ras_node->node_name))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* setup condition/filter for query - return only processes that
* are assigned to the specified node name
*/
orte_gpr_keyval_t *condition;
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&condition, ORTE_NODE_NAME_KEY, ORTE_STRING, (void*)ras_node->node_name))) {
ORTE_ERROR_LOG(rc);
return NULL;
}
rc = orte_gpr.get_conditional(
ORTE_GPR_KEYS_OR|ORTE_GPR_TOKENS_OR,
ORTE_NODE_SEGMENT,
tokens,
get_keys,
1,
&condition,
&get_cnt,
&get_values);
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* parse the response */
for(i=0; i<get_cnt; i++) {
orte_gpr_value_t* value = get_values[i];
size_t k;
/* looking in each GPR container for the keyval */
for(k=0; k < value->cnt; k++) {
orte_gpr_keyval_t* keyval = value->keyvals[k];
if(strcmp(keyval->key, "orte-gridengine-slot-cnt") == 0) {
if (ORTE_SUCCESS != (rc = orte_dss.get(
(void**)&iptr, keyval->value, ORTE_INT))) {
ORTE_ERROR_LOG(rc);
continue;
}
*slot_cnt = *iptr;
free(iptr);
if (mca_pls_gridengine_component.debug) {
opal_output(0, "pls:gridengine: %s: registry shows PE slots=%d",
ras_node->node_name, *slot_cnt);
}
(*slot_cnt)--; /* account for the current launch */
if (mca_pls_gridengine_component.debug) {
opal_output(0,"pls:gridengine: %s: decrementing, PE slots=%d",
ras_node->node_name, *slot_cnt);
}
orte_data_value_t *put_value;
put_value = OBJ_NEW(orte_data_value_t);
if (NULL == put_value) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
ivalue = *slot_cnt;
put_value->type = ORTE_INT;
put_value->data = &ivalue;
/* put the keyvalue in the segment */
if (ORTE_SUCCESS != (rc = orte_gpr.put_1(
ORTE_GPR_OVERWRITE|ORTE_GPR_TOKENS_XAND,
ORTE_NODE_SEGMENT,
tokens,
"orte-gridengine-slot-cnt",
put_value
))) {
ORTE_ERROR_LOG(rc);
}
continue;
}
}
}
cleanup:
for(i=1; i<get_cnt; i++)
OBJ_RELEASE(get_values[i]);
if (NULL != get_values) free(get_values);
opal_argv_free(tokens);
return rc;
}
/**
* Query the registry for all nodes participating in the job
*/
int orte_pls_gridengine_terminate_job(orte_jobid_t jobid)
{
return orte_pls_base_proxy_terminate_job(jobid);
}
int orte_pls_gridengine_terminate_proc(const orte_process_name_t* proc)
{
return orte_pls_base_proxy_terminate_proc(proc);
}
/**
* Signal all processes associated with this job
*/
int orte_pls_gridengine_signal_job(orte_jobid_t jobid, int32_t signal)
{
return orte_pls_base_proxy_signal_job(jobid, signal);
}
/**
* Signal a specific process.
*/
int orte_pls_gridengine_signal_proc(const orte_process_name_t* proc, int32_t signal)
{
return orte_pls_base_proxy_signal_proc(proc, signal);
}
/**
* Finalize
*/
int orte_pls_gridengine_finalize(void)
{
/* cleanup any pending recvs */
orte_rml.recv_cancel(ORTE_RML_NAME_ANY, ORTE_RML_TAG_RMGR_CLNT);
return ORTE_SUCCESS;
}
/**
* Set signal handler
*/
static void set_handler_default(int sig)
{
#ifndef __WINDOWS__
struct sigaction act;
act.sa_handler = SIG_DFL;
act.sa_flags = 0;
sigemptyset(&act.sa_mask);
sigaction(sig, &act, (struct sigaction *)0);
#endif
}

50
orte/mca/ras/gridengine/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,50 @@
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2006 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
dist_pkgdata_DATA = help-ras-gridengine.txt
sources = \
ras_gridengine.h \
ras_gridengine_component.c \
ras_gridengine_module.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if OMPI_BUILD_ras_gridengine_DSO
component_noinst =
component_install = mca_ras_gridengine.la
else
component_noinst = libmca_ras_gridengine.la
component_install =
endif
mcacomponentdir = $(libdir)/openmpi
mcacomponent_LTLIBRARIES = $(component_install)
mca_ras_gridengine_la_SOURCES = $(sources)
mca_ras_gridengine_la_LDFLAGS = -module -avoid-version
mca_ras_gridengine_la_LIBADD = \
$(top_ompi_builddir)/orte/liborte.la \
$(top_ompi_builddir)/opal/libopal.la
noinst_LTLIBRARIES = $(component_noinst)
libmca_ras_gridengine_la_SOURCES =$(sources)
libmca_ras_gridengine_la_LDFLAGS = -module -avoid-version

26
orte/mca/ras/gridengine/configure.m4 Обычный файл
Просмотреть файл

@ -0,0 +1,26 @@
# -*- shell-script -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2006 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_ras_sge_CONFIG([action-if-found], [action-if-not-found])
# -----------------------------------------------------------
AC_DEFUN([MCA_ras_gridengine_CONFIG],[
AC_CHECK_FUNC([fork], [$1], [$2])
])dnl

26
orte/mca/ras/gridengine/configure.params Обычный файл
Просмотреть файл

@ -0,0 +1,26 @@
# -*- shell-script -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2006 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# Specific to this module
PARAM_INIT_FILE=ras_gridengine.c
PARAM_CONFIG_HEADER_FILE="ras_gridengine.h"
PARAM_CONFIG_FILES="Makefile"

Просмотреть файл

@ -0,0 +1,39 @@
# -*- text -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2006 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
[empty-nodelist-error]
There are not enough slots available in the Grid Engine
Parallel Environment (PE) to satisfy the slot requirement needed to
launch ORTE daemons to the remote hosts, as requested by the application.
Either request fewer slots in application with less value in -np,
or make more PE slots available for use.
[cannot-read-pe-hostfile]
The Grid Engine ras component is not able to read the $PE_HOSTFILE for
the Grid Engine nodes. The $PE_HOSTFILE environment variable shows the
file is located at:
%s
The following error is returned:
%s

54
orte/mca/ras/gridengine/ras_gridengine.h Обычный файл
Просмотреть файл

@ -0,0 +1,54 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
* Resource allocation for Grid Engine
*/
#ifndef ORTE_RAS_GRIDENGINE_H
#define ORTE_RAS_GRIDENGINE_H
#include "orte/mca/ras/ras.h"
#include "orte/mca/ras/base/base.h"
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
/**
* RAS Component
*/
struct orte_ras_gridengine_component_t {
orte_ras_base_component_t super;
int debug;
int verbose;
int priority;
};
typedef struct orte_ras_gridengine_component_t orte_ras_gridengine_component_t;
OMPI_COMP_EXPORT extern orte_ras_gridengine_component_t mca_ras_gridengine_component;
OMPI_COMP_EXPORT extern orte_ras_base_module_t orte_ras_gridengine_module;
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif

Просмотреть файл

@ -0,0 +1,116 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
* Resource allocation for Grid Engine
*/
#include "orte_config.h"
#include "orte/orte_constants.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h"
#include "orte/util/proc_info.h"
#include "opal/util/output.h"
#include "ras_gridengine.h"
/*
* Local functions
*/
static int orte_ras_gridengine_open(void);
static int orte_ras_gridengine_close(void);
static orte_ras_base_module_t* orte_ras_gridengine_init(int* priority);
orte_ras_gridengine_component_t mca_ras_gridengine_component = {
{
/* First, the mca_base_component_t struct containing meta
information about the component itself */
{
/* Indicate that we are a ras v1.0.0 component (which also
implies a specific MCA version) */
ORTE_RAS_BASE_VERSION_1_0_0,
"gridengine", /* MCA component name */
ORTE_MAJOR_VERSION, /* MCA component major version */
ORTE_MINOR_VERSION, /* MCA component minor version */
ORTE_RELEASE_VERSION, /* MCA component release version */
orte_ras_gridengine_open, /* component open */
orte_ras_gridengine_close /* component close */
},
/* Next the MCA v1.0.0 component meta data */
{
/* Whether the component is checkpointable or not */
false
},
orte_ras_gridengine_init
}
};
/**
* component open/close/init function
*/
static int orte_ras_gridengine_open(void)
{
int value;
mca_base_component_t *c = &mca_ras_gridengine_component.super.ras_version;
mca_base_param_reg_int(c, "debug",
"Enable debugging output for the gridengine ras component",
false, false, 0, &mca_ras_gridengine_component.debug);
mca_base_param_reg_int(c, "priority",
"Priority of the gridengine ras component",
false , false, 100, &mca_ras_gridengine_component.priority);
mca_base_param_reg_int(c, "verbose",
"Enable verbose output for the gridengine ras component",
false, false, 0, &value);
if (value != 0) {
mca_ras_gridengine_component.verbose = opal_output_open(NULL);
} else {
mca_ras_gridengine_component.verbose = -1;
}
return ORTE_SUCCESS;
}
static orte_ras_base_module_t *orte_ras_gridengine_init(int* priority)
{
*priority = mca_ras_gridengine_component.priority;
if (NULL != getenv("SGE_ROOT") && NULL != getenv("ARC") &&
NULL != getenv("PE_HOSTFILE")) {
opal_output(orte_ras_base.ras_output,
"ras:gridengine: available for selection");
return &orte_ras_gridengine_module;
}
opal_output(orte_ras_base.ras_output,
"ras:gridengine: NOT available for selection");
return NULL;
}
/**
* Close all subsystems.
*/
static int orte_ras_gridengine_close(void)
{
return ORTE_SUCCESS;
}

Просмотреть файл

@ -0,0 +1,460 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file:
* Resource Allocation for Grid Engine
*/
#include <errno.h>
#include <unistd.h>
#include <string.h>
#include <sys/systeminfo.h>
#include "opal/util/argv.h"
#include "opal/util/output.h"
#include "opal/util/show_help.h"
#include "orte_config.h"
#include "orte/orte_constants.h"
#include "orte/mca/rmgr/base/base.h"
#include "orte/mca/ras/gridengine/ras_gridengine.h"
#include "orte/mca/ras/base/base.h"
#include "orte/mca/ras/base/ras_base_node.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ns/ns.h"
#include "orte/mca/gpr/gpr.h"
/*
* Local functions
*/
static int orte_ras_gridengine_allocate(orte_jobid_t jobid);
static int orte_ras_gridengine_discover(opal_list_t* nodelist,
orte_app_context_t** context, size_t num_context);
static int orte_ras_gridengine_node_insert(opal_list_t* nodes);
static int orte_ras_gridengine_node_query(opal_list_t* nodes);
static int orte_ras_gridengine_deallocate(orte_jobid_t jobid);
static int orte_ras_gridengine_finalize(void);
static int get_slot_count(char* node_name, int* slot_cnt);
static int put_slot_keyval(orte_ras_node_t* node, int slot_cnt);
static int get_slot_keyval(orte_ras_node_t* node, int* slot_cnt);
/*
* Global variable
*/
orte_ras_base_module_t orte_ras_gridengine_module = {
orte_ras_gridengine_allocate,
orte_ras_gridengine_node_insert,
orte_ras_gridengine_node_query,
orte_ras_gridengine_deallocate,
orte_ras_gridengine_finalize
};
/**
* Discover available (pre-allocated) nodes. Allocate the
* requested number of nodes/process slots to the job.
*
*/
static int orte_ras_gridengine_allocate(orte_jobid_t jobid)
{
opal_list_t nodes;
opal_list_item_t* item;
int rc;
orte_app_context_t **context = NULL;
size_t i, num_context;
/* get the context */
rc = orte_rmgr_base_get_app_context(jobid, &context, &num_context);
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* construct a node object and pass to discover to gather valid nodes */
OBJ_CONSTRUCT(&nodes, opal_list_t);
if(ORTE_SUCCESS != (rc =
orte_ras_gridengine_discover(&nodes, context, num_context))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* call the base allocator to allocate the nodes to the jobid */
if(ORTE_SUCCESS != (rc = orte_ras_base_allocate_nodes(jobid, &nodes))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
if (mca_ras_gridengine_component.debug) {
opal_output(0, "ras:gridengine: dumping the orte node segment");
orte_gpr.dump_segment(ORTE_NODE_SEGMENT);
}
cleanup:
while(NULL != (item = opal_list_remove_first(&nodes))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&nodes);
for(i=0; i<num_context; i++) {
OBJ_RELEASE(context[i]);
}
if (NULL != context) {
free(context);
}
return rc;
}
/**
* Discover the available resources.
* - validate any nodes specified via hostfile/commandline
* - check for additional nodes that have already been allocated
*/
static int orte_ras_gridengine_discover(opal_list_t* nodelist,
orte_app_context_t** context, size_t num_context)
{
char *pe_hostfile = getenv("PE_HOSTFILE");
char buf[1024];
int rc, gridengine_slot_cnt;
opal_list_item_t* item;
opal_list_t new_nodes;
FILE *fp;
/* query the nodelist from the registry */
if(ORTE_SUCCESS != (rc = orte_ras_gridengine_node_query(nodelist))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* check the PE_HOSTFILE before continuing on */
if (!(fp = fopen(pe_hostfile, "r"))) {
opal_show_help("help-ras-gridengine.txt", "cannot-read-pe-hostfile",
true, pe_hostfile, strerror(errno));
rc = ORTE_ERROR;
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* parse the pe_hostfile for hostname, slots, etc, then compare the
* current node with a list of hosts in the nodelist, if the current
* node is not found in nodelist, add it in */
OBJ_CONSTRUCT(&new_nodes, opal_list_t);
while (fgets(buf, sizeof(buf), fp)) {
char *tok, *ptr = strtok_r(buf, " \n", &tok);
char *num = strtok_r(NULL, " \n", &tok);
char *queue = strtok_r(NULL, " \n", &tok);
char *arch = strtok_r(NULL, " \n", &tok);
orte_ras_node_t *node;
/* is this node already in the list */
for(item = opal_list_get_first(nodelist);
item != opal_list_get_end(nodelist);
item = opal_list_get_next(item)) {
node = (orte_ras_node_t*)item;
if(strcmp(node->node_name, ptr) == 0) {
opal_output(mca_ras_gridengine_component.verbose,
"ras:gridengine: %s: node already in nodelist", node->node_name);
break; /* break so that the current 'item' is in nodelist */
}
}
/* If the current 'item' is already in the nodelist, then continue
* with the while loop to check next node in the PE_HOSTFILE. */
if(item != opal_list_get_end(nodelist)) {
opal_output(mca_ras_gridengine_component.verbose,
"ras:gridengine: checking next node in pe_hostfile");
continue;
}
/* otherwise, it's a new node. Then create a new node entry */
node = OBJ_NEW(orte_ras_node_t);
if (NULL == node) {
return ORTE_ERR_OUT_OF_RESOURCE;
}
node->node_name = strdup(ptr);
node->node_arch = strdup(arch);
node->node_state = ORTE_NODE_STATE_UP;
node->node_cellid = 0;
node->node_slots_inuse = 0;
node->node_slots_max = 0;
if (ORTE_SUCCESS != (rc =
get_slot_count(node->node_name, &gridengine_slot_cnt))) {
ORTE_ERROR_LOG(rc);
return rc;
}
node->node_slots = gridengine_slot_cnt;
opal_list_append(&new_nodes, &node->super);
/* put the gridengine slot into the gpr to use later */
if (ORTE_SUCCESS != (rc = put_slot_keyval(node, gridengine_slot_cnt))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
} /* finished reading the $PE_HOSTFILE */
/* adding new / undiscovered nodes to the registry */
if(opal_list_get_size(&new_nodes)) {
opal_output(mca_ras_gridengine_component.verbose,
"ras:gridengine: adding new nodes to the registry");
rc = orte_ras_gridengine_node_insert(&new_nodes);
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
}
}
/* append new_nodes to the nodelist */
while(NULL != (item = opal_list_remove_first(&new_nodes)))
opal_list_append(nodelist, item);
/* get the registry key of the remaining gridengine slot count for
* each node. The keyval for each node tells us the number of
* gridengine launches is left for each node in the nodelist */
for(item = opal_list_get_first(nodelist);
item != opal_list_get_end(nodelist);
item = opal_list_get_next(item)) {
orte_ras_node_t *node = (orte_ras_node_t*)item;
int remain_slot_cnt;
opal_output(mca_ras_gridengine_component.verbose,
"ras:gridengine: %s: checking gpr key", node->node_name);
if (ORTE_SUCCESS != (rc = get_slot_keyval(node, &remain_slot_cnt))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
opal_output(mca_ras_gridengine_component.verbose,
"ras:gridengine: %s: remaining PE slots=%d",
node->node_name, remain_slot_cnt);
/* if the remaining gridengine slot reaches down to 0 for this node,
* then remove this node from the nodelist */
if (remain_slot_cnt == 0) {
opal_output(mca_ras_gridengine_component.verbose,
"ras:gridengine: %s: used up all PE slots, removing node",
node->node_name);
opal_list_remove_item(nodelist,item);
OBJ_DESTRUCT(item);
}
}
/* If there are no more nodes available in the nodelist, then quit this job
* because otherwise, other RAS (like localhost) might be able to allocate
* the resource and use the gridengine PLS to do the process launching with qrsh.
* This will lead to failure eventually in gridengine PLS. */
if(opal_list_get_size(nodelist) == 0) {
opal_show_help("help-ras-gridengine.txt", "empty-nodelist-error", true);
rc = ORTE_ERR_NOT_AVAILABLE;
goto cleanup;
}
cleanup:
OBJ_DESTRUCT(&new_nodes);
return rc;
}
/**
* Use this function to set the initial gridengine slot count for the given node
* to the registry.
*/
static int put_slot_keyval(orte_ras_node_t* node, int slot_cnt)
{
/* put our contact info into the registry */
orte_data_value_t *put_value;
int rc, ivalue;
size_t num_tokens;
char **tokens;
opal_output(mca_ras_gridengine_component.verbose,
"ras:gridengine: %s: putting PE slots=%d",
node->node_name, slot_cnt);
put_value = OBJ_NEW(orte_data_value_t);
if (NULL == put_value) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
ivalue = slot_cnt;
put_value->type = ORTE_INT;
put_value->data = &ivalue;
/* get token */
if (ORTE_SUCCESS != (rc = orte_schema.get_node_tokens(&tokens,
&num_tokens, node->node_cellid, node->node_name))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* put the keyval in the segment */
if (ORTE_SUCCESS != (rc = orte_gpr.put_1(
ORTE_GPR_OVERWRITE|ORTE_GPR_TOKENS_XAND,
ORTE_NODE_SEGMENT,
tokens,
"orte-gridengine-slot-cnt",
put_value
))) {
ORTE_ERROR_LOG(rc);
return(rc);
}
return rc;
}
/**
* Use this function to get the remaining gridengine slot count for the given
* node. This will query the registry for the slot count by providing a
* key and set the remaining slot count as a result.
*/
static int get_slot_keyval(orte_ras_node_t* node, int* slot_cnt) {
char **tokens;
size_t num_tokens, i, get_cnt=0;
int rc, *iptr;
orte_gpr_keyval_t *condition;
orte_gpr_value_t** get_values;
char *get_keys[] = {"orte-gridengine-slot-cnt", NULL};
/* get token */
if (ORTE_SUCCESS != (rc = orte_schema.get_node_tokens(&tokens,
&num_tokens, node->node_cellid, node->node_name))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* setup condition/filter for query - return only processes that
* are assigned to the specified node name */
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&condition,
ORTE_NODE_NAME_KEY, ORTE_STRING, (void*)node->node_name))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* get the keyvalue from the node segment */
if(ORTE_SUCCESS != (rc = orte_gpr.get_conditional(
ORTE_GPR_KEYS_OR|ORTE_GPR_TOKENS_OR,
ORTE_NODE_SEGMENT,
tokens,
get_keys,
1,
&condition,
&get_cnt,
&get_values))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* parse the response */
for(i=0; i<get_cnt; i++) {
orte_gpr_value_t* value = get_values[i];
size_t k;
/* looking in each GPR container for keyvals */
for(k=0; k < value->cnt; k++) {
orte_gpr_keyval_t* keyval = value->keyvals[k];
if(strcmp(keyval->key, "orte-gridengine-slot-cnt") == 0) {
if (ORTE_SUCCESS != (rc = orte_dss.get(
(void**)&iptr, keyval->value, ORTE_INT))) {
ORTE_ERROR_LOG(rc);
continue;
}
*slot_cnt = *iptr;
free(iptr);
opal_output(mca_ras_gridengine_component.verbose,
"ras:gridengine: %s: registry shows PE slots=%d",
node->node_name, *slot_cnt);
continue;
}
}
}
cleanup:
for(i=1; i<get_cnt; i++)
OBJ_RELEASE(get_values[i]);
if (NULL != get_values) free(get_values);
opal_argv_free(tokens);
return rc;
}
/**
* Parse the PE_HOSTFILE to determine the number of process
* slots/processors available on the node.
*/
static int get_slot_count(char* node_name, int* slot_cnt)
{
char buf[1024];
char *pe_hostfile = getenv("PE_HOSTFILE");
FILE *fp;
/* check the PE_HOSTFILE before continuing on */
if (!(fp = fopen(pe_hostfile, "r"))) {
opal_show_help("help-ras-gridengine.txt", "cannot-read-pe-hostfile",
true, pe_hostfile, strerror(errno));
ORTE_ERROR_LOG(ORTE_ERROR);
return(ORTE_ERROR);
}
while (fgets(buf, sizeof(buf), fp)) {
char *tok, *name = strtok_r(buf, " \n", &tok);
char *num = strtok_r(NULL, " \n", &tok);
char *queue = strtok_r(NULL, " \n", &tok);
char *arch = strtok_r(NULL, " \n", &tok);
if(strcmp(node_name,name) == 0) {
*slot_cnt = (int) strtol(num, (char **)NULL, 10);
opal_output(mca_ras_gridengine_component.verbose,
"ras:gridengine: %s: PE_HOSTFILE shows slots=%d",
node_name, *slot_cnt);
return ORTE_SUCCESS;
}
}
/* when there is no match */
return ORTE_ERROR;
}
/**
* call the base class to insert nodes
*/
static int orte_ras_gridengine_node_insert(opal_list_t *nodes)
{
return orte_ras_base_node_insert(nodes);
}
/**
* call the base class to query nodes
*/
static int orte_ras_gridengine_node_query(opal_list_t *nodes)
{
return orte_ras_base_node_query(nodes);
}
/**
* call the base class to deallocate nodes
*/
static int orte_ras_gridengine_deallocate(orte_jobid_t jobid)
{
/* Nothing to do */
opal_output(mca_ras_gridengine_component.verbose,
"ras:gridengine:deallocate: success (nothing to do)");
return ORTE_SUCCESS;
}
/**
* finalize
*/
static int orte_ras_gridengine_finalize(void)
{
/* Nothing to do */
opal_output(mca_ras_gridengine_component.verbose,
"ras:gridengine:finalize: success (nothing to do)");
return ORTE_SUCCESS;
}