1
1
This commit was SVN r26244.
Этот коммит содержится в:
Ralph Castain 2012-04-06 15:31:13 +00:00
родитель abf60337de
Коммит ed197acaa2
8 изменённых файлов: 0 добавлений и 1090 удалений

Просмотреть файл

Просмотреть файл

@ -1,60 +0,0 @@
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# Use the top-level Makefile.options
AM_CPPFLAGS = -I$(top_ompi_builddir)/include
AM_OBJCFLAGS = $(plm_xgrid_OBJCFLAGS)
# Automake and Libtool don't completely speak Objective C. Since the
# only Objective C we'll be using is GCC on Mac OS X, we can pretend
# to be C instead of ObjC for libtool and it works well enough. If CC
# and OBJC aren't the same, Libtool doesn't automatically infer that
# we're using C and Automake doesn't add the --tag, so we need to
# explicitly pass the --tag=CC flag to libtool.
AM_LIBTOOLFLAGS = --tag=CC
xgrid_sources = \
src/plm_xgrid.h \
src/plm_xgrid_component.m \
src/plm_xgrid_module.m \
src/plm_xgrid_client.h \
src/plm_xgrid_client.m
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if MCA_BUILD_orte_plm_xgrid_DSO
component_noinst =
component_install = mca_plm_xgrid.la
else
component_noinst = libmca_plm_xgrid.la
component_install =
endif
mcacomponentdir = $(pkglibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_plm_xgrid_la_SOURCES = $(xgrid_sources)
mca_plm_xgrid_la_LDFLAGS = -module -avoid-version $(plm_xgrid_LDFLAGS)
noinst_LTLIBRARIES = $(component_noinst)
libmca_plm_xgrid_la_SOURCES = $(xgrid_sources)
libmca_plm_xgrid_la_LIBADD =
libmca_plm_xgrid_la_LDFLAGS = -module -avoid-version $(plm_xgrid_LDFLAGS)

Просмотреть файл

@ -1,44 +0,0 @@
# -*- shell-script -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2011 Los Alamos National Security, LLC.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_orte_plm_xgrid_CONFIG([action-if-found], [action-if-not-found])
# -----------------------------------------------------------
AC_DEFUN([MCA_orte_plm_xgrid_CONFIG],[
AC_CONFIG_FILES([orte/mca/plm/xgrid/Makefile])
ORTE_CHECK_XGRID([plm_xgrid], [plm_xgrid_good=1], [plm_xgrid_good=0])
# For very dumb reasons involving linking, it's near impossible
# to build the XGrid components as static libraries. Disable if that's
# the case.
AS_IF([test "$plm_xgrid_good" = "0" -a "$orte_without_full_support" = 0], [$2],
[AS_IF([test "$compile_mode" = "dso"],
[ # plm_xgrid_LDFLAGS will be set by ORTE_CHECK_XGRID
plm_xgrid_WRAPPER_EXTRA_LDFLAGS="$plm_xgrid_LDFLAGS"
$1],
[AC_MSG_WARN([XGrid components must be built as DSOs. Disabling])
$2])])
# set build flags to use in makefile
AC_SUBST([plm_xgrid_OBJCFLAGS])
AC_SUBST([plm_xgrid_LDFLAGS])
])dnl

Просмотреть файл

@ -1,39 +0,0 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/*
* NOTE: This header is an Objective-C file. It might not do what
* you intend with a C/C++ compiler
*/
#import "orte/mca/plm/plm.h"
#import "plm_xgrid_client.h"
/**
* PLM Component
*/
struct orte_plm_xgrid_component_t {
orte_plm_base_component_t super;
PlmXGridClient *client;
NSAutoreleasePool *pool;
};
typedef struct orte_plm_xgrid_component_t orte_plm_xgrid_component_t;
extern orte_plm_xgrid_component_t mca_plm_xgrid_component;
extern orte_plm_base_module_1_0_0_t orte_plm_xgrid_module;
int orte_plm_xgrid_progress(void);

Просмотреть файл

@ -1,76 +0,0 @@
/* -*- ObjC -*-
*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
*/
#import <Foundation/Foundation.h>
#import <XgridFoundation/XgridFoundation.h>
#import <Foundation/NSString.h>
#import "opal/threads/condition.h"
@interface PlmXGridClient : NSObject
{
NSString *orted;
NSString *controller_hostname;
NSString *controller_password;
/* state of the world... */
opal_condition_t state_cond;
opal_mutex_t state_mutex;
XGConnection *connection;
XGController *controller;
XGGrid *grid;
int cleanup;
NSMutableDictionary *active_xgrid_jobs;
}
/* init / finalize */
-(id) init;
-(id) initWithControllerHostname: (char*) hostnam
AndControllerPassword: (char*) password
AndOrted: (char*) ortedname
AndCleanup: (int) val;
-(void) dealloc;
/* accessors */
-(NSString*) getOrted;
-(void) setOrtedAsCString: (char*) name;
-(void) setControllerPasswordAsCString: (char*) name;
-(void) setControllerHostnameAsCString: (char*) password;
-(void) setCleanUp: (int) val;
-(NSString*)servicePrincipal;
/* interface for launch */
-(int) connect;
-(int) launchOrteds:(orte_job_t*) jdata;
-(int) terminateOrteds;
/* delegate for changes */
-(void) connectionDidOpen:(XGConnection*) connection;
-(void) connectionDidNotOpen:(XGConnection*)connection withError:(NSError*) error;
-(void) connectionDidClose:(XGConnection *) connection;
/* Helper function */
-(NSMutableArray*) getArgumentsForOrtedLaunch;
@end

Просмотреть файл

@ -1,454 +0,0 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#import "orte_config.h"
#import <stdio.h>
#import "opal/util/path.h"
#import "orte/constants.h"
#import "orte/mca/rml/rml.h"
#import "orte/mca/plm/base/base.h"
#import "orte/mca/plm/base/plm_private.h"
#import "orte/mca/plm/plm.h"
#import "orte/mca/errmgr/errmgr.h"
#import "orte/mca/ras/ras_types.h"
#import "orte/mca/rmaps/rmaps.h"
#import "plm_xgrid_client.h"
@implementation PlmXGridClient
/* init / finalize */
-(id) init
{
return [self initWithControllerHostname: NULL
AndControllerPassword: NULL
AndOrted: NULL
AndCleanup: 1];
}
-(id) initWithControllerHostname: (char*) hostname
AndControllerPassword: (char*) password
AndOrted: (char*) ortedname
AndCleanup: (int) val
{
if (self = [super init]) {
/* class-specific initialization goes here */
OBJ_CONSTRUCT(&state_cond, opal_condition_t);
OBJ_CONSTRUCT(&state_mutex, opal_mutex_t);
if (NULL != password) {
controller_password = [NSString stringWithUTF8String: password];
}
if (NULL != hostname) {
controller_hostname = [NSString stringWithUTF8String: hostname];
}
cleanup = val;
if (NULL != ortedname) {
orted = [NSString stringWithUTF8String: ortedname];
}
active_xgrid_jobs = [NSMutableDictionary dictionary];
}
return self;
}
-(void) dealloc
{
/* if supposed to clean up jobs, do so */
if (cleanup) {
NSArray *keys = [active_xgrid_jobs allKeys];
NSEnumerator *enumerator = [keys objectEnumerator];
NSString *key;
XGJob *job;
XGActionMonitor *actionMonitor;
while (key = [enumerator nextObject]) {
job = [grid jobForIdentifier: [active_xgrid_jobs objectForKey: key]];
actionMonitor = [job performDeleteAction];
while (XGActionMonitorOutcomeNone == [actionMonitor outcome]) {
opal_progress();
}
/* we should have a result - find out if it worked */
if (XGActionMonitorOutcomeSuccess != [actionMonitor outcome]) {
NSError *err = [actionMonitor error];
fprintf(stderr, "orte:plm:xgrid: cleanup failed: %s\n",
[[err localizedDescription] UTF8String]);
}
}
}
/* need to shut down connection */
[connection finalize];
OBJ_DESTRUCT(&state_mutex);
OBJ_DESTRUCT(&state_cond);
[super dealloc];
}
/* accessors */
-(NSString*) getOrted
{
return orted;
}
-(void) setOrtedAsCString: (char*) name
{
orted = [NSString stringWithUTF8String: name];
}
-(void) setControllerPasswordAsCString: (char*) name
{
controller_password = [NSString stringWithUTF8String: name];
}
-(void) setControllerHostnameAsCString: (char*) password
{
controller_hostname = [NSString stringWithUTF8String: password];
}
-(void) setCleanUp: (int) val
{
cleanup = val;
}
- (NSString *)servicePrincipal;
{
NSString *myServicePrincipal = [connection servicePrincipal];
if (myServicePrincipal == nil) {
myServicePrincipal = [NSString stringWithFormat:@"xgrid/%@", [connection name]];
}
opal_output_verbose(1, orte_plm_globals.output,
"orte:plm:xgrid: Kerberos servicePrincipal: %s",
[myServicePrincipal UTF8String]);
return myServicePrincipal;
}
/* interface for launch */
-(int) connect
{
connection = [[[XGConnection alloc] initWithHostname: controller_hostname
portnumber:0] autorelease];
if (nil == controller_password) {
opal_output_verbose(1, orte_plm_globals.output,
"orte:plm:xgrid: Using Kerberos authentication");
XGGSSAuthenticator *authenticator =
[[[XGGSSAuthenticator alloc] init] autorelease];
opal_output_verbose(1, orte_plm_globals.output,
"orte:plm:xgrid: Kerberos principal: %s",
[[self servicePrincipal] UTF8String]);
[authenticator setServicePrincipal:[self servicePrincipal]];
[connection setAuthenticator:authenticator];
} else {
opal_output_verbose(1, orte_plm_globals.output,
"orte:plm:xgrid: Using password authentication");
XGTwoWayRandomAuthenticator *authenticator =
[[[XGTwoWayRandomAuthenticator alloc] init] autorelease];
/* this seems to be hard coded */
[authenticator setUsername:@"one-xgrid-client"];
[authenticator setPassword:controller_password];
[connection setAuthenticator:authenticator];
}
[connection setDelegate: self];
/* get us connected */
opal_mutex_lock(&state_mutex);
[connection open];
while ([connection state] == XGConnectionStateOpening) {
opal_condition_wait(&state_cond, &state_mutex);
}
opal_mutex_unlock(&state_mutex);
/* if we're not connected when the condition is triggered, we
dont' have a connection and can't start. exit. */
if ([connection state] != XGConnectionStateOpen) {
return ORTE_ERR_NOT_AVAILABLE;
}
opal_output_verbose(1, orte_plm_globals.output,
"orte:plm:xgrid: connection name: %s",
[[connection name] UTF8String]);
controller = [[XGController alloc] initWithConnection:connection];
/* need to call progress exactly once for some reason to get the
controller happy enough to allow us to assign the grid */
opal_progress();
grid = [controller defaultGrid];
opal_output_verbose(1, orte_plm_globals.output,
"plm: xgrid: grid name: %s",
[[grid identifier] UTF8String]);
return ORTE_SUCCESS;
}
-(int) launchOrteds:(orte_job_t*) jdata
{
orte_job_map_t *map = NULL;
opal_list_item_t *item;
int rc = ORTE_SUCCESS;
char *orted_path = NULL;
bool failed_launch = true;
orte_node_t **nodes;
orte_std_cntr_t nnode;
char *vpid_string;
/* Get the map for this job */
if (NULL == (map = orte_rmaps.get_job_map(jdata->jobid))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
rc = ORTE_ERR_NOT_FOUND;
goto cleanup;
}
/* get the nodes list */
nodes = (orte_node_t**)map->nodes->addr;
/* Shortcut out of here */
if (0 == map->num_new_daemons) {
/* have all the daemons we need - launch app */
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:rsh: no new daemons to launch",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
return ORTE_SUCCESS;
}
/* find orted */
orted_path = opal_path_findv((char*) [orted UTF8String], 0, environ, NULL);
/* build up the array of task specifications */
NSMutableDictionary *taskSpecifications = [NSMutableDictionary dictionary];
for (nnode=0 ; nnode < map->num_nodes ; nnode++) {
opal_output_verbose(1, orte_plm_globals.output,
"orte:plm:xgrid: launching on node %s",
nodes[nnode]->name);
/* Create the task */
NSMutableDictionary *task = [NSMutableDictionary dictionary];
/* fill in applicaton to start */
[task setObject: [NSString stringWithUTF8String: orted_path]
forKey: XGJobSpecificationCommandKey];
/* fill in task arguments */
NSMutableArray *taskArguments = [self getArgumentsForOrtedLaunch];
[taskArguments addObject: @"-mca"];
[taskArguments addObject: @"orte_ess_vpid"];
rc = orte_util_convert_vpid_to_string(&vpid_string,
nodes[nnode]->daemon->name.vpid);
if (ORTE_SUCCESS != rc) {
opal_output(0, "orte_plm_rsh: unable to get daemon vpid as string");
goto cleanup;
}
[taskArguments addObject: [NSString stringWithUTF8String: vpid_string]];
free(vpid_string);
[taskArguments addObject: @"--nodename"];
[taskArguments addObject: [NSString stringWithUTF8String: nodes[nnode]->name]];
[task setObject: taskArguments forKey: XGJobSpecificationArgumentsKey];
/* Add task to the task specification dictionary */
[taskSpecifications setObject: task
forKey: [NSString stringWithFormat: @"%d", nnode]];
}
/* job specification */
NSMutableDictionary *jobSpecification = [NSMutableDictionary dictionary];
[jobSpecification setObject:XGJobSpecificationTypeTaskListValue
forKey:XGJobSpecificationTypeKey];
[jobSpecification setObject: [NSString stringWithFormat:
@"org.open-mpi.plm.xgrid"]
forKey:XGJobSpecificationSubmissionIdentifierKey];
[jobSpecification setObject: [NSString stringWithFormat: @"Open MPI Job %u",
jdata->jobid]
forKey:XGJobSpecificationNameKey];
[jobSpecification setObject:taskSpecifications
forKey:XGJobSpecificationTaskSpecificationsKey];
/* Submit the request and get our monitor */
XGActionMonitor *actionMonitor =
[controller performSubmitJobActionWithJobSpecification: jobSpecification
gridIdentifier: [grid identifier]];
/* wait until we have some idea if job succeeded or not */
while (XGActionMonitorOutcomeNone == [actionMonitor outcome]) {
opal_progress();
}
/* we should have a result - find out if it worked */
if (XGActionMonitorOutcomeSuccess == [actionMonitor outcome]) {
rc = ORTE_SUCCESS;
} else {
NSError *err = [actionMonitor error];
fprintf(stderr, "orte:plm:xgrid: launch failed: (%d) %s\n",
[actionMonitor outcome],
[[err localizedDescription] UTF8String]);
rc = ORTE_ERROR;
goto cleanup;
}
/* save the XGJob identifier somewhere we can get to it */
[active_xgrid_jobs setObject: [[actionMonitor results] objectForKey: @"jobIdentifier"]
forKey: [NSString stringWithFormat: @"%u", jdata->jobid]];
/* wait for daemons to callback */
if (ORTE_SUCCESS != (rc = orte_plm_base_daemon_callback(map->num_new_daemons))) {
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:xgrid: daemon launch failed for job %s on error %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jdata->jobid), ORTE_ERROR_NAME(rc)));
goto cleanup;
}
cleanup:
opal_output_verbose(1, orte_plm_globals.output,
"orte:plm:xgrid:launch: finished, rc=%d\n", rc);
return rc;
}
-(int) terminateOrteds
{
NSArray *keys = [active_xgrid_jobs allKeys];
NSEnumerator *enumerator = [keys objectEnumerator];
NSString *key;
XGJob *job;
XGActionMonitor *actionMonitor;
int ret = ORTE_SUCCESS;
while (key = [enumerator nextObject]) {
job = [grid jobForIdentifier: [active_xgrid_jobs objectForKey: key]];
actionMonitor = [job performStopAction];
while (XGActionMonitorOutcomeNone == [actionMonitor outcome]) {
opal_progress();
}
/* we should have a result - find out if it worked */
if (XGActionMonitorOutcomeSuccess != [actionMonitor outcome]) {
NSError *err = [actionMonitor error];
fprintf(stderr, "orte:plm:xgrid: terminate failed: %s\n",
[[err localizedDescription] UTF8String]);
ret = ORTE_ERROR;
}
}
return ret;
}
/* delegate for changes */
-(void) connectionDidOpen:(XGConnection*) myConnection
{
/* this isn't an error condition -- we finally opened the
connection, so trigger the condition variable we're waiting
on */
opal_condition_broadcast(&state_cond);
}
-(void) connectionDidNotOpen:(XGConnection*) myConnection withError: (NSError*) error
{
opal_output(orte_plm_globals.output,
"orte:plm:xgrid: Controller connection did not open: (%ld) %s",
(long)[error code],
[[error localizedDescription] UTF8String]);
opal_condition_broadcast(&state_cond);
}
-(void) connectionDidClose:(XGConnection*) myConnection
{
// check for success
if ([myConnection error] != nil) {
switch ([[myConnection error] code]) {
case 200:
/* success */
break;
case 530:
case 535:
opal_output(orte_plm_globals.output,
"orte:plm:xgrid: Connection to XGrid controller failed due to authentication error (%ld):",
(long)[[myConnection error] code]);
break;
default:
opal_output(orte_plm_globals.output,
"orte:plm:xgrid: Connection to XGrid controller unexpectedly closed: (%ld) %s",
(long)[[myConnection error] code],
[[[myConnection error] localizedDescription] UTF8String]);
break;
}
} else {
opal_output(orte_plm_globals.output,
"orte:plm:xgrid: Connection to XGrid controller unexpectedly closed");
}
opal_condition_broadcast(&state_cond);
}
-(NSMutableArray*) getArgumentsForOrtedLaunch
{
char **argv = NULL;
int argc = 0;
int i;
orte_plm_base_orted_append_basic_args(&argc, &argv,
"env",
NULL,
NULL);
/* Note that capacity is a starting capacity, not max */
NSMutableArray *ret = [NSMutableArray arrayWithCapacity: argc];
for (i = 0 ; i < argc ; ++i) {
[ret addObject: [NSString stringWithUTF8String: argv[i]]];
}
if (NULL != argv) opal_argv_free(argv);
return ret;
}
@end

Просмотреть файл

@ -1,172 +0,0 @@
/*
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
*/
#import "orte_config.h"
#import <stdlib.h>
#import <unistd.h>
#import "orte/constants.h"
#import "opal/util/argv.h"
#import "opal/util/path.h"
#import "opal/util/basename.h"
#import "orte/util/proc_info.h"
#import "orte/mca/plm/plm.h"
#import "orte/mca/plm/base/base.h"
#import "orte/mca/plm/base/plm_private.h"
#import "opal/mca/base/mca_base_param.h"
#import "plm_xgrid.h"
#import "plm_xgrid_client.h"
int orte_plm_xgrid_component_open(void);
int orte_plm_xgrid_component_close(void);
int orte_plm_xgrid_component_query(mca_base_module_t **module, int *priority);
/*
* Instantiate the public struct with all of our public information
* and pointers to our public functions in it
*/
orte_plm_xgrid_component_t mca_plm_xgrid_component = {
{
/* First, the mca_component_t struct containing meta information
about the component itself */
{
ORTE_PLM_BASE_VERSION_2_0_0,
/* Component name and version */
"xgrid",
ORTE_MAJOR_VERSION,
ORTE_MINOR_VERSION,
ORTE_RELEASE_VERSION,
/* Component open and close functions */
orte_plm_xgrid_component_open,
orte_plm_xgrid_component_close,
orte_plm_xgrid_component_query
},
{
/* This component is not checkpointable */
MCA_BASE_METADATA_PARAM_NONE
}
}
};
int
orte_plm_xgrid_component_open(void)
{
mca_base_param_reg_string(&mca_plm_xgrid_component.super.base_version,
"orted",
"The command name that the component will invoke for the ORTE daemon",
false, false, "orted", NULL);
mca_base_param_reg_int(&mca_plm_xgrid_component.super.base_version,
"priority",
"Priority of the xgrid plm component",
false, false, 20, NULL);
mca_base_param_reg_int(&mca_plm_xgrid_component.super.base_version,
"delete_job",
"Delete job from XGrid controller's database on job completion",
false, false, 1, NULL);
mca_base_param_reg_int(&mca_plm_xgrid_component.super.base_version,
"num_slots",
"Number of slots to reserve for job (including future spawned processes). "
"0 will result in number of processes requested in initial launch.",
false, false, 0, NULL);
return ORTE_SUCCESS;
}
int
orte_plm_xgrid_component_close(void)
{
return ORTE_SUCCESS;
}
int orte_plm_xgrid_component_query(mca_base_module_t **module, int *priority)
{
char *string;
int ret, val, param;
if (NULL == getenv("XGRID_CONTROLLER_HOSTNAME")) {
opal_output_verbose(10, orte_plm_globals.output,
"orte:plm:xgrid: not available: controller info not set");
*module = NULL;
return ORTE_ERROR;
}
opal_output_verbose(1, orte_plm_globals.output,
"orte:plm:xgrid: initializing PlmXGridClient");
mca_plm_xgrid_component.pool = [[NSAutoreleasePool alloc] init];
mca_plm_xgrid_component.client = [[PlmXGridClient alloc] init];
/* setup daemon name */
param = mca_base_param_find("plm", "xgrid", "orted");
mca_base_param_lookup_string(param, &string);
[mca_plm_xgrid_component.client setOrtedAsCString: string];
if (NULL != string) free(string);
/* setup contact information */
if (NULL != getenv("XGRID_CONTROLLER_PASSWORD")) {
[mca_plm_xgrid_component.client setControllerPasswordAsCString:
getenv("XGRID_CONTROLLER_PASSWORD")];
}
[mca_plm_xgrid_component.client setControllerHostnameAsCString:
getenv("XGRID_CONTROLLER_HOSTNAME")];
/* info we need */
param = mca_base_param_find("plm", "xgrid", "priority");
mca_base_param_lookup_int(param, priority);
param = mca_base_param_find("plm", "xgrid", "delete_job");
mca_base_param_lookup_int(param, &val);
[mca_plm_xgrid_component.client setCleanUp: val];
opal_progress_register(orte_plm_xgrid_progress);
ret = [mca_plm_xgrid_component.client connect];
if (ret != ORTE_SUCCESS) {
opal_output_verbose(10, orte_plm_globals.output,
"orte:plm:xgrid: not available: connection failed");
orte_plm_xgrid_finalize();
*module = NULL;
return ORTE_ERROR;
}
opal_output_verbose(10, orte_plm_globals.output,
"orte:plm:xgrid: initialized");
*module = (mca_base_module_t *) &orte_plm_xgrid_module;
return ORTE_SUCCESS;
}
int
orte_plm_xgrid_progress(void)
{
/* tick the event loop */
[[NSRunLoop currentRunLoop] runUntilDate:
[NSDate dateWithTimeIntervalSinceNow:1]];
}

Просмотреть файл

@ -1,245 +0,0 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2008 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
*/
#import "orte_config.h"
#import <stdlib.h>
#import <unistd.h>
#import <errno.h>
#import <string.h>
#import <sys/types.h>
#import <sys/stat.h>
#import <sys/wait.h>
#import <fcntl.h>
#ifdef HAVE_SYS_TIME_H
#import <sys/time.h>
#endif
#import "orte/constants.h"
#import "opal/util/argv.h"
#import "opal/class/opal_pointer_array.h"
#import "orte/util/show_help.h"
#import "orte/util/session_dir.h"
#import "opal/mca/event/event.h"
#import "orte/runtime/orte_wait.h"
#import "orte/mca/plm/plm.h"
#import "orte/mca/plm/base/plm_private.h"
#import "orte/mca/rml/rml.h"
#import "orte/mca/errmgr/errmgr.h"
#import "orte/mca/rmaps/rmaps.h"
#import "orte/mca/iof/iof.h"
#import "plm_xgrid.h"
int orte_plm_xgrid_init(void);
int orte_plm_xgrid_spawn(orte_job_t *jdata);
int orte_plm_xgrid_terminate_orteds(void);
int orte_plm_xgrid_signal_job(orte_jobid_t job, int32_t signal);
int orte_plm_xgrid_finalize(void);
orte_plm_base_module_1_0_0_t orte_plm_xgrid_module = {
orte_plm_xgrid_init,
orte_plm_base_set_hnp_name,
orte_plm_xgrid_spawn,
NULL,
orte_plm_base_orted_terminate_job,
orte_plm_xgrid_terminate_orteds,
orte_plm_base_orted_kill_local_procs,
orte_plm_xgrid_signal_job,
orte_plm_xgrid_finalize
};
/* counter of number of "nodes" created */
static int node_counter = 0;
int
orte_plm_xgrid_init(void)
{
int rc;
if (ORTE_SUCCESS != (rc = orte_plm_base_comm_start())) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
static int
orte_plm_xgrid_make_nodes(orte_job_t *jdata)
{
int num_nodes = 0, param, i, rc;
orte_app_context_t *app, **apps;
/* figure out how many slots we need */
apps = (orte_app_context_t**)jdata->apps->addr;
for(i = 0 ; i < jdata->num_apps ; i++) {
app = apps[i];
if (0 == app->num_procs) return ORTE_ERR_NOT_SUPPORTED;
num_nodes += app->num_procs;
}
/* Create node entries for the orteds we're going to spawn. */
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_size(orte_node_pool, num_nodes))) {
ORTE_ERROR_LOG(rc);
return rc;
}
for (i = 0 ; i < num_nodes ; ++i) {
orte_node_t *node = OBJ_NEW(orte_node_t);
if (NULL == node) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
rc = ORTE_ERR_OUT_OF_RESOURCE;
return rc;
}
asprintf(&node->name, "ompi-xgrid-node-%d", node_counter++);
node->state = ORTE_NODE_STATE_UP;
node->slots_inuse = 0;
node->slots_max = 0;
node->slots = 1;
node->slots_alloc = 1;
node->index = opal_pointer_array_add(orte_node_pool, (void*)node);
/* update the total slots in the job */
jdata->total_slots_alloc += node->slots_alloc;
}
jdata->oversubscribe_override = true;
return ORTE_SUCCESS;
}
int
orte_plm_xgrid_spawn(orte_job_t *jdata)
{
int rc;
orte_process_name_t name = {ORTE_JOBID_INVALID, 0};
bool failed_launch = true;
/* create a jobid for this job */
if (ORTE_SUCCESS != (rc = orte_plm_base_create_jobid(jdata))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:xgrid: launching job %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jdata->jobid)));
/* insert the job object into the global pool */
opal_pointer_array_add(orte_job_data, jdata);
if (ORTE_SUCCESS != (rc = orte_plm_xgrid_make_nodes(jdata))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:xgrid: mapping job %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jdata->jobid)));
if (ORTE_SUCCESS != (rc = orte_rmaps.map_job(jdata))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:xgrid: setting up I/O for %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jdata->jobid)));
/* launch new daemons */
rc = [mca_plm_xgrid_component.client launchOrteds: jdata];
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* Daemons are running - launch the applications */
if (ORTE_SUCCESS != (rc = orte_plm_base_launch_apps(jdata->jobid))) {
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:xgrid: launch of apps failed for job %s on error %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jdata->jobid), ORTE_ERROR_NAME(rc)));
goto cleanup;
}
/* get here if launch went okay */
failed_launch = false;
cleanup:
/* check for failed launch - if so, force terminate */
if (failed_launch) {
orte_errmgr.update_state(jdata->jobid, ORTE_JOB_STATE_FAILED_TO_START,
NULL, ORTE_PROC_STATE_UNDEF,
0, ORTE_ERROR_DEFAULT_EXIT_CODE);
}
return rc;
}
int
orte_plm_xgrid_terminate_orteds(void)
{
int rc;
rc = orte_plm_base_orted_exit(ORTE_DAEMON_HALT_VM_CMD);
if (ORTE_SUCCESS != rc) {
rc = [mca_plm_xgrid_component.client terminateOrteds];
}
if (ORTE_SUCCESS != rc) ORTE_ERROR_LOG(rc);
return rc;
}
int
orte_plm_xgrid_signal_job(orte_jobid_t jobid, int32_t signal)
{
int rc;
/* order them to pass this signal to their local procs */
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_signal_local_procs(jobid, signal))) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
int
orte_plm_xgrid_finalize(void)
{
int rc;
/* cleanup any pending recvs */
if (ORTE_SUCCESS != (rc = orte_plm_base_comm_stop())) {
ORTE_ERROR_LOG(rc);
}
[mca_plm_xgrid_component.client release];
[mca_plm_xgrid_component.pool release];
opal_progress_unregister(orte_plm_xgrid_progress);
return ORTE_SUCCESS;
}