1
1

Added xcpu component in pls and soh.

This commit was SVN r9491.
Этот коммит содержится в:
Sushant Sharma 2006-03-31 02:19:52 +00:00
родитель c2b6e86766
Коммит 46f84b1e8e
13 изменённых файлов: 1103 добавлений и 0 удалений

63
config/ompi_check_xcpu.m4 Обычный файл
Просмотреть файл

@ -0,0 +1,63 @@
# -*- shell-script -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# OMPI_CHECK_XCPU(prefix, [action-if-found], [action-if-not-found])
# --------------------------------------------------------
AC_DEFUN([OMPI_CHECK_XCPU],[
AC_ARG_WITH([xcpu],
[AC_HELP_STRING([--with-xcpu],
[Path to xcpu installation])])
AS_IF([test ! -z "$with_xcpu" -a "$with_xcpu" = "no"],[$4], [
ompi_check_xcpu_save_CPPFLAGS="$CPPFLAGS"
ompi_check_xcpu_save_LDFLAGS="$LDFLAGS"
ompi_check_xcpu_save_LIBS="$LIBS"
AS_IF([test ! -z "$with_xcpu" -a "$with_xcpu" != "yes"],
[CPPFLAGS="$CPPFLAGS -I$with_xcpu/include"
LDFLAGS="$LDFLAGS -L$with_xcpu/lib"])
AC_CHECK_HEADERS([sys/xcpu.h],
[AC_CHECK_LIB([xcpu],
[check_for_xcpu],
[ompi_check_xcpu_works="yes"],
[ompi_check_xcpu_works="no"])],
[AC_CHECK_LIB([xcpu],
[check_for_xcpu],
[ompi_check_xcpu_works="yes"],
[ompi_check_xcpu_works="no"])])
# check for library irrespective of if xcpu.h is there or not
# 'cause I am not sure
# if we need to check for xcpu.h
CPPFLAGS="$ompi_check_xcpu_save_CPPFLAGS"
LDFLAGS="$ompi_check_xcpu_save_LDFLAGS"
LIBS="$ompi_check_xcpu_save_LIBS"
AS_IF([test "$ompi_check_xcpu_works" != "no"],
[AS_IF([test ! -z "$with_xcpu" -a "$with_xcpu" != "yes"],
[$1_CPPFLAGS="$$1_CPPFLAGS -I$with_xcpu/include"
$1_LDFLAGS="$$1_LDFLAGS -L$with_xcpu/lib"])
$1_LIBS="$$1_LIBS -lxcpu"
AS_IF([test "$ompi_check_xcpu_works" = "yes"], [$2], [$3])],
[AS_IF([test ! -z "$with_xcpu"],
[AC_MSG_ERROR([xcpu support requested but not found. Perhaps
you need to specify the location of the xcpu libraries.])])
$4])
])
])

52
orte/mca/pls/xcpu/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,52 @@
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
#dist_pkgdata_DATA = help-pls-bproc.txt
AM_CPPFLAGS = -I$(top_ompi_builddir)/src/include $(pls_xcpu_CPPFLAGS) -DORTE_BINDIR="\"$(bindir)\""
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if OMPI_BUILD_pls_xcpu_DSO
component_noinst =
component_install = mca_pls_xcpu.la
else
component_noinst = libmca_pls_xcpu.la
component_install =
endif
sources = \
pls_xcpu.h \
pls_xcpu.c \
pls_xcpu_component.c
mcacomponentdir = $(libdir)/openmpi
mcacomponent_LTLIBRARIES = $(component_install)
mca_pls_xcpu_la_SOURCES = $(sources)
mca_pls_xcpu_la_LIBADD = \
$(pls_xcpu_LIBS) \
$(top_ompi_builddir)/orte/liborte.la \
$(top_ompi_builddir)/opal/libopal.la
mca_pls_xcpu_la_LDFLAGS = -module -avoid-version $(pls_xcpu_LDFLAGS)
noinst_LTLIBRARIES = $(component_noinst)
libmca_pls_xcpu_la_SOURCES = $(sources)
libmca_pls_xcpu_la_LIBADD = $(pls_xcpu_LIBS)
libmca_pls_xcpu_la_LDFLAGS = -module -avoid-version $(pls_xcpu_LDFLAGS)

37
orte/mca/pls/xcpu/configure.m4 Обычный файл
Просмотреть файл

@ -0,0 +1,37 @@
# -*- shell-script -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_pls_xcpu_CONFIG([action-if-found], [action-if-not-found])
# -----------------------------------------------------------
AC_DEFUN([MCA_pls_xcpu_CONFIG],[
OMPI_CHECK_XCPU([pls_xcpu], [pls_xcpu_good=1], [pls_xcpu_good=0])
# if xcpu is present and working, pls_xcpu_good=1.
# Evaluate succeed / fail
AS_IF([test "$pls_xcpu_good" = "1"],
[pls_xcpu_WRAPPER_EXTRA_LDFLAGS="$pls_xcpu_LDFLAGS"
pls_xcpu_WRAPPER_EXTRA_LIBS="$pls_xcpu_LIBS"
$1],
[$2])
# set build flags to use in makefile
AC_SUBST([pls_xcpu_CPPFLAGS])
AC_SUBST([pls_xcpu_LDFLAGS])
AC_SUBST([pls_xcpu_LIBS])
])dnl

24
orte/mca/pls/xcpu/configure.params Обычный файл
Просмотреть файл

@ -0,0 +1,24 @@
# -*- shell-script -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# Specific to this module
PARAM_INIT_FILE=pls_xcpu.c
PARAM_CONFIG_FILES="Makefile"

182
orte/mca/pls/xcpu/pls_xcpu.c Обычный файл
Просмотреть файл

@ -0,0 +1,182 @@
/* -*- C -*-
*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
*/
/* @file:
* xcpu Lancher to launch jobs on compute nodes..
*/
#include "orte_config.h"
#if HAVE_SYS_TYPES_H
#include <sys/types.h>
#endif /* HAVE_SYS_TYPES_H */
#ifdef HAVE_SYS_STAT_H
#include <sys/stat.h>
#endif /* HAVE_SYS_STAT_H */
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#include <errno.h>
#include <signal.h>
#ifdef HAVE_FCNTL_H
#include <fcntl.h>
#endif /* HAVE_FCNTL_H */
#ifdef HAVE_STRING_H
#include <string.h>
#endif /* HAVE_STRING_H */
#include "opal/event/event.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/util/argv.h"
#include "opal/util/output.h"
#include "opal/util/opal_environ.h"
#include "opal/util/path.h"
#include "opal/util/show_help.h"
#include "orte/dss/dss.h"
#include "orte/util/sys_info.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/gpr/base/base.h"
#include "orte/mca/iof/iof.h"
#include "orte/mca/ns/base/base.h"
#include "orte/mca/sds/base/base.h"
#include "orte/mca/oob/base/base.h"
#include "orte/mca/ras/base/base.h"
#include "orte/mca/rmgr/base/base.h"
#include "orte/mca/rmaps/base/base.h"
#include "orte/mca/rmaps/base/rmaps_base_map.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/soh/base/base.h"
#include "orte/runtime/orte_wait.h"
#include "orte/runtime/runtime.h"
#include "pls_xcpu.h"
/**
* Initialization of the xcpu module with all the needed function pointers
*/
orte_pls_base_module_t orte_pls_xcpu_module = {
orte_pls_xcpu_launch,
orte_pls_xcpu_terminate_job,
orte_pls_xcpu_terminate_proc,
orte_pls_xcpu_finalize
};
int lrx(int argc, char **argv);
int get_argc(char **argv){
int i=0;
while(argv[i]){
i++;
}
return i;
}
void free_stack(tid_stack *s){
if(s){
free_stack(s->next);
free(s);
}
}
/* This is the main function that will launch jobs on remote compute modes
* @param jobid the jobid of the job to launch
* @retval ORTE_SUCCESS or error
*/
int orte_pls_xcpu_launch(orte_jobid_t jobid){
opal_list_t mapping;
char **new_argv;
int new_argc, nprocs=0;
int rc, i=0;
tid_stack *t_stack, *temp_stack;
opal_list_item_t *item, *temp;
orte_rmaps_base_map_t* map;
/* first get the list of nodes on which we are going to launch job */
/* OBJ_CONSTRUCT construct/initialize objects that are not dynamically allocated.
* see file opal/class/opal_object.h for detils
*/
/*fprintf(stdout, "\nxcpu launch called, job id: %d\n", jobid);*/
OBJ_CONSTRUCT(&mapping, opal_list_t);
/* 1. get map from registry*/
if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_map(jobid, &mapping))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* 2. use the map to launch jobs*/
map=(orte_rmaps_base_map_t*)opal_list_get_first(&mapping);
new_argc=get_argc(map->app->argv)+3;
new_argv=(char**)malloc(new_argc*sizeof(char*));
new_argv[0]=(char*)malloc(1);/*it could be anything ... doesn't matter*/
for(i=2; i<new_argc; i++){
new_argv[i]=map->app->argv[i-2];
/*fprintf(stdout, "new_argv[%d]:%s\n", i, new_argv[i]);*/
}
new_argv[i]=NULL;
/*printf("new_argv[%d] is nulled\n", i);*/
t_stack=NULL;
for(item = opal_list_get_first(&mapping);
item != opal_list_get_end(&mapping);
item = opal_list_get_next(item)) {
map = (orte_rmaps_base_map_t*) item;
/* now here.. do we want to pass all node-names and binary as
* arguments to xcpu_launch or do we want to launch then one
* by one, by providing only one node-name and binary at a time?
*/
for(temp = opal_list_get_first(&map->nodes);
temp != opal_list_get_end(&map->nodes);
temp = opal_list_get_next(temp)){
new_argv[1]=((orte_rmaps_base_node_t*)temp)->node->node_name;
/*above should contain node name where process is to be launched*/
/*fprintf(stdout, "node name: %s\n", new_argv[1]);*/
nprocs=((orte_rmaps_base_node_t*)temp)->node_procs.opal_list_length;
/*fprintf(stdout, "list length: %d\n", nprocs);*/
for (i = 0; i<nprocs; ++i) {
temp_stack=(tid_stack*)malloc(sizeof(tid_stack));
temp_stack->next=t_stack;
t_stack=temp_stack;
t_stack->tid=lrx(new_argc, new_argv);
}
}
}
/* wait for all thrads that have launched processes on remote nodes
* */
temp_stack=t_stack;
while(t_stack){
pthread_join(t_stack->tid, NULL);
t_stack=t_stack->next;
}
orte_soh.begin_monitoring_job(jobid);
free_stack(temp_stack);
free(new_argv[0]);
/*free(new_argv[1]);*/
free(new_argv);
OBJ_DESTRUCT(&mapping);
/*fprintf(stdout, "launch finished\n");*/
return ORTE_SUCCESS;
}
int orte_pls_xcpu_terminate_job(orte_jobid_t jobid){
return ORTE_SUCCESS;
}
int orte_pls_xcpu_terminate_proc(const orte_process_name_t* proc_name){
return ORTE_SUCCESS;
}
int orte_pls_xcpu_finalize(void){
return ORTE_SUCCESS;
}

107
orte/mca/pls/xcpu/pls_xcpu.h Обычный файл
Просмотреть файл

@ -0,0 +1,107 @@
/* -*- C -*-
*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
*
*/
/**
* @file:
* Header file for the xcpu launcher. This will use xcpu to launch jobs on
* the list of nodes that it will get from RAS (resource allocation
* system (slurm??)
* -# pls_xcpu is called by orterun. It reads the ompi registry and launch
* the binary on the nodes specified in the registry.
*/
#ifndef ORTE_PLS_XCPU_H_
#define ORTE_PLS_XCPUC_H_
#include "orte_config.h"
#include "orte/class/orte_pointer_array.h"
#include "orte/orte_constants.h"
#include "orte/mca/pls/base/base.h"
#include "orte/util/proc_info.h"
#include "opal/threads/condition.h"
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
/*
* Module open / close -- defined in component file
*/
int orte_pls_xcpu_component_open(void); /*probably do nothing*/
int orte_pls_xcpu_component_close(void); /*probably do nothing*/
/*
* Startup / Shutdown
*/
orte_pls_base_module_t* orte_pls_xcpu_init(int *priority); /* in component file */
/* int orte_pls_xcpu_finalize(void); */ /* should be with interface */
/*
* Interface
*/
int orte_pls_xcpu_launch(orte_jobid_t);
int orte_pls_xcpu_terminate_job(orte_jobid_t);
int orte_pls_xcpu_terminate_proc(const orte_process_name_t* proc_name);
int orte_pls_xcpu_finalize(void);
/**
* (P)rocess (L)aunch (S)ubsystem xcpu Component
*/
struct orte_pls_xcpu_component_t {
orte_pls_base_component_t super;/*base_class this is needed others below this are not*/
/* most of the memebrs below are going to get removed from this structure
* and so are their registrations from open() function
*/
bool done_launching; /* Is true if we are done launching the user's app. */
int debug; /* If greater than 0 print debugging information */
int num_procs; /* The number of processes that are running */
int priority; /* The priority of this component. This will be returned if
* we determine that xcpu is available and running on this node,
*/
int terminate_sig; /* The signal that gets sent to a process to kill it. */
size_t num_daemons; /* The number of daemons that are currently running. */
opal_mutex_t lock; /* Lock used to prevent some race conditions */
opal_condition_t condition; /* Condition that is signaled when all the daemons have died */
orte_pointer_array_t * daemon_names;
/* Array of the process names of all the daemons. This is used to send
* the daemons a termonation signal when all the user processes are done */
orte_cellid_t cellid;
};
/**
* Convenience typedef
*/
typedef struct orte_pls_xcpu_component_t orte_pls_xcpu_component_t;
struct tid_stack {
int tid;
struct tid_stack *next;
};
typedef struct tid_stack tid_stack;
ORTE_DECLSPEC extern orte_pls_xcpu_component_t mca_pls_xcpu_component;
ORTE_DECLSPEC extern orte_pls_base_module_t orte_pls_xcpu_module; /* this is defined in pls_xcpu.c file */
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif /* ORTE_PLS_XCPU_H_ */

113
orte/mca/pls/xcpu/pls_xcpu_component.c Обычный файл
Просмотреть файл

@ -0,0 +1,113 @@
/* -*- C -*-
*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
*/
/**
* @file:
* Takes care of the component stuff for the MCA.
*/
#include "orte_config.h"
#include "orte/mca/errmgr/errmgr.h"
#include "opal/mca/mca.h"
#include "opal/mca/base/mca_base_param.h"
#include "pls_xcpu.h"
/**
* The xcpu component data structure that stores all the relevent data about
* this component.
*/
orte_pls_xcpu_component_t mca_pls_xcpu_component = {
{ /* version, data and init members of only first
* structure (called super) being initialized
*/
{
ORTE_PLS_BASE_VERSION_1_0_0,
"xcpu", /* MCA component name */
ORTE_MAJOR_VERSION, /* MCA component major version */
ORTE_MINOR_VERSION, /* MCA component minor version */
ORTE_RELEASE_VERSION, /* MCA component release version */
orte_pls_xcpu_component_open, /* component open */
orte_pls_xcpu_component_close /* component close */
},
{
false /* checkpoint / restart */
},
orte_pls_xcpu_init /* component init */
}
};
/**
* Opens the pls_xcpu component, setting all the needed mca parameters and
* finishes setting up the component struct.
*/
int orte_pls_xcpu_component_open(void) {
int rc;
/* init parameters */
/*read trunk/opal/mca/base/mca_base_param.h for reg_int details*/
mca_base_component_t *c = &mca_pls_xcpu_component.super.pls_version;
mca_base_param_reg_int(c, "priority", NULL, false, false,0,
&mca_pls_xcpu_component.priority);
mca_base_param_reg_int(c, "debug",
"If > 0 prints library debugging information",
false, false, 0, &mca_pls_xcpu_component.debug);
mca_base_param_reg_int(c, "terminate_sig",
"Signal sent to processes to terminate them", false,
false, 9, &mca_pls_xcpu_component.terminate_sig);
mca_pls_xcpu_component.num_procs = 0;
mca_pls_xcpu_component.num_daemons = 0;
mca_pls_xcpu_component.done_launching = false;
OBJ_CONSTRUCT(&mca_pls_xcpu_component.lock, opal_mutex_t);
OBJ_CONSTRUCT(&mca_pls_xcpu_component.condition, opal_condition_t);
/* init the list to hold the daemon names */
rc = orte_pointer_array_init(&mca_pls_xcpu_component.daemon_names, 8, 200000, 8);
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
/**
* Closes the pls_xcpu component
*/
int orte_pls_xcpu_component_close(void) {
OBJ_DESTRUCT(&mca_pls_xcpu_component.lock);
OBJ_DESTRUCT(&mca_pls_xcpu_component.condition);
OBJ_RELEASE(mca_pls_xcpu_component.daemon_names);
return ORTE_SUCCESS;
}
/**
* Initializes the module. We do not want to run unless, xcpu
* is running and we are on the control node.
*/
/* What I thnk is that this function will be called some where from (R)esource (M)ana(G)e(R)
* and then it will return orte_pls_xcpu_module that contains function pointers for launch,
* finalize etc. and then resource manager can call these functions
*/
orte_pls_base_module_t* orte_pls_xcpu_init(int *priority) {
/* check if xcpu component should be loaded or not
* if not, then return NULL here
*/
/*return NULL; *//*for time being*/
*priority = mca_pls_xcpu_component.priority;
return &orte_pls_xcpu_module; /* this is defined in pls_xcpu.c and will contains
* function pointers for launch, terminate_job
* terminate_proc and finalize
*/
}

57
orte/mca/soh/xcpu/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,57 @@
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
headers = \
soh_xcpu.h
if OMPI_BUILD_soh_xcpu_DSO
component_noinst =
component_install = mca_soh_xcpu.la
else
component_noinst = libmca_soh_xcpu.la
component_install =
endif
# Conditionally install the header files
if WANT_INSTALL_HEADERS
ortedir = $(includedir)/openmpi/orte/mca/soh/xcpu
orte_HEADERS = $(headers)
else
ortedir = $(includedir)
endif
host_SOURCES = \
soh_xcpu.c \
soh_xcpu.h \
soh_xcpu_component.c
mcacomponentdir = $(libdir)/openmpi
mcacomponent_LTLIBRARIES = $(component_install)
mca_soh_xcpu_la_SOURCES = $(host_SOURCES)
mca_soh_xcpu_la_LIBADD = \
$(top_ompi_builddir)/orte/liborte.la \
$(top_ompi_builddir)/opal/libopal.la
mca_soh_xcpu_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_soh_xcpu_la_SOURCES = $(host_SOURCES)
libmca_soh_xcpu_la_LIBADD =
libmca_soh_xcpu_la_LDFLAGS = -module -avoid-version

37
orte/mca/soh/xcpu/configure.m4 Обычный файл
Просмотреть файл

@ -0,0 +1,37 @@
# -*- shell-script -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_soh_xcpu_CONFIG([action-if-found], [action-if-not-found])
# -----------------------------------------------------------
AC_DEFUN([MCA_soh_xcpu_CONFIG],[
OMPI_CHECK_XCPU([soh_xcpu], [soh_xcpu_good=1], [soh_xcpu_good=0])
# if xcpu is present and working, soh_xcpu_good=1.
# Evaluate succeed / fail
AS_IF([test "$soh_xcpu_good" = "1"],
[soh_xcpu_WRAPPER_EXTRA_LDFLAGS="$soh_xcpu_LDFLAGS"
soh_xcpu_WRAPPER_EXTRA_LIBS="$soh_xcpu_LIBS"
$1],
[$2])
# set build flags to use in makefile
AC_SUBST([soh_xcpu_CPPFLAGS])
AC_SUBST([soh_xcpu_LDFLAGS])
AC_SUBST([soh_xcpu_LIBS])
])dnl

23
orte/mca/soh/xcpu/configure.params Обычный файл
Просмотреть файл

@ -0,0 +1,23 @@
# -*- shell-script -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# Specific to this module
PARAM_INIT_FILE=soh_xcpu.c
PARAM_CONFIG_FILES="Makefile"

238
orte/mca/soh/xcpu/soh_xcpu.c Обычный файл
Просмотреть файл

@ -0,0 +1,238 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include <pwd.h>
#include <grp.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#include "orte/orte_constants.h"
#include "orte/orte_types.h"
#include "orte/util/proc_info.h"
#include "orte/mca/ns/ns.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/gpr/base/base.h"
#include "orte/mca/soh/base/base.h"
#include "orte/mca/soh/xcpu/soh_xcpu.h"
#include "orte/mca/rmaps/base/base.h"
#include "orte/mca/rmaps/base/rmaps_base_map.h"
#include "opal/util/output.h"
static int orte_soh_xcpu_get_proc_soh(orte_proc_state_t *, int *, orte_process_name_t *);
static int orte_soh_xcpu_set_proc_soh(orte_process_name_t *, orte_proc_state_t, int);
static int orte_soh_xcpu_begin_monitoring_job(orte_jobid_t);
static int orte_soh_xcpu_finalize(void);
static int update_registry(orte_jobid_t jobid, char *proc_name){
orte_gpr_value_t *value;
int rc;
char *segment;
orte_proc_state_t state;
orte_job_state_t jstate;
orte_schema.get_job_segment_name(&segment, jobid);
/*fprintf(stdout, "soh_xcpu: segment: %s\n", segment);*/
if (ORTE_SUCCESS != (rc = orte_gpr.create_value(&value, ORTE_GPR_OVERWRITE | ORTE_GPR_TOKENS_AND,
segment, 3, 0))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/*fprintf(stdout, "debug 1\n");*/
if(ORTE_SUCCESS != (rc = orte_schema.get_proc_tokens(&(value->tokens), &(value->num_tokens),
orte_process_info.my_name) ) ){
ORTE_ERROR_LOG(rc);
}
/*
if(ORTE_SUCCESS != (rc = orte_schema.get_node_tokens(&(value->tokens), &(value->num_tokens), mca_soh_xcpu_component.cellid,
proc_name))) {
ORTE_ERROR_LOG(rc);
return rc;
}*/
/*fprintf(stdout, "debug 1.1\n");*/
state=ORTE_PROC_STATE_TERMINATED;
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[0]), ORTE_PROC_STATE_KEY,
ORTE_PROC_STATE, &state))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(value);
return rc;
}
/*fprintf(stdout, "debug 2\n");*/
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[1]), ORTE_PROC_EXIT_CODE_KEY,
ORTE_INT, 0))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(value);
return 0;
}
/*fprintf(stdout, "debug 3\n");*/
jstate=ORTE_JOB_STATE_TERMINATED;
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[2]), ORTE_JOB_STATE_KEY,
ORTE_JOB_STATE, &jstate))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(value);
return 0;
}
/*fprintf(stdout, "debug 4\n");*/
if ((rc = orte_gpr.put(1, &value)) != ORTE_SUCCESS) {
ORTE_ERROR_LOG(rc);
}
/*fprintf(stdout, "debug 4\n");*/
OBJ_RELEASE(value);
/*fprintf(stdout, "soh_xcpu: registry updated\n");*/
return rc;
}
/*
static int do_update(){
return 1;
}
static void orte_soh_xcpu_notify_handler(int fd, short flags, void *user)
{
}
*/
/**
* Register a callback to receive xcpu update notifications
*/
int orte_soh_xcpu_module_init(void)
{
int rc;
if (ORTE_SUCCESS != (rc = orte_ns.get_cellid(&mca_soh_xcpu_component.cellid, orte_process_info.my_name))) {
fprintf(stderr, "orte_soh_xcpu_module_init error\n");
ORTE_ERROR_LOG(rc);
return rc;
}
return ORTE_SUCCESS;
} /*
* Set initial node status
*/
/* if(!do_update()){
fprintf(stderr, "do_update error\n");
}
*/
/*
* Now regiser notify event
*/
/*` mca_soh_xcpu_component.notify_fd = 0;*/ /*bproc_notifier();*/
/*
memset(&mca_soh_xcpu_component.notify_event, 0, sizeof(opal_event_t));
opal_event_set(
&mca_soh_xcpu_component.notify_event,
mca_soh_xcpu_component.notify_fd,
OPAL_EV_READ|OPAL_EV_PERSIST,
orte_soh_xcpu_notify_handler,
0);
opal_event_add(&mca_soh_xcpu_component.notify_event, 0);
return ORTE_SUCCESS;
}
*/
orte_soh_base_module_t orte_soh_xcpu_module = {
orte_soh_xcpu_get_proc_soh,
orte_soh_xcpu_set_proc_soh,
orte_soh_base_get_node_soh_not_available,
orte_soh_base_set_node_soh_not_available,
orte_soh_base_get_job_soh,
orte_soh_base_set_job_soh,
orte_soh_xcpu_begin_monitoring_job,
orte_soh_xcpu_finalize
};
static int orte_soh_xcpu_get_proc_soh(orte_proc_state_t *state, int *status, orte_process_name_t *proc)
{
fprintf(stdout, "soh_xcpu: get_proc_soh\n");
return ORTE_SUCCESS;
return orte_soh_base_get_proc_soh(state, status, proc);
}
static int orte_soh_xcpu_set_proc_soh(orte_process_name_t *proc, orte_proc_state_t state, int status)
{
fprintf(stdout, "soh_xcpu: set_proc_soh\n");
return ORTE_SUCCESS;
return orte_soh_base_set_proc_soh(proc, state, status);
}
/* begin monitoring right now only trying to update registry so
* that mpirun can exit normally
* pls_xcpu is waiting for all threads to finish before calling this function
*/
static int orte_soh_xcpu_begin_monitoring_job(orte_jobid_t jobid){
int rc, nprocs, i;
opal_list_item_t *item, *temp;
orte_rmaps_base_map_t* map;
opal_list_t mapping;
OBJ_CONSTRUCT(&mapping, opal_list_t);
/* 1. get map from registry*/
if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_map(jobid, &mapping))) {
ORTE_ERROR_LOG(rc);
return rc;
}
fprintf(stdout, "soh_xcpu: begin monitoring\n");
if (ORTE_SUCCESS != (rc = orte_ns.get_cellid(&mca_soh_xcpu_component.cellid, orte_process_info.my_name))) {
fprintf(stderr, "soh_xcpu: get_cell_id error\n");
ORTE_ERROR_LOG(rc);
return rc;
}else
for(item = opal_list_get_first(&mapping);
item != opal_list_get_end(&mapping);
item = opal_list_get_next(item)) {
map = (orte_rmaps_base_map_t*) item;
for(temp = opal_list_get_first(&map->nodes);
temp != opal_list_get_end(&map->nodes);
temp = opal_list_get_next(temp)){
nprocs=((orte_rmaps_base_node_t*)temp)->node_procs.opal_list_length;
for (i = 0; i<nprocs; ++i) {
/*fprintf(stdout, "%s\n", ((orte_rmaps_base_node_t*)temp)->node->node_name);*/
update_registry(jobid, ((orte_rmaps_base_node_t*)temp)->node->node_name);
}
}
}
return ORTE_SUCCESS;
}
/**
* Cleanup
*/
static int orte_soh_xcpu_finalize(void)
{
fprintf(stdout, "soh_xcpu: finalize\n");
/* opal_event_del(&mca_soh_xcpu_component.notify_event);*/
return ORTE_SUCCESS;
}

66
orte/mca/soh/xcpu/soh_xcpu.h Обычный файл
Просмотреть файл

@ -0,0 +1,66 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*/
#ifndef ORTE_SOH_XCPU_H
#define ORTE_SOH_XCPU_H
#include "orte/mca/soh/soh.h"
#include "opal/event/event.h"
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
/**
* Bproc node registry keys
*/
#define ORTE_SOH_XCPU_NODE_STATUS "orte-node-xcpu-status"
#define ORTE_SOH_XCPU_NODE_MODE "orte-node-xcpu-mode"
#define ORTE_SOH_XCPU_NODE_USER "orte-node-xcpu-user"
#define ORTE_SOH_XCPU_NODE_GROUP "orte-node-xcpu-group"
/**
* Module init/fini
*/
int orte_soh_xcpu_module_init(void);
int orte_soh_xcpu_module_finalize(void);
struct orte_soh_xcpu_component_t {
orte_soh_base_component_t super;
/* not sure which of the following variabels are
* needed
* */
int debug;
int priority;
opal_event_t notify_event;
int notify_fd;
orte_cellid_t cellid;
/*struct xcpu_node_set_t node_set;*/
};
typedef struct orte_soh_xcpu_component_t orte_soh_xcpu_component_t;
OMPI_COMP_EXPORT extern orte_soh_base_module_t orte_soh_xcpu_module;
OMPI_COMP_EXPORT extern orte_soh_xcpu_component_t mca_soh_xcpu_component;
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif

104
orte/mca/soh/xcpu/soh_xcpu_component.c Обычный файл
Просмотреть файл

@ -0,0 +1,104 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/orte_constants.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h"
#include "orte/util/proc_info.h"
#include "opal/util/output.h"
#include "orte/mca/soh/xcpu/soh_xcpu.h"
/*
* Local functions
*/
static int orte_soh_xcpu_open(void);
static int orte_soh_xcpu_close(void);
static orte_soh_base_module_t* orte_soh_xcpu_init(int*);
orte_soh_xcpu_component_t mca_soh_xcpu_component = {
{
/* First, the mca_base_module_t struct containing meta
information about the module itself */
{
/* Indicate that we are a xcpu soh v1.0.0 module (which also
implies a specific MCA version) */
ORTE_SOH_BASE_VERSION_1_0_0,
"xcpu", /* MCA module name */
ORTE_MAJOR_VERSION, /* MCA module major version */
ORTE_MINOR_VERSION, /* MCA module minor version */
ORTE_RELEASE_VERSION, /* MCA module release version */
orte_soh_xcpu_open, /* component open */
orte_soh_xcpu_close /* component close */
},
/* Next the MCA v1.0.0 module meta data */
{
/* Whether the module is checkpointable or not */
false
},
orte_soh_xcpu_init
}
};
/**
* Utility function to register parameters
*/
static int orte_soh_xcpu_param_register_int(
const char* param_name,
int default_value)
{
int id = mca_base_param_register_int("soh","xcpu",param_name,NULL,default_value);
int param_value = default_value;
mca_base_param_lookup_int(id,&param_value);
return param_value;
}
static int orte_soh_xcpu_open(void)
{
mca_soh_xcpu_component.debug =
orte_soh_xcpu_param_register_int("debug", 0);
mca_soh_xcpu_component.priority =
orte_soh_xcpu_param_register_int("priority", 1);
/*fprintf(stdout, "soh_xcpu: open\n");*/
return ORTE_SUCCESS;
}
static orte_soh_base_module_t* orte_soh_xcpu_init(int *priority)
{
if (!orte_process_info.seed){
fprintf(stderr, "soh_xcpu: no seed found\n");
return NULL;
}
*priority = mca_soh_xcpu_component.priority;
orte_soh_xcpu_module_init();/*do we need this???*/
return &orte_soh_xcpu_module;
}
static int orte_soh_xcpu_close(void)
{
fprintf(stdout, "soh_xcpu: close\n");
return ORTE_SUCCESS;
}