diff --git a/config/ompi_check_xcpu.m4 b/config/ompi_check_xcpu.m4 new file mode 100644 index 0000000000..698bccd664 --- /dev/null +++ b/config/ompi_check_xcpu.m4 @@ -0,0 +1,63 @@ +# -*- shell-script -*- +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# OMPI_CHECK_XCPU(prefix, [action-if-found], [action-if-not-found]) +# -------------------------------------------------------- +AC_DEFUN([OMPI_CHECK_XCPU],[ + AC_ARG_WITH([xcpu], + [AC_HELP_STRING([--with-xcpu], + [Path to xcpu installation])]) + + AS_IF([test ! -z "$with_xcpu" -a "$with_xcpu" = "no"],[$4], [ + ompi_check_xcpu_save_CPPFLAGS="$CPPFLAGS" + ompi_check_xcpu_save_LDFLAGS="$LDFLAGS" + ompi_check_xcpu_save_LIBS="$LIBS" + + AS_IF([test ! -z "$with_xcpu" -a "$with_xcpu" != "yes"], + [CPPFLAGS="$CPPFLAGS -I$with_xcpu/include" + LDFLAGS="$LDFLAGS -L$with_xcpu/lib"]) + AC_CHECK_HEADERS([sys/xcpu.h], + [AC_CHECK_LIB([xcpu], + [check_for_xcpu], + [ompi_check_xcpu_works="yes"], + [ompi_check_xcpu_works="no"])], + [AC_CHECK_LIB([xcpu], + [check_for_xcpu], + [ompi_check_xcpu_works="yes"], + [ompi_check_xcpu_works="no"])]) + # check for library irrespective of if xcpu.h is there or not + # 'cause I am not sure + # if we need to check for xcpu.h + + CPPFLAGS="$ompi_check_xcpu_save_CPPFLAGS" + LDFLAGS="$ompi_check_xcpu_save_LDFLAGS" + LIBS="$ompi_check_xcpu_save_LIBS" + + AS_IF([test "$ompi_check_xcpu_works" != "no"], + [AS_IF([test ! -z "$with_xcpu" -a "$with_xcpu" != "yes"], + [$1_CPPFLAGS="$$1_CPPFLAGS -I$with_xcpu/include" + $1_LDFLAGS="$$1_LDFLAGS -L$with_xcpu/lib"]) + $1_LIBS="$$1_LIBS -lxcpu" + AS_IF([test "$ompi_check_xcpu_works" = "yes"], [$2], [$3])], + [AS_IF([test ! -z "$with_xcpu"], + [AC_MSG_ERROR([xcpu support requested but not found. Perhaps +you need to specify the location of the xcpu libraries.])]) + $4]) + ]) +]) diff --git a/orte/mca/pls/xcpu/Makefile.am b/orte/mca/pls/xcpu/Makefile.am new file mode 100644 index 0000000000..bca8e5515a --- /dev/null +++ b/orte/mca/pls/xcpu/Makefile.am @@ -0,0 +1,52 @@ +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +#dist_pkgdata_DATA = help-pls-bproc.txt + +AM_CPPFLAGS = -I$(top_ompi_builddir)/src/include $(pls_xcpu_CPPFLAGS) -DORTE_BINDIR="\"$(bindir)\"" + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +if OMPI_BUILD_pls_xcpu_DSO +component_noinst = +component_install = mca_pls_xcpu.la +else +component_noinst = libmca_pls_xcpu.la +component_install = +endif + +sources = \ + pls_xcpu.h \ + pls_xcpu.c \ + pls_xcpu_component.c + +mcacomponentdir = $(libdir)/openmpi +mcacomponent_LTLIBRARIES = $(component_install) +mca_pls_xcpu_la_SOURCES = $(sources) +mca_pls_xcpu_la_LIBADD = \ + $(pls_xcpu_LIBS) \ + $(top_ompi_builddir)/orte/liborte.la \ + $(top_ompi_builddir)/opal/libopal.la +mca_pls_xcpu_la_LDFLAGS = -module -avoid-version $(pls_xcpu_LDFLAGS) + +noinst_LTLIBRARIES = $(component_noinst) +libmca_pls_xcpu_la_SOURCES = $(sources) +libmca_pls_xcpu_la_LIBADD = $(pls_xcpu_LIBS) +libmca_pls_xcpu_la_LDFLAGS = -module -avoid-version $(pls_xcpu_LDFLAGS) diff --git a/orte/mca/pls/xcpu/configure.m4 b/orte/mca/pls/xcpu/configure.m4 new file mode 100644 index 0000000000..9699eb2789 --- /dev/null +++ b/orte/mca/pls/xcpu/configure.m4 @@ -0,0 +1,37 @@ +# -*- shell-script -*- +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# MCA_pls_xcpu_CONFIG([action-if-found], [action-if-not-found]) +# ----------------------------------------------------------- +AC_DEFUN([MCA_pls_xcpu_CONFIG],[ + OMPI_CHECK_XCPU([pls_xcpu], [pls_xcpu_good=1], [pls_xcpu_good=0]) + # if xcpu is present and working, pls_xcpu_good=1. + # Evaluate succeed / fail + + AS_IF([test "$pls_xcpu_good" = "1"], + [pls_xcpu_WRAPPER_EXTRA_LDFLAGS="$pls_xcpu_LDFLAGS" + pls_xcpu_WRAPPER_EXTRA_LIBS="$pls_xcpu_LIBS" + $1], + [$2]) + + # set build flags to use in makefile + AC_SUBST([pls_xcpu_CPPFLAGS]) + AC_SUBST([pls_xcpu_LDFLAGS]) + AC_SUBST([pls_xcpu_LIBS]) +])dnl diff --git a/orte/mca/pls/xcpu/configure.params b/orte/mca/pls/xcpu/configure.params new file mode 100644 index 0000000000..8407213f09 --- /dev/null +++ b/orte/mca/pls/xcpu/configure.params @@ -0,0 +1,24 @@ +# -*- shell-script -*- +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# Specific to this module + + +PARAM_INIT_FILE=pls_xcpu.c +PARAM_CONFIG_FILES="Makefile" diff --git a/orte/mca/pls/xcpu/pls_xcpu.c b/orte/mca/pls/xcpu/pls_xcpu.c new file mode 100644 index 0000000000..ec75df3d4a --- /dev/null +++ b/orte/mca/pls/xcpu/pls_xcpu.c @@ -0,0 +1,182 @@ +/* -*- C -*- + * + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + * + */ + +/* @file: + * xcpu Lancher to launch jobs on compute nodes.. + */ + +#include "orte_config.h" +#if HAVE_SYS_TYPES_H +#include +#endif /* HAVE_SYS_TYPES_H */ +#ifdef HAVE_SYS_STAT_H +#include +#endif /* HAVE_SYS_STAT_H */ +#ifdef HAVE_UNISTD_H +#include +#endif /* HAVE_UNISTD_H */ +#include +#include +#ifdef HAVE_FCNTL_H +#include +#endif /* HAVE_FCNTL_H */ +#ifdef HAVE_STRING_H +#include +#endif /* HAVE_STRING_H */ + +#include "opal/event/event.h" +#include "opal/mca/base/mca_base_param.h" +#include "opal/util/argv.h" +#include "opal/util/output.h" +#include "opal/util/opal_environ.h" +#include "opal/util/path.h" +#include "opal/util/show_help.h" + +#include "orte/dss/dss.h" +#include "orte/util/sys_info.h" +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/gpr/base/base.h" +#include "orte/mca/iof/iof.h" +#include "orte/mca/ns/base/base.h" +#include "orte/mca/sds/base/base.h" +#include "orte/mca/oob/base/base.h" +#include "orte/mca/ras/base/base.h" +#include "orte/mca/rmgr/base/base.h" +#include "orte/mca/rmaps/base/base.h" +#include "orte/mca/rmaps/base/rmaps_base_map.h" +#include "orte/mca/rml/rml.h" +#include "orte/mca/soh/base/base.h" +#include "orte/runtime/orte_wait.h" +#include "orte/runtime/runtime.h" + +#include "pls_xcpu.h" + +/** + * Initialization of the xcpu module with all the needed function pointers + */ +orte_pls_base_module_t orte_pls_xcpu_module = { + orte_pls_xcpu_launch, + orte_pls_xcpu_terminate_job, + orte_pls_xcpu_terminate_proc, + orte_pls_xcpu_finalize +}; +int lrx(int argc, char **argv); +int get_argc(char **argv){ + int i=0; + while(argv[i]){ + i++; + } + return i; +} + +void free_stack(tid_stack *s){ + if(s){ + free_stack(s->next); + free(s); + } +} + +/* This is the main function that will launch jobs on remote compute modes + * @param jobid the jobid of the job to launch + * @retval ORTE_SUCCESS or error + */ +int orte_pls_xcpu_launch(orte_jobid_t jobid){ + opal_list_t mapping; + char **new_argv; + int new_argc, nprocs=0; + int rc, i=0; + tid_stack *t_stack, *temp_stack; + opal_list_item_t *item, *temp; + orte_rmaps_base_map_t* map; + /* first get the list of nodes on which we are going to launch job */ + /* OBJ_CONSTRUCT construct/initialize objects that are not dynamically allocated. + * see file opal/class/opal_object.h for detils + */ + /*fprintf(stdout, "\nxcpu launch called, job id: %d\n", jobid);*/ + OBJ_CONSTRUCT(&mapping, opal_list_t); + /* 1. get map from registry*/ + if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_map(jobid, &mapping))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* 2. use the map to launch jobs*/ + map=(orte_rmaps_base_map_t*)opal_list_get_first(&mapping); + new_argc=get_argc(map->app->argv)+3; + new_argv=(char**)malloc(new_argc*sizeof(char*)); + new_argv[0]=(char*)malloc(1);/*it could be anything ... doesn't matter*/ + for(i=2; iapp->argv[i-2]; + /*fprintf(stdout, "new_argv[%d]:%s\n", i, new_argv[i]);*/ + } + new_argv[i]=NULL; + /*printf("new_argv[%d] is nulled\n", i);*/ + t_stack=NULL; + for(item = opal_list_get_first(&mapping); + item != opal_list_get_end(&mapping); + item = opal_list_get_next(item)) { + map = (orte_rmaps_base_map_t*) item; + /* now here.. do we want to pass all node-names and binary as + * arguments to xcpu_launch or do we want to launch then one + * by one, by providing only one node-name and binary at a time? + */ + for(temp = opal_list_get_first(&map->nodes); + temp != opal_list_get_end(&map->nodes); + temp = opal_list_get_next(temp)){ + + new_argv[1]=((orte_rmaps_base_node_t*)temp)->node->node_name; + /*above should contain node name where process is to be launched*/ + /*fprintf(stdout, "node name: %s\n", new_argv[1]);*/ + nprocs=((orte_rmaps_base_node_t*)temp)->node_procs.opal_list_length; + /*fprintf(stdout, "list length: %d\n", nprocs);*/ + for (i = 0; inext=t_stack; + t_stack=temp_stack; + t_stack->tid=lrx(new_argc, new_argv); + } + } + } + /* wait for all thrads that have launched processes on remote nodes + * */ + temp_stack=t_stack; + while(t_stack){ + pthread_join(t_stack->tid, NULL); + t_stack=t_stack->next; + } + orte_soh.begin_monitoring_job(jobid); + + free_stack(temp_stack); + free(new_argv[0]); + /*free(new_argv[1]);*/ + free(new_argv); + OBJ_DESTRUCT(&mapping); + /*fprintf(stdout, "launch finished\n");*/ + return ORTE_SUCCESS; +} + +int orte_pls_xcpu_terminate_job(orte_jobid_t jobid){ + return ORTE_SUCCESS; +} +int orte_pls_xcpu_terminate_proc(const orte_process_name_t* proc_name){ + return ORTE_SUCCESS; +} +int orte_pls_xcpu_finalize(void){ + return ORTE_SUCCESS; +} diff --git a/orte/mca/pls/xcpu/pls_xcpu.h b/orte/mca/pls/xcpu/pls_xcpu.h new file mode 100644 index 0000000000..218c2bed05 --- /dev/null +++ b/orte/mca/pls/xcpu/pls_xcpu.h @@ -0,0 +1,107 @@ +/* -*- C -*- + * + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + * + * + */ +/** + * @file: + * Header file for the xcpu launcher. This will use xcpu to launch jobs on + * the list of nodes that it will get from RAS (resource allocation + * system (slurm??) + * -# pls_xcpu is called by orterun. It reads the ompi registry and launch + * the binary on the nodes specified in the registry. + */ + +#ifndef ORTE_PLS_XCPU_H_ +#define ORTE_PLS_XCPUC_H_ + +#include "orte_config.h" +#include "orte/class/orte_pointer_array.h" +#include "orte/orte_constants.h" +#include "orte/mca/pls/base/base.h" +#include "orte/util/proc_info.h" +#include "opal/threads/condition.h" + +#if defined(c_plusplus) || defined(__cplusplus) +extern "C" { +#endif + +/* + * Module open / close -- defined in component file + */ +int orte_pls_xcpu_component_open(void); /*probably do nothing*/ +int orte_pls_xcpu_component_close(void); /*probably do nothing*/ + +/* + * Startup / Shutdown + */ +orte_pls_base_module_t* orte_pls_xcpu_init(int *priority); /* in component file */ +/* int orte_pls_xcpu_finalize(void); */ /* should be with interface */ + +/* + * Interface + */ +int orte_pls_xcpu_launch(orte_jobid_t); +int orte_pls_xcpu_terminate_job(orte_jobid_t); +int orte_pls_xcpu_terminate_proc(const orte_process_name_t* proc_name); +int orte_pls_xcpu_finalize(void); + + +/** + * (P)rocess (L)aunch (S)ubsystem xcpu Component + */ +struct orte_pls_xcpu_component_t { + orte_pls_base_component_t super;/*base_class this is needed others below this are not*/ + + /* most of the memebrs below are going to get removed from this structure + * and so are their registrations from open() function + */ + bool done_launching; /* Is true if we are done launching the user's app. */ + int debug; /* If greater than 0 print debugging information */ + int num_procs; /* The number of processes that are running */ + int priority; /* The priority of this component. This will be returned if + * we determine that xcpu is available and running on this node, + */ + int terminate_sig; /* The signal that gets sent to a process to kill it. */ + size_t num_daemons; /* The number of daemons that are currently running. */ + opal_mutex_t lock; /* Lock used to prevent some race conditions */ + opal_condition_t condition; /* Condition that is signaled when all the daemons have died */ + orte_pointer_array_t * daemon_names; + /* Array of the process names of all the daemons. This is used to send + * the daemons a termonation signal when all the user processes are done */ + orte_cellid_t cellid; +}; +/** + * Convenience typedef + */ +typedef struct orte_pls_xcpu_component_t orte_pls_xcpu_component_t; + +struct tid_stack { + int tid; + struct tid_stack *next; +}; +typedef struct tid_stack tid_stack; + +ORTE_DECLSPEC extern orte_pls_xcpu_component_t mca_pls_xcpu_component; +ORTE_DECLSPEC extern orte_pls_base_module_t orte_pls_xcpu_module; /* this is defined in pls_xcpu.c file */ + +#if defined(c_plusplus) || defined(__cplusplus) +} +#endif +#endif /* ORTE_PLS_XCPU_H_ */ + diff --git a/orte/mca/pls/xcpu/pls_xcpu_component.c b/orte/mca/pls/xcpu/pls_xcpu_component.c new file mode 100644 index 0000000000..44bbe65909 --- /dev/null +++ b/orte/mca/pls/xcpu/pls_xcpu_component.c @@ -0,0 +1,113 @@ +/* -*- C -*- + * + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + * + */ +/** + * @file: + * Takes care of the component stuff for the MCA. + */ +#include "orte_config.h" +#include "orte/mca/errmgr/errmgr.h" +#include "opal/mca/mca.h" +#include "opal/mca/base/mca_base_param.h" +#include "pls_xcpu.h" + +/** + * The xcpu component data structure that stores all the relevent data about + * this component. + */ +orte_pls_xcpu_component_t mca_pls_xcpu_component = { + { /* version, data and init members of only first + * structure (called super) being initialized + */ + { + ORTE_PLS_BASE_VERSION_1_0_0, + "xcpu", /* MCA component name */ + ORTE_MAJOR_VERSION, /* MCA component major version */ + ORTE_MINOR_VERSION, /* MCA component minor version */ + ORTE_RELEASE_VERSION, /* MCA component release version */ + orte_pls_xcpu_component_open, /* component open */ + orte_pls_xcpu_component_close /* component close */ + }, + { + false /* checkpoint / restart */ + }, + orte_pls_xcpu_init /* component init */ + } +}; + +/** + * Opens the pls_xcpu component, setting all the needed mca parameters and + * finishes setting up the component struct. + */ +int orte_pls_xcpu_component_open(void) { + int rc; + /* init parameters */ + /*read trunk/opal/mca/base/mca_base_param.h for reg_int details*/ + mca_base_component_t *c = &mca_pls_xcpu_component.super.pls_version; + mca_base_param_reg_int(c, "priority", NULL, false, false,0, + &mca_pls_xcpu_component.priority); + mca_base_param_reg_int(c, "debug", + "If > 0 prints library debugging information", + false, false, 0, &mca_pls_xcpu_component.debug); + mca_base_param_reg_int(c, "terminate_sig", + "Signal sent to processes to terminate them", false, + false, 9, &mca_pls_xcpu_component.terminate_sig); + mca_pls_xcpu_component.num_procs = 0; + mca_pls_xcpu_component.num_daemons = 0; + mca_pls_xcpu_component.done_launching = false; + OBJ_CONSTRUCT(&mca_pls_xcpu_component.lock, opal_mutex_t); + OBJ_CONSTRUCT(&mca_pls_xcpu_component.condition, opal_condition_t); + /* init the list to hold the daemon names */ + rc = orte_pointer_array_init(&mca_pls_xcpu_component.daemon_names, 8, 200000, 8); + if(ORTE_SUCCESS != rc) { + ORTE_ERROR_LOG(rc); + } + return rc; +} + +/** + * Closes the pls_xcpu component + */ +int orte_pls_xcpu_component_close(void) { + OBJ_DESTRUCT(&mca_pls_xcpu_component.lock); + OBJ_DESTRUCT(&mca_pls_xcpu_component.condition); + OBJ_RELEASE(mca_pls_xcpu_component.daemon_names); + return ORTE_SUCCESS; +} + +/** + * Initializes the module. We do not want to run unless, xcpu + * is running and we are on the control node. + */ +/* What I thnk is that this function will be called some where from (R)esource (M)ana(G)e(R) + * and then it will return orte_pls_xcpu_module that contains function pointers for launch, + * finalize etc. and then resource manager can call these functions + */ +orte_pls_base_module_t* orte_pls_xcpu_init(int *priority) { + /* check if xcpu component should be loaded or not + * if not, then return NULL here + */ + /*return NULL; *//*for time being*/ + *priority = mca_pls_xcpu_component.priority; + return &orte_pls_xcpu_module; /* this is defined in pls_xcpu.c and will contains + * function pointers for launch, terminate_job + * terminate_proc and finalize + */ +} + diff --git a/orte/mca/soh/xcpu/Makefile.am b/orte/mca/soh/xcpu/Makefile.am new file mode 100644 index 0000000000..d5a5960f2b --- /dev/null +++ b/orte/mca/soh/xcpu/Makefile.am @@ -0,0 +1,57 @@ +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + + + +headers = \ + soh_xcpu.h + +if OMPI_BUILD_soh_xcpu_DSO +component_noinst = +component_install = mca_soh_xcpu.la +else +component_noinst = libmca_soh_xcpu.la +component_install = +endif + +# Conditionally install the header files + +if WANT_INSTALL_HEADERS +ortedir = $(includedir)/openmpi/orte/mca/soh/xcpu +orte_HEADERS = $(headers) +else +ortedir = $(includedir) +endif + +host_SOURCES = \ + soh_xcpu.c \ + soh_xcpu.h \ + soh_xcpu_component.c + +mcacomponentdir = $(libdir)/openmpi +mcacomponent_LTLIBRARIES = $(component_install) +mca_soh_xcpu_la_SOURCES = $(host_SOURCES) +mca_soh_xcpu_la_LIBADD = \ + $(top_ompi_builddir)/orte/liborte.la \ + $(top_ompi_builddir)/opal/libopal.la +mca_soh_xcpu_la_LDFLAGS = -module -avoid-version + +noinst_LTLIBRARIES = $(component_noinst) +libmca_soh_xcpu_la_SOURCES = $(host_SOURCES) +libmca_soh_xcpu_la_LIBADD = +libmca_soh_xcpu_la_LDFLAGS = -module -avoid-version diff --git a/orte/mca/soh/xcpu/configure.m4 b/orte/mca/soh/xcpu/configure.m4 new file mode 100644 index 0000000000..edc3ab8996 --- /dev/null +++ b/orte/mca/soh/xcpu/configure.m4 @@ -0,0 +1,37 @@ +# -*- shell-script -*- +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# MCA_soh_xcpu_CONFIG([action-if-found], [action-if-not-found]) +# ----------------------------------------------------------- +AC_DEFUN([MCA_soh_xcpu_CONFIG],[ + OMPI_CHECK_XCPU([soh_xcpu], [soh_xcpu_good=1], [soh_xcpu_good=0]) + # if xcpu is present and working, soh_xcpu_good=1. + # Evaluate succeed / fail + + AS_IF([test "$soh_xcpu_good" = "1"], + [soh_xcpu_WRAPPER_EXTRA_LDFLAGS="$soh_xcpu_LDFLAGS" + soh_xcpu_WRAPPER_EXTRA_LIBS="$soh_xcpu_LIBS" + $1], + [$2]) + + # set build flags to use in makefile + AC_SUBST([soh_xcpu_CPPFLAGS]) + AC_SUBST([soh_xcpu_LDFLAGS]) + AC_SUBST([soh_xcpu_LIBS]) +])dnl diff --git a/orte/mca/soh/xcpu/configure.params b/orte/mca/soh/xcpu/configure.params new file mode 100644 index 0000000000..b209d3785f --- /dev/null +++ b/orte/mca/soh/xcpu/configure.params @@ -0,0 +1,23 @@ +# -*- shell-script -*- +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# Specific to this module + +PARAM_INIT_FILE=soh_xcpu.c +PARAM_CONFIG_FILES="Makefile" diff --git a/orte/mca/soh/xcpu/soh_xcpu.c b/orte/mca/soh/xcpu/soh_xcpu.c new file mode 100644 index 0000000000..5e67afda82 --- /dev/null +++ b/orte/mca/soh/xcpu/soh_xcpu.c @@ -0,0 +1,238 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" + +#include +#include + +#ifdef HAVE_UNISTD_H +#include +#endif + +#include "orte/orte_constants.h" +#include "orte/orte_types.h" + +#include "orte/util/proc_info.h" +#include "orte/mca/ns/ns.h" +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/gpr/base/base.h" +#include "orte/mca/soh/base/base.h" +#include "orte/mca/soh/xcpu/soh_xcpu.h" +#include "orte/mca/rmaps/base/base.h" +#include "orte/mca/rmaps/base/rmaps_base_map.h" +#include "opal/util/output.h" + +static int orte_soh_xcpu_get_proc_soh(orte_proc_state_t *, int *, orte_process_name_t *); +static int orte_soh_xcpu_set_proc_soh(orte_process_name_t *, orte_proc_state_t, int); +static int orte_soh_xcpu_begin_monitoring_job(orte_jobid_t); +static int orte_soh_xcpu_finalize(void); + +static int update_registry(orte_jobid_t jobid, char *proc_name){ + orte_gpr_value_t *value; + int rc; + char *segment; + orte_proc_state_t state; + orte_job_state_t jstate; + orte_schema.get_job_segment_name(&segment, jobid); + /*fprintf(stdout, "soh_xcpu: segment: %s\n", segment);*/ + if (ORTE_SUCCESS != (rc = orte_gpr.create_value(&value, ORTE_GPR_OVERWRITE | ORTE_GPR_TOKENS_AND, + segment, 3, 0))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /*fprintf(stdout, "debug 1\n");*/ + + + if(ORTE_SUCCESS != (rc = orte_schema.get_proc_tokens(&(value->tokens), &(value->num_tokens), + orte_process_info.my_name) ) ){ + ORTE_ERROR_LOG(rc); + } + /* + if(ORTE_SUCCESS != (rc = orte_schema.get_node_tokens(&(value->tokens), &(value->num_tokens), mca_soh_xcpu_component.cellid, + proc_name))) { + ORTE_ERROR_LOG(rc); + return rc; + }*/ + /*fprintf(stdout, "debug 1.1\n");*/ + state=ORTE_PROC_STATE_TERMINATED; + if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[0]), ORTE_PROC_STATE_KEY, + ORTE_PROC_STATE, &state))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(value); + return rc; + } + /*fprintf(stdout, "debug 2\n");*/ + + if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[1]), ORTE_PROC_EXIT_CODE_KEY, + ORTE_INT, 0))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(value); + return 0; + } + /*fprintf(stdout, "debug 3\n");*/ + jstate=ORTE_JOB_STATE_TERMINATED; + if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[2]), ORTE_JOB_STATE_KEY, + ORTE_JOB_STATE, &jstate))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(value); + return 0; + } + + /*fprintf(stdout, "debug 4\n");*/ + + if ((rc = orte_gpr.put(1, &value)) != ORTE_SUCCESS) { + ORTE_ERROR_LOG(rc); + } + /*fprintf(stdout, "debug 4\n");*/ + OBJ_RELEASE(value); + /*fprintf(stdout, "soh_xcpu: registry updated\n");*/ + return rc; +} + +/* +static int do_update(){ + return 1; + +} + +static void orte_soh_xcpu_notify_handler(int fd, short flags, void *user) +{ + +} +*/ + +/** + * Register a callback to receive xcpu update notifications + */ +int orte_soh_xcpu_module_init(void) +{ + int rc; + + if (ORTE_SUCCESS != (rc = orte_ns.get_cellid(&mca_soh_xcpu_component.cellid, orte_process_info.my_name))) { + fprintf(stderr, "orte_soh_xcpu_module_init error\n"); + ORTE_ERROR_LOG(rc); + return rc; + } + + return ORTE_SUCCESS; +} /* + * Set initial node status + */ +/* if(!do_update()){ + fprintf(stderr, "do_update error\n"); + } +*/ + /* + * Now regiser notify event + */ + +/*` mca_soh_xcpu_component.notify_fd = 0;*/ /*bproc_notifier();*/ +/* + memset(&mca_soh_xcpu_component.notify_event, 0, sizeof(opal_event_t)); + + opal_event_set( + &mca_soh_xcpu_component.notify_event, + mca_soh_xcpu_component.notify_fd, + OPAL_EV_READ|OPAL_EV_PERSIST, + orte_soh_xcpu_notify_handler, + 0); + + opal_event_add(&mca_soh_xcpu_component.notify_event, 0); + + return ORTE_SUCCESS; +} +*/ +orte_soh_base_module_t orte_soh_xcpu_module = { + orte_soh_xcpu_get_proc_soh, + orte_soh_xcpu_set_proc_soh, + orte_soh_base_get_node_soh_not_available, + orte_soh_base_set_node_soh_not_available, + orte_soh_base_get_job_soh, + orte_soh_base_set_job_soh, + orte_soh_xcpu_begin_monitoring_job, + orte_soh_xcpu_finalize +}; + +static int orte_soh_xcpu_get_proc_soh(orte_proc_state_t *state, int *status, orte_process_name_t *proc) +{ + fprintf(stdout, "soh_xcpu: get_proc_soh\n"); + return ORTE_SUCCESS; + return orte_soh_base_get_proc_soh(state, status, proc); +} + +static int orte_soh_xcpu_set_proc_soh(orte_process_name_t *proc, orte_proc_state_t state, int status) +{ + fprintf(stdout, "soh_xcpu: set_proc_soh\n"); + return ORTE_SUCCESS; + return orte_soh_base_set_proc_soh(proc, state, status); +} + +/* begin monitoring right now only trying to update registry so + * that mpirun can exit normally + * pls_xcpu is waiting for all threads to finish before calling this function + */ +static int orte_soh_xcpu_begin_monitoring_job(orte_jobid_t jobid){ + int rc, nprocs, i; + opal_list_item_t *item, *temp; + orte_rmaps_base_map_t* map; + opal_list_t mapping; + + OBJ_CONSTRUCT(&mapping, opal_list_t); + /* 1. get map from registry*/ + if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_map(jobid, &mapping))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + fprintf(stdout, "soh_xcpu: begin monitoring\n"); + if (ORTE_SUCCESS != (rc = orte_ns.get_cellid(&mca_soh_xcpu_component.cellid, orte_process_info.my_name))) { + fprintf(stderr, "soh_xcpu: get_cell_id error\n"); + ORTE_ERROR_LOG(rc); + return rc; + }else + for(item = opal_list_get_first(&mapping); + item != opal_list_get_end(&mapping); + item = opal_list_get_next(item)) { + map = (orte_rmaps_base_map_t*) item; + + for(temp = opal_list_get_first(&map->nodes); + temp != opal_list_get_end(&map->nodes); + temp = opal_list_get_next(temp)){ + + nprocs=((orte_rmaps_base_node_t*)temp)->node_procs.opal_list_length; + + for (i = 0; inode->node_name);*/ + update_registry(jobid, ((orte_rmaps_base_node_t*)temp)->node->node_name); + } + } + } + return ORTE_SUCCESS; +} + +/** + * Cleanup + */ + +static int orte_soh_xcpu_finalize(void) +{ + fprintf(stdout, "soh_xcpu: finalize\n"); + /* opal_event_del(&mca_soh_xcpu_component.notify_event);*/ + return ORTE_SUCCESS; +} diff --git a/orte/mca/soh/xcpu/soh_xcpu.h b/orte/mca/soh/xcpu/soh_xcpu.h new file mode 100644 index 0000000000..c69a52215d --- /dev/null +++ b/orte/mca/soh/xcpu/soh_xcpu.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + */ +#ifndef ORTE_SOH_XCPU_H +#define ORTE_SOH_XCPU_H + +#include "orte/mca/soh/soh.h" +#include "opal/event/event.h" + +#if defined(c_plusplus) || defined(__cplusplus) +extern "C" { +#endif + +/** + * Bproc node registry keys + */ +#define ORTE_SOH_XCPU_NODE_STATUS "orte-node-xcpu-status" +#define ORTE_SOH_XCPU_NODE_MODE "orte-node-xcpu-mode" +#define ORTE_SOH_XCPU_NODE_USER "orte-node-xcpu-user" +#define ORTE_SOH_XCPU_NODE_GROUP "orte-node-xcpu-group" + + +/** + * Module init/fini + */ +int orte_soh_xcpu_module_init(void); +int orte_soh_xcpu_module_finalize(void); + +struct orte_soh_xcpu_component_t { + orte_soh_base_component_t super; + /* not sure which of the following variabels are + * needed + * */ + int debug; + int priority; + opal_event_t notify_event; + int notify_fd; + orte_cellid_t cellid; + /*struct xcpu_node_set_t node_set;*/ +}; +typedef struct orte_soh_xcpu_component_t orte_soh_xcpu_component_t; + +OMPI_COMP_EXPORT extern orte_soh_base_module_t orte_soh_xcpu_module; +OMPI_COMP_EXPORT extern orte_soh_xcpu_component_t mca_soh_xcpu_component; + +#if defined(c_plusplus) || defined(__cplusplus) +} +#endif +#endif diff --git a/orte/mca/soh/xcpu/soh_xcpu_component.c b/orte/mca/soh/xcpu/soh_xcpu_component.c new file mode 100644 index 0000000000..c2db7688d7 --- /dev/null +++ b/orte/mca/soh/xcpu/soh_xcpu_component.c @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" +#include "orte/orte_constants.h" +#include "opal/mca/base/base.h" +#include "opal/mca/base/mca_base_param.h" +#include "orte/util/proc_info.h" +#include "opal/util/output.h" +#include "orte/mca/soh/xcpu/soh_xcpu.h" + +/* + * Local functions + */ + +static int orte_soh_xcpu_open(void); +static int orte_soh_xcpu_close(void); +static orte_soh_base_module_t* orte_soh_xcpu_init(int*); + +orte_soh_xcpu_component_t mca_soh_xcpu_component = { + { + /* First, the mca_base_module_t struct containing meta + information about the module itself */ + { + /* Indicate that we are a xcpu soh v1.0.0 module (which also + implies a specific MCA version) */ + + ORTE_SOH_BASE_VERSION_1_0_0, + + "xcpu", /* MCA module name */ + ORTE_MAJOR_VERSION, /* MCA module major version */ + ORTE_MINOR_VERSION, /* MCA module minor version */ + ORTE_RELEASE_VERSION, /* MCA module release version */ + orte_soh_xcpu_open, /* component open */ + orte_soh_xcpu_close /* component close */ + }, + + /* Next the MCA v1.0.0 module meta data */ + + { + /* Whether the module is checkpointable or not */ + + false + }, + + orte_soh_xcpu_init + } +}; + +/** + * Utility function to register parameters + */ +static int orte_soh_xcpu_param_register_int( + const char* param_name, + int default_value) +{ + int id = mca_base_param_register_int("soh","xcpu",param_name,NULL,default_value); + int param_value = default_value; + mca_base_param_lookup_int(id,¶m_value); + return param_value; +} + +static int orte_soh_xcpu_open(void) +{ + mca_soh_xcpu_component.debug = + orte_soh_xcpu_param_register_int("debug", 0); + mca_soh_xcpu_component.priority = + orte_soh_xcpu_param_register_int("priority", 1); + /*fprintf(stdout, "soh_xcpu: open\n");*/ + return ORTE_SUCCESS; +} + +static orte_soh_base_module_t* orte_soh_xcpu_init(int *priority) +{ + if (!orte_process_info.seed){ + fprintf(stderr, "soh_xcpu: no seed found\n"); + return NULL; + } + + *priority = mca_soh_xcpu_component.priority; + orte_soh_xcpu_module_init();/*do we need this???*/ + return &orte_soh_xcpu_module; +} + +static int orte_soh_xcpu_close(void) +{ + fprintf(stdout, "soh_xcpu: close\n"); + return ORTE_SUCCESS; +}