* First take at a slurm pcm / pcmclient set of components. Will compile if
slurm is available, but will not allow itself to be selected yet. Still have to figure out a couple of things about slurm This commit was SVN r3202.
Этот коммит содержится в:
родитель
af59f53ff7
Коммит
09458c7dc2
32
src/mca/pcm/slurm/Makefile.am
Обычный файл
32
src/mca/pcm/slurm/Makefile.am
Обычный файл
@ -0,0 +1,32 @@
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# Use the top-level Makefile.options
|
||||
|
||||
include $(top_ompi_srcdir)/config/Makefile.options
|
||||
|
||||
SUBDIRS = src
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if OMPI_BUILD_pcm_slurm_DSO
|
||||
component_noinst =
|
||||
component_install = mca_pcm_slurm.la
|
||||
else
|
||||
component_noinst = libmca_pcm_slurm.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(libdir)/openmpi
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_pcm_slurm_la_SOURCES =
|
||||
mca_pcm_slurm_la_LIBADD = src/libmca_pcm_slurm.la
|
||||
mca_pcm_slurm_la_LDFLAGS = -module -avoid-version
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_pcm_slurm_la_SOURCES =
|
||||
libmca_pcm_slurm_la_LIBADD = src/libmca_pcm_slurm.la
|
||||
libmca_pcm_slurm_la_LDFLAGS = -module -avoid-version
|
10
src/mca/pcm/slurm/configure.params
Обычный файл
10
src/mca/pcm/slurm/configure.params
Обычный файл
@ -0,0 +1,10 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# Specific to this module
|
||||
|
||||
PARAM_INIT_FILE=src/pcm_slurm.c
|
||||
PARAM_CONFIG_HEADER_FILE="src/slurm_config.h"
|
||||
PARAM_CONFIG_FILES="Makefile src/Makefile"
|
101
src/mca/pcm/slurm/configure.stub
Обычный файл
101
src/mca/pcm/slurm/configure.stub
Обычный файл
@ -0,0 +1,101 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
#
|
||||
# Main function. This will be invoked in the middle of the templated
|
||||
# configure script.
|
||||
#
|
||||
AC_DEFUN([MCA_CONFIGURE_STUB],[
|
||||
#
|
||||
# Make a best effort to see if we are on a SLURM system. Also allow
|
||||
# user to specify location to the slurm prefix, which will work just as
|
||||
# well :).
|
||||
#
|
||||
|
||||
OMPI_HAVE_SLURM=0
|
||||
SLURM_LDFLAGS=""
|
||||
|
||||
#
|
||||
AC_ARG_WITH(slurm,
|
||||
AC_HELP_STRING([--with-slurm=DIR],
|
||||
[directory where the slurm software was installed]))
|
||||
|
||||
|
||||
#
|
||||
# Case 1: --without-slurm specified - overrides autodetect
|
||||
#
|
||||
if test "$with_slurm" = "no"; then
|
||||
AC_MSG_ERROR([*** SLURM explicitly disabled])
|
||||
|
||||
#
|
||||
# Case 2: --with-slurm specified - look in generic places for slurm libs
|
||||
#
|
||||
elif test "$with_slurm" = "yes"; then
|
||||
# See if we can find the slurm libraries...
|
||||
LIBS_save="$LIBS"
|
||||
AC_CHECK_LIB(slurm, slurm_api_version,
|
||||
OMPI_HAVE_SLURM=1,
|
||||
AC_MSG_ERROR([*** Connot find working libslurm.]))
|
||||
LIBS="$LIBS_save"
|
||||
|
||||
#
|
||||
# Case 3: --with-slurm=<foo> specified - try where they said to find it
|
||||
#
|
||||
else
|
||||
SLURM_DIR=$with_slurm
|
||||
|
||||
if test -n "$SLURM_DIR"; then
|
||||
# Make the tests work...
|
||||
OLDLDFLAGS="$LDFLAGS"
|
||||
OLDCPPFLAGS="$CPPFLAGS"
|
||||
SLURM_LDFLAGS="-L$SLURM_DIR/lib"
|
||||
LDFLAGS="$LDFLAGS $SLURM_LDFLAGS"
|
||||
CPPFLAGS="$CPPFLAGS -I$SLURM_DIR/include"
|
||||
LIBS_save="$LIBS"
|
||||
|
||||
AC_CHECK_LIB(slurm, slurm_api_version, OMPI_HAVE_SLURM=1,
|
||||
AC_MSG_ERROR([*** Cannot find working libslurm.]))
|
||||
|
||||
# Since we are going to add the -L and -l to LIBOMPI_EXTRA_LIBS,
|
||||
# we reset this to the start ...
|
||||
LDFLAGS="$OLDLDFLAGS"
|
||||
CPPFLAGS="$OLDCPPFLAGS"
|
||||
LIBS="$LIBS_save"
|
||||
|
||||
else
|
||||
AC_CHECK_LIB(slurm, slurm_api_version, OMPI_HAVE_SLURM=1,
|
||||
AC_MSG_ERROR([*** Cannot find working libslurm.]))
|
||||
fi
|
||||
fi
|
||||
|
||||
AC_MSG_CHECKING([if want SLURM support])
|
||||
|
||||
if test "$OMPI_HAVE_SLURM" = "1"; then
|
||||
|
||||
AC_MSG_RESULT([yes])
|
||||
#
|
||||
# Ok, we have slurm support. Add proper things to the various
|
||||
# compiler flags..
|
||||
#
|
||||
LIBMPI_EXTRA_LDFLAGS="$SLURM_LDFLAGS"
|
||||
LIBMPI_EXTRA_LIBS="-lslurm"
|
||||
|
||||
WRAPPER_EXTRA_LDFLAGS="$SLURM_LDFLAGS"
|
||||
WRAPPER_EXTRA_LIBS="-lslurm"
|
||||
|
||||
LDFLAGS="$LDFLAGS $SLURM_LDFLAGS"
|
||||
LIBS="$LIBS -lslurm"
|
||||
|
||||
else
|
||||
AC_MSG_RESULT([no])
|
||||
fi
|
||||
|
||||
|
||||
AC_DEFINE_UNQUOTED(OMPI_HAVE_SLURM, $OMPI_HAVE_SLURM,
|
||||
[Whether we have slurm support or not])
|
||||
|
||||
# Clean up
|
||||
unset SLURM_LDFLAGS
|
||||
])dnl
|
11
src/mca/pcm/slurm/src/Makefile.am
Обычный файл
11
src/mca/pcm/slurm/src/Makefile.am
Обычный файл
@ -0,0 +1,11 @@
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
include $(top_ompi_srcdir)/config/Makefile.options
|
||||
|
||||
noinst_LTLIBRARIES = libmca_pcm_slurm.la
|
||||
libmca_pcm_slurm_la_SOURCES = \
|
||||
pcm_slurm.c \
|
||||
pcm_slurm_component.c \
|
||||
pcm_slurm.h
|
262
src/mca/pcm/slurm/src/pcm_slurm.c
Обычный файл
262
src/mca/pcm/slurm/src/pcm_slurm.c
Обычный файл
@ -0,0 +1,262 @@
|
||||
/* -*- C -*-
|
||||
*
|
||||
* $HEADER$
|
||||
*
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include <errno.h>
|
||||
#include <signal.h>
|
||||
#include <sys/types.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include "pcm_slurm.h"
|
||||
#include "mca/pcm/base/base_job_track.h"
|
||||
#include "include/constants.h"
|
||||
#include "mca/pcm/pcm.h"
|
||||
#include "mca/pcm/base/base.h"
|
||||
#include "event/event.h"
|
||||
#include "class/ompi_list.h"
|
||||
#include "mca/ns/ns.h"
|
||||
#include "mca/ns/base/base.h"
|
||||
#include "util/argv.h"
|
||||
#include "util/numtostr.h"
|
||||
#include "runtime/ompi_rte_wait.h"
|
||||
|
||||
|
||||
static void internal_wait_cb(pid_t pid, int status, void *data);
|
||||
|
||||
|
||||
/* ok, this is fairly simple in the SLURM world */
|
||||
ompi_list_t *
|
||||
mca_pcm_slurm_allocate_resources(struct mca_pcm_base_module_1_0_0_t* me,
|
||||
mca_ns_base_jobid_t jobid,
|
||||
int nodes, int procs)
|
||||
{
|
||||
ompi_list_t *ret;
|
||||
ompi_rte_node_allocation_t *node_alloc;
|
||||
int total_procs;
|
||||
|
||||
ret = OBJ_NEW(ompi_list_t);
|
||||
if (NULL == ret) {
|
||||
errno = ENOMEM;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
node_alloc = OBJ_NEW(ompi_rte_node_allocation_t);
|
||||
if (NULL == node_alloc) {
|
||||
OBJ_RELEASE(ret);
|
||||
errno = ENOMEM;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* For now, just punt on whether we can actually fullfill the request or not */
|
||||
total_procs = (nodes == 0) ? procs : nodes * procs;
|
||||
node_alloc->start =
|
||||
(int) ompi_name_server.reserve_range(jobid, total_procs);
|
||||
node_alloc->nodes = nodes;
|
||||
node_alloc->count = procs;
|
||||
|
||||
ompi_list_append(ret, (ompi_list_item_t*) node_alloc);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
mca_pcm_slurm_spawn_procs(struct mca_pcm_base_module_1_0_0_t* me,
|
||||
mca_ns_base_jobid_t jobid, ompi_list_t *schedlist)
|
||||
{
|
||||
ompi_rte_node_allocation_t *nodes;
|
||||
ompi_rte_node_schedule_t *sched;
|
||||
char **argv = NULL;
|
||||
int argc = 0;
|
||||
char *num;
|
||||
int i;
|
||||
int ret;
|
||||
char *tmp;
|
||||
pid_t child;
|
||||
|
||||
/* quick sanity check */
|
||||
if (ompi_list_get_size(schedlist) > 1) {
|
||||
/* BWB: show_help */
|
||||
printf("SLURM pcm can not cope with multiple schedlist items at this time\n");
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
sched = (ompi_rte_node_schedule_t*) ompi_list_get_first(schedlist);
|
||||
if (ompi_list_get_size(sched->nodelist) > 1) {
|
||||
/* BWB: show_help */
|
||||
printf("SLURM pcm can not cope with multiple nodelists at this time\n");
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
nodes = (ompi_rte_node_allocation_t*) ompi_list_get_first(sched->nodelist);
|
||||
|
||||
/* start building up the argv array */
|
||||
ompi_argv_append(&argc, &argv, "prun");
|
||||
if (nodes->nodes > 0) {
|
||||
/* copy over the number of nodes */
|
||||
num = ompi_ltostr(nodes->nodes);
|
||||
ompi_argv_append(&argc, &argv, "-N");
|
||||
ompi_argv_append(&argc, &argv, num);
|
||||
free(num);
|
||||
/* and map to fix how they look at their num procs */
|
||||
num = ompi_ltostr(nodes->nodes * nodes->count);
|
||||
ompi_argv_append(&argc, &argv, "-n");
|
||||
ompi_argv_append(&argc, &argv, num);
|
||||
free(num);
|
||||
} else {
|
||||
num = ompi_ltostr(nodes->count);
|
||||
ompi_argv_append(&argc, &argv, "-n");
|
||||
ompi_argv_append(&argc, &argv, num);
|
||||
free(num);
|
||||
}
|
||||
|
||||
/* copy over the command line arguments */
|
||||
for (i = 0 ; i < sched->argc ; ++i) {
|
||||
ompi_argv_append(&argc, &argv, (sched->argv)[i]);
|
||||
}
|
||||
|
||||
/*ok, fork! */
|
||||
child = fork();
|
||||
if (child < 0) {
|
||||
/* show_help */
|
||||
printf("SLURM pcm unable to fork\n");
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
} else if (0 == child) {
|
||||
/* set up environment */
|
||||
/* these pointers will last until we exec, so safe to putenv them in the child */
|
||||
for (i = 0 ; sched->env[i] != NULL ; ++i) {
|
||||
putenv(sched->env[i]);
|
||||
}
|
||||
|
||||
/* give our starting vpid count to the other side... */
|
||||
asprintf(&tmp, "OMPI_MCA_pcmclient_slurm_start_vpid=%d\n",
|
||||
nodes->start);
|
||||
putenv(tmp);
|
||||
|
||||
asprintf(&tmp, "OMPI_MCA_pcmclient_slurm_jobid=%d\n", jobid);
|
||||
putenv(tmp);
|
||||
|
||||
/* set cwd */
|
||||
ret = chdir(sched->cwd);
|
||||
if (0 != ret) {
|
||||
/* BWB show_help */
|
||||
printf("SLURM pcm can not chdir to %s\n", sched->cwd);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/* go, go, go! */
|
||||
ret = execvp(argv[0], argv);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/* ok, I'm the parent - stick the pids where they belong */
|
||||
ret = mca_pcm_base_add_started_pids(jobid, child, nodes->start,
|
||||
nodes->start + (nodes->nodes == 0) ?
|
||||
nodes->count :
|
||||
nodes->nodes * nodes->count);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
/* BWB show_help */
|
||||
printf("show_help: unable to record child pid\n");
|
||||
kill(child, SIGKILL);
|
||||
}
|
||||
ret = ompi_rte_wait_cb(child, internal_wait_cb, NULL);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
/* BWB - show_help */
|
||||
printf("show_help: unable to register callback\n");
|
||||
kill(child, SIGKILL);
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
mca_pcm_slurm_kill_proc(struct mca_pcm_base_module_1_0_0_t* me,
|
||||
ompi_process_name_t *name, int flags)
|
||||
{
|
||||
pid_t doomed;
|
||||
|
||||
doomed = mca_pcm_base_get_started_pid(ns_base_get_jobid(name),
|
||||
ns_base_get_vpid(name), true);
|
||||
if (doomed > 0) {
|
||||
kill(doomed, SIGTERM);
|
||||
} else {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
mca_pcm_slurm_kill_job(struct mca_pcm_base_module_1_0_0_t* me,
|
||||
mca_ns_base_jobid_t jobid, int flags)
|
||||
{
|
||||
pid_t *doomed;
|
||||
size_t doomed_len;
|
||||
int ret, i;
|
||||
|
||||
ret = mca_pcm_base_get_started_pid_list(jobid, &doomed, &doomed_len, true);
|
||||
if (OMPI_SUCCESS != ret) return ret;
|
||||
|
||||
for (i = 0 ; i < doomed_len ; ++i) {
|
||||
kill(doomed[i], SIGTERM);
|
||||
}
|
||||
|
||||
if (NULL != doomed) {
|
||||
free(doomed);
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
mca_pcm_slurm_deallocate_resources(struct mca_pcm_base_module_1_0_0_t* me,
|
||||
mca_ns_base_jobid_t jobid,
|
||||
ompi_list_t *nodelist)
|
||||
{
|
||||
if (nodelist != NULL) OBJ_RELEASE(nodelist);
|
||||
|
||||
mca_pcm_base_remove_job(jobid);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
|
||||
static void
|
||||
internal_wait_cb(pid_t pid, int status, void *data)
|
||||
{
|
||||
mca_ns_base_jobid_t jobid = 0;
|
||||
mca_ns_base_vpid_t upper = 0;
|
||||
mca_ns_base_vpid_t lower = 0;
|
||||
mca_ns_base_vpid_t i = 0;
|
||||
int ret;
|
||||
char *test;
|
||||
ompi_process_name_t *proc_name;
|
||||
|
||||
printf("pcm_slurm was notified that process %d exited with status %d\n",
|
||||
pid, status);
|
||||
|
||||
ret = mca_pcm_base_get_job_info(pid, &jobid, &lower, &upper);
|
||||
if (ret != OMPI_SUCCESS) {
|
||||
printf("Unfortunately, we could not find the associated job info\n");
|
||||
} else {
|
||||
printf(" It appears that this starter was assocated with jobid %d\n"
|
||||
" vpids %d to %d\n\n",
|
||||
jobid, lower, upper);
|
||||
}
|
||||
|
||||
/* unregister all the procs */
|
||||
#if 0
|
||||
/* BWB - fix me when deadlock in gpr is fixed */
|
||||
for (i = lower ; i <= upper ; ++i) {
|
||||
test = ns_base_get_proc_name_string(ns_base_create_process_name(0, jobid, i));
|
||||
ompi_registry.rte_unregister(test);
|
||||
}
|
||||
#endif
|
||||
}
|
66
src/mca/pcm/slurm/src/pcm_slurm.h
Обычный файл
66
src/mca/pcm/slurm/src/pcm_slurm.h
Обычный файл
@ -0,0 +1,66 @@
|
||||
/* -*- C -*-
|
||||
*
|
||||
* $HEADER$
|
||||
*
|
||||
* BWB: COMPONENT TODO
|
||||
*
|
||||
* - add process reaping code
|
||||
* - trigger status change events on process death
|
||||
*
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include "mca/pcm/pcm.h"
|
||||
#include "include/types.h"
|
||||
#include "class/ompi_list.h"
|
||||
|
||||
#include <sys/types.h>
|
||||
|
||||
#ifndef MCA_PCM_SLURM_H_
|
||||
#define MCA_PCM_SLURM_H_
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Module open / close
|
||||
*/
|
||||
int mca_pcm_slurm_component_open(void);
|
||||
int mca_pcm_slurm_component_close(void);
|
||||
|
||||
/*
|
||||
* Startup / Shutdown
|
||||
*/
|
||||
struct mca_pcm_base_module_1_0_0_t* mca_pcm_slurm_init(int *priority,
|
||||
bool have_threads,
|
||||
int constraints);
|
||||
int mca_pcm_slurm_finalize(struct mca_pcm_base_module_1_0_0_t* me);
|
||||
|
||||
/*
|
||||
* Interface
|
||||
*/
|
||||
ompi_list_t* mca_pcm_slurm_allocate_resources(struct mca_pcm_base_module_1_0_0_t* me,
|
||||
mca_ns_base_jobid_t jobid,
|
||||
int nodes, int procs);
|
||||
int mca_pcm_slurm_spawn_procs(struct mca_pcm_base_module_1_0_0_t* me,
|
||||
mca_ns_base_jobid_t jobid, ompi_list_t *schedule_list);
|
||||
int mca_pcm_slurm_kill_proc(struct mca_pcm_base_module_1_0_0_t* me,
|
||||
ompi_process_name_t *name, int flags);
|
||||
int mca_pcm_slurm_kill_job(struct mca_pcm_base_module_1_0_0_t* me,
|
||||
mca_ns_base_jobid_t jobid, int flags);
|
||||
int mca_pcm_slurm_deallocate_resources(struct mca_pcm_base_module_1_0_0_t* me,
|
||||
mca_ns_base_jobid_t jobid,
|
||||
ompi_list_t *nodelist);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Module variables
|
||||
*/
|
||||
extern int mca_pcm_slurm_output;
|
||||
|
||||
#endif /* MCA_PCM_SLURM_H_ */
|
158
src/mca/pcm/slurm/src/pcm_slurm_component.c
Обычный файл
158
src/mca/pcm/slurm/src/pcm_slurm_component.c
Обычный файл
@ -0,0 +1,158 @@
|
||||
/* -*- C -*-
|
||||
*
|
||||
* $HEADER$
|
||||
*
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include "pcm_slurm.h"
|
||||
#include "include/constants.h"
|
||||
#include "include/types.h"
|
||||
#include "util/malloc.h"
|
||||
#include "util/output.h"
|
||||
#include "class/ompi_list.h"
|
||||
#include "mca/mca.h"
|
||||
#include "mca/base/mca_base_param.h"
|
||||
#include "mca/pcm/pcm.h"
|
||||
#include "mca/pcm/base/base.h"
|
||||
#include "mca/llm/base/base.h"
|
||||
#include "util/path.h"
|
||||
#include "runtime/runtime.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
|
||||
extern char **environ;
|
||||
|
||||
/*
|
||||
* Struct of function pointers and all that to let us be initialized
|
||||
*/
|
||||
mca_pcm_base_component_1_0_0_t mca_pcm_slurm_component = {
|
||||
{
|
||||
MCA_PCM_BASE_VERSION_1_0_0,
|
||||
|
||||
"slurm", /* MCA component name */
|
||||
1, /* MCA component major version */
|
||||
0, /* MCA component minor version */
|
||||
0, /* MCA component release version */
|
||||
mca_pcm_slurm_component_open, /* component open */
|
||||
mca_pcm_slurm_component_close /* component close */
|
||||
},
|
||||
{
|
||||
false /* checkpoint / restart */
|
||||
},
|
||||
mca_pcm_slurm_init, /* component init */
|
||||
NULL /* unique name */
|
||||
};
|
||||
|
||||
|
||||
/* need to create output stream to dump in file */
|
||||
ompi_output_stream_t mca_pcm_slurm_output_stream = {
|
||||
false, /* lds_is_debugging BWB - change me for release */
|
||||
0, /* lds_verbose_level */
|
||||
false, /* lds_want_syslog */
|
||||
0, /* lds_syslog_priority */
|
||||
NULL, /* lds_syslog_ident */
|
||||
"pcm: slurm: ", /* lds_prefix */
|
||||
true, /* lds_want_stdout */
|
||||
false, /* lds_want_stderr */
|
||||
true, /* lds_want_file */
|
||||
true, /* lds_want_file_append */
|
||||
"pcm_slurm" /* lds_file_suffix */
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
* Module variables handles
|
||||
*/
|
||||
static int mca_pcm_slurm_param_priority;
|
||||
static int mca_pcm_slurm_param_debug;
|
||||
|
||||
/*
|
||||
* Component variables. All of these are shared among the module
|
||||
* instances, so they don't need to go in a special structure or
|
||||
* anything.
|
||||
*/
|
||||
int mca_pcm_slurm_output = -1;
|
||||
|
||||
|
||||
int
|
||||
mca_pcm_slurm_component_open(void)
|
||||
{
|
||||
mca_pcm_slurm_param_debug =
|
||||
mca_base_param_register_int("pcm", "slurm", "debug", NULL, 100);
|
||||
|
||||
mca_pcm_slurm_param_priority =
|
||||
mca_base_param_register_int("pcm", "slurm", "priority", NULL, 5);
|
||||
|
||||
mca_pcm_slurm_output = ompi_output_open(&mca_pcm_slurm_output_stream);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
mca_pcm_slurm_component_close(void)
|
||||
{
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
mca_pcm_base_module_t*
|
||||
mca_pcm_slurm_init(int *priority,
|
||||
bool have_threads,
|
||||
int constraints)
|
||||
{
|
||||
int debug;
|
||||
char *srun;
|
||||
mca_pcm_base_module_t *me;
|
||||
|
||||
/* BWB - temporarily disable */
|
||||
return NULL;
|
||||
|
||||
/* debugging gorp */
|
||||
mca_base_param_lookup_int(mca_pcm_slurm_param_debug, &debug);
|
||||
ompi_output_set_verbosity(mca_pcm_slurm_output, debug);
|
||||
|
||||
/* get our priority - if 0, we don't run */
|
||||
mca_base_param_lookup_int(mca_pcm_slurm_param_priority, priority);
|
||||
if (0 == priority) return NULL;
|
||||
|
||||
/* check constrains */
|
||||
/* no daemon */
|
||||
if (0 != (constraints & OMPI_RTE_SPAWN_DAEMON)) return NULL;
|
||||
/* no MPI_COMM_SPAWN* */
|
||||
if (0 != (constraints & OMPI_RTE_SPAWN_FROM_MPI)) return NULL;
|
||||
|
||||
srun = ompi_path_env_findv("srun", X_OK, environ, NULL);
|
||||
if (NULL == srun) return NULL;
|
||||
free(srun);
|
||||
|
||||
/* ok, now let's try to fire up */
|
||||
me = malloc(sizeof(mca_pcm_base_module_t));
|
||||
if (NULL == me) return NULL;
|
||||
|
||||
me->pcm_allocate_resources = mca_pcm_slurm_allocate_resources;
|
||||
me->pcm_spawn_procs = mca_pcm_slurm_spawn_procs;
|
||||
me->pcm_kill_proc = mca_pcm_slurm_kill_proc;
|
||||
me->pcm_kill_job = mca_pcm_slurm_kill_job;
|
||||
me->pcm_deallocate_resources = mca_pcm_slurm_deallocate_resources;
|
||||
me->pcm_finalize = mca_pcm_slurm_finalize;
|
||||
|
||||
return me;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
mca_pcm_slurm_finalize(struct mca_pcm_base_module_1_0_0_t* me)
|
||||
{
|
||||
if (mca_pcm_slurm_output > 0) {
|
||||
ompi_output_close(mca_pcm_slurm_output);
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
37
src/mca/pcmclient/slurm/Makefile.am
Обычный файл
37
src/mca/pcmclient/slurm/Makefile.am
Обычный файл
@ -0,0 +1,37 @@
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# Use the top-level Makefile.options
|
||||
|
||||
include $(top_ompi_srcdir)/config/Makefile.options
|
||||
|
||||
EXTRA_DIST = VERSION
|
||||
|
||||
sources = \
|
||||
pcmclient_slurm.h \
|
||||
pcmclient_slurm.c \
|
||||
pcmclient_slurm_component.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if OMPI_BUILD_pcmclient_slurm_DSO
|
||||
component_noinst =
|
||||
component_install = mca_pcmclient_slurm.la
|
||||
else
|
||||
component_noinst = libmca_pcmclient_slurm.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(libdir)/openmpi
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_pcmclient_slurm_la_SOURCES = $(sources)
|
||||
mca_pcmclient_slurm_la_LIBADD =
|
||||
mca_pcmclient_slurm_la_LDFLAGS = -module -avoid-version
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_pcmclient_slurm_la_SOURCES = $(sources)
|
||||
libmca_pcmclient_slurm_la_LIBADD =
|
||||
libmca_pcmclient_slurm_la_LDFLAGS = -module -avoid-version
|
6
src/mca/pcmclient/slurm/VERSION
Обычный файл
6
src/mca/pcmclient/slurm/VERSION
Обычный файл
@ -0,0 +1,6 @@
|
||||
major=1
|
||||
minor=0
|
||||
release=0
|
||||
alpha=0
|
||||
beta=0
|
||||
svn=1
|
9
src/mca/pcmclient/slurm/configure.params
Обычный файл
9
src/mca/pcmclient/slurm/configure.params
Обычный файл
@ -0,0 +1,9 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# Specific to this module
|
||||
|
||||
PARAM_INIT_FILE=pcmclient_slurm_component.c
|
||||
PARAM_CONFIG_FILES="Makefile"
|
40
src/mca/pcmclient/slurm/pcmclient_slurm.c
Обычный файл
40
src/mca/pcmclient/slurm/pcmclient_slurm.c
Обычный файл
@ -0,0 +1,40 @@
|
||||
/*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include "mca/pcmclient/pcmclient.h"
|
||||
#include "mca/pcmclient/slurm/pcmclient_slurm.h"
|
||||
#include "include/types.h"
|
||||
#include "include/constants.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <sys/types.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
|
||||
extern int mca_pcmclient_slurm_num_procs;
|
||||
extern int mca_pcmclient_slurm_procid;
|
||||
extern ompi_process_name_t *mca_pcmclient_slurm_procs;
|
||||
|
||||
|
||||
int
|
||||
mca_pcmclient_slurm_get_peers(ompi_process_name_t **procs,
|
||||
size_t *num_procs)
|
||||
{
|
||||
if (NULL == mca_pcmclient_slurm_procs) return OMPI_ERROR;
|
||||
|
||||
*num_procs = mca_pcmclient_slurm_num_procs;
|
||||
*procs = mca_pcmclient_slurm_procs;
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
ompi_process_name_t*
|
||||
mca_pcmclient_slurm_get_self(void)
|
||||
{
|
||||
if (NULL == mca_pcmclient_slurm_procs) return NULL;
|
||||
|
||||
return &mca_pcmclient_slurm_procs[mca_pcmclient_slurm_procid];
|
||||
}
|
29
src/mca/pcmclient/slurm/pcmclient_slurm.h
Обычный файл
29
src/mca/pcmclient/slurm/pcmclient_slurm.h
Обычный файл
@ -0,0 +1,29 @@
|
||||
/* -*- C -*-
|
||||
*
|
||||
* $HEADER$
|
||||
*
|
||||
*/
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include "mca/pcmclient/pcmclient.h"
|
||||
#include "include/types.h"
|
||||
|
||||
/*
|
||||
* Module open / close
|
||||
*/
|
||||
int mca_pcmclient_slurm_open(void);
|
||||
int mca_pcmclient_slurm_close(void);
|
||||
|
||||
/*
|
||||
* Startup / Shutdown
|
||||
*/
|
||||
struct mca_pcmclient_base_module_1_0_0_t* mca_pcmclient_slurm_init(int *priority, bool *allow_multi_user_threads, bool *have_hidden_threads);
|
||||
int mca_pcmclient_slurm_finalize(void);
|
||||
|
||||
|
||||
/*
|
||||
* "Action" functions
|
||||
*/
|
||||
int mca_pcmclient_slurm_get_peers(ompi_process_name_t **peers, size_t *npeers);
|
||||
ompi_process_name_t* mca_pcmclient_slurm_get_self(void);
|
||||
|
147
src/mca/pcmclient/slurm/pcmclient_slurm_component.c
Обычный файл
147
src/mca/pcmclient/slurm/pcmclient_slurm_component.c
Обычный файл
@ -0,0 +1,147 @@
|
||||
/*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
|
||||
#include "ompi_config.h"
|
||||
#include "pcmclient-slurm-version.h"
|
||||
|
||||
#include "include/constants.h"
|
||||
#include "include/types.h"
|
||||
#include "mca/mca.h"
|
||||
#include "mca/pcmclient/pcmclient.h"
|
||||
#include "mca/pcmclient/slurm/pcmclient_slurm.h"
|
||||
#include "util/proc_info.h"
|
||||
#include "mca/base/mca_base_param.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
|
||||
/*
|
||||
* Struct of function pointers and all that to let us be initialized
|
||||
*/
|
||||
mca_pcmclient_base_component_1_0_0_t mca_pcmclient_slurm_component = {
|
||||
{
|
||||
MCA_PCMCLIENT_BASE_VERSION_1_0_0,
|
||||
|
||||
"slurm", /* MCA component name */
|
||||
MCA_pcmclient_slurm_MAJOR_VERSION, /* MCA component major version */
|
||||
MCA_pcmclient_slurm_MINOR_VERSION, /* MCA component minor version */
|
||||
MCA_pcmclient_slurm_RELEASE_VERSION, /* MCA component release version */
|
||||
mca_pcmclient_slurm_open, /* component open */
|
||||
mca_pcmclient_slurm_close /* component close */
|
||||
},
|
||||
{
|
||||
false /* checkpoint / restart */
|
||||
},
|
||||
mca_pcmclient_slurm_init, /* component init */
|
||||
mca_pcmclient_slurm_finalize
|
||||
};
|
||||
|
||||
|
||||
struct mca_pcmclient_base_module_1_0_0_t mca_pcmclient_slurm_1_0_0 = {
|
||||
mca_pcmclient_slurm_get_self,
|
||||
mca_pcmclient_slurm_get_peers,
|
||||
};
|
||||
|
||||
/*
|
||||
* component-global variables
|
||||
*/
|
||||
int mca_pcmclient_slurm_num_procs;
|
||||
int mca_pcmclient_slurm_procid;
|
||||
|
||||
ompi_process_name_t *mca_pcmclient_slurm_procs = NULL;
|
||||
|
||||
/*
|
||||
* local variables
|
||||
*/
|
||||
|
||||
static int slurm_jobid_handle;
|
||||
static int slurm_start_vpid_handle;
|
||||
static int slurm_cellid_handle;
|
||||
|
||||
int mca_pcmclient_slurm_cellid;
|
||||
int mca_pcmclient_slurm_jobid;
|
||||
|
||||
int
|
||||
mca_pcmclient_slurm_open(void)
|
||||
{
|
||||
slurm_jobid_handle =
|
||||
mca_base_param_register_int("pcmclient", "slurm", "jobid", NULL, -1);
|
||||
slurm_start_vpid_handle =
|
||||
mca_base_param_register_int("pcmclient", "slurm", "start_vpid", NULL, 0);
|
||||
slurm_cellid_handle =
|
||||
mca_base_param_register_int("pcmclient", "slurm", "cellid", NULL, 0);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
mca_pcmclient_slurm_close(void)
|
||||
{
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
struct mca_pcmclient_base_module_1_0_0_t *
|
||||
mca_pcmclient_slurm_init(int *priority,
|
||||
bool *allow_multiple_user_threads,
|
||||
bool *have_hidden_threads)
|
||||
{
|
||||
int i;
|
||||
char *tmp;
|
||||
int start_vpid;
|
||||
|
||||
*priority = 5; /* make sure we are above env / singleton */
|
||||
*allow_multiple_user_threads = true;
|
||||
*have_hidden_threads = false;
|
||||
|
||||
mca_base_param_lookup_int(slurm_jobid_handle, &mca_pcmclient_slurm_jobid);
|
||||
mca_base_param_lookup_int(slurm_cellid_handle, &mca_pcmclient_slurm_cellid);
|
||||
mca_base_param_lookup_int(slurm_start_vpid_handle, &start_vpid);
|
||||
|
||||
if (mca_pcmclient_slurm_jobid < 0) {
|
||||
tmp = getenv("SLURM_JOBID");
|
||||
if (NULL == tmp) return NULL;
|
||||
mca_pcmclient_slurm_jobid = atoi(tmp);
|
||||
}
|
||||
|
||||
tmp = getenv("SLURM_PROCID");
|
||||
if (NULL == tmp) return NULL;
|
||||
mca_pcmclient_slurm_procid = atoi(tmp);
|
||||
|
||||
tmp = getenv("SLURM_NPROCS");
|
||||
if (NULL == tmp) return NULL;
|
||||
mca_pcmclient_slurm_num_procs = atoi(tmp);
|
||||
|
||||
mca_pcmclient_slurm_procs =
|
||||
(ompi_process_name_t*) malloc(sizeof(ompi_process_name_t) *
|
||||
mca_pcmclient_slurm_num_procs);
|
||||
if (NULL == mca_pcmclient_slurm_procs) return NULL;
|
||||
|
||||
for ( i = 0 ; i < mca_pcmclient_slurm_num_procs ; ++i) {
|
||||
mca_pcmclient_slurm_procs[i].cellid = mca_pcmclient_slurm_cellid;
|
||||
mca_pcmclient_slurm_procs[i].jobid = mca_pcmclient_slurm_jobid;
|
||||
mca_pcmclient_slurm_procs[i].vpid = start_vpid + i;
|
||||
}
|
||||
|
||||
return &mca_pcmclient_slurm_1_0_0;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
mca_pcmclient_slurm_finalize(void)
|
||||
{
|
||||
if (NULL != mca_pcmclient_slurm_procs) {
|
||||
free(mca_pcmclient_slurm_procs);
|
||||
mca_pcmclient_slurm_procs = NULL;
|
||||
mca_pcmclient_slurm_num_procs = 0;
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user