2005-10-02 14:57:44 +00:00
|
|
|
/* -*- C -*-
|
2006-02-13 15:28:29 +00:00
|
|
|
*
|
2005-11-05 19:57:48 +00:00
|
|
|
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
|
|
|
* University Research and Technology
|
|
|
|
* Corporation. All rights reserved.
|
2006-08-23 03:32:36 +00:00
|
|
|
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
2005-11-05 19:57:48 +00:00
|
|
|
* of Tennessee Research Foundation. All rights
|
|
|
|
* reserved.
|
2006-02-13 15:28:29 +00:00
|
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
2005-10-02 14:57:44 +00:00
|
|
|
* University of Stuttgart. All rights reserved.
|
|
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
|
|
* All rights reserved.
|
|
|
|
* $COPYRIGHT$
|
2006-02-13 15:28:29 +00:00
|
|
|
*
|
2005-10-02 14:57:44 +00:00
|
|
|
* Additional copyrights may follow
|
2006-02-13 15:28:29 +00:00
|
|
|
*
|
2005-10-02 14:57:44 +00:00
|
|
|
* $HEADER$
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
/**
|
|
|
|
* @file:
|
|
|
|
* Part of the bproc launcher. See pls_bproc.h for an overview of how it works.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "orte_config.h"
|
2005-12-31 12:35:24 +00:00
|
|
|
#if HAVE_SYS_TYPES_H
|
2005-10-02 14:57:44 +00:00
|
|
|
#include <sys/types.h>
|
2005-12-31 12:35:24 +00:00
|
|
|
#endif /* HAVE_SYS_TYPES_H */
|
|
|
|
#ifdef HAVE_SYS_STAT_H
|
2005-10-02 14:57:44 +00:00
|
|
|
#include <sys/stat.h>
|
2005-12-31 12:35:24 +00:00
|
|
|
#endif /* HAVE_SYS_STAT_H */
|
|
|
|
#ifdef HAVE_UNISTD_H
|
2005-10-02 14:57:44 +00:00
|
|
|
#include <unistd.h>
|
2005-12-31 12:35:24 +00:00
|
|
|
#endif /* HAVE_UNISTD_H */
|
|
|
|
#include <errno.h>
|
2005-10-02 14:57:44 +00:00
|
|
|
#include <signal.h>
|
2005-12-31 12:35:24 +00:00
|
|
|
#ifdef HAVE_FCNTL_H
|
2005-10-02 14:57:44 +00:00
|
|
|
#include <fcntl.h>
|
2005-12-31 12:35:24 +00:00
|
|
|
#endif /* HAVE_FCNTL_H */
|
|
|
|
#ifdef HAVE_STRING_H
|
2005-10-02 14:57:44 +00:00
|
|
|
#include <string.h>
|
2005-12-31 12:35:24 +00:00
|
|
|
#endif /* HAVE_STRING_H */
|
2005-10-02 14:57:44 +00:00
|
|
|
|
2006-03-14 14:40:52 +00:00
|
|
|
#include "opal/install_dirs.h"
|
Bring over the update to terminate orteds that are generated by a dynamic spawn such as comm_spawn. This introduces the concept of a job "family" - i.e., jobs that have a parent/child relationship. Comm_spawn'ed jobs have a parent (the one that spawned them). We track that relationship throughout the lineage - i.e., if a comm_spawned job in turn calls comm_spawn, then it has a parent (the one that spawned it) and a "root" job (the original job that started things).
Accordingly, there are new APIs to the name service to support the ability to get a job's parent, root, immediate children, and all its descendants. In addition, the terminate_job, terminate_orted, and signal_job APIs for the PLS have been modified to accept attributes that define the extent of their actions. For example, doing a "terminate_job" with an attribute of ORTE_NS_INCLUDE_DESCENDANTS will terminate the given jobid AND all jobs that descended from it.
I have tested this capability on a MacBook under rsh, Odin under SLURM, and LANL's Flash (bproc). It worked successfully on non-MPI jobs (both simple and including a spawn), and MPI jobs (again, both simple and with a spawn).
This commit was SVN r12597.
2006-11-14 19:34:59 +00:00
|
|
|
#include "opal/class/opal_list.h"
|
2005-10-02 14:57:44 +00:00
|
|
|
#include "opal/event/event.h"
|
|
|
|
#include "opal/mca/base/mca_base_param.h"
|
|
|
|
#include "opal/util/argv.h"
|
|
|
|
#include "opal/util/output.h"
|
|
|
|
#include "opal/util/opal_environ.h"
|
|
|
|
#include "opal/util/path.h"
|
2006-09-14 21:29:51 +00:00
|
|
|
#include "opal/util/os_path.h"
|
2005-10-02 14:57:44 +00:00
|
|
|
#include "opal/util/show_help.h"
|
2006-10-02 14:58:22 +00:00
|
|
|
#include "opal/util/trace.h"
|
2006-02-13 15:28:29 +00:00
|
|
|
|
|
|
|
#include "orte/dss/dss.h"
|
2005-10-02 14:57:44 +00:00
|
|
|
#include "orte/util/sys_info.h"
|
|
|
|
#include "orte/mca/errmgr/errmgr.h"
|
|
|
|
#include "orte/mca/iof/iof.h"
|
2006-08-03 05:29:49 +00:00
|
|
|
#include "orte/mca/gpr/gpr.h"
|
2006-09-14 21:29:51 +00:00
|
|
|
#include "orte/mca/ns/ns.h"
|
2005-10-02 14:57:44 +00:00
|
|
|
#include "orte/mca/sds/base/base.h"
|
|
|
|
#include "orte/mca/oob/base/base.h"
|
2006-10-07 15:45:24 +00:00
|
|
|
#include "orte/mca/ras/ras.h"
|
2006-09-14 21:29:51 +00:00
|
|
|
#include "orte/mca/rmgr/rmgr.h"
|
2006-10-07 15:45:24 +00:00
|
|
|
#include "orte/mca/rmaps/rmaps.h"
|
2005-10-02 14:57:44 +00:00
|
|
|
#include "orte/mca/rml/rml.h"
|
2006-11-22 20:23:17 +00:00
|
|
|
#include "orte/mca/schema/schema_types.h"
|
2006-08-16 16:35:09 +00:00
|
|
|
#include "orte/mca/smr/smr.h"
|
2005-10-02 14:57:44 +00:00
|
|
|
#include "orte/runtime/orte_wait.h"
|
|
|
|
#include "orte/runtime/runtime.h"
|
2006-02-13 15:28:29 +00:00
|
|
|
|
2006-09-14 21:29:51 +00:00
|
|
|
#include "orte/mca/pls/base/pls_private.h"
|
2005-10-02 14:57:44 +00:00
|
|
|
#include "pls_bproc.h"
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Our current evironment
|
|
|
|
*/
|
|
|
|
extern char **environ;
|
2006-12-15 02:34:14 +00:00
|
|
|
|
|
|
|
static bool daemons_launched;
|
|
|
|
static bool bynode;
|
2005-10-02 14:57:44 +00:00
|
|
|
|
2006-03-16 00:06:48 +00:00
|
|
|
#if OMPI_HAVE_POSIX_THREADS && OMPI_THREADS_HAVE_DIFFERENT_PIDS
|
|
|
|
int orte_pls_bproc_launch_threaded(orte_jobid_t);
|
|
|
|
#endif
|
2006-06-08 18:27:17 +00:00
|
|
|
|
|
|
|
|
2005-10-02 14:57:44 +00:00
|
|
|
/**
|
|
|
|
* Initialization of the bproc module with all the needed function pointers
|
|
|
|
*/
|
|
|
|
orte_pls_base_module_t orte_pls_bproc_module = {
|
2006-03-16 00:06:48 +00:00
|
|
|
#if OMPI_HAVE_POSIX_THREADS && OMPI_THREADS_HAVE_DIFFERENT_PIDS
|
|
|
|
orte_pls_bproc_launch_threaded,
|
|
|
|
#else
|
2005-10-02 14:57:44 +00:00
|
|
|
orte_pls_bproc_launch,
|
2006-03-16 00:06:48 +00:00
|
|
|
#endif
|
2005-10-02 14:57:44 +00:00
|
|
|
orte_pls_bproc_terminate_job,
|
2006-09-14 21:29:51 +00:00
|
|
|
orte_pls_bproc_terminate_orteds,
|
2005-10-02 14:57:44 +00:00
|
|
|
orte_pls_bproc_terminate_proc,
|
2006-06-08 18:27:17 +00:00
|
|
|
orte_pls_bproc_signal_job,
|
|
|
|
orte_pls_bproc_signal_proc,
|
2005-10-02 14:57:44 +00:00
|
|
|
orte_pls_bproc_finalize
|
|
|
|
};
|
|
|
|
|
2006-03-16 00:06:48 +00:00
|
|
|
|
2006-12-15 02:34:14 +00:00
|
|
|
static int orte_pls_bproc_node_list(orte_job_map_t *map,
|
|
|
|
int *node_array, int * num_nodes,
|
2005-10-02 14:57:44 +00:00
|
|
|
int num_procs);
|
2006-02-13 15:28:29 +00:00
|
|
|
static int orte_pls_bproc_setup_io(orte_jobid_t jobid, struct bproc_io_t * io,
|
2005-10-02 14:57:44 +00:00
|
|
|
int node_rank, int app_context);
|
|
|
|
static void orte_pls_bproc_waitpid_cb(pid_t wpid, int status, void *data);
|
|
|
|
static void orte_pls_bproc_waitpid_daemon_cb(pid_t wpid, int status, void *data);
|
|
|
|
#ifdef MCA_pls_bproc_scyld
|
|
|
|
/* compatibility functions for scyld bproc and pre 3.2.0 LANL bproc */
|
2006-02-13 15:28:29 +00:00
|
|
|
static int bproc_vexecmove_io(int nnodes, int *nodes, int *pids,
|
2005-10-02 14:57:44 +00:00
|
|
|
struct bproc_io_t *io, int iolen, const char *cmd,
|
|
|
|
char * const argv[], char * envp[]);
|
2006-02-13 15:28:29 +00:00
|
|
|
static int bproc_vexecmove(int nnodes, int *nodes, int *pids, const char *cmd,
|
2005-10-02 14:57:44 +00:00
|
|
|
char * const argv[], char * envp[]);
|
|
|
|
#endif
|
2006-02-13 21:08:35 +00:00
|
|
|
static void orte_pls_bproc_setup_env(char *** env);
|
2006-12-15 02:34:14 +00:00
|
|
|
static int orte_pls_bproc_launch_daemons(orte_job_map_t *map, char ***envp);
|
|
|
|
static int orte_pls_bproc_launch_app(orte_job_map_t* map, int num_slots,
|
|
|
|
orte_vpid_t vpid_start, int app_context);
|
2005-10-02 14:57:44 +00:00
|
|
|
|
|
|
|
/**
|
2006-12-15 02:34:14 +00:00
|
|
|
* Creates a list of nodes from a job map that should participate in the next launch cycle.
|
|
|
|
* @param map a pointer to the job map
|
|
|
|
* @param node_array a pointer to an integer array that will contain the node names
|
|
|
|
* @param num_nodes a pointer to the place where we will store the number of nodes in the array
|
|
|
|
* @param num_procs the number of processes that a node must have to be placed on the list
|
2005-10-02 14:57:44 +00:00
|
|
|
*/
|
2006-12-15 02:34:14 +00:00
|
|
|
static int orte_pls_bproc_node_list(orte_job_map_t *map, int *node_array, int *num_nodes, int num_procs)
|
|
|
|
{
|
|
|
|
opal_list_item_t *item;
|
|
|
|
orte_mapped_node_t *node;
|
2006-10-02 14:58:22 +00:00
|
|
|
|
|
|
|
OPAL_TRACE(1);
|
|
|
|
|
2006-12-15 02:34:14 +00:00
|
|
|
/* initialize all */
|
2005-10-02 14:57:44 +00:00
|
|
|
*num_nodes = 0;
|
2006-12-15 02:34:14 +00:00
|
|
|
memset((void*)node_array, -1, sizeof(int) * map->num_nodes);
|
2006-02-13 15:28:29 +00:00
|
|
|
|
2005-10-02 14:57:44 +00:00
|
|
|
/* build the node list */
|
2006-12-15 02:34:14 +00:00
|
|
|
for(item = opal_list_get_first(&map->nodes);
|
|
|
|
item != opal_list_get_end(&map->nodes);
|
|
|
|
item = opal_list_get_next(item)) {
|
|
|
|
node = (orte_mapped_node_t*)item;
|
|
|
|
|
|
|
|
if (node->num_procs >= num_procs) {
|
|
|
|
node_array[(*num_nodes)++] = atoi(node->nodename);
|
2005-10-02 14:57:44 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return ORTE_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Sets up the bproc io structs for the specified rank on the nodes
|
|
|
|
*
|
|
|
|
* @param jobid
|
|
|
|
* @param io A pointer to an array of 3 bproc_io_t structs
|
|
|
|
* @param node_rank the rank on the node we are setting up the structs for
|
|
|
|
* @param app_context the application context number
|
|
|
|
* @retval ORTE_SUCCESS
|
|
|
|
* @retval error
|
|
|
|
*/
|
2006-02-13 15:28:29 +00:00
|
|
|
static int orte_pls_bproc_setup_io(orte_jobid_t jobid, struct bproc_io_t * io,
|
2005-10-02 14:57:44 +00:00
|
|
|
int node_rank, int app_context) {
|
|
|
|
char *frontend = NULL, *path = NULL, *job = NULL;
|
|
|
|
int rc, i;
|
|
|
|
|
2006-10-02 14:58:22 +00:00
|
|
|
OPAL_TRACE(1);
|
|
|
|
|
2005-10-02 14:57:44 +00:00
|
|
|
/* ensure that system info is set */
|
|
|
|
orte_sys_info();
|
|
|
|
if (NULL == orte_system_info.user) { /* error condition */
|
|
|
|
return ORTE_ERROR;
|
|
|
|
}
|
|
|
|
if (NULL == orte_universe_info.name) { /* error condition */
|
|
|
|
return ORTE_ERROR;
|
|
|
|
}
|
|
|
|
|
2006-09-14 21:29:51 +00:00
|
|
|
rc = orte_ns.convert_jobid_to_string(&job, jobid);
|
2005-10-02 14:57:44 +00:00
|
|
|
if(ORTE_SUCCESS != rc) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
/* build the directory tree the io files will be in */
|
2006-09-14 21:29:51 +00:00
|
|
|
if (0 > asprintf(&frontend, OPAL_PATH_SEP"tmp"OPAL_PATH_SEP"openmpi-bproc-%s"OPAL_PATH_SEP"%s"OPAL_PATH_SEP"%s-%d"OPAL_PATH_SEP"%d",
|
2006-08-24 16:20:32 +00:00
|
|
|
orte_system_info.user, orte_universe_info.name, job,
|
|
|
|
app_context, node_rank)) {
|
2005-10-02 14:57:44 +00:00
|
|
|
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
|
|
|
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
|
|
|
|
for(i = 0; i < 3; i++) {
|
2006-08-24 16:20:32 +00:00
|
|
|
if(0 > asprintf(&path, "%s"OPAL_PATH_SEP"%d", frontend, i)) {
|
2005-10-02 14:57:44 +00:00
|
|
|
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
|
|
|
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
if (mca_pls_bproc_component.debug) {
|
|
|
|
opal_output(0, "mpirun bproc io setup. Path: %s\n", path);
|
|
|
|
}
|
|
|
|
io[i].fd = i;
|
|
|
|
io[i].type = BPROC_IO_FILE;
|
|
|
|
#if defined BPROC_API_VERSION && BPROC_API_VERSION >= 4
|
|
|
|
io[i].flags = 0;
|
|
|
|
#else
|
|
|
|
io[i].send_info = 0;
|
2006-02-13 15:28:29 +00:00
|
|
|
#endif
|
2005-10-02 14:57:44 +00:00
|
|
|
if(0 == i) {
|
|
|
|
io[i].d.file.flags = O_RDONLY;
|
|
|
|
} else {
|
|
|
|
io[i].d.file.flags = O_WRONLY;
|
|
|
|
}
|
|
|
|
io[i].d.file.offset = 0;
|
|
|
|
io[i].d.file.mode = 0;
|
2006-02-13 15:28:29 +00:00
|
|
|
strncpy(io[i].d.file.name, path, 256);
|
2005-10-02 14:57:44 +00:00
|
|
|
free(path);
|
2006-02-13 15:28:29 +00:00
|
|
|
}
|
2005-10-02 14:57:44 +00:00
|
|
|
|
|
|
|
cleanup:
|
|
|
|
if (NULL != frontend) {
|
|
|
|
free(frontend);
|
|
|
|
}
|
|
|
|
if (NULL != job) {
|
|
|
|
free(job);
|
|
|
|
}
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2006-09-14 21:29:51 +00:00
|
|
|
* Callback for orte_wait_cb. This function ONLY gets called for
|
|
|
|
* normal termination, or termination caused by a signal. If the
|
|
|
|
* process abnormally terminates by other than a signal, we go through
|
|
|
|
* another function so it can tell us that it was abnormal.
|
|
|
|
* Bproc doesn't really let us do it through here.
|
2005-10-02 14:57:44 +00:00
|
|
|
* @param wpid the process's pid
|
|
|
|
* @param status tells why the process died
|
|
|
|
* @param data a pointer to the process's name
|
|
|
|
*/
|
|
|
|
static void orte_pls_bproc_waitpid_cb(pid_t wpid, int status, void *data) {
|
|
|
|
orte_process_name_t * proc = (orte_process_name_t*) data;
|
|
|
|
int rc;
|
2006-09-14 21:29:51 +00:00
|
|
|
|
2006-10-02 14:58:22 +00:00
|
|
|
OPAL_TRACE(1);
|
|
|
|
|
2005-10-02 14:57:44 +00:00
|
|
|
/* set the state of this process */
|
|
|
|
if(WIFEXITED(status)) {
|
2006-09-14 21:29:51 +00:00
|
|
|
rc = orte_smr.set_proc_state(proc, ORTE_PROC_STATE_TERMINATED, status);
|
2005-10-02 14:57:44 +00:00
|
|
|
} else {
|
2006-08-16 16:35:09 +00:00
|
|
|
rc = orte_smr.set_proc_state(proc, ORTE_PROC_STATE_ABORTED, status);
|
2005-10-02 14:57:44 +00:00
|
|
|
}
|
|
|
|
if(ORTE_SUCCESS != rc) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
}
|
|
|
|
free(proc);
|
|
|
|
}
|
2006-02-13 15:28:29 +00:00
|
|
|
|
2005-10-02 14:57:44 +00:00
|
|
|
/**
|
|
|
|
* Callback for orte_wait_cb for the daemons. If a daemon unexpectedly dies
|
2006-02-13 15:28:29 +00:00
|
|
|
* before we are done launching, we abort the job.
|
2005-10-02 14:57:44 +00:00
|
|
|
* @param wpid the daemons's pid
|
|
|
|
* @param status tells why the daemon died
|
|
|
|
* @param data a pointer to the node the daemon was on
|
|
|
|
*/
|
|
|
|
static void orte_pls_bproc_waitpid_daemon_cb(pid_t wpid, int status, void *data) {
|
2006-10-02 14:58:22 +00:00
|
|
|
|
|
|
|
OPAL_TRACE(1);
|
|
|
|
|
2006-12-15 02:34:14 +00:00
|
|
|
if(!daemons_launched) {
|
2005-10-02 14:57:44 +00:00
|
|
|
/* if a daemon exits before we are done launching the user apps we send a
|
2006-09-14 21:29:51 +00:00
|
|
|
* message to ourself so we will break out of the receive loop and exit */
|
2005-10-02 14:57:44 +00:00
|
|
|
orte_buffer_t ack;
|
|
|
|
int rc;
|
|
|
|
int src[4] = {-1, -1};
|
|
|
|
src[2] = wpid;
|
|
|
|
src[3] = *(int*)data;
|
|
|
|
if(WIFSIGNALED(status)) {
|
|
|
|
src[1] = WTERMSIG(status);
|
|
|
|
}
|
|
|
|
OBJ_CONSTRUCT(&ack, orte_buffer_t);
|
2006-02-13 15:28:29 +00:00
|
|
|
rc = orte_dss.pack(&ack, &src, 4, ORTE_INT);
|
2005-10-02 14:57:44 +00:00
|
|
|
if(ORTE_SUCCESS != rc) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
}
|
Bring over the update to terminate orteds that are generated by a dynamic spawn such as comm_spawn. This introduces the concept of a job "family" - i.e., jobs that have a parent/child relationship. Comm_spawn'ed jobs have a parent (the one that spawned them). We track that relationship throughout the lineage - i.e., if a comm_spawned job in turn calls comm_spawn, then it has a parent (the one that spawned it) and a "root" job (the original job that started things).
Accordingly, there are new APIs to the name service to support the ability to get a job's parent, root, immediate children, and all its descendants. In addition, the terminate_job, terminate_orted, and signal_job APIs for the PLS have been modified to accept attributes that define the extent of their actions. For example, doing a "terminate_job" with an attribute of ORTE_NS_INCLUDE_DESCENDANTS will terminate the given jobid AND all jobs that descended from it.
I have tested this capability on a MacBook under rsh, Odin under SLURM, and LANL's Flash (bproc). It worked successfully on non-MPI jobs (both simple and including a spawn), and MPI jobs (again, both simple and with a spawn).
This commit was SVN r12597.
2006-11-14 19:34:59 +00:00
|
|
|
rc = mca_oob_send_packed(ORTE_PROC_MY_NAME, &ack, ORTE_RML_TAG_BPROC, 0);
|
2005-10-02 14:57:44 +00:00
|
|
|
if(0 > rc) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
OPAL_THREAD_LOCK(&mca_pls_bproc_component.lock);
|
|
|
|
if(0 < mca_pls_bproc_component.num_daemons) {
|
|
|
|
mca_pls_bproc_component.num_daemons--;
|
|
|
|
}
|
|
|
|
opal_condition_signal(&mca_pls_bproc_component.condition);
|
|
|
|
OPAL_THREAD_UNLOCK(&mca_pls_bproc_component.lock);
|
|
|
|
if(0 < mca_pls_bproc_component.debug) {
|
2006-02-13 15:28:29 +00:00
|
|
|
opal_output(0, "in orte_pls_bproc_waitpid_daemon_cb, %d daemons left\n",
|
2005-10-02 14:57:44 +00:00
|
|
|
mca_pls_bproc_component.num_daemons);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef MCA_pls_bproc_scyld
|
2006-02-13 15:28:29 +00:00
|
|
|
/**
|
2005-10-02 14:57:44 +00:00
|
|
|
* compatibility function for scyld bproc and pre 3.2.0 LANL bproc. See the
|
|
|
|
* bproc documentation for details
|
|
|
|
*/
|
2006-02-13 15:28:29 +00:00
|
|
|
static int bproc_vexecmove_io(int nnodes, int *nodes, int *pids,
|
2005-10-02 14:57:44 +00:00
|
|
|
struct bproc_io_t *io, int iolen, const char *cmd,
|
|
|
|
char * const argv[], char * envp[]) {
|
|
|
|
int i;
|
|
|
|
char * rank;
|
2006-10-02 14:58:22 +00:00
|
|
|
|
|
|
|
OPAL_TRACE(1);
|
|
|
|
|
2005-10-02 14:57:44 +00:00
|
|
|
for(i = 0; i < nnodes; i++) {
|
|
|
|
pids[i] = fork();
|
|
|
|
if(0 == pids[i]) {
|
|
|
|
/* set BPROC_RANK so the proc can get its name */
|
|
|
|
if (0 > asprintf(&rank, "%d", i)) {
|
|
|
|
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
|
|
|
exit(-1);
|
|
|
|
}
|
|
|
|
opal_setenv("BPROC_RANK", rank, true, &envp);
|
|
|
|
bproc_execmove_io(nodes[i], io, iolen, cmd, argv, envp);
|
|
|
|
/* if we get here, there was an error */
|
2006-02-13 15:28:29 +00:00
|
|
|
opal_show_help("help-pls-bproc.txt", "bproc-vexecmove-launch", true,
|
2005-10-02 14:57:44 +00:00
|
|
|
cmd, nodes[i], errno);
|
|
|
|
ORTE_ERROR_LOG(ORTE_ERROR);
|
|
|
|
exit(-1);
|
|
|
|
} else if(-1 == pids[i]) {
|
2006-02-13 15:28:29 +00:00
|
|
|
opal_show_help("help-pls-bproc.txt", "bproc-vexecmove-fork", true,
|
2005-10-02 14:57:44 +00:00
|
|
|
errno);
|
|
|
|
ORTE_ERROR_LOG(ORTE_ERROR);
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return nnodes;
|
|
|
|
}
|
|
|
|
|
2006-02-13 15:28:29 +00:00
|
|
|
/**
|
2005-10-02 14:57:44 +00:00
|
|
|
* compatibility function for scyld bproc and pre 3.2.0 LANL bproc. See the
|
|
|
|
* bproc documentation for details
|
|
|
|
*/
|
2006-02-13 15:28:29 +00:00
|
|
|
static int bproc_vexecmove(int nnodes, int *nodes, int *pids, const char *cmd,
|
2005-10-02 14:57:44 +00:00
|
|
|
char * const argv[], char * envp[]) {
|
|
|
|
return bproc_vexecmove_io(nnodes, nodes, pids, NULL, 0, cmd, argv, envp);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Sets up the passed environment for processes launched by the bproc launcher.
|
|
|
|
* @param env a pointer to the environment to setup
|
|
|
|
*/
|
2006-02-13 21:08:35 +00:00
|
|
|
static void orte_pls_bproc_setup_env(char *** env)
|
2005-10-02 14:57:44 +00:00
|
|
|
{
|
|
|
|
char ** merged;
|
|
|
|
char * var;
|
2006-02-16 00:16:22 +00:00
|
|
|
char * param;
|
2005-10-02 14:57:44 +00:00
|
|
|
int rc;
|
2006-02-13 21:08:35 +00:00
|
|
|
int num_env;
|
|
|
|
|
2006-10-02 14:58:22 +00:00
|
|
|
OPAL_TRACE(1);
|
|
|
|
|
2006-02-16 20:40:23 +00:00
|
|
|
num_env = opal_argv_count(*env);
|
2005-10-02 14:57:44 +00:00
|
|
|
/* append mca parameters to our environment */
|
2006-02-13 21:08:35 +00:00
|
|
|
if(ORTE_SUCCESS != (rc = mca_base_param_build_env(env, &num_env, false))) {
|
2005-10-02 14:57:44 +00:00
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* ns replica contact info */
|
|
|
|
if(NULL == orte_process_info.ns_replica) {
|
Bring over the update to terminate orteds that are generated by a dynamic spawn such as comm_spawn. This introduces the concept of a job "family" - i.e., jobs that have a parent/child relationship. Comm_spawn'ed jobs have a parent (the one that spawned them). We track that relationship throughout the lineage - i.e., if a comm_spawned job in turn calls comm_spawn, then it has a parent (the one that spawned it) and a "root" job (the original job that started things).
Accordingly, there are new APIs to the name service to support the ability to get a job's parent, root, immediate children, and all its descendants. In addition, the terminate_job, terminate_orted, and signal_job APIs for the PLS have been modified to accept attributes that define the extent of their actions. For example, doing a "terminate_job" with an attribute of ORTE_NS_INCLUDE_DESCENDANTS will terminate the given jobid AND all jobs that descended from it.
I have tested this capability on a MacBook under rsh, Odin under SLURM, and LANL's Flash (bproc). It worked successfully on non-MPI jobs (both simple and including a spawn), and MPI jobs (again, both simple and with a spawn).
This commit was SVN r12597.
2006-11-14 19:34:59 +00:00
|
|
|
orte_dss.copy((void**)&orte_process_info.ns_replica, orte_process_info.my_name, ORTE_NAME);
|
2005-10-02 14:57:44 +00:00
|
|
|
orte_process_info.ns_replica_uri = orte_rml.get_uri();
|
|
|
|
}
|
|
|
|
var = mca_base_param_environ_variable("ns","replica","uri");
|
|
|
|
opal_setenv(var,orte_process_info.ns_replica_uri, true, env);
|
|
|
|
free(var);
|
|
|
|
|
|
|
|
/* make sure the username used to create the bproc directory is the same on
|
|
|
|
* the backend as the frontend */
|
|
|
|
var = mca_base_param_environ_variable("pls","bproc","username");
|
|
|
|
opal_setenv(var, orte_system_info.user, true, env);
|
|
|
|
free(var);
|
|
|
|
|
|
|
|
/* gpr replica contact info */
|
|
|
|
if(NULL == orte_process_info.gpr_replica) {
|
Bring over the update to terminate orteds that are generated by a dynamic spawn such as comm_spawn. This introduces the concept of a job "family" - i.e., jobs that have a parent/child relationship. Comm_spawn'ed jobs have a parent (the one that spawned them). We track that relationship throughout the lineage - i.e., if a comm_spawned job in turn calls comm_spawn, then it has a parent (the one that spawned it) and a "root" job (the original job that started things).
Accordingly, there are new APIs to the name service to support the ability to get a job's parent, root, immediate children, and all its descendants. In addition, the terminate_job, terminate_orted, and signal_job APIs for the PLS have been modified to accept attributes that define the extent of their actions. For example, doing a "terminate_job" with an attribute of ORTE_NS_INCLUDE_DESCENDANTS will terminate the given jobid AND all jobs that descended from it.
I have tested this capability on a MacBook under rsh, Odin under SLURM, and LANL's Flash (bproc). It worked successfully on non-MPI jobs (both simple and including a spawn), and MPI jobs (again, both simple and with a spawn).
This commit was SVN r12597.
2006-11-14 19:34:59 +00:00
|
|
|
orte_dss.copy((void**)&orte_process_info.gpr_replica, orte_process_info.my_name, ORTE_NAME);
|
2005-10-02 14:57:44 +00:00
|
|
|
orte_process_info.gpr_replica_uri = orte_rml.get_uri();
|
|
|
|
}
|
|
|
|
var = mca_base_param_environ_variable("gpr","replica","uri");
|
|
|
|
opal_setenv(var,orte_process_info.gpr_replica_uri, true, env);
|
|
|
|
free(var);
|
|
|
|
|
2006-02-16 00:16:22 +00:00
|
|
|
/* universe directory - needs to match orted */
|
|
|
|
var = mca_base_param_environ_variable("universe", NULL, NULL);
|
|
|
|
asprintf(¶m, "%s@%s:%s", orte_universe_info.uid,
|
|
|
|
orte_universe_info.host, orte_universe_info.name);
|
|
|
|
opal_setenv(var, param, true, env);
|
|
|
|
free(param);
|
|
|
|
free(var);
|
|
|
|
|
2006-12-15 02:34:14 +00:00
|
|
|
/* merge in environment - merge ensures we don't overwrite anything we just set */
|
2005-10-02 14:57:44 +00:00
|
|
|
merged = opal_environ_merge(*env, environ);
|
|
|
|
opal_argv_free(*env);
|
|
|
|
*env = merged;
|
2006-12-15 02:34:14 +00:00
|
|
|
|
2005-10-02 14:57:44 +00:00
|
|
|
/* make sure hostname doesn't get pushed to backend node */
|
|
|
|
opal_unsetenv("HOSTNAME", env);
|
2006-12-15 02:34:14 +00:00
|
|
|
|
|
|
|
/* make sure the frontend hostname does not get pushed out to the backend */
|
|
|
|
var = mca_base_param_environ_variable("orte", "base", "nodename");
|
|
|
|
opal_unsetenv(var, env);
|
|
|
|
free(var);
|
|
|
|
|
2005-10-02 14:57:44 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Launches the daemons
|
|
|
|
* @param cellid the cellid of the job
|
|
|
|
* @param envp a pointer to the environment to use for the daemons
|
|
|
|
* @param node_arrays an array that holds the node arrays for each app context
|
|
|
|
* @param node_array_lens an array of lengths of the node arrays
|
|
|
|
* @param num_contexts the number of application contexts
|
|
|
|
* @param num_procs the numer of processes in the job
|
|
|
|
* @param global_vpid_start the starting vpid for the user's processes
|
|
|
|
* @param jobid the jobid for the user processes
|
|
|
|
* @retval ORTE_SUCCESS
|
2006-02-13 15:28:29 +00:00
|
|
|
* @retval error
|
2005-10-02 14:57:44 +00:00
|
|
|
*/
|
2006-12-15 02:34:14 +00:00
|
|
|
static int orte_pls_bproc_launch_daemons(orte_job_map_t *map, char ***envp) {
|
2005-10-02 14:57:44 +00:00
|
|
|
int * daemon_list = NULL;
|
|
|
|
int num_daemons = 0;
|
2006-10-07 15:45:24 +00:00
|
|
|
int rc, i;
|
2005-10-02 14:57:44 +00:00
|
|
|
int * pids = NULL;
|
|
|
|
int argc;
|
|
|
|
char ** argv = NULL;
|
|
|
|
char * param;
|
2006-08-11 19:41:33 +00:00
|
|
|
char * var;
|
|
|
|
int stride;
|
2005-10-02 14:57:44 +00:00
|
|
|
char * orted_path;
|
2006-12-15 02:34:14 +00:00
|
|
|
orte_vpid_t daemon_vpid_start;
|
2006-08-15 19:54:10 +00:00
|
|
|
orte_std_cntr_t idx;
|
2005-10-02 14:57:44 +00:00
|
|
|
struct stat buf;
|
2006-09-14 21:29:51 +00:00
|
|
|
opal_list_t daemons;
|
|
|
|
orte_pls_daemon_info_t *dmn;
|
|
|
|
opal_list_item_t *item;
|
Bring the timing instrumentation to the trunk.
If you want to look at our launch and MPI process startup times, you can do so with two MCA params:
OMPI_MCA_orte_timing: set it to anything non-zero and you will get the launch time for different steps in the job launch procedure. The degree of detail depends on the launch environment. rsh will provide you with the average, min, and max launch time for the daemons. SLURM block launches the daemon, so you only get the time to launch the daemons and the total time to launch the job. Ditto for bproc. TM looks more like rsh. Only those four environments are currently supported - anyone interested in extending this capability to other environs is welcome to do so. In all cases, you also get the time to setup the job for launch.
OMPI_MCA_ompi_timing: set it to anything non-zero and you will get the time for mpi_init to reach the compound registry command, the time to execute that command, the time to go from our stage1 barrier to the stage2 barrier, and the time to go from the stage2 barrier to the end of mpi_init. This will be output for each process, so you'll have to compile any statistics on your own. Note: if someone develops a nice parser to do so, it would be really appreciated if you could/would share!
This commit was SVN r12302.
2006-10-25 15:27:47 +00:00
|
|
|
struct timeval joblaunchstart, launchstart, launchstop;
|
2006-09-14 21:29:51 +00:00
|
|
|
|
2006-10-02 14:58:22 +00:00
|
|
|
OPAL_TRACE(1);
|
|
|
|
|
2006-12-15 02:34:14 +00:00
|
|
|
if (orte_pls_base.timing) {
|
Bring the timing instrumentation to the trunk.
If you want to look at our launch and MPI process startup times, you can do so with two MCA params:
OMPI_MCA_orte_timing: set it to anything non-zero and you will get the launch time for different steps in the job launch procedure. The degree of detail depends on the launch environment. rsh will provide you with the average, min, and max launch time for the daemons. SLURM block launches the daemon, so you only get the time to launch the daemons and the total time to launch the job. Ditto for bproc. TM looks more like rsh. Only those four environments are currently supported - anyone interested in extending this capability to other environs is welcome to do so. In all cases, you also get the time to setup the job for launch.
OMPI_MCA_ompi_timing: set it to anything non-zero and you will get the time for mpi_init to reach the compound registry command, the time to execute that command, the time to go from our stage1 barrier to the stage2 barrier, and the time to go from the stage2 barrier to the end of mpi_init. This will be output for each process, so you'll have to compile any statistics on your own. Note: if someone develops a nice parser to do so, it would be really appreciated if you could/would share!
This commit was SVN r12302.
2006-10-25 15:27:47 +00:00
|
|
|
if (0 != gettimeofday(&joblaunchstart, NULL)) {
|
|
|
|
opal_output(0, "pls_bproc: could not obtain start time");
|
|
|
|
}
|
|
|
|
}
|
2006-10-20 16:50:13 +00:00
|
|
|
|
2006-12-15 02:34:14 +00:00
|
|
|
/* indicate that the daemons have not completely launched yet */
|
|
|
|
daemons_launched = false;
|
|
|
|
|
2006-09-14 21:29:51 +00:00
|
|
|
/* setup a list that will contain the info for all the daemons
|
|
|
|
* so we can store it on the registry when done
|
|
|
|
*/
|
|
|
|
OBJ_CONSTRUCT(&daemons, opal_list_t);
|
2005-10-02 14:57:44 +00:00
|
|
|
|
2006-10-07 15:45:24 +00:00
|
|
|
/* get the number of nodes in this job and allocate an array for
|
|
|
|
* their names so we can pass that to bproc - populate the list
|
|
|
|
* with the node names
|
|
|
|
*/
|
2006-12-15 02:34:14 +00:00
|
|
|
num_daemons = map->num_nodes;
|
|
|
|
if (0 == num_daemons) {
|
|
|
|
/* nothing to do */
|
|
|
|
OBJ_DESTRUCT(&daemons);
|
|
|
|
return ORTE_SUCCESS;
|
|
|
|
}
|
|
|
|
|
2006-10-07 15:45:24 +00:00
|
|
|
if(NULL == (daemon_list = (int*)malloc(sizeof(int) * num_daemons))) {
|
2005-10-02 14:57:44 +00:00
|
|
|
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
|
|
|
goto cleanup;
|
|
|
|
}
|
2006-10-07 15:45:24 +00:00
|
|
|
i = 0;
|
|
|
|
for (item = opal_list_get_first(&map->nodes);
|
|
|
|
item != opal_list_get_end(&map->nodes);
|
|
|
|
item = opal_list_get_next(item)) {
|
|
|
|
orte_mapped_node_t *node = (orte_mapped_node_t*)item;
|
|
|
|
|
|
|
|
daemon_list[i++] = atoi(node->nodename);
|
2005-10-02 14:57:44 +00:00
|
|
|
}
|
2006-10-07 15:45:24 +00:00
|
|
|
|
2006-12-15 02:34:14 +00:00
|
|
|
/* allocate storage for bproc to return the daemon pids */
|
2005-10-02 14:57:44 +00:00
|
|
|
if(NULL == (pids = (int*)malloc(sizeof(int) * num_daemons))) {
|
|
|
|
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* allocate a range of vpids for the daemons */
|
2006-12-15 02:34:14 +00:00
|
|
|
rc = orte_ns.reserve_range(0, num_daemons, &daemon_vpid_start);
|
2005-10-02 14:57:44 +00:00
|
|
|
if(ORTE_SUCCESS != rc) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
goto cleanup;
|
|
|
|
}
|
2006-09-14 21:29:51 +00:00
|
|
|
|
|
|
|
/* setup the orted triggers for passing their launch info */
|
2006-12-15 02:34:14 +00:00
|
|
|
if (ORTE_SUCCESS != (rc = orte_smr.init_orted_stage_gates(map->job, num_daemons, NULL, NULL))) {
|
2005-10-02 14:57:44 +00:00
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
goto cleanup;
|
|
|
|
}
|
2006-08-11 19:41:33 +00:00
|
|
|
|
2006-12-15 02:34:14 +00:00
|
|
|
/* setup the daemon environment */
|
|
|
|
orte_pls_bproc_setup_env(envp);
|
|
|
|
|
2006-08-11 19:41:33 +00:00
|
|
|
/* daemons calculate their process name using a "stride" of one, so
|
|
|
|
* push that value into their environment */
|
|
|
|
stride = 1;
|
|
|
|
asprintf(¶m, "%ld", (long)stride);
|
|
|
|
var = mca_base_param_environ_variable("pls", "bproc", "stride");
|
|
|
|
opal_setenv(var, param, true, envp);
|
|
|
|
free(param);
|
|
|
|
free(var);
|
2006-02-13 15:28:29 +00:00
|
|
|
|
2006-08-11 19:41:33 +00:00
|
|
|
/* set up the base environment so the daemons can get their names once launched */
|
2006-12-15 02:34:14 +00:00
|
|
|
rc = orte_ns_nds_bproc_put(ORTE_PROC_MY_NAME->cellid, 0, daemon_vpid_start,
|
|
|
|
0, num_daemons, envp);
|
2006-02-13 15:28:29 +00:00
|
|
|
if(ORTE_SUCCESS != rc) {
|
2005-10-02 14:57:44 +00:00
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
|
|
|
|
argc = 0;
|
|
|
|
opal_argv_append(&argc, &argv, mca_pls_bproc_component.orted);
|
|
|
|
/* check for debug flags */
|
2006-08-03 18:51:42 +00:00
|
|
|
#if 0
|
2005-10-02 14:57:44 +00:00
|
|
|
if (mca_pls_bproc_component.debug) {
|
|
|
|
opal_argv_append(&argc, &argv, "--debug");
|
|
|
|
opal_argv_append(&argc, &argv, "--debug-daemons");
|
|
|
|
}
|
2006-08-03 18:51:42 +00:00
|
|
|
#endif
|
2005-10-02 14:57:44 +00:00
|
|
|
|
|
|
|
opal_argv_append(&argc, &argv, "--bootproxy");
|
2006-12-15 02:34:14 +00:00
|
|
|
orte_ns.convert_jobid_to_string(¶m, map->job);
|
2005-10-02 14:57:44 +00:00
|
|
|
opal_argv_append(&argc, &argv, param);
|
|
|
|
free(param);
|
|
|
|
|
|
|
|
/* pass along the universe name and location info */
|
|
|
|
opal_argv_append(&argc, &argv, "--universe");
|
|
|
|
asprintf(¶m, "%s@%s:%s", orte_universe_info.uid,
|
|
|
|
orte_universe_info.host, orte_universe_info.name);
|
|
|
|
opal_argv_append(&argc, &argv, param);
|
|
|
|
free(param);
|
|
|
|
|
|
|
|
/* tell orted not to demonize itself */
|
|
|
|
opal_argv_append(&argc, &argv, "--no-daemonize");
|
|
|
|
|
|
|
|
/* find orted */
|
|
|
|
if(0 == stat(mca_pls_bproc_component.orted, &buf)) {
|
|
|
|
orted_path = strdup(mca_pls_bproc_component.orted);
|
|
|
|
} else {
|
|
|
|
orted_path = opal_path_findv(mca_pls_bproc_component.orted, 0, environ, NULL);
|
|
|
|
if(NULL == orted_path) {
|
2006-08-23 03:32:36 +00:00
|
|
|
orted_path = opal_os_path( false, OPAL_BINDIR, mca_pls_bproc_component.orted, NULL );
|
|
|
|
if( (NULL != orted_path) || (0 != stat(orted_path, &buf)) ) {
|
2005-10-02 14:57:44 +00:00
|
|
|
char *path = getenv("PATH");
|
|
|
|
if (NULL == path) {
|
|
|
|
path = ("PATH is empty!");
|
|
|
|
}
|
2006-02-13 15:28:29 +00:00
|
|
|
opal_show_help("help-pls-bproc.txt", "no-orted", true,
|
2005-10-02 14:57:44 +00:00
|
|
|
mca_pls_bproc_component.orted,
|
2006-03-12 04:35:01 +00:00
|
|
|
mca_pls_bproc_component.orted, path, OPAL_BINDIR);
|
2005-10-02 14:57:44 +00:00
|
|
|
rc = ORTE_ERROR;
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if(0 < mca_pls_bproc_component.debug) {
|
2006-02-13 15:28:29 +00:00
|
|
|
opal_output(0, "PLS_BPROC DEBUG: launching %d daemons. cmd: %s ",
|
2005-10-02 14:57:44 +00:00
|
|
|
num_daemons, orted_path);
|
|
|
|
}
|
2006-02-13 15:28:29 +00:00
|
|
|
|
2005-10-02 14:57:44 +00:00
|
|
|
/* launch the daemons */
|
2006-12-15 02:34:14 +00:00
|
|
|
if (orte_pls_base.timing) {
|
Bring the timing instrumentation to the trunk.
If you want to look at our launch and MPI process startup times, you can do so with two MCA params:
OMPI_MCA_orte_timing: set it to anything non-zero and you will get the launch time for different steps in the job launch procedure. The degree of detail depends on the launch environment. rsh will provide you with the average, min, and max launch time for the daemons. SLURM block launches the daemon, so you only get the time to launch the daemons and the total time to launch the job. Ditto for bproc. TM looks more like rsh. Only those four environments are currently supported - anyone interested in extending this capability to other environs is welcome to do so. In all cases, you also get the time to setup the job for launch.
OMPI_MCA_ompi_timing: set it to anything non-zero and you will get the time for mpi_init to reach the compound registry command, the time to execute that command, the time to go from our stage1 barrier to the stage2 barrier, and the time to go from the stage2 barrier to the end of mpi_init. This will be output for each process, so you'll have to compile any statistics on your own. Note: if someone develops a nice parser to do so, it would be really appreciated if you could/would share!
This commit was SVN r12302.
2006-10-25 15:27:47 +00:00
|
|
|
if (0 != gettimeofday(&launchstart, NULL)) {
|
|
|
|
opal_output(0, "pls_bproc: could not obtain start time");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2006-12-15 02:34:14 +00:00
|
|
|
if (mca_pls_bproc_component.do_not_launch) {
|
|
|
|
for (i=0; i < num_daemons; i++) pids[i] = i+1;
|
|
|
|
rc = num_daemons;
|
|
|
|
} else {
|
|
|
|
rc = bproc_vexecmove(num_daemons, daemon_list, pids, orted_path, argv, *envp);
|
|
|
|
}
|
Bring the timing instrumentation to the trunk.
If you want to look at our launch and MPI process startup times, you can do so with two MCA params:
OMPI_MCA_orte_timing: set it to anything non-zero and you will get the launch time for different steps in the job launch procedure. The degree of detail depends on the launch environment. rsh will provide you with the average, min, and max launch time for the daemons. SLURM block launches the daemon, so you only get the time to launch the daemons and the total time to launch the job. Ditto for bproc. TM looks more like rsh. Only those four environments are currently supported - anyone interested in extending this capability to other environs is welcome to do so. In all cases, you also get the time to setup the job for launch.
OMPI_MCA_ompi_timing: set it to anything non-zero and you will get the time for mpi_init to reach the compound registry command, the time to execute that command, the time to go from our stage1 barrier to the stage2 barrier, and the time to go from the stage2 barrier to the end of mpi_init. This will be output for each process, so you'll have to compile any statistics on your own. Note: if someone develops a nice parser to do so, it would be really appreciated if you could/would share!
This commit was SVN r12302.
2006-10-25 15:27:47 +00:00
|
|
|
|
2006-12-15 02:34:14 +00:00
|
|
|
if (orte_pls_base.timing) {
|
Bring the timing instrumentation to the trunk.
If you want to look at our launch and MPI process startup times, you can do so with two MCA params:
OMPI_MCA_orte_timing: set it to anything non-zero and you will get the launch time for different steps in the job launch procedure. The degree of detail depends on the launch environment. rsh will provide you with the average, min, and max launch time for the daemons. SLURM block launches the daemon, so you only get the time to launch the daemons and the total time to launch the job. Ditto for bproc. TM looks more like rsh. Only those four environments are currently supported - anyone interested in extending this capability to other environs is welcome to do so. In all cases, you also get the time to setup the job for launch.
OMPI_MCA_ompi_timing: set it to anything non-zero and you will get the time for mpi_init to reach the compound registry command, the time to execute that command, the time to go from our stage1 barrier to the stage2 barrier, and the time to go from the stage2 barrier to the end of mpi_init. This will be output for each process, so you'll have to compile any statistics on your own. Note: if someone develops a nice parser to do so, it would be really appreciated if you could/would share!
This commit was SVN r12302.
2006-10-25 15:27:47 +00:00
|
|
|
if (0 != gettimeofday(&launchstop, NULL)) {
|
|
|
|
opal_output(0, "pls_bproc: could not obtain stop time");
|
|
|
|
} else {
|
|
|
|
opal_output(0, "pls_bproc: daemon launch time is %ld usec",
|
|
|
|
(launchstop.tv_sec - launchstart.tv_sec)*1000000 +
|
|
|
|
(launchstop.tv_usec - launchstart.tv_usec));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2005-10-02 14:57:44 +00:00
|
|
|
if(rc != num_daemons) {
|
2006-02-13 15:28:29 +00:00
|
|
|
opal_show_help("help-pls-bproc.txt", "daemon-launch-number", true,
|
2005-10-02 14:57:44 +00:00
|
|
|
num_daemons, rc, orted_path);
|
|
|
|
rc = ORTE_ERROR;
|
|
|
|
goto cleanup;
|
|
|
|
}
|
2006-12-15 02:34:14 +00:00
|
|
|
|
2005-10-02 14:57:44 +00:00
|
|
|
if(0 < mca_pls_bproc_component.debug) {
|
2006-02-13 15:28:29 +00:00
|
|
|
opal_output(0, "PLS_BPROC DEBUG: %d daemons launched. First pid: %d\n",
|
2005-10-02 14:57:44 +00:00
|
|
|
rc, *pids);
|
|
|
|
}
|
2006-02-13 15:28:29 +00:00
|
|
|
|
2005-10-02 14:57:44 +00:00
|
|
|
for(i = 0; i < num_daemons; i++) {
|
|
|
|
if(0 >= pids[i]) {
|
|
|
|
opal_show_help("help-pls-bproc.txt", "daemon-launch-bad-pid", true,
|
2006-02-13 15:28:29 +00:00
|
|
|
daemon_list[i], pids[i], errno, orted_path);
|
2005-10-02 14:57:44 +00:00
|
|
|
rc = ORTE_ERROR;
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
goto cleanup;
|
|
|
|
} else {
|
|
|
|
if (0 > asprintf(¶m, "%d", daemon_list[i])) {
|
|
|
|
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
|
|
|
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
|
|
|
goto cleanup;
|
|
|
|
}
|
2006-12-15 02:34:14 +00:00
|
|
|
rc = orte_pls_bproc_set_node_pid(ORTE_PROC_MY_NAME->cellid, param, map->job, pids[i]);
|
2005-10-02 14:57:44 +00:00
|
|
|
if(ORTE_SUCCESS != rc) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
goto cleanup;
|
|
|
|
}
|
2006-12-15 02:34:14 +00:00
|
|
|
|
2006-09-14 21:29:51 +00:00
|
|
|
dmn = OBJ_NEW(orte_pls_daemon_info_t);
|
2006-12-15 02:34:14 +00:00
|
|
|
rc = orte_ns.create_process_name(&(dmn->name), ORTE_PROC_MY_NAME->cellid, 0,
|
|
|
|
daemon_vpid_start + i);
|
|
|
|
if(ORTE_SUCCESS != rc) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
dmn->cell = dmn->name->cellid;
|
2006-09-14 21:29:51 +00:00
|
|
|
dmn->nodename = strdup(param);
|
2006-12-15 02:34:14 +00:00
|
|
|
dmn->active_job = map->job;
|
2006-09-14 21:29:51 +00:00
|
|
|
opal_list_append(&daemons, &dmn->super);
|
|
|
|
|
2005-10-02 14:57:44 +00:00
|
|
|
free(param);
|
|
|
|
}
|
|
|
|
}
|
2006-11-22 22:49:22 +00:00
|
|
|
|
2006-09-14 21:29:51 +00:00
|
|
|
/* store the daemon info */
|
2006-10-07 15:45:24 +00:00
|
|
|
if (ORTE_SUCCESS != (rc = orte_pls_base_store_active_daemons(&daemons))) {
|
2006-09-14 21:29:51 +00:00
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
}
|
2005-10-02 14:57:44 +00:00
|
|
|
|
2006-11-22 22:49:22 +00:00
|
|
|
/* setup the callbacks - this needs to be done *after* we store the
|
|
|
|
* daemon info so that short-lived apps don't cause mpirun to
|
|
|
|
* try and terminate the orteds before we record them
|
|
|
|
*/
|
2006-12-15 02:34:14 +00:00
|
|
|
if (!mca_pls_bproc_component.do_not_launch) {
|
|
|
|
for (i=0; i < num_daemons; i++) {
|
|
|
|
rc = orte_wait_cb(pids[i], orte_pls_bproc_waitpid_daemon_cb,
|
|
|
|
&daemon_list[i]);
|
|
|
|
if(ORTE_SUCCESS != rc) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
goto cleanup;
|
|
|
|
}
|
2006-11-22 22:49:22 +00:00
|
|
|
}
|
|
|
|
|
2006-12-15 02:34:14 +00:00
|
|
|
/* wait for communication back from the daemons, which indicates they have
|
|
|
|
* sucessfully set up the pty/pipes and IO forwarding which the user apps
|
|
|
|
* will use */
|
|
|
|
for(i = 0; i < num_daemons; i++) {
|
|
|
|
orte_buffer_t ack;
|
|
|
|
int src[4];
|
|
|
|
OBJ_CONSTRUCT(&ack, orte_buffer_t);
|
|
|
|
rc = mca_oob_recv_packed(ORTE_NAME_WILDCARD, &ack, ORTE_RML_TAG_BPROC);
|
|
|
|
if(0 > rc) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
OBJ_DESTRUCT(&ack);
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
idx = 4;
|
|
|
|
rc = orte_dss.unpack(&ack, &src, &idx, ORTE_INT);
|
|
|
|
if(ORTE_SUCCESS != rc) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
}
|
|
|
|
OBJ_DESTRUCT(&ack);
|
|
|
|
|
|
|
|
if(-1 == src[0]) {
|
|
|
|
/* one of the daemons has failed to properly launch. The error is sent
|
|
|
|
* by orte_pls_bproc_waitpid_daemon_cb */
|
|
|
|
if(-1 == src[1]) { /* did not die on a signal */
|
|
|
|
opal_show_help("help-pls-bproc.txt", "daemon-died-no-signal", true,
|
|
|
|
src[2], src[3]);
|
|
|
|
} else { /* died on a signal */
|
|
|
|
opal_show_help("help-pls-bproc.txt", "daemon-died-signal", true,
|
|
|
|
src[2], src[3], src[1]);
|
|
|
|
}
|
|
|
|
rc = ORTE_ERROR;
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
orte_pls_bproc_terminate_job(map->job, NULL);
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* indicate that the daemons have now launched */
|
|
|
|
daemons_launched = true;
|
|
|
|
|
|
|
|
if (orte_pls_base.timing) {
|
Bring the timing instrumentation to the trunk.
If you want to look at our launch and MPI process startup times, you can do so with two MCA params:
OMPI_MCA_orte_timing: set it to anything non-zero and you will get the launch time for different steps in the job launch procedure. The degree of detail depends on the launch environment. rsh will provide you with the average, min, and max launch time for the daemons. SLURM block launches the daemon, so you only get the time to launch the daemons and the total time to launch the job. Ditto for bproc. TM looks more like rsh. Only those four environments are currently supported - anyone interested in extending this capability to other environs is welcome to do so. In all cases, you also get the time to setup the job for launch.
OMPI_MCA_ompi_timing: set it to anything non-zero and you will get the time for mpi_init to reach the compound registry command, the time to execute that command, the time to go from our stage1 barrier to the stage2 barrier, and the time to go from the stage2 barrier to the end of mpi_init. This will be output for each process, so you'll have to compile any statistics on your own. Note: if someone develops a nice parser to do so, it would be really appreciated if you could/would share!
This commit was SVN r12302.
2006-10-25 15:27:47 +00:00
|
|
|
if (0 != gettimeofday(&launchstop, NULL)) {
|
|
|
|
opal_output(0, "pls_bproc: could not obtain stop time");
|
|
|
|
} else {
|
|
|
|
opal_output(0, "pls_bproc: total job launch time is %ld usec",
|
|
|
|
(launchstop.tv_sec - joblaunchstart.tv_sec)*1000000 +
|
|
|
|
(launchstop.tv_usec - joblaunchstart.tv_usec));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2005-10-02 14:57:44 +00:00
|
|
|
cleanup:
|
|
|
|
if(NULL != argv) {
|
|
|
|
opal_argv_free(argv);
|
|
|
|
}
|
|
|
|
if(NULL != pids) {
|
|
|
|
free(pids);
|
|
|
|
}
|
|
|
|
if(NULL != orted_path) {
|
|
|
|
free(orted_path);
|
|
|
|
}
|
2006-09-14 21:29:51 +00:00
|
|
|
while (NULL != (item = opal_list_remove_first(&daemons))) {
|
|
|
|
OBJ_RELEASE(item);
|
|
|
|
}
|
|
|
|
OBJ_DESTRUCT(&daemons);
|
2006-08-03 18:51:42 +00:00
|
|
|
|
2006-12-15 02:34:14 +00:00
|
|
|
return rc;
|
2006-08-03 05:29:49 +00:00
|
|
|
}
|
|
|
|
|
2006-08-03 18:51:42 +00:00
|
|
|
|
|
|
|
static int
|
2006-12-15 02:34:14 +00:00
|
|
|
orte_pls_bproc_node_failed(orte_gpr_notify_msg_t *msg)
|
2006-08-03 18:51:42 +00:00
|
|
|
{
|
2006-12-15 02:34:14 +00:00
|
|
|
orte_jobid_t job;
|
|
|
|
|
|
|
|
/* respond to a node failure reported by the smr. We know that
|
|
|
|
* this function will only be called when one or more nodes in
|
|
|
|
* our allocation fails, so we just need to respond to it. The
|
|
|
|
* complication is that the failure could occur in any of several
|
|
|
|
* states:
|
|
|
|
* (a) before we start to launch the daemons
|
|
|
|
* (b) while we are launching the daemons
|
|
|
|
* (c) after the daemons are launched, while we are launching the app
|
|
|
|
* (d) during app launch
|
|
|
|
* (e) after app launch, but before completion
|
|
|
|
* (f) while the app is finalizing
|
|
|
|
* (g) while we are cleaning up after the app has finalized
|
|
|
|
*/
|
|
|
|
|
|
|
|
printf("mpirun has detected a dead node within the job and is terminating\n");
|
|
|
|
|
|
|
|
/* extract the jobid from the returned data */
|
|
|
|
orte_schema.extract_jobid_from_std_trigger_name(&job, msg->trigger);
|
|
|
|
|
|
|
|
/* terminate all jobs in the in the job family */
|
|
|
|
orte_pls_bproc_terminate_job(job, NULL);
|
|
|
|
|
|
|
|
/* kill the daemons */
|
|
|
|
orte_pls_bproc_terminate_job(0, NULL);
|
|
|
|
|
|
|
|
/* shouldn't ever get here.. */
|
|
|
|
exit(1);
|
|
|
|
|
2006-08-03 05:29:49 +00:00
|
|
|
}
|
2005-10-02 14:57:44 +00:00
|
|
|
|
2006-08-03 18:51:42 +00:00
|
|
|
|
2005-10-02 14:57:44 +00:00
|
|
|
/**
|
|
|
|
* Launches the application processes
|
|
|
|
* @param cellid the cellid of the job
|
|
|
|
* @param jobid the jobid of the job
|
|
|
|
* @param map a pointer to the mapping of this application
|
|
|
|
* @param num_processes the number of processes in this job
|
|
|
|
* @param vpid_start the starting vpid for this app context
|
|
|
|
* @param global_vpid_start the starting vpid for the user's processes
|
|
|
|
* @param app_context the application context number
|
|
|
|
* @param node_array the node array for this context
|
|
|
|
* @param node_array_len the length of the node array
|
|
|
|
* @retval ORTE_SUCCESS
|
2006-02-13 15:28:29 +00:00
|
|
|
* @retval error
|
2005-10-02 14:57:44 +00:00
|
|
|
*/
|
2006-12-15 02:34:14 +00:00
|
|
|
static int orte_pls_bproc_launch_app(orte_job_map_t* map, int num_slots,
|
|
|
|
orte_vpid_t vpid_start, int app_context) {
|
|
|
|
int *node_array, num_nodes, cycle;
|
2006-08-11 19:41:33 +00:00
|
|
|
int rc, i, j, stride;
|
2006-12-15 02:34:14 +00:00
|
|
|
orte_std_cntr_t num_processes;
|
|
|
|
int *pids = NULL;
|
|
|
|
char *var, *param;
|
2005-10-02 14:57:44 +00:00
|
|
|
orte_process_name_t * proc_name;
|
|
|
|
struct bproc_io_t bproc_io[3];
|
2006-10-07 15:45:24 +00:00
|
|
|
char **env;
|
|
|
|
int dbg;
|
2006-02-13 15:28:29 +00:00
|
|
|
|
2006-10-02 14:58:22 +00:00
|
|
|
OPAL_TRACE(1);
|
|
|
|
|
2006-10-07 15:45:24 +00:00
|
|
|
/* point to the env array for this app_context */
|
2006-12-15 02:34:14 +00:00
|
|
|
env = opal_argv_copy(map->apps[app_context]->env);
|
2005-10-02 14:57:44 +00:00
|
|
|
|
2006-02-16 00:16:22 +00:00
|
|
|
/* set up app context */
|
2005-10-02 14:57:44 +00:00
|
|
|
asprintf(¶m, "%d", app_context);
|
|
|
|
var = mca_base_param_environ_variable("pls", "bproc", "app_context");
|
2006-10-07 15:45:24 +00:00
|
|
|
opal_setenv(var, param, true, &env);
|
2005-10-02 14:57:44 +00:00
|
|
|
free(param);
|
|
|
|
free(var);
|
2006-08-11 19:41:33 +00:00
|
|
|
|
|
|
|
/* set the vpid-to-vpid stride based on the mapping mode */
|
2006-12-15 02:34:14 +00:00
|
|
|
if (bynode) {
|
2006-08-11 19:41:33 +00:00
|
|
|
/* we are mapping by node, so we want to set the stride
|
|
|
|
* length (i.e., the step size between vpids that is used
|
|
|
|
* to compute the process name) to 1
|
|
|
|
*/
|
|
|
|
stride = 1;
|
|
|
|
} else {
|
|
|
|
/* we are mapping by slot, so we want to set the stride
|
|
|
|
* length (i.e., the step size between vpids that is used
|
|
|
|
* to compute the process name) to the number of slots
|
|
|
|
*/
|
|
|
|
stride = num_slots;
|
|
|
|
}
|
|
|
|
/* and push that value into the process' environment */
|
|
|
|
asprintf(¶m, "%ld", (long)stride);
|
|
|
|
var = mca_base_param_environ_variable("pls", "bproc", "stride");
|
2006-10-07 15:45:24 +00:00
|
|
|
opal_setenv(var, param, true, &env);
|
2006-08-11 19:41:33 +00:00
|
|
|
free(param);
|
|
|
|
free(var);
|
|
|
|
|
2006-12-15 02:34:14 +00:00
|
|
|
/* set up the node_array to handle the launch */
|
|
|
|
node_array = (int*)malloc(map->num_nodes * sizeof(int));
|
|
|
|
if (NULL == node_array) {
|
|
|
|
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
|
|
|
return ORTE_ERR_OUT_OF_RESOURCE;
|
|
|
|
}
|
|
|
|
|
2006-08-14 18:34:13 +00:00
|
|
|
/* initialize the cycle count. Computing the process name under Bproc
|
|
|
|
* is a complex matter when mapping by slot as Bproc's inherent
|
|
|
|
* methodology is to do everything by node. When mapping by slot, the
|
|
|
|
* first num_slots number of launch cycles all have a vpid_start that
|
|
|
|
* will differ by one - i.e., the processes on a given node will have
|
|
|
|
* vpids that differ by only one.
|
|
|
|
*
|
|
|
|
* However, when we oversubscribe, we enter into a cyclic arrangement.
|
|
|
|
* During each cycle, the above description of how names are assigned
|
|
|
|
* is accurate. However, each cycle (i.e., each collection of num_nodes
|
|
|
|
* processes that we launch) will have a vpid start that is offset by
|
|
|
|
* num_slots * num_nodes. We have to compensate for that here when we
|
|
|
|
* calculate and pass the vpid_start param so that the processes can
|
|
|
|
* correctly compute their name
|
|
|
|
*/
|
2006-08-14 19:16:11 +00:00
|
|
|
cycle = 1;
|
2006-08-14 18:34:13 +00:00
|
|
|
|
2005-10-02 14:57:44 +00:00
|
|
|
/* launch the processes */
|
|
|
|
i = 1;
|
2006-12-15 02:34:14 +00:00
|
|
|
num_processes = map->vpid_range;
|
|
|
|
|
|
|
|
rc = orte_pls_bproc_node_list(map, node_array, &num_nodes, i);
|
2005-10-02 14:57:44 +00:00
|
|
|
if(ORTE_SUCCESS != rc) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
goto cleanup;
|
|
|
|
}
|
2006-10-11 20:34:12 +00:00
|
|
|
opal_output_verbose(1, orte_pls_base.pls_output,
|
|
|
|
"launching app %s", map->apps[app_context]->app);
|
|
|
|
|
2005-10-02 14:57:44 +00:00
|
|
|
while(0 != num_nodes) {
|
2006-10-11 20:34:12 +00:00
|
|
|
if (0 < mca_pls_bproc_component.debug) {
|
|
|
|
opal_output_verbose(1, orte_pls_base.pls_output,
|
|
|
|
"\tlaunching cycle %d", i);
|
|
|
|
for (dbg=0; dbg<num_nodes; dbg++) {
|
|
|
|
opal_output_verbose(1, orte_pls_base.pls_output,
|
2006-12-15 02:34:14 +00:00
|
|
|
"\t\tlaunching on node %d", node_array[dbg]);
|
2006-10-11 20:34:12 +00:00
|
|
|
}
|
|
|
|
}
|
2005-10-02 14:57:44 +00:00
|
|
|
|
|
|
|
/* setup environment so the procs can figure out their names */
|
2006-12-15 02:34:14 +00:00
|
|
|
rc = orte_ns_nds_bproc_put(ORTE_PROC_MY_NAME->cellid, map->job, vpid_start, map->vpid_start,
|
2006-10-07 15:45:24 +00:00
|
|
|
num_processes, &env);
|
2006-02-13 15:28:29 +00:00
|
|
|
if(ORTE_SUCCESS != rc) {
|
2005-10-02 14:57:44 +00:00
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
|
2006-12-15 02:34:14 +00:00
|
|
|
rc = orte_pls_bproc_setup_io(map->job, bproc_io, i - 1, app_context);
|
2005-10-02 14:57:44 +00:00
|
|
|
if(ORTE_SUCCESS != rc) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
if(0 < mca_pls_bproc_component.debug) {
|
2006-10-11 20:34:12 +00:00
|
|
|
opal_output(0, "pls_bproc: launching %d processes:", num_nodes);
|
2005-10-02 14:57:44 +00:00
|
|
|
}
|
2006-12-15 02:34:14 +00:00
|
|
|
|
|
|
|
/* allocate space for bproc to return the pids */
|
|
|
|
pids = (int*)malloc(num_nodes * sizeof(int));
|
|
|
|
if (NULL == pids) {
|
|
|
|
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
|
|
|
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (mca_pls_bproc_component.do_not_launch) {
|
|
|
|
for (j=0; j < num_nodes; j++) pids[j] = j+1;
|
|
|
|
rc = num_nodes;
|
|
|
|
} else {
|
|
|
|
rc = bproc_vexecmove_io(num_nodes, node_array, pids, bproc_io, 3,
|
|
|
|
map->apps[app_context]->app,
|
|
|
|
map->apps[app_context]->argv, env);
|
|
|
|
}
|
|
|
|
|
2005-10-02 14:57:44 +00:00
|
|
|
if(0 < mca_pls_bproc_component.debug) {
|
|
|
|
opal_output(0, "pls_bproc: %d processes launched. First pid: %d",
|
|
|
|
rc, *pids);
|
|
|
|
}
|
|
|
|
if(rc != num_nodes) {
|
2006-02-13 15:28:29 +00:00
|
|
|
opal_show_help("help-pls-bproc.txt", "proc-launch-number", true,
|
2006-10-07 15:45:24 +00:00
|
|
|
num_nodes, rc, map->apps[app_context]->app);
|
2005-10-02 14:57:44 +00:00
|
|
|
rc = ORTE_ERROR;
|
|
|
|
goto cleanup;
|
|
|
|
}
|
2006-12-15 02:34:14 +00:00
|
|
|
|
2005-10-02 14:57:44 +00:00
|
|
|
for(j = 0; j < num_nodes; j++) {
|
|
|
|
if(0 >= pids[j]) {
|
2006-02-13 15:28:29 +00:00
|
|
|
opal_show_help("help-pls-bproc.txt", "proc-launch-bad-pid", true,
|
2006-12-15 02:34:14 +00:00
|
|
|
node_array[j], pids[j], errno, map->apps[app_context]->app);
|
2005-10-02 14:57:44 +00:00
|
|
|
rc = ORTE_ERROR;
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
goto cleanup;
|
|
|
|
} else {
|
2006-12-15 02:34:14 +00:00
|
|
|
rc = orte_ns.create_process_name(&proc_name, ORTE_PROC_MY_NAME->cellid, map->job,
|
2005-10-02 14:57:44 +00:00
|
|
|
vpid_start + j);
|
|
|
|
if(ORTE_SUCCESS != rc) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
goto cleanup;
|
|
|
|
}
|
2006-12-15 02:34:14 +00:00
|
|
|
orte_pls_bproc_set_proc_pid(proc_name, pids[j], node_array[j]);
|
2005-10-02 14:57:44 +00:00
|
|
|
if(ORTE_SUCCESS != rc) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
goto cleanup;
|
|
|
|
}
|
2006-12-15 02:34:14 +00:00
|
|
|
if (!mca_pls_bproc_component.do_not_launch) {
|
|
|
|
rc = orte_wait_cb(pids[j], orte_pls_bproc_waitpid_cb, proc_name);
|
|
|
|
if(ORTE_SUCCESS != rc) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
goto cleanup;
|
|
|
|
}
|
2005-10-02 14:57:44 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2006-12-15 02:34:14 +00:00
|
|
|
free(pids);
|
|
|
|
pids = NULL;
|
2005-10-02 14:57:44 +00:00
|
|
|
i++;
|
2006-12-15 02:34:14 +00:00
|
|
|
if (bynode) {
|
2006-08-11 19:41:33 +00:00
|
|
|
/* we are mapping by node, so the vpid_start must increment by
|
|
|
|
* the number of nodes
|
|
|
|
*/
|
|
|
|
vpid_start += num_nodes;
|
|
|
|
} else {
|
2006-08-14 18:34:13 +00:00
|
|
|
/* we are mapping by slot. Here is where we need to check our
|
|
|
|
* cyclic condition - if we are at the end of a cycle, then
|
|
|
|
* we need to increment the vpid_start by num_slots*num_nodes.
|
|
|
|
* Otherwise, we just increment it by one.
|
2006-08-11 19:41:33 +00:00
|
|
|
*/
|
2006-08-14 18:34:13 +00:00
|
|
|
if (cycle == num_slots) {
|
|
|
|
/* end of cycle condition */
|
2006-08-14 19:16:11 +00:00
|
|
|
vpid_start += num_slots * num_nodes - 1;
|
|
|
|
cycle = 1;
|
2006-08-14 18:34:13 +00:00
|
|
|
} else {
|
|
|
|
vpid_start += 1;
|
2006-08-14 19:16:11 +00:00
|
|
|
cycle++;
|
2006-08-14 18:34:13 +00:00
|
|
|
}
|
2006-08-11 19:41:33 +00:00
|
|
|
}
|
|
|
|
|
2006-12-15 02:34:14 +00:00
|
|
|
rc = orte_pls_bproc_node_list(map, node_array, &num_nodes, i);
|
2005-10-02 14:57:44 +00:00
|
|
|
if(ORTE_SUCCESS != rc) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
}
|
2006-02-13 15:28:29 +00:00
|
|
|
|
2005-10-02 14:57:44 +00:00
|
|
|
cleanup:
|
|
|
|
if(NULL != pids) {
|
|
|
|
free(pids);
|
|
|
|
}
|
2006-12-15 02:34:14 +00:00
|
|
|
|
|
|
|
free(node_array);
|
|
|
|
|
|
|
|
if (NULL != env) opal_argv_free(env);
|
2005-10-02 14:57:44 +00:00
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* The main bproc launcher. See pls_bproc.h for a high level overview of how
|
|
|
|
* the bproc launching works.
|
|
|
|
* Here we:
|
2006-02-13 15:28:29 +00:00
|
|
|
* -# Launch the deamons on the backend nodes.
|
|
|
|
* -# The daemons setup files for io forwarding then connect back to us to
|
2005-10-02 14:57:44 +00:00
|
|
|
* tells us they are ready for the actual apps.
|
|
|
|
* -# Launch the apps on the backend nodes
|
2006-02-13 15:28:29 +00:00
|
|
|
*
|
2005-10-02 14:57:44 +00:00
|
|
|
* @param jobid the jobid of the job to launch
|
|
|
|
* @retval ORTE_SUCCESS
|
|
|
|
* @retval error
|
|
|
|
*/
|
|
|
|
int orte_pls_bproc_launch(orte_jobid_t jobid) {
|
2006-10-07 15:45:24 +00:00
|
|
|
orte_job_map_t* map;
|
|
|
|
orte_mapped_node_t *map_node;
|
2005-10-02 14:57:44 +00:00
|
|
|
orte_vpid_t vpid_launch;
|
|
|
|
int rc;
|
2006-10-07 15:45:24 +00:00
|
|
|
int num_slots;
|
|
|
|
int context;
|
2006-12-15 02:34:14 +00:00
|
|
|
int i;
|
2006-02-16 20:40:23 +00:00
|
|
|
char cwd_save[OMPI_PATH_MAX + 1];
|
2006-10-07 15:45:24 +00:00
|
|
|
orte_ras_node_t *ras_node;
|
2006-12-15 02:34:14 +00:00
|
|
|
char **daemon_env;
|
2005-10-02 14:57:44 +00:00
|
|
|
|
2006-10-02 14:58:22 +00:00
|
|
|
OPAL_TRACE(1);
|
|
|
|
|
2006-09-14 21:29:51 +00:00
|
|
|
/* make sure the pls_bproc receive function has been started */
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_pls_bproc_comm_start())) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* save the current working directory */
|
2006-02-16 20:40:23 +00:00
|
|
|
if (NULL == getcwd(cwd_save, sizeof(cwd_save))) {
|
|
|
|
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
|
|
|
return ORTE_ERR_NOT_FOUND;
|
|
|
|
}
|
|
|
|
cwd_save[sizeof(cwd_save) - 1] = '\0';
|
2006-09-14 21:29:51 +00:00
|
|
|
|
2006-10-07 15:45:24 +00:00
|
|
|
/* get the job map */
|
|
|
|
if(ORTE_SUCCESS != (rc = orte_rmaps.get_job_map(&map, jobid))) {
|
2005-10-02 14:57:44 +00:00
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
2006-10-07 15:45:24 +00:00
|
|
|
|
2006-12-15 02:34:14 +00:00
|
|
|
/* set the mapping mode */
|
|
|
|
if (NULL != map->mapping_mode && 0 == strcmp("bynode", map->mapping_mode)) {
|
|
|
|
bynode = true;
|
|
|
|
} else {
|
|
|
|
bynode = false;
|
2005-10-02 14:57:44 +00:00
|
|
|
}
|
2006-09-14 21:29:51 +00:00
|
|
|
|
2006-10-07 15:45:24 +00:00
|
|
|
/* check all of the app_contexts for sanity */
|
|
|
|
for (i=0; i < map->num_apps; i++) {
|
|
|
|
/* Check that the cwd is sane. We have to chdir there in
|
|
|
|
to check the executable, because the executable could
|
|
|
|
have been specified as a relative path to the wdir */
|
|
|
|
rc = orte_rmgr.check_context_cwd(map->apps[i], true);
|
|
|
|
if (ORTE_SUCCESS != rc) {
|
|
|
|
goto cleanup;
|
|
|
|
}
|
2006-02-16 20:40:23 +00:00
|
|
|
|
2006-10-07 15:45:24 +00:00
|
|
|
/* Check that the app exists and is executable */
|
|
|
|
rc = orte_rmgr.check_context_app(map->apps[i]);
|
|
|
|
if (ORTE_SUCCESS != rc) {
|
|
|
|
goto cleanup;
|
|
|
|
}
|
2006-02-17 16:15:21 +00:00
|
|
|
|
2006-10-07 15:45:24 +00:00
|
|
|
/* Return to the original dir */
|
|
|
|
if (0 != chdir(cwd_save)) {
|
|
|
|
rc = ORTE_ERR_IN_ERRNO;
|
|
|
|
goto cleanup;
|
2006-02-16 20:40:23 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2006-10-07 15:45:24 +00:00
|
|
|
/* For Bproc, we need to know how many slots were allocated on each
|
|
|
|
* node so the spawned processes can computer their name. Only Bproc
|
|
|
|
* needs to do this, so we choose not to modify the mapped_node struct
|
|
|
|
* to hold this info - bproc can go get it.
|
|
|
|
*
|
|
|
|
* Since Bproc also requires that the slots allocated on each node
|
|
|
|
* be the same, we really only need to lookup a single node. So grab
|
|
|
|
* the data for the first node on the map
|
|
|
|
*/
|
|
|
|
map_node = (orte_mapped_node_t*)opal_list_get_first(&map->nodes);
|
|
|
|
if (NULL == (ras_node = orte_ras.node_lookup(map_node->cell, map_node->nodename))) {
|
|
|
|
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
num_slots = ras_node->node_slots;
|
|
|
|
OBJ_RELEASE(ras_node);
|
|
|
|
|
2006-08-03 18:51:42 +00:00
|
|
|
if(0 < mca_pls_bproc_component.debug) {
|
|
|
|
opal_output(0, "pls_bproc: --- starting to launch procs ---");
|
|
|
|
}
|
|
|
|
|
2006-12-15 02:34:14 +00:00
|
|
|
/* save the daemon environment */
|
|
|
|
daemon_env = opal_argv_copy(map->apps[0]->env);
|
2005-10-02 14:57:44 +00:00
|
|
|
|
2006-12-15 02:34:14 +00:00
|
|
|
/* for each application context, setup its env */
|
2006-10-07 15:45:24 +00:00
|
|
|
for(i=0; i < map->num_apps; i++) {
|
|
|
|
orte_pls_bproc_setup_env(&map->apps[i]->env);
|
2005-10-02 14:57:44 +00:00
|
|
|
}
|
2006-08-03 05:29:49 +00:00
|
|
|
|
2006-12-15 02:34:14 +00:00
|
|
|
/* tell the smr which nodes to monitor so we can be notified
|
2006-10-07 15:45:24 +00:00
|
|
|
when the node's state changes, useful for aborting when
|
2006-12-15 02:34:14 +00:00
|
|
|
a bproc node up and dies */
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_smr.begin_monitoring(map, orte_pls_bproc_node_failed, NULL))) {
|
2006-08-03 05:29:49 +00:00
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
goto cleanup;
|
|
|
|
}
|
2006-12-15 02:34:14 +00:00
|
|
|
|
|
|
|
/* launch the daemons on all nodes which have processes assigned to them */
|
|
|
|
rc = orte_pls_bproc_launch_daemons(map, &daemon_env);
|
|
|
|
opal_argv_free(daemon_env);
|
2006-02-13 15:28:29 +00:00
|
|
|
|
2005-10-02 14:57:44 +00:00
|
|
|
if(ORTE_SUCCESS != rc) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
goto cleanup;
|
|
|
|
}
|
2006-12-15 02:34:14 +00:00
|
|
|
|
|
|
|
vpid_launch = map->vpid_start;
|
2006-10-11 20:34:12 +00:00
|
|
|
|
2005-10-02 14:57:44 +00:00
|
|
|
/* for each application context launch the app */
|
2006-10-07 15:45:24 +00:00
|
|
|
for(context=0; context < map->num_apps; context++) {
|
|
|
|
rc = orte_rmgr.check_context_cwd(map->apps[context], true);
|
2006-02-16 20:40:23 +00:00
|
|
|
if (ORTE_SUCCESS != rc) {
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
|
2006-12-15 02:34:14 +00:00
|
|
|
rc = orte_pls_bproc_launch_app(map, num_slots, vpid_launch, context);
|
2005-10-02 14:57:44 +00:00
|
|
|
if(ORTE_SUCCESS != rc) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
goto cleanup;
|
|
|
|
}
|
2006-12-15 02:34:14 +00:00
|
|
|
vpid_launch += map->apps[context]->num_procs;
|
2005-10-02 14:57:44 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
cleanup:
|
2006-02-16 20:40:23 +00:00
|
|
|
chdir(cwd_save);
|
2006-10-07 22:44:00 +00:00
|
|
|
|
|
|
|
OBJ_RELEASE(map);
|
|
|
|
|
2006-12-15 02:34:14 +00:00
|
|
|
if (mca_pls_bproc_component.do_not_launch) {
|
2006-12-15 14:03:53 +00:00
|
|
|
/* indicate that we failed to launch, but do so silently */
|
|
|
|
return ORTE_ERR_SILENT;
|
2005-10-02 14:57:44 +00:00
|
|
|
}
|
2006-12-15 02:34:14 +00:00
|
|
|
|
2005-10-02 14:57:44 +00:00
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2006-09-14 21:29:51 +00:00
|
|
|
* Terminate all processes associated with this job */
|
Bring over the update to terminate orteds that are generated by a dynamic spawn such as comm_spawn. This introduces the concept of a job "family" - i.e., jobs that have a parent/child relationship. Comm_spawn'ed jobs have a parent (the one that spawned them). We track that relationship throughout the lineage - i.e., if a comm_spawned job in turn calls comm_spawn, then it has a parent (the one that spawned it) and a "root" job (the original job that started things).
Accordingly, there are new APIs to the name service to support the ability to get a job's parent, root, immediate children, and all its descendants. In addition, the terminate_job, terminate_orted, and signal_job APIs for the PLS have been modified to accept attributes that define the extent of their actions. For example, doing a "terminate_job" with an attribute of ORTE_NS_INCLUDE_DESCENDANTS will terminate the given jobid AND all jobs that descended from it.
I have tested this capability on a MacBook under rsh, Odin under SLURM, and LANL's Flash (bproc). It worked successfully on non-MPI jobs (both simple and including a spawn), and MPI jobs (again, both simple and with a spawn).
This commit was SVN r12597.
2006-11-14 19:34:59 +00:00
|
|
|
int orte_pls_bproc_terminate_job(orte_jobid_t jobid, opal_list_t *attrs) {
|
2005-10-02 14:57:44 +00:00
|
|
|
pid_t* pids;
|
2006-08-15 19:54:10 +00:00
|
|
|
orte_std_cntr_t i, num_pids;
|
2005-10-02 14:57:44 +00:00
|
|
|
int rc;
|
2006-09-14 21:29:51 +00:00
|
|
|
|
2006-10-02 14:58:22 +00:00
|
|
|
OPAL_TRACE(1);
|
|
|
|
|
2006-09-14 21:29:51 +00:00
|
|
|
if(0 < mca_pls_bproc_component.debug) {
|
|
|
|
opal_output(0, "orte_pls_bproc: terminating job %ld", jobid);
|
|
|
|
}
|
|
|
|
|
2005-10-02 14:57:44 +00:00
|
|
|
/* kill application process */
|
Bring over the update to terminate orteds that are generated by a dynamic spawn such as comm_spawn. This introduces the concept of a job "family" - i.e., jobs that have a parent/child relationship. Comm_spawn'ed jobs have a parent (the one that spawned them). We track that relationship throughout the lineage - i.e., if a comm_spawned job in turn calls comm_spawn, then it has a parent (the one that spawned it) and a "root" job (the original job that started things).
Accordingly, there are new APIs to the name service to support the ability to get a job's parent, root, immediate children, and all its descendants. In addition, the terminate_job, terminate_orted, and signal_job APIs for the PLS have been modified to accept attributes that define the extent of their actions. For example, doing a "terminate_job" with an attribute of ORTE_NS_INCLUDE_DESCENDANTS will terminate the given jobid AND all jobs that descended from it.
I have tested this capability on a MacBook under rsh, Odin under SLURM, and LANL's Flash (bproc). It worked successfully on non-MPI jobs (both simple and including a spawn), and MPI jobs (again, both simple and with a spawn).
This commit was SVN r12597.
2006-11-14 19:34:59 +00:00
|
|
|
if(ORTE_SUCCESS != (rc = orte_pls_bproc_get_proc_pids(jobid, &pids, &num_pids, attrs)))
|
2005-10-02 14:57:44 +00:00
|
|
|
return rc;
|
|
|
|
for(i=0; i<num_pids; i++) {
|
|
|
|
if(mca_pls_bproc_component.debug) {
|
|
|
|
opal_output(0, "orte_pls_bproc: killing proc: %d\n", pids[i]);
|
|
|
|
}
|
|
|
|
kill(pids[i], mca_pls_bproc_component.terminate_sig);
|
|
|
|
}
|
|
|
|
if(NULL != pids)
|
|
|
|
free(pids);
|
2006-09-14 21:29:51 +00:00
|
|
|
|
|
|
|
/* dont kill daemons - mpirun will do this for us */
|
2005-10-02 14:57:44 +00:00
|
|
|
return ORTE_SUCCESS;
|
|
|
|
}
|
|
|
|
|
2006-09-14 21:29:51 +00:00
|
|
|
/**
|
|
|
|
* Terminate the orteds for a given job
|
|
|
|
*/
|
Bring over the update to terminate orteds that are generated by a dynamic spawn such as comm_spawn. This introduces the concept of a job "family" - i.e., jobs that have a parent/child relationship. Comm_spawn'ed jobs have a parent (the one that spawned them). We track that relationship throughout the lineage - i.e., if a comm_spawned job in turn calls comm_spawn, then it has a parent (the one that spawned it) and a "root" job (the original job that started things).
Accordingly, there are new APIs to the name service to support the ability to get a job's parent, root, immediate children, and all its descendants. In addition, the terminate_job, terminate_orted, and signal_job APIs for the PLS have been modified to accept attributes that define the extent of their actions. For example, doing a "terminate_job" with an attribute of ORTE_NS_INCLUDE_DESCENDANTS will terminate the given jobid AND all jobs that descended from it.
I have tested this capability on a MacBook under rsh, Odin under SLURM, and LANL's Flash (bproc). It worked successfully on non-MPI jobs (both simple and including a spawn), and MPI jobs (again, both simple and with a spawn).
This commit was SVN r12597.
2006-11-14 19:34:59 +00:00
|
|
|
int orte_pls_bproc_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs)
|
2006-09-14 21:29:51 +00:00
|
|
|
{
|
|
|
|
int rc;
|
|
|
|
opal_list_t daemons;
|
|
|
|
opal_list_item_t *item;
|
|
|
|
|
2006-10-02 14:58:22 +00:00
|
|
|
OPAL_TRACE(1);
|
|
|
|
|
2006-09-14 21:29:51 +00:00
|
|
|
/* construct the list of active daemons on this job */
|
|
|
|
OBJ_CONSTRUCT(&daemons, opal_list_t);
|
Bring over the update to terminate orteds that are generated by a dynamic spawn such as comm_spawn. This introduces the concept of a job "family" - i.e., jobs that have a parent/child relationship. Comm_spawn'ed jobs have a parent (the one that spawned them). We track that relationship throughout the lineage - i.e., if a comm_spawned job in turn calls comm_spawn, then it has a parent (the one that spawned it) and a "root" job (the original job that started things).
Accordingly, there are new APIs to the name service to support the ability to get a job's parent, root, immediate children, and all its descendants. In addition, the terminate_job, terminate_orted, and signal_job APIs for the PLS have been modified to accept attributes that define the extent of their actions. For example, doing a "terminate_job" with an attribute of ORTE_NS_INCLUDE_DESCENDANTS will terminate the given jobid AND all jobs that descended from it.
I have tested this capability on a MacBook under rsh, Odin under SLURM, and LANL's Flash (bproc). It worked successfully on non-MPI jobs (both simple and including a spawn), and MPI jobs (again, both simple and with a spawn).
This commit was SVN r12597.
2006-11-14 19:34:59 +00:00
|
|
|
if (ORTE_SUCCESS != (rc = orte_pls_base_get_active_daemons(&daemons, jobid, attrs))) {
|
2006-09-14 21:29:51 +00:00
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
goto CLEANUP;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* now tell them to die! */
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_exit(&daemons))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
}
|
|
|
|
|
|
|
|
CLEANUP:
|
|
|
|
while (NULL != (item = opal_list_remove_first(&daemons))) {
|
|
|
|
OBJ_RELEASE(item);
|
|
|
|
}
|
|
|
|
OBJ_DESTRUCT(&daemons);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
2005-10-02 14:57:44 +00:00
|
|
|
/**
|
|
|
|
* Terminate a specific process.
|
|
|
|
*/
|
|
|
|
int orte_pls_bproc_terminate_proc(const orte_process_name_t* proc_name) {
|
|
|
|
int rc;
|
|
|
|
pid_t pid;
|
2006-10-02 14:58:22 +00:00
|
|
|
|
|
|
|
OPAL_TRACE(1);
|
|
|
|
|
2006-09-14 21:29:51 +00:00
|
|
|
if(ORTE_SUCCESS != (rc = orte_pls_bproc_get_proc_pid(proc_name, &pid)))
|
2005-10-02 14:57:44 +00:00
|
|
|
return rc;
|
|
|
|
if(kill(pid, mca_pls_bproc_component.terminate_sig) != 0) {
|
|
|
|
switch(errno) {
|
|
|
|
case EINVAL:
|
|
|
|
return ORTE_ERR_BAD_PARAM;
|
|
|
|
case ESRCH:
|
|
|
|
return ORTE_ERR_NOT_FOUND;
|
|
|
|
case EPERM:
|
|
|
|
return ORTE_ERR_PERM;
|
|
|
|
default:
|
|
|
|
return ORTE_ERROR;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return ORTE_SUCCESS;
|
|
|
|
}
|
|
|
|
|
2006-06-08 18:27:17 +00:00
|
|
|
/**
|
|
|
|
* Signal all processes associated with this job
|
|
|
|
*/
|
Bring over the update to terminate orteds that are generated by a dynamic spawn such as comm_spawn. This introduces the concept of a job "family" - i.e., jobs that have a parent/child relationship. Comm_spawn'ed jobs have a parent (the one that spawned them). We track that relationship throughout the lineage - i.e., if a comm_spawned job in turn calls comm_spawn, then it has a parent (the one that spawned it) and a "root" job (the original job that started things).
Accordingly, there are new APIs to the name service to support the ability to get a job's parent, root, immediate children, and all its descendants. In addition, the terminate_job, terminate_orted, and signal_job APIs for the PLS have been modified to accept attributes that define the extent of their actions. For example, doing a "terminate_job" with an attribute of ORTE_NS_INCLUDE_DESCENDANTS will terminate the given jobid AND all jobs that descended from it.
I have tested this capability on a MacBook under rsh, Odin under SLURM, and LANL's Flash (bproc). It worked successfully on non-MPI jobs (both simple and including a spawn), and MPI jobs (again, both simple and with a spawn).
This commit was SVN r12597.
2006-11-14 19:34:59 +00:00
|
|
|
int orte_pls_bproc_signal_job(orte_jobid_t jobid, int32_t signal, opal_list_t *attrs) {
|
2006-06-08 18:27:17 +00:00
|
|
|
pid_t* pids;
|
2006-08-15 19:54:10 +00:00
|
|
|
orte_std_cntr_t i, num_pids;
|
2006-06-08 18:27:17 +00:00
|
|
|
int rc;
|
|
|
|
|
2006-10-02 14:58:22 +00:00
|
|
|
OPAL_TRACE(1);
|
|
|
|
|
2006-06-08 18:27:17 +00:00
|
|
|
/* signal application process */
|
Bring over the update to terminate orteds that are generated by a dynamic spawn such as comm_spawn. This introduces the concept of a job "family" - i.e., jobs that have a parent/child relationship. Comm_spawn'ed jobs have a parent (the one that spawned them). We track that relationship throughout the lineage - i.e., if a comm_spawned job in turn calls comm_spawn, then it has a parent (the one that spawned it) and a "root" job (the original job that started things).
Accordingly, there are new APIs to the name service to support the ability to get a job's parent, root, immediate children, and all its descendants. In addition, the terminate_job, terminate_orted, and signal_job APIs for the PLS have been modified to accept attributes that define the extent of their actions. For example, doing a "terminate_job" with an attribute of ORTE_NS_INCLUDE_DESCENDANTS will terminate the given jobid AND all jobs that descended from it.
I have tested this capability on a MacBook under rsh, Odin under SLURM, and LANL's Flash (bproc). It worked successfully on non-MPI jobs (both simple and including a spawn), and MPI jobs (again, both simple and with a spawn).
This commit was SVN r12597.
2006-11-14 19:34:59 +00:00
|
|
|
if(ORTE_SUCCESS != (rc = orte_pls_bproc_get_proc_pids(jobid, &pids, &num_pids, attrs)))
|
2006-06-08 18:27:17 +00:00
|
|
|
return rc;
|
|
|
|
for(i=0; i<num_pids; i++) {
|
|
|
|
if(mca_pls_bproc_component.debug) {
|
|
|
|
opal_output(0, "orte_pls_bproc: signaling proc: %d\n", pids[i]);
|
|
|
|
}
|
|
|
|
kill(pids[i], (int)signal);
|
|
|
|
}
|
|
|
|
if(NULL != pids)
|
|
|
|
free(pids);
|
|
|
|
|
|
|
|
/** dont signal daemons - this is strictly for signalling application processes */
|
|
|
|
return ORTE_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Signal a specific process.
|
|
|
|
*/
|
|
|
|
int orte_pls_bproc_signal_proc(const orte_process_name_t* proc_name, int32_t signal) {
|
|
|
|
int rc;
|
|
|
|
pid_t pid;
|
|
|
|
|
2006-10-02 14:58:22 +00:00
|
|
|
OPAL_TRACE(1);
|
|
|
|
|
2006-09-14 21:29:51 +00:00
|
|
|
if(ORTE_SUCCESS != (rc = orte_pls_bproc_get_proc_pid(proc_name, &pid)))
|
2006-06-08 18:27:17 +00:00
|
|
|
return rc;
|
|
|
|
if(kill(pid, (int)signal) != 0) {
|
|
|
|
switch(errno) {
|
|
|
|
case EINVAL:
|
|
|
|
return ORTE_ERR_BAD_PARAM;
|
|
|
|
case ESRCH:
|
|
|
|
return ORTE_ERR_NOT_FOUND;
|
|
|
|
case EPERM:
|
|
|
|
return ORTE_ERR_PERM;
|
|
|
|
default:
|
|
|
|
return ORTE_ERROR;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return ORTE_SUCCESS;
|
|
|
|
}
|
|
|
|
|
2005-10-02 14:57:44 +00:00
|
|
|
/**
|
|
|
|
* Module cleanup
|
|
|
|
*/
|
2006-06-08 18:27:17 +00:00
|
|
|
int orte_pls_bproc_finalize(void)
|
2006-02-16 00:16:22 +00:00
|
|
|
{
|
2006-09-14 21:29:51 +00:00
|
|
|
return ORTE_SUCCESS;
|
2005-10-02 14:57:44 +00:00
|
|
|
}
|
|
|
|
|
2006-03-16 00:06:48 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Handle threading issues.
|
|
|
|
*/
|
2006-06-08 18:27:17 +00:00
|
|
|
|
2006-03-16 00:06:48 +00:00
|
|
|
#if OMPI_HAVE_POSIX_THREADS && OMPI_THREADS_HAVE_DIFFERENT_PIDS
|
2006-06-08 18:27:17 +00:00
|
|
|
|
2006-03-16 00:06:48 +00:00
|
|
|
struct orte_pls_bproc_stack_t {
|
|
|
|
opal_condition_t cond;
|
|
|
|
opal_mutex_t mutex;
|
|
|
|
bool complete;
|
|
|
|
orte_jobid_t jobid;
|
|
|
|
int rc;
|
|
|
|
};
|
|
|
|
typedef struct orte_pls_bproc_stack_t orte_pls_bproc_stack_t;
|
2006-06-08 18:27:17 +00:00
|
|
|
|
2006-03-16 00:06:48 +00:00
|
|
|
static void orte_pls_bproc_stack_construct(orte_pls_bproc_stack_t* stack)
|
|
|
|
{
|
|
|
|
OBJ_CONSTRUCT(&stack->mutex, opal_mutex_t);
|
|
|
|
OBJ_CONSTRUCT(&stack->cond, opal_condition_t);
|
|
|
|
stack->rc = 0;
|
|
|
|
stack->complete = false;
|
|
|
|
}
|
2006-06-08 18:27:17 +00:00
|
|
|
|
2006-03-16 00:06:48 +00:00
|
|
|
static void orte_pls_bproc_stack_destruct(orte_pls_bproc_stack_t* stack)
|
|
|
|
{
|
|
|
|
OBJ_DESTRUCT(&stack->mutex);
|
|
|
|
OBJ_DESTRUCT(&stack->cond);
|
|
|
|
}
|
2006-06-08 18:27:17 +00:00
|
|
|
|
2006-03-16 00:06:48 +00:00
|
|
|
static OBJ_CLASS_INSTANCE(
|
|
|
|
orte_pls_bproc_stack_t,
|
|
|
|
opal_object_t,
|
|
|
|
orte_pls_bproc_stack_construct,
|
|
|
|
orte_pls_bproc_stack_destruct);
|
2006-06-08 18:27:17 +00:00
|
|
|
|
|
|
|
|
2006-03-16 00:06:48 +00:00
|
|
|
static void orte_pls_bproc_launch_cb(int fd, short event, void* args)
|
|
|
|
{
|
2006-06-08 18:27:17 +00:00
|
|
|
|
2006-03-16 00:06:48 +00:00
|
|
|
orte_pls_bproc_stack_t *stack = (orte_pls_bproc_stack_t*)args;
|
|
|
|
stack->rc = orte_pls_bproc_launch(stack->jobid);
|
|
|
|
OPAL_THREAD_LOCK(&stack->mutex);
|
|
|
|
stack->complete = true;
|
|
|
|
opal_condition_signal(&stack->cond);
|
|
|
|
OPAL_THREAD_UNLOCK(&stack->mutex);
|
|
|
|
}
|
|
|
|
|
|
|
|
int orte_pls_bproc_launch_threaded(orte_jobid_t jobid)
|
|
|
|
{
|
|
|
|
struct timeval tv = { 0, 0 };
|
|
|
|
struct opal_event event;
|
|
|
|
struct orte_pls_bproc_stack_t stack;
|
2006-06-08 18:27:17 +00:00
|
|
|
|
2006-03-16 00:06:48 +00:00
|
|
|
OBJ_CONSTRUCT(&stack, orte_pls_bproc_stack_t);
|
2006-06-08 18:27:17 +00:00
|
|
|
|
2006-03-16 00:06:48 +00:00
|
|
|
stack.jobid = jobid;
|
|
|
|
opal_evtimer_set(&event, orte_pls_bproc_launch_cb, &stack);
|
|
|
|
opal_evtimer_add(&event, &tv);
|
2006-06-08 18:27:17 +00:00
|
|
|
|
2006-03-16 00:06:48 +00:00
|
|
|
OPAL_THREAD_LOCK(&stack.mutex);
|
|
|
|
while(stack.complete == false)
|
|
|
|
opal_condition_wait(&stack.cond, &stack.mutex);
|
|
|
|
OPAL_THREAD_UNLOCK(&stack.mutex);
|
|
|
|
OBJ_DESTRUCT(&stack);
|
|
|
|
return stack.rc;
|
|
|
|
}
|
2006-06-08 18:27:17 +00:00
|
|
|
|
2006-03-16 00:06:48 +00:00
|
|
|
#endif
|
|
|
|
|