2005-03-14 20:57:21 +00:00
|
|
|
/*
|
|
|
|
* Copyright (c) 2004-2005 The Trustees of Indiana University.
|
|
|
|
* All rights reserved.
|
|
|
|
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
|
|
|
* All rights reserved.
|
|
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
|
|
* University of Stuttgart. All rights reserved.
|
2005-03-24 12:43:37 +00:00
|
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
|
|
* All rights reserved.
|
2005-03-14 20:57:21 +00:00
|
|
|
* $COPYRIGHT$
|
|
|
|
*
|
|
|
|
* Additional copyrights may follow
|
|
|
|
*
|
|
|
|
* $HEADER$
|
|
|
|
*
|
|
|
|
* These symbols are in a file by themselves to provide nice linker
|
|
|
|
* semantics. Since linkers generally pull in symbols by object
|
|
|
|
* files, keeping these symbols as the only symbols in this file
|
|
|
|
* prevents utility programs such as "ompi_info" from having to import
|
|
|
|
* entire components just to query their version and parameters.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "orte_config.h"
|
|
|
|
|
|
|
|
#if HAVE_UNISTD_H
|
|
|
|
#include <unistd.h>
|
|
|
|
#endif
|
|
|
|
#include <signal.h>
|
2005-03-18 03:43:59 +00:00
|
|
|
#include <sys/types.h>
|
|
|
|
#include <sys/wait.h>
|
2005-03-14 20:57:21 +00:00
|
|
|
|
|
|
|
#include "include/orte_constants.h"
|
|
|
|
#include "include/orte_types.h"
|
2005-07-04 00:13:44 +00:00
|
|
|
#include "opal/util/argv.h"
|
2005-07-03 23:31:27 +00:00
|
|
|
#include "opal/util/output.h"
|
2005-07-04 01:36:20 +00:00
|
|
|
#include "opal/util/opal_environ.h"
|
2005-03-18 03:43:59 +00:00
|
|
|
#include "runtime/runtime.h"
|
|
|
|
#include "runtime/orte_wait.h"
|
2005-03-14 20:57:21 +00:00
|
|
|
#include "mca/base/mca_base_param.h"
|
|
|
|
#include "mca/rmgr/base/base.h"
|
|
|
|
#include "mca/rmaps/base/rmaps_base_map.h"
|
|
|
|
#include "mca/pls/pls.h"
|
|
|
|
#include "mca/pls/base/base.h"
|
|
|
|
#include "mca/errmgr/errmgr.h"
|
|
|
|
#include "mca/soh/soh_types.h"
|
|
|
|
#include "mca/gpr/gpr.h"
|
2005-08-19 14:46:11 +00:00
|
|
|
#include "orte/mca/sds/base/base.h"
|
2005-03-18 03:43:59 +00:00
|
|
|
#include "mca/soh/soh.h"
|
|
|
|
#include "mca/rml/rml.h"
|
|
|
|
#include "mca/ns/ns.h"
|
2005-03-14 20:57:21 +00:00
|
|
|
#include "pls_tm.h"
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Local functions
|
|
|
|
*/
|
|
|
|
static int pls_tm_launch(orte_jobid_t jobid);
|
|
|
|
static int pls_tm_terminate_job(orte_jobid_t jobid);
|
|
|
|
static int pls_tm_terminate_proc(const orte_process_name_t *name);
|
|
|
|
static int pls_tm_finalize(void);
|
|
|
|
|
2005-03-18 03:43:59 +00:00
|
|
|
static void do_wait_proc(pid_t pid, int status, void* cbdata);
|
|
|
|
static int kill_tids(tm_task_id *tids, orte_process_name_t *names,
|
|
|
|
size_t num_tids);
|
2005-03-14 20:57:21 +00:00
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Global variable
|
|
|
|
*/
|
|
|
|
orte_pls_base_module_1_0_0_t orte_pls_tm_module = {
|
|
|
|
pls_tm_launch,
|
|
|
|
pls_tm_terminate_job,
|
|
|
|
pls_tm_terminate_proc,
|
|
|
|
pls_tm_finalize
|
|
|
|
};
|
2005-03-18 03:43:59 +00:00
|
|
|
bool orte_pls_tm_connected = false;
|
2005-03-14 20:57:21 +00:00
|
|
|
|
|
|
|
extern char **environ;
|
|
|
|
#define NUM_SIGNAL_POLL_ITERS 50
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Local variables
|
|
|
|
*/
|
2005-03-18 03:43:59 +00:00
|
|
|
static bool wait_cb_set = false;
|
|
|
|
static pid_t child_pid = -1;
|
2005-03-14 20:57:21 +00:00
|
|
|
|
|
|
|
|
|
|
|
static int pls_tm_launch(orte_jobid_t jobid)
|
|
|
|
{
|
2005-03-18 03:43:59 +00:00
|
|
|
orte_jobid_t *save;
|
2005-03-14 20:57:21 +00:00
|
|
|
|
2005-03-18 03:43:59 +00:00
|
|
|
/* Copy the jobid */
|
2005-03-14 20:57:21 +00:00
|
|
|
|
2005-03-18 03:43:59 +00:00
|
|
|
save = malloc(sizeof(orte_jobid_t));
|
|
|
|
if (NULL == save) {
|
|
|
|
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
|
|
|
return ORTE_ERR_OUT_OF_RESOURCE;
|
2005-03-14 20:57:21 +00:00
|
|
|
}
|
2005-03-18 03:43:59 +00:00
|
|
|
memcpy(save, &jobid, sizeof(orte_jobid_t));
|
|
|
|
|
|
|
|
/* Child */
|
|
|
|
|
2005-07-03 23:31:27 +00:00
|
|
|
opal_output(orte_pls_base.pls_output,
|
2005-03-18 03:43:59 +00:00
|
|
|
"pls:tm:launch: launching child to do the work");
|
|
|
|
child_pid = fork();
|
|
|
|
if (0 == child_pid) {
|
|
|
|
if (ORTE_SUCCESS != orte_pls_tm_child_init() ||
|
|
|
|
ORTE_SUCCESS != orte_pls_tm_child_launch(jobid) ||
|
|
|
|
ORTE_SUCCESS != orte_pls_tm_child_wait(jobid) ||
|
|
|
|
ORTE_SUCCESS != orte_pls_tm_child_finalize()) {
|
|
|
|
/* Bogus logic just to stop at the first failure */
|
|
|
|
child_pid++;
|
2005-03-14 20:57:21 +00:00
|
|
|
}
|
2005-03-18 03:43:59 +00:00
|
|
|
exit(0);
|
2005-03-14 20:57:21 +00:00
|
|
|
}
|
2005-03-18 03:43:59 +00:00
|
|
|
printf("tm child PID: %d\n", child_pid);
|
|
|
|
fflush(stdout);
|
2005-03-14 20:57:21 +00:00
|
|
|
|
2005-03-18 03:43:59 +00:00
|
|
|
/* Parent */
|
2005-03-14 20:57:21 +00:00
|
|
|
|
2005-03-18 03:43:59 +00:00
|
|
|
orte_wait_cb(child_pid, do_wait_proc, save);
|
|
|
|
wait_cb_set = true;
|
|
|
|
|
|
|
|
return ORTE_SUCCESS;
|
2005-03-14 20:57:21 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static int pls_tm_terminate_job(orte_jobid_t jobid)
|
|
|
|
{
|
2005-03-18 03:43:59 +00:00
|
|
|
struct tm_roots tm_root;
|
2005-03-14 20:57:21 +00:00
|
|
|
tm_task_id *tids;
|
2005-03-18 03:43:59 +00:00
|
|
|
orte_process_name_t *names;
|
|
|
|
size_t size;
|
2005-03-14 20:57:21 +00:00
|
|
|
int ret;
|
|
|
|
|
2005-03-18 03:43:59 +00:00
|
|
|
/* If we have a child, that child is potentially sitting inside
|
|
|
|
tm_poll(), and we won't be able to tm_init(). Sigh. So kill
|
|
|
|
the child. */
|
|
|
|
|
|
|
|
if (child_pid > 0) {
|
2005-07-03 23:31:27 +00:00
|
|
|
opal_output(orte_pls_base.pls_output,
|
2005-03-18 03:43:59 +00:00
|
|
|
"pls:tm:terminate_job: killing tm shephard");
|
|
|
|
kill(child_pid, SIGKILL);
|
|
|
|
waitpid(child_pid, NULL, 0);
|
|
|
|
child_pid = -1;
|
|
|
|
sleep(1);
|
|
|
|
}
|
|
|
|
|
2005-03-14 20:57:21 +00:00
|
|
|
/* Open up our connection to tm. Note that we may be called from
|
|
|
|
launch, above, in which case we don't need to tm_init */
|
|
|
|
|
2005-07-03 23:31:27 +00:00
|
|
|
opal_output(orte_pls_base.pls_output,
|
2005-03-18 03:43:59 +00:00
|
|
|
"pls:tm:terminate_job: killing jobid %d", jobid);
|
|
|
|
if (!orte_pls_tm_connected) {
|
2005-03-14 20:57:21 +00:00
|
|
|
ret = tm_init(NULL, &tm_root);
|
|
|
|
if (TM_SUCCESS != ret) {
|
2005-03-18 03:43:59 +00:00
|
|
|
ret = ORTE_ERR_RESOURCE_BUSY;
|
|
|
|
ORTE_ERROR_LOG(ret);
|
|
|
|
return ret;
|
2005-03-14 20:57:21 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Get the TIDs from the registry */
|
|
|
|
|
2005-03-18 03:43:59 +00:00
|
|
|
ret = orte_pls_tm_get_tids(jobid, &tids, &names, &size);
|
|
|
|
if (ORTE_SUCCESS == ret && size > 0) {
|
2005-07-03 23:31:27 +00:00
|
|
|
opal_output(orte_pls_base.pls_output,
|
2005-03-18 03:43:59 +00:00
|
|
|
"pls:tm:terminate_job: got %d tids from registry", size);
|
|
|
|
ret = kill_tids(tids, names, size);
|
|
|
|
if (NULL != names) {
|
|
|
|
free(names);
|
|
|
|
}
|
|
|
|
if (NULL != tids) {
|
|
|
|
free(tids);
|
|
|
|
}
|
|
|
|
} else {
|
2005-07-03 23:31:27 +00:00
|
|
|
opal_output(orte_pls_base.pls_output,
|
2005-03-18 03:43:59 +00:00
|
|
|
"pls:tm:terminate_job: got no tids from registry -- nothing to kill");
|
2005-03-14 20:57:21 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/* All done */
|
|
|
|
|
2005-03-18 03:43:59 +00:00
|
|
|
if (!orte_pls_tm_connected) {
|
2005-03-14 20:57:21 +00:00
|
|
|
tm_finalize();
|
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* TM can't kill individual processes -- PBS will kill the entire job
|
|
|
|
*/
|
|
|
|
static int pls_tm_terminate_proc(const orte_process_name_t *name)
|
|
|
|
{
|
2005-07-03 23:31:27 +00:00
|
|
|
opal_output(orte_pls_base.pls_output,
|
2005-03-18 03:43:59 +00:00
|
|
|
"pls:tm:terminate_proc: not supported");
|
|
|
|
ORTE_ERROR_LOG(ORTE_ERR_NOT_SUPPORTED);
|
2005-03-14 20:57:21 +00:00
|
|
|
return ORTE_ERR_NOT_SUPPORTED;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Free stuff
|
|
|
|
*/
|
|
|
|
static int pls_tm_finalize(void)
|
|
|
|
{
|
2005-03-18 03:43:59 +00:00
|
|
|
if (wait_cb_set) {
|
|
|
|
orte_wait_cb_cancel(child_pid);
|
2005-03-14 20:57:21 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return ORTE_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2005-03-18 03:43:59 +00:00
|
|
|
static void do_wait_proc(pid_t pid, int status, void *cbdata)
|
2005-03-14 20:57:21 +00:00
|
|
|
{
|
2005-03-18 03:43:59 +00:00
|
|
|
orte_jobid_t *jobid = (orte_jobid_t *) cbdata;
|
2005-03-14 20:57:21 +00:00
|
|
|
|
2005-03-18 03:43:59 +00:00
|
|
|
printf("Child TM proc has exited!\n");
|
|
|
|
fflush(stdout);
|
2005-03-14 20:57:21 +00:00
|
|
|
|
2005-03-18 03:43:59 +00:00
|
|
|
free(cbdata);
|
2005-03-14 20:57:21 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Kill a bunch of tids. Don't care about errors here -- just make a
|
|
|
|
* best attempt to kill kill kill; if we fail, oh well.
|
|
|
|
*/
|
2005-03-18 03:43:59 +00:00
|
|
|
static int kill_tids(tm_task_id *tids, orte_process_name_t *names, size_t size)
|
2005-03-14 20:57:21 +00:00
|
|
|
{
|
2005-03-18 03:43:59 +00:00
|
|
|
size_t i;
|
|
|
|
int j, ret, local_errno, exit_status;
|
2005-03-14 20:57:21 +00:00
|
|
|
tm_event_t event;
|
2005-03-18 03:43:59 +00:00
|
|
|
bool died;
|
2005-03-14 20:57:21 +00:00
|
|
|
|
2005-03-18 03:43:59 +00:00
|
|
|
for (i = 0; i < size; ++i) {
|
|
|
|
died = false;
|
2005-03-14 20:57:21 +00:00
|
|
|
|
|
|
|
/* First, kill with SIGTERM */
|
|
|
|
|
2005-07-03 23:31:27 +00:00
|
|
|
opal_output(orte_pls_base.pls_output,
|
2005-03-18 03:43:59 +00:00
|
|
|
"pls:tm:terminate:kill_tids: killing tid %d", tids[i]);
|
|
|
|
ret = tm_kill(tids[i], SIGTERM, &event);
|
2005-03-14 20:57:21 +00:00
|
|
|
|
2005-03-18 03:43:59 +00:00
|
|
|
/* If we didn't find the tid, then just continue -- it may
|
|
|
|
have exited on its own */
|
2005-03-14 20:57:21 +00:00
|
|
|
|
2005-03-18 03:43:59 +00:00
|
|
|
if (TM_ENOTFOUND == ret) {
|
2005-07-03 23:31:27 +00:00
|
|
|
opal_output(orte_pls_base.pls_output,
|
2005-03-18 03:43:59 +00:00
|
|
|
"pls:tm:terminate:kill_tids: tid %d not found (already dead?)",
|
|
|
|
tids[i]);
|
|
|
|
died = true;
|
|
|
|
} else if (TM_SUCCESS != ret) {
|
2005-07-03 23:31:27 +00:00
|
|
|
opal_output(orte_pls_base.pls_output,
|
2005-03-18 03:43:59 +00:00
|
|
|
"pls:tm:kill: tm_kill failed with %d", ret);
|
2005-03-14 20:57:21 +00:00
|
|
|
ret = ORTE_ERROR;
|
|
|
|
ORTE_ERROR_LOG(ret);
|
|
|
|
return ret;
|
|
|
|
}
|
2005-03-18 03:43:59 +00:00
|
|
|
if (!died) {
|
|
|
|
tm_poll(TM_NULL_EVENT, &event, 1, &local_errno);
|
2005-07-03 23:31:27 +00:00
|
|
|
opal_output(orte_pls_base.pls_output,
|
2005-03-18 03:43:59 +00:00
|
|
|
"pls:tm:kill: killed tid %d with SIGTERM", tids[i]);
|
2005-03-14 20:57:21 +00:00
|
|
|
|
2005-03-18 03:43:59 +00:00
|
|
|
/* Did it die? */
|
2005-03-14 20:57:21 +00:00
|
|
|
|
2005-03-18 03:43:59 +00:00
|
|
|
ret = tm_obit(tids[i], &exit_status, &event);
|
|
|
|
if (TM_SUCCESS != ret) {
|
2005-07-03 23:31:27 +00:00
|
|
|
opal_output(orte_pls_base.pls_output,
|
2005-03-18 03:43:59 +00:00
|
|
|
"pls:tm:kill: tm_obit failed with %d", ret);
|
|
|
|
ret = ORTE_ERROR;
|
|
|
|
ORTE_ERROR_LOG(ret);
|
|
|
|
return ret;
|
2005-03-14 20:57:21 +00:00
|
|
|
}
|
|
|
|
|
2005-03-18 03:43:59 +00:00
|
|
|
tm_poll(TM_NULL_EVENT, &event, 0, &local_errno);
|
2005-03-14 20:57:21 +00:00
|
|
|
|
2005-03-18 03:43:59 +00:00
|
|
|
/* If it's dead, save the state */
|
|
|
|
|
|
|
|
if (TM_NULL_EVENT != event) {
|
|
|
|
died = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* It didn't seem to die right away; poll a few times */
|
|
|
|
|
|
|
|
else {
|
|
|
|
for (j = 0; j < NUM_SIGNAL_POLL_ITERS; ++j) {
|
|
|
|
tm_poll(TM_NULL_EVENT, &event, 0, &local_errno);
|
|
|
|
if (TM_NULL_EVENT != event) {
|
|
|
|
died = true;
|
2005-07-03 23:31:27 +00:00
|
|
|
opal_output(orte_pls_base.pls_output,
|
2005-03-18 03:43:59 +00:00
|
|
|
"pls:tm:kill: tid %d died", tids[i]);
|
|
|
|
break;
|
|
|
|
}
|
2005-04-19 04:38:48 +00:00
|
|
|
#if defined(WIN32)
|
|
|
|
sleep(1);
|
|
|
|
#else
|
2005-03-18 03:43:59 +00:00
|
|
|
usleep(1);
|
2005-04-19 04:38:48 +00:00
|
|
|
#endif
|
2005-03-14 20:57:21 +00:00
|
|
|
}
|
|
|
|
|
2005-03-18 03:43:59 +00:00
|
|
|
/* No, it did not die. Try with SIGKILL */
|
|
|
|
|
|
|
|
if (!died) {
|
|
|
|
ret = tm_kill(tids[i], SIGKILL, &event);
|
|
|
|
if (TM_SUCCESS != ret) {
|
2005-07-03 23:31:27 +00:00
|
|
|
opal_output(orte_pls_base.pls_output,
|
2005-03-18 03:43:59 +00:00
|
|
|
"pls:tm:kill: tm_kill failed with %d",
|
|
|
|
ret);
|
|
|
|
ret = ORTE_ERROR;
|
|
|
|
ORTE_ERROR_LOG(ret);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
tm_poll(TM_NULL_EVENT, &event, 1, &local_errno);
|
2005-07-03 23:31:27 +00:00
|
|
|
opal_output(orte_pls_base.pls_output,
|
2005-03-18 03:43:59 +00:00
|
|
|
"pls:tm:kill: killed tid %d with SIGKILL",
|
|
|
|
tids[i]);
|
|
|
|
/* Did it die this time? */
|
|
|
|
|
|
|
|
ret = tm_obit(tids[i], &exit_status, &event);
|
|
|
|
if (TM_SUCCESS != ret) {
|
2005-07-03 23:31:27 +00:00
|
|
|
opal_output(orte_pls_base.pls_output,
|
2005-03-18 03:43:59 +00:00
|
|
|
"pls:tm:kill: tm_obit failed with %d",
|
|
|
|
ret);
|
|
|
|
ret = ORTE_ERROR;
|
|
|
|
ORTE_ERROR_LOG(ret);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
tm_poll(TM_NULL_EVENT, &event, 0, &local_errno);
|
|
|
|
|
|
|
|
/* No -- poll a few times -- just to try to clean it
|
|
|
|
up... If we don't get it here, oh well. Just let
|
|
|
|
the resources hang; TM will clean them up when the
|
|
|
|
job completed */
|
|
|
|
|
|
|
|
if (TM_NULL_EVENT == event) {
|
|
|
|
for (j = 0; j < NUM_SIGNAL_POLL_ITERS; ++j) {
|
|
|
|
tm_poll(TM_NULL_EVENT, &event, 0, &local_errno);
|
|
|
|
if (TM_NULL_EVENT != event) {
|
2005-07-03 23:31:27 +00:00
|
|
|
opal_output(orte_pls_base.pls_output,
|
2005-03-18 03:43:59 +00:00
|
|
|
"pls:tm:kill: tid %d (finally) died",
|
|
|
|
tids[i]);
|
|
|
|
died = true;
|
|
|
|
break;
|
|
|
|
}
|
2005-04-19 04:38:48 +00:00
|
|
|
#if defined(WIN32)
|
|
|
|
sleep(1);
|
|
|
|
#else
|
2005-03-18 03:43:59 +00:00
|
|
|
usleep(1);
|
2005-04-19 04:38:48 +00:00
|
|
|
#endif
|
2005-03-18 03:43:59 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if (j >= NUM_SIGNAL_POLL_ITERS) {
|
2005-07-03 23:31:27 +00:00
|
|
|
opal_output(orte_pls_base.pls_output,
|
2005-03-18 03:43:59 +00:00
|
|
|
"pls:tm:kill: tid %d did not die!",
|
|
|
|
tids[i]);
|
2005-03-14 20:57:21 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2005-03-18 03:43:59 +00:00
|
|
|
|
|
|
|
/* If it's dead, update the registry */
|
|
|
|
|
|
|
|
if (died) {
|
|
|
|
ret = orte_soh.set_proc_soh(&names[i],
|
|
|
|
ORTE_PROC_STATE_TERMINATED,
|
|
|
|
exit_status);
|
|
|
|
}
|
2005-03-14 20:57:21 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/* All done */
|
|
|
|
|
|
|
|
return ORTE_SUCCESS;
|
|
|
|
}
|