2006-06-29 02:33:54 +04:00
|
|
|
/*
|
|
|
|
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
|
|
|
* University Research and Technology
|
|
|
|
* Corporation. All rights reserved.
|
|
|
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
|
|
|
* of Tennessee Research Foundation. All rights
|
|
|
|
* reserved.
|
|
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
|
|
* University of Stuttgart. All rights reserved.
|
|
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
|
|
* All rights reserved.
|
2007-01-05 00:48:34 +03:00
|
|
|
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
|
2006-06-29 02:33:54 +04:00
|
|
|
* $COPYRIGHT$
|
|
|
|
*
|
|
|
|
* Additional copyrights may follow
|
|
|
|
*
|
|
|
|
* $HEADER$
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "orte_config.h"
|
|
|
|
|
|
|
|
#include <stdio.h>
|
|
|
|
#include <errno.h>
|
2006-07-11 09:25:41 +04:00
|
|
|
#ifdef HAVE_UNISTD_H
|
2006-06-29 02:33:54 +04:00
|
|
|
#include <unistd.h>
|
2006-07-11 09:25:41 +04:00
|
|
|
#endif /* HAVE_UNISTD_H */
|
2006-06-29 02:33:54 +04:00
|
|
|
#ifdef HAVE_STDLIB_H
|
|
|
|
#include <stdlib.h>
|
|
|
|
#endif /* HAVE_STDLIB_H */
|
|
|
|
#ifdef HAVE_SYS_STAT_H
|
|
|
|
#include <sys/stat.h>
|
|
|
|
#endif
|
|
|
|
#ifdef HAVE_LIBGEN_H
|
|
|
|
#include <libgen.h>
|
|
|
|
#endif
|
|
|
|
#ifdef HAVE_SYS_TYPES_H
|
|
|
|
#include <sys/types.h>
|
|
|
|
#endif
|
|
|
|
#ifdef HAVE_SYS_WAIT_H
|
|
|
|
#include <sys/wait.h>
|
|
|
|
#endif
|
2007-02-03 04:11:35 +03:00
|
|
|
#ifdef HAVE_SYS_PARAM_H
|
|
|
|
#include <sys/param.h>
|
|
|
|
#endif
|
2006-06-29 02:33:54 +04:00
|
|
|
#ifdef HAVE_STRING_H
|
|
|
|
#include <string.h>
|
|
|
|
#endif /* HAVE_STRING_H */
|
|
|
|
#include <sys/types.h>
|
2007-02-03 03:25:42 +03:00
|
|
|
#ifdef HAVE_DIRENT_H
|
|
|
|
#include <dirent.h>
|
|
|
|
#endif /* HAVE_DIRENT_H */
|
|
|
|
#include <signal.h>
|
2006-06-29 02:33:54 +04:00
|
|
|
|
|
|
|
#include "orte/orte_constants.h"
|
|
|
|
|
|
|
|
#include "opal/util/cmd_line.h"
|
|
|
|
#include "opal/util/argv.h"
|
|
|
|
#include "opal/util/show_help.h"
|
|
|
|
#include "opal/util/output.h"
|
|
|
|
#include "opal/util/opal_environ.h"
|
2006-07-11 09:50:15 +04:00
|
|
|
#include "opal/util/os_dirpath.h"
|
2006-06-29 02:33:54 +04:00
|
|
|
#include "opal/mca/base/base.h"
|
|
|
|
#include "opal/mca/base/mca_base_param.h"
|
|
|
|
|
|
|
|
#include "orte/util/univ_info.h"
|
|
|
|
#include "orte/util/sys_info.h"
|
|
|
|
#include "orte/util/proc_info.h"
|
|
|
|
#include "opal/util/os_path.h"
|
|
|
|
#include "orte/util/session_dir.h"
|
|
|
|
#include "orte/util/universe_setup_file_io.h"
|
|
|
|
#include "orte/mca/gpr/gpr.h"
|
|
|
|
#include "orte/mca/rml/rml.h"
|
|
|
|
|
|
|
|
#include "opal/runtime/opal.h"
|
|
|
|
#include "orte/runtime/runtime.h"
|
|
|
|
|
|
|
|
/******************
|
|
|
|
* Local Functions
|
|
|
|
******************/
|
|
|
|
static int orte_clean_init(void);
|
|
|
|
static int parse_args(int argc, char *argv[]);
|
2007-02-03 03:25:42 +03:00
|
|
|
#if !defined(__WINDOWS__)
|
|
|
|
static void kill_procs(void);
|
|
|
|
#endif /* !defined(__WINDOWS__) */
|
2006-06-29 02:33:54 +04:00
|
|
|
|
|
|
|
/*****************************************
|
|
|
|
* Global Vars for Command line Arguments
|
|
|
|
*****************************************/
|
|
|
|
typedef struct {
|
2007-01-11 17:07:15 +03:00
|
|
|
bool help;
|
|
|
|
bool verbose;
|
2006-06-29 02:33:54 +04:00
|
|
|
} orte_clean_globals_t;
|
|
|
|
|
|
|
|
orte_clean_globals_t orte_clean_globals;
|
|
|
|
|
|
|
|
opal_cmd_line_init_t cmd_line_opts[] = {
|
|
|
|
{ NULL, NULL, NULL,
|
|
|
|
'h', NULL, "help",
|
|
|
|
0,
|
|
|
|
&orte_clean_globals.help, OPAL_CMD_LINE_TYPE_BOOL,
|
|
|
|
"This help message" },
|
|
|
|
|
|
|
|
{ NULL, NULL, NULL,
|
|
|
|
'v', NULL, "verbose",
|
|
|
|
0,
|
|
|
|
&orte_clean_globals.verbose, OPAL_CMD_LINE_TYPE_BOOL,
|
2007-02-03 03:25:42 +03:00
|
|
|
"Generate verbose output" },
|
2006-06-29 02:33:54 +04:00
|
|
|
|
|
|
|
/* End of list */
|
|
|
|
{ NULL, NULL, NULL,
|
|
|
|
'\0', NULL, NULL,
|
|
|
|
0,
|
|
|
|
NULL, OPAL_CMD_LINE_TYPE_NULL,
|
|
|
|
NULL }
|
|
|
|
};
|
|
|
|
|
2007-02-03 03:25:42 +03:00
|
|
|
/*
|
|
|
|
* This utility will do a brute force clean of a node. It will
|
|
|
|
* attempt to clean up any files in the user's session directory.
|
|
|
|
* It will also look for any orted and orterun processes that are
|
|
|
|
* not part of this job, and kill them off.
|
|
|
|
*/
|
2006-06-29 02:33:54 +04:00
|
|
|
int
|
|
|
|
main(int argc, char *argv[])
|
|
|
|
{
|
2006-06-30 18:22:58 +04:00
|
|
|
int ret, exit_status = ORTE_SUCCESS;
|
2006-06-29 02:33:54 +04:00
|
|
|
|
|
|
|
/***************
|
|
|
|
* Initialize
|
|
|
|
***************/
|
|
|
|
if (ORTE_SUCCESS != (ret = parse_args(argc, argv))) {
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
if (ORTE_SUCCESS != (ret = orte_clean_init())) {
|
|
|
|
exit_status = ret;
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
/*
|
2007-02-03 03:25:42 +03:00
|
|
|
* Clean out all /tmp directories except for our own.
|
2006-06-29 02:33:54 +04:00
|
|
|
*/
|
2007-02-03 03:25:42 +03:00
|
|
|
orte_universe_clean_directories(orte_universe_info.name, orte_clean_globals.verbose);
|
|
|
|
#if !defined(__WINDOWS__)
|
|
|
|
kill_procs();
|
|
|
|
#endif /* !defined(__WINDOWS__) */
|
2006-06-29 02:33:54 +04:00
|
|
|
|
2006-08-23 06:35:00 +04:00
|
|
|
orte_finalize();
|
|
|
|
opal_finalize();
|
2007-02-03 03:25:42 +03:00
|
|
|
cleanup:
|
2006-06-29 02:33:54 +04:00
|
|
|
return exit_status;
|
|
|
|
}
|
2007-02-03 03:25:42 +03:00
|
|
|
/*
|
|
|
|
* Parse the command line arguments using the functions command
|
|
|
|
* line utility functions.
|
|
|
|
*/
|
2006-06-29 02:33:54 +04:00
|
|
|
static int parse_args(int argc, char *argv[]) {
|
|
|
|
int i, ret, len;
|
|
|
|
opal_cmd_line_t cmd_line;
|
|
|
|
char **app_env = NULL, **global_env = NULL;
|
|
|
|
orte_clean_globals_t tmp = { false, false };
|
|
|
|
|
|
|
|
/* Parse the command line options */
|
2007-01-11 17:30:32 +03:00
|
|
|
|
|
|
|
/* NOTE: There is a bug in the PGI 6.2 series that causes the
|
|
|
|
compiler to choke when copying structs containing bool members
|
|
|
|
by value. So do a memcpy here instead. */
|
|
|
|
memcpy(&orte_clean_globals, &tmp, sizeof(tmp));
|
2006-06-29 02:33:54 +04:00
|
|
|
|
2007-02-03 03:25:42 +03:00
|
|
|
/*
|
|
|
|
* Initialize list of available command line options.
|
|
|
|
*/
|
2006-06-29 02:33:54 +04:00
|
|
|
opal_cmd_line_create(&cmd_line, cmd_line_opts);
|
|
|
|
|
|
|
|
mca_base_open();
|
|
|
|
ret = opal_cmd_line_parse(&cmd_line, true, argc, argv);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Put all of the MCA arguments in the environment
|
|
|
|
*/
|
|
|
|
mca_base_cmd_line_process_args(&cmd_line, &app_env, &global_env);
|
|
|
|
|
|
|
|
len = opal_argv_count(app_env);
|
|
|
|
for(i = 0; i < len; ++i) {
|
|
|
|
putenv(app_env[i]);
|
|
|
|
}
|
|
|
|
|
|
|
|
len = opal_argv_count(global_env);
|
|
|
|
for(i = 0; i < len; ++i) {
|
|
|
|
putenv(global_env[i]);
|
|
|
|
}
|
|
|
|
|
|
|
|
opal_setenv(mca_base_param_env_var("crs_base_is_tool"),
|
2006-08-23 06:35:00 +04:00
|
|
|
"1", true, NULL);
|
2006-06-29 02:33:54 +04:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Now start parsing our specific arguments
|
|
|
|
*/
|
|
|
|
if (OPAL_SUCCESS != ret ||
|
|
|
|
orte_clean_globals.help) {
|
|
|
|
char *args = NULL;
|
|
|
|
args = opal_cmd_line_get_usage_msg(&cmd_line);
|
2007-02-03 03:25:42 +03:00
|
|
|
opal_show_help("help-orte-clean.txt", "usage", true,
|
2006-06-29 02:33:54 +04:00
|
|
|
args);
|
|
|
|
free(args);
|
|
|
|
return ORTE_ERROR;
|
|
|
|
}
|
|
|
|
|
2007-02-03 03:25:42 +03:00
|
|
|
OBJ_DESTRUCT(&cmd_line);
|
|
|
|
|
2006-06-29 02:33:54 +04:00
|
|
|
return ORTE_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int orte_clean_init(void) {
|
|
|
|
int exit_status = ORTE_SUCCESS, ret;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We are trying to attach to another process' GPR so we need to
|
|
|
|
* attach no matter if it is identified as private or not.
|
|
|
|
*/
|
|
|
|
opal_setenv(mca_base_param_env_var("universe_console"),
|
2006-08-23 06:35:00 +04:00
|
|
|
"1", true, NULL);
|
2006-06-29 02:33:54 +04:00
|
|
|
|
|
|
|
/***************************
|
|
|
|
* We need all of OPAL
|
|
|
|
***************************/
|
|
|
|
if (ORTE_SUCCESS != (ret = opal_init())) {
|
|
|
|
exit_status = ret;
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ORTE_SUCCESS != (ret = orte_system_init(true))) {
|
|
|
|
exit_status = ret;
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
#if 0
|
|
|
|
/***************************
|
|
|
|
* And ORTE, but need to do a bit of a dance first
|
|
|
|
***************************/
|
|
|
|
/* register handler for errnum -> string converstion */
|
|
|
|
opal_error_register("ORTE", ORTE_ERR_BASE, ORTE_ERR_MAX, orte_err2str);
|
|
|
|
|
|
|
|
/* Register all MCA Params */
|
|
|
|
if (ORTE_SUCCESS != (ret = orte_register_params(true))) {
|
|
|
|
exit_status = ret;
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Ensure the system_info structure is instantiated and initialized */
|
|
|
|
if (ORTE_SUCCESS != (ret = orte_sys_info())) {
|
|
|
|
exit_status = ret;
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Ensure the process info structure is instantiated and initialized */
|
|
|
|
if (ORTE_SUCCESS != (ret = orte_proc_info())) {
|
|
|
|
exit_status = ret;
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
cleanup:
|
|
|
|
return exit_status;
|
|
|
|
}
|
|
|
|
|
2007-02-03 03:25:42 +03:00
|
|
|
#if !defined(__WINDOWS__)
|
|
|
|
/*
|
|
|
|
* This function makes a call to "ps" to find out the processes that
|
|
|
|
* are running on this node. It then attempts to kill off any orteds
|
|
|
|
* and orteruns that are not related to this job.
|
|
|
|
*/
|
|
|
|
static
|
|
|
|
void kill_procs(void) {
|
2007-02-06 01:03:58 +03:00
|
|
|
int ortedpid;
|
2007-02-03 03:25:42 +03:00
|
|
|
char procname[MAXPATHLEN]; /* only really need 8, but being safe */
|
|
|
|
char pidstr[MAXPATHLEN]; /* only really need 8, but being safe */
|
|
|
|
char user[MAXPATHLEN];
|
|
|
|
int procpid;
|
|
|
|
FILE *psfile;
|
|
|
|
bool kill_orteruns = false;
|
2007-02-06 01:03:58 +03:00
|
|
|
int orunpid = 0;
|
2006-07-11 21:31:05 +04:00
|
|
|
|
2007-02-03 03:25:42 +03:00
|
|
|
/*
|
|
|
|
* This is the command that is used to get the information about
|
|
|
|
* all the processes that are running. The output looks like the
|
|
|
|
* following:
|
|
|
|
* COMMAND PID USER
|
|
|
|
* tcsh 12556 rolfv
|
|
|
|
* ps 14424 rolfv
|
|
|
|
* etc.
|
|
|
|
* Currently, we do not make use of the USER field, but we may later
|
|
|
|
* on so we grab it also.
|
|
|
|
*/
|
2006-06-29 02:33:54 +04:00
|
|
|
|
2007-02-03 03:25:42 +03:00
|
|
|
/*
|
|
|
|
* The configure determines if there is a valid ps command for us to
|
|
|
|
* use. If it is set to unknown, then we skip this section.
|
|
|
|
*/
|
|
|
|
char command[] = ORTE_CLEAN_PS_CMD;
|
|
|
|
if (!(strcmp("unknown", command))) {
|
|
|
|
return;
|
|
|
|
}
|
2006-06-29 02:33:54 +04:00
|
|
|
|
2007-02-03 03:25:42 +03:00
|
|
|
/*
|
|
|
|
* Try to get the pid of our orterun process from our universe name.
|
|
|
|
* This works in the case where one is using the default universe name
|
|
|
|
* which appends the pid after the 'default-universe-' string. In
|
|
|
|
* this way, we avoid killing our own mpirun process. Note that if
|
|
|
|
* we cannot determine our orterun pid, then we skip killing the
|
|
|
|
* orterun processes to avoid odd behavior for the user.
|
|
|
|
*/
|
|
|
|
if (!(strncmp(ORTE_DEFAULT_UNIVERSE, orte_universe_info.name,
|
|
|
|
sizeof(ORTE_DEFAULT_UNIVERSE)-1))) {
|
|
|
|
char *tptr;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Set a pointer to the pid part of the name. The pointer
|
|
|
|
* is adjusted by the name along with one extra to remove the
|
|
|
|
* dash before the pid. Then convert to a pid. If the strtol()
|
|
|
|
* returns zero, then we got an error on the conversion and we
|
|
|
|
* will skip killing the orteruns.
|
|
|
|
*/
|
|
|
|
tptr = orte_universe_info.name + sizeof(ORTE_DEFAULT_UNIVERSE);
|
|
|
|
if (0 != (orunpid = (int)strtol(tptr, (char **)NULL, 10))) {
|
|
|
|
kill_orteruns = true;
|
|
|
|
}
|
|
|
|
}
|
2006-06-29 02:33:54 +04:00
|
|
|
|
2007-02-03 03:25:42 +03:00
|
|
|
/*
|
|
|
|
* Get our parent pid which is the pid of the orted.
|
|
|
|
*/
|
|
|
|
ortedpid = getppid();
|
2006-06-29 02:33:54 +04:00
|
|
|
|
|
|
|
/*
|
2007-02-03 03:25:42 +03:00
|
|
|
* There is a race condition here. The problem is that we are looking
|
|
|
|
* for any processes named orted. However, one may erroneously find more
|
|
|
|
* orteds then there really are because the orted is doing a series of
|
|
|
|
* fork/execs. If we run with more than one orte-clean on a node, then
|
|
|
|
* one of the orte-cleans may catch the other one while it has forked,
|
|
|
|
* but not exec'ed. It will therefore kill an orte-clean. Now one
|
|
|
|
* can argue it is silly to run more than one orte-clean on a node, and
|
|
|
|
* this is true. We will have to figure out how to prevent this. For
|
|
|
|
* now, we use a big hammer and just sleep a second to decrease the
|
|
|
|
* probability.
|
2006-06-29 02:33:54 +04:00
|
|
|
*/
|
2007-02-03 03:25:42 +03:00
|
|
|
sleep(1);
|
|
|
|
|
|
|
|
psfile = popen(command, "r");
|
2006-06-29 02:33:54 +04:00
|
|
|
/*
|
2007-02-03 03:25:42 +03:00
|
|
|
* Read the first line of the output. We just throw it away
|
|
|
|
* as it is the header consisting of the words COMMAND, PID and
|
|
|
|
* USER.
|
2006-06-29 02:33:54 +04:00
|
|
|
*/
|
2007-02-03 03:25:42 +03:00
|
|
|
if ((fscanf(psfile, "%s%s%s", procname, pidstr, user)) == EOF) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
while ((fscanf(psfile, "%s%s%s", procname, pidstr, user)) != EOF) {
|
|
|
|
|
|
|
|
procpid = atoi(pidstr);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Look for any orteds that are not our parent and attempt to
|
|
|
|
* kill them. We currently do not worry whether we are the
|
|
|
|
* owner or not. If we are not, we will just fail to send
|
|
|
|
* the signal and that is OK. This also allows a root process
|
|
|
|
* to kill all orteds.
|
|
|
|
*/
|
|
|
|
if (!strcmp("orted", procname)) {
|
|
|
|
if (procpid != ortedpid) {
|
|
|
|
if (orte_clean_globals.verbose) {
|
|
|
|
opal_output(0, "orte-clean: found potential rogue orted process"
|
|
|
|
" (pid=%d,user=%s), sending SIGKILL...\n",
|
|
|
|
procpid, user);
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* We ignore the return code here as we do not really
|
|
|
|
* care whether this worked or not.
|
|
|
|
*/
|
|
|
|
(void)kill(procpid, SIGKILL);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Now check for any orteruns.
|
|
|
|
*/
|
|
|
|
if (kill_orteruns) {
|
|
|
|
if (!strcmp("orterun", procname)) {
|
|
|
|
if (procpid != orunpid) {
|
|
|
|
if (orte_clean_globals.verbose) {
|
|
|
|
opal_output(0, "orte-clean: found potential rogue orterun process"
|
|
|
|
" (pid=%d,user=%s), sending SIGKILL...\n",
|
|
|
|
procpid, user);
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* We ignore the return code here as we do not really
|
|
|
|
* care whether this worked or not.
|
|
|
|
*/
|
|
|
|
(void)kill(procpid, SIGKILL);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2006-06-29 02:33:54 +04:00
|
|
|
}
|
2007-02-03 03:25:42 +03:00
|
|
|
return;
|
2006-06-29 02:33:54 +04:00
|
|
|
}
|
2007-02-03 03:25:42 +03:00
|
|
|
#endif /* !defined(__WINDOWS__) */
|