1
1
openmpi/orte/tools/orte-clean/orte-clean.c
Rolf vandeVaart ed4920ba5f Fix a couple problems with orte-clean. Also add a new
--debug flag to help developers figure out possible future issues.

This fixes trac:1335.

This commit was SVN r18979.

The following Trac tickets were found above:
  Ticket 1335 --> https://svn.open-mpi.org/trac/ompi/ticket/1335
2008-07-22 17:41:06 +00:00

428 строки
13 KiB
C

/*
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007-2008 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include <stdio.h>
#include <errno.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif /* HAVE_STDLIB_H */
#ifdef HAVE_SYS_STAT_H
#include <sys/stat.h>
#endif /* HAVE_SYS_STAT_H */
#ifdef HAVE_SYS_TYPES_H
#include <sys/types.h>
#endif /* HAVE_SYS_TYPES_H */
#ifdef HAVE_SYS_WAIT_H
#include <sys/wait.h>
#endif /* HAVE_SYS_WAIT_H */
#ifdef HAVE_SYS_PARAM_H
#include <sys/param.h>
#endif /* HAVE_SYS_PARAM_H */
#ifdef HAVE_STRING_H
#include <string.h>
#endif /* HAVE_STRING_H */
#ifdef HAVE_DIRENT_H
#include <dirent.h>
#endif /* HAVE_DIRENT_H */
#include <signal.h>
#ifdef HAVE_PWD_H
#include <pwd.h>
#endif /* HAVE_PWD_H */
#include "opal/util/cmd_line.h"
#include "opal/util/argv.h"
#include "opal/util/opal_environ.h"
#include "opal/util/os_dirpath.h"
#include "opal/util/basename.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h"
#include "orte/util/show_help.h"
#include "orte/util/proc_info.h"
#include "opal/util/os_path.h"
#include "orte/util/session_dir.h"
#include "opal/runtime/opal.h"
#if OPAL_ENABLE_FT == 1
#include "opal/runtime/opal_cr.h"
#endif
#include "orte/runtime/runtime.h"
/******************
* Local Functions
******************/
static int parse_args(int argc, char *argv[]);
#if !defined(__WINDOWS__)
static void kill_procs(void);
#endif /* !defined(__WINDOWS__) */
/*****************************************
* Global Vars for Command line Arguments
*****************************************/
typedef struct {
bool help;
bool verbose;
bool debug;
} orte_clean_globals_t;
orte_clean_globals_t orte_clean_globals;
opal_cmd_line_init_t cmd_line_opts[] = {
{ NULL, NULL, NULL,
'h', NULL, "help",
0,
&orte_clean_globals.help, OPAL_CMD_LINE_TYPE_BOOL,
"This help message" },
{ NULL, NULL, NULL,
'v', NULL, "verbose",
0,
&orte_clean_globals.verbose, OPAL_CMD_LINE_TYPE_BOOL,
"Generate verbose output" },
{ NULL, NULL, NULL,
'd', NULL, "debug",
0,
&orte_clean_globals.debug, OPAL_CMD_LINE_TYPE_BOOL,
"Extra debug output for developers to ensure that orte-clean is working" },
/* End of list */
{ NULL, NULL, NULL,
'\0', NULL, NULL,
0,
NULL, OPAL_CMD_LINE_TYPE_NULL,
NULL }
};
/*
* This utility will do a brute force clean of a node. It will
* attempt to clean up any files in the user's session directory.
* It will also look for any orted and orterun processes that are
* not part of this job, and kill them off.
*/
int
main(int argc, char *argv[])
{
int ret = ORTE_SUCCESS;
char *tmp_env_var;
/* This is needed so we can print the help message */
if (ORTE_SUCCESS != (ret = opal_init_util())) {
return ret;
}
if (ORTE_SUCCESS != (ret = parse_args(argc, argv))) {
return ret;
}
#if OPAL_ENABLE_FT == 1
/* Disable the checkpoint notification routine for this
* tool. As we will never need to checkpoint this tool.
* Note: This must happen before opal_init().
*/
opal_cr_set_enabled(false);
/* Select the none component, since we don't actually use a checkpointer */
tmp_env_var = mca_base_param_env_var("crs");
opal_setenv(tmp_env_var,
"none",
true, &environ);
free(tmp_env_var);
tmp_env_var = NULL;
tmp_env_var = mca_base_param_env_var("opal_cr_is_tool");
opal_setenv(tmp_env_var,
"1", true, NULL);
free(tmp_env_var);
#endif
tmp_env_var = NULL; /* Silence compiler warning */
if (ORTE_SUCCESS != (ret = orte_init(ORTE_TOOL_WITH_NAME))) {
return ret;
}
/*
* Clean out all session directories - we don't have to protect
* our own session directory because (since we are a tool) we
* didn't create one!
*/
if (orte_clean_globals.verbose) {
fprintf(stderr, "orte-clean: cleaning session dir tree %s\n",
orte_process_info.top_session_dir);
}
opal_os_dirpath_destroy(orte_process_info.top_session_dir, true, NULL);
/* now kill any lingering procs, if we can */
#if !defined(__WINDOWS__)
kill_procs();
#endif /* !defined(__WINDOWS__) */
orte_finalize();
return ORTE_SUCCESS;
}
/*
* Parse the command line arguments using the functions command
* line utility functions.
*/
static int parse_args(int argc, char *argv[]) {
int ret;
opal_cmd_line_t cmd_line;
orte_clean_globals_t tmp = { false, false, false };
/* NOTE: There is a bug in the PGI 6.2 series that causes the
compiler to choke when copying structs containing bool members
by value. So do a memcpy here instead. */
memcpy(&orte_clean_globals, &tmp, sizeof(tmp));
/*
* Initialize list of available command line options.
*/
opal_cmd_line_create(&cmd_line, cmd_line_opts);
ret = opal_cmd_line_parse(&cmd_line, true, argc, argv);
/**
* Now start parsing our specific arguments
*/
if (OPAL_SUCCESS != ret ||
orte_clean_globals.help) {
char *args = NULL;
args = opal_cmd_line_get_usage_msg(&cmd_line);
orte_show_help("help-orte-clean.txt", "usage", true,
args);
free(args);
return ORTE_ERROR;
}
OBJ_DESTRUCT(&cmd_line);
return ORTE_SUCCESS;
}
#if !defined(__WINDOWS__)
static char *orte_getline(FILE *fp)
{
char *ret, *buff;
char input[1024];
int i;
ret = fgets(input, 1024, fp);
if (NULL != ret) {
/* remove trailing spaces */
for (i=strlen(input)-2; i > 0; i--) {
if (input[i] != ' ') {
input[i+1] = '\0';
break;
}
}
buff = strdup(input);
return buff;
}
return NULL;
}
/*
* This function makes a call to "ps" to find out the processes that
* are running on this node. It then attempts to kill off any orteds
* and orteruns that are not related to this job.
*/
static
void kill_procs(void) {
int ortedpid;
char *fullprocname;
char *procname;
char *pidstr;
char *user;
int procpid;
FILE *psfile;
char *inputline;
char *this_user;
int uid;
struct passwd *pwdent;
char *separator = " \t"; /* output can be delimited by space or tab */
/*
* This is the command that is used to get the information about
* all the processes that are running. The output looks like the
* following:
* COMMAND PID USER
* tcsh 12556 rolfv
* ps 14424 rolfv
* etc.
* Currently, we do not make use of the USER field, but we may later
* on so we grab it also.
*/
/*
* The configure determines if there is a valid ps command for us to
* use. If it is set to unknown, then we skip this section.
*/
char command[] = ORTE_CLEAN_PS_CMD;
if (0 == strcmp("unknown", command)) {
return;
}
if (orte_clean_globals.verbose) {
fprintf(stderr, "orte-clean: killing any lingering procs\n");
}
/*
* Get our parent pid which is the pid of the orted.
*/
ortedpid = getppid();
/* get the name of the user */
uid = getuid();
#ifdef HAVE_GETPWUID
pwdent = getpwuid(uid);
#else
pwdent = NULL;
#endif
if (NULL != pwdent) {
this_user = strdup(pwdent->pw_name);
} else {
if (0 > asprintf(&this_user, "%d", uid)) {
return;
}
}
/*
* There is a race condition here. The problem is that we are looking
* for any processes named orted. However, one may erroneously find more
* orteds then there really are because the orted is doing a series of
* fork/execs. If we run with more than one orte-clean on a node, then
* one of the orte-cleans may catch the other one while it has forked,
* but not exec'ed. It will therefore kill an orte-clean. Now one
* can argue it is silly to run more than one orte-clean on a node, and
* this is true. We will have to figure out how to prevent this. For
* now, we use a big hammer and just sleep a second to decrease the
* probability.
*/
sleep(1);
psfile = popen(command, "r");
/*
* Read the first line of the output. We just throw it away
* as it is the header consisting of the words COMMAND, PID and
* USER.
*/
if (NULL == (inputline = orte_getline(psfile))) {
free(this_user);
return;
}
free(inputline); /* dump the header line */
while (NULL != (inputline = orte_getline(psfile))) {
/* The three fields are typically seperated by spaces */
fullprocname = strtok(inputline, separator);
pidstr = strtok(NULL, separator);
user = strtok(NULL, separator);
if (orte_clean_globals.debug) {
fprintf(stdout, "\norte-clean: user(pid)=%s, me=%s\n",
user, this_user);
}
/* If the user is not us, and the user is not root, then skip
* further checking. If the user is root, then continue on as
* we want root to kill off everybody. */
if ((0 != strcmp(user, this_user)) && (0 != strcmp("root", this_user))) {
/* not us */
free(inputline);
continue;
}
procpid = atoi(pidstr);
procname = opal_basename(fullprocname);
if (orte_clean_globals.debug) {
fprintf(stdout, "orte-clean: fullname=%s, basename=%s, pid=%d\n",
fullprocname, procname, procpid);
}
/*
* Look for any orteds that are not our parent and attempt to
* kill them. We currently do not worry whether we are the
* owner or not. If we are not, we will just fail to send
* the signal and that is OK. This also allows a root process
* to kill all orteds.
*
* NOTE: need to also look for "(orted)" as a non-active
* proc is sometimes reported that way
*/
if (0 == strncmp("orted", procname, strlen("orted")) ||
0 == strncmp("(orted)", procname, strlen("(orted)"))) {
if (procpid != ortedpid) {
if (orte_clean_globals.verbose) {
fprintf(stderr, "orte-clean: found potential rogue orted process"
" (pid=%d,user=%s), sending SIGKILL...\n",
procpid, user);
}
/*
* We ignore the return code here as we do not really
* care whether this worked or not.
*/
(void)kill(procpid, SIGKILL);
}
}
/*
* Now check for any orteruns.
*/
if (0 == strncmp("orterun", procname, strlen("orterun")) ||
0 == strncmp("mpirun", procname, strlen("mpirun"))) {
/* if we are on the same node as the HNP, then the ortedpid
* is the same as that of our orterun, so don't kill it
*/
if (procpid != ortedpid) {
if (orte_clean_globals.verbose) {
fprintf(stderr, "orte-clean: found potential rogue orterun process"
" (pid=%d,user=%s), sending SIGKILL...\n",
procpid, user);
}
/* if we are a singleton, check the hnp_pid as well */
if (orte_process_info.singleton) {
if (procpid != orte_process_info.hnp_pid) {
(void)kill(procpid, SIGKILL);
}
} else {
/* We ignore the return code here as we do not really
* care whether this worked or not.
*/
(void)kill(procpid, SIGKILL);
}
}
}
free(inputline);
free(procname);
}
free(this_user);
return;
}
#endif /* !defined(__WINDOWS__) */