1
1

Fix a couple problems with orte-clean. Also add a new

--debug flag to help developers figure out possible future issues.

This fixes trac:1335.

This commit was SVN r18979.

The following Trac tickets were found above:
  Ticket 1335 --> https://svn.open-mpi.org/trac/ompi/ticket/1335
Этот коммит содержится в:
Rolf vandeVaart 2008-07-22 17:41:06 +00:00
родитель 26cfac94e6
Коммит ed4920ba5f
2 изменённых файлов: 79 добавлений и 89 удалений

Просмотреть файл

@ -3,7 +3,7 @@
.\" University Research and Technology .\" University Research and Technology
.\" Corporation. All rights reserved. .\" Corporation. All rights reserved.
.\" .\"
.\" Copyright 2007, Sun Microsystems, Inc. .\" Copyright (c) 2007-2008 Sun Microsystems, Inc.
.\" .\"
.TH orte-clean 1 "March 2007" "Open MPI 1.2" "OPEN MPI COMMANDS " .TH orte-clean 1 "March 2007" "Open MPI 1.2" "OPEN MPI COMMANDS "
.SH NAME .SH NAME
@ -18,9 +18,9 @@ from Open MPI jobs.
.SH SYNOPSIS .SH SYNOPSIS
.ft R .ft R
.nf .nf
orte-clean [--verbose] orte-clean [--verbose] [--debug]
.br .br
mpirun --pernode [--host | --hostfile \fIfile\fP] orte-clean [--verbose] mpirun --pernode [--host | --hostfile \fIfile\fP] orte-clean [--verbose] [--debug]
.sp .sp
@ -33,7 +33,11 @@ mpirun --pernode [--host | --hostfile \fIfile\fP] orte-clean [--verbose]
mode and print out the universes that are getting cleaned up mode and print out the universes that are getting cleaned up
as well as processes that are being killed. as well as processes that are being killed.
.sp .sp
.ft R
[-d | --debug] This argument will run the command in debug
mode and print out lots of details about what the command is
doing. This is intended for developer use only.
.sp
.\" ************************** .\" **************************
.\" Description Section .\" Description Section
.\" ************************** .\" **************************

Просмотреть файл

@ -9,7 +9,7 @@
* University of Stuttgart. All rights reserved. * University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2007-2008 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights * Copyright (c) 2007 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* $COPYRIGHT$ * $COPYRIGHT$
@ -18,7 +18,6 @@
* *
* $HEADER$ * $HEADER$
*/ */
#include "orte_config.h" #include "orte_config.h"
#include "orte/constants.h" #include "orte/constants.h"
@ -57,6 +56,7 @@
#include "opal/util/argv.h" #include "opal/util/argv.h"
#include "opal/util/opal_environ.h" #include "opal/util/opal_environ.h"
#include "opal/util/os_dirpath.h" #include "opal/util/os_dirpath.h"
#include "opal/util/basename.h"
#include "opal/mca/base/base.h" #include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h" #include "opal/mca/base/mca_base_param.h"
@ -74,7 +74,6 @@
/****************** /******************
* Local Functions * Local Functions
******************/ ******************/
static int orte_clean_init(void);
static int parse_args(int argc, char *argv[]); static int parse_args(int argc, char *argv[]);
#if !defined(__WINDOWS__) #if !defined(__WINDOWS__)
static void kill_procs(void); static void kill_procs(void);
@ -86,6 +85,7 @@ static void kill_procs(void);
typedef struct { typedef struct {
bool help; bool help;
bool verbose; bool verbose;
bool debug;
} orte_clean_globals_t; } orte_clean_globals_t;
orte_clean_globals_t orte_clean_globals; orte_clean_globals_t orte_clean_globals;
@ -103,6 +103,12 @@ opal_cmd_line_init_t cmd_line_opts[] = {
&orte_clean_globals.verbose, OPAL_CMD_LINE_TYPE_BOOL, &orte_clean_globals.verbose, OPAL_CMD_LINE_TYPE_BOOL,
"Generate verbose output" }, "Generate verbose output" },
{ NULL, NULL, NULL,
'd', NULL, "debug",
0,
&orte_clean_globals.debug, OPAL_CMD_LINE_TYPE_BOOL,
"Extra debug output for developers to ensure that orte-clean is working" },
/* End of list */ /* End of list */
{ NULL, NULL, NULL, { NULL, NULL, NULL,
'\0', NULL, NULL, '\0', NULL, NULL,
@ -120,18 +126,44 @@ opal_cmd_line_init_t cmd_line_opts[] = {
int int
main(int argc, char *argv[]) main(int argc, char *argv[])
{ {
int ret, exit_status = ORTE_SUCCESS; int ret = ORTE_SUCCESS;
char *tmp_env_var;
/* This is needed so we can print the help message */
if (ORTE_SUCCESS != (ret = opal_init_util())) {
return ret;
}
/***************
* Initialize
***************/
if (ORTE_SUCCESS != (ret = parse_args(argc, argv))) { if (ORTE_SUCCESS != (ret = parse_args(argc, argv))) {
return ret; return ret;
} }
if (ORTE_SUCCESS != (ret = orte_clean_init())) {
exit_status = ret; #if OPAL_ENABLE_FT == 1
goto cleanup; /* Disable the checkpoint notification routine for this
* tool. As we will never need to checkpoint this tool.
* Note: This must happen before opal_init().
*/
opal_cr_set_enabled(false);
/* Select the none component, since we don't actually use a checkpointer */
tmp_env_var = mca_base_param_env_var("crs");
opal_setenv(tmp_env_var,
"none",
true, &environ);
free(tmp_env_var);
tmp_env_var = NULL;
tmp_env_var = mca_base_param_env_var("opal_cr_is_tool");
opal_setenv(tmp_env_var,
"1", true, NULL);
free(tmp_env_var);
#endif
tmp_env_var = NULL; /* Silence compiler warning */
if (ORTE_SUCCESS != (ret = orte_init(ORTE_TOOL_WITH_NAME))) {
return ret;
} }
/* /*
* Clean out all session directories - we don't have to protect * Clean out all session directories - we don't have to protect
* our own session directory because (since we are a tool) we * our own session directory because (since we are a tool) we
@ -150,8 +182,7 @@ main(int argc, char *argv[])
orte_finalize(); orte_finalize();
cleanup: return ORTE_SUCCESS;
return exit_status;
} }
/* /*
* Parse the command line arguments using the functions command * Parse the command line arguments using the functions command
@ -160,10 +191,7 @@ main(int argc, char *argv[])
static int parse_args(int argc, char *argv[]) { static int parse_args(int argc, char *argv[]) {
int ret; int ret;
opal_cmd_line_t cmd_line; opal_cmd_line_t cmd_line;
orte_clean_globals_t tmp = { false, false }; orte_clean_globals_t tmp = { false, false, false };
char * tmp_env_var = NULL;
/* Parse the command line options */
/* NOTE: There is a bug in the PGI 6.2 series that causes the /* NOTE: There is a bug in the PGI 6.2 series that causes the
compiler to choke when copying structs containing bool members compiler to choke when copying structs containing bool members
@ -176,12 +204,6 @@ static int parse_args(int argc, char *argv[]) {
opal_cmd_line_create(&cmd_line, cmd_line_opts); opal_cmd_line_create(&cmd_line, cmd_line_opts);
ret = opal_cmd_line_parse(&cmd_line, true, argc, argv); ret = opal_cmd_line_parse(&cmd_line, true, argc, argv);
tmp_env_var = mca_base_param_env_var("opal_cr_is_tool");
opal_setenv(tmp_env_var,
"1", true, NULL);
free(tmp_env_var);
tmp_env_var = NULL;
/** /**
* Now start parsing our specific arguments * Now start parsing our specific arguments
*/ */
@ -200,35 +222,6 @@ static int parse_args(int argc, char *argv[]) {
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }
static int orte_clean_init(void) {
int exit_status = ORTE_SUCCESS, ret;
char * tmp_env_var = NULL;
#if OPAL_ENABLE_FT == 1
/* Disable the checkpoint notification routine for this
* tool. As we will never need to checkpoint this tool.
* Note: This must happen before opal_init().
*/
opal_cr_set_enabled(false);
/* Select the none component, since we don't actually use a checkpointer */
tmp_env_var = mca_base_param_env_var("crs");
opal_setenv(tmp_env_var,
"none",
true, &environ);
free(tmp_env_var);
#endif
tmp_env_var = NULL; /* Silence compiler warning */
if (ORTE_SUCCESS != (ret = orte_init(ORTE_TOOL_WITH_NAME))) {
exit_status = ret;
goto cleanup;
}
cleanup:
return exit_status;
}
#if !defined(__WINDOWS__) #if !defined(__WINDOWS__)
static char *orte_getline(FILE *fp) static char *orte_getline(FILE *fp)
{ {
@ -260,15 +253,17 @@ static char *orte_getline(FILE *fp)
static static
void kill_procs(void) { void kill_procs(void) {
int ortedpid; int ortedpid;
char *fullprocname;
char *procname; char *procname;
char *pidstr; char *pidstr;
char *user; char *user;
int procpid; int procpid;
FILE *psfile; FILE *psfile;
char *inputline, *tmpline; char *inputline;
char *this_user; char *this_user;
int uid; int uid;
struct passwd *pwdent; struct passwd *pwdent;
char *separator = " \t"; /* output can be delimited by space or tab */
/* /*
* This is the command that is used to get the information about * This is the command that is used to get the information about
@ -300,7 +295,7 @@ void kill_procs(void) {
*/ */
ortedpid = getppid(); ortedpid = getppid();
/* get the name of the user */ /* get the name of the user */
uid = getuid(); uid = getuid();
#ifdef HAVE_GETPWUID #ifdef HAVE_GETPWUID
pwdent = getpwuid(uid); pwdent = getpwuid(uid);
@ -343,39 +338,30 @@ void kill_procs(void) {
while (NULL != (inputline = orte_getline(psfile))) { while (NULL != (inputline = orte_getline(psfile))) {
/* the user name is at the end of the line, with a space /* The three fields are typically seperated by spaces */
* preceeding it - extract that field fullprocname = strtok(inputline, separator);
*/ pidstr = strtok(NULL, separator);
user = strrchr(inputline, ' '); user = strtok(NULL, separator);
*user = '\0'; /* null terminate the remainder of the line */
user++; /* increment to point to the beginning of the user name */
/* if we are not the user, dump this input */ if (orte_clean_globals.debug) {
if (0 != strcmp(user, this_user)) { fprintf(stdout, "\norte-clean: user(pid)=%s, me=%s\n",
user, this_user);
}
/* If the user is not us, and the user is not root, then skip
* further checking. If the user is root, then continue on as
* we want root to kill off everybody. */
if ((0 != strcmp(user, this_user)) && (0 != strcmp("root", this_user))) {
/* not us */ /* not us */
free(inputline); free(inputline);
continue; continue;
} }
/* copy just the first part so we can search
* from the back of the string
*/
tmpline = strdup(inputline);
/* parse the truncated line for the procname and pid */
pidstr = strrchr(tmpline, ' ');
*pidstr = '\0'; /* NULL terminate the front of the line */
pidstr++;
procpid = atoi(pidstr); procpid = atoi(pidstr);
procname = opal_basename(fullprocname);
/* since we null-terminated inputline at the end of the if (orte_clean_globals.debug) {
* procname field, we can now search that field to fprintf(stdout, "orte-clean: fullname=%s, basename=%s, pid=%d\n",
* separate out the base command name in case they fullprocname, procname, procpid);
* have a bunch of path stuff at the start
*/
if (NULL == (procname = strrchr(tmpline, '/'))) {
procname = tmpline; /* no path in command name */
} else {
procname++; /* move past the / */
} }
/* /*
@ -433,7 +419,7 @@ void kill_procs(void) {
} }
} }
free(inputline); free(inputline);
free(tmpline); free(procname);
} }
free(this_user); free(this_user);
return; return;