Fix a couple problems with orte-clean. Also add a new
--debug flag to help developers figure out possible future issues. This fixes trac:1335. This commit was SVN r18979. The following Trac tickets were found above: Ticket 1335 --> https://svn.open-mpi.org/trac/ompi/ticket/1335
Этот коммит содержится в:
родитель
26cfac94e6
Коммит
ed4920ba5f
@ -3,7 +3,7 @@
|
|||||||
.\" University Research and Technology
|
.\" University Research and Technology
|
||||||
.\" Corporation. All rights reserved.
|
.\" Corporation. All rights reserved.
|
||||||
.\"
|
.\"
|
||||||
.\" Copyright 2007, Sun Microsystems, Inc.
|
.\" Copyright (c) 2007-2008 Sun Microsystems, Inc.
|
||||||
.\"
|
.\"
|
||||||
.TH orte-clean 1 "March 2007" "Open MPI 1.2" "OPEN MPI COMMANDS "
|
.TH orte-clean 1 "March 2007" "Open MPI 1.2" "OPEN MPI COMMANDS "
|
||||||
.SH NAME
|
.SH NAME
|
||||||
@ -18,9 +18,9 @@ from Open MPI jobs.
|
|||||||
.SH SYNOPSIS
|
.SH SYNOPSIS
|
||||||
.ft R
|
.ft R
|
||||||
.nf
|
.nf
|
||||||
orte-clean [--verbose]
|
orte-clean [--verbose] [--debug]
|
||||||
.br
|
.br
|
||||||
mpirun --pernode [--host | --hostfile \fIfile\fP] orte-clean [--verbose]
|
mpirun --pernode [--host | --hostfile \fIfile\fP] orte-clean [--verbose] [--debug]
|
||||||
.sp
|
.sp
|
||||||
|
|
||||||
|
|
||||||
@ -33,7 +33,11 @@ mpirun --pernode [--host | --hostfile \fIfile\fP] orte-clean [--verbose]
|
|||||||
mode and print out the universes that are getting cleaned up
|
mode and print out the universes that are getting cleaned up
|
||||||
as well as processes that are being killed.
|
as well as processes that are being killed.
|
||||||
.sp
|
.sp
|
||||||
|
.ft R
|
||||||
|
[-d | --debug] This argument will run the command in debug
|
||||||
|
mode and print out lots of details about what the command is
|
||||||
|
doing. This is intended for developer use only.
|
||||||
|
.sp
|
||||||
.\" **************************
|
.\" **************************
|
||||||
.\" Description Section
|
.\" Description Section
|
||||||
.\" **************************
|
.\" **************************
|
||||||
|
@ -9,7 +9,7 @@
|
|||||||
* University of Stuttgart. All rights reserved.
|
* University of Stuttgart. All rights reserved.
|
||||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
|
* Copyright (c) 2007-2008 Sun Microsystems, Inc. All rights reserved.
|
||||||
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
|
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
@ -18,7 +18,6 @@
|
|||||||
*
|
*
|
||||||
* $HEADER$
|
* $HEADER$
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include "orte_config.h"
|
#include "orte_config.h"
|
||||||
#include "orte/constants.h"
|
#include "orte/constants.h"
|
||||||
|
|
||||||
@ -57,6 +56,7 @@
|
|||||||
#include "opal/util/argv.h"
|
#include "opal/util/argv.h"
|
||||||
#include "opal/util/opal_environ.h"
|
#include "opal/util/opal_environ.h"
|
||||||
#include "opal/util/os_dirpath.h"
|
#include "opal/util/os_dirpath.h"
|
||||||
|
#include "opal/util/basename.h"
|
||||||
#include "opal/mca/base/base.h"
|
#include "opal/mca/base/base.h"
|
||||||
#include "opal/mca/base/mca_base_param.h"
|
#include "opal/mca/base/mca_base_param.h"
|
||||||
|
|
||||||
@ -74,7 +74,6 @@
|
|||||||
/******************
|
/******************
|
||||||
* Local Functions
|
* Local Functions
|
||||||
******************/
|
******************/
|
||||||
static int orte_clean_init(void);
|
|
||||||
static int parse_args(int argc, char *argv[]);
|
static int parse_args(int argc, char *argv[]);
|
||||||
#if !defined(__WINDOWS__)
|
#if !defined(__WINDOWS__)
|
||||||
static void kill_procs(void);
|
static void kill_procs(void);
|
||||||
@ -86,6 +85,7 @@ static void kill_procs(void);
|
|||||||
typedef struct {
|
typedef struct {
|
||||||
bool help;
|
bool help;
|
||||||
bool verbose;
|
bool verbose;
|
||||||
|
bool debug;
|
||||||
} orte_clean_globals_t;
|
} orte_clean_globals_t;
|
||||||
|
|
||||||
orte_clean_globals_t orte_clean_globals;
|
orte_clean_globals_t orte_clean_globals;
|
||||||
@ -103,6 +103,12 @@ opal_cmd_line_init_t cmd_line_opts[] = {
|
|||||||
&orte_clean_globals.verbose, OPAL_CMD_LINE_TYPE_BOOL,
|
&orte_clean_globals.verbose, OPAL_CMD_LINE_TYPE_BOOL,
|
||||||
"Generate verbose output" },
|
"Generate verbose output" },
|
||||||
|
|
||||||
|
{ NULL, NULL, NULL,
|
||||||
|
'd', NULL, "debug",
|
||||||
|
0,
|
||||||
|
&orte_clean_globals.debug, OPAL_CMD_LINE_TYPE_BOOL,
|
||||||
|
"Extra debug output for developers to ensure that orte-clean is working" },
|
||||||
|
|
||||||
/* End of list */
|
/* End of list */
|
||||||
{ NULL, NULL, NULL,
|
{ NULL, NULL, NULL,
|
||||||
'\0', NULL, NULL,
|
'\0', NULL, NULL,
|
||||||
@ -120,18 +126,44 @@ opal_cmd_line_init_t cmd_line_opts[] = {
|
|||||||
int
|
int
|
||||||
main(int argc, char *argv[])
|
main(int argc, char *argv[])
|
||||||
{
|
{
|
||||||
int ret, exit_status = ORTE_SUCCESS;
|
int ret = ORTE_SUCCESS;
|
||||||
|
char *tmp_env_var;
|
||||||
|
|
||||||
|
/* This is needed so we can print the help message */
|
||||||
|
if (ORTE_SUCCESS != (ret = opal_init_util())) {
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
/***************
|
|
||||||
* Initialize
|
|
||||||
***************/
|
|
||||||
if (ORTE_SUCCESS != (ret = parse_args(argc, argv))) {
|
if (ORTE_SUCCESS != (ret = parse_args(argc, argv))) {
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
if (ORTE_SUCCESS != (ret = orte_clean_init())) {
|
|
||||||
exit_status = ret;
|
#if OPAL_ENABLE_FT == 1
|
||||||
goto cleanup;
|
/* Disable the checkpoint notification routine for this
|
||||||
|
* tool. As we will never need to checkpoint this tool.
|
||||||
|
* Note: This must happen before opal_init().
|
||||||
|
*/
|
||||||
|
opal_cr_set_enabled(false);
|
||||||
|
|
||||||
|
/* Select the none component, since we don't actually use a checkpointer */
|
||||||
|
tmp_env_var = mca_base_param_env_var("crs");
|
||||||
|
opal_setenv(tmp_env_var,
|
||||||
|
"none",
|
||||||
|
true, &environ);
|
||||||
|
free(tmp_env_var);
|
||||||
|
tmp_env_var = NULL;
|
||||||
|
|
||||||
|
tmp_env_var = mca_base_param_env_var("opal_cr_is_tool");
|
||||||
|
opal_setenv(tmp_env_var,
|
||||||
|
"1", true, NULL);
|
||||||
|
free(tmp_env_var);
|
||||||
|
#endif
|
||||||
|
tmp_env_var = NULL; /* Silence compiler warning */
|
||||||
|
|
||||||
|
if (ORTE_SUCCESS != (ret = orte_init(ORTE_TOOL_WITH_NAME))) {
|
||||||
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Clean out all session directories - we don't have to protect
|
* Clean out all session directories - we don't have to protect
|
||||||
* our own session directory because (since we are a tool) we
|
* our own session directory because (since we are a tool) we
|
||||||
@ -150,8 +182,7 @@ main(int argc, char *argv[])
|
|||||||
|
|
||||||
orte_finalize();
|
orte_finalize();
|
||||||
|
|
||||||
cleanup:
|
return ORTE_SUCCESS;
|
||||||
return exit_status;
|
|
||||||
}
|
}
|
||||||
/*
|
/*
|
||||||
* Parse the command line arguments using the functions command
|
* Parse the command line arguments using the functions command
|
||||||
@ -160,10 +191,7 @@ main(int argc, char *argv[])
|
|||||||
static int parse_args(int argc, char *argv[]) {
|
static int parse_args(int argc, char *argv[]) {
|
||||||
int ret;
|
int ret;
|
||||||
opal_cmd_line_t cmd_line;
|
opal_cmd_line_t cmd_line;
|
||||||
orte_clean_globals_t tmp = { false, false };
|
orte_clean_globals_t tmp = { false, false, false };
|
||||||
char * tmp_env_var = NULL;
|
|
||||||
|
|
||||||
/* Parse the command line options */
|
|
||||||
|
|
||||||
/* NOTE: There is a bug in the PGI 6.2 series that causes the
|
/* NOTE: There is a bug in the PGI 6.2 series that causes the
|
||||||
compiler to choke when copying structs containing bool members
|
compiler to choke when copying structs containing bool members
|
||||||
@ -176,12 +204,6 @@ static int parse_args(int argc, char *argv[]) {
|
|||||||
opal_cmd_line_create(&cmd_line, cmd_line_opts);
|
opal_cmd_line_create(&cmd_line, cmd_line_opts);
|
||||||
ret = opal_cmd_line_parse(&cmd_line, true, argc, argv);
|
ret = opal_cmd_line_parse(&cmd_line, true, argc, argv);
|
||||||
|
|
||||||
tmp_env_var = mca_base_param_env_var("opal_cr_is_tool");
|
|
||||||
opal_setenv(tmp_env_var,
|
|
||||||
"1", true, NULL);
|
|
||||||
free(tmp_env_var);
|
|
||||||
tmp_env_var = NULL;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Now start parsing our specific arguments
|
* Now start parsing our specific arguments
|
||||||
*/
|
*/
|
||||||
@ -200,35 +222,6 @@ static int parse_args(int argc, char *argv[]) {
|
|||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int orte_clean_init(void) {
|
|
||||||
int exit_status = ORTE_SUCCESS, ret;
|
|
||||||
char * tmp_env_var = NULL;
|
|
||||||
|
|
||||||
#if OPAL_ENABLE_FT == 1
|
|
||||||
/* Disable the checkpoint notification routine for this
|
|
||||||
* tool. As we will never need to checkpoint this tool.
|
|
||||||
* Note: This must happen before opal_init().
|
|
||||||
*/
|
|
||||||
opal_cr_set_enabled(false);
|
|
||||||
|
|
||||||
/* Select the none component, since we don't actually use a checkpointer */
|
|
||||||
tmp_env_var = mca_base_param_env_var("crs");
|
|
||||||
opal_setenv(tmp_env_var,
|
|
||||||
"none",
|
|
||||||
true, &environ);
|
|
||||||
free(tmp_env_var);
|
|
||||||
#endif
|
|
||||||
tmp_env_var = NULL; /* Silence compiler warning */
|
|
||||||
|
|
||||||
if (ORTE_SUCCESS != (ret = orte_init(ORTE_TOOL_WITH_NAME))) {
|
|
||||||
exit_status = ret;
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
|
|
||||||
cleanup:
|
|
||||||
return exit_status;
|
|
||||||
}
|
|
||||||
|
|
||||||
#if !defined(__WINDOWS__)
|
#if !defined(__WINDOWS__)
|
||||||
static char *orte_getline(FILE *fp)
|
static char *orte_getline(FILE *fp)
|
||||||
{
|
{
|
||||||
@ -260,15 +253,17 @@ static char *orte_getline(FILE *fp)
|
|||||||
static
|
static
|
||||||
void kill_procs(void) {
|
void kill_procs(void) {
|
||||||
int ortedpid;
|
int ortedpid;
|
||||||
|
char *fullprocname;
|
||||||
char *procname;
|
char *procname;
|
||||||
char *pidstr;
|
char *pidstr;
|
||||||
char *user;
|
char *user;
|
||||||
int procpid;
|
int procpid;
|
||||||
FILE *psfile;
|
FILE *psfile;
|
||||||
char *inputline, *tmpline;
|
char *inputline;
|
||||||
char *this_user;
|
char *this_user;
|
||||||
int uid;
|
int uid;
|
||||||
struct passwd *pwdent;
|
struct passwd *pwdent;
|
||||||
|
char *separator = " \t"; /* output can be delimited by space or tab */
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* This is the command that is used to get the information about
|
* This is the command that is used to get the information about
|
||||||
@ -300,7 +295,7 @@ void kill_procs(void) {
|
|||||||
*/
|
*/
|
||||||
ortedpid = getppid();
|
ortedpid = getppid();
|
||||||
|
|
||||||
/* get the name of the user */
|
/* get the name of the user */
|
||||||
uid = getuid();
|
uid = getuid();
|
||||||
#ifdef HAVE_GETPWUID
|
#ifdef HAVE_GETPWUID
|
||||||
pwdent = getpwuid(uid);
|
pwdent = getpwuid(uid);
|
||||||
@ -343,39 +338,30 @@ void kill_procs(void) {
|
|||||||
|
|
||||||
while (NULL != (inputline = orte_getline(psfile))) {
|
while (NULL != (inputline = orte_getline(psfile))) {
|
||||||
|
|
||||||
/* the user name is at the end of the line, with a space
|
/* The three fields are typically seperated by spaces */
|
||||||
* preceeding it - extract that field
|
fullprocname = strtok(inputline, separator);
|
||||||
*/
|
pidstr = strtok(NULL, separator);
|
||||||
user = strrchr(inputline, ' ');
|
user = strtok(NULL, separator);
|
||||||
*user = '\0'; /* null terminate the remainder of the line */
|
|
||||||
user++; /* increment to point to the beginning of the user name */
|
|
||||||
|
|
||||||
/* if we are not the user, dump this input */
|
if (orte_clean_globals.debug) {
|
||||||
if (0 != strcmp(user, this_user)) {
|
fprintf(stdout, "\norte-clean: user(pid)=%s, me=%s\n",
|
||||||
|
user, this_user);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* If the user is not us, and the user is not root, then skip
|
||||||
|
* further checking. If the user is root, then continue on as
|
||||||
|
* we want root to kill off everybody. */
|
||||||
|
if ((0 != strcmp(user, this_user)) && (0 != strcmp("root", this_user))) {
|
||||||
/* not us */
|
/* not us */
|
||||||
free(inputline);
|
free(inputline);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
/* copy just the first part so we can search
|
|
||||||
* from the back of the string
|
|
||||||
*/
|
|
||||||
tmpline = strdup(inputline);
|
|
||||||
|
|
||||||
/* parse the truncated line for the procname and pid */
|
|
||||||
pidstr = strrchr(tmpline, ' ');
|
|
||||||
*pidstr = '\0'; /* NULL terminate the front of the line */
|
|
||||||
pidstr++;
|
|
||||||
procpid = atoi(pidstr);
|
procpid = atoi(pidstr);
|
||||||
|
procname = opal_basename(fullprocname);
|
||||||
/* since we null-terminated inputline at the end of the
|
if (orte_clean_globals.debug) {
|
||||||
* procname field, we can now search that field to
|
fprintf(stdout, "orte-clean: fullname=%s, basename=%s, pid=%d\n",
|
||||||
* separate out the base command name in case they
|
fullprocname, procname, procpid);
|
||||||
* have a bunch of path stuff at the start
|
|
||||||
*/
|
|
||||||
if (NULL == (procname = strrchr(tmpline, '/'))) {
|
|
||||||
procname = tmpline; /* no path in command name */
|
|
||||||
} else {
|
|
||||||
procname++; /* move past the / */
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -433,7 +419,7 @@ void kill_procs(void) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
free(inputline);
|
free(inputline);
|
||||||
free(tmpline);
|
free(procname);
|
||||||
}
|
}
|
||||||
free(this_user);
|
free(this_user);
|
||||||
return;
|
return;
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user