/* * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. * Copyright (c) 2004-2005 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2007 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ */ #include "orte_config.h" #include #include #ifdef HAVE_UNISTD_H #include #endif /* HAVE_UNISTD_H */ #ifdef HAVE_STDLIB_H #include #endif /* HAVE_STDLIB_H */ #ifdef HAVE_SYS_STAT_H #include #endif /* HAVE_SYS_STAT_H */ #ifdef HAVE_SYS_TYPES_H #include #endif /* HAVE_SYS_TYPES_H */ #ifdef HAVE_SYS_WAIT_H #include #endif /* HAVE_SYS_WAIT_H */ #ifdef HAVE_SYS_PARAM_H #include #endif /* HAVE_SYS_PARAM_H */ #ifdef HAVE_STRING_H #include #endif /* HAVE_STRING_H */ #ifdef HAVE_DIRENT_H #include #endif /* HAVE_DIRENT_H */ #include #include "orte/orte_constants.h" #include "opal/util/cmd_line.h" #include "opal/util/argv.h" #include "opal/util/show_help.h" #include "opal/util/output.h" #include "opal/util/opal_environ.h" #include "opal/util/os_dirpath.h" #include "opal/mca/base/base.h" #include "opal/mca/base/mca_base_param.h" #include "orte/util/univ_info.h" #include "orte/util/sys_info.h" #include "orte/util/proc_info.h" #include "opal/util/os_path.h" #include "orte/util/session_dir.h" #include "orte/util/universe_setup_file_io.h" #include "orte/mca/gpr/gpr.h" #include "orte/mca/rml/rml.h" #include "opal/runtime/opal.h" #if OPAL_ENABLE_FT == 1 #include "opal/runtime/opal_cr.h" #endif #include "orte/runtime/runtime.h" /****************** * Local Functions ******************/ static int orte_clean_init(void); static int parse_args(int argc, char *argv[]); #if !defined(__WINDOWS__) static void kill_procs(void); #endif /* !defined(__WINDOWS__) */ /***************************************** * Global Vars for Command line Arguments *****************************************/ typedef struct { bool help; bool verbose; } orte_clean_globals_t; orte_clean_globals_t orte_clean_globals; opal_cmd_line_init_t cmd_line_opts[] = { { NULL, NULL, NULL, 'h', NULL, "help", 0, &orte_clean_globals.help, OPAL_CMD_LINE_TYPE_BOOL, "This help message" }, { NULL, NULL, NULL, 'v', NULL, "verbose", 0, &orte_clean_globals.verbose, OPAL_CMD_LINE_TYPE_BOOL, "Generate verbose output" }, /* End of list */ { NULL, NULL, NULL, '\0', NULL, NULL, 0, NULL, OPAL_CMD_LINE_TYPE_NULL, NULL } }; /* * This utility will do a brute force clean of a node. It will * attempt to clean up any files in the user's session directory. * It will also look for any orted and orterun processes that are * not part of this job, and kill them off. */ int main(int argc, char *argv[]) { int ret, exit_status = ORTE_SUCCESS; /*************** * Initialize ***************/ if (ORTE_SUCCESS != (ret = parse_args(argc, argv))) { return ret; } if (ORTE_SUCCESS != (ret = orte_clean_init())) { exit_status = ret; goto cleanup; } /* * Clean out all /tmp directories except for our own. */ orte_universe_clean_directories(orte_universe_info.name, orte_clean_globals.verbose); #if !defined(__WINDOWS__) kill_procs(); #endif /* !defined(__WINDOWS__) */ orte_finalize(); opal_finalize(); cleanup: return exit_status; } /* * Parse the command line arguments using the functions command * line utility functions. */ static int parse_args(int argc, char *argv[]) { int i, ret, len; opal_cmd_line_t cmd_line; char **app_env = NULL, **global_env = NULL; orte_clean_globals_t tmp = { false, false }; /* Parse the command line options */ /* NOTE: There is a bug in the PGI 6.2 series that causes the compiler to choke when copying structs containing bool members by value. So do a memcpy here instead. */ memcpy(&orte_clean_globals, &tmp, sizeof(tmp)); /* * Initialize list of available command line options. */ opal_cmd_line_create(&cmd_line, cmd_line_opts); mca_base_open(); ret = opal_cmd_line_parse(&cmd_line, true, argc, argv); /** * Put all of the MCA arguments in the environment */ mca_base_cmd_line_process_args(&cmd_line, &app_env, &global_env); len = opal_argv_count(app_env); for(i = 0; i < len; ++i) { putenv(app_env[i]); } len = opal_argv_count(global_env); for(i = 0; i < len; ++i) { putenv(global_env[i]); } opal_setenv(mca_base_param_env_var("opal_cr_is_tool"), "1", true, NULL); /** * Now start parsing our specific arguments */ if (OPAL_SUCCESS != ret || orte_clean_globals.help) { char *args = NULL; args = opal_cmd_line_get_usage_msg(&cmd_line); opal_show_help("help-orte-clean.txt", "usage", true, args); free(args); return ORTE_ERROR; } OBJ_DESTRUCT(&cmd_line); return ORTE_SUCCESS; } static int orte_clean_init(void) { int exit_status = ORTE_SUCCESS, ret; /* * We are trying to attach to another process' GPR so we need to * attach no matter if it is identified as private or not. */ opal_setenv(mca_base_param_env_var("universe_console"), "1", true, NULL); #if OPAL_ENABLE_FT == 1 /* Disable the checkpoint notification routine for this * tool. As we will never need to checkpoint this tool. * Note: This must happen before opal_init(). */ opal_cr_set_enabled(false); /* Select the none component, since we don't actually use a checkpointer */ opal_setenv(mca_base_param_env_var("crs"), "none", true, &environ); #endif /*************************** * We need all of OPAL ***************************/ if (ORTE_SUCCESS != (ret = opal_init())) { exit_status = ret; goto cleanup; } if (ORTE_SUCCESS != (ret = orte_system_init(ORTE_INFRASTRUCTURE, ORTE_NON_BARRIER))) { exit_status = ret; goto cleanup; } #if 0 /*************************** * And ORTE, but need to do a bit of a dance first ***************************/ /* register handler for errnum -> string converstion */ opal_error_register("ORTE", ORTE_ERR_BASE, ORTE_ERR_MAX, orte_err2str); /* Register all MCA Params */ if (ORTE_SUCCESS != (ret = orte_register_params(true))) { exit_status = ret; goto cleanup; } /* Ensure the system_info structure is instantiated and initialized */ if (ORTE_SUCCESS != (ret = orte_sys_info())) { exit_status = ret; goto cleanup; } /* Ensure the process info structure is instantiated and initialized */ if (ORTE_SUCCESS != (ret = orte_proc_info())) { exit_status = ret; goto cleanup; } #endif cleanup: return exit_status; } #if !defined(__WINDOWS__) /* * This function makes a call to "ps" to find out the processes that * are running on this node. It then attempts to kill off any orteds * and orteruns that are not related to this job. */ static void kill_procs(void) { int ortedpid; char procname[MAXPATHLEN]; /* only really need 8, but being safe */ char pidstr[MAXPATHLEN]; /* only really need 8, but being safe */ char user[MAXPATHLEN]; int procpid; FILE *psfile; bool kill_orteruns = false; int orunpid = 0; /* * This is the command that is used to get the information about * all the processes that are running. The output looks like the * following: * COMMAND PID USER * tcsh 12556 rolfv * ps 14424 rolfv * etc. * Currently, we do not make use of the USER field, but we may later * on so we grab it also. */ /* * The configure determines if there is a valid ps command for us to * use. If it is set to unknown, then we skip this section. */ char command[] = ORTE_CLEAN_PS_CMD; if (!(strcmp("unknown", command))) { return; } /* * Try to get the pid of our orterun process from our universe name. * This works in the case where one is using the default universe name * which appends the pid after the 'default-universe-' string. In * this way, we avoid killing our own mpirun process. Note that if * we cannot determine our orterun pid, then we skip killing the * orterun processes to avoid odd behavior for the user. */ if (!(strncmp(ORTE_DEFAULT_UNIVERSE, orte_universe_info.name, sizeof(ORTE_DEFAULT_UNIVERSE)-1))) { char *tptr; /* * Set a pointer to the pid part of the name. The pointer * is adjusted by the name along with one extra to remove the * dash before the pid. Then convert to a pid. If the strtol() * returns zero, then we got an error on the conversion and we * will skip killing the orteruns. */ tptr = orte_universe_info.name + sizeof(ORTE_DEFAULT_UNIVERSE); if (0 != (orunpid = (int)strtol(tptr, (char **)NULL, 10))) { kill_orteruns = true; } } /* * Get our parent pid which is the pid of the orted. */ ortedpid = getppid(); /* * There is a race condition here. The problem is that we are looking * for any processes named orted. However, one may erroneously find more * orteds then there really are because the orted is doing a series of * fork/execs. If we run with more than one orte-clean on a node, then * one of the orte-cleans may catch the other one while it has forked, * but not exec'ed. It will therefore kill an orte-clean. Now one * can argue it is silly to run more than one orte-clean on a node, and * this is true. We will have to figure out how to prevent this. For * now, we use a big hammer and just sleep a second to decrease the * probability. */ sleep(1); psfile = popen(command, "r"); /* * Read the first line of the output. We just throw it away * as it is the header consisting of the words COMMAND, PID and * USER. */ if ((fscanf(psfile, "%s%s%s", procname, pidstr, user)) == EOF) { return; } while ((fscanf(psfile, "%s%s%s", procname, pidstr, user)) != EOF) { procpid = atoi(pidstr); /* * Look for any orteds that are not our parent and attempt to * kill them. We currently do not worry whether we are the * owner or not. If we are not, we will just fail to send * the signal and that is OK. This also allows a root process * to kill all orteds. */ if (!strcmp("orted", procname)) { if (procpid != ortedpid) { if (orte_clean_globals.verbose) { opal_output(0, "orte-clean: found potential rogue orted process" " (pid=%d,user=%s), sending SIGKILL...\n", procpid, user); } /* * We ignore the return code here as we do not really * care whether this worked or not. */ (void)kill(procpid, SIGKILL); } } /* * Now check for any orteruns. */ if (kill_orteruns) { if (!strcmp("orterun", procname)) { if (procpid != orunpid) { if (orte_clean_globals.verbose) { opal_output(0, "orte-clean: found potential rogue orterun process" " (pid=%d,user=%s), sending SIGKILL...\n", procpid, user); } /* * We ignore the return code here as we do not really * care whether this worked or not. */ (void)kill(procpid, SIGKILL); } } } } return; } #endif /* !defined(__WINDOWS__) */