1
1
openmpi/orte/util/session_dir.c
Ralph Castain 7a79b25577 Ensure we cleanup some files so session dirs can be rolled up
cmr=v1.8.2:reviewer=jsquyres

This commit was SVN r31569.
2014-04-30 17:52:10 +00:00

759 строки
23 KiB
C

/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2014 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
*/
#include "orte_config.h"
#include "orte/constants.h"
#include <stdio.h>
#ifdef HAVE_PWD_H
#include <pwd.h>
#endif
#include <stddef.h>
#include <stdlib.h>
#include <string.h>
#ifdef HAVE_SYS_PARAM_H
#include <sys/param.h>
#endif /* HAVE_SYS_PARAM_H */
#ifdef HAVE_SYS_TYPES_H
#include <sys/types.h>
#endif /* HAVE_SYS_TYPES_H */
#include <sys/stat.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#include <errno.h>
#ifdef HAVE_DIRENT_H
#include <dirent.h>
#endif /* HAVE_DIRENT_H */
#ifdef HAVE_PWD_H
#include <pwd.h>
#endif /* HAVE_PWD_H */
#include "opal/util/argv.h"
#include "opal/util/output.h"
#include "opal/util/os_path.h"
#include "opal/util/os_dirpath.h"
#include "opal/util/basename.h"
#include "opal/util/opal_environ.h"
#include "orte/util/proc_info.h"
#include "orte/util/name_fns.h"
#include "orte/util/show_help.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/runtime/runtime.h"
#include "orte/runtime/orte_globals.h"
#include "orte/util/session_dir.h"
/*******************************
* Local function Declarations
*******************************/
static int orte_create_dir(char *directory);
static bool orte_dir_check_file(const char *root, const char *path);
static bool orte_dir_check_file_output(const char *root, const char *path);
static char *orte_build_job_session_dir(char *top_dir,
orte_process_name_t *proc,
orte_jobid_t jobid);
#define OMPI_PRINTF_FIX_STRING(a) ((NULL == a) ? "(null)" : a)
/****************************
* Funcationality
****************************/
/*
* Check and create the directory requested
*/
static int orte_create_dir(char *directory)
{
mode_t my_mode = S_IRWXU; /* I'm looking for full rights */
int ret;
/* Sanity check before creating the directory with the proper mode,
* Make sure it doesn't exist already */
if( ORTE_ERR_NOT_FOUND !=
(ret = opal_os_dirpath_access(directory, my_mode)) ) {
/* Failure because opal_os_dirpath_access() indicated that either:
* - The directory exists and we can access it (no need to create it again),
* return OPAL_SUCCESS, or
* - don't have access rights, return OPAL_ERROR
*/
if (ORTE_SUCCESS != ret) {
ORTE_ERROR_LOG(ret);
}
return(ret);
}
/* Get here if the directory doesn't exist, so create it */
if (ORTE_SUCCESS != (ret = opal_os_dirpath_create(directory, my_mode))) {
ORTE_ERROR_LOG(ret);
}
return ret;
}
/*
* Construct the fullpath to the session directory
*/
int
orte_session_dir_get_name(char **fulldirpath,
char **return_prefix, /* This will come back as the valid tmp dir */
char **return_frontend,
char *hostid,
char *batchid,
orte_process_name_t *proc) {
char *hostname = NULL,
*batchname = NULL,
*sessions = NULL,
*user = NULL,
*prefix = NULL,
*frontend = NULL,
*jobfam = NULL,
*job = NULL,
*vpidstr = NULL;
bool prefix_provided = false;
int exit_status = ORTE_SUCCESS;
size_t len;
int uid;
/* Ensure that system info is set */
orte_proc_info();
/* get the name of the user */
uid = getuid();
#if OPAL_ENABLE_GETPWUID
{
struct passwd *pwdent;
#ifdef HAVE_GETPWUID
pwdent = getpwuid(uid);
if (NULL == pwdent) {
/* this indicates a problem with the passwd system,
* so pretty-print a message just for info
*/
orte_show_help("help-orte-runtime.txt",
"orte:session:dir:nopwname", true);
}
#else
pwdent = NULL;
#endif
if (NULL != pwdent) {
user = strdup(pwdent->pw_name);
} else {
asprintf(&user, "%d", uid);
}
}
#else
asprintf(&user, "%d", uid);
#endif
/*
* set the 'hostname'
*/
if( NULL != hostid) { /* User specified version */
hostname = strdup(hostid);
}
else { /* check if it is set elsewhere */
if( NULL != orte_process_info.nodename)
hostname = strdup(orte_process_info.nodename);
else {
/* Couldn't find it, so fail */
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
exit_status = ORTE_ERR_BAD_PARAM;
goto cleanup;
}
}
/*
* set the 'batchid'
*/
if (NULL != batchid)
batchname = strdup(batchid);
else
batchname = strdup("0");
/*
* get the front part of the session directory
* Will look something like:
* openmpi-sessions-USERNAME@HOSTNAME_BATCHID
*/
if (NULL != orte_process_info.top_session_dir) {
frontend = strdup(orte_process_info.top_session_dir);
}
else { /* If not set then construct it */
if (0 > asprintf(&frontend, "openmpi-sessions-%s@%s_%s", user, hostname, batchname)) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
exit_status = ORTE_ERR_OUT_OF_RESOURCE;
goto cleanup;
}
}
/*
* Construct the session directory
*/
/* If we were given a valid vpid then we can construct it fully into:
* openmpi-sessions-USERNAME@HOSTNAME_BATCHID/JOB-FAMILY/JOBID/VPID
*/
if( NULL != proc) {
if (ORTE_VPID_INVALID != proc->vpid) {
if (0 > asprintf(&jobfam, "%d", ORTE_JOB_FAMILY(proc->jobid))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
exit_status = ORTE_ERR_OUT_OF_RESOURCE;
goto cleanup;
}
if (0 > asprintf(&job, "%d", ORTE_LOCAL_JOBID(proc->jobid))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
exit_status = ORTE_ERR_OUT_OF_RESOURCE;
goto cleanup;
}
if (ORTE_SUCCESS != orte_util_convert_vpid_to_string(&vpidstr, proc->vpid)) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
exit_status = ORTE_ERR_OUT_OF_RESOURCE;
goto cleanup;
}
sessions = opal_os_path( false, frontend, jobfam, job, vpidstr, NULL );
if( NULL == sessions ) {
ORTE_ERROR_LOG(ORTE_ERROR);
exit_status = ORTE_ERROR;
goto cleanup;
}
}
/* If we were given a valid jobid then we can construct it partially into:
* openmpi-sessions-USERNAME@HOSTNAME_BATCHID/JOB-FAMILY/JOBID
*/
else if (ORTE_JOBID_INVALID != proc->jobid) {
if (0 > asprintf(&jobfam, "%d", ORTE_JOB_FAMILY(proc->jobid))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
exit_status = ORTE_ERR_OUT_OF_RESOURCE;
goto cleanup;
}
if (0 > asprintf(&job, "%d", ORTE_LOCAL_JOBID(proc->jobid))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
exit_status = ORTE_ERR_OUT_OF_RESOURCE;
goto cleanup;
}
sessions = opal_os_path( false, frontend, jobfam, job, NULL );
if( NULL == sessions ) {
ORTE_ERROR_LOG(ORTE_ERROR);
exit_status = ORTE_ERROR;
goto cleanup;
}
} /* if both are invalid */
else {
sessions = strdup(frontend); /* must dup this to avoid double-free later */
}
} /* If we were not given a proc at all, then we just set it to frontend
*/
else {
sessions = strdup(frontend); /* must dup this to avoid double-free later */
}
/*
* If the user specified an invalid prefix, or no prefix at all
* we need to keep looking
*/
if( NULL != fulldirpath && NULL != *fulldirpath) {
free(*fulldirpath);
*fulldirpath = NULL;
}
if( NULL != return_prefix && NULL != *return_prefix) { /* use the user specified one, if available */
prefix = strdup(*return_prefix);
prefix_provided = true;
}
/* Try to find a proper alternative prefix */
else if (NULL != orte_process_info.tmpdir_base) { /* stored value */
prefix = strdup(orte_process_info.tmpdir_base);
}
else { /* General Environment var */
prefix = strdup(opal_tmp_directory());
}
len = strlen(prefix);
/* check for a trailing path separator */
if (OPAL_PATH_SEP[0] == prefix[len-1]) {
prefix[len-1] = '\0';
}
/* BEFORE doing anything else, check to see if this prefix is
* allowed by the system
*/
if (NULL != orte_prohibited_session_dirs) {
char **list;
int i, len;
/* break the string into tokens - it should be
* separated by ','
*/
list = opal_argv_split(orte_prohibited_session_dirs, ',');
len = opal_argv_count(list);
/* cycle through the list */
for (i=0; i < len; i++) {
/* check if prefix matches */
if (0 == strncmp(prefix, list[i], strlen(list[i]))) {
/* this is a prohibited location */
orte_show_help("help-orte-runtime.txt",
"orte:session:dir:prohibited",
true, prefix, orte_prohibited_session_dirs);
return ORTE_ERR_FATAL;
}
}
opal_argv_free(list); /* done with this */
}
/*
* Construct the absolute final path, if requested
*/
if (NULL != fulldirpath) {
*fulldirpath = opal_os_path(false, prefix, sessions, NULL);
}
/*
* Return the frontend and prefix, if user requested we do so
*/
if (NULL != return_frontend) {
*return_frontend = strdup(frontend);
}
if (!prefix_provided && NULL != return_prefix) {
*return_prefix = strdup(prefix);
}
cleanup:
if(NULL != hostname)
free(hostname);
if(NULL != batchname)
free(batchname);
if(NULL != sessions)
free(sessions);
if(NULL != user)
free(user);
if (NULL != prefix) free(prefix);
if (NULL != frontend) free(frontend);
if (NULL != jobfam) free(jobfam);
if (NULL != job) free(job);
if (NULL != vpidstr) free(vpidstr);
return exit_status;
}
/*
* Construct the session directory and create it if necessary
*/
int orte_session_dir(bool create,
char *prefix, char *hostid,
char *batchid, orte_process_name_t *proc)
{
char *fulldirpath = NULL,
*frontend = NULL,
*sav = NULL;
int rc = ORTE_SUCCESS;
char *local_prefix = NULL;
/* use the specified prefix, if one was given */
if (NULL != prefix) {
local_prefix = strdup(prefix);
}
/*
* Get the session directory full name
*/
if (ORTE_SUCCESS != (rc = orte_session_dir_get_name(&fulldirpath,
&local_prefix,
&frontend,
hostid,
batchid, proc))) {
if (ORTE_ERR_FATAL == rc) {
/* this indicates we should abort quietly */
rc = ORTE_ERR_SILENT;
goto cleanup;
}
/* otherwise, bark a little first */
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/*
* Now that we have the full path, go ahead and create it if necessary
*/
if( create ) {
if( ORTE_SUCCESS != (rc = orte_create_dir(fulldirpath) ) ) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
}
/* update global structure fields */
if (NULL != orte_process_info.tmpdir_base) {
free(orte_process_info.tmpdir_base);
}
orte_process_info.tmpdir_base = strdup(local_prefix);
if (NULL != orte_process_info.top_session_dir) {
free(orte_process_info.top_session_dir);
orte_process_info.top_session_dir = NULL;
}
if (NULL != frontend) {
orte_process_info.top_session_dir = strdup(frontend);
}
/*
* Set the process session directory
*/
if (ORTE_VPID_INVALID != proc->vpid) {
if (NULL != orte_process_info.proc_session_dir) {
free(orte_process_info.proc_session_dir);
}
orte_process_info.proc_session_dir = strdup(fulldirpath);
/* Strip off last part of directory structure */
sav = opal_dirname(fulldirpath);
free(fulldirpath);
fulldirpath = sav;
sav = NULL;
}
/*
* Set the job session directory
*/
if (ORTE_JOBID_INVALID != proc->jobid) {
if (NULL != orte_process_info.job_session_dir) {
free(orte_process_info.job_session_dir);
}
orte_process_info.job_session_dir = strdup(fulldirpath);
}
if (orte_debug_flag) {
opal_output(0, "procdir: %s",
OMPI_PRINTF_FIX_STRING(orte_process_info.proc_session_dir));
opal_output(0, "jobdir: %s",
OMPI_PRINTF_FIX_STRING(orte_process_info.job_session_dir));
opal_output(0, "top: %s",
OMPI_PRINTF_FIX_STRING(orte_process_info.top_session_dir));
opal_output(0, "tmp: %s",
OMPI_PRINTF_FIX_STRING(orte_process_info.tmpdir_base));
}
cleanup:
if (NULL != local_prefix) {
free(local_prefix);
}
if(NULL != fulldirpath) {
free(fulldirpath);
}
if(NULL != frontend) {
free(frontend);
}
return rc;
}
/*
* A job has aborted - so force cleanup of the session directory
*/
int
orte_session_dir_cleanup(orte_jobid_t jobid)
{
int rc = ORTE_SUCCESS;
char *tmp = NULL;
char *job_session_dir=NULL;
if (!orte_create_session_dirs) {
/* didn't create them */
return ORTE_SUCCESS;
}
if (NULL == orte_process_info.tmpdir_base &&
NULL == orte_process_info.top_session_dir) {
/* this should never happen - it means we are calling
* cleanup *before* properly setting up the session
* dir system. This leaves open the possibility of
* accidentally removing directories we shouldn't
* touch
*/
rc = ORTE_ERR_NOT_INITIALIZED;
goto CLEANUP;
}
/* need to setup the top_session_dir with the prefix */
tmp = opal_os_path(false,
orte_process_info.tmpdir_base,
orte_process_info.top_session_dir, NULL);
/* we can only blow away session directories for our job family */
job_session_dir = orte_build_job_session_dir(tmp, ORTE_PROC_MY_NAME, jobid);
if (NULL == job_session_dir) {
rc = ORTE_ERR_OUT_OF_RESOURCE;
goto CLEANUP;
}
if (ORTE_JOBID_WILDCARD != jobid) {
opal_os_dirpath_destroy(job_session_dir, true, orte_dir_check_file);
} else {
/* if we want the session_dir removed for ALL jobids, then
* just recursively blow the whole session away for our job family,
* saving only output files
*/
opal_os_dirpath_destroy(job_session_dir, true, orte_dir_check_file_output);
}
/* now attempt to eliminate the top level directory itself - this
* will fail if anything is present, but ensures we cleanup if
* we are the last one out
*/
opal_os_dirpath_destroy(tmp, false, orte_dir_check_file);
if (NULL != job_session_dir && opal_os_dirpath_is_empty(job_session_dir)) {
if (orte_debug_flag) {
opal_output(0, "sess_dir_cleanup: found job session dir empty - deleting");
}
rmdir(job_session_dir);
} else {
if (orte_debug_flag) {
if (OPAL_ERR_NOT_FOUND == opal_os_dirpath_access(job_session_dir, 0)) {
opal_output(0, "sess_dir_cleanup: job session dir does not exist");
} else {
opal_output(0, "sess_dir_cleanup: job session dir not empty - leaving");
}
}
goto CLEANUP;
}
if (opal_os_dirpath_is_empty(tmp)) {
if (orte_debug_flag) {
opal_output(0, "sess_dir_cleanup: found top session dir empty - deleting");
}
rmdir(tmp);
} else {
if (orte_debug_flag) {
if (OPAL_ERR_NOT_FOUND == opal_os_dirpath_access(tmp, 0)) {
opal_output(0, "sess_dir_cleanup: top session dir does not exist");
} else {
opal_output(0, "sess_dir_cleanup: top session dir not empty - leaving");
}
}
}
CLEANUP:
if (NULL != tmp) free(tmp);
if (NULL != job_session_dir) free(job_session_dir);
return rc;
}
int
orte_session_dir_finalize(orte_process_name_t *proc)
{
int rc;
char *tmp;
char *job_session_dir, *vpid, *proc_session_dir;
if (!orte_create_session_dirs) {
/* didn't create them */
return ORTE_SUCCESS;
}
if (NULL == orte_process_info.tmpdir_base &&
NULL == orte_process_info.top_session_dir) {
/* this should never happen - it means we are calling
* cleanup *before* properly setting up the session
* dir system. This leaves open the possibility of
* accidentally removing directories we shouldn't
* touch
*/
ORTE_ERROR_LOG(ORTE_ERR_NOT_INITIALIZED);
return ORTE_ERR_NOT_INITIALIZED;
}
/* need to setup the top_session_dir with the prefix */
tmp = opal_os_path(false,
orte_process_info.tmpdir_base,
orte_process_info.top_session_dir, NULL);
/* define the proc and job session directories for this process */
if (ORTE_SUCCESS != (rc = orte_util_convert_vpid_to_string(&vpid, proc->vpid))) {
ORTE_ERROR_LOG(rc);
free(tmp);
return rc;
}
job_session_dir = orte_build_job_session_dir(tmp, proc, proc->jobid);
if( NULL == job_session_dir) {
free(tmp);
free(vpid);
return ORTE_ERR_OUT_OF_RESOURCE;
}
proc_session_dir = opal_os_path( false, job_session_dir, vpid, NULL );
if( NULL == proc_session_dir ) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
free(tmp);
free(vpid);
free(job_session_dir);
return ORTE_ERR_OUT_OF_RESOURCE;
}
opal_os_dirpath_destroy(proc_session_dir,
false, orte_dir_check_file);
opal_os_dirpath_destroy(job_session_dir,
false, orte_dir_check_file);
opal_os_dirpath_destroy(tmp,
false, orte_dir_check_file);
if (opal_os_dirpath_is_empty(proc_session_dir)) {
if (orte_debug_flag) {
opal_output(0, "sess_dir_finalize: found proc session dir empty - deleting");
}
rmdir(proc_session_dir);
} else {
if (orte_debug_flag) {
if (OPAL_ERR_NOT_FOUND == opal_os_dirpath_access(proc_session_dir, 0)) {
opal_output(0, "sess_dir_finalize: proc session dir does not exist");
} else {
opal_output(0, "sess_dir_finalize: proc session dir not empty - leaving");
}
}
goto CLEANUP;
}
if (opal_os_dirpath_is_empty(job_session_dir)) {
if (orte_debug_flag) {
opal_output(0, "sess_dir_finalize: found job session dir empty - deleting");
}
rmdir(job_session_dir);
} else {
if (orte_debug_flag) {
if (OPAL_ERR_NOT_FOUND == opal_os_dirpath_access(job_session_dir, 0)) {
opal_output(0, "sess_dir_finalize: job session dir does not exist");
} else {
opal_output(0, "sess_dir_finalize: job session dir not empty - leaving");
}
}
goto CLEANUP;
}
if (opal_os_dirpath_is_empty(tmp)) {
if (orte_debug_flag) {
opal_output(0, "sess_dir_finalize: found top session dir empty - deleting");
}
rmdir(tmp);
} else {
if (orte_debug_flag) {
if (OPAL_ERR_NOT_FOUND == opal_os_dirpath_access(tmp, 0)) {
opal_output(0, "sess_dir_finalize: top session dir does not exist");
} else {
opal_output(0, "sess_dir_finalize: top session dir not empty - leaving");
}
}
}
CLEANUP:
free(tmp);
free(vpid);
free(job_session_dir);
free(proc_session_dir);
return ORTE_SUCCESS;
}
static bool
orte_dir_check_file(const char *root, const char *path)
{
struct stat st;
char *fullpath;
/*
* Keep:
* - non-zero files starting with "output-"
*/
if (0 == strncmp(path, "output-", strlen("output-"))) {
fullpath = opal_os_path(false, &fullpath, root, path, NULL);
stat(fullpath, &st);
free(fullpath);
if (0 == st.st_size) {
return true;
}
return false;
}
return true;
}
static bool
orte_dir_check_file_output(const char *root, const char *path)
{
struct stat st;
char *fullpath;
/*
* Keep:
* - non-zero files starting with "output-"
*/
if (0 == strncmp(path, "output-", strlen("output-"))) {
fullpath = opal_os_path(false, &fullpath, root, path, NULL);
stat(fullpath, &st);
free(fullpath);
if (0 == st.st_size) {
return true;
}
return false;
}
return true;
}
static char *orte_build_job_session_dir(char *top_dir,
orte_process_name_t *proc,
orte_jobid_t jobid)
{
char *jobfam = NULL;
char *job_session_dir;
if (0 > asprintf(&jobfam, "%d", ORTE_JOB_FAMILY(proc->jobid))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return NULL;
}
if (ORTE_JOBID_WILDCARD != jobid) {
char *job = NULL;
if (0 > asprintf(&job, "%d", ORTE_LOCAL_JOBID(jobid))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
job_session_dir = NULL;
goto out;
}
job_session_dir = opal_os_path(false, top_dir, jobfam, job, NULL);
free(job);
if (NULL == job_session_dir) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
}
} else {
job_session_dir = opal_os_path(false, top_dir, jobfam, NULL);
if( NULL == job_session_dir) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
}
}
out:
free(jobfam);
return job_session_dir;
}