1
1

Ensure cleanup of registered files/dirs

Resolve a race condition between registering for a file to be removed upon termination and actual creation of that file by providing attributes that identify whether the path is a file or directory. This removes the need for PMIx to detect the difference.

Refs #4686

Signed-off-by: Ralph Castain <rhc@open-mpi.org>
Этот коммит содержится в:
Ralph Castain 2018-01-11 11:05:30 -08:00
родитель 614696f03c
Коммит 6216225bda
8 изменённых файлов: 48 добавлений и 45 удалений

Просмотреть файл

@ -15,7 +15,7 @@
* Copyright (c) 2010-2017 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2011 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2018 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2016 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
@ -510,9 +510,8 @@ static mca_btl_base_module_t **mca_btl_vader_component_init (int *num_btls,
free (btls);
return NULL;
}
if (NULL != opal_pmix.register_cleanup) {
opal_pmix.register_cleanup (sm_file, false, false);
opal_pmix.register_cleanup (sm_file, false, false, false);
}
rc = opal_shmem_segment_create (&component->seg_ds, sm_file, component->segment_size);

Просмотреть файл

@ -1,6 +1,6 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2018 Intel, Inc. All rights reserved.
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
@ -868,7 +868,7 @@ typedef int (*opal_pmix_base_process_monitor_fn_t)(opal_list_t *monitor,
opal_pmix_info_cbfunc_t cbfunc, void *cbdata);
/* register cleanup */
typedef int (*opal_pmix_base_register_cleanup_fn_t)(char *path, bool ignore, bool jobscope);
typedef int (*opal_pmix_base_register_cleanup_fn_t)(char *path, bool directory, bool ignore, bool jobscope);
/*
* the standard public API data structure

Просмотреть файл

@ -462,7 +462,9 @@ typedef uint32_t pmix_rank_t;
#define PMIX_JOB_CTRL_PROVISION_IMAGE "pmix.jctrl.pvnimg" // (char*) name of the image that is to be provisioned
#define PMIX_JOB_CTRL_PREEMPTIBLE "pmix.jctrl.preempt" // (bool) job can be pre-empted
#define PMIX_JOB_CTRL_TERMINATE "pmix.jctrl.term" // (bool) politely terminate the specified procs
#define PMIX_REGISTER_CLEANUP "pmix.reg.cleanup" // (char*) comma-delimited list of files/directories to
#define PMIX_REGISTER_CLEANUP "pmix.reg.cleanup" // (char*) comma-delimited list of files to
// be removed upon process termination
#define PMIX_REGISTER_CLEANUP_DIR "pmix.reg.cleanupdir" // (char*) comma-delimited list of directories to
// be removed upon process termination
#define PMIX_CLEANUP_RECURSIVE "pmix.clnup.recurse" // (bool) recursively cleanup all subdirectories under the
// specified one(s)

Просмотреть файл

@ -2106,30 +2106,36 @@ pmix_status_t pmix_server_job_ctrl(pmix_peer_t *peer,
rc = PMIX_ERR_BAD_PARAM;
goto exit;
}
if (0 != stat(cd->info[n].value.data.string, &statbuf)) {
cf = PMIX_NEW(pmix_cleanup_file_t);
if (NULL == cf) {
/* return an error */
rc = PMIX_ERR_NOMEM;
goto exit;
}
cf->path = strdup(cd->info[n].value.data.string);
pmix_list_append(&cachefiles, &cf->super);
} else if (0 == strncmp(cd->info[n].key, PMIX_REGISTER_CLEANUP_DIR, PMIX_MAX_KEYLEN)) {
++cnt;
/* see if we allow epilog requests */
if (NULL == epi) {
/* return an error */
rc = PMIX_ERR_BAD_PARAM;
goto exit;
}
if (S_ISDIR(statbuf.st_mode)) {
cdir = PMIX_NEW(pmix_cleanup_dir_t);
if (NULL == cdir) {
/* return an error */
rc = PMIX_ERR_NOMEM;
goto exit;
}
cdir->path = strdup(cd->info[n].value.data.string);
pmix_list_append(&cachedirs, &cdir->super);
} else {
cf = PMIX_NEW(pmix_cleanup_file_t);
if (NULL == cf) {
/* return an error */
rc = PMIX_ERR_NOMEM;
goto exit;
}
cf->path = strdup(cd->info[n].value.data.string);
pmix_list_append(&cachefiles, &cf->super);
if (PMIX_STRING != cd->info[n].value.type ||
NULL == cd->info[n].value.data.string) {
/* return an error */
rc = PMIX_ERR_BAD_PARAM;
goto exit;
}
cdir = PMIX_NEW(pmix_cleanup_dir_t);
if (NULL == cdir) {
/* return an error */
rc = PMIX_ERR_NOMEM;
goto exit;
}
cdir->path = strdup(cd->info[n].value.data.string);
pmix_list_append(&cachedirs, &cdir->super);
} else if (0 == strncmp(cd->info[n].key, PMIX_CLEANUP_RECURSIVE, PMIX_MAX_KEYLEN)) {
/* see if we allow epilog requests */
if (NULL == epi) {

Просмотреть файл

@ -1,6 +1,6 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2018 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2017 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2014-2015 Mellanox Technologies, Inc.
@ -74,7 +74,7 @@ static void pmix3x_query(opal_list_t *queries,
static void pmix3x_log(opal_list_t *info,
opal_pmix_op_cbfunc_t cbfunc, void *cbdata);
static int pmix3x_register_cleanup(char *path, bool ignore, bool jobscope);
static int pmix3x_register_cleanup(char *path, bool directory, bool ignore, bool jobscope);
const opal_pmix_base_module_t opal_pmix_pmix3x_module = {
/* client APIs */
@ -360,14 +360,13 @@ static void cleanup_cbfunc(pmix_status_t status,
OPAL_PMIX_WAKEUP_THREAD(lk);
}
static int pmix3x_register_cleanup(char *path, bool ignore, bool jobscope)
static int pmix3x_register_cleanup(char *path, bool directory, bool ignore, bool jobscope)
{
opal_pmix_lock_t lk;
pmix_info_t pinfo[3];
size_t n, ninfo=0;
pmix_status_t rc;
int ret;
struct stat statbuf;
OPAL_PMIX_CONSTRUCT_LOCK(&lk);
@ -376,18 +375,16 @@ static int pmix3x_register_cleanup(char *path, bool ignore, bool jobscope)
PMIX_INFO_LOAD(&pinfo[ninfo], PMIX_CLEANUP_IGNORE, path, PMIX_STRING);
++ninfo;
} else {
/* order cleanup of the provided path */
PMIX_INFO_LOAD(&pinfo[ninfo], PMIX_REGISTER_CLEANUP, path, PMIX_STRING);
++ninfo;
/* if the path is a directory, then we need to tell the server
* to recursively clean up */
if (stat(path, &statbuf) != 0) {
return OPAL_ERR_NOT_FOUND;
}
if (S_ISDIR(statbuf.st_mode)) {
if (directory) {
PMIX_INFO_LOAD(&pinfo[ninfo], PMIX_REGISTER_CLEANUP_DIR, path, PMIX_STRING);
++ninfo;
/* recursively cleanup directories */
PMIX_INFO_LOAD(&pinfo[ninfo], PMIX_CLEANUP_RECURSIVE, NULL, PMIX_BOOL);
++ninfo;
} else {
/* order cleanup of the provided path */
PMIX_INFO_LOAD(&pinfo[ninfo], PMIX_REGISTER_CLEANUP, path, PMIX_STRING);
++ninfo;
}
}

Просмотреть файл

@ -1,6 +1,6 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2018 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2017 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
@ -201,7 +201,6 @@ int pmix3x_server_finalize(void)
}
}
OPAL_PMIX_RELEASE_THREAD(&opal_pmix_base.lock);
rc = PMIx_server_finalize();
return pmix3x_convert_rc(rc);
}

Просмотреть файл

@ -16,7 +16,7 @@
* Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2017 IBM Corporation. All rights reserved.
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* Copyright (c) 2017-2018 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -802,7 +802,7 @@ static int open_file(int i)
/* register it to be ignored */
if (NULL != opal_pmix.register_cleanup) {
opal_pmix.register_cleanup(filename, true, false);
opal_pmix.register_cleanup(filename, false, true, false);
}
free(filename); /* release the filename in all cases */
}

Просмотреть файл

@ -12,7 +12,7 @@
* Copyright (c) 2010-2012 Oak Ridge National Labs. All rights reserved.
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2013-2018 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2016 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
@ -149,13 +149,13 @@ int orte_ess_base_app_setup(bool db_restrict_local)
/* register the directory for cleanup */
if (NULL != opal_pmix.register_cleanup) {
if (orte_standalone_operation) {
if (OPAL_SUCCESS != (ret = opal_pmix.register_cleanup(orte_process_info.top_session_dir, false, true))) {
if (OPAL_SUCCESS != (ret = opal_pmix.register_cleanup(orte_process_info.top_session_dir, true, false, true))) {
ORTE_ERROR_LOG(ret);
error = "register cleanup";
goto error;
}
} else {
if (OPAL_SUCCESS != (ret = opal_pmix.register_cleanup(orte_process_info.jobfam_session_dir, false, false))) {
if (OPAL_SUCCESS != (ret = opal_pmix.register_cleanup(orte_process_info.jobfam_session_dir, true, false, false))) {
ORTE_ERROR_LOG(ret);
error = "register cleanup";
goto error;