Fix bus errors caused by an inadequate amount of space during
opal_shmem_segment_create by testing whether or not the target mount has enough space to accommodate the shared-memory backing store. Fixes trac:2827. Will work with Shiqing to add Windows support (if required). This commit was SVN r27433. The following Trac tickets were found above: Ticket 2827 --> https://svn.open-mpi.org/trac/ompi/ticket/2827
Этот коммит содержится в:
родитель
9984a7143f
Коммит
0461826a4b
opal
@ -1,7 +1,7 @@
|
||||
# -*- text -*-
|
||||
#
|
||||
# Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2010 Los Alamos National Security, LLC.
|
||||
# Copyright (c) 2010-2012 Los Alamos National Security, LLC.
|
||||
# All rights reserved.
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
@ -42,3 +42,12 @@ the MCA parameter "orte_no_session_dir".
|
||||
|
||||
You can set the MCA paramter shmem_mmap_enable_nfs_warning to 0 to
|
||||
disable this message.
|
||||
#
|
||||
[target full]
|
||||
It appears as if there is not enough space for %s (the shared-memory backing
|
||||
file). It is likely that your MPI job will now either abort or experience
|
||||
performance degradation.
|
||||
|
||||
Local host: %s
|
||||
Space Requested: %ld B
|
||||
Space Available: %ld B
|
||||
|
@ -11,7 +11,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007-2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
|
||||
* Copyright (c) 2010-2011 Los Alamos National Security, LLC.
|
||||
* Copyright (c) 2010-2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
@ -88,6 +88,11 @@ segment_unlink(opal_shmem_ds_t *ds_buf);
|
||||
static int
|
||||
module_finalize(void);
|
||||
|
||||
static int
|
||||
enough_space(const char *filename,
|
||||
long space_req,
|
||||
bool *result);
|
||||
|
||||
/*
|
||||
* mmap shmem module
|
||||
*/
|
||||
@ -132,6 +137,57 @@ shmem_ds_reset(opal_shmem_ds_t *ds_buf)
|
||||
ds_buf->seg_base_addr = (unsigned char *)MAP_FAILED;
|
||||
}
|
||||
|
||||
/* ////////////////////////////////////////////////////////////////////////// */
|
||||
static int
|
||||
enough_space(const char *filename,
|
||||
long space_req,
|
||||
bool *result)
|
||||
{
|
||||
long avail = 0;
|
||||
long fluff = (long)(.05 * space_req);
|
||||
bool enough = false;
|
||||
char *last_sep = NULL;
|
||||
/* the target file name is passed here, but we need to check the parent
|
||||
* directory. store it so we can extract that info later. */
|
||||
char *target_dir = strdup(filename);
|
||||
int rc;
|
||||
|
||||
if (NULL == target_dir) {
|
||||
rc = OPAL_ERR_OUT_OF_RESOURCE;
|
||||
goto out;
|
||||
}
|
||||
/* get the parent directory */
|
||||
last_sep = strrchr(target_dir, OPAL_PATH_SEP[0]);
|
||||
*last_sep = '\0';
|
||||
/* now check space availability */
|
||||
if (OPAL_SUCCESS != (rc = opal_path_df(target_dir, &avail))) {
|
||||
OPAL_OUTPUT_VERBOSE(
|
||||
(70, opal_shmem_base_output,
|
||||
"WARNING: opal_path_df failure!")
|
||||
);
|
||||
goto out;
|
||||
}
|
||||
/* do we have enough space? */
|
||||
if (avail >= space_req + fluff) {
|
||||
enough = true;
|
||||
}
|
||||
else {
|
||||
OPAL_OUTPUT_VERBOSE(
|
||||
(70, opal_shmem_base_output,
|
||||
"WARNING: not enough space on %s to meet request!"
|
||||
"available: %ld requested: %ld", target_dir,
|
||||
avail, space_req + fluff)
|
||||
);
|
||||
}
|
||||
|
||||
out:
|
||||
if (NULL != target_dir) {
|
||||
free(target_dir);
|
||||
}
|
||||
*result = enough;
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* ////////////////////////////////////////////////////////////////////////// */
|
||||
static int
|
||||
module_init(void)
|
||||
@ -243,6 +299,7 @@ segment_create(opal_shmem_ds_t *ds_buf,
|
||||
int rc = OPAL_SUCCESS;
|
||||
char *real_file_name = NULL;
|
||||
pid_t my_pid = getpid();
|
||||
bool space_available = false;
|
||||
/* the real size of the shared memory segment. this includes enough space
|
||||
* to store our segment header.
|
||||
*/
|
||||
@ -311,6 +368,25 @@ segment_create(opal_shmem_ds_t *ds_buf,
|
||||
opal_show_help("help-opal-shmem-mmap.txt", "mmap on nfs", 1, hn,
|
||||
real_file_name);
|
||||
}
|
||||
/* let's make sure we have enough space for the backing file */
|
||||
if (OPAL_SUCCESS != (rc = enough_space(real_file_name,
|
||||
(long)real_size,
|
||||
&space_available))) {
|
||||
opal_output(0, "shmem: mmap: an error occurred while determining "
|
||||
"whether or not %s could be created.", real_file_name);
|
||||
/* rc is set */
|
||||
goto out;
|
||||
}
|
||||
if (!space_available) {
|
||||
char hn[MAXHOSTNAMELEN];
|
||||
gethostname(hn, MAXHOSTNAMELEN - 1);
|
||||
hn[MAXHOSTNAMELEN - 1] = '\0';
|
||||
rc = OPAL_ERR_OUT_OF_RESOURCE;
|
||||
opal_show_help("help-opal-shmem-mmap.txt", "target full", 1,
|
||||
real_file_name, hn, (long)real_size, space_available);
|
||||
goto out;
|
||||
}
|
||||
/* enough space is available, so create the segment */
|
||||
if (-1 == (ds_buf->seg_id = open(real_file_name, O_CREAT | O_RDWR, 0600))) {
|
||||
int err = errno;
|
||||
char hn[MAXHOSTNAMELEN];
|
||||
|
@ -11,6 +11,8 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2009-2012 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2010 IBM Corporation. All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -533,3 +535,55 @@ found:
|
||||
#endif /* __WINDOWS__ */
|
||||
}
|
||||
|
||||
int
|
||||
opal_path_df(const char *path,
|
||||
long *out_avail)
|
||||
{
|
||||
#if !defined(__WINDOWS__)
|
||||
int rc = -1;
|
||||
int trials = 5;
|
||||
int err = 0;
|
||||
#if defined(__SVR4) && defined(__sun)
|
||||
struct statvfs buf;
|
||||
#elif defined(__linux__) || defined (__BSD) || \
|
||||
(defined(__APPLE__) && defined(__MACH__))
|
||||
struct statfs buf;
|
||||
#endif
|
||||
|
||||
if (NULL == path || NULL == out_avail) {
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
|
||||
do {
|
||||
#if defined(__SVR4) && defined(__sun)
|
||||
rc = statvfs(path, &buf);
|
||||
#elif defined(__linux__) || defined (__BSD) || \
|
||||
(defined(__APPLE__) && defined(__MACH__))
|
||||
rc = statfs(path, &buf);
|
||||
#endif
|
||||
err = errno;
|
||||
} while (-1 == rc && ESTALE == err && (--trials > 0));
|
||||
|
||||
if (-1 == rc) {
|
||||
OPAL_OUTPUT_VERBOSE((10, 2, "opal_path_df: stat(v)fs on "
|
||||
"path: %s failed with errno: %d (%s)\n",
|
||||
path, err, strerror(err)));
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
|
||||
/* now set the amount of free space available on path */
|
||||
/* sometimes buf.f_bavail is negative */
|
||||
*out_avail = buf.f_bsize * ((int)buf.f_bavail < 0 ? 0 : buf.f_bavail);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((10, 2, "opal_path_df: stat(v)fs states "
|
||||
"path: %s has %ld B of free space.",
|
||||
path, *out_avail));
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
|
||||
#else /* defined __WINDOWS__ */
|
||||
/* FIXME if need Windows support */
|
||||
*out_avail = 0;
|
||||
return OPAL_SUCCESS;
|
||||
#endif /* !defined(__WINDOWS__) */
|
||||
}
|
||||
|
@ -9,6 +9,8 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -23,6 +25,8 @@
|
||||
|
||||
#include "opal_config.h"
|
||||
|
||||
#include "opal/constants.h"
|
||||
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
@ -136,6 +140,18 @@ OPAL_DECLSPEC char *opal_path_access(char *fname, char *path, int mode) __opal_a
|
||||
*/
|
||||
OPAL_DECLSPEC bool opal_path_nfs(char *fname) __opal_attribute_warn_unused_result__;
|
||||
|
||||
/**
|
||||
* @brief Returns the disk usage of path.
|
||||
*
|
||||
* @param[in] path Path to check
|
||||
* @out_avail[out] Amount of free space available on path (if successful)
|
||||
*
|
||||
* @retval OPAL_SUCCESS If the operation was successful
|
||||
* @retval OPAL_ERROR otherwise
|
||||
*/
|
||||
OPAL_DECLSPEC int
|
||||
opal_path_df(const char *path,
|
||||
long *out_avail)__opal_attribute_warn_unused_result__;
|
||||
|
||||
END_C_DECLS
|
||||
#endif /* OPAL_PATH_H */
|
||||
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user