1
1

Fix bus errors caused by an inadequate amount of space during

opal_shmem_segment_create by testing whether or not the target mount has enough
space to accommodate the shared-memory backing store. Fixes trac:2827. Will work
with Shiqing to add Windows support (if required).

This commit was SVN r27433.

The following Trac tickets were found above:
  Ticket 2827 --> https://svn.open-mpi.org/trac/ompi/ticket/2827
Этот коммит содержится в:
Samuel Gutierrez 2012-10-09 20:48:04 +00:00
родитель 9984a7143f
Коммит 0461826a4b
4 изменённых файлов: 158 добавлений и 3 удалений

Просмотреть файл

@ -1,7 +1,7 @@
# -*- text -*-
#
# Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2010 Los Alamos National Security, LLC.
# Copyright (c) 2010-2012 Los Alamos National Security, LLC.
# All rights reserved.
#
# $COPYRIGHT$
@ -42,3 +42,12 @@ the MCA parameter "orte_no_session_dir".
You can set the MCA paramter shmem_mmap_enable_nfs_warning to 0 to
disable this message.
#
[target full]
It appears as if there is not enough space for %s (the shared-memory backing
file). It is likely that your MPI job will now either abort or experience
performance degradation.
Local host: %s
Space Requested: %ld B
Space Available: %ld B

Просмотреть файл

@ -11,7 +11,7 @@
* All rights reserved.
* Copyright (c) 2007-2011 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2010-2011 Los Alamos National Security, LLC.
* Copyright (c) 2010-2012 Los Alamos National Security, LLC.
* All rights reserved.
*
* $COPYRIGHT$
@ -88,6 +88,11 @@ segment_unlink(opal_shmem_ds_t *ds_buf);
static int
module_finalize(void);
static int
enough_space(const char *filename,
long space_req,
bool *result);
/*
* mmap shmem module
*/
@ -132,6 +137,57 @@ shmem_ds_reset(opal_shmem_ds_t *ds_buf)
ds_buf->seg_base_addr = (unsigned char *)MAP_FAILED;
}
/* ////////////////////////////////////////////////////////////////////////// */
static int
enough_space(const char *filename,
long space_req,
bool *result)
{
long avail = 0;
long fluff = (long)(.05 * space_req);
bool enough = false;
char *last_sep = NULL;
/* the target file name is passed here, but we need to check the parent
* directory. store it so we can extract that info later. */
char *target_dir = strdup(filename);
int rc;
if (NULL == target_dir) {
rc = OPAL_ERR_OUT_OF_RESOURCE;
goto out;
}
/* get the parent directory */
last_sep = strrchr(target_dir, OPAL_PATH_SEP[0]);
*last_sep = '\0';
/* now check space availability */
if (OPAL_SUCCESS != (rc = opal_path_df(target_dir, &avail))) {
OPAL_OUTPUT_VERBOSE(
(70, opal_shmem_base_output,
"WARNING: opal_path_df failure!")
);
goto out;
}
/* do we have enough space? */
if (avail >= space_req + fluff) {
enough = true;
}
else {
OPAL_OUTPUT_VERBOSE(
(70, opal_shmem_base_output,
"WARNING: not enough space on %s to meet request!"
"available: %ld requested: %ld", target_dir,
avail, space_req + fluff)
);
}
out:
if (NULL != target_dir) {
free(target_dir);
}
*result = enough;
return rc;
}
/* ////////////////////////////////////////////////////////////////////////// */
static int
module_init(void)
@ -243,6 +299,7 @@ segment_create(opal_shmem_ds_t *ds_buf,
int rc = OPAL_SUCCESS;
char *real_file_name = NULL;
pid_t my_pid = getpid();
bool space_available = false;
/* the real size of the shared memory segment. this includes enough space
* to store our segment header.
*/
@ -311,6 +368,25 @@ segment_create(opal_shmem_ds_t *ds_buf,
opal_show_help("help-opal-shmem-mmap.txt", "mmap on nfs", 1, hn,
real_file_name);
}
/* let's make sure we have enough space for the backing file */
if (OPAL_SUCCESS != (rc = enough_space(real_file_name,
(long)real_size,
&space_available))) {
opal_output(0, "shmem: mmap: an error occurred while determining "
"whether or not %s could be created.", real_file_name);
/* rc is set */
goto out;
}
if (!space_available) {
char hn[MAXHOSTNAMELEN];
gethostname(hn, MAXHOSTNAMELEN - 1);
hn[MAXHOSTNAMELEN - 1] = '\0';
rc = OPAL_ERR_OUT_OF_RESOURCE;
opal_show_help("help-opal-shmem-mmap.txt", "target full", 1,
real_file_name, hn, (long)real_size, space_available);
goto out;
}
/* enough space is available, so create the segment */
if (-1 == (ds_buf->seg_id = open(real_file_name, O_CREAT | O_RDWR, 0600))) {
int err = errno;
char hn[MAXHOSTNAMELEN];

Просмотреть файл

@ -11,6 +11,8 @@
* All rights reserved.
* Copyright (c) 2009-2012 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2010 IBM Corporation. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, LLC.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -533,3 +535,55 @@ found:
#endif /* __WINDOWS__ */
}
int
opal_path_df(const char *path,
long *out_avail)
{
#if !defined(__WINDOWS__)
int rc = -1;
int trials = 5;
int err = 0;
#if defined(__SVR4) && defined(__sun)
struct statvfs buf;
#elif defined(__linux__) || defined (__BSD) || \
(defined(__APPLE__) && defined(__MACH__))
struct statfs buf;
#endif
if (NULL == path || NULL == out_avail) {
return OPAL_ERROR;
}
do {
#if defined(__SVR4) && defined(__sun)
rc = statvfs(path, &buf);
#elif defined(__linux__) || defined (__BSD) || \
(defined(__APPLE__) && defined(__MACH__))
rc = statfs(path, &buf);
#endif
err = errno;
} while (-1 == rc && ESTALE == err && (--trials > 0));
if (-1 == rc) {
OPAL_OUTPUT_VERBOSE((10, 2, "opal_path_df: stat(v)fs on "
"path: %s failed with errno: %d (%s)\n",
path, err, strerror(err)));
return OPAL_ERROR;
}
/* now set the amount of free space available on path */
/* sometimes buf.f_bavail is negative */
*out_avail = buf.f_bsize * ((int)buf.f_bavail < 0 ? 0 : buf.f_bavail);
OPAL_OUTPUT_VERBOSE((10, 2, "opal_path_df: stat(v)fs states "
"path: %s has %ld B of free space.",
path, *out_avail));
return OPAL_SUCCESS;
#else /* defined __WINDOWS__ */
/* FIXME if need Windows support */
*out_avail = 0;
return OPAL_SUCCESS;
#endif /* !defined(__WINDOWS__) */
}

Просмотреть файл

@ -9,6 +9,8 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, LLC.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -23,6 +25,8 @@
#include "opal_config.h"
#include "opal/constants.h"
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
@ -136,6 +140,18 @@ OPAL_DECLSPEC char *opal_path_access(char *fname, char *path, int mode) __opal_a
*/
OPAL_DECLSPEC bool opal_path_nfs(char *fname) __opal_attribute_warn_unused_result__;
/**
* @brief Returns the disk usage of path.
*
* @param[in] path Path to check
* @out_avail[out] Amount of free space available on path (if successful)
*
* @retval OPAL_SUCCESS If the operation was successful
* @retval OPAL_ERROR otherwise
*/
OPAL_DECLSPEC int
opal_path_df(const char *path,
long *out_avail)__opal_attribute_warn_unused_result__;
END_C_DECLS
#endif /* OPAL_PATH_H */