Fixed the mpool sm size specification problem at large -np due to variable has overflown
Added a verbose MCA param for showing the actual size of the mpool sm allocation See trac #1083 for details This commit was SVN r15419.
Этот коммит содержится в:
родитель
7a9a8c7e17
Коммит
685dd6f47b
@ -9,6 +9,7 @@
|
|||||||
* University of Stuttgart. All rights reserved.
|
* University of Stuttgart. All rights reserved.
|
||||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
|
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -36,6 +37,7 @@ struct mca_mpool_sm_component_t {
|
|||||||
/* mca_allocator_base_module_t* sm_allocator; */
|
/* mca_allocator_base_module_t* sm_allocator; */
|
||||||
char* sm_allocator_name;
|
char* sm_allocator_name;
|
||||||
size_t sm_size;
|
size_t sm_size;
|
||||||
|
int verbose;
|
||||||
/* struct mca_mpool_sm_mmap_t *sm_mmap; */
|
/* struct mca_mpool_sm_mmap_t *sm_mmap; */
|
||||||
};
|
};
|
||||||
typedef struct mca_mpool_sm_component_t mca_mpool_sm_component_t;
|
typedef struct mca_mpool_sm_component_t mca_mpool_sm_component_t;
|
||||||
|
@ -9,6 +9,7 @@
|
|||||||
* University of Stuttgart. All rights reserved.
|
* University of Stuttgart. All rights reserved.
|
||||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
|
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -20,6 +21,10 @@
|
|||||||
#if HAVE_UNISTD_H
|
#if HAVE_UNISTD_H
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
#endif /* HAVE_UNISTD_H*/
|
#endif /* HAVE_UNISTD_H*/
|
||||||
|
#ifdef HAVE_STDLIB_H
|
||||||
|
#include <stdlib.h>
|
||||||
|
#endif /* HAVE_STDLIB_H */
|
||||||
|
#include <errno.h>
|
||||||
#include "opal/util/output.h"
|
#include "opal/util/output.h"
|
||||||
#include "opal/mca/base/base.h"
|
#include "opal/mca/base/base.h"
|
||||||
#include "opal/mca/base/mca_base_param.h"
|
#include "opal/mca/base/mca_base_param.h"
|
||||||
@ -68,14 +73,21 @@ mca_mpool_sm_component_t mca_mpool_sm_component = {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
static int max_size_param, min_size_param, peer_size_param;
|
static char *max_size_param, *min_size_param, *peer_size_param;
|
||||||
|
static size_t default_max, default_min, default_peer;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* component open/close/init function
|
* component open/close/init function
|
||||||
*/
|
*/
|
||||||
static int mca_mpool_sm_open(void)
|
static int mca_mpool_sm_open(void)
|
||||||
{
|
{
|
||||||
|
int value = 0;
|
||||||
|
char size_str[100];
|
||||||
|
|
||||||
|
default_max = 512*1024*1024;
|
||||||
|
default_min = 128*1024*1024;
|
||||||
|
default_peer = 32*1024*1024;
|
||||||
|
|
||||||
/* register SM component parameters */
|
/* register SM component parameters */
|
||||||
mca_base_param_reg_string(&mca_mpool_sm_component.super.mpool_version,
|
mca_base_param_reg_string(&mca_mpool_sm_component.super.mpool_version,
|
||||||
"allocator",
|
"allocator",
|
||||||
@ -84,26 +96,36 @@ static int mca_mpool_sm_open(void)
|
|||||||
"bucket",
|
"bucket",
|
||||||
&mca_mpool_sm_component.sm_allocator_name);
|
&mca_mpool_sm_component.sm_allocator_name);
|
||||||
|
|
||||||
max_size_param =
|
/* register values as string instead of int, to allow max_size and sm_size
|
||||||
mca_base_param_reg_int(&mca_mpool_sm_component.super.mpool_version,
|
* to be set greater than 2GB for 32 bit, and even more for 64 bit */
|
||||||
|
sprintf(size_str, "%lu", default_max);
|
||||||
|
mca_base_param_reg_string(&mca_mpool_sm_component.super.mpool_version,
|
||||||
"max_size",
|
"max_size",
|
||||||
"Maximum size of the sm mpool shared memory file",
|
"Maximum size of the sm mpool shared memory file",
|
||||||
false, false, 512 * 1024 * 1024, NULL);
|
false, false, size_str, &max_size_param);
|
||||||
|
|
||||||
min_size_param =
|
sprintf(size_str, "%lu", default_min);
|
||||||
mca_base_param_reg_int(&mca_mpool_sm_component.super.mpool_version,
|
mca_base_param_reg_string(&mca_mpool_sm_component.super.mpool_version,
|
||||||
"min_size",
|
"min_size",
|
||||||
"Minimum size of the sm mpool shared memory file",
|
"Minimum size of the sm mpool shared memory file",
|
||||||
false, false, 128 * 1024 * 1024, NULL);
|
false, false, size_str, &min_size_param);
|
||||||
|
|
||||||
peer_size_param =
|
sprintf(size_str, "%lu", default_peer);
|
||||||
mca_base_param_reg_int(&mca_mpool_sm_component.super.mpool_version,
|
mca_base_param_reg_string(&mca_mpool_sm_component.super.mpool_version,
|
||||||
"per_peer_size",
|
"per_peer_size",
|
||||||
"Size (in bytes) to allocate per local peer in "
|
"Size (in bytes) to allocate per local peer in "
|
||||||
"the sm mpool shared memory file, bounded by "
|
"the sm mpool shared memory file, bounded by "
|
||||||
"min_size and max_size",
|
"min_size and max_size",
|
||||||
false, false, 32 * 1024 * 1024, NULL);
|
false, false, size_str, &peer_size_param);
|
||||||
|
mca_base_param_reg_int(&mca_mpool_sm_component.super.mpool_version,
|
||||||
|
"verbose",
|
||||||
|
"Enable verbose output for mpool sm component",
|
||||||
|
false, false, 0, &value);
|
||||||
|
if (value != 0) {
|
||||||
|
mca_mpool_sm_component.verbose = opal_output_open(NULL);
|
||||||
|
} else {
|
||||||
|
mca_mpool_sm_component.verbose = -1;
|
||||||
|
}
|
||||||
mca_mpool_sm_component.sm_size = 0;
|
mca_mpool_sm_component.sm_size = 0;
|
||||||
|
|
||||||
return OMPI_SUCCESS;
|
return OMPI_SUCCESS;
|
||||||
@ -126,16 +148,11 @@ static mca_mpool_base_module_t* mca_mpool_sm_init(
|
|||||||
char *file_name;
|
char *file_name;
|
||||||
int len;
|
int len;
|
||||||
mca_mpool_sm_module_t* mpool_module;
|
mca_mpool_sm_module_t* mpool_module;
|
||||||
mca_allocator_base_component_t* allocator_component;
|
mca_allocator_base_component_t* allocator_component;
|
||||||
int max_size, min_size, peer_size;
|
size_t max_size, min_size, peer_size;
|
||||||
ompi_proc_t **procs;
|
ompi_proc_t **procs;
|
||||||
size_t num_all_procs, i, num_local_procs = 0;
|
size_t num_all_procs, i, num_local_procs = 0;
|
||||||
|
|
||||||
/* determine size of shared memory file */
|
|
||||||
mca_base_param_lookup_int(max_size_param, &max_size);
|
|
||||||
mca_base_param_lookup_int(min_size_param, &min_size);
|
|
||||||
mca_base_param_lookup_int(peer_size_param, &peer_size);
|
|
||||||
|
|
||||||
/* README: this needs to change if procs in different jobs (even
|
/* README: this needs to change if procs in different jobs (even
|
||||||
spawned ones) are to talk using shared memory */
|
spawned ones) are to talk using shared memory */
|
||||||
procs = ompi_proc_world(&num_all_procs);
|
procs = ompi_proc_world(&num_all_procs);
|
||||||
@ -145,14 +162,60 @@ static mca_mpool_base_module_t* mca_mpool_sm_init(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* parse the max, min and peer sizes, and validate them */
|
||||||
|
/* absolutely necessary to reset errno each time */
|
||||||
|
errno = 0;
|
||||||
|
max_size = strtoul(max_size_param, (char **)NULL, 10);
|
||||||
|
if (errno == ERANGE) {
|
||||||
|
opal_output(0, "mca_mpool_sm_init: max_size overflows! set to default (%lu)", default_max);
|
||||||
|
max_size = default_max;
|
||||||
|
} else if (errno == EINVAL) {
|
||||||
|
opal_output(0, "mca_mpool_sm_init: invalid max_size entered. set it to (%lu)", default_max);
|
||||||
|
max_size = default_max;
|
||||||
|
}
|
||||||
|
|
||||||
|
errno = 0;
|
||||||
|
min_size = strtoul(min_size_param, (char **)NULL, 10);
|
||||||
|
if (errno == ERANGE) {
|
||||||
|
opal_output(0, "mca_mpool_sm_init: min_size overflows! set to default (%lu)", default_min);
|
||||||
|
min_size = default_min;
|
||||||
|
} else if (errno == EINVAL) {
|
||||||
|
opal_output(0, "mca_mpool_sm_init: invalid min_size entered. set it to (%lu)", default_min);
|
||||||
|
min_size = default_min;
|
||||||
|
}
|
||||||
|
|
||||||
|
errno = 0;
|
||||||
|
peer_size = strtoul(peer_size_param, (char **)NULL, 10);
|
||||||
|
if (errno == ERANGE) {
|
||||||
|
opal_output(0, "mca_mpool_sm_init: peer_size overflows! set to default (%lu)", default_peer);
|
||||||
|
peer_size = default_peer;
|
||||||
|
} else if (errno == EINVAL) {
|
||||||
|
opal_output(0, "mca_mpool_sm_init: invalid peer_size entered. set it to (%lu)", default_peer);
|
||||||
|
peer_size = default_peer;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* more checks... */
|
||||||
if (min_size > max_size) {
|
if (min_size > max_size) {
|
||||||
opal_output(0, "mca_mpool_sm_init: adjusting max_size to be min_size (%d)",
|
opal_output(0, "mca_mpool_sm_init: adjusting max_size to be min_size (%lu)",
|
||||||
min_size);
|
min_size);
|
||||||
max_size = min_size;
|
max_size = min_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* set sm_size based on num_procs, then adjust from there */
|
/* sm_size is a product of peer_size * num_local_procs. To prevent the
|
||||||
mca_mpool_sm_component.sm_size = peer_size * num_local_procs;
|
* sm_size from overflowing SIZE_MAX, we first calculate the quotient.
|
||||||
|
* If quotient is less than the peer_size, it means the product
|
||||||
|
* (peer_size * num_local_procs) is going to overflow SIZE_MAX, then we'll
|
||||||
|
* set sm_size to max_size. */
|
||||||
|
if ((double)SIZE_MAX / num_local_procs < peer_size) {
|
||||||
|
/* enable verbose would show if sm_size overflows */
|
||||||
|
opal_output(mca_mpool_sm_component.verbose,
|
||||||
|
"mca_mpool_sm_init: sm_size overflows, set sm_size to max_size (%lu)",
|
||||||
|
SIZE_MAX);
|
||||||
|
mca_mpool_sm_component.sm_size = max_size;
|
||||||
|
} else {
|
||||||
|
mca_mpool_sm_component.sm_size = peer_size * num_local_procs;
|
||||||
|
}
|
||||||
|
|
||||||
if ((size_t) min_size > mca_mpool_sm_component.sm_size) {
|
if ((size_t) min_size > mca_mpool_sm_component.sm_size) {
|
||||||
mca_mpool_sm_component.sm_size = min_size;
|
mca_mpool_sm_component.sm_size = min_size;
|
||||||
}
|
}
|
||||||
@ -163,7 +226,7 @@ static mca_mpool_base_module_t* mca_mpool_sm_init(
|
|||||||
allocator_component = mca_allocator_component_lookup(
|
allocator_component = mca_allocator_component_lookup(
|
||||||
mca_mpool_sm_component.sm_allocator_name);
|
mca_mpool_sm_component.sm_allocator_name);
|
||||||
|
|
||||||
/* if specified allocator cannout be loaded - look for an alternative */
|
/* if specified allocator cannot be loaded - look for an alternative */
|
||||||
if(NULL == allocator_component) {
|
if(NULL == allocator_component) {
|
||||||
if(opal_list_get_size(&mca_allocator_base_components) == 0) {
|
if(opal_list_get_size(&mca_allocator_base_components) == 0) {
|
||||||
mca_base_component_list_item_t* item = (mca_base_component_list_item_t*)
|
mca_base_component_list_item_t* item = (mca_base_component_list_item_t*)
|
||||||
@ -177,8 +240,7 @@ static mca_mpool_base_module_t* mca_mpool_sm_init(
|
|||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
mpool_module = (mca_mpool_sm_module_t*)malloc(sizeof(mca_mpool_sm_module_t));
|
mpool_module = (mca_mpool_sm_module_t*)malloc(sizeof(mca_mpool_sm_module_t));
|
||||||
mca_mpool_sm_module_init(mpool_module);
|
mca_mpool_sm_module_init(mpool_module);
|
||||||
|
|
||||||
@ -189,6 +251,10 @@ static mca_mpool_base_module_t* mca_mpool_sm_init(
|
|||||||
if ( 0 > len ) {
|
if ( 0 > len ) {
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
opal_output(mca_mpool_sm_component.verbose,
|
||||||
|
"mca_mpool_sm_init: shared memory size used: (%lu)",
|
||||||
|
mca_mpool_sm_component.sm_size);
|
||||||
|
|
||||||
if(NULL ==
|
if(NULL ==
|
||||||
(mca_common_sm_mmap =
|
(mca_common_sm_mmap =
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user