From 685dd6f47b2e34f29aa0c0d938e5232bf0e03c67 Mon Sep 17 00:00:00 2001 From: Pak Lui Date: Fri, 13 Jul 2007 20:49:30 +0000 Subject: [PATCH] Fixed the mpool sm size specification problem at large -np due to variable has overflown Added a verbose MCA param for showing the actual size of the mpool sm allocation See trac #1083 for details This commit was SVN r15419. --- ompi/mca/mpool/sm/mpool_sm.h | 2 + ompi/mca/mpool/sm/mpool_sm_component.c | 118 +++++++++++++++++++------ 2 files changed, 94 insertions(+), 26 deletions(-) diff --git a/ompi/mca/mpool/sm/mpool_sm.h b/ompi/mca/mpool/sm/mpool_sm.h index 5686f05f1c..936c08c15d 100644 --- a/ompi/mca/mpool/sm/mpool_sm.h +++ b/ompi/mca/mpool/sm/mpool_sm.h @@ -9,6 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -36,6 +37,7 @@ struct mca_mpool_sm_component_t { /* mca_allocator_base_module_t* sm_allocator; */ char* sm_allocator_name; size_t sm_size; + int verbose; /* struct mca_mpool_sm_mmap_t *sm_mmap; */ }; typedef struct mca_mpool_sm_component_t mca_mpool_sm_component_t; diff --git a/ompi/mca/mpool/sm/mpool_sm_component.c b/ompi/mca/mpool/sm/mpool_sm_component.c index b0749aa3ee..2340af7d3a 100644 --- a/ompi/mca/mpool/sm/mpool_sm_component.c +++ b/ompi/mca/mpool/sm/mpool_sm_component.c @@ -9,6 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -20,6 +21,10 @@ #if HAVE_UNISTD_H #include #endif /* HAVE_UNISTD_H*/ +#ifdef HAVE_STDLIB_H +#include +#endif /* HAVE_STDLIB_H */ +#include #include "opal/util/output.h" #include "opal/mca/base/base.h" #include "opal/mca/base/mca_base_param.h" @@ -68,14 +73,21 @@ mca_mpool_sm_component_t mca_mpool_sm_component = { } }; -static int max_size_param, min_size_param, peer_size_param; - +static char *max_size_param, *min_size_param, *peer_size_param; +static size_t default_max, default_min, default_peer; /** * component open/close/init function */ static int mca_mpool_sm_open(void) { + int value = 0; + char size_str[100]; + + default_max = 512*1024*1024; + default_min = 128*1024*1024; + default_peer = 32*1024*1024; + /* register SM component parameters */ mca_base_param_reg_string(&mca_mpool_sm_component.super.mpool_version, "allocator", @@ -84,26 +96,36 @@ static int mca_mpool_sm_open(void) "bucket", &mca_mpool_sm_component.sm_allocator_name); - max_size_param = - mca_base_param_reg_int(&mca_mpool_sm_component.super.mpool_version, + /* register values as string instead of int, to allow max_size and sm_size + * to be set greater than 2GB for 32 bit, and even more for 64 bit */ + sprintf(size_str, "%lu", default_max); + mca_base_param_reg_string(&mca_mpool_sm_component.super.mpool_version, "max_size", "Maximum size of the sm mpool shared memory file", - false, false, 512 * 1024 * 1024, NULL); + false, false, size_str, &max_size_param); - min_size_param = - mca_base_param_reg_int(&mca_mpool_sm_component.super.mpool_version, + sprintf(size_str, "%lu", default_min); + mca_base_param_reg_string(&mca_mpool_sm_component.super.mpool_version, "min_size", "Minimum size of the sm mpool shared memory file", - false, false, 128 * 1024 * 1024, NULL); + false, false, size_str, &min_size_param); - peer_size_param = - mca_base_param_reg_int(&mca_mpool_sm_component.super.mpool_version, + sprintf(size_str, "%lu", default_peer); + mca_base_param_reg_string(&mca_mpool_sm_component.super.mpool_version, "per_peer_size", "Size (in bytes) to allocate per local peer in " "the sm mpool shared memory file, bounded by " "min_size and max_size", - false, false, 32 * 1024 * 1024, NULL); - + false, false, size_str, &peer_size_param); + mca_base_param_reg_int(&mca_mpool_sm_component.super.mpool_version, + "verbose", + "Enable verbose output for mpool sm component", + false, false, 0, &value); + if (value != 0) { + mca_mpool_sm_component.verbose = opal_output_open(NULL); + } else { + mca_mpool_sm_component.verbose = -1; + } mca_mpool_sm_component.sm_size = 0; return OMPI_SUCCESS; @@ -126,16 +148,11 @@ static mca_mpool_base_module_t* mca_mpool_sm_init( char *file_name; int len; mca_mpool_sm_module_t* mpool_module; - mca_allocator_base_component_t* allocator_component; - int max_size, min_size, peer_size; + mca_allocator_base_component_t* allocator_component; + size_t max_size, min_size, peer_size; ompi_proc_t **procs; size_t num_all_procs, i, num_local_procs = 0; - - /* determine size of shared memory file */ - mca_base_param_lookup_int(max_size_param, &max_size); - mca_base_param_lookup_int(min_size_param, &min_size); - mca_base_param_lookup_int(peer_size_param, &peer_size); - + /* README: this needs to change if procs in different jobs (even spawned ones) are to talk using shared memory */ procs = ompi_proc_world(&num_all_procs); @@ -145,14 +162,60 @@ static mca_mpool_base_module_t* mca_mpool_sm_init( } } + /* parse the max, min and peer sizes, and validate them */ + /* absolutely necessary to reset errno each time */ + errno = 0; + max_size = strtoul(max_size_param, (char **)NULL, 10); + if (errno == ERANGE) { + opal_output(0, "mca_mpool_sm_init: max_size overflows! set to default (%lu)", default_max); + max_size = default_max; + } else if (errno == EINVAL) { + opal_output(0, "mca_mpool_sm_init: invalid max_size entered. set it to (%lu)", default_max); + max_size = default_max; + } + + errno = 0; + min_size = strtoul(min_size_param, (char **)NULL, 10); + if (errno == ERANGE) { + opal_output(0, "mca_mpool_sm_init: min_size overflows! set to default (%lu)", default_min); + min_size = default_min; + } else if (errno == EINVAL) { + opal_output(0, "mca_mpool_sm_init: invalid min_size entered. set it to (%lu)", default_min); + min_size = default_min; + } + + errno = 0; + peer_size = strtoul(peer_size_param, (char **)NULL, 10); + if (errno == ERANGE) { + opal_output(0, "mca_mpool_sm_init: peer_size overflows! set to default (%lu)", default_peer); + peer_size = default_peer; + } else if (errno == EINVAL) { + opal_output(0, "mca_mpool_sm_init: invalid peer_size entered. set it to (%lu)", default_peer); + peer_size = default_peer; + } + + /* more checks... */ if (min_size > max_size) { - opal_output(0, "mca_mpool_sm_init: adjusting max_size to be min_size (%d)", + opal_output(0, "mca_mpool_sm_init: adjusting max_size to be min_size (%lu)", min_size); max_size = min_size; } - /* set sm_size based on num_procs, then adjust from there */ - mca_mpool_sm_component.sm_size = peer_size * num_local_procs; + /* sm_size is a product of peer_size * num_local_procs. To prevent the + * sm_size from overflowing SIZE_MAX, we first calculate the quotient. + * If quotient is less than the peer_size, it means the product + * (peer_size * num_local_procs) is going to overflow SIZE_MAX, then we'll + * set sm_size to max_size. */ + if ((double)SIZE_MAX / num_local_procs < peer_size) { + /* enable verbose would show if sm_size overflows */ + opal_output(mca_mpool_sm_component.verbose, + "mca_mpool_sm_init: sm_size overflows, set sm_size to max_size (%lu)", + SIZE_MAX); + mca_mpool_sm_component.sm_size = max_size; + } else { + mca_mpool_sm_component.sm_size = peer_size * num_local_procs; + } + if ((size_t) min_size > mca_mpool_sm_component.sm_size) { mca_mpool_sm_component.sm_size = min_size; } @@ -163,7 +226,7 @@ static mca_mpool_base_module_t* mca_mpool_sm_init( allocator_component = mca_allocator_component_lookup( mca_mpool_sm_component.sm_allocator_name); - /* if specified allocator cannout be loaded - look for an alternative */ + /* if specified allocator cannot be loaded - look for an alternative */ if(NULL == allocator_component) { if(opal_list_get_size(&mca_allocator_base_components) == 0) { mca_base_component_list_item_t* item = (mca_base_component_list_item_t*) @@ -177,8 +240,7 @@ static mca_mpool_base_module_t* mca_mpool_sm_init( return NULL; } } - - + mpool_module = (mca_mpool_sm_module_t*)malloc(sizeof(mca_mpool_sm_module_t)); mca_mpool_sm_module_init(mpool_module); @@ -189,6 +251,10 @@ static mca_mpool_base_module_t* mca_mpool_sm_init( if ( 0 > len ) { return NULL; } + + opal_output(mca_mpool_sm_component.verbose, + "mca_mpool_sm_init: shared memory size used: (%lu)", + mca_mpool_sm_component.sm_size); if(NULL == (mca_common_sm_mmap =