From be3fc7bf202546a64c299f3055b231edc0985e9e Mon Sep 17 00:00:00 2001 From: Mike Dubman Date: Tue, 25 Mar 2014 15:27:13 +0000 Subject: [PATCH] OSHMEM: better error messages when failing Provide users with right fail reason. fixes trac:4433 This commit was SVN r31202. The following Trac tickets were found above: Ticket 4433 --> https://svn.open-mpi.org/trac/ompi/ticket/4433 --- oshmem/mca/sshmem/base/Makefile.am | 2 ++ oshmem/mca/sshmem/base/help-oshmem-sshmem.txt | 25 ++++++++++++++++ .../sshmem/mmap/help-oshmem-sshmem-mmap.txt | 23 +++++++-------- oshmem/mca/sshmem/mmap/sshmem_mmap_module.c | 17 +++++------ .../sshmem/sysv/help-oshmem-sshmem-sysv.txt | 17 +++-------- oshmem/mca/sshmem/sysv/sshmem_sysv_module.c | 29 ++++++++++--------- oshmem/runtime/oshmem_shmem_init.c | 10 +++---- 7 files changed, 70 insertions(+), 53 deletions(-) create mode 100644 oshmem/mca/sshmem/base/help-oshmem-sshmem.txt diff --git a/oshmem/mca/sshmem/base/Makefile.am b/oshmem/mca/sshmem/base/Makefile.am index 79b11b1269..3b0dc81ea2 100644 --- a/oshmem/mca/sshmem/base/Makefile.am +++ b/oshmem/mca/sshmem/base/Makefile.am @@ -7,6 +7,8 @@ # $HEADER$ # +dist_ompidata_DATA = base/help-oshmem-sshmem.txt + headers += \ base/base.h diff --git a/oshmem/mca/sshmem/base/help-oshmem-sshmem.txt b/oshmem/mca/sshmem/base/help-oshmem-sshmem.txt new file mode 100644 index 0000000000..72ced52811 --- /dev/null +++ b/oshmem/mca/sshmem/base/help-oshmem-sshmem.txt @@ -0,0 +1,25 @@ +# -*- text -*- +# +# Copyright (c) 2013 Mellanox Technologies, Inc. +# All rights reserved. +# Copyright (c) 2014 Cisco Systems, Inc. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# +# This is the US/English help file for Open SHMEM MCA error messages. +# +[create segment failure] +The OpenSHMEM "(%s)" plugin in the "sshmem" framework failed to +allocate a shared memory segment via the %s system call. This +usually means that there are not enough resources available to memory subsystem on your server. + +Your OpenSHMEM job will now abort. + + Server: %s + Requested shared + memory segment size: %llu + Specific error: %s (%d) + diff --git a/oshmem/mca/sshmem/mmap/help-oshmem-sshmem-mmap.txt b/oshmem/mca/sshmem/mmap/help-oshmem-sshmem-mmap.txt index 9940243517..9ae3bae259 100644 --- a/oshmem/mca/sshmem/mmap/help-oshmem-sshmem-mmap.txt +++ b/oshmem/mca/sshmem/mmap/help-oshmem-sshmem-mmap.txt @@ -10,19 +10,16 @@ # # -[mmap segment failed] -The OpenSHMEM "mmap" plugin in the "sshmem" framework failed to -allocate a shared memory segement via the mmap system call. This -usually means that there are not enough resources available to your -memory subsystem on your server. +[mmap:create segment failure] -Your OpenSHMEM job will now abort. +You can try the following: - Server: %s - Requested mmap - segment size: %u - Specific error: %s (%d) +1. Decrease the symmetric heap area with + "-x SHMEM_SYMMETRIC_HEAP_SIZE=". +2. Set "--mca sshmem_base_start_address 0" for + automatic selection by OS of virtual start address for sshmem. -You can try to decrease the symmetric heap area with: - -"-x SHMEM_SYMMETRIC_HEAP_SIZE=". +This issue could also be related to CONFIG_STRICT_DEVMEM +kernel option which if enabled prevents access to physical +memory via "mmap". In this case you could try using other +sshmem components instead. diff --git a/oshmem/mca/sshmem/mmap/sshmem_mmap_module.c b/oshmem/mca/sshmem/mmap/sshmem_mmap_module.c index 2d61aae2b8..faea0c0a6b 100644 --- a/oshmem/mca/sshmem/mmap/sshmem_mmap_module.c +++ b/oshmem/mca/sshmem/mmap/sshmem_mmap_module.c @@ -194,16 +194,15 @@ segment_create(map_segment_t *ds_buf, 0); if (MAP_FAILED == addr) { - OPAL_OUTPUT_VERBOSE( - (5, oshmem_sshmem_base_framework.framework_output, - "Failed to mmap() %llu bytes (errno=%d)", - (unsigned long long)size, errno) - ); - opal_show_help("help-oshmem-sshmem-mmap.txt", - "mmap segment failed", + opal_show_help("help-oshmem-sshmem.txt", + "create segment failure", + "mmap", true, - orte_process_info.nodename, (unsigned) size, - strerror(errno),errno); + orte_process_info.nodename, (unsigned long long) size, + strerror(errno), errno); + opal_show_help("help-oshmem-sshmem-mmap.txt", + "mmap:create segment failure", + true); return OSHMEM_ERR_OUT_OF_RESOURCE; } diff --git a/oshmem/mca/sshmem/sysv/help-oshmem-sshmem-sysv.txt b/oshmem/mca/sshmem/sysv/help-oshmem-sshmem-sysv.txt index ecd467dada..8452810600 100644 --- a/oshmem/mca/sshmem/sysv/help-oshmem-sshmem-sysv.txt +++ b/oshmem/mca/sshmem/sysv/help-oshmem-sshmem-sysv.txt @@ -1,6 +1,6 @@ # -*- text -*- # -# Copyright (c) 2013 Mellanox Technologies, Inc. +# Copyright (c) 2014 Mellanox Technologies, Inc. # All rights reserved. # Copyright (c) 2014 Cisco Systems, Inc. All rights reserved. # $COPYRIGHT$ @@ -11,18 +11,7 @@ # # This is the US/English help file for Open SHMEM MCA error messages. # -[create segment failure] -The OpenSHMEM "sysv" plugin in the "sshmem" framework failed to -allocate a shared memory segment via the shmat(2) system call. This -usually means that there are not enough resources available to the -SYSV shared memory subsystem on your server. - -Your OpenSHMEM job will now abort. - - Server: %s - Requested shared - memory segment size: %u - Specific error: %s (%d) +[sysv:create segment failure] You can try the following: @@ -31,3 +20,5 @@ You can try the following: SHMEM_SYMMETRIC_HEAP_SIZE=". 3. Increase your system's allowable SYSV shared memory segment size (e.g., via the SHMMAX and/or SMMAX kernel parameters). +4. Set "--mca sshmem_base_start_address 0" for + automatic selection by OS of virtual start address for sshmem. diff --git a/oshmem/mca/sshmem/sysv/sshmem_sysv_module.c b/oshmem/mca/sshmem/sysv/sshmem_sysv_module.c index a805984b72..674ade450e 100644 --- a/oshmem/mca/sshmem/sysv/sshmem_sysv_module.c +++ b/oshmem/mca/sshmem/sysv/sshmem_sysv_module.c @@ -194,28 +194,31 @@ segment_create(map_segment_t *ds_buf, /* Create a new shared memory segment and save the shmid. */ shmid = shmget(IPC_PRIVATE, size, flags); if (shmid == MAP_SEGMENT_SHM_INVALID) { - OPAL_OUTPUT_VERBOSE( - (5, oshmem_sshmem_base_framework.framework_output, - "Failed to shmget() %llu bytes (errno=%d)", - (unsigned long long)size, errno)); - - opal_show_help("help-oshmem-sshmem-sysv.txt", + opal_show_help("help-oshmem-sshmem.txt", "create segment failure", true, - orte_process_info.nodename, (unsigned) size, + "sysv", + orte_process_info.nodename, (unsigned long long) size, strerror(errno), errno); + opal_show_help("help-oshmem-sshmem-sysv.txt", + "sysv:create segment failure", + true); return OSHMEM_ERROR; } /* Attach to the sement */ addr = shmat(shmid, (void *) mca_sshmem_base_start_address, 0); if (addr == (void *) -1L) { - OPAL_OUTPUT_VERBOSE( - (5, oshmem_sshmem_base_framework.framework_output, - "Failed to shmat() %llu bytes (errno=%d)", - (unsigned long long)size, errno) - ); - shmctl(shmid, IPC_RMID, NULL ); + opal_show_help("help-oshmem-sshmem.txt", + "create segment failure", + true, + "sysv", + orte_process_info.nodename, (unsigned long long) size, + strerror(errno), errno); + opal_show_help("help-oshmem-sshmem-sysv.txt", + "sysv:create segment failure", + true); + shmctl(shmid, IPC_RMID, NULL); return OSHMEM_ERR_OUT_OF_RESOURCE; } diff --git a/oshmem/runtime/oshmem_shmem_init.c b/oshmem/runtime/oshmem_shmem_init.c index e00dd7a1aa..7be7978bfd 100644 --- a/oshmem/runtime/oshmem_shmem_init.c +++ b/oshmem/runtime/oshmem_shmem_init.c @@ -228,6 +228,11 @@ int oshmem_shmem_init(int argc, char **argv, int requested, int *provided) if (OSHMEM_SUCCESS == ret) { oshmem_shmem_initialized = true; + if (OSHMEM_SUCCESS != shmem_lock_init()) { + SHMEM_API_ERROR( "shmem_lock_init() failed"); + return OSHMEM_ERROR; + } + /* this is a collective op, implies barrier */ MCA_MEMHEAP_CALL(get_all_mkeys()); @@ -437,11 +442,6 @@ static int _shmem_init(int argc, char **argv, int requested, int *provided) goto error; } - if (OSHMEM_SUCCESS != shmem_lock_init()) { - error = "shmem_lock_init() failed"; - goto error; - } - error: if (ret != OSHMEM_SUCCESS) { const char *err_msg = opal_strerror(ret); orte_show_help("help-shmem-runtime.txt",