2005-05-23 22:06:50 +00:00
|
|
|
/*
|
2011-07-10 23:32:23 +00:00
|
|
|
* Copyright (c) 2004-2011 The Trustees of Indiana University and Indiana
|
2005-11-05 19:57:48 +00:00
|
|
|
* University Research and Technology
|
|
|
|
* Corporation. All rights reserved.
|
2009-12-15 23:34:09 +00:00
|
|
|
* Copyright (c) 2004-2009 The University of Tennessee and The University
|
2005-11-05 19:57:48 +00:00
|
|
|
* of Tennessee Research Foundation. All rights
|
|
|
|
* reserved.
|
2008-03-09 13:17:13 +00:00
|
|
|
* Copyright (c) 2004-2007 High Performance Computing Center Stuttgart,
|
2005-05-23 22:06:50 +00:00
|
|
|
* University of Stuttgart. All rights reserved.
|
|
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
|
|
* All rights reserved.
|
2007-10-22 12:07:22 +00:00
|
|
|
* Copyright (c) 2006-2007 Voltaire. All rights reserved.
|
2009-12-15 23:34:09 +00:00
|
|
|
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
2010-06-09 16:58:52 +00:00
|
|
|
* Copyright (c) 2010 Los Alamos National Security, LLC.
|
|
|
|
* All rights reserved.
|
2005-05-23 22:06:50 +00:00
|
|
|
* $COPYRIGHT$
|
2008-03-09 13:17:13 +00:00
|
|
|
*
|
2005-05-23 22:06:50 +00:00
|
|
|
* Additional copyrights may follow
|
2008-03-09 13:17:13 +00:00
|
|
|
*
|
2005-05-23 22:06:50 +00:00
|
|
|
* $HEADER$
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "ompi_config.h"
|
|
|
|
|
|
|
|
#include <sys/types.h>
|
|
|
|
#include <sys/stat.h>
|
2011-10-11 20:32:10 +00:00
|
|
|
#ifdef HAVE_FCNTL_H
|
2005-05-23 22:06:50 +00:00
|
|
|
#include <fcntl.h>
|
2011-10-11 20:32:10 +00:00
|
|
|
#endif /* HAVE_FCNTL_H */
|
2005-05-23 22:06:50 +00:00
|
|
|
#include <errno.h>
|
2011-10-11 20:32:10 +00:00
|
|
|
#ifdef HAVE_SYS_MMAN_H
|
|
|
|
#include <sys/mman.h>
|
|
|
|
#endif /* HAVE_SYS_MMAN_H */
|
2005-05-23 22:06:50 +00:00
|
|
|
|
2006-02-12 01:33:29 +00:00
|
|
|
#include "opal/sys/atomic.h"
|
2009-03-03 22:25:13 +00:00
|
|
|
#include "opal/class/opal_bitmap.h"
|
2009-02-14 02:26:12 +00:00
|
|
|
#include "opal/util/output.h"
|
2009-03-03 22:25:13 +00:00
|
|
|
#include "opal/util/printf.h"
|
2008-06-15 13:43:28 +00:00
|
|
|
#include "opal/mca/carto/carto.h"
|
|
|
|
#include "opal/mca/carto/base/base.h"
|
|
|
|
#include "opal/mca/paffinity/base/base.h"
|
|
|
|
#include "opal/mca/maffinity/base/base.h"
|
2006-02-12 01:33:29 +00:00
|
|
|
#include "orte/util/proc_info.h"
|
- Split the datatype engine into two parts: an MPI specific part in
OMPI
and a language agnostic part in OPAL. The convertor is completely
moved into OPAL. This offers several benefits as described in RFC
http://www.open-mpi.org/community/lists/devel/2009/07/6387.php
namely:
- Fewer basic types (int* and float* types, boolean and wchar
- Fixing naming scheme to ompi-nomenclature.
- Usability outside of the ompi-layer.
- Due to the fixed nature of simple opal types, their information is
completely
known at compile time and therefore constified
- With fewer datatypes (22), the actual sizes of bit-field types may be
reduced
from 64 to 32 bits, allowing reorganizing the opal_datatype
structure, eliminating holes and keeping data required in convertor
(upon send/recv) in one cacheline...
This has implications to the convertor-datastructure and other parts
of the code.
- Several performance tests have been run, the netpipe latency does not
change with
this patch on Linux/x86-64 on the smoky cluster.
- Extensive tests have been done to verify correctness (no new
regressions) using:
1. mpi_test_suite on linux/x86-64 using clean ompi-trunk and
ompi-ddt:
a. running both trunk and ompi-ddt resulted in no differences
(except for MPI_SHORT_INT and MPI_TYPE_MIX_LB_UB do now run
correctly).
b. with --enable-memchecker and running under valgrind (one buglet
when run with static found in test-suite, commited)
2. ibm testsuite on linux/x86-64 using clean ompi-trunk and ompi-ddt:
all passed (except for the dynamic/ tests failed!! as trunk/MTT)
3. compilation and usage of HDF5 tests on Jaguar using PGI and
PathScale compilers.
4. compilation and usage on Scicortex.
- Please note, that for the heterogeneous case, (-m32 compiled
binaries/ompi), neither
ompi-trunk, nor ompi-ddt branch would successfully launch.
This commit was SVN r21641.
2009-07-13 04:56:31 +00:00
|
|
|
#include "opal/datatype/opal_convertor.h"
|
2005-09-12 20:22:59 +00:00
|
|
|
#include "ompi/class/ompi_free_list.h"
|
|
|
|
#include "ompi/mca/btl/btl.h"
|
2006-02-12 01:33:29 +00:00
|
|
|
#include "ompi/mca/mpool/base/base.h"
|
2008-06-15 13:43:28 +00:00
|
|
|
#include "ompi/mca/mpool/sm/mpool_sm.h"
|
2008-10-16 15:09:00 +00:00
|
|
|
|
2010-03-12 23:57:50 +00:00
|
|
|
#if OPAL_ENABLE_FT_CR == 1
|
2008-10-16 15:09:00 +00:00
|
|
|
#include "opal/mca/crs/base/base.h"
|
2009-09-17 14:43:06 +00:00
|
|
|
#include "opal/util/basename.h"
|
A number of C/R enhancements per RFC below:
http://www.open-mpi.org/community/lists/devel/2010/07/8240.php
Documentation:
http://osl.iu.edu/research/ft/
Major Changes:
--------------
* Added C/R-enabled Debugging support.
Enabled with the --enable-crdebug flag. See the following website for more information:
http://osl.iu.edu/research/ft/crdebug/
* Added Stable Storage (SStore) framework for checkpoint storage
* 'central' component does a direct to central storage save
* 'stage' component stages checkpoints to central storage while the application continues execution.
* 'stage' supports offline compression of checkpoints before moving (sstore_stage_compress)
* 'stage' supports local caching of checkpoints to improve automatic recovery (sstore_stage_caching)
* Added Compression (compress) framework to support
* Add two new ErrMgr recovery policies
* {{{crmig}}} C/R Process Migration
* {{{autor}}} C/R Automatic Recovery
* Added the {{{ompi-migrate}}} command line tool to support the {{{crmig}}} ErrMgr component
* Added CR MPI Ext functions (enable them with {{{--enable-mpi-ext=cr}}} configure option)
* {{{OMPI_CR_Checkpoint}}} (Fixes trac:2342)
* {{{OMPI_CR_Restart}}}
* {{{OMPI_CR_Migrate}}} (may need some more work for mapping rules)
* {{{OMPI_CR_INC_register_callback}}} (Fixes trac:2192)
* {{{OMPI_CR_Quiesce_start}}}
* {{{OMPI_CR_Quiesce_checkpoint}}}
* {{{OMPI_CR_Quiesce_end}}}
* {{{OMPI_CR_self_register_checkpoint_callback}}}
* {{{OMPI_CR_self_register_restart_callback}}}
* {{{OMPI_CR_self_register_continue_callback}}}
* The ErrMgr predicted_fault() interface has been changed to take an opal_list_t of ErrMgr defined types. This will allow us to better support a wider range of fault prediction services in the future.
* Add a progress meter to:
* FileM rsh (filem_rsh_process_meter)
* SnapC full (snapc_full_progress_meter)
* SStore stage (sstore_stage_progress_meter)
* Added 2 new command line options to ompi-restart
* --showme : Display the full command line that would have been exec'ed.
* --mpirun_opts : Command line options to pass directly to mpirun. (Fixes trac:2413)
* Deprecated some MCA params:
* crs_base_snapshot_dir deprecated, use sstore_stage_local_snapshot_dir
* snapc_base_global_snapshot_dir deprecated, use sstore_base_global_snapshot_dir
* snapc_base_global_shared deprecated, use sstore_stage_global_is_shared
* snapc_base_store_in_place deprecated, replaced with different components of SStore
* snapc_base_global_snapshot_ref deprecated, use sstore_base_global_snapshot_ref
* snapc_base_establish_global_snapshot_dir deprecated, never well supported
* snapc_full_skip_filem deprecated, use sstore_stage_skip_filem
Minor Changes:
--------------
* Fixes trac:1924 : {{{ompi-restart}}} now recognizes path prefixed checkpoint handles and does the right thing.
* Fixes trac:2097 : {{{ompi-info}}} should now report all available CRS components
* Fixes trac:2161 : Manual checkpoint movement. A user can 'mv' a checkpoint directory from the original location to another and still restart from it.
* Fixes trac:2208 : Honor various TMPDIR varaibles instead of forcing {{{/tmp}}}
* Move {{{ompi_cr_continue_like_restart}}} to {{{orte_cr_continue_like_restart}}} to be more flexible in where this should be set.
* opal_crs_base_metadata_write* functions have been moved to SStore to support a wider range of metadata handling functionality.
* Cleanup the CRS framework and components to work with the SStore framework.
* Cleanup the SnapC framework and components to work with the SStore framework (cleans up these code paths considerably).
* Add 'quiesce' hook to CRCP for a future enhancement.
* We now require a BLCR version that supports {{{cr_request_file()}}} or {{{cr_request_checkpoint()}}} in order to make the code more maintainable. Note that {{{cr_request_file}}} has been deprecated since 0.7.0, so we prefer to use {{{cr_request_checkpoint()}}}.
* Add optional application level INC callbacks (registered through the CR MPI Ext interface).
* Increase the {{{opal_cr_thread_sleep_wait}}} parameter to 1000 microseconds to make the C/R thread less aggressive.
* {{{opal-restart}}} now looks for cache directories before falling back on stable storage when asked.
* {{{opal-restart}}} also support local decompression before restarting
* {{{orte-checkpoint}}} now uses the SStore framework to work with the metadata
* {{{orte-restart}}} now uses the SStore framework to work with the metadata
* Remove the {{{orte-restart}}} preload option. This was removed since the user only needs to select the 'stage' component in order to support this functionality.
* Since the '-am' parameter is saved in the metadata, {{{ompi-restart}}} no longer hard codes {{{-am ft-enable-cr}}}.
* Fix {{{hnp}}} ErrMgr so that if a previous component in the stack has 'fixed' the problem, then it should be skipped.
* Make sure to decrement the number of 'num_local_procs' in the orted when one goes away.
* odls now checks the SStore framework to see if it needs to load any checkpoint files before launching (to support 'stage'). This separates the SStore logic from the --preload-[binary|files] options.
* Add unique IDs to the named pipes established between the orted and the app in SnapC. This is to better support migration and automatic recovery activities.
* Improve the checks for 'already checkpointing' error path.
* A a recovery output timer, to show how long it takes to restart a job
* Do a better job of cleaning up the old session directory on restart.
* Add a local module to the autor and crmig ErrMgr components. These small modules prevent the 'orted' component from attempting a local recovery (Which does not work for MPI apps at the moment)
* Add a fix for bounding the checkpointable region between MPI_Init and MPI_Finalize.
This commit was SVN r23587.
The following Trac tickets were found above:
Ticket 1924 --> https://svn.open-mpi.org/trac/ompi/ticket/1924
Ticket 2097 --> https://svn.open-mpi.org/trac/ompi/ticket/2097
Ticket 2161 --> https://svn.open-mpi.org/trac/ompi/ticket/2161
Ticket 2192 --> https://svn.open-mpi.org/trac/ompi/ticket/2192
Ticket 2208 --> https://svn.open-mpi.org/trac/ompi/ticket/2208
Ticket 2342 --> https://svn.open-mpi.org/trac/ompi/ticket/2342
Ticket 2413 --> https://svn.open-mpi.org/trac/ompi/ticket/2413
2010-08-10 20:51:11 +00:00
|
|
|
#include "orte/mca/sstore/sstore.h"
|
2008-10-16 15:09:00 +00:00
|
|
|
#include "ompi/runtime/ompi_cr.h"
|
|
|
|
#endif
|
|
|
|
|
2005-06-30 05:50:55 +00:00
|
|
|
#include "btl_sm.h"
|
|
|
|
#include "btl_sm_endpoint.h"
|
|
|
|
#include "btl_sm_frag.h"
|
2005-07-28 16:25:09 +00:00
|
|
|
#include "btl_sm_fifo.h"
|
2005-09-12 20:22:59 +00:00
|
|
|
#include "ompi/proc/proc.h"
|
2005-05-23 22:06:50 +00:00
|
|
|
|
2007-03-20 08:15:58 +00:00
|
|
|
mca_btl_sm_t mca_btl_sm = {
|
2005-05-23 22:06:50 +00:00
|
|
|
{
|
2005-06-30 05:50:55 +00:00
|
|
|
&mca_btl_sm_component.super,
|
|
|
|
0, /* btl_eager_limit */
|
2007-12-16 08:35:17 +00:00
|
|
|
0, /* btl_rndv_eager_limit */
|
2005-06-30 05:50:55 +00:00
|
|
|
0, /* btl_max_send_size */
|
2007-06-21 07:12:40 +00:00
|
|
|
0, /* btl_rdma_pipeline_send_length */
|
2007-05-17 07:54:27 +00:00
|
|
|
0, /* btl_rdma_pipeline_frag_size */
|
|
|
|
0, /* btl_min_rdma_pipeline_size */
|
2005-06-30 05:50:55 +00:00
|
|
|
0, /* btl_exclusivity */
|
|
|
|
0, /* btl_latency */
|
|
|
|
0, /* btl_bandwidth */
|
|
|
|
0, /* btl flags */
|
2007-03-20 08:15:58 +00:00
|
|
|
mca_btl_sm_add_procs,
|
2005-06-30 05:50:55 +00:00
|
|
|
mca_btl_sm_del_procs,
|
2008-01-15 05:32:53 +00:00
|
|
|
NULL,
|
2005-06-30 05:50:55 +00:00
|
|
|
mca_btl_sm_finalize,
|
|
|
|
mca_btl_sm_alloc,
|
|
|
|
mca_btl_sm_free,
|
|
|
|
mca_btl_sm_prepare_src,
|
2009-12-15 23:34:09 +00:00
|
|
|
#if OMPI_BTL_SM_HAVE_KNEM
|
|
|
|
mca_btl_sm_prepare_dst,
|
|
|
|
#else
|
2005-06-01 14:34:22 +00:00
|
|
|
NULL,
|
2009-12-15 23:34:09 +00:00
|
|
|
#endif /* OMPI_BTL_SM_HAVE_KNEM */
|
2008-03-09 13:17:13 +00:00
|
|
|
mca_btl_sm_send,
|
2009-02-26 18:10:50 +00:00
|
|
|
mca_btl_sm_sendi,
|
2005-05-23 22:06:50 +00:00
|
|
|
NULL, /* put */
|
2009-12-15 23:34:09 +00:00
|
|
|
NULL, /* get -- optionally filled during initialization */
|
2006-08-17 22:02:01 +00:00
|
|
|
mca_btl_base_dump,
|
|
|
|
NULL, /* mpool */
|
2007-03-16 23:11:45 +00:00
|
|
|
mca_btl_sm_register_error_cb, /* register error */
|
|
|
|
mca_btl_sm_ft_event
|
2005-05-23 22:06:50 +00:00
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2007-03-20 08:15:58 +00:00
|
|
|
/*
|
|
|
|
* calculate offset of an address from the beginning of a shared memory segment
|
|
|
|
*/
|
|
|
|
#define ADDR2OFFSET(ADDR, BASE) ((char*)(ADDR) - (char*)(BASE))
|
|
|
|
|
|
|
|
/*
|
|
|
|
* calculate an absolute address in a local address space given an offset and
|
|
|
|
* a base address of a shared memory segment
|
|
|
|
*/
|
|
|
|
#define OFFSET2ADDR(OFFSET, BASE) ((ptrdiff_t)(OFFSET) + (char*)(BASE))
|
2005-05-23 22:06:50 +00:00
|
|
|
|
2008-03-11 14:39:10 +00:00
|
|
|
|
|
|
|
static void *mpool_calloc(size_t nmemb, size_t size)
|
|
|
|
{
|
|
|
|
void *buf;
|
|
|
|
size_t bsize = nmemb * size;
|
2008-06-15 13:43:28 +00:00
|
|
|
mca_mpool_base_module_t *mpool = mca_btl_sm_component.sm_mpool;
|
|
|
|
|
2010-07-06 14:33:36 +00:00
|
|
|
buf = mpool->mpool_alloc(mpool, bsize, opal_cache_line_size, 0, NULL);
|
2008-03-11 14:39:10 +00:00
|
|
|
|
|
|
|
if (NULL == buf)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
memset(buf, 0, bsize);
|
|
|
|
return buf;
|
|
|
|
}
|
|
|
|
|
2008-06-15 13:43:28 +00:00
|
|
|
static void init_maffinity(int *my_mem_node, int *max_mem_node)
|
|
|
|
{
|
2009-08-13 13:08:39 +00:00
|
|
|
opal_carto_graph_t *topo;
|
2008-06-15 13:43:28 +00:00
|
|
|
opal_value_array_t dists;
|
2008-08-23 03:13:29 +00:00
|
|
|
int i, num_core, socket;
|
2008-06-15 13:43:28 +00:00
|
|
|
opal_paffinity_base_cpu_set_t cpus;
|
|
|
|
char *myslot = NULL;
|
|
|
|
opal_carto_node_distance_t *dist;
|
|
|
|
opal_carto_base_node_t *slot_node;
|
|
|
|
|
|
|
|
*my_mem_node = 0;
|
|
|
|
*max_mem_node = 1;
|
|
|
|
|
Fixes trac:1988. The little bug that turned out to be huge. Yoinks.
* Various cosmetic/style updates in the btl sm
* Clean up concept of mpool module (I think that code was written way
back when the concept of "modules" was fuzzy)
* Bring over some old fixes from the /tmp/timattox-sm-coll/ tree to
fix potential segv's when mmap'ed regions were at different
addresses in different processes (thanks Tim!).
* Change sm coll to no longer use mpool as its main source of shmem;
rather, just mmap its own segment (because it's fixed size --
there was nothing to be gained by using mpool; shedding the use of
mpool saved a lot of complexity in the sm coll setup). This
effectively made Tim's fixes moot (because now everything is an
offset into the mmap that is computed locally; there are no global
pointers). :-)
* Slightly updated common/sm to allow making mmap's for a specific
set of procs (vs. ''all'' procs in the process). This potentially
allows for same-host-inter-proc mmaps -- yay!
* Fixed many, many things in the coll sm (particularly in reduce):
* Fixed handling of MPI_IN_PLACE in reduce and allreduce
* Fixed handling of non-contiguous datatypes in reduce
* Changed the order of reductions to go from process (n-1)'s data
to process 0's data, because that's how all other OMPI coll
components work
* Fixed lots of usage of ddt functions
* When using a non-contiguous datatype, if the root process is not
(n-1), now we used a 2nd convertor to copy from shmem to the rbuf
(saves a memory copy vs. what was done before)
* Lots and lots of little cleanups, clarifications, and minor
optimizations (although still more could be done -- e.g., I think
the use of write memory barriers is fairly sub-optimal; they
could be ganged together at the root, for example)
I'm marking this as "fixes trac:1988" and closing the ticket; if something
is still broken, we can re-open the ticket.
This commit was SVN r21967.
The following Trac tickets were found above:
Ticket 1988 --> https://svn.open-mpi.org/trac/ompi/ticket/1988
2009-09-15 00:25:21 +00:00
|
|
|
if (OMPI_SUCCESS != opal_carto_base_get_host_graph(&topo, "Memory")) {
|
2008-06-15 13:43:28 +00:00
|
|
|
return;
|
Fixes trac:1988. The little bug that turned out to be huge. Yoinks.
* Various cosmetic/style updates in the btl sm
* Clean up concept of mpool module (I think that code was written way
back when the concept of "modules" was fuzzy)
* Bring over some old fixes from the /tmp/timattox-sm-coll/ tree to
fix potential segv's when mmap'ed regions were at different
addresses in different processes (thanks Tim!).
* Change sm coll to no longer use mpool as its main source of shmem;
rather, just mmap its own segment (because it's fixed size --
there was nothing to be gained by using mpool; shedding the use of
mpool saved a lot of complexity in the sm coll setup). This
effectively made Tim's fixes moot (because now everything is an
offset into the mmap that is computed locally; there are no global
pointers). :-)
* Slightly updated common/sm to allow making mmap's for a specific
set of procs (vs. ''all'' procs in the process). This potentially
allows for same-host-inter-proc mmaps -- yay!
* Fixed many, many things in the coll sm (particularly in reduce):
* Fixed handling of MPI_IN_PLACE in reduce and allreduce
* Fixed handling of non-contiguous datatypes in reduce
* Changed the order of reductions to go from process (n-1)'s data
to process 0's data, because that's how all other OMPI coll
components work
* Fixed lots of usage of ddt functions
* When using a non-contiguous datatype, if the root process is not
(n-1), now we used a 2nd convertor to copy from shmem to the rbuf
(saves a memory copy vs. what was done before)
* Lots and lots of little cleanups, clarifications, and minor
optimizations (although still more could be done -- e.g., I think
the use of write memory barriers is fairly sub-optimal; they
could be ganged together at the root, for example)
I'm marking this as "fixes trac:1988" and closing the ticket; if something
is still broken, we can re-open the ticket.
This commit was SVN r21967.
The following Trac tickets were found above:
Ticket 1988 --> https://svn.open-mpi.org/trac/ompi/ticket/1988
2009-09-15 00:25:21 +00:00
|
|
|
}
|
2008-06-15 13:43:28 +00:00
|
|
|
|
|
|
|
OBJ_CONSTRUCT(&dists, opal_value_array_t);
|
|
|
|
opal_value_array_init(&dists, sizeof(opal_carto_node_distance_t));
|
|
|
|
|
Fixes trac:1988. The little bug that turned out to be huge. Yoinks.
* Various cosmetic/style updates in the btl sm
* Clean up concept of mpool module (I think that code was written way
back when the concept of "modules" was fuzzy)
* Bring over some old fixes from the /tmp/timattox-sm-coll/ tree to
fix potential segv's when mmap'ed regions were at different
addresses in different processes (thanks Tim!).
* Change sm coll to no longer use mpool as its main source of shmem;
rather, just mmap its own segment (because it's fixed size --
there was nothing to be gained by using mpool; shedding the use of
mpool saved a lot of complexity in the sm coll setup). This
effectively made Tim's fixes moot (because now everything is an
offset into the mmap that is computed locally; there are no global
pointers). :-)
* Slightly updated common/sm to allow making mmap's for a specific
set of procs (vs. ''all'' procs in the process). This potentially
allows for same-host-inter-proc mmaps -- yay!
* Fixed many, many things in the coll sm (particularly in reduce):
* Fixed handling of MPI_IN_PLACE in reduce and allreduce
* Fixed handling of non-contiguous datatypes in reduce
* Changed the order of reductions to go from process (n-1)'s data
to process 0's data, because that's how all other OMPI coll
components work
* Fixed lots of usage of ddt functions
* When using a non-contiguous datatype, if the root process is not
(n-1), now we used a 2nd convertor to copy from shmem to the rbuf
(saves a memory copy vs. what was done before)
* Lots and lots of little cleanups, clarifications, and minor
optimizations (although still more could be done -- e.g., I think
the use of write memory barriers is fairly sub-optimal; they
could be ganged together at the root, for example)
I'm marking this as "fixes trac:1988" and closing the ticket; if something
is still broken, we can re-open the ticket.
This commit was SVN r21967.
The following Trac tickets were found above:
Ticket 1988 --> https://svn.open-mpi.org/trac/ompi/ticket/1988
2009-09-15 00:25:21 +00:00
|
|
|
if (OMPI_SUCCESS != opal_paffinity_base_get_processor_info(&num_core)) {
|
2008-08-21 19:21:28 +00:00
|
|
|
num_core = 100; /* set something large */
|
|
|
|
}
|
2008-06-15 13:43:28 +00:00
|
|
|
|
|
|
|
OPAL_PAFFINITY_CPU_ZERO(cpus);
|
|
|
|
opal_paffinity_base_get(&cpus);
|
|
|
|
|
|
|
|
/* find core we are running on */
|
Fixes trac:1988. The little bug that turned out to be huge. Yoinks.
* Various cosmetic/style updates in the btl sm
* Clean up concept of mpool module (I think that code was written way
back when the concept of "modules" was fuzzy)
* Bring over some old fixes from the /tmp/timattox-sm-coll/ tree to
fix potential segv's when mmap'ed regions were at different
addresses in different processes (thanks Tim!).
* Change sm coll to no longer use mpool as its main source of shmem;
rather, just mmap its own segment (because it's fixed size --
there was nothing to be gained by using mpool; shedding the use of
mpool saved a lot of complexity in the sm coll setup). This
effectively made Tim's fixes moot (because now everything is an
offset into the mmap that is computed locally; there are no global
pointers). :-)
* Slightly updated common/sm to allow making mmap's for a specific
set of procs (vs. ''all'' procs in the process). This potentially
allows for same-host-inter-proc mmaps -- yay!
* Fixed many, many things in the coll sm (particularly in reduce):
* Fixed handling of MPI_IN_PLACE in reduce and allreduce
* Fixed handling of non-contiguous datatypes in reduce
* Changed the order of reductions to go from process (n-1)'s data
to process 0's data, because that's how all other OMPI coll
components work
* Fixed lots of usage of ddt functions
* When using a non-contiguous datatype, if the root process is not
(n-1), now we used a 2nd convertor to copy from shmem to the rbuf
(saves a memory copy vs. what was done before)
* Lots and lots of little cleanups, clarifications, and minor
optimizations (although still more could be done -- e.g., I think
the use of write memory barriers is fairly sub-optimal; they
could be ganged together at the root, for example)
I'm marking this as "fixes trac:1988" and closing the ticket; if something
is still broken, we can re-open the ticket.
This commit was SVN r21967.
The following Trac tickets were found above:
Ticket 1988 --> https://svn.open-mpi.org/trac/ompi/ticket/1988
2009-09-15 00:25:21 +00:00
|
|
|
for (i = 0; i < num_core; i++) {
|
|
|
|
if (OPAL_PAFFINITY_CPU_ISSET(i, cpus)) {
|
2008-06-15 13:43:28 +00:00
|
|
|
break;
|
Fixes trac:1988. The little bug that turned out to be huge. Yoinks.
* Various cosmetic/style updates in the btl sm
* Clean up concept of mpool module (I think that code was written way
back when the concept of "modules" was fuzzy)
* Bring over some old fixes from the /tmp/timattox-sm-coll/ tree to
fix potential segv's when mmap'ed regions were at different
addresses in different processes (thanks Tim!).
* Change sm coll to no longer use mpool as its main source of shmem;
rather, just mmap its own segment (because it's fixed size --
there was nothing to be gained by using mpool; shedding the use of
mpool saved a lot of complexity in the sm coll setup). This
effectively made Tim's fixes moot (because now everything is an
offset into the mmap that is computed locally; there are no global
pointers). :-)
* Slightly updated common/sm to allow making mmap's for a specific
set of procs (vs. ''all'' procs in the process). This potentially
allows for same-host-inter-proc mmaps -- yay!
* Fixed many, many things in the coll sm (particularly in reduce):
* Fixed handling of MPI_IN_PLACE in reduce and allreduce
* Fixed handling of non-contiguous datatypes in reduce
* Changed the order of reductions to go from process (n-1)'s data
to process 0's data, because that's how all other OMPI coll
components work
* Fixed lots of usage of ddt functions
* When using a non-contiguous datatype, if the root process is not
(n-1), now we used a 2nd convertor to copy from shmem to the rbuf
(saves a memory copy vs. what was done before)
* Lots and lots of little cleanups, clarifications, and minor
optimizations (although still more could be done -- e.g., I think
the use of write memory barriers is fairly sub-optimal; they
could be ganged together at the root, for example)
I'm marking this as "fixes trac:1988" and closing the ticket; if something
is still broken, we can re-open the ticket.
This commit was SVN r21967.
The following Trac tickets were found above:
Ticket 1988 --> https://svn.open-mpi.org/trac/ompi/ticket/1988
2009-09-15 00:25:21 +00:00
|
|
|
}
|
|
|
|
}
|
2008-06-15 13:43:28 +00:00
|
|
|
|
2008-08-23 03:13:29 +00:00
|
|
|
if (OMPI_SUCCESS != opal_paffinity_base_get_map_to_socket_core(i, &socket, &i)) {
|
|
|
|
/* no topology info available */
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2008-06-15 13:43:28 +00:00
|
|
|
asprintf(&myslot, "slot%d", socket);
|
|
|
|
|
|
|
|
slot_node = opal_carto_base_find_node(topo, myslot);
|
|
|
|
|
Fixes trac:1988. The little bug that turned out to be huge. Yoinks.
* Various cosmetic/style updates in the btl sm
* Clean up concept of mpool module (I think that code was written way
back when the concept of "modules" was fuzzy)
* Bring over some old fixes from the /tmp/timattox-sm-coll/ tree to
fix potential segv's when mmap'ed regions were at different
addresses in different processes (thanks Tim!).
* Change sm coll to no longer use mpool as its main source of shmem;
rather, just mmap its own segment (because it's fixed size --
there was nothing to be gained by using mpool; shedding the use of
mpool saved a lot of complexity in the sm coll setup). This
effectively made Tim's fixes moot (because now everything is an
offset into the mmap that is computed locally; there are no global
pointers). :-)
* Slightly updated common/sm to allow making mmap's for a specific
set of procs (vs. ''all'' procs in the process). This potentially
allows for same-host-inter-proc mmaps -- yay!
* Fixed many, many things in the coll sm (particularly in reduce):
* Fixed handling of MPI_IN_PLACE in reduce and allreduce
* Fixed handling of non-contiguous datatypes in reduce
* Changed the order of reductions to go from process (n-1)'s data
to process 0's data, because that's how all other OMPI coll
components work
* Fixed lots of usage of ddt functions
* When using a non-contiguous datatype, if the root process is not
(n-1), now we used a 2nd convertor to copy from shmem to the rbuf
(saves a memory copy vs. what was done before)
* Lots and lots of little cleanups, clarifications, and minor
optimizations (although still more could be done -- e.g., I think
the use of write memory barriers is fairly sub-optimal; they
could be ganged together at the root, for example)
I'm marking this as "fixes trac:1988" and closing the ticket; if something
is still broken, we can re-open the ticket.
This commit was SVN r21967.
The following Trac tickets were found above:
Ticket 1988 --> https://svn.open-mpi.org/trac/ompi/ticket/1988
2009-09-15 00:25:21 +00:00
|
|
|
if(NULL == slot_node) {
|
2008-06-15 13:43:28 +00:00
|
|
|
goto out;
|
Fixes trac:1988. The little bug that turned out to be huge. Yoinks.
* Various cosmetic/style updates in the btl sm
* Clean up concept of mpool module (I think that code was written way
back when the concept of "modules" was fuzzy)
* Bring over some old fixes from the /tmp/timattox-sm-coll/ tree to
fix potential segv's when mmap'ed regions were at different
addresses in different processes (thanks Tim!).
* Change sm coll to no longer use mpool as its main source of shmem;
rather, just mmap its own segment (because it's fixed size --
there was nothing to be gained by using mpool; shedding the use of
mpool saved a lot of complexity in the sm coll setup). This
effectively made Tim's fixes moot (because now everything is an
offset into the mmap that is computed locally; there are no global
pointers). :-)
* Slightly updated common/sm to allow making mmap's for a specific
set of procs (vs. ''all'' procs in the process). This potentially
allows for same-host-inter-proc mmaps -- yay!
* Fixed many, many things in the coll sm (particularly in reduce):
* Fixed handling of MPI_IN_PLACE in reduce and allreduce
* Fixed handling of non-contiguous datatypes in reduce
* Changed the order of reductions to go from process (n-1)'s data
to process 0's data, because that's how all other OMPI coll
components work
* Fixed lots of usage of ddt functions
* When using a non-contiguous datatype, if the root process is not
(n-1), now we used a 2nd convertor to copy from shmem to the rbuf
(saves a memory copy vs. what was done before)
* Lots and lots of little cleanups, clarifications, and minor
optimizations (although still more could be done -- e.g., I think
the use of write memory barriers is fairly sub-optimal; they
could be ganged together at the root, for example)
I'm marking this as "fixes trac:1988" and closing the ticket; if something
is still broken, we can re-open the ticket.
This commit was SVN r21967.
The following Trac tickets were found above:
Ticket 1988 --> https://svn.open-mpi.org/trac/ompi/ticket/1988
2009-09-15 00:25:21 +00:00
|
|
|
}
|
2008-06-15 13:43:28 +00:00
|
|
|
|
|
|
|
opal_carto_base_get_nodes_distance(topo, slot_node, "Memory", &dists);
|
Fixes trac:1988. The little bug that turned out to be huge. Yoinks.
* Various cosmetic/style updates in the btl sm
* Clean up concept of mpool module (I think that code was written way
back when the concept of "modules" was fuzzy)
* Bring over some old fixes from the /tmp/timattox-sm-coll/ tree to
fix potential segv's when mmap'ed regions were at different
addresses in different processes (thanks Tim!).
* Change sm coll to no longer use mpool as its main source of shmem;
rather, just mmap its own segment (because it's fixed size --
there was nothing to be gained by using mpool; shedding the use of
mpool saved a lot of complexity in the sm coll setup). This
effectively made Tim's fixes moot (because now everything is an
offset into the mmap that is computed locally; there are no global
pointers). :-)
* Slightly updated common/sm to allow making mmap's for a specific
set of procs (vs. ''all'' procs in the process). This potentially
allows for same-host-inter-proc mmaps -- yay!
* Fixed many, many things in the coll sm (particularly in reduce):
* Fixed handling of MPI_IN_PLACE in reduce and allreduce
* Fixed handling of non-contiguous datatypes in reduce
* Changed the order of reductions to go from process (n-1)'s data
to process 0's data, because that's how all other OMPI coll
components work
* Fixed lots of usage of ddt functions
* When using a non-contiguous datatype, if the root process is not
(n-1), now we used a 2nd convertor to copy from shmem to the rbuf
(saves a memory copy vs. what was done before)
* Lots and lots of little cleanups, clarifications, and minor
optimizations (although still more could be done -- e.g., I think
the use of write memory barriers is fairly sub-optimal; they
could be ganged together at the root, for example)
I'm marking this as "fixes trac:1988" and closing the ticket; if something
is still broken, we can re-open the ticket.
This commit was SVN r21967.
The following Trac tickets were found above:
Ticket 1988 --> https://svn.open-mpi.org/trac/ompi/ticket/1988
2009-09-15 00:25:21 +00:00
|
|
|
if((*max_mem_node = opal_value_array_get_size(&dists)) < 2) {
|
2008-06-15 13:43:28 +00:00
|
|
|
goto out;
|
Fixes trac:1988. The little bug that turned out to be huge. Yoinks.
* Various cosmetic/style updates in the btl sm
* Clean up concept of mpool module (I think that code was written way
back when the concept of "modules" was fuzzy)
* Bring over some old fixes from the /tmp/timattox-sm-coll/ tree to
fix potential segv's when mmap'ed regions were at different
addresses in different processes (thanks Tim!).
* Change sm coll to no longer use mpool as its main source of shmem;
rather, just mmap its own segment (because it's fixed size --
there was nothing to be gained by using mpool; shedding the use of
mpool saved a lot of complexity in the sm coll setup). This
effectively made Tim's fixes moot (because now everything is an
offset into the mmap that is computed locally; there are no global
pointers). :-)
* Slightly updated common/sm to allow making mmap's for a specific
set of procs (vs. ''all'' procs in the process). This potentially
allows for same-host-inter-proc mmaps -- yay!
* Fixed many, many things in the coll sm (particularly in reduce):
* Fixed handling of MPI_IN_PLACE in reduce and allreduce
* Fixed handling of non-contiguous datatypes in reduce
* Changed the order of reductions to go from process (n-1)'s data
to process 0's data, because that's how all other OMPI coll
components work
* Fixed lots of usage of ddt functions
* When using a non-contiguous datatype, if the root process is not
(n-1), now we used a 2nd convertor to copy from shmem to the rbuf
(saves a memory copy vs. what was done before)
* Lots and lots of little cleanups, clarifications, and minor
optimizations (although still more could be done -- e.g., I think
the use of write memory barriers is fairly sub-optimal; they
could be ganged together at the root, for example)
I'm marking this as "fixes trac:1988" and closing the ticket; if something
is still broken, we can re-open the ticket.
This commit was SVN r21967.
The following Trac tickets were found above:
Ticket 1988 --> https://svn.open-mpi.org/trac/ompi/ticket/1988
2009-09-15 00:25:21 +00:00
|
|
|
}
|
2008-06-15 13:43:28 +00:00
|
|
|
|
2008-09-08 15:39:30 +00:00
|
|
|
dist = (opal_carto_node_distance_t *) opal_value_array_get_item(&dists, 0);
|
2008-06-15 13:43:28 +00:00
|
|
|
opal_maffinity_base_node_name_to_id(dist->node->node_name, my_mem_node);
|
|
|
|
out:
|
Fixes trac:1988. The little bug that turned out to be huge. Yoinks.
* Various cosmetic/style updates in the btl sm
* Clean up concept of mpool module (I think that code was written way
back when the concept of "modules" was fuzzy)
* Bring over some old fixes from the /tmp/timattox-sm-coll/ tree to
fix potential segv's when mmap'ed regions were at different
addresses in different processes (thanks Tim!).
* Change sm coll to no longer use mpool as its main source of shmem;
rather, just mmap its own segment (because it's fixed size --
there was nothing to be gained by using mpool; shedding the use of
mpool saved a lot of complexity in the sm coll setup). This
effectively made Tim's fixes moot (because now everything is an
offset into the mmap that is computed locally; there are no global
pointers). :-)
* Slightly updated common/sm to allow making mmap's for a specific
set of procs (vs. ''all'' procs in the process). This potentially
allows for same-host-inter-proc mmaps -- yay!
* Fixed many, many things in the coll sm (particularly in reduce):
* Fixed handling of MPI_IN_PLACE in reduce and allreduce
* Fixed handling of non-contiguous datatypes in reduce
* Changed the order of reductions to go from process (n-1)'s data
to process 0's data, because that's how all other OMPI coll
components work
* Fixed lots of usage of ddt functions
* When using a non-contiguous datatype, if the root process is not
(n-1), now we used a 2nd convertor to copy from shmem to the rbuf
(saves a memory copy vs. what was done before)
* Lots and lots of little cleanups, clarifications, and minor
optimizations (although still more could be done -- e.g., I think
the use of write memory barriers is fairly sub-optimal; they
could be ganged together at the root, for example)
I'm marking this as "fixes trac:1988" and closing the ticket; if something
is still broken, we can re-open the ticket.
This commit was SVN r21967.
The following Trac tickets were found above:
Ticket 1988 --> https://svn.open-mpi.org/trac/ompi/ticket/1988
2009-09-15 00:25:21 +00:00
|
|
|
if (myslot) {
|
|
|
|
free(myslot);
|
|
|
|
}
|
2008-06-15 13:43:28 +00:00
|
|
|
OBJ_DESTRUCT(&dists);
|
|
|
|
opal_carto_base_free_graph(topo);
|
|
|
|
}
|
|
|
|
|
2008-03-11 14:39:10 +00:00
|
|
|
static int sm_btl_first_time_init(mca_btl_sm_t *sm_btl, int n)
|
|
|
|
{
|
|
|
|
size_t size, length, length_payload;
|
|
|
|
char *sm_ctl_file;
|
2009-02-17 15:58:15 +00:00
|
|
|
sm_fifo_t *my_fifos;
|
2008-06-15 13:43:28 +00:00
|
|
|
int my_mem_node=-1, num_mem_nodes=-1, i;
|
Fixes trac:1988. The little bug that turned out to be huge. Yoinks.
* Various cosmetic/style updates in the btl sm
* Clean up concept of mpool module (I think that code was written way
back when the concept of "modules" was fuzzy)
* Bring over some old fixes from the /tmp/timattox-sm-coll/ tree to
fix potential segv's when mmap'ed regions were at different
addresses in different processes (thanks Tim!).
* Change sm coll to no longer use mpool as its main source of shmem;
rather, just mmap its own segment (because it's fixed size --
there was nothing to be gained by using mpool; shedding the use of
mpool saved a lot of complexity in the sm coll setup). This
effectively made Tim's fixes moot (because now everything is an
offset into the mmap that is computed locally; there are no global
pointers). :-)
* Slightly updated common/sm to allow making mmap's for a specific
set of procs (vs. ''all'' procs in the process). This potentially
allows for same-host-inter-proc mmaps -- yay!
* Fixed many, many things in the coll sm (particularly in reduce):
* Fixed handling of MPI_IN_PLACE in reduce and allreduce
* Fixed handling of non-contiguous datatypes in reduce
* Changed the order of reductions to go from process (n-1)'s data
to process 0's data, because that's how all other OMPI coll
components work
* Fixed lots of usage of ddt functions
* When using a non-contiguous datatype, if the root process is not
(n-1), now we used a 2nd convertor to copy from shmem to the rbuf
(saves a memory copy vs. what was done before)
* Lots and lots of little cleanups, clarifications, and minor
optimizations (although still more could be done -- e.g., I think
the use of write memory barriers is fairly sub-optimal; they
could be ganged together at the root, for example)
I'm marking this as "fixes trac:1988" and closing the ticket; if something
is still broken, we can re-open the ticket.
This commit was SVN r21967.
The following Trac tickets were found above:
Ticket 1988 --> https://svn.open-mpi.org/trac/ompi/ticket/1988
2009-09-15 00:25:21 +00:00
|
|
|
ompi_proc_t **procs;
|
|
|
|
size_t num_procs;
|
2008-06-15 13:43:28 +00:00
|
|
|
|
|
|
|
init_maffinity(&my_mem_node, &num_mem_nodes);
|
|
|
|
mca_btl_sm_component.mem_node = my_mem_node;
|
|
|
|
mca_btl_sm_component.num_mem_nodes = num_mem_nodes;
|
2008-03-11 14:39:10 +00:00
|
|
|
|
|
|
|
/* lookup shared memory pool */
|
2008-09-08 15:39:30 +00:00
|
|
|
mca_btl_sm_component.sm_mpools = (mca_mpool_base_module_t **) calloc(num_mem_nodes,
|
2008-06-15 13:43:28 +00:00
|
|
|
sizeof(mca_mpool_base_module_t*));
|
|
|
|
|
|
|
|
/* create mpool for each memory node */
|
|
|
|
for(i = 0; i < num_mem_nodes; i++) {
|
|
|
|
mca_mpool_base_resources_t res;
|
2009-02-20 19:51:57 +00:00
|
|
|
mca_btl_sm_component_t* m = &mca_btl_sm_component;
|
|
|
|
|
2008-06-15 13:43:28 +00:00
|
|
|
/* disable memory binding if there is only one memory node */
|
|
|
|
res.mem_node = (num_mem_nodes == 1) ? -1 : i;
|
2009-02-20 19:51:57 +00:00
|
|
|
|
|
|
|
/* determine how much memory to create */
|
2009-03-13 18:11:41 +00:00
|
|
|
/*
|
|
|
|
* This heuristic formula mostly says that we request memory for:
|
|
|
|
* - nfifos FIFOs, each comprising:
|
|
|
|
* . a sm_fifo_t structure
|
|
|
|
* . many pointers (fifo_size of them per FIFO)
|
|
|
|
* - eager fragments (2*n of them, allocated in sm_free_list_inc chunks)
|
|
|
|
* - max fragments (sm_free_list_num of them)
|
|
|
|
*
|
2010-07-06 14:33:36 +00:00
|
|
|
* On top of all that, we sprinkle in some number of "opal_cache_line_size"
|
2009-03-13 18:11:41 +00:00
|
|
|
* additions to account for some padding and edge effects that may lie
|
|
|
|
* in the allocator.
|
|
|
|
*/
|
2009-12-10 19:28:39 +00:00
|
|
|
res.size =
|
2010-07-06 14:33:36 +00:00
|
|
|
FIFO_MAP_NUM(n) * ( sizeof(sm_fifo_t) + sizeof(void *) * m->fifo_size + 4 * opal_cache_line_size )
|
|
|
|
+ ( 2 * n + m->sm_free_list_inc ) * ( m->eager_limit + 2 * opal_cache_line_size )
|
|
|
|
+ m->sm_free_list_num * ( m->max_frag_size + 2 * opal_cache_line_size );
|
2009-10-29 23:06:32 +00:00
|
|
|
|
|
|
|
/* before we multiply by n, make sure the result won't overflow */
|
|
|
|
/* Stick that little pad in, particularly since we'll eventually
|
|
|
|
* need a little extra space. E.g., in mca_mpool_sm_init() in
|
2010-06-09 16:58:52 +00:00
|
|
|
* mpool_sm_component.c when sizeof(mca_common_sm_module_t) is
|
2009-10-29 23:06:32 +00:00
|
|
|
* added.
|
|
|
|
*/
|
|
|
|
if ( ((double) res.size) * n > LONG_MAX - 4096 )
|
|
|
|
return OMPI_ERR_OUT_OF_RESOURCE;
|
|
|
|
res.size *= n;
|
2009-02-20 19:51:57 +00:00
|
|
|
|
|
|
|
/* now, create it */
|
2008-06-15 13:43:28 +00:00
|
|
|
mca_btl_sm_component.sm_mpools[i] =
|
2008-03-11 14:39:10 +00:00
|
|
|
mca_mpool_base_module_create(mca_btl_sm_component.sm_mpool_name,
|
2008-06-15 13:43:28 +00:00
|
|
|
sm_btl, &res);
|
|
|
|
/* Sanity check to ensure that we found it */
|
|
|
|
if(NULL == mca_btl_sm_component.sm_mpools[i])
|
|
|
|
return OMPI_ERR_OUT_OF_RESOURCE;
|
|
|
|
|
|
|
|
if(i == my_mem_node)
|
|
|
|
mca_btl_sm_component.sm_mpool = mca_btl_sm_component.sm_mpools[i];
|
2008-03-11 14:39:10 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
mca_btl_sm_component.sm_mpool_base =
|
2008-06-15 13:43:28 +00:00
|
|
|
mca_btl_sm_component.sm_mpools[0]->mpool_base(mca_btl_sm_component.sm_mpools[0]);
|
2008-03-11 14:39:10 +00:00
|
|
|
|
|
|
|
/* create a list of peers */
|
|
|
|
mca_btl_sm_component.sm_peers = (struct mca_btl_base_endpoint_t**)
|
|
|
|
calloc(n, sizeof(struct mca_btl_base_endpoint_t*));
|
|
|
|
if(NULL == mca_btl_sm_component.sm_peers)
|
|
|
|
return OMPI_ERR_OUT_OF_RESOURCE;
|
|
|
|
|
|
|
|
/* Allocate Shared Memory BTL process coordination
|
|
|
|
* data structure. This will reside in shared memory */
|
|
|
|
|
|
|
|
/* set file name */
|
|
|
|
if(asprintf(&sm_ctl_file, "%s"OPAL_PATH_SEP"shared_mem_btl_module.%s",
|
2009-03-05 21:56:03 +00:00
|
|
|
orte_process_info.job_session_dir,
|
|
|
|
orte_process_info.nodename) < 0)
|
2008-03-11 14:39:10 +00:00
|
|
|
return OMPI_ERR_OUT_OF_RESOURCE;
|
|
|
|
|
|
|
|
/* Pass in a data segment alignment of 0 to get no data
|
|
|
|
segment (only the shared control structure) */
|
2010-06-09 16:58:52 +00:00
|
|
|
size = sizeof(mca_common_sm_seg_header_t) +
|
2010-07-06 14:33:36 +00:00
|
|
|
n * (sizeof(sm_fifo_t*) + sizeof(char *) + sizeof(uint16_t)) + opal_cache_line_size;
|
Fixes trac:1988. The little bug that turned out to be huge. Yoinks.
* Various cosmetic/style updates in the btl sm
* Clean up concept of mpool module (I think that code was written way
back when the concept of "modules" was fuzzy)
* Bring over some old fixes from the /tmp/timattox-sm-coll/ tree to
fix potential segv's when mmap'ed regions were at different
addresses in different processes (thanks Tim!).
* Change sm coll to no longer use mpool as its main source of shmem;
rather, just mmap its own segment (because it's fixed size --
there was nothing to be gained by using mpool; shedding the use of
mpool saved a lot of complexity in the sm coll setup). This
effectively made Tim's fixes moot (because now everything is an
offset into the mmap that is computed locally; there are no global
pointers). :-)
* Slightly updated common/sm to allow making mmap's for a specific
set of procs (vs. ''all'' procs in the process). This potentially
allows for same-host-inter-proc mmaps -- yay!
* Fixed many, many things in the coll sm (particularly in reduce):
* Fixed handling of MPI_IN_PLACE in reduce and allreduce
* Fixed handling of non-contiguous datatypes in reduce
* Changed the order of reductions to go from process (n-1)'s data
to process 0's data, because that's how all other OMPI coll
components work
* Fixed lots of usage of ddt functions
* When using a non-contiguous datatype, if the root process is not
(n-1), now we used a 2nd convertor to copy from shmem to the rbuf
(saves a memory copy vs. what was done before)
* Lots and lots of little cleanups, clarifications, and minor
optimizations (although still more could be done -- e.g., I think
the use of write memory barriers is fairly sub-optimal; they
could be ganged together at the root, for example)
I'm marking this as "fixes trac:1988" and closing the ticket; if something
is still broken, we can re-open the ticket.
This commit was SVN r21967.
The following Trac tickets were found above:
Ticket 1988 --> https://svn.open-mpi.org/trac/ompi/ticket/1988
2009-09-15 00:25:21 +00:00
|
|
|
procs = ompi_proc_world(&num_procs);
|
2010-06-09 16:58:52 +00:00
|
|
|
if (!(mca_btl_sm_component.sm_seg =
|
|
|
|
mca_common_sm_init(procs, num_procs, size, sm_ctl_file,
|
|
|
|
sizeof(mca_common_sm_seg_header_t),
|
2010-07-06 14:33:36 +00:00
|
|
|
opal_cache_line_size))) {
|
2008-06-09 14:53:58 +00:00
|
|
|
opal_output(0, "mca_btl_sm_add_procs: unable to create shared memory "
|
2008-03-11 14:39:10 +00:00
|
|
|
"BTL coordinating strucure :: size %lu \n",
|
|
|
|
(unsigned long)size);
|
2009-10-02 17:13:56 +00:00
|
|
|
free(procs);
|
2008-03-11 14:39:10 +00:00
|
|
|
free(sm_ctl_file);
|
|
|
|
return OMPI_ERROR;
|
|
|
|
}
|
2009-10-02 17:13:56 +00:00
|
|
|
free(procs);
|
2008-03-11 14:39:10 +00:00
|
|
|
free(sm_ctl_file);
|
|
|
|
|
2008-03-16 10:01:56 +00:00
|
|
|
/* check to make sure number of local procs is within the
|
|
|
|
* specified limits */
|
|
|
|
if(mca_btl_sm_component.sm_max_procs > 0 &&
|
|
|
|
mca_btl_sm_component.num_smp_procs + n >
|
|
|
|
mca_btl_sm_component.sm_max_procs) {
|
|
|
|
return OMPI_ERROR;
|
|
|
|
}
|
|
|
|
|
2010-06-09 16:58:52 +00:00
|
|
|
mca_btl_sm_component.shm_fifo = (volatile sm_fifo_t **)mca_btl_sm_component.sm_seg->module_data_addr;
|
2008-03-16 10:01:56 +00:00
|
|
|
mca_btl_sm_component.shm_bases = (char**)(mca_btl_sm_component.shm_fifo + n);
|
2008-06-15 13:43:28 +00:00
|
|
|
mca_btl_sm_component.shm_mem_nodes = (uint16_t*)(mca_btl_sm_component.shm_bases + n);
|
2008-03-16 10:01:56 +00:00
|
|
|
|
|
|
|
/* set the base of the shared memory segment */
|
|
|
|
mca_btl_sm_component.shm_bases[mca_btl_sm_component.my_smp_rank] =
|
2008-03-11 14:39:10 +00:00
|
|
|
(char*)mca_btl_sm_component.sm_mpool_base;
|
2008-06-15 13:43:28 +00:00
|
|
|
mca_btl_sm_component.shm_mem_nodes[mca_btl_sm_component.my_smp_rank] =
|
|
|
|
(uint16_t)my_mem_node;
|
2008-03-11 14:39:10 +00:00
|
|
|
|
2009-02-17 15:58:15 +00:00
|
|
|
/* initialize the array of fifo's "owned" by this process */
|
|
|
|
if(NULL == (my_fifos = (sm_fifo_t*)mpool_calloc(FIFO_MAP_NUM(n), sizeof(sm_fifo_t))))
|
2008-03-11 14:39:10 +00:00
|
|
|
return OMPI_ERR_OUT_OF_RESOURCE;
|
|
|
|
|
2008-03-16 10:01:56 +00:00
|
|
|
mca_btl_sm_component.shm_fifo[mca_btl_sm_component.my_smp_rank] = my_fifos;
|
2008-03-11 14:39:10 +00:00
|
|
|
|
|
|
|
/* cache the pointer to the 2d fifo array. These addresses
|
|
|
|
* are valid in the current process space */
|
2009-02-17 15:58:15 +00:00
|
|
|
mca_btl_sm_component.fifo = (sm_fifo_t**)malloc(sizeof(sm_fifo_t*) * n);
|
2008-03-11 14:39:10 +00:00
|
|
|
|
|
|
|
if(NULL == mca_btl_sm_component.fifo)
|
|
|
|
return OMPI_ERR_OUT_OF_RESOURCE;
|
|
|
|
|
|
|
|
mca_btl_sm_component.fifo[mca_btl_sm_component.my_smp_rank] = my_fifos;
|
|
|
|
|
2008-09-08 15:39:30 +00:00
|
|
|
mca_btl_sm_component.mem_nodes = (uint16_t *) malloc(sizeof(uint16_t) * n);
|
2008-06-15 13:43:28 +00:00
|
|
|
if(NULL == mca_btl_sm_component.mem_nodes)
|
|
|
|
return OMPI_ERR_OUT_OF_RESOURCE;
|
|
|
|
|
2008-03-11 14:39:10 +00:00
|
|
|
/* initialize fragment descriptor free lists */
|
|
|
|
|
|
|
|
/* allocation will be for the fragment descriptor and payload buffer */
|
|
|
|
length = sizeof(mca_btl_sm_frag1_t);
|
|
|
|
length_payload =
|
|
|
|
sizeof(mca_btl_sm_hdr_t) + mca_btl_sm_component.eager_limit;
|
2009-02-20 19:51:57 +00:00
|
|
|
i = ompi_free_list_init_new(&mca_btl_sm_component.sm_frags_eager, length,
|
2010-07-06 14:33:36 +00:00
|
|
|
opal_cache_line_size, OBJ_CLASS(mca_btl_sm_frag1_t),
|
|
|
|
length_payload, opal_cache_line_size,
|
2009-02-20 19:51:57 +00:00
|
|
|
mca_btl_sm_component.sm_free_list_num,
|
|
|
|
mca_btl_sm_component.sm_free_list_max,
|
|
|
|
mca_btl_sm_component.sm_free_list_inc,
|
|
|
|
mca_btl_sm_component.sm_mpool);
|
|
|
|
if ( OMPI_SUCCESS != i )
|
|
|
|
return i;
|
2008-03-11 14:39:10 +00:00
|
|
|
|
|
|
|
length = sizeof(mca_btl_sm_frag2_t);
|
|
|
|
length_payload =
|
|
|
|
sizeof(mca_btl_sm_hdr_t) + mca_btl_sm_component.max_frag_size;
|
2009-02-20 19:51:57 +00:00
|
|
|
i = ompi_free_list_init_new(&mca_btl_sm_component.sm_frags_max, length,
|
2010-07-06 14:33:36 +00:00
|
|
|
opal_cache_line_size, OBJ_CLASS(mca_btl_sm_frag2_t),
|
|
|
|
length_payload, opal_cache_line_size,
|
2009-02-20 19:51:57 +00:00
|
|
|
mca_btl_sm_component.sm_free_list_num,
|
|
|
|
mca_btl_sm_component.sm_free_list_max,
|
|
|
|
mca_btl_sm_component.sm_free_list_inc,
|
|
|
|
mca_btl_sm_component.sm_mpool);
|
|
|
|
if ( OMPI_SUCCESS != i )
|
|
|
|
return i;
|
|
|
|
|
2009-12-15 23:34:09 +00:00
|
|
|
i = ompi_free_list_init_new(&mca_btl_sm_component.sm_frags_user,
|
|
|
|
sizeof(mca_btl_sm_user_t),
|
2010-07-06 14:33:36 +00:00
|
|
|
opal_cache_line_size, OBJ_CLASS(mca_btl_sm_user_t),
|
|
|
|
sizeof(mca_btl_sm_hdr_t), opal_cache_line_size,
|
2009-12-15 23:34:09 +00:00
|
|
|
mca_btl_sm_component.sm_free_list_num,
|
|
|
|
mca_btl_sm_component.sm_free_list_max,
|
|
|
|
mca_btl_sm_component.sm_free_list_inc,
|
|
|
|
mca_btl_sm_component.sm_mpool);
|
|
|
|
if ( OMPI_SUCCESS != i )
|
|
|
|
return i;
|
|
|
|
|
2009-06-27 00:12:56 +00:00
|
|
|
mca_btl_sm_component.num_outstanding_frags = 0;
|
|
|
|
|
|
|
|
mca_btl_sm_component.num_pending_sends = 0;
|
2009-02-20 19:51:57 +00:00
|
|
|
i = opal_free_list_init(&mca_btl_sm_component.pending_send_fl,
|
|
|
|
sizeof(btl_sm_pending_send_item_t),
|
|
|
|
OBJ_CLASS(opal_free_list_item_t),
|
|
|
|
16, -1, 32);
|
|
|
|
if ( OMPI_SUCCESS != i )
|
|
|
|
return i;
|
2008-03-11 14:39:10 +00:00
|
|
|
|
|
|
|
/* set flag indicating btl has been inited */
|
|
|
|
sm_btl->btl_inited = true;
|
|
|
|
|
|
|
|
return OMPI_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct mca_btl_base_endpoint_t *
|
|
|
|
create_sm_endpoint(int local_proc, struct ompi_proc_t *proc)
|
|
|
|
{
|
|
|
|
struct mca_btl_base_endpoint_t *ep;
|
Update libevent to the 2.0 series, currently at 2.0.7rc. We will update to their final release when it becomes available. Currently known errors exist in unused portions of the libevent code. This revision passes the IBM test suite on a Linux machine and on a standalone Mac.
This is a fairly intrusive change, but outside of the moving of opal/event to opal/mca/event, the only changes involved (a) changing all calls to opal_event functions to reflect the new framework instead, and (b) ensuring that all opal_event_t objects are properly constructed since they are now true opal_objects.
Note: Shiqing has just returned from vacation and has not yet had a chance to complete the Windows integration. Thus, this commit almost certainly breaks Windows support on the trunk. However, I want this to have a chance to soak for as long as possible before I become less available a week from today (going to be at a class for 5 days, and thus will only be sparingly available) so we can find and fix any problems.
Biggest change is moving the libevent code from opal/event to a new opal/mca/event framework. This was done to make it much easier to update libevent in the future. New versions can be inserted as a new component and tested in parallel with the current version until validated, then we can remove the earlier version if we so choose. This is a statically built framework ala installdirs, so only one component will build at a time. There is no selection logic - the sole compiled component simply loads its function pointers into the opal_event struct.
I have gone thru the code base and converted all the libevent calls I could find. However, I cannot compile nor test every environment. It is therefore quite likely that errors remain in the system. Please keep an eye open for two things:
1. compile-time errors: these will be obvious as calls to the old functions (e.g., opal_evtimer_new) must be replaced by the new framework APIs (e.g., opal_event.evtimer_new)
2. run-time errors: these will likely show up as segfaults due to missing constructors on opal_event_t objects. It appears that it became a typical practice for people to "init" an opal_event_t by simply using memset to zero it out. This will no longer work - you must either OBJ_NEW or OBJ_CONSTRUCT an opal_event_t. I tried to catch these cases, but may have missed some. Believe me, you'll know when you hit it.
There is also the issue of the new libevent "no recursion" behavior. As I described on a recent email, we will have to discuss this and figure out what, if anything, we need to do.
This commit was SVN r23925.
2010-10-24 18:35:54 +00:00
|
|
|
#if OMPI_ENABLE_PROGRESS_THREADS == 1
|
2008-03-11 14:39:10 +00:00
|
|
|
char path[PATH_MAX];
|
|
|
|
#endif
|
|
|
|
|
|
|
|
ep = (struct mca_btl_base_endpoint_t*)
|
|
|
|
malloc(sizeof(struct mca_btl_base_endpoint_t));
|
|
|
|
if(NULL == ep)
|
|
|
|
return NULL;
|
|
|
|
ep->peer_smp_rank = local_proc + mca_btl_sm_component.num_smp_procs;
|
|
|
|
|
|
|
|
OBJ_CONSTRUCT(&ep->pending_sends, opal_list_t);
|
2012-02-08 01:32:36 +00:00
|
|
|
OBJ_CONSTRUCT(&ep->endpoint_lock, opal_mutex_t);
|
Update libevent to the 2.0 series, currently at 2.0.7rc. We will update to their final release when it becomes available. Currently known errors exist in unused portions of the libevent code. This revision passes the IBM test suite on a Linux machine and on a standalone Mac.
This is a fairly intrusive change, but outside of the moving of opal/event to opal/mca/event, the only changes involved (a) changing all calls to opal_event functions to reflect the new framework instead, and (b) ensuring that all opal_event_t objects are properly constructed since they are now true opal_objects.
Note: Shiqing has just returned from vacation and has not yet had a chance to complete the Windows integration. Thus, this commit almost certainly breaks Windows support on the trunk. However, I want this to have a chance to soak for as long as possible before I become less available a week from today (going to be at a class for 5 days, and thus will only be sparingly available) so we can find and fix any problems.
Biggest change is moving the libevent code from opal/event to a new opal/mca/event framework. This was done to make it much easier to update libevent in the future. New versions can be inserted as a new component and tested in parallel with the current version until validated, then we can remove the earlier version if we so choose. This is a statically built framework ala installdirs, so only one component will build at a time. There is no selection logic - the sole compiled component simply loads its function pointers into the opal_event struct.
I have gone thru the code base and converted all the libevent calls I could find. However, I cannot compile nor test every environment. It is therefore quite likely that errors remain in the system. Please keep an eye open for two things:
1. compile-time errors: these will be obvious as calls to the old functions (e.g., opal_evtimer_new) must be replaced by the new framework APIs (e.g., opal_event.evtimer_new)
2. run-time errors: these will likely show up as segfaults due to missing constructors on opal_event_t objects. It appears that it became a typical practice for people to "init" an opal_event_t by simply using memset to zero it out. This will no longer work - you must either OBJ_NEW or OBJ_CONSTRUCT an opal_event_t. I tried to catch these cases, but may have missed some. Believe me, you'll know when you hit it.
There is also the issue of the new libevent "no recursion" behavior. As I described on a recent email, we will have to discuss this and figure out what, if anything, we need to do.
This commit was SVN r23925.
2010-10-24 18:35:54 +00:00
|
|
|
#if OMPI_ENABLE_PROGRESS_THREADS == 1
|
2008-03-11 14:39:10 +00:00
|
|
|
sprintf(path, "%s"OPAL_PATH_SEP"sm_fifo.%lu",
|
2009-03-05 21:56:03 +00:00
|
|
|
orte_process_info.job_session_dir,
|
2008-03-11 14:39:10 +00:00
|
|
|
(unsigned long)proc->proc_name.vpid);
|
|
|
|
ep->fifo_fd = open(path, O_WRONLY);
|
|
|
|
if(ep->fifo_fd < 0) {
|
2008-06-09 14:53:58 +00:00
|
|
|
opal_output(0, "mca_btl_sm_add_procs: open(%s) failed with errno=%d\n",
|
2008-03-11 14:39:10 +00:00
|
|
|
path, errno);
|
|
|
|
free(ep);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
return ep;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void calc_sm_max_procs(int n)
|
|
|
|
{
|
|
|
|
/* see if need to allocate space for extra procs */
|
|
|
|
if(0 > mca_btl_sm_component.sm_max_procs) {
|
|
|
|
/* no limit */
|
|
|
|
if(0 <= mca_btl_sm_component.sm_extra_procs) {
|
|
|
|
/* limit */
|
|
|
|
mca_btl_sm_component.sm_max_procs =
|
|
|
|
n + mca_btl_sm_component.sm_extra_procs;
|
|
|
|
} else {
|
|
|
|
/* no limit */
|
|
|
|
mca_btl_sm_component.sm_max_procs = 2 * n;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2007-03-20 08:15:58 +00:00
|
|
|
int mca_btl_sm_add_procs(
|
2008-03-09 13:17:13 +00:00
|
|
|
struct mca_btl_base_module_t* btl,
|
|
|
|
size_t nprocs,
|
|
|
|
struct ompi_proc_t **procs,
|
2005-06-30 05:50:55 +00:00
|
|
|
struct mca_btl_base_endpoint_t **peers,
|
2009-03-03 22:25:13 +00:00
|
|
|
opal_bitmap_t* reachability)
|
2005-05-23 22:06:50 +00:00
|
|
|
{
|
2008-03-11 14:39:10 +00:00
|
|
|
int return_code = OMPI_SUCCESS;
|
2009-05-20 17:41:13 +00:00
|
|
|
int32_t n_local_procs = 0, proc, j, my_smp_rank = -1;
|
2005-05-23 22:06:50 +00:00
|
|
|
ompi_proc_t* my_proc; /* pointer to caller's proc structure */
|
2008-03-11 14:39:10 +00:00
|
|
|
mca_btl_sm_t *sm_btl;
|
2007-02-01 17:18:35 +00:00
|
|
|
bool have_connected_peer = false;
|
2008-03-16 10:01:56 +00:00
|
|
|
char **bases;
|
2005-05-23 22:06:50 +00:00
|
|
|
/* initializion */
|
|
|
|
|
2008-03-11 14:39:10 +00:00
|
|
|
sm_btl = (mca_btl_sm_t *)btl;
|
2005-05-23 22:06:50 +00:00
|
|
|
|
|
|
|
/* get pointer to my proc structure */
|
2008-03-11 14:39:10 +00:00
|
|
|
if(NULL == (my_proc = ompi_proc_local()))
|
|
|
|
return OMPI_ERR_OUT_OF_RESOURCE;
|
2005-05-23 22:06:50 +00:00
|
|
|
|
|
|
|
/* Get unique host identifier for each process in the list,
|
|
|
|
* and idetify procs that are on this host. Add procs on this
|
|
|
|
* host to shared memory reachbility list. Also, get number
|
2008-03-11 14:39:10 +00:00
|
|
|
* of local procs in the procs list. */
|
|
|
|
for(proc = 0; proc < (int32_t)nprocs; proc++) {
|
2005-07-15 15:22:41 +00:00
|
|
|
/* check to see if this proc can be reached via shmem (i.e.,
|
|
|
|
if they're on my local host and in my job) */
|
2007-02-01 17:18:35 +00:00
|
|
|
if (procs[proc]->proc_name.jobid != my_proc->proc_name.jobid ||
|
2009-02-11 15:02:38 +00:00
|
|
|
!OPAL_PROC_ON_LOCAL_NODE(procs[proc]->proc_flags)) {
|
2007-03-21 13:29:19 +00:00
|
|
|
peers[proc] = NULL;
|
2005-05-23 22:06:50 +00:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2007-02-01 17:18:35 +00:00
|
|
|
/* check to see if this is me */
|
2008-03-11 14:39:10 +00:00
|
|
|
if(my_proc == procs[proc]) {
|
|
|
|
my_smp_rank = mca_btl_sm_component.my_smp_rank = n_local_procs++;
|
|
|
|
continue;
|
2007-02-01 17:18:35 +00:00
|
|
|
}
|
|
|
|
|
2008-03-11 14:39:10 +00:00
|
|
|
/* we have someone to talk to */
|
|
|
|
have_connected_peer = true;
|
2007-10-22 12:07:22 +00:00
|
|
|
|
2008-03-11 14:39:10 +00:00
|
|
|
if(!(peers[proc] = create_sm_endpoint(n_local_procs, procs[proc]))) {
|
|
|
|
return_code = OMPI_ERROR;
|
2005-07-15 15:22:41 +00:00
|
|
|
goto CLEANUP;
|
|
|
|
}
|
|
|
|
n_local_procs++;
|
2008-03-11 14:39:10 +00:00
|
|
|
|
|
|
|
/* add this proc to shared memory accessibility list */
|
2009-03-03 22:25:13 +00:00
|
|
|
return_code = opal_bitmap_set_bit(reachability, proc);
|
2008-03-11 14:39:10 +00:00
|
|
|
if(OMPI_SUCCESS != return_code)
|
|
|
|
goto CLEANUP;
|
2005-05-23 22:06:50 +00:00
|
|
|
}
|
2005-07-15 15:22:41 +00:00
|
|
|
|
2007-02-01 17:18:35 +00:00
|
|
|
/* jump out if there's not someone we can talk to */
|
2008-03-11 14:39:10 +00:00
|
|
|
if (!have_connected_peer)
|
2005-05-23 22:06:50 +00:00
|
|
|
goto CLEANUP;
|
|
|
|
|
|
|
|
/* make sure that my_smp_rank has been defined */
|
2008-03-11 14:39:10 +00:00
|
|
|
if(-1 == my_smp_rank) {
|
|
|
|
return_code = OMPI_ERROR;
|
2005-05-23 22:06:50 +00:00
|
|
|
goto CLEANUP;
|
|
|
|
}
|
|
|
|
|
2008-03-11 14:39:10 +00:00
|
|
|
calc_sm_max_procs(n_local_procs);
|
2005-05-23 22:06:50 +00:00
|
|
|
|
2008-03-11 14:39:10 +00:00
|
|
|
if (!sm_btl->btl_inited) {
|
|
|
|
return_code =
|
|
|
|
sm_btl_first_time_init(sm_btl, mca_btl_sm_component.sm_max_procs);
|
|
|
|
if(return_code != OMPI_SUCCESS)
|
2005-05-23 22:06:50 +00:00
|
|
|
goto CLEANUP;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* set local proc's smp rank in the peers structure for
|
2009-05-20 17:41:13 +00:00
|
|
|
* rapid access and calculate reachability */
|
2008-03-11 14:39:10 +00:00
|
|
|
for(proc = 0; proc < (int32_t)nprocs; proc++) {
|
|
|
|
if(NULL == peers[proc])
|
|
|
|
continue;
|
|
|
|
mca_btl_sm_component.sm_peers[peers[proc]->peer_smp_rank] = peers[proc];
|
|
|
|
peers[proc]->my_smp_rank = my_smp_rank;
|
2005-05-23 22:06:50 +00:00
|
|
|
}
|
|
|
|
|
2008-03-16 10:01:56 +00:00
|
|
|
bases = mca_btl_sm_component.shm_bases;
|
2005-05-23 22:06:50 +00:00
|
|
|
|
2009-02-17 15:58:15 +00:00
|
|
|
/* initialize own FIFOs */
|
|
|
|
/*
|
|
|
|
* The receiver initializes all its FIFOs. All components will
|
|
|
|
* be allocated near the receiver. Nothing will be local to
|
|
|
|
* "the sender" since there will be many senders.
|
|
|
|
*/
|
|
|
|
for(j = mca_btl_sm_component.num_smp_procs;
|
|
|
|
j < mca_btl_sm_component.num_smp_procs + FIFO_MAP_NUM(n_local_procs); j++) {
|
|
|
|
|
|
|
|
return_code = sm_fifo_init( mca_btl_sm_component.fifo_size,
|
|
|
|
mca_btl_sm_component.sm_mpool,
|
|
|
|
&mca_btl_sm_component.fifo[my_smp_rank][j],
|
|
|
|
mca_btl_sm_component.fifo_lazy_free);
|
|
|
|
if(return_code != OMPI_SUCCESS)
|
|
|
|
goto CLEANUP;
|
|
|
|
}
|
|
|
|
|
2009-05-19 22:50:44 +00:00
|
|
|
opal_atomic_wmb();
|
|
|
|
|
2009-03-31 21:46:27 +00:00
|
|
|
/* Sync with other local procs. Force the FIFO initialization to always
|
|
|
|
* happens before the readers access it.
|
|
|
|
*/
|
2010-06-09 16:58:52 +00:00
|
|
|
opal_atomic_add_32( &mca_btl_sm_component.sm_seg->module_seg->seg_inited, 1);
|
2009-03-31 21:46:27 +00:00
|
|
|
while( n_local_procs >
|
2010-06-09 16:58:52 +00:00
|
|
|
mca_btl_sm_component.sm_seg->module_seg->seg_inited) {
|
2009-03-31 21:46:27 +00:00
|
|
|
opal_progress();
|
2009-05-19 22:50:44 +00:00
|
|
|
opal_atomic_rmb();
|
2009-03-31 21:46:27 +00:00
|
|
|
}
|
|
|
|
|
2009-02-17 15:58:15 +00:00
|
|
|
/* coordinate with other processes */
|
2008-03-11 14:39:10 +00:00
|
|
|
for(j = mca_btl_sm_component.num_smp_procs;
|
|
|
|
j < mca_btl_sm_component.num_smp_procs + n_local_procs; j++) {
|
|
|
|
ptrdiff_t diff;
|
2005-05-23 22:06:50 +00:00
|
|
|
|
|
|
|
/* spin until this element is allocated */
|
2009-02-17 15:58:15 +00:00
|
|
|
/* doesn't really wait for that process... FIFO might be allocated, but not initialized */
|
2009-05-19 22:50:44 +00:00
|
|
|
opal_atomic_rmb();
|
2008-03-16 10:01:56 +00:00
|
|
|
while(NULL == mca_btl_sm_component.shm_fifo[j]) {
|
2007-03-21 10:25:10 +00:00
|
|
|
opal_progress();
|
2009-05-19 22:50:44 +00:00
|
|
|
opal_atomic_rmb();
|
2005-10-19 00:56:14 +00:00
|
|
|
}
|
2005-05-23 22:06:50 +00:00
|
|
|
|
2005-11-12 22:32:09 +00:00
|
|
|
/* Calculate the difference as (my_base - their_base) */
|
2008-03-16 10:01:56 +00:00
|
|
|
diff = ADDR2OFFSET(bases[my_smp_rank], bases[j]);
|
2007-03-22 12:18:44 +00:00
|
|
|
|
2008-03-11 14:39:10 +00:00
|
|
|
/* store local address of remote fifos */
|
|
|
|
mca_btl_sm_component.fifo[j] =
|
2009-02-17 15:58:15 +00:00
|
|
|
(sm_fifo_t*)OFFSET2ADDR(diff, mca_btl_sm_component.shm_fifo[j]);
|
2007-03-22 12:18:44 +00:00
|
|
|
|
2008-06-15 13:43:28 +00:00
|
|
|
/* cache local copy of peer memory node number */
|
2009-02-17 15:58:15 +00:00
|
|
|
mca_btl_sm_component.mem_nodes[j] = mca_btl_sm_component.shm_mem_nodes[j];
|
2005-05-23 22:06:50 +00:00
|
|
|
}
|
|
|
|
|
2008-03-11 14:39:10 +00:00
|
|
|
/* update the local smp process count */
|
|
|
|
mca_btl_sm_component.num_smp_procs += n_local_procs;
|
2005-05-23 22:06:50 +00:00
|
|
|
|
2006-10-06 21:13:49 +00:00
|
|
|
/* make sure we have enough eager fragmnents for each process */
|
2008-05-30 03:58:39 +00:00
|
|
|
return_code = ompi_free_list_resize(&mca_btl_sm_component.sm_frags_eager,
|
2008-03-11 14:39:10 +00:00
|
|
|
mca_btl_sm_component.num_smp_procs * 2);
|
|
|
|
if (OMPI_SUCCESS != return_code)
|
2006-10-06 21:13:49 +00:00
|
|
|
goto CLEANUP;
|
2005-05-23 22:06:50 +00:00
|
|
|
|
|
|
|
CLEANUP:
|
|
|
|
return return_code;
|
|
|
|
}
|
|
|
|
|
2005-06-30 05:50:55 +00:00
|
|
|
int mca_btl_sm_del_procs(
|
2008-03-09 13:17:13 +00:00
|
|
|
struct mca_btl_base_module_t* btl,
|
2005-05-23 22:06:50 +00:00
|
|
|
size_t nprocs,
|
2008-03-09 13:17:13 +00:00
|
|
|
struct ompi_proc_t **procs,
|
2005-06-30 05:50:55 +00:00
|
|
|
struct mca_btl_base_endpoint_t **peers)
|
2005-05-23 22:06:50 +00:00
|
|
|
{
|
|
|
|
return OMPI_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
2005-06-30 05:50:55 +00:00
|
|
|
* MCA->BTL Clean up any resources held by BTL module
|
2005-05-23 22:06:50 +00:00
|
|
|
* before the module is unloaded.
|
|
|
|
*
|
2005-06-30 05:50:55 +00:00
|
|
|
* @param btl (IN) BTL module.
|
2005-05-23 22:06:50 +00:00
|
|
|
*
|
2005-06-30 05:50:55 +00:00
|
|
|
* Prior to unloading a BTL module, the MCA framework will call
|
|
|
|
* the BTL finalize method of the module. Any resources held by
|
|
|
|
* the BTL should be released and if required the memory corresponding
|
|
|
|
* to the BTL module freed.
|
2005-05-23 22:06:50 +00:00
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
2005-06-30 05:50:55 +00:00
|
|
|
int mca_btl_sm_finalize(struct mca_btl_base_module_t* btl)
|
2005-05-23 22:06:50 +00:00
|
|
|
{
|
|
|
|
return OMPI_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2008-03-09 13:17:13 +00:00
|
|
|
/*
|
2007-02-13 12:01:36 +00:00
|
|
|
* Register callback function for error handling..
|
|
|
|
*/
|
|
|
|
int mca_btl_sm_register_error_cb(
|
|
|
|
struct mca_btl_base_module_t* btl,
|
|
|
|
mca_btl_base_module_error_cb_fn_t cbfunc)
|
|
|
|
{
|
|
|
|
mca_btl_sm_t *sm_btl = (mca_btl_sm_t *)btl;
|
|
|
|
sm_btl->error_cb = cbfunc;
|
|
|
|
return OMPI_SUCCESS;
|
|
|
|
}
|
2005-05-23 22:06:50 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Allocate a segment.
|
|
|
|
*
|
2005-06-30 05:50:55 +00:00
|
|
|
* @param btl (IN) BTL module
|
2005-05-23 22:06:50 +00:00
|
|
|
* @param size (IN) Request segment size.
|
|
|
|
*/
|
2005-06-30 05:50:55 +00:00
|
|
|
extern mca_btl_base_descriptor_t* mca_btl_sm_alloc(
|
|
|
|
struct mca_btl_base_module_t* btl,
|
2007-12-09 14:00:42 +00:00
|
|
|
struct mca_btl_base_endpoint_t* endpoint,
|
2007-05-24 19:51:26 +00:00
|
|
|
uint8_t order,
|
2007-12-09 14:08:01 +00:00
|
|
|
size_t size,
|
|
|
|
uint32_t flags)
|
2005-05-23 22:06:50 +00:00
|
|
|
{
|
2008-12-17 22:14:59 +00:00
|
|
|
mca_btl_sm_frag_t* frag = NULL;
|
2005-05-23 22:06:50 +00:00
|
|
|
int rc;
|
2005-06-30 05:50:55 +00:00
|
|
|
if(size <= mca_btl_sm_component.eager_limit) {
|
2008-12-17 22:14:59 +00:00
|
|
|
MCA_BTL_SM_FRAG_ALLOC_EAGER(frag,rc);
|
2005-09-07 13:40:22 +00:00
|
|
|
} else if (size <= mca_btl_sm_component.max_frag_size) {
|
2008-12-17 22:14:59 +00:00
|
|
|
MCA_BTL_SM_FRAG_ALLOC_MAX(frag,rc);
|
2005-05-23 22:06:50 +00:00
|
|
|
}
|
2006-10-06 21:13:49 +00:00
|
|
|
|
2008-12-17 22:14:59 +00:00
|
|
|
if (OPAL_LIKELY(frag != NULL)) {
|
2006-10-06 21:13:49 +00:00
|
|
|
frag->segment.seg_len = size;
|
2008-02-18 17:39:30 +00:00
|
|
|
frag->base.des_flags = flags;
|
2006-10-06 21:13:49 +00:00
|
|
|
}
|
2005-06-30 05:50:55 +00:00
|
|
|
return (mca_btl_base_descriptor_t*)frag;
|
2005-05-23 22:06:50 +00:00
|
|
|
}
|
2006-10-06 21:13:49 +00:00
|
|
|
|
2005-05-23 22:06:50 +00:00
|
|
|
/**
|
2005-06-30 05:50:55 +00:00
|
|
|
* Return a segment allocated by this BTL.
|
2005-05-23 22:06:50 +00:00
|
|
|
*
|
2005-06-30 05:50:55 +00:00
|
|
|
* @param btl (IN) BTL module
|
2005-05-23 22:06:50 +00:00
|
|
|
* @param segment (IN) Allocated segment.
|
|
|
|
*/
|
2005-06-30 05:50:55 +00:00
|
|
|
extern int mca_btl_sm_free(
|
|
|
|
struct mca_btl_base_module_t* btl,
|
|
|
|
mca_btl_base_descriptor_t* des)
|
2005-05-23 22:06:50 +00:00
|
|
|
{
|
2005-06-30 05:50:55 +00:00
|
|
|
mca_btl_sm_frag_t* frag = (mca_btl_sm_frag_t*)des;
|
2006-05-31 14:24:32 +00:00
|
|
|
MCA_BTL_SM_FRAG_RETURN(frag);
|
2008-03-09 13:17:13 +00:00
|
|
|
|
2005-05-23 22:06:50 +00:00
|
|
|
return OMPI_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Pack data
|
|
|
|
*
|
2005-06-30 05:50:55 +00:00
|
|
|
* @param btl (IN) BTL module
|
2005-05-23 22:06:50 +00:00
|
|
|
*/
|
2005-06-30 05:50:55 +00:00
|
|
|
struct mca_btl_base_descriptor_t* mca_btl_sm_prepare_src(
|
|
|
|
struct mca_btl_base_module_t* btl,
|
|
|
|
struct mca_btl_base_endpoint_t* endpoint,
|
2005-06-24 21:12:38 +00:00
|
|
|
mca_mpool_base_registration_t* registration,
|
- Split the datatype engine into two parts: an MPI specific part in
OMPI
and a language agnostic part in OPAL. The convertor is completely
moved into OPAL. This offers several benefits as described in RFC
http://www.open-mpi.org/community/lists/devel/2009/07/6387.php
namely:
- Fewer basic types (int* and float* types, boolean and wchar
- Fixing naming scheme to ompi-nomenclature.
- Usability outside of the ompi-layer.
- Due to the fixed nature of simple opal types, their information is
completely
known at compile time and therefore constified
- With fewer datatypes (22), the actual sizes of bit-field types may be
reduced
from 64 to 32 bits, allowing reorganizing the opal_datatype
structure, eliminating holes and keeping data required in convertor
(upon send/recv) in one cacheline...
This has implications to the convertor-datastructure and other parts
of the code.
- Several performance tests have been run, the netpipe latency does not
change with
this patch on Linux/x86-64 on the smoky cluster.
- Extensive tests have been done to verify correctness (no new
regressions) using:
1. mpi_test_suite on linux/x86-64 using clean ompi-trunk and
ompi-ddt:
a. running both trunk and ompi-ddt resulted in no differences
(except for MPI_SHORT_INT and MPI_TYPE_MIX_LB_UB do now run
correctly).
b. with --enable-memchecker and running under valgrind (one buglet
when run with static found in test-suite, commited)
2. ibm testsuite on linux/x86-64 using clean ompi-trunk and ompi-ddt:
all passed (except for the dynamic/ tests failed!! as trunk/MTT)
3. compilation and usage of HDF5 tests on Jaguar using PGI and
PathScale compilers.
4. compilation and usage on Scicortex.
- Please note, that for the heterogeneous case, (-m32 compiled
binaries/ompi), neither
ompi-trunk, nor ompi-ddt branch would successfully launch.
This commit was SVN r21641.
2009-07-13 04:56:31 +00:00
|
|
|
struct opal_convertor_t* convertor,
|
2007-05-24 19:51:26 +00:00
|
|
|
uint8_t order,
|
2005-05-23 22:06:50 +00:00
|
|
|
size_t reserve,
|
2007-12-09 14:08:01 +00:00
|
|
|
size_t* size,
|
|
|
|
uint32_t flags)
|
2005-05-23 22:06:50 +00:00
|
|
|
{
|
2005-06-30 05:50:55 +00:00
|
|
|
mca_btl_sm_frag_t* frag;
|
2005-06-01 14:34:22 +00:00
|
|
|
struct iovec iov;
|
|
|
|
uint32_t iov_count = 1;
|
2005-06-08 19:13:28 +00:00
|
|
|
size_t max_data = *size;
|
2005-06-01 14:34:22 +00:00
|
|
|
int rc;
|
2009-12-15 23:34:09 +00:00
|
|
|
#if OMPI_BTL_SM_HAVE_KNEM
|
|
|
|
mca_btl_sm_t* sm_btl = (mca_btl_sm_t*)btl;
|
|
|
|
struct knem_cmd_create_region knem_cr;
|
|
|
|
struct knem_cmd_param_iovec knem_iov;
|
2005-06-01 14:34:22 +00:00
|
|
|
|
2009-12-15 23:34:09 +00:00
|
|
|
if( (0 != reserve) || (OPAL_UNLIKELY(!mca_btl_sm_component.use_knem)) ) {
|
|
|
|
#endif
|
|
|
|
if ( reserve + max_data <= mca_btl_sm_component.eager_limit ) {
|
|
|
|
MCA_BTL_SM_FRAG_ALLOC_EAGER(frag,rc);
|
|
|
|
} else {
|
|
|
|
MCA_BTL_SM_FRAG_ALLOC_MAX(frag, rc);
|
|
|
|
}
|
|
|
|
if( OPAL_UNLIKELY(NULL == frag) ) {
|
|
|
|
return NULL;
|
|
|
|
}
|
2005-06-01 14:34:22 +00:00
|
|
|
|
2009-12-15 23:34:09 +00:00
|
|
|
if( OPAL_UNLIKELY(reserve + max_data > frag->size) ) {
|
|
|
|
max_data = frag->size - reserve;
|
|
|
|
}
|
|
|
|
iov.iov_len = max_data;
|
|
|
|
iov.iov_base =
|
|
|
|
(IOVBASE_TYPE*)(((unsigned char*)(frag->segment.seg_addr.pval)) +
|
|
|
|
reserve);
|
|
|
|
|
|
|
|
rc = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
|
|
|
|
if( OPAL_UNLIKELY(rc < 0) ) {
|
|
|
|
MCA_BTL_SM_FRAG_RETURN(frag);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
frag->segment.seg_len = reserve + max_data;
|
|
|
|
#if OMPI_BTL_SM_HAVE_KNEM
|
|
|
|
} else {
|
|
|
|
MCA_BTL_SM_FRAG_ALLOC_USER(frag, rc);
|
|
|
|
if( OPAL_UNLIKELY(NULL == frag) ) {
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
iov.iov_len = max_data;
|
|
|
|
iov.iov_base = NULL;
|
|
|
|
rc = opal_convertor_pack(convertor, &iov, &iov_count, &max_data);
|
|
|
|
if( OPAL_UNLIKELY(rc < 0) ) {
|
|
|
|
MCA_BTL_SM_FRAG_RETURN(frag);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
frag->segment.seg_addr.pval = iov.iov_base;
|
|
|
|
frag->segment.seg_len = max_data;
|
|
|
|
|
|
|
|
knem_iov.base = (uintptr_t)iov.iov_base;
|
|
|
|
knem_iov.len = max_data;
|
|
|
|
knem_cr.iovec_array = (uintptr_t)&knem_iov;
|
|
|
|
knem_cr.iovec_nr = iov_count;
|
|
|
|
knem_cr.protection = PROT_READ;
|
|
|
|
knem_cr.flags = KNEM_FLAG_SINGLEUSE;
|
|
|
|
if (OPAL_UNLIKELY(ioctl(sm_btl->knem_fd, KNEM_CMD_CREATE_REGION, &knem_cr) < 0)) {
|
|
|
|
return NULL;
|
|
|
|
}
|
2011-11-06 16:19:09 +00:00
|
|
|
frag->segment.seg_key.key64[0] = knem_cr.cookie;
|
2005-06-01 14:34:22 +00:00
|
|
|
}
|
2009-12-15 23:34:09 +00:00
|
|
|
#endif
|
|
|
|
frag->base.des_src = &(frag->segment);
|
|
|
|
frag->base.des_src_cnt = 1;
|
|
|
|
frag->base.order = MCA_BTL_NO_ORDER;
|
|
|
|
frag->base.des_dst = NULL;
|
|
|
|
frag->base.des_dst_cnt = 0;
|
2008-02-18 17:39:30 +00:00
|
|
|
frag->base.des_flags = flags;
|
2005-06-01 14:34:22 +00:00
|
|
|
*size = max_data;
|
|
|
|
return &frag->base;
|
2005-05-23 22:06:50 +00:00
|
|
|
}
|
|
|
|
|
2008-05-30 03:58:39 +00:00
|
|
|
#if 0
|
|
|
|
#define MCA_BTL_SM_TOUCH_DATA_TILL_CACHELINE_BOUNDARY(sm_frag) \
|
|
|
|
do { \
|
|
|
|
char* _memory = (char*)(sm_frag)->segment.seg_addr.pval + \
|
|
|
|
(sm_frag)->segment.seg_len; \
|
|
|
|
int* _intmem; \
|
|
|
|
size_t align = (intptr_t)_memory & 0xFUL; \
|
|
|
|
switch( align & 0x3 ) { \
|
|
|
|
case 3: *_memory = 0; _memory++; \
|
|
|
|
case 2: *_memory = 0; _memory++; \
|
|
|
|
case 1: *_memory = 0; _memory++; \
|
|
|
|
} \
|
|
|
|
align >>= 2; \
|
|
|
|
_intmem = (int*)_memory; \
|
|
|
|
switch( align ) { \
|
|
|
|
case 3: *_intmem = 0; _intmem++; \
|
|
|
|
case 2: *_intmem = 0; _intmem++; \
|
|
|
|
case 1: *_intmem = 0; _intmem++; \
|
|
|
|
} \
|
|
|
|
} while(0)
|
|
|
|
#else
|
|
|
|
#define MCA_BTL_SM_TOUCH_DATA_TILL_CACHELINE_BOUNDARY(sm_frag)
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#if 0
|
|
|
|
if( OPAL_LIKELY(align > 0) ) { \
|
|
|
|
align = 0xFUL - align; \
|
|
|
|
memset( _memory, 0, align ); \
|
|
|
|
} \
|
|
|
|
|
|
|
|
#endif
|
2008-03-09 13:17:13 +00:00
|
|
|
|
2005-05-23 22:06:50 +00:00
|
|
|
/**
|
2008-05-30 03:58:39 +00:00
|
|
|
* Initiate an inline send to the peer. If failure then return a descriptor.
|
2005-05-23 22:06:50 +00:00
|
|
|
*
|
2005-06-30 05:50:55 +00:00
|
|
|
* @param btl (IN) BTL module
|
|
|
|
* @param peer (IN) BTL peer addressing
|
2005-05-23 22:06:50 +00:00
|
|
|
*/
|
2008-05-30 03:58:39 +00:00
|
|
|
int mca_btl_sm_sendi( struct mca_btl_base_module_t* btl,
|
|
|
|
struct mca_btl_base_endpoint_t* endpoint,
|
- Split the datatype engine into two parts: an MPI specific part in
OMPI
and a language agnostic part in OPAL. The convertor is completely
moved into OPAL. This offers several benefits as described in RFC
http://www.open-mpi.org/community/lists/devel/2009/07/6387.php
namely:
- Fewer basic types (int* and float* types, boolean and wchar
- Fixing naming scheme to ompi-nomenclature.
- Usability outside of the ompi-layer.
- Due to the fixed nature of simple opal types, their information is
completely
known at compile time and therefore constified
- With fewer datatypes (22), the actual sizes of bit-field types may be
reduced
from 64 to 32 bits, allowing reorganizing the opal_datatype
structure, eliminating holes and keeping data required in convertor
(upon send/recv) in one cacheline...
This has implications to the convertor-datastructure and other parts
of the code.
- Several performance tests have been run, the netpipe latency does not
change with
this patch on Linux/x86-64 on the smoky cluster.
- Extensive tests have been done to verify correctness (no new
regressions) using:
1. mpi_test_suite on linux/x86-64 using clean ompi-trunk and
ompi-ddt:
a. running both trunk and ompi-ddt resulted in no differences
(except for MPI_SHORT_INT and MPI_TYPE_MIX_LB_UB do now run
correctly).
b. with --enable-memchecker and running under valgrind (one buglet
when run with static found in test-suite, commited)
2. ibm testsuite on linux/x86-64 using clean ompi-trunk and ompi-ddt:
all passed (except for the dynamic/ tests failed!! as trunk/MTT)
3. compilation and usage of HDF5 tests on Jaguar using PGI and
PathScale compilers.
4. compilation and usage on Scicortex.
- Please note, that for the heterogeneous case, (-m32 compiled
binaries/ompi), neither
ompi-trunk, nor ompi-ddt branch would successfully launch.
This commit was SVN r21641.
2009-07-13 04:56:31 +00:00
|
|
|
struct opal_convertor_t* convertor,
|
2008-05-30 03:58:39 +00:00
|
|
|
void* header,
|
|
|
|
size_t header_size,
|
|
|
|
size_t payload_size,
|
|
|
|
uint8_t order,
|
|
|
|
uint32_t flags,
|
|
|
|
mca_btl_base_tag_t tag,
|
|
|
|
mca_btl_base_descriptor_t** descriptor )
|
|
|
|
{
|
2009-02-26 18:10:50 +00:00
|
|
|
size_t length = (header_size + payload_size);
|
2008-05-30 03:58:39 +00:00
|
|
|
mca_btl_sm_frag_t* frag;
|
|
|
|
int rc;
|
2005-05-23 22:06:50 +00:00
|
|
|
|
2009-06-27 23:42:09 +00:00
|
|
|
if ( mca_btl_sm_component.num_outstanding_frags * 2 > (int) mca_btl_sm_component.fifo_size ) {
|
2009-06-27 00:12:56 +00:00
|
|
|
mca_btl_sm_component_progress();
|
|
|
|
}
|
|
|
|
|
2009-02-26 18:10:50 +00:00
|
|
|
/* this check should be unnecessary... turn into an assertion? */
|
2008-05-30 03:58:39 +00:00
|
|
|
if( length < mca_btl_sm_component.eager_limit ) {
|
2009-02-26 18:10:50 +00:00
|
|
|
|
|
|
|
/* allocate a fragment, giving up if we can't get one */
|
|
|
|
/* note that frag==NULL is equivalent to rc returning an error code */
|
2008-12-17 22:14:59 +00:00
|
|
|
MCA_BTL_SM_FRAG_ALLOC_EAGER(frag, rc);
|
2008-05-30 03:58:39 +00:00
|
|
|
if( OPAL_UNLIKELY(NULL == frag) ) {
|
|
|
|
*descriptor = NULL;
|
|
|
|
return rc;
|
|
|
|
}
|
2009-02-26 18:10:50 +00:00
|
|
|
|
|
|
|
/* fill in fragment fields */
|
2008-05-30 03:58:39 +00:00
|
|
|
frag->segment.seg_len = length;
|
2009-02-26 18:10:50 +00:00
|
|
|
frag->hdr->len = length;
|
2008-05-30 03:58:39 +00:00
|
|
|
assert( 0 == (flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) );
|
2009-02-26 18:10:50 +00:00
|
|
|
frag->base.des_flags = flags | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP; /* why do any flags matter here other than OWNERSHIP? */
|
2008-05-30 03:58:39 +00:00
|
|
|
frag->hdr->tag = tag;
|
|
|
|
frag->endpoint = endpoint;
|
|
|
|
|
2009-02-26 18:10:50 +00:00
|
|
|
/* write the match header (with MPI comm/tag/etc. info) */
|
2008-05-30 03:58:39 +00:00
|
|
|
memcpy( frag->segment.seg_addr.pval, header, header_size );
|
2009-02-26 18:10:50 +00:00
|
|
|
|
|
|
|
/* write the message data if there is any */
|
|
|
|
/*
|
|
|
|
We can add MEMCHECKER calls before and after the packing.
|
|
|
|
*/
|
2008-05-30 03:58:39 +00:00
|
|
|
if( payload_size ) {
|
2009-02-26 18:10:50 +00:00
|
|
|
size_t max_data;
|
2008-05-30 03:58:39 +00:00
|
|
|
struct iovec iov;
|
2008-06-12 17:24:39 +00:00
|
|
|
uint32_t iov_count;
|
2008-05-30 03:58:39 +00:00
|
|
|
/* pack the data into the supplied buffer */
|
|
|
|
iov.iov_base = (IOVBASE_TYPE*)((unsigned char*)frag->segment.seg_addr.pval + header_size);
|
2008-06-12 17:24:39 +00:00
|
|
|
iov.iov_len = max_data = payload_size;
|
2008-05-30 03:58:39 +00:00
|
|
|
iov_count = 1;
|
|
|
|
|
- Split the datatype engine into two parts: an MPI specific part in
OMPI
and a language agnostic part in OPAL. The convertor is completely
moved into OPAL. This offers several benefits as described in RFC
http://www.open-mpi.org/community/lists/devel/2009/07/6387.php
namely:
- Fewer basic types (int* and float* types, boolean and wchar
- Fixing naming scheme to ompi-nomenclature.
- Usability outside of the ompi-layer.
- Due to the fixed nature of simple opal types, their information is
completely
known at compile time and therefore constified
- With fewer datatypes (22), the actual sizes of bit-field types may be
reduced
from 64 to 32 bits, allowing reorganizing the opal_datatype
structure, eliminating holes and keeping data required in convertor
(upon send/recv) in one cacheline...
This has implications to the convertor-datastructure and other parts
of the code.
- Several performance tests have been run, the netpipe latency does not
change with
this patch on Linux/x86-64 on the smoky cluster.
- Extensive tests have been done to verify correctness (no new
regressions) using:
1. mpi_test_suite on linux/x86-64 using clean ompi-trunk and
ompi-ddt:
a. running both trunk and ompi-ddt resulted in no differences
(except for MPI_SHORT_INT and MPI_TYPE_MIX_LB_UB do now run
correctly).
b. with --enable-memchecker and running under valgrind (one buglet
when run with static found in test-suite, commited)
2. ibm testsuite on linux/x86-64 using clean ompi-trunk and ompi-ddt:
all passed (except for the dynamic/ tests failed!! as trunk/MTT)
3. compilation and usage of HDF5 tests on Jaguar using PGI and
PathScale compilers.
4. compilation and usage on Scicortex.
- Please note, that for the heterogeneous case, (-m32 compiled
binaries/ompi), neither
ompi-trunk, nor ompi-ddt branch would successfully launch.
This commit was SVN r21641.
2009-07-13 04:56:31 +00:00
|
|
|
(void)opal_convertor_pack( convertor, &iov, &iov_count, &max_data);
|
2008-05-30 03:58:39 +00:00
|
|
|
|
|
|
|
assert(max_data == payload_size);
|
|
|
|
}
|
2009-02-26 18:10:50 +00:00
|
|
|
|
2008-05-30 03:58:39 +00:00
|
|
|
MCA_BTL_SM_TOUCH_DATA_TILL_CACHELINE_BOUNDARY(frag);
|
2009-02-26 18:10:50 +00:00
|
|
|
|
|
|
|
/* write the fragment pointer to the FIFO */
|
2008-05-30 03:58:39 +00:00
|
|
|
/*
|
2009-02-26 18:10:50 +00:00
|
|
|
* Note that we don't care what the FIFO-write return code is. Even if
|
|
|
|
* the return code indicates failure, the write has still "completed" from
|
|
|
|
* our point of view: it has been posted to a "pending send" queue.
|
2008-05-30 03:58:39 +00:00
|
|
|
*/
|
2009-06-27 00:12:56 +00:00
|
|
|
OPAL_THREAD_ADD32(&mca_btl_sm_component.num_outstanding_frags, +1);
|
2008-05-30 03:58:39 +00:00
|
|
|
MCA_BTL_SM_FIFO_WRITE(endpoint, endpoint->my_smp_rank,
|
2009-06-27 00:12:56 +00:00
|
|
|
endpoint->peer_smp_rank, (void *) VIRTUAL2RELATIVE(frag->hdr), false, true, rc);
|
2009-02-26 18:10:50 +00:00
|
|
|
return OMPI_SUCCESS;
|
2008-05-30 03:58:39 +00:00
|
|
|
}
|
2009-02-26 18:10:50 +00:00
|
|
|
|
|
|
|
/* presumably, this code path will never get executed */
|
2008-05-30 03:58:39 +00:00
|
|
|
*descriptor = mca_btl_sm_alloc( btl, endpoint, order,
|
|
|
|
payload_size + header_size, flags);
|
|
|
|
return OMPI_ERR_RESOURCE_BUSY;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Initiate a send to the peer.
|
|
|
|
*
|
|
|
|
* @param btl (IN) BTL module
|
|
|
|
* @param peer (IN) BTL peer addressing
|
|
|
|
*/
|
2008-09-30 18:30:35 +00:00
|
|
|
int mca_btl_sm_send( struct mca_btl_base_module_t* btl,
|
|
|
|
struct mca_btl_base_endpoint_t* endpoint,
|
|
|
|
struct mca_btl_base_descriptor_t* descriptor,
|
|
|
|
mca_btl_base_tag_t tag )
|
2005-05-23 22:06:50 +00:00
|
|
|
{
|
2005-06-30 05:50:55 +00:00
|
|
|
mca_btl_sm_frag_t* frag = (mca_btl_sm_frag_t*)descriptor;
|
2007-10-22 12:07:22 +00:00
|
|
|
int rc;
|
2005-05-23 22:06:50 +00:00
|
|
|
|
2009-06-27 23:42:09 +00:00
|
|
|
if ( mca_btl_sm_component.num_outstanding_frags * 2 > (int) mca_btl_sm_component.fifo_size ) {
|
2009-06-27 00:12:56 +00:00
|
|
|
mca_btl_sm_component_progress();
|
|
|
|
}
|
|
|
|
|
2007-10-22 12:07:22 +00:00
|
|
|
/* available header space */
|
2007-03-05 14:24:09 +00:00
|
|
|
frag->hdr->len = frag->segment.seg_len;
|
2007-08-27 21:41:04 +00:00
|
|
|
/* type of message, pt-2-pt, one-sided, etc */
|
2007-03-05 14:24:09 +00:00
|
|
|
frag->hdr->tag = tag;
|
2005-05-23 22:06:50 +00:00
|
|
|
|
2008-05-30 03:58:39 +00:00
|
|
|
MCA_BTL_SM_TOUCH_DATA_TILL_CACHELINE_BOUNDARY(frag);
|
|
|
|
|
2007-10-22 12:07:22 +00:00
|
|
|
frag->endpoint = endpoint;
|
|
|
|
|
2008-03-09 13:17:13 +00:00
|
|
|
/*
|
2005-05-23 22:06:50 +00:00
|
|
|
* post the descriptor in the queue - post with the relative
|
2008-03-09 13:17:13 +00:00
|
|
|
* address
|
2005-05-23 22:06:50 +00:00
|
|
|
*/
|
2009-06-27 00:12:56 +00:00
|
|
|
OPAL_THREAD_ADD32(&mca_btl_sm_component.num_outstanding_frags, +1);
|
2007-10-22 12:07:22 +00:00
|
|
|
MCA_BTL_SM_FIFO_WRITE(endpoint, endpoint->my_smp_rank,
|
2009-06-27 00:12:56 +00:00
|
|
|
endpoint->peer_smp_rank, (void *) VIRTUAL2RELATIVE(frag->hdr), false, true, rc);
|
2008-08-17 19:00:50 +00:00
|
|
|
if( OPAL_LIKELY(0 == rc) ) {
|
|
|
|
return 1; /* the data is completely gone */
|
|
|
|
}
|
|
|
|
frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
|
2008-08-17 20:07:53 +00:00
|
|
|
/* not yet gone, but pending. Let the upper level knows that
|
|
|
|
* the callback will be triggered when the data will be sent.
|
|
|
|
*/
|
|
|
|
return 0;
|
2005-05-23 22:06:50 +00:00
|
|
|
}
|
2007-03-16 23:11:45 +00:00
|
|
|
|
2009-12-15 23:34:09 +00:00
|
|
|
#if OMPI_BTL_SM_HAVE_KNEM
|
|
|
|
struct mca_btl_base_descriptor_t* mca_btl_sm_prepare_dst(
|
|
|
|
struct mca_btl_base_module_t* btl,
|
|
|
|
struct mca_btl_base_endpoint_t* endpoint,
|
|
|
|
struct mca_mpool_base_registration_t* registration,
|
|
|
|
struct opal_convertor_t* convertor,
|
|
|
|
uint8_t order,
|
|
|
|
size_t reserve,
|
|
|
|
size_t* size,
|
|
|
|
uint32_t flags)
|
|
|
|
{
|
|
|
|
int rc;
|
|
|
|
mca_btl_sm_frag_t* frag;
|
|
|
|
|
|
|
|
MCA_BTL_SM_FRAG_ALLOC_USER(frag, rc);
|
|
|
|
if(OPAL_UNLIKELY(NULL == frag)) {
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
frag->segment.seg_len = *size;
|
|
|
|
opal_convertor_get_current_pointer( convertor, (void**)&(frag->segment.seg_addr.pval) );
|
|
|
|
|
|
|
|
frag->base.des_src = NULL;
|
|
|
|
frag->base.des_src_cnt = 0;
|
|
|
|
frag->base.des_dst = &frag->segment;
|
|
|
|
frag->base.des_dst_cnt = 1;
|
|
|
|
frag->base.des_flags = flags;
|
|
|
|
return &frag->base;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Initiate an synchronous get.
|
|
|
|
*
|
|
|
|
* @param btl (IN) BTL module
|
|
|
|
* @param endpoint (IN) BTL addressing information
|
|
|
|
* @param descriptor (IN) Description of the data to be transferred
|
|
|
|
*/
|
|
|
|
int mca_btl_sm_get_sync(struct mca_btl_base_module_t* btl,
|
|
|
|
struct mca_btl_base_endpoint_t* endpoint,
|
|
|
|
struct mca_btl_base_descriptor_t* des)
|
|
|
|
{
|
|
|
|
int btl_ownership;
|
|
|
|
mca_btl_sm_t* sm_btl = (mca_btl_sm_t*) btl;
|
|
|
|
mca_btl_sm_frag_t* frag = (mca_btl_sm_frag_t*)des;
|
|
|
|
mca_btl_base_segment_t *src = des->des_src;
|
|
|
|
mca_btl_base_segment_t *dst = des->des_dst;
|
|
|
|
struct knem_cmd_inline_copy icopy;
|
|
|
|
struct knem_cmd_param_iovec recv_iovec;
|
|
|
|
|
|
|
|
/* Fill in the ioctl data fields. There's no async completion, so
|
|
|
|
we don't need to worry about getting a slot, etc. */
|
|
|
|
recv_iovec.base = (uintptr_t) dst->seg_addr.pval;
|
|
|
|
recv_iovec.len = dst->seg_len;
|
|
|
|
icopy.local_iovec_array = (uintptr_t)&recv_iovec;
|
|
|
|
icopy.local_iovec_nr = 1;
|
2011-11-06 16:19:09 +00:00
|
|
|
icopy.remote_cookie = src->seg_key.key64[0];
|
2009-12-15 23:34:09 +00:00
|
|
|
icopy.remote_offset = 0;
|
|
|
|
icopy.write = 0;
|
|
|
|
|
|
|
|
/* Use the DMA flag if knem supports it *and* the segment length
|
|
|
|
is greater than the cutoff. Note that if the knem_dma_min
|
|
|
|
value is 0 (i.e., the MCA param was set to 0), the segment size
|
|
|
|
will never be larger than it, so DMA will never be used. */
|
|
|
|
icopy.flags = 0;
|
|
|
|
if (mca_btl_sm_component.knem_dma_min <= dst->seg_len) {
|
|
|
|
icopy.flags = mca_btl_sm_component.knem_dma_flag;
|
|
|
|
}
|
|
|
|
/* synchronous flags only, no need to specify icopy.async_status_index */
|
|
|
|
|
|
|
|
/* When the ioctl returns, the transfer is done and we can invoke
|
|
|
|
the btl callback and return the frag */
|
|
|
|
if (OPAL_UNLIKELY(0 != ioctl(sm_btl->knem_fd,
|
|
|
|
KNEM_CMD_INLINE_COPY, &icopy))) {
|
|
|
|
return OMPI_ERROR;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* FIXME: what if icopy.current_status == KNEM_STATUS_FAILED? */
|
|
|
|
|
|
|
|
btl_ownership = (frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
|
|
|
|
if (0 != (MCA_BTL_DES_SEND_ALWAYS_CALLBACK & frag->base.des_flags)) {
|
|
|
|
frag->base.des_cbfunc(&mca_btl_sm.super,
|
|
|
|
frag->endpoint, &frag->base,
|
|
|
|
OMPI_SUCCESS);
|
|
|
|
}
|
|
|
|
if (btl_ownership) {
|
|
|
|
MCA_BTL_SM_FRAG_RETURN(frag);
|
|
|
|
}
|
|
|
|
|
|
|
|
return OMPI_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Initiate an asynchronous get.
|
|
|
|
*
|
|
|
|
* @param btl (IN) BTL module
|
|
|
|
* @param endpoint (IN) BTL addressing information
|
|
|
|
* @param descriptor (IN) Description of the data to be transferred
|
|
|
|
*/
|
|
|
|
int mca_btl_sm_get_async(struct mca_btl_base_module_t* btl,
|
|
|
|
struct mca_btl_base_endpoint_t* endpoint,
|
|
|
|
struct mca_btl_base_descriptor_t* des)
|
|
|
|
{
|
|
|
|
int btl_ownership;
|
|
|
|
mca_btl_sm_t* sm_btl = (mca_btl_sm_t*) btl;
|
|
|
|
mca_btl_sm_frag_t* frag = (mca_btl_sm_frag_t*)des;
|
|
|
|
mca_btl_base_segment_t *src = des->des_src;
|
|
|
|
mca_btl_base_segment_t *dst = des->des_dst;
|
|
|
|
struct knem_cmd_inline_copy icopy;
|
|
|
|
struct knem_cmd_param_iovec recv_iovec;
|
|
|
|
|
|
|
|
/* If we have no knem slots available, return
|
|
|
|
TEMP_OUT_OF_RESOURCE */
|
|
|
|
if (sm_btl->knem_status_num_used >=
|
|
|
|
mca_btl_sm_component.knem_max_simultaneous) {
|
|
|
|
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* We have a slot, so fill in the data fields. Bump the
|
|
|
|
first_avail and num_used counters. */
|
|
|
|
recv_iovec.base = (uintptr_t) dst->seg_addr.pval;
|
|
|
|
recv_iovec.len = dst->seg_len;
|
|
|
|
icopy.local_iovec_array = (uintptr_t)&recv_iovec;
|
|
|
|
icopy.local_iovec_nr = 1;
|
|
|
|
icopy.write = 0;
|
|
|
|
icopy.async_status_index = sm_btl->knem_status_first_avail++;
|
|
|
|
if (sm_btl->knem_status_first_avail >=
|
|
|
|
mca_btl_sm_component.knem_max_simultaneous) {
|
|
|
|
sm_btl->knem_status_first_avail = 0;
|
|
|
|
}
|
|
|
|
++sm_btl->knem_status_num_used;
|
2011-11-06 16:19:09 +00:00
|
|
|
icopy.remote_cookie = src->seg_key.key64[0];
|
2009-12-15 23:34:09 +00:00
|
|
|
icopy.remote_offset = 0;
|
|
|
|
|
|
|
|
/* Use the DMA flag if knem supports it *and* the segment length
|
|
|
|
is greater than the cutoff */
|
|
|
|
icopy.flags = KNEM_FLAG_ASYNCDMACOMPLETE;
|
|
|
|
if (mca_btl_sm_component.knem_dma_min <= dst->seg_len) {
|
|
|
|
icopy.flags = mca_btl_sm_component.knem_dma_flag;
|
|
|
|
}
|
|
|
|
|
|
|
|
sm_btl->knem_frag_array[icopy.async_status_index] = frag;
|
|
|
|
if (OPAL_LIKELY(0 == ioctl(sm_btl->knem_fd,
|
|
|
|
KNEM_CMD_INLINE_COPY, &icopy))) {
|
|
|
|
if (icopy.current_status != KNEM_STATUS_PENDING) {
|
|
|
|
/* request completed synchronously */
|
|
|
|
|
|
|
|
/* FIXME: what if icopy.current_status == KNEM_STATUS_FAILED? */
|
|
|
|
|
|
|
|
btl_ownership = (frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
|
|
|
|
if (0 != (MCA_BTL_DES_SEND_ALWAYS_CALLBACK & frag->base.des_flags)) {
|
|
|
|
frag->base.des_cbfunc(&mca_btl_sm.super,
|
|
|
|
frag->endpoint, &frag->base,
|
|
|
|
OMPI_SUCCESS);
|
|
|
|
}
|
|
|
|
if (btl_ownership) {
|
|
|
|
MCA_BTL_SM_FRAG_RETURN(frag);
|
|
|
|
}
|
|
|
|
|
|
|
|
--sm_btl->knem_status_num_used;
|
|
|
|
++sm_btl->knem_status_first_used;
|
|
|
|
if (sm_btl->knem_status_first_used >=
|
|
|
|
mca_btl_sm_component.knem_max_simultaneous) {
|
|
|
|
sm_btl->knem_status_first_used = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return OMPI_SUCCESS;
|
|
|
|
} else {
|
|
|
|
return OMPI_ERROR;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
2010-03-12 23:57:50 +00:00
|
|
|
#if OPAL_ENABLE_FT_CR == 0
|
2007-03-16 23:11:45 +00:00
|
|
|
int mca_btl_sm_ft_event(int state) {
|
2008-10-16 15:09:00 +00:00
|
|
|
return OMPI_SUCCESS;
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
int mca_btl_sm_ft_event(int state) {
|
|
|
|
/* Notify mpool */
|
|
|
|
if( NULL != mca_btl_sm_component.sm_mpool &&
|
|
|
|
NULL != mca_btl_sm_component.sm_mpool->mpool_ft_event) {
|
|
|
|
mca_btl_sm_component.sm_mpool->mpool_ft_event(state);
|
|
|
|
}
|
|
|
|
|
2007-03-16 23:11:45 +00:00
|
|
|
if(OPAL_CRS_CHECKPOINT == state) {
|
2010-06-09 16:58:52 +00:00
|
|
|
if( NULL != mca_btl_sm_component.sm_seg ) {
|
2008-10-16 15:09:00 +00:00
|
|
|
/* On restart we need the old file names to exist (not necessarily
|
|
|
|
* contain content) so the CRS component does not fail when searching
|
|
|
|
* for these old file handles. The restart procedure will make sure
|
|
|
|
* these files get cleaned up appropriately.
|
|
|
|
*/
|
A number of C/R enhancements per RFC below:
http://www.open-mpi.org/community/lists/devel/2010/07/8240.php
Documentation:
http://osl.iu.edu/research/ft/
Major Changes:
--------------
* Added C/R-enabled Debugging support.
Enabled with the --enable-crdebug flag. See the following website for more information:
http://osl.iu.edu/research/ft/crdebug/
* Added Stable Storage (SStore) framework for checkpoint storage
* 'central' component does a direct to central storage save
* 'stage' component stages checkpoints to central storage while the application continues execution.
* 'stage' supports offline compression of checkpoints before moving (sstore_stage_compress)
* 'stage' supports local caching of checkpoints to improve automatic recovery (sstore_stage_caching)
* Added Compression (compress) framework to support
* Add two new ErrMgr recovery policies
* {{{crmig}}} C/R Process Migration
* {{{autor}}} C/R Automatic Recovery
* Added the {{{ompi-migrate}}} command line tool to support the {{{crmig}}} ErrMgr component
* Added CR MPI Ext functions (enable them with {{{--enable-mpi-ext=cr}}} configure option)
* {{{OMPI_CR_Checkpoint}}} (Fixes trac:2342)
* {{{OMPI_CR_Restart}}}
* {{{OMPI_CR_Migrate}}} (may need some more work for mapping rules)
* {{{OMPI_CR_INC_register_callback}}} (Fixes trac:2192)
* {{{OMPI_CR_Quiesce_start}}}
* {{{OMPI_CR_Quiesce_checkpoint}}}
* {{{OMPI_CR_Quiesce_end}}}
* {{{OMPI_CR_self_register_checkpoint_callback}}}
* {{{OMPI_CR_self_register_restart_callback}}}
* {{{OMPI_CR_self_register_continue_callback}}}
* The ErrMgr predicted_fault() interface has been changed to take an opal_list_t of ErrMgr defined types. This will allow us to better support a wider range of fault prediction services in the future.
* Add a progress meter to:
* FileM rsh (filem_rsh_process_meter)
* SnapC full (snapc_full_progress_meter)
* SStore stage (sstore_stage_progress_meter)
* Added 2 new command line options to ompi-restart
* --showme : Display the full command line that would have been exec'ed.
* --mpirun_opts : Command line options to pass directly to mpirun. (Fixes trac:2413)
* Deprecated some MCA params:
* crs_base_snapshot_dir deprecated, use sstore_stage_local_snapshot_dir
* snapc_base_global_snapshot_dir deprecated, use sstore_base_global_snapshot_dir
* snapc_base_global_shared deprecated, use sstore_stage_global_is_shared
* snapc_base_store_in_place deprecated, replaced with different components of SStore
* snapc_base_global_snapshot_ref deprecated, use sstore_base_global_snapshot_ref
* snapc_base_establish_global_snapshot_dir deprecated, never well supported
* snapc_full_skip_filem deprecated, use sstore_stage_skip_filem
Minor Changes:
--------------
* Fixes trac:1924 : {{{ompi-restart}}} now recognizes path prefixed checkpoint handles and does the right thing.
* Fixes trac:2097 : {{{ompi-info}}} should now report all available CRS components
* Fixes trac:2161 : Manual checkpoint movement. A user can 'mv' a checkpoint directory from the original location to another and still restart from it.
* Fixes trac:2208 : Honor various TMPDIR varaibles instead of forcing {{{/tmp}}}
* Move {{{ompi_cr_continue_like_restart}}} to {{{orte_cr_continue_like_restart}}} to be more flexible in where this should be set.
* opal_crs_base_metadata_write* functions have been moved to SStore to support a wider range of metadata handling functionality.
* Cleanup the CRS framework and components to work with the SStore framework.
* Cleanup the SnapC framework and components to work with the SStore framework (cleans up these code paths considerably).
* Add 'quiesce' hook to CRCP for a future enhancement.
* We now require a BLCR version that supports {{{cr_request_file()}}} or {{{cr_request_checkpoint()}}} in order to make the code more maintainable. Note that {{{cr_request_file}}} has been deprecated since 0.7.0, so we prefer to use {{{cr_request_checkpoint()}}}.
* Add optional application level INC callbacks (registered through the CR MPI Ext interface).
* Increase the {{{opal_cr_thread_sleep_wait}}} parameter to 1000 microseconds to make the C/R thread less aggressive.
* {{{opal-restart}}} now looks for cache directories before falling back on stable storage when asked.
* {{{opal-restart}}} also support local decompression before restarting
* {{{orte-checkpoint}}} now uses the SStore framework to work with the metadata
* {{{orte-restart}}} now uses the SStore framework to work with the metadata
* Remove the {{{orte-restart}}} preload option. This was removed since the user only needs to select the 'stage' component in order to support this functionality.
* Since the '-am' parameter is saved in the metadata, {{{ompi-restart}}} no longer hard codes {{{-am ft-enable-cr}}}.
* Fix {{{hnp}}} ErrMgr so that if a previous component in the stack has 'fixed' the problem, then it should be skipped.
* Make sure to decrement the number of 'num_local_procs' in the orted when one goes away.
* odls now checks the SStore framework to see if it needs to load any checkpoint files before launching (to support 'stage'). This separates the SStore logic from the --preload-[binary|files] options.
* Add unique IDs to the named pipes established between the orted and the app in SnapC. This is to better support migration and automatic recovery activities.
* Improve the checks for 'already checkpointing' error path.
* A a recovery output timer, to show how long it takes to restart a job
* Do a better job of cleaning up the old session directory on restart.
* Add a local module to the autor and crmig ErrMgr components. These small modules prevent the 'orted' component from attempting a local recovery (Which does not work for MPI apps at the moment)
* Add a fix for bounding the checkpointable region between MPI_Init and MPI_Finalize.
This commit was SVN r23587.
The following Trac tickets were found above:
Ticket 1924 --> https://svn.open-mpi.org/trac/ompi/ticket/1924
Ticket 2097 --> https://svn.open-mpi.org/trac/ompi/ticket/2097
Ticket 2161 --> https://svn.open-mpi.org/trac/ompi/ticket/2161
Ticket 2192 --> https://svn.open-mpi.org/trac/ompi/ticket/2192
Ticket 2208 --> https://svn.open-mpi.org/trac/ompi/ticket/2208
Ticket 2342 --> https://svn.open-mpi.org/trac/ompi/ticket/2342
Ticket 2413 --> https://svn.open-mpi.org/trac/ompi/ticket/2413
2010-08-10 20:51:11 +00:00
|
|
|
orte_sstore.set_attr(orte_sstore_handle_current,
|
|
|
|
SSTORE_METADATA_LOCAL_TOUCH,
|
2011-07-10 23:32:23 +00:00
|
|
|
mca_btl_sm_component.sm_seg->shmem_ds.seg_name);
|
2008-10-16 15:09:00 +00:00
|
|
|
}
|
2007-03-16 23:11:45 +00:00
|
|
|
}
|
|
|
|
else if(OPAL_CRS_CONTINUE == state) {
|
A number of C/R enhancements per RFC below:
http://www.open-mpi.org/community/lists/devel/2010/07/8240.php
Documentation:
http://osl.iu.edu/research/ft/
Major Changes:
--------------
* Added C/R-enabled Debugging support.
Enabled with the --enable-crdebug flag. See the following website for more information:
http://osl.iu.edu/research/ft/crdebug/
* Added Stable Storage (SStore) framework for checkpoint storage
* 'central' component does a direct to central storage save
* 'stage' component stages checkpoints to central storage while the application continues execution.
* 'stage' supports offline compression of checkpoints before moving (sstore_stage_compress)
* 'stage' supports local caching of checkpoints to improve automatic recovery (sstore_stage_caching)
* Added Compression (compress) framework to support
* Add two new ErrMgr recovery policies
* {{{crmig}}} C/R Process Migration
* {{{autor}}} C/R Automatic Recovery
* Added the {{{ompi-migrate}}} command line tool to support the {{{crmig}}} ErrMgr component
* Added CR MPI Ext functions (enable them with {{{--enable-mpi-ext=cr}}} configure option)
* {{{OMPI_CR_Checkpoint}}} (Fixes trac:2342)
* {{{OMPI_CR_Restart}}}
* {{{OMPI_CR_Migrate}}} (may need some more work for mapping rules)
* {{{OMPI_CR_INC_register_callback}}} (Fixes trac:2192)
* {{{OMPI_CR_Quiesce_start}}}
* {{{OMPI_CR_Quiesce_checkpoint}}}
* {{{OMPI_CR_Quiesce_end}}}
* {{{OMPI_CR_self_register_checkpoint_callback}}}
* {{{OMPI_CR_self_register_restart_callback}}}
* {{{OMPI_CR_self_register_continue_callback}}}
* The ErrMgr predicted_fault() interface has been changed to take an opal_list_t of ErrMgr defined types. This will allow us to better support a wider range of fault prediction services in the future.
* Add a progress meter to:
* FileM rsh (filem_rsh_process_meter)
* SnapC full (snapc_full_progress_meter)
* SStore stage (sstore_stage_progress_meter)
* Added 2 new command line options to ompi-restart
* --showme : Display the full command line that would have been exec'ed.
* --mpirun_opts : Command line options to pass directly to mpirun. (Fixes trac:2413)
* Deprecated some MCA params:
* crs_base_snapshot_dir deprecated, use sstore_stage_local_snapshot_dir
* snapc_base_global_snapshot_dir deprecated, use sstore_base_global_snapshot_dir
* snapc_base_global_shared deprecated, use sstore_stage_global_is_shared
* snapc_base_store_in_place deprecated, replaced with different components of SStore
* snapc_base_global_snapshot_ref deprecated, use sstore_base_global_snapshot_ref
* snapc_base_establish_global_snapshot_dir deprecated, never well supported
* snapc_full_skip_filem deprecated, use sstore_stage_skip_filem
Minor Changes:
--------------
* Fixes trac:1924 : {{{ompi-restart}}} now recognizes path prefixed checkpoint handles and does the right thing.
* Fixes trac:2097 : {{{ompi-info}}} should now report all available CRS components
* Fixes trac:2161 : Manual checkpoint movement. A user can 'mv' a checkpoint directory from the original location to another and still restart from it.
* Fixes trac:2208 : Honor various TMPDIR varaibles instead of forcing {{{/tmp}}}
* Move {{{ompi_cr_continue_like_restart}}} to {{{orte_cr_continue_like_restart}}} to be more flexible in where this should be set.
* opal_crs_base_metadata_write* functions have been moved to SStore to support a wider range of metadata handling functionality.
* Cleanup the CRS framework and components to work with the SStore framework.
* Cleanup the SnapC framework and components to work with the SStore framework (cleans up these code paths considerably).
* Add 'quiesce' hook to CRCP for a future enhancement.
* We now require a BLCR version that supports {{{cr_request_file()}}} or {{{cr_request_checkpoint()}}} in order to make the code more maintainable. Note that {{{cr_request_file}}} has been deprecated since 0.7.0, so we prefer to use {{{cr_request_checkpoint()}}}.
* Add optional application level INC callbacks (registered through the CR MPI Ext interface).
* Increase the {{{opal_cr_thread_sleep_wait}}} parameter to 1000 microseconds to make the C/R thread less aggressive.
* {{{opal-restart}}} now looks for cache directories before falling back on stable storage when asked.
* {{{opal-restart}}} also support local decompression before restarting
* {{{orte-checkpoint}}} now uses the SStore framework to work with the metadata
* {{{orte-restart}}} now uses the SStore framework to work with the metadata
* Remove the {{{orte-restart}}} preload option. This was removed since the user only needs to select the 'stage' component in order to support this functionality.
* Since the '-am' parameter is saved in the metadata, {{{ompi-restart}}} no longer hard codes {{{-am ft-enable-cr}}}.
* Fix {{{hnp}}} ErrMgr so that if a previous component in the stack has 'fixed' the problem, then it should be skipped.
* Make sure to decrement the number of 'num_local_procs' in the orted when one goes away.
* odls now checks the SStore framework to see if it needs to load any checkpoint files before launching (to support 'stage'). This separates the SStore logic from the --preload-[binary|files] options.
* Add unique IDs to the named pipes established between the orted and the app in SnapC. This is to better support migration and automatic recovery activities.
* Improve the checks for 'already checkpointing' error path.
* A a recovery output timer, to show how long it takes to restart a job
* Do a better job of cleaning up the old session directory on restart.
* Add a local module to the autor and crmig ErrMgr components. These small modules prevent the 'orted' component from attempting a local recovery (Which does not work for MPI apps at the moment)
* Add a fix for bounding the checkpointable region between MPI_Init and MPI_Finalize.
This commit was SVN r23587.
The following Trac tickets were found above:
Ticket 1924 --> https://svn.open-mpi.org/trac/ompi/ticket/1924
Ticket 2097 --> https://svn.open-mpi.org/trac/ompi/ticket/2097
Ticket 2161 --> https://svn.open-mpi.org/trac/ompi/ticket/2161
Ticket 2192 --> https://svn.open-mpi.org/trac/ompi/ticket/2192
Ticket 2208 --> https://svn.open-mpi.org/trac/ompi/ticket/2208
Ticket 2342 --> https://svn.open-mpi.org/trac/ompi/ticket/2342
Ticket 2413 --> https://svn.open-mpi.org/trac/ompi/ticket/2413
2010-08-10 20:51:11 +00:00
|
|
|
if( orte_cr_continue_like_restart ) {
|
2010-06-09 16:58:52 +00:00
|
|
|
if( NULL != mca_btl_sm_component.sm_seg ) {
|
2008-10-16 15:09:00 +00:00
|
|
|
/* Add shared memory file */
|
2011-07-10 23:32:23 +00:00
|
|
|
opal_crs_base_cleanup_append(mca_btl_sm_component.sm_seg->shmem_ds.seg_name, false);
|
2008-10-16 15:09:00 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Clear this so we force the module to re-init the sm files */
|
|
|
|
mca_btl_sm_component.sm_mpool = NULL;
|
|
|
|
}
|
2007-03-16 23:11:45 +00:00
|
|
|
}
|
2008-10-16 15:09:00 +00:00
|
|
|
else if(OPAL_CRS_RESTART == state ||
|
|
|
|
OPAL_CRS_RESTART_PRE == state) {
|
2010-06-09 16:58:52 +00:00
|
|
|
if( NULL != mca_btl_sm_component.sm_seg ) {
|
2008-10-16 15:09:00 +00:00
|
|
|
/* Add shared memory file */
|
2011-07-10 23:32:23 +00:00
|
|
|
opal_crs_base_cleanup_append(mca_btl_sm_component.sm_seg->shmem_ds.seg_name, false);
|
2008-10-16 15:09:00 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Clear this so we force the module to re-init the sm files */
|
|
|
|
mca_btl_sm_component.sm_mpool = NULL;
|
2007-03-16 23:11:45 +00:00
|
|
|
}
|
|
|
|
else if(OPAL_CRS_TERM == state ) {
|
|
|
|
;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
;
|
|
|
|
}
|
|
|
|
|
|
|
|
return OMPI_SUCCESS;
|
|
|
|
}
|
2010-03-12 23:57:50 +00:00
|
|
|
#endif /* OPAL_ENABLE_FT_CR */
|