From ff384daab459874c2956b7a23eed794ba40c7ae7 Mon Sep 17 00:00:00 2001 From: Mike Dubman Date: Tue, 12 Feb 2013 15:33:21 +0000 Subject: [PATCH] Added new project: oshmem. This commit was SVN r28048. --- CMakeLists.txt | 7 + Makefile.am | 11 + README-SHMEM-WITH-VALGRIND.txt | 36 + README-SHMEM.txt | 19 + VERSION | 3 + autogen.pl | 13 +- config/opal_check_attributes.m4 | 12 + config/oshmem_config_files.m4 | 26 + config/oshmem_configure_options.m4 | 188 + configure.ac | 25 +- contrib/Makefile.am | 10 + .../win32/CMakeModules/opal_get_version.cmake | 11 + contrib/tau/readme | 178 + contrib/tau/shmem_wiki_tau.sh | 176 + contrib/tau/tau_openshmem.patch | 101 + distr/README | 24 + distr/build_knem_rpm.sh | 13 + distr/buildrpm.sh | 165 + distr/buildtarball.sh | 22 + distr/oshmem.spec.in | 151 + distr/rpm2cpio | 118 + knem_patch/README-knem-patch | 16 + knem_patch/shmem_knem.patch | 88 + ompi/class/ompi_free_list.c | 4 +- ompi/mca/btl/btl.h | 9 + ompi/mca/btl/openib/btl_openib.c | 33 +- ompi/mca/btl/openib/btl_openib_async.c | 5 +- ompi/mca/btl/openib/btl_openib_endpoint.c | 8 +- ompi/mca/btl/openib/btl_openib_endpoint.h | 18 +- ompi/mca/btl/openib/btl_openib_frag.h | 5 + ompi/mca/btl/openib/connect/base.h | 6 +- .../openib/connect/btl_openib_connect_oob.c | 51 +- ompi/mca/btl/sm/btl_sm.c | 116 +- ompi/mca/btl/sm/btl_sm.h | 11 + ompi/mca/btl/sm/btl_sm_component.c | 65 +- ompi/mca/btl/sm/btl_sm_frag.h | 11 +- ompi/mca/mpool/grdma/mpool_grdma_module.c | 7 +- ompi/mca/rte/rte.h | 7 + ompi/mca/sbgp/ibnet/Makefile.am | 5 +- ompi/mca/sbgp/ibnet/sbgp_ibnet_module.c | 1 - ompi/mpi/c/finalize.c | 8 + ompi/mpi/c/init.c | 10 + ompi/mpi/c/init_thread.c | 11 + ompi/tools/Makefile.am | 16 +- ompi/tools/ompi_info/Makefile.am | 7 +- ompi/tools/ompi_info/components.c | 59 +- ompi/tools/ompi_info/ompi_info.c | 12 + ompi/tools/ompi_info/version.c | 32 +- opal/etc/openmpi-mca-params.conf | 5 + opal/include/opal_config_bottom.h | 8 + opal/threads/condition.c | 7 + opal/threads/condition.h | 42 + orte/etc/Makefile.am | 22 + oshmem/CMakeLists.txt | 128 + oshmem/Makefile.am | 130 + oshmem/include/Makefile.am | 67 + oshmem/include/mpif-common.h | 457 ++ oshmem/include/mpif-config.h.in | 99 + oshmem/include/mpif-mpi-io.h | 74 + oshmem/include/mpif.h.in | 76 + oshmem/include/mpp/shmem.fh | 11 + oshmem/include/mpp/shmem.h | 16 + oshmem/include/oshmem/Makefile.am | 15 + oshmem/include/oshmem/constants.h | 127 + oshmem/include/oshmem/types.h | 23 + oshmem/include/oshmem/version.h.in | 30 + oshmem/include/oshmem_config.h.in | 125 + oshmem/include/shmem.fh | 55 + oshmem/include/shmem.h.in | 389 ++ oshmem/include/shmem_portable_platform.h.in | 401 ++ oshmem/mca/atomic/Makefile.am | 35 + oshmem/mca/atomic/atomic.h | 124 + oshmem/mca/atomic/base/Makefile.am | 20 + .../mca/atomic/base/atomic_base_available.c | 180 + oshmem/mca/atomic/base/atomic_base_close.c | 50 + oshmem/mca/atomic/base/atomic_base_open.c | 70 + oshmem/mca/atomic/base/atomic_base_select.c | 259 + oshmem/mca/atomic/base/base.h | 123 + oshmem/mca/atomic/basic/.windows | 12 + oshmem/mca/atomic/basic/Makefile.am | 42 + oshmem/mca/atomic/basic/atomic_basic.h | 51 + .../mca/atomic/basic/atomic_basic_component.c | 88 + oshmem/mca/atomic/basic/atomic_basic_cswap.c | 47 + oshmem/mca/atomic/basic/atomic_basic_fadd.c | 54 + oshmem/mca/atomic/basic/atomic_basic_module.c | 211 + oshmem/mca/atomic/basic/configure.params | 13 + oshmem/mca/memheap/Makefile.am | 39 + oshmem/mca/memheap/README | 50 + oshmem/mca/memheap/base/Makefile.am | 28 + oshmem/mca/memheap/base/base.h | 200 + oshmem/mca/memheap/base/help-shmem-mca.txt | 23 + oshmem/mca/memheap/base/memheap_base_alloc.c | 529 ++ oshmem/mca/memheap/base/memheap_base_close.c | 75 + oshmem/mca/memheap/base/memheap_base_mkey.c | 684 ++ oshmem/mca/memheap/base/memheap_base_open.c | 123 + .../mca/memheap/base/memheap_base_register.c | 140 + oshmem/mca/memheap/base/memheap_base_select.c | 267 + oshmem/mca/memheap/base/memheap_base_static.c | 224 + oshmem/mca/memheap/buddy/.windows | 12 + oshmem/mca/memheap/buddy/Makefile.am | 41 + oshmem/mca/memheap/buddy/configure.params | 13 + oshmem/mca/memheap/buddy/memheap_buddy.c | 693 +++ oshmem/mca/memheap/buddy/memheap_buddy.h | 97 + .../memheap/buddy/memheap_buddy_component.c | 73 + .../memheap/buddy/memheap_buddy_component.h | 26 + oshmem/mca/memheap/configure.m4 | 19 + oshmem/mca/memheap/memheap.h | 147 + oshmem/mca/memheap/ptmalloc/Makefile.am | 43 + oshmem/mca/memheap/ptmalloc/configure.params | 13 + oshmem/mca/memheap/ptmalloc/malloc.c | 5527 +++++++++++++++++ oshmem/mca/memheap/ptmalloc/malloc_defs.h | 32 + .../mca/memheap/ptmalloc/memheap_ptmalloc.c | 184 + .../mca/memheap/ptmalloc/memheap_ptmalloc.h | 75 + .../ptmalloc/memheap_ptmalloc_component.c | 73 + .../ptmalloc/memheap_ptmalloc_component.h | 26 + oshmem/mca/scoll/Makefile.am | 35 + oshmem/mca/scoll/base/Makefile.am | 20 + oshmem/mca/scoll/base/base.h | 196 + oshmem/mca/scoll/base/scoll_base_available.c | 180 + oshmem/mca/scoll/base/scoll_base_close.c | 60 + oshmem/mca/scoll/base/scoll_base_open.c | 107 + oshmem/mca/scoll/base/scoll_base_select.c | 359 ++ oshmem/mca/scoll/basic/.windows | 12 + oshmem/mca/scoll/basic/Makefile.am | 44 + oshmem/mca/scoll/basic/configure.params | 13 + oshmem/mca/scoll/basic/scoll_basic.h | 74 + oshmem/mca/scoll/basic/scoll_basic_barrier.c | 601 ++ .../mca/scoll/basic/scoll_basic_broadcast.c | 209 + oshmem/mca/scoll/basic/scoll_basic_collect.c | 511 ++ .../mca/scoll/basic/scoll_basic_component.c | 138 + oshmem/mca/scoll/basic/scoll_basic_module.c | 70 + oshmem/mca/scoll/basic/scoll_basic_reduce.c | 691 +++ oshmem/mca/scoll/fca/.windows | 12 + oshmem/mca/scoll/fca/Makefile.am | 38 + oshmem/mca/scoll/fca/configure.m4 | 39 + oshmem/mca/scoll/fca/configure.params | 13 + oshmem/mca/scoll/fca/scoll_fca.h | 126 + oshmem/mca/scoll/fca/scoll_fca_api.h | 75 + oshmem/mca/scoll/fca/scoll_fca_component.c | 272 + oshmem/mca/scoll/fca/scoll_fca_debug.h | 36 + oshmem/mca/scoll/fca/scoll_fca_module.c | 555 ++ oshmem/mca/scoll/fca/scoll_fca_ops.c | 240 + oshmem/mca/scoll/scoll.h | 181 + oshmem/mca/spml/Makefile.am | 35 + oshmem/mca/spml/base/Makefile.am | 29 + oshmem/mca/spml/base/base.h | 79 + oshmem/mca/spml/base/spml_base.c | 148 + oshmem/mca/spml/base/spml_base_atomicreq.c | 45 + oshmem/mca/spml/base/spml_base_atomicreq.h | 101 + oshmem/mca/spml/base/spml_base_close.c | 74 + oshmem/mca/spml/base/spml_base_getreq.c | 45 + oshmem/mca/spml/base/spml_base_getreq.h | 97 + oshmem/mca/spml/base/spml_base_open.c | 168 + oshmem/mca/spml/base/spml_base_putreq.c | 40 + oshmem/mca/spml/base/spml_base_putreq.h | 101 + oshmem/mca/spml/base/spml_base_request.c | 38 + oshmem/mca/spml/base/spml_base_request.h | 83 + oshmem/mca/spml/base/spml_base_request_dbg.h | 30 + oshmem/mca/spml/base/spml_base_select.c | 332 + oshmem/mca/spml/configure.m4 | 19 + oshmem/mca/spml/ikrit/.windows | 12 + oshmem/mca/spml/ikrit/Makefile.am | 42 + oshmem/mca/spml/ikrit/configure.m4 | 35 + oshmem/mca/spml/ikrit/configure.params | 14 + .../mca/spml/ikrit/help-shmem-spml-ikrit.txt | 68 + oshmem/mca/spml/ikrit/spml_ikrit.c | 1343 ++++ oshmem/mca/spml/ikrit/spml_ikrit.h | 134 + oshmem/mca/spml/ikrit/spml_ikrit_component.c | 279 + oshmem/mca/spml/ikrit/spml_ikrit_component.h | 25 + oshmem/mca/spml/spml.h | 294 + oshmem/mca/spml/yoda/.windows | 12 + oshmem/mca/spml/yoda/Makefile.am | 45 + oshmem/mca/spml/yoda/configure.params | 14 + oshmem/mca/spml/yoda/help-shmem-spml-yoda.txt | 21 + oshmem/mca/spml/yoda/post_configure.sh | 4 + oshmem/mca/spml/yoda/spml_yoda.c | 1171 ++++ oshmem/mca/spml/yoda/spml_yoda.h | 97 + oshmem/mca/spml/yoda/spml_yoda_component.c | 146 + oshmem/mca/spml/yoda/spml_yoda_component.h | 25 + oshmem/mca/spml/yoda/spml_yoda_getreq.c | 111 + oshmem/mca/spml/yoda/spml_yoda_getreq.h | 61 + oshmem/mca/spml/yoda/spml_yoda_putreq.c | 107 + oshmem/mca/spml/yoda/spml_yoda_putreq.h | 58 + oshmem/mca/spml/yoda/spml_yoda_rdmafrag.h | 47 + oshmem/op/Makefile.am | 19 + oshmem/op/op.c | 398 ++ oshmem/op/op.h | 206 + oshmem/proc/Makefile.am | 21 + oshmem/proc/proc.c | 785 +++ oshmem/proc/proc.h | 505 ++ oshmem/proc/proc_group_cache.c | 102 + oshmem/proc/proc_group_cache.h | 39 + oshmem/request/Makefile.am | 19 + oshmem/request/request.c | 168 + oshmem/request/request.h | 439 ++ oshmem/request/request_dbg.h | 47 + oshmem/runtime/Makefile.am | 27 + oshmem/runtime/help-shmem-runtime.txt | 64 + oshmem/runtime/oshmem_shmem_abort.c | 111 + oshmem/runtime/oshmem_shmem_finalize.c | 163 + oshmem/runtime/oshmem_shmem_init.c | 437 ++ oshmem/runtime/oshmem_shmem_params.c | 35 + oshmem/runtime/oshmem_shmem_preconnect.h | 24 + oshmem/runtime/params.h | 42 + oshmem/runtime/runtime.h | 200 + oshmem/shmem/CMakeLists.txt | 22 + oshmem/shmem/Makefile.am | 13 + oshmem/shmem/c/Makefile.am | 75 + oshmem/shmem/c/help-shmem-api.txt | 21 + oshmem/shmem/c/profile/Makefile.am | 96 + oshmem/shmem/c/profile/defines.h | 271 + oshmem/shmem/c/shmem_add.c | 53 + oshmem/shmem/c/shmem_addr_accessible.c | 30 + oshmem/shmem/c/shmem_align.c | 41 + oshmem/shmem/c/shmem_alloc.c | 39 + oshmem/shmem/c/shmem_barrier.c | 91 + oshmem/shmem/c/shmem_broadcast.c | 111 + oshmem/shmem/c/shmem_clear_cache_inv.c | 27 + oshmem/shmem/c/shmem_clear_cache_line_inv.c | 28 + oshmem/shmem/c/shmem_clear_lock.c | 26 + oshmem/shmem/c/shmem_collect.c | 114 + oshmem/shmem/c/shmem_cswap.c | 55 + oshmem/shmem/c/shmem_fadd.c | 55 + oshmem/shmem/c/shmem_fence.c | 22 + oshmem/shmem/c/shmem_finalize.c | 29 + oshmem/shmem/c/shmem_finc.c | 56 + oshmem/shmem/c/shmem_free.c | 38 + oshmem/shmem/c/shmem_g.c | 53 + oshmem/shmem/c/shmem_get.c | 85 + oshmem/shmem/c/shmem_iget.c | 86 + oshmem/shmem/c/shmem_inc.c | 55 + oshmem/shmem/c/shmem_init.c | 94 + oshmem/shmem/c/shmem_iput.c | 86 + oshmem/shmem/c/shmem_lock.c | 1050 ++++ oshmem/shmem/c/shmem_p.c | 54 + oshmem/shmem/c/shmem_pe_accessible.c | 31 + oshmem/shmem/c/shmem_ptr.c | 32 + oshmem/shmem/c/shmem_put.c | 88 + oshmem/shmem/c/shmem_query.c | 56 + oshmem/shmem/c/shmem_quiet.c | 22 + oshmem/shmem/c/shmem_realloc.c | 42 + oshmem/shmem/c/shmem_reduce.c | 148 + oshmem/shmem/c/shmem_set_cache_inv.c | 28 + oshmem/shmem/c/shmem_set_cache_line_inv.c | 28 + oshmem/shmem/c/shmem_set_lock.c | 25 + oshmem/shmem/c/shmem_swap.c | 55 + oshmem/shmem/c/shmem_test_lock.c | 28 + oshmem/shmem/c/shmem_udcflush.c | 26 + oshmem/shmem/c/shmem_udcflush_line.c | 26 + oshmem/shmem/c/shmem_wait.c | 70 + oshmem/shmem/f77/Makefile.am | 135 + oshmem/shmem/f77/bindings.h | 25 + oshmem/shmem/f77/my_pe_f.c | 29 + oshmem/shmem/f77/num_pes_f.c | 29 + oshmem/shmem/f77/prototypes_shmem.h | 154 + oshmem/shmem/f77/shmem_addr_accessible_f.c | 29 + oshmem/shmem/f77/shmem_and_to_all_f.c | 101 + oshmem/shmem/f77/shmem_barrier_all_f.c | 27 + oshmem/shmem/f77/shmem_barrier_f.c | 31 + oshmem/shmem/f77/shmem_broadcast_f.c | 127 + oshmem/shmem/f77/shmem_cache_f.c | 93 + oshmem/shmem/f77/shmem_character_get_f.c | 38 + oshmem/shmem/f77/shmem_character_put_f.c | 39 + oshmem/shmem/f77/shmem_collect_f.c | 150 + oshmem/shmem/f77/shmem_complex_get_f.c | 38 + oshmem/shmem/f77/shmem_complex_iget_f.c | 46 + oshmem/shmem/f77/shmem_complex_iput_f.c | 47 + oshmem/shmem/f77/shmem_complex_put_f.c | 38 + oshmem/shmem/f77/shmem_double_get_f.c | 38 + oshmem/shmem/f77/shmem_double_iget_f.c | 46 + oshmem/shmem/f77/shmem_double_iput_f.c | 47 + oshmem/shmem/f77/shmem_double_put_f.c | 38 + oshmem/shmem/f77/shmem_fence_f.c | 27 + oshmem/shmem/f77/shmem_finalize_f.c | 27 + oshmem/shmem/f77/shmem_fortran_pointer.h | 17 + oshmem/shmem/f77/shmem_get128_f.c | 34 + oshmem/shmem/f77/shmem_get32_f.c | 34 + oshmem/shmem/f77/shmem_get4_f.c | 34 + oshmem/shmem/f77/shmem_get64_f.c | 34 + oshmem/shmem/f77/shmem_get8_f.c | 34 + oshmem/shmem/f77/shmem_getmem_f.c | 34 + oshmem/shmem/f77/shmem_iget128_f.c | 43 + oshmem/shmem/f77/shmem_iget32_f.c | 43 + oshmem/shmem/f77/shmem_iget4_f.c | 43 + oshmem/shmem/f77/shmem_iget64_f.c | 43 + oshmem/shmem/f77/shmem_iget8_f.c | 43 + oshmem/shmem/f77/shmem_int4_add_f.c | 41 + oshmem/shmem/f77/shmem_int4_cswap_f.c | 41 + oshmem/shmem/f77/shmem_int4_fadd_f.c | 43 + oshmem/shmem/f77/shmem_int4_finc_f.c | 44 + oshmem/shmem/f77/shmem_int4_inc_f.c | 41 + oshmem/shmem/f77/shmem_int4_swap_f.c | 41 + oshmem/shmem/f77/shmem_int4_wait_f.c | 30 + oshmem/shmem/f77/shmem_int4_wait_until_f.c | 33 + oshmem/shmem/f77/shmem_int8_add_f.c | 41 + oshmem/shmem/f77/shmem_int8_cswap_f.c | 41 + oshmem/shmem/f77/shmem_int8_fadd_f.c | 43 + oshmem/shmem/f77/shmem_int8_finc_f.c | 44 + oshmem/shmem/f77/shmem_int8_inc_f.c | 42 + oshmem/shmem/f77/shmem_int8_swap_f.c | 41 + oshmem/shmem/f77/shmem_int8_wait_f.c | 30 + oshmem/shmem/f77/shmem_int8_wait_until_f.c | 33 + oshmem/shmem/f77/shmem_integer_get_f.c | 38 + oshmem/shmem/f77/shmem_integer_iget_f.c | 46 + oshmem/shmem/f77/shmem_integer_iput_f.c | 47 + oshmem/shmem/f77/shmem_integer_put_f.c | 38 + oshmem/shmem/f77/shmem_iput128_f.c | 44 + oshmem/shmem/f77/shmem_iput32_f.c | 43 + oshmem/shmem/f77/shmem_iput4_f.c | 44 + oshmem/shmem/f77/shmem_iput64_f.c | 44 + oshmem/shmem/f77/shmem_iput8_f.c | 44 + oshmem/shmem/f77/shmem_lock_f.c | 55 + oshmem/shmem/f77/shmem_logical_get_f.c | 38 + oshmem/shmem/f77/shmem_logical_iget_f.c | 46 + oshmem/shmem/f77/shmem_logical_iput_f.c | 47 + oshmem/shmem/f77/shmem_logical_put_f.c | 38 + oshmem/shmem/f77/shmem_max_to_all_f.c | 125 + oshmem/shmem/f77/shmem_min_to_all_f.c | 117 + oshmem/shmem/f77/shmem_or_to_all_f.c | 98 + oshmem/shmem/f77/shmem_pe_accessible_f.c | 29 + oshmem/shmem/f77/shmem_prod_to_all_f.c | 117 + oshmem/shmem/f77/shmem_ptr_f.c | 29 + oshmem/shmem/f77/shmem_put32_f.c | 35 + oshmem/shmem/f77/shmem_put4_f.c | 35 + oshmem/shmem/f77/shmem_put64_f.c | 35 + oshmem/shmem/f77/shmem_put8_f.c | 35 + oshmem/shmem/f77/shmem_put_f.c | 33 + oshmem/shmem/f77/shmem_putmem_f.c | 34 + oshmem/shmem/f77/shmem_quiet_f.c | 27 + oshmem/shmem/f77/shmem_real4_swap_f.c | 41 + oshmem/shmem/f77/shmem_real8_swap_f.c | 41 + oshmem/shmem/f77/shmem_real_get_f.c | 38 + oshmem/shmem/f77/shmem_real_iget_f.c | 46 + oshmem/shmem/f77/shmem_real_iput_f.c | 47 + oshmem/shmem/f77/shmem_real_put_f.c | 38 + oshmem/shmem/f77/shmem_sum_to_all_f.c | 117 + oshmem/shmem/f77/shmem_swap_f.c | 43 + oshmem/shmem/f77/shmem_wait_f.c | 30 + oshmem/shmem/f77/shmem_wait_until_f.c | 33 + oshmem/shmem/f77/shmem_xor_to_all_f.c | 102 + oshmem/shmem/f77/shpalloc_f.c | 45 + oshmem/shmem/f77/shpclmove_f.c | 60 + oshmem/shmem/f77/shpdeallc_f.c | 30 + oshmem/shmem/f77/start_pes_f.c | 27 + oshmem/shmem/shmem_api_logger.h | 31 + oshmem/shmem/shmem_lock.h | 26 + oshmem/tools/Makefile.am | 21 + oshmem/tools/wrappers/Makefile.am | 41 + .../wrappers/shmemcc-wrapper-data.txt.in | 30 + .../wrappers/shmemf77-wrapper-data.txt.in | 30 + .../wrappers/shmemf90-wrapper-data.txt.in | 32 + 351 files changed, 37807 insertions(+), 27 deletions(-) create mode 100644 README-SHMEM-WITH-VALGRIND.txt create mode 100644 README-SHMEM.txt create mode 100644 config/oshmem_config_files.m4 create mode 100644 config/oshmem_configure_options.m4 create mode 100644 contrib/tau/readme create mode 100644 contrib/tau/shmem_wiki_tau.sh create mode 100644 contrib/tau/tau_openshmem.patch create mode 100644 distr/README create mode 100644 distr/build_knem_rpm.sh create mode 100644 distr/buildrpm.sh create mode 100644 distr/buildtarball.sh create mode 100644 distr/oshmem.spec.in create mode 100644 distr/rpm2cpio create mode 100644 knem_patch/README-knem-patch create mode 100644 knem_patch/shmem_knem.patch create mode 100644 oshmem/CMakeLists.txt create mode 100644 oshmem/Makefile.am create mode 100644 oshmem/include/Makefile.am create mode 100644 oshmem/include/mpif-common.h create mode 100644 oshmem/include/mpif-config.h.in create mode 100644 oshmem/include/mpif-mpi-io.h create mode 100644 oshmem/include/mpif.h.in create mode 100644 oshmem/include/mpp/shmem.fh create mode 100644 oshmem/include/mpp/shmem.h create mode 100644 oshmem/include/oshmem/Makefile.am create mode 100644 oshmem/include/oshmem/constants.h create mode 100644 oshmem/include/oshmem/types.h create mode 100644 oshmem/include/oshmem/version.h.in create mode 100644 oshmem/include/oshmem_config.h.in create mode 100644 oshmem/include/shmem.fh create mode 100644 oshmem/include/shmem.h.in create mode 100644 oshmem/include/shmem_portable_platform.h.in create mode 100644 oshmem/mca/atomic/Makefile.am create mode 100644 oshmem/mca/atomic/atomic.h create mode 100644 oshmem/mca/atomic/base/Makefile.am create mode 100644 oshmem/mca/atomic/base/atomic_base_available.c create mode 100644 oshmem/mca/atomic/base/atomic_base_close.c create mode 100644 oshmem/mca/atomic/base/atomic_base_open.c create mode 100644 oshmem/mca/atomic/base/atomic_base_select.c create mode 100644 oshmem/mca/atomic/base/base.h create mode 100644 oshmem/mca/atomic/basic/.windows create mode 100644 oshmem/mca/atomic/basic/Makefile.am create mode 100644 oshmem/mca/atomic/basic/atomic_basic.h create mode 100644 oshmem/mca/atomic/basic/atomic_basic_component.c create mode 100644 oshmem/mca/atomic/basic/atomic_basic_cswap.c create mode 100644 oshmem/mca/atomic/basic/atomic_basic_fadd.c create mode 100644 oshmem/mca/atomic/basic/atomic_basic_module.c create mode 100644 oshmem/mca/atomic/basic/configure.params create mode 100644 oshmem/mca/memheap/Makefile.am create mode 100644 oshmem/mca/memheap/README create mode 100644 oshmem/mca/memheap/base/Makefile.am create mode 100644 oshmem/mca/memheap/base/base.h create mode 100644 oshmem/mca/memheap/base/help-shmem-mca.txt create mode 100644 oshmem/mca/memheap/base/memheap_base_alloc.c create mode 100644 oshmem/mca/memheap/base/memheap_base_close.c create mode 100644 oshmem/mca/memheap/base/memheap_base_mkey.c create mode 100644 oshmem/mca/memheap/base/memheap_base_open.c create mode 100644 oshmem/mca/memheap/base/memheap_base_register.c create mode 100644 oshmem/mca/memheap/base/memheap_base_select.c create mode 100644 oshmem/mca/memheap/base/memheap_base_static.c create mode 100644 oshmem/mca/memheap/buddy/.windows create mode 100644 oshmem/mca/memheap/buddy/Makefile.am create mode 100644 oshmem/mca/memheap/buddy/configure.params create mode 100644 oshmem/mca/memheap/buddy/memheap_buddy.c create mode 100644 oshmem/mca/memheap/buddy/memheap_buddy.h create mode 100644 oshmem/mca/memheap/buddy/memheap_buddy_component.c create mode 100644 oshmem/mca/memheap/buddy/memheap_buddy_component.h create mode 100644 oshmem/mca/memheap/configure.m4 create mode 100644 oshmem/mca/memheap/memheap.h create mode 100644 oshmem/mca/memheap/ptmalloc/Makefile.am create mode 100644 oshmem/mca/memheap/ptmalloc/configure.params create mode 100644 oshmem/mca/memheap/ptmalloc/malloc.c create mode 100644 oshmem/mca/memheap/ptmalloc/malloc_defs.h create mode 100644 oshmem/mca/memheap/ptmalloc/memheap_ptmalloc.c create mode 100644 oshmem/mca/memheap/ptmalloc/memheap_ptmalloc.h create mode 100644 oshmem/mca/memheap/ptmalloc/memheap_ptmalloc_component.c create mode 100644 oshmem/mca/memheap/ptmalloc/memheap_ptmalloc_component.h create mode 100644 oshmem/mca/scoll/Makefile.am create mode 100644 oshmem/mca/scoll/base/Makefile.am create mode 100644 oshmem/mca/scoll/base/base.h create mode 100644 oshmem/mca/scoll/base/scoll_base_available.c create mode 100644 oshmem/mca/scoll/base/scoll_base_close.c create mode 100644 oshmem/mca/scoll/base/scoll_base_open.c create mode 100644 oshmem/mca/scoll/base/scoll_base_select.c create mode 100644 oshmem/mca/scoll/basic/.windows create mode 100644 oshmem/mca/scoll/basic/Makefile.am create mode 100644 oshmem/mca/scoll/basic/configure.params create mode 100644 oshmem/mca/scoll/basic/scoll_basic.h create mode 100644 oshmem/mca/scoll/basic/scoll_basic_barrier.c create mode 100644 oshmem/mca/scoll/basic/scoll_basic_broadcast.c create mode 100644 oshmem/mca/scoll/basic/scoll_basic_collect.c create mode 100644 oshmem/mca/scoll/basic/scoll_basic_component.c create mode 100644 oshmem/mca/scoll/basic/scoll_basic_module.c create mode 100644 oshmem/mca/scoll/basic/scoll_basic_reduce.c create mode 100644 oshmem/mca/scoll/fca/.windows create mode 100644 oshmem/mca/scoll/fca/Makefile.am create mode 100644 oshmem/mca/scoll/fca/configure.m4 create mode 100644 oshmem/mca/scoll/fca/configure.params create mode 100644 oshmem/mca/scoll/fca/scoll_fca.h create mode 100644 oshmem/mca/scoll/fca/scoll_fca_api.h create mode 100644 oshmem/mca/scoll/fca/scoll_fca_component.c create mode 100644 oshmem/mca/scoll/fca/scoll_fca_debug.h create mode 100644 oshmem/mca/scoll/fca/scoll_fca_module.c create mode 100644 oshmem/mca/scoll/fca/scoll_fca_ops.c create mode 100644 oshmem/mca/scoll/scoll.h create mode 100644 oshmem/mca/spml/Makefile.am create mode 100644 oshmem/mca/spml/base/Makefile.am create mode 100644 oshmem/mca/spml/base/base.h create mode 100644 oshmem/mca/spml/base/spml_base.c create mode 100644 oshmem/mca/spml/base/spml_base_atomicreq.c create mode 100644 oshmem/mca/spml/base/spml_base_atomicreq.h create mode 100644 oshmem/mca/spml/base/spml_base_close.c create mode 100644 oshmem/mca/spml/base/spml_base_getreq.c create mode 100644 oshmem/mca/spml/base/spml_base_getreq.h create mode 100644 oshmem/mca/spml/base/spml_base_open.c create mode 100644 oshmem/mca/spml/base/spml_base_putreq.c create mode 100644 oshmem/mca/spml/base/spml_base_putreq.h create mode 100644 oshmem/mca/spml/base/spml_base_request.c create mode 100644 oshmem/mca/spml/base/spml_base_request.h create mode 100644 oshmem/mca/spml/base/spml_base_request_dbg.h create mode 100644 oshmem/mca/spml/base/spml_base_select.c create mode 100644 oshmem/mca/spml/configure.m4 create mode 100644 oshmem/mca/spml/ikrit/.windows create mode 100644 oshmem/mca/spml/ikrit/Makefile.am create mode 100644 oshmem/mca/spml/ikrit/configure.m4 create mode 100644 oshmem/mca/spml/ikrit/configure.params create mode 100644 oshmem/mca/spml/ikrit/help-shmem-spml-ikrit.txt create mode 100644 oshmem/mca/spml/ikrit/spml_ikrit.c create mode 100644 oshmem/mca/spml/ikrit/spml_ikrit.h create mode 100644 oshmem/mca/spml/ikrit/spml_ikrit_component.c create mode 100644 oshmem/mca/spml/ikrit/spml_ikrit_component.h create mode 100644 oshmem/mca/spml/spml.h create mode 100644 oshmem/mca/spml/yoda/.windows create mode 100644 oshmem/mca/spml/yoda/Makefile.am create mode 100644 oshmem/mca/spml/yoda/configure.params create mode 100644 oshmem/mca/spml/yoda/help-shmem-spml-yoda.txt create mode 100644 oshmem/mca/spml/yoda/post_configure.sh create mode 100644 oshmem/mca/spml/yoda/spml_yoda.c create mode 100644 oshmem/mca/spml/yoda/spml_yoda.h create mode 100644 oshmem/mca/spml/yoda/spml_yoda_component.c create mode 100644 oshmem/mca/spml/yoda/spml_yoda_component.h create mode 100644 oshmem/mca/spml/yoda/spml_yoda_getreq.c create mode 100644 oshmem/mca/spml/yoda/spml_yoda_getreq.h create mode 100644 oshmem/mca/spml/yoda/spml_yoda_putreq.c create mode 100644 oshmem/mca/spml/yoda/spml_yoda_putreq.h create mode 100644 oshmem/mca/spml/yoda/spml_yoda_rdmafrag.h create mode 100644 oshmem/op/Makefile.am create mode 100644 oshmem/op/op.c create mode 100644 oshmem/op/op.h create mode 100644 oshmem/proc/Makefile.am create mode 100644 oshmem/proc/proc.c create mode 100644 oshmem/proc/proc.h create mode 100644 oshmem/proc/proc_group_cache.c create mode 100644 oshmem/proc/proc_group_cache.h create mode 100644 oshmem/request/Makefile.am create mode 100644 oshmem/request/request.c create mode 100644 oshmem/request/request.h create mode 100644 oshmem/request/request_dbg.h create mode 100644 oshmem/runtime/Makefile.am create mode 100644 oshmem/runtime/help-shmem-runtime.txt create mode 100644 oshmem/runtime/oshmem_shmem_abort.c create mode 100644 oshmem/runtime/oshmem_shmem_finalize.c create mode 100644 oshmem/runtime/oshmem_shmem_init.c create mode 100644 oshmem/runtime/oshmem_shmem_params.c create mode 100644 oshmem/runtime/oshmem_shmem_preconnect.h create mode 100644 oshmem/runtime/params.h create mode 100644 oshmem/runtime/runtime.h create mode 100644 oshmem/shmem/CMakeLists.txt create mode 100644 oshmem/shmem/Makefile.am create mode 100644 oshmem/shmem/c/Makefile.am create mode 100644 oshmem/shmem/c/help-shmem-api.txt create mode 100644 oshmem/shmem/c/profile/Makefile.am create mode 100644 oshmem/shmem/c/profile/defines.h create mode 100644 oshmem/shmem/c/shmem_add.c create mode 100644 oshmem/shmem/c/shmem_addr_accessible.c create mode 100644 oshmem/shmem/c/shmem_align.c create mode 100644 oshmem/shmem/c/shmem_alloc.c create mode 100644 oshmem/shmem/c/shmem_barrier.c create mode 100644 oshmem/shmem/c/shmem_broadcast.c create mode 100644 oshmem/shmem/c/shmem_clear_cache_inv.c create mode 100644 oshmem/shmem/c/shmem_clear_cache_line_inv.c create mode 100644 oshmem/shmem/c/shmem_clear_lock.c create mode 100644 oshmem/shmem/c/shmem_collect.c create mode 100644 oshmem/shmem/c/shmem_cswap.c create mode 100644 oshmem/shmem/c/shmem_fadd.c create mode 100644 oshmem/shmem/c/shmem_fence.c create mode 100644 oshmem/shmem/c/shmem_finalize.c create mode 100644 oshmem/shmem/c/shmem_finc.c create mode 100644 oshmem/shmem/c/shmem_free.c create mode 100644 oshmem/shmem/c/shmem_g.c create mode 100644 oshmem/shmem/c/shmem_get.c create mode 100644 oshmem/shmem/c/shmem_iget.c create mode 100644 oshmem/shmem/c/shmem_inc.c create mode 100644 oshmem/shmem/c/shmem_init.c create mode 100644 oshmem/shmem/c/shmem_iput.c create mode 100644 oshmem/shmem/c/shmem_lock.c create mode 100644 oshmem/shmem/c/shmem_p.c create mode 100644 oshmem/shmem/c/shmem_pe_accessible.c create mode 100644 oshmem/shmem/c/shmem_ptr.c create mode 100644 oshmem/shmem/c/shmem_put.c create mode 100644 oshmem/shmem/c/shmem_query.c create mode 100644 oshmem/shmem/c/shmem_quiet.c create mode 100644 oshmem/shmem/c/shmem_realloc.c create mode 100644 oshmem/shmem/c/shmem_reduce.c create mode 100644 oshmem/shmem/c/shmem_set_cache_inv.c create mode 100644 oshmem/shmem/c/shmem_set_cache_line_inv.c create mode 100644 oshmem/shmem/c/shmem_set_lock.c create mode 100644 oshmem/shmem/c/shmem_swap.c create mode 100644 oshmem/shmem/c/shmem_test_lock.c create mode 100644 oshmem/shmem/c/shmem_udcflush.c create mode 100644 oshmem/shmem/c/shmem_udcflush_line.c create mode 100644 oshmem/shmem/c/shmem_wait.c create mode 100644 oshmem/shmem/f77/Makefile.am create mode 100644 oshmem/shmem/f77/bindings.h create mode 100644 oshmem/shmem/f77/my_pe_f.c create mode 100644 oshmem/shmem/f77/num_pes_f.c create mode 100644 oshmem/shmem/f77/prototypes_shmem.h create mode 100644 oshmem/shmem/f77/shmem_addr_accessible_f.c create mode 100644 oshmem/shmem/f77/shmem_and_to_all_f.c create mode 100644 oshmem/shmem/f77/shmem_barrier_all_f.c create mode 100644 oshmem/shmem/f77/shmem_barrier_f.c create mode 100644 oshmem/shmem/f77/shmem_broadcast_f.c create mode 100644 oshmem/shmem/f77/shmem_cache_f.c create mode 100644 oshmem/shmem/f77/shmem_character_get_f.c create mode 100644 oshmem/shmem/f77/shmem_character_put_f.c create mode 100644 oshmem/shmem/f77/shmem_collect_f.c create mode 100644 oshmem/shmem/f77/shmem_complex_get_f.c create mode 100644 oshmem/shmem/f77/shmem_complex_iget_f.c create mode 100644 oshmem/shmem/f77/shmem_complex_iput_f.c create mode 100644 oshmem/shmem/f77/shmem_complex_put_f.c create mode 100644 oshmem/shmem/f77/shmem_double_get_f.c create mode 100644 oshmem/shmem/f77/shmem_double_iget_f.c create mode 100644 oshmem/shmem/f77/shmem_double_iput_f.c create mode 100644 oshmem/shmem/f77/shmem_double_put_f.c create mode 100644 oshmem/shmem/f77/shmem_fence_f.c create mode 100644 oshmem/shmem/f77/shmem_finalize_f.c create mode 100644 oshmem/shmem/f77/shmem_fortran_pointer.h create mode 100644 oshmem/shmem/f77/shmem_get128_f.c create mode 100644 oshmem/shmem/f77/shmem_get32_f.c create mode 100644 oshmem/shmem/f77/shmem_get4_f.c create mode 100644 oshmem/shmem/f77/shmem_get64_f.c create mode 100644 oshmem/shmem/f77/shmem_get8_f.c create mode 100644 oshmem/shmem/f77/shmem_getmem_f.c create mode 100644 oshmem/shmem/f77/shmem_iget128_f.c create mode 100644 oshmem/shmem/f77/shmem_iget32_f.c create mode 100644 oshmem/shmem/f77/shmem_iget4_f.c create mode 100644 oshmem/shmem/f77/shmem_iget64_f.c create mode 100644 oshmem/shmem/f77/shmem_iget8_f.c create mode 100644 oshmem/shmem/f77/shmem_int4_add_f.c create mode 100644 oshmem/shmem/f77/shmem_int4_cswap_f.c create mode 100644 oshmem/shmem/f77/shmem_int4_fadd_f.c create mode 100644 oshmem/shmem/f77/shmem_int4_finc_f.c create mode 100644 oshmem/shmem/f77/shmem_int4_inc_f.c create mode 100644 oshmem/shmem/f77/shmem_int4_swap_f.c create mode 100644 oshmem/shmem/f77/shmem_int4_wait_f.c create mode 100644 oshmem/shmem/f77/shmem_int4_wait_until_f.c create mode 100644 oshmem/shmem/f77/shmem_int8_add_f.c create mode 100644 oshmem/shmem/f77/shmem_int8_cswap_f.c create mode 100644 oshmem/shmem/f77/shmem_int8_fadd_f.c create mode 100644 oshmem/shmem/f77/shmem_int8_finc_f.c create mode 100644 oshmem/shmem/f77/shmem_int8_inc_f.c create mode 100644 oshmem/shmem/f77/shmem_int8_swap_f.c create mode 100644 oshmem/shmem/f77/shmem_int8_wait_f.c create mode 100644 oshmem/shmem/f77/shmem_int8_wait_until_f.c create mode 100644 oshmem/shmem/f77/shmem_integer_get_f.c create mode 100644 oshmem/shmem/f77/shmem_integer_iget_f.c create mode 100644 oshmem/shmem/f77/shmem_integer_iput_f.c create mode 100644 oshmem/shmem/f77/shmem_integer_put_f.c create mode 100644 oshmem/shmem/f77/shmem_iput128_f.c create mode 100644 oshmem/shmem/f77/shmem_iput32_f.c create mode 100644 oshmem/shmem/f77/shmem_iput4_f.c create mode 100644 oshmem/shmem/f77/shmem_iput64_f.c create mode 100644 oshmem/shmem/f77/shmem_iput8_f.c create mode 100644 oshmem/shmem/f77/shmem_lock_f.c create mode 100644 oshmem/shmem/f77/shmem_logical_get_f.c create mode 100644 oshmem/shmem/f77/shmem_logical_iget_f.c create mode 100644 oshmem/shmem/f77/shmem_logical_iput_f.c create mode 100644 oshmem/shmem/f77/shmem_logical_put_f.c create mode 100644 oshmem/shmem/f77/shmem_max_to_all_f.c create mode 100644 oshmem/shmem/f77/shmem_min_to_all_f.c create mode 100644 oshmem/shmem/f77/shmem_or_to_all_f.c create mode 100644 oshmem/shmem/f77/shmem_pe_accessible_f.c create mode 100644 oshmem/shmem/f77/shmem_prod_to_all_f.c create mode 100644 oshmem/shmem/f77/shmem_ptr_f.c create mode 100644 oshmem/shmem/f77/shmem_put32_f.c create mode 100644 oshmem/shmem/f77/shmem_put4_f.c create mode 100644 oshmem/shmem/f77/shmem_put64_f.c create mode 100644 oshmem/shmem/f77/shmem_put8_f.c create mode 100644 oshmem/shmem/f77/shmem_put_f.c create mode 100644 oshmem/shmem/f77/shmem_putmem_f.c create mode 100644 oshmem/shmem/f77/shmem_quiet_f.c create mode 100644 oshmem/shmem/f77/shmem_real4_swap_f.c create mode 100644 oshmem/shmem/f77/shmem_real8_swap_f.c create mode 100644 oshmem/shmem/f77/shmem_real_get_f.c create mode 100644 oshmem/shmem/f77/shmem_real_iget_f.c create mode 100644 oshmem/shmem/f77/shmem_real_iput_f.c create mode 100644 oshmem/shmem/f77/shmem_real_put_f.c create mode 100644 oshmem/shmem/f77/shmem_sum_to_all_f.c create mode 100644 oshmem/shmem/f77/shmem_swap_f.c create mode 100644 oshmem/shmem/f77/shmem_wait_f.c create mode 100644 oshmem/shmem/f77/shmem_wait_until_f.c create mode 100644 oshmem/shmem/f77/shmem_xor_to_all_f.c create mode 100644 oshmem/shmem/f77/shpalloc_f.c create mode 100644 oshmem/shmem/f77/shpclmove_f.c create mode 100644 oshmem/shmem/f77/shpdeallc_f.c create mode 100644 oshmem/shmem/f77/start_pes_f.c create mode 100644 oshmem/shmem/shmem_api_logger.h create mode 100644 oshmem/shmem/shmem_lock.h create mode 100644 oshmem/tools/Makefile.am create mode 100644 oshmem/tools/wrappers/Makefile.am create mode 100644 oshmem/tools/wrappers/shmemcc-wrapper-data.txt.in create mode 100644 oshmem/tools/wrappers/shmemf77-wrapper-data.txt.in create mode 100644 oshmem/tools/wrappers/shmemf90-wrapper-data.txt.in diff --git a/CMakeLists.txt b/CMakeLists.txt index 0a14c3899d..3938bcb257 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,8 @@ # # Copyright (c) 2007-2011 High Performance Computing Center Stuttgart, # University of Stuttgart. All rights reserved. +# Copyright (c) 2012 Mellanox Technologies, Inc. +# All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -68,6 +70,8 @@ INCLUDE_DIRECTORIES ( ${OpenMPI_SOURCE_DIR}/opal/include ${OpenMPI_SOURCE_DIR}/ompi ${OpenMPI_SOURCE_DIR}/ompi/include + ${OpenMPI_SOURCE_DIR}/oshmem + ${OpenMPI_SOURCE_DIR}/oshmem/include ${OpenMPI_SOURCE_DIR}/orte ${OpenMPI_SOURCE_DIR}/orte/include ${OpenMPI_BINARY_DIR}/ @@ -75,6 +79,8 @@ INCLUDE_DIRECTORIES ( ${OpenMPI_BINARY_DIR}/opal/include ${OpenMPI_BINARY_DIR}/ompi ${OpenMPI_BINARY_DIR}/ompi/include + ${OpenMPI_BINARY_DIR}/oshmem + ${OpenMPI_BINARY_DIR}/oshmem/include ${OpenMPI_BINARY_DIR}/orte ${OpenMPI_BINARY_DIR}/orte/include ) @@ -107,6 +113,7 @@ ADD_SUBDIRECTORY(opal) # set up the libevent and hwloc include dirs INCLUDE_DIRECTORIES (${LIBEVENT_INCLUDE_DIRS} ${HWLOC_PATH}/hwloc/include) ADD_SUBDIRECTORY(ompi) +ADD_SUBDIRECTORY(oshmem) ADD_SUBDIRECTORY(orte) ADD_SUBDIRECTORY(contrib/platform/win32/examples) diff --git a/Makefile.am b/Makefile.am index 0423f3067a..ca29422097 100644 --- a/Makefile.am +++ b/Makefile.am @@ -11,6 +11,8 @@ # All rights reserved. # Copyright (c) 2006-2010 Cisco Systems, Inc. All rights reserved. # Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved. +# Copyright (c) 2012 Mellanox Technologies, Inc. +# All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -28,3 +30,12 @@ dist-hook: csh "$(top_srcdir)/config/distscript.csh" "$(top_srcdir)" "$(distdir)" "$(OMPI_VERSION)" "$(OMPI_SVN_R)" ACLOCAL_AMFLAGS = -I config + +if OSHMEM_SUPPORT +DISTCHECK_CONFIGURE_FLAGS = --with-oshmem --disable-debug +cov: + PATH=/hpc/local/commercial/coverity/cov-sa/bin:$$PATH && rm -rf $(PWD)/cov-build && make clean && cov-build --dir $(PWD)/cov-build make all && cov-analyze --dir $(PWD)/cov-build && cov-format-errors --dir $(PWD)/cov-build + +get_tarball: + cp $(distdir).tar.gz $(to) +endif diff --git a/README-SHMEM-WITH-VALGRIND.txt b/README-SHMEM-WITH-VALGRIND.txt new file mode 100644 index 0000000000..5be3e7f1d9 --- /dev/null +++ b/README-SHMEM-WITH-VALGRIND.txt @@ -0,0 +1,36 @@ +Copyright (c) 2012 Mellanox Technologies, Inc. + All rights reserved. +In order to enable valgrind support: +1) download and build (into the same dir for convenience) two libraries: libmlx4 and libibverbs with valgrind support. + - http://www.openfabrics.org/downloads/libmlx4/ + - ./configure --prefix=/PATH_TO_LIBS --with-valgrind && make && make install + - http://www.openfabrics.org/downloads/libibverbs/ + - ./configure --prefix=/PATH_TO_LIBS --with-valgrind && make && make install + +NOTE: libmlx4 and libibvers should match your ofed version + +2) build shmem with valgrind, memchecker module and these two libs: +./autogen.sh && ./configure --prefix=$PWD/install --with-openib=/usr --with-oshmem --with-openib-libdir=PATH_TO_LIBS --with-valgrind=PATH_TO_VALGRIND --enable-memchecker --disable-dlopen && make clean && make && make install + + - --with-openib=/usr - the place where to look for infiniband + - --with-openib-libdir=PATH_TO_LIBS - two libs previously installed + - --with-valgrind - the --prefix to the valgrind install. that must contain $prefix/include/valgrind/valgrind.h and $prefix/valgrind/memchecker.h. Should be /usr if valgrind installed from rpm + +==8511== Conditional jump or move depends on uninitialised value(s) +==8511== at 0x3849A17486: index (in /lib64/ld-2.12.so) +==8511== by 0x3849A06254: expand_dynamic_string_token (in /lib64/ld-2.12.so) +==8511== by 0x3849A07CAF: _dl_map_object (in /lib64/ld-2.12.so) +==8511== by 0x3849A016EA: map_doit (in /lib64/ld-2.12.so) +==8511== by 0x3849A0E0A5: _dl_catch_error (in /lib64/ld-2.12.so) +==8511== by 0x3849A015EE: do_preload (in /lib64/ld-2.12.so) +==8511== by 0x3849A03BAA: dl_main (in /lib64/ld-2.12.so) +==8511== by 0x3849A15A7D: _dl_sysdep_start (in /lib64/ld-2.12.so) +==8511== by 0x3849A01493: _dl_start (in /lib64/ld-2.12.so) +==8511== by 0x3849A00AF7: ??? (in /lib64/ld-2.12.so) +==8511== by 0x2: ??? +==8511== by 0x7FF0000A2: ??? +==8511== Uninitialised value was created by a stack allocation +==8511== at 0x3849A0328D: dl_main (in /lib64/ld-2.12.so) + +In order to get rid of them use an extra command line option to valgrind: --suppression=${prefix}/share/openshmem/openmpi-valgrind.supp --suppressions=shmem_suppressions, where shmem_suppressions is the file in the root of the shmem code and ${prefix} is shmem install prefix as set by configure. + diff --git a/README-SHMEM.txt b/README-SHMEM.txt new file mode 100644 index 0000000000..f1b50bc505 --- /dev/null +++ b/README-SHMEM.txt @@ -0,0 +1,19 @@ +Copyright (c) 2012 Mellanox Technologies, Inc. + All rights reserved. + + +Build SHMEM +----------- + +./autogen.sh +./configure --prefix=$PWD/install --with-oshmem -enable-contrib-no-build=libnbc,vt --with-openib +make +make install +export SHMEM_HOME=$PWD/install + +Run SHMEM +--------- + +$SHMEM_HOME/bin/shmemrun -np 10 --host amd1,amd5,amd7 -mca btl openib,self hello_shmem.exe + + diff --git a/VERSION b/VERSION index f8f6c8d208..5a73404a73 100644 --- a/VERSION +++ b/VERSION @@ -1,6 +1,8 @@ # Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved. # Copyright (c) 2008-2011 Cisco Systems, Inc. All rights reserved. # Copyright (c) 2011 NVIDIA Corporation. All rights reserved. +# Copyright (c) 2012 Mellanox Technologies, Inc. +# All rights reserved. # This is the VERSION file for Open MPI, describing the precise # version of Open MPI in this distribution. The various components of @@ -96,6 +98,7 @@ libmpi_usempi_ignore_tkr_so_version=0:0:0 libopen_rte_so_version=0:0:0 libopen_pal_so_version=0:0:0 libmpi_java_so_version=0:0:0 +libshmem_so_version=0:0:0 # "Common" components install standalone libraries that are run-time # linked by one or more components. So they need to be versioned as diff --git a/autogen.pl b/autogen.pl index a762b354b2..c8f896bbd2 100755 --- a/autogen.pl +++ b/autogen.pl @@ -2,7 +2,8 @@ # # Copyright (c) 2009-2012 Cisco Systems, Inc. All rights reserved. # Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. -# +# Copyright (c) 2012 Mellanox Technologies, Inc. +# All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -41,6 +42,7 @@ my @subdirs; # Command line parameters my $no_ompi_arg = 0; my $no_orte_arg = 0; +my $no_oshmem_arg = 0; my $quiet_arg = 0; my $debug_arg = 0; my $help_arg = 0; @@ -935,6 +937,7 @@ sub patch_autotools_output { my $ok = Getopt::Long::GetOptions("no-ompi" => \$no_ompi_arg, "no-orte" => \$no_orte_arg, + "no-oshmem" => \$no_oshmem_arg, "quiet|q" => \$quiet_arg, "debug|d" => \$debug_arg, "help|h" => \$help_arg, @@ -949,6 +952,7 @@ if (!$ok || $help_arg) { print "Options: --no-ompi | -no-ompi Do not build the Open MPI layer --no-orte | -no-orte Do not build the ORTE layer + --no-oshmem | -no-oshmem Do not build the OSHMEM layer --quiet | -q Do not display normal verbose output --debug | -d Output lots of debug information --help | -h This help list @@ -977,6 +981,10 @@ if (! -e "orte") { $no_orte_arg = 1; debug "No orte subdirectory found - will not build ORTE\n"; } +if (! -e "oshmem") { + $no_oshmem_arg = 1; + debug "No oshmem subdirectory found - will not build OSHMEM\n"; +} if ($no_ompi_arg == 1 && $no_orte_arg == 0) { $project_name_long = "Open MPI Run Time Environment"; @@ -1143,6 +1151,9 @@ push(@{$projects}, { name => "orte", dir => "orte", need_base => 1 }) if (!$no_orte_arg); push(@{$projects}, { name => "ompi", dir => "ompi", need_base => 1 }) if (!$no_ompi_arg); +push(@{$projects}, { name => "oshmem", dir => "oshmem", need_base => 1 }) + if (!$no_ompi_arg && !$no_orte_arg && !$no_oshmem_arg); + # Save the list of projects in the m4 file my $str; diff --git a/config/opal_check_attributes.m4 b/config/opal_check_attributes.m4 index 5bf8263a93..9dcfee1d0f 100644 --- a/config/opal_check_attributes.m4 +++ b/config/opal_check_attributes.m4 @@ -12,6 +12,8 @@ # All rights reserved. # Copyright (c) 2009 Oak Ridge National Labs. All rights reserved. # Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2012 Mellanox Technologies, Inc. +# All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -215,6 +217,7 @@ AC_DEFUN([OPAL_CHECK_ATTRIBUTES], [ opal_cv___attribute__visibility=0 opal_cv___attribute__warn_unused_result=0 opal_cv___attribute__weak_alias=0 + opal_cv___attribute__destructor=0 else AC_MSG_RESULT([yes]) @@ -533,6 +536,13 @@ AC_DEFUN([OPAL_CHECK_ATTRIBUTES], [ [], []) + _OPAL_CHECK_SPECIFIC_ATTRIBUTE([destructor], + [ + void foo(void) __attribute__ ((__destructor__)); + void foo(void) { return ; } + ], + [], + []) fi # Now that all the values are set, define them @@ -581,4 +591,6 @@ AC_DEFUN([OPAL_CHECK_ATTRIBUTES], [ [Whether your compiler has __attribute__ warn unused result or not]) AC_DEFINE_UNQUOTED(OPAL_HAVE_ATTRIBUTE_WEAK_ALIAS, [$opal_cv___attribute__weak_alias], [Whether your compiler has __attribute__ weak alias or not]) + AC_DEFINE_UNQUOTED(OPAL_HAVE_ATTRIBUTE_DESTRUCTOR, [$opal_cv___attribute__destructor], + [Whether your compiler has __attribute__ destructor or not]) ]) diff --git a/config/oshmem_config_files.m4 b/config/oshmem_config_files.m4 new file mode 100644 index 0000000000..b3c5f188ff --- /dev/null +++ b/config/oshmem_config_files.m4 @@ -0,0 +1,26 @@ +# -*- shell-script -*- +# +# Copyright (c) 2012 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +AC_DEFUN([OSHMEM_CONFIG_FILES],[ + AC_CONFIG_FILES([ + oshmem/Makefile + oshmem/include/Makefile + oshmem/shmem/c/Makefile + oshmem/shmem/f77/Makefile + + oshmem/shmem/c/profile/Makefile + + oshmem/tools/wrappers/Makefile + oshmem/tools/wrappers/shmemcc-wrapper-data.txt + oshmem/tools/wrappers/shmemf77-wrapper-data.txt + oshmem/tools/wrappers/shmemf90-wrapper-data.txt + ]) +]) diff --git a/config/oshmem_configure_options.m4 b/config/oshmem_configure_options.m4 new file mode 100644 index 0000000000..9892ff6c03 --- /dev/null +++ b/config/oshmem_configure_options.m4 @@ -0,0 +1,188 @@ +dnl -*- shell-script -*- +dnl +dnl Copyright (c) 2012 Mellanox Technologies, Inc. +dnl All rights reserved. +dnl +dnl $COPYRIGHT$ +dnl +dnl Additional copyrights may follow +dnl +dnl $HEADER$ +dnl + + + +AC_DEFUN([OSHMEM_CONFIGURE_OPTIONS],[ +ompi_show_subtitle "OSHMEM Configuration options" + + +AC_SUBST(OSHMEM_LIBSHMEM_EXTRA_LIBS) +AC_SUBST(OSHMEM_LIBSHMEM_EXTRA_LDFLAGS) + +# +# OSHMEM support +# +AC_MSG_CHECKING([if want OSHMEM support]) +AC_ARG_WITH([oshmem], + [AC_HELP_STRING([--with-oshmem], + [Build with OSHMEM support (default=no)])]) +if test "$with_oshmem" = "no"; then + AC_MSG_RESULT([no]) + oshmem_with_support=0 +else + AC_MSG_RESULT([yes]) + oshmem_with_support=1 +fi +AM_CONDITIONAL(OSHMEM_SUPPORT, test "$oshmem_with_support" = 1) +AC_DEFINE_UNQUOTED([OSHMEM_ENABLED], [$oshmem_with_support], + [Whether user wants OSHMEM support or not]) + +# +# Enable compatibility mode +# +AC_MSG_CHECKING([if want SGI/Quadrix compatibility mode]) +AC_ARG_ENABLE(oshmem-compat, + AC_HELP_STRING([--enable-oshmem-compat], + [enable compatibility mode (default: enabled)])) +if test "$enable_oshmem_compat" != "no"; then + AC_MSG_RESULT([yes]) + OSHMEM_SPEC_COMPAT=1 +else + AC_MSG_RESULT([no]) + OSHMEM_SPEC_COMPAT=0 +fi +AC_DEFINE_UNQUOTED([OSHMEM_SPEC_COMPAT], [$OSHMEM_SPEC_COMPAT], + [Whether user wants OSHMEM in compatibility mode or not]) + + + +# +# Do we want to disable OSHMEM parameter checking at run-time? +# +AC_MSG_CHECKING([if want SHMEM API parameter checking]) +AC_ARG_WITH(shmem-param-check, + AC_HELP_STRING([--shmem-param-check(=VALUE)], + [behavior of SHMEM function parameter checking. Valid values are: always, never. If --with-shmem-param-check is specified with no VALUE argument, it is equivalent to a VALUE of "always"; --without-shmem-param-check is equivalent to "never" (default: always).])) +shmem_param_check=1 +if test "$with_shmem_param_check" = "no" -o \ + "$with_shmem_param_check" = "never"; then + shmem_param_check=0 + AC_MSG_RESULT([never]) +elif test "$with_shmem_param_check" = "yes" -o \ + "$with_shmem_param_check" = "always" -o \ + -z "$with_shmem_param_check"; then + shmem_param_check=1 + AC_MSG_RESULT([always]) +else + AC_MSG_RESULT([unknown]) + AC_MSG_WARN([*** Unrecognized --with-shmem-param-check value]) + AC_MSG_WARN([*** See "configure --help" output]) + AC_MSG_WARN([*** Defaulting to "runtime"]) +fi +AC_DEFINE_UNQUOTED(OSHMEM_PARAM_CHECK, $shmem_param_check, + [Whether we want to check SHMEM parameters always or never]) + + +# +# OSHMEM profiling support +# +AC_MSG_CHECKING([if want pshmem_]) +AC_ARG_ENABLE(oshmem-profile, + AC_HELP_STRING([--enable-oshmem-profile], + [enable OSHMEM profiling (default: enabled)])) +if test "$enable_oshmem_profile" != "no"; then + AC_MSG_RESULT([yes]) + oshmem_progiling_support=1 +else + AC_MSG_RESULT([no]) + oshmem_progiling_support=0 +fi +AM_CONDITIONAL(OSHMEM_PROFILING, test "$oshmem_progiling_support" = 1) +#AC_DEFINE_UNQUOTED([OSHMEM_PROFILING], [$oshmem_progiling_support], +# [Whether user wants OSHMEM profiling]) + +]) + + +AC_DEFUN([OSHMEM_SETUP_CFLAGS],[ + + +OMPI_C_COMPILER_VENDOR([oshmem_c_vendor]) + +# +# OSHMEM force warnings as errors +# +# +# Since SHMEM libraries are not fully ISO99 C compliant +# -pedantic and -Wundef raise a bunch of warnings, so +# we just strip them off for this component +AC_MSG_WARN([Removed -pedantic and -Wundef from CFLAGS for OSHMEM]) + +oshmem_CFLAGS="$CFLAGS" + +# Strip off problematic arguments +oshmem_CFLAGS="`echo $oshmem_CFLAGS | sed 's/-pedantic//g'`" +oshmem_CFLAGS="`echo $oshmem_CFLAGS | sed 's/-Wundef//g'`" +oshmem_CFLAGS="`echo $oshmem_CFLAGS | sed 's/-Wno-long-double//g'`" +CFLAGS="$oshmem_CFLAGS" + +case "$oshmem_c_vendor" in + gnu) + OSHMEM_CFLAGS=" -Werror " + OSHMEM_TEST_CFLAGS="$CFLAGS -Wall -Wundef -Werror " + ;; + intel) + # we want specifically the warning on format string conversion + OSHMEM_CFLAGS=" -Werror " + OSHMEM_TEST_CFLAGS="$CFLAGS -Wall -Werror -wd188,981,1419,810" + ;; +esac + +AC_SUBST([OSHMEM_CFLAGS]) +AC_SUBST([OSHMEM_TEST_CFLAGS]) + + + +OMPI_CHECK_OPENFABRICS([openib], + [openib_happy="yes"], + [openib_happy="no"]) + +# substitute in the things needed to build MEMHEAP BASE +AC_SUBST([openib_CFLAGS]) +AC_SUBST([openib_CPPFLAGS]) +AC_SUBST([openib_LDFLAGS]) +AC_SUBST([openib_LIBS]) + +# If we have the openib stuff available, find out what we've got +AS_IF( + [test "$openib_happy" = "yes"], + [ + OSHMEM_LIBSHMEM_EXTRA_LDFLAGS="$OSHMEM_LIBSHMEM_EXTRA_LDFLAGS $openib_LDFLAGS" + OSHMEM_LIBSHMEM_EXTRA_LIBS="$OSHMEM_LIBSHMEM_EXTRA_LIBS $openib_LIBS" + + # ibv_reg_shared_mr was added in MOFED 1.8 + oshmem_have_mpage=0 + + openib_save_CPPFLAGS="$CPPFLAGS" + openib_save_LDFLAGS="$LDFLAGS" + openib_save_LIBS="$LIBS" + + CPPFLAGS="$CPPFLAGS $openib_CPPFLAGS" + LDFLAGS="$LDFLAGS $openib_LDFLAGS" + LIBS="$LIBS $openib_LIBS" + + AC_CHECK_DECLS([IBV_ACCESS_ALLOCATE_MR], + [oshmem_have_mpage=1], [], + [#include ]) + + AC_CHECK_LIB([ibverbs], [ibv_reg_shared_mr], [oshmem_have_mpage=2]) + + CPPFLAGS="$openib_save_CPPFLAGS" + LDFLAGS="$openib_save_LDFLAGS" + LIBS="$openib_save_LIBS" + + AC_DEFINE_UNQUOTED(MPAGE_ENABLE, $oshmem_have_mpage, + [Whether we can use M-PAGE supported since MOFED 1.8]) + ]) +])dnl + diff --git a/configure.ac b/configure.ac index 7f5ec905ea..6634b3e24a 100644 --- a/configure.ac +++ b/configure.ac @@ -17,6 +17,8 @@ # Copyright (c) 2009 Oak Ridge National Labs. All rights reserved. # Copyright (c) 2011 NVIDIA Corporation. All rights reserved. # Copyright (c) 2012 Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2012 Mellanox Technologies, Inc. +# All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -99,6 +101,8 @@ AC_SUBST([CONFIGURE_DEPENDENCIES], ['$(top_srcdir)/VERSION']) # Set up project specific AM_CONDITIONALs AM_CONDITIONAL([PROJECT_OMPI], m4_ifdef([project_ompi], [true], [false])) AM_CONDITIONAL([PROJECT_ORTE], m4_ifdef([project_orte], [true], [false])) +AM_CONDITIONAL([PROJECT_OSHMEM], m4_ifdef([project_oshmem], [true], [false])) + ompi_show_subtitle "Checking versions" @@ -113,6 +117,11 @@ m4_ifdef([project_orte], [$srcdir/VERSION], [orte/include/orte/version.h])]) +m4_ifdef([project_oshmem], + [OPAL_SAVE_VERSION([OSHMEM], [Open SHMEM], + [$srcdir/VERSION], + [oshmem/include/oshmem/version.h])]) + OPAL_SAVE_VERSION([OPAL], [Open Portable Access Layer], [$srcdir/VERSION], [opal/include/opal/version.h]) @@ -137,6 +146,8 @@ m4_ifdef([project_ompi], AC_SUBST(libmca_common_portals_so_version)]) m4_ifdef([project_orte], [AC_SUBST(libopen_rte_so_version)]) +m4_ifdef([project_oshmem], + [AC_SUBST(libshmem_so_version)]) AC_SUBST(libmca_opal_common_hwloc_so_version) AC_SUBST(libopen_pal_so_version) @@ -160,6 +171,8 @@ m4_ifdef([project_orte], [AC_CONFIG_HEADERS([orte/include/orte_config.h])]) m4_ifdef([project_ompi], [AC_CONFIG_HEADERS([ompi/include/ompi_config.h ompi/include/mpi.h])]) +m4_ifdef([project_oshmem], + [AC_CONFIG_HEADER([oshmem/include/oshmem_config.h oshmem/include/shmem.h oshmem/include/shmem_portable_platform.h])]) # override/fixup the version numbers set by AC_INIT, since on # developer builds, there's no good way to know what the version is @@ -239,6 +252,7 @@ AC_SUBST(top_ompi_builddir) OPAL_CONFIGURE_OPTIONS m4_ifdef([project_orte], [ORTE_CONFIGURE_OPTIONS]) m4_ifdef([project_ompi], [OMPI_CONFIGURE_OPTIONS]) +m4_ifdef([project_oshmem], [OSHMEM_CONFIGURE_OPTIONS]) if test "$enable_binaries" = "no" -a "$enable_dist" = "yes"; then AC_MSG_WARN([--disable-binaries is incompatible with --enable dist]) @@ -671,6 +685,7 @@ ompi_show_title "Type tests" # Size of pid_t AC_CHECK_SIZEOF(pid_t) +AC_CHECK_SIZEOF(long) AC_CHECK_TYPES([socklen_t, struct sockaddr_in, struct sockaddr_in6, struct sockaddr_storage], @@ -845,6 +860,12 @@ AC_DEFINE_UNQUOTED(OMPI_MPI_OFFSET_TYPE, $MPI_OFFSET_TYPE, [Type of MPI_Offset - AC_DEFINE_UNQUOTED(OMPI_MPI_OFFSET_SIZE, $MPI_OFFSET_SIZE, [Size of the MPI_Offset]) AC_DEFINE_UNQUOTED(OMPI_OFFSET_DATATYPE, $MPI_OFFSET_DATATYPE, [MPI datatype corresponding to MPI_Offset]) +AC_DEFINE_UNQUOTED(OSHMEM_SHMEM_OFFSET_TYPE, $MPI_OFFSET_TYPE, [Type of SHMEM_Offset -- has to be defined here and typedef'ed later because shmem.h does not get AC SUBST's]) +AC_DEFINE_UNQUOTED(OSHMEM_SHMEM_OFFSET_SIZE, $MPI_OFFSET_SIZE, [Size of the SHMEM_Offset]) +AC_DEFINE_UNQUOTED(OSHMEM_OFFSET_DATATYPE, $MPI_OFFSET_DATATYPE, [SHMEM datatype corresponding to SHMEM_Offset]) + +AC_DEFINE_UNQUOTED(OPAL_SIZEOF_LONG, $ac_cv_sizeof_long, "Size of 'long' type") + # # Check for MPI_Aint type. Yes, there are platforms where # sizeof(void*) != sizeof(long) (64 bit Windows, apparently). @@ -1172,7 +1193,7 @@ if test "$OMPI_TOP_BUILDDIR" != "$OMPI_TOP_SRCDIR"; then # rather than have successive assignments to these shell # variables, lest the $(foo) names try to get evaluated here. # Yuck! - CPPFLAGS='-I$(top_srcdir) -I$(top_builddir) -I$(top_srcdir)/opal/include m4_ifdef([project_orte], [-I$(top_srcdir)/orte/include]) m4_ifdef([project_ompi], [-I$(top_srcdir)/ompi/include])'" $CPPFLAGS" + CPPFLAGS='-I$(top_srcdir) -I$(top_builddir) -I$(top_srcdir)/opal/include m4_ifdef([project_orte], [-I$(top_srcdir)/orte/include]) m4_ifdef([project_ompi], [-I$(top_srcdir)/ompi/include]) m4_ifdef([project_oshmem], [-I$(top_srcdir)/oshmem/include])'" $CPPFLAGS" # C++ is only relevant if we're building OMPI m4_ifdef([project_ompi], [CXXCPPFLAGS='-I$(top_srcdir) -I$(top_builddir) -I$(top_srcdir)/opal/include -I$(top_srcdir)/orte/include -I$(top_srcdir)/ompi/include'" $CXXCPPFLAGS"]) else @@ -1186,6 +1207,7 @@ fi m4_ifdef([project_orte], [ORTE_SETUP_DEBUGGER_FLAGS], [m4_ifdef([project_ompi], [ORTE_SETUP_DEBUGGER_FLAGS])]) +m4_ifdef([project_oshmem], [OSHMEM_SETUP_CFLAGS]) # # Delayed the substitution of CFLAGS and CXXFLAGS until now because @@ -1292,5 +1314,6 @@ AC_CONFIG_FILES([ OPAL_CONFIG_FILES m4_ifdef([project_orte], [ORTE_CONFIG_FILES]) m4_ifdef([project_ompi], [OMPI_CONFIG_FILES]) +m4_ifdef([project_oshmem], [OSHMEM_CONFIG_FILES]) AC_OUTPUT diff --git a/contrib/Makefile.am b/contrib/Makefile.am index 8d5f441de2..554aaab660 100644 --- a/contrib/Makefile.am +++ b/contrib/Makefile.am @@ -12,6 +12,9 @@ # Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. # Copyright (c) 2010 IBM Corporation. All rights reserved. # Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved. +# Copyright (c) 2012 Mellanox Technologies, Inc. +# All rights reserved. +# # $COPYRIGHT$ # # Additional copyrights may follow @@ -160,3 +163,10 @@ EXTRA_DIST = \ platform/greenplum/mrplus/linux-optimized.conf dist_pkgdata_DATA = openmpi-valgrind.supp + +if OSHMEM_SUPPORT +dist_pkgdata_DATA += \ + tau/shmem_wiki_tau.sh \ + tau/tau_openshmem.patch \ + tau/readme +endif diff --git a/contrib/platform/win32/CMakeModules/opal_get_version.cmake b/contrib/platform/win32/CMakeModules/opal_get_version.cmake index 88eb974635..7cdff62658 100644 --- a/contrib/platform/win32/CMakeModules/opal_get_version.cmake +++ b/contrib/platform/win32/CMakeModules/opal_get_version.cmake @@ -1,6 +1,8 @@ # # Copyright (c) 2007-2012 High Performance Computing Center Stuttgart, # University of Stuttgart. All rights reserved. +# Copyright (c) 2012 Mellanox Technologies, Inc. +# All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -107,6 +109,15 @@ OMPI_DEF(OMPI_MINOR_VERSION ${MINOR_VERSION} "Minor release number of Open MPI." OMPI_DEF(OMPI_RELEASE_VERSION ${RELEASE_VERSION} "Release number of Open MPI." 0 1) OMPI_DEF(OMPI_VERSION ${VERSION_STRING} "Complete release number of Open MPI." 1 1) +# Set OSHMEM versions +OMPI_DEF(OSHMEM_WANT_REPO_REV ${WANT_REPO_REV} "SVN verstion of OSHMEM" 1 1) +OMPI_DEF(OSHMEM_REPO_REV "${SVN_VERSION}" "SVN verstion of OSHMEM" 1 1) +OMPI_DEF(OSHMEM_GREEK_VERSION "${GREEK_VERSION}" "Greek - alpha, beta, etc - release number of Open Portable Access Layer." 1 1) +OMPI_DEF(OSHMEM_MAJOR_VERSION ${MAJOR_VERSION} "Major release number of OSHMEM." 0 1) +OMPI_DEF(OSHMEM_MINOR_VERSION ${MINOR_VERSION} "Minor release number of OSHMEM." 0 1) +OMPI_DEF(OSHMEM_RELEASE_VERSION ${RELEASE_VERSION} "Release number of OSHMEM." 0 1) +OMPI_DEF(OSHMEM_VERSION ${VERSION_STRING} "Complete release number of OSHMEM." 1 1) + # Set ORTE versions OMPI_DEF(ORTE_WANT_REPO_REV ${WANT_REPO_REV} "SVN verstion of ORTE" 1 1) OMPI_DEF(ORTE_REPO_REV "${SVN_VERSION}" "SVN verstion of ORTE" 1 1) diff --git a/contrib/tau/readme b/contrib/tau/readme new file mode 100644 index 0000000000..4c5d6cd944 --- /dev/null +++ b/contrib/tau/readme @@ -0,0 +1,178 @@ +Copyright (c) 2012 Mellanox Technologies, Inc. + All rights reserved. +22 May 2012 + +Description +=========== + +Using TAU with OpenSHMEM +======================== + + +Building PDT +============= + +Download, configure and build PDT: + +----- +wget -nc http://tau.uoregon.edu/pdt_releases/pdtoolkit-3.17.tar.gz +tar -xzf pdtoolkit-3.17.tar.gz +cd pdtoolkit-* +PDT_INST=$PWD +./configure +make install +cd .. +----- + + +Building TAU +============ + +Download: + +----- +wget -nc http://www.cs.uoregon.edu/research/paracomp/tau/tauprofile/dist/tau_latest.tar.gztar -xzf pdtoolkit-3.17.tar.gz +tar -xzf tau_latest.tar.gz +cd tau-* +----- + +Accept patch: + +----- +patch -p1 -i tau_openshmem.patch +----- + +Note: +Patch is needed to define profiling API that is not part of official openshmem.org standard. + +Configure and build: + +----- +TAU_INST=$PWD/inst-tau-shmem +./configure -prefix=$TAU_INST -shmem -tag=oshmem -cc=gcc -pdt=$PDT_INST -PROFILEPARAM -useropt="-I$OSHMEM_INST/include/mpp" -shmemlib=$OSHMEM_INST/lib -shmemlibrary=-lshmem#-lpmi +make install +cd .. +----- + +Note: +-useropt : specifies additional user options such as -g or -I. For multiple options, the options list should be enclosed in a single quote. +-lpmi : SLURM support +-shmeminc : is not set due to bug in TAU (http://nic.uoregon.edu/pipermail/tau-users/2011-December/000599.html) +TAU documentation link http://www.cs.uoregon.edu/Research/tau/docs/old/bk04ch01.html + + +Building Application +==================== + +Add OpenSHMEM installation location to the PATH environment: + +----- +export PATH=$OSHMEM_INST/bin:$PATH +----- + +Set TAU environmemnt variables: + +----- +export PATH=$TAU_INST/x86_64/bin:$PATH +export TAU_MAKEFILE=$TAU_INST/x86_64/lib/Makefile.tau-oshmem-param-shmem-pdt +export TAU_OPTIONS= +----- + +Note: +TAU provides shell scripts tau_cxx.sh, tau_f90.sh and tau_cc.sh for C++,F90 and C compilers. These are typically installed in //bin >directory. You may replace the compiler in your Makefiles with these scripts. These scripts may also be used on the command line. Each configuration of TAU has a stub makefile associated with it. For instance, +inst-tau-shmem/x86_64/lib/Makefile.tau-oshmem-param-shmem-pdt +This makefile specifies the measurement components that have been configured with it. In this case it shows that TAU's MPI/SHMEM wrapper interposition library and PDT have been configured. You need to set the environment variable: +TAU_MAKEFILE +to point to this stub makefile. You can also pass arguments to the four stages of compilation using the environment variable +TAU_OPTIONS + +Build and run example: + +----- +tau_cc.sh $TAU_INST/../examples/shmem/c/simple.c -o simple-tau.out +env LD_LIBRARY_PATH=$OSHMEM_INST/lib:$LD_LIBRARY_PATH srun -n 2 simple-tau.out +----- + +After job completes, view simple report: + +----- +pprof +paraprof --pack example_shmem_tau.ppk +----- + +Output example +============== + +----- +$pprof +Reading Profile files in profile.* + +NODE 0;CONTEXT 0;THREAD 0: +--------------------------------------------------------------------------------------- +%Time Exclusive Inclusive #Call #Subrs Inclusive Name + msec total msec usec/call +--------------------------------------------------------------------------------------- +100.0 0.102 10,479 1 5 10479786 int main(int, char **) C +100.0 10,479 10,479 1 0 10479221 void start_pes(int) C + 0.0 0.423 0.423 1 0 423 void shmem_barrier_all(void) C + 0.0 0.04 0.04 1 0 40 void shmem_long_put(long *, const long *, size_t, int) C + 0.0 0.04 0.04 1 0 40 void shmem_long_put(long *, const long *, size_t, int) C [ = <80> ] + 0.0 0 0 2 0 0 int shmem_my_pe(void) C +--------------------------------------------------------------------------------------- + +USER EVENTS Profile :NODE 0, CONTEXT 0, THREAD 0 +--------------------------------------------------------------------------------------- +NumSamples MaxValue MinValue MeanValue Std. Dev. Event Name +--------------------------------------------------------------------------------------- + 1 80 80 80 0 Message size sent to all nodes +--------------------------------------------------------------------------------------- + +NODE 1;CONTEXT 0;THREAD 0: +--------------------------------------------------------------------------------------- +%Time Exclusive Inclusive #Call #Subrs Inclusive Name + msec total msec usec/call +--------------------------------------------------------------------------------------- +100.0 0.042 9,270 1 4 9270413 int main(int, char **) C +100.0 9,270 9,270 1 0 9270317 void start_pes(int) C + 0.0 0.053 0.053 1 0 53 void shmem_barrier_all(void) C + 0.0 0.001 0.001 2 0 0 int shmem_my_pe(void) C + +FUNCTION SUMMARY (total): +--------------------------------------------------------------------------------------- +%Time Exclusive Inclusive #Call #Subrs Inclusive Name + msec total msec usec/call +--------------------------------------------------------------------------------------- +100.0 0.144 19,750 2 9 9875100 int main(int, char **) C +100.0 19,749 19,749 2 0 9874769 void start_pes(int) C + 0.0 0.476 0.476 2 0 238 void shmem_barrier_all(void) C + 0.0 0.04 0.04 1 0 40 void shmem_long_put(long *, const long *, size_t, int) C + 0.0 0.04 0.04 1 0 40 void shmem_long_put(long *, const long *, size_t, int) C [ = <80> ] + 0.0 0.001 0.001 4 0 0 int shmem_my_pe(void) C + +FUNCTION SUMMARY (mean): +--------------------------------------------------------------------------------------- +%Time Exclusive Inclusive #Call #Subrs Inclusive Name + msec total msec usec/call +--------------------------------------------------------------------------------------- +100.0 0.072 9,875 1 4.5 9875100 int main(int, char **) C +100.0 9,874 9,874 1 0 9874769 void start_pes(int) C + 0.0 0.238 0.238 1 0 238 void shmem_barrier_all(void) C + 0.0 0.02 0.02 0.5 0 40 void shmem_long_put(long *, const long *, size_t, int) C + 0.0 0.02 0.02 0.5 0 40 void shmem_long_put(long *, const long *, size_t, int) C [ = <80> ] + 0.0 0.0005 0.0005 2 0 0 int shmem_my_pe(void) C +----- + + +A sample session +=========== ===== + +Following shell script can be launched to play with OpenSHMEM and TAU as shmem_wiki_tau.sh +This script download, configure and install all needed tools, compile simple open shmem application, run one using SLURM and show the result using pprof and prepares result to view using paraprof also. + +ParaProf - User's Manual: http://www.cs.uoregon.edu/Research/tau/docs/paraprof/index.html + +----- +./shmem_wiki_tau.sh +----- + +Enjoy. diff --git a/contrib/tau/shmem_wiki_tau.sh b/contrib/tau/shmem_wiki_tau.sh new file mode 100644 index 0000000000..a0fb4f98c2 --- /dev/null +++ b/contrib/tau/shmem_wiki_tau.sh @@ -0,0 +1,176 @@ +#!/bin/bash +# Copyright (c) 2012 Mellanox Technologies, Inc. +# All rights reserved. + +if [[ -n "$1" ]]; then + OSHMEM_INST=$1 +else + echo "Path to OpenShmem install should be passed as an argument and built with --with-pmi" +fi + +export PATH=$OSHMEM_INST/bin:$PATH + +OSHMEM_TAU_PATCH=`pwd`/tau_openshmem.patch + +# TAU expects having shmcc compiler name for Open SMEM +#ln -s $OSHMEM_INST/bin/shmemcc $OSHMEM_INST/bin/oshcc +#ln -s $OSHMEM_INST/share/openshmem/shmemcc-wrapper-data.txt $OSHMEM_INST/share/openshmem/oshcc-wrapper-data.txt + + +# download PDT sources +wget -nc http://tau.uoregon.edu/pdt_releases/pdtoolkit-3.17.tar.gz + +# build PDT +tar -xzf pdtoolkit-3.17.tar.gz +cd pdtoolkit-* +PDT_INST=$PWD +./configure +make install +cd .. + + +function install_openshmem_tau_patch +{ +cat > ${OSHMEM_TAU_PATCH} < ++ # define COMPLEXIFY(T) std::complex ++ #else /* _cplusplus */ ++ # include ++ # define COMPLEXIFY(T) T complex ++ #endif /* __cplusplus */ ++ void pshmem_complexd_put (COMPLEXIFY (double) * dest, ++ const COMPLEXIFY (double) * src, ++ size_t nelems, int pe) ++ { ++ fprintf(stderr, "Dummy %s\n", __FUNCTION__); ++ return ; ++ } ++ ++ /* Old API */ ++ ++ + /********************************************************** + start_pes + **********************************************************/ +END_MSG + +patch --dry-run -p1 -i ${OSHMEM_TAU_PATCH} +patch -p1 -i ${OSHMEM_TAU_PATCH} + + return 0 +} + + +# download TAU sources +wget -nc http://www.cs.uoregon.edu/research/paracomp/tau/tauprofile/dist/tau_latest.tar.gz +#wget http://tau.uoregon.edu/tau.tgz + +tar -xzf tau_latest.tar.gz +cd tau-* +install_openshmem_tau_patch +TAU_INST=$PWD/inst-tau-shmem +./configure -prefix=$TAU_INST -shmem -tag=oshmem -cc=gcc -pdt=$PDT_INST -PROFILEPARAM -useropt="-g" -shmemlib=$OSHMEM_INST/lib -shmemlibrary="-lshmem -lpmi" +make install +cd .. + +export PATH=$TAU_INST/x86_64/bin:$PATH +export TAU_MAKEFILE=$TAU_INST/x86_64/lib/Makefile.tau-oshmem-param-shmem-pdt +export TAU_OPTIONS= + + +# Example +# Note: +# srun reports error message as ORTE_ERROR_LOG: A message is attempting to be sent to a process whose contact information is unknown in file +# in case shmem library is built w/o option --with-pmi +mkdir example +cd example +if [[ -n "$2" ]]; then + tau_cc.sh $2/test/shmem/vs/osu_latency.c -DOSHMEM -o osu_latency-tau.out + env LD_LIBRARY_PATH=$OSHMEM_INST/lib:$LD_LIBRARY_PATH srun -n 2 osu_latency-tau.out + cd .. +else + tau_cc.sh $TAU_INST/../examples/shmem/c/simple.c -o simple-tau.out + env LD_LIBRARY_PATH=$OSHMEM_INST/lib:$LD_LIBRARY_PATH srun -n 2 simple-tau.out +fi +pprof +paraprof --pack example_shmem_tau.ppk +cd .. diff --git a/contrib/tau/tau_openshmem.patch b/contrib/tau/tau_openshmem.patch new file mode 100644 index 0000000000..e8278e4850 --- /dev/null +++ b/contrib/tau/tau_openshmem.patch @@ -0,0 +1,101 @@ +# Copyright (c) 2012 Mellanox Technologies, Inc. +# All rights reserved. +*** tau-2.21.2/src/Profile/TauShmemOpenShmemC.c 2012-01-27 20:43:12.000000000 +0200 +--- new/src/Profile/TauShmemOpenShmemC.c 2012-05-21 14:14:51.000000000 +0300 +*************** +*** 6,11 **** +--- 6,99 ---- + #define TAU_SHMEM_TAGID tau_shmem_tagid_f=tau_shmem_tagid_f%250 + #define TAU_SHMEM_TAGID_NEXT (++tau_shmem_tagid_f) % 250 + ++ ++ /* This section contains old API that are not part of openshmem.org specification ++ * ++ */ ++ void pshmem_init (void) ++ { ++ fprintf(stderr, "Dummy %s\n", __FUNCTION__); ++ return ; ++ } ++ ++ void pshmem_finalize (void) ++ { ++ fprintf(stderr, "Dummy %s\n", __FUNCTION__); ++ return ; ++ } ++ ++ char *pshmem_nodename (void) ++ { ++ fprintf(stderr, "Dummy %s\n", __FUNCTION__); ++ return NULL; ++ } ++ ++ int pshmem_version (int *major, int *minor) ++ { ++ fprintf(stderr, "Dummy %s\n", __FUNCTION__); ++ return 0; ++ } ++ ++ void *pshmem_malloc (size_t size) ++ { ++ fprintf(stderr, "Dummy %s\n", __FUNCTION__); ++ return NULL; ++ } ++ ++ void pshmem_free (void *ptr) ++ { ++ fprintf(stderr, "Dummy %s\n", __FUNCTION__); ++ return ; ++ } ++ ++ void *pshmem_realloc (void *ptr, size_t size) ++ { ++ fprintf(stderr, "Dummy %s\n", __FUNCTION__); ++ return NULL; ++ } ++ ++ void *pshmem_memalign (size_t alignment, size_t size) ++ { ++ fprintf(stderr, "Dummy %s\n", __FUNCTION__); ++ return NULL; ++ } ++ ++ char *psherror (void) ++ { ++ fprintf(stderr, "Dummy %s\n", __FUNCTION__); ++ return NULL; ++ } ++ ++ char *pshmem_error (void) ++ { ++ fprintf(stderr, "Dummy %s\n", __FUNCTION__); ++ return NULL; ++ } ++ ++ void pshmem_sync_init (long *pSync) ++ { ++ fprintf(stderr, "Dummy %s\n", __FUNCTION__); ++ return ; ++ } ++ ++ #ifdef __cplusplus ++ # include ++ # define COMPLEXIFY(T) std::complex ++ #else /* _cplusplus */ ++ # include ++ # define COMPLEXIFY(T) T complex ++ #endif /* __cplusplus */ ++ void pshmem_complexd_put (COMPLEXIFY (double) * dest, ++ const COMPLEXIFY (double) * src, ++ size_t nelems, int pe) ++ { ++ fprintf(stderr, "Dummy %s\n", __FUNCTION__); ++ return ; ++ } ++ ++ /* Old API */ ++ ++ + /********************************************************** + start_pes + **********************************************************/ diff --git a/distr/README b/distr/README new file mode 100644 index 0000000000..8f6312f6d6 --- /dev/null +++ b/distr/README @@ -0,0 +1,24 @@ +Copyright (c) 2012 Mellanox Technologies, Inc. + All rights reserved. + +Files oshmem.spec and buildrpm.sh provide a way to create binary and source rpms of the OSHMEM. +In order to create rpms: +./buildrpm.sh +Created rpms are located in ./rpm-dest/. + +Several parameters are hardcoded in the buildshmem.sh: package name, version, release. Apart fromit, one can specify wheter to create source rpm or not. +See comments in buildrpm.sh. + +Binary rpms created by the script are relocatable, implying that they can be installed in user specified directory with: +sudo rpm -ihv --prefix= + +Default params: +package_name=openshmem +version=1.1 +release=1 +_prefix=/opt/ - place to install + + +In order to create binary tarball from binary rpm: +./buildtarball.sh +The output will appear in ./tarballs/ diff --git a/distr/build_knem_rpm.sh b/distr/build_knem_rpm.sh new file mode 100644 index 0000000000..9629d43c4f --- /dev/null +++ b/distr/build_knem_rpm.sh @@ -0,0 +1,13 @@ +#!/bin/sh +# Copyright (c) 2012 Mellanox Technologies, Inc. +# All rights reserved. + + +mydir=$(cd `dirname $0`;pwd) +rpmdir=$mydir/rpm-dist/`hostname` +rpmdist=$rpmdir/build + +knem_tgz="$mydir/knem-0.9.7.tar.gz" + +mkdir -p $rpmdir $rpmdist +rpmbuild --define="_rpmdir $rpmdir" --define="_srcrpmdir $rpmdir" --define="_sourcedir $rpmdist" --define="_specdir $rpmdist" --define="_builddir $rpmdist" -tb $knem_tgz diff --git a/distr/buildrpm.sh b/distr/buildrpm.sh new file mode 100644 index 0000000000..ffa46cd35d --- /dev/null +++ b/distr/buildrpm.sh @@ -0,0 +1,165 @@ +#!/bin/bash +# Copyright (c) 2012 Mellanox Technologies, Inc. +# All rights reserved. + + +#this three parameters define the name of rpm package: -- +package_name=openshmem +#version - defined from VERSION file +#release - define from VERSION file + +if [ -d /opt/knem-0.9.7mlnx1 ];then + KNEM_FOLDER="/opt/knem-0.9.7mlnx1" +else + KNEM_FOLDER="/opt/knem-0.9.7" +fi + + +fca='no' +debug='no' +extra_cflags="" +slurm_dir="" +while getopts :k:hf:dmw:scl: OPTION +do + case $OPTION in + k)KNEM_FOLDER=$OPTARG + ;; + h)echo "`basename $0` -[h] [-k ] [-f ] [-d (debug)] [-w (-DOSHMEM_WAIT_COMPLETION_DEBUG=number)] [-s (-DOSHMEM_SM_PUT_SYNC_MODE)]" + exit 0; + ;; + l)slurm_dir=$OPTARG + if [ -d "$slurm_dir/include" ]; then + echo using pmi/slurm + oshmem_configure_params="$oshmem_configure_params --with-pmi=$slurm_dir --with-slurm=$slurm_dir" + else + echo PMI selected but not found + exit -1 + fi + ;; + f)rpmspec=$OPTARG + ;; + \?)echo "`basename $0` -[h] [-k ] [-f ] [-d (debug)] [-w (-DOSHMEM_WAIT_COMPLETION_DEBUG=number)] [-s (-DOSHMEM_SM_PUT_SYNC_MODE)]" + ;; + d)debug='yes' + ;; + c)fca='yes' + ;; + m)mxm='yes' + ;; + w) if [ $OPTARG -lt 0 ]; then + echo '-w key should be > 0'; + exit 1; + fi + extra_cflags="$extra_cflags -DOSHMEM_WAIT_COMPLETION_DEBUG=$OPTARG" + ;; + s)extra_cflags="$extra_cflags -DOSHMEM_SM_PUT_SYNC_MODE" + ;; + esac +done + +if [ $debug == "yes" ]; then + DEBUG_CONF="--enable-debug" + oshmem_name_prefix="debug-" + extra_cflags="$extra_cflags -g -O0" +elif [ $debug == "no" ]; then + DEBUG_CONF="--disable-debug" + extra_cflags="$extra_cflags -g -O2" + oshmem_name_prefix=""; +fi +if [ "x" == "x$rpmspec" ]; then + echo "please provide a spec file: -f "; + exit 1; +fi + +if [ ! -f "$rpmspec" ]; then + echo "$rpmspec does not exist"; + exit 1; +fi + +if [ ! -d "$KNEM_FOLDER" ]; then + echo "$KNEM_FOLDER does not exist"; + exit 1; +fi +#parameters that are passed to the ./configure script +oshmem_configure_params="$oshmem_configure_params --with-oshmem --enable-mpirun-prefix-by-default $DEBUG_CONF --with-knem=$KNEM_FOLDER" + + +if [ $fca == "yes" ]; then + echo using fca + oshmem_configure_params="$oshmem_configure_params --with-fca=/opt/mellanox/fca" +fi + +if [ $mxm == "yes" ]; then + echo using mxm + oshmem_configure_params="$oshmem_configure_params --with-mxm=/opt/mellanox/mxm" +fi + +#build the binary rpm only +build_binary_rpm=${build_binary_rpm:='no'} +#build both: binary and source rpms +build_source_rpm=${build_source_rpm:='no'} + +which rpmbuild &> /dev/null; +if [ $? -ne 0 ]; then \ + echo "*** This make target requires an rpm-based linux distribution."; \ + (exit 1); exit 1; \ +fi + +mkdir -p rpm-dist sources build + +work_dir=$(cd `dirname $0` && pwd) +version="`grep major= $work_dir/../VERSION | sed -e s/major=//`"."`grep minor= $work_dir/../VERSION | sed -e s/minor=//`" +release="`grep release= $work_dir/../VERSION | sed -e s/release=//`" +build=$(hg id -n | sed -e s/\+//g) +release=$build +echo oshmem version defined: $version-$release build: $build + + +echo Making source tarball... +(cd $work_dir/.. && ./autogen.sh && ./configure $oshmem_configure_params) +make -C $work_dir/.. distcheck && make -C $work_dir/.. get_tarball to=$work_dir/sources + + +rpmmacros="--define='_rpmdir $work_dir/rpm-dist' --define='_srcrpmdir $work_dir/rpm-dist' --define='_sourcedir $work_dir/sources' --define='_specdir $work_dir' --define='_builddir $work_dir/build'" +rpmopts="--nodeps --buildroot=$work_dir/_rpm" + +# Generate spec file for rpm +echo Generating oshmem.spec file +if [ ! -z "$oshmem_configure_params" ] +then + oshmem_configure_params=${oshmem_configure_params//\//\\\/} +fi +if [ ! -z "$extra_cflags" ] +then + extra_cflags=${extra_cflags//\//\\\/} +fi + +sed -e s/@OSHMEM_NAME@/openshmem/ -e s/@OSHMEM_CONFIGURE_PARAMS@/"$oshmem_configure_params"/ \ +-e s/@OSHMEM_VERSION@/"$version"/ -e s/@OSHMEM_RELEASE@/"$release"/ -e s/@OSHMEM_CFLAGS@/"$extra_cflags"/ \ +-e s/@OSHMEM_BUILD@/"$build"/ -e s/@OSHMEM_NAME_PREFIX@/"$oshmem_names_prefix"/ \ +$rpmspec > oshmem.spec +echo ${version}-${build} > latest.txt + +if [ $build_source_rpm == 'yes' ] +then + echo running source rpmbuild... + echo -ba -v $rpmmacros $rpmopts oshmem.spec | xargs rpmbuild + if [ $? -ne 0 ]; then \ + (exit 1); exit 1; \ + fi + exit 0; +fi + + +if [ $build_binary_rpm == 'yes' ] +then + echo running binary rpmbuild... +# echo "$rpmmacros $rpmopts $rpmspec" + echo -bb -v $rpmmacros $rpmopts oshmem.spec | xargs rpmbuild + if [ $? -ne 0 ]; then \ + (exit 1); exit 1; \ + fi +fi + + + diff --git a/distr/buildtarball.sh b/distr/buildtarball.sh new file mode 100644 index 0000000000..e5b9f066f9 --- /dev/null +++ b/distr/buildtarball.sh @@ -0,0 +1,22 @@ +#!/bin/bash +# Copyright (c) 2012 Mellanox Technologies, Inc. +# All rights reserved. + +rpmname="$1" +if test "$rpmname" = ""; then + echo "Usage: buildtarball.sh " + exit 1 +fi +if test ! -f $rpmname; then + echo "Can't find $rpmname" + exit 1 +fi +echo "--> Found rpm: $rpmname" + +version="`rpm -qp $rpmname | sed s/openshmem-//`" +mkdir -p tarballs +rpm2cpio $rpmname | cpio -id +cd opt/mellanox +tar jcvf ../../tarballs/openshmem-$version.tar.bz openshmem +cd ../.. +rm -rf opt/ diff --git a/distr/oshmem.spec.in b/distr/oshmem.spec.in new file mode 100644 index 0000000000..5dd5cd613f --- /dev/null +++ b/distr/oshmem.spec.in @@ -0,0 +1,151 @@ +# Copyright (c) 2012 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# don't stop with an error if we don't pack all files at once +%define _unpackaged_files_terminate_build 0 +# avoid this error: +# /usr/lib/rpm/debugedit: canonicalization unexpectedly shrank by one character +%define debug_package %{nil} + + +# variables replaced by buildrpm.sh script +%define oshmem_name @OSHMEM_NAME@ +%define oshmem_configure_params @OSHMEM_CONFIGURE_PARAMS@ +%define oshmem_version @OSHMEM_VERSION@ +%define oshmem_release @OSHMEM_RELEASE@ +%define oshmem_cflags @OSHMEM_CFLAGS@ +%define oshmem_build @OSHMEM_BUILD@ +%{!?configure_opts: %define configure_opts %{nil}} + + + +# +# global Open SHMEM stuff +# + +%{!?oshmem_name: %define oshmem_name openshmem} +%{!?oshmem_version: %define oshmem_version 2.0} +%{!?oshmem_release: %define oshmem_release 0} +%{!?oshmem_build: %define oshmem_build %{nil}} +%{!?oshmem_name_prefix: %define oshmem_name_prefix %{nil}} +%{!?oshmem_prefix: %define oshmem_prefix /opt/mellanox/%{oshmem_name_prefix}%{oshmem_name}/%{oshmem_version}} + +%define oshmem_long_ver %{oshmem_name}-%{oshmem_version}.%{oshmem_build} +%define oshmem_source %{oshmem_long_ver}.tar.gz +%define shell_scripts_path %{_bindir} +%define shell_scripts_basename shmemvars +# fix configure +# +%{!?extra_cflags: %define extra_cflags %{nil}} +# +%define _prefix %{oshmem_prefix} +%define _sysconfdir %{_prefix}/etc +%define _libdir %{_prefix}/lib +%define _includedir %{_prefix}/include + +# +# compiler settings +# +%define oshmem_compiler default +%define oshmem_cc " " +%define oshmem_cxx " " +%define oshmem_f77 " " +%define oshmem_fc " " + + +###################################################################### +# +# Build section +# +###################################################################### +Summary: Mellanox SHMEM parallel programming library. +Name: %{oshmem_name_prefix}%{oshmem_name} +Version: %{oshmem_version} +Release: %{oshmem_release} +License: Proprietary +Group: Development/Libraries +URL: http://www.mellanox.com +Source0: %{oshmem_source} +BuildRoot: %(mktemp -ud %{_tmppath}/%{name}-%{version}-%{release}-XXXXXX) +Requires: libibverbs mxm fca +BuildRequires: gcc-c++ libstdc++ libstdc++-devel libibverbs-devel fca mxm +Provides: shmem +Packager: mellanox +Vendor: mellanox +Prefix: %{_prefix} +%description + +The Mellanox SHMEM library provides fast inter-processor communication for large +messages using data passing and one-sided communication techniques. + +The Mellanox SHMEM API based on OpenSHMEM standard from http://www.openshmem.org/ + +%prep +rm -rf $RPM_BUILD_ROOT +%setup -q -n %{oshmem_long_ver} + +%build +OSHMEM_CONFIGURE_FLAGS="%{oshmem_configure_params}" +EXTRA_CFLAGS="%{extra_cflags}" +if [ "%{oshmem_compiler}" != "default" ]; then +OSHMEM_CONFIGURE_FLAGS="$OSHMEM_CONFIGURE_FLAGS CC=%{oshmem_cc} CXX=%{oshmem_cxx} F77=%{oshmem_f77} FC=%{oshmem_fc}" +fi +CFLAGS="$EXTRA_CFLAGS" && %configure $OSHMEM_CONFIGURE_FLAGS %{configure_opts} +make -j4 + +%install +rm -rf $RPM_BUILD_ROOT +%{__make} -j4 install DESTDIR=$RPM_BUILD_ROOT + +%{__mkdir_p} $RPM_BUILD_ROOT/%{shell_scripts_path} +cat < $RPM_BUILD_ROOT/%{shell_scripts_path}/%{shell_scripts_basename}.sh +# NOTE: This is an automatically-generated file! (generated by the +# Open MPI RPM). Any changes made here will be lost if the RPM is +# uninstalled or upgraded. + +# PATH +if test -z "\`echo \$PATH | grep %{_bindir}\`"; then + PATH=%{_bindir}:\${PATH} + export PATH + fi + +# LD_LIBRARY_PATH +if test -z "\`echo \$LD_LIBRARY_PATH | grep %{_libdir}\`"; then + LD_LIBRARY_PATH=%{_libdir}\${LD_LIBRARY_PATH:+:}\${LD_LIBRARY_PATH} + export LD_LIBRARY_PATH + fi + +# MANPATH +if test -z "\`echo \$MANPATH | grep %{_mandir}\`"; then + MANPATH=%{_mandir}:\${MANPATH} + export MANPATH + fi + +# MPI_ROOT +MPI_ROOT=%{_prefix} +export MPI_ROOT +EOF + +cat>>$RPM_BUILD_ROOT/%{_sysconfdir}/openmpi-mca-params.conf< file.cpio + +Here's how to use cpio: + list of contents: cpio -t -i < /file/name + extract files: cpio -d -i < /file/name +HERE + + exit 0; +} + +if ($#ARGV == -1) { + printhelp if -t STDIN; + $f = "STDIN"; +} elsif ($#ARGV == 0) { + open(F, "< $ARGV[0]") or die "Can't read file $ARGV[0]\n"; + $f = 'F'; +} else { + printhelp; +} + +printhelp if -t STDOUT; + +read $f, $rpm, 96; + +($magic, $major, $minor, $crap) = unpack("NCC C90", $rpm); + +die "Not an RPM\n" if $magic != 0xedabeedb; +die "Not a version 3 or 4 RPM\n" if $major != 3 && $major != 4; + +while (!eof($f)) { + $pos = tell($f); + read $f,$rpm,16; + $smagic = unpack("n", $rpm); + last if $smagic == 0x1f8b; # ZIP file magic, end of headers. + last if $smagic == 0x425a; # BZIP2 file magic, end of headers. + # Turns out that every header except the start of the gzip one is + # padded to an 8 bytes boundary. + if ($pos & 0x7) { + $pos += 7; + $pos &= ~0x7; # Round to 8 byte boundary + seek $f, $pos, 0; + read $f,$rpm,16; + } + ($magic, $crap, $sections, $bytes) = unpack("N4", $rpm); + die "Error: header not recognized\n" if $magic != 0x8eade801; + $pos += 16; # for header + $pos += 16 * $sections; + $pos += $bytes; + seek $f, $pos, 0; +} + +if (eof($f)) { + die "bogus RPM\n"; +} + +if ($smagic == 0x1f8b) { + open(ZCAT, "|$gzip -cd") || die "can't pipe to gzip\n"; + +} elsif ($smagic == 0x425a) { + open(ZCAT, "|$bzip2 -cd") || die "can't pipe to bzip2\n"; + +} else { + die "Unknown magic '$smagic' for compressed file contents. Expected 0x1f8b (gzip) or 0x425a (bzip2)."; +} + +print STDERR "CPIO archive found!\n"; + +print ZCAT $rpm; + +while (read($f, ($_=''), 16384) > 0) { + print ZCAT; +} + +close ZCAT; + diff --git a/knem_patch/README-knem-patch b/knem_patch/README-knem-patch new file mode 100644 index 0000000000..f143de461f --- /dev/null +++ b/knem_patch/README-knem-patch @@ -0,0 +1,16 @@ +#Copyright (c) 2012 Mellanox Technologies, Inc. +# All rights reserved. +#$COPYRIGHT$ +# +#Additional copyrights may follow +# +#$HEADER$ +# + +File shmem_knem.patch contains the modification to the Knem kernel module version 0.9.7 that is required for the correct work of SM BTL within SHMEM which is based on Knem. + +Install: +1. Get the Knem-0.9.7 source tarball from http://runtime.bordeaux.inria.fr/knem/download/knem-0.9.7.tar.gz +2. Untar the archive and cope shmem_knem.patch to the top directory of the Knem source tree +3. patch -p1 -i shmem_knem.patch +4. Build and install Knem kernel module as usual. diff --git a/knem_patch/shmem_knem.patch b/knem_patch/shmem_knem.patch new file mode 100644 index 0000000000..ff0059f79f --- /dev/null +++ b/knem_patch/shmem_knem.patch @@ -0,0 +1,88 @@ +# Copyright (c) 2012 Mellanox Technologies, Inc. +# All rights reserved. +diff -cr knem-0.9.7/driver/linux/knem_main.c knem-fixed/driver/linux/knem_main.c +*** knem-0.9.7/driver/linux/knem_main.c 2011-07-28 20:06:58.000000000 +0300 +--- knem-fixed/driver/linux/knem_main.c 2011-10-04 13:25:12.000000000 +0200 +*************** +*** 1145,1150 **** +--- 1145,1195 ---- + * Copying between pinned iovecs + */ + ++ #define KNEM_FOR_SHMEM 1 ++ ++ #if KNEM_FOR_SHMEM ++ static void perform_copy(void *dst, void *src, unsigned long length) ++ { ++ int i; ++ int num64 = length / sizeof(uint64_t); ++ int num32 = (length % sizeof(uint64_t))/sizeof(uint32_t); ++ int num16 = (length % sizeof(uint32_t))/sizeof(uint16_t); ++ int num8 = (length % sizeof(uint16_t))/sizeof(uint8_t); ++ uint64_t *dst64 = (uint64_t*)dst; ++ uint64_t *src64 = (uint64_t*)src; ++ uint32_t *dst32; ++ uint32_t *src32; ++ uint16_t *src16; ++ uint16_t *dst16; ++ uint8_t *dst8; ++ uint8_t *src8; ++ ++ for (i=0; i PAGE_SIZE)) + chunk = PAGE_SIZE - dst_first_page_offset; +! + memcpy(dst_addr + dst_first_page_offset, + src_addr + src_first_page_offset, + chunk); +! + remaining -= chunk; + if (unlikely(!remaining)) + break; +--- 1204,1218 ---- + chunk = PAGE_SIZE - src_first_page_offset; + if (likely(dst_first_page_offset + chunk > PAGE_SIZE)) + chunk = PAGE_SIZE - dst_first_page_offset; +! #if (!KNEM_FOR_SHMEM) + memcpy(dst_addr + dst_first_page_offset, + src_addr + src_first_page_offset, + chunk); +! #else +! perform_copy(dst_addr + dst_first_page_offset, +! src_addr + src_first_page_offset, +! chunk); +! #endif + remaining -= chunk; + if (unlikely(!remaining)) + break; diff --git a/ompi/class/ompi_free_list.c b/ompi/class/ompi_free_list.c index 909e1a5556..d3380b50a3 100644 --- a/ompi/class/ompi_free_list.c +++ b/ompi/class/ompi_free_list.c @@ -10,11 +10,12 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2006-2007 Mellanox Technologies. All rights reserved. * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2011 NVIDIA Corporation. All rights reserved. * Copyright (c) 2012 Los Alamos National Security, LLC. All rights * reserved. + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -194,6 +195,7 @@ int ompi_free_list_grow(ompi_free_list_t* flist, size_t num_elements) if(NULL == alloc_ptr) return OMPI_ERR_TEMP_OUT_OF_RESOURCE; + memset(alloc_ptr,0,alloc_size); if (0 != flist->fl_payload_buffer_size) { elem_size = OPAL_ALIGN(flist->fl_payload_buffer_size, diff --git a/ompi/mca/btl/btl.h b/ompi/mca/btl/btl.h index bde52a1d29..692713fcda 100644 --- a/ompi/mca/btl/btl.h +++ b/ompi/mca/btl/btl.h @@ -14,6 +14,8 @@ * reserved. * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved * $COPYRIGHT$ * * Additional copyrights may follow @@ -309,6 +311,13 @@ OMPI_DECLSPEC OBJ_CLASS_DECLARATION(mca_btl_base_descriptor_t); #define MCA_BTL_DES_FLAGS_PUT 0x0010 #define MCA_BTL_DES_FLAGS_GET 0x0020 +#ifdef OSHMEM_ENABLED +#define MCA_BTL_DES_FLAGS_SHMEM_REQUEST 0x0030 + +#define BTL_SM_HDR_TYPE_PUT_AS_SEND (MCA_BTL_TAG_USR + 0x0A) +#define BTL_SM_HDR_TYPE_GET_AS_SEND (MCA_BTL_TAG_USR + 0x0B) +#endif /* OSHMEM_ENABLED */ + /** * Maximum number of allowed segments in src/dst fields of a descriptor. */ diff --git a/ompi/mca/btl/openib/btl_openib.c b/ompi/mca/btl/openib/btl_openib.c index 4ce3916f36..8ba95a6828 100644 --- a/ompi/mca/btl/openib/btl_openib.c +++ b/ompi/mca/btl/openib/btl_openib.c @@ -11,12 +11,13 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2007-2012 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved. * Copyright (c) 2006-2012 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2006-2007 Voltaire All rights reserved. * Copyright (c) 2008-2012 Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2009 IBM Corporation. All rights reserved. + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -1347,11 +1348,12 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_dst( return NULL; } +#if OSHMEM_ENABLED +#else /* max_msg_sz is the maximum message size of the HCA (hw limitation) set the minimum between local max_msg_sz and the remote */ max_msg_sz = MIN(openib_btl->ib_port_attr.max_msg_sz, endpoint->endpoint_btl->ib_port_attr.max_msg_sz); - /* check if user has explicitly limited the max message size */ if (openib_component->max_hw_msg_size > 0 && max_msg_sz > (size_t)openib_component->max_hw_msg_size) { @@ -1363,7 +1365,7 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_dst( *size = (size_t)max_msg_sz; BTL_VERBOSE(("message size limited to %" PRIsize_t "\n", *size)); } - +#endif /* OSHMEM_ENABLED */ opal_convertor_get_current_pointer(convertor, &buffer); if(NULL == registration){ @@ -1388,6 +1390,12 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_dst( to_base_frag(frag)->segment.base.seg_addr.lval = (uint64_t)(uintptr_t) buffer; to_base_frag(frag)->segment.base.seg_len = *size; to_base_frag(frag)->segment.key = openib_reg->mr->rkey; + +#if OSHMEM_ENABLED + /** Keep lkey of pre-registered user buffer */ + to_base_frag(frag)->segment.lkey = frag->sg_entry.lkey; +#endif /* OSHMEM_ENABLED */ + to_base_frag(frag)->base.order = order; to_base_frag(frag)->base.des_flags = flags; @@ -1797,6 +1805,12 @@ int mca_btl_openib_put( mca_btl_base_module_t* btl, to_com_frag(frag)->sg_entry.addr = src_seg->base.seg_addr.lval; to_com_frag(frag)->sg_entry.length = src_seg->base.seg_len; + +#if OSHMEM_ENABLED + /** Keep lkey of pre-registered user buffer */ + to_com_frag(frag)->sg_entry.lkey = src_seg->key; +#endif /* OSHMEM_ENABLED */ + to_com_frag(frag)->endpoint = ep; #if HAVE_XRC if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp)) @@ -1834,7 +1848,9 @@ int mca_btl_openib_get(mca_btl_base_module_t* btl, uint64_t rem_addr = src_seg->base.seg_addr.lval; uint32_t rkey = src_seg->key; +#if OSHMEM_ENABLED == 0 assert(openib_frag_type(frag) == MCA_BTL_OPENIB_FRAG_RECV_USER); +#endif /* OSHMEM_ENABLED */ descriptor->des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK; @@ -1883,6 +1899,12 @@ int mca_btl_openib_get(mca_btl_base_module_t* btl, to_com_frag(frag)->sg_entry.addr = dst_seg->base.seg_addr.lval; to_com_frag(frag)->sg_entry.length = dst_seg->base.seg_len; + +#if OSHMEM_ENABLED + /** Keep lkey of pre-registered user buffer */ + to_com_frag(frag)->sg_entry.lkey = dst_seg->lkey; +#endif /* OSHMEM_ENABLED */ + to_com_frag(frag)->endpoint = ep; #if HAVE_XRC @@ -1893,6 +1915,11 @@ int mca_btl_openib_get(mca_btl_base_module_t* btl, qp_inflight_wqe_to_frag(ep, qp, to_com_frag(frag)); qp_reset_signal_count(ep, qp); + +#if OSHMEM_ENABLED + frag->sr_desc.opcode = IBV_WR_RDMA_READ; + frag->sr_desc.send_flags = IBV_SEND_SIGNALED; +#endif /* OSHMEM_ENABLED */ if(ibv_post_send(ep->qps[qp].qp->lcl_qp, &frag->sr_desc, &bad_wr)) return OMPI_ERROR; diff --git a/ompi/mca/btl/openib/btl_openib_async.c b/ompi/mca/btl/openib/btl_openib_async.c index 4ff9812cc3..e41f56cb1e 100644 --- a/ompi/mca/btl/openib/btl_openib_async.c +++ b/ompi/mca/btl/openib/btl_openib_async.c @@ -1,8 +1,9 @@ /* - * Copyright (c) 2008-2009 Mellanox Technologies. All rights reserved. * Copyright (c) 2007-2009 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2006-2007 Voltaire All rights reserved. * Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -531,7 +532,7 @@ void* btl_openib_async_thread(void * async) int btl_openib_async_command_done(int exp) { - int comp; + int comp = 0; if (read(mca_btl_openib_component.async_comp_pipe[0], &comp, sizeof(int)) < 0){ BTL_ERROR(("Failed to read from pipe")); diff --git a/ompi/mca/btl/openib/btl_openib_endpoint.c b/ompi/mca/btl/openib/btl_openib_endpoint.c index 557c98cdac..6cfcca6284 100644 --- a/ompi/mca/btl/openib/btl_openib_endpoint.c +++ b/ompi/mca/btl/openib/btl_openib_endpoint.c @@ -14,9 +14,10 @@ * Copyright (c) 2006-2012 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2006-2007 Voltaire All rights reserved. - * Copyright (c) 2006-2009 Mellanox Technologies, Inc. All rights reserved. * Copyright (c) 2010-2011 IBM Corporation. All rights reserved. * Copyright (c) 2010-2011 Oracle and/or its affiliates. All rights reserved + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. * * $COPYRIGHT$ * @@ -346,6 +347,11 @@ static void mca_btl_openib_endpoint_construct(mca_btl_base_endpoint_t* endpoint) endpoint->rem_info.rem_srqs = NULL; } +#ifdef OSHMEM_ENABLED + endpoint->rdma_qp = mca_btl_openib_component.rdma_qp; +#endif /* OSHMEM_ENABLED */ + + endpoint->ib_addr = NULL; endpoint->xrc_recv_qp_num = 0; endpoint->endpoint_btl = 0; diff --git a/ompi/mca/btl/openib/btl_openib_endpoint.h b/ompi/mca/btl/openib/btl_openib_endpoint.h index 991c446007..69c2422b01 100644 --- a/ompi/mca/btl/openib/btl_openib_endpoint.h +++ b/ompi/mca/btl/openib/btl_openib_endpoint.h @@ -13,8 +13,9 @@ * Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2006-2007 Voltaire All rights reserved. - * Copyright (c) 2007-2009 Mellanox Technologies. All rights reserved. * Copyright (c) 2010-2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -257,6 +258,9 @@ struct mca_btl_base_endpoint_t { /** Whether we've send out CTS to the peer or not (only used in CTS protocol) */ bool endpoint_cts_sent; +#ifdef OSHMEM_ENABLED + int rdma_qp; +#endif /* OSHMEM_ENABLED */ }; typedef struct mca_btl_base_endpoint_t mca_btl_base_endpoint_t; @@ -509,6 +513,18 @@ static inline int check_endpoint_state(mca_btl_openib_endpoint_t *ep, return rc; } +#if OSHMEM_ENABLED +/* hack, since there is no clean way to find inline limits */ +#include "ompi/mca/bml/bml.h" +static inline int +mca_btl_openib_rdma_inline_size(mca_bml_base_btl_t *bml_btl) +{ + mca_btl_base_endpoint_t *ep = bml_btl->btl_endpoint; + + return ep->qps[ep->rdma_qp].ib_inline_max; +} +#endif /* OSHMEM_ENABLED */ + static inline __opal_attribute_always_inline__ int ib_send_flags(uint32_t size, mca_btl_openib_endpoint_qp_t *qp, int do_signal) { diff --git a/ompi/mca/btl/openib/btl_openib_frag.h b/ompi/mca/btl/openib/btl_openib_frag.h index a4b4611fef..735d97c9f2 100644 --- a/ompi/mca/btl/openib/btl_openib_frag.h +++ b/ompi/mca/btl/openib/btl_openib_frag.h @@ -14,6 +14,8 @@ * reserved. * Copyright (c) 2006-2007 Voltaire All rights reserved. * Copyright (c) 2010-2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -287,6 +289,9 @@ typedef enum mca_btl_openib_frag_type_t mca_btl_openib_frag_type_t; typedef struct mca_btl_openib_segment_t { mca_btl_base_segment_t base; uint32_t key; +#if OSHMEM_ENABLED + uint32_t lkey; +#endif } mca_btl_openib_segment_t; /* base openib frag */ diff --git a/ompi/mca/btl/openib/connect/base.h b/ompi/mca/btl/openib/connect/base.h index b4de20304d..19d523c327 100644 --- a/ompi/mca/btl/openib/connect/base.h +++ b/ompi/mca/btl/openib/connect/base.h @@ -1,6 +1,8 @@ /* * Copyright (c) 2007-2008 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2009 Mellanox Technologies. All rights reserved. + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * * * $COPYRIGHT$ * @@ -12,7 +14,7 @@ #ifndef BTL_OPENIB_CONNECT_BASE_H #define BTL_OPENIB_CONNECT_BASE_H -#include "connect/connect.h" +#include "ompi/mca/btl/openib/connect/connect.h" #ifdef OMPI_HAVE_RDMAOE #define BTL_OPENIB_CONNECT_BASE_CHECK_IF_NOT_IB(btl) \ diff --git a/ompi/mca/btl/openib/connect/btl_openib_connect_oob.c b/ompi/mca/btl/openib/connect/btl_openib_connect_oob.c index 84bd0266c2..4066db1b8a 100644 --- a/ompi/mca/btl/openib/connect/btl_openib_connect_oob.c +++ b/ompi/mca/btl/openib/connect/btl_openib_connect_oob.c @@ -12,7 +12,8 @@ * Copyright (c) 2006-2012 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2006-2012 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2008-2011 Mellanox Technologies. All rights reserved. + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. * Copyright (c) 2009-2011 IBM Corporation. All rights reserved. * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved * @@ -371,6 +372,40 @@ static int qp_connect_all(mca_btl_openib_endpoint_t *endpoint) } +static void permute_array(int *permuted_qps, int nqps) +{ + int i; + int idx; + int tmp; + int control[nqps]; + + for (i = 0; i < nqps; i++) { + permuted_qps[i] = i; + control[i] = 0; + } + + for (i = 0; i < nqps - 1; i++) { + idx = i + random() % (nqps - i); + tmp = permuted_qps[i]; + permuted_qps[i] = permuted_qps[idx]; + permuted_qps[idx] = tmp; + } + + /* verify that permutation is ok: */ + for (i = 0; i < nqps; i++) { + control[permuted_qps[i]] ++; + } + for (i = 0; i < nqps; i++) { + if (control[i] != 1) { + printf("bad permutation detected: "); + for (i = 0; i < nqps; i++) printf("%d ", permuted_qps[i]); + printf("\n"); + abort(); + } + } +} + + /* * Create the local side of all the qp's. The remote sides will be * connected later. @@ -380,6 +415,12 @@ static int qp_create_all(mca_btl_base_endpoint_t* endpoint) int qp, rc, pp_qp_num = 0; int32_t rd_rsv_total = 0; + int rand_qpns[mca_btl_openib_component.num_qps]; + int i; + + permute_array(rand_qpns, mca_btl_openib_component.num_qps); + + for (qp = 0; qp < mca_btl_openib_component.num_qps; ++qp) if(BTL_OPENIB_QP_TYPE_PP(qp)) { rd_rsv_total += @@ -392,11 +433,12 @@ static int qp_create_all(mca_btl_base_endpoint_t* endpoint) if(0 == pp_qp_num && true == endpoint->use_eager_rdma) pp_qp_num = 1; - for (qp = 0; qp < mca_btl_openib_component.num_qps; ++qp) { + for (i = 0; i < mca_btl_openib_component.num_qps; ++i) { struct ibv_srq *srq = NULL; uint32_t max_recv_wr, max_send_wr; int32_t rd_rsv, rd_num_credits; + qp = rand_qpns[i]; /* QP used for SW flow control need some additional recourses */ if(qp == mca_btl_openib_component.credits_qp) { rd_rsv = rd_rsv_total; @@ -463,7 +505,12 @@ static int qp_create_one(mca_btl_base_endpoint_t* endpoint, int qp, memset(&attr, 0, sizeof(attr)); init_attr.qp_type = IBV_QPT_RC; +#if OSHMEM_ENABLED + // bump priority for rmda qp + init_attr.send_cq = openib_btl->device->ib_cq[BTL_OPENIB_RDMA_QP(qp) ? BTL_OPENIB_HP_CQ: BTL_OPENIB_LP_CQ]; +#else init_attr.send_cq = openib_btl->device->ib_cq[BTL_OPENIB_LP_CQ]; +#endif /* OSHMEM_ENABLED */ init_attr.recv_cq = openib_btl->device->ib_cq[qp_cq_prio(qp)]; init_attr.srq = srq; init_attr.cap.max_inline_data = req_inline = diff --git a/ompi/mca/btl/sm/btl_sm.c b/ompi/mca/btl/sm/btl_sm.c index 1235c27992..d52fae2634 100644 --- a/ompi/mca/btl/sm/btl_sm.c +++ b/ompi/mca/btl/sm/btl_sm.c @@ -15,6 +15,8 @@ * All rights reserved. * Copyright (c) 2010-2012 IBM Corporation. All rights reserved. * Copyright (c) 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved * $COPYRIGHT$ * * Additional copyrights may follow @@ -785,6 +787,9 @@ struct mca_btl_base_descriptor_t* mca_btl_sm_prepare_src( } else { #if OMPI_BTL_SM_HAVE_KNEM struct knem_cmd_create_region knem_cr; +#if OSHMEM_ENABLED + memset(&knem_cr,0,sizeof(struct knem_cmd_create_region)); +#endif /* OSHMEM_ENABLED */ struct knem_cmd_param_iovec knem_iov; #endif /* OMPI_BTL_SM_HAVE_KNEM */ MCA_BTL_SM_FRAG_ALLOC_USER(frag, rc); @@ -807,8 +812,13 @@ struct mca_btl_base_descriptor_t* mca_btl_sm_prepare_src( knem_iov.len = max_data; knem_cr.iovec_array = (uintptr_t)&knem_iov; knem_cr.iovec_nr = iov_count; +#if OSHMEM_ENABLED + knem_cr.protection = PROT_READ | PROT_WRITE; + knem_cr.flags = 0; +#else knem_cr.protection = PROT_READ; knem_cr.flags = KNEM_FLAG_SINGLEUSE; +#endif /* OSHMEM_ENABLED */ if (OPAL_UNLIKELY(ioctl(sm_btl->knem_fd, KNEM_CMD_CREATE_REGION, &knem_cr) < 0)) { return NULL; } @@ -974,7 +984,20 @@ int mca_btl_sm_send( struct mca_btl_base_module_t* btl, frag->hdr->len = frag->segment.base.seg_len; /* type of message, pt-2-pt, one-sided, etc */ frag->hdr->tag = tag; - +#if OSHMEM_ENABLED + /* + * This ugly hack is done to support following configuration as: + * OSHMEM + SM => put/get for small messages using send() + */ + if (frag->hdr->tag == BTL_SM_HDR_TYPE_GET_AS_SEND) /*this is shmem_get request */ + { + frag->hdr->src_addr = (void *)descriptor->des_src->seg_addr.lval; /*from where to take the data for get operation*/ + } + else if (frag->hdr->tag == BTL_SM_HDR_TYPE_PUT_AS_SEND)/*this is shmem_put operation*/ + { + frag->hdr->dst_addr = descriptor->des_dst->seg_addr.pval; + } +#endif /* OSHMEM_ENABLED */ MCA_BTL_SM_TOUCH_DATA_TILL_CACHELINE_BOUNDARY(frag); frag->endpoint = endpoint; @@ -1057,6 +1080,12 @@ int mca_btl_sm_get_sync(struct mca_btl_base_module_t* btl, icopy.local_iovec_nr = 1; icopy.remote_cookie = src->key; icopy.remote_offset = 0; +#if OSHMEM_ENABLED + if (des->des_flags & MCA_BTL_DES_FLAGS_SHMEM_REQUEST) + { + icopy.remote_offset = dst->key; + } +#endif icopy.write = 0; /* Use the DMA flag if knem supports it *and* the segment length @@ -1076,6 +1105,13 @@ int mca_btl_sm_get_sync(struct mca_btl_base_module_t* btl, return OMPI_ERROR; } +#if OSHMEM_ENABLED + if ( icopy.current_status == KNEM_STATUS_FAILED) + { + opal_output(0,"KNEM FAILED\n"); + return OMPI_ERROR; + } +#endif /* OSHMEM_ENABLED */ /* FIXME: what if icopy.current_status == KNEM_STATUS_FAILED? */ } #endif /* OMPI_BTL_SM_HAVE_KNEM */ @@ -1134,6 +1170,84 @@ int mca_btl_sm_get_sync(struct mca_btl_base_module_t* btl, #if OMPI_BTL_SM_HAVE_KNEM /* No support async_get for CMA yet */ +#if OSHMEM_ENABLED +int mca_btl_sm_put_async(struct mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* endpoint, + struct mca_btl_base_descriptor_t* des) +{ + int btl_ownership; + mca_btl_sm_t* sm_btl = (mca_btl_sm_t*) btl; + mca_btl_sm_frag_t* frag = (mca_btl_sm_frag_t*)des; + mca_btl_sm_segment_t *src = (mca_btl_sm_segment_t*)des->des_src; + mca_btl_sm_segment_t *dst = (mca_btl_sm_segment_t*)des->des_dst; + struct knem_cmd_inline_copy icopy; + struct knem_cmd_param_iovec recv_iovec; + /* If we have no knem slots available, return + TEMP_OUT_OF_RESOURCE */ +#ifndef OSHMEM_SM_PUT_SYNC_MODE + if (sm_btl->knem_status_num_used >= + mca_btl_sm_component.knem_max_simultaneous) { + return OMPI_ERR_TEMP_OUT_OF_RESOURCE; + } +#endif /* OSHMEM_SM_PUT_SYNC_MODE */ + /* We have a slot, so fill in the data fields. Bump the + first_avail and num_used counters. */ + recv_iovec.base = (uintptr_t) src->base.seg_addr.pval; + recv_iovec.len = src->base.seg_len; + icopy.local_iovec_array = (uintptr_t)&recv_iovec; + icopy.local_iovec_nr = 1; + icopy.write = 1; + icopy.remote_cookie = dst->key; + icopy.remote_offset = src->key; +#ifndef OSHMEM_SM_PUT_SYNC_MODE + icopy.async_status_index = sm_btl->knem_status_first_avail++; + if (sm_btl->knem_status_first_avail >= + mca_btl_sm_component.knem_max_simultaneous) { + sm_btl->knem_status_first_avail = 0; + } + ++sm_btl->knem_status_num_used; + /* Use the DMA flag if knem supports it *and* the segment length + is greater than the cutoff */ + sm_btl->knem_frag_array[icopy.async_status_index] = frag; + icopy.flags = KNEM_FLAG_ASYNCDMACOMPLETE; +#else + icopy.flags = 0; +#endif /* OSHMEM_SM_PUT_SYNC_MODE */ + if (mca_btl_sm_component.knem_dma_min <= dst->base.seg_len) { + icopy.flags = mca_btl_sm_component.knem_dma_flag; + } + if (OPAL_LIKELY(0 == ioctl(sm_btl->knem_fd, + KNEM_CMD_INLINE_COPY, &icopy))) { +#ifndef OSHMEM_SM_PUT_SYNC_MODE + if (icopy.current_status != KNEM_STATUS_PENDING) { + /* request completed synchronously */ + /* FIXME: what if icopy.current_status == KNEM_STATUS_FAILED? */ +#endif /* OSHMEM_SM_PUT_SYNC_MODE */ + btl_ownership = (frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); + if (0 != (MCA_BTL_DES_SEND_ALWAYS_CALLBACK & frag->base.des_flags)) { + frag->base.des_cbfunc(&mca_btl_sm.super, + frag->endpoint, &frag->base, + OMPI_SUCCESS); + } + if (btl_ownership) { + MCA_BTL_SM_FRAG_RETURN(frag); + } +#ifndef OSHMEM_SM_PUT_SYNC_MODE + --sm_btl->knem_status_num_used; + ++sm_btl->knem_status_first_used; + if (sm_btl->knem_status_first_used >= + mca_btl_sm_component.knem_max_simultaneous) { + sm_btl->knem_status_first_used = 0; + } + } +#endif /* OSHMEM_SM_PUT_SYNC_MODE */ + return OMPI_SUCCESS; + } else { + return OMPI_ERROR; + } +} +#endif /* OSHMEM_ENABLED */ + /** * Initiate an asynchronous get. * diff --git a/ompi/mca/btl/sm/btl_sm.h b/ompi/mca/btl/sm/btl_sm.h index e3646a24f9..2495ebae89 100644 --- a/ompi/mca/btl/sm/btl_sm.h +++ b/ompi/mca/btl/sm/btl_sm.h @@ -14,6 +14,8 @@ * Copyright (c) 2010-2013 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2010-2012 IBM Corporation. All rights reserved. + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved * $COPYRIGHT$ * * Additional copyrights may follow @@ -182,6 +184,9 @@ struct mca_btl_sm_component_t { #if OMPI_BTL_SM_HAVE_KNEM /* Knem capabilities info */ struct knem_cmd_info knem_info; +#if OSHMEM_ENABLED + unsigned int knem_threshold; +#endif /* OSHMEM_ENABLED */ #endif /* OMPI_BTL_SM_HAVE_KNEM */ /** MCA: should we be using knem or not? neg=try but continue if @@ -508,6 +513,12 @@ extern int mca_btl_sm_get_sync( struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, struct mca_btl_base_descriptor_t* des ); +#if OSHMEM_ENABLED +extern int mca_btl_sm_put_async( + struct mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* endpoint, + struct mca_btl_base_descriptor_t* des ); +#endif /* OSHMEM_ENABLED */ extern struct mca_btl_base_descriptor_t* mca_btl_sm_prepare_dst( struct mca_btl_base_module_t* btl, diff --git a/ompi/mca/btl/sm/btl_sm_component.c b/ompi/mca/btl/sm/btl_sm_component.c index c5689fe5b4..caa938f92a 100644 --- a/ompi/mca/btl/sm/btl_sm_component.c +++ b/ompi/mca/btl/sm/btl_sm_component.c @@ -15,6 +15,8 @@ * All rights reserved. * Copyright (c) 2011 NVIDIA Corporation. All rights reserved. * Copyright (c) 2010-2012 IBM Corporation. All rights reserved. + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved * $COPYRIGHT$ * * Additional copyrights may follow @@ -168,6 +170,25 @@ static int sm_register(void) } mca_btl_sm_component.use_knem = 0; } + +#if OSHMEM_ENABLED +#if OMPI_BTL_SM_HAVE_KNEM + mca_btl_sm.knem_fd = -1; + mca_base_param_reg_int(&mca_btl_sm_component.super.btl_version, + "knem_threshold", "Messages of the size greater than this value are sent via KNEM others go through SM Send", + false, false, 12228, &i); + + if (i < 0) + { + opal_output(0,"Error: knem threshold has to be positive value or zero; using default value: 12228 bytes"); + i = 12228; + } + mca_btl_sm_component.knem_threshold = (unsigned int)i; + mca_base_param_reg_int(&mca_btl_sm_component.super.btl_version, + "component_use_knem_value", NULL, + true, false, mca_btl_sm_component.use_knem, NULL); +#endif /* OMPI_BTL_SM_HAVE_KNEM */ +#endif /* OSHMEM_ENABLED */ /* Currently disabling DMA mode by default; it's not clear that this is useful in all applications and architectures. */ mca_base_param_reg_int(&mca_btl_sm_component.super.btl_version, @@ -179,7 +200,7 @@ static int sm_register(void) mca_base_param_reg_int(&mca_btl_sm_component.super.btl_version, "knem_max_simultaneous", "Max number of simultaneous ongoing knem operations to support (0 = do everything synchronously, which probably gives the best large message latency; >0 means to do all operations asynchronously, which supports better overlap for simultaneous large message sends)", - false, false, 0, + false, false, 100, &mca_btl_sm_component.knem_max_simultaneous); /* CMA parameters */ @@ -223,6 +244,9 @@ static int sm_register(void) #if OMPI_BTL_SM_HAVE_KNEM || OMPI_BTL_SM_HAVE_CMA if (mca_btl_sm_component.use_knem || mca_btl_sm_component.use_cma) { mca_btl_sm.super.btl_flags |= MCA_BTL_FLAGS_GET; +#if OSHMEM_ENABLED + mca_btl_sm.super.btl_flags |= MCA_BTL_FLAGS_PUT; +#endif /* OSHMEM_ENABLED */ } if (mca_btl_sm_component.use_knem && mca_btl_sm_component.use_cma) { @@ -251,6 +275,9 @@ static int sm_register(void) static int mca_btl_sm_component_open(void) { mca_btl_sm_component.sm_max_btls = 1; +#if OSHMEM_ENABLED + OBJ_CONSTRUCT(&mca_btl_sm_component.sm_lock, opal_mutex_t); +#endif /* OSHMEM_ENABLED */ /* make sure the number of fifos is a power of 2 */ mca_btl_sm_component.nfifos = opal_next_poweroftwo_inclusive (mca_btl_sm_component.nfifos); @@ -265,7 +292,9 @@ static int mca_btl_sm_component_open(void) mca_btl_sm_component.eager_limit = mca_btl_sm.super.btl_eager_limit; /* initialize objects */ +#if !OSHMEM_ENABLED OBJ_CONSTRUCT(&mca_btl_sm_component.sm_lock, opal_mutex_t); +#endif /* !OSHMEM_ENABLED */ OBJ_CONSTRUCT(&mca_btl_sm_component.sm_frags_eager, ompi_free_list_t); OBJ_CONSTRUCT(&mca_btl_sm_component.sm_frags_max, ompi_free_list_t); OBJ_CONSTRUCT(&mca_btl_sm_component.sm_frags_user, ompi_free_list_t); @@ -707,6 +736,12 @@ mca_btl_sm_component_init(int *num_btls, ompi_node_rank_t my_node_rank = OMPI_NODE_RANK_INVALID; #if OMPI_BTL_SM_HAVE_KNEM int rc; +#if OSHMEM_ENABLED + if (mca_btl_sm_component.use_knem == 0) + { + opal_output_verbose(1, mca_btl_base_output, "SM BTL will operate without KNEM kernel module"); + } +#endif /* OSHMEM_ENABLED */ #endif /* OMPI_BTL_SM_HAVE_KNEM */ *num_btls = 0; @@ -879,6 +914,11 @@ mca_btl_sm_component_init(int *num_btls, } } } +#if OSHMEM_ENABLED + /*always use sync get and sync put according to shmem logic*/ + mca_btl_sm.super.btl_get = mca_btl_sm_get_sync; + mca_btl_sm.super.btl_put = mca_btl_sm_put_async; +#else /* Set the BTL get function pointer if we're supporting KNEM; choose between synchronous and asynchronous. */ if (mca_btl_sm_component.knem_max_simultaneous > 0) { @@ -886,6 +926,7 @@ mca_btl_sm_component_init(int *num_btls, } else { mca_btl_sm.super.btl_get = mca_btl_sm_get_sync; } +#endif /* OSHMEM_ENABLED */ } #endif /* OMPI_BTL_SM_HAVE_KNEM */ @@ -920,6 +961,9 @@ mca_btl_sm_component_init(int *num_btls, /* If "use_knem" is positive, then it's an error if knem support is not available -- deactivate the sm btl. */ if (mca_btl_sm_component.use_knem > 0) { +#if OSHMEM_ENABLED + opal_output(0,"Error: SM BTL was explicitly requested to operate with KNEM support but KNEM module is not available! SM BTL will be deactivated"); +#endif return NULL; } @@ -1051,8 +1095,27 @@ int mca_btl_sm_component_progress(void) seg.seg_len = hdr->len; Frag.base.des_dst_cnt = 1; Frag.base.des_dst = &seg; +#if OSHMEM_ENABLED + /* + * This ugly hack is done to support following configuration as: + * OSHMEM + SM => put/get for small messages using send() + */ + if (hdr->tag == (MCA_BTL_TAG_USR + 0xA)) + { + memcpy(hdr->dst_addr, seg.seg_addr.pval, hdr->len); + } + else if (hdr->tag == (MCA_BTL_TAG_USR + 0xB)) + { + memcpy(seg.seg_addr.pval, hdr->src_addr, hdr->len); + } + else{ + reg->cbfunc(&mca_btl_sm.super, hdr->tag, &(Frag.base), + reg->cbdata); + } +#else reg->cbfunc(&mca_btl_sm.super, hdr->tag, &(Frag.base), reg->cbdata); +#endif /* OSHMEM_ENABLED */ /* return the fragment */ MCA_BTL_SM_FIFO_WRITE( mca_btl_sm_component.sm_peers[peer_smp_rank], diff --git a/ompi/mca/btl/sm/btl_sm_frag.h b/ompi/mca/btl/sm/btl_sm_frag.h index 11e882bee7..184e2acc48 100644 --- a/ompi/mca/btl/sm/btl_sm_frag.h +++ b/ompi/mca/btl/sm/btl_sm_frag.h @@ -11,6 +11,8 @@ * All rights reserved. * Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved * $COPYRIGHT$ * * Additional copyrights may follow @@ -42,13 +44,20 @@ struct mca_btl_sm_hdr_t { size_t len; int my_smp_rank; mca_btl_base_tag_t tag; +#if OSHMEM_ENABLED + void *dst_addr; + void *src_addr; +#endif /* OSHMEM_ENABLED */ }; typedef struct mca_btl_sm_hdr_t mca_btl_sm_hdr_t; struct mca_btl_sm_segment_t { mca_btl_base_segment_t base; -#if OMPI_BTL_SM_HAVE_KNEM || OMPI_BTL_SM_HAVE_CMA +#if OMPI_BTL_SM_HAVE_KNEM || OMPI_BTL_SM_HAVE_CMA uint64_t key; +#if OSHMEM_ENABLED + uint32_t lkey; +#endif /* OSHMEM_ENABLED */ #endif /* OMPI_BTL_SM_HAVE_KNEM || OMPI_BTL_SM_HAVE_CMA */ }; typedef struct mca_btl_sm_segment_t mca_btl_sm_segment_t; diff --git a/ompi/mca/mpool/grdma/mpool_grdma_module.c b/ompi/mca/mpool/grdma/mpool_grdma_module.c index 4ab8a9420a..2463c08b8f 100644 --- a/ompi/mca/mpool/grdma/mpool_grdma_module.c +++ b/ompi/mca/mpool/grdma/mpool_grdma_module.c @@ -12,10 +12,11 @@ * All rights reserved. * Copyright (c) 2006-2009 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2006 Voltaire. All rights reserved. - * Copyright (c) 2007 Mellanox Technologies. All rights reserved. * Copyright (c) 2010 IBM Corporation. All rights reserved. * Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights * reserved. + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. * * $COPYRIGHT$ * @@ -140,13 +141,13 @@ void* mca_mpool_grdma_alloc(mca_mpool_base_module_t *mpool, size_t size, #ifdef HAVE_POSIX_MEMALIGN if((errno = posix_memalign(&base_addr, align, size)) != 0) return NULL; - + memset(base_addr,0,size); addr = base_addr; #else base_addr = malloc(size + align); if(NULL == base_addr) return NULL; - + memset(base_addr,0,size + align); addr = (void*)OPAL_ALIGN((uintptr_t)base_addr, align, uintptr_t); #endif diff --git a/ompi/mca/rte/rte.h b/ompi/mca/rte/rte.h index ec696b04f7..e8ce7145d3 100644 --- a/ompi/mca/rte/rte.h +++ b/ompi/mca/rte/rte.h @@ -1,5 +1,7 @@ /* * Copyright (c) 2012 Los Alamos National Security, LLC. All rights reserved. + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. * * $COPYRIGHT$ * @@ -213,6 +215,11 @@ BEGIN_C_DECLS #define OMPI_RML_TAG_OFACM OMPI_RML_TAG_BASE+11 #define OMPI_RML_TAG_XOFACM OMPI_RML_TAG_BASE+12 +#if OSHMEM_ENABLED +/* open shmem oob communication */ +#define OMPI_RML_TAG_SHMEM OMPI_RML_TAG_BASE+13 +#endif + #define OMPI_RML_TAG_DYNAMIC OMPI_RML_TAG_BASE+200 typedef struct { diff --git a/ompi/mca/sbgp/ibnet/Makefile.am b/ompi/mca/sbgp/ibnet/Makefile.am index b82f5f9eac..b2b839b1eb 100644 --- a/ompi/mca/sbgp/ibnet/Makefile.am +++ b/ompi/mca/sbgp/ibnet/Makefile.am @@ -43,7 +43,7 @@ mca_sbgp_ibnet_la_LDFLAGS = -module -avoid-version $(sbgp_ibnet_LDFLAGS) $(btl_o mca_sbgp_ibnet_la_LIBADD = $(sbgp_ibnet_LIBS) $(btl_openib_LIBS) \ $(top_ompi_builddir)/ompi/mca/common/verbs/libmca_common_verbs.la \ $(top_ompi_builddir)/ompi/mca/common/ofacm/libmca_common_ofacm.la - +#ompi/mca/sbgp/ibnet/Makefile.am noinst_LTLIBRARIES = $(component_noinst) libmca_sbgp_ibnet_la_SOURCES =$(sources) libmca_sbgp_ibnet_la_LDFLAGS = -module -avoid-version @@ -54,4 +54,7 @@ $(top_ompi_builddir)/ompi/mca/common/ofacm/libmca_common_ofacm.la: foo.c $(top_ompi_builddir)/ompi/mca/common/ofautils/libmca_common_ofautils.la: foo.c cd $(top_ompi_builddir)/ompi/mca/common/ofautils && $(MAKE) +#$(top_ompi_builddir)/ompi/mca/common/commpatterns/libmca_common_commpatterns.la: foo.c +# cd $(top_ompi_builddir)/ompi/mca/common/commpatterns && $(MAKE) + foo.c: diff --git a/ompi/mca/sbgp/ibnet/sbgp_ibnet_module.c b/ompi/mca/sbgp/ibnet/sbgp_ibnet_module.c index fd2a913638..e2d5d2e4a1 100644 --- a/ompi/mca/sbgp/ibnet/sbgp_ibnet_module.c +++ b/ompi/mca/sbgp/ibnet/sbgp_ibnet_module.c @@ -25,7 +25,6 @@ #include "ompi/mca/sbgp/ibnet/sbgp_ibnet.h" #include "ompi/mca/common/ofacm/base.h" #include "ompi/mca/common/ofacm/connect.h" -#include "ompi/patterns/comm/coll_ops.h" /* * Unused static int ibnet_module_enable(mca_sbgp_base_module_t *module, diff --git a/ompi/mpi/c/finalize.c b/ompi/mpi/c/finalize.c index 7c69eb481d..2cb91be279 100644 --- a/ompi/mpi/c/finalize.c +++ b/ompi/mpi/c/finalize.c @@ -9,6 +9,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved * $COPYRIGHT$ * * Additional copyrights may follow @@ -22,6 +24,11 @@ #include "ompi/runtime/params.h" #include "ompi/errhandler/errhandler.h" +#if OSHMEM_ENABLED + /* Do nothing because SHMEM has made MPI ready and destroy + */ + #pragma weak MPI_Finalize +#else #if OPAL_HAVE_WEAK_SYMBOLS && OMPI_PROFILING_DEFINES #pragma weak MPI_Finalize = PMPI_Finalize #endif @@ -29,6 +36,7 @@ #if OMPI_PROFILING_DEFINES #include "ompi/mpi/c/profile/defines.h" #endif +#endif /* OSHMEM_ENABLED */ static const char FUNC_NAME[] = "MPI_Finalize"; diff --git a/ompi/mpi/c/init.c b/ompi/mpi/c/init.c index 121315ab5f..53a4ba3aed 100644 --- a/ompi/mpi/c/init.c +++ b/ompi/mpi/c/init.c @@ -11,6 +11,8 @@ * All rights reserved. * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2007-2008 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved * $COPYRIGHT$ * * Additional copyrights may follow @@ -27,6 +29,13 @@ #include "ompi/errhandler/errhandler.h" #include "ompi/constants.h" +#if OSHMEM_ENABLED + /* start_pes() is called instead of MPI_Init() in case OpenSHMEM usage + * Do nothing because SHMEM has made MPI ready + */ + #pragma weak MPI_Init +#else + #if OPAL_HAVE_WEAK_SYMBOLS && OMPI_PROFILING_DEFINES #pragma weak MPI_Init = PMPI_Init #endif @@ -34,6 +43,7 @@ #if OMPI_PROFILING_DEFINES #include "ompi/mpi/c/profile/defines.h" #endif +#endif /* OSHMEM_ENABLED */ static const char FUNC_NAME[] = "MPI_Init"; diff --git a/ompi/mpi/c/init_thread.c b/ompi/mpi/c/init_thread.c index becbceed56..b468d277ea 100644 --- a/ompi/mpi/c/init_thread.c +++ b/ompi/mpi/c/init_thread.c @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2010 Oak Ridge National Labs. All rights reserved. + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved * $COPYRIGHT$ * * Additional copyrights may follow @@ -25,6 +27,14 @@ #include "ompi/errhandler/errhandler.h" #include "ompi/constants.h" +#if OSHMEM_ENABLED + /* start_pes() is called instead of MPI_Init_thread() in case OpenSHMEM usage + * Do nothing because SHMEM has made MPI ready + */ + #pragma weak MPI_Init_thread +#else + + #if OPAL_HAVE_WEAK_SYMBOLS && OMPI_PROFILING_DEFINES #pragma weak MPI_Init_thread = PMPI_Init_thread #endif @@ -32,6 +42,7 @@ #if OMPI_PROFILING_DEFINES #include "ompi/mpi/c/profile/defines.h" #endif +#endif /* OSHMEM_ENABLED */ static const char FUNC_NAME[] = "MPI_Init_thread"; diff --git a/ompi/tools/Makefile.am b/ompi/tools/Makefile.am index 044845740a..9c13fd1804 100644 --- a/ompi/tools/Makefile.am +++ b/ompi/tools/Makefile.am @@ -10,6 +10,9 @@ # University of Stuttgart. All rights reserved. # Copyright (c) 2004-2005 The Regents of the University of California. # All rights reserved. +# Copyright (c) 2012 Mellanox Technologies, Inc. +# All rights reserved +# # $COPYRIGHT$ # # Additional copyrights may follow @@ -22,9 +25,18 @@ EXTRA_DIST += tools/win_makefile tools/CMakeLists.txt SUBDIRS += \ - tools/ompi_info \ tools/wrappers DIST_SUBDIRS += \ - tools/ompi_info \ tools/wrappers + +# Don`t build this tool during OMPI build +# it is built as part of OSHMEM build +if OSHMEM_SUPPORT +else + SUBDIRS += \ + tools/ompi_info + DIST_SUBDIRS += \ + tools/ompi_info +endif + diff --git a/ompi/tools/ompi_info/Makefile.am b/ompi/tools/ompi_info/Makefile.am index 1e21e545be..e2c7ea16ae 100644 --- a/ompi/tools/ompi_info/Makefile.am +++ b/ompi/tools/ompi_info/Makefile.am @@ -13,6 +13,8 @@ # Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved. # Copyright (c) 2012 Los Alamos National Security, LLC. # All rights reserved. +# Copyright (c) 2012 Mellanox Technologies, Inc. +# All rights reserved # $COPYRIGHT$ # # Additional copyrights may follow @@ -62,8 +64,11 @@ ompi_info_SOURCES = \ param.c \ components.c \ version.c - +if OSHMEM_SUPPORT +ompi_info_LDADD = $(top_builddir)/oshmem/libshmem.la +else ompi_info_LDADD = $(top_builddir)/ompi/libmpi.la +endif if OMPI_RTE_ORTE ompi_info_LDADD += $(top_builddir)/orte/libopen-rte.la endif diff --git a/ompi/tools/ompi_info/components.c b/ompi/tools/ompi_info/components.c index 517e08472a..33b9d8aba2 100644 --- a/ompi/tools/ompi_info/components.c +++ b/ompi/tools/ompi_info/components.c @@ -13,6 +13,8 @@ * Copyright (c) 2010-2012 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2011-2012 University of Houston. All rights reserved. + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved * $COPYRIGHT$ * * Additional copyrights may follow @@ -61,11 +63,23 @@ #include "ompi/mca/sharedfp/base/base.h" #include "ompi/runtime/params.h" +#if OSHMEM_ENABLED +#include "oshmem/runtime/runtime.h" +#endif + #if OPAL_ENABLE_FT_CR == 1 #include "ompi/mca/crcp/crcp.h" #include "ompi/mca/crcp/base/base.h" #endif +#if OSHMEM_ENABLED +#include "oshmem/mca/scoll/base/base.h" +#include "oshmem/mca/spml/base/base.h" +#include "oshmem/mca/memheap/base/base.h" +#include "oshmem/mca/atomic/base/base.h" +#endif + + #include "ompi/tools/ompi_info/ompi_info.h" @@ -125,7 +139,14 @@ int ompi_info_register_components(opal_pointer_array_t *mca_types, } goto error; } - + +#if OSHMEM_ENABLED + if (OMPI_SUCCESS != oshmem_shmem_register_params()) { + str = "oshmem_shmem_Register_params failed"; + goto error; + } +#endif + /* Find / open all components */ map = OBJ_NEW(opal_info_component_map_t); map->type = strdup("base"); @@ -160,6 +181,32 @@ int ompi_info_register_components(opal_pointer_array_t *mca_types, goto breakout; } +#if OSHMEM_ENABLED + if (OMPI_SUCCESS != mca_scoll_base_open()) { + goto error; + } + map = OBJ_NEW(opal_info_component_map_t); + map->type = strdup("scoll"); + map->components = &mca_scoll_base_components_opened; + opal_pointer_array_add(component_map, map); + + if (OMPI_SUCCESS != mca_memheap_base_open()) { + goto error; + } + map = OBJ_NEW(opal_info_component_map_t); + map->type = strdup("memheap"); + map->components = &mca_memheap_base_components_opened; + opal_pointer_array_add(component_map, map); + + if (OMPI_SUCCESS != mca_atomic_base_open()) { + goto error; + } + map = OBJ_NEW(opal_info_component_map_t); + map->type = strdup("atomic"); + map->components = &mca_atomic_base_components_opened; + opal_pointer_array_add(component_map, map); +#endif + if (OMPI_SUCCESS != (rc = mca_coll_base_open()) && OMPI_ERR_BAD_PARAM != rc) { str = "coll open"; @@ -316,6 +363,16 @@ int ompi_info_register_components(opal_pointer_array_t *mca_types, goto breakout; } +#if OSHMEM_ENABLED + if (OMPI_SUCCESS != mca_spml_base_open()) { + goto error; + } + map = OBJ_NEW(opal_info_component_map_t); + map->type = strdup("spml"); + map->components = &mca_spml_base_components_available; + opal_pointer_array_add(component_map, map); +#endif + /* No need to call the bml_base_open() because the ob1 pml calls it. * mca_bml_base_open(); */ diff --git a/ompi/tools/ompi_info/ompi_info.c b/ompi/tools/ompi_info/ompi_info.c index 409df6e16d..a9db34a69a 100644 --- a/ompi/tools/ompi_info/ompi_info.c +++ b/ompi/tools/ompi_info/ompi_info.c @@ -13,6 +13,8 @@ * Copyright (c) 2011-2012 University of Houston. All rights reserved. * Copyright (c) 2010-2012 Los Alamos National Security, LLC. * All rights reserved. + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved * $COPYRIGHT$ * * Additional copyrights may follow @@ -115,6 +117,16 @@ int main(int argc, char *argv[]) opal_pointer_array_add(&mca_types, "btl"); opal_pointer_array_add(&mca_types, "coll"); opal_pointer_array_add(&mca_types, "common"); +#if OSHMEM_ENABLED + opal_pointer_array_add(&mca_types, "scoll"); + opal_pointer_array_add(&mca_types, "scoll_basic"); + opal_pointer_array_add(&mca_types, "scoll_fca"); + opal_pointer_array_add(&mca_types, "spml"); + opal_pointer_array_add(&mca_types, "memheap"); + opal_pointer_array_add(&mca_types, "shmalloc"); + opal_pointer_array_add(&mca_types, "atomic"); + opal_pointer_array_add(&mca_types, "shmem"); +#endif #if OPAL_ENABLE_FT_CR == 1 opal_pointer_array_add(&mca_types, "crcp"); #endif diff --git a/ompi/tools/ompi_info/version.c b/ompi/tools/ompi_info/version.c index 40c7daac9c..c6328ac221 100644 --- a/ompi/tools/ompi_info/version.c +++ b/ompi/tools/ompi_info/version.c @@ -13,6 +13,8 @@ * Copyright (c) 2008-2011 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2012 Los Alamos National Security, LLC. * All rights reserved. + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved * $COPYRIGHT$ * * Additional copyrights may follow @@ -27,6 +29,10 @@ #include "mpi.h" +#if OSHMEM_ENABLED +#include "shmem.h" +#endif + #include "opal/version.h" #if OMPI_RTE_ORTE #include "orte/version.h" @@ -129,19 +135,36 @@ void ompi_info_show_ompi_version(const char *scope) opal_info_out("Package", "package", OPAL_PACKAGE_STRING); (void)asprintf(&tmp, "%s:version:full", ompi_info_type_ompi); +#if OSHMEM_ENABLED + tmp2 = opal_info_make_version_str(scope, + OSHMEM_MAJOR_VERSION, OSHMEM_MINOR_VERSION, + OSHMEM_RELEASE_VERSION, + OSHMEM_GREEK_VERSION, + OSHMEM_WANT_REPO_REV, OSHMEM_REPO_REV); + opal_info_out("Open SHMEM", tmp, tmp2); +#else tmp2 = opal_info_make_version_str(scope, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION, OMPI_RELEASE_VERSION, OMPI_GREEK_VERSION, OMPI_WANT_REPO_REV, OMPI_REPO_REV); opal_info_out("Open MPI", tmp, tmp2); +#endif free(tmp); free(tmp2); (void)asprintf(&tmp, "%s:version:repo", ompi_info_type_ompi); +#if OSHMEM_ENABLED + opal_info_out("Open SHMEM repo revision", tmp, OSHMEM_REPO_REV); +#else opal_info_out("Open MPI repo revision", tmp, OMPI_REPO_REV); +#endif free(tmp); (void)asprintf(&tmp, "%s:version:release_date", ompi_info_type_ompi); +#if OSHMEM_ENABLED + opal_info_out("Open SHMEM release date", tmp, OSHMEM_RELEASE_DATE); +#else opal_info_out("Open MPI release date", tmp, OMPI_RELEASE_DATE); +#endif free(tmp); #if OMPI_RTE_ORTE @@ -151,11 +174,18 @@ void ompi_info_show_ompi_version(const char *scope) /* show the opal version */ opal_info_show_opal_version(scope); - + +#if OSHMEM_ENABLED + tmp2 = opal_info_make_version_str(scope, + SHMEM_VERSION, SHMEM_SUBVERSION, + 0, "", 0, ""); + opal_info_out("SHMEM API", "shmem-api:version:full", tmp2); +#else tmp2 = opal_info_make_version_str(scope, MPI_VERSION, MPI_SUBVERSION, 0, "", 0, ""); opal_info_out("MPI API", "mpi-api:version:full", tmp2); +#endif free(tmp2); opal_info_out("Ident string", "ident", OPAL_IDENT_STRING); diff --git a/opal/etc/openmpi-mca-params.conf b/opal/etc/openmpi-mca-params.conf index 7a1f92367a..a4df4253bc 100644 --- a/opal/etc/openmpi-mca-params.conf +++ b/opal/etc/openmpi-mca-params.conf @@ -10,6 +10,8 @@ # Copyright (c) 2004-2005 The Regents of the University of California. # All rights reserved. # Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2012 Mellanox Technologies, Inc. +# All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -56,3 +58,6 @@ # See "ompi_info --param all all" for a full listing of Open MPI MCA # parameters available and their default values. +coll_fca_enable = 0 +scoll_fca_enable = 0 +mca_component_show_load_errors = 0 diff --git a/opal/include/opal_config_bottom.h b/opal/include/opal_config_bottom.h index e8263a997d..c17c0b19b4 100644 --- a/opal/include/opal_config_bottom.h +++ b/opal/include/opal_config_bottom.h @@ -11,6 +11,8 @@ * All rights reserved. * Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -220,6 +222,12 @@ # define __opal_attribute_weak_alias__(a) #endif +#if OPAL_HAVE_ATTRIBUTE_DESTRUCTOR +# define __opal_attribute_destructor__ __attribute__((__destructor__)) +#else +# define __opal_attribute_destructor__ +#endif + /*********************************************************************** * * Windows library interface declaration code diff --git a/opal/threads/condition.c b/opal/threads/condition.c index 44f4a45da1..6c552bfdb8 100644 --- a/opal/threads/condition.c +++ b/opal/threads/condition.c @@ -9,6 +9,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -27,6 +29,11 @@ static void opal_condition_construct(opal_condition_t *c) c->c_signaled = 0; #if OPAL_HAVE_POSIX_THREADS pthread_cond_init(&c->c_cond, NULL); +#endif +#ifdef OSHMEM_WAIT_COMPLETION_DEBUG + c->my_pe = -1; + c->puts_counter_sm = 0; + c->puts_counter_openib = 0; #endif c->name = NULL; } diff --git a/opal/threads/condition.h b/opal/threads/condition.h index 53d7cbf3e9..067cfc15c7 100644 --- a/opal/threads/condition.h +++ b/opal/threads/condition.h @@ -11,6 +11,8 @@ * All rights reserved. * Copyright (c) 2007 Los Alamos National Security, LLC. All rights * reserved. + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -54,11 +56,39 @@ struct opal_condition_t { pthread_cond_t c_cond; #elif OPAL_HAVE_SOLARIS_THREADS cond_t c_cond; +#endif +#ifdef OSHMEM_WAIT_COMPLETION_DEBUG + int my_pe; + int *pe_dest; + int *msg_length; + char **btl_name; + char **op_name; + int puts_counter_sm; + int puts_counter_openib; + uint64_t *src; + uint64_t *dst; #endif char *name; }; typedef struct opal_condition_t opal_condition_t; +#ifdef OSHMEM_WAIT_COMPLETION_DEBUG +static void print_oshmem_wait_condition_dbg_info(opal_condition_t *c) +{ + char output[10000]; + int i; + int stride=0; + stride += sprintf(&(output[stride]),"rank=%i, puts_count_sm=%i, puts_count_openib=%i:\n",c->my_pe,c->puts_counter_sm,c->puts_counter_openib); + for (i=OSHMEM_WAIT_COMPLETION_DEBUG-1; i>=0; i--) + { + stride += sprintf(&(output[stride]),"\t%i: %s, pe_dst=%i, btl=%s, msg_length=%i, src=%p, dst=%p\n",OSHMEM_WAIT_COMPLETION_DEBUG-1-i, + c->op_name[i],c->pe_dest[i],c->btl_name[i],c->msg_length[i],(void *)(uintptr_t)c->src[i],(void *)(uintptr_t)c->dst[i]); + } + fprintf(stderr,"%s",output); + fflush(stderr); +} +#endif + OPAL_DECLSPEC OBJ_CLASS_DECLARATION(opal_condition_t); @@ -96,7 +126,19 @@ static inline int opal_condition_wait(opal_condition_t *c, opal_mutex_t *m) } #endif } else { +#ifdef OSHMEM_WAIT_COMPLETION_DEBUG + time_t wait_time = time(NULL); + time_t show_dbg_info_timeout = 60*5; //wait for 5 minutes +#endif while (c->c_signaled == 0) { +#ifdef OSHMEM_WAIT_COMPLETION_DEBUG + if (wait_time && (time(NULL)-wait_time > show_dbg_info_timeout) && (c->my_pe >= 0)) + { + wait_time = 0; + print_oshmem_wait_condition_dbg_info(c); + } +#endif + opal_progress(); OPAL_CR_TEST_CHECKPOINT_READY_STALL(); } diff --git a/orte/etc/Makefile.am b/orte/etc/Makefile.am index e657449169..8d994e854b 100644 --- a/orte/etc/Makefile.am +++ b/orte/etc/Makefile.am @@ -28,6 +28,22 @@ EXTRA_DIST = $(orte_config_files) # (http://www.open-mpi.org/community/lists/devel/2008/06/4080.php) for # details why the mkdir is in install-data-local. +module_file=$(DESTDIR)$(sysconfdir)/shmem_modulefile +module_file_text="\#%Module\n\ +\n\ +\# NOTE: This is an automatically-generated file! Any changes\n\ +\# made here will be lost a) if the RPM is uninstalled, or\n\ +\# if the RPM is upgraded or uninstalled.\n\ +\n\ +proc ModulesHelp { } {\n\ +puts stderr \"This module adds Mellanox ScalableSHMEM to various paths\"\n\ +}\n\ +\n\ +module-whatis \"Sets up Mellanox ScalableSHMEM in your enviornment\"\n\ +\n\ +prepend-path PATH \"$(bindir)\"\n\ +prepend-path LD_LIBRARY_PATH \"$(libdir)\"" + install-data-local: $(MKDIR_P) $(DESTDIR)$(sysconfdir) @ p="$(orte_config_files)"; \ @@ -44,6 +60,9 @@ install-data-local: $(INSTALL_DATA) $$d$$file $(DESTDIR)$(sysconfdir)/$$f; \ fi; \ done +if OSHMEM_SUPPORT + echo -e $(module_file_text)>$(module_file) +endif # Only remove if exactly the same as what in our tree # NOTE TO READER: Bourne shell if ... fi evaluates the body if @@ -60,3 +79,6 @@ uninstall-local: fi ; \ fi ; \ done +if OSHMEM_SUPPORT + rm $(module_file) +endif diff --git a/oshmem/CMakeLists.txt b/oshmem/CMakeLists.txt new file mode 100644 index 0000000000..001a71d2ec --- /dev/null +++ b/oshmem/CMakeLists.txt @@ -0,0 +1,128 @@ +# Copyright (c) 2012 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + + + +# The source code is compiled as C++ for dynamic build +# and compiled as C for static build + +PROJECT (OSHMEM) + +# Recuresive search sub directories excluding mca, mpi and tools. +# Add sources in different source groups. +INCLUDE(list_subdirs) +CHECK_SUBDIRS("${PROJECT_SOURCE_DIR}" OSHMEM_SUBDIRS) + + +SET(OSHMEM_EXCLUDE_SUBDIRS mca shmem class info) + +FOREACH(OSHMEM_SUBDIR ${OSHMEM_SUBDIRS}) + + LIST(FIND OSHMEM_EXCLUDE_SUBDIRS ${OSHMEM_SUBDIR} OSHMEM_EXCLUDE_SUBDIR) + + IF(${OSHMEM_EXCLUDE_SUBDIR} EQUAL -1) + + FILE(GLOB_RECURSE OSHMEM_${OSHMEM_SUBDIR}_FILES + "${OSHMEM_SUBDIR}/*.h" "${OSHMEM_SUBDIR}/*.c" "${OSHMEM_SUBDIR}/*.cc" "${OSHMEM_SUBDIR}/*.cpp") + + SET (OSHMEM_SOURCE_FILES + ${OSHMEM_SOURCE_FILES} + ${OSHMEM_${OSHMEM_SUBDIR}_FILES} + ) + + SOURCE_GROUP("${OSHMEM_SUBDIR}" FILES ${OSHMEM_${OSHMEM_SUBDIR}_FILES}) + + ENDIF(${OSHMEM_EXCLUDE_SUBDIR} EQUAL -1) + +ENDFOREACH(OSHMEM_SUBDIR ${OSHMEM_SUBDIRS}) + + +# Add MPI C files +FILE(GLOB OSHMEM_SHMEM_C_FILES "shmem/c/*.h" "shmem/c/*.c") + + +SET (OSHMEM_SOURCE_FILES ${OSHMEM_SOURCE_FILES} ${OSHMEM_SHMEM_C_FILES}) +SOURCE_GROUP(shmem"") +SOURCE_GROUP(shmem\\c FILES ${OSHMEM_SHMEM_C_FILES}) + + + +IF(OMPI_WANT_F77_BINDINGS) + # A handful of files in mpi/f77/base must be included in libshmem, in order to build the + # Fortran 77 glue into libshmem + FILE(GLOB OMPI_F77_BASE_FILES "mpi/f77/base/*.c") + SET_SOURCE_FILES_PROPERTIES(${OSHMEM_F77_BASE_FILES} + PROPERTIES COMPILE_FLAGS "-DOMPI_COMPILING_F77_WRAPPERS=1 -DOMPI_PROFILE_LAYER=0") + SET(OSHMEM_SOURCE_FILES ${OSHMEM_SOURCE_FILES} ${OMPI_F77_BASE_FILES}) + SOURCE_GROUP(mpi\\f77\\base FILES ${OMPI_F77_BASE_FILES}) +ENDIF(OMPI_WANT_F77_BINDINGS) + + +INCLUDE (check_mca_subdirs) +SET (OSHMEM_SOURCE_FILES ${OSHMEM_SOURCE_FILES} ${MCA_FILES}) + +ADD_LIBRARY (libshmem ${OSHMEM_SOURCE_FILES}) + +SET_TARGET_PROPERTIES(libshmem PROPERTIES COMPILE_FLAGS + "${OSHMEM_C_DEF_PRE}OMPI_MPIHANDLES_DLL_PREFIX=liboshmem_dbg_mpihandles + ${OSHMEM_C_DEF_PRE}OMPI_MSGQ_DLL_PREFIX=libompi_dbg_msgq + ${OSHMEM_C_DEF_PRE}OMPI_BUILDING + ${OSHMEM_C_DEF_PRE}OMPI_BUILDING_CXX_BINDINGS_LIBRARY") + +# Set compile flags for this target +IF (BUILD_SHARED_LIBS) + ADD_DEPENDENCIES (libshmem libopen-pal libopen-rte) + SET_TARGET_PROPERTIES(libshmem PROPERTIES + COMPILE_FLAGS "-D_USRDLL -DHAVE_CONFIG_H -DOSHMEM_EXPORTS /TP") + TARGET_LINK_LIBRARIES (libshmem Ws2_32.lib shlwapi.lib) +ELSE (BUILD_SHARED_LIBS) + SET_TARGET_PROPERTIES(libshmem PROPERTIES COMPILE_FLAGS "-D_LIB") +ENDIF(BUILD_SHARED_LIBS) + +# generate shmem.h +CONFIGURE_FILE(${OpenMPI_SOURCE_DIR}/oshmem/include/shmem.h.in ${OpenMPI_BINARY_DIR}/oshmem/include/shmem.h) + +# generate oshmem_config.h +CONFIGURE_FILE(${OpenMPI_SOURCE_DIR}/oshmem/include/oshmem_config.h.in ${OpenMPI_BINARY_DIR}/oshmem/include/oshmem_config.h) + +# generate shmem_portable_platform.h +CONFIGURE_FILE(${OpenMPI_SOURCE_DIR}/oshmem/include/shmem_portable_platform.h.in ${OpenMPI_BINARY_DIR}/oshmem/include/shmem_portable_platform.h) + +# generate version.h +CONFIGURE_FILE(${OpenMPI_SOURCE_DIR}/oshmem/include/oshmem/version.h.in ${OpenMPI_BINARY_DIR}/oshmem/include/oshmem/version.h) + +ADD_SUBDIRECTORY(shmem) + + +# Install libraries headers, and shared files +INSTALL(TARGETS libshmem + RUNTIME DESTINATION bin + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib) +INSTALL(FILES ${PROJECT_BINARY_DIR}/include/shmem.h ${PROJECT_BINARY_DIR}/include/shmem_portable_platform.h + DESTINATION include) + +IF (OMPI_DEBUG_BUILD) + INSTALL(FILES ${OpenMPI_BINARY_DIR}/Debug/libmpi${CMAKE_DEBUG_POSTFIX}.pdb + DESTINATION bin) +ENDIF (OMPI_DEBUG_BUILD) + +IF(OMPI_WANT_F77_BINDINGS) + INSTALL(FILES ${PROJECT_BINARY_DIR}/include/mpif.h + ${PROJECT_BINARY_DIR}/include/mpif-config.h + ${PROJECT_SOURCE_DIR}/include/mpif-common.h + ${PROJECT_SOURCE_DIR}/include/mpif-mpi-io.h + DESTINATION include) +ENDIF(OMPI_WANT_F77_BINDINGS) + +INSTALL(FILES ${PROJECT_SOURCE_DIR}/runtime/help-shmem-runtime.txt ${PROJECT_SOURCE_DIR}/shmem/help-shmem-api.txt + DESTINATION share/openmpi) +INSTALL(DIRECTORY shmem/cxx/ DESTINATION include/openshmem/oshmem/shmem/cxx + FILES_MATCHING PATTERN "*.h" PATTERN ".svn" EXCLUDE) + diff --git a/oshmem/Makefile.am b/oshmem/Makefile.am new file mode 100644 index 0000000000..bb84887df6 --- /dev/null +++ b/oshmem/Makefile.am @@ -0,0 +1,130 @@ +# +# Copyright (c) 2012 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +if OSHMEM_SUPPORT + +AM_CFLAGS = $(OSHMEM_CFLAGS) + +EXTRA_DIST = CMakeLists.txt + +# Do we have profiling? +if OSHMEM_PROFILING +c_pshmem_lib = shmem/c/profile/libshmem_c_pshmem.la +else +c_pshmem_lib = +endif + +#Irit +f77_shmem_lib = shmem/f77/libshmem_f77.la + +# Note that the ordering of "." in SUBDIRS is important: the C++, F77, +# and F90 bindings are all in standalone .la files that depend on +# libshmem.la. So we must fully build libshmem.la first. + +# NOTE: A handful of files in mpi/f77/base must be included in +# libshmem.la. But we wanted to keep all the Fortran sources together +# in the same tree, so we moved those sources to a separate +# subdirectory with its own Makefile.include that is included in this +# Makefile.am (NOTE: it did *not* work to put all the files -- base +# q +# and non-base -- into mpi/f77 and have both a regular Makefile.am for +# building the f77 bindings library and a separate Makefile.include +# that was included in this top-level Makefile.am; problems occurred +# with "make distclean" and files in the ompi/mpi/f77/.deps directory +# -- it's not clear whether this is an AM bug or whether this behavior +# is simply not supported). This ompi/mpi/f77/base/Makefile.include +# file makes a convenience LT library that is then sucked into +# libshmem.la (the ompi/mpi/f77/base sources must be compiled with +# special CPPFLAGS; we can't just add the raw sources to +# libshmem_la_SOURCES, unfortunately). + +# The end of the result is that libshmem.la -- including a few sources +# from mpi/f77/base -- is fully built before the C++, F77, and F90 +# bindings are built. Therefore, the C++, F77 and F90 bindings +# libraries can all link against libshmem.la. + +SUBDIRS = \ + include \ + shmem/c \ + shmem/f77 \ + $(EXT_oshmem_FRAMEWORKS_SUBDIRS) \ + $(EXT_oshmem_FRAMEWORK_COMPONENT_STATIC_SUBDIRS) \ + $(MCA_oshmem_FRAMEWORKS_SUBDIRS) \ + $(MCA_oshmem_FRAMEWORK_COMPONENT_STATIC_SUBDIRS) \ + . \ + $(MCA_oshmem_FRAMEWORK_COMPONENT_DSO_SUBDIRS) + +DIST_SUBDIRS = \ + include \ + shmem/c \ + shmem/f77 \ + $(EXT_oshmem_FRAMEWORKS_SUBDIRS) \ + $(EXT_oshmem_FRAMEWORK_COMPONENT_ALL_SUBDIRS) \ + $(MCA_oshmem_FRAMEWORKS_SUBDIRS) \ + $(MCA_oshmem_FRAMEWORK_COMPONENT_ALL_SUBDIRS) + +#Build The main SHMEM library +lib_LTLIBRARIES = libshmem.la +libshmem_la_SOURCES = +libshmem_la_LIBADD = \ + shmem/c/libshmem_c.la \ + $(c_pshmem_lib) \ + $(f77_shmem_lib) \ + $(MCA_oshmem_FRAMEWORK_LIBS) \ + $(top_ompi_builddir)/ompi/libmpi.la +libshmem_la_DEPENDENCIES = $(libshmem_la_LIBADD) +libshmem_la_LDFLAGS = \ + -version-info $(libshmem_so_version) \ + $(OSHMEM_LIBSHMEM_EXTRA_LDFLAGS) + +# included subdirectory Makefile.am's and appended-to variables +headers = +noinst_LTLIBRARIES = +include_HEADERS = +nobase_oshmem_HEADERS = +dist_pkgdata_DATA = +libshmem_la_SOURCES += $(headers) +nodist_man_MANS = + +# Conditionally install the header files + +if WANT_INSTALL_HEADERS +oshmemdir = $(includedir)/oshmem/oshmem +nobase_oshmem_HEADERS += $(headers) +else +oshmemdir = $(includedir) +endif + +include op/Makefile.am +include proc/Makefile.am +include request/Makefile.am +include runtime/Makefile.am +include shmem/Makefile.am +include tools/Makefile.am + +# Ensure that the man page directory exists before we try to make man +# page files (because oshmem/shmem/man/man3 has no config.status-generated +# Makefile) +dir_stamp = $(top_builddir)/$(subdir)/shmem/man/man3/.dir-stamp + +# Also ensure that the man pages are rebuilt if the opal_config.h file +# changes (e.g., configure was run again, meaning that the release +# date or version may have changed) +$(nodist_man_MANS): $(dir_stamp) $(top_builddir)/opal/include/opal_config.h + +$(dir_stamp): + $(mkdir_p) `dirname $@` + touch "$@" + +# Remove the generated man pages +distclean-local: + rm -f $(nodist_man_MANS) $(dir_stamp) + +endif # OSHMEM_SUPPORT diff --git a/oshmem/include/Makefile.am b/oshmem/include/Makefile.am new file mode 100644 index 0000000000..9a8ae53d9b --- /dev/null +++ b/oshmem/include/Makefile.am @@ -0,0 +1,67 @@ +# Copyright (c) 2012 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# mpif-common.h is not generated, but mpif.h and mpif-config.h are. +# See big comments in these files for an explanation. + +# note - headers and nodist_headers will go in ${includedir}/openmpi, +# not ${includedir}/ +headers = +nodist_headers = \ + oshmem_config.h + +# Install these in $(includedir) +include_HEADERS = + +# Install these in $(includedir) +mppincludedir = $(includedir)/mpp +mppinclude_HEADERS = mpp/shmem.h \ + mpp/shmem.fh + +# Always install these in $(pkgincludedir) +pkginclude_HEADERS = + +include_HEADERS += shmem.fh + +# pasha # if OMPI_WANT_F77_BINDINGS +# pasha # include_HEADERS = +# pasha # +# pasha # #include_HEADERS += \ +# pasha # # mpif-common.h \ +# pasha # # mpif-mpi-io.h +# pasha # endif + +# These files are always installed in $(includedir), but shouldn't be +# shipped since they are generated by configure from their .in +# counterparts (which AM automatically ships). +nodist_include_HEADERS = \ + shmem.h \ + shmem_portable_platform.h + +# pasha # if OMPI_WANT_F77_BINDINGS +# pasha # nodist_include_HEADERS = +# pasha # +# pasha # #nodist_include_HEADERS += \ +# pasha # # mpif.h \ +# pasha # # mpif-config.h +# pasha # endif + +if WANT_INSTALL_HEADERS +oshmemdir = $(includedir)/oshmem +nobase_dist_oshmem_HEADERS = $(headers) +nobase_nodist_oshmem_HEADERS = $(nodist_headers) +else +oshmemdir = $(includedir) +nobase_dist_noinst_HEADERS = $(headers) +nobase_nodist_noinst_HEADERS = $(nodist_headers) +endif + +distclean-local: + +include oshmem/Makefile.am diff --git a/oshmem/include/mpif-common.h b/oshmem/include/mpif-common.h new file mode 100644 index 0000000000..bdfe43f460 --- /dev/null +++ b/oshmem/include/mpif-common.h @@ -0,0 +1,457 @@ +! +! Copyright (c) 2012 Mellanox Technologies, Inc. +! All rights reserved. +! $COPYRIGHT$ +! +! Additional copyrights may follow +! +! $HEADER$ +! +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +! WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +! +! Do ***not*** copy this file to the directory where your Fortran +! fortran application is compiled unless it is absolutely necessary! Most +! modern Fortran compilers now support the -I command line flag, which +! tells the compiler where to find .h files (specifically, this one). For +! example: +! +! shell$ mpif77 foo.f -o foo -I$OMPI_HOME/include +! +! will probably do the trick (assuming that you have set OMPI_HOME +! properly). +! +! That being said, OMPI's "mpif77" wrapper compiler should +! automatically include the -I option for you. The following command +! should be equivalent to the command listed above: +! +! shell$ mpif77 foo.f -o foo +! +! You should not copy this file to your local directory because it is +! possible that this file will be changed between versions of Open MPI. +! Indeed, this mpif.h is incompatible with the mpif.f of other +! implementations of MPI. Using this mpif.h with other implementations +! of MPI, or with other versions of Open MPI will result in undefined +! behavior (to include incorrect results, segmentation faults, +! unexplainable "hanging" in your application, etc.). Always use the +! -I command line option instead (or let mpif77 do it for you). +! +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +! WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +! +! This file contains the bulk of the Open MPI Fortran interface. It +! is included as a back-end file to both mpif.h (i.e., the +! standardized MPI Fortran header file) and mpi.f90 (the MPI-2 +! Fortran module source file, found in ompi/mpi/f90). +! +! This file is marginally different than mpif.h. mpif.h includes +! some "external" statements that are not suitable for use with the +! MPI-2 F90 module, and therefore cannot be included in the mpi.f90 +! source file. Hence, this file is essentially everything that +! needs to be in the standardized mpif.h *except* the "external" +! statements, and is therefore suitable to be included in mpi.f90. +! + +! First, however, include some output from configure. +! + include 'mpif-config.h' + +! +! MPI version +! + integer MPI_VERSION, MPI_SUBVERSION + + parameter (MPI_VERSION=2) + parameter (MPI_SUBVERSION=1) +! +! Miscellaneous constants +! + integer MPI_ANY_SOURCE, MPI_ANY_TAG + integer MPI_PROC_NULL + integer MPI_ROOT + integer MPI_UNDEFINED + integer MPI_CART, MPI_GRAPH, MPI_KEYVAL_INVALID + integer MPI_SOURCE, MPI_TAG, MPI_ERROR + integer MPI_TAG_UB, MPI_HOST, MPI_IO, MPI_WTIME_IS_GLOBAL + integer MPI_APPNUM, MPI_LASTUSEDCODE, MPI_UNIVERSE_SIZE + integer IMPI_CLIENT_SIZE, IMPI_CLIENT_COLOR + integer IMPI_HOST_SIZE, IMPI_HOST_COLOR + integer MPI_BSEND_OVERHEAD + integer MPI_ORDER_C, MPI_ORDER_FORTRAN + integer MPI_DISTRIBUTE_BLOCK, MPI_DISTRIBUTE_CYCLIC + integer MPI_DISTRIBUTE_NONE, MPI_DISTRIBUTE_DFLT_DARG + integer MPI_TYPECLASS_INTEGER, MPI_TYPECLASS_REAL + integer MPI_TYPECLASS_COMPLEX + integer MPI_MODE_NOCHECK, MPI_MODE_NOPRECEDE, MPI_MODE_NOPUT + integer MPI_MODE_NOSTORE, MPI_MODE_NOSUCCEED + integer MPI_LOCK_EXCLUSIVE, MPI_LOCK_SHARED + integer MPI_WIN_BASE, MPI_WIN_SIZE, MPI_WIN_DISP_UNIT + + parameter (MPI_ANY_SOURCE=-1) + parameter (MPI_ANY_TAG=-1) + parameter (MPI_PROC_NULL=-2) + parameter (MPI_ROOT=-4) + parameter (MPI_UNDEFINED=-32766) + parameter (MPI_CART=1) + parameter (MPI_GRAPH=2) + parameter (MPI_KEYVAL_INVALID=-1) + parameter (MPI_SOURCE=1) + parameter (MPI_TAG=2) + parameter (MPI_ERROR=3) + parameter (MPI_TAG_UB=0) + parameter (MPI_HOST=1) + parameter (MPI_IO=2) + parameter (MPI_WTIME_IS_GLOBAL=3) + parameter (MPI_APPNUM=4) + parameter (MPI_LASTUSEDCODE=5) + parameter (MPI_UNIVERSE_SIZE=6) + parameter (MPI_WIN_BASE=7) + parameter (MPI_WIN_SIZE=8) + parameter (MPI_WIN_DISP_UNIT=9) + parameter (IMPI_CLIENT_SIZE=10) + parameter (IMPI_CLIENT_COLOR=11) + parameter (IMPI_HOST_SIZE=12) + parameter (IMPI_HOST_COLOR=13) + + parameter (MPI_BSEND_OVERHEAD=128) + parameter (MPI_ORDER_C=0) + parameter (MPI_ORDER_FORTRAN=1) + parameter (MPI_DISTRIBUTE_BLOCK=0) + parameter (MPI_DISTRIBUTE_CYCLIC=1) + parameter (MPI_DISTRIBUTE_NONE=2) + parameter (MPI_DISTRIBUTE_DFLT_DARG=-1) + parameter (MPI_TYPECLASS_INTEGER=1) + parameter (MPI_TYPECLASS_REAL=2) + parameter (MPI_TYPECLASS_COMPLEX=3) + parameter (MPI_MODE_NOCHECK=1) + parameter (MPI_MODE_NOPRECEDE=2) + parameter (MPI_MODE_NOPUT=4) + parameter (MPI_MODE_NOSTORE=8) + parameter (MPI_MODE_NOSUCCEED=16) + parameter (MPI_LOCK_EXCLUSIVE=1) + parameter (MPI_LOCK_SHARED=2) + +! +! MPI sentinel values +! +! Several of these types were chosen with care to match specific +! overloaded functions in the F90 bindings. They should also match +! the types of their corresponding C variables. Do not arbitrarily +! change their types without also updating the F90 bindings and +! their corresponding types in ompi/mpi/f77/constants.h and +! ompi/mpi/runtime/ompi_init.c! +! +! MPI_BOTTOM is only used where choice buffers can be used (meaning +! that we already have overloaded F90 bindings for all available +! types), so any type is fine. + integer MPI_BOTTOM +! MPI_IN_PLACE has the same rationale as MPI_BOTTOM. + integer MPI_IN_PLACE +! Making MPI_ARGV_NULL be the same type as the parameter that is +! exepected in the F90 binding for MPI_COMM_SPAWN means that we +! don't need another binding for MPI_COMM_SPAWN. + character MPI_ARGV_NULL(1) +! The array_of_argv parameter in the F90 bindings for +! MPI_COMM_SPAWN_MULTIPLE takes a variable number of dimensions +! (specified by the "count" parameter), so it's not possible to have +! a single variable match all possible values. Hence, make it an +! entirely different type (one that would never likely be used by a +! correct program, e.g., double) and have a separate F90 binding for +! matching just this type. + double precision MPI_ARGVS_NULL +! MPI_ERRCODES_IGNORE has similar rationale to MPI_ARGV_NULL. The +! F77 functions are all smart enough to check that the errcodes +! parameter is not ERRCODES_IGNORE before assigning values into it +! (hence, the fact that this is an array of only 1 element does not +! matter -- we'll never overrun it because we never assign values +! into it). + integer MPI_ERRCODES_IGNORE(1) +! MPI_STATUS_IGNORE has similar rationale to MPI_ERRCODES_IGNORE. + integer MPI_STATUS_IGNORE(MPI_STATUS_SIZE) +! MPI_STATUSES_IGNORE has similar rationale to MPI_ARGVS_NULL. + double precision MPI_STATUSES_IGNORE + + common/mpi_fortran_bottom/MPI_BOTTOM + common/mpi_fortran_in_place/MPI_IN_PLACE + common/mpi_fortran_argv_null/MPI_ARGV_NULL + common/mpi_fortran_argvs_null/MPI_ARGVS_NULL + common/mpi_fortran_errcodes_ignore/MPI_ERRCODES_IGNORE + common/mpi_fortran_status_ignore/MPI_STATUS_IGNORE + common/mpi_fortran_statuses_ignore/MPI_STATUSES_IGNORE +! +! NULL "handles" (indices) +! + integer MPI_GROUP_NULL, MPI_COMM_NULL, MPI_DATATYPE_NULL + integer MPI_REQUEST_NULL, MPI_OP_NULL, MPI_ERRHANDLER_NULL + integer MPI_INFO_NULL, MPI_WIN_NULL + + parameter (MPI_GROUP_NULL=0) + parameter (MPI_COMM_NULL=2) + parameter (MPI_DATATYPE_NULL=0) + parameter (MPI_REQUEST_NULL=0) + parameter (MPI_OP_NULL=0) + parameter (MPI_ERRHANDLER_NULL=0) + parameter (MPI_INFO_NULL=0) + parameter (MPI_WIN_NULL=0) +! +! MPI_Init_thread constants +! + integer MPI_THREAD_SINGLE, MPI_THREAD_FUNNELED + integer MPI_THREAD_SERIALIZED, MPI_THREAD_MULTIPLE + + parameter (MPI_THREAD_SINGLE=0) + parameter (MPI_THREAD_FUNNELED=1) + parameter (MPI_THREAD_SERIALIZED=2) + parameter (MPI_THREAD_MULTIPLE=3) +! +! error classes +! + integer SHMEM_SUCCESS + integer SHMEM_ERR_BUFFER + integer SHMEM_ERR_COUNT + integer SHMEM_ERR_TYPE + integer SHMEM_ERR_TAG + integer SHMEM_ERR_COMM + integer SHMEM_ERR_RANK + integer SHMEM_ERR_REQUEST + integer SHMEM_ERR_ROOT + integer SHMEM_ERR_GROUP + integer SHMEM_ERR_OP + integer SHMEM_ERR_TOPOLOGY + integer SHMEM_ERR_DIMS + integer SHMEM_ERR_ARG + integer SHMEM_ERR_UNKNOWN + integer SHMEM_ERR_TRUNCATE + integer SHMEM_ERR_OTHER + integer SHMEM_ERR_INTERN + integer SHMEM_ERR_IN_STATUS + integer SHMEM_ERR_PENDING + integer SHMEM_ERR_ACCESS + integer SHMEM_ERR_AMODE + integer SHMEM_ERR_ASSERT + integer SHMEM_ERR_BAD_FILE + integer SHMEM_ERR_BASE + integer SHMEM_ERR_CONVERSION + integer SHMEM_ERR_DISP + integer SHMEM_ERR_DUP_DATAREP + integer SHMEM_ERR_FILE_EXISTS + integer SHMEM_ERR_FILE_IN_USE + integer SHMEM_ERR_FILE + integer SHMEM_ERR_INFO_KEY + integer SHMEM_ERR_INFO_NOKEY + integer SHMEM_ERR_INFO_VALUE + integer SHMEM_ERR_INFO + integer SHMEM_ERR_IO + integer SHMEM_ERR_KEYVAL + integer SHMEM_ERR_LOCKTYPE + integer SHMEM_ERR_NAME + integer SHMEM_ERR_NO_MEM + integer SHMEM_ERR_NOT_SAME + integer SHMEM_ERR_NO_SPACE + integer SHMEM_ERR_NO_SUCH_FILE + integer SHMEM_ERR_PORT + integer SHMEM_ERR_QUOTA + integer SHMEM_ERR_READ_ONLY + integer SHMEM_ERR_RMA_CONFLICT + integer SHMEM_ERR_RMA_SYNC + integer SHMEM_ERR_SERVICE + integer SHMEM_ERR_SIZE + integer SHMEM_ERR_SPAWN + integer SHMEM_ERR_UNSUPPORTED_DATAREP + integer SHMEM_ERR_UNSUPPORTED_OPERATION + integer SHMEM_ERR_WIN + + integer SHMEM_ERR_SYSRESOURCE + integer SHMEM_ERR_LASTCODE + + parameter( SHMEM_SUCCESS = 0) + parameter( SHMEM_ERR_BUFFER = 1) + parameter( SHMEM_ERR_COUNT = 2) + parameter( SHMEM_ERR_TYPE = 3) + parameter( SHMEM_ERR_TAG = 4) + parameter( SHMEM_ERR_COMM = 5) + parameter( SHMEM_ERR_RANK = 6) + parameter( SHMEM_ERR_REQUEST = 7) + parameter( SHMEM_ERR_ROOT = 8) + parameter( SHMEM_ERR_GROUP = 9) + parameter( SHMEM_ERR_OP = 10) + parameter( SHMEM_ERR_TOPOLOGY = 11) + parameter( SHMEM_ERR_DIMS = 12) + parameter( SHMEM_ERR_ARG = 13) + parameter( SHMEM_ERR_UNKNOWN = 14) + parameter( SHMEM_ERR_TRUNCATE = 15) + parameter( SHMEM_ERR_OTHER = 16) + parameter( SHMEM_ERR_INTERN = 17) + parameter( SHMEM_ERR_IN_STATUS = 18) + parameter( SHMEM_ERR_PENDING = 19) + parameter( SHMEM_ERR_ACCESS = 20) + parameter( SHMEM_ERR_AMODE = 21) + parameter( SHMEM_ERR_ASSERT = 22) + parameter( SHMEM_ERR_BAD_FILE = 23) + parameter( SHMEM_ERR_BASE = 24) + parameter( SHMEM_ERR_CONVERSION = 25) + parameter( SHMEM_ERR_DISP = 26) + parameter( SHMEM_ERR_DUP_DATAREP = 27) + parameter( SHMEM_ERR_FILE_EXISTS = 28) + parameter( SHMEM_ERR_FILE_IN_USE = 29) + parameter( SHMEM_ERR_FILE = 30) + parameter( SHMEM_ERR_INFO_KEY = 31) + parameter( SHMEM_ERR_INFO_NOKEY = 32) + parameter( SHMEM_ERR_INFO_VALUE = 33) + parameter( SHMEM_ERR_INFO = 34) + parameter( SHMEM_ERR_IO = 35) + parameter( SHMEM_ERR_KEYVAL = 36) + parameter( SHMEM_ERR_LOCKTYPE = 37) + parameter( SHMEM_ERR_NAME = 38) + parameter( SHMEM_ERR_NO_MEM = 39) + parameter( SHMEM_ERR_NOT_SAME = 40) + parameter( SHMEM_ERR_NO_SPACE = 41) + parameter( SHMEM_ERR_NO_SUCH_FILE = 42) + parameter( SHMEM_ERR_PORT = 43) + parameter( SHMEM_ERR_QUOTA = 44) + parameter( SHMEM_ERR_READ_ONLY = 45) + parameter( SHMEM_ERR_RMA_CONFLICT = 46) + parameter( SHMEM_ERR_RMA_SYNC = 47) + parameter( SHMEM_ERR_SERVICE = 48) + parameter( SHMEM_ERR_SIZE = 49) + parameter( SHMEM_ERR_SPAWN = 50) + parameter( SHMEM_ERR_UNSUPPORTED_DATAREP = 51) + parameter( SHMEM_ERR_UNSUPPORTED_OPERATION= 52) + parameter( SHMEM_ERR_WIN = 53) + + parameter( SHMEM_ERR_SYSRESOURCE = -2) + parameter( SHMEM_ERR_LASTCODE = 54) + +! +! comparison results +! + integer MPI_IDENT, MPI_CONGRUENT, MPI_SIMILAR, MPI_UNEQUAL + + parameter (MPI_IDENT=0) + parameter (MPI_CONGRUENT=1) + parameter (MPI_SIMILAR=2) + parameter (MPI_UNEQUAL=3) +! +! datatype combiners +! + integer MPI_COMBINER_NAMED + integer MPI_COMBINER_DUP + integer MPI_COMBINER_CONTIGUOUS + integer MPI_COMBINER_VECTOR + integer MPI_COMBINER_HVECTOR_INTEGER + integer MPI_COMBINER_HVECTOR + integer MPI_COMBINER_INDEXED + integer MPI_COMBINER_HINDEXED_INTEGER + integer MPI_COMBINER_HINDEXED + integer MPI_COMBINER_INDEXED_BLOCK + integer MPI_COMBINER_STRUCT_INTEGER + integer MPI_COMBINER_STRUCT + integer MPI_COMBINER_SUBARRAY + integer MPI_COMBINER_DARRAY + integer MPI_COMBINER_F90_REAL + integer MPI_COMBINER_F90_COMPLEX + integer MPI_COMBINER_F90_INTEGER + integer MPI_COMBINER_RESIZED + + parameter (MPI_COMBINER_NAMED=0) + parameter (MPI_COMBINER_DUP=1) + parameter (MPI_COMBINER_CONTIGUOUS=2) + parameter (MPI_COMBINER_VECTOR=3) + parameter (MPI_COMBINER_HVECTOR_INTEGER=4) + parameter (MPI_COMBINER_HVECTOR=5) + parameter (MPI_COMBINER_INDEXED=6) + parameter (MPI_COMBINER_HINDEXED_INTEGER=7) + parameter (MPI_COMBINER_HINDEXED=8) + parameter (MPI_COMBINER_INDEXED_BLOCK=9) + parameter (MPI_COMBINER_STRUCT_INTEGER=10) + parameter (MPI_COMBINER_STRUCT=11) + parameter (MPI_COMBINER_SUBARRAY=12) + parameter (MPI_COMBINER_DARRAY=13) + parameter (MPI_COMBINER_F90_REAL=14) + parameter (MPI_COMBINER_F90_COMPLEX=15) + parameter (MPI_COMBINER_F90_INTEGER=16) + parameter (MPI_COMBINER_RESIZED=17) +! +! lookup table indices +! + integer MPI_COMM_WORLD, MPI_COMM_SELF + integer MPI_GROUP_EMPTY + integer MPI_ERRORS_ARE_FATAL, MPI_ERRORS_RETURN + + parameter (MPI_COMM_WORLD=0) + parameter (MPI_COMM_SELF=1) + parameter (MPI_GROUP_EMPTY=1) + parameter (MPI_ERRORS_ARE_FATAL=1) + parameter (MPI_ERRORS_RETURN=2) + + integer MPI_BYTE, MPI_PACKED, MPI_UB, MPI_LB + integer MPI_CHARACTER, MPI_LOGICAL + integer MPI_INTEGER, MPI_INTEGER1, MPI_INTEGER2, MPI_INTEGER4 + integer MPI_INTEGER8, MPI_INTEGER16 + integer MPI_REAL, MPI_REAL2, MPI_REAL4, MPI_REAL8, MPI_REAL16 + integer MPI_DOUBLE_PRECISION + integer MPI_COMPLEX, MPI_COMPLEX8, MPI_COMPLEX16, MPI_COMPLEX32 + integer MPI_DOUBLE_COMPLEX + integer MPI_2REAL, MPI_2DOUBLE_PRECISION, MPI_2INTEGER + integer MPI_2COMPLEX, MPI_2DOUBLE_COMPLEX +! Note that MPI_LOGICALx are not defined by the MPI spec, but there are +! other MPI implementations that have them, so it's good for us to have +! as well. + integer MPI_LOGICAL1, MPI_LOGICAL2, MPI_LOGICAL4, MPI_LOGICAL8 + +! +! Do NOT change the order of these parameters +! + parameter (MPI_BYTE=1) + parameter (MPI_PACKED=2) + parameter (MPI_UB=3) + parameter (MPI_LB=4) + parameter (MPI_CHARACTER=5) + parameter (MPI_LOGICAL=6) + parameter (MPI_INTEGER=7) + parameter (MPI_INTEGER1=8) + parameter (MPI_INTEGER2=9) + parameter (MPI_INTEGER4=10) + parameter (MPI_INTEGER8=11) + parameter (MPI_INTEGER16=12) + parameter (MPI_REAL=13) + parameter (MPI_REAL4=14) + parameter (MPI_REAL8=15) + parameter (MPI_REAL16=16) + parameter (MPI_DOUBLE_PRECISION=17) + parameter (MPI_COMPLEX=18) + parameter (MPI_COMPLEX8=19) + parameter (MPI_COMPLEX16=20) + parameter (MPI_COMPLEX32=21) + parameter (MPI_DOUBLE_COMPLEX=22) + parameter (MPI_2REAL=23) + parameter (MPI_2DOUBLE_PRECISION=24) + parameter (MPI_2INTEGER=25) + parameter (MPI_2COMPLEX=26) + parameter (MPI_2DOUBLE_COMPLEX=27) + parameter (MPI_REAL2=28) + parameter (MPI_LOGICAL1=29) + parameter (MPI_LOGICAL2=30) + parameter (MPI_LOGICAL4=31) + parameter (MPI_LOGICAL8=32) + + integer MPI_MAX, MPI_MIN, MPI_SUM, MPI_PROD, MPI_LAND + integer MPI_BAND, MPI_LOR, MPI_BOR, MPI_LXOR, MPI_BXOR + integer MPI_MAXLOC, MPI_MINLOC, MPI_REPLACE + + parameter (MPI_MAX=1) + parameter (MPI_MIN=2) + parameter (MPI_SUM=3) + parameter (MPI_PROD=4) + parameter (MPI_LAND=5) + parameter (MPI_BAND=6) + parameter (MPI_LOR=7) + parameter (MPI_BOR=8) + parameter (MPI_LXOR=9) + parameter (MPI_BXOR=10) + parameter (MPI_MAXLOC=11) + parameter (MPI_MINLOC=12) + parameter (MPI_REPLACE=13) diff --git a/oshmem/include/mpif-config.h.in b/oshmem/include/mpif-config.h.in new file mode 100644 index 0000000000..7e7d6fa8ee --- /dev/null +++ b/oshmem/include/mpif-config.h.in @@ -0,0 +1,99 @@ +! -*- fortran -*- +! +! Copyright (c) 2012 Mellanox Technologies, Inc. +! All rights reserved. +! $COPYRIGHT$ +! +! Additional copyrights may follow +! +! $HEADER$ +! +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +! WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +! +! Do ***not*** copy this file to the directory where your Fortran +! fortran application is compiled unless it is absolutely necessary! Most +! modern Fortran compilers now support the -I command line flag, which +! tells the compiler where to find .h files (specifically, this one). For +! example: +! +! shell$ mpif77 foo.f -o foo -I$OMPI_HOME/include +! +! will probably do the trick (assuming that you have set OMPI_HOME +! properly). +! +! That being said, OMPI's "mpif77" wrapper compiler should +! automatically include the -I option for you. The following command +! should be equivalent to the command listed above: +! +! shell$ mpif77 foo.f -o foo +! +! You should not copy this file to your local directory because it is +! possible that this file will be changed between versions of Open MPI. +! Indeed, this mpif.h is incompatible with the mpif.f of other +! implementations of MPI. Using this mpif.h with other implementations +! of MPI, or with other versions of Open MPI will result in undefined +! behavior (to include incorrect results, segmentation faults, +! unexplainable "hanging" in your application, etc.). Always use the +! -I command line option instead (or let mpif77 do it for you). +! +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +! WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +! +! This file is included as a back-end file to both mpif.h (i.e., the +! standardized MPI Fortran header file) and a bunch of the MPI +! Fortran 90 subroutine implementations found in ompi/mpi/f90. +! +! This file contains the output from configure that is relevant for +! Fortran applications (both 77 and 90) and a few values that are +! necessary to compile the F90 module (e.g., MPI_STATUS_SIZE). +! + +! Include the MPI I/O stuff, if needed + @OMPI_MPIF_MPI_IO_INCLUDE@ + +! +! OMPI version +! This file is generated from configure; do not edit it manually. +! + integer OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION + integer OMPI_RELEASE_VERSION + character*32 OMPI_GREEK_VERSION + character*32 OMPI_SVN_VERSION + parameter (OMPI_MAJOR_VERSION=@OMPI_MAJOR_VERSION@) + parameter (OMPI_MINOR_VERSION=@OMPI_MINOR_VERSION@) + parameter (OMPI_RELEASE_VERSION=@OMPI_RELEASE_VERSION@) + parameter (OMPI_GREEK_VERSION="@OMPI_GREEK_VERSION@") + parameter (OMPI_SVN_VERSION="@OMPI_SVN_R@") +! +! Kind parameters +! + integer MPI_OFFSET_KIND, MPI_ADDRESS_KIND, MPI_INTEGER_KIND + parameter (MPI_INTEGER_KIND=@OMPI_MPI_INTEGER_KIND@) + parameter (MPI_ADDRESS_KIND=@OMPI_MPI_ADDRESS_KIND@) + parameter (MPI_OFFSET_KIND=@OMPI_MPI_OFFSET_KIND@) +! +! Miscellaneous constants +! + integer MPI_STATUS_SIZE + parameter (MPI_STATUS_SIZE=5) +! +! Configurable length constants +! + integer MPI_MAX_PROCESSOR_NAME + integer MPI_MAX_ERROR_STRING + integer MPI_MAX_OBJECT_NAME + integer MPI_MAX_INFO_KEY + integer MPI_MAX_INFO_VAL + integer MPI_MAX_PORT_NAME + integer MPI_MAX_DATAREP_STRING + parameter (MPI_MAX_PROCESSOR_NAME=@OPAL_MAX_PROCESSOR_NAME@-1) + parameter (MPI_MAX_ERROR_STRING=@OPAL_MAX_ERROR_STRING@-1) + parameter (MPI_MAX_OBJECT_NAME=@OPAL_MAX_OBJECT_NAME@-1) + parameter (MPI_MAX_INFO_KEY=@OPAL_MAX_INFO_KEY@-1) + parameter (MPI_MAX_INFO_VAL=@OPAL_MAX_INFO_VAL@-1) + parameter (MPI_MAX_PORT_NAME=@OPAL_MAX_PORT_NAME@-1) + parameter (MPI_MAX_DATAREP_STRING=@OPAL_MAX_DATAREP_STRING@-1) diff --git a/oshmem/include/mpif-mpi-io.h b/oshmem/include/mpif-mpi-io.h new file mode 100644 index 0000000000..a3acc78204 --- /dev/null +++ b/oshmem/include/mpif-mpi-io.h @@ -0,0 +1,74 @@ +!Copyright (c) 2012 Mellanox Technologies, Inc. +! All rights reserved. +! $COPYRIGHT$ +! +! Additional copyrights may follow +! +! $HEADER$ +! +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +! WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +! +! Do ***not*** copy this file to the directory where your Fortran +! fortran application is compiled unless it is absolutely necessary! Most +! modern Fortran compilers now support the -I command line flag, which +! tells the compiler where to find .h files (specifically, this one). For +! example: +! +! shell$ mpif77 foo.f -o foo -I$OMPI_HOME/include +! +! will probably do the trick (assuming that you have set OMPI_HOME +! properly). +! +! That being said, OMPI's "mpif77" wrapper compiler should +! automatically include the -I option for you. The following command +! should be equivalent to the command listed above: +! +! shell$ mpif77 foo.f -o foo +! +! You should not copy this file to your local directory because it is +! possible that this file will be changed between versions of Open MPI. +! Indeed, this mpif.h is incompatible with the mpif.f of other +! implementations of MPI. Using this mpif.h with other implementations +! of MPI, or with other versions of Open MPI will result in undefined +! behavior (to include incorrect results, segmentation faults, +! unexplainable "hanging" in your application, etc.). Always use the +! -I command line option instead (or let mpif77 do it for you). +! +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +! WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +! +! This file is included as a back-end file to both mpif.h (i.e., the +! standardized MPI Fortran header file) and a bunch of the MPI +! Fortran 90 subroutine implementations found in ompi/mpi/f90. +! +! This file contains the output from configure that is relevant for +! Fortran applications (both 77 and 90) and a few values that are +! necessary to compile the F90 module (e.g., MPI_STATUS_SIZE). +! + + integer MPI_FILE_NULL + integer MPI_SEEK_SET, MPI_SEEK_CUR, MPI_SEEK_END + integer MPI_MODE_CREATE + integer MPI_MODE_RDONLY, MPI_MODE_WRONLY, MPI_MODE_RDWR + integer MPI_MODE_DELETE_ON_CLOSE, MPI_MODE_UNIQUE_OPEN + integer MPI_MODE_EXCL, MPI_MODE_APPEND, MPI_MODE_SEQUENTIAL + integer MPI_DISPLACEMENT_CURRENT + + parameter (MPI_FILE_NULL=0) + parameter (MPI_SEEK_SET=600) + parameter (MPI_SEEK_CUR=602) + parameter (MPI_SEEK_END=604) + parameter (MPI_MODE_CREATE=1) + parameter (MPI_MODE_RDONLY=2) + parameter (MPI_MODE_WRONLY=4) + parameter (MPI_MODE_RDWR=8) + parameter (MPI_MODE_DELETE_ON_CLOSE=16) + parameter (MPI_MODE_UNIQUE_OPEN=32) + parameter (MPI_MODE_EXCL=64) + parameter (MPI_MODE_APPEND=128) + parameter (MPI_MODE_SEQUENTIAL=256) + parameter (MPI_DISPLACEMENT_CURRENT=-54278278) diff --git a/oshmem/include/mpif.h.in b/oshmem/include/mpif.h.in new file mode 100644 index 0000000000..4540ba2f11 --- /dev/null +++ b/oshmem/include/mpif.h.in @@ -0,0 +1,76 @@ +! -*- fortran -*- +! +! Copyright (c) 2012 Mellanox Technologies, Inc. +! All rights reserved. +! $COPYRIGHT$ +! +! Additional copyrights may follow +! +! $HEADER$ +! +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +! WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +! +! Do ***not*** copy this file to the directory where your Fortran +! fortran application is compiled unless it is absolutely necessary! Most +! modern Fortran compilers now support the -I command line flag, which +! tells the compiler where to find .h files (specifically, this one). For +! example: +! +! shell$ mpif77 foo.f -o foo -I$OMPI_HOME/include +! +! will probably do the trick (assuming that you have set OMPI_HOME +! properly). +! +! That being said, OMPI's "mpif77" wrapper compiler should +! automatically include the -I option for you. The following command +! should be equivalent to the command listed above: +! +! shell$ mpif77 foo.f -o foo +! +! You should not copy this file to your local directory because it is +! possible that this file will be changed between versions of Open MPI. +! Indeed, this mpif.h is incompatible with the mpif.f of other +! implementations of MPI. Using this mpif.h with other implementations +! of MPI, or with other versions of Open MPI will result in undefined +! behavior (to include incorrect results, segmentation faults, +! unexplainable "hanging" in your application, etc.). Always use the +! -I command line option instead (or let mpif77 do it for you). +! +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +! WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +! +! Include the back-end file that has the bulk of the MPI Fortran +! interface. +! + + include 'mpif-common.h' + +! +! These "external" statements are specific to the MPI F77 interface +! (and are toxic to the MPI F90 interface), and are therefore in the +! MPI F77-specific header file (i.e., this one). +! + external MPI_NULL_COPY_FN, MPI_NULL_DELETE_FN + external MPI_COMM_NULL_COPY_FN, MPI_COMM_NULL_DELETE_FN + external MPI_TYPE_NULL_COPY_FN, MPI_TYPE_NULL_DELETE_FN + external MPI_DUP_FN, MPI_COMM_DUP_FN, MPI_TYPE_DUP_FN + external MPI_WIN_NULL_COPY_FN + external MPI_WIN_NULL_DELETE_FN + external MPI_WIN_DUP_FN +! Note that MPI_CONVERSION_FN_NULL is a "constant" (it is only ever +! checked for comparison; it is never invoked), but it is passed as +! a function pointer (to MPI_REGISTER_DATAREP) and therefore must be +! the same size/type. It is therefore external'ed here, and not +! defined with an integer value in mpif-common.h. + external MPI_CONVERSION_FN_NULL + +! +! double precision functions +! + external MPI_WTIME, MPI_WTICK @MPIF_H_PMPI_W_FUNCS@ + double precision MPI_WTIME, MPI_WTICK @MPIF_H_PMPI_W_FUNCS@ + diff --git a/oshmem/include/mpp/shmem.fh b/oshmem/include/mpp/shmem.fh new file mode 100644 index 0000000000..791d5b514a --- /dev/null +++ b/oshmem/include/mpp/shmem.fh @@ -0,0 +1,11 @@ +! +! Copyright (c) 2012 Mellanox Technologies, Inc. +! All rights reserved. +! $COPYRIGHT$ +! +! Additional copyrights may follow +! +! $HEADER$ +! + + include 'shmem.fh' diff --git a/oshmem/include/mpp/shmem.h b/oshmem/include/mpp/shmem.h new file mode 100644 index 0000000000..91e2908213 --- /dev/null +++ b/oshmem/include/mpp/shmem.h @@ -0,0 +1,16 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef __MPP_SHMEM_H__ +#define __MPP_SHMEM_H__ + +#include + +#endif \ No newline at end of file diff --git a/oshmem/include/oshmem/Makefile.am b/oshmem/include/oshmem/Makefile.am new file mode 100644 index 0000000000..bad2701b27 --- /dev/null +++ b/oshmem/include/oshmem/Makefile.am @@ -0,0 +1,15 @@ +# Copyright (c) 2012 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ + + +headers += \ + oshmem/constants.h \ + oshmem/types.h + +nodist_headers += \ + oshmem/version.h diff --git a/oshmem/include/oshmem/constants.h b/oshmem/include/oshmem/constants.h new file mode 100644 index 0000000000..b50c22682f --- /dev/null +++ b/oshmem/include/oshmem/constants.h @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef OSHMEM_CONSTANTS_H +#define OSHMEM_CONSTANTS_H + +#include "orte/constants.h" +#if defined(OSHMEM_PROFILING) && (OSHMEM_PROFILING == 1) +#include "oshmem/shmem/c/profile/defines.h" +#endif +#include "oshmem/include/shmem.h" + + +#define OSHMEM_ERR_BASE ORTE_ERR_MAX + +/* error codes */ +enum { + /* Error codes inherited from ORTE/OPAL. Still enum values so + that we might get nice debugger help */ + OSHMEM_SUCCESS = ORTE_SUCCESS, + + OSHMEM_ERROR = ORTE_ERROR, + OSHMEM_ERR_OUT_OF_RESOURCE = ORTE_ERR_OUT_OF_RESOURCE, + OSHMEM_ERR_TEMP_OUT_OF_RESOURCE = ORTE_ERR_TEMP_OUT_OF_RESOURCE, + OSHMEM_ERR_RESOURCE_BUSY = ORTE_ERR_RESOURCE_BUSY, + OSHMEM_ERR_BAD_PARAM = ORTE_ERR_BAD_PARAM, + OSHMEM_ERR_FATAL = ORTE_ERR_FATAL, + OSHMEM_ERR_NOT_IMPLEMENTED = ORTE_ERR_NOT_IMPLEMENTED, + OSHMEM_ERR_NOT_SUPPORTED = ORTE_ERR_NOT_SUPPORTED, + OSHMEM_ERR_INTERUPTED = ORTE_ERR_INTERUPTED, + OSHMEM_ERR_WOULD_BLOCK = ORTE_ERR_WOULD_BLOCK, + OSHMEM_ERR_IN_ERRNO = ORTE_ERR_IN_ERRNO, + OSHMEM_ERR_UNREACH = ORTE_ERR_UNREACH, + OSHMEM_ERR_NOT_FOUND = ORTE_ERR_NOT_FOUND, + OSHMEM_EXISTS = ORTE_EXISTS, /* indicates that the specified object already exists */ + OSHMEM_ERR_TIMEOUT = ORTE_ERR_TIMEOUT, + OSHMEM_ERR_NOT_AVAILABLE = ORTE_ERR_NOT_AVAILABLE, + OSHMEM_ERR_PERM = ORTE_ERR_PERM, + OSHMEM_ERR_VALUE_OUT_OF_BOUNDS = ORTE_ERR_VALUE_OUT_OF_BOUNDS, + OSHMEM_ERR_FILE_READ_FAILURE = ORTE_ERR_FILE_READ_FAILURE, + OSHMEM_ERR_FILE_WRITE_FAILURE = ORTE_ERR_FILE_WRITE_FAILURE, + OSHMEM_ERR_FILE_OPEN_FAILURE = ORTE_ERR_FILE_OPEN_FAILURE, + + OSHMEM_ERR_RECV_LESS_THAN_POSTED = ORTE_ERR_RECV_LESS_THAN_POSTED, + OSHMEM_ERR_RECV_MORE_THAN_POSTED = ORTE_ERR_RECV_MORE_THAN_POSTED, + OSHMEM_ERR_NO_MATCH_YET = ORTE_ERR_NO_MATCH_YET, + OSHMEM_ERR_BUFFER = ORTE_ERR_BUFFER, + OSHMEM_ERR_REQUEST = ORTE_ERR_REQUEST, + OSHMEM_ERR_NO_CONNECTION_ALLOWED = ORTE_ERR_NO_CONNECTION_ALLOWED, + OSHMEM_ERR_CONNECTION_REFUSED = ORTE_ERR_CONNECTION_REFUSED , + OSHMEM_ERR_CONNECTION_FAILED = ORTE_ERR_CONNECTION_FAILED, + OSHMEM_PACK_MISMATCH = ORTE_ERR_PACK_MISMATCH, + OSHMEM_ERR_PACK_FAILURE = ORTE_ERR_PACK_FAILURE, + OSHMEM_ERR_UNPACK_FAILURE = ORTE_ERR_UNPACK_FAILURE, + OSHMEM_ERR_COMM_FAILURE = ORTE_ERR_COMM_FAILURE, + OSHMEM_UNPACK_INADEQUATE_SPACE = ORTE_ERR_UNPACK_INADEQUATE_SPACE, + OSHMEM_UNPACK_READ_PAST_END_OF_BUFFER = ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER, + OSHMEM_ERR_TYPE_MISMATCH = ORTE_ERR_TYPE_MISMATCH, + OSHMEM_ERR_COMPARE_FAILURE = ORTE_ERR_COMPARE_FAILURE, + OSHMEM_ERR_COPY_FAILURE = ORTE_ERR_COPY_FAILURE, + OSHMEM_ERR_UNKNOWN_DATA_TYPE = ORTE_ERR_UNKNOWN_DATA_TYPE, + OSHMEM_ERR_DATA_TYPE_REDEF = ORTE_ERR_DATA_TYPE_REDEF, + OSHMEM_ERR_DATA_OVERWRITE_ATTEMPT = ORTE_ERR_DATA_OVERWRITE_ATTEMPT +}; + +#define OSHMEM_ERR_MAX (OSHMEM_ERR_BASE - 1) + + +/* C datatypes */ +/* + * SHMEM_Init_thread constants + * Do not change the order of these without also modifying mpif.h.in. + */ +enum { + SHMEM_NULL = 0, + SHMEM_CHAR, + SHMEM_UCHAR, + SHMEM_SHORT, + SHMEM_USHORT, + SHMEM_INT, + SHMEM_UINT, + SHMEM_LONG, + SHMEM_ULONG, + SHMEM_LLONG, + SHMEM_ULLONG, + SHMEM_FLOAT, + SHMEM_DOUBLE, + SHMEM_LDOUBLE, + + SHMEM_FINT, + SHMEM_FINT4, + SHMEM_FINT8 +}; + + +/* + * Miscellaneous constants + */ +#define SHMEM_ANY_SOURCE -1 /* match any source rank */ +#define SHMEM_PROC_NULL -2 /* rank of null process */ +#define SHMEM_UNDEFINED -32766 /* undefined stuff */ + + +#ifndef UNREFERENCED_PARAMETER +#define UNREFERENCED_PARAMETER(P) ((void)P) +#endif + +#define OSHMEM_PREDEFINED_GLOBAL(type, global) ((type) ((void *) &(global))) + +#if OMPI_WANT_MEMCHECKER +#define MEMCHECKER(x) do { \ + x; \ + } while(0) +#else +#define MEMCHECKER(x) +#endif /* OMPI_WANT_MEMCHECKER */ + + +#endif /* OSHMEM_CONSTANTS_H */ + diff --git a/oshmem/include/oshmem/types.h b/oshmem/include/oshmem/types.h new file mode 100644 index 0000000000..08696b3218 --- /dev/null +++ b/oshmem/include/oshmem/types.h @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#ifndef SHMEM_TYPES_H +#define SHMEM_TYPES_H + + +/* + * Predefine some internal types so we dont need all the include + * dependencies. + */ + + struct oshmem_proc_t; + struct oshmem_group_t; + struct oshmem_op_t; + +#endif diff --git a/oshmem/include/oshmem/version.h.in b/oshmem/include/oshmem/version.h.in new file mode 100644 index 0000000000..d36adcdc3f --- /dev/null +++ b/oshmem/include/oshmem/version.h.in @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + * + * This file should be included by any file that needs full + * version information for the OSHMEM project + */ + +#ifndef OSHMEM_VERSIONS_H +#define OSHMEM_VERSIONS_H + +#define OSHMEM_MAJOR_VERSION @OSHMEM_MAJOR_VERSION@ +#define OSHMEM_MINOR_VERSION @OSHMEM_MINOR_VERSION@ +#define OSHMEM_RELEASE_VERSION @OSHMEM_RELEASE_VERSION@ +#define OSHMEM_GREEK_VERSION "@OSHMEM_GREEK_VERSION@" +#define OSHMEM_WANT_REPO_REV @OSHMEM_WANT_REPO_REV@ +#define OSHMEM_REPO_REV "@OSHMEM_REPO_REV@" +#ifdef OSHMEM_VERSION +/* If we included version.h, we want the real version, not the + stripped (no-r number) version */ +#undef OSHMEM_VERSION +#endif +#define OSHMEM_VERSION "@OSHMEM_VERSION@" + +#endif diff --git a/oshmem/include/oshmem_config.h.in b/oshmem/include/oshmem_config.h.in new file mode 100644 index 0000000000..6e87cd79bb --- /dev/null +++ b/oshmem/include/oshmem_config.h.in @@ -0,0 +1,125 @@ +/* -*- c -*- + * + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + * + * Function: - OS, CPU and compiler dependent configuration + */ + +#ifndef OSHMEM_CONFIG_H +#define OSHMEM_CONFIG_H + +#include "opal_config.h" + +#define OSHMEM_IDENT_STRING OPAL_IDENT_STRING + +/*********************************************************************** + * + * OMPI-specific Fortran code that should be in ompi_config.h, but not + * in the other projects. + * + **********************************************************************/ + +/* MPI_Fint is the same as ompi_fortran_INTEGER_t */ +#define MPI_Fint ompi_fortran_integer_t + +#if OMPI_HAVE_FORTRAN_REAL && OMPI_HAVE_FORTRAN_COMPLEX +/* * C type for Fortran COMPLEX */ +/*typedef struct { + ompi_fortran_real_t real; + ompi_fortran_real_t imag; +} ompi_fortran_complex_t;*/ +#endif + +#if OMPI_HAVE_FORTRAN_REAL4 && OMPI_HAVE_FORTRAN_COMPLEX8 +/* * C type for Fortran COMPLEX*8 */ +/*typedef struct { + ompi_fortran_real4_t real; + ompi_fortran_real4_t imag; +} ompi_fortran_complex8_t;*/ +#endif + +#if OMPI_HAVE_FORTRAN_REAL8 && OMPI_HAVE_FORTRAN_COMPLEX16 +/* * C type for Fortran COMPLEX*16 */ +/*typedef struct { + ompi_fortran_real8_t real; + ompi_fortran_real8_t imag; +} ompi_fortran_complex16_t;*/ +#endif + +#if OMPI_HAVE_FORTRAN_REAL16 && OMPI_HAVE_FORTRAN_COMPLEX32 +/* * C type for Fortran COMPLEX*32 */ +/*typedef struct { + ompi_fortran_real16_t real; + ompi_fortran_real16_t imag; +} ompi_fortran_complex32_t;*/ +#endif + +#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION +/* * C type for Fortran DOUBLE COMPLEX */ +/*typedef struct { + ompi_fortran_double_precision_t real; + ompi_fortran_double_precision_t imag; +} ompi_fortran_double_complex_t;*/ +#endif + +#if OPAL_HAVE_ATTRIBUTE_DESTRUCTOR +# define __opal_attribute_destructor__ __attribute__((__destructor__)) +#else +# define __opal_attribute_destructor__ +#endif + +#if defined(__WINDOWS__) + +# if defined(_USRDLL) /* building shared libraries (.DLL) */ +# if defined(OSHMEM_EXPORTS) +# define OSHMEM_DECLSPEC __declspec(dllexport) +# define OSHMEM_MODULE_DECLSPEC +# else +# define OSHMEM_DECLSPEC __declspec(dllimport) +# if defined(OSHMEM_MODULE_EXPORTS) +# define OSHMEM_MODULE_DECLSPEC __declspec(dllexport) +# else +# define OSHMEM_MODULE_DECLSPEC __declspec(dllimport) +# endif /* defined(OSHMEM_MODULE_EXPORTS) */ +# endif /* defined(OSHMEM_EXPORTS) */ +# else /* building static library */ +# if defined(OSHMEM_IMPORTS) +# define OSHMEM_DECLSPEC __declspec(dllimport) +# else +# define OSHMEM_DECLSPEC +# endif /* defined(OSHMEM_IMPORTS) */ +# define OSHMEM_MODULE_DECLSPEC +# endif /* defined(_USRDLL) */ + +#else + +# if OPAL_C_HAVE_VISIBILITY +# ifndef OSHMEM_DECLSPEC +# define OSHMEM_DECLSPEC __opal_attribute_visibility__("default") +# endif +# ifndef OSHMEM_MODULE_DECLSPEC +# define OSHMEM_MODULE_DECLSPEC __opal_attribute_visibility__("default") +# endif +# ifndef OSHMEM_DESTRUCTOR +# define OSHMEM_DESTRUCTOR __opal_attribute_destructor__ +# endif +# else +# ifndef OSHMEM_DECLSPEC +# define OSHMEM_DECLSPEC +# endif +# ifndef OSHMEM_MODULE_DECLSPEC +# define OSHMEM_MODULE_DECLSPEC +# endif +# ifndef OSHMEM_DESTRUCTOR +# define OSHMEM_DESTRUCTOR +# endif +# endif +#endif /* defined(__WINDOWS__) */ + +#endif diff --git a/oshmem/include/shmem.fh b/oshmem/include/shmem.fh new file mode 100644 index 0000000000..16618db756 --- /dev/null +++ b/oshmem/include/shmem.fh @@ -0,0 +1,55 @@ +! Emacs: -*- mode: fortran; -*- +! +! Copyright (c) 2012 Mellanox Technologies, Inc. +! All rights reserved. +! $COPYRIGHT$ +! +! Additional copyrights may follow +! +! $HEADER$ +! + +! +! TODO: exact values should be found during configuration +! + + integer SHMEM_BARRIER_SYNC_SIZE + parameter ( SHMEM_BARRIER_SYNC_SIZE = 4 ) + + integer SHMEM_BCAST_SYNC_SIZE + parameter ( SHMEM_BCAST_SYNC_SIZE = 8 ) + + + integer SHMEM_COLLECT_SYNC_SIZE + parameter ( SHMEM_COLLECT_SYNC_SIZE = 8 ) + + integer SHMEM_REDUCE_SYNC_SIZE + parameter ( SHMEM_REDUCE_SYNC_SIZE = 8 ) + + integer SHMEM_SYNC_VALUE + parameter ( SHMEM_SYNC_VALUE = -1 ) + + integer SHMEM_REDUCE_MIN_WRKDATA_SIZE + parameter ( SHMEM_REDUCE_MIN_WRKDATA_SIZE = 8 ) + +! +! waits +! + integer SHMEM_CMP_EQ + parameter ( SHMEM_CMP_EQ = 0 ) + integer SHMEM_CMP_NE + parameter ( SHMEM_CMP_NE = 1 ) + integer SHMEM_CMP_GT + parameter ( SHMEM_CMP_GT = 2 ) + integer SHMEM_CMP_LE + parameter ( SHMEM_CMP_LE = 3 ) + integer SHMEM_CMP_LT + parameter ( SHMEM_CMP_LT = 4 ) + integer SHMEM_CMP_GE + parameter ( SHMEM_CMP_GE = 5 ) + + + logical shmem_pe_accessible + logical shmem_addr_accessible + + integer*8 shmem_ptr diff --git a/oshmem/include/shmem.h.in b/oshmem/include/shmem.h.in new file mode 100644 index 0000000000..0f7727bdea --- /dev/null +++ b/oshmem/include/shmem.h.in @@ -0,0 +1,389 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef OSHMEM_SHMEM_H +#define OSHMEM_SHMEM_H + + +#include /* include for ptrdiff_t */ + +#if defined(WIN32) || defined(_WIN32) +# define OSHMEM_COMPLEX_TYPE(type) +#else +# if defined(c_plusplus) || defined(__cplusplus) +# include +# define OSHMEM_COMPLEX_TYPE(type) std::complex +# else +# include +# define OSHMEM_COMPLEX_TYPE(type) type complex +# endif +#endif + + +/* + * SHMEM version + */ +#define SHMEM_VERSION 1 +#define SHMEM_SUBVERSION 5 + + +#ifndef OSHMEM_DECLSPEC +# if defined(WIN32) || defined(_WIN32) +# if defined(OSHMEM_IMPORTS) +# define OSHMEM_DECLSPEC __declspec(dllimport) +# else +# define OSHMEM_DECLSPEC +# endif /* defined(OSHMEM_IMPORTS) */ +# else +# if defined(OPAL_C_HAVE_VISIBILITY) && (OPAL_C_HAVE_VISIBILITY == 1) +# define OSHMEM_DECLSPEC __attribute__((visibility("default"))) +# else +# define OSHMEM_DECLSPEC +# endif +# endif +#endif + +#ifndef OSHMEM_DESTRUCTOR +# if defined(OPAL_C_HAVE_VISIBILITY) && (OPAL_C_HAVE_VISIBILITY == 1) +# define OSHMEM_DESTRUCTOR __attribute__((__destructor__)) +# else +# define OSHMEM_DESTRUCTOR +# endif +#endif + + +#if defined(c_plusplus) || defined(__cplusplus) +extern "C" { +#endif + + +/* + * OpenSHMEM API (www.openshmem.org) + */ + +/* + * Environment variables + */ + +/* size of symmetric heap in bytes. + * Can be qualified with the letter 'K', 'M', 'G' or 'T' + */ +#define SHMEM_HEAP_SIZE "SHMEM_SYMMETRIC_HEAP_SIZE" + +/* Following environment variables are Mellanox extension */ + +/* + * Type of allocator used by symmetric heap + */ +#define SHMEM_HEAP_TYPE "SHMEM_SYMMETRIC_HEAP_ALLOCATOR" + +/* + * Constants and definitions + */ +enum shmem_wait_ops { + SHMEM_CMP_EQ, + SHMEM_CMP_NE, + SHMEM_CMP_GT, + SHMEM_CMP_LE, + SHMEM_CMP_LT, + SHMEM_CMP_GE +}; + +#define _SHMEM_BARRIER_SYNC_SIZE (1) +#define _SHMEM_BCAST_SYNC_SIZE (1 + _SHMEM_BARRIER_SYNC_SIZE) +#define _SHMEM_COLLECT_SYNC_SIZE (1 + _SHMEM_BCAST_SYNC_SIZE) +#define _SHMEM_REDUCE_SYNC_SIZE (1 + _SHMEM_BCAST_SYNC_SIZE) +#define _SHMEM_REDUCE_MIN_WRKDATA_SIZE (1) +#define _SHMEM_SYNC_VALUE (-1) + +#define SHMEM_BARRIER_SYNC_SIZE _SHMEM_BARRIER_SYNC_SIZE +#define SHMEM_BCAST_SYNC_SIZE _SHMEM_BCAST_SYNC_SIZE +#define SHMEM_COLLECT_SYNC_SIZE _SHMEM_COLLECT_SYNC_SIZE +#define SHMEM_REDUCE_SYNC_SIZE _SHMEM_REDUCE_SYNC_SIZE +#define SHMEM_REDUCE_MIN_WRKDATA_SIZE _SHMEM_REDUCE_MIN_WRKDATA_SIZE +#define SHMEM_SYNC_VALUE _SHMEM_SYNC_VALUE + + +/* + * Initialization routines + */ +OSHMEM_DECLSPEC void start_pes(int npes); + + +/* + * Query routines + */ +OSHMEM_DECLSPEC int _num_pes(void); +OSHMEM_DECLSPEC int _my_pe(void); + + +/* + * Accessability routines + */ +OSHMEM_DECLSPEC int shmem_pe_accessible(int pe); +OSHMEM_DECLSPEC int shmem_addr_accessible(void *addr, int pe); + +/* + * Symmetric heap routines + */ +OSHMEM_DECLSPEC void* shmalloc(size_t size); +OSHMEM_DECLSPEC void* shmemalign(size_t align, size_t size); +OSHMEM_DECLSPEC void* shrealloc(void *ptr, size_t size); +OSHMEM_DECLSPEC void shfree(void* ptr); + +/* + * Remote pointer operations + */ +OSHMEM_DECLSPEC void *shmem_ptr(void *ptr, int pe); + +/* + * Elemental put routines + */ +OSHMEM_DECLSPEC void shmem_short_p(short* addr, short value, int pe); +OSHMEM_DECLSPEC void shmem_int_p(int* addr, int value, int pe); +OSHMEM_DECLSPEC void shmem_long_p(long* addr, long value, int pe); +OSHMEM_DECLSPEC void shmem_float_p(float* addr, float value, int pe); +OSHMEM_DECLSPEC void shmem_double_p(double* addr, double value, int pe); +OSHMEM_DECLSPEC void shmem_longlong_p(long long* addr, long long value, int pe); +OSHMEM_DECLSPEC void shmem_longdouble_p(long double* addr, long double value, int pe); + +/* + * Block data put routines + */ +OSHMEM_DECLSPEC void shmem_char_put(char *target, const char *source, size_t len, int pe); +OSHMEM_DECLSPEC void shmem_short_put(short *target, const short *source, size_t len, int pe); +OSHMEM_DECLSPEC void shmem_int_put(int* target, const int* source, size_t len, int pe); +OSHMEM_DECLSPEC void shmem_long_put(long *target, const long *source, size_t len, int pe); +OSHMEM_DECLSPEC void shmem_float_put(float *target, const float *source, size_t len, int pe); +OSHMEM_DECLSPEC void shmem_double_put(double *target, const double *source, size_t len, int pe); +OSHMEM_DECLSPEC void shmem_longlong_put(long long *target, const long long *source, size_t len, int pe); +OSHMEM_DECLSPEC void shmem_longdouble_put(long double *target, const long double *source, size_t len, int pe); +OSHMEM_DECLSPEC void shmem_put32(void *target, const void *source, size_t len, int pe); +OSHMEM_DECLSPEC void shmem_put64(void *target, const void *source, size_t len, int pe); +OSHMEM_DECLSPEC void shmem_put128(void *target, const void *source, size_t len, int pe); +OSHMEM_DECLSPEC void shmem_putmem(void *target, const void *source, size_t len, int pe); + +/* + * Strided put routines + */ +OSHMEM_DECLSPEC void shmem_int_iput(int* target, const int* source, ptrdiff_t tst, ptrdiff_t sst,size_t len, int pe); +OSHMEM_DECLSPEC void shmem_short_iput(short* target, const short* source, ptrdiff_t tst, ptrdiff_t sst,size_t len, int pe); +OSHMEM_DECLSPEC void shmem_float_iput(float* target, const float* source, ptrdiff_t tst, ptrdiff_t sst,size_t len, int pe); +OSHMEM_DECLSPEC void shmem_double_iput(double* target, const double* source, ptrdiff_t tst, ptrdiff_t sst,size_t len, int pe); +OSHMEM_DECLSPEC void shmem_longlong_iput(long long* target, const long long* source, ptrdiff_t tst, ptrdiff_t sst,size_t len, int pe); +OSHMEM_DECLSPEC void shmem_longdouble_iput(long double* target, const long double* source, ptrdiff_t tst, ptrdiff_t sst,size_t len, int pe); +OSHMEM_DECLSPEC void shmem_long_iput(long* target, const long* source, ptrdiff_t tst, ptrdiff_t sst,size_t len, int pe); +OSHMEM_DECLSPEC void shmem_iput32(void* target, const void* source, ptrdiff_t tst, ptrdiff_t sst,size_t len, int pe); +OSHMEM_DECLSPEC void shmem_iput64(void* target, const void* source, ptrdiff_t tst, ptrdiff_t sst,size_t len, int pe); +OSHMEM_DECLSPEC void shmem_iput128(void* target, const void* source, ptrdiff_t tst, ptrdiff_t sst,size_t len, int pe); + +/* + * Elemental get routines + */ +OSHMEM_DECLSPEC short shmem_short_g(short* addr, int pe); +OSHMEM_DECLSPEC int shmem_int_g(int* addr, int pe); +OSHMEM_DECLSPEC long shmem_long_g(long* addr, int pe); +OSHMEM_DECLSPEC float shmem_float_g(float* addr, int pe); +OSHMEM_DECLSPEC double shmem_double_g(double* addr, int pe); +OSHMEM_DECLSPEC long long shmem_longlong_g(long long* addr, int pe); +OSHMEM_DECLSPEC long double shmem_longdouble_g(long double* addr, int pe); + +/* + * Block data get routines + */ +OSHMEM_DECLSPEC void shmem_char_get(char *target, const char *source, size_t len, int pe); +OSHMEM_DECLSPEC void shmem_short_get(short *target, const short *source, size_t len, int pe); +OSHMEM_DECLSPEC void shmem_int_get(int *target, const int *source, size_t len, int pe); +OSHMEM_DECLSPEC void shmem_long_get(long *target, const long *source, size_t len, int pe); +OSHMEM_DECLSPEC void shmem_float_get(float *target, const float *source, size_t len, int pe); +OSHMEM_DECLSPEC void shmem_double_get(double *target, const double *source, size_t len, int pe); +OSHMEM_DECLSPEC void shmem_longlong_get(long long *target, const long long *source, size_t len, int pe); +OSHMEM_DECLSPEC void shmem_longdouble_get(long double *target, const long double *source, size_t len, int pe); +OSHMEM_DECLSPEC void shmem_get32(void *target, const void *source, size_t len, int pe); +OSHMEM_DECLSPEC void shmem_get64(void *target, const void *source, size_t len, int pe); +OSHMEM_DECLSPEC void shmem_get128(void *target, const void *source, size_t len, int pe); +OSHMEM_DECLSPEC void shmem_getmem(void *target, const void *source, size_t len, int pe); + +/* + * Strided get routines + */ +OSHMEM_DECLSPEC void shmem_int_iget(int* target, const int* source, ptrdiff_t tst, ptrdiff_t sst,size_t len, int pe); +OSHMEM_DECLSPEC void shmem_short_iget(short* target, const short* source, ptrdiff_t tst, ptrdiff_t sst,size_t len, int pe); +OSHMEM_DECLSPEC void shmem_float_iget(float* target, const float* source, ptrdiff_t tst, ptrdiff_t sst,size_t len, int pe); +OSHMEM_DECLSPEC void shmem_double_iget(double* target, const double* source, ptrdiff_t tst, ptrdiff_t sst,size_t len, int pe); +OSHMEM_DECLSPEC void shmem_longlong_iget(long long* target, const long long* source, ptrdiff_t tst, ptrdiff_t sst,size_t len, int pe); +OSHMEM_DECLSPEC void shmem_longdouble_iget(long double* target, const long double* source, ptrdiff_t tst, ptrdiff_t sst,size_t len, int pe); +OSHMEM_DECLSPEC void shmem_long_iget(long* target, const long* source, ptrdiff_t tst, ptrdiff_t sst,size_t len, int pe); +OSHMEM_DECLSPEC void shmem_iget32(void* target, const void* source, ptrdiff_t tst, ptrdiff_t sst,size_t len, int pe); +OSHMEM_DECLSPEC void shmem_iget64(void* target, const void* source, ptrdiff_t tst, ptrdiff_t sst,size_t len, int pe); +OSHMEM_DECLSPEC void shmem_iget128(void* target, const void* source, ptrdiff_t tst, ptrdiff_t sst,size_t len, int pe); + +/* + * Atomic operations + */ +/* Atomic swap */ +OSHMEM_DECLSPEC long shmem_swap(long *target, long value, int pe); +OSHMEM_DECLSPEC double shmem_double_swap(double *target, double value, int pe); +OSHMEM_DECLSPEC float shmem_float_swap(float *target, float value, int pe); +OSHMEM_DECLSPEC int shmem_int_swap(int *target, int value, int pe); +OSHMEM_DECLSPEC long shmem_long_swap(long *target, long value, int pe); +OSHMEM_DECLSPEC long long shmem_longlong_swap(long long*target, long long value, int pe); + +/* Atomic conditional swap */ +OSHMEM_DECLSPEC int shmem_int_cswap(int *target, int cond, int value, int pe); +OSHMEM_DECLSPEC long shmem_long_cswap(long *target, long cond, long value, int pe); +OSHMEM_DECLSPEC long long shmem_longlong_cswap(long long *target, long long cond, long long value, int pe); + +/* Atomic Fetch&Add */ +OSHMEM_DECLSPEC int shmem_int_fadd(int *target, int value, int pe); +OSHMEM_DECLSPEC long shmem_long_fadd(long *target, long value, int pe); +OSHMEM_DECLSPEC long long shmem_longlong_fadd(long long *target, long long value, int pe); + +/* Atomic Fetch&Inc */ +OSHMEM_DECLSPEC int shmem_int_finc(int *target, int pe); +OSHMEM_DECLSPEC long shmem_long_finc(long *target, int pe); +OSHMEM_DECLSPEC long long shmem_longlong_finc(long long *target, int pe); + +/* Atomic Add*/ +OSHMEM_DECLSPEC void shmem_int_add(int *target, int value, int pe); +OSHMEM_DECLSPEC void shmem_long_add(long *target, long value, int pe); +OSHMEM_DECLSPEC void shmem_longlong_add(long long *target, long long value, int pe); + +/* Atomic Inc */ +OSHMEM_DECLSPEC void shmem_int_inc(int *target, int pe); +OSHMEM_DECLSPEC void shmem_long_inc(long *target, int pe); +OSHMEM_DECLSPEC void shmem_longlong_inc(long long *target, int pe); + +/* + * Lock functions + */ +OSHMEM_DECLSPEC void shmem_set_lock(long *lock); +OSHMEM_DECLSPEC void shmem_clear_lock(long *lock); +OSHMEM_DECLSPEC int shmem_test_lock(long *lock); + +/* + * P2P sync routines + */ +OSHMEM_DECLSPEC void shmem_short_wait(short *addr, short value); +OSHMEM_DECLSPEC void shmem_int_wait(int *addr, int value); +OSHMEM_DECLSPEC void shmem_long_wait(long *addr, long value); +OSHMEM_DECLSPEC void shmem_longlong_wait(long long *addr, long long value); +OSHMEM_DECLSPEC void shmem_wait(long *addr, long value); + +OSHMEM_DECLSPEC void shmem_short_wait_until(short *addr, int cmp, short value); +OSHMEM_DECLSPEC void shmem_int_wait_until(int *addr, int cmp, int value); +OSHMEM_DECLSPEC void shmem_long_wait_until(long *addr, int cmp, long value); +OSHMEM_DECLSPEC void shmem_longlong_wait_until(long long *addr, int cmp, long long value); +OSHMEM_DECLSPEC void shmem_wait_until(long *addr, int cmp, long value); + +/* + * Barrier sync routines + */ +OSHMEM_DECLSPEC void shmem_barrier(int PE_start, int logPE_stride, int PE_size, long *pSync); +OSHMEM_DECLSPEC void shmem_barrier_all(void); +OSHMEM_DECLSPEC void shmem_fence(void); +OSHMEM_DECLSPEC void shmem_quiet(void); + +/* + * Collective routines + */ +OSHMEM_DECLSPEC void shmem_broadcast32(void *target, const void *source, size_t nlong, int PE_root, int PE_start, int logPE_stride, int PE_size, long *pSync); +OSHMEM_DECLSPEC void shmem_broadcast64(void *target, const void *source, size_t nlong, int PE_root, int PE_start, int logPE_stride, int PE_size, long *pSync); +OSHMEM_DECLSPEC void shmem_collect32(void *target, const void *source, size_t nlong, int PE_start, int logPE_stride, int PE_size, long *pSync); +OSHMEM_DECLSPEC void shmem_collect64(void *target, const void *source, size_t nlong, int PE_start, int logPE_stride, int PE_size, long *pSync); +OSHMEM_DECLSPEC void shmem_fcollect32(void *target, const void *source, size_t nlong, int PE_start, int logPE_stride, int PE_size, long *pSync); +OSHMEM_DECLSPEC void shmem_fcollect64(void *target, const void *source, size_t nlong, int PE_start, int logPE_stride, int PE_size, long *pSync); + +/* + * Reduction routines + */ +OSHMEM_DECLSPEC void shmem_short_and_to_all(short *target, short *source, int nreduce, int PE_start, int logPE_stride, int PE_size, short *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_int_and_to_all(int *target, int *source, int nreduce, int PE_start, int logPE_stride, int PE_size, int *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_long_and_to_all(long *target, long *source, int nreduce, int PE_start, int logPE_stride, int PE_size, long *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_longlong_and_to_all(long long *target, long long *source, int nreduce, int PE_start, int logPE_stride, int PE_size, long long *pWrk, long *pSync); + +OSHMEM_DECLSPEC void shmem_short_or_to_all(short *target, short *source, int nreduce, int PE_start, int logPE_stride, int PE_size, short *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_int_or_to_all(int *target, int *source, int nreduce, int PE_start, int logPE_stride, int PE_size, int *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_long_or_to_all(long *target, long *source, int nreduce, int PE_start, int logPE_stride, int PE_size, long *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_longlong_or_to_all(long long *target, long long *source, int nreduce, int PE_start, int logPE_stride, int PE_size, long long *pWrk, long *pSync); + +OSHMEM_DECLSPEC void shmem_short_xor_to_all(short *target, short *source, int nreduce, int PE_start, int logPE_stride, int PE_size, short *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_int_xor_to_all(int *target, int *source, int nreduce, int PE_start, int logPE_stride, int PE_size, int *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_long_xor_to_all(long *target, long *source, int nreduce, int PE_start, int logPE_stride, int PE_size, long *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_longlong_xor_to_all(long long *target, long long *source, int nreduce, int PE_start, int logPE_stride, int PE_size, long long *pWrk, long *pSync); + +OSHMEM_DECLSPEC void shmem_short_max_to_all(short *target, short *source, int nreduce, int PE_start, int logPE_stride, int PE_size, short *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_int_max_to_all(int *target, int *source, int nreduce, int PE_start, int logPE_stride, int PE_size, int *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_long_max_to_all(long *target, long *source, int nreduce, int PE_start, int logPE_stride, int PE_size, long *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_longlong_max_to_all(long long *target, long long *source, int nreduce, int PE_start, int logPE_stride, int PE_size, long long *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_float_max_to_all(float *target, float *source, int nreduce, int PE_start, int logPE_stride, int PE_size, float *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_double_max_to_all(double *target, double *source, int nreduce, int PE_start, int logPE_stride, int PE_size, double *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_longdouble_max_to_all(long double *target, long double *source, int nreduce, int PE_start, int logPE_stride, int PE_size, long double *pWrk, long *pSync); + +OSHMEM_DECLSPEC void shmem_short_min_to_all(short *target, short *source, int nreduce, int PE_start, int logPE_stride, int PE_size, short *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_int_min_to_all(int *target, int *source, int nreduce, int PE_start, int logPE_stride, int PE_size, int *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_long_min_to_all(long *target, long *source, int nreduce, int PE_start, int logPE_stride, int PE_size, long *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_longlong_min_to_all(long long *target, long long *source, int nreduce, int PE_start, int logPE_stride, int PE_size, long long *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_float_min_to_all(float *target, float *source, int nreduce, int PE_start, int logPE_stride, int PE_size, float *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_double_min_to_all(double *target, double *source, int nreduce, int PE_start, int logPE_stride, int PE_size, double *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_longdouble_min_to_all(long double *target, long double *source, int nreduce, int PE_start, int logPE_stride, int PE_size, long double *pWrk, long *pSync); + +OSHMEM_DECLSPEC void shmem_short_sum_to_all(short *target, short *source, int nreduce, int PE_start, int logPE_stride, int PE_size, short *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_int_sum_to_all(int *target, int *source, int nreduce, int PE_start, int logPE_stride, int PE_size, int *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_long_sum_to_all(long *target, long *source, int nreduce, int PE_start, int logPE_stride, int PE_size, long *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_longlong_sum_to_all(long long *target, long long *source, int nreduce, int PE_start, int logPE_stride, int PE_size, long long *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_float_sum_to_all(float *target, float *source, int nreduce, int PE_start, int logPE_stride, int PE_size, float *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_double_sum_to_all(double *target, double *source, int nreduce, int PE_start, int logPE_stride, int PE_size, double *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_longdouble_sum_to_all(long double *target, long double *source, int nreduce, int PE_start, int logPE_stride, int PE_size, long double *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_complexf_sum_to_all(OSHMEM_COMPLEX_TYPE(float) *target, OSHMEM_COMPLEX_TYPE(float) *source, int nreduce, int PE_start, int logPE_stride, int PE_size, OSHMEM_COMPLEX_TYPE(float) *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_complexd_sum_to_all(OSHMEM_COMPLEX_TYPE(double) *target, OSHMEM_COMPLEX_TYPE(double) *source, int nreduce, int PE_start, int logPE_stride, int PE_size, OSHMEM_COMPLEX_TYPE(double) *pWrk, long *pSync); + +OSHMEM_DECLSPEC void shmem_short_prod_to_all(short *target, short *source, int nreduce, int PE_start, int logPE_stride, int PE_size, short *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_int_prod_to_all(int *target, int *source, int nreduce, int PE_start, int logPE_stride, int PE_size, int *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_long_prod_to_all(long *target, long *source, int nreduce, int PE_start, int logPE_stride, int PE_size, long *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_longlong_prod_to_all(long long *target, long long *source, int nreduce, int PE_start, int logPE_stride, int PE_size, long long *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_float_prod_to_all(float *target, float *source, int nreduce, int PE_start, int logPE_stride, int PE_size, float *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_double_prod_to_all(double *target, double *source, int nreduce, int PE_start, int logPE_stride, int PE_size, double *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_longdouble_prod_to_all(long double *target, long double *source, int nreduce, int PE_start, int logPE_stride, int PE_size, long double *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_complexf_prod_to_all(OSHMEM_COMPLEX_TYPE(float) *target, OSHMEM_COMPLEX_TYPE(float) *source, int nreduce, int PE_start, int logPE_stride, int PE_size, OSHMEM_COMPLEX_TYPE(float) *pWrk, long *pSync); +OSHMEM_DECLSPEC void shmem_complexd_prod_to_all(OSHMEM_COMPLEX_TYPE(double) *target, OSHMEM_COMPLEX_TYPE(double) *source, int nreduce, int PE_start, int logPE_stride, int PE_size, OSHMEM_COMPLEX_TYPE(double) *pWrk, long *pSync); + +/* + * Platform specific cache management routines + */ +OSHMEM_DECLSPEC void shmem_udcflush(void); +OSHMEM_DECLSPEC void shmem_udcflush_line(void* target); +OSHMEM_DECLSPEC void shmem_set_cache_inv(void); +OSHMEM_DECLSPEC void shmem_set_cache_line_inv(void* target); +OSHMEM_DECLSPEC void shmem_clear_cache_inv(void); +OSHMEM_DECLSPEC void shmem_clear_cache_line_inv(void* target); + +/* + * Legacy API + */ +OSHMEM_DECLSPEC int num_pes(void); +OSHMEM_DECLSPEC int my_pe(void); + +/* old init/destruct functions - not in the open shmem spec but still supported */ +OSHMEM_DECLSPEC void shmem_init(void); +OSHMEM_DECLSPEC int shmem_finalize(void) OSHMEM_DESTRUCTOR; +OSHMEM_DECLSPEC int shmem_n_pes(void); +OSHMEM_DECLSPEC int shmem_my_pe(void); + +OSHMEM_DECLSPEC void shmem_put(void *target, const void *source, size_t len, int pe); +OSHMEM_DECLSPEC void shmem_get(void *target, const void *source, size_t len, int pe); + + +#if defined(c_plusplus) || defined(__cplusplus) +} +#endif + + +#endif /* OSHMEM_SHMEM_H */ diff --git a/oshmem/include/shmem_portable_platform.h.in b/oshmem/include/shmem_portable_platform.h.in new file mode 100644 index 0000000000..ac36e511f9 --- /dev/null +++ b/oshmem/include/shmem_portable_platform.h.in @@ -0,0 +1,401 @@ +/* + * Header file with preprocessor magic to figure out, which compiler the user has been calling! + * + * This code is adapted from the file other/portable_platform.h of GASnet-1.12.0: + * - Ripping out the required parts. + * - Get rid of brackets as it messes up autoconf + * - Delete version tests for older PGI versions (#include "omp.h" not acceptabe) + * - Indent ('#' should be in column 0) + * + * External packages (vt, romio) depend on top_build_dir/ompi/include, therefore + * although this is not changed in the configure process, this has to be set as + * a .in file... + * --------------------------------------------------------------------------- + */ +#ifndef MPI_PORTABLE_PLATFORM_H +#define MPI_PORTABLE_PLATFORM_H + +/* All files in this directory and all sub-directories (except where otherwise noted) + * are subject to the following licensing terms: + * + * --------------------------------------------------------------------------- + * "Copyright (c) 2012 Mellanox Technologies, Inc. + * " All rights reserved. + * + * Permission to use, copy, modify, and distribute this software and its + * documentation for any purpose, without fee, and without written agreement is + * hereby granted, provided that the above copyright notice and the following + * two paragraphs appear in all copies of this software. + * + * IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR + * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT + * OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF THE UNIVERSITY OF + * CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS + * ON AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATION TO + * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS." + * --------------------------------------------------------------------------- + * + * Please see the license.txt files within the gm-conduit, lapi-conduit and + * vapi-conduit directories for the licensing terms governing those + * contributed components. + * + * The authors/contributors of GASNet include: + * + * Dan Bonachea : + * General infrastructure & documentation + * mpi-conduit + * elan-conduit + * smp-conduit + * udp-conduit + * extended-ref + * template-conduit + * Christian Bell : gm-conduit, shmem-conduit + * Mike Welcome : lapi-conduit, portals-conduit + * Paul H. Hargrove : vapi-conduit, ibv-conduit + * Rajesh Nishtala : collectives, dcmf-conduit + * Parry Husbands (PJRHusbands@lbl.gov): lapi-conduit + * + * For more information about GASNet, visit our home page at: + * http://gasnet.cs.berkeley.edu/ + * Or send email to: + * + * + * Source code contributions (fixes, patches, extensions etc.) should be + * sent to to be reviewed for acceptance into the primary + * distribution. Contributions are most likely to be accepted if they + * are provided as public domain, or under a BSD-style license such as + * the one above. + * + */ +#ifndef _STRINGIFY +#define _STRINGIFY_HELPER(x) #x +#define _STRINGIFY(x) _STRINGIFY_HELPER(x) +#endif + +#if defined(__INTEL_COMPILER) +# define PLATFORM_COMPILER_FAMILYNAME INTEL +# define PLATFORM_COMPILER_FAMILYID 2 +# ifdef __cplusplus +# define PLATFORM_COMPILER_INTEL_CXX 1 +# else +# define PLATFORM_COMPILER_INTEL_C 1 +# endif +# define _PLATFORM_COMPILER_INTEL_MIN_BUILDDATE 19700000 /* year 1970: predates most intel products :) */ +# ifdef __INTEL_COMPILER_BUILD_DATE +# define _PLATFORM_INTEL_COMPILER_BUILD_DATE __INTEL_COMPILER_BUILD_DATE +# else +# define _PLATFORM_INTEL_COMPILER_BUILD_DATE _PLATFORM_COMPILER_INTEL_MIN_BUILDDATE +# endif + /* patch number is a decimal build date: YYYYMMDD */ +# define PLATFORM_COMPILER_VERSION_INT(maj,min,pat) \ + (((((maj) * 10) | (min)) << 20) | \ + ((pat) < _PLATFORM_COMPILER_INTEL_MIN_BUILDDATE ? \ + _PLATFORM_COMPILER_INTEL_MIN_BUILDDATE : ((pat)-_PLATFORM_COMPILER_INTEL_MIN_BUILDDATE))) +# define PLATFORM_COMPILER_VERSION \ + PLATFORM_COMPILER_VERSION_INT(__INTEL_COMPILER/10, __INTEL_COMPILER/100, _PLATFORM_INTEL_COMPILER_BUILD_DATE) +# define PLATFORM_COMPILER_VERSION_STR \ + _STRINGIFY(__INTEL_COMPILER)"."_STRINGIFY(_PLATFORM_INTEL_COMPILER_BUILD_DATE) + +#elif defined(__PATHSCALE__) +# define PLATFORM_COMPILER_PATHSCALE 1 +# define PLATFORM_COMPILER_FAMILYNAME PATHSCALE +# define PLATFORM_COMPILER_FAMILYID 3 +# ifdef __cplusplus +# define PLATFORM_COMPILER_PATHSCALE_CXX 1 +# else +# define PLATFORM_COMPILER_PATHSCALE_C 1 +# endif +# define PLATFORM_COMPILER_VERSION \ + PLATFORM_COMPILER_VERSION_INT(__PATHCC__,__PATHCC_MINOR__,__PATHCC_PATCHLEVEL__) +# define PLATFORM_COMPILER_VERSION_STR __PATHSCALE__ + +#elif defined(__PGI) +# define PLATFORM_COMPILER_PGI 1 +# define PLATFORM_COMPILER_FAMILYNAME PGI +# define PLATFORM_COMPILER_FAMILYID 4 +# ifdef __cplusplus +# define PLATFORM_COMPILER_PGI_CXX 1 +# else +# define PLATFORM_COMPILER_PGI_C 1 +# endif +# if __PGIC__ == 99 + /* bug 2230: PGI versioning was broken for some platforms in 7.0 + no way to know exact version, but provide something slightly more accurate */ +# define PLATFORM_COMPILER_VERSION 0x070000 +# define PLATFORM_COMPILER_VERSION_STR "7.?-?" +# elif defined(__PGIC__) && defined(__PGIC_MINOR__) && defined(__PGIC_PATCHLEVEL__) +# define PLATFORM_COMPILER_VERSION \ + PLATFORM_COMPILER_VERSION_INT(__PGIC__,__PGIC_MINOR__,__PGIC_PATCHLEVEL__) +# define PLATFORM_COMPILER_VERSION_STR \ + _STRINGIFY(__PGIC__)"."_STRINGIFY(__PGIC_MINOR__)"-"_STRINGIFY(__PGIC_PATCHLEVEL__) +# else + /* PGI before 6.1-4 lacks any version ID preprocessor macros - so use this filthy hack */ + /* XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX + * We cannot do these within mpi.h.in, as we should not include ompi.h + * Hopefully, compilers with integrated preprocessors will not analyse code within the #if 0-block + * XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX + */ +#if 0 +# ifdef PLATFORM_PGI_IS_ANCIENT + /* Include below might fail for ancient versions lacking this header, but testing shows it + works back to at least 5.1-3 (Nov 2003), and based on docs probably back to 3.2 (Sep 2000) */ +# define PLATFORM_COMPILER_VERSION 0 +# elif defined(__x86_64__) /* bug 1753 - 64-bit omp.h upgrade happenned in <6.0-8,6.1-1) */ +# include "omp.h" +# if defined(_PGOMP_H) + /* 6.1.1 or newer */ +# define PLATFORM_COMPILER_VERSION 0x060101 +# define PLATFORM_COMPILER_VERSION_STR ">=6.1-1" +# else + /* 6.0.8 or older */ +# define PLATFORM_COMPILER_VERSION 0 +# define PLATFORM_COMPILER_VERSION_STR "<=6.0-8" +# endif +# else /* 32-bit omp.h upgrade happenned in <5.2-4,6.0-8 */ +# include "omp.h" +# if defined(_PGOMP_H) + /* 6.0-8 or newer */ +# define PLATFORM_COMPILER_VERSION 0x060008 +# define PLATFORM_COMPILER_VERSION_STR ">=6.0-8" +# else + /* 5.2-4 or older */ +# define PLATFORM_COMPILER_VERSION 0 +# define PLATFORM_COMPILER_VERSION_STR "<=5.2-4" +# endif +# endif +#endif /* 0 */ + /* XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX */ +# endif + +#elif defined(__xlC__) +# define PLATFORM_COMPILER_XLC 1 +# define PLATFORM_COMPILER_FAMILYNAME XLC +# define PLATFORM_COMPILER_FAMILYID 5 +# ifdef __cplusplus +# define PLATFORM_COMPILER_XLC_CXX 1 +# else +# define PLATFORM_COMPILER_XLC_C 1 +# endif +# define PLATFORM_COMPILER_VERSION __xlC__ +# define PLATFORM_COMPILER_VERSION_INT(maj,min,pat) \ + ( ((maj) << 8) | ((min) << 4) | (pat) ) + +#elif defined(__DECC) || defined(__DECCXX) +# define PLATFORM_COMPILER_COMPAQ 1 +# define PLATFORM_COMPILER_FAMILYNAME COMPAQ +# define PLATFORM_COMPILER_FAMILYID 6 +# ifdef __cplusplus +# define PLATFORM_COMPILER_COMPAQ_CXX 1 +# else +# define PLATFORM_COMPILER_COMPAQ_C 1 +# endif +# if defined(__DECC_VER) +# define PLATFORM_COMPILER_VERSION __DECC_VER +# elif defined(__DECCXX_VER) +# define PLATFORM_COMPILER_VERSION __DECCXX_VER +# endif + +# define PLATFORM_COMPILER_VERSION_INT(maj,min,pat) \ + ( ((maj) * 10000000) + ((min) * 100000) + (90000) + (pat) ) + /* 90000 = official ver, 80000 = customer special ver, 60000 = field test ver */ + +#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC) +# define PLATFORM_COMPILER_SUN 1 +# define PLATFORM_COMPILER_FAMILYNAME SUN +# define PLATFORM_COMPILER_FAMILYID 7 +# ifdef __cplusplus +# define PLATFORM_COMPILER_SUN_CXX 1 +# else +# define PLATFORM_COMPILER_SUN_C 1 +# endif +# if defined(__SUNPRO_C) && __SUNPRO_C > 0 +# define PLATFORM_COMPILER_VERSION __SUNPRO_C +# elif defined(__SUNPRO_CC) && __SUNPRO_CC > 0 +# define PLATFORM_COMPILER_VERSION __SUNPRO_CC +# endif +# define PLATFORM_COMPILER_VERSION_INT(maj,min,pat) \ + ( ((maj) << 8) | ((min) << 4) | (pat) ) + +#elif defined(__HP_cc) || defined(__HP_aCC) +# define PLATFORM_COMPILER_HP 1 +# define PLATFORM_COMPILER_FAMILYNAME HP +# define PLATFORM_COMPILER_FAMILYID 8 +# ifdef __cplusplus +# define PLATFORM_COMPILER_HP_CXX 1 +# else +# define PLATFORM_COMPILER_HP_C 1 +# endif +# if defined(__HP_cc) && __HP_cc > 0 +# define PLATFORM_COMPILER_VERSION __HP_cc +# elif defined(__HP_aCC) && __HP_aCC > 0 +# define PLATFORM_COMPILER_VERSION __HP_aCC +# endif +# define PLATFORM_COMPILER_VERSION_INT(maj,min,pat) \ + ( ((maj) << 16) | ((min) << 8) | (pat) ) + +#elif defined(_SGI_COMPILER_VERSION) || \ + (defined(_COMPILER_VERSION) && defined(__sgi) && !defined(__GNUC__)) /* 7.3.0 and earlier lack _SGI_COMPILER_VERSION */ +# define PLATFORM_COMPILER_SGI 1 +# define PLATFORM_COMPILER_FAMILYNAME SGI +# define PLATFORM_COMPILER_FAMILYID 9 +# ifdef __cplusplus +# define PLATFORM_COMPILER_SGI_CXX 1 +# else +# define PLATFORM_COMPILER_SGI_C 1 +# endif +# if defined(_SGI_COMPILER_VERSION) && _SGI_COMPILER_VERSION > 0 +# define PLATFORM_COMPILER_VERSION _SGI_COMPILER_VERSION +# elif defined(_COMPILER_VERSION) && _COMPILER_VERSION > 0 +# define PLATFORM_COMPILER_VERSION _COMPILER_VERSION +# endif +# define PLATFORM_COMPILER_VERSION_INT(maj,min,pat) \ + ( ((maj) << 8) | ((min) << 4) | (pat) ) + +#elif defined(_CRAYC) +# define PLATFORM_COMPILER_CRAY 1 +# define PLATFORM_COMPILER_FAMILYNAME CRAY +# define PLATFORM_COMPILER_FAMILYID 10 +# ifdef __cplusplus +# define PLATFORM_COMPILER_CRAY_CXX 1 +# else +# define PLATFORM_COMPILER_CRAY_C 1 +# endif +# if defined(_RELEASE) && defined(_RELEASE_MINOR) /* X1 */ +# define PLATFORM_COMPILER_VERSION \ + PLATFORM_COMPILER_VERSION_INT(_RELEASE,_RELEASE_MINOR,0) +# elif defined(_RELEASE) /* T3E */ +# define PLATFORM_COMPILER_VERSION \ + PLATFORM_COMPILER_VERSION_INT(_RELEASE,0,0) +# endif +# ifdef _RELEASE_STRING /* X1 */ +# define PLATFORM_COMPILER_VERSION_STR _RELEASE_STRING +# endif + +#elif defined(__KCC) +# define PLATFORM_COMPILER_KAI 1 +# define PLATFORM_COMPILER_FAMILYNAME KAI +# define PLATFORM_COMPILER_FAMILYID 11 +# ifdef __cplusplus +# define PLATFORM_COMPILER_KAI_CXX 1 +# else +# define PLATFORM_COMPILER_KAI_C 1 +# endif + +#elif defined(__MTA__) +# define PLATFORM_COMPILER_MTA 1 +# define PLATFORM_COMPILER_FAMILYNAME MTA +# define PLATFORM_COMPILER_FAMILYID 12 +# ifdef __cplusplus +# define PLATFORM_COMPILER_MTA_CXX 1 +# else +# define PLATFORM_COMPILER_MTA_C 1 +# endif + +#elif defined(_SX) +# define PLATFORM_COMPILER_NECSX 1 +# define PLATFORM_COMPILER_FAMILYNAME NECSX +# define PLATFORM_COMPILER_FAMILYID 13 +# ifdef __cplusplus +# define PLATFORM_COMPILER_NECSX_CXX 1 +# else +# define PLATFORM_COMPILER_NECSX_C 1 +# endif + +#elif defined(_MSC_VER) +# define PLATFORM_COMPILER_MICROSOFT 1 +# define PLATFORM_COMPILER_FAMILYNAME MICROSOFT +# define PLATFORM_COMPILER_FAMILYID 14 +# ifdef __cplusplus +# define PLATFORM_COMPILER_MICROSOFT_CXX 1 +# else +# define PLATFORM_COMPILER_MICROSOFT_C 1 +# endif +# define PLATFORM_COMPILER_VERSION _MSC_VER + +#elif defined(__TINYC__) +# define PLATFORM_COMPILER_TINY 1 +# define PLATFORM_COMPILER_FAMILYNAME TINY +# define PLATFORM_COMPILER_FAMILYID 15 +# ifdef __cplusplus +# define PLATFORM_COMPILER_TINY_CXX 1 +# else +# define PLATFORM_COMPILER_TINY_C 1 +# endif + +#elif defined(__LCC__) +# define PLATFORM_COMPILER_LCC 1 +# define PLATFORM_COMPILER_FAMILYNAME LCC +# define PLATFORM_COMPILER_FAMILYID 16 +# ifdef __cplusplus +# define PLATFORM_COMPILER_LCC_CXX 1 +# else +# define PLATFORM_COMPILER_LCC_C 1 +# endif + +#else /* unknown compiler */ +# define PLATFORM_COMPILER_UNKNOWN 1 +#endif + +/* this stanza comes last, because many vendor compilers lie and claim + to be GNU C for compatibility reasons and/or because they share a frontend */ +#if defined(__GNUC__) +# undef PLATFORM_COMPILER_UNKNOWN +# ifndef PLATFORM_COMPILER_FAMILYID +# define PLATFORM_COMPILER_GNU 1 +# define PLATFORM_COMPILER_FAMILYNAME GNU +# define PLATFORM_COMPILER_FAMILYID 1 +# ifdef __cplusplus +# define PLATFORM_COMPILER_GNU_CXX 1 +# else +# define PLATFORM_COMPILER_GNU_C 1 +# endif +# if defined(__GNUC_MINOR__) && defined(__GNUC_PATCHLEVEL__) +# define PLATFORM_COMPILER_VERSION \ + PLATFORM_COMPILER_VERSION_INT(__GNUC__,__GNUC_MINOR__,__GNUC_PATCHLEVEL__) +# elif defined(__GNUC_MINOR__) /* older versions of egcs lack __GNUC_PATCHLEVEL__ */ +# define PLATFORM_COMPILER_VERSION \ + PLATFORM_COMPILER_VERSION_INT(__GNUC__,__GNUC_MINOR__,0) +# else +# define PLATFORM_COMPILER_VERSION \ + PLATFORM_COMPILER_VERSION_INT(__GNUC__,0,0) +# endif +# define PLATFORM_COMPILER_VERSION_STR __PLATFORM_COMPILER_GNU_VERSION_STR +# else +# define _PLATFORM_COMPILER_GNU_VERSION_STR __PLATFORM_COMPILER_GNU_VERSION_STR +# endif + /* gather any advertised GNU version number info, even for non-gcc compilers */ +# if defined(__GNUC_MINOR__) && defined(__GNUC_PATCHLEVEL__) +# define __PLATFORM_COMPILER_GNU_VERSION_STR \ + _STRINGIFY(__GNUC__)"."_STRINGIFY(__GNUC_MINOR__)"."_STRINGIFY(__GNUC_PATCHLEVEL__) +# elif defined(__GNUC_MINOR__) +# define __PLATFORM_COMPILER_GNU_VERSION_STR \ + _STRINGIFY(__GNUC__)"."_STRINGIFY(__GNUC_MINOR__)".?" +# else +# define __PLATFORM_COMPILER_GNU_VERSION_STR \ + _STRINGIFY(__GNUC__)".?.?" +# endif +#elif defined(PLATFORM_COMPILER_UNKNOWN) /* unknown compiler */ +# define PLATFORM_COMPILER_FAMILYNAME UNKNOWN +# define PLATFORM_COMPILER_FAMILYID 0 +#endif + +/* Default Values */ +#ifndef PLATFORM_COMPILER_VERSION +# define PLATFORM_COMPILER_VERSION 0 /* don't know */ +#endif + +#ifndef PLATFORM_COMPILER_VERSION_STR +# define PLATFORM_COMPILER_VERSION_STR _STRINGIFY(PLATFORM_COMPILER_VERSION) +#endif + +#ifndef PLATFORM_COMPILER_VERSION_INT +# define PLATFORM_COMPILER_VERSION_INT(maj,min,pat) \ + (((maj) << 16) | ((min) << 8) | (pat)) +#endif + + +#endif /* MPI_PORTABLE_PLATFORM_H */ diff --git a/oshmem/mca/atomic/Makefile.am b/oshmem/mca/atomic/Makefile.am new file mode 100644 index 0000000000..fc0912b95e --- /dev/null +++ b/oshmem/mca/atomic/Makefile.am @@ -0,0 +1,35 @@ +# +# Copyright (c) 2012 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# main library setup +noinst_LTLIBRARIES = libmca_atomic.la +libmca_atomic_la_SOURCES = + +# header setup +nobase_oshmem_HEADERS = +nobase_nodist_oshmem_HEADERS = + +# local files +headers = atomic.h +libmca_atomic_la_SOURCES += $(headers) $(nodist_headers) + +# Conditionally install the header files +if WANT_INSTALL_HEADERS +nobase_oshmem_HEADERS += $(headers) +nobase_nodist_oshmem_HEADERS += $(nodist_headers) +oshmemdir = $(includedir)/oshmem/oshmem/mca/atomic +else +oshmemdir = $(includedir) +endif + +include base/Makefile.am + +distclean-local: + rm -f base/static-components.h diff --git a/oshmem/mca/atomic/atomic.h b/oshmem/mca/atomic/atomic.h new file mode 100644 index 0000000000..6d28292959 --- /dev/null +++ b/oshmem/mca/atomic/atomic.h @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/** + * @file + * + * Atomic Operations Interface + * + */ + +#ifndef OSHMEM_MCA_ATOMIC_H +#define OSHMEM_MCA_ATOMIC_H + +#include "oshmem_config.h" +#include "oshmem/types.h" +#include "oshmem/constants.h" + +#include "opal/util/output.h" +#include "mpi.h" +#include "opal/mca/mca.h" +#include "opal/mca/base/base.h" +#include "oshmem/mca/atomic/base/base.h" + + +BEGIN_C_DECLS + + +/* ******************************************************************** */ + +struct oshmem_op_t; + + +/* ******************************************************************** */ + + +typedef int (*mca_atomic_base_component_init_fn_t) + (bool enable_progress_threads, bool enable_threads); + +typedef int (*mca_atomic_base_component_finalize_fn_t)(void); + +typedef struct mca_atomic_base_module_1_0_0_t* (*mca_atomic_base_component_query_fn_t) + (int *priority); + + +/* ******************************************************************** */ + + +/** + * Atomic component interface + * + * Component interface for the atomic framework. A public + * instance of this structure, called + * mca_atomic_[component_name]_component, must exist in any atomic + * component. + */ +struct mca_atomic_base_component_1_0_0_t { + /** Base component description */ + mca_base_component_t atomic_version; + /** Base component data block */ + mca_base_component_data_t atomic_data; + + /** Component initialization function */ + mca_atomic_base_component_init_fn_t atomic_init; + mca_atomic_base_component_finalize_fn_t atomic_finalize; + mca_atomic_base_component_query_fn_t atomic_query; +}; +typedef struct mca_atomic_base_component_1_0_0_t mca_atomic_base_component_1_0_0_t; + +/** Per guidence in mca.h, use the unversioned struct name if you just + want to always keep up with the most recent version of the + interace. */ +typedef struct mca_atomic_base_component_1_0_0_t mca_atomic_base_component_t; + + +/** + * Atomic module interface + * + */ +struct mca_atomic_base_module_1_0_0_t { + /** Collective modules all inherit from opal_object */ + opal_object_t super; + + /* Collective function pointers */ + int (*atomic_fadd)(void *target, void *prev, const void *value, size_t nlong, int pe, struct oshmem_op_t *op); + int (*atomic_cswap)(void *target, void *prev, const void *cond, const void *value, size_t nlong, int pe); +}; +typedef struct mca_atomic_base_module_1_0_0_t mca_atomic_base_module_1_0_0_t; + +/** Per guidence in mca.h, use the unversioned struct name if you just + want to always keep up with the most recent version of the + interace. */ +typedef struct mca_atomic_base_module_1_0_0_t mca_atomic_base_module_t; +OSHMEM_DECLSPEC OBJ_CLASS_DECLARATION(mca_atomic_base_module_t); + + +/* ******************************************************************** */ + + +/* + * Macro for use in components + */ +#define MCA_ATOMIC_BASE_VERSION_2_0_0 \ + MCA_BASE_VERSION_2_0_0, \ + "atomic", 1, 0, 0 + + +/* ******************************************************************** */ + + +OSHMEM_DECLSPEC extern mca_atomic_base_component_t mca_atomic_base_selected_component; +OSHMEM_DECLSPEC extern mca_atomic_base_module_t mca_atomic; +#define MCA_ATOMIC_CALL(a) mca_atomic.atomic_ ## a + + +END_C_DECLS + +#endif /* OSHMEM_MCA_ATOMIC_H */ diff --git a/oshmem/mca/atomic/base/Makefile.am b/oshmem/mca/atomic/base/Makefile.am new file mode 100644 index 0000000000..51678a6d48 --- /dev/null +++ b/oshmem/mca/atomic/base/Makefile.am @@ -0,0 +1,20 @@ +# +# Copyright (c) 2012 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +AM_CFLAGS = $(OSHMEM_CFLAGS) + +headers += \ + base/base.h + +libmca_atomic_la_SOURCES += \ + base/atomic_base_close.c \ + base/atomic_base_available.c \ + base/atomic_base_select.c \ + base/atomic_base_open.c diff --git a/oshmem/mca/atomic/base/atomic_base_available.c b/oshmem/mca/atomic/base/atomic_base_available.c new file mode 100644 index 0000000000..dc282b2b71 --- /dev/null +++ b/oshmem/mca/atomic/base/atomic_base_available.c @@ -0,0 +1,180 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include +#include +#include + +#include "oshmem_config.h" + +#include "orte/util/show_help.h" + +#include "opal/class/opal_list.h" +#include "opal/util/output.h" +#include "opal/mca/mca.h" +#include "opal/mca/base/base.h" +#include "opal/mca/base/mca_base_component_repository.h" + +#include "oshmem/constants.h" +#include "oshmem/mca/atomic/atomic.h" +#include "oshmem/mca/atomic/base/base.h" + + +/* + * Global variables + */ +bool mca_atomic_base_components_available_valid = false; +opal_list_t mca_atomic_base_components_available; + + +/* + * Private functions + */ +static int init_query(const mca_base_component_t * ls, + mca_base_component_priority_list_item_t * entry, + bool enable_progress_threads, + bool enable_threads); + +/* + * Scan down the list of successfully opened components and query each of + * them (the opened list will be one or more components. If the user + * requested a specific component, it will be the only component in the + * opened list). Create and populate the available list of all + * components who indicate that they want to be considered for selection. + * Close all components who do not want to be considered for selection, + * and destroy the opened list. + * + * Also find the basic component while we're doing all of this, and save + * it in a global variable so that we can find it easily later (e.g., + * during scope selection). + */ +int mca_atomic_base_find_available(bool enable_progress_threads, + bool enable_threads) +{ + bool found = false; + mca_base_component_priority_list_item_t *entry; + opal_list_item_t *p; + const mca_base_component_t *component; + + /* Initialize the list */ + + OBJ_CONSTRUCT(&mca_atomic_base_components_available, opal_list_t); + mca_atomic_base_components_available_valid = true; + + /* The list of components that we should check has already been + established in mca_coll_base_open. */ + + for (found = false, + p = opal_list_remove_first(&mca_atomic_base_components_opened); + p != NULL; + p = opal_list_remove_first(&mca_atomic_base_components_opened)) { + component = ((mca_base_component_list_item_t *) p)->cli_component; + + /* Call a subroutine to do the work, because the component may + represent different versions of the coll MCA. */ + + entry = OBJ_NEW(mca_base_component_priority_list_item_t); + entry->super.cli_component = component; + entry->cpli_priority = 0; + if (OSHMEM_SUCCESS == init_query(component, entry, + enable_progress_threads, + enable_threads)) { + opal_list_append(&mca_atomic_base_components_available, + (opal_list_item_t *) entry); + found = true; + } else { + + /* If the component doesn't want to run, then close it. + It's already had its close() method invoked; now close + it out of the DSO repository (if it's there). */ + + mca_base_component_repository_release(component); + OBJ_RELEASE(entry); + } + + /* Free the entry from the "opened" list */ + + OBJ_RELEASE(p); + } + + /* The opened list is now no longer useful and we can free it */ + + OBJ_DESTRUCT(&mca_atomic_base_components_opened); + mca_atomic_base_components_opened_valid = false; + + /* If we have no atomic components available, it's an error. + Thanks for playing! */ + + if (!found) { + /* Need to free all items in the list */ + OBJ_DESTRUCT(&mca_atomic_base_components_available); + mca_atomic_base_components_available_valid = false; + ATOMIC_VERBOSE(10,"atomic:find_available: no atomic components available!"); + return OSHMEM_ERROR; + } + + /* All done */ + + return mca_atomic_base_select(); +} + + +/* + * Query a component, see if it wants to run at all. If it does, save + * some information. If it doesn't, close it. + */ +static int init_query(const mca_base_component_t * component, + mca_base_component_priority_list_item_t * entry, + bool enable_progress_threads, bool enable_threads) +{ + int ret; + + ATOMIC_VERBOSE(10,"atomic:find_available: querying atomic component %s", + component->mca_component_name); + + /* This component has already been successfully opened. So now + query it. */ + + if (1 == component->mca_type_major_version && + 0 == component->mca_type_minor_version && + 0 == component->mca_type_release_version) { + + mca_atomic_base_component_t *atomic = + (mca_atomic_base_component_t *) component; + + ret = atomic->atomic_init(enable_progress_threads, + enable_threads); + } else { + /* Unrecognized coll API version */ + + ATOMIC_VERBOSE(10,"atomic:find_available: unrecognized atomic API version (%d.%d.%d, ignored)", + component->mca_type_major_version, + component->mca_type_minor_version, + component->mca_type_release_version); + return OSHMEM_ERROR; + } + + /* Query done -- look at the return value to see what happened */ + + if (OSHMEM_SUCCESS != ret) { + ATOMIC_VERBOSE(10,"atomic:find_available: atomic component %s is not available", + component->mca_component_name); + if (NULL != component->mca_close_component) { + component->mca_close_component(); + } + } else { + ATOMIC_VERBOSE(10,"atomic:find_available: atomic component %s is available", + component->mca_component_name); + } + + /* All done */ + + return ret; +} diff --git a/oshmem/mca/atomic/base/atomic_base_close.c b/oshmem/mca/atomic/base/atomic_base_close.c new file mode 100644 index 0000000000..b7ccd0029f --- /dev/null +++ b/oshmem/mca/atomic/base/atomic_base_close.c @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include + +#include "oshmem_config.h" + +#include "opal/mca/mca.h" +#include "opal/mca/base/base.h" + +#include "oshmem/constants.h" +#include "oshmem/mca/atomic/atomic.h" +#include "oshmem/mca/atomic/base/base.h" + + +int mca_atomic_base_close(void) +{ + int ret = OSHMEM_SUCCESS; + if (NULL != mca_atomic_base_selected_component.atomic_finalize) { + ret = mca_atomic_base_selected_component.atomic_finalize(); + } + /* Close all components that are still open. This may be the opened + * list (if we're in ompi_info), or it may be the available list (if + * we're anywhere else). */ + + if (mca_atomic_base_components_opened_valid) { + mca_base_components_close(mca_atomic_base_output, + &mca_atomic_base_components_opened, NULL); + OBJ_DESTRUCT(&mca_atomic_base_components_opened); + mca_atomic_base_components_opened_valid = false; + } else if (mca_atomic_base_components_available_valid) { + mca_base_components_close(mca_atomic_base_output, + &mca_atomic_base_components_available, + NULL); + OBJ_DESTRUCT(&mca_atomic_base_components_available); + mca_atomic_base_components_available_valid = false; + } + + + /* All done */ + + return ret; +} diff --git a/oshmem/mca/atomic/base/atomic_base_open.c b/oshmem/mca/atomic/base/atomic_base_open.c new file mode 100644 index 0000000000..ef70a419fa --- /dev/null +++ b/oshmem/mca/atomic/base/atomic_base_open.c @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + + +#include + +#include "oshmem_config.h" + +#include "opal/mca/mca.h" +#include "opal/util/output.h" +#include "opal/mca/base/base.h" +#include "opal/mca/base/mca_base_param.h" + +#include "oshmem/constants.h" +#include "oshmem/mca/atomic/atomic.h" +#include "oshmem/mca/atomic/base/base.h" + +/* + * The following file was created by configure. It contains extern + * statements and the definition of an array of pointers to each + * component's public mca_base_component_t struct. + */ +#include "oshmem/mca/atomic/base/static-components.h" + + +/* + * Global variables; most of which are loaded by back-ends of MCA + * variables + */ +int mca_atomic_base_output = -1; + +bool mca_atomic_base_components_opened_valid = false; +opal_list_t mca_atomic_base_components_opened; + +OBJ_CLASS_INSTANCE(mca_atomic_base_module_t, opal_object_t, NULL, NULL); + +/* + * Function for finding and opening either all MCA components, or the one + * that was specifically requested via a MCA parameter. + */ +int mca_atomic_base_open(void) +{ + /* Open an output stream for this framework */ + int value = -1; + + mca_atomic_base_output = opal_output_open(NULL); + mca_base_param_reg_int_name("atomic_base","verbose", + "Verbose level of the shmem atomic component",false,false,0,&value); + opal_output_set_verbosity(mca_atomic_base_output, value); + + /* Open up all available components */ + if (OSHMEM_SUCCESS != + mca_base_components_open("atomic", mca_atomic_base_output, + mca_atomic_base_static_components, + &mca_atomic_base_components_opened, true)) { + return OSHMEM_ERROR; + } + mca_atomic_base_components_opened_valid = true; + + /* All done */ + + return OSHMEM_SUCCESS; +} diff --git a/oshmem/mca/atomic/base/atomic_base_select.c b/oshmem/mca/atomic/base/atomic_base_select.c new file mode 100644 index 0000000000..444035db9b --- /dev/null +++ b/oshmem/mca/atomic/base/atomic_base_select.c @@ -0,0 +1,259 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "oshmem_config.h" + +#include +#include +#include + +#include "oshmem/constants.h" + +#include "opal/class/opal_list.h" +#include "opal/util/output.h" +#include "orte/util/show_help.h" +#include "opal/mca/mca.h" +#include "opal/mca/base/base.h" +#include "opal/mca/base/mca_base_component_repository.h" + +#include "oshmem/mca/atomic/atomic.h" +#include "oshmem/mca/atomic/base/base.h" + + +/* + * Global variables; most of which are loaded by back-ends of MCA + * variables + */ +mca_atomic_base_module_t mca_atomic; +mca_atomic_base_component_t mca_atomic_base_selected_component; + + +/* + * Local types + */ +struct avail_atomic_t { + opal_list_item_t super; + + int ac_priority; + mca_atomic_base_component_t *ac_component; + mca_atomic_base_module_t *ac_module; +}; +typedef struct avail_atomic_t avail_atomic_t; + + +/* + * Local functions + */ +static opal_list_t *check_components(opal_list_t * components); +static int check_one_component(const mca_base_component_t * component, + mca_atomic_base_module_1_0_0_t ** module); + +static int query(const mca_base_component_t * component, + int *priority, + mca_atomic_base_module_1_0_0_t ** module); + +static int query_1_0_0(const mca_atomic_base_component_1_0_0_t * atomic_component, + int *priority, + mca_atomic_base_module_1_0_0_t ** module); + +/* + * Stuff for the OBJ interface + */ +static OBJ_CLASS_INSTANCE(avail_atomic_t, opal_list_item_t, NULL, NULL); + +/* + * This function is called at the initialization. + * It is used to select which atomic component will be + * active for a given group. + */ +int mca_atomic_base_select(void) +{ + opal_list_t *selectable; + opal_list_item_t *item; + + /* Announce */ + ATOMIC_VERBOSE(10,"atomic:base:atomic_select: Checking all available modules"); + selectable = check_components(&mca_atomic_base_components_available); + + /* Upon return from the above, the modules list will contain the + list of modules that returned (priority >= 0). If we have no + atomic modules available, then print error and return. */ + if (NULL == selectable) { + /* There's no modules available */ + return OSHMEM_ERROR; + } + + /* do the selection loop */ + for (item = opal_list_remove_first(selectable); + NULL != item; item = opal_list_remove_first(selectable)) + { + avail_atomic_t *avail = (avail_atomic_t *)item; + + /* Set module having the highest priority */ + memcpy(&mca_atomic, avail->ac_module, sizeof(mca_atomic)); + memcpy(&mca_atomic_base_selected_component, avail->ac_component, sizeof(mca_atomic_base_selected_component)); + + OBJ_RELEASE(avail->ac_module); + OBJ_RELEASE(avail); + /* check correctness */ + if ( !(mca_atomic.atomic_fadd) || + !(mca_atomic.atomic_cswap) ) + { + return OSHMEM_ERR_NOT_FOUND; + } + } + + /* Done with the list from the check_components() call so release it. */ + OBJ_RELEASE(selectable); + + return OSHMEM_SUCCESS; +} + + +/* + * For each module in the list, check and see if it wants to run, and + * do the resulting priority comparison. Make a list of modules to be + * only those who returned that they want to run, and put them in + * priority order. + */ +static opal_list_t *check_components(opal_list_t * components) +{ + int priority; + const mca_base_component_t *component; + opal_list_item_t *item, *item2; + mca_atomic_base_module_1_0_0_t *module; + opal_list_t *selectable; + avail_atomic_t *avail, *avail2; + + /* Make a list of the components that query successfully */ + selectable = OBJ_NEW(opal_list_t); + + /* Scan through the list of components. This nested loop is + O(N^2), but we should never have too many components, so this + *hopefully* shouldn't matter... */ + + for (item = opal_list_get_first(components); + ((item != opal_list_get_end(components)) && (item != NULL)); + item = opal_list_get_next(item)) { + component = ((mca_base_component_priority_list_item_t *) + item)->super.cli_component; + + priority = check_one_component(component, &module); + if (priority >= 0) { + + /* We have a component that indicated that it wants to run + by giving us a module */ + avail = OBJ_NEW(avail_atomic_t); + avail->ac_priority = priority; + avail->ac_module = module; + avail->ac_component = (mca_atomic_base_component_t *)component; + + /* Put this item on the list in priority order (lowest + priority first). Should it go first? */ + for (item2 = opal_list_get_first(selectable); + item2 != opal_list_get_end(selectable); + item2 = opal_list_get_next(item2)) { + avail2 = (avail_atomic_t *) item2; + if (avail->ac_priority < avail2->ac_priority) { + opal_list_insert_pos(selectable, + item2, + (opal_list_item_t *) avail); + break; + } + } + + if (opal_list_get_end(selectable) == item2) { + opal_list_append(selectable, + (opal_list_item_t *) avail); + } + } + } + /*TODO: copy over any of the pointers */ + + /* If we didn't find any available components, return an error */ + if (0 == opal_list_get_size(selectable)) { + OBJ_RELEASE(selectable); + return NULL; + } + + /* All done */ + return selectable; +} + + +/* + * Check a single component + */ +static int check_one_component(const mca_base_component_t * component, + mca_atomic_base_module_1_0_0_t ** module) +{ + int err; + int priority = -1; + + err = query(component, &priority, module); + + if (OSHMEM_SUCCESS == err) { + priority = (priority < 100) ? priority : 100; + ATOMIC_VERBOSE(10,"atomic:base:atomic_select: component available: %s, priority: %d", + component->mca_component_name, priority); + + } else { + priority = -1; + ATOMIC_VERBOSE(10,"atomic:base:atomic_select: component not available: %s", + component->mca_component_name); + } + + return priority; +} + + +/************************************************************************** + * Query functions + **************************************************************************/ + +/* + * Take any version of a atomic module, query it, and return the right + * module struct + */ +static int query(const mca_base_component_t * component, + int *priority, mca_atomic_base_module_1_0_0_t ** module) +{ + *module = NULL; + if (1 == component->mca_type_major_version && + 0 == component->mca_type_minor_version && + 0 == component->mca_type_release_version) { + const mca_atomic_base_component_1_0_0_t *atomic100 = + (mca_atomic_base_component_1_0_0_t *) component; + + return query_1_0_0(atomic100, priority, module); + } + + /* Unknown atomic API version -- return error */ + + return OSHMEM_ERROR; +} + + +static int query_1_0_0(const mca_atomic_base_component_1_0_0_t * component, + int *priority, + mca_atomic_base_module_1_0_0_t ** module) +{ + mca_atomic_base_module_1_0_0_t *ret; + + /* There's currently no need for conversion */ + + ret = component->atomic_query(priority); + if (NULL != ret) { + *module = ret; + return OSHMEM_SUCCESS; + } + + return OSHMEM_ERROR; +} diff --git a/oshmem/mca/atomic/base/base.h b/oshmem/mca/atomic/base/base.h new file mode 100644 index 0000000000..a117a2b96e --- /dev/null +++ b/oshmem/mca/atomic/base/base.h @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + + +#ifndef MCA_ATOMIC_BASE_H +#define MCA_ATOMIC_BASE_H + +#include "oshmem_config.h" + +#include "oshmem/mca/atomic/atomic.h" +#include "opal/class/opal_list.h" + + +/* + * Global functions for MCA overall atomic open and close + */ + +BEGIN_C_DECLS + +/** + * Initialize the atomic MCA framework + * + * @retval OSHEM_SUCCESS Upon success + * @retval OSHMEM_ERROR Upon failure + * + */ +OSHMEM_DECLSPEC int mca_atomic_base_open(void); + +/** + * Create list of available atomic components. + * + * @param allow_multi_user_threads Will be set to true if any of the + * available components will allow multiple user threads + * @param have_hidden_threads Will be set to true if any of the + * available components have hidden threads. + * + * @retval OSHMEM_SUCCESS If one or more atomic components are available. + * @retval OSHMEM_ERROR If no atomic components are found to be available. + * + */ +int mca_atomic_base_find_available(bool enable_progress_threads, + bool enable_threads); + + +int mca_atomic_base_select(void); + + +/** + * Shut down the atomic MCA framework. + * + * @retval OSHMEM_SUCCESS Always + * + * This function shuts down everything in the atomic MCA framework, + * and is called during oshmem_shmem_finalize(). + * + * It must be the last function invoked on the atomic MCA framework. + */ +OSHMEM_DECLSPEC int mca_atomic_base_close(void); + + +/* + * Globals + */ + + +/** + * ATOMIC framework debugging stream ID used with opal_output() and + * opal_output_verbose(). + */ +OSHMEM_DECLSPEC extern int mca_atomic_base_output; + +/** + * Indicator as to whether the list of opened atomic components is valid or + * not. + */ +extern bool mca_atomic_base_components_opened_valid; + +/** + * List of all opened components; created when the atomic framework is + * initialized and destroyed when we reduce the list to all available + * coll components. + */ +OSHMEM_DECLSPEC extern opal_list_t mca_atomic_base_components_opened; + +/** + * Indicator as to whether the list of available atomic components is valid + * or not. + */ +extern bool mca_atomic_base_components_available_valid; + +/** + * List of all available components; created by reducing the list of open + * components to all those who indicate that they may run during this + * process. + */ +extern opal_list_t mca_atomic_base_components_available; + + +/* ******************************************************************** */ +#ifdef __BASE_FILE__ +#define __ATOMIC_FILE__ __BASE_FILE__ +#else +#define __ATOMIC_FILE__ __FILE__ +#endif + +#define ATOMIC_VERBOSE(level, format, ...) \ + opal_output_verbose(level, mca_atomic_base_output, "%s:%d - %s() " format, \ + __ATOMIC_FILE__, __LINE__, __FUNCTION__, ## __VA_ARGS__) + +#define ATOMIC_ERROR(format, ... ) \ + opal_output_verbose(0, mca_atomic_base_output, "Error: %s:%d - %s() " format, \ + __SCOLL_FILE__, __LINE__, __FUNCTION__, ## __VA_ARGS__) + +END_C_DECLS + +#endif /* MCA_ATOMIC_BASE_H */ diff --git a/oshmem/mca/atomic/basic/.windows b/oshmem/mca/atomic/basic/.windows new file mode 100644 index 0000000000..104768dd6a --- /dev/null +++ b/oshmem/mca/atomic/basic/.windows @@ -0,0 +1,12 @@ +# +# Copyright (c) 2012 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# Specific to this module +mca_link_libraries=libshmem diff --git a/oshmem/mca/atomic/basic/Makefile.am b/oshmem/mca/atomic/basic/Makefile.am new file mode 100644 index 0000000000..f726c3c58c --- /dev/null +++ b/oshmem/mca/atomic/basic/Makefile.am @@ -0,0 +1,42 @@ +# +# Copyright (c) 2012 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +EXTRA_DIST = .windows + +AM_CFLAGS = $(OSHMEM_CFLAGS) + +sources = \ + atomic_basic.h \ + atomic_basic_module.c \ + atomic_basic_component.c \ + atomic_basic_fadd.c \ + atomic_basic_cswap.c + + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +if MCA_BUILD_oshmem_atomic_basic_DSO +component_noinst = +component_install = mca_atomic_basic.la +else +component_noinst = libmca_atomic_basic.la +component_install = +endif + +mcacomponentdir = $(pkglibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_atomic_basic_la_SOURCES = $(sources) +mca_atomic_basic_la_LDFLAGS = -module -avoid-version + +noinst_LTLIBRARIES = $(component_noinst) +libmca_atomic_basic_la_SOURCES =$(sources) +libmca_atomic_basic_la_LDFLAGS = -module -avoid-version diff --git a/oshmem/mca/atomic/basic/atomic_basic.h b/oshmem/mca/atomic/basic/atomic_basic.h new file mode 100644 index 0000000000..eb8474aec2 --- /dev/null +++ b/oshmem/mca/atomic/basic/atomic_basic.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_ATOMIC_BASIC_H +#define MCA_ATOMIC_BASIC_H + +#include "oshmem_config.h" + +#include "opal/mca/mca.h" +#include "oshmem/mca/atomic/atomic.h" + +BEGIN_C_DECLS + +/* Globally exported variables */ + +OSHMEM_MODULE_DECLSPEC extern mca_atomic_base_component_1_0_0_t + mca_atomic_basic_component; + +extern int mca_atomic_basic_priority_param; + +OSHMEM_DECLSPEC void atomic_basic_lock(int pe); +OSHMEM_DECLSPEC void atomic_basic_unlock(int pe); + +/* API functions */ + +int mca_atomic_basic_init(bool enable_progress_threads, + bool enable_threads); +int mca_atomic_basic_finalize(void); +mca_atomic_base_module_t* + mca_atomic_basic_query(int *priority); + +int mca_atomic_basic_fadd(void *target, void *prev, const void *value, size_t nlong, int pe, struct oshmem_op_t *op); +int mca_atomic_basic_cswap(void *target, void *prev, const void *cond, const void *value, size_t nlong, int pe); + + +struct mca_atomic_basic_module_t { + mca_atomic_base_module_t super; +}; +typedef struct mca_atomic_basic_module_t mca_atomic_basic_module_t; +OBJ_CLASS_DECLARATION(mca_atomic_basic_module_t); + +END_C_DECLS + +#endif /* MCA_ATOMIC_BASIC_H */ diff --git a/oshmem/mca/atomic/basic/atomic_basic_component.c b/oshmem/mca/atomic/basic/atomic_basic_component.c new file mode 100644 index 0000000000..d507a29557 --- /dev/null +++ b/oshmem/mca/atomic/basic/atomic_basic_component.c @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "oshmem_config.h" + +#include "oshmem/constants.h" +#include "oshmem/mca/atomic/atomic.h" +#include "oshmem/mca/atomic/base/base.h" +#include "atomic_basic.h" + + +/* + * Public string showing the scoll basic component version number + */ +const char *mca_atomic_basic_component_version_string = + "Open SHMEM basic atomic MCA component version " OSHMEM_VERSION; + +/* + * Global variable + */ +int mca_atomic_basic_priority_param = -1; + +/* + * Local function + */ +static int __basic_open(void); + + +/* + * Instantiate the public struct with all of our public information + * and pointers to our public functions in it + */ + +mca_atomic_base_component_t mca_atomic_basic_component = { + + /* First, the mca_component_t struct containing meta information + about the component itself */ + + { + MCA_ATOMIC_BASE_VERSION_2_0_0, + + /* Component name and version */ + "basic", + OSHMEM_MAJOR_VERSION, + OSHMEM_MINOR_VERSION, + OSHMEM_RELEASE_VERSION, + + /* Component open and close functions */ + __basic_open, + NULL + }, + { + /* The component is checkpoint ready */ + MCA_BASE_METADATA_PARAM_CHECKPOINT + }, + + /* Initialization / querying functions */ + + mca_atomic_basic_init, + mca_atomic_basic_finalize, + mca_atomic_basic_query +}; + + +static int __basic_open(void) +{ + /* We'll always be picked if there's only one process in the + communicator */ + int default_value = 75; + int param_value = default_value; + + mca_atomic_basic_priority_param = + mca_base_param_reg_int(&mca_atomic_basic_component.atomic_version, "priority", NULL, false, false, default_value, ¶m_value); + + return OSHMEM_SUCCESS; +} + + +OBJ_CLASS_INSTANCE(mca_atomic_basic_module_t, + mca_atomic_base_module_t, + NULL, NULL); diff --git a/oshmem/mca/atomic/basic/atomic_basic_cswap.c b/oshmem/mca/atomic/basic/atomic_basic_cswap.c new file mode 100644 index 0000000000..9e0f3cad6d --- /dev/null +++ b/oshmem/mca/atomic/basic/atomic_basic_cswap.c @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "oshmem_config.h" +#include +#include + +#include "oshmem/constants.h" +#include "oshmem/mca/spml/spml.h" +#include "oshmem/mca/atomic/atomic.h" +#include "oshmem/mca/atomic/base/base.h" +#include "atomic_basic.h" + + +int mca_atomic_basic_cswap(void *target, void *prev, const void *cond, const void *value, size_t nlong, int pe) +{ + int rc = OSHMEM_SUCCESS; + + if (!prev) + { + rc = OSHMEM_ERROR; + } + + if ( rc == OSHMEM_SUCCESS ) + { + atomic_basic_lock(pe); + + rc = MCA_SPML_CALL(get(target, nlong, prev, pe)); + + if (( rc == OSHMEM_SUCCESS ) && (!cond || !memcmp(prev, cond, nlong))) + { + rc = MCA_SPML_CALL(put(target, nlong, (void*)value, pe)); + shmem_quiet(); + } + + atomic_basic_unlock(pe); + } + + return rc; +} diff --git a/oshmem/mca/atomic/basic/atomic_basic_fadd.c b/oshmem/mca/atomic/basic/atomic_basic_fadd.c new file mode 100644 index 0000000000..ace56a04cd --- /dev/null +++ b/oshmem/mca/atomic/basic/atomic_basic_fadd.c @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "oshmem_config.h" +#include +#include + +#include "oshmem/constants.h" +#include "oshmem/op/op.h" +#include "oshmem/mca/spml/spml.h" +#include "oshmem/mca/atomic/atomic.h" +#include "oshmem/mca/atomic/base/base.h" +#include "atomic_basic.h" + + +int mca_atomic_basic_fadd(void *target, void *prev, const void *value, size_t nlong, int pe, struct oshmem_op_t *op) +{ + int rc = OSHMEM_SUCCESS; + + if (!target || !value) + { + rc = OSHMEM_ERROR; + } + + if ( rc == OSHMEM_SUCCESS ) + { + long long temp_value = 0; + + atomic_basic_lock(pe); + + rc = MCA_SPML_CALL(get(target, nlong, (void*)&temp_value, pe)); + + if (prev) memcpy(prev, (void*)&temp_value, nlong); + + op->o_func.c_fn((void*)value, (void*)&temp_value, nlong / op->dt_size); + + if (rc == OSHMEM_SUCCESS) + { + rc = MCA_SPML_CALL(put(target, nlong, (void*)&temp_value, pe)); + shmem_quiet(); + } + + atomic_basic_unlock(pe); + } + + return rc; +} diff --git a/oshmem/mca/atomic/basic/atomic_basic_module.c b/oshmem/mca/atomic/basic/atomic_basic_module.c new file mode 100644 index 0000000000..2500e04088 --- /dev/null +++ b/oshmem/mca/atomic/basic/atomic_basic_module.c @@ -0,0 +1,211 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "oshmem_config.h" +#include + +#include "opal/mca/base/mca_base_param.h" + +#include "oshmem/constants.h" +#include "oshmem/mca/atomic/atomic.h" +#include "oshmem/mca/spml/spml.h" +#include "oshmem/mca/memheap/memheap.h" +#include "oshmem/proc/proc.h" +#include "atomic_basic.h" + + +static char *atomic_lock_sync; +static int *atomic_lock_turn; +static char *local_lock_sync; +static int *local_lock_turn; + +enum +{ + ATOMIC_LOCK_IDLE = 0, + ATOMIC_LOCK_WAITING = 1, + ATOMIC_LOCK_ACTIVE = 2 +}; + +/* + * Initial query function that is invoked during initialization, allowing + * this module to indicate what level of thread support it provides. + */ +int mca_atomic_basic_init(bool enable_progress_threads, + bool enable_threads) +{ + int rc = OSHMEM_SUCCESS; + void* ptr = NULL; + int num_pe = oshmem_num_procs(); + + rc = MCA_MEMHEAP_CALL(private_alloc((num_pe * sizeof(char)), &ptr)); + if(rc == OSHMEM_SUCCESS) + { + atomic_lock_sync = (char*)ptr; + memset(atomic_lock_sync, ATOMIC_LOCK_IDLE, sizeof(char) * num_pe); + + rc = MCA_MEMHEAP_CALL(private_alloc(sizeof(int), &ptr)); + if(rc == OSHMEM_SUCCESS) + { + atomic_lock_turn = (int*)ptr; + *atomic_lock_turn = 0; + if(rc == OSHMEM_SUCCESS) + { + local_lock_sync = (char*)malloc(num_pe * sizeof(char)); + local_lock_turn = (int*)malloc(sizeof(int)); + if (!local_lock_sync || !local_lock_turn) + { + rc = OSHMEM_ERR_OUT_OF_RESOURCE; + } + else + { + memcpy((void*)local_lock_sync, (void*)atomic_lock_sync, sizeof(char) * num_pe); + *local_lock_turn = *atomic_lock_turn; + } + } + } + } + + return rc; +} + + +int mca_atomic_basic_finalize(void) +{ + void* ptr = NULL; + + ptr = (void*)atomic_lock_sync; + MCA_MEMHEAP_CALL(private_free(ptr)); + atomic_lock_sync = NULL; + + ptr = (void*)atomic_lock_turn; + MCA_MEMHEAP_CALL(private_free(ptr)); + atomic_lock_turn = NULL; + + if (local_lock_sync) + { + free((void*)local_lock_sync); + local_lock_sync = NULL; + } + + if (local_lock_turn) + { + free((void*)local_lock_turn); + local_lock_turn = NULL; + } + + return OSHMEM_SUCCESS; +} + + +mca_atomic_base_module_t * +mca_atomic_basic_query(int *priority) +{ + mca_atomic_basic_module_t *module; + + if (OSHMEM_SUCCESS == + mca_base_param_lookup_int(mca_atomic_basic_priority_param, + priority)) + { + module = OBJ_NEW(mca_atomic_basic_module_t); + if (module) + { + module->super.atomic_fadd = mca_atomic_basic_fadd; + module->super.atomic_cswap = mca_atomic_basic_cswap; + return &(module->super); + } + } + + return NULL; +} + +void atomic_basic_lock(int pe) +{ + int rc = OSHMEM_SUCCESS; + int index = -1; + int me = oshmem_my_proc_id(); + int num_pe = oshmem_num_procs(); + char lock_required = ATOMIC_LOCK_WAITING; + char lock_active = ATOMIC_LOCK_ACTIVE; + int root_pe = pe; + + do + { + /* announce that we need the resource */ + do + { + rc = MCA_SPML_CALL(put((void*)(atomic_lock_sync + me), sizeof(lock_required), (void*)&lock_required, root_pe)); + //MCA_SPML_CALL(fence()); /* quiet */ + MCA_SPML_CALL(get((void*)atomic_lock_sync, num_pe * sizeof(*atomic_lock_sync), (void*)local_lock_sync, root_pe)); + } while (local_lock_sync[me] != lock_required); + + MCA_SPML_CALL(get((void*)atomic_lock_turn, sizeof(index), (void*)&index, root_pe)); + while (index != me) + { + if (local_lock_sync[index] != ATOMIC_LOCK_IDLE) + { + MCA_SPML_CALL(get((void*)atomic_lock_turn, sizeof(index), (void*)&index, root_pe)); + MCA_SPML_CALL(get((void*)atomic_lock_sync, num_pe * sizeof(*atomic_lock_sync), (void*)local_lock_sync, root_pe)); + } + else + { + index = (index + 1) % num_pe; + } + } + + /* now tentatively claim the resource */ + do + { + rc = MCA_SPML_CALL(put((void*)(atomic_lock_sync + me), sizeof(lock_active), (void*)&lock_active, root_pe)); + //MCA_SPML_CALL(fence()); /* quiet */ + MCA_SPML_CALL(get((void*)atomic_lock_sync, num_pe * sizeof(*atomic_lock_sync), (void*)local_lock_sync, root_pe)); + } while (local_lock_sync[me] != lock_active); + + index = 0; + while((index < num_pe)&& + ((index == me)||(local_lock_sync[index] != ATOMIC_LOCK_ACTIVE))) + { + index = index + 1; + } + + MCA_SPML_CALL(get((void*)atomic_lock_turn, sizeof(*atomic_lock_turn), (void*)local_lock_turn, root_pe)); + } while (!((index >= num_pe) && + ((*local_lock_turn == me)||(local_lock_sync[*local_lock_turn] == ATOMIC_LOCK_IDLE)))); + + rc = MCA_SPML_CALL(put((void*)atomic_lock_turn, sizeof(me), (void*)&me, root_pe)); + //MCA_SPML_CALL(fence()); /* quiet */ +} + + +void atomic_basic_unlock(int pe) +{ + int rc = OSHMEM_SUCCESS; + int index = -1; + int me = oshmem_my_proc_id(); + int num_pe = oshmem_num_procs(); + char lock_idle = ATOMIC_LOCK_IDLE; + int root_pe = pe; + + MCA_SPML_CALL(get((void*)atomic_lock_sync, num_pe * sizeof(*atomic_lock_sync), (void*)local_lock_sync, root_pe)); + MCA_SPML_CALL(get((void*)atomic_lock_turn, sizeof(index), (void*)&index, root_pe)); + + do + { + index = (index + 1) % num_pe; + } while (local_lock_sync[index] == ATOMIC_LOCK_IDLE); + + rc = MCA_SPML_CALL(put((void*)atomic_lock_turn, sizeof(index), (void*)&index, root_pe)); + + do + { + rc = MCA_SPML_CALL(put((void*)(atomic_lock_sync + me), sizeof(lock_idle), (void*)&lock_idle, root_pe)); + //MCA_SPML_CALL(fence()); /* quiet */ + MCA_SPML_CALL(get((void*)atomic_lock_sync, num_pe * sizeof(*atomic_lock_sync), (void*)local_lock_sync, root_pe)); + } while (local_lock_sync[me] != lock_idle); +} diff --git a/oshmem/mca/atomic/basic/configure.params b/oshmem/mca/atomic/basic/configure.params new file mode 100644 index 0000000000..5a3f93008f --- /dev/null +++ b/oshmem/mca/atomic/basic/configure.params @@ -0,0 +1,13 @@ +# -*- shell-script -*- +# Copyright (c) 2012 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# Specific to this module + +PARAM_CONFIG_FILES="Makefile" diff --git a/oshmem/mca/memheap/Makefile.am b/oshmem/mca/memheap/Makefile.am new file mode 100644 index 0000000000..a60f94beb2 --- /dev/null +++ b/oshmem/mca/memheap/Makefile.am @@ -0,0 +1,39 @@ +# +# Copyright (c) 2012 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# main library setup +noinst_LTLIBRARIES = libmca_memheap.la +libmca_memheap_la_SOURCES = +libmca_memheap_la_LDFLAGS = +libmca_memheap_la_LIBADD = + +# header setup +nobase_oshmem_HEADERS = +nobase_nodist_oshmem_HEADERS = + +dist_pkgdata_DATA = + +# local files +headers = memheap.h +libmca_memheap_la_SOURCES += $(headers) $(nodist_headers) + +# Conditionally install the header files +if WANT_INSTALL_HEADERS +nobase_oshmem_HEADERS += $(headers) +nobase_nodist_oshmem_HEADERS += $(nodist_headers) +oshmemdir = $(includedir)/oshmem/oshmem/mca/memheap +else +oshmemdir = $(includedir) +endif + +include base/Makefile.am + +distclean-local: + rm -f base/static-components.h diff --git a/oshmem/mca/memheap/README b/oshmem/mca/memheap/README new file mode 100644 index 0000000000..4f06a45099 --- /dev/null +++ b/oshmem/mca/memheap/README @@ -0,0 +1,50 @@ +# Copyright (c) 2012 Mellanox Technologies, Inc. +# All rights reserved +# $COPYRIGHT$ +MEMHEAP Infrustructure documentation +------------------------------------ + +MEMHEAP Infrustructure is responsible for managing the symmetric heap. +The framework currently has following components: buddy and ptmalloc. buddy which uses a buddy allocator in order to manage the Memory allocations on the symmetric heap. Ptmalloc is an adaptation of ptmalloc3. + +Additional components may be added easily to the framework by defining the component's and the module's base and extended structures, and their funtionalities. + +The buddy allocator has the following data structures: +1. Base component - of type struct mca_memheap_base_component_2_0_0_t +2. Base module - of type struct mca_memheap_base_module_t +3. Buddy component - of type struct mca_memheap_base_component_2_0_0_t +4. Buddy module - of type struct mca_memheap_buddy_module_t extending the base module (struct mca_memheap_base_module_t) + +Each data structure includes the following fields: +1. Base component - memheap_version, memheap_data and memheap_init +2. Base module - Holds pointers to the base component and to the functions: alloc, free and finalize +3. Buddy component - is a base component. +4. Buddy module - Extends the base module and holds additional data on the components's priority, buddy allocator, + maximal order of the symmetric heap, symmetric heap, pointer to the symmetric heap and hashtable maintaining the size of each allocated address. + +In the case that the user decides to implement additional components, the Memheap infrastructure chooses a component with the maximal priority. +Handling the component opening is done under the base directory, in three stages: +1. Open all available components. Implemented by memheap_base_open.c and called from shmem_init. +2. Select the maximal priority component. This procedure involves the initialization of all components and then their + finalization except to the chosen component. It is implemented by memheap_base_select.c and called from shmem_init. +3. Close the max priority active cmponent. Implemented by memheap_base_close.c and called from shmem finalize. + + +Buddy Component/Module +---------------------- + +Responsible for handling the entire activities of the symmetric heap. +The supported activities are: + - buddy_init (Initialization) + - buddy_alloc (Allocates a variable on the symmetric heap) + - buddy_free (frees a variable previously allocated on the symetric heap) + - buddy_finalize (Finalization). + +Data members of buddy module: - priority. The module's priority. + - buddy allocator: bits, num_free, lock and the maximal order (log2 of the maximal size) + of a variable on the symmetric heap. Buddy Allocator gives the offset in the symmetric heap + where a variable should be allocated. + - symmetric_heap: a range of reserved addresses (equal in all executing PE's) dedicated to "shared memory" allocation. + - symmetric_heap_hashtable (holding the size of an allocated variable on the symmetric heap. + used to free an allocated variable on the symmetric heap) + diff --git a/oshmem/mca/memheap/base/Makefile.am b/oshmem/mca/memheap/base/Makefile.am new file mode 100644 index 0000000000..e8efe488b8 --- /dev/null +++ b/oshmem/mca/memheap/base/Makefile.am @@ -0,0 +1,28 @@ +# Copyright (c) 2012 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +AM_CFLAGS = $(OSHMEM_CFLAGS) +AM_CPPFLAGS = $(openib_CPPFLAGS) + +dist_pkgdata_DATA += base/help-shmem-mca.txt + +headers += \ + base/base.h + +libmca_memheap_la_SOURCES += \ + base/memheap_base_open.c \ + base/memheap_base_select.c \ + base/memheap_base_alloc.c \ + base/memheap_base_static.c \ + base/memheap_base_register.c \ + base/memheap_base_mkey.c \ + base/memheap_base_close.c + +libmca_memheap_la_LDFLAGS += -module -avoid-version $(openib_LDFLAGS) +libmca_memheap_la_LIBADD += $(openib_LIBS) diff --git a/oshmem/mca/memheap/base/base.h b/oshmem/mca/memheap/base/base.h new file mode 100644 index 0000000000..57f980973c --- /dev/null +++ b/oshmem/mca/memheap/base/base.h @@ -0,0 +1,200 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + */ +#ifndef MCA_MEMHEAP_BASE_H +#define MCA_MEMHEAP_BASE_H + +#include "oshmem_config.h" +#include "opal/class/opal_list.h" +#include "opal/class/opal_value_array.h" +#include "opal/mca/mca.h" + +#include "oshmem/mca/memheap/memheap.h" + + +BEGIN_C_DECLS + + +/* + * Global functions for MCA: overall MEMHEAP open and close + */ +OSHMEM_DECLSPEC int mca_memheap_base_open(void); +OSHMEM_DECLSPEC int mca_memheap_base_select(void); +OSHMEM_DECLSPEC int mca_memheap_base_close(void); + + +/* + * Globals + */ +OSHMEM_DECLSPEC extern int mca_memheap_base_output; +OSHMEM_DECLSPEC extern opal_list_t mca_memheap_base_components_opened; +OSHMEM_DECLSPEC extern struct mca_memheap_base_module_t* mca_memheap_base_module_initialized; + + +/* only used within base -- no need to DECLSPEC */ +#define MEMHEAP_BASE_START_ADDRESS 0xFF000000 +#define MEMHEAP_BASE_MIN_ORDER 3 /* forces 64 bit alignment */ +#define MEMHEAP_BASE_PAGE_ORDER 21 +#define MEMHEAP_BASE_PRIVATE_SIZE (1ULL << MEMHEAP_BASE_PAGE_ORDER) /* should be at least the same as a huge page size */ +#define MEMHEAP_BASE_MIN_SIZE (1ULL << MEMHEAP_BASE_PAGE_ORDER) /* must fit into at least one huge page */ + +extern char* mca_memheap_base_include; +extern char* mca_memheap_base_exclude; +extern int mca_memheap_base_already_opened; +extern int mca_memheap_base_shmalloc_use_hugepages; +extern int mca_memheap_buddy_use_modex; +extern int mca_memheap_base_mr_interleave_factor; + + +#define MCA_MEMHEAP_MAX_SEGMENTS 256 +#define HEAP_SEG_INDEX 0 +#define SYMB_SEG_INDEX 1 + + +#define MEMHEAP_SHM_INVALID (-1) + +#define MEMHEAP_SHM_CODE( type, id ) ((((uint64_t)(type)) << 32) | ((uint32_t)(id))) +#define MEMHEAP_SHM_GET_TYPE( x ) (((uint32_t)((x) >> 32)) & 0xFFFFFFFF) +#define MEMHEAP_SHM_GET_ID( x ) ((uint32_t)((x) & 0xFFFFFFFF)) + + +typedef enum { + MAP_SEGMENT_STATIC = 0, + MAP_SEGMENT_ALLOC_MMAP, + MAP_SEGMENT_ALLOC_SHM, + MAP_SEGMENT_ALLOC_IBV, + MAP_SEGMENT_UNKNOWN +} segment_type_t; + +#if defined(MPAGE_ENABLE) && (MPAGE_ENABLE > 0) +#include + +typedef struct openib_device_t { + struct ibv_device **ib_devs; + struct ibv_device *ib_dev; + struct ibv_context *ib_dev_context; + struct ibv_device_attr ib_dev_attr; + struct ibv_pd *ib_pd; + opal_value_array_t ib_mr_array;; + struct ibv_mr *ib_mr_shared; +} openib_device_t; +#endif /* MPAGE_ENABLE */ + + +typedef struct map_segment_t { + mca_spml_mkey_t **mkeys_cache; /* includes remote segment bases in va_base */ + mca_spml_mkey_t *mkeys; /* includes local segment bases in va_base */ + int is_active; /* enable/disable flag */ + int shmid; + + uint64_t start; /* base address of the segment */ + uint64_t end; /* final address of the segment */ + size_t size; /* length of the segment */ + + segment_type_t type; /* type of the segment */ + void *context; /* additional data related the segment */ +} map_segment_t; + +typedef struct mca_memheap_map { + map_segment_t mem_segs[MCA_MEMHEAP_MAX_SEGMENTS]; /* TODO: change into pointer array */ + int n_segments; + int num_transports; +} mca_memheap_map_t; + + +extern mca_memheap_map_t mca_memheap_base_map; + +OSHMEM_DECLSPEC int mca_memheap_base_alloc_init(mca_memheap_map_t *, size_t); +OSHMEM_DECLSPEC void mca_memheap_base_alloc_exit(mca_memheap_map_t *); +OSHMEM_DECLSPEC int mca_memheap_base_static_init(mca_memheap_map_t *); +OSHMEM_DECLSPEC void mca_memheap_base_static_exit(mca_memheap_map_t *); +OSHMEM_DECLSPEC int mca_memheap_base_register(mca_memheap_map_t *); +OSHMEM_DECLSPEC int mca_memheap_base_deregister(mca_memheap_map_t *); +OSHMEM_DECLSPEC int memheap_oob_init(mca_memheap_map_t *); +OSHMEM_DECLSPEC void memheap_oob_destruct(void); + +OSHMEM_DECLSPEC uint64_t mca_memheap_base_find_offset(int pe, int tr_id, unsigned long va, uint64_t rva); +OSHMEM_DECLSPEC int mca_memheap_base_is_symmetric_addr(unsigned long va); +OSHMEM_DECLSPEC mca_spml_mkey_t *mca_memheap_base_get_mkey(unsigned long va, int tr_id); +OSHMEM_DECLSPEC mca_spml_mkey_t * mca_memheap_base_get_cached_mkey(int pe, unsigned long va, int btl_id, uint64_t *rva); +OSHMEM_DECLSPEC void mca_memheap_modex_recv_all(void); + +/* This function is for internal usage only + * return value: + * 0 - addr is not symmetric address + * 1 - addr is part of user memheap + * 2 - addr is part of private memheap + * 3 - addr is static variable + */ +typedef enum { + ADDR_INVALID = 0, + ADDR_USER, + ADDR_PRIVATE, + ADDR_STATIC, +} addr_type_t; + +OSHMEM_DECLSPEC int mca_memheap_base_detect_addr_type(unsigned long va); + + +static inline unsigned memheap_log2(unsigned long long val) +{ + /* add 1 if val is NOT a power of 2 (to do the ceil) */ + unsigned int count = (val & (val-1) ? 1 : 0); + + while(val > 0) + { + val = val >> 1; + count++; + } + + return count > 0 ? count-1: 0; +} + +static inline void *memheap_down_align_addr(void* addr, unsigned int shift) +{ + return (void*) (((intptr_t) addr) & (~(intptr_t) 0) << shift); +} + +static inline void *memheap_up_align_addr(void*addr, unsigned int shift) +{ + return (void*) ((((intptr_t) addr) | ~((~(intptr_t) 0) << shift))); +} + +static inline unsigned long long memheap_align(unsigned long top) +{ + return ((top + MEMHEAP_BASE_MIN_SIZE - 1) & ~(MEMHEAP_BASE_MIN_SIZE - 1)); +} + + +/*----------------------------------------------------------------------------------*/ +/*logger macros*/ + +#ifdef __BASE_FILE__ +#define __SPML_FILE__ __BASE_FILE__ +#else +#define __SPML_FILE__ __FILE__ +#endif + +#define MEMHEAP_VERBOSE(level, format, ...) \ + opal_output_verbose(level, mca_memheap_base_output, "%s:%d - %s() " format, \ + __SPML_FILE__, __LINE__, __FUNCTION__, ## __VA_ARGS__) + +#define MEMHEAP_ERROR(format, ... ) \ + opal_output_verbose(0, mca_memheap_base_output, "Error: %s:%d - %s() " format, \ + __SPML_FILE__, __LINE__, __FUNCTION__, ## __VA_ARGS__) + +#define MEMHEAP_WARN(format, ... ) \ + opal_output_verbose(0, mca_memheap_base_output, "Warning: %s:%d - %s() " format, \ + __SPML_FILE__, __LINE__, __FUNCTION__, ## __VA_ARGS__) +END_C_DECLS + +#endif /* MCA_MEMHEAP_BASE_H */ diff --git a/oshmem/mca/memheap/base/help-shmem-mca.txt b/oshmem/mca/memheap/base/help-shmem-mca.txt new file mode 100644 index 0000000000..e59ee1a1aa --- /dev/null +++ b/oshmem/mca/memheap/base/help-shmem-mca.txt @@ -0,0 +1,23 @@ +# -*- text -*- +# +# Copyright (c) 2012 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# +# This is the US/English help file for Open SHMEM MCA error messages. +# +[find-available:none-found] +No available %s components were found! + +This means that there are no components of this type installed on your +system or all the components reported that they could not be used. + +This is a fatal error; your SHMEM process is likely to abort. Check the +output of the "ompi_info" command and ensure that components of this +type are available on your system. You may also wish to check the +value of the "component_path" MCA parameter and ensure that it has at +least one directory that contains valid MCA components. diff --git a/oshmem/mca/memheap/base/memheap_base_alloc.c b/oshmem/mca/memheap/base/memheap_base_alloc.c new file mode 100644 index 0000000000..f3712480dc --- /dev/null +++ b/oshmem/mca/memheap/base/memheap_base_alloc.c @@ -0,0 +1,529 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "oshmem_config.h" + +#include "opal/util/output.h" +#include "orte/util/show_help.h" +#include "oshmem/mca/memheap/memheap.h" +#include "oshmem/mca/memheap/base/base.h" + +#ifdef HAVE_SYS_MMAN_H +#include +#endif + +#include +#include + +#if defined(MPAGE_ENABLE) && (MPAGE_ENABLE > 0) +#include +#endif /* MPAGE_ENABLE */ + +static int __shm_attach(map_segment_t *, size_t, int, int); +static void __shm_detach(map_segment_t *); + +static int __mmap_attach(map_segment_t *, size_t); +static void __mmap_detach(map_segment_t *); + +#if defined(MPAGE_ENABLE) && (MPAGE_ENABLE > 0) +static int __ibv_attach(map_segment_t *, size_t); +static void __ibv_detach(map_segment_t *); +#endif /* MPAGE_ENABLE */ + +static int __adaptive_attach(map_segment_t *, size_t); + + +int mca_memheap_base_alloc_init(mca_memheap_map_t *map, size_t size) +{ + int ret = OSHMEM_SUCCESS; + int value = mca_memheap_base_shmalloc_use_hugepages; + + assert(map); + assert(HEAP_SEG_INDEX == map->n_segments); + + MEMHEAP_VERBOSE(5,"memheap method : %d", + mca_memheap_base_shmalloc_use_hugepages); + + map_segment_t *s = &map->mem_segs[map->n_segments]; + memset(s, 0, sizeof(*s)); + s->is_active = 0; + s->shmid = MEMHEAP_SHM_INVALID; + s->start = 0; + s->end = 0; + s->size = 0; + s->type = MAP_SEGMENT_UNKNOWN; + s->context = NULL; + + switch(value) { + case 0: + /* use sysv alloc without hugepages */ + ret = __shm_attach(s, size, 0, 1); + break; + + case 1: + ret = __shm_attach(s, size, 1, 1); + if (OSHMEM_SUCCESS != ret) + ret = __shm_attach(s, size, 0, 1); + break; + + case 2: + /* huge pages only */ + ret = __shm_attach(s, size, 1, 1); + if (OSHMEM_SUCCESS != ret) + MEMHEAP_ERROR("FAILED to allocated symmetric heap using hugepages fallback is disabled, errno=%d", errno); + break; + + case 3: + /* huge pages only + cleanup shmid */ + ret = __shm_attach(s, size, 1, 0); + if (OSHMEM_SUCCESS != ret) + MEMHEAP_ERROR("FAILED to allocated symmetric heap using hugepages fallback is disabled, errno=%d", errno); + break; + + case 4: + /* use sysv alloc without hugepages */ + ret = __shm_attach(s, size, 0, 0); + break; + +#if defined(MPAGE_ENABLE) && (MPAGE_ENABLE > 0) + case 5: + /* use shared memory registration (mpages) */ + ret = __ibv_attach(s, size); + break; +#endif /* MPAGE_ENABLE */ + + case 100: + /* use mmap. It will severaly impact performance of intra node communication */ + ret = __mmap_attach(s, size); + MEMHEAP_VERBOSE(1, "mmap() memheap allocation will severely impact performance of intra node communication"); + break; + + case 101: + ret = __shm_attach(s, size, 1, 1); + if (OSHMEM_SUCCESS != ret) { + MEMHEAP_ERROR("Failed to allocate hugepages. Falling back on regular allocation"); + ret = __mmap_attach(s, size); + } + else { + s->shmid = MEMHEAP_SHM_INVALID; + } + MEMHEAP_VERBOSE(1, "SM BTL will be always used for intranode comm\n"); + break; + + case 102: + ret = __shm_attach(s, size, 1, 1); + if (OSHMEM_SUCCESS != ret) { + MEMHEAP_ERROR("FAILED to allocated symmetric heap using hugepages fallback is disabled, errno=%d", errno); + } + else { + s->shmid = MEMHEAP_SHM_INVALID; + } + break; + + default: + ret = __adaptive_attach(s, size); +// MEMHEAP_ERROR("Unknown memheap allocation method: %d", value); +// ret = OSHMEM_ERR_BAD_PARAM; + } + + if (OSHMEM_SUCCESS == ret) + { + map->n_segments++; + MEMHEAP_VERBOSE(1, "Memheap alloc memory: %llu byte(s), %d segments by method: %d", + (unsigned long long)size, map->n_segments, s->type); + } + + return ret; +} + + +void mca_memheap_base_alloc_exit(mca_memheap_map_t *map) +{ + if (map) + { + map_segment_t *s = &map->mem_segs[HEAP_SEG_INDEX]; + + assert(s); + + switch(s->type) { + case MAP_SEGMENT_ALLOC_SHM: + __shm_detach(s); + break; + + case MAP_SEGMENT_ALLOC_MMAP: + __mmap_detach(s); + break; + +#if defined(MPAGE_ENABLE) && (MPAGE_ENABLE > 0) + case MAP_SEGMENT_ALLOC_IBV: + __ibv_detach(s); + break; +#endif /* MPAGE_ENABLE */ + + default: + MEMHEAP_ERROR("Unknown segment type: %d", (int)s->type); + } + } +} + + +static int __adaptive_attach(map_segment_t *s, size_t size) +{ + int rc = OSHMEM_SUCCESS; + +#if defined(MPAGE_ENABLE) && (MPAGE_ENABLE > 0) + rc = __ibv_attach(s, size); +#endif /* MPAGE_ENABLE */ + + if (rc) + { + rc = __shm_attach(s, size, 1, 1); + } + + if (rc) + { + rc = __shm_attach(s, size, 0, 1); + } + + if (rc) + { + rc = __shm_attach(s, size, 0, 0); + } + + if (rc) + { + rc = __mmap_attach(s, size); + } + + return rc; +} + + +static int __shm_attach(map_segment_t *s, size_t size, int use_hp, int do_rmid) +{ + static int shm_context = 0;; + void *addr = NULL; + int shmid = MEMHEAP_SHM_INVALID; + int flags; + + assert(s); + + shm_context = use_hp; + + flags = IPC_CREAT | IPC_EXCL | SHM_R | SHM_W; + flags |= ( use_hp ? SHM_HUGETLB : 0 ); + + /* Create a new shared memory segment and save the shmid. */ + shmid = shmget(IPC_PRIVATE, size, flags); + if (shmid == MEMHEAP_SHM_INVALID) { + MEMHEAP_VERBOSE(1, "Failed to get shm segment (errno=%d)", errno); + return OSHMEM_ERROR; + } + + /* Attach to the sement */ + addr = shmat(shmid, (void *)MEMHEAP_BASE_START_ADDRESS, 0); + if (addr == (void *)-1L) { + MEMHEAP_VERBOSE(1, "Failed to attach to shm segment (errno=%d)", errno); + + shmctl(shmid, IPC_RMID, NULL); + return OSHMEM_ERR_OUT_OF_RESOURCE; + } + + MEMHEAP_VERBOSE(5, "got shmid %d", shmid); + + if (do_rmid) + shmctl(shmid, IPC_RMID, NULL); + + s->type = MAP_SEGMENT_ALLOC_SHM; + s->shmid = shmid; + s->start = (uintptr_t)addr; + s->size = size; + s->end = s->start + s->size; + s->context = &shm_context; + + return OSHMEM_SUCCESS; +} + + +static void __shm_detach(map_segment_t *s) +{ + assert(s); + + if (s->shmid != MEMHEAP_SHM_INVALID) + { + shmctl(s->shmid, IPC_RMID, NULL); + } + + if (s->context && (*((int *)(s->context))) > 0) { + /** + * Workaround kernel panic when detaching huge pages from user space simultanously from several processes + * dont detach here instead let kernel do it during process cleanup + */ + //shmdt((void *)s->start); + } +} + + +static int __mmap_attach(map_segment_t *s, size_t size) +{ + void *addr = NULL; + + assert(s); + + addr = mmap((void *)MEMHEAP_BASE_START_ADDRESS, + size, + PROT_READ|PROT_WRITE, + MAP_SHARED|MAP_ANONYMOUS|MAP_FIXED, 0, 0); + + if (MAP_FAILED == addr) { + MEMHEAP_ERROR("Failed to mmap() %llu bytes (errno=%d)", (unsigned long long)size, errno); + return OSHMEM_ERR_OUT_OF_RESOURCE; + } + + s->type = MAP_SEGMENT_ALLOC_MMAP; + s->shmid = MEMHEAP_SHM_INVALID; + s->start = (uintptr_t)addr; + s->size = size; + s->end = s->start + s->size; + s->context = NULL; + + return OSHMEM_SUCCESS; +} + + +static void __mmap_detach(map_segment_t *s) +{ + assert(s); + + munmap((void *)s->start, s->size); +} + + +#if defined(MPAGE_ENABLE) && (MPAGE_ENABLE > 0) + +static int __ibv_attach(map_segment_t *s, size_t size) +{ + int rc = OSHMEM_SUCCESS; + static openib_device_t memheap_device; + openib_device_t *device = &memheap_device; + int num_devs = 0; + + assert(s); + + memset(device, 0, sizeof(*device)); + +#ifdef HAVE_IBV_GET_DEVICE_LIST + device->ib_devs = ibv_get_device_list(&num_devs); +#else + #error unsupported ibv_get_device_list in infiniband/verbs.h +#endif + + if (num_devs == 0 || !device->ib_devs) + { + rc = OSHMEM_ERR_NOT_SUPPORTED; + } + + /* Open device */ + if (!rc) + { + int i = 0; + + if (num_devs > 1) + MEMHEAP_VERBOSE(5, "found %d HCAs, choosing the first", num_devs); + + for (i = 0; i < num_devs; i++) + { + device->ib_dev = device->ib_devs[i]; + + device->ib_dev_context = ibv_open_device(device->ib_dev); + if (NULL == device->ib_dev_context) + { + MEMHEAP_ERROR("error obtaining device context for %s errno says %d: %s", + ibv_get_device_name(device->ib_dev), errno, strerror(errno)); + rc = OSHMEM_ERR_RESOURCE_BUSY; + } + else + { + MEMHEAP_VERBOSE(5, "selected %s as %d of %d", ibv_get_device_name(device->ib_dev), i, num_devs); + rc = OSHMEM_SUCCESS; + break; + } + } + } + + /* Obtain device attributes */ + if (!rc) + { + if (ibv_query_device(device->ib_dev_context, &device->ib_dev_attr)) + { + MEMHEAP_ERROR("error obtaining device attributes for %s errno says %d: %s", + ibv_get_device_name(device->ib_dev), errno, strerror(errno)); + rc = OSHMEM_ERR_RESOURCE_BUSY; + } + else + { + MEMHEAP_VERBOSE(5, "ibv device %s", + ibv_get_device_name(device->ib_dev)); + } + } + + /* Allocate the protection domain for the device */ + if (!rc) + { + device->ib_pd = ibv_alloc_pd(device->ib_dev_context); + if (NULL == device->ib_pd) + { + MEMHEAP_ERROR("error allocating protection domain for %s errno says %d: %s", + ibv_get_device_name(device->ib_dev), errno, strerror(errno)); + rc = OSHMEM_ERR_RESOURCE_BUSY; + } + } + + /* Allocate memory */ + if (!rc) + { + void *addr = NULL; + struct ibv_mr *ib_mr = NULL; + int access_flag = IBV_ACCESS_LOCAL_WRITE | + IBV_ACCESS_REMOTE_WRITE | + IBV_ACCESS_REMOTE_READ; + + OBJ_CONSTRUCT(&device->ib_mr_array, opal_value_array_t); + opal_value_array_init(&device->ib_mr_array, sizeof(struct ibv_mr *)); + +#if defined(MPAGE_ENABLE) && ((MPAGE_ENABLE == 1) || (MPAGE_ENABLE == 2)) + access_flag |= IBV_ACCESS_ALLOCATE_MR | + IBV_ACCESS_SHARED_MR_USER_READ | + IBV_ACCESS_SHARED_MR_USER_WRITE; +#endif /* MPAGE_ENABLE */ + + ib_mr = ibv_reg_mr(device->ib_pd, addr, size, access_flag); + if (NULL == ib_mr) + { + MEMHEAP_ERROR("error to ibv_reg_mr() %llu bytes errno says %d: %s", + (unsigned long long)size, errno, strerror(errno)); + rc = OSHMEM_ERR_OUT_OF_RESOURCE; + } + else + { + device->ib_mr_shared = ib_mr; + opal_value_array_append_item(&device->ib_mr_array, &ib_mr); + } + +#if defined(MPAGE_ENABLE) && (MPAGE_ENABLE == 2) + if (!rc) + { + access_flag = IBV_ACCESS_LOCAL_WRITE | + IBV_ACCESS_REMOTE_WRITE | + IBV_ACCESS_REMOTE_READ| + IBV_ACCESS_NO_RDMA; + + addr = (void *)MEMHEAP_BASE_START_ADDRESS; + ib_mr = ibv_reg_shared_mr(device->ib_mr_shared->handle, + device->ib_pd, addr, access_flag); + if (NULL == ib_mr) + { + MEMHEAP_ERROR("error to ibv_reg_shared_mr() %llu bytes errno says %d: %s", + (unsigned long long)size, errno, strerror(errno)); + rc = OSHMEM_ERR_OUT_OF_RESOURCE; + } + else + { + opal_value_array_append_item(&device->ib_mr_array, &ib_mr); + } + } +#endif /* MPAGE_ENABLE */ + + if (!rc) + { + assert(size == device->ib_mr_shared->length); + + s->type = MAP_SEGMENT_ALLOC_IBV; + s->shmid = device->ib_mr_shared->handle; + s->start = (intptr_t)ib_mr->addr; + s->size = size; + s->end = s->start + s->size; + s->context = &memheap_device; + } + } + + return rc; +} + + +static void __ibv_detach(map_segment_t *s) +{ + int rc = OSHMEM_SUCCESS; + openib_device_t *device = NULL; + + assert(s); + + device = (openib_device_t *)s->context; + + if (device) + { + if(!rc && opal_value_array_get_size(&device->ib_mr_array)) + { + struct ibv_mr** array; + struct ibv_mr* ib_mr = NULL; + array = OPAL_VALUE_ARRAY_GET_BASE(&device->ib_mr_array, struct ibv_mr *); + while (opal_value_array_get_size(&device->ib_mr_array) > 0) + { + ib_mr = array[0]; + if(ibv_dereg_mr(ib_mr)) + { + MEMHEAP_ERROR("error ibv_dereg_mr(): %d: %s", errno, strerror(errno)); + rc = OSHMEM_ERROR; + } + opal_value_array_remove_item(&device->ib_mr_array, 0); + } + + if(!rc && device->ib_mr_shared) + { + device->ib_mr_shared = NULL; + } + OBJ_DESTRUCT(&device->ib_mr_array); + } + + if(!rc && device->ib_pd) + { + if(ibv_dealloc_pd(device->ib_pd)) + { + MEMHEAP_ERROR("error ibv_dealloc_pd(): %d: %s", errno, strerror(errno)); + rc = OSHMEM_ERROR; + } + else + { + device->ib_pd = NULL; + } + } + + if(!rc && device->ib_dev_context) + { + if(ibv_close_device(device->ib_dev_context)) + { + MEMHEAP_ERROR("error ibv_close_device(): %d: %s", errno, strerror(errno)); + rc = OSHMEM_ERROR; + } + else + { + device->ib_dev_context = NULL; + } + } + + if(!rc && device->ib_devs) + { + ibv_free_device_list(device->ib_devs); + device->ib_devs = NULL; + } + } +} + +#endif /* MPAGE_ENABLE */ diff --git a/oshmem/mca/memheap/base/memheap_base_close.c b/oshmem/mca/memheap/base/memheap_base_close.c new file mode 100644 index 0000000000..5ee00e6c9c --- /dev/null +++ b/oshmem/mca/memheap/base/memheap_base_close.c @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "oshmem_config.h" +#include +#include "opal/util/argv.h" +#include "opal/util/output.h" +#include "orte/util/show_help.h" +#include "opal/mca/base/base.h" +#include "opal/mca/base/mca_base_component_repository.h" +#include "opal/runtime/opal.h" +#include "oshmem/constants.h" +#include "opal/mca/mca.h" +#include "oshmem/mca/memheap/base/base.h" + + +int mca_memheap_base_close(void) +{ + int rc = OSHMEM_SUCCESS; + + if( mca_memheap_base_already_opened <= 0 ) { + return OSHMEM_ERROR; + } + mca_memheap_base_already_opened--; + if (mca_memheap_base_already_opened > 0) { + return OSHMEM_SUCCESS; + } +#if 0 + /* disable event processing while cleaning up memheaps */ + opal_event_disable(); +#endif + + /* Free allocated module */ + /*free(mca_memheap_base_module_initialized); + if(NULL != mca_memheap_base_module_initialized){ + rc = mca_memheap_base_module_initialized->memheap_finalize(mca_memheap_base_module_initialized); + if (OSHMEM_SUCCESS != rc){ + return OSHMEM_ERROR; + } + }*/ + + memheap_oob_destruct(); + + rc = mca_memheap_base_deregister(&mca_memheap_base_map); + + mca_memheap_base_alloc_exit(&mca_memheap_base_map); + mca_memheap_base_static_exit(&mca_memheap_base_map); + + /* Close the maximal priority component which is the only component remained opened */ + if (0 != opal_list_get_size(&mca_memheap_base_components_opened)) { + mca_base_components_close(mca_memheap_base_output, + &mca_memheap_base_components_opened, NULL); + } + + /* cleanup */ + if(NULL != mca_memheap_base_include) + free(mca_memheap_base_include); + if(NULL != mca_memheap_base_exclude) + free(mca_memheap_base_exclude); + +#if 0 + /* restore event processing */ + opal_event_enable(); +#endif + + /* All done */ + return OSHMEM_SUCCESS; +} diff --git a/oshmem/mca/memheap/base/memheap_base_mkey.c b/oshmem/mca/memheap/base/memheap_base_mkey.c new file mode 100644 index 0000000000..c55279fbe6 --- /dev/null +++ b/oshmem/mca/memheap/base/memheap_base_mkey.c @@ -0,0 +1,684 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "oshmem_config.h" + +#include "opal/util/output.h" +#include "opal/dss/dss.h" + +#include "orte/mca/rml/rml.h" +#include "orte/mca/rml/rml_types.h" +#include "orte/mca/grpcomm/grpcomm.h" +#include "orte/mca/errmgr/errmgr.h" +#include "orte/runtime/orte_globals.h" + +#include "ompi/mca/bml/bml.h" +#include "ompi/mca/dpm/dpm.h" + +#include "oshmem/proc/proc.h" +#include "oshmem/runtime/runtime.h" +#include "oshmem/mca/memheap/memheap.h" +#include "oshmem/mca/memheap/base/base.h" + +#ifdef HAVE_SYS_MMAN_H +#include +#endif + +#include +#include + +#if defined(MPAGE_ENABLE) && (MPAGE_ENABLE > 0) +#include +#endif /* MPAGE_ENABLE */ + + +#define MEMHEAP_RKEY_REQ 0xA1 +#define MEMHEAP_RKEY_RESP 0xA2 +#define MEMHEAP_RKEY_RESP_FAIL 0xA3 + +struct oob_comm { + opal_mutex_t lck; + opal_condition_t cond; + mca_spml_mkey_t *mkeys; + int mkeys_rcvd; +}; + +#define MEMHEAP_VERBOSE_FASTPATH(...) + +static mca_memheap_map_t* memheap_map = NULL; + +struct oob_comm memheap_oob; + +/* pickup list of rkeys and remote va */ +static int memheap_oob_get_mkeys(int pe, uint32_t va_seg_num, mca_spml_mkey_t *mkey); + + +static inline unsigned long __seg2base_va(int seg) +{ + return memheap_map->mem_segs[seg].start; +} + + +static int __seg_cmp(const void *k, const void *v) +{ + unsigned long va = (unsigned long)k; + map_segment_t *s = (map_segment_t *)v; + + if (va < s->start) + return -1; + if (va >= s->end) + return 1; + + return 0; +} + + +static inline map_segment_t *__find_va(unsigned long va) +{ + map_segment_t *s; + + if ( OPAL_LIKELY(va >= (unsigned long)memheap_map->mem_segs[HEAP_SEG_INDEX].start && + va < (unsigned long)memheap_map->mem_segs[HEAP_SEG_INDEX].end)) + { + s = &memheap_map->mem_segs[HEAP_SEG_INDEX]; + } + else + { + s = bsearch((const void *)va, &memheap_map->mem_segs[SYMB_SEG_INDEX], memheap_map->n_segments - 1, sizeof(*s), __seg_cmp); + } + +#if 0 + if (s) { + MEMHEAP_VERBOSE(5, "match seg#%02ld: 0x%llX - 0x%llX %llu bytes va=%p", + s - memheap_map->mem_segs, + (long long)s->start, + (long long)s->end, + (long long)(s->end - s->start), + (void *)va); + } +#endif + return s; +} + +static int do_mkey_req(opal_buffer_t *msg, int pe, int seg) +{ + uint8_t msg_type; + oshmem_proc_t *proc; + int i, n, tr_id; + mca_spml_mkey_t *mkey; + + msg_type = MEMHEAP_RKEY_RESP; + opal_dss.pack(msg, &msg_type, 1, OPAL_UINT8); + + /* go over all transports to remote pe and pack mkeys */ + n = oshmem_get_transport_count(pe); + proc = oshmem_proc_group_find(oshmem_group_all, pe); + opal_dss.pack(msg, &n, 1, OPAL_UINT32); + MEMHEAP_VERBOSE(5, "found %d transports to %d", n, pe); + for (i = 0; i < n; i++) { + tr_id = proc->transport_ids[i]; + + mkey = mca_memheap_base_get_mkey(__seg2base_va(seg), tr_id); + if (!mkey) { + MEMHEAP_ERROR("seg#%d tr_id: %d failed to find local mkey", seg, tr_id); + return OSHMEM_ERROR; + } + opal_dss.pack(msg, &tr_id, 1, OPAL_UINT32); + opal_dss.pack(msg, &mkey->key, 1, OPAL_UINT64); + opal_dss.pack(msg, &mkey->va_base, 1, OPAL_UINT64); + MEMHEAP_VERBOSE(5, "seg#%d tr_id: %d key %llx base_va %llx", + seg, tr_id, + (unsigned long long)mkey->key, + (unsigned long long)mkey->va_base); + } + return OSHMEM_SUCCESS; +} + +static void memheap_attach_segment(mca_spml_mkey_t *mkey, int tr_id) +{ + /* process special case when va was got using shmget(IPC_PRIVATE) + * this case is notable for: + * - key is set as (type|shmid); + * - va_base is set as 0; + */ + if (!mkey->va_base && ((int)MEMHEAP_SHM_GET_ID(mkey->key) != MEMHEAP_SHM_INVALID)) + { + MEMHEAP_VERBOSE(5, "shared memory usage tr_id: %d key %llx base_va %llx shmid 0x%X|0x%X", + tr_id, + (unsigned long long)mkey->key, + (unsigned long long)mkey->va_base, + MEMHEAP_SHM_GET_TYPE(mkey->key), + MEMHEAP_SHM_GET_ID(mkey->key)); + + if (MEMHEAP_SHM_GET_TYPE(mkey->key) == MAP_SEGMENT_ALLOC_SHM) + { + mkey->va_base = (intptr_t)shmat(MEMHEAP_SHM_GET_ID(mkey->key), 0, 0); + } + else if (MEMHEAP_SHM_GET_TYPE(mkey->key) == MAP_SEGMENT_ALLOC_IBV) + { +#if defined(MPAGE_ENABLE) && (MPAGE_ENABLE == 2) + openib_device_t *device = NULL; + struct ibv_mr *ib_mr; + void *addr; + static int mr_count; + + int access_flag = IBV_ACCESS_LOCAL_WRITE | + IBV_ACCESS_REMOTE_WRITE | + IBV_ACCESS_REMOTE_READ | + IBV_ACCESS_NO_RDMA; + + device = (openib_device_t *)memheap_map->mem_segs[HEAP_SEG_INDEX].context; + assert(device); + + /* workaround mtt problem - request aligned addresses */ + ++mr_count; + addr = (void *)(MEMHEAP_BASE_START_ADDRESS + mca_memheap_base_mr_interleave_factor*1024ULL*1024ULL*1024ULL*mr_count); + ib_mr = ibv_reg_shared_mr(MEMHEAP_SHM_GET_ID(mkey->key), + device->ib_pd, addr, access_flag); + if (NULL == ib_mr) + { + mkey->va_base = -1; + MEMHEAP_ERROR("error to ibv_reg_shared_mr() errno says %d: %s", + errno, strerror(errno)); + } + else + { + if (ib_mr->addr != addr) { + MEMHEAP_WARN("Failed to map shared region to address %p got addr %p. Try to increase 'memheap_mr_interleave_factor' from %d", addr, ib_mr->addr, mca_memheap_base_mr_interleave_factor); + } + + opal_value_array_append_item(&device->ib_mr_array, &ib_mr); + mkey->va_base = (intptr_t)ib_mr->addr; + } +#endif /* MPAGE_ENABLE */ + } + else + { + MEMHEAP_ERROR("tr_id: %d key %llx attach failed: incorrect shmid 0x%X|0x%X", + tr_id, + (unsigned long long)mkey->key, + MEMHEAP_SHM_GET_TYPE(mkey->key), + MEMHEAP_SHM_GET_ID(mkey->key)); + oshmem_shmem_abort(-1); + } + + if ((void *)-1 == (void *)mkey->va_base) + { + MEMHEAP_ERROR("tr_id: %d key %llx attach failed: errno = %d", + tr_id, + (unsigned long long)mkey->key, + errno); + oshmem_shmem_abort(-1); + } + } +} + +static void do_mkey_resp(opal_buffer_t *msg) +{ + int32_t cnt; + int32_t n; + int32_t tr_id; + int i; + + cnt = 1; + opal_dss.unpack(msg, &n, &cnt, OPAL_UINT32); + for (i = 0; i < n; i++) { + opal_dss.unpack(msg, &tr_id, &cnt, OPAL_UINT32); + opal_dss.unpack(msg, &memheap_oob.mkeys[tr_id].key, &cnt, OPAL_UINT64); + opal_dss.unpack(msg, &memheap_oob.mkeys[tr_id].va_base, &cnt, OPAL_UINT64); + + memheap_attach_segment(&memheap_oob.mkeys[tr_id], tr_id); + + MEMHEAP_VERBOSE(5, "tr_id: %d key %llx base_va %llx", + tr_id, + (unsigned long long)memheap_oob.mkeys[tr_id].key, + (unsigned long long)memheap_oob.mkeys[tr_id].va_base); + } +} + +static void memheap_buddy_rml_recv_cb(int status, orte_process_name_t* process_name, + opal_buffer_t* buffer, orte_rml_tag_t tag, + void* cbdata) +{ + MEMHEAP_VERBOSE(5,"**** get request from %u:%d", process_name->jobid, process_name->vpid); + int32_t cnt = 1; + int rc; + opal_buffer_t *msg; + uint8_t msg_type; + uint32_t seg; + + MEMHEAP_VERBOSE(5,"unpacking %d of %d", cnt, OPAL_UINT8); + rc = opal_dss.unpack(buffer, &msg_type, &cnt, OPAL_UINT8); + if (ORTE_SUCCESS != rc) { + ORTE_ERROR_LOG(rc); + goto send_fail; + } + + switch (msg_type) { + case MEMHEAP_RKEY_REQ: + cnt = 1; + rc = opal_dss.unpack(buffer, &seg, &cnt, OPAL_UINT32); + if (ORTE_SUCCESS != rc) { + MEMHEAP_ERROR("bad RKEY_REQ msg"); + goto send_fail; + } + + MEMHEAP_VERBOSE(5,"*** RKEY REQ"); + msg = OBJ_NEW(opal_buffer_t); + if (!msg) { + MEMHEAP_ERROR("failed to get msg buffer"); + ORTE_ERROR_LOG(rc); + return; + } + + if (OSHMEM_SUCCESS != do_mkey_req(msg, process_name->vpid, seg)) { + OBJ_RELEASE(msg); + goto send_fail; + } + + rc = orte_rml.send_buffer(process_name, msg, OMPI_RML_TAG_SHMEM, 0); + OBJ_RELEASE(msg); + if (0 >= rc) { + MEMHEAP_ERROR("FAILED to send rml message %d", rc); + ORTE_ERROR_LOG(rc); + goto send_fail; + } + break; + + case MEMHEAP_RKEY_RESP: + MEMHEAP_VERBOSE(5,"*** RKEY RESP"); + OPAL_THREAD_LOCK(&memheap_oob.lck); + do_mkey_resp(buffer); + memheap_oob.mkeys_rcvd = MEMHEAP_RKEY_RESP; + opal_condition_broadcast(&memheap_oob.cond); + OPAL_THREAD_UNLOCK(&memheap_oob.lck); + break; + + case MEMHEAP_RKEY_RESP_FAIL: + MEMHEAP_VERBOSE(5,"*** RKEY RESP FAIL"); + memheap_oob.mkeys_rcvd = MEMHEAP_RKEY_RESP_FAIL; + opal_condition_broadcast(&memheap_oob.cond); + OPAL_THREAD_UNLOCK(&memheap_oob.lck); + break; + + default: + MEMHEAP_VERBOSE(5,"Unknown message type %x",msg_type); + goto send_fail; + } + return; + +send_fail: + msg = OBJ_NEW(opal_buffer_t); + if (!msg) { + MEMHEAP_ERROR("failed to get msg buffer"); + ORTE_ERROR_LOG(rc); + return; + } + msg_type = MEMHEAP_RKEY_RESP_FAIL; + opal_dss.pack(msg, &msg_type, 1, OPAL_UINT8); + + rc = orte_rml.send_buffer(process_name, msg, OMPI_RML_TAG_SHMEM, 0); + if (0 >= rc) { + MEMHEAP_ERROR("FAILED to send rml message %d", rc); + ORTE_ERROR_LOG(rc); + } + + OBJ_RELEASE(msg); +} + +int memheap_oob_init(mca_memheap_map_t *map) +{ + int rc; + + memheap_map = map; + + OBJ_CONSTRUCT(&memheap_oob.lck, opal_mutex_t); + OBJ_CONSTRUCT(&memheap_oob.cond, opal_condition_t); + + rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, + OMPI_RML_TAG_SHMEM, + ORTE_RML_PERSISTENT, + memheap_buddy_rml_recv_cb, + NULL); + + return rc; +} + +void memheap_oob_destruct(void) +{ + orte_rml.recv_cancel(ORTE_NAME_WILDCARD, OMPI_RML_TAG_SHMEM); + OBJ_DESTRUCT(&memheap_oob.lck); + OBJ_DESTRUCT(&memheap_oob.cond); +} + +static int memheap_oob_get_mkeys(int pe, uint32_t seg, mca_spml_mkey_t *mkeys) +{ + orte_process_name_t name; + opal_buffer_t *msg; + int rc; + uint8_t cmd; + int i; + + if (OSHMEM_SUCCESS == MCA_SPML_CALL(oob_get_mkeys(pe, seg, mkeys))) { + for (i = 0; i < memheap_map->num_transports; i++) { + mkeys[i].va_base = __seg2base_va(seg); + MEMHEAP_VERBOSE(5, "MKEY CALCULATED BY LOCAL SPML: pe: %d tr_id: %d key %llx base_va %llx", + pe, + i, + (unsigned long long)mkeys[i].key, + (unsigned long long)mkeys[i].va_base); + } + return OSHMEM_SUCCESS; + } + + OPAL_THREAD_LOCK(&memheap_oob.lck); + + memheap_oob.mkeys = mkeys; + memheap_oob.mkeys_rcvd = 0; + + name.jobid = ORTE_PROC_MY_NAME->jobid; + name.vpid = pe; + + msg = OBJ_NEW(opal_buffer_t); + if (!msg) { + OPAL_THREAD_UNLOCK(&memheap_oob.lck); + MEMHEAP_ERROR("failed to get msg buffer"); + return OSHMEM_ERROR; + } + + OPAL_THREAD_LOCK(&memheap_oob.lck); + cmd = MEMHEAP_RKEY_REQ; + opal_dss.pack(msg, &cmd, 1, OPAL_UINT8); + opal_dss.pack(msg, &seg, 1, OPAL_UINT32); + + rc = orte_rml.send_buffer(&name, msg, OMPI_RML_TAG_SHMEM, 0); + if (0 >= rc) { + OBJ_RELEASE(msg); + OPAL_THREAD_UNLOCK(&memheap_oob.lck); + MEMHEAP_ERROR("FAILED to send rml message %d", rc); + return OSHMEM_ERROR; + } + + MEMHEAP_VERBOSE(5,"message sent: %d bytes!", rc); + + while (!memheap_oob.mkeys_rcvd) { + opal_condition_wait(&memheap_oob.cond, &memheap_oob.lck); + } + + if (MEMHEAP_RKEY_RESP == memheap_oob.mkeys_rcvd) { + rc = OSHMEM_SUCCESS; + } + else { + MEMHEAP_ERROR("failed to get rkey seg#%d pe=%d", seg, pe); + rc = OSHMEM_ERROR; + } + + OBJ_RELEASE(msg); + OPAL_THREAD_UNLOCK(&memheap_oob.lck); + return rc; +} + +#if 0 /* disable till we figure out double modex&grpcomm.bad problem */ +static void memheap_modex_mkey_exchange(void) +{ + /* disable till we figure out double modex&grpcomm.bad problem */ + return; + if (!mca_memheap_buddy_use_modex) + return; + + if (OMPI_SUCCESS != ompi_modex_send(&mca_memheap_buddy_component.memheap_version, + memheap_map->mem_segs[HEAP_SEG_INDEX].mkeys, + sizeof(mca_spml_mkey_t) * memheap_map->num_transports)) { + MEMHEAP_VERBOSE(1, "FAILED to modex_send() my mkeys"); + return; + } + + if (OMPI_SUCCESS != orte_grpcomm.modex(0)) { + MEMHEAP_VERBOSE(1, "FAILED to do modex()"); + } +} +#endif + +#if 0 +static int memheap_modex_recv(int pe, mca_spml_mkey_t *mkeys, int num_transports) +{ + oshmem_proc_t *proc; + size_t size; + int rc; + mca_spml_mkey_t *new_mkeys; + int i; + uint64_t dummy_rva; + + proc = oshmem_proc_group_find(oshmem_group_all, pe); + + /* hack till we get modex 4 static memheap */ + /* force exchange for bss/data segments */ + mca_memheap_base_get_cached_mkey(pe, __seg2base_va(SYMB_SEG_INDEX), proc->transport_ids[0], &dummy_rva); + + rc = ompi_modex_recv(&mca_memheap_buddy_component.memheap_version, (ompi_proc_t *)proc, + (void**)&new_mkeys, &size); + + if (OMPI_SUCCESS != rc) + return OSHMEM_ERROR; + + if (size != num_transports * sizeof(mca_spml_mkey_t)) { + MEMHEAP_VERBOSE(1, "modex echanges size mismatch: wanted %d, got %d", + (int)(num_transports * sizeof(mca_spml_mkey_t)), + (int)size); + return OSHMEM_ERROR; + } + memcpy(mkeys, new_mkeys, num_transports * sizeof(mca_spml_mkey_t)); + free(new_mkeys); + + for (i = 0; i < num_transports; i++) { + if (mkeys[i].key == 0 && mkeys[i].va_base == 0) + continue; + /* don not try to attach to segments that are not connected to us */ + if (proc->transport_ids[0] != i) + continue; + MEMHEAP_VERBOSE(5, "pe: %d tr_id: %d key %llx base_va %llx", + pe, i, + (unsigned long long)mkeys[i].key, + (unsigned long long)mkeys[i].va_base); + memheap_attach_segment(&mkeys[i], i); + } + + return rc; +} +#endif + + +void mca_memheap_modex_recv_all(void) +{ + int i; + int j; + int nprocs, my_pe; + oshmem_proc_t *proc; + mca_spml_mkey_t *mkey; + uint64_t dummy_rva; + + if (!mca_memheap_buddy_use_modex) + return; + + /* init rkey cache */ + nprocs = oshmem_num_procs(); + my_pe = oshmem_my_proc_id(); + + /* do exchange via rml till we figure out problem with grpcomm.modex and barrier */ + for (i = 0; i < nprocs; i++) { + if (i == my_pe) + continue; + + proc = oshmem_proc_group_find(oshmem_group_all, i); + for (j = 0; j < memheap_map->n_segments; j++) + { + mkey = mca_memheap_base_get_cached_mkey( i, + memheap_map->mem_segs[j].start, + proc->transport_ids[0], + &dummy_rva); + if (!mkey) + { + MEMHEAP_ERROR("Failed to receive mkeys"); + oshmem_shmem_abort(-1); + } + } + + } + +#if 0 + for (i = 0; i < nprocs; i++) { + if (i == my_pe) + continue; + + memheap_map->mem_segs[HEAP_SEG_INDEX].mkeys_cache[i] = (mca_spml_mkey_t *)calloc(memheap_map->num_transports, sizeof(mca_spml_mkey_t)); + if (!memheap_map->mem_segs[HEAP_SEG_INDEX].mkeys_cache[i]) { + MEMHEAP_ERROR("Failed to allocated mkey cache memory"); + oshmem_shmem_abort(-1); + } + if (OSHMEM_SUCCESS != memheap_modex_recv(i, memheap_map->mem_segs[HEAP_SEG_INDEX].mkeys_cache[i], memheap_map->num_transports)) + oshmem_shmem_abort(-1); + } +#endif + /* + * There is an issue with orte_grpcomm.barrier usage as + * ess/pmi directs to use grpcomm/pmi in case slurm srun() call grpcomm/pmi calls PMI_Barrier() + * that is a function of external library. + * There is no opal_progress() in such way. As a result slow PEs send a request (MEMHEAP_RKEY_REQ) to + * fast PEs waiting on barrier and do not get a respond (MEMHEAP_RKEY_RESP). + * + * there are following ways to solve one: + * 1. calculate requests from remote PEs and do ORTE_PROGRESSED_WAIT waiting for expected value; + * 2. use shmem_barrier_all(); + * 3. rework pmi/barrier to use opal_progress(); + * 4. use orte_grpcomm.barrier carefully; + * + * It seems there is no need to use orte_grpcomm.barrier here + */ + + if ( memheap_map->mem_segs[HEAP_SEG_INDEX].shmid != MEMHEAP_SHM_INVALID) { + /* unfortunately we must do barrier here to assure that everyone are attached to our segment + * good thing that this code path only invoked on older linuxes (-mca shmalloc_use_hugepages 3|4) + * try to minimize damage here by waiting 5 seconds and doing progress + */ + shmem_barrier_all(); + /* keys exchanged, segments attached, now we can safely cleanup */ + if (memheap_map->mem_segs[HEAP_SEG_INDEX].type == MAP_SEGMENT_ALLOC_SHM) + { + shmctl(memheap_map->mem_segs[HEAP_SEG_INDEX].shmid, IPC_RMID, NULL); + } + } +} + + + +static inline uint64_t va2rva(unsigned long va, uint64_t local_base, uint64_t remote_base) +{ + return remote_base > local_base ? va + (remote_base - local_base) : va - (local_base - remote_base); +} + + +mca_spml_mkey_t * mca_memheap_base_get_cached_mkey(int pe, unsigned long va, int btl_id, uint64_t *rva) +{ + map_segment_t *s; + int rc; + mca_spml_mkey_t *mkey; + + MEMHEAP_VERBOSE_FASTPATH(10, "rkey: pe=%d va=%p", pe, (void *)va); + s = __find_va(va); + if (NULL == s) + return NULL; + + if (!s->is_active) + return NULL; + + if (pe == oshmem_my_proc_id()) { + *rva = va; + MEMHEAP_VERBOSE_FASTPATH(10, "rkey: pe=%d va=%p -> (local) %lx %p", pe, (void *)va, + s->mkeys[btl_id].key, (void *)*rva); + return &s->mkeys[btl_id]; + } + + if (OPAL_LIKELY(s->mkeys_cache[pe])) { + mkey = &s->mkeys_cache[pe][btl_id]; + *rva = va2rva(va, s->start, mkey->va_base); + MEMHEAP_VERBOSE_FASTPATH(10, "rkey: pe=%d va=%p -> (cached) %lx %p", pe, (void *)va, mkey->key, (void *)*rva); + return mkey; + } + + s->mkeys_cache[pe] = (mca_spml_mkey_t *)calloc(memheap_map->num_transports, sizeof(mca_spml_mkey_t)); + if (!s->mkeys_cache[pe]) + return NULL; + + rc = memheap_oob_get_mkeys(pe, s - memheap_map->mem_segs, s->mkeys_cache[pe]); + if (OSHMEM_SUCCESS != rc) + return NULL; + + mkey = &s->mkeys_cache[pe][btl_id]; + *rva = va2rva(va, s->start, mkey->va_base); + + MEMHEAP_VERBOSE_FASTPATH(5, "rkey: pe=%d va=%p -> (remote lookup) %lx %p", pe, (void *)va, mkey->key, (void *)*rva); + return mkey; +} + + +mca_spml_mkey_t *mca_memheap_base_get_mkey(unsigned long va, int tr_id) +{ + map_segment_t *s; + + s = __find_va(va); + + return ( (s && s->is_active) ? &s->mkeys[tr_id] : NULL); +} + + +uint64_t mca_memheap_base_find_offset(int pe, int tr_id, unsigned long va, uint64_t rva) +{ + map_segment_t *s; + + s = __find_va(va); + + return ( (s && s->is_active) ? (rva - s->mkeys_cache[pe][tr_id].va_base) : 0 ); +} + + +int mca_memheap_base_is_symmetric_addr(unsigned long va) +{ + return ( __find_va(va) ? 1 : 0 ); +} + + +int mca_memheap_base_detect_addr_type(unsigned long va) +{ + int addr_type = ADDR_INVALID; + map_segment_t *s; + + s = __find_va(va); + + if (s) + { + if (s->type == MAP_SEGMENT_STATIC) + { + addr_type = ADDR_STATIC; + } + else if ( va >= (unsigned long)s->start && + va < (unsigned long)(s->start + mca_memheap.memheap_size) ) + { + addr_type = ADDR_USER; + } + else + { + assert( va >= (unsigned long)(s->start + mca_memheap.memheap_size) && + va < (unsigned long)s->end ); + addr_type = ADDR_PRIVATE; + } + } + + return addr_type; +} diff --git a/oshmem/mca/memheap/base/memheap_base_open.c b/oshmem/mca/memheap/base/memheap_base_open.c new file mode 100644 index 0000000000..2448237396 --- /dev/null +++ b/oshmem/mca/memheap/base/memheap_base_open.c @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + + +#include "oshmem_config.h" +#include + +#include "opal/mca/mca.h" +#include "opal/util/output.h" +#include "opal/mca/base/base.h" + +#include "opal/mca/base/mca_base_param.h" +#include "oshmem/mca/memheap/memheap.h" +#include "oshmem/mca/memheap/base/base.h" + + +/* + * The following file was created by configure. It contains extern + * statements and the definition of an array of pointers to each + * component's public mca_base_component_t struct. + */ + +#include "oshmem/mca/memheap/base/static-components.h" + +#if defined(MPAGE_ENABLE) && (MPAGE_ENABLE > 0) + int mca_memheap_base_shmalloc_use_hugepages = 5; +#else + int mca_memheap_base_shmalloc_use_hugepages = 1; +#endif /* MPAGE_ENABLE */ + +int mca_memheap_base_output = -1; +int mca_memheap_buddy_use_modex = 1; +int mca_memheap_base_mr_interleave_factor = 2; +char* mca_memheap_base_include = NULL; +char* mca_memheap_base_exclude = NULL; +opal_list_t mca_memheap_base_components_opened; +struct mca_memheap_base_module_t* mca_memheap_base_module_initialized = NULL; +int mca_memheap_base_already_opened = 0; +mca_memheap_map_t mca_memheap_base_map; + +/** + * Function for finding and opening either all MCA components, or the one + * that was specifically requested via a MCA parameter. + */ +int mca_memheap_base_open(void) +{ + int value = -1; + mca_memheap_base_already_opened = mca_memheap_base_already_opened + 1; + if( mca_memheap_base_already_opened > 1 ){ + return OSHMEM_SUCCESS; + } + + mca_memheap_base_output = opal_output_open(NULL); + mca_base_param_reg_int_name("memheap", + "base_verbose", + "Verbosity level of the MEMHEAP framework", + false, false, + 0, &value); + opal_output_set_verbosity(mca_memheap_base_output, value); + +#if defined(MPAGE_ENABLE) && (MPAGE_ENABLE > 0) + mca_base_param_reg_int_name("shmalloc", + "use_hugepages", + "0|1|2|5 - disabled, enabled with fallback to mmap(), do not fallback to mmap(), enabled mpages(default)", + false, false, + mca_memheap_base_shmalloc_use_hugepages, &mca_memheap_base_shmalloc_use_hugepages); +#else + mca_base_param_reg_int_name("shmalloc", + "use_hugepages", + "0|1|2 - disabled, enabled(default) with fallback to mmap(), do not fallback to mmap()", + false, false, + mca_memheap_base_shmalloc_use_hugepages, &mca_memheap_base_shmalloc_use_hugepages); +#endif /* MPAGE_ENABLE */ + + mca_base_param_reg_int_name("shmalloc", + "use_modex", + "0|1 - disabled, enabled(default) use modex to facilitate memory registration exchange", + false, false, + 1, &mca_memheap_buddy_use_modex); + + mca_base_param_reg_int_name("memheap", + "mr_interleave_factor", + "2 - default, try to give at least N Gbytes spaces between mapped memheaps of other pes that are local to me", + false, false, + mca_memheap_base_mr_interleave_factor, &mca_memheap_base_mr_interleave_factor); + /* Open up all available components */ + if (OSHMEM_SUCCESS != + mca_base_components_open("memheap", mca_memheap_base_output, mca_memheap_base_static_components, + &mca_memheap_base_components_opened, true)) { + return OSHMEM_ERROR; + } + + /* register parameters */ + + mca_base_param_reg_string_name("memheap", NULL, + "Specify a specific memheap implementation to use", + false, false, NULL, &mca_memheap_base_include); + + if (NULL == mca_memheap_base_include) { + mca_memheap_base_include = getenv(SHMEM_HEAP_TYPE); + if (NULL == mca_memheap_base_include) + mca_memheap_base_include = strdup(""); + else + mca_memheap_base_include = strdup(mca_memheap_base_include); + } + + (void) mca_base_param_reg_string_name("memheap","base_exclude",NULL,false,false,NULL, &mca_memheap_base_exclude); + + memset(&mca_memheap_base_map, 0, sizeof(mca_memheap_base_map)); + mca_memheap_base_map.n_segments = 0; + mca_memheap_base_map.num_transports = 0; + + /* All done */ + return OSHMEM_SUCCESS; +} diff --git a/oshmem/mca/memheap/base/memheap_base_register.c b/oshmem/mca/memheap/base/memheap_base_register.c new file mode 100644 index 0000000000..c276cc63ba --- /dev/null +++ b/oshmem/mca/memheap/base/memheap_base_register.c @@ -0,0 +1,140 @@ +/* +* Copyright (c) 2012 Mellanox Technologies, Inc. +* All rights reserved. +* $COPYRIGHT$ +* +* Additional copyrights may follow +* +* $HEADER$ +*/ +#include "oshmem_config.h" + +#include "oshmem/proc/proc.h" +#include "oshmem/mca/memheap/memheap.h" +#include "oshmem/mca/memheap/base/base.h" + +#include + + +static int __dereg_segment(map_segment_t *s); +static int __reg_segment(map_segment_t *s, int *num_btl); + + +extern int mca_memheap_base_register(mca_memheap_map_t *memheap_map) +{ + int ret = OSHMEM_SUCCESS; + int i; + + for (i = 0; i < memheap_map->n_segments; i++) { + map_segment_t *s = &memheap_map->mem_segs[i]; + + MEMHEAP_VERBOSE(5, "register seg#%02d: 0x%llX - 0x%llX %llu bytes type=0x%X id=0x%X", + i, + (long long)s->start, + (long long)s->end, + (long long)(s->end - s->start), + s->type, + s->shmid + ); + ret = __reg_segment(s, &memheap_map->num_transports); + } + + return ret; +} + + +extern int mca_memheap_base_deregister(mca_memheap_map_t *memheap_map) +{ + int ret = OSHMEM_SUCCESS; + int i; + + for (i = 0; i < memheap_map->n_segments; i++) { + map_segment_t *s = &memheap_map->mem_segs[i]; + + if (!s->is_active) + continue; + + MEMHEAP_VERBOSE(5, "deregistering segment#%d: %llx - %llx %llu bytes", + i, + (long long)s->start, + (long long)s->end, + (long long)(s->end - s->start) + ); + ret = __dereg_segment(s); + } + + return ret; +} + + +static int __dereg_segment(map_segment_t *s) +{ + int rc = OSHMEM_SUCCESS; + int j; + int nprocs, my_pe; + + nprocs = oshmem_num_procs(); + my_pe = oshmem_my_proc_id(); + + MCA_SPML_CALL(deregister(s->mkeys)); + + if (s->mkeys_cache) { + for (j = 0; j < nprocs; j++) { + if (j == my_pe) + continue; + if (s->mkeys_cache[j]) + { + free(s->mkeys_cache[j]); + s->mkeys_cache[j] = NULL; + } + } + free(s->mkeys_cache); + s->mkeys_cache = NULL; + } + + s->is_active = 0; + + return rc; +} + + +static int __reg_segment(map_segment_t *s, int *num_btl) +{ + int rc = OSHMEM_SUCCESS; + int my_pe; + int nprocs; + + nprocs = oshmem_num_procs(); + my_pe = oshmem_my_proc_id(); + + s->mkeys_cache = (mca_spml_mkey_t **)calloc(nprocs, sizeof(mca_spml_mkey_t *)); + if (NULL == s->mkeys_cache) + { + MEMHEAP_ERROR("Failed to allocate memory for remote segments"); + rc = OSHMEM_ERROR; + } + + if (!rc) + { + s->mkeys = MCA_SPML_CALL(register((void *)(unsigned long)s->start, + s->end - s->start, + MEMHEAP_SHM_CODE(s->type, s->shmid), + num_btl)); + if (NULL == s->mkeys) + { + free(s->mkeys_cache); + s->mkeys_cache = NULL; + + MEMHEAP_ERROR("Failed to register segment"); + rc = OSHMEM_ERROR; + } + } + + if (OSHMEM_SUCCESS == rc) + { + s->mkeys_cache[my_pe] = s->mkeys; + s->is_active = 1; + } + + return rc; +} diff --git a/oshmem/mca/memheap/base/memheap_base_select.c b/oshmem/mca/memheap/base/memheap_base_select.c new file mode 100644 index 0000000000..bae862b6d0 --- /dev/null +++ b/oshmem/mca/memheap/base/memheap_base_select.c @@ -0,0 +1,267 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "oshmem_config.h" + +#include "opal/util/argv.h" +#include "opal/util/output.h" +#include "orte/util/show_help.h" +#include "opal/mca/mca.h" +#include "opal/mca/base/base.h" +#include "opal/mca/base/mca_base_component_repository.h" +#include "oshmem/mca/memheap/memheap.h" +#include "oshmem/mca/memheap/base/base.h" +#include "orte/mca/errmgr/errmgr.h" +#include "opal/runtime/opal.h" + + +mca_memheap_base_module_t mca_memheap; +/** + * Function for weeding out memheap components that shouldn't be executed. + * Implementation inspired by btl/base. + * + * Call the init function on all available components to find out if + * they want to run. Select all components that don't fail. Failing + * components will be closed and unloaded. The selected modules will + * be pointed to by mca_memheap_base_module_t. + */ + +static memheap_context_t* __memheap_create(void); + + +/** + * Choose to init one component with the highest priority. + * If the include list if it is not empty choose a component that appear in the list. + * O/W choose the highest priority component not in the exclude list. + * Include and exclude lists may be given in the shmem launcher command line. + */ +int mca_memheap_base_select() +{ + int priority = 0; + int max_priority = 0; + opal_list_item_t *max_priority_item = NULL; + opal_list_item_t *item = NULL; + mca_base_component_list_item_t *cli = NULL; + mca_memheap_base_component_t *component = NULL; + mca_memheap_base_component_t *max_priority_component = NULL; + mca_memheap_base_module_t *module = NULL; + memheap_context_t *context = NULL; + + char** include = opal_argv_split(mca_memheap_base_include, ','); + char** exclude = opal_argv_split(mca_memheap_base_exclude, ','); + + context = __memheap_create(); + if (!context) + { + return OSHMEM_ERROR; + } + + item = opal_list_get_first(&mca_memheap_base_components_opened); + while (item != opal_list_get_end(&mca_memheap_base_components_opened)) { + + opal_list_item_t *next = opal_list_get_next(item); + cli = (mca_base_component_list_item_t *) item; + component = (mca_memheap_base_component_t *) cli->cli_component; + + + /* Verify if the component is in the include or the exclude list. */ + /* If there is an include list - item must be in the list to be included */ + if ( NULL != include ) { + char** argv = include; + bool found = false; + while(argv && *argv) { + if(strcmp(component->memheap_version.mca_component_name,*argv) == 0) { + found = true; + break; + } + argv++; + } + /* If not in the list do not choose this component */ + if(found == false) { + item = next; + continue; + } + + /* Otherwise - check the exclude list to see if this item has been specifically excluded */ + } else if ( NULL != exclude ) { + char** argv = exclude; + bool found = false; + while(argv && *argv) { + if(strcmp(component->memheap_version.mca_component_name,*argv) == 0) { + found = true; + break; + } + argv++; + } + if(found == true) { + item = next; + continue; + } + } + + /* Verify that the component has an init function */ + if (NULL == component->memheap_init) { + MEMHEAP_VERBOSE(10,"select: no init function; for component %s. No component selected", + component->memheap_version.mca_component_name); + } else { + + MEMHEAP_VERBOSE(5,"select: component %s size : user %d private: %d", + component->memheap_version.mca_component_name, + (int)context->user_size, (int)context->private_size); + + /* Init the component in order to get its priority */ + module = component->memheap_init(context, &priority); + + /* If the component didn't initialize, remove it from the opened list, remove it from the component repository and return an error */ + if (NULL == module) { + MEMHEAP_VERBOSE(10,"select: init of component %s returned failure", + component->memheap_version.mca_component_name); + MEMHEAP_VERBOSE(10,"select: module %s unloaded", + component->memheap_version.mca_component_name); + + mca_base_component_repository_release((mca_base_component_t *) component); + } + /* Calculate memheap size in case it was not set during component initialization */ + module->memheap_size = context->user_size; + } + + + /* Init max priority component */ + if(NULL == max_priority_component) { + max_priority_component = component; + max_priority_item = item; + mca_memheap_base_module_initialized = module; + max_priority = priority; + } + + /* Update max priority component if current component has greater priority */ + if(priority > max_priority) { + max_priority = priority; + max_priority_component = component; + max_priority_item = item; + mca_memheap_base_module_initialized = module; + } + item = next; + } + + opal_argv_free(include); + opal_argv_free(exclude); + + /* Verify that a component was selected */ + if(NULL == max_priority_component) { + MEMHEAP_VERBOSE(10,"select: no component selected"); + return OSHMEM_ERROR; + } + + /* Verify that some module was initialized */ + if(NULL == mca_memheap_base_module_initialized) { + orte_show_help("help-shmem-mca.txt", "find-available:none-found", true, "memheap"); + orte_errmgr.abort(1, NULL); + } + + MEMHEAP_VERBOSE(10,"SELECTED %s component %s", + max_priority_component->memheap_version.mca_type_name, + max_priority_component->memheap_version.mca_component_name); + + setenv(SHMEM_HEAP_TYPE, max_priority_component->memheap_version.mca_component_name, 1); + /* Clear open list */ + /* Close all remaining opened components except for the selected component */ + if (0 != opal_list_get_size(&mca_memheap_base_components_opened)) { + mca_base_components_close(mca_memheap_base_output, + &mca_memheap_base_components_opened, (mca_base_component_t*)max_priority_component); + } + + mca_memheap = *mca_memheap_base_module_initialized; + + return OSHMEM_SUCCESS; +} + +static size_t memheap_size(void) +{ + char *p; + unsigned long long factor; + int idx; + unsigned long long size; + + p = getenv(SHMEM_HEAP_SIZE); + if (!p) + return SIZE_IN_MEGA_BYTES(DEFAULT_SYMMETRIC_HEAP_SIZE); + + idx = strlen(p)-1; + if (p[idx] == 'k' || p[idx] == 'K') { + factor = 1024; + } + else if (p[idx] == 'm' || p[idx] == 'M') { + factor = 1024 * 1024; + } + else if (p[idx] == 'g' || p[idx] == 'G') { + factor = 1024 * 1024 * 1024; + } + else if (p[idx] == 't' || p[idx] == 'T') { + factor = 1024UL * 1024UL * 1024UL * 1024UL; + } + else + factor = 1; + + size = atoll(p); + if (size == 0) { + MEMHEAP_ERROR("Incorrect symmetric heap size %s. Using default heap size %d M\n", p, DEFAULT_SYMMETRIC_HEAP_SIZE); + return SIZE_IN_MEGA_BYTES(DEFAULT_SYMMETRIC_HEAP_SIZE); + } + return (size_t)memheap_align(size * factor); +} + +static memheap_context_t* __memheap_create(void) +{ + int rc = OSHMEM_SUCCESS; + static memheap_context_t context; + size_t user_size; + + user_size = memheap_size(); + if (user_size < MEMHEAP_BASE_MIN_SIZE) { + MEMHEAP_ERROR("Requested memheap size is less than minimal meamheap size (%llu < %llu)", + (unsigned long long)user_size, MEMHEAP_BASE_MIN_SIZE); + return NULL; + } + /* Inititialize symmetric area */ + if (OSHMEM_SUCCESS == rc) + { + rc = mca_memheap_base_alloc_init(&mca_memheap_base_map, + user_size + MEMHEAP_BASE_PRIVATE_SIZE); + } + + /* Inititialize static/global variables area */ + if (OSHMEM_SUCCESS == rc) + { + rc = mca_memheap_base_static_init(&mca_memheap_base_map); + } + + /* Memory Registration */ + if (OSHMEM_SUCCESS == rc) + { + rc = mca_memheap_base_register(&mca_memheap_base_map); + } + + /* Init OOB channel */ + if (OSHMEM_SUCCESS == rc) + { + rc = memheap_oob_init(&mca_memheap_base_map); + } + + if (OSHMEM_SUCCESS == rc) + { + context.user_size = user_size; + context.private_size = MEMHEAP_BASE_PRIVATE_SIZE; + context.user_base_addr = (void*)((unsigned char*)mca_memheap_base_map.mem_segs[HEAP_SEG_INDEX].start + 0); + context.private_base_addr = (void*)((unsigned char*)mca_memheap_base_map.mem_segs[HEAP_SEG_INDEX].start + context.user_size); + } + + return ( (OSHMEM_SUCCESS == rc) ? &context : NULL ); +} diff --git a/oshmem/mca/memheap/base/memheap_base_static.c b/oshmem/mca/memheap/base/memheap_base_static.c new file mode 100644 index 0000000000..57de1cb609 --- /dev/null +++ b/oshmem/mca/memheap/base/memheap_base_static.c @@ -0,0 +1,224 @@ +/* +* Copyright (c) 2012 Mellanox Technologies, Inc. +* All rights reserved. +* $COPYRIGHT$ +* +* Additional copyrights may follow +* +* $HEADER$ +*/ +#include "oshmem_config.h" + +#include "oshmem/proc/proc.h" +#include "oshmem/mca/memheap/memheap.h" +#include "oshmem/mca/memheap/base/base.h" + +#include + +struct map_segment_desc { + uint64_t start; + uint64_t end; + char perms[8]; + uint64_t offset; + char dev[8]; + uint64_t inode; + char pathname[MAXPATHLEN]; +}; + +typedef struct memheap_static_context +{ + struct + { + uint64_t start; + uint64_t end; + } mem_segs[MCA_MEMHEAP_MAX_SEGMENTS]; + int n_segments; +} memheap_static_context_t; + +static memheap_static_context_t memheap_context; + +static int __load_segments(void); +static int __check_perms(struct map_segment_desc *seg); +static int __check_address(struct map_segment_desc *seg); +static int __check_pathname(struct map_segment_desc *seg); + + +int mca_memheap_base_static_init(mca_memheap_map_t *map) +{ + /* read and parse segments from /proc/self/maps */ + int ret = OSHMEM_SUCCESS; + + assert(map); + assert(SYMB_SEG_INDEX <= map->n_segments); + + ret = __load_segments(); + + if (OSHMEM_SUCCESS == ret) + { + int i; + size_t total_mem; + + for (i = 0, total_mem = 0; i < memheap_context.n_segments; i++) + { + map_segment_t *s = &map->mem_segs[map->n_segments]; + + memset(s, 0, sizeof(*s)); + s->is_active = 0; + s->shmid = MEMHEAP_SHM_INVALID; + s->start = memheap_context.mem_segs[i].start; + s->end = memheap_context.mem_segs[i].end; + s->size = s->end - s->start; + s->type = MAP_SEGMENT_STATIC; + s->context = NULL; + map->n_segments++; + + total_mem += s->end - s->start; + } + MEMHEAP_VERBOSE(1, "Memheap static memory: %llu byte(s), %d segments", + (unsigned long long)total_mem, map->n_segments); + } + + return ret; +} + + +void mca_memheap_base_static_exit(mca_memheap_map_t *map) +{ + assert(map); +} + + +static int __check_perms(struct map_segment_desc *seg) +{ + if (!strcmp(seg->perms, "rw-p") || !strcmp(seg->perms, "rwxp")) + return OSHMEM_SUCCESS; + + return OSHMEM_ERROR; +} + + +static int __check_address(struct map_segment_desc *seg) +{ + extern unsigned _end; + unsigned long data_end = (unsigned long)&_end; + + /** + * Sasha: + * SGI shmem only supports globals&static in main program. + * It does not support them in shared objects or in dlopen() + * (Clarified on PGAS 2011 tutorial) + * + * So ignored any maps that start higher then process _end + * FIXME: make sure we do not register symmetric heap twice + * if we decide to allow shared objects + */ + if (seg->start > data_end) { + MEMHEAP_VERBOSE(100, "skip segment: data _end < segment start (%llx < %llx)", + (unsigned long long)data_end, + (unsigned long long)seg->start + ); + return OSHMEM_ERROR; + } + return OSHMEM_SUCCESS; +} + + +static int __check_pathname(struct map_segment_desc *seg) +{ + + return OSHMEM_SUCCESS; +#if 0 /* To press check coverity issue */ + char *p; + if ('\0' == seg->pathname[0]) + return OSHMEM_SUCCESS; + + if (0 == strncmp(seg->pathname, "/lib", 4)) + return OSHMEM_ERROR; + + if (0 == strncmp(seg->pathname, "/usr/lib", 8)) + return OSHMEM_ERROR; + + if (0 == strncmp(seg->pathname, "/dev", 4)) + return OSHMEM_ERROR; + + if (0 == strcmp(seg->pathname, "[stack]")) + return OSHMEM_ERROR; + + if (0 == strcmp(seg->pathname, "[vdso]")) + return OSHMEM_ERROR; + + if (0 == strcmp(seg->pathname, "[vsyscall]")) + return OSHMEM_ERROR; + + p = rindex(seg->pathname, '/'); + if (p) { + if (0 == strncmp(p+1, "libshmem.so", 11)) + return OSHMEM_ERROR; + + if (0 == strncmp(p+1, "libmpi.so", 9)) + return OSHMEM_ERROR; + + if (0 == strncmp(p+1, "libmca_common_sm.so", 19)) + return OSHMEM_ERROR; + } + + return OSHMEM_SUCCESS; +#endif +} + + +static int __load_segments(void) +{ + FILE *fp; + char line[1024]; + struct map_segment_desc seg; + + memheap_context.n_segments = 0; + + fp = fopen("/proc/self/maps", "r"); + if (NULL == fp) { + MEMHEAP_ERROR("Failed to open /proc/self/maps"); + return OSHMEM_ERROR; + } + + while (NULL != fgets(line, sizeof(line), fp)) { + memset(&seg, 0, sizeof(seg)); + sscanf(line, "%llx-%llx %s %llx %s %llx %s", + (long long *)&seg.start, + (long long *)&seg.end, + seg.perms, + (long long *)&seg.offset, + seg.dev, + (long long *)&seg.inode, + seg.pathname); + + if (OSHMEM_ERROR == __check_address(&seg)) + continue; + + if (OSHMEM_ERROR == __check_pathname(&seg)) + continue; + + if (OSHMEM_ERROR == __check_perms(&seg)) + continue; + + MEMHEAP_VERBOSE(5, "add: %s", line); + if (MCA_MEMHEAP_MAX_SEGMENTS <= memheap_context.n_segments) { + MEMHEAP_ERROR("too many segments (max = %d): skip %s", + MCA_MEMHEAP_MAX_SEGMENTS, line); + continue; + } + if (memheap_context.n_segments > 0 && + seg.start == memheap_context.mem_segs[memheap_context.n_segments-1].end) { + MEMHEAP_VERBOSE(5, "Coalescing segment"); + memheap_context.mem_segs[memheap_context.n_segments-1].end = seg.end; + } + else { + memheap_context.mem_segs[memheap_context.n_segments].start = seg.start; + memheap_context.mem_segs[memheap_context.n_segments].end = seg.end; + memheap_context.n_segments ++; + } + } + + fclose(fp); + return OSHMEM_SUCCESS; +} diff --git a/oshmem/mca/memheap/buddy/.windows b/oshmem/mca/memheap/buddy/.windows new file mode 100644 index 0000000000..efba552db4 --- /dev/null +++ b/oshmem/mca/memheap/buddy/.windows @@ -0,0 +1,12 @@ +# +# Copyright (c) 2012 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# Specific to this module +mca_dependencies=libshmem libopen-rte diff --git a/oshmem/mca/memheap/buddy/Makefile.am b/oshmem/mca/memheap/buddy/Makefile.am new file mode 100644 index 0000000000..f3d09b1a91 --- /dev/null +++ b/oshmem/mca/memheap/buddy/Makefile.am @@ -0,0 +1,41 @@ +# +# Copyright (c) 2012 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +EXTRA_DIST = + +AM_CFLAGS = $(OSHMEM_CFLAGS) + +buddy_sources = \ + memheap_buddy.c \ + memheap_buddy.h \ + memheap_buddy_component.c \ + memheap_buddy_component.h + +#if OMPI_BUILD_memheap_buddy_DSO +if MCA_BUILD_ompi_pml_ob1_DSO +component_noinst = +component_install = mca_memheap_buddy.la +else +component_noinst = libmca_memheap_buddy.la +component_install = +endif + +mcacomponentdir = $(pkglibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_memheap_buddy_la_SOURCES = $(buddy_sources) +mca_memheap_buddy_la_LDFLAGS = -module -avoid-version + +#noinst_LTLIBRARIES = $(lib) +noinst_LTLIBRARIES = $(component_noinst) +libmca_memheap_buddy_la_SOURCES = $(buddy_sources) +libmca_memheap_buddy_la_LDFLAGS = -module -avoid-version + + + diff --git a/oshmem/mca/memheap/buddy/configure.params b/oshmem/mca/memheap/buddy/configure.params new file mode 100644 index 0000000000..5a3f93008f --- /dev/null +++ b/oshmem/mca/memheap/buddy/configure.params @@ -0,0 +1,13 @@ +# -*- shell-script -*- +# Copyright (c) 2012 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# Specific to this module + +PARAM_CONFIG_FILES="Makefile" diff --git a/oshmem/mca/memheap/buddy/memheap_buddy.c b/oshmem/mca/memheap/buddy/memheap_buddy.c new file mode 100644 index 0000000000..1edbdc6052 --- /dev/null +++ b/oshmem/mca/memheap/buddy/memheap_buddy.c @@ -0,0 +1,693 @@ +/* Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "oshmem_config.h" +#include "oshmem/proc/proc.h" +#include "oshmem/mca/spml/spml.h" +#include "oshmem/mca/memheap/memheap.h" +#include "oshmem/mca/memheap/buddy/memheap_buddy.h" +#include "oshmem/mca/memheap/buddy/memheap_buddy_component.h" +#include "oshmem/mca/memheap/base/base.h" +#include "orte/mca/grpcomm/grpcomm.h" +#include "opal/class/opal_hash_table.h" +#include "opal/class/opal_object.h" +#include "orte/util/name_fns.h" + + +static int buddy_init(mca_memheap_buddy_module_t* buddy); + + +mca_memheap_buddy_module_t memheap_buddy = { + { + &mca_memheap_buddy_component, + mca_memheap_buddy_finalize, + mca_memheap_buddy_alloc, + mca_memheap_buddy_align, + mca_memheap_buddy_realloc, + mca_memheap_buddy_free, + + mca_memheap_buddy_private_alloc, + mca_memheap_buddy_private_free, + + mca_memheap_base_get_cached_mkey, + mca_memheap_base_get_mkey, + mca_memheap_base_find_offset, + mca_memheap_base_is_symmetric_addr, + mca_memheap_modex_recv_all, + + 0 + }, + 1 /* priority */ +}; + + +/* Memory Heap Buddy Implementation */ + +/* Static inline functions */ +static inline unsigned int bits_per_long(void) +{ + return BITS_PER_BYTE * sizeof(unsigned long); +} + +static inline void bitmap_zero(unsigned long *dst, unsigned long nbits) +{ + unsigned long len = BITS_TO_LONGS(nbits) * sizeof(unsigned long); + memset(dst, 0, len); +} + +/* + * WARNING: Non atomic version. + */ +static inline void __clear_bit(unsigned long nr, volatile void * addr) +{ + int *m = ((int *) addr) + (nr >> 5); + *m &= ~(1 << (nr & 31)); +} + +/* + * WARNING: non atomic version. + */ +static inline void __set_bit(unsigned long nr, volatile void * addr) +{ + int *m = ((int *) addr) + (nr >> 5); + *m |= 1 << (nr & 31); +} + +static inline int test_bit(int nr, const volatile void * addr) +{ + return (1UL & (((const int *) addr)[nr >> 5] >> (nr & 31))) != 0UL; +} + + + +/* + * __ffs - find first bit in word. + * @word: The word to search + * + * Undefined if no bit exists, so code should check against 0 first. + */ +static inline __opal_attribute_always_inline__ unsigned long __ffs(unsigned long word) +{ + int num = 0; + + if(bits_per_long() == 64){ + if ((word & 0xffffffff) == 0) { + num += 32; + word >>= 32; + } + } + + if ((word & 0xffff) == 0) { + num += 16; + word >>= 16; + } + if ((word & 0xff) == 0) { + num += 8; + word >>= 8; + } + if ((word & 0xf) == 0) { + num += 4; + word >>= 4; + } + if ((word & 0x3) == 0) { + num += 2; + word >>= 2; + } + if ((word & 0x1) == 0) + num += 1; + return num; +} + +/* round up to next power of two */ +static inline unsigned memheap_buddy_find_order(unsigned long size) +{ + unsigned order; + + if (size & (size - 1)) + order = 1; + else + order = 0; + + while (size >>= 1) { + order++; + } + return order; +} + +/* + * find the first set bit in a memory region + * @addr: The address to base the search on + * @offset: The bitnumber to start searching at + * @size: The maximum size to search + */ + +static inline unsigned long find_next_bit(const unsigned long *addr, unsigned long size, unsigned long offset) +{ + const unsigned long *p = addr + BITOP_WORD(offset); + unsigned long result = offset & ~(bits_per_long()-1); + unsigned long tmp; + + if (offset >= size) + return size; + size -= result; + offset %= bits_per_long(); + if (offset) { + tmp = *(p++); + tmp &= (~0UL << offset); + if (size < bits_per_long()) + goto found_first; + if (tmp) + goto found_middle; + size -= bits_per_long(); + result += bits_per_long(); + } + while (size & ~(bits_per_long()-1)) { + if ((tmp = *(p++))) + goto found_middle; + result += bits_per_long(); + size -= bits_per_long(); + } + if (!size) + return result; + tmp = *p; + +found_first: + tmp &= (~0UL >> (bits_per_long() - size)); + if (tmp == 0UL) /* Are any bits set? */ + return result + size; /* Nope. */ +found_middle: + return result + __ffs(tmp); +} + + +/** + * Initialize the Memory Heap + */ +int mca_memheap_buddy_module_init(memheap_context_t *context) +{ + if (!context || !context->user_size || !context->private_size) + { + return OSHMEM_ERR_BAD_PARAM; + } + + /* Construct a mutex object */ + OBJ_CONSTRUCT(&memheap_buddy.lock, opal_mutex_t); + + memheap_buddy.heap.max_order = memheap_log2(context->user_size); + memheap_buddy.heap.min_order = MEMHEAP_BASE_MIN_ORDER; + memheap_buddy.private_heap.max_order = memheap_log2(context->private_size); + memheap_buddy.private_heap.min_order = MEMHEAP_BASE_MIN_ORDER; + + if (context->user_size != (1ULL << memheap_buddy.heap.max_order)) { + MEMHEAP_VERBOSE(1, "Memheap rounded to the nearest power of two: requested %llu bytes, allocated %llu bytes", + (unsigned long long)context->user_size, 1ULL << memheap_buddy.heap.max_order); + } + + assert(context->private_size == (1ULL << memheap_buddy.private_heap.max_order)); + + memheap_buddy.heap.symmetric_heap = context->user_base_addr; + memheap_buddy.private_heap.symmetric_heap = context->private_base_addr; + + memheap_buddy.super.memheap_size = (1ULL << memheap_buddy.heap.max_order); + + MEMHEAP_VERBOSE(1, "symmetric heap memory (user+private): %llu bytes", + (unsigned long long)(context->user_size + context->private_size)); + + /* Initialize buddy allocator */ + if (OSHMEM_SUCCESS != buddy_init(&memheap_buddy)) { + MEMHEAP_ERROR("Failed to setup MEMHEAP buddy allocator"); + goto err; + } + + /* disable till we figure out double modex&grpcomm.bad problem */ +// memheap_modex_mkey_exchange(); + + return OSHMEM_SUCCESS; + + err: + mca_memheap_buddy_finalize(); + return OSHMEM_ERROR; +} + + +static int buddy_init(mca_memheap_buddy_module_t* buddy) +{ + unsigned long long total_size; + unsigned i; + unsigned long long s; + + /* Allocate and init Hashtable */ + memheap_buddy.heap.symmetric_heap_hashtable = OBJ_NEW(opal_hash_table_t); + if(NULL == memheap_buddy.heap.symmetric_heap_hashtable){ + MEMHEAP_ERROR("Opal failed to allocate hashtable object"); + goto err; + } + memheap_buddy.private_heap.symmetric_heap_hashtable = OBJ_NEW(opal_hash_table_t); + if(NULL == memheap_buddy.private_heap.symmetric_heap_hashtable){ + MEMHEAP_ERROR("Opal failed to allocate hashtable object"); + goto err; + } + + opal_hash_table_init(memheap_buddy.heap.symmetric_heap_hashtable, DEFAULT_HASHTABLE_SIZE); + opal_hash_table_init(memheap_buddy.private_heap.symmetric_heap_hashtable, DEFAULT_HASHTABLE_SIZE); + /* Init Buddy Allocator */ + buddy->heap.bits = (unsigned long**)calloc((buddy->heap.max_order + 1), sizeof(unsigned long *)); + buddy->private_heap.bits = (unsigned long**)calloc((buddy->private_heap.max_order + 1), sizeof(unsigned long *)); + buddy->heap.num_free = (unsigned int*)calloc((buddy->heap.max_order + 1), sizeof (unsigned int)); + buddy->private_heap.num_free = (unsigned int*)calloc((buddy->private_heap.max_order + 1), sizeof (unsigned int)); + if ((NULL == buddy->heap.bits) || (NULL == buddy->heap.num_free) || + (NULL == buddy->private_heap.bits) || (NULL == buddy->private_heap.num_free)){ + + MEMHEAP_ERROR("Failed to allocate buddy allocator"); + goto err; + } + + total_size = 0; + for (i = buddy->heap.min_order; i <= buddy->heap.max_order; ++i) { + s = BITS_TO_LONGS(1UL << (buddy->heap.max_order - i)); + MEMHEAP_VERBOSE(20, "%d: (order=%d) allocating %llu longs (sizeof long = %d)", + i, buddy->heap.max_order, s, (int)sizeof(unsigned long)); + total_size += s * sizeof(unsigned long); + buddy->heap.bits[i] = (unsigned long*)malloc(s * sizeof (unsigned long)); + if (NULL == buddy->heap.bits[i]){ + MEMHEAP_ERROR("Failed to allocate buddy->allocator"); + goto err; + } + bitmap_zero(buddy->heap.bits[i], 1UL << (buddy->heap.max_order - i)); + } + MEMHEAP_VERBOSE(5,"MEMHEAP metadata size = %llu bytes", total_size); + + total_size = 0; + for (i = buddy->private_heap.min_order; i <= buddy->private_heap.max_order; ++i) { + s = BITS_TO_LONGS(1UL << (buddy->private_heap.max_order - i)); + MEMHEAP_VERBOSE(20, "%d: (order=%d) allocating %llu longs (sizeof long = %d)", + i, buddy->private_heap.max_order, s, (int)sizeof(unsigned long)); + total_size += s * sizeof(unsigned long); + buddy->private_heap.bits[i] = (unsigned long*)malloc(s * sizeof (unsigned long)); + if (NULL == buddy->private_heap.bits[i]){ + MEMHEAP_ERROR("Failed to allocate buddy->allocator"); + goto err; + } + bitmap_zero(buddy->private_heap.bits[i], 1UL << (buddy->private_heap.max_order - i)); + } + MEMHEAP_VERBOSE(5,"private MEMHEAP metadata size = %llu bytes", total_size); + + set_bit(0, buddy->heap.bits[buddy->heap.max_order]); + set_bit(0, buddy->private_heap.bits[buddy->private_heap.max_order]); + buddy->heap.num_free[buddy->heap.max_order] = 1; + buddy->private_heap.num_free[buddy->private_heap.max_order] = 1; + + return OSHMEM_SUCCESS; + +err: + return OSHMEM_ERROR; +} + +static int buddy_cleanup(mca_memheap_buddy_module_t* buddy) +{ + unsigned int i; + + MEMHEAP_VERBOSE(5, "buddy cleanup"); + if(NULL == buddy){ + return OSHMEM_SUCCESS; + } + + for (i = 0; i <= buddy->heap.max_order; ++i){ + if(NULL != buddy->heap.bits && NULL != buddy->heap.bits[i]){ + free(buddy->heap.bits[i]); + } + } + + for (i = 0; i <= buddy->private_heap.max_order; ++i){ + if(NULL != buddy->private_heap.bits && NULL != buddy->private_heap.bits[i]){ + free(buddy->private_heap.bits[i]); + } + } + + if(NULL != buddy->heap.bits){ + free(buddy->heap.bits); + } + if(NULL != buddy->heap.num_free){ + free(buddy->heap.num_free); + } + + if(NULL != buddy->private_heap.bits){ + free(buddy->private_heap.bits); + } + if(NULL != buddy->private_heap.num_free){ + free(buddy->private_heap.num_free); + } + + OBJ_DESTRUCT(&buddy->lock); + return OSHMEM_SUCCESS; +} + +static int _buddy_alloc(unsigned order, uint32_t* seg, mca_memheap_buddy_heap_t *heap) +{ + uint32_t o; + uint32_t m; + + MEMHEAP_VERBOSE(20, "order=%d size=%d", order, 1<max_order; ++o) { + if (heap->num_free[o]) { + m = 1 << (heap->max_order - o); + *seg = find_first_bit(heap->bits[o], m); + MEMHEAP_VERBOSE(20, "found free bit: order=%d, bits=0x%lx m=%d, *seg=%d", o, heap->bits[o][0], m, *seg); + if (*seg < m) + goto found; + } + } + + OPAL_THREAD_UNLOCK(&memheap_buddy.lock); + return OSHMEM_ERROR; + +found: + clear_bit(*seg, heap->bits[o]); + --(heap->num_free[o]); + + while (o > order) { + --o; + *seg <<= 1; + set_bit(*seg ^ 1, heap->bits[o]); + ++(heap->num_free[o]); + } + + OPAL_THREAD_UNLOCK(&memheap_buddy.lock); + *seg <<= order; + + return OSHMEM_SUCCESS; +} + +static int _buddy_free(mca_memheap_buddy_module_t* buddy, uint32_t seg, unsigned order, mca_memheap_buddy_heap_t *heap) +{ + MEMHEAP_VERBOSE(20, "order=%d size=%d seg=%d", order, 1<>= order; + OPAL_THREAD_LOCK(&buddy->lock); + + while (test_bit(seg ^ 1, heap->bits[order])) { + clear_bit(seg ^ 1, heap->bits[order]); + --(heap->num_free[order]); + seg >>= 1; + ++order; + } + + set_bit(seg, heap->bits[order]); + ++(heap->num_free[order]); + OPAL_THREAD_UNLOCK(&buddy->lock); + return OSHMEM_SUCCESS; +} + +static int buddy_free(mca_memheap_buddy_module_t* buddy, uint32_t seg, unsigned order) +{ + return _buddy_free(buddy, seg, order, &buddy->heap); +} + +static int buddy_private_free(mca_memheap_buddy_module_t* buddy, uint32_t seg, unsigned order) +{ + return _buddy_free(buddy, seg, order, &buddy->private_heap); +} + +static int _do_alloc(uint32_t order, void **p_buff, mca_memheap_buddy_heap_t *heap) +{ + int rc; + unsigned long base; + uint32_t offset; + unsigned long addr; + + if (order < heap->min_order) + order = heap->min_order; + + *p_buff = 0; + if(order > heap->max_order) { + /* Test allocated size overflow */ + MEMHEAP_VERBOSE(5, "Allocation overflow of symmetric heap size"); + return OSHMEM_ERROR; + } + + base = (unsigned long)heap->symmetric_heap; + + if(OSHMEM_SUCCESS != _buddy_alloc(order, &offset, heap)){ + MEMHEAP_VERBOSE(5, "Buddy Allocator failed to return a base address"); + return OSHMEM_ERROR; + } + + /* Save the order of the allocated variable */ + addr = base + offset; + + rc = opal_hash_table_set_value_uint64(heap->symmetric_heap_hashtable, + addr, (void *)(unsigned long)order); + + if(OPAL_SUCCESS != rc) { + MEMHEAP_VERBOSE(5, "Failed to insert order to hashtable"); + goto alloc_error; + } + + *p_buff = (void*)addr; + /* no barrier because it is not required by spec! */ + return OSHMEM_SUCCESS; + +alloc_error: + _buddy_free(&memheap_buddy, offset, order, heap); + return OSHMEM_ERROR; +} + +static int do_alloc(uint32_t order, void **p_buff) +{ + return _do_alloc(order, p_buff, &(memheap_buddy.heap)); +} + +static int do_private_alloc(uint32_t order, void **p_buff) +{ + return _do_alloc(order, p_buff, &(memheap_buddy.private_heap)); +} + +/** + * Allocate size bytes on the symmetric heap. + * The allocated variable is aligned to its size. + */ +int mca_memheap_buddy_alloc(size_t size, void** p_buff) +{ + + uint32_t order; + + order = memheap_buddy_find_order(size); + + return do_alloc(order, p_buff); +} + +int mca_memheap_buddy_private_alloc(size_t size, void** p_buff) +{ + uint32_t order; + int status = 0; + order = memheap_buddy_find_order(size); + + status = do_private_alloc(order, p_buff); + + MEMHEAP_VERBOSE(20, "private alloc addr: %p", *p_buff); + + return status; +} + +int mca_memheap_buddy_private_free(void* ptr) +{ + int rc; + uint32_t offset; + unsigned long addr; + unsigned long base; + void *order; + + if (0 == ptr) + { + return OSHMEM_SUCCESS; + } + + base = (unsigned long)memheap_buddy.private_heap.symmetric_heap; + addr = (unsigned long)ptr; + offset = addr - base; + + rc = opal_hash_table_get_value_uint64(memheap_buddy.private_heap.symmetric_heap_hashtable, addr, &order); + if(OPAL_SUCCESS != rc) { + return OSHMEM_ERROR; + } + + buddy_private_free(&memheap_buddy, offset, (unsigned)(unsigned long)order); + opal_hash_table_remove_value_uint64(memheap_buddy.private_heap.symmetric_heap_hashtable, addr); + + return OSHMEM_SUCCESS; +} + + +int mca_memheap_buddy_align(size_t align, size_t size, void **p_buff) +{ + uint32_t order; + + if (align == 0) { + *p_buff = 0; + return OSHMEM_ERROR; + } + + /* check that align is power of 2 */ + if (align & (align - 1)) { + *p_buff = 0; + return OSHMEM_ERROR; + } + + order = memheap_buddy_find_order(size); + if ((unsigned long)align > (1UL << order)) + order = memheap_buddy_find_order(align); + + return do_alloc(order, p_buff); +} + +int mca_memheap_buddy_realloc(size_t new_size, void *p_buff, void **p_new_buff) +{ + int rc; + uint32_t offset; + unsigned long addr; + unsigned long base; + void *order; + size_t old_size; + char *tmp_buf; + + /* equiv to alloc if old ptr is null */ + if (NULL == p_buff) + return mca_memheap_buddy_alloc(new_size, p_new_buff); + + base = (unsigned long)memheap_buddy.heap.symmetric_heap; + addr = (unsigned long)p_buff; + offset = addr - base; + + rc = opal_hash_table_get_value_uint64(memheap_buddy.heap.symmetric_heap_hashtable, addr, &order); + if(OPAL_SUCCESS != rc) { + *p_new_buff = NULL; + return OSHMEM_ERROR; + } + + /* equiv to free if new_size is 0 */ + if (0 == new_size) { + *p_new_buff = NULL; + return mca_memheap_buddy_free(p_buff); + } + + old_size = 1UL << (unsigned long)order; + + /* do nothing if new size is less then current size */ + if (new_size <= old_size) { + *p_new_buff = p_buff; + return OSHMEM_SUCCESS; + } + + if (new_size > (1UL << memheap_buddy.heap.max_order)) { + *p_new_buff = NULL; + return OSHMEM_ERR_OUT_OF_RESOURCE; + } + + if (old_size + new_size >= (1UL << memheap_buddy.heap.max_order)) { + /* copy via temporary buffer */ + + tmp_buf = (char *)malloc(old_size); + if (!tmp_buf) + return OSHMEM_ERR_OUT_OF_RESOURCE; + memcpy(tmp_buf, p_buff, old_size); + mca_memheap_buddy_free(p_buff); + } + else + tmp_buf = p_buff; + + /* alloc and copy data to new buffer, free old one */ + rc = mca_memheap_buddy_alloc(new_size, p_new_buff); + if (OSHMEM_SUCCESS != rc) { + *p_new_buff = NULL; + if (old_size + new_size >= (1UL << memheap_buddy.heap.max_order) && tmp_buf) + { + free(tmp_buf); + } + return rc; + } + + memcpy(*p_new_buff, tmp_buf, old_size); + + if (old_size + new_size < (1UL << memheap_buddy.heap.max_order)) + mca_memheap_buddy_free(p_buff); + else + if (tmp_buf) free(tmp_buf); + + return OSHMEM_SUCCESS; +} + +/* + * Free a variable allocated on the + * symmetric heap. + */ +int mca_memheap_buddy_free(void* ptr) +{ + int rc; + uint32_t offset; + unsigned long addr; + unsigned long base; + void *order; + + base = (unsigned long)memheap_buddy.heap.symmetric_heap; + addr = (unsigned long)ptr; + offset = addr - base; + + rc = opal_hash_table_get_value_uint64(memheap_buddy.heap.symmetric_heap_hashtable, addr, &order); + if(OPAL_SUCCESS != rc) { + return OSHMEM_ERROR; + } + + buddy_free(&memheap_buddy, offset, (unsigned)(unsigned long)order); + opal_hash_table_remove_value_uint64(memheap_buddy.heap.symmetric_heap_hashtable, addr); + + return OSHMEM_SUCCESS; +} + + +int mca_memheap_buddy_finalize() +{ + MEMHEAP_VERBOSE(5, "deregistering symmetric heap"); + + /* was not initialized - do nothing */ + if (memheap_buddy.heap.max_order == 0) + return OSHMEM_SUCCESS; + + /* Destruct hashtable supporting shfree of symmetric heap variables */ + if (memheap_buddy.heap.symmetric_heap_hashtable) { + OBJ_RELEASE(memheap_buddy.heap.symmetric_heap_hashtable); + } + if (memheap_buddy.private_heap.symmetric_heap_hashtable) { + OBJ_RELEASE(memheap_buddy.private_heap.symmetric_heap_hashtable); + } + + buddy_cleanup(&memheap_buddy); + + return OSHMEM_SUCCESS; +} + + +/** + * Return the base address of the symmetric heap. + */ + +static inline void* mca_memheap_buddy_get_symmetric_heap_base_addr(void) +{ + return memheap_buddy.heap.symmetric_heap; +} + +/** + * Return the last address in the symmetric heap. + */ +static inline void* mca_memheap_buddy_get_symmetric_heap_last_addr(void) +{ + return (void*)((unsigned char*)(memheap_buddy.heap.symmetric_heap) + + (1ULL< +#include +#include + + +#define BITS_PER_BYTE 8 +#define __BITOPS_WORDSIZE 64 +#define DEFAULT_HASHTABLE_SIZE 100 + + + +#define BITOP_WORD(nr) ((nr) / bits_per_long()) +#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d)) +#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(unsigned long)) +#define __BITOPS_WORDS(bits) (((bits)+__BITOPS_WORDSIZE-1)/__BITOPS_WORDSIZE) +#define clear_bit(x,y) __clear_bit((x), (y)) +#define set_bit(x,y) __set_bit((x), (y)) +#define find_first_bit(addr, size) find_next_bit((addr), (size), 0) + +BEGIN_C_DECLS + + +struct mca_memheap_buddy_heap_t { + unsigned long **bits; /** Part of the buddy allocator */ + unsigned *num_free; /** Part of the buddy allocator */ + unsigned max_order; /** Log2 of Maximal heap size, part of the allocator */ + unsigned min_order; /** min alloc order */ + void* symmetric_heap; /** Symmetric Heap */ + opal_hash_table_t* symmetric_heap_hashtable; /** Pointer to the Symmetric heap used for moving on it */ +}; +typedef struct mca_memheap_buddy_heap_t mca_memheap_buddy_heap_t; + + +/* Structure for managing shmem symmetric heap */ +struct mca_memheap_buddy_module_t { + mca_memheap_base_module_t super; + + int priority; /** Module's Priority */ + mca_memheap_buddy_heap_t heap; + mca_memheap_buddy_heap_t private_heap; + opal_mutex_t lock; /** Part of the buddy allocator */ +}; +typedef struct mca_memheap_buddy_module_t mca_memheap_buddy_module_t; +OSHMEM_DECLSPEC extern mca_memheap_buddy_module_t memheap_buddy; + + +/* + * Buddy interface. + * Please pay attention to the new differences in the interface. + */ +OSHMEM_DECLSPEC extern int mca_memheap_buddy_module_init(memheap_context_t *); +OSHMEM_DECLSPEC extern int mca_memheap_buddy_alloc(size_t, void**); +OSHMEM_DECLSPEC extern int mca_memheap_buddy_realloc(size_t, void*, void **); +OSHMEM_DECLSPEC extern int mca_memheap_buddy_align(size_t, size_t, void**); +OSHMEM_DECLSPEC extern int mca_memheap_buddy_free(void*); +OSHMEM_DECLSPEC extern int mca_memheap_buddy_finalize(void); + +/* private alloc/free functions */ +OSHMEM_DECLSPEC extern int mca_memheap_buddy_private_alloc(size_t, void**); +OSHMEM_DECLSPEC extern int mca_memheap_buddy_private_free(void*); + + + + +/** + * static/global variables support. Consider making it a separate component + */ + + +END_C_DECLS + +#endif /* MCA_MEMHEAP_BUDDY_H */ diff --git a/oshmem/mca/memheap/buddy/memheap_buddy_component.c b/oshmem/mca/memheap/buddy/memheap_buddy_component.c new file mode 100644 index 0000000000..0b560fbcc3 --- /dev/null +++ b/oshmem/mca/memheap/buddy/memheap_buddy_component.c @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#include "oshmem_config.h" +#include "opal/mca/base/mca_base_param.h" +#include "opal/util/output.h" +#include "oshmem/mca/memheap/memheap.h" +#include "oshmem/mca/memheap/base/base.h" +#include "oshmem/mca/memheap/buddy/memheap_buddy.h" +#include "memheap_buddy_component.h" + + +static int mca_memheap_buddy_component_close(void); +static mca_memheap_base_module_t* mca_memheap_buddy_component_init( memheap_context_t *, int * ); + + +static int __basic_open(void); + +mca_memheap_base_component_t mca_memheap_buddy_component = { + { + MCA_MEMHEAP_BASE_VERSION_2_0_0, + + "buddy", /* MCA component name */ + OSHMEM_MAJOR_VERSION, /* MCA component major version */ + OSHMEM_MINOR_VERSION, /* MCA component minor version */ + OSHMEM_RELEASE_VERSION, /* MCA component release version */ + + __basic_open, + mca_memheap_buddy_component_close, + NULL + }, + { + /* The component is checkpoint ready */ + MCA_BASE_METADATA_PARAM_CHECKPOINT + }, + mca_memheap_buddy_component_init +}; + +/* Open component */ +static int __basic_open(void) +{ + return OSHMEM_SUCCESS; +} + +/* Initialize component */ +mca_memheap_base_module_t* mca_memheap_buddy_component_init(memheap_context_t *context, int *priority) +{ + int rc; + + *priority = memheap_buddy.priority; + rc = mca_memheap_buddy_module_init(context); + if (OSHMEM_SUCCESS != rc) { + return NULL; + } + + return &(memheap_buddy.super); +} + +/* + * This function is automaticaly called from mca_base_components_close. + * It releases the component's allocated memory. + */ +int mca_memheap_buddy_component_close() +{ + mca_memheap_buddy_finalize(); + return OSHMEM_SUCCESS; +} diff --git a/oshmem/mca/memheap/buddy/memheap_buddy_component.h b/oshmem/mca/memheap/buddy/memheap_buddy_component.h new file mode 100644 index 0000000000..9c35f2b3c3 --- /dev/null +++ b/oshmem/mca/memheap/buddy/memheap_buddy_component.h @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + */ + +#ifndef MCA_MEMHEAP_BUDDY_COMPONENT_H +#define MCA_MEMHEAP_BUDDY_COMPONENT_H + +BEGIN_C_DECLS + +/* + * MEMHEAP module functions. + */ +OSHMEM_MODULE_DECLSPEC extern mca_memheap_base_component_2_0_0_t mca_memheap_buddy_component; + +END_C_DECLS + +#endif diff --git a/oshmem/mca/memheap/configure.m4 b/oshmem/mca/memheap/configure.m4 new file mode 100644 index 0000000000..86d838d7e8 --- /dev/null +++ b/oshmem/mca/memheap/configure.m4 @@ -0,0 +1,19 @@ +# -*- shell-script -*- +# +# Copyright (c) 2012 Mellanox Technologies, Inc. +# All rights reserved. +# +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +AC_DEFUN([MCA_oshmem_memheap_CONFIG],[ + # configure all the components + MCA_CONFIGURE_FRAMEWORK($1, $2, 1) + + # this is a direct callable component, so set that up. + MCA_SETUP_DIRECT_CALL($1, $2) +]) diff --git a/oshmem/mca/memheap/memheap.h b/oshmem/mca/memheap/memheap.h new file mode 100644 index 0000000000..93465012b2 --- /dev/null +++ b/oshmem/mca/memheap/memheap.h @@ -0,0 +1,147 @@ +/** + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_MEMHEAP_H +#define MCA_MEMHEAP_H +#include "opal/mca/mca.h" +#include "oshmem/constants.h" +#include "oshmem/proc/proc.h" +#include "oshmem/mca/spml/spml.h" + +#define DEFAULT_SYMMETRIC_HEAP_SIZE 256 +#define SIZE_IN_MEGA_BYTES(size_in_mb) size_in_mb * 1024 * 1024 + +BEGIN_C_DECLS +struct mca_memheap_base_module_t; + + +typedef struct memheap_context +{ + void* user_base_addr; + void* private_base_addr; + size_t user_size; + size_t private_size; +} memheap_context_t; + + +/** + * Component initialize + */ +typedef struct mca_memheap_base_module_t* (*mca_memheap_base_component_init_fn_t)(memheap_context_t *, int *priority); + +/* + * Symmetric heap allocation. Malloc like interface + */ +typedef int (*mca_memheap_base_module_alloc_fn_t)(size_t, void**); + +typedef int (*mca_memheap_base_module_memalign_fn_t)(size_t align, size_t size, void**); + +typedef int (*mca_memheap_base_module_realloc_fn_t)(size_t newsize, void *, void **); + +/* + * Symmetric heap free. + */ +typedef int (*mca_memheap_base_module_free_fn_t)(void*); + +/** + * Service functions + */ +typedef uint64_t (*mca_memheap_base_module_find_offset_fn_t)(int pe, int tr_id, unsigned long va, uint64_t rva); + + +/** + * @return mkey suitable to access pe via given transport id. rva is set to virtual address mapping of (va) + * on remote pe. + */ +typedef mca_spml_mkey_t * (*mca_memheap_base_module_get_cached_mkey_fn_t)(int pe, unsigned long va, int transport_id, uint64_t *rva); +typedef mca_spml_mkey_t * (*mca_memheap_base_module_get_local_mkey_fn_t)(unsigned long va, int transport_id); + +/* + * Symmetric heap destructor. + */ +typedef int (*mca_memheap_base_module_finalize_fn_t)(void); + +typedef int (*mca_memheap_base_is_memheap_addr_fn_t)(unsigned long va); + +/* get mkeys from all ranks */ +typedef void (*mca_memheap_base_mkey_exchange_fn_t)(void); + +/* + * memheap component descriptor. Contains component version, information and + * init functions + */ +struct mca_memheap_base_component_2_0_0_t{ + mca_base_component_t memheap_version; /**< version */ + mca_base_component_data_t memheap_data; /**< metadata */ + mca_memheap_base_component_init_fn_t memheap_init; /**. Therefore, some of the comments below do not apply + for this modified version. However, it is the intention to keep + differences to Doug Lea's original version minimal, hence the + comments were mostly left unchanged. + + ----------------------------------------------------------------------- + + This is a version (aka dlmalloc) of malloc/free/realloc written by + Doug Lea and released to the public domain, as explained at + http://creativecommons.org/licenses/publicdomain. Send questions, + comments, complaints, performance data, etc to dl@cs.oswego.edu + +* Version pre-2.8.4 Wed Mar 29 19:46:29 2006 (dl at gee) + + Note: There may be an updated version of this malloc obtainable at + ftp://gee.cs.oswego.edu/pub/misc/malloc.c + Check before installing! + +* Quickstart + + This library is all in one file to simplify the most common usage: + ftp it, compile it (-O3), and link it into another program. All of + the compile-time options default to reasonable values for use on + most platforms. You might later want to step through various + compile-time and dynamic tuning options. + + For convenience, an include file for code using this malloc is at: + ftp://gee.cs.oswego.edu/pub/misc/malloc-2.8.3.h + You don't really need this .h file unless you call functions not + defined in your system include files. The .h file contains only the + excerpts from this file needed for using this malloc on ANSI C/C++ + systems, so long as you haven't changed compile-time options about + naming and tuning parameters. If you do, then you can create your + own malloc.h that does include all settings by cutting at the point + indicated below. Note that you may already by default be using a C + library containing a malloc that is based on some version of this + malloc (for example in linux). You might still want to use the one + in this file to customize settings or to avoid overheads associated + with library versions. + +* Vital statistics: + + Supported pointer/size_t representation: 4 or 8 bytes + size_t MUST be an unsigned type of the same width as + pointers. (If you are using an ancient system that declares + size_t as a signed type, or need it to be a different width + than pointers, you can use a previous release of this malloc + (e.g. 2.7.2) supporting these.) + + Alignment: 8 bytes (default) + This suffices for nearly all current machines and C compilers. + However, you can define MALLOC_ALIGNMENT to be wider than this + if necessary (up to 128bytes), at the expense of using more space. + + Minimum overhead per allocated chunk: 4 or 8 bytes (if 4byte sizes) + 8 or 16 bytes (if 8byte sizes) + Each malloced chunk has a hidden word of overhead holding size + and status information, and additional cross-check word + if FOOTERS is defined. + + Minimum allocated size: 4-byte ptrs: 16 bytes (including overhead) + 8-byte ptrs: 32 bytes (including overhead) + + Even a request for zero bytes (i.e., malloc(0)) returns a + pointer to something of the minimum allocatable size. + The maximum overhead wastage (i.e., number of extra bytes + allocated than were requested in malloc) is less than or equal + to the minimum size, except for requests >= mmap_threshold that + are serviced via mmap(), where the worst case wastage is about + 32 bytes plus the remainder from a system page (the minimal + mmap unit); typically 4096 or 8192 bytes. + + Security: static-safe; optionally more or less + The "security" of malloc refers to the ability of malicious + code to accentuate the effects of errors (for example, freeing + space that is not currently malloc'ed or overwriting past the + ends of chunks) in code that calls malloc. This malloc + guarantees not to modify any memory locations below the base of + heap, i.e., static variables, even in the presence of usage + errors. The routines additionally detect most improper frees + and reallocs. All this holds as long as the static bookkeeping + for malloc itself is not corrupted by some other means. This + is only one aspect of security -- these checks do not, and + cannot, detect all possible programming errors. + + If FOOTERS is defined nonzero, then each allocated chunk + carries an additional check word to verify that it was malloced + from its space. These check words are the same within each + execution of a program using malloc, but differ across + executions, so externally crafted fake chunks cannot be + freed. This improves security by rejecting frees/reallocs that + could corrupt heap memory, in addition to the checks preventing + writes to statics that are always on. This may further improve + security at the expense of time and space overhead. (Note that + FOOTERS may also be worth using with MSPACES.) + + By default detected errors cause the program to abort (calling + "abort()"). You can override this to instead proceed past + errors by defining PROCEED_ON_ERROR. In this case, a bad free + has no effect, and a malloc that encounters a bad address + caused by user overwrites will ignore the bad address by + dropping pointers and indices to all known memory. This may + be appropriate for programs that should continue if at all + possible in the face of programming errors, although they may + run out of memory because dropped memory is never reclaimed. + + If you don't like either of these options, you can define + CORRUPTION_ERROR_ACTION and USAGE_ERROR_ACTION to do anything + else. And if if you are sure that your program using malloc has + no errors or vulnerabilities, you can define INSECURE to 1, + which might (or might not) provide a small performance improvement. + + Thread-safety: NOT thread-safe unless USE_LOCKS defined + When USE_LOCKS is defined, each public call to malloc, free, + etc is surrounded with either a pthread mutex or a win32 + spinlock (depending on WIN32). This is not especially fast, and + can be a major bottleneck. It is designed only to provide + minimal protection in concurrent environments, and to provide a + basis for extensions. If you are using malloc in a concurrent + program, consider instead using nedmalloc + (http://www.nedprod.com/programs/portable/nedmalloc/) or + ptmalloc (See http://www.malloc.de), which are derived + from versions of this malloc. + + System requirements: Any combination of MORECORE and/or MMAP/MUNMAP + This malloc can use unix sbrk or any emulation (invoked using + the CALL_MORECORE macro) and/or mmap/munmap or any emulation + (invoked using CALL_MMAP/CALL_MUNMAP) to get and release system + memory. On most unix systems, it tends to work best if both + MORECORE and MMAP are enabled. On Win32, it uses emulations + based on VirtualAlloc. It also uses common C library functions + like memset. + + Compliance: I believe it is compliant with the Single Unix Specification + (See http://www.unix.org). Also SVID/XPG, ANSI C, and probably + others as well. + +* Overview of algorithms + + This is not the fastest, most space-conserving, most portable, or + most tunable malloc ever written. However it is among the fastest + while also being among the most space-conserving, portable and + tunable. Consistent balance across these factors results in a good + general-purpose allocator for malloc-intensive programs. + + In most ways, this malloc is a best-fit allocator. Generally, it + chooses the best-fitting existing chunk for a request, with ties + broken in approximately least-recently-used order. (This strategy + normally maintains low fragmentation.) However, for requests less + than 256bytes, it deviates from best-fit when there is not an + exactly fitting available chunk by preferring to use space adjacent + to that used for the previous small request, as well as by breaking + ties in approximately most-recently-used order. (These enhance + locality of series of small allocations.) And for very large requests + (>= 256Kb by default), it relies on system memory mapping + facilities, if supported. (This helps avoid carrying around and + possibly fragmenting memory used only for large chunks.) + + All operations (except malloc_stats and mallinfo) have execution + times that are bounded by a constant factor of the number of bits in + a size_t, not counting any clearing in calloc or copying in realloc, + or actions surrounding MORECORE and MMAP that have times + proportional to the number of non-contiguous regions returned by + system allocation routines, which is often just 1. In real-time + applications, you can optionally suppress segment traversals using + NO_SEGMENT_TRAVERSAL, which assures bounded execution even when + system allocators return non-contiguous spaces, at the typical + expense of carrying around more memory and increased fragmentation. + + The implementation is not very modular and seriously overuses + macros. Perhaps someday all C compilers will do as good a job + inlining modular code as can now be done by brute-force expansion, + but now, enough of them seem not to. + + Some compilers issue a lot of warnings about code that is + dead/unreachable only on some platforms, and also about intentional + uses of negation on unsigned types. All known cases of each can be + ignored. + + For a longer but out of date high-level description, see + http://gee.cs.oswego.edu/dl/html/malloc.html + +* MSPACES + If MSPACES is defined, then in addition to malloc, free, etc., + this file also defines mspace_malloc, mspace_free, etc. These + are versions of malloc routines that take an "mspace" argument + obtained using create_mspace, to control all internal bookkeeping. + If ONLY_MSPACES is defined, only these versions are compiled. + So if you would like to use this allocator for only some allocations, + and your system malloc for others, you can compile with + ONLY_MSPACES and then do something like... + static mspace mymspace = create_mspace(0,0); // for example + #define mymalloc(bytes) mspace_malloc(mymspace, bytes) + + (Note: If you only need one instance of an mspace, you can instead + use "USE_DL_PREFIX" to relabel the global malloc.) + + You can similarly create thread-local allocators by storing + mspaces as thread-locals. For example: + static __thread mspace tlms = 0; + void* tlmalloc(size_t bytes) { + if (tlms == 0) tlms = create_mspace(0, 0); + return mspace_malloc(tlms, bytes); + } + void tlfree(void* mem) { mspace_free(tlms, mem); } + + Unless FOOTERS is defined, each mspace is completely independent. + You cannot allocate from one and free to another (although + conformance is only weakly checked, so usage errors are not always + caught). If FOOTERS is defined, then each chunk carries around a tag + indicating its originating mspace, and frees are directed to their + originating spaces. + + ------------------------- Compile-time options --------------------------- + +Be careful in setting #define values for numerical constants of type +size_t. On some systems, literal values are not automatically extended +to size_t precision unless they are explicitly casted. You can also +use the symbolic values MAX_SIZE_T, SIZE_T_ONE, etc below. + +WIN32 default: defined if _WIN32 defined + Defining WIN32 sets up defaults for MS environment and compilers. + Otherwise defaults are for unix. + +MALLOC_ALIGNMENT default: (size_t)8 + Controls the minimum alignment for malloc'ed chunks. It must be a + power of two and at least 8, even on machines for which smaller + alignments would suffice. It may be defined as larger than this + though. Note however that code and data structures are optimized for + the case of 8-byte alignment. + +MSPACES default: 0 (false) + If true, compile in support for independent allocation spaces. + This is only supported if DL_HAVE_MMAP is true. + +ONLY_MSPACES default: 0 (false) + If true, only compile in mspace versions, not regular versions. + +USE_LOCKS default: 0 (false) + Causes each call to each public routine to be surrounded with + pthread or WIN32 mutex lock/unlock. (If set true, this can be + overridden on a per-mspace basis for mspace versions.) If set to a + non-zero value other than 1, locks are used, but their + implementation is left out, so lock functions must be supplied manually. + +USE_SPIN_LOCKS default: 1 iff USE_LOCKS and on x86 using gcc or MSC + If true, uses custom spin locks for locking. This is currently + supported only for x86 platforms using gcc or recent MS compilers. + Otherwise, posix locks or win32 critical sections are used. + +FOOTERS default: 0 + If true, provide extra checking and dispatching by placing + information in the footers of allocated chunks. This adds + space and time overhead. + +INSECURE default: 0 + If true, omit checks for usage errors and heap space overwrites. + +USE_DL_PREFIX default: NOT defined + Causes compiler to prefix all public routines with the string 'dl'. + This can be useful when you only want to use this malloc in one part + of a program, using your regular system malloc elsewhere. + +ABORT default: defined as abort() + Defines how to abort on failed checks. On most systems, a failed + check cannot die with an "assert" or even print an informative + message, because the underlying print routines in turn call malloc, + which will fail again. Generally, the best policy is to simply call + abort(). It's not very useful to do more than this because many + errors due to overwriting will show up as address faults (null, odd + addresses etc) rather than malloc-triggered checks, so will also + abort. Also, most compilers know that abort() does not return, so + can better optimize code conditionally calling it. + +PROCEED_ON_ERROR default: defined as 0 (false) + Controls whether detected bad addresses cause them to bypassed + rather than aborting. If set, detected bad arguments to free and + realloc are ignored. And all bookkeeping information is zeroed out + upon a detected overwrite of freed heap space, thus losing the + ability to ever return it from malloc again, but enabling the + application to proceed. If PROCEED_ON_ERROR is defined, the + static variable malloc_corruption_error_count is compiled in + and can be examined to see if errors have occurred. This option + generates slower code than the default abort policy. + +DL_DEBUG default: NOT defined + The DL_DEBUG setting is mainly intended for people trying to modify + this code or diagnose problems when porting to new platforms. + However, it may also be able to better isolate user errors than just + using runtime checks. The assertions in the check routines spell + out in more detail the assumptions and invariants underlying the + algorithms. The checking is fairly extensive, and will slow down + execution noticeably. Calling malloc_stats or mallinfo with DL_DEBUG + set will attempt to check every non-mmapped allocated and free chunk + in the course of computing the summaries. + +ABORT_ON_ASSERT_FAILURE default: defined as 1 (true) + Debugging assertion failures can be nearly impossible if your + version of the assert macro causes malloc to be called, which will + lead to a cascade of further failures, blowing the runtime stack. + ABORT_ON_ASSERT_FAILURE cause assertions failures to call abort(), + which will usually make debugging easier. + +MALLOC_FAILURE_ACTION default: sets errno to ENOMEM, or no-op on win32 + The action to take before "return 0" when malloc fails to be able to + return memory because there is none available. + +HAVE_MORECORE default: 1 (true) unless win32 or ONLY_MSPACES + True if this system supports sbrk or an emulation of it. + +MORECORE default: sbrk + The name of the sbrk-style system routine to call to obtain more + memory. See below for guidance on writing custom MORECORE + functions. The type of the argument to sbrk/MORECORE varies across + systems. It cannot be size_t, because it supports negative + arguments, so it is normally the signed type of the same width as + size_t (sometimes declared as "intptr_t"). It doesn't much matter + though. Internally, we only call it with arguments less than half + the max value of a size_t, which should work across all reasonable + possibilities, although sometimes generating compiler warnings. See + near the end of this file for guidelines for creating a custom + version of MORECORE. + +MORECORE_CONTIGUOUS default: 1 (true) if HAVE_MORECORE + If true, take advantage of fact that consecutive calls to MORECORE + with positive arguments always return contiguous increasing + addresses. This is true of unix sbrk. It does not hurt too much to + set it true anyway, since malloc copes with non-contiguities. + Setting it false when definitely non-contiguous saves time + and possibly wasted space it would take to discover this though. + +MORECORE_CANNOT_TRIM default: NOT defined + True if MORECORE cannot release space back to the system when given + negative arguments. This is generally necessary only if you are + using a hand-crafted MORECORE function that cannot handle negative + arguments. + +NO_SEGMENT_TRAVERSAL default: 0 + If non-zero, suppresses traversals of memory segments + returned by either MORECORE or CALL_MMAP. This disables + merging of segments that are contiguous, and selectively + releasing them to the OS if unused, but bounds execution times. + +DL_HAVE_MMAP default: 1 (true) + True if this system supports mmap or an emulation of it. If so, and + HAVE_MORECORE is not true, MMAP is used for all system + allocation. If set and HAVE_MORECORE is true as well, MMAP is + primarily used to directly allocate very large blocks. It is also + used as a backup strategy in cases where MORECORE fails to provide + space from system. Note: A single call to MUNMAP is assumed to be + able to unmap memory that may have be allocated using multiple calls + to MMAP, so long as they are adjacent. + +DL_HAVE_MREMAP default: 1 on linux, else 0 + If true realloc() uses mremap() to re-allocate large blocks and + extend or shrink allocation spaces. + +MMAP_CLEARS default: 1 except on WINCE. + True if mmap clears memory so calloc doesn't need to. This is true + for standard unix mmap using /dev/zero and on WIN32 except for WINCE. + +USE_BUILTIN_FFS default: 0 (i.e., not used) + Causes malloc to use the builtin ffs() function to compute indices. + Some compilers may recognize and intrinsify ffs to be faster than the + supplied C version. Also, the case of x86 using gcc is special-cased + to an asm instruction, so is already as fast as it can be, and so + this setting has no effect. Similarly for Win32 under recent MS compilers. + (On most x86s, the asm version is only slightly faster than the C version.) + +malloc_getpagesize default: derive from system includes, or 4096. + The system page size. To the extent possible, this malloc manages + memory from the system in page-size units. This may be (and + usually is) a function rather than a constant. This is ignored + if WIN32, where page size is determined using getSystemInfo during + initialization. + +USE_DEV_RANDOM default: 0 (i.e., not used) + Causes malloc to use /dev/random to initialize secure magic seed for + stamping footers. Otherwise, the current time is used. + +NO_MALLINFO default: 0 + If defined, don't compile "mallinfo". This can be a simple way + of dealing with mismatches between system declarations and + those in this file. + +MALLINFO_FIELD_TYPE default: size_t + The type of the fields in the mallinfo struct. This was originally + defined as "int" in SVID etc, but is more usefully defined as + size_t. The value is used only if HAVE_USR_INCLUDE_MALLOC_H is not set + +REALLOC_ZERO_BYTES_FREES default: not defined + This should be set if a call to realloc with zero bytes should + be the same as a call to free. Some people think it should. Otherwise, + since this malloc returns a unique pointer for malloc(0), so does + realloc(p, 0). + +LACKS_UNISTD_H, LACKS_FCNTL_H, LACKS_SYS_PARAM_H, LACKS_SYS_MMAN_H +LACKS_STRINGS_H, LACKS_STRING_H, LACKS_SYS_TYPES_H, LACKS_ERRNO_H +LACKS_STDLIB_H default: NOT defined unless on WIN32 + Define these if your system does not have these header files. + You might need to manually insert some of the declarations they provide. + +DEFAULT_GRANULARITY default: page size if MORECORE_CONTIGUOUS, + system_info.dwAllocationGranularity in WIN32, + otherwise 64K. + Also settable using mallopt(M_GRANULARITY, x) + The unit for allocating and deallocating memory from the system. On + most systems with contiguous MORECORE, there is no reason to + make this more than a page. However, systems with MMAP tend to + either require or encourage larger granularities. You can increase + this value to prevent system allocation functions to be called so + often, especially if they are slow. The value must be at least one + page and must be a power of two. Setting to 0 causes initialization + to either page size or win32 region size. (Note: In previous + versions of malloc, the equivalent of this option was called + "TOP_PAD") + +DEFAULT_TRIM_THRESHOLD default: 2MB + Also settable using mallopt(M_TRIM_THRESHOLD, x) + The maximum amount of unused top-most memory to keep before + releasing via malloc_trim in free(). Automatic trimming is mainly + useful in long-lived programs using contiguous MORECORE. Because + trimming via sbrk can be slow on some systems, and can sometimes be + wasteful (in cases where programs immediately afterward allocate + more large chunks) the value should be high enough so that your + overall system performance would improve by releasing this much + memory. As a rough guide, you might set to a value close to the + average size of a process (program) running on your system. + Releasing this much memory would allow such a process to run in + memory. Generally, it is worth tuning trim thresholds when a + program undergoes phases where several large chunks are allocated + and released in ways that can reuse each other's storage, perhaps + mixed with phases where there are no such chunks at all. The trim + value must be greater than page size to have any useful effect. To + disable trimming completely, you can set to MAX_SIZE_T. Note that the trick + some people use of mallocing a huge space and then freeing it at + program startup, in an attempt to reserve system memory, doesn't + have the intended effect under automatic trimming, since that memory + will immediately be returned to the system. + +DEFAULT_MMAP_THRESHOLD default: 256K + Also settable using mallopt(M_MMAP_THRESHOLD, x) + The request size threshold for using MMAP to directly service a + request. Requests of at least this size that cannot be allocated + using already-existing space will be serviced via mmap. (If enough + normal freed space already exists it is used instead.) Using mmap + segregates relatively large chunks of memory so that they can be + individually obtained and released from the host system. A request + serviced through mmap is never reused by any other request (at least + not directly; the system may just so happen to remap successive + requests to the same locations). Segregating space in this way has + the benefits that: Mmapped space can always be individually released + back to the system, which helps keep the system level memory demands + of a long-lived program low. Also, mapped memory doesn't become + `locked' between other chunks, as can happen with normally allocated + chunks, which means that even trimming via malloc_trim would not + release them. However, it has the disadvantage that the space + cannot be reclaimed, consolidated, and then used to service later + requests, as happens with normal chunks. The advantages of mmap + nearly always outweigh disadvantages for "large" chunks, but the + value of "large" may vary across systems. The default is an + empirically derived value that works well in most systems. You can + disable mmap by setting to MAX_SIZE_T. + +MAX_RELEASE_CHECK_RATE default: 255 unless not DL_HAVE_MMAP + The number of consolidated frees between checks to release + unused segments when freeing. When using non-contiguous segments, + especially with multiple mspaces, checking only for topmost space + doesn't always suffice to trigger trimming. To compensate for this, + free() will, with a period of MAX_RELEASE_CHECK_RATE (or the + current number of segments, if greater) try to release unused + segments to the OS when freeing chunks that result in + consolidation. The best value for this parameter is a compromise + between slowing down frees with relatively costly checks that + rarely trigger versus holding on to unused memory. To effectively + disable, set to MAX_SIZE_T. This may lead to a very slight speed + improvement at the expense of carrying around more memory. +*/ + +#ifndef WIN32 +#ifdef _WIN32 +#define WIN32 1 +#endif /* _WIN32 */ +#endif /* WIN32 */ +#ifdef WIN32 +#define WIN32_LEAN_AND_MEAN +#include +#define DL_HAVE_MMAP 1 +#define HAVE_MORECORE 0 +#define LACKS_UNISTD_H +#define LACKS_SYS_PARAM_H +#define LACKS_SYS_MMAN_H +#define LACKS_STRING_H +#define LACKS_STRINGS_H +#define LACKS_SYS_TYPES_H +#define LACKS_ERRNO_H +#define MALLOC_FAILURE_ACTION +#ifdef _WIN32_WCE /* WINCE reportedly does not clear */ +#define MMAP_CLEARS 0 +#else +#define MMAP_CLEARS 1 +#endif /* _WIN32_WCE */ +#endif /* WIN32 */ + +#if defined(DARWIN) || defined(_DARWIN) +/* Mac OSX docs advise not to use sbrk; it seems better to use mmap */ +#ifndef HAVE_MORECORE +#define HAVE_MORECORE 0 +#define DL_HAVE_MMAP 1 +#endif /* HAVE_MORECORE */ +#endif /* DARWIN */ + +#ifndef LACKS_SYS_TYPES_H +#include /* For size_t */ +#endif /* LACKS_SYS_TYPES_H */ + +/* The maximum possible size_t value has all bits set */ +#define MAX_SIZE_T (~(size_t)0) + +#ifndef ONLY_MSPACES +#define ONLY_MSPACES 0 +#endif /* ONLY_MSPACES */ +#ifndef MSPACES +#if ONLY_MSPACES +#define MSPACES 1 +#else /* ONLY_MSPACES */ +#define MSPACES 0 +#endif /* ONLY_MSPACES */ +#endif /* MSPACES */ +#ifndef MALLOC_ALIGNMENT +#define MALLOC_ALIGNMENT ((size_t)8U) +#endif /* MALLOC_ALIGNMENT */ +#ifndef FOOTERS +#define FOOTERS 0 +#endif /* FOOTERS */ +#ifndef ABORT +#define ABORT abort() +#endif /* ABORT */ +#ifndef ABORT_ON_ASSERT_FAILURE +#define ABORT_ON_ASSERT_FAILURE 1 +#endif /* ABORT_ON_ASSERT_FAILURE */ +#ifndef PROCEED_ON_ERROR +#define PROCEED_ON_ERROR 0 +#endif /* PROCEED_ON_ERROR */ +#ifndef USE_LOCKS +#define USE_LOCKS 0 +#endif /* USE_LOCKS */ +#ifndef USE_SPIN_LOCKS +#if USE_LOCKS && (defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))) || (defined(_MSC_VER) && _MSC_VER>=1310) +#define USE_SPIN_LOCKS 1 +#else +#define USE_SPIN_LOCKS 0 +#endif /* USE_LOCKS && ... */ +#endif /* USE_SPIN_LOCKS */ +#ifndef INSECURE +#define INSECURE 0 +#endif /* INSECURE */ +#ifndef DL_HAVE_MMAP +#define DL_HAVE_MMAP 1 +#endif /* DL_HAVE_MMAP */ +#ifndef MMAP_CLEARS +#define MMAP_CLEARS 1 +#endif /* MMAP_CLEARS */ +#ifndef DL_HAVE_MREMAP +#ifdef linux +#define DL_HAVE_MREMAP 1 +#else /* linux */ +#define DL_HAVE_MREMAP 0 +#endif /* linux */ +#endif /* DL_HAVE_MREMAP */ +#ifndef MALLOC_FAILURE_ACTION +#define MALLOC_FAILURE_ACTION errno = ENOMEM; +#endif /* MALLOC_FAILURE_ACTION */ +#ifndef HAVE_MORECORE +#if ONLY_MSPACES +#define HAVE_MORECORE 0 +#else /* ONLY_MSPACES */ +#define HAVE_MORECORE 1 +#endif /* ONLY_MSPACES */ +#endif /* HAVE_MORECORE */ +#if !HAVE_MORECORE +#define MORECORE_CONTIGUOUS 0 +#else /* !HAVE_MORECORE */ +#ifndef MORECORE +#define MORECORE sbrk +#endif /* MORECORE */ +#ifndef MORECORE_CONTIGUOUS +#define MORECORE_CONTIGUOUS 1 +#endif /* MORECORE_CONTIGUOUS */ +#endif /* HAVE_MORECORE */ +#ifndef DEFAULT_GRANULARITY +#if MORECORE_CONTIGUOUS +#define DEFAULT_GRANULARITY (0) /* 0 means to compute in init_mparams */ +#else /* MORECORE_CONTIGUOUS */ +#define DEFAULT_GRANULARITY ((size_t)64U * (size_t)1024U) +#endif /* MORECORE_CONTIGUOUS */ +#endif /* DEFAULT_GRANULARITY */ +#ifndef DEFAULT_TRIM_THRESHOLD +#ifndef MORECORE_CANNOT_TRIM +#define DEFAULT_TRIM_THRESHOLD ((size_t)2U * (size_t)1024U * (size_t)1024U) +#else /* MORECORE_CANNOT_TRIM */ +#define DEFAULT_TRIM_THRESHOLD MAX_SIZE_T +#endif /* MORECORE_CANNOT_TRIM */ +#endif /* DEFAULT_TRIM_THRESHOLD */ +#ifndef DEFAULT_MMAP_THRESHOLD +#if DL_HAVE_MMAP +#define DEFAULT_MMAP_THRESHOLD ((size_t)256U * (size_t)1024U) +#else /* DL_HAVE_MMAP */ +#define DEFAULT_MMAP_THRESHOLD MAX_SIZE_T +#endif /* DL_HAVE_MMAP */ +#endif /* DEFAULT_MMAP_THRESHOLD */ +#ifndef MAX_RELEASE_CHECK_RATE +#if DL_HAVE_MMAP +#define MAX_RELEASE_CHECK_RATE 255 +#else +#define MAX_RELEASE_CHECK_RATE MAX_SIZE_T +#endif /* DL_HAVE_MMAP */ +#endif /* MAX_RELEASE_CHECK_RATE */ +#ifndef USE_BUILTIN_FFS +#define USE_BUILTIN_FFS 0 +#endif /* USE_BUILTIN_FFS */ +#ifndef USE_DEV_RANDOM +#define USE_DEV_RANDOM 0 +#endif /* USE_DEV_RANDOM */ +#ifndef NO_MALLINFO +#define NO_MALLINFO 0 +#endif /* NO_MALLINFO */ +#ifndef MALLINFO_FIELD_TYPE +#define MALLINFO_FIELD_TYPE size_t +#endif /* MALLINFO_FIELD_TYPE */ +#ifndef NO_SEGMENT_TRAVERSAL +#define NO_SEGMENT_TRAVERSAL 0 +#endif /* NO_SEGMENT_TRAVERSAL */ + +/* + mallopt tuning options. SVID/XPG defines four standard parameter + numbers for mallopt, normally defined in malloc.h. None of these + are used in this malloc, so setting them has no effect. But this + malloc does support the following options. +*/ + +#define M_TRIM_THRESHOLD (-1) +#define M_GRANULARITY (-2) +#define M_MMAP_THRESHOLD (-3) + +/* ------------------------ Mallinfo declarations ------------------------ */ + +#if !NO_MALLINFO +/* + This version of malloc supports the standard SVID/XPG mallinfo + routine that returns a struct containing usage properties and + statistics. It should work on any system that has a + /usr/include/malloc.h defining struct mallinfo. The main + declaration needed is the mallinfo struct that is returned (by-copy) + by mallinfo(). The malloinfo struct contains a bunch of fields that + are not even meaningful in this version of malloc. These fields are + are instead filled by mallinfo() with other numbers that might be of + interest. + + HAVE_USR_INCLUDE_MALLOC_H should be set if you have a + /usr/include/malloc.h file that includes a declaration of struct + mallinfo. If so, it is included; else a compliant version is + declared below. These must be precisely the same for mallinfo() to + work. The original SVID version of this struct, defined on most + systems with mallinfo, declares all fields as ints. But some others + define as unsigned long. If your system defines the fields using a + type of different width than listed here, you MUST #include your + system version and #define HAVE_USR_INCLUDE_MALLOC_H. +*/ + +/* #define HAVE_USR_INCLUDE_MALLOC_H */ + +#ifdef HAVE_USR_INCLUDE_MALLOC_H +#include "/usr/include/malloc.h" +#else /* HAVE_USR_INCLUDE_MALLOC_H */ + +struct mallinfo { + MALLINFO_FIELD_TYPE arena; /* non-mmapped space allocated from system */ + MALLINFO_FIELD_TYPE ordblks; /* number of free chunks */ + MALLINFO_FIELD_TYPE smblks; /* always 0 */ + MALLINFO_FIELD_TYPE hblks; /* always 0 */ + MALLINFO_FIELD_TYPE hblkhd; /* space in mmapped regions */ + MALLINFO_FIELD_TYPE usmblks; /* maximum total allocated space */ + MALLINFO_FIELD_TYPE fsmblks; /* always 0 */ + MALLINFO_FIELD_TYPE uordblks; /* total allocated space */ + MALLINFO_FIELD_TYPE fordblks; /* total free space */ + MALLINFO_FIELD_TYPE keepcost; /* releasable (via malloc_trim) space */ +}; + +#endif /* HAVE_USR_INCLUDE_MALLOC_H */ +#endif /* NO_MALLINFO */ + +/* + Try to persuade compilers to inline. The most critical functions for + inlining are defined as macros, so these aren't used for them. +*/ + +#ifndef FORCEINLINE + #if defined(__GNUC__) +#define FORCEINLINE __inline __attribute__ ((always_inline)) + #elif defined(_MSC_VER) + #define FORCEINLINE __forceinline + #endif +#endif +#ifndef NOINLINE + #if defined(__GNUC__) + #define NOINLINE __attribute__ ((noinline)) + #elif defined(_MSC_VER) + #define NOINLINE __declspec(noinline) + #else + #define NOINLINE + #endif +#endif + +#ifdef __cplusplus +extern "C" { +#ifndef FORCEINLINE + #define FORCEINLINE inline +#endif +#endif /* __cplusplus */ +#ifndef FORCEINLINE + #define FORCEINLINE +#endif + +#if !ONLY_MSPACES + +/* ------------------- Declarations of public routines ------------------- */ + +#ifndef USE_DL_PREFIX +#define dlcalloc calloc +#define dlfree free +#define dlmalloc malloc +#define dlmemalign memalign +#define dlrealloc realloc +#define dlvalloc valloc +#define dlpvalloc pvalloc +#define dlmallinfo mallinfo +#define dlmallopt mallopt +#define dlmalloc_trim malloc_trim +#define dlmalloc_stats malloc_stats +#define dlmalloc_usable_size malloc_usable_size +#define dlmalloc_footprint malloc_footprint +#define dlmalloc_max_footprint malloc_max_footprint +#define dlindependent_calloc independent_calloc +#define dlindependent_comalloc independent_comalloc +#endif /* USE_DL_PREFIX */ + + +/* + malloc(size_t n) + Returns a pointer to a newly allocated chunk of at least n bytes, or + null if no space is available, in which case errno is set to ENOMEM + on ANSI C systems. + + If n is zero, malloc returns a minimum-sized chunk. (The minimum + size is 16 bytes on most 32bit systems, and 32 bytes on 64bit + systems.) Note that size_t is an unsigned type, so calls with + arguments that would be negative if signed are interpreted as + requests for huge amounts of space, which will often fail. The + maximum supported value of n differs across systems, but is in all + cases less than the maximum representable value of a size_t. +*/ +void* dlmalloc(size_t); + +/* + free(void* p) + Releases the chunk of memory pointed to by p, that had been previously + allocated using malloc or a related routine such as realloc. + It has no effect if p is null. If p was not malloced or already + freed, free(p) will by default cause the current program to abort. +*/ +void dlfree(void*); + +/* + calloc(size_t n_elements, size_t element_size); + Returns a pointer to n_elements * element_size bytes, with all locations + set to zero. +*/ +void* dlcalloc(size_t, size_t); + +/* + realloc(void* p, size_t n) + Returns a pointer to a chunk of size n that contains the same data + as does chunk p up to the minimum of (n, p's size) bytes, or null + if no space is available. + + The returned pointer may or may not be the same as p. The algorithm + prefers extending p in most cases when possible, otherwise it + employs the equivalent of a malloc-copy-free sequence. + + If p is null, realloc is equivalent to malloc. + + If space is not available, realloc returns null, errno is set (if on + ANSI) and p is NOT freed. + + if n is for fewer bytes than already held by p, the newly unused + space is lopped off and freed if possible. realloc with a size + argument of zero (re)allocates a minimum-sized chunk. + + The old unix realloc convention of allowing the last-free'd chunk + to be used as an argument to realloc is not supported. +*/ + +void* dlrealloc(void*, size_t); + +/* + memalign(size_t alignment, size_t n); + Returns a pointer to a newly allocated chunk of n bytes, aligned + in accord with the alignment argument. + + The alignment argument should be a power of two. If the argument is + not a power of two, the nearest greater power is used. + 8-byte alignment is guaranteed by normal malloc calls, so don't + bother calling memalign with an argument of 8 or less. + + Overreliance on memalign is a sure way to fragment space. +*/ +void* dlmemalign(size_t, size_t); + +/* + valloc(size_t n); + Equivalent to memalign(pagesize, n), where pagesize is the page + size of the system. If the pagesize is unknown, 4096 is used. +*/ +void* dlvalloc(size_t); + +/* + mallopt(int parameter_number, int parameter_value) + Sets tunable parameters The format is to provide a + (parameter-number, parameter-value) pair. mallopt then sets the + corresponding parameter to the argument value if it can (i.e., so + long as the value is meaningful), and returns 1 if successful else + 0. SVID/XPG/ANSI defines four standard param numbers for mallopt, + normally defined in malloc.h. None of these are use in this malloc, + so setting them has no effect. But this malloc also supports other + options in mallopt. See below for details. Briefly, supported + parameters are as follows (listed defaults are for "typical" + configurations). + + Symbol param # default allowed param values + M_TRIM_THRESHOLD -1 2*1024*1024 any (MAX_SIZE_T disables) + M_GRANULARITY -2 page size any power of 2 >= page size + M_MMAP_THRESHOLD -3 256*1024 any (or 0 if no MMAP support) +*/ +int dlmallopt(int, int); + +/* + malloc_footprint(); + Returns the number of bytes obtained from the system. The total + number of bytes allocated by malloc, realloc etc., is less than this + value. Unlike mallinfo, this function returns only a precomputed + result, so can be called frequently to monitor memory consumption. + Even if locks are otherwise defined, this function does not use them, + so results might not be up to date. +*/ +size_t dlmalloc_footprint(void); + +/* + malloc_max_footprint(); + Returns the maximum number of bytes obtained from the system. This + value will be greater than current footprint if deallocated space + has been reclaimed by the system. The peak number of bytes allocated + by malloc, realloc etc., is less than this value. Unlike mallinfo, + this function returns only a precomputed result, so can be called + frequently to monitor memory consumption. Even if locks are + otherwise defined, this function does not use them, so results might + not be up to date. +*/ +size_t dlmalloc_max_footprint(void); + +#if !NO_MALLINFO +/* + mallinfo() + Returns (by copy) a struct containing various summary statistics: + + arena: current total non-mmapped bytes allocated from system + ordblks: the number of free chunks + smblks: always zero. + hblks: current number of mmapped regions + hblkhd: total bytes held in mmapped regions + usmblks: the maximum total allocated space. This will be greater + than current total if trimming has occurred. + fsmblks: always zero + uordblks: current total allocated space (normal or mmapped) + fordblks: total free space + keepcost: the maximum number of bytes that could ideally be released + back to system via malloc_trim. ("ideally" means that + it ignores page restrictions etc.) + + Because these fields are ints, but internal bookkeeping may + be kept as longs, the reported values may wrap around zero and + thus be inaccurate. +*/ +struct mallinfo dlmallinfo(void); +#endif /* NO_MALLINFO */ + +/* + independent_calloc(size_t n_elements, size_t element_size, void* chunks[]); + + independent_calloc is similar to calloc, but instead of returning a + single cleared space, it returns an array of pointers to n_elements + independent elements that can hold contents of size elem_size, each + of which starts out cleared, and can be independently freed, + realloc'ed etc. The elements are guaranteed to be adjacently + allocated (this is not guaranteed to occur with multiple callocs or + mallocs), which may also improve cache locality in some + applications. + + The "chunks" argument is optional (i.e., may be null, which is + probably the most typical usage). If it is null, the returned array + is itself dynamically allocated and should also be freed when it is + no longer needed. Otherwise, the chunks array must be of at least + n_elements in length. It is filled in with the pointers to the + chunks. + + In either case, independent_calloc returns this pointer array, or + null if the allocation failed. If n_elements is zero and "chunks" + is null, it returns a chunk representing an array with zero elements + (which should be freed if not wanted). + + Each element must be individually freed when it is no longer + needed. If you'd like to instead be able to free all at once, you + should instead use regular calloc and assign pointers into this + space to represent elements. (In this case though, you cannot + independently free elements.) + + independent_calloc simplifies and speeds up implementations of many + kinds of pools. It may also be useful when constructing large data + structures that initially have a fixed number of fixed-sized nodes, + but the number is not known at compile time, and some of the nodes + may later need to be freed. For example: + + struct Node { int item; struct Node* next; }; + + struct Node* build_list() { + struct Node** pool; + int n = read_number_of_nodes_needed(); + if (n <= 0) return 0; + pool = (struct Node**)(independent_calloc(n, sizeof(struct Node), 0); + if (pool == 0) die(); + // organize into a linked list... + struct Node* first = pool[0]; + for (i = 0; i < n-1; ++i) + pool[i]->next = pool[i+1]; + free(pool); // Can now free the array (or not, if it is needed later) + return first; + } +*/ +void** dlindependent_calloc(size_t, size_t, void**); + +/* + independent_comalloc(size_t n_elements, size_t sizes[], void* chunks[]); + + independent_comalloc allocates, all at once, a set of n_elements + chunks with sizes indicated in the "sizes" array. It returns + an array of pointers to these elements, each of which can be + independently freed, realloc'ed etc. The elements are guaranteed to + be adjacently allocated (this is not guaranteed to occur with + multiple callocs or mallocs), which may also improve cache locality + in some applications. + + The "chunks" argument is optional (i.e., may be null). If it is null + the returned array is itself dynamically allocated and should also + be freed when it is no longer needed. Otherwise, the chunks array + must be of at least n_elements in length. It is filled in with the + pointers to the chunks. + + In either case, independent_comalloc returns this pointer array, or + null if the allocation failed. If n_elements is zero and chunks is + null, it returns a chunk representing an array with zero elements + (which should be freed if not wanted). + + Each element must be individually freed when it is no longer + needed. If you'd like to instead be able to free all at once, you + should instead use a single regular malloc, and assign pointers at + particular offsets in the aggregate space. (In this case though, you + cannot independently free elements.) + + independent_comallac differs from independent_calloc in that each + element may have a different size, and also that it does not + automatically clear elements. + + independent_comalloc can be used to speed up allocation in cases + where several structs or objects must always be allocated at the + same time. For example: + + struct Head { ... } + struct Foot { ... } + + void send_message(char* msg) { + int msglen = strlen(msg); + size_t sizes[3] = { sizeof(struct Head), msglen, sizeof(struct Foot) }; + void* chunks[3]; + if (independent_comalloc(3, sizes, chunks) == 0) + die(); + struct Head* head = (struct Head*)(chunks[0]); + char* body = (char*)(chunks[1]); + struct Foot* foot = (struct Foot*)(chunks[2]); + // ... + } + + In general though, independent_comalloc is worth using only for + larger values of n_elements. For small values, you probably won't + detect enough difference from series of malloc calls to bother. + + Overuse of independent_comalloc can increase overall memory usage, + since it cannot reuse existing noncontiguous small chunks that + might be available for some of the elements. +*/ +void** dlindependent_comalloc(size_t, size_t*, void**); + + +/* + pvalloc(size_t n); + Equivalent to valloc(minimum-page-that-holds(n)), that is, + round up n to nearest pagesize. + */ +void* dlpvalloc(size_t); + +/* + malloc_trim(size_t pad); + + If possible, gives memory back to the system (via negative arguments + to sbrk) if there is unused memory at the `high' end of the malloc + pool or in unused MMAP segments. You can call this after freeing + large blocks of memory to potentially reduce the system-level memory + requirements of a program. However, it cannot guarantee to reduce + memory. Under some allocation patterns, some large free blocks of + memory will be locked between two used chunks, so they cannot be + given back to the system. + + The `pad' argument to malloc_trim represents the amount of free + trailing space to leave untrimmed. If this argument is zero, only + the minimum amount of memory to maintain internal data structures + will be left. Non-zero arguments can be supplied to maintain enough + trailing space to service future expected allocations without having + to re-obtain memory from the system. + + Malloc_trim returns 1 if it actually released any memory, else 0. +*/ +int dlmalloc_trim(size_t); + +/* + malloc_usable_size(void* p); + + Returns the number of bytes you can actually use in + an allocated chunk, which may be more than you requested (although + often not) due to alignment and minimum size constraints. + You can use this many bytes without worrying about + overwriting other allocated objects. This is not a particularly great + programming practice. malloc_usable_size can be more useful in + debugging and assertions, for example: + + p = malloc(n); + assert(malloc_usable_size(p) >= 256); +*/ +size_t dlmalloc_usable_size(void*); + +/* + malloc_stats(); + Prints on stderr the amount of space obtained from the system (both + via sbrk and mmap), the maximum amount (which may be more than + current if malloc_trim and/or munmap got called), and the current + number of bytes allocated via malloc (or realloc, etc) but not yet + freed. Note that this is the number of bytes allocated, not the + number requested. It will be larger than the number requested + because of alignment and bookkeeping overhead. Because it includes + alignment wastage as being in use, this figure may be greater than + zero even when no user-level chunks are allocated. + + The reported current and maximum system memory can be inaccurate if + a program makes other calls to system memory allocation functions + (normally sbrk) outside of malloc. + + malloc_stats prints only the most commonly interesting statistics. + More information can be obtained by calling mallinfo. +*/ +void dlmalloc_stats(void); + +#endif /* ONLY_MSPACES */ + +#if MSPACES + +/* + mspace is an opaque type representing an independent + region of space that supports mspace_malloc, etc. +*/ +typedef void* mspace; + +/* + create_mspace creates and returns a new independent space with the + given initial capacity, or, if 0, the default granularity size. It + returns null if there is no system memory available to create the + space. If argument locked is non-zero, the space uses a separate + lock to control access. The capacity of the space will grow + dynamically as needed to service mspace_malloc requests. You can + control the sizes of incremental increases of this space by + compiling with a different DEFAULT_GRANULARITY or dynamically + setting with mallopt(M_GRANULARITY, value). +*/ +mspace create_mspace(size_t capacity, int locked); + +/* + destroy_mspace destroys the given space, and attempts to return all + of its memory back to the system, returning the total number of + bytes freed. After destruction, the results of access to all memory + used by the space become undefined. +*/ +size_t destroy_mspace(mspace msp); + +/* + create_mspace_with_base uses the memory supplied as the initial base + of a new mspace. Part (less than 128*sizeof(size_t) bytes) of this + space is used for bookkeeping, so the capacity must be at least this + large. (Otherwise 0 is returned.) When this initial space is + exhausted, additional memory will be obtained from the system. + Destroying this space will deallocate all additionally allocated + space (if possible) but not the initial base. +*/ +mspace create_mspace_with_base(void* base, size_t capacity, int locked); + +/* + mspace_malloc behaves as malloc, but operates within + the given space. +*/ +void* mspace_malloc(mspace msp, size_t bytes); + +/* + mspace_free behaves as free, but operates within + the given space. + + If compiled with FOOTERS==1, mspace_free is not actually needed. + free may be called instead of mspace_free because freed chunks from + any space are handled by their originating spaces. +*/ +void mspace_free(mspace msp, void* mem); + +/* + mspace_realloc behaves as realloc, but operates within + the given space. + + If compiled with FOOTERS==1, mspace_realloc is not actually + needed. realloc may be called instead of mspace_realloc because + realloced chunks from any space are handled by their originating + spaces. +*/ +void* mspace_realloc(mspace msp, void* mem, size_t newsize); + +/* + mspace_calloc behaves as calloc, but operates within + the given space. +*/ +void* mspace_calloc(mspace msp, size_t n_elements, size_t elem_size); + +/* + mspace_memalign behaves as memalign, but operates within + the given space. +*/ +void* mspace_memalign(mspace msp, size_t alignment, size_t bytes); + +/* + mspace_independent_calloc behaves as independent_calloc, but + operates within the given space. +*/ +void** mspace_independent_calloc(mspace msp, size_t n_elements, + size_t elem_size, void* chunks[]); + +/* + mspace_independent_comalloc behaves as independent_comalloc, but + operates within the given space. +*/ +void** mspace_independent_comalloc(mspace msp, size_t n_elements, + size_t sizes[], void* chunks[]); + +/* + mspace_footprint() returns the number of bytes obtained from the + system for this space. +*/ +size_t mspace_footprint(mspace msp); + +/* + mspace_max_footprint() returns the peak number of bytes obtained from the + system for this space. +*/ +size_t mspace_max_footprint(mspace msp); + + +#if !NO_MALLINFO +/* + mspace_mallinfo behaves as mallinfo, but reports properties of + the given space. +*/ +struct mallinfo mspace_mallinfo(mspace msp); +#endif /* NO_MALLINFO */ + +/* + mspace_malloc_stats behaves as malloc_stats, but reports + properties of the given space. +*/ +void mspace_malloc_stats(mspace msp); + +/* + mspace_trim behaves as malloc_trim, but + operates within the given space. +*/ +int mspace_trim(mspace msp, size_t pad); + +/* + An alias for mallopt. +*/ +int mspace_mallopt(int, int); + +#endif /* MSPACES */ + +#ifdef __cplusplus +}; /* end of extern "C" */ +#endif /* __cplusplus */ + +/* + ======================================================================== + To make a fully customizable malloc.h header file, cut everything + above this line, put into file malloc.h, edit to suit, and #include it + on the next line, as well as in programs that use this malloc. + ======================================================================== +*/ + +/* #include "malloc.h" */ + +/*------------------------------ internal #includes ---------------------- */ + +#ifdef WIN32 +#pragma warning( disable : 4146 ) /* no "unsigned" warnings */ +#endif /* WIN32 */ + +#include /* for printing in malloc_stats */ + +#ifndef LACKS_ERRNO_H +#include /* for MALLOC_FAILURE_ACTION */ +#endif /* LACKS_ERRNO_H */ +#if FOOTERS +#include /* for magic initialization */ +#endif /* FOOTERS */ +#ifndef LACKS_STDLIB_H +#include /* for abort() */ +#endif /* LACKS_STDLIB_H */ +#ifdef DL_DEBUG +#if ABORT_ON_ASSERT_FAILURE +#define dl_assert(x) if(!(x)) ABORT +#else /* ABORT_ON_ASSERT_FAILURE */ +#include +#endif /* ABORT_ON_ASSERT_FAILURE */ +#else /* DL_DEBUG */ +#define dl_assert(x) assert(x) +#include +#endif /* DL_DEBUG */ +#ifndef LACKS_STRING_H +#include /* for memset etc */ +#endif /* LACKS_STRING_H */ +#if USE_BUILTIN_FFS +#ifndef LACKS_STRINGS_H +#include /* for ffs */ +#endif /* LACKS_STRINGS_H */ +#endif /* USE_BUILTIN_FFS */ +#if DL_HAVE_MMAP +#ifndef LACKS_SYS_MMAN_H +#include /* for mmap */ +#endif /* LACKS_SYS_MMAN_H */ +#ifndef LACKS_FCNTL_H +#include +#endif /* LACKS_FCNTL_H */ +#endif /* DL_HAVE_MMAP */ +#if HAVE_MORECORE +#ifndef LACKS_UNISTD_H +#include /* for sbrk */ +#else /* LACKS_UNISTD_H */ +#if !defined(__FreeBSD__) && !defined(__OpenBSD__) && !defined(__NetBSD__) +extern void* sbrk(ptrdiff_t); +#endif /* FreeBSD etc */ +#endif /* LACKS_UNISTD_H */ +#endif /* DL_HAVE_MMAP */ + +/* Declarations for locking */ +#if USE_LOCKS +#ifndef WIN32 +#include +#if defined (__SVR4) && defined (__sun) /* solaris */ +#include +#endif /* solaris */ +#else +#ifndef _M_AMD64 +/* These are already defined on AMD64 builds */ +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ +LONG __cdecl _InterlockedCompareExchange(LPLONG volatile Dest, LONG Exchange, LONG Comp); +LONG __cdecl _InterlockedExchange(LPLONG volatile Target, LONG Value); +#ifdef __cplusplus +} +#endif /* __cplusplus */ +#endif /* _M_AMD64 */ +#pragma intrinsic (_InterlockedCompareExchange) +#pragma intrinsic (_InterlockedExchange) +#define interlockedcompareexchange _InterlockedCompareExchange +#define interlockedexchange _InterlockedExchange +#endif /* Win32 */ +#endif /* USE_LOCKS */ + +/* Declarations for bit scanning on win32 */ +#if defined(_MSC_VER) && _MSC_VER>=1300 +#ifndef BitScanForward /* Try to avoid pulling in WinNT.h */ +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ +unsigned char _BitScanForward(unsigned long *index, unsigned long mask); +unsigned char _BitScanReverse(unsigned long *index, unsigned long mask); +#ifdef __cplusplus +} +#endif /* __cplusplus */ + +#define BitScanForward _BitScanForward +#define BitScanReverse _BitScanReverse +#pragma intrinsic(_BitScanForward) +#pragma intrinsic(_BitScanReverse) +#endif /* BitScanForward */ +#endif /* defined(_MSC_VER) && _MSC_VER>=1300 */ + +#ifndef WIN32 +#ifndef malloc_getpagesize +# ifdef _SC_PAGESIZE /* some SVR4 systems omit an underscore */ +# ifndef _SC_PAGE_SIZE +# define _SC_PAGE_SIZE _SC_PAGESIZE +# endif +# endif +# ifdef _SC_PAGE_SIZE +# define malloc_getpagesize sysconf(_SC_PAGE_SIZE) +# else +# if defined(BSD) || defined(DGUX) || defined(HAVE_GETPAGESIZE) + extern size_t getpagesize(); +# define malloc_getpagesize getpagesize() +# else +# ifdef WIN32 /* use supplied emulation of getpagesize */ +# define malloc_getpagesize getpagesize() +# else +# ifndef LACKS_SYS_PARAM_H +# include +# endif +# ifdef EXEC_PAGESIZE +# define malloc_getpagesize EXEC_PAGESIZE +# else +# ifdef NBPG +# ifndef CLSIZE +# define malloc_getpagesize NBPG +# else +# define malloc_getpagesize (NBPG * CLSIZE) +# endif +# else +# ifdef NBPC +# define malloc_getpagesize NBPC +# else +# ifdef PAGESIZE +# define malloc_getpagesize PAGESIZE +# else /* just guess */ +# define malloc_getpagesize ((size_t)4096U) +# endif +# endif +# endif +# endif +# endif +# endif +# endif +#endif +#endif + + + +/* ------------------- size_t and alignment properties -------------------- */ + +/* The byte and bit size of a size_t */ +#define SIZE_T_SIZE (sizeof(size_t)) +#define SIZE_T_BITSIZE (sizeof(size_t) << 3) + +/* Some constants coerced to size_t */ +/* Annoying but necessary to avoid errors on some platforms */ +#define SIZE_T_ZERO ((size_t)0) +#define SIZE_T_ONE ((size_t)1) +#define SIZE_T_TWO ((size_t)2) +#define SIZE_T_FOUR ((size_t)4) +#define TWO_SIZE_T_SIZES (SIZE_T_SIZE<<1) +#define FOUR_SIZE_T_SIZES (SIZE_T_SIZE<<2) +#define SIX_SIZE_T_SIZES (FOUR_SIZE_T_SIZES+TWO_SIZE_T_SIZES) +#define HALF_MAX_SIZE_T (MAX_SIZE_T / 2U) + +/* The bit mask value corresponding to MALLOC_ALIGNMENT */ +#define CHUNK_ALIGN_MASK (MALLOC_ALIGNMENT - SIZE_T_ONE) + +/* True if address a has acceptable alignment */ +#define is_aligned(A) (((size_t)((A)) & (CHUNK_ALIGN_MASK)) == 0) + +/* the number of bytes to offset an address to align it */ +#define align_offset(A)\ + ((((size_t)(A) & CHUNK_ALIGN_MASK) == 0)? 0 :\ + ((MALLOC_ALIGNMENT - ((size_t)(A) & CHUNK_ALIGN_MASK)) & CHUNK_ALIGN_MASK)) + +/* -------------------------- MMAP preliminaries ------------------------- */ + +/* + If HAVE_MORECORE or DL_HAVE_MMAP are false, we just define calls and + checks to fail so compiler optimizer can delete code rather than + using so many "#if"s. +*/ + + +/* MORECORE and MMAP must return MFAIL on failure */ +#define MFAIL ((void*)(MAX_SIZE_T)) +#define CMFAIL ((char*)(MFAIL)) /* defined for convenience */ + +#if !DL_HAVE_MMAP +#define IS_MMAPPED_BIT (SIZE_T_ZERO) +#define USE_MMAP_BIT (SIZE_T_ZERO) +#define CALL_MMAP(s) MFAIL +#define CALL_MUNMAP(a, s) (-1) +#define DIRECT_MMAP(s) MFAIL + +#else /* DL_HAVE_MMAP */ +#define IS_MMAPPED_BIT (SIZE_T_ONE) +#define USE_MMAP_BIT (SIZE_T_ONE) + +#ifndef WIN32 +#define CALL_MUNMAP(a, s) munmap((a), (s)) +#define MMAP_PROT (PROT_READ|PROT_WRITE) +#if !defined(MAP_ANONYMOUS) && defined(MAP_ANON) +#define MAP_ANONYMOUS MAP_ANON +#endif /* MAP_ANON */ +#ifdef MAP_ANONYMOUS +#define MMAP_FLAGS (MAP_PRIVATE|MAP_ANONYMOUS) +#define CALL_MMAP(s) mmap(0, (s), MMAP_PROT, MMAP_FLAGS, -1, 0) +#else /* MAP_ANONYMOUS */ +/* + Nearly all versions of mmap support MAP_ANONYMOUS, so the following + is unlikely to be needed, but is supplied just in case. +*/ +#define MMAP_FLAGS (MAP_PRIVATE) +static int dev_zero_fd = -1; /* Cached file descriptor for /dev/zero. */ +#define CALL_MMAP(s) ((dev_zero_fd < 0) ? \ + (dev_zero_fd = open("/dev/zero", O_RDWR), \ + mmap(0, (s), MMAP_PROT, MMAP_FLAGS, dev_zero_fd, 0)) : \ + mmap(0, (s), MMAP_PROT, MMAP_FLAGS, dev_zero_fd, 0)) +#endif /* MAP_ANONYMOUS */ + +#define DIRECT_MMAP(s) CALL_MMAP(s) +#else /* WIN32 */ + +/* Win32 MMAP via VirtualAlloc */ +static FORCEINLINE void* win32mmap(size_t size) { + void* ptr = VirtualAlloc(0, size, MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE); + return (ptr != 0)? ptr: MFAIL; +} + +/* For direct MMAP, use MEM_TOP_DOWN to minimize interference */ +static FORCEINLINE void* win32direct_mmap(size_t size) { + void* ptr = VirtualAlloc(0, size, MEM_RESERVE|MEM_COMMIT|MEM_TOP_DOWN, + PAGE_READWRITE); + return (ptr != 0)? ptr: MFAIL; +} + +/* This function supports releasing coalesed segments */ +static FORCEINLINE int win32munmap(void* ptr, size_t size) { + MEMORY_BASIC_INFORMATION minfo; + char* cptr = (char*)ptr; + while (size) { + if (VirtualQuery(cptr, &minfo, sizeof(minfo)) == 0) + return -1; + if (minfo.BaseAddress != cptr || minfo.AllocationBase != cptr || + minfo.State != MEM_COMMIT || minfo.RegionSize > size) + return -1; + if (VirtualFree(cptr, 0, MEM_RELEASE) == 0) + return -1; + cptr += minfo.RegionSize; + size -= minfo.RegionSize; + } + return 0; +} + +#define CALL_MMAP(s) win32mmap(s) +#define CALL_MUNMAP(a, s) win32munmap((a), (s)) +#define DIRECT_MMAP(s) win32direct_mmap(s) +#endif /* WIN32 */ +#endif /* DL_HAVE_MMAP */ + +#if DL_HAVE_MMAP && DL_HAVE_MREMAP +#define CALL_MREMAP(addr, osz, nsz, mv) mremap((addr), (osz), (nsz), (mv)) +#else /* DL_HAVE_MMAP && DL_HAVE_MREMAP */ +#define CALL_MREMAP(addr, osz, nsz, mv) ((void)(addr),(void)(osz), \ + (void)(nsz), (void)(mv),MFAIL) +#endif /* DL_HAVE_MMAP && DL_HAVE_MREMAP */ + +#if HAVE_MORECORE +#define CALL_MORECORE(S) MORECORE(S) +#else /* HAVE_MORECORE */ +#define CALL_MORECORE(S) MFAIL +#endif /* HAVE_MORECORE */ + +/* mstate bit set if continguous morecore disabled or failed */ +#define USE_NONCONTIGUOUS_BIT (4U) + +/* segment bit set in create_mspace_with_base */ +#define EXTERN_BIT (8U) + + +/* --------------------------- Lock preliminaries ------------------------ */ + +/* + When locks are defined, there are up to two global locks: + + * If HAVE_MORECORE, morecore_mutex protects sequences of calls to + MORECORE. In many cases sys_alloc requires two calls, that should + not be interleaved with calls by other threads. This does not + protect against direct calls to MORECORE by other threads not + using this lock, so there is still code to cope the best we can on + interference. + + * magic_init_mutex ensures that mparams.magic and other + unique mparams values are initialized only once. + + To enable use in layered extensions, locks are reentrant. + + Because lock-protected regions generally have bounded times, we use + the supplied simple spinlocks in the custom versions for x86. + + If USE_LOCKS is > 1, the definitions of lock routines here are + bypassed, in which case you will need to define at least + INITIAL_LOCK, ACQUIRE_LOCK, RELEASE_LOCK, and + NULL_LOCK_INITIALIZER, and possibly TRY_LOCK and IS_LOCKED + (The latter two are not used in this malloc, but are + commonly needed in extensions.) +*/ + +#if USE_LOCKS == 1 + +#if USE_SPIN_LOCKS +#ifndef WIN32 +/* Custom pthread-style spin locks on x86 and x64 for gcc */ +struct pthread_mlock_t +{ + volatile pthread_t threadid; + volatile unsigned int c; + volatile unsigned int l; +}; +#define MLOCK_T struct pthread_mlock_t +#define CURRENT_THREAD pthread_self() +#define SPINS_PER_YIELD 63 +static FORCEINLINE int pthread_acquire_lock (MLOCK_T *sl) { + if(CURRENT_THREAD==sl->threadid) + ++sl->c; + else { + int spins = 0; + for (;;) { + int ret; + __asm__ __volatile__ ("lock cmpxchgl %2,(%1)" : "=a" (ret) : "r" (&sl->l), "r" (1), "a" (0)); + if(!ret) { + dl_assert(!sl->threadid); + sl->threadid=CURRENT_THREAD; + sl->c=1; + break; + } + if ((++spins & SPINS_PER_YIELD) == 0) { +#if defined (__SVR4) && defined (__sun) /* solaris */ + thr_yield(); +#else +#ifdef linux + sched_yield(); +#else /* no-op yield on unknown systems */ + ; +#endif /* linux */ +#endif /* solaris */ + } + } + } + + return 0; +} + +static FORCEINLINE void pthread_release_lock (MLOCK_T *sl) { + int ret; + dl_assert(CURRENT_THREAD==sl->threadid); + if (!--sl->c) { + sl->threadid=0; + __asm__ __volatile__ ("xchgl %2,(%1)" : "=r" (ret) : "r" (&sl->l), "0" (0)); + } +} + +static FORCEINLINE int pthread_try_lock (MLOCK_T *sl) { + int ret; + __asm__ __volatile__ ("lock cmpxchgl %2,(%1)" : "=a" (ret) : "r" (&sl->l), "r" (1), "a" (0)); + if(!ret){ + dl_assert(!sl->threadid); + sl->threadid=CURRENT_THREAD; + sl->c=1; + return 1; + } + return 0; +} + +#define INITIAL_LOCK(sl) (memset((sl), 0, sizeof(MLOCK_T)), 0) +#define ACQUIRE_LOCK(sl) pthread_acquire_lock(sl) +#define RELEASE_LOCK(sl) pthread_release_lock(sl) +#define TRY_LOCK(sl) pthread_try_lock(sl) +#define IS_LOCKED(sl) ((sl)->l) + +static MLOCK_T magic_init_mutex = {0, 0, 0 }; +#if HAVE_MORECORE +static MLOCK_T morecore_mutex = {0, 0, 0 }; +#endif /* HAVE_MORECORE */ + +#else /* WIN32 */ +/* Custom win32-style spin locks on x86 and x64 for MSC */ +struct win32_mlock_t +{ + volatile long threadid; + volatile unsigned int c; + long l; +}; +#define MLOCK_T struct win32_mlock_t +#define CURRENT_THREAD GetCurrentThreadId() +#define SPINS_PER_YIELD 63 +static FORCEINLINE int win32_acquire_lock (MLOCK_T *sl) { + long mythreadid=CURRENT_THREAD; + if(mythreadid==sl->threadid) + ++sl->c; + else { + int spins = 0; + for (;;) { + if (!interlockedexchange(&sl->l, 1)) { + dl_assert(!sl->threadid); + sl->threadid=mythreadid; + sl->c=1; + break; + } + if ((++spins & SPINS_PER_YIELD) == 0) + SleepEx(0, FALSE); + } + } + return 0; +} + +static FORCEINLINE void win32_release_lock (MLOCK_T *sl) { + dl_assert(CURRENT_THREAD==sl->threadid); + if (!--sl->c) { + sl->threadid=0; + interlockedexchange (&sl->l, 0); + } +} + +static FORCEINLINE int win32_try_lock (MLOCK_T *sl) { + if (!interlockedexchange(&sl->l, 1)){ + dl_assert(!sl->threadid); + sl->threadid=CURRENT_THREAD; + sl->c=1; + return 1; + } + return 0; +} + +#define INITIAL_LOCK(sl) (memset(sl, 0, sizeof(MLOCK_T)), 0) +#define ACQUIRE_LOCK(sl) win32_acquire_lock(sl) +#define RELEASE_LOCK(sl) win32_release_lock(sl) +#define TRY_LOCK(sl) win32_try_lock(sl) +#define IS_LOCKED(sl) ((sl)->l) + +static MLOCK_T magic_init_mutex = {0, 0 }; +#if HAVE_MORECORE +static MLOCK_T morecore_mutex = {0, 0 }; +#endif /* HAVE_MORECORE */ + +#endif /* WIN32 */ +#else /* USE_SPIN_LOCKS */ + +#ifndef WIN32 +/* pthreads-based locks */ +struct pthread_mlock_t +{ + volatile unsigned int c; + pthread_mutex_t l; +}; +#define MLOCK_T struct pthread_mlock_t +#define CURRENT_THREAD pthread_self() +static FORCEINLINE int pthread_acquire_lock (MLOCK_T *sl) { + if(!pthread_mutex_lock(&(sl)->l)){ + sl->c++; + return 0; + } + return 1; +} + +static FORCEINLINE void pthread_release_lock (MLOCK_T *sl) { + --sl->c; + pthread_mutex_unlock(&(sl)->l); +} + +static FORCEINLINE int pthread_try_lock (MLOCK_T *sl) { + if(!pthread_mutex_trylock(&(sl)->l)){ + sl->c++; + return 1; + } + return 0; +} + +static FORCEINLINE int pthread_init_lock (MLOCK_T *sl) { + pthread_mutexattr_t attr; + sl->c=0; + if(pthread_mutexattr_init(&attr)) return 1; + if(pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE)) return 1; + if(pthread_mutex_init(&sl->l, &attr)) return 1; + pthread_mutexattr_destroy(&attr); + return 0; +} + +static FORCEINLINE int pthread_islocked (MLOCK_T *sl) { + if(!pthread_try_lock(sl)){ + int ret = (sl->c != 0); + pthread_mutex_unlock(sl); + return ret; + } + return 0; +} + +#define INITIAL_LOCK(sl) pthread_init_lock(sl) +#define ACQUIRE_LOCK(sl) pthread_acquire_lock(sl) +#define RELEASE_LOCK(sl) pthread_release_lock(sl) +#define TRY_LOCK(sl) pthread_try_lock(sl) +#define IS_LOCKED(sl) pthread_islocked(sl) + +static MLOCK_T magic_init_mutex = {0, PTHREAD_MUTEX_INITIALIZER }; +#if HAVE_MORECORE +static MLOCK_T morecore_mutex = {0, PTHREAD_MUTEX_INITIALIZER }; +#endif /* HAVE_MORECORE */ + +#else /* WIN32 */ +/* Win32 critical sections */ +#define MLOCK_T CRITICAL_SECTION +#define CURRENT_THREAD GetCurrentThreadId() +#define INITIAL_LOCK(s) (!InitializeCriticalSectionAndSpinCount((s), 4000) +#define ACQUIRE_LOCK(s) ( (!((s))->DebugInfo ? INITIAL_LOCK((s)) : 0), !EnterCriticalSection((s)), 0) +#define RELEASE_LOCK(s) ( LeaveCriticalSection((s)), 0 ) +#define TRY_LOCK(s) ( TryEnterCriticalSection((s)) ) +#define IS_LOCKED(s) ( (s)->LockCount >= 0 ) +#define NULL_LOCK_INITIALIZER +static MLOCK_T magic_init_mutex; +#if HAVE_MORECORE +static MLOCK_T morecore_mutex; +#endif /* HAVE_MORECORE */ +#endif /* WIN32 */ +#endif /* USE_SPIN_LOCKS */ +#endif /* USE_LOCKS == 1 */ + +/* ----------------------- User-defined locks ------------------------ */ + +#if USE_LOCKS > 1 +/* Define your own lock implementation here */ +/* #define INITIAL_LOCK(sl) ... */ +/* #define ACQUIRE_LOCK(sl) ... */ +/* #define RELEASE_LOCK(sl) ... */ +/* #define TRY_LOCK(sl) ... */ +/* #define IS_LOCKED(sl) ... */ +/* #define NULL_LOCK_INITIALIZER ... */ + +static MLOCK_T magic_init_mutex = NULL_LOCK_INITIALIZER; +#if HAVE_MORECORE +static MLOCK_T morecore_mutex = NULL_LOCK_INITIALIZER; +#endif /* HAVE_MORECORE */ +#endif /* USE_LOCKS > 1 */ + +/* ----------------------- Lock-based state ------------------------ */ + + +#if USE_LOCKS +#define USE_LOCK_BIT (2U) +#else /* USE_LOCKS */ +#define USE_LOCK_BIT (0U) +#define INITIAL_LOCK(l) +#endif /* USE_LOCKS */ + +#if USE_LOCKS && HAVE_MORECORE +#define ACQUIRE_MORECORE_LOCK() ACQUIRE_LOCK(&morecore_mutex); +#define RELEASE_MORECORE_LOCK() RELEASE_LOCK(&morecore_mutex); +#else /* USE_LOCKS && HAVE_MORECORE */ +#define ACQUIRE_MORECORE_LOCK() +#define RELEASE_MORECORE_LOCK() +#endif /* USE_LOCKS && HAVE_MORECORE */ + +#if USE_LOCKS +#define ACQUIRE_MAGIC_INIT_LOCK() ACQUIRE_LOCK(&magic_init_mutex); +#define RELEASE_MAGIC_INIT_LOCK() RELEASE_LOCK(&magic_init_mutex); +#else /* USE_LOCKS */ +#define ACQUIRE_MAGIC_INIT_LOCK() +#define RELEASE_MAGIC_INIT_LOCK() +#endif /* USE_LOCKS */ + + +/* ----------------------- Chunk representations ------------------------ */ + +/* + (The following includes lightly edited explanations by Colin Plumb.) + + The malloc_chunk declaration below is misleading (but accurate and + necessary). It declares a "view" into memory allowing access to + necessary fields at known offsets from a given base. + + Chunks of memory are maintained using a `boundary tag' method as + originally described by Knuth. (See the paper by Paul Wilson + ftp://ftp.cs.utexas.edu/pub/garbage/allocsrv.ps for a survey of such + techniques.) Sizes of free chunks are stored both in the front of + each chunk and at the end. This makes consolidating fragmented + chunks into bigger chunks fast. The head fields also hold bits + representing whether chunks are free or in use. + + Here are some pictures to make it clearer. They are "exploded" to + show that the state of a chunk can be thought of as extending from + the high 31 bits of the head field of its header through the + prev_foot and PINUSE_BIT bit of the following chunk header. + + A chunk that's in use looks like: + + chunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Size of previous chunk (if P = 1) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |P| + | Size of this chunk 1| +-+ + mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | | + +- -+ + | | + +- -+ + | : + +- size - sizeof(size_t) available payload bytes -+ + : | + chunk-> +- -+ + | | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |1| + | Size of next chunk (may or may not be in use) | +-+ + mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + And if it's free, it looks like this: + + chunk-> +- -+ + | User payload (must be in use, or we would have merged!) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |P| + | Size of this chunk 0| +-+ + mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Next pointer | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Prev pointer | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | : + +- size - sizeof(struct chunk) unused bytes -+ + : | + chunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Size of this chunk | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |0| + | Size of next chunk (must be in use, or we would have merged)| +-+ + mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | : + +- User payload -+ + : | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + |0| + +-+ + Note that since we always merge adjacent free chunks, the chunks + adjacent to a free chunk must be in use. + + Given a pointer to a chunk (which can be derived trivially from the + payload pointer) we can, in O(1) time, find out whether the adjacent + chunks are free, and if so, unlink them from the lists that they + are on and merge them with the current chunk. + + Chunks always begin on even word boundaries, so the mem portion + (which is returned to the user) is also on an even word boundary, and + thus at least double-word aligned. + + The P (PINUSE_BIT) bit, stored in the unused low-order bit of the + chunk size (which is always a multiple of two words), is an in-use + bit for the *previous* chunk. If that bit is *clear*, then the + word before the current chunk size contains the previous chunk + size, and can be used to find the front of the previous chunk. + The very first chunk allocated always has this bit set, preventing + access to non-existent (or non-owned) memory. If pinuse is set for + any given chunk, then you CANNOT determine the size of the + previous chunk, and might even get a memory addressing fault when + trying to do so. + + The C (CINUSE_BIT) bit, stored in the unused second-lowest bit of + the chunk size redundantly records whether the current chunk is + inuse. This redundancy enables usage checks within free and realloc, + and reduces indirection when freeing and consolidating chunks. + + Each freshly allocated chunk must have both cinuse and pinuse set. + That is, each allocated chunk borders either a previously allocated + and still in-use chunk, or the base of its memory arena. This is + ensured by making all allocations from the the `lowest' part of any + found chunk. Further, no free chunk physically borders another one, + so each free chunk is known to be preceded and followed by either + inuse chunks or the ends of memory. + + Note that the `foot' of the current chunk is actually represented + as the prev_foot of the NEXT chunk. This makes it easier to + deal with alignments etc but can be very confusing when trying + to extend or adapt this code. + + The exceptions to all this are + + 1. The special chunk `top' is the top-most available chunk (i.e., + the one bordering the end of available memory). It is treated + specially. Top is never included in any bin, is used only if + no other chunk is available, and is released back to the + system if it is very large (see M_TRIM_THRESHOLD). In effect, + the top chunk is treated as larger (and thus less well + fitting) than any other available chunk. The top chunk + doesn't update its trailing size field since there is no next + contiguous chunk that would have to index off it. However, + space is still allocated for it (TOP_FOOT_SIZE) to enable + separation or merging when space is extended. + + 3. Chunks allocated via mmap, which have the lowest-order bit + (IS_MMAPPED_BIT) set in their prev_foot fields, and do not set + PINUSE_BIT in their head fields. Because they are allocated + one-by-one, each must carry its own prev_foot field, which is + also used to hold the offset this chunk has within its mmapped + region, which is needed to preserve alignment. Each mmapped + chunk is trailed by the first two fields of a fake next-chunk + for sake of usage checks. + +*/ + +struct malloc_chunk { + size_t prev_foot; /* Size of previous chunk (if free). */ + size_t head; /* Size and inuse bits. */ + struct malloc_chunk* fd; /* double links -- used only if free. */ + struct malloc_chunk* bk; +}; + +typedef struct malloc_chunk mchunk; +typedef struct malloc_chunk* mchunkptr; +typedef struct malloc_chunk* sbinptr; /* The type of bins of chunks */ +typedef unsigned int bindex_t; /* Described below */ +typedef unsigned int binmap_t; /* Described below */ +typedef unsigned int flag_t; /* The type of various bit flag sets */ + +/* ------------------- Chunks sizes and alignments ----------------------- */ + +#define MCHUNK_SIZE (sizeof(mchunk)) + +#if FOOTERS +#define CHUNK_OVERHEAD (TWO_SIZE_T_SIZES) +#else /* FOOTERS */ +#define CHUNK_OVERHEAD (SIZE_T_SIZE) +#endif /* FOOTERS */ + +/* MMapped chunks need a second word of overhead ... */ +#define MMAP_CHUNK_OVERHEAD (TWO_SIZE_T_SIZES) +/* ... and additional padding for fake next-chunk at foot */ +#define MMAP_FOOT_PAD (FOUR_SIZE_T_SIZES) + +/* The smallest size we can malloc is an aligned minimal chunk */ +#define MIN_CHUNK_SIZE\ + ((MCHUNK_SIZE + CHUNK_ALIGN_MASK) & ~CHUNK_ALIGN_MASK) + +/* conversion from malloc headers to user pointers, and back */ +#define chunk2mem(p) ((void*)((char*)(p) + TWO_SIZE_T_SIZES)) +#define mem2chunk(mem) ((mchunkptr)((char*)(mem) - TWO_SIZE_T_SIZES)) +/* chunk associated with aligned address A */ +#define align_as_chunk(A) (mchunkptr)((A) + align_offset(chunk2mem(A))) + +/* Bounds on request (not chunk) sizes. */ +#define MAX_REQUEST ((-MIN_CHUNK_SIZE) << 2) +#define MIN_REQUEST (MIN_CHUNK_SIZE - CHUNK_OVERHEAD - SIZE_T_ONE) + +/* pad request bytes into a usable size */ +#define pad_request(req) \ + (((req) + CHUNK_OVERHEAD + CHUNK_ALIGN_MASK) & ~CHUNK_ALIGN_MASK) + +/* pad request, checking for minimum (but not maximum) */ +#define request2size(req) \ + (((req) < MIN_REQUEST)? MIN_CHUNK_SIZE : pad_request(req)) + + +/* ------------------ Operations on head and foot fields ----------------- */ + +/* + The head field of a chunk is or'ed with PINUSE_BIT when previous + adjacent chunk in use, and or'ed with CINUSE_BIT if this chunk is in + use. If the chunk was obtained with mmap, the prev_foot field has + IS_MMAPPED_BIT set, otherwise holding the offset of the base of the + mmapped region to the base of the chunk. + + FLAG4_BIT is not used by this malloc, but might be useful in extensions. +*/ + +#define PINUSE_BIT (SIZE_T_ONE) +#define CINUSE_BIT (SIZE_T_TWO) +#define FLAG4_BIT (SIZE_T_FOUR) +#define INUSE_BITS (PINUSE_BIT|CINUSE_BIT) +#define FLAG_BITS (PINUSE_BIT|CINUSE_BIT|FLAG4_BIT) + +/* Head value for fenceposts */ +#define FENCEPOST_HEAD (INUSE_BITS|SIZE_T_SIZE) + +/* extraction of fields from head words */ +#define cinuse(p) ((p)->head & CINUSE_BIT) +#define pinuse(p) ((p)->head & PINUSE_BIT) +#define chunksize(p) ((p)->head & ~(FLAG_BITS)) + +#define clear_pinuse(p) ((p)->head &= ~PINUSE_BIT) +#define clear_cinuse(p) ((p)->head &= ~CINUSE_BIT) + +/* Treat space at ptr +/- offset as a chunk */ +#define chunk_plus_offset(p, s) ((mchunkptr)(((char*)(p)) + (s))) +#define chunk_minus_offset(p, s) ((mchunkptr)(((char*)(p)) - (s))) + +/* Ptr to next or previous physical malloc_chunk. */ +#define next_chunk(p) ((mchunkptr)( ((char*)(p)) + ((p)->head & ~FLAG_BITS))) +#define prev_chunk(p) ((mchunkptr)( ((char*)(p)) - ((p)->prev_foot) )) + +/* extract next chunk's pinuse bit */ +#define next_pinuse(p) ((next_chunk(p)->head) & PINUSE_BIT) + +/* Get/set size at footer */ +#define get_foot(p, s) (((mchunkptr)((char*)(p) + (s)))->prev_foot) +#define set_foot(p, s) (((mchunkptr)((char*)(p) + (s)))->prev_foot = (s)) + +/* Set size, pinuse bit, and foot */ +#define set_size_and_pinuse_of_free_chunk(p, s)\ + ((p)->head = (s|PINUSE_BIT), set_foot(p, s)) + +/* Set size, pinuse bit, foot, and clear next pinuse */ +#define set_free_with_pinuse(p, s, n)\ + (clear_pinuse(n), set_size_and_pinuse_of_free_chunk(p, s)) + +#define is_mmapped(p)\ + (!((p)->head & PINUSE_BIT) && ((p)->prev_foot & IS_MMAPPED_BIT)) + +/* Get the internal overhead associated with chunk p */ +#define overhead_for(p)\ + (is_mmapped(p)? MMAP_CHUNK_OVERHEAD : CHUNK_OVERHEAD) + +/* Return true if malloced space is not necessarily cleared */ +#if MMAP_CLEARS +#define calloc_must_clear(p) (!is_mmapped(p)) +#else /* MMAP_CLEARS */ +#define calloc_must_clear(p) (1) +#endif /* MMAP_CLEARS */ + +/* ---------------------- Overlaid data structures ----------------------- */ + +/* + When chunks are not in use, they are treated as nodes of either + lists or trees. + + "Small" chunks are stored in circular doubly-linked lists, and look + like this: + + chunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Size of previous chunk | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + `head:' | Size of chunk, in bytes |P| + mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Forward pointer to next chunk in list | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Back pointer to previous chunk in list | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Unused space (may be 0 bytes long) . + . . + . | +nextchunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + `foot:' | Size of chunk, in bytes | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + Larger chunks are kept in a form of bitwise digital trees (aka + tries) keyed on chunksizes. Because malloc_tree_chunks are only for + free chunks greater than 256 bytes, their size doesn't impose any + constraints on user chunk sizes. Each node looks like: + + chunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Size of previous chunk | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + `head:' | Size of chunk, in bytes |P| + mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Forward pointer to next chunk of same size | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Back pointer to previous chunk of same size | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Pointer to left child (child[0]) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Pointer to right child (child[1]) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Pointer to parent | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | bin index of this chunk | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Unused space . + . | +nextchunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + `foot:' | Size of chunk, in bytes | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + Each tree holding treenodes is a tree of unique chunk sizes. Chunks + of the same size are arranged in a circularly-linked list, with only + the oldest chunk (the next to be used, in our FIFO ordering) + actually in the tree. (Tree members are distinguished by a non-null + parent pointer.) If a chunk with the same size an an existing node + is inserted, it is linked off the existing node using pointers that + work in the same way as fd/bk pointers of small chunks. + + Each tree contains a power of 2 sized range of chunk sizes (the + smallest is 0x100 <= x < 0x180), which is is divided in half at each + tree level, with the chunks in the smaller half of the range (0x100 + <= x < 0x140 for the top nose) in the left subtree and the larger + half (0x140 <= x < 0x180) in the right subtree. This is, of course, + done by inspecting individual bits. + + Using these rules, each node's left subtree contains all smaller + sizes than its right subtree. However, the node at the root of each + subtree has no particular ordering relationship to either. (The + dividing line between the subtree sizes is based on trie relation.) + If we remove the last chunk of a given size from the interior of the + tree, we need to replace it with a leaf node. The tree ordering + rules permit a node to be replaced by any leaf below it. + + The smallest chunk in a tree (a common operation in a best-fit + allocator) can be found by walking a path to the leftmost leaf in + the tree. Unlike a usual binary tree, where we follow left child + pointers until we reach a null, here we follow the right child + pointer any time the left one is null, until we reach a leaf with + both child pointers null. The smallest chunk in the tree will be + somewhere along that path. + + The worst case number of steps to add, find, or remove a node is + bounded by the number of bits differentiating chunks within + bins. Under current bin calculations, this ranges from 6 up to 21 + (for 32 bit sizes) or up to 53 (for 64 bit sizes). The typical case + is of course much better. +*/ + +struct malloc_tree_chunk { + /* The first four fields must be compatible with malloc_chunk */ + size_t prev_foot; + size_t head; + struct malloc_tree_chunk* fd; + struct malloc_tree_chunk* bk; + + struct malloc_tree_chunk* child[2]; + struct malloc_tree_chunk* parent; + bindex_t index; +}; + +typedef struct malloc_tree_chunk tchunk; +typedef struct malloc_tree_chunk* tchunkptr; +typedef struct malloc_tree_chunk* tbinptr; /* The type of bins of trees */ + +/* A little helper macro for trees */ +#define leftmost_child(t) ((t)->child[0] != 0? (t)->child[0] : (t)->child[1]) + +/* ----------------------------- Segments -------------------------------- */ + +/* + Each malloc space may include non-contiguous segments, held in a + list headed by an embedded malloc_segment record representing the + top-most space. Segments also include flags holding properties of + the space. Large chunks that are directly allocated by mmap are not + included in this list. They are instead independently created and + destroyed without otherwise keeping track of them. + + Segment management mainly comes into play for spaces allocated by + MMAP. Any call to MMAP might or might not return memory that is + adjacent to an existing segment. MORECORE normally contiguously + extends the current space, so this space is almost always adjacent, + which is simpler and faster to deal with. (This is why MORECORE is + used preferentially to MMAP when both are available -- see + sys_alloc.) When allocating using MMAP, we don't use any of the + hinting mechanisms (inconsistently) supported in various + implementations of unix mmap, or distinguish reserving from + committing memory. Instead, we just ask for space, and exploit + contiguity when we get it. It is probably possible to do + better than this on some systems, but no general scheme seems + to be significantly better. + + Management entails a simpler variant of the consolidation scheme + used for chunks to reduce fragmentation -- new adjacent memory is + normally prepended or appended to an existing segment. However, + there are limitations compared to chunk consolidation that mostly + reflect the fact that segment processing is relatively infrequent + (occurring only when getting memory from system) and that we + don't expect to have huge numbers of segments: + + * Segments are not indexed, so traversal requires linear scans. (It + would be possible to index these, but is not worth the extra + overhead and complexity for most programs on most platforms.) + * New segments are only appended to old ones when holding top-most + memory; if they cannot be prepended to others, they are held in + different segments. + + Except for the top-most segment of an mstate, each segment record + is kept at the tail of its segment. Segments are added by pushing + segment records onto the list headed by &mstate.seg for the + containing mstate. + + Segment flags control allocation/merge/deallocation policies: + * If EXTERN_BIT set, then we did not allocate this segment, + and so should not try to deallocate or merge with others. + (This currently holds only for the initial segment passed + into create_mspace_with_base.) + * If IS_MMAPPED_BIT set, the segment may be merged with + other surrounding mmapped segments and trimmed/de-allocated + using munmap. + * If neither bit is set, then the segment was obtained using + MORECORE so can be merged with surrounding MORECORE'd segments + and deallocated/trimmed using MORECORE with negative arguments. +*/ + +struct malloc_segment { + char* base; /* base address */ + size_t size; /* allocated size */ + struct malloc_segment* next; /* ptr to next segment */ + flag_t sflags; /* mmap and extern flag */ +}; + +#define is_mmapped_segment(S) ((S)->sflags & IS_MMAPPED_BIT) +#define is_extern_segment(S) ((S)->sflags & EXTERN_BIT) + +typedef struct malloc_segment msegment; +typedef struct malloc_segment* msegmentptr; + +/* ---------------------------- malloc_state ----------------------------- */ + +/* + A malloc_state holds all of the bookkeeping for a space. + The main fields are: + + Top + The topmost chunk of the currently active segment. Its size is + cached in topsize. The actual size of topmost space is + topsize+TOP_FOOT_SIZE, which includes space reserved for adding + fenceposts and segment records if necessary when getting more + space from the system. The size at which to autotrim top is + cached from mparams in trim_check, except that it is disabled if + an autotrim fails. + + Designated victim (dv) + This is the preferred chunk for servicing small requests that + don't have exact fits. It is normally the chunk split off most + recently to service another small request. Its size is cached in + dvsize. The link fields of this chunk are not maintained since it + is not kept in a bin. + + SmallBins + An array of bin headers for free chunks. These bins hold chunks + with sizes less than MIN_LARGE_SIZE bytes. Each bin contains + chunks of all the same size, spaced 8 bytes apart. To simplify + use in double-linked lists, each bin header acts as a malloc_chunk + pointing to the real first node, if it exists (else pointing to + itself). This avoids special-casing for headers. But to avoid + waste, we allocate only the fd/bk pointers of bins, and then use + repositioning tricks to treat these as the fields of a chunk. + + TreeBins + Treebins are pointers to the roots of trees holding a range of + sizes. There are 2 equally spaced treebins for each power of two + from TREE_SHIFT to TREE_SHIFT+16. The last bin holds anything + larger. + + Bin maps + There is one bit map for small bins ("smallmap") and one for + treebins ("treemap). Each bin sets its bit when non-empty, and + clears the bit when empty. Bit operations are then used to avoid + bin-by-bin searching -- nearly all "search" is done without ever + looking at bins that won't be selected. The bit maps + conservatively use 32 bits per map word, even if on 64bit system. + For a good description of some of the bit-based techniques used + here, see Henry S. Warren Jr's book "Hacker's Delight" (and + supplement at http://hackersdelight.org/). Many of these are + intended to reduce the branchiness of paths through malloc etc, as + well as to reduce the number of memory locations read or written. + + Segments + A list of segments headed by an embedded malloc_segment record + representing the initial space. + + Address check support + The least_addr field is the least address ever obtained from + MORECORE or MMAP. Attempted frees and reallocs of any address less + than this are trapped (unless INSECURE is defined). + + Magic tag + A cross-check field that should always hold same value as mparams.magic. + + Flags + Bits recording whether to use MMAP, locks, or contiguous MORECORE + + Statistics + Each space keeps track of current and maximum system memory + obtained via MORECORE or MMAP. + + Trim support + Fields holding the amount of unused topmost memory that should trigger + timming, and a counter to force periodic scanning to release unused + non-topmost segments. + + Locking + If USE_LOCKS is defined, the "mutex" lock is acquired and released + around every public call using this mspace. + + Extension support + A void* pointer and a size_t field that can be used to help implement + extensions to this malloc. +*/ + +/* Bin types, widths and sizes */ +#define NSMALLBINS (32U) +#define NTREEBINS (32U) +#define SMALLBIN_SHIFT (3U) +#define SMALLBIN_WIDTH (SIZE_T_ONE << SMALLBIN_SHIFT) +#define TREEBIN_SHIFT (8U) +#define MIN_LARGE_SIZE (SIZE_T_ONE << TREEBIN_SHIFT) +#define MAX_SMALL_SIZE (MIN_LARGE_SIZE - SIZE_T_ONE) +#define MAX_SMALL_REQUEST (MAX_SMALL_SIZE - CHUNK_ALIGN_MASK - CHUNK_OVERHEAD) + +struct malloc_state { + binmap_t smallmap; + binmap_t treemap; + size_t dvsize; + size_t topsize; + char* least_addr; + mchunkptr dv; + mchunkptr top; + size_t trim_check; + size_t release_checks; + size_t magic; + mchunkptr smallbins[(NSMALLBINS+1)*2]; + tbinptr treebins[NTREEBINS]; + size_t footprint; + size_t max_footprint; + flag_t mflags; +#if USE_LOCKS + MLOCK_T mutex; /* locate lock among fields that rarely change */ +#endif /* USE_LOCKS */ + msegment seg; + void* extp; /* Unused but available for extensions */ + size_t exts; +}; + +typedef struct malloc_state* mstate; + +/* ------------- Global malloc_state and malloc_params ------------------- */ + +/* + malloc_params holds global properties, including those that can be + dynamically set using mallopt. There is a single instance, mparams, + initialized in init_mparams. +*/ + +struct malloc_params { + size_t magic; + size_t page_size; + size_t granularity; + size_t mmap_threshold; + size_t trim_threshold; + flag_t default_mflags; +}; + +static struct malloc_params mparams; + +#if !ONLY_MSPACES + +/* The global malloc_state used for all non-"mspace" calls */ +static struct malloc_state _gm_; +#define gm (&_gm_) +#define is_global(M) ((M) == &_gm_) + +#endif /* !ONLY_MSPACES */ + +#define is_initialized(M) ((M)->top != 0) + +/* -------------------------- system alloc setup ------------------------- */ + +/* Operations on mflags */ + +#define use_lock(M) ((M)->mflags & USE_LOCK_BIT) +#define enable_lock(M) ((M)->mflags |= USE_LOCK_BIT) +#define disable_lock(M) ((M)->mflags &= ~USE_LOCK_BIT) + +#define use_mmap(M) ((M)->mflags & USE_MMAP_BIT) +#define enable_mmap(M) ((M)->mflags |= USE_MMAP_BIT) +#define disable_mmap(M) ((M)->mflags &= ~USE_MMAP_BIT) + +#define use_noncontiguous(M) ((M)->mflags & USE_NONCONTIGUOUS_BIT) +#define disable_contiguous(M) ((M)->mflags |= USE_NONCONTIGUOUS_BIT) + +#define set_lock(M,L)\ + ((M)->mflags = (L)?\ + ((M)->mflags | USE_LOCK_BIT) :\ + ((M)->mflags & ~USE_LOCK_BIT)) + +/* page-align a size */ +#define page_align(S)\ + (((S) + (mparams.page_size - SIZE_T_ONE)) & ~(mparams.page_size - SIZE_T_ONE)) + +/* granularity-align a size */ +#define granularity_align(S)\ + (((S) + (mparams.granularity - SIZE_T_ONE))\ + & ~(mparams.granularity - SIZE_T_ONE)) + + +/* For mmap, use granularity alignment on windows, else page-align */ +#ifdef WIN32 +#define mmap_align(S) granularity_align(S) +#else +#define mmap_align(S) page_align(S) +#endif + +#define is_page_aligned(S)\ + (((size_t)(S) & (mparams.page_size - SIZE_T_ONE)) == 0) +#define is_granularity_aligned(S)\ + (((size_t)(S) & (mparams.granularity - SIZE_T_ONE)) == 0) + +/* True if segment S holds address A */ +#define segment_holds(S, A)\ + ((char*)(A) >= S->base && (char*)(A) < S->base + S->size) + +/* Return segment holding given address */ +static msegmentptr segment_holding(mstate m, char* addr) { + msegmentptr sp = &m->seg; + for (;;) { + if (addr >= sp->base && addr < sp->base + sp->size) + return sp; + if ((sp = sp->next) == 0) + return 0; + } +} + +/* Return true if segment contains a segment link */ +static int has_segment_link(mstate m, msegmentptr ss) { + msegmentptr sp = &m->seg; + for (;;) { + if ((char*)sp >= ss->base && (char*)sp < ss->base + ss->size) + return 1; + if ((sp = sp->next) == 0) + return 0; + } +} + +#ifndef MORECORE_CANNOT_TRIM +#define should_trim(M,s) ((s) > (M)->trim_check) +#else /* MORECORE_CANNOT_TRIM */ +#define should_trim(M,s) (0) +#endif /* MORECORE_CANNOT_TRIM */ + +/* + TOP_FOOT_SIZE is padding at the end of a segment, including space + that may be needed to place segment records and fenceposts when new + noncontiguous segments are added. +*/ +#define TOP_FOOT_SIZE\ + (align_offset(chunk2mem(0))+pad_request(sizeof(struct malloc_segment))+MIN_CHUNK_SIZE) + + +/* ------------------------------- Hooks -------------------------------- */ + +/* + PREACTION should be defined to return 0 on success, and nonzero on + failure. If you are not using locking, you can redefine these to do + anything you like. +*/ + +#if USE_LOCKS + +/* Ensure locks are initialized */ +#define GLOBALLY_INITIALIZE() (mparams.page_size == 0 && init_mparams()) + +#define PREACTION(M) ((GLOBALLY_INITIALIZE() || use_lock(M))? ACQUIRE_LOCK(&(M)->mutex) : 0) +#define POSTACTION(M) { if (use_lock(M)) RELEASE_LOCK(&(M)->mutex); } +#else /* USE_LOCKS */ + +#ifndef PREACTION +#define PREACTION(M) (0) +#endif /* PREACTION */ + +#ifndef POSTACTION +#define POSTACTION(M) +#endif /* POSTACTION */ + +#endif /* USE_LOCKS */ + +/* + CORRUPTION_ERROR_ACTION is triggered upon detected bad addresses. + USAGE_ERROR_ACTION is triggered on detected bad frees and + reallocs. The argument p is an address that might have triggered the + fault. It is ignored by the two predefined actions, but might be + useful in custom actions that try to help diagnose errors. +*/ + +#if PROCEED_ON_ERROR + +/* A count of the number of corruption errors causing resets */ +int malloc_corruption_error_count; + +/* default corruption action */ +static void reset_on_error(mstate m); + +#define CORRUPTION_ERROR_ACTION(m) reset_on_error(m) +#define USAGE_ERROR_ACTION(m, p) + +#else /* PROCEED_ON_ERROR */ + +#ifndef CORRUPTION_ERROR_ACTION +#define CORRUPTION_ERROR_ACTION(m) ABORT +#endif /* CORRUPTION_ERROR_ACTION */ + +#ifndef USAGE_ERROR_ACTION +#define USAGE_ERROR_ACTION(m,p) ABORT +#endif /* USAGE_ERROR_ACTION */ + +#endif /* PROCEED_ON_ERROR */ + +/* -------------------------- Debugging setup ---------------------------- */ + +#if ! DL_DEBUG + +#define check_free_chunk(M,P) +#define check_inuse_chunk(M,P) +#define check_malloced_chunk(M,P,N) +#define check_mmapped_chunk(M,P) +#define check_malloc_state(M) +#define check_top_chunk(M,P) + +#else /* DL_DEBUG */ +#define check_free_chunk(M,P) do_check_free_chunk(M,P) +#define check_inuse_chunk(M,P) do_check_inuse_chunk(M,P) +#define check_top_chunk(M,P) do_check_top_chunk(M,P) +#define check_malloced_chunk(M,P,N) do_check_malloced_chunk(M,P,N) +#define check_mmapped_chunk(M,P) do_check_mmapped_chunk(M,P) +#define check_malloc_state(M) do_check_malloc_state(M) + +static void do_check_any_chunk(mstate m, mchunkptr p); +static void do_check_top_chunk(mstate m, mchunkptr p); +static void do_check_mmapped_chunk(mstate m, mchunkptr p); +static void do_check_inuse_chunk(mstate m, mchunkptr p); +static void do_check_free_chunk(mstate m, mchunkptr p); +static void do_check_malloced_chunk(mstate m, void* mem, size_t s); +static void do_check_tree(mstate m, tchunkptr t); +static void do_check_treebin(mstate m, bindex_t i); +static void do_check_smallbin(mstate m, bindex_t i); +static void do_check_malloc_state(mstate m); +static int bin_find(mstate m, mchunkptr x); +static size_t traverse_and_check(mstate m); +#endif /* DL_DEBUG */ + +/* ---------------------------- Indexing Bins ---------------------------- */ + +#define is_small(s) (((s) >> SMALLBIN_SHIFT) < NSMALLBINS) +#define small_index(s) ((s) >> SMALLBIN_SHIFT) +#define small_index2size(i) ((i) << SMALLBIN_SHIFT) +#define MIN_SMALL_INDEX (small_index(MIN_CHUNK_SIZE)) + +/* addressing by index. See above about smallbin repositioning */ +#define smallbin_at(M, i) ((sbinptr)((char*)&((M)->smallbins[(i)<<1]))) +#define treebin_at(M,i) (&((M)->treebins[i])) + +/* assign tree index for size S to variable I */ +#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) +#define compute_tree_index(S, I)\ +{\ + unsigned int X = S >> TREEBIN_SHIFT;\ + if (X == 0)\ + I = 0;\ + else if (X > 0xFFFF)\ + I = NTREEBINS-1;\ + else {\ + unsigned int K;\ + __asm__("bsrl\t%1, %0\n\t" : "=r" (K) : "g" (X));\ + I = (bindex_t)((K << 1) + ((S >> (K + (TREEBIN_SHIFT-1)) & 1)));\ + }\ +} + +#elif defined(_MSC_VER) && _MSC_VER>=1300 +#define compute_tree_index(S, I)\ +{\ + size_t X = S >> TREEBIN_SHIFT;\ + if (X == 0)\ + I = 0;\ + else if (X > 0xFFFF)\ + I = NTREEBINS-1;\ + else {\ + unsigned int K;\ + _BitScanReverse((DWORD *) &K, X);\ + I = (bindex_t)((K << 1) + ((S >> (K + (TREEBIN_SHIFT-1)) & 1)));\ + }\ +} +#else /* GNUC */ +#define compute_tree_index(S, I)\ +{\ + size_t X = S >> TREEBIN_SHIFT;\ + if (X == 0)\ + I = 0;\ + else if (X > 0xFFFF)\ + I = NTREEBINS-1;\ + else {\ + unsigned int Y = (unsigned int)X;\ + unsigned int N = ((Y - 0x100) >> 16) & 8;\ + unsigned int K = (((Y <<= N) - 0x1000) >> 16) & 4;\ + N += K;\ + N += K = (((Y <<= K) - 0x4000) >> 16) & 2;\ + K = 14 - N + ((Y <<= K) >> 15);\ + I = (K << 1) + ((S >> (K + (TREEBIN_SHIFT-1)) & 1));\ + }\ +} +#endif /* GNUC */ + +/* Bit representing maximum resolved size in a treebin at i */ +#define bit_for_tree_index(i) \ + (i == NTREEBINS-1)? (SIZE_T_BITSIZE-1) : (((i) >> 1) + TREEBIN_SHIFT - 2) + +/* Shift placing maximum resolved bit in a treebin at i as sign bit */ +#define leftshift_for_tree_index(i) \ + ((i == NTREEBINS-1)? 0 : \ + ((SIZE_T_BITSIZE-SIZE_T_ONE) - (((i) >> 1) + TREEBIN_SHIFT - 2))) + +/* The size of the smallest chunk held in bin with index i */ +#define minsize_for_tree_index(i) \ + ((SIZE_T_ONE << (((i) >> 1) + TREEBIN_SHIFT)) | \ + (((size_t)((i) & SIZE_T_ONE)) << (((i) >> 1) + TREEBIN_SHIFT - 1))) + + +/* ------------------------ Operations on bin maps ----------------------- */ + +/* bit corresponding to given index */ +#define idx2bit(i) ((binmap_t)(1) << (i)) + +/* Mark/Clear bits with given index */ +#define mark_smallmap(M,i) ((M)->smallmap |= idx2bit(i)) +#define clear_smallmap(M,i) ((M)->smallmap &= ~idx2bit(i)) +#define smallmap_is_marked(M,i) ((M)->smallmap & idx2bit(i)) + +#define mark_treemap(M,i) ((M)->treemap |= idx2bit(i)) +#define clear_treemap(M,i) ((M)->treemap &= ~idx2bit(i)) +#define treemap_is_marked(M,i) ((M)->treemap & idx2bit(i)) + +/* index corresponding to given bit */ + +#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) +#define compute_bit2idx(X, I)\ +{\ + unsigned int J;\ + __asm__("bsfl\t%1, %0\n\t" : "=r" (J) : "g" (X));\ + I = (bindex_t)J;\ +} +#elif defined(_MSC_VER) && _MSC_VER>=1300 +#define compute_bit2idx(X, I)\ +{\ + unsigned int J;\ + _BitScanForward((DWORD *) &J, X);\ + I = (bindex_t)J;\ +} + +#else /* GNUC */ +#if USE_BUILTIN_FFS +#define compute_bit2idx(X, I) I = ffs(X)-1 + +#else /* USE_BUILTIN_FFS */ +#define compute_bit2idx(X, I)\ +{\ + unsigned int Y = X - 1;\ + unsigned int K = Y >> (16-4) & 16;\ + unsigned int N = K; Y >>= K;\ + N += K = Y >> (8-3) & 8; Y >>= K;\ + N += K = Y >> (4-2) & 4; Y >>= K;\ + N += K = Y >> (2-1) & 2; Y >>= K;\ + N += K = Y >> (1-0) & 1; Y >>= K;\ + I = (bindex_t)(N + Y);\ +} +#endif /* USE_BUILTIN_FFS */ +#endif /* GNUC */ + +/* isolate the least set bit of a bitmap */ +#define least_bit(x) ((x) & -(x)) + +/* mask with all bits to left of least bit of x on */ +#define left_bits(x) ((x<<1) | -(x<<1)) + +/* mask with all bits to left of or equal to least bit of x on */ +#define same_or_left_bits(x) ((x) | -(x)) + + +/* ----------------------- Runtime Check Support ------------------------- */ + +/* + For security, the main invariant is that malloc/free/etc never + writes to a static address other than malloc_state, unless static + malloc_state itself has been corrupted, which cannot occur via + malloc (because of these checks). In essence this means that we + believe all pointers, sizes, maps etc held in malloc_state, but + check all of those linked or offsetted from other embedded data + structures. These checks are interspersed with main code in a way + that tends to minimize their run-time cost. + + When FOOTERS is defined, in addition to range checking, we also + verify footer fields of inuse chunks, which can be used guarantee + that the mstate controlling malloc/free is intact. This is a + streamlined version of the approach described by William Robertson + et al in "Run-time Detection of Heap-based Overflows" LISA'03 + http://www.usenix.org/events/lisa03/tech/robertson.html The footer + of an inuse chunk holds the xor of its mstate and a random seed, + that is checked upon calls to free() and realloc(). This is + (probablistically) unguessable from outside the program, but can be + computed by any code successfully malloc'ing any chunk, so does not + itself provide protection against code that has already broken + security through some other means. Unlike Robertson et al, we + always dynamically check addresses of all offset chunks (previous, + next, etc). This turns out to be cheaper than relying on hashes. +*/ + +#if !INSECURE +/* Check if address a is at least as high as any from MORECORE or MMAP */ +#define ok_address(M, a) ((char*)(a) >= (M)->least_addr) +/* Check if address of next chunk n is higher than base chunk p */ +#define ok_next(p, n) ((char*)(p) < (char*)(n)) +/* Check if p has its cinuse bit on */ +#define ok_cinuse(p) cinuse(p) +/* Check if p has its pinuse bit on */ +#define ok_pinuse(p) pinuse(p) + +#else /* !INSECURE */ +#define ok_address(M, a) (1) +#define ok_next(b, n) (1) +#define ok_cinuse(p) (1) +#define ok_pinuse(p) (1) +#endif /* !INSECURE */ + +#if (FOOTERS && !INSECURE) +/* Check if (alleged) mstate m has expected magic field */ +/* Modified by sasha: also check that address is within memheap */ +#define ok_magic(M) ((char *)(M) >= (char *)gm->least_addr && (M)->magic == mparams.magic) +#else /* (FOOTERS && !INSECURE) */ +#define ok_magic(M) (1) +#endif /* (FOOTERS && !INSECURE) */ + + +/* In gcc, use __builtin_expect to minimize impact of checks */ +#if !INSECURE +#if defined(__GNUC__) && __GNUC__ >= 3 +#define RTCHECK(e) __builtin_expect(e, 1) +#else /* GNUC */ +#define RTCHECK(e) (e) +#endif /* GNUC */ +#else /* !INSECURE */ +#define RTCHECK(e) (1) +#endif /* !INSECURE */ + +/* macros to set up inuse chunks with or without footers */ + +#if !FOOTERS + +#define mark_inuse_foot(M,p,s) + +/* Set cinuse bit and pinuse bit of next chunk */ +#define set_inuse(M,p,s)\ + ((p)->head = (((p)->head & PINUSE_BIT)|s|CINUSE_BIT),\ + ((mchunkptr)(((char*)(p)) + (s)))->head |= PINUSE_BIT) + +/* Set cinuse and pinuse of this chunk and pinuse of next chunk */ +#define set_inuse_and_pinuse(M,p,s)\ + ((p)->head = (s|PINUSE_BIT|CINUSE_BIT),\ + ((mchunkptr)(((char*)(p)) + (s)))->head |= PINUSE_BIT) + +/* Set size, cinuse and pinuse bit of this chunk */ +#define set_size_and_pinuse_of_inuse_chunk(M, p, s)\ + ((p)->head = (s|PINUSE_BIT|CINUSE_BIT)) + +#else /* FOOTERS */ + +/* Set foot of inuse chunk to be xor of mstate and seed */ +#define mark_inuse_foot(M,p,s)\ + (((mchunkptr)((char*)(p) + (s)))->prev_foot = ((size_t)(M) ^ mparams.magic)) + +#define get_mstate_for(p)\ + ((mstate)(((mchunkptr)((char*)(p) +\ + (chunksize(p))))->prev_foot ^ mparams.magic)) + +#define set_inuse(M,p,s)\ + ((p)->head = (((p)->head & PINUSE_BIT)|s|CINUSE_BIT),\ + (((mchunkptr)(((char*)(p)) + (s)))->head |= PINUSE_BIT), \ + mark_inuse_foot(M,p,s)) + +#define set_inuse_and_pinuse(M,p,s)\ + ((p)->head = (s|PINUSE_BIT|CINUSE_BIT),\ + (((mchunkptr)(((char*)(p)) + (s)))->head |= PINUSE_BIT),\ + mark_inuse_foot(M,p,s)) + +#define set_size_and_pinuse_of_inuse_chunk(M, p, s)\ + ((p)->head = (s|PINUSE_BIT|CINUSE_BIT),\ + mark_inuse_foot(M, p, s)) + +#endif /* !FOOTERS */ + +/* ---------------------------- setting mparams -------------------------- */ + +/* Initialize mparams */ +static int init_mparams(void) { + if (mparams.page_size == 0) { + size_t s; + + mparams.mmap_threshold = DEFAULT_MMAP_THRESHOLD; + mparams.trim_threshold = DEFAULT_TRIM_THRESHOLD; +#if MORECORE_CONTIGUOUS + mparams.default_mflags = USE_LOCK_BIT|USE_MMAP_BIT; +#else /* MORECORE_CONTIGUOUS */ + mparams.default_mflags = USE_LOCK_BIT|USE_MMAP_BIT|USE_NONCONTIGUOUS_BIT; +#endif /* MORECORE_CONTIGUOUS */ + +#if (FOOTERS && !INSECURE) + { +#if USE_DEV_RANDOM + int fd; + unsigned char buf[sizeof(size_t)]; + /* Try to use /dev/urandom, else fall back on using time */ + if ((fd = open("/dev/urandom", O_RDONLY)) >= 0 && + read(fd, buf, sizeof(buf)) == sizeof(buf)) { + s = *((size_t *) buf); + close(fd); + } + else +#endif /* USE_DEV_RANDOM */ + s = (size_t)(time(0) ^ (size_t)0x55555555U); + + s |= (size_t)8U; /* ensure nonzero */ + s &= ~(size_t)7U; /* improve chances of fault for bad values */ + + } +#else /* (FOOTERS && !INSECURE) */ + s = (size_t)0x58585858U; +#endif /* (FOOTERS && !INSECURE) */ + ACQUIRE_MAGIC_INIT_LOCK(); + if (mparams.magic == 0) { + mparams.magic = s; +#if !ONLY_MSPACES + /* Set up lock for main malloc area */ + INITIAL_LOCK(&gm->mutex); + gm->mflags = mparams.default_mflags; +#endif + } + RELEASE_MAGIC_INIT_LOCK(); + +#ifndef WIN32 + mparams.page_size = malloc_getpagesize; + mparams.granularity = ((DEFAULT_GRANULARITY != 0)? + DEFAULT_GRANULARITY : mparams.page_size); +#else /* WIN32 */ + { + SYSTEM_INFO system_info; + GetSystemInfo(&system_info); + mparams.page_size = system_info.dwPageSize; + mparams.granularity = system_info.dwAllocationGranularity; + } +#endif /* WIN32 */ + + /* Sanity-check configuration: + size_t must be unsigned and as wide as pointer type. + ints must be at least 4 bytes. + alignment must be at least 8. + Alignment, min chunk size, and page size must all be powers of 2. + */ + if ((sizeof(size_t) != sizeof(char*)) || + (MAX_SIZE_T < MIN_CHUNK_SIZE) || + (sizeof(int) < 4) || + (MALLOC_ALIGNMENT < (size_t)8U) || + ((MALLOC_ALIGNMENT & (MALLOC_ALIGNMENT-SIZE_T_ONE)) != 0) || + ((MCHUNK_SIZE & (MCHUNK_SIZE-SIZE_T_ONE)) != 0) || + ((mparams.granularity & (mparams.granularity-SIZE_T_ONE)) != 0) || + ((mparams.page_size & (mparams.page_size-SIZE_T_ONE)) != 0)) + ABORT; + } + return 0; +} + +/* support for mallopt */ +static int change_mparam(int param_number, int value) { + size_t val = (size_t)value; + init_mparams(); + switch(param_number) { + case M_TRIM_THRESHOLD: + mparams.trim_threshold = val; + return 1; + case M_GRANULARITY: + if (val >= mparams.page_size && ((val & (val-1)) == 0)) { + mparams.granularity = val; + return 1; + } + else + return 0; + case M_MMAP_THRESHOLD: + mparams.mmap_threshold = val; + return 1; + default: + return 0; + } +} + +#if DL_DEBUG +/* ------------------------- Debugging Support --------------------------- */ + +/* Check properties of any chunk, whether free, inuse, mmapped etc */ +static void do_check_any_chunk(mstate m, mchunkptr p) { + dl_assert((is_aligned(chunk2mem(p))) || (p->head == FENCEPOST_HEAD)); + dl_assert(ok_address(m, p)); +} + +/* Check properties of top chunk */ +static void do_check_top_chunk(mstate m, mchunkptr p) { + msegmentptr sp = segment_holding(m, (char*)p); + size_t sz = p->head & ~INUSE_BITS; /* third-lowest bit can be set! */ + dl_assert(sp != 0); + dl_assert((is_aligned(chunk2mem(p))) || (p->head == FENCEPOST_HEAD)); + dl_assert(ok_address(m, p)); + dl_assert(sz == m->topsize); + dl_assert(sz > 0); + dl_assert(sz == ((sp->base + sp->size) - (char*)p) - TOP_FOOT_SIZE); + dl_assert(pinuse(p)); + dl_assert(!pinuse(chunk_plus_offset(p, sz))); +} + +/* Check properties of (inuse) mmapped chunks */ +static void do_check_mmapped_chunk(mstate m, mchunkptr p) { + size_t sz = chunksize(p); + size_t len = (sz + (p->prev_foot & ~IS_MMAPPED_BIT) + MMAP_FOOT_PAD); + dl_assert(is_mmapped(p)); + dl_assert(use_mmap(m)); + dl_assert((is_aligned(chunk2mem(p))) || (p->head == FENCEPOST_HEAD)); + dl_assert(ok_address(m, p)); + dl_assert(!is_small(sz)); + dl_assert((len & (mparams.page_size-SIZE_T_ONE)) == 0); + dl_assert(chunk_plus_offset(p, sz)->head == FENCEPOST_HEAD); + dl_assert(chunk_plus_offset(p, sz+SIZE_T_SIZE)->head == 0); +} + +/* Check properties of inuse chunks */ +static void do_check_inuse_chunk(mstate m, mchunkptr p) { + do_check_any_chunk(m, p); + dl_assert(cinuse(p)); + dl_assert(next_pinuse(p)); + /* If not pinuse and not mmapped, previous chunk has OK offset */ + dl_assert(is_mmapped(p) || pinuse(p) || next_chunk(prev_chunk(p)) == p); + if (is_mmapped(p)) + do_check_mmapped_chunk(m, p); +} + +/* Check properties of free chunks */ +static void do_check_free_chunk(mstate m, mchunkptr p) { + size_t sz = chunksize(p); + mchunkptr next = chunk_plus_offset(p, sz); + do_check_any_chunk(m, p); + dl_assert(!cinuse(p)); + dl_assert(!next_pinuse(p)); + assert (!is_mmapped(p)); + if (p != m->dv && p != m->top) { + if (sz >= MIN_CHUNK_SIZE) { + dl_assert((sz & CHUNK_ALIGN_MASK) == 0); + dl_assert(is_aligned(chunk2mem(p))); + dl_assert(next->prev_foot == sz); + dl_assert(pinuse(p)); + assert (next == m->top || cinuse(next)); + dl_assert(p->fd->bk == p); + dl_assert(p->bk->fd == p); + } + else /* markers are always of size SIZE_T_SIZE */ + dl_assert(sz == SIZE_T_SIZE); + } +} + +/* Check properties of malloced chunks at the point they are malloced */ +static void do_check_malloced_chunk(mstate m, void* mem, size_t s) { + if (mem != 0) { + mchunkptr p = mem2chunk(mem); + size_t sz = p->head & ~(PINUSE_BIT|CINUSE_BIT); + do_check_inuse_chunk(m, p); + dl_assert((sz & CHUNK_ALIGN_MASK) == 0); + dl_assert(sz >= MIN_CHUNK_SIZE); + dl_assert(sz >= s); + /* unless mmapped, size is less than MIN_CHUNK_SIZE more than request */ + dl_assert(is_mmapped(p) || sz < (s + MIN_CHUNK_SIZE)); + } +} + +/* Check a tree and its subtrees. */ +static void do_check_tree(mstate m, tchunkptr t) { + tchunkptr head = 0; + tchunkptr u = t; + bindex_t tindex = t->index; + size_t tsize = chunksize(t); + bindex_t idx; + compute_tree_index(tsize, idx); + dl_assert(tindex == idx); + dl_assert(tsize >= MIN_LARGE_SIZE); + dl_assert(tsize >= minsize_for_tree_index(idx)); + dl_assert((idx == NTREEBINS-1) || (tsize < minsize_for_tree_index((idx+1)))); + + do { /* traverse through chain of same-sized nodes */ + do_check_any_chunk(m, ((mchunkptr)u)); + dl_assert(u->index == tindex); + dl_assert(chunksize(u) == tsize); + dl_assert(!cinuse(u)); + dl_assert(!next_pinuse(u)); + dl_assert(u->fd->bk == u); + dl_assert(u->bk->fd == u); + if (u->parent == 0) { + dl_assert(u->child[0] == 0); + dl_assert(u->child[1] == 0); + } + else { + dl_assert(head == 0); /* only one node on chain has parent */ + head = u; + dl_assert(u->parent != u); + assert (u->parent->child[0] == u || + u->parent->child[1] == u || + *((tbinptr*)(u->parent)) == u); + if (u->child[0] != 0) { + dl_assert(u->child[0]->parent == u); + dl_assert(u->child[0] != u); + do_check_tree(m, u->child[0]); + } + if (u->child[1] != 0) { + dl_assert(u->child[1]->parent == u); + dl_assert(u->child[1] != u); + do_check_tree(m, u->child[1]); + } + if (u->child[0] != 0 && u->child[1] != 0) { + dl_assert(chunksize(u->child[0]) < chunksize(u->child[1])); + } + } + u = u->fd; + } while (u != t); + dl_assert(head != 0); +} + +/* Check all the chunks in a treebin. */ +static void do_check_treebin(mstate m, bindex_t i) { + tbinptr* tb = treebin_at(m, i); + tchunkptr t = *tb; + int empty = (m->treemap & (1U << i)) == 0; + if (t == 0) + dl_assert(empty); + if (!empty) + do_check_tree(m, t); +} + +/* Check all the chunks in a smallbin. */ +static void do_check_smallbin(mstate m, bindex_t i) { + sbinptr b = smallbin_at(m, i); + mchunkptr p = b->bk; + unsigned int empty = (m->smallmap & (1U << i)) == 0; + if (p == b) + dl_assert(empty); + if (!empty) { + for (; p != b; p = p->bk) { + size_t size = chunksize(p); + mchunkptr q; + /* each chunk claims to be free */ + do_check_free_chunk(m, p); + /* chunk belongs in bin */ + dl_assert(small_index(size) == i); + dl_assert(p->bk == b || chunksize(p->bk) == chunksize(p)); + /* chunk is followed by an inuse chunk */ + q = next_chunk(p); + if (q->head != FENCEPOST_HEAD) + do_check_inuse_chunk(m, q); + } + } +} + +/* Find x in a bin. Used in other check functions. */ +static int bin_find(mstate m, mchunkptr x) { + size_t size = chunksize(x); + if (is_small(size)) { + bindex_t sidx = small_index(size); + sbinptr b = smallbin_at(m, sidx); + if (smallmap_is_marked(m, sidx)) { + mchunkptr p = b; + do { + if (p == x) + return 1; + } while ((p = p->fd) != b); + } + } + else { + bindex_t tidx; + compute_tree_index(size, tidx); + if (treemap_is_marked(m, tidx)) { + tchunkptr t = *treebin_at(m, tidx); + size_t sizebits = size << leftshift_for_tree_index(tidx); + while (t != 0 && chunksize(t) != size) { + t = t->child[(sizebits >> (SIZE_T_BITSIZE-SIZE_T_ONE)) & 1]; + sizebits <<= 1; + } + if (t != 0) { + tchunkptr u = t; + do { + if (u == (tchunkptr)x) + return 1; + } while ((u = u->fd) != t); + } + } + } + return 0; +} + +/* Traverse each chunk and check it; return total */ +static size_t traverse_and_check(mstate m) { + size_t sum = 0; + if (is_initialized(m)) { + msegmentptr s = &m->seg; + sum += m->topsize + TOP_FOOT_SIZE; + while (s != 0) { + mchunkptr q = align_as_chunk(s->base); + mchunkptr lastq = 0; + dl_assert(pinuse(q)); + while (segment_holds(s, q) && + q != m->top && q->head != FENCEPOST_HEAD) { + sum += chunksize(q); + if (cinuse(q)) { + dl_assert(!bin_find(m, q)); + do_check_inuse_chunk(m, q); + } + else { + dl_assert(q == m->dv || bin_find(m, q)); + dl_assert(lastq == 0 || cinuse(lastq)); /* Not 2 consecutive free */ + do_check_free_chunk(m, q); + } + lastq = q; + q = next_chunk(q); + } + s = s->next; + } + } + return sum; +} + +/* Check all properties of malloc_state. */ +static void do_check_malloc_state(mstate m) { + bindex_t i; + size_t total; + /* check bins */ + for (i = 0; i < NSMALLBINS; ++i) + do_check_smallbin(m, i); + for (i = 0; i < NTREEBINS; ++i) + do_check_treebin(m, i); + + if (m->dvsize != 0) { /* check dv chunk */ + do_check_any_chunk(m, m->dv); + dl_assert(m->dvsize == chunksize(m->dv)); + dl_assert(m->dvsize >= MIN_CHUNK_SIZE); + dl_assert(bin_find(m, m->dv) == 0); + } + + if (m->top != 0) { /* check top chunk */ + do_check_top_chunk(m, m->top); + /*dl_assert(m->topsize == chunksize(m->top)); redundant */ + dl_assert(m->topsize > 0); + dl_assert(bin_find(m, m->top) == 0); + } + + total = traverse_and_check(m); + dl_assert(total <= m->footprint); + dl_assert(m->footprint <= m->max_footprint); +} +#endif /* DL_DEBUG */ + +/* ----------------------------- statistics ------------------------------ */ + +#if !NO_MALLINFO +static struct mallinfo internal_mallinfo(mstate m) { + struct mallinfo nm = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + if (!PREACTION(m)) { + check_malloc_state(m); + if (is_initialized(m)) { + size_t nfree = SIZE_T_ONE; /* top always free */ + size_t mfree = m->topsize + TOP_FOOT_SIZE; + size_t sum = mfree; + msegmentptr s = &m->seg; + while (s != 0) { + mchunkptr q = align_as_chunk(s->base); + while (segment_holds(s, q) && + q != m->top && q->head != FENCEPOST_HEAD) { + size_t sz = chunksize(q); + sum += sz; + if (!cinuse(q)) { + mfree += sz; + ++nfree; + } + q = next_chunk(q); + } + s = s->next; + } + + nm.arena = sum; + nm.ordblks = nfree; + nm.hblkhd = m->footprint - sum; + nm.usmblks = m->max_footprint; + nm.uordblks = m->footprint - mfree; + nm.fordblks = mfree; + nm.keepcost = m->topsize; + } + + POSTACTION(m); + } + return nm; +} +#endif /* !NO_MALLINFO */ + +static void internal_malloc_stats(mstate m) { + if (!PREACTION(m)) { + size_t maxfp = 0; + size_t fp = 0; + size_t used = 0; + check_malloc_state(m); + if (is_initialized(m)) { + msegmentptr s = &m->seg; + maxfp = m->max_footprint; + fp = m->footprint; + used = fp - (m->topsize + TOP_FOOT_SIZE); + + while (s != 0) { + mchunkptr q = align_as_chunk(s->base); + while (segment_holds(s, q) && + q != m->top && q->head != FENCEPOST_HEAD) { + if (!cinuse(q)) + used -= chunksize(q); + q = next_chunk(q); + } + s = s->next; + } + } + + fprintf(stderr, "max system bytes = %10lu\n", (unsigned long)(maxfp)); + fprintf(stderr, "system bytes = %10lu\n", (unsigned long)(fp)); + fprintf(stderr, "in use bytes = %10lu\n", (unsigned long)(used)); + + POSTACTION(m); + } +} + +/* ----------------------- Operations on smallbins ----------------------- */ + +/* + Various forms of linking and unlinking are defined as macros. Even + the ones for trees, which are very long but have very short typical + paths. This is ugly but reduces reliance on inlining support of + compilers. +*/ + +/* Link a free chunk into a smallbin */ +#define insert_small_chunk(M, P, S) {\ + bindex_t IDX = small_index(S);\ + mchunkptr B = smallbin_at(M, IDX);\ + mchunkptr F = B;\ + dl_assert(S >= MIN_CHUNK_SIZE);\ + if (!smallmap_is_marked(M, IDX))\ + mark_smallmap(M, IDX);\ + else if (RTCHECK(ok_address(M, B->fd)))\ + F = B->fd;\ + else {\ + CORRUPTION_ERROR_ACTION(M);\ + }\ + B->fd = P;\ + F->bk = P;\ + P->fd = F;\ + P->bk = B;\ +} + +/* Unlink a chunk from a smallbin */ +#define unlink_small_chunk(M, P, S) {\ + mchunkptr F = P->fd;\ + mchunkptr B = P->bk;\ + bindex_t IDX = small_index(S);\ + dl_assert(P != B);\ + dl_assert(P != F);\ + dl_assert(chunksize(P) == small_index2size(IDX));\ + if (F == B)\ + clear_smallmap(M, IDX);\ + else if (RTCHECK((F == smallbin_at(M,IDX) || ok_address(M, F)) &&\ + (B == smallbin_at(M,IDX) || ok_address(M, B)))) {\ + F->bk = B;\ + B->fd = F;\ + }\ + else {\ + CORRUPTION_ERROR_ACTION(M);\ + }\ +} + +/* Unlink the first chunk from a smallbin */ +#define unlink_first_small_chunk(M, B, P, IDX) {\ + mchunkptr F = P->fd;\ + dl_assert(P != B);\ + dl_assert(P != F);\ + dl_assert(chunksize(P) == small_index2size(IDX));\ + if (B == F)\ + clear_smallmap(M, IDX);\ + else if (RTCHECK(ok_address(M, F))) {\ + B->fd = F;\ + F->bk = B;\ + }\ + else {\ + CORRUPTION_ERROR_ACTION(M);\ + }\ +} + +/* Replace dv node, binning the old one */ +/* Used only when dvsize known to be small */ +#define replace_dv(M, P, S) {\ + size_t DVS = M->dvsize;\ + if (DVS != 0) {\ + mchunkptr DV = M->dv;\ + dl_assert(is_small(DVS));\ + insert_small_chunk(M, DV, DVS);\ + }\ + M->dvsize = S;\ + M->dv = P;\ +} + +/* ------------------------- Operations on trees ------------------------- */ + +/* Insert chunk into tree */ +#define insert_large_chunk(M, X, S) {\ + tbinptr* H;\ + bindex_t IDX;\ + compute_tree_index(S, IDX);\ + H = treebin_at(M, IDX);\ + X->index = IDX;\ + X->child[0] = X->child[1] = 0;\ + if (!treemap_is_marked(M, IDX)) {\ + mark_treemap(M, IDX);\ + *H = X;\ + X->parent = (tchunkptr)H;\ + X->fd = X->bk = X;\ + }\ + else {\ + tchunkptr T = *H;\ + size_t K = S << leftshift_for_tree_index(IDX);\ + for (;;) {\ + if (chunksize(T) != S) {\ + tchunkptr* C = &(T->child[(K >> (SIZE_T_BITSIZE-SIZE_T_ONE)) & 1]);\ + K <<= 1;\ + if (*C != 0)\ + T = *C;\ + else if (RTCHECK(ok_address(M, C))) {\ + *C = X;\ + X->parent = T;\ + X->fd = X->bk = X;\ + break;\ + }\ + else {\ + CORRUPTION_ERROR_ACTION(M);\ + break;\ + }\ + }\ + else {\ + tchunkptr F = T->fd;\ + if (RTCHECK(ok_address(M, T) && ok_address(M, F))) {\ + T->fd = F->bk = X;\ + X->fd = F;\ + X->bk = T;\ + X->parent = 0;\ + break;\ + }\ + else {\ + CORRUPTION_ERROR_ACTION(M);\ + break;\ + }\ + }\ + }\ + }\ +} + +/* + Unlink steps: + + 1. If x is a chained node, unlink it from its same-sized fd/bk links + and choose its bk node as its replacement. + 2. If x was the last node of its size, but not a leaf node, it must + be replaced with a leaf node (not merely one with an open left or + right), to make sure that lefts and rights of descendents + correspond properly to bit masks. We use the rightmost descendent + of x. We could use any other leaf, but this is easy to locate and + tends to counteract removal of leftmosts elsewhere, and so keeps + paths shorter than minimally guaranteed. This doesn't loop much + because on average a node in a tree is near the bottom. + 3. If x is the base of a chain (i.e., has parent links) relink + x's parent and children to x's replacement (or null if none). +*/ + +#define unlink_large_chunk(M, X) {\ + tchunkptr XP = X->parent;\ + tchunkptr R;\ + if (X->bk != X) {\ + tchunkptr F = X->fd;\ + R = X->bk;\ + if (RTCHECK(ok_address(M, F))) {\ + F->bk = R;\ + R->fd = F;\ + }\ + else {\ + CORRUPTION_ERROR_ACTION(M);\ + }\ + }\ + else {\ + tchunkptr* RP;\ + if (((R = *(RP = &(X->child[1]))) != 0) ||\ + ((R = *(RP = &(X->child[0]))) != 0)) {\ + tchunkptr* CP;\ + while ((*(CP = &(R->child[1])) != 0) ||\ + (*(CP = &(R->child[0])) != 0)) {\ + R = *(RP = CP);\ + }\ + if (RTCHECK(ok_address(M, RP)))\ + *RP = 0;\ + else {\ + CORRUPTION_ERROR_ACTION(M);\ + }\ + }\ + }\ + if (XP != 0) {\ + tbinptr* H = treebin_at(M, X->index);\ + if (X == *H) {\ + if ((*H = R) == 0) \ + clear_treemap(M, X->index);\ + }\ + else if (RTCHECK(ok_address(M, XP))) {\ + if (XP->child[0] == X) \ + XP->child[0] = R;\ + else \ + XP->child[1] = R;\ + }\ + else\ + CORRUPTION_ERROR_ACTION(M);\ + if (R != 0) {\ + if (RTCHECK(ok_address(M, R))) {\ + tchunkptr C0, C1;\ + R->parent = XP;\ + if ((C0 = X->child[0]) != 0) {\ + if (RTCHECK(ok_address(M, C0))) {\ + R->child[0] = C0;\ + C0->parent = R;\ + }\ + else\ + CORRUPTION_ERROR_ACTION(M);\ + }\ + if ((C1 = X->child[1]) != 0) {\ + if (RTCHECK(ok_address(M, C1))) {\ + R->child[1] = C1;\ + C1->parent = R;\ + }\ + else\ + CORRUPTION_ERROR_ACTION(M);\ + }\ + }\ + else\ + CORRUPTION_ERROR_ACTION(M);\ + }\ + }\ +} + +/* Relays to large vs small bin operations */ + +#define insert_chunk(M, P, S)\ + if (is_small(S)) insert_small_chunk(M, P, S)\ + else { tchunkptr TP = (tchunkptr)(P); insert_large_chunk(M, TP, S); } + +#define unlink_chunk(M, P, S)\ + if (is_small(S)) unlink_small_chunk(M, P, S)\ + else { tchunkptr TP = (tchunkptr)(P); unlink_large_chunk(M, TP); } + + +/* Relays to internal calls to malloc/free from realloc, memalign etc */ + +#if ONLY_MSPACES +#define internal_malloc(m, b) mspace_malloc(m, b) +#define internal_free(m, mem) mspace_free(m,mem); +#else /* ONLY_MSPACES */ +#if MSPACES +#define internal_malloc(m, b)\ + (m == gm)? dlmalloc(b) : mspace_malloc(m, b) +#define internal_free(m, mem)\ + if (m == gm) dlfree(mem); else mspace_free(m,mem); +#else /* MSPACES */ +#define internal_malloc(m, b) dlmalloc(b) +#define internal_free(m, mem) dlfree(mem) +#endif /* MSPACES */ +#endif /* ONLY_MSPACES */ + +/* ----------------------- Direct-mmapping chunks ----------------------- */ + +/* + Directly mmapped chunks are set up with an offset to the start of + the mmapped region stored in the prev_foot field of the chunk. This + allows reconstruction of the required argument to MUNMAP when freed, + and also allows adjustment of the returned chunk to meet alignment + requirements (especially in memalign). There is also enough space + allocated to hold a fake next chunk of size SIZE_T_SIZE to maintain + the PINUSE bit so frees can be checked. +*/ + +/* Malloc using mmap */ +static void* mmap_alloc(mstate m, size_t nb) { + size_t mmsize = mmap_align(nb + SIX_SIZE_T_SIZES + CHUNK_ALIGN_MASK); + if (mmsize > nb) { /* Check for wrap around 0 */ + char* mm = (char*)(DIRECT_MMAP(mmsize)); + if (mm != CMFAIL) { + size_t offset = align_offset(chunk2mem(mm)); + size_t psize = mmsize - offset - MMAP_FOOT_PAD; + mchunkptr p = (mchunkptr)(mm + offset); + p->prev_foot = offset | IS_MMAPPED_BIT; + (p)->head = (psize|CINUSE_BIT); + mark_inuse_foot(m, p, psize); + chunk_plus_offset(p, psize)->head = FENCEPOST_HEAD; + chunk_plus_offset(p, psize+SIZE_T_SIZE)->head = 0; + + if (mm < m->least_addr) + m->least_addr = mm; + if ((m->footprint += mmsize) > m->max_footprint) + m->max_footprint = m->footprint; + dl_assert(is_aligned(chunk2mem(p))); + check_mmapped_chunk(m, p); + return chunk2mem(p); + } + } + return 0; +} + +/* Realloc using mmap */ +static mchunkptr mmap_resize(mstate m, mchunkptr oldp, size_t nb) { + size_t oldsize = chunksize(oldp); + if (is_small(nb)) /* Can't shrink mmap regions below small size */ + return 0; + /* Keep old chunk if big enough but not too big */ + if (oldsize >= nb + SIZE_T_SIZE && + (oldsize - nb) <= (mparams.granularity << 1)) + return oldp; + else { + size_t offset = oldp->prev_foot & ~IS_MMAPPED_BIT; + size_t oldmmsize = oldsize + offset + MMAP_FOOT_PAD; + size_t newmmsize = mmap_align(nb + SIX_SIZE_T_SIZES + CHUNK_ALIGN_MASK); + char* cp = (char*)CALL_MREMAP((char*)oldp - offset, + oldmmsize, newmmsize, 1); + if (cp != CMFAIL) { + mchunkptr newp = (mchunkptr)(cp + offset); + size_t psize = newmmsize - offset - MMAP_FOOT_PAD; + newp->head = (psize|CINUSE_BIT); + mark_inuse_foot(m, newp, psize); + chunk_plus_offset(newp, psize)->head = FENCEPOST_HEAD; + chunk_plus_offset(newp, psize+SIZE_T_SIZE)->head = 0; + + if (cp < m->least_addr) + m->least_addr = cp; + if ((m->footprint += newmmsize - oldmmsize) > m->max_footprint) + m->max_footprint = m->footprint; + check_mmapped_chunk(m, newp); + return newp; + } + } + return 0; +} + +/* -------------------------- mspace management -------------------------- */ + +/* Initialize top chunk and its size */ +static void init_top(mstate m, mchunkptr p, size_t psize) { + /* Ensure alignment */ + size_t offset = align_offset(chunk2mem(p)); + p = (mchunkptr)((char*)p + offset); + psize -= offset; + + m->top = p; + m->topsize = psize; + p->head = psize | PINUSE_BIT; + /* set size of fake trailing chunk holding overhead space only once */ + chunk_plus_offset(p, psize)->head = TOP_FOOT_SIZE; + m->trim_check = mparams.trim_threshold; /* reset on each update */ +} + +/* Initialize bins for a new mstate that is otherwise zeroed out */ +static void init_bins(mstate m) { + /* Establish circular links for smallbins */ + bindex_t i; + for (i = 0; i < NSMALLBINS; ++i) { + sbinptr bin = smallbin_at(m,i); + bin->fd = bin->bk = bin; + } +} + +#if PROCEED_ON_ERROR + +/* default corruption action */ +static void reset_on_error(mstate m) { + int i; + ++malloc_corruption_error_count; + /* Reinitialize fields to forget about all memory */ + m->smallbins = m->treebins = 0; + m->dvsize = m->topsize = 0; + m->seg.base = 0; + m->seg.size = 0; + m->seg.next = 0; + m->top = m->dv = 0; + for (i = 0; i < NTREEBINS; ++i) + *treebin_at(m, i) = 0; + init_bins(m); +} +#endif /* PROCEED_ON_ERROR */ + +/* Allocate chunk and prepend remainder with chunk in successor base. */ +static void* prepend_alloc(mstate m, char* newbase, char* oldbase, + size_t nb) { + mchunkptr p = align_as_chunk(newbase); + mchunkptr oldfirst = align_as_chunk(oldbase); + size_t psize = (char*)oldfirst - (char*)p; + mchunkptr q = chunk_plus_offset(p, nb); + size_t qsize = psize - nb; + set_size_and_pinuse_of_inuse_chunk(m, p, nb); + + dl_assert((char*)oldfirst > (char*)q); + dl_assert(pinuse(oldfirst)); + dl_assert(qsize >= MIN_CHUNK_SIZE); + + /* consolidate remainder with first chunk of old base */ + if (oldfirst == m->top) { + size_t tsize = m->topsize += qsize; + m->top = q; + q->head = tsize | PINUSE_BIT; + check_top_chunk(m, q); + } + else if (oldfirst == m->dv) { + size_t dsize = m->dvsize += qsize; + m->dv = q; + set_size_and_pinuse_of_free_chunk(q, dsize); + } + else { + if (!cinuse(oldfirst)) { + size_t nsize = chunksize(oldfirst); + unlink_chunk(m, oldfirst, nsize); + oldfirst = chunk_plus_offset(oldfirst, nsize); + qsize += nsize; + } + set_free_with_pinuse(q, qsize, oldfirst); + insert_chunk(m, q, qsize); + check_free_chunk(m, q); + } + + check_malloced_chunk(m, chunk2mem(p), nb); + return chunk2mem(p); +} + +/* Add a segment to hold a new noncontiguous region */ +static void add_segment(mstate m, char* tbase, size_t tsize, flag_t mmapped) { + /* Determine locations and sizes of segment, fenceposts, old top */ + char* old_top = (char*)m->top; + msegmentptr oldsp = segment_holding(m, old_top); + char* old_end = oldsp->base + oldsp->size; + size_t ssize = pad_request(sizeof(struct malloc_segment)); + char* rawsp = old_end - (ssize + FOUR_SIZE_T_SIZES + CHUNK_ALIGN_MASK); + size_t offset = align_offset(chunk2mem(rawsp)); + char* asp = rawsp + offset; + char* csp = (asp < (old_top + MIN_CHUNK_SIZE))? old_top : asp; + mchunkptr sp = (mchunkptr)csp; + msegmentptr ss = (msegmentptr)(chunk2mem(sp)); + mchunkptr tnext = chunk_plus_offset(sp, ssize); + mchunkptr p = tnext; + int nfences = 0; + + /* reset top to new space */ + init_top(m, (mchunkptr)tbase, tsize - TOP_FOOT_SIZE); + + /* Set up segment record */ + dl_assert(is_aligned(ss)); + set_size_and_pinuse_of_inuse_chunk(m, sp, ssize); + *ss = m->seg; /* Push current record */ + m->seg.base = tbase; + m->seg.size = tsize; + m->seg.sflags = mmapped; + m->seg.next = ss; + + /* Insert trailing fenceposts */ + for (;;) { + mchunkptr nextp = chunk_plus_offset(p, SIZE_T_SIZE); + p->head = FENCEPOST_HEAD; + ++nfences; + if ((char*)(&(nextp->head)) < old_end) + p = nextp; + else + break; + } + dl_assert(nfences >= 2); + + /* Insert the rest of old top into a bin as an ordinary free chunk */ + if (csp != old_top) { + mchunkptr q = (mchunkptr)old_top; + size_t psize = csp - old_top; + mchunkptr tn = chunk_plus_offset(q, psize); + set_free_with_pinuse(q, psize, tn); + insert_chunk(m, q, psize); + } + + check_top_chunk(m, m->top); +} + +/* -------------------------- System allocation -------------------------- */ + +/* Get memory from system using MORECORE or MMAP */ +static void* sys_alloc(mstate m, size_t nb) { + char* tbase = CMFAIL; + size_t tsize = 0; + flag_t mmap_flag = 0; + + init_mparams(); + + /* Directly map large chunks */ + if (use_mmap(m) && nb >= mparams.mmap_threshold) { + void* mem = mmap_alloc(m, nb); + if (mem != 0) + return mem; + } + + /* + Try getting memory in any of three ways (in most-preferred to + least-preferred order): + 1. A call to MORECORE that can normally contiguously extend memory. + (disabled if not MORECORE_CONTIGUOUS or not HAVE_MORECORE or + or main space is mmapped or a previous contiguous call failed) + 2. A call to MMAP new space (disabled if not DL_HAVE_MMAP). + Note that under the default settings, if MORECORE is unable to + fulfill a request, and DL_HAVE_MMAP is true, then mmap is + used as a noncontiguous system allocator. This is a useful backup + strategy for systems with holes in address spaces -- in this case + sbrk cannot contiguously expand the heap, but mmap may be able to + find space. + 3. A call to MORECORE that cannot usually contiguously extend memory. + (disabled if not HAVE_MORECORE) + */ + + if (MORECORE_CONTIGUOUS && !use_noncontiguous(m)) { + char* br = CMFAIL; + msegmentptr ss = (m->top == 0)? 0 : segment_holding(m, (char*)m->top); + size_t asize = 0; + ACQUIRE_MORECORE_LOCK(); + + if (ss == 0) { /* First time through or recovery */ + char* base = (char*)CALL_MORECORE(0); + if (base != CMFAIL) { + asize = granularity_align(nb + TOP_FOOT_SIZE + SIZE_T_ONE); + /* Adjust to end on a page boundary */ + if (!is_page_aligned(base)) + asize += (page_align((size_t)base) - (size_t)base); + /* Can't call MORECORE if size is negative when treated as signed */ + if (asize < HALF_MAX_SIZE_T && + (br = (char*)(CALL_MORECORE(asize))) == base) { + tbase = base; + tsize = asize; + } + } + } + else { + /* Subtract out existing available top space from MORECORE request. */ + asize = granularity_align(nb - m->topsize + TOP_FOOT_SIZE + SIZE_T_ONE); + /* Use mem here only if it did continuously extend old space */ + if (asize < HALF_MAX_SIZE_T && + (br = (char*)(CALL_MORECORE(asize))) == ss->base+ss->size) { + tbase = br; + tsize = asize; + } + } + + if (tbase == CMFAIL) { /* Cope with partial failure */ + if (br != CMFAIL) { /* Try to use/extend the space we did get */ + if (asize < HALF_MAX_SIZE_T && + asize < nb + TOP_FOOT_SIZE + SIZE_T_ONE) { + size_t esize = granularity_align(nb + TOP_FOOT_SIZE + SIZE_T_ONE - asize); + if (esize < HALF_MAX_SIZE_T) { + char* end = (char*)CALL_MORECORE(esize); + if (end != CMFAIL) + asize += esize; + else { /* Can't use; try to release */ + (void) CALL_MORECORE(-asize); + br = CMFAIL; + } + } + } + } + if (br != CMFAIL) { /* Use the space we did get */ + tbase = br; + tsize = asize; + } + else + disable_contiguous(m); /* Don't try contiguous path in the future */ + } + + RELEASE_MORECORE_LOCK(); + } + + if (DL_HAVE_MMAP && tbase == CMFAIL) { /* Try MMAP */ + size_t req = nb + TOP_FOOT_SIZE + SIZE_T_ONE; + size_t rsize = granularity_align(req); + if (rsize > nb) { /* Fail if wraps around zero */ + char* mp = (char*)(CALL_MMAP(rsize)); + if (mp != CMFAIL) { + tbase = mp; + tsize = rsize; + mmap_flag = IS_MMAPPED_BIT; + } + } + } + + if (HAVE_MORECORE && tbase == CMFAIL) { /* Try noncontiguous MORECORE */ + size_t asize = granularity_align(nb + TOP_FOOT_SIZE + SIZE_T_ONE); + if (asize < HALF_MAX_SIZE_T) { + char* br = CMFAIL; + char* end = CMFAIL; + ACQUIRE_MORECORE_LOCK(); + br = (char*)(CALL_MORECORE(asize)); + end = (char*)(CALL_MORECORE(0)); + RELEASE_MORECORE_LOCK(); + if (br != CMFAIL && end != CMFAIL && br < end) { + size_t ssize = end - br; + if (ssize > nb + TOP_FOOT_SIZE) { + tbase = br; + tsize = ssize; + } + } + } + } + + if (tbase != CMFAIL) { + + if ((m->footprint += tsize) > m->max_footprint) + m->max_footprint = m->footprint; + + if (!is_initialized(m)) { /* first-time initialization */ + m->seg.base = m->least_addr = tbase; + m->seg.size = tsize; + m->seg.sflags = mmap_flag; + m->magic = mparams.magic; + m->release_checks = MAX_RELEASE_CHECK_RATE; + init_bins(m); +#if !ONLY_MSPACES + if (is_global(m)) + init_top(m, (mchunkptr)tbase, tsize - TOP_FOOT_SIZE); + else +#endif + { + /* Offset top by embedded malloc_state */ + mchunkptr mn = next_chunk(mem2chunk(m)); + init_top(m, mn, (size_t)((tbase + tsize) - (char*)mn) -TOP_FOOT_SIZE); + } + } + + else { + /* Try to merge with an existing segment */ + msegmentptr sp = &m->seg; + /* Only consider most recent segment if traversal suppressed */ + while (sp != 0 && tbase != sp->base + sp->size) + sp = (NO_SEGMENT_TRAVERSAL) ? 0 : sp->next; + if (sp != 0 && + !is_extern_segment(sp) && + (sp->sflags & IS_MMAPPED_BIT) == mmap_flag && + segment_holds(sp, m->top)) { /* append */ + sp->size += tsize; + init_top(m, m->top, m->topsize + tsize); + } + else { + if (tbase < m->least_addr) + m->least_addr = tbase; + sp = &m->seg; + while (sp != 0 && sp->base != tbase + tsize) + sp = (NO_SEGMENT_TRAVERSAL) ? 0 : sp->next; + if (sp != 0 && + !is_extern_segment(sp) && + (sp->sflags & IS_MMAPPED_BIT) == mmap_flag) { + char* oldbase = sp->base; + sp->base = tbase; + sp->size += tsize; + return prepend_alloc(m, tbase, oldbase, nb); + } + else + add_segment(m, tbase, tsize, mmap_flag); + } + } + + if (nb < m->topsize) { /* Allocate from new or extended top space */ + size_t rsize = m->topsize -= nb; + mchunkptr p = m->top; + mchunkptr r = m->top = chunk_plus_offset(p, nb); + r->head = rsize | PINUSE_BIT; + set_size_and_pinuse_of_inuse_chunk(m, p, nb); + check_top_chunk(m, m->top); + check_malloced_chunk(m, chunk2mem(p), nb); + return chunk2mem(p); + } + } + + MALLOC_FAILURE_ACTION; + return 0; +} + +/* ----------------------- system deallocation -------------------------- */ + +/* Unmap and unlink any mmapped segments that don't contain used chunks */ +static size_t release_unused_segments(mstate m) { + size_t released = 0; + int nsegs = 0; + msegmentptr pred = &m->seg; + msegmentptr sp = pred->next; + while (sp != 0) { + char* base = sp->base; + size_t size = sp->size; + msegmentptr next = sp->next; + ++nsegs; + if (is_mmapped_segment(sp) && !is_extern_segment(sp)) { + mchunkptr p = align_as_chunk(base); + size_t psize = chunksize(p); + /* Can unmap if first chunk holds entire segment and not pinned */ + if (!cinuse(p) && (char*)p + psize >= base + size - TOP_FOOT_SIZE) { + tchunkptr tp = (tchunkptr)p; + dl_assert(segment_holds(sp, (char*)sp)); + if (p == m->dv) { + m->dv = 0; + m->dvsize = 0; + } + else { + unlink_large_chunk(m, tp); + } + if (CALL_MUNMAP(base, size) == 0) { + released += size; + m->footprint -= size; + /* unlink obsoleted record */ + sp = pred; + sp->next = next; + } + else { /* back out if cannot unmap */ + insert_large_chunk(m, tp, psize); + } + } + } + if (NO_SEGMENT_TRAVERSAL) /* scan only first segment */ + break; + pred = sp; + sp = next; + } + /* Reset check counter */ + m->release_checks = ((nsegs > (int)MAX_RELEASE_CHECK_RATE)? + (size_t)nsegs : MAX_RELEASE_CHECK_RATE); + return released; +} + +static int sys_trim(mstate m, size_t pad) { + size_t released = 0; + if (pad < MAX_REQUEST && is_initialized(m)) { + pad += TOP_FOOT_SIZE; /* ensure enough room for segment overhead */ + + if (m->topsize > pad) { + /* Shrink top space in granularity-size units, keeping at least one */ + size_t unit = mparams.granularity; + size_t extra = ((m->topsize - pad + (unit - SIZE_T_ONE)) / unit - + SIZE_T_ONE) * unit; + msegmentptr sp = segment_holding(m, (char*)m->top); + + if (!is_extern_segment(sp)) { + if (is_mmapped_segment(sp)) { + if (DL_HAVE_MMAP && + sp->size >= extra && + !has_segment_link(m, sp)) { /* can't shrink if pinned */ + size_t newsize = sp->size - extra; + /* Prefer mremap, fall back to munmap */ + if ((CALL_MREMAP(sp->base, sp->size, newsize, 0) != MFAIL) || + (CALL_MUNMAP(sp->base + newsize, extra) == 0)) { + released = extra; + } + } + } + else if (HAVE_MORECORE) { + if (extra >= HALF_MAX_SIZE_T) /* Avoid wrapping negative */ + extra = (HALF_MAX_SIZE_T) + SIZE_T_ONE - unit; + ACQUIRE_MORECORE_LOCK(); + { + /* Make sure end of memory is where we last set it. */ + char* old_br = (char*)(CALL_MORECORE(0)); + if (old_br == sp->base + sp->size) { + char* rel_br = (char*)(CALL_MORECORE(-extra)); + char* new_br = (char*)(CALL_MORECORE(0)); + if (rel_br != CMFAIL && new_br < old_br) + released = old_br - new_br; + } + } + RELEASE_MORECORE_LOCK(); + } + } + + if (released != 0) { + sp->size -= released; + m->footprint -= released; + init_top(m, m->top, m->topsize - released); + check_top_chunk(m, m->top); + } + } + + /* Unmap any unused mmapped segments */ + if (DL_HAVE_MMAP) + released += release_unused_segments(m); + + /* On failure, disable autotrim to avoid repeated failed future calls */ + if (released == 0 && m->topsize > m->trim_check) + m->trim_check = MAX_SIZE_T; + } + + return (released != 0)? 1 : 0; +} + +/* ---------------------------- malloc support --------------------------- */ + +/* allocate a large request from the best fitting chunk in a treebin */ +static void* tmalloc_large(mstate m, size_t nb) { + tchunkptr v = 0; + size_t rsize = -nb; /* Unsigned negation */ + tchunkptr t; + bindex_t idx; + compute_tree_index(nb, idx); + + if ((t = *treebin_at(m, idx)) != 0) { + /* Traverse tree for this bin looking for node with size == nb */ + size_t sizebits = nb << leftshift_for_tree_index(idx); + tchunkptr rst = 0; /* The deepest untaken right subtree */ + for (;;) { + tchunkptr rt; + size_t trem = chunksize(t) - nb; + if (trem < rsize) { + v = t; + if ((rsize = trem) == 0) + break; + } + rt = t->child[1]; + t = t->child[(sizebits >> (SIZE_T_BITSIZE-SIZE_T_ONE)) & 1]; + if (rt != 0 && rt != t) + rst = rt; + if (t == 0) { + t = rst; /* set t to least subtree holding sizes > nb */ + break; + } + sizebits <<= 1; + } + } + + if (t == 0 && v == 0) { /* set t to root of next non-empty treebin */ + binmap_t leftbits = left_bits(idx2bit(idx)) & m->treemap; + if (leftbits != 0) { + bindex_t i; + binmap_t leastbit = least_bit(leftbits); + compute_bit2idx(leastbit, i); + t = *treebin_at(m, i); + } + } + + while (t != 0) { /* find smallest of tree or subtree */ + size_t trem = chunksize(t) - nb; + if (trem < rsize) { + rsize = trem; + v = t; + } + t = leftmost_child(t); + } + + /* If dv is a better fit, return 0 so malloc will use it */ + if (v != 0 && rsize < (size_t)(m->dvsize - nb)) { + if (RTCHECK(ok_address(m, v))) { /* split */ + mchunkptr r = chunk_plus_offset(v, nb); + dl_assert(chunksize(v) == rsize + nb); + if (RTCHECK(ok_next(v, r))) { + unlink_large_chunk(m, v); + if (rsize < MIN_CHUNK_SIZE) + set_inuse_and_pinuse(m, v, (rsize + nb)); + else { + set_size_and_pinuse_of_inuse_chunk(m, v, nb); + set_size_and_pinuse_of_free_chunk(r, rsize); + insert_chunk(m, r, rsize); + } + return chunk2mem(v); + } + } + CORRUPTION_ERROR_ACTION(m); + } + return 0; +} + +/* allocate a small request from the best fitting chunk in a treebin */ +static void* tmalloc_small(mstate m, size_t nb) { + tchunkptr t, v; + size_t rsize; + bindex_t i; + binmap_t leastbit = least_bit(m->treemap); + compute_bit2idx(leastbit, i); + + v = t = *treebin_at(m, i); + rsize = chunksize(t) - nb; + + while ((t = leftmost_child(t)) != 0) { + size_t trem = chunksize(t) - nb; + if (trem < rsize) { + rsize = trem; + v = t; + } + } + + if (RTCHECK(ok_address(m, v))) { + mchunkptr r = chunk_plus_offset(v, nb); + dl_assert(chunksize(v) == rsize + nb); + if (RTCHECK(ok_next(v, r))) { + unlink_large_chunk(m, v); + if (rsize < MIN_CHUNK_SIZE) + set_inuse_and_pinuse(m, v, (rsize + nb)); + else { + set_size_and_pinuse_of_inuse_chunk(m, v, nb); + set_size_and_pinuse_of_free_chunk(r, rsize); + replace_dv(m, r, rsize); + } + return chunk2mem(v); + } + } + + CORRUPTION_ERROR_ACTION(m); + return 0; +} + +/* --------------------------- realloc support --------------------------- */ + +static void* internal_realloc(mstate m, void* oldmem, size_t bytes) { + if (bytes >= MAX_REQUEST) { + MALLOC_FAILURE_ACTION; + return 0; + } + if (!PREACTION(m)) { + mchunkptr oldp = mem2chunk(oldmem); + size_t oldsize = chunksize(oldp); + mchunkptr next = chunk_plus_offset(oldp, oldsize); + mchunkptr newp = 0; + void* extra = 0; + + /* Try to either shrink or extend into top. Else malloc-copy-free */ + + if (RTCHECK(ok_address(m, oldp) && ok_cinuse(oldp) && + ok_next(oldp, next) && ok_pinuse(next))) { + size_t nb = request2size(bytes); + if (is_mmapped(oldp)) + newp = mmap_resize(m, oldp, nb); + else if (oldsize >= nb) { /* already big enough */ + size_t rsize = oldsize - nb; + newp = oldp; + if (rsize >= MIN_CHUNK_SIZE) { + mchunkptr remainder = chunk_plus_offset(newp, nb); + set_inuse(m, newp, nb); + set_inuse(m, remainder, rsize); + extra = chunk2mem(remainder); + } + } + else if (next == m->top && oldsize + m->topsize > nb) { + /* Expand into top */ + size_t newsize = oldsize + m->topsize; + size_t newtopsize = newsize - nb; + mchunkptr newtop = chunk_plus_offset(oldp, nb); + set_inuse(m, oldp, nb); + newtop->head = newtopsize |PINUSE_BIT; + m->top = newtop; + m->topsize = newtopsize; + newp = oldp; + } + } + else { + USAGE_ERROR_ACTION(m, oldmem); + POSTACTION(m); + return 0; + } + + POSTACTION(m); + + if (newp != 0) { + if (extra != 0) { + internal_free(m, extra); + } + check_inuse_chunk(m, newp); + return chunk2mem(newp); + } + else { + void* newmem = internal_malloc(m, bytes); + if (newmem != 0) { + size_t oc = oldsize - overhead_for(oldp); + memcpy(newmem, oldmem, (oc < bytes)? oc : bytes); + internal_free(m, oldmem); + } + return newmem; + } + } + return 0; +} + +/* --------------------------- memalign support -------------------------- */ + +static void* internal_memalign(mstate m, size_t alignment, size_t bytes) { + if (alignment <= MALLOC_ALIGNMENT) /* Can just use malloc */ + return internal_malloc(m, bytes); + if (alignment < MIN_CHUNK_SIZE) /* must be at least a minimum chunk size */ + alignment = MIN_CHUNK_SIZE; + if ((alignment & (alignment-SIZE_T_ONE)) != 0) {/* Ensure a power of 2 */ + size_t a = MALLOC_ALIGNMENT << 1; + while (a < alignment) a <<= 1; + alignment = a; + } + + if (bytes >= MAX_REQUEST - alignment) { + if (m != 0) { /* Test isn't needed but avoids compiler warning */ + MALLOC_FAILURE_ACTION; + } + } + else { + size_t nb = request2size(bytes); + size_t req = nb + alignment + MIN_CHUNK_SIZE - CHUNK_OVERHEAD; + char* mem = (char*)internal_malloc(m, req); + if (mem != 0) { + void* leader = 0; + void* trailer = 0; + mchunkptr p = mem2chunk(mem); + + if (PREACTION(m)) return 0; + if ((((size_t)(mem)) % alignment) != 0) { /* misaligned */ + /* + Find an aligned spot inside chunk. Since we need to give + back leading space in a chunk of at least MIN_CHUNK_SIZE, if + the first calculation places us at a spot with less than + MIN_CHUNK_SIZE leader, we can move to the next aligned spot. + We've allocated enough total room so that this is always + possible. + */ + char* br = (char*)mem2chunk((size_t)(((size_t)(mem + + alignment - + SIZE_T_ONE)) & + -alignment)); + char* pos = ((size_t)(br - (char*)(p)) >= MIN_CHUNK_SIZE)? + br : br+alignment; + mchunkptr newp = (mchunkptr)pos; + size_t leadsize = pos - (char*)(p); + size_t newsize = chunksize(p) - leadsize; + + if (is_mmapped(p)) { /* For mmapped chunks, just adjust offset */ + newp->prev_foot = p->prev_foot + leadsize; + newp->head = (newsize|CINUSE_BIT); + } + else { /* Otherwise, give back leader, use the rest */ + set_inuse(m, newp, newsize); + set_inuse(m, p, leadsize); + leader = chunk2mem(p); + } + p = newp; + } + + /* Give back spare room at the end */ + if (!is_mmapped(p)) { + size_t size = chunksize(p); + if (size > nb + MIN_CHUNK_SIZE) { + size_t remainder_size = size - nb; + mchunkptr remainder = chunk_plus_offset(p, nb); + set_inuse(m, p, nb); + set_inuse(m, remainder, remainder_size); + trailer = chunk2mem(remainder); + } + } + + assert (chunksize(p) >= nb); + dl_assert((((size_t)(chunk2mem(p))) % alignment) == 0); + check_inuse_chunk(m, p); + POSTACTION(m); + if (leader != 0) { + internal_free(m, leader); + } + if (trailer != 0) { + internal_free(m, trailer); + } + return chunk2mem(p); + } + } + return 0; +} + +/* ------------------------ comalloc/coalloc support --------------------- */ + +static void** ialloc(mstate m, + size_t n_elements, + size_t* sizes, + int opts, + void* chunks[]) { + /* + This provides common support for independent_X routines, handling + all of the combinations that can result. + + The opts arg has: + bit 0 set if all elements are same size (using sizes[0]) + bit 1 set if elements should be zeroed + */ + + size_t element_size; /* chunksize of each element, if all same */ + size_t contents_size; /* total size of elements */ + size_t array_size; /* request size of pointer array */ + void* mem; /* malloced aggregate space */ + mchunkptr p; /* corresponding chunk */ + size_t remainder_size; /* remaining bytes while splitting */ + void** marray; /* either "chunks" or malloced ptr array */ + mchunkptr array_chunk; /* chunk for malloced ptr array */ + flag_t was_enabled; /* to disable mmap */ + size_t size; + size_t i; + + /* compute array length, if needed */ + if (chunks != 0) { + if (n_elements == 0) + return chunks; /* nothing to do */ + marray = chunks; + array_size = 0; + } + else { + /* if empty req, must still return chunk representing empty array */ + if (n_elements == 0) + return (void**)internal_malloc(m, 0); + marray = 0; + array_size = request2size(n_elements * (sizeof(void*))); + } + + /* compute total element size */ + if (opts & 0x1) { /* all-same-size */ + element_size = request2size(*sizes); + contents_size = n_elements * element_size; + } + else { /* add up all the sizes */ + element_size = 0; + contents_size = 0; + for (i = 0; i != n_elements; ++i) + contents_size += request2size(sizes[i]); + } + + size = contents_size + array_size; + + /* + Allocate the aggregate chunk. First disable direct-mmapping so + malloc won't use it, since we would not be able to later + free/realloc space internal to a segregated mmap region. + */ + was_enabled = use_mmap(m); + disable_mmap(m); + mem = internal_malloc(m, size - CHUNK_OVERHEAD); + if (was_enabled) + enable_mmap(m); + if (mem == 0) + return 0; + + if (PREACTION(m)) return 0; + p = mem2chunk(mem); + remainder_size = chunksize(p); + + dl_assert(!is_mmapped(p)); + + if (opts & 0x2) { /* optionally clear the elements */ + memset((size_t*)mem, 0, remainder_size - SIZE_T_SIZE - array_size); + } + + /* If not provided, allocate the pointer array as final part of chunk */ + if (marray == 0) { + size_t array_chunk_size; + array_chunk = chunk_plus_offset(p, contents_size); + array_chunk_size = remainder_size - contents_size; + marray = (void**) (chunk2mem(array_chunk)); + set_size_and_pinuse_of_inuse_chunk(m, array_chunk, array_chunk_size); + remainder_size = contents_size; + } + + /* split out elements */ + for (i = 0; ; ++i) { + marray[i] = chunk2mem(p); + if (i != n_elements-1) { + if (element_size != 0) + size = element_size; + else + size = request2size(sizes[i]); + remainder_size -= size; + set_size_and_pinuse_of_inuse_chunk(m, p, size); + p = chunk_plus_offset(p, size); + } + else { /* the final element absorbs any overallocation slop */ + set_size_and_pinuse_of_inuse_chunk(m, p, remainder_size); + break; + } + } + +#if DL_DEBUG + if (marray != chunks) { + /* final element must have exactly exhausted chunk */ + if (element_size != 0) { + dl_assert(remainder_size == element_size); + } + else { + dl_assert(remainder_size == request2size(sizes[i])); + } + check_inuse_chunk(m, mem2chunk(marray)); + } + for (i = 0; i != n_elements; ++i) + check_inuse_chunk(m, mem2chunk(marray[i])); + +#endif /* DL_DEBUG */ + + POSTACTION(m); + return marray; +} + + +/* -------------------------- public routines ---------------------------- */ + +#if !ONLY_MSPACES + +void* dlmalloc(size_t bytes) { + /* + Basic algorithm: + If a small request (< 256 bytes minus per-chunk overhead): + 1. If one exists, use a remainderless chunk in associated smallbin. + (Remainderless means that there are too few excess bytes to + represent as a chunk.) + 2. If it is big enough, use the dv chunk, which is normally the + chunk adjacent to the one used for the most recent small request. + 3. If one exists, split the smallest available chunk in a bin, + saving remainder in dv. + 4. If it is big enough, use the top chunk. + 5. If available, get memory from system and use it + Otherwise, for a large request: + 1. Find the smallest available binned chunk that fits, and use it + if it is better fitting than dv chunk, splitting if necessary. + 2. If better fitting than any binned chunk, use the dv chunk. + 3. If it is big enough, use the top chunk. + 4. If request size >= mmap threshold, try to directly mmap this chunk. + 5. If available, get memory from system and use it + + The ugly goto's here ensure that postaction occurs along all paths. + */ + + if (!PREACTION(gm)) { + void* mem; + size_t nb; + if (bytes <= MAX_SMALL_REQUEST) { + bindex_t idx; + binmap_t smallbits; + nb = (bytes < MIN_REQUEST)? MIN_CHUNK_SIZE : pad_request(bytes); + idx = small_index(nb); + smallbits = gm->smallmap >> idx; + + if ((smallbits & 0x3U) != 0) { /* Remainderless fit to a smallbin. */ + mchunkptr b, p; + idx += ~smallbits & 1; /* Uses next bin if idx empty */ + b = smallbin_at(gm, idx); + p = b->fd; + dl_assert(chunksize(p) == small_index2size(idx)); + unlink_first_small_chunk(gm, b, p, idx); + set_inuse_and_pinuse(gm, p, small_index2size(idx)); + mem = chunk2mem(p); + check_malloced_chunk(gm, mem, nb); + goto postaction; + } + + else if (nb > gm->dvsize) { + if (smallbits != 0) { /* Use chunk in next nonempty smallbin */ + mchunkptr b, p, r; + size_t rsize; + bindex_t i; + binmap_t leftbits = (smallbits << idx) & left_bits(idx2bit(idx)); + binmap_t leastbit = least_bit(leftbits); + compute_bit2idx(leastbit, i); + b = smallbin_at(gm, i); + p = b->fd; + dl_assert(chunksize(p) == small_index2size(i)); + unlink_first_small_chunk(gm, b, p, i); + rsize = small_index2size(i) - nb; + /* Fit here cannot be remainderless if 4byte sizes */ + if (SIZE_T_SIZE != 4 && rsize < MIN_CHUNK_SIZE) + set_inuse_and_pinuse(gm, p, small_index2size(i)); + else { + set_size_and_pinuse_of_inuse_chunk(gm, p, nb); + r = chunk_plus_offset(p, nb); + set_size_and_pinuse_of_free_chunk(r, rsize); + replace_dv(gm, r, rsize); + } + mem = chunk2mem(p); + check_malloced_chunk(gm, mem, nb); + goto postaction; + } + + else if (gm->treemap != 0 && (mem = tmalloc_small(gm, nb)) != 0) { + check_malloced_chunk(gm, mem, nb); + goto postaction; + } + } + } + else if (bytes >= MAX_REQUEST) + nb = MAX_SIZE_T; /* Too big to allocate. Force failure (in sys alloc) */ + else { + nb = pad_request(bytes); + if (gm->treemap != 0 && (mem = tmalloc_large(gm, nb)) != 0) { + check_malloced_chunk(gm, mem, nb); + goto postaction; + } + } + + if (nb <= gm->dvsize) { + size_t rsize = gm->dvsize - nb; + mchunkptr p = gm->dv; + if (rsize >= MIN_CHUNK_SIZE) { /* split dv */ + mchunkptr r = gm->dv = chunk_plus_offset(p, nb); + gm->dvsize = rsize; + set_size_and_pinuse_of_free_chunk(r, rsize); + set_size_and_pinuse_of_inuse_chunk(gm, p, nb); + } + else { /* exhaust dv */ + size_t dvs = gm->dvsize; + gm->dvsize = 0; + gm->dv = 0; + set_inuse_and_pinuse(gm, p, dvs); + } + mem = chunk2mem(p); + check_malloced_chunk(gm, mem, nb); + goto postaction; + } + + else if (nb < gm->topsize) { /* Split top */ + size_t rsize = gm->topsize -= nb; + mchunkptr p = gm->top; + mchunkptr r = gm->top = chunk_plus_offset(p, nb); + r->head = rsize | PINUSE_BIT; + set_size_and_pinuse_of_inuse_chunk(gm, p, nb); + mem = chunk2mem(p); + check_top_chunk(gm, gm->top); + check_malloced_chunk(gm, mem, nb); + goto postaction; + } + + mem = sys_alloc(gm, nb); + + postaction: + POSTACTION(gm); + return mem; + } + + return 0; +} + +void dlfree(void* mem) { + /* + Consolidate freed chunks with preceeding or succeeding bordering + free chunks, if they exist, and then place in a bin. Intermixed + with special cases for top, dv, mmapped chunks, and usage errors. + */ + + if (mem != 0) { + mchunkptr p = mem2chunk(mem); +#if FOOTERS + mstate fm = get_mstate_for(p); + if (!ok_magic(fm)) { + USAGE_ERROR_ACTION(fm, p); + return; + } +#else /* FOOTERS */ +#define fm gm +#endif /* FOOTERS */ + if (!PREACTION(fm)) { + check_inuse_chunk(fm, p); + if (RTCHECK(ok_address(fm, p) && ok_cinuse(p))) { + size_t psize = chunksize(p); + mchunkptr next = chunk_plus_offset(p, psize); + if (!pinuse(p)) { + size_t prevsize = p->prev_foot; + if ((prevsize & IS_MMAPPED_BIT) != 0) { + prevsize &= ~IS_MMAPPED_BIT; + psize += prevsize + MMAP_FOOT_PAD; + if (CALL_MUNMAP((char*)p - prevsize, psize) == 0) + fm->footprint -= psize; + goto postaction; + } + else { + mchunkptr prev = chunk_minus_offset(p, prevsize); + psize += prevsize; + p = prev; + if (RTCHECK(ok_address(fm, prev))) { /* consolidate backward */ + if (p != fm->dv) { + unlink_chunk(fm, p, prevsize); + } + else if ((next->head & INUSE_BITS) == INUSE_BITS) { + fm->dvsize = psize; + set_free_with_pinuse(p, psize, next); + goto postaction; + } + } + else + goto erroraction; + } + } + + if (RTCHECK(ok_next(p, next) && ok_pinuse(next))) { + if (!cinuse(next)) { /* consolidate forward */ + if (next == fm->top) { + size_t tsize = fm->topsize += psize; + fm->top = p; + p->head = tsize | PINUSE_BIT; + if (p == fm->dv) { + fm->dv = 0; + fm->dvsize = 0; + } + if (should_trim(fm, tsize)) + sys_trim(fm, 0); + goto postaction; + } + else if (next == fm->dv) { + size_t dsize = fm->dvsize += psize; + fm->dv = p; + set_size_and_pinuse_of_free_chunk(p, dsize); + goto postaction; + } + else { + size_t nsize = chunksize(next); + psize += nsize; + unlink_chunk(fm, next, nsize); + set_size_and_pinuse_of_free_chunk(p, psize); + if (p == fm->dv) { + fm->dvsize = psize; + goto postaction; + } + } + } + else + set_free_with_pinuse(p, psize, next); + + if (is_small(psize)) { + insert_small_chunk(fm, p, psize); + check_free_chunk(fm, p); + } + else { + tchunkptr tp = (tchunkptr)p; + insert_large_chunk(fm, tp, psize); + check_free_chunk(fm, p); + if (--fm->release_checks == 0) + release_unused_segments(fm); + } + goto postaction; + } + } + erroraction: + USAGE_ERROR_ACTION(fm, p); + postaction: + POSTACTION(fm); + } + } +#if !FOOTERS +#undef fm +#endif /* FOOTERS */ +} + +void* dlcalloc(size_t n_elements, size_t elem_size) { + void* mem; + size_t req = 0; + if (n_elements != 0) { + req = n_elements * elem_size; + if (((n_elements | elem_size) & ~(size_t)0xffff) && + (req / n_elements != elem_size)) + req = MAX_SIZE_T; /* force downstream failure on overflow */ + } + mem = dlmalloc(req); + if (mem != 0 && calloc_must_clear(mem2chunk(mem))) + memset(mem, 0, req); + return mem; +} + +void* dlrealloc(void* oldmem, size_t bytes) { + // printf("oldmem=%p bytes=%d\n", oldmem, (int)bytes); + if (oldmem == 0) + return dlmalloc(bytes); +#ifdef REALLOC_ZERO_BYTES_FREES + if (bytes == 0) { + dlfree(oldmem); + return 0; + } +#endif /* REALLOC_ZERO_BYTES_FREES */ + else { +#if ! FOOTERS + mstate m = gm; +#else /* FOOTERS */ +//printf("checking state\n"); + mstate m = get_mstate_for(mem2chunk(oldmem)); + // mchunkptr p = mem2chunk(oldmem); +//printf("checking state m=%p gm=%p least_addr=%p p=%p, head=%x size=%d pp=%x\n", m, gm, gm->least_addr, p, (unsigned)p->head, (int)chunksize(p), + // (unsigned)(((mchunkptr)((char*)(p) +(chunksize(p))))->prev_foot) + // ); + if (!ok_magic(m)) { +//printf("checking state - oops\n"); + USAGE_ERROR_ACTION(m, oldmem); + return 0; + } +//printf("checking state OK\n"); +#endif /* FOOTERS */ +//printf("to internal realloc m=%p gm=%p, mparams.magic=%x oldmem=%p bytes=%d\n", m, gm, (unsigned)mparams.magic, oldmem, (int)bytes); + return internal_realloc(m, oldmem, bytes); + } +} + +void* dlmemalign(size_t alignment, size_t bytes) { + return internal_memalign(gm, alignment, bytes); +} + +void** dlindependent_calloc(size_t n_elements, size_t elem_size, + void* chunks[]) { + size_t sz = elem_size; /* serves as 1-element array */ + return ialloc(gm, n_elements, &sz, 3, chunks); +} + +void** dlindependent_comalloc(size_t n_elements, size_t sizes[], + void* chunks[]) { + return ialloc(gm, n_elements, sizes, 0, chunks); +} + +void* dlvalloc(size_t bytes) { + size_t pagesz; + init_mparams(); + pagesz = mparams.page_size; + return dlmemalign(pagesz, bytes); +} + +void* dlpvalloc(size_t bytes) { + size_t pagesz; + init_mparams(); + pagesz = mparams.page_size; + return dlmemalign(pagesz, (bytes + pagesz - SIZE_T_ONE) & ~(pagesz - SIZE_T_ONE)); +} + +int dlmalloc_trim(size_t pad) { + int result = 0; + if (!PREACTION(gm)) { + result = sys_trim(gm, pad); + POSTACTION(gm); + } + return result; +} + +size_t dlmalloc_footprint(void) { + return gm->footprint; +} + +size_t dlmalloc_max_footprint(void) { + return gm->max_footprint; +} + +#if !NO_MALLINFO +struct mallinfo dlmallinfo(void) { + return internal_mallinfo(gm); +} +#endif /* NO_MALLINFO */ + +void dlmalloc_stats() { + internal_malloc_stats(gm); +} + +size_t dlmalloc_usable_size(void* mem) { + if (mem != 0) { + mchunkptr p = mem2chunk(mem); + if (cinuse(p)) + return chunksize(p) - overhead_for(p); + } + return 0; +} + +int dlmallopt(int param_number, int value) { + return change_mparam(param_number, value); +} + +#endif /* !ONLY_MSPACES */ + +/* ----------------------------- user mspaces ---------------------------- */ + +#if MSPACES + +static mstate init_user_mstate(char* tbase, size_t tsize) { + size_t msize = pad_request(sizeof(struct malloc_state)); + mchunkptr mn; + mchunkptr msp = align_as_chunk(tbase); + mstate m = (mstate)(chunk2mem(msp)); + memset(m, 0, msize); + INITIAL_LOCK(&m->mutex); + msp->head = (msize|PINUSE_BIT|CINUSE_BIT); + m->seg.base = m->least_addr = tbase; + m->seg.size = m->footprint = m->max_footprint = tsize; + m->magic = mparams.magic; + m->release_checks = MAX_RELEASE_CHECK_RATE; + m->mflags = mparams.default_mflags; + m->extp = 0; + m->exts = 0; + disable_contiguous(m); + init_bins(m); + mn = next_chunk(mem2chunk(m)); + init_top(m, mn, (size_t)((tbase + tsize) - (char*)mn) - TOP_FOOT_SIZE); + check_top_chunk(m, m->top); + return m; +} + +mspace create_mspace(size_t capacity, int locked) { + mstate m = 0; + size_t msize = pad_request(sizeof(struct malloc_state)); + init_mparams(); /* Ensure pagesize etc initialized */ + + if (capacity < (size_t) -(msize + TOP_FOOT_SIZE + mparams.page_size)) { + size_t rs = ((capacity == 0)? mparams.granularity : + (capacity + TOP_FOOT_SIZE + msize)); + size_t tsize = granularity_align(rs); + char* tbase = (char*)(CALL_MMAP(tsize)); + if (tbase != CMFAIL) { + m = init_user_mstate(tbase, tsize); + m->seg.sflags = IS_MMAPPED_BIT; + set_lock(m, locked); + } + } + return (mspace)m; +} + +mspace create_mspace_with_base(void* base, size_t capacity, int locked) { + mstate m = 0; + size_t msize = pad_request(sizeof(struct malloc_state)); + init_mparams(); /* Ensure pagesize etc initialized */ + + if (capacity > msize + TOP_FOOT_SIZE && + capacity < (size_t) -(msize + TOP_FOOT_SIZE + mparams.page_size)) { + m = init_user_mstate((char*)base, capacity); + m->seg.sflags = EXTERN_BIT; + set_lock(m, locked); + } + return (mspace)m; +} + +size_t destroy_mspace(mspace msp) { + size_t freed = 0; + mstate ms = (mstate)msp; + if (ok_magic(ms)) { + msegmentptr sp = &ms->seg; + while (sp != 0) { + char* base = sp->base; + size_t size = sp->size; + flag_t flag = sp->sflags; + sp = sp->next; + if ((flag & IS_MMAPPED_BIT) && !(flag & EXTERN_BIT) && + CALL_MUNMAP(base, size) == 0) + freed += size; + } + } + else { + USAGE_ERROR_ACTION(ms,ms); + } + return freed; +} + +/* + mspace versions of routines are near-clones of the global + versions. This is not so nice but better than the alternatives. +*/ + + +void* mspace_malloc(mspace msp, size_t bytes) { + mstate ms = (mstate)msp; + if (!ok_magic(ms)) { + USAGE_ERROR_ACTION(ms,ms); + return 0; + } + if (!PREACTION(ms)) { + void* mem; + size_t nb; + if (bytes <= MAX_SMALL_REQUEST) { + bindex_t idx; + binmap_t smallbits; + nb = (bytes < MIN_REQUEST)? MIN_CHUNK_SIZE : pad_request(bytes); + idx = small_index(nb); + smallbits = ms->smallmap >> idx; + + if ((smallbits & 0x3U) != 0) { /* Remainderless fit to a smallbin. */ + mchunkptr b, p; + idx += ~smallbits & 1; /* Uses next bin if idx empty */ + b = smallbin_at(ms, idx); + p = b->fd; + dl_assert(chunksize(p) == small_index2size(idx)); + unlink_first_small_chunk(ms, b, p, idx); + set_inuse_and_pinuse(ms, p, small_index2size(idx)); + mem = chunk2mem(p); + check_malloced_chunk(ms, mem, nb); + goto postaction; + } + + else if (nb > ms->dvsize) { + if (smallbits != 0) { /* Use chunk in next nonempty smallbin */ + mchunkptr b, p, r; + size_t rsize; + bindex_t i; + binmap_t leftbits = (smallbits << idx) & left_bits(idx2bit(idx)); + binmap_t leastbit = least_bit(leftbits); + compute_bit2idx(leastbit, i); + b = smallbin_at(ms, i); + p = b->fd; + dl_assert(chunksize(p) == small_index2size(i)); + unlink_first_small_chunk(ms, b, p, i); + rsize = small_index2size(i) - nb; + /* Fit here cannot be remainderless if 4byte sizes */ + if (SIZE_T_SIZE != 4 && rsize < MIN_CHUNK_SIZE) + set_inuse_and_pinuse(ms, p, small_index2size(i)); + else { + set_size_and_pinuse_of_inuse_chunk(ms, p, nb); + r = chunk_plus_offset(p, nb); + set_size_and_pinuse_of_free_chunk(r, rsize); + replace_dv(ms, r, rsize); + } + mem = chunk2mem(p); + check_malloced_chunk(ms, mem, nb); + goto postaction; + } + + else if (ms->treemap != 0 && (mem = tmalloc_small(ms, nb)) != 0) { + check_malloced_chunk(ms, mem, nb); + goto postaction; + } + } + } + else if (bytes >= MAX_REQUEST) + nb = MAX_SIZE_T; /* Too big to allocate. Force failure (in sys alloc) */ + else { + nb = pad_request(bytes); + if (ms->treemap != 0 && (mem = tmalloc_large(ms, nb)) != 0) { + check_malloced_chunk(ms, mem, nb); + goto postaction; + } + } + + if (nb <= ms->dvsize) { + size_t rsize = ms->dvsize - nb; + mchunkptr p = ms->dv; + if (rsize >= MIN_CHUNK_SIZE) { /* split dv */ + mchunkptr r = ms->dv = chunk_plus_offset(p, nb); + ms->dvsize = rsize; + set_size_and_pinuse_of_free_chunk(r, rsize); + set_size_and_pinuse_of_inuse_chunk(ms, p, nb); + } + else { /* exhaust dv */ + size_t dvs = ms->dvsize; + ms->dvsize = 0; + ms->dv = 0; + set_inuse_and_pinuse(ms, p, dvs); + } + mem = chunk2mem(p); + check_malloced_chunk(ms, mem, nb); + goto postaction; + } + + else if (nb < ms->topsize) { /* Split top */ + size_t rsize = ms->topsize -= nb; + mchunkptr p = ms->top; + mchunkptr r = ms->top = chunk_plus_offset(p, nb); + r->head = rsize | PINUSE_BIT; + set_size_and_pinuse_of_inuse_chunk(ms, p, nb); + mem = chunk2mem(p); + check_top_chunk(ms, ms->top); + check_malloced_chunk(ms, mem, nb); + goto postaction; + } + + mem = sys_alloc(ms, nb); + + postaction: + POSTACTION(ms); + return mem; + } + + return 0; +} + +void mspace_free(mspace msp, void* mem) { + if (mem != 0) { + mchunkptr p = mem2chunk(mem); +#if FOOTERS + mstate fm = get_mstate_for(p); +#else /* FOOTERS */ + mstate fm = (mstate)msp; +#endif /* FOOTERS */ + if (!ok_magic(fm)) { + USAGE_ERROR_ACTION(fm, p); + return; + } + if (!PREACTION(fm)) { + check_inuse_chunk(fm, p); + if (RTCHECK(ok_address(fm, p) && ok_cinuse(p))) { + size_t psize = chunksize(p); + mchunkptr next = chunk_plus_offset(p, psize); + if (!pinuse(p)) { + size_t prevsize = p->prev_foot; + if ((prevsize & IS_MMAPPED_BIT) != 0) { + prevsize &= ~IS_MMAPPED_BIT; + psize += prevsize + MMAP_FOOT_PAD; + if (CALL_MUNMAP((char*)p - prevsize, psize) == 0) + fm->footprint -= psize; + goto postaction; + } + else { + mchunkptr prev = chunk_minus_offset(p, prevsize); + psize += prevsize; + p = prev; + if (RTCHECK(ok_address(fm, prev))) { /* consolidate backward */ + if (p != fm->dv) { + unlink_chunk(fm, p, prevsize); + } + else if ((next->head & INUSE_BITS) == INUSE_BITS) { + fm->dvsize = psize; + set_free_with_pinuse(p, psize, next); + goto postaction; + } + } + else + goto erroraction; + } + } + + if (RTCHECK(ok_next(p, next) && ok_pinuse(next))) { + if (!cinuse(next)) { /* consolidate forward */ + if (next == fm->top) { + size_t tsize = fm->topsize += psize; + fm->top = p; + p->head = tsize | PINUSE_BIT; + if (p == fm->dv) { + fm->dv = 0; + fm->dvsize = 0; + } + if (should_trim(fm, tsize)) + sys_trim(fm, 0); + goto postaction; + } + else if (next == fm->dv) { + size_t dsize = fm->dvsize += psize; + fm->dv = p; + set_size_and_pinuse_of_free_chunk(p, dsize); + goto postaction; + } + else { + size_t nsize = chunksize(next); + psize += nsize; + unlink_chunk(fm, next, nsize); + set_size_and_pinuse_of_free_chunk(p, psize); + if (p == fm->dv) { + fm->dvsize = psize; + goto postaction; + } + } + } + else + set_free_with_pinuse(p, psize, next); + + if (is_small(psize)) { + insert_small_chunk(fm, p, psize); + check_free_chunk(fm, p); + } + else { + tchunkptr tp = (tchunkptr)p; + insert_large_chunk(fm, tp, psize); + check_free_chunk(fm, p); + if (--fm->release_checks == 0) + release_unused_segments(fm); + } + goto postaction; + } + } + erroraction: + USAGE_ERROR_ACTION(fm, p); + postaction: + POSTACTION(fm); + } + } +} + +void* mspace_calloc(mspace msp, size_t n_elements, size_t elem_size) { + void* mem; + size_t req = 0; + mstate ms = (mstate)msp; + if (!ok_magic(ms)) { + USAGE_ERROR_ACTION(ms,ms); + return 0; + } + if (n_elements != 0) { + req = n_elements * elem_size; + if (((n_elements | elem_size) & ~(size_t)0xffff) && + (req / n_elements != elem_size)) + req = MAX_SIZE_T; /* force downstream failure on overflow */ + } + mem = internal_malloc(ms, req); + if (mem != 0 && calloc_must_clear(mem2chunk(mem))) + memset(mem, 0, req); + return mem; +} + +void* mspace_realloc(mspace msp, void* oldmem, size_t bytes) { + if (oldmem == 0) + return mspace_malloc(msp, bytes); +#ifdef REALLOC_ZERO_BYTES_FREES + if (bytes == 0) { + mspace_free(msp, oldmem); + return 0; + } +#endif /* REALLOC_ZERO_BYTES_FREES */ + else { +#if FOOTERS + mchunkptr p = mem2chunk(oldmem); + mstate ms = get_mstate_for(p); +#else /* FOOTERS */ + mstate ms = (mstate)msp; +#endif /* FOOTERS */ + if (!ok_magic(ms)) { + USAGE_ERROR_ACTION(ms,ms); + return 0; + } + return internal_realloc(ms, oldmem, bytes); + } +} + +void* mspace_memalign(mspace msp, size_t alignment, size_t bytes) { + mstate ms = (mstate)msp; + if (!ok_magic(ms)) { + USAGE_ERROR_ACTION(ms,ms); + return 0; + } + return internal_memalign(ms, alignment, bytes); +} + +void** mspace_independent_calloc(mspace msp, size_t n_elements, + size_t elem_size, void* chunks[]) { + size_t sz = elem_size; /* serves as 1-element array */ + mstate ms = (mstate)msp; + if (!ok_magic(ms)) { + USAGE_ERROR_ACTION(ms,ms); + return 0; + } + return ialloc(ms, n_elements, &sz, 3, chunks); +} + +void** mspace_independent_comalloc(mspace msp, size_t n_elements, + size_t sizes[], void* chunks[]) { + mstate ms = (mstate)msp; + if (!ok_magic(ms)) { + USAGE_ERROR_ACTION(ms,ms); + return 0; + } + return ialloc(ms, n_elements, sizes, 0, chunks); +} + +int mspace_trim(mspace msp, size_t pad) { + int result = 0; + mstate ms = (mstate)msp; + if (ok_magic(ms)) { + if (!PREACTION(ms)) { + result = sys_trim(ms, pad); + POSTACTION(ms); + } + } + else { + USAGE_ERROR_ACTION(ms,ms); + } + return result; +} + +void mspace_malloc_stats(mspace msp) { + mstate ms = (mstate)msp; + if (ok_magic(ms)) { + internal_malloc_stats(ms); + } + else { + USAGE_ERROR_ACTION(ms,ms); + } +} + +size_t mspace_footprint(mspace msp) { + size_t result = 0; + mstate ms = (mstate)msp; + if (ok_magic(ms)) { + result = ms->footprint; + } + else { + USAGE_ERROR_ACTION(ms,ms); + } + return result; +} + + +size_t mspace_max_footprint(mspace msp) { + size_t result = 0; + mstate ms = (mstate)msp; + if (ok_magic(ms)) { + result = ms->max_footprint; + } + else { + USAGE_ERROR_ACTION(ms,ms); + } + return result; +} + + +#if !NO_MALLINFO +struct mallinfo mspace_mallinfo(mspace msp) { + mstate ms = (mstate)msp; + if (!ok_magic(ms)) { + USAGE_ERROR_ACTION(ms,ms); + } + return internal_mallinfo(ms); +} +#endif /* NO_MALLINFO */ + +size_t mspace_usable_size(void* mem) { + if (mem != 0) { + mchunkptr p = mem2chunk(mem); + if (cinuse(p)) + return chunksize(p) - overhead_for(p); + } + return 0; +} + +int mspace_mallopt(int param_number, int value) { + return change_mparam(param_number, value); +} + +#endif /* MSPACES */ + +/* -------------------- Alternative MORECORE functions ------------------- */ + +/* + Guidelines for creating a custom version of MORECORE: + + * For best performance, MORECORE should allocate in multiples of pagesize. + * MORECORE may allocate more memory than requested. (Or even less, + but this will usually result in a malloc failure.) + * MORECORE must not allocate memory when given argument zero, but + instead return one past the end address of memory from previous + nonzero call. + * For best performance, consecutive calls to MORECORE with positive + arguments should return increasing addresses, indicating that + space has been contiguously extended. + * Even though consecutive calls to MORECORE need not return contiguous + addresses, it must be OK for malloc'ed chunks to span multiple + regions in those cases where they do happen to be contiguous. + * MORECORE need not handle negative arguments -- it may instead + just return MFAIL when given negative arguments. + Negative arguments are always multiples of pagesize. MORECORE + must not misinterpret negative args as large positive unsigned + args. You can suppress all such calls from even occurring by defining + MORECORE_CANNOT_TRIM, + + As an example alternative MORECORE, here is a custom allocator + kindly contributed for pre-OSX macOS. It uses virtually but not + necessarily physically contiguous non-paged memory (locked in, + present and won't get swapped out). You can use it by uncommenting + this section, adding some #includes, and setting up the appropriate + defines above: + + #define MORECORE osMoreCore + + There is also a shutdown routine that should somehow be called for + cleanup upon program exit. + + #define MAX_POOL_ENTRIES 100 + #define MINIMUM_MORECORE_SIZE (64 * 1024U) + static int next_os_pool; + void *our_os_pools[MAX_POOL_ENTRIES]; + + void *osMoreCore(int size) + { + void *ptr = 0; + static void *sbrk_top = 0; + + if (size > 0) + { + if (size < MINIMUM_MORECORE_SIZE) + size = MINIMUM_MORECORE_SIZE; + if (CurrentExecutionLevel() == kTaskLevel) + ptr = PoolAllocateResident(size + RM_PAGE_SIZE, 0); + if (ptr == 0) + { + return (void *) MFAIL; + } + // save ptrs so they can be freed during cleanup + our_os_pools[next_os_pool] = ptr; + next_os_pool++; + ptr = (void *) ((((size_t) ptr) + RM_PAGE_MASK) & ~RM_PAGE_MASK); + sbrk_top = (char *) ptr + size; + return ptr; + } + else if (size < 0) + { + // we don't currently support shrink behavior + return (void *) MFAIL; + } + else + { + return sbrk_top; + } + } + + // cleanup any allocated memory pools + // called as last thing before shutting down driver + + void osCleanupMem(void) + { + void **ptr; + + for (ptr = our_os_pools; ptr < &our_os_pools[MAX_POOL_ENTRIES]; ptr++) + if (*ptr) + { + PoolDeallocate(*ptr); + *ptr = 0; + } + } + +*/ + + +/* ----------------------------------------------------------------------- +History: + V2.8.4 (not yet released) + * Fix bad error check in mspace_footprint + * Adaptations for ptmalloc, courtesy of Wolfram Gloger. + * Reentrant spin locks, courtesy of Earl Chew and others + * Win32 improvements, courtesy of Niall Douglas and Earl Chew + * Add NO_SEGMENT_TRAVERSAL and MAX_RELEASE_CHECK_RATE options + * Various small adjustments to reduce warnings on some compilers + * Extension hook in malloc_state + + V2.8.3 Thu Sep 22 11:16:32 2005 Doug Lea (dl at gee) + * Add max_footprint functions + * Ensure all appropriate literals are size_t + * Fix conditional compilation problem for some #define settings + * Avoid concatenating segments with the one provided + in create_mspace_with_base + * Rename some variables to avoid compiler shadowing warnings + * Use explicit lock initialization. + * Better handling of sbrk interference. + * Simplify and fix segment insertion, trimming and mspace_destroy + * Reinstate REALLOC_ZERO_BYTES_FREES option from 2.7.x + * Thanks especially to Dennis Flanagan for help on these. + + V2.8.2 Sun Jun 12 16:01:10 2005 Doug Lea (dl at gee) + * Fix memalign brace error. + + V2.8.1 Wed Jun 8 16:11:46 2005 Doug Lea (dl at gee) + * Fix improper #endif nesting in C++ + * Add explicit casts needed for C++ + + V2.8.0 Mon May 30 14:09:02 2005 Doug Lea (dl at gee) + * Use trees for large bins + * Support mspaces + * Use segments to unify sbrk-based and mmap-based system allocation, + removing need for emulation on most platforms without sbrk. + * Default safety checks + * Optional footer checks. Thanks to William Robertson for the idea. + * Internal code refactoring + * Incorporate suggestions and platform-specific changes. + Thanks to Dennis Flanagan, Colin Plumb, Niall Douglas, + Aaron Bachmann, Emery Berger, and others. + * Speed up non-fastbin processing enough to remove fastbins. + * Remove useless cfree() to avoid conflicts with other apps. + * Remove internal memcpy, memset. Compilers handle builtins better. + * Remove some options that no one ever used and rename others. + + V2.7.2 Sat Aug 17 09:07:30 2002 Doug Lea (dl at gee) + * Fix malloc_state bitmap array misdeclaration + + V2.7.1 Thu Jul 25 10:58:03 2002 Doug Lea (dl at gee) + * Allow tuning of FIRST_SORTED_BIN_SIZE + * Use PTR_UINT as type for all ptr->int casts. Thanks to John Belmonte. + * Better detection and support for non-contiguousness of MORECORE. + Thanks to Andreas Mueller, Conal Walsh, and Wolfram Gloger + * Bypass most of malloc if no frees. Thanks To Emery Berger. + * Fix freeing of old top non-contiguous chunk im sysmalloc. + * Raised default trim and map thresholds to 256K. + * Fix mmap-related #defines. Thanks to Lubos Lunak. + * Fix copy macros; added LACKS_FCNTL_H. Thanks to Neal Walfield. + * Branch-free bin calculation + * Default trim and mmap thresholds now 256K. + + V2.7.0 Sun Mar 11 14:14:06 2001 Doug Lea (dl at gee) + * Introduce independent_comalloc and independent_calloc. + Thanks to Michael Pachos for motivation and help. + * Make optional .h file available + * Allow > 2GB requests on 32bit systems. + * new WIN32 sbrk, mmap, munmap, lock code from . + Thanks also to Andreas Mueller , + and Anonymous. + * Allow override of MALLOC_ALIGNMENT (Thanks to Ruud Waij for + helping test this.) + * memalign: check alignment arg + * realloc: don't try to shift chunks backwards, since this + leads to more fragmentation in some programs and doesn't + seem to help in any others. + * Collect all cases in malloc requiring system memory into sysmalloc + * Use mmap as backup to sbrk + * Place all internal state in malloc_state + * Introduce fastbins (although similar to 2.5.1) + * Many minor tunings and cosmetic improvements + * Introduce USE_PUBLIC_MALLOC_WRAPPERS, USE_MALLOC_LOCK + * Introduce MALLOC_FAILURE_ACTION, MORECORE_CONTIGUOUS + Thanks to Tony E. Bennett and others. + * Include errno.h to support default failure action. + + V2.6.6 Sun Dec 5 07:42:19 1999 Doug Lea (dl at gee) + * return null for negative arguments + * Added Several WIN32 cleanups from Martin C. Fong + * Add 'LACKS_SYS_PARAM_H' for those systems without 'sys/param.h' + (e.g. WIN32 platforms) + * Cleanup header file inclusion for WIN32 platforms + * Cleanup code to avoid Microsoft Visual C++ compiler complaints + * Add 'USE_DL_PREFIX' to quickly allow co-existence with existing + memory allocation routines + * Set 'malloc_getpagesize' for WIN32 platforms (needs more work) + * Use 'assert' rather than 'ASSERT' in WIN32 code to conform to + usage of 'assert' in non-WIN32 code + * Improve WIN32 'sbrk()' emulation's 'findRegion()' routine to + avoid infinite loop + * Always call 'fREe()' rather than 'free()' + + V2.6.5 Wed Jun 17 15:57:31 1998 Doug Lea (dl at gee) + * Fixed ordering problem with boundary-stamping + + V2.6.3 Sun May 19 08:17:58 1996 Doug Lea (dl at gee) + * Added pvalloc, as recommended by H.J. Liu + * Added 64bit pointer support mainly from Wolfram Gloger + * Added anonymously donated WIN32 sbrk emulation + * Malloc, calloc, getpagesize: add optimizations from Raymond Nijssen + * malloc_extend_top: fix mask error that caused wastage after + foreign sbrks + * Add linux mremap support code from HJ Liu + + V2.6.2 Tue Dec 5 06:52:55 1995 Doug Lea (dl at gee) + * Integrated most documentation with the code. + * Add support for mmap, with help from + Wolfram Gloger (Gloger@lrz.uni-muenchen.de). + * Use last_remainder in more cases. + * Pack bins using idea from colin@nyx10.cs.du.edu + * Use ordered bins instead of best-fit threshhold + * Eliminate block-local decls to simplify tracing and debugging. + * Support another case of realloc via move into top + * Fix error occuring when initial sbrk_base not word-aligned. + * Rely on page size for units instead of SBRK_UNIT to + avoid surprises about sbrk alignment conventions. + * Add mallinfo, mallopt. Thanks to Raymond Nijssen + (raymond@es.ele.tue.nl) for the suggestion. + * Add `pad' argument to malloc_trim and top_pad mallopt parameter. + * More precautions for cases where other routines call sbrk, + courtesy of Wolfram Gloger (Gloger@lrz.uni-muenchen.de). + * Added macros etc., allowing use in linux libc from + H.J. Lu (hjl@gnu.ai.mit.edu) + * Inverted this history list + + V2.6.1 Sat Dec 2 14:10:57 1995 Doug Lea (dl at gee) + * Re-tuned and fixed to behave more nicely with V2.6.0 changes. + * Removed all preallocation code since under current scheme + the work required to undo bad preallocations exceeds + the work saved in good cases for most test programs. + * No longer use return list or unconsolidated bins since + no scheme using them consistently outperforms those that don't + given above changes. + * Use best fit for very large chunks to prevent some worst-cases. + * Added some support for debugging + + V2.6.0 Sat Nov 4 07:05:23 1995 Doug Lea (dl at gee) + * Removed footers when chunks are in use. Thanks to + Paul Wilson (wilson@cs.texas.edu) for the suggestion. + + V2.5.4 Wed Nov 1 07:54:51 1995 Doug Lea (dl at gee) + * Added malloc_trim, with help from Wolfram Gloger + (wmglo@Dent.MED.Uni-Muenchen.DE). + + V2.5.3 Tue Apr 26 10:16:01 1994 Doug Lea (dl at g) + + V2.5.2 Tue Apr 5 16:20:40 1994 Doug Lea (dl at g) + * realloc: try to expand in both directions + * malloc: swap order of clean-bin strategy; + * realloc: only conditionally expand backwards + * Try not to scavenge used bins + * Use bin counts as a guide to preallocation + * Occasionally bin return list chunks in first scan + * Add a few optimizations from colin@nyx10.cs.du.edu + + V2.5.1 Sat Aug 14 15:40:43 1993 Doug Lea (dl at g) + * faster bin computation & slightly different binning + * merged all consolidations to one part of malloc proper + (eliminating old malloc_find_space & malloc_clean_bin) + * Scan 2 returns chunks (not just 1) + * Propagate failure in realloc if malloc returns 0 + * Add stuff to allow compilation on non-ANSI compilers + from kpv@research.att.com + + V2.5 Sat Aug 7 07:41:59 1993 Doug Lea (dl at g.oswego.edu) + * removed potential for odd address access in prev_chunk + * removed dependency on getpagesize.h + * misc cosmetics and a bit more internal documentation + * anticosmetics: mangled names in macros to evade debugger strangeness + * tested on sparc, hp-700, dec-mips, rs6000 + with gcc & native cc (hp, dec only) allowing + Detlefs & Zorn comparison study (in SIGPLAN Notices.) + + Trial version Fri Aug 28 13:14:29 1992 Doug Lea (dl at g.oswego.edu) + * Based loosely on libg++-1.2X malloc. (It retains some of the overall + structure of old version, but most details differ.) + +*/ + + diff --git a/oshmem/mca/memheap/ptmalloc/malloc_defs.h b/oshmem/mca/memheap/ptmalloc/malloc_defs.h new file mode 100644 index 0000000000..cd08a1b7bb --- /dev/null +++ b/oshmem/mca/memheap/ptmalloc/malloc_defs.h @@ -0,0 +1,32 @@ +#ifndef _MALLOC_DEFS_H +#define _MALLOC_DEFS_H + +#include "oshmem/runtime/runtime.h" + +/* See malloc.c for detailed parameter description */ +#define USE_SPIN_LOCKS 0 +#define USE_DL_PREFIX +#define ABORT oshmem_shmem_abort(-2) +//#define ABORT abort() +#define MORECORE mca_memheap_ptmalloc_sbrk +#define MORECORE_CANNOT_TRIM +#define DL_HAVE_MMAP 0 +#define DL_HAVE_MREMAP 0 +#define malloc_getpagesize mca_memheap_ptmalloc_getpagesize() +#define REALLOC_ZERO_BYTES_FREES +#define ABORT_ON_ASSERT_FAILURE 1 +/* next two are useful for debugging */ +#define DL_DEBUG 0 +#define FOOTERS 0 +/* print error if *alloc() is called with incorrect params */ +#define USAGE_ERROR_ACTION(m, p) do { printf("PTMALLOC: USAGE ERROR DETECTED: m=%p ptr=%p\n", m, p); } while (0) + +int mca_memheap_ptmalloc_getpagesize(void); +void *mca_memheap_ptmalloc_sbrk(size_t size); + +void* dlmalloc(size_t); +void dlfree(void*); +void* dlrealloc(void*, size_t); +void* dlmemalign(size_t, size_t); + +#endif diff --git a/oshmem/mca/memheap/ptmalloc/memheap_ptmalloc.c b/oshmem/mca/memheap/ptmalloc/memheap_ptmalloc.c new file mode 100644 index 0000000000..1e190f3a4f --- /dev/null +++ b/oshmem/mca/memheap/ptmalloc/memheap_ptmalloc.c @@ -0,0 +1,184 @@ +/* Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "oshmem_config.h" +#include "oshmem/proc/proc.h" +#include "oshmem/mca/spml/spml.h" +#include "oshmem/mca/memheap/memheap.h" +#include "oshmem/mca/memheap/ptmalloc/memheap_ptmalloc.h" +#include "oshmem/mca/memheap/ptmalloc/memheap_ptmalloc_component.h" +#include "oshmem/mca/memheap/base/base.h" +#include "orte/mca/grpcomm/grpcomm.h" +#include "opal/class/opal_hash_table.h" +#include "opal/class/opal_object.h" +#include "orte/util/name_fns.h" + + +mca_memheap_ptmalloc_module_t memheap_ptmalloc = { + { + &mca_memheap_ptmalloc_component, + mca_memheap_ptmalloc_finalize, + mca_memheap_ptmalloc_alloc, + mca_memheap_ptmalloc_align, + mca_memheap_ptmalloc_realloc, + mca_memheap_ptmalloc_free, + + mca_memheap_ptmalloc_alloc, + mca_memheap_ptmalloc_free, + + mca_memheap_base_get_cached_mkey, + mca_memheap_base_get_mkey, + mca_memheap_base_find_offset, + mca_memheap_base_is_symmetric_addr, + mca_memheap_modex_recv_all, + + 0 + }, + 100 /* priority */ +}; + + +/* Memory Heap Buddy Implementation */ +/** + * Initialize the Memory Heap + */ +int mca_memheap_ptmalloc_module_init(memheap_context_t *context) +{ + if (!context || !context->user_size || !context->private_size) { + return OSHMEM_ERR_BAD_PARAM; + } + + /* Construct a mutex object */ + OBJ_CONSTRUCT(&memheap_ptmalloc.lock, opal_mutex_t); + memheap_ptmalloc.base = context->user_base_addr; + memheap_ptmalloc.cur_size = 0; + memheap_ptmalloc.max_size = context->user_size + context->private_size; + memheap_ptmalloc.max_alloc_size = context->user_size; + + MEMHEAP_VERBOSE(1, "symmetric heap memory (user+private): %llu bytes", + (unsigned long long)(context->user_size + context->private_size)); + + /* disable till we figure out double modex&grpcomm.bad problem */ + // memheap_modex_mkey_exchange(); + + return OSHMEM_SUCCESS; + +} + + +/** + * Allocate size bytes on the symmetric heap. + * The allocated variable is aligned to its size. + */ +int mca_memheap_ptmalloc_alloc(size_t size, void** p_buff) +{ + if (size > memheap_ptmalloc.max_alloc_size) { + *p_buff = 0; + return OSHMEM_ERR_OUT_OF_RESOURCE; + } + + OPAL_THREAD_LOCK(&memheap_ptmalloc.lock); + *p_buff = dlmalloc(size); + OPAL_THREAD_UNLOCK(&memheap_ptmalloc.lock); + + if (NULL == *p_buff) + return OSHMEM_ERROR; + + return OSHMEM_SUCCESS; +} + + +int mca_memheap_ptmalloc_align(size_t align, size_t size, void **p_buff) +{ + if (size > memheap_ptmalloc.max_alloc_size) { + *p_buff = 0; + return OSHMEM_ERR_OUT_OF_RESOURCE; + } + + if (align == 0) { + *p_buff = 0; + return OSHMEM_ERROR; + } + + /* check that align is power of 2 */ + if (align & (align - 1)) { + *p_buff = 0; + return OSHMEM_ERROR; + } + + OPAL_THREAD_LOCK(&memheap_ptmalloc.lock); + *p_buff = dlmemalign(align, size); + OPAL_THREAD_UNLOCK(&memheap_ptmalloc.lock); + + if (NULL == *p_buff) + return OSHMEM_ERROR; + + return OSHMEM_SUCCESS; +} + +int mca_memheap_ptmalloc_realloc(size_t new_size, void *p_buff, void **p_new_buff) +{ + if (new_size > memheap_ptmalloc.max_alloc_size) { + *p_new_buff = 0; + return OSHMEM_ERR_OUT_OF_RESOURCE; + } + + OPAL_THREAD_LOCK(&memheap_ptmalloc.lock); + *p_new_buff = dlrealloc(p_buff, new_size); + OPAL_THREAD_UNLOCK(&memheap_ptmalloc.lock); + + if (!*p_new_buff) + return OSHMEM_ERR_OUT_OF_RESOURCE; + + return OSHMEM_SUCCESS; +} + +/* + * Free a variable allocated on the + * symmetric heap. + */ +int mca_memheap_ptmalloc_free(void* ptr) +{ + OPAL_THREAD_LOCK(&memheap_ptmalloc.lock); + dlfree(ptr); + OPAL_THREAD_UNLOCK(&memheap_ptmalloc.lock); + return OSHMEM_SUCCESS; +} + + +int mca_memheap_ptmalloc_finalize() +{ + MEMHEAP_VERBOSE(5, "deregistering symmetric heap"); + return OSHMEM_SUCCESS; +} + +int mca_memheap_ptmalloc_getpagesize(void) +{ + return 2 * 1024 * 1024; +} + +/* must be same as in malloc.c */ +#define PTMALLOC_MAX_SIZE_T (~(size_t)0) +#define PTMALLOC_MFAIL ((void*)(PTMALLOC_MAX_SIZE_T)) +void *mca_memheap_ptmalloc_sbrk(size_t size) +{ + char *ret; + + if (memheap_ptmalloc.cur_size + size > memheap_ptmalloc.max_size) { + return PTMALLOC_MFAIL; + } + + ret = (char *)memheap_ptmalloc.base + memheap_ptmalloc.cur_size; + memheap_ptmalloc.cur_size += size; + + return ret; +} + + + diff --git a/oshmem/mca/memheap/ptmalloc/memheap_ptmalloc.h b/oshmem/mca/memheap/ptmalloc/memheap_ptmalloc.h new file mode 100644 index 0000000000..7786a268e9 --- /dev/null +++ b/oshmem/mca/memheap/ptmalloc/memheap_ptmalloc.h @@ -0,0 +1,75 @@ +/** + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + * Description of the Registration Cache framework + */ +#ifndef MCA_MEMHEAP_PTMALLOC_H +#define MCA_MEMHEAP_PTMALLOC_H + +#include "oshmem_config.h" +#include "opal/mca/mca.h" +#include "opal/class/opal_list.h" +#include "opal/threads/mutex.h" +#include "oshmem/mca/memheap/memheap.h" +#include "oshmem/mca/memheap/base/base.h" +#include "oshmem/mca/spml/spml.h" +#include "opal/class/opal_hash_table.h" +#include "ompi/mca/btl/btl.h" +#include +#include +#include + + +BEGIN_C_DECLS + +#include "malloc_defs.h" +/* + * At the moment we use only dlmalloc part of the ptmalloc3. Thread safety is implemented by using locks on + * alloc operations. Since all shmem alloc ops are collectives, malloc performance is not a problem. So it makes + * sense to use simpler algorithm. + * + * Heap is allocate in one chunk, and we implement our on sbrk like function that serves portions of the memory + * to malloc. + * + * At the moment we do not support growing/returning heap based memory to OS. + */ + +/* Structure for managing shmem symmetric heap */ +struct mca_memheap_ptmalloc_module_t { + mca_memheap_base_module_t super; + int priority; /** Module's Priority */ + void *base; + size_t cur_size; + size_t max_size; + size_t max_alloc_size; + opal_mutex_t lock; /** Part of the allocator */ +}; + +typedef struct mca_memheap_ptmalloc_module_t mca_memheap_ptmalloc_module_t; +OSHMEM_DECLSPEC extern mca_memheap_ptmalloc_module_t memheap_ptmalloc; + + +/* + * Buddy interface. + * Please pay attention to the new differences in the interface. + */ +OSHMEM_DECLSPEC extern int mca_memheap_ptmalloc_module_init(memheap_context_t *); +OSHMEM_DECLSPEC extern int mca_memheap_ptmalloc_alloc(size_t, void**); +OSHMEM_DECLSPEC extern int mca_memheap_ptmalloc_realloc(size_t, void*, void **); +OSHMEM_DECLSPEC extern int mca_memheap_ptmalloc_align(size_t, size_t, void**); +OSHMEM_DECLSPEC extern int mca_memheap_ptmalloc_free(void*); +OSHMEM_DECLSPEC extern int mca_memheap_ptmalloc_finalize(void); + + + +END_C_DECLS + +#endif /* MCA_MEMHEAP_BUDDY_H */ diff --git a/oshmem/mca/memheap/ptmalloc/memheap_ptmalloc_component.c b/oshmem/mca/memheap/ptmalloc/memheap_ptmalloc_component.c new file mode 100644 index 0000000000..f05c6901a8 --- /dev/null +++ b/oshmem/mca/memheap/ptmalloc/memheap_ptmalloc_component.c @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#include "oshmem_config.h" +#include "opal/mca/base/mca_base_param.h" +#include "opal/util/output.h" +#include "oshmem/mca/memheap/memheap.h" +#include "oshmem/mca/memheap/base/base.h" +#include "oshmem/mca/memheap/ptmalloc/memheap_ptmalloc.h" +#include "memheap_ptmalloc_component.h" + + +static int mca_memheap_ptmalloc_component_close(void); +static mca_memheap_base_module_t* mca_memheap_ptmalloc_component_init( memheap_context_t *, int * ); + + +static int __basic_open(void); + +mca_memheap_base_component_t mca_memheap_ptmalloc_component = { + { + MCA_MEMHEAP_BASE_VERSION_2_0_0, + + "ptmalloc", /* MCA component name */ + OSHMEM_MAJOR_VERSION, /* MCA component major version */ + OSHMEM_MINOR_VERSION, /* MCA component minor version */ + OSHMEM_RELEASE_VERSION, /* MCA component release version */ + + __basic_open, + mca_memheap_ptmalloc_component_close, + NULL + }, + { + /* The component is checkpoint ready */ + MCA_BASE_METADATA_PARAM_CHECKPOINT + }, + mca_memheap_ptmalloc_component_init +}; + +/* Open component */ +static int __basic_open(void) +{ + return OSHMEM_SUCCESS; +} + +/* Initialize component */ +mca_memheap_base_module_t* mca_memheap_ptmalloc_component_init(memheap_context_t *context, int *priority) +{ + int rc; + + *priority = memheap_ptmalloc.priority; + rc = mca_memheap_ptmalloc_module_init(context); + if (OSHMEM_SUCCESS != rc) { + return NULL; + } + + return &(memheap_ptmalloc.super); +} + +/* + * This function is automaticaly called from mca_base_components_close. + * It releases the component's allocated memory. + */ +int mca_memheap_ptmalloc_component_close() +{ + mca_memheap_ptmalloc_finalize(); + return OSHMEM_SUCCESS; +} diff --git a/oshmem/mca/memheap/ptmalloc/memheap_ptmalloc_component.h b/oshmem/mca/memheap/ptmalloc/memheap_ptmalloc_component.h new file mode 100644 index 0000000000..f05939f646 --- /dev/null +++ b/oshmem/mca/memheap/ptmalloc/memheap_ptmalloc_component.h @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + */ + +#ifndef MCA_MEMHEAP_PTMALLOC_COMPONENT_H +#define MCA_MEMHEAP_PTMALLOC_COMPONENT_H + +BEGIN_C_DECLS + +/* + * MEMHEAP module functions. + */ +OSHMEM_MODULE_DECLSPEC extern mca_memheap_base_component_2_0_0_t mca_memheap_ptmalloc_component; + +END_C_DECLS + +#endif diff --git a/oshmem/mca/scoll/Makefile.am b/oshmem/mca/scoll/Makefile.am new file mode 100644 index 0000000000..1c9d730467 --- /dev/null +++ b/oshmem/mca/scoll/Makefile.am @@ -0,0 +1,35 @@ +# +# Copyright (c) 2012 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# main library setup +noinst_LTLIBRARIES = libmca_scoll.la +libmca_scoll_la_SOURCES = + +# header setup +nobase_oshmem_HEADERS = +nobase_nodist_oshmem_HEADERS = + +# local files +headers = scoll.h +libmca_scoll_la_SOURCES += $(headers) $(nodist_headers) + +# Conditionally install the header files +if WANT_INSTALL_HEADERS +nobase_oshmem_HEADERS += $(headers) +nobase_nodist_oshmem_HEADERS += $(nodist_headers) +oshmemdir = $(includedir)/oshmem/oshmem/mca/scoll +else +oshmemdir = $(includedir) +endif + +include base/Makefile.am + +distclean-local: + rm -f base/static-components.h diff --git a/oshmem/mca/scoll/base/Makefile.am b/oshmem/mca/scoll/base/Makefile.am new file mode 100644 index 0000000000..985a7caef2 --- /dev/null +++ b/oshmem/mca/scoll/base/Makefile.am @@ -0,0 +1,20 @@ +# +# Copyright (c) 2012 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +AM_CFLAGS = $(OSHMEM_CFLAGS) + +headers += \ + base/base.h + +libmca_scoll_la_SOURCES += \ + base/scoll_base_close.c \ + base/scoll_base_available.c \ + base/scoll_base_select.c \ + base/scoll_base_open.c diff --git a/oshmem/mca/scoll/base/base.h b/oshmem/mca/scoll/base/base.h new file mode 100644 index 0000000000..5848569599 --- /dev/null +++ b/oshmem/mca/scoll/base/base.h @@ -0,0 +1,196 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + + +#ifndef MCA_SCOLL_BASE_H +#define MCA_SCOLL_BASE_H + +#include "oshmem_config.h" + +#include "oshmem/mca/memheap/memheap.h" +#include "opal/class/opal_list.h" + + +/* + * Global functions for MCA overall collective open and close + */ + +BEGIN_C_DECLS + +/** + * Initialize the coll MCA framework + * + * @retval OSHEM_SUCCESS Upon success + * @retval OSHMEM_ERROR Upon failure + * + * This must be the first function invoked in the coll MCA + * framework. It initializes the coll MCA framework, finds and + * opens coll components, etc. + * + * This function is invoked during oshmem_shmem_init() and during the + * initialization of the special case of the laminfo command. + * + * This function fills in the internal global variable + * mca_coll_base_components_opened, which is a list of all coll components + * that were successfully opened. This variable should \em only be + * used by other coll base functions -- it is not considered a + * public interface member -- and is only mentioned here for + * completeness. + */ +OSHMEM_DECLSPEC int mca_scoll_base_open(void); + +/** + * Create list of available coll components. + * + * @param allow_multi_user_threads Will be set to true if any of the + * available components will allow multiple user threads + * @param have_hidden_threads Will be set to true if any of the + * available components have hidden threads. + * + * @retval OSHMEM_SUCCESS If one or more coll components are available. + * @retval OSHMEM_ERROR If no coll components are found to be available. + * + * This function is invoked during oshmem_shmem_init() to query all + * successfully opened coll components and create a list of all + * available coll components. + * + * This function traverses the (internal global variable) + * mca_coll_base_components_opened list and queries each component to see + * if it ever might want to run during this SHMEM process. It creates + * another internal global variable list named + * mca_coll_base_components_available, consisting of a list of components + * that are available for selection when communicators are created. + * This variable should \em only be used by other coll base + * functions -- it is not considered a public interface member -- + * and is only mentioned here for completeness. + */ +OSHMEM_DECLSPEC int mca_scoll_base_find_available(bool enable_progress_threads, + bool enable_threads); + + +/** + * Select an available component for a new communicator. + * + * @param comm Communicator that the component will be selected for. + * @param preferred The component that is preferred for this + * communicator (or NULL). + * + * @return OSHMEM_SUCCESS Upon success. + * @return OSHMEM_ERROR Upon failure. + * + * This function is invoked when a new communicator is created and a + * coll component needs to be selected for it. It should be invoked + * near the end of the communicator creation process such that + * almost everything else is functional on the communicator (e.g., + * point-to-point communication). + * + * Note that new communicators may be created as a result of + * invoking this function. Specifically: this function is called in + * the depths of communicator creation, but during the execution of + * this function, new communicators may be created, and therefore + * communicator creation functions may be re-entered (albiet with + * different arguments). + */ +OSHMEM_DECLSPEC int mca_scoll_base_select(struct oshmem_group_t *group); + +/** + * Finalize a coll component on a specific communicator. + * + * @param comm The communicator that is being destroyed. + * + * @retval OSHMEM_SUCCESS Always. + * + * This function is invoked near the beginning of the destruction of + * a communicator. It finalizes the coll component associated with the + * communicator (e.g., allowing the component to clean up and free any + * resources allocated for that communicator). Note that similar to + * mca_coll_base_group_select(), as result of this function, other + * communicators may also be destroyed. + */ +int mca_scoll_base_group_unselect(struct oshmem_group_t *group); + +/** + * Shut down the coll MCA framework. + * + * @retval OSHMEM_SUCCESS Always + * + * This function shuts down everything in the coll MCA framework, + * and is called during oshmem_shmem_finalize(). + * + * It must be the last function invoked on the coll MCA framework. + */ +OSHMEM_DECLSPEC int mca_scoll_base_close(void); + + +/* + * Globals + */ + + +/** + * Special synchronization array to do barrier all. + */ +OSHMEM_DECLSPEC extern long* mca_scoll_sync_array; + +OSHMEM_DECLSPEC int mca_scoll_enable(void) ; + +OSHMEM_DECLSPEC void mca_scoll_disable(void); + + +/** + * SCOLL framework debugging stream ID used with opal_output() and + * opal_output_verbose(). + */ +OSHMEM_DECLSPEC extern int mca_scoll_base_output; + +/** + * Indicator as to whether the list of opened coll components is valid or + * not. + */ +extern bool mca_scoll_base_components_opened_valid; + +/** + * List of all opened components; created when the coll framework is + * initialized and destroyed when we reduce the list to all available + * coll components. + */ +OSHMEM_DECLSPEC extern opal_list_t mca_scoll_base_components_opened; + +/** + * Indicator as to whether the list of available coll components is valid + * or not. + */ +extern bool mca_scoll_base_components_available_valid; + +/** + * List of all available components; created by reducing the list of open + * components to all those who indicate that they may run during this + * process. + */ +extern opal_list_t mca_scoll_base_components_available; + +/* ******************************************************************** */ +#ifdef __BASE_FILE__ +#define __SCOLL_FILE__ __BASE_FILE__ +#else +#define __SCOLL_FILE__ __FILE__ +#endif + +#define SCOLL_VERBOSE(level, format, ...) \ + opal_output_verbose(level, mca_scoll_base_output, "%s:%d - %s() " format, \ + __SCOLL_FILE__, __LINE__, __FUNCTION__, ## __VA_ARGS__) + +#define SCOLL_ERROR(format, ... ) \ + opal_output_verbose(0, mca_scoll_base_output, "Error: %s:%d - %s() " format, \ + __SCOLL_FILE__, __LINE__, __FUNCTION__, ## __VA_ARGS__) + +END_C_DECLS + +#endif /* MCA_SCOLL_BASE_H */ diff --git a/oshmem/mca/scoll/base/scoll_base_available.c b/oshmem/mca/scoll/base/scoll_base_available.c new file mode 100644 index 0000000000..89165d987e --- /dev/null +++ b/oshmem/mca/scoll/base/scoll_base_available.c @@ -0,0 +1,180 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include +#include +#include + +#include "oshmem_config.h" + +#include "orte/util/show_help.h" + +#include "opal/class/opal_list.h" +#include "opal/util/output.h" +#include "opal/mca/mca.h" +#include "opal/mca/base/base.h" +#include "opal/mca/base/mca_base_component_repository.h" + +#include "oshmem/constants.h" +#include "oshmem/mca/scoll/scoll.h" +#include "oshmem/mca/scoll/base/base.h" + + +/* + * Global variables + */ +bool mca_scoll_base_components_available_valid = false; +opal_list_t mca_scoll_base_components_available; + + +/* + * Private functions + */ +static int init_query(const mca_base_component_t * ls, + mca_base_component_priority_list_item_t * entry, + bool enable_progress_threads, + bool enable_threads); + +/* + * Scan down the list of successfully opened components and query each of + * them (the opened list will be one or more components. If the user + * requested a specific component, it will be the only component in the + * opened list). Create and populate the available list of all + * components who indicate that they want to be considered for selection. + * Close all components who do not want to be considered for selection, + * and destroy the opened list. + * + * Also find the basic component while we're doing all of this, and save + * it in a global variable so that we can find it easily later (e.g., + * during scope selection). + */ +int mca_scoll_base_find_available(bool enable_progress_threads, + bool enable_threads) +{ + bool found = false; + mca_base_component_priority_list_item_t *entry; + opal_list_item_t *p; + const mca_base_component_t *component; + + /* Initialize the list */ + + OBJ_CONSTRUCT(&mca_scoll_base_components_available, opal_list_t); + mca_scoll_base_components_available_valid = true; + + /* The list of components that we should check has already been + established in mca_coll_base_open. */ + + for (found = false, + p = opal_list_remove_first(&mca_scoll_base_components_opened); + p != NULL; + p = opal_list_remove_first(&mca_scoll_base_components_opened)) { + component = ((mca_base_component_list_item_t *) p)->cli_component; + + /* Call a subroutine to do the work, because the component may + represent different versions of the coll MCA. */ + + entry = OBJ_NEW(mca_base_component_priority_list_item_t); + entry->super.cli_component = component; + entry->cpli_priority = 0; + if (OSHMEM_SUCCESS == init_query(component, entry, + enable_progress_threads, + enable_threads)) { + opal_list_append(&mca_scoll_base_components_available, + (opal_list_item_t *) entry); + found = true; + } else { + + /* If the component doesn't want to run, then close it. + It's already had its close() method invoked; now close + it out of the DSO repository (if it's there). */ + + mca_base_component_repository_release(component); + OBJ_RELEASE(entry); + } + + /* Free the entry from the "opened" list */ + + OBJ_RELEASE(p); + } + + /* The opened list is now no longer useful and we can free it */ + + OBJ_DESTRUCT(&mca_scoll_base_components_opened); + mca_scoll_base_components_opened_valid = false; + + /* If we have no collective components available, it's an error. + Thanks for playing! */ + + if (!found) { + /* Need to free all items in the list */ + OBJ_DESTRUCT(&mca_scoll_base_components_available); + mca_scoll_base_components_available_valid = false; + SCOLL_VERBOSE(10,"scoll:find_available: no scoll components available!"); + return OSHMEM_ERROR; + } + + /* All done */ + + return OSHMEM_SUCCESS; +} + + +/* + * Query a component, see if it wants to run at all. If it does, save + * some information. If it doesn't, close it. + */ +static int init_query(const mca_base_component_t * component, + mca_base_component_priority_list_item_t * entry, + bool enable_progress_threads, bool enable_threads) +{ + int ret; + + SCOLL_VERBOSE(10,"coll:find_available: querying coll component %s", + component->mca_component_name); + + /* This component has already been successfully opened. So now + query it. */ + + if (1 == component->mca_type_major_version && + 0 == component->mca_type_minor_version && + 0 == component->mca_type_release_version) { + + mca_scoll_base_component_t *scoll = + (mca_scoll_base_component_t *) component; + + ret = scoll->scoll_init(enable_progress_threads, + enable_threads); + } else { + /* Unrecognized coll API version */ + + SCOLL_VERBOSE(10,"scoll:find_available: unrecognized scoll API version (%d.%d.%d, ignored)", + component->mca_type_major_version, + component->mca_type_minor_version, + component->mca_type_release_version); + return OSHMEM_ERROR; + } + + /* Query done -- look at the return value to see what happened */ + + if (OSHMEM_SUCCESS != ret) { + SCOLL_VERBOSE(10,"scoll:find_available: scoll component %s is not available", + component->mca_component_name); + if (NULL != component->mca_close_component) { + component->mca_close_component(); + } + } else { + SCOLL_VERBOSE(10,"scoll:find_available: scoll component %s is available", + component->mca_component_name); + } + + /* All done */ + + return ret; +} diff --git a/oshmem/mca/scoll/base/scoll_base_close.c b/oshmem/mca/scoll/base/scoll_base_close.c new file mode 100644 index 0000000000..ab059b1771 --- /dev/null +++ b/oshmem/mca/scoll/base/scoll_base_close.c @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include + +#include "oshmem_config.h" + +#include "opal/mca/mca.h" +#include "opal/mca/base/base.h" + +#include "oshmem/constants.h" +#include "oshmem/mca/scoll/scoll.h" +#include "oshmem/mca/scoll/base/base.h" + + +int mca_scoll_base_close(void) +{ + /* Close all components that are still open. This may be the opened + * list (if we're in ompi_info), or it may be the available list (if + * we're anywhere else). */ + + if (mca_scoll_base_components_opened_valid) { + mca_base_components_close(mca_scoll_base_output, + &mca_scoll_base_components_opened, NULL); + OBJ_DESTRUCT(&mca_scoll_base_components_opened); + mca_scoll_base_components_opened_valid = false; + } else if (mca_scoll_base_components_available_valid) { + mca_base_components_close(mca_scoll_base_output, + &mca_scoll_base_components_available, + NULL); + OBJ_DESTRUCT(&mca_scoll_base_components_available); + mca_scoll_base_components_available_valid = false; + } + + /* This call should be done after memheap close */ + mca_scoll_disable(); + + /* All done */ + + return OSHMEM_SUCCESS; +} + + +void mca_scoll_disable(void) +{ + if (mca_scoll_sync_array) + { + void* ptr = (void*)mca_scoll_sync_array; + + MCA_MEMHEAP_CALL(private_free(ptr)); + mca_scoll_sync_array = NULL; + } +} diff --git a/oshmem/mca/scoll/base/scoll_base_open.c b/oshmem/mca/scoll/base/scoll_base_open.c new file mode 100644 index 0000000000..bd879ae90a --- /dev/null +++ b/oshmem/mca/scoll/base/scoll_base_open.c @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + + +#include + +#include "oshmem_config.h" + +#include "oshmem/constants.h" + +#include "opal/mca/mca.h" +#include "opal/util/output.h" +#include "opal/mca/base/base.h" +#include "opal/mca/base/mca_base_param.h" + +#include "oshmem/mca/scoll/scoll.h" +#include "oshmem/mca/scoll/base/base.h" + +/* + * The following file was created by configure. It contains extern + * statements and the definition of an array of pointers to each + * component's public mca_base_component_t struct. + */ +#include "oshmem/mca/scoll/base/static-components.h" + + +/* + * Global variables; most of which are loaded by back-ends of MCA + * variables + */ +long* mca_scoll_sync_array = NULL; + +int mca_scoll_base_output = -1; + +bool mca_scoll_base_components_opened_valid = false; +opal_list_t mca_scoll_base_components_opened; + +OBJ_CLASS_INSTANCE(mca_scoll_base_module_t, opal_object_t, NULL, NULL); + +/* + * Function for finding and opening either all MCA components, or the one + * that was specifically requested via a MCA parameter. + */ +int mca_scoll_base_open(void) +{ + /* Open an output stream for this framework */ + int value = -1; + + mca_scoll_base_output = opal_output_open(NULL); + mca_base_param_reg_int_name("scoll_base","verbose", + "Verbose level of the shmem scoll component",false,false,0,&value); + opal_output_set_verbosity(mca_scoll_base_output, value); + + /* Open up all available components */ + if (OSHMEM_SUCCESS != + mca_base_components_open("scoll", mca_scoll_base_output, + mca_scoll_base_static_components, + &mca_scoll_base_components_opened, true)) { + return OSHMEM_ERROR; + } + mca_scoll_base_components_opened_valid = true; + + /* All done */ + + return OSHMEM_SUCCESS; +} + + +int mca_scoll_enable(void) +{ + int ret = OSHMEM_SUCCESS; + + if (!mca_scoll_sync_array) + { + void* ptr = (void*)mca_scoll_sync_array; + int i = 0; + + MCA_MEMHEAP_CALL(private_alloc((_SHMEM_BARRIER_SYNC_SIZE * sizeof(*mca_scoll_sync_array)), &ptr)); + mca_scoll_sync_array = ptr; + + for ( i = 0; i < _SHMEM_BARRIER_SYNC_SIZE; i++ ) + { + mca_scoll_sync_array[i] = _SHMEM_SYNC_VALUE; + } + } + + /* Note: it is done to support FCA only and we need to consider possibility to + * find a way w/o this ugly hack + */ + if (OSHMEM_SUCCESS != (ret = mca_scoll_base_select(oshmem_group_all))) + { + return ret; + } + if (OSHMEM_SUCCESS != (ret = mca_scoll_base_select(oshmem_group_self))) + { + return ret; + } + + return OSHMEM_SUCCESS; +} diff --git a/oshmem/mca/scoll/base/scoll_base_select.c b/oshmem/mca/scoll/base/scoll_base_select.c new file mode 100644 index 0000000000..cb8d2615ea --- /dev/null +++ b/oshmem/mca/scoll/base/scoll_base_select.c @@ -0,0 +1,359 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "oshmem_config.h" + +#include +#include +#include + +#include "oshmem/constants.h" + +#include "opal/class/opal_list.h" +#include "opal/util/output.h" +#include "orte/util/show_help.h" +#include "opal/mca/mca.h" +#include "opal/mca/base/base.h" +#include "opal/mca/base/mca_base_component_repository.h" + +#include "oshmem/mca/scoll/scoll.h" +#include "oshmem/mca/scoll/base/base.h" +#include "oshmem/proc/proc.h" +#include "oshmem/runtime/runtime.h" + + +/* + * Local types + */ +struct avail_coll_t { + opal_list_item_t super; + + int ac_priority; + mca_scoll_base_module_1_0_0_t *ac_module; +}; +typedef struct avail_coll_t avail_coll_t; + + +/* + * Local functions + */ +static opal_list_t *check_components(opal_list_t * components, + oshmem_group_t * group); +static int check_one_component(oshmem_group_t * group, + const mca_base_component_t * component, + mca_scoll_base_module_1_0_0_t ** module); + +static int query(const mca_base_component_t * component, + oshmem_group_t * group, int *priority, + mca_scoll_base_module_1_0_0_t ** module); + +static int query_1_0_0(const mca_scoll_base_component_1_0_0_t * + scoll_component, oshmem_group_t * group, + int *priority, + mca_scoll_base_module_1_0_0_t ** module); + +static int scoll_null_barrier(struct oshmem_group_t *group, long *pSync, int alg) +{ + if (oshmem_proc_group_is_member(group)) { + SCOLL_ERROR("internal error"); + oshmem_shmem_abort(-1); + return OSHMEM_ERROR; + } + return OSHMEM_SUCCESS; +} + +static int scoll_null_broadcast(struct oshmem_group_t *group, int PE_root, void *target, const void *source, size_t nlong, long *pSync, int alg) +{ + if (oshmem_proc_group_is_member(group)) { + SCOLL_ERROR("internal error"); + oshmem_shmem_abort(-1); + return OSHMEM_ERROR; + } + return OSHMEM_SUCCESS; +} + +static int scoll_null_collect(struct oshmem_group_t *group, void *target, const void *source, size_t nlong, long *pSync, bool nlong_type, int alg) +{ + if (oshmem_proc_group_is_member(group)) { + SCOLL_ERROR("internal error"); + oshmem_shmem_abort(-1); + return OSHMEM_ERROR; + } + return OSHMEM_SUCCESS; +} + +static int scoll_null_reduce(struct oshmem_group_t *group, struct oshmem_op_t *op, void *target, const void *source, size_t nlong, long *pSync, void *pWrk, int alg) +{ + if (oshmem_proc_group_is_member(group)) { + SCOLL_ERROR("internal error"); + oshmem_shmem_abort(-1); + return OSHMEM_ERROR; + } + return OSHMEM_SUCCESS; +} + +/* + * Stuff for the OBJ interface + */ +static OBJ_CLASS_INSTANCE(avail_coll_t, opal_list_item_t, NULL, NULL); + + +#define COPY(module, group, func) \ + do { \ + if (NULL != module->scoll_ ## func) { \ + if (NULL != group->g_scoll.scoll_ ## func ## _module) { \ + OBJ_RELEASE(group->g_scoll.scoll_ ## func ## _module); \ + } \ + group->g_scoll.scoll_ ## func = module->scoll_ ## func; \ + group->g_scoll.scoll_ ## func ## _module = module; \ + OBJ_RETAIN(module); \ + } \ + } while (0) + +#define CLOSE(group, func) \ + do { \ + if (NULL != group->g_scoll.scoll_ ## func ## _module) { \ + OBJ_RELEASE(group->g_scoll.scoll_ ## func ## _module); \ + group->g_scoll.scoll_## func = NULL; \ + group->g_scoll.scoll_## func ## _module = NULL; \ + } \ + } while (0) + +int mca_scoll_base_group_unselect(struct oshmem_group_t * group) +{ + /* + * scoll close() is called before group destructors, so + * do close group collectives if scoll modules are no longer + * valid + * + * there is a memory leak here, because not doing close means + * that we leaving object with dangling ref counts + */ + SCOLL_VERBOSE(10,"scoll:base:group_unselect: group: %d", + group->id); + if (mca_scoll_base_components_opened_valid || + mca_scoll_base_components_available_valid ){ + CLOSE(group, reduce); + CLOSE(group, barrier); + CLOSE(group, broadcast); + CLOSE(group, collect); + } + /* All done */ + return OSHMEM_SUCCESS; +} +/* + * This function is called at the initialization time of every + * group. It is used to select which coll component will be + * active for a given group. + */ +int mca_scoll_base_select(struct oshmem_group_t *group) +{ + opal_list_t *selectable; + opal_list_item_t *item; + int ret; + /* Announce */ + SCOLL_VERBOSE(10,"scoll:base:group_select: new group: %d", + group->id); + mca_scoll_base_group_unselect(group); + memset(&group->g_scoll, 0, sizeof(mca_scoll_base_group_scoll_t)); + if (!oshmem_proc_group_is_member(group)) { + group->g_scoll.scoll_barrier = scoll_null_barrier; + group->g_scoll.scoll_broadcast = scoll_null_broadcast; + group->g_scoll.scoll_reduce = scoll_null_reduce; + group->g_scoll.scoll_collect = scoll_null_collect; + return OSHMEM_SUCCESS; + } + SCOLL_VERBOSE(10,"scoll:base:group_select: Checking all available modules"); + selectable = check_components(&mca_scoll_base_components_available, group); + + /* Upon return from the above, the modules list will contain the + list of modules that returned (priority >= 0). If we have no + collective modules available, then print error and return. */ + if (NULL == selectable) { + /* There's no modules available */ + return OSHMEM_ERROR; + } + + /* do the selection loop */ + for (item = opal_list_remove_first(selectable); + NULL != item; item = opal_list_remove_first(selectable)) + { + avail_coll_t *avail = (avail_coll_t *)item; + ret = avail->ac_module->scoll_module_enable(avail->ac_module, group); + if (OSHMEM_SUCCESS != ret) { + mca_scoll_base_group_unselect(group); + } + else { + COPY(avail->ac_module, group, broadcast); + COPY(avail->ac_module, group, collect); + COPY(avail->ac_module, group, reduce); + COPY(avail->ac_module, group, barrier); + } + OBJ_RELEASE(avail->ac_module); + OBJ_RELEASE(avail); + } + + + /* Done with the list from the check_components() call so release it. */ + OBJ_RELEASE(selectable); + if ((NULL == group->g_scoll.scoll_barrier) || + (NULL == group->g_scoll.scoll_broadcast) || + (NULL == group->g_scoll.scoll_collect) || + (NULL == group->g_scoll.scoll_reduce)) { + mca_scoll_base_group_unselect(group); + return OSHMEM_ERR_NOT_FOUND; + } + + return OSHMEM_SUCCESS; +} + + +/* + * For each module in the list, check and see if it wants to run, and + * do the resulting priority comparison. Make a list of modules to be + * only those who returned that they want to run, and put them in + * priority order. + */ +static opal_list_t *check_components(opal_list_t * components, + oshmem_group_t * group) +{ + int priority; + const mca_base_component_t *component; + opal_list_item_t *item, *item2; + mca_scoll_base_module_1_0_0_t *module; + opal_list_t *selectable; + avail_coll_t *avail, *avail2; + + /* Make a list of the components that query successfully */ + selectable = OBJ_NEW(opal_list_t); + + /* Scan through the list of components. This nested loop is + O(N^2), but we should never have too many components, so this + *hopefully* shouldn't matter... */ + + for (item = opal_list_get_first(components); + ((item != opal_list_get_end(components)) && (item != NULL)); + item = opal_list_get_next(item)) { + component = ((mca_base_component_priority_list_item_t *) + item)->super.cli_component; + + priority = check_one_component(group, component, &module); + if (priority >= 0) { + + /* We have a component that indicated that it wants to run + by giving us a module */ + avail = OBJ_NEW(avail_coll_t); + avail->ac_priority = priority; + avail->ac_module = module; + + /* Put this item on the list in priority order (lowest + priority first). Should it go first? */ + for (item2 = opal_list_get_first(selectable); + item2 != opal_list_get_end(selectable); + item2 = opal_list_get_next(item2)) { + avail2 = (avail_coll_t *) item2; + if (avail->ac_priority < avail2->ac_priority) { + opal_list_insert_pos(selectable, + item2, + (opal_list_item_t *) avail); + break; + } + } + + if (opal_list_get_end(selectable) == item2) { + opal_list_append(selectable, + (opal_list_item_t *) avail); + } + } + } + /*TODO: copy over any of the pointers */ + + /* If we didn't find any available components, return an error */ + if (0 == opal_list_get_size(selectable)) { + OBJ_RELEASE(selectable); + return NULL; + } + + /* All done */ + return selectable; +} + + +/* + * Check a single component + */ +static int check_one_component(oshmem_group_t * group, + const mca_base_component_t * component, + mca_scoll_base_module_1_0_0_t ** module) +{ + int err; + int priority = -1; + + err = query(component, group, &priority, module); + + if (OSHMEM_SUCCESS == err) { + priority = (priority < 100) ? priority : 100; + SCOLL_VERBOSE(10,"scoll:base:group_select: component available: %s, priority: %d", + component->mca_component_name, priority); + + } else { + priority = -1; + SCOLL_VERBOSE(10,"scoll:base:group_select: component not available: %s", + component->mca_component_name); + } + + return priority; +} + + +/************************************************************************** + * Query functions + **************************************************************************/ + +/* + * Take any version of a coll module, query it, and return the right + * module struct + */ +static int query(const mca_base_component_t * component, + oshmem_group_t * group, + int *priority, mca_scoll_base_module_1_0_0_t ** module) +{ + *module = NULL; + if (1 == component->mca_type_major_version && + 0 == component->mca_type_minor_version && + 0 == component->mca_type_release_version) { + const mca_scoll_base_component_1_0_0_t *coll100 = + (mca_scoll_base_component_1_0_0_t *) component; + + return query_1_0_0(coll100, group, priority, module); + } + + /* Unknown coll API version -- return error */ + + return OSHMEM_ERROR; +} + + +static int query_1_0_0(const mca_scoll_base_component_1_0_0_t * component, + oshmem_group_t * group, int *priority, + mca_scoll_base_module_1_0_0_t ** module) +{ + mca_scoll_base_module_1_0_0_t *ret; + + /* There's currently no need for conversion */ + + ret = component->scoll_query(group, priority); + if (NULL != ret) { + *module = ret; + return OSHMEM_SUCCESS; + } + + return OSHMEM_ERROR; +} diff --git a/oshmem/mca/scoll/basic/.windows b/oshmem/mca/scoll/basic/.windows new file mode 100644 index 0000000000..104768dd6a --- /dev/null +++ b/oshmem/mca/scoll/basic/.windows @@ -0,0 +1,12 @@ +# +# Copyright (c) 2012 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# Specific to this module +mca_link_libraries=libshmem diff --git a/oshmem/mca/scoll/basic/Makefile.am b/oshmem/mca/scoll/basic/Makefile.am new file mode 100644 index 0000000000..41cee45d2e --- /dev/null +++ b/oshmem/mca/scoll/basic/Makefile.am @@ -0,0 +1,44 @@ +# +# Copyright (c) 2012 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +EXTRA_DIST = .windows + +AM_CFLAGS = $(OSHMEM_CFLAGS) + +sources = \ + scoll_basic.h \ + scoll_basic_module.c \ + scoll_basic_component.c \ + scoll_basic_barrier.c \ + scoll_basic_broadcast.c \ + scoll_basic_collect.c \ + scoll_basic_reduce.c + + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +if MCA_BUILD_oshmem_scoll_basic_DSO +component_noinst = +component_install = mca_scoll_basic.la +else +component_noinst = libmca_scoll_basic.la +component_install = +endif + +mcacomponentdir = $(pkglibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_scoll_basic_la_SOURCES = $(sources) +mca_scoll_basic_la_LDFLAGS = -module -avoid-version + +noinst_LTLIBRARIES = $(component_noinst) +libmca_scoll_basic_la_SOURCES =$(sources) +libmca_scoll_basic_la_LDFLAGS = -module -avoid-version diff --git a/oshmem/mca/scoll/basic/configure.params b/oshmem/mca/scoll/basic/configure.params new file mode 100644 index 0000000000..5a3f93008f --- /dev/null +++ b/oshmem/mca/scoll/basic/configure.params @@ -0,0 +1,13 @@ +# -*- shell-script -*- +# Copyright (c) 2012 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# Specific to this module + +PARAM_CONFIG_FILES="Makefile" diff --git a/oshmem/mca/scoll/basic/scoll_basic.h b/oshmem/mca/scoll/basic/scoll_basic.h new file mode 100644 index 0000000000..529bcda4a1 --- /dev/null +++ b/oshmem/mca/scoll/basic/scoll_basic.h @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_SCOLL_BASIC_H +#define MCA_SCOLL_BASIC_H + +#include "oshmem_config.h" + +#include "opal/mca/mca.h" +#include "oshmem/mca/scoll/scoll.h" + +BEGIN_C_DECLS + +/* Globally exported variables */ + +OSHMEM_MODULE_DECLSPEC extern mca_scoll_base_component_1_0_0_t + mca_scoll_basic_component; + +extern int mca_scoll_basic_priority_param; +OSHMEM_DECLSPEC extern int mca_scoll_basic_param_barrier_algorithm; +extern int mca_scoll_basic_param_broadcast_algorithm; +extern int mca_scoll_basic_param_collect_algorithm; +extern int mca_scoll_basic_param_reduce_algorithm; + +/* API functions */ + +int mca_scoll_basic_init(bool enable_progress_threads, + bool enable_threads); +mca_scoll_base_module_t* + mca_scoll_basic_query(struct oshmem_group_t *group, + int *priority); + +enum { + SHMEM_SYNC_INIT = _SHMEM_SYNC_VALUE, + SHMEM_SYNC_WAIT = -2, + SHMEM_SYNC_RUN = -3, + SHMEM_SYNC_READY= -4, +}; + +int mca_scoll_basic_barrier(struct oshmem_group_t *group, long *pSync, int alg); +int mca_scoll_basic_broadcast(struct oshmem_group_t *group, int PE_root, void *target, const void *source, size_t nlong, long *pSync, int alg); +int mca_scoll_basic_collect(struct oshmem_group_t *group, void *target, const void *source, size_t nlong, long *pSync, bool nlong_type, int alg); +int mca_scoll_basic_reduce(struct oshmem_group_t *group, struct oshmem_op_t *op, void *target, const void *source, size_t nlong, long *pSync, void *pWrk, int alg); + + +static inline unsigned int scoll_log2(unsigned long val) +{ + unsigned int count = 0; + + while(val > 0) + { + val = val >> 1; + count++; + } + + return count > 0 ? count-1: 0; +} + +struct mca_scoll_basic_module_t { + mca_scoll_base_module_t super; +}; +typedef struct mca_scoll_basic_module_t mca_scoll_basic_module_t; +OBJ_CLASS_DECLARATION(mca_scoll_basic_module_t); + +END_C_DECLS + +#endif /* MCA_SCOLL_BASIC_H */ diff --git a/oshmem/mca/scoll/basic/scoll_basic_barrier.c b/oshmem/mca/scoll/basic/scoll_basic_barrier.c new file mode 100644 index 0000000000..431b95eef2 --- /dev/null +++ b/oshmem/mca/scoll/basic/scoll_basic_barrier.c @@ -0,0 +1,601 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "oshmem_config.h" +#include +#include + +#include "orte/mca/grpcomm/grpcomm.h" + +#include "oshmem/constants.h" +#include "oshmem/mca/spml/spml.h" +#include "oshmem/mca/scoll/scoll.h" +#include "oshmem/mca/scoll/base/base.h" +#include "oshmem/proc/proc.h" +#include "scoll_basic.h" + + +static int __algorithm_central_counter(struct oshmem_group_t *group, long *pSync); +static int __algorithm_tournament(struct oshmem_group_t *group, long *pSync); +static int __algorithm_recursive_doubling(struct oshmem_group_t *group, long *pSync); +static int __algorithm_dissemination(struct oshmem_group_t *group, long *pSync); +static int __algorithm_basic(struct oshmem_group_t *group, long *pSync); +static int __algorithm_adaptive(struct oshmem_group_t *group, long *pSync); + + +int mca_scoll_basic_barrier(struct oshmem_group_t *group, long *pSync, int alg) +{ + int rc = OSHMEM_SUCCESS; + + /* Arguments validation */ + if (!group) + { + SCOLL_ERROR("Active set (group) of PE is not defined"); + rc = OSHMEM_ERR_BAD_PARAM; + } + + if ((rc == OSHMEM_SUCCESS) && oshmem_proc_group_is_member(group)) + { + if (pSync) + { + alg = ( alg == SCOLL_DEFAULT_ALG ? mca_scoll_basic_param_barrier_algorithm : alg); + switch(alg) + { + case SCOLL_ALG_BARRIER_CENTRAL_COUNTER: + { + rc = __algorithm_central_counter(group, pSync); + break; + } + case SCOLL_ALG_BARRIER_TOURNAMENT: + { + rc = __algorithm_tournament(group, pSync); + break; + } + case SCOLL_ALG_BARRIER_RECURSIVE_DOUBLING: + { + rc = __algorithm_recursive_doubling(group, pSync); + break; + } + case SCOLL_ALG_BARRIER_DISSEMINATION: + { + rc = __algorithm_dissemination(group, pSync); + break; + } + case SCOLL_ALG_BARRIER_BASIC: + { + rc = __algorithm_basic(group, pSync); + break; + } + case SCOLL_ALG_BARRIER_ADAPTIVE: + { + rc = __algorithm_adaptive(group, pSync); + break; + } + default: + { + rc = __algorithm_recursive_doubling(group, pSync); + } + } + } + else + { + SCOLL_ERROR("Incorrect argument pSync"); + rc = OSHMEM_ERR_BAD_PARAM; + } + } + + return rc; +} + + +/* + This algorithm is quite simple and straightforward. But because of it’s obvious simplicity and + the naive prove for correctness it is implemented quite often. One node asks peers if they are + achieve barrier state. When all processors are ready it signals to go ahead. + Outlay: + NP-1 competing network transfers are needed to implement the counter + The memory usage is constant (1 byte) per node. +*/ +static int __algorithm_central_counter(struct oshmem_group_t *group, long *pSync) +{ + int rc = OSHMEM_SUCCESS; + long value = SHMEM_SYNC_INIT; + int root_id = 0; + int PE_root = oshmem_proc_pe(group->proc_array[root_id]); + int i = 0; + + SCOLL_VERBOSE(12, "[#%d] Barrier algorithm: Central Counter", group->my_pe); + SCOLL_VERBOSE(15, "[#%d] pSync[0] = %ld", group->my_pe, pSync[0]); + + /* Set current state as WAIT */ + pSync[0] = SHMEM_SYNC_WAIT; + + /* Root processes synchronization */ + if (PE_root == group->my_pe) + { + int pe_cur = 0; + long wait_pe_count = 0; + int* wait_pe_array = NULL; + + wait_pe_array = malloc(sizeof(*wait_pe_array) * group->proc_count); + if (wait_pe_array) + { + SCOLL_VERBOSE(14, "[#%d] PE is the root", group->my_pe); + + wait_pe_count = group->proc_count; + for (i = 0; i < group->proc_count; i++) + { + wait_pe_array[i] = oshmem_proc_pe(group->proc_array[i]); + } + wait_pe_array[root_id] = OSHMEM_PE_INVALID; + wait_pe_count--; + + while (wait_pe_count) + { + for (i = 0; (i < group->proc_count) && (rc == OSHMEM_SUCCESS); i++) + { + pe_cur = wait_pe_array[i]; + if (pe_cur != OSHMEM_PE_INVALID) + { + rc = MCA_SPML_CALL(get((void*)pSync, sizeof(value), (void*)&value, pe_cur)); + if ( (rc == OSHMEM_SUCCESS) && (value == SHMEM_SYNC_WAIT) ) + { + wait_pe_array[i] = OSHMEM_PE_INVALID; + wait_pe_count--; + SCOLL_VERBOSE(14, "[#%d] PE#%d is ready (wait list counter: %d)", group->my_pe, pe_cur, (int)wait_pe_count); + } + } + } + } + + SCOLL_VERBOSE(14, "[#%d] PE signals to all", group->my_pe); + value = SHMEM_SYNC_RUN; + for (i = 0; (i < group->proc_count) && (rc == OSHMEM_SUCCESS); i++) + { + pe_cur = oshmem_proc_pe(group->proc_array[i]); + if (pe_cur != PE_root) + { + rc = MCA_SPML_CALL(put((void*)pSync, sizeof(value), (void*)&value, pe_cur)); + } + } + + free(wait_pe_array); + } + else + { + rc = OSHMEM_ERR_OUT_OF_RESOURCE; + } + + /* Possibly this is unnecessary... + But imagine the scenario when you have 2 sequential barriers and the root PE is the fastest one. + The root could leave the first barrier and in the second barrier it could get SHMEM_SYNC_WAIT value on + remote node before the remote node receives its SHMEM_SYNC_RUN value in the first barrier + */ + /* TODO: actually it must be quiet */ + MCA_SPML_CALL(fence()); + } + /* Wait for RUN signal */ + else + { + SCOLL_VERBOSE(14, "[#%d] PE waits for a signal from root", group->my_pe); + + value = SHMEM_SYNC_RUN; + rc = MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_EQ, (void*)&value, SHMEM_LONG)); + } + + /* Restore initial values */ + SCOLL_VERBOSE(12, "[#%d] Restore special synchronization array", group->my_pe); + for (i = 0; pSync && (i < _SHMEM_BARRIER_SYNC_SIZE); i++) + { + pSync[i] = _SHMEM_SYNC_VALUE; + } + + SCOLL_VERBOSE(15, "[#%d] pSync[0] = %ld", group->my_pe, pSync[0]); + + return rc; +} + + +/* + The Tournament Barrier, proposed by Hengsen, Finkel and Manser is mostly suitable for shared memory + multiprocessors because it benefits from several caching mechanisms. + The algorithm is similar to a tournament game. In each round two + nodes play against each other. The winner is known in advance and waits until the looser arrives. The + winners play against each other in the next round. The overall winner (the champion) notifies all others + about the end of the barrier. + Outlay: + The game scales with log2(NP) and uses 1 byte of memory. +*/ +static int __algorithm_tournament(struct oshmem_group_t *group, long *pSync) +{ + int rc = OSHMEM_SUCCESS; + int round = 0; + int exit_flag = group->proc_count - 1; + long value = SHMEM_SYNC_INIT; + int my_id = oshmem_proc_group_find_id(group, group->my_pe); + int peer_id = 0; + int peer_pe = 0; + int i = 0; + + SCOLL_VERBOSE(12, "[#%d] Barrier algorithm: Tournament", group->my_pe); + SCOLL_VERBOSE(15, "[#%d] pSync[0] = %ld", group->my_pe, pSync[0]); + + /* Set current state as WAIT */ + pSync[0] = SHMEM_SYNC_WAIT; + + while (exit_flag && (rc == OSHMEM_SUCCESS)) + { + /* Define a peer for competition */ + peer_id = my_id ^ (1 << round); + + /* Update exit condition and round counter */ + exit_flag >>= 1; + round++; + + /* Do not have peer for tournament */ + if (peer_id >= group->proc_count) continue; + + if ( my_id < peer_id ) + { + pSync[0] = peer_id; + value = my_id; + + SCOLL_VERBOSE(14, "[#%d] round = %d wait", group->my_pe, round); + rc = MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_EQ, (void*)&value, SHMEM_LONG)); + } + else + { + peer_pe = oshmem_proc_pe(group->proc_array[peer_id]); + +#if 1 /* It is ugly implementation of compare and swap operation + Usage of this hack does not give performance improvement but + it is expected that shmem_long_cswap() will make it faster. + */ + do + { + MCA_SPML_CALL(get((void*)pSync, sizeof(value), (void*)&value, peer_pe)); + } while (value != my_id); + + SCOLL_VERBOSE(14, "[#%d] round = %d signals to #%d", group->my_pe, round, peer_pe); + value = peer_id; + rc = MCA_SPML_CALL(put((void*)pSync, sizeof(value), (void*)&value, peer_pe)); +#else + SCOLL_VERBOSE(14, "[#%d] round = %d signals to #%d", group->my_pe, round, peer_pe); + do + { + rc = MCA_ATOMIC_CALL(cswap((void*)pSync, (void*)&value, (const void*)&my_id, (const void*)&peer_id, sizeof(value), peer_pe)); + } while (value != my_id); +#endif + SCOLL_VERBOSE(14, "[#%d] round = %d wait", group->my_pe, round); + value = SHMEM_SYNC_RUN; + rc = MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_EQ, (void*)&value, SHMEM_LONG)); + + break; + } + } + + /* Restore initial values */ + SCOLL_VERBOSE(12, "[#%d] Restore special synchronization array", group->my_pe); + for (i = 0; pSync && (i < _SHMEM_BARRIER_SYNC_SIZE); i++) + { + pSync[i] = _SHMEM_SYNC_VALUE; + } + + /* Send result to all PE in group */ + if ( (my_id == 0) && (rc == OSHMEM_SUCCESS)) + { + SCOLL_VERBOSE(14, "[#%d] signals to all", group->my_pe); + + value = SHMEM_SYNC_RUN; + for (peer_id = 1; (peer_id < group->proc_count) && (rc == OSHMEM_SUCCESS); peer_id++) + { + peer_pe = oshmem_proc_pe(group->proc_array[peer_id]); + rc = MCA_SPML_CALL(put((void*)pSync, sizeof(value), (void*)&value, peer_pe)); + } + } + + SCOLL_VERBOSE(15, "[#%d] pSync[0] = %ld", group->my_pe, pSync[0]); + + return rc; +} + + +/* + Pairwise Exchange With Recursive Doubling. + Rinka Gupta, Vinod Tipparaju, Jare Nieplocha, and Dhabaleswar Panda. Efficient Barrier + using Remote Memory Operations on VIA-Based Clusters. In 2002 IEEE International + Conference on Cluster Computing (CLUSTER 2002), page 83. IEEE Computer Society, 2002. + Outlay: + The algorithm uses a maximum of log2(NP) + 2 network writes and P bytes memory per node. +*/ +static int __algorithm_recursive_doubling(struct oshmem_group_t *group, long *pSync) +{ + int rc = OSHMEM_SUCCESS; + int round = 0; + int floor2_proc = 0; + int exit_flag = 0; + long value = SHMEM_SYNC_INIT; + int my_id = oshmem_proc_group_find_id(group, group->my_pe); + int peer_id = 0; + int peer_pe = 0; + int i = 0; + + floor2_proc = 1; + i = group->proc_count; + i >>= 1; + while (i) + { + i >>= 1; + floor2_proc <<= 1; + } + + SCOLL_VERBOSE(12, "[#%d] Barrier algorithm: Recursive Doubling", group->my_pe); + SCOLL_VERBOSE(15, "[#%d] pSync[0] = %ld floor2_proc = %d", group->my_pe, pSync[0], floor2_proc); + + if (my_id >= floor2_proc) + { + /* I am in extra group, my partner is node (my_id-y) in basic group */ + peer_id = my_id - floor2_proc; + peer_pe = oshmem_proc_pe(group->proc_array[peer_id]); + + SCOLL_VERBOSE(14, "[#%d] is extra and signal to #%d", group->my_pe, peer_pe); + value = SHMEM_SYNC_WAIT; + rc = MCA_SPML_CALL(put((void*)pSync, sizeof(value), (void*)&value, peer_pe)); + + SCOLL_VERBOSE(14, "[#%d] wait", group->my_pe); + value = SHMEM_SYNC_RUN; + rc = MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_EQ, (void*)&value, SHMEM_LONG)); + + /* Restore initial values */ + SCOLL_VERBOSE(12, "[#%d] Restore special synchronization array", group->my_pe); + for (i = 0; pSync && (i < _SHMEM_BARRIER_SYNC_SIZE); i++) + { + pSync[i] = _SHMEM_SYNC_VALUE; + } + } + else + { + /* Wait for a peer from extra group */ + if ((group->proc_count - floor2_proc) > my_id) + { + /* I am in basic group, my partner is node (my_id+y) in extra group */ + peer_id = my_id + floor2_proc; + peer_pe = oshmem_proc_pe(group->proc_array[peer_id]); + + SCOLL_VERBOSE(14, "[#%d] wait a signal from #%d", group->my_pe, peer_pe); + value = SHMEM_SYNC_WAIT; + rc = MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_EQ, (void*)&value, SHMEM_LONG)); + } + + /* Pairwise exchange */ + exit_flag = floor2_proc - 1; + pSync[0] = round; + while (exit_flag && (rc == OSHMEM_SUCCESS)) + { + /* Define a peer for competition */ + peer_id = my_id ^ (1 << round); + + /* Update exit condition and round counter */ + exit_flag >>= 1; + round++; + + peer_pe = oshmem_proc_pe(group->proc_array[peer_id]); + +#if 1 /* It is ugly implementation of compare and swap operation + Usage of this hack does not give performance improvement but + it is expected that shmem_long_cswap() will make it faster. + */ + do + { + MCA_SPML_CALL(get((void*)pSync, sizeof(value), (void*)&value, peer_pe)); + } while (value != (round - 1)); + + SCOLL_VERBOSE(14, "[#%d] round = %d signals to #%d", group->my_pe, round, peer_pe); + value = round; + rc = MCA_SPML_CALL(put((void*)pSync, sizeof(value), (void*)&value, peer_pe)); +#else + SCOLL_VERBOSE(14, "[#%d] round = %d signals to #%d", group->my_pe, round, peer_pe); + { + long cond = round - 1; + do + { + rc = MCA_ATOMIC_CALL(cswap((void*)pSync, (void*)&value, (const void*)&cond, (const void*)&round, sizeof(value), peer_pe)); + } while (value != (round-1)); + } +#endif + + SCOLL_VERBOSE(14, "[#%d] round = %d wait", group->my_pe, round); + value = round; + rc = MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_GE, (void*)&value, SHMEM_LONG)); + } + + /* Restore initial values */ + SCOLL_VERBOSE(12, "[#%d] Restore special synchronization array", group->my_pe); + for (i = 0; pSync && (i < _SHMEM_BARRIER_SYNC_SIZE); i++) + { + pSync[i] = _SHMEM_SYNC_VALUE; + } + + /* Notify a peer from extra group */ + if ((group->proc_count - floor2_proc) > my_id) + { + /* I am in basic group, my partner is node (my_id+y) in extra group */ + peer_id = my_id + floor2_proc; + peer_pe = oshmem_proc_pe(group->proc_array[peer_id]); + + SCOLL_VERBOSE(14, "[#%d] signals to #%d", group->my_pe, peer_pe); + value = SHMEM_SYNC_RUN; + rc = MCA_SPML_CALL(put((void*)pSync, sizeof(value), (void*)&value, peer_pe)); + } + } + + SCOLL_VERBOSE(15, "[#%d] pSync[0] = %ld", group->my_pe, pSync[0]); + + return rc; +} + + +/* + The Dissemination Barrier, introduced by Hengsen, Finkel and Manser in 1998. + The algorithm is mostly an improvement of the Butterfly Barrier for non power of two processor counts. + It uses the same pairwise synchronization but with other partners. + Outlay: + The game scales with log2(NP) and uses 1 byte of memory. +*/ +static int __algorithm_dissemination(struct oshmem_group_t *group, long *pSync) +{ + int rc = OSHMEM_SUCCESS; + int round = 0; + int log2_proc = 0; + long value = SHMEM_SYNC_INIT; + int my_id = oshmem_proc_group_find_id(group, group->my_pe); + int peer_id = 0; + int peer_pe = 0; + int i = 0; + + log2_proc = scoll_log2((unsigned long)group->proc_count); + + SCOLL_VERBOSE(12, "[#%d] Barrier algorithm: Dissemination", group->my_pe); + SCOLL_VERBOSE(15, "[#%d] pSync[0] = %ld floor2_proc = %d", group->my_pe, pSync[0], log2_proc); + + pSync[0] = round; + for (round = 0; (round <= log2_proc) && (rc == OSHMEM_SUCCESS); round++) + { + /* Define a peer to send signal */ + peer_id = (my_id + (1 << round)) % group->proc_count; + + peer_pe = oshmem_proc_pe(group->proc_array[peer_id]); + +#if 1 /* It is ugly implementation of compare and swap operation + Usage of this hack does not give performance improvement but + it is expected that shmem_long_cswap() will make it faster. + */ + do + { + MCA_SPML_CALL(get((void*)pSync, sizeof(value), (void*)&value, peer_pe)); + } while (value != round); + + SCOLL_VERBOSE(14, "[#%d] round = %d signals to #%d", group->my_pe, round, peer_pe); + value = round + 1; + rc = MCA_SPML_CALL(put((void*)pSync, sizeof(value), (void*)&value, peer_pe)); +#endif + + SCOLL_VERBOSE(14, "[#%d] round = %d wait", group->my_pe, round); + value = round + 1; + rc = MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_GE, (void*)&value, SHMEM_LONG)); + } + + /* Restore initial values */ + SCOLL_VERBOSE(12, "[#%d] Restore special synchronization array", group->my_pe); + for (i = 0; pSync && (i < _SHMEM_BARRIER_SYNC_SIZE); i++) + { + pSync[i] = _SHMEM_SYNC_VALUE; + } + + SCOLL_VERBOSE(15, "[#%d] pSync[0] = %ld", group->my_pe, pSync[0]); + + return rc; +} + + +static int __algorithm_basic(struct oshmem_group_t *group, long *pSync) +{ + int rc = OSHMEM_SUCCESS; + int root_id = 0; + int PE_root = oshmem_proc_pe(group->proc_array[root_id]); + int i = 0; + + SCOLL_VERBOSE(12, "[#%d] Barrier algorithm: Basic", group->my_pe); + + if (PE_root != group->my_pe) + { + rc = MCA_SPML_CALL(send(NULL, 0, PE_root, MCA_SPML_BASE_PUT_STANDARD)); + if (OSHMEM_SUCCESS != rc) { + return rc; + } + + rc = MCA_SPML_CALL(recv(NULL, 0, PE_root)); + if (OSHMEM_SUCCESS != rc) { + return rc; + } + } + + /* The root collects and broadcasts the messages. */ + + else + { + int pe_cur = 0; + + for (i = 0; (i < group->proc_count) && (rc == OSHMEM_SUCCESS); i++) + { + pe_cur = oshmem_proc_pe(group->proc_array[i]); + if (pe_cur != PE_root) + { + rc = MCA_SPML_CALL(recv(NULL, 0, SHMEM_ANY_SOURCE)); + } + if (OSHMEM_SUCCESS != rc) { + return rc; + } + } + + for (i = 0; (i < group->proc_count) && (rc == OSHMEM_SUCCESS); i++) + { + pe_cur = oshmem_proc_pe(group->proc_array[i]); + if (pe_cur != PE_root) + { + rc = MCA_SPML_CALL(send(NULL, 0, pe_cur, MCA_SPML_BASE_PUT_STANDARD)); + } + if (OSHMEM_SUCCESS != rc) { + return rc; + } + } + } + + return rc; +} + +static int __algorithm_adaptive(struct oshmem_group_t *group, long *pSync) +{ + int rc = OSHMEM_SUCCESS; + bool local_peers_only = true; + + SCOLL_VERBOSE(12, "[#%d] Barrier algorithm: Adaptive", group->my_pe); + + /* check if we have only local peers */ + { + int i = 0; + + for (i = 0; i < group->proc_count; i++) + { + if (i == group->id) continue; + + if ( !OPAL_PROC_ON_LOCAL_NODE(group->proc_array[i]->proc_flags) ) + { + local_peers_only = false; + break; + } + } + } + + /* Select algorithm we use: + * use send/recv way for group in the same node and for np < 32 + * otherwise use put/get way + */ + if (local_peers_only || (group->proc_count < 32) ) + { + rc = __algorithm_basic(group, pSync); + } + else + { + rc = __algorithm_recursive_doubling(group, pSync); + } + + return rc; +} diff --git a/oshmem/mca/scoll/basic/scoll_basic_broadcast.c b/oshmem/mca/scoll/basic/scoll_basic_broadcast.c new file mode 100644 index 0000000000..dfd778dbc4 --- /dev/null +++ b/oshmem/mca/scoll/basic/scoll_basic_broadcast.c @@ -0,0 +1,209 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "oshmem_config.h" +#include +#include + +#include "orte/mca/grpcomm/grpcomm.h" + +#include "opal/util/bit_ops.h" + +#include "oshmem/constants.h" +#include "oshmem/mca/spml/spml.h" +#include "oshmem/mca/scoll/scoll.h" +#include "oshmem/mca/scoll/base/base.h" +#include "scoll_basic.h" + + +static int __algorithm_central_counter(struct oshmem_group_t *group, int PE_root, void *target, const void *source, size_t nlong, long *pSync); +static int __algorithm_binomial_tree(struct oshmem_group_t *group, int PE_root, void *target, const void *source, size_t nlong, long *pSync); + + +int mca_scoll_basic_broadcast(struct oshmem_group_t *group, int PE_root, void *target, const void *source, size_t nlong, long *pSync, int alg) +{ + int rc = OSHMEM_SUCCESS; + + /* Arguments validation */ + if (!group) + { + SCOLL_ERROR("Active set (group) of PE is not defined"); + rc = OSHMEM_ERR_BAD_PARAM; + } + + /* Check if this PE is part of the group */ + if ((rc == OSHMEM_SUCCESS) && oshmem_proc_group_is_member(group)) + { + int i = 0; + + if (pSync) + { + alg = ( alg == SCOLL_DEFAULT_ALG ? mca_scoll_basic_param_broadcast_algorithm : alg); + switch(alg) + { + case SCOLL_ALG_BROADCAST_CENTRAL_COUNTER: + { + rc = __algorithm_central_counter(group, PE_root, target, source, nlong, pSync); + break; + } + case SCOLL_ALG_BROADCAST_BINOMIAL: + { + rc = __algorithm_binomial_tree(group, PE_root, target, source, nlong, pSync); + break; + } + default: + { + rc = __algorithm_binomial_tree(group, PE_root, target, source, nlong, pSync); + } + } + } + else + { + SCOLL_ERROR("Incorrect argument pSync"); + rc = OSHMEM_ERR_BAD_PARAM; + } + + /* Restore initial values */ + SCOLL_VERBOSE(12, "[#%d] Restore special synchronization array", group->my_pe); + for (i = 0; pSync && (i < _SHMEM_BCAST_SYNC_SIZE); i++) + { + pSync[i] = _SHMEM_SYNC_VALUE; + } + } + + return rc; +} + + +/* + This algorithm is quite simple and straightforward. But because of it’s obvious simplicity and + the naive prove for correctness it is implemented quite often. The root send data to all. + Outlay: + NP-1 competing network transfers are needed to implement the counter + The memory usage is constant (1 byte) per node. +*/ +static int __algorithm_central_counter(struct oshmem_group_t *group, int PE_root, void *target, const void *source, size_t nlong, long *pSync) +{ + int rc = OSHMEM_SUCCESS; + int i = 0; + + SCOLL_VERBOSE(12, "[#%d] Broadcast algorithm: Central Counter", group->my_pe); + SCOLL_VERBOSE(15, "[#%d] pSync[0] = %ld root = #%d", group->my_pe, pSync[0], PE_root); + + /* Check if this PE is the root */ + if (PE_root == group->my_pe) + { + int pe_cur = 0; + + SCOLL_VERBOSE(14, "[#%d] send data to all PE in the group", group->my_pe); + for (i = 0; (i < group->proc_count) && (rc == OSHMEM_SUCCESS); i++) + { + pe_cur = oshmem_proc_pe(group->proc_array[i]); + if (pe_cur != PE_root) + { + SCOLL_VERBOSE(15, "[#%d] send data to #%d", group->my_pe, pe_cur); + rc = MCA_SPML_CALL(put(target, nlong, (void *)source, pe_cur)); + } + } + } + + /* Wait for operation completion to set needed size */ + if (rc == OSHMEM_SUCCESS) + { + SCOLL_VERBOSE(14, "[#%d] Wait for operation completion", group->my_pe); + rc = group->g_scoll.scoll_barrier(group, (pSync + 1), SCOLL_DEFAULT_ALG); + } + + return rc; +} + + +/* + The Binomial Spanning Tree algorithm. + Outlay: + The game scales with log2(NP) and uses 1 byte of memory. +*/ +static int __algorithm_binomial_tree(struct oshmem_group_t *group, int PE_root, void *target, const void *source, size_t nlong, long *pSync) +{ + int rc = OSHMEM_SUCCESS; + long value = SHMEM_SYNC_INIT; + int root_id = oshmem_proc_group_find_id(group, PE_root); + int my_id = oshmem_proc_group_find_id(group, group->my_pe); + int peer_id = 0; + int peer_pe = 0; + int vrank; + int dim = opal_cube_dim(group->proc_count); + int hibit; + int mask; + int i = 0; + + SCOLL_VERBOSE(12, "[#%d] Broadcast algorithm: Tree", group->my_pe); + SCOLL_VERBOSE(15, "[#%d] pSync[0] = %ld root = #%d", group->my_pe, pSync[0], PE_root); + + vrank = (my_id + group->proc_count - root_id) % group->proc_count; + hibit = opal_hibit(vrank, dim); + + SCOLL_VERBOSE(15, "[#%d] dim = %d vrank = %d hibit = %d", group->my_pe, dim, vrank, hibit); + + dim--; + + pSync[0] = SHMEM_SYNC_READY; + /* Receive data from parent in the tree. */ + if (vrank > 0) + { + value = SHMEM_SYNC_READY; + + SCOLL_VERBOSE(14, "[#%d] wait", group->my_pe); + rc = MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_NE, (void*)&value, SHMEM_LONG)); + while ((value = pSync[0]) < 0) + { + SCOLL_VERBOSE(14, "[#%d] Broadcast size is a negative value (%li)\n", group->my_pe, pSync[0]); + MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_NE, (void*)&value, SHMEM_LONG)); + } + if (OSHMEM_SUCCESS != rc) + { + return rc; + } + nlong = (size_t)pSync[0]; + } + + /* Send data to the children. */ + for (i = hibit + 1, mask = 1 << i; i <= dim; ++i, mask <<= 1) + { + peer_id = vrank | mask; + + if (peer_id < group->proc_count) + { + /* Wait for the child to be ready to receive (pSync must have the initial value) */ + peer_id = (peer_id + root_id) % group->proc_count; + peer_pe = oshmem_proc_pe(group->proc_array[peer_id]); + + SCOLL_VERBOSE(14, "[#%d] check remote pe is ready to receive #%d", group->my_pe, peer_pe); + do { + rc = MCA_SPML_CALL(get((void*)pSync, sizeof(long), (void*)pSync, peer_pe)); + } while ((OSHMEM_SUCCESS == rc) && (pSync[0] != SHMEM_SYNC_READY)); + + SCOLL_VERBOSE(14, "[#%d] send data to #%d", group->my_pe, peer_pe); + rc = MCA_SPML_CALL(put(target, nlong, (my_id == root_id ? (void *)source : target), peer_pe)); + + MCA_SPML_CALL(fence()); + + SCOLL_VERBOSE(14, "[#%d] signals to #%d", group->my_pe, peer_pe); + value = nlong; + rc = MCA_SPML_CALL(put((void*)pSync, sizeof(value), (void*)&value, peer_pe)); + if (OSHMEM_SUCCESS != rc) + { + break; + } + } + } + + return rc; +} diff --git a/oshmem/mca/scoll/basic/scoll_basic_collect.c b/oshmem/mca/scoll/basic/scoll_basic_collect.c new file mode 100644 index 0000000000..a5c445c072 --- /dev/null +++ b/oshmem/mca/scoll/basic/scoll_basic_collect.c @@ -0,0 +1,511 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "oshmem_config.h" +#include +#include + +#include "orte/mca/grpcomm/grpcomm.h" + +#include "oshmem/constants.h" +#include "oshmem/mca/spml/spml.h" +#include "oshmem/mca/scoll/scoll.h" +#include "oshmem/mca/scoll/base/base.h" +#include "scoll_basic.h" + + +static int __algorithm_central_collector(struct oshmem_group_t *group, void *target, const void *source, size_t nlong, long *pSync); +static int __algorithm_f_central_counter(struct oshmem_group_t *group, void *target, const void *source, size_t nlong, long *pSync); +static int __algorithm_f_tournament(struct oshmem_group_t *group, void *target, const void *source, size_t nlong, long *pSync); +static int __algorithm_f_recursive_doubling(struct oshmem_group_t *group, void *target, const void *source, size_t nlong, long *pSync); +static int __algorithm_f_ring(struct oshmem_group_t *group, void *target, const void *source, size_t nlong, long *pSync); + + +int mca_scoll_basic_collect(struct oshmem_group_t *group, void *target, const void *source, size_t nlong, long *pSync, bool nlong_type, int alg) +{ + int rc = OSHMEM_SUCCESS; + + /* Arguments validation */ + if (!group) + { + SCOLL_ERROR("Active set (group) of PE is not defined"); + rc = OSHMEM_ERR_BAD_PARAM; + } + + /* Check if this PE is part of the group */ + if ((rc == OSHMEM_SUCCESS) && oshmem_proc_group_is_member(group)) + { + int i = 0; + + if (nlong_type) + { + alg = ( alg == SCOLL_DEFAULT_ALG ? mca_scoll_basic_param_collect_algorithm : alg); + switch(alg) + { + case SCOLL_ALG_COLLECT_CENTRAL_COUNTER: + { + rc = __algorithm_f_central_counter(group, target, source, nlong, pSync); + break; + } + case SCOLL_ALG_COLLECT_TOURNAMENT: + { + rc = __algorithm_f_tournament(group, target, source, nlong, pSync); + break; + } + case SCOLL_ALG_COLLECT_RECURSIVE_DOUBLING: + { + rc = __algorithm_f_recursive_doubling(group, target, source, nlong, pSync); + break; + } + case SCOLL_ALG_COLLECT_RING: + { + rc = __algorithm_f_ring(group, target, source, nlong, pSync); + break; + } + default: + { + rc = __algorithm_f_central_counter(group, target, source, nlong, pSync); + } + } + } + else + { + rc = __algorithm_central_collector(group, target, source, nlong, pSync); + } + + /* Restore initial values */ + SCOLL_VERBOSE(12, "[#%d] Restore special synchronization array", group->my_pe); + for (i = 0; pSync && (i < _SHMEM_COLLECT_SYNC_SIZE); i++) + { + pSync[i] = _SHMEM_SYNC_VALUE; + } + } + + return rc; +} + + +/* + This algorithm is quite simple and straightforward for PEs with identical data size. + One node gathers data from peers and send final result to them. + Outlay: + NP-1 competing network transfers are needed. +*/ +static int __algorithm_f_central_counter(struct oshmem_group_t *group, void *target, const void *source, size_t nlong, long *pSync) +{ + int rc = OSHMEM_SUCCESS; + int i = 0; + int PE_root = oshmem_proc_pe(group->proc_array[0]); + + SCOLL_VERBOSE(12, "[#%d] Collect algorithm: Central Counter (identical size)", group->my_pe); + SCOLL_VERBOSE(15, "[#%d] pSync[0] = %ld", group->my_pe, pSync[0]); + + if (PE_root == group->my_pe) + { + int pe_cur = 0; + + memcpy((void*)((unsigned char*)target + 0 * nlong), (void *)source, nlong); + + SCOLL_VERBOSE(14, "[#%d] Gather data from all PEs in the group", group->my_pe); + for (i = 0; (i < group->proc_count) && (rc == OSHMEM_SUCCESS); i++) + { + /* Get PE ID of a peer from the group */ + pe_cur = oshmem_proc_pe(group->proc_array[i]); + + if (pe_cur == group->my_pe) continue; + + SCOLL_VERBOSE(14, "[#%d] Gather data (%d bytes) from #%d", group->my_pe, (int)nlong, pe_cur); + + /* Get data from the current peer */ + rc = MCA_SPML_CALL(get((void *)source, nlong, (void*)((unsigned char*)target + i * nlong), pe_cur)); + } + } + + /* Send result to all PE in group */ + if (rc == OSHMEM_SUCCESS) + { + SCOLL_VERBOSE(14, "[#%d] Broadcast from the root #%d", group->my_pe, PE_root); + rc = group->g_scoll.scoll_broadcast(group, PE_root, target, target, group->proc_count * nlong, (pSync + 1), SCOLL_DEFAULT_ALG); + } + + SCOLL_VERBOSE(15, "[#%d] pSync[0] = %ld", group->my_pe, pSync[0]); + + return rc; +} + + +static int __algorithm_f_tournament(struct oshmem_group_t *group, void *target, const void *source, size_t nlong, long *pSync) +{ + int rc = OSHMEM_SUCCESS; + int round = 0; + int exit_flag = group->proc_count - 1; + long value = SHMEM_SYNC_INIT; + int my_id = oshmem_proc_group_find_id(group, group->my_pe); + int peer_id = 0; + int peer_pe = 0; + int PE_root = oshmem_proc_pe(group->proc_array[0]); + + SCOLL_VERBOSE(12, "[#%d] Collect algorithm: Tournament (identical size)", group->my_pe); + SCOLL_VERBOSE(15, "[#%d] pSync[0] = %ld", group->my_pe, pSync[0]); + + /* Set current state as WAIT */ + pSync[0] = SHMEM_SYNC_WAIT; + + /* Copy data to itself */ + memcpy((void*)((unsigned char*)target + my_id * nlong), (void *)source, nlong); + + while (exit_flag && (rc == OSHMEM_SUCCESS)) + { + /* Define a peer for competition */ + peer_id = my_id ^ (1 << round); + + /* Update exit condition and round counter */ + exit_flag >>= 1; + round++; + + /* Do not have peer for tournament */ + if (peer_id >= group->proc_count) continue; + + if ( my_id < peer_id ) + { + pSync[0] = peer_id; + value = my_id; + + SCOLL_VERBOSE(14, "[#%d] round = %d wait", group->my_pe, round); + rc = MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_EQ, (void*)&value, SHMEM_LONG)); + } + else + { + peer_pe = oshmem_proc_pe(group->proc_array[peer_id]); + +#if 1 /* It is ugly implementation of compare and swap operation + Usage of this hack does not give performance improvement but + it is expected that shmem_long_cswap() will make it faster. + */ + do + { + MCA_SPML_CALL(get((void*)pSync, sizeof(value), (void*)&value, peer_pe)); + } while (value != my_id); + + SCOLL_VERBOSE(14, "[#%d] round = %d send data to #%d", group->my_pe, round, peer_pe); + rc = MCA_SPML_CALL(put((void*)((unsigned char*)target + my_id * nlong), (1 << (round - 1)) * nlong, (void*)((unsigned char*)target + my_id * nlong), peer_pe)); + + MCA_SPML_CALL(fence()); + + SCOLL_VERBOSE(14, "[#%d] round = %d signals to #%d", group->my_pe, round, peer_pe); + value = peer_id; + rc = MCA_SPML_CALL(put((void*)pSync, sizeof(value), (void*)&value, peer_pe)); +#endif + SCOLL_VERBOSE(14, "[#%d] round = %d wait", group->my_pe, round); + value = SHMEM_SYNC_RUN; + rc = MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_EQ, (void*)&value, SHMEM_LONG)); + + break; + } + } + + /* Send result to all PE in group */ + if ( (my_id == 0) && (rc == OSHMEM_SUCCESS) ) + { + SCOLL_VERBOSE(14, "[#%d] signals to all", group->my_pe); + + value = SHMEM_SYNC_RUN; + for (peer_id = 1; (peer_id < group->proc_count) && (rc == OSHMEM_SUCCESS); peer_id++) + { + peer_pe = oshmem_proc_pe(group->proc_array[peer_id]); + rc = MCA_SPML_CALL(put((void*)pSync, sizeof(value), (void*)&value, peer_pe)); + } + } + + /* Send result to all PE in group */ + if ( rc == OSHMEM_SUCCESS ) + { + SCOLL_VERBOSE(14, "[#%d] Broadcast from the root #%d", group->my_pe, PE_root); + rc = group->g_scoll.scoll_broadcast(group, PE_root, target, target, group->proc_count * nlong, (pSync + 1), SCOLL_DEFAULT_ALG); + } + + SCOLL_VERBOSE(15, "[#%d] pSync[0] = %ld", group->my_pe, pSync[0]); + + return rc; +} + + +static int __algorithm_f_ring(struct oshmem_group_t *group, void *target, const void *source, size_t nlong, long *pSync) +{ + int rc = OSHMEM_SUCCESS; + int i = 0; + long value = SHMEM_SYNC_INIT; + int my_id = oshmem_proc_group_find_id(group, group->my_pe); + int data_index = 0; + int peer_id = 0; + int peer_pe = 0; + + SCOLL_VERBOSE(12, "[#%d] Collect algorithm: Ring (identical size)", group->my_pe); + SCOLL_VERBOSE(15, "[#%d] pSync[0] = %ld", group->my_pe, pSync[0]); + + peer_id = (my_id + 1) % group->proc_count; + peer_pe = oshmem_proc_pe(group->proc_array[peer_id]); + memcpy((void*)((unsigned char*)target + my_id * nlong), (void *)source, nlong); + data_index = my_id; + + for (i = 0; (i < (group->proc_count - 1)) && (rc == OSHMEM_SUCCESS); i++) + { + SCOLL_VERBOSE(14, "[#%d] round = %d send data to #%d by index = %d", group->my_pe, i, peer_pe, data_index); + rc = MCA_SPML_CALL(put((void*)((unsigned char*)target + data_index * nlong), nlong, (void*)((unsigned char*)target + data_index * nlong), peer_pe)); + + MCA_SPML_CALL(fence()); + + SCOLL_VERBOSE(14, "[#%d] round = %d signals to #%d", group->my_pe, i, peer_pe); + value = i; + rc = MCA_SPML_CALL(put((void*)pSync, sizeof(value), (void*)&value, peer_pe)); + + data_index = (data_index ? (data_index - 1) : (group->proc_count - 1)); + + SCOLL_VERBOSE(14, "[#%d] round = %d wait for data by index = %d", group->my_pe, i, data_index); + if ( i == 0 ) + { + value = _SHMEM_SYNC_VALUE; + rc = MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_NE, (void*)&value, SHMEM_LONG)); + } + else + { + value = i; + rc = MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_GE, (void*)&value, SHMEM_LONG)); + } + } + + SCOLL_VERBOSE(15, "[#%d] pSync[0] = %ld", group->my_pe, pSync[0]); + + return rc; +} + + +static int __algorithm_f_recursive_doubling(struct oshmem_group_t *group, void *target, const void *source, size_t nlong, long *pSync) +{ + int rc = OSHMEM_SUCCESS; + int round = 0; + int floor2_proc = 0; + int exit_flag = 0; + long value = SHMEM_SYNC_INIT; + int my_id = oshmem_proc_group_find_id(group, group->my_pe); + int data_index = 0; + int peer_id = 0; + int peer_pe = 0; + int i = 0; + + floor2_proc = 1; + i = group->proc_count; + i >>= 1; + while (i) + { + i >>= 1; + floor2_proc <<= 1; + } + + SCOLL_VERBOSE(12, "[#%d] Collect algorithm: Recursive Doubling (identical size)", group->my_pe); + SCOLL_VERBOSE(15, "[#%d] pSync[0] = %ld floor2_proc = %d", group->my_pe, pSync[0], floor2_proc); + + memcpy((void*)((unsigned char*)target + my_id * nlong), (void *)source, nlong); + data_index = my_id; + + if (my_id >= floor2_proc) + { + int pe_cur = 0; + + /* I am in extra group, my partner is node (my_id-y) in basic group */ + peer_id = my_id - floor2_proc; + peer_pe = oshmem_proc_pe(group->proc_array[peer_id]); + + for (i = 0; (i < group->proc_count) && (rc == OSHMEM_SUCCESS); i++) + { + if ( i == my_id ) continue; + + pe_cur = oshmem_proc_pe(group->proc_array[i]); + + SCOLL_VERBOSE(14, "[#%d] is extra send data to #%d", group->my_pe, pe_cur); + rc = MCA_SPML_CALL(put((void*)((unsigned char*)target + data_index * nlong), nlong, (void *)source, pe_cur)); + } + + MCA_SPML_CALL(fence()); + + SCOLL_VERBOSE(14, "[#%d] is extra and signal to #%d", group->my_pe, peer_pe); + value = SHMEM_SYNC_RUN; + rc = MCA_SPML_CALL(put((void*)pSync, sizeof(value), (void*)&value, peer_pe)); + + SCOLL_VERBOSE(14, "[#%d] wait", group->my_pe); + value = SHMEM_SYNC_RUN; + rc = MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_EQ, (void*)&value, SHMEM_LONG)); + } + else + { + /* Wait for a peer from extra group */ + if ((group->proc_count - floor2_proc) > my_id) + { + /* I am in basic group, my partner is node (my_id+y) in extra group */ + peer_id = my_id + floor2_proc; + peer_pe = oshmem_proc_pe(group->proc_array[peer_id]); + + SCOLL_VERBOSE(14, "[#%d] wait a signal from #%d", group->my_pe, peer_pe); + value = SHMEM_SYNC_RUN; + rc = MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_EQ, (void*)&value, SHMEM_LONG)); + } + + /* Pairwise exchange */ + exit_flag = floor2_proc - 1; + pSync[0] = round; + while (exit_flag && (rc == OSHMEM_SUCCESS)) + { + /* Define a peer for competition */ + peer_id = my_id ^ (1 << round); + + /* Update exit condition and round counter */ + exit_flag >>= 1; + round++; + + peer_pe = oshmem_proc_pe(group->proc_array[peer_id]); + +#if 1 /* It is ugly implementation of compare and swap operation + Usage of this hack does not give performance improvement but + it is expected that shmem_long_cswap() will make it faster. + */ + do + { + MCA_SPML_CALL(get((void*)pSync, sizeof(value), (void*)&value, peer_pe)); + } while (value != (round - 1)); + + SCOLL_VERBOSE(14, "[#%d] round = %d send data to #%d by index = %d", group->my_pe, round, peer_pe, data_index); + rc = MCA_SPML_CALL(put((void*)((unsigned char*)target + data_index * nlong), (1 << (round - 1)) * nlong, (void*)((unsigned char*)target + data_index * nlong), peer_pe)); + + MCA_SPML_CALL(fence()); + + data_index = (my_id / (1 << round)) * (1 << round); + + SCOLL_VERBOSE(14, "[#%d] round = %d signals to #%d", group->my_pe, round, peer_pe); + value = SHMEM_SYNC_RUN; + rc = MCA_SPML_CALL(put((void*)pSync, sizeof(value), (void*)&value, peer_pe)); +#endif + + SCOLL_VERBOSE(14, "[#%d] round = %d wait", group->my_pe, round); + value = SHMEM_SYNC_RUN; + rc = MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_EQ, (void*)&value, SHMEM_LONG)); + + pSync[0] = round; + } + + /* Notify a peer from extra group */ + if ((group->proc_count - floor2_proc) > my_id) + { + /* I am in basic group, my partner is node (my_id+y) in extra group */ + peer_id = my_id + floor2_proc; + peer_pe = oshmem_proc_pe(group->proc_array[peer_id]); + + SCOLL_VERBOSE(14, "[#%d] is extra send data to #%d", group->my_pe, peer_pe); + rc = MCA_SPML_CALL(put(target, group->proc_count * nlong, target, peer_pe)); + + MCA_SPML_CALL(fence()); + + SCOLL_VERBOSE(14, "[#%d] signals to #%d", group->my_pe, peer_pe); + value = SHMEM_SYNC_RUN; + rc = MCA_SPML_CALL(put((void*)pSync, sizeof(value), (void*)&value, peer_pe)); + } + } + + SCOLL_VERBOSE(15, "[#%d] pSync[0] = %ld", group->my_pe, pSync[0]); + + return rc; +} + + +/* + This algorithm is quite simple and straightforward. It allows to have different data size on PEs. + One node gathers data from peers and send final result to them. + Outlay: + NP-1 competing network transfers are needed. +*/ +static int __algorithm_central_collector(struct oshmem_group_t *group, void *target, const void *source, size_t nlong, long *pSync) +{ + int rc = OSHMEM_SUCCESS; + size_t offset = 0; + int i = 0; + int PE_root = oshmem_proc_pe(group->proc_array[0]); + + SCOLL_VERBOSE(12, "[#%d] Collect algorithm: Central Counter (vary size)", group->my_pe); + + /* Set own data size */ + pSync[0] = nlong; + + if (PE_root == group->my_pe) + { + long value = 0; + int pe_cur = 0; + long wait_pe_count = 0; + size_t* wait_pe_array = NULL; + + wait_pe_count = group->proc_count; + wait_pe_array = malloc(sizeof(*wait_pe_array) * wait_pe_count); + if (wait_pe_array) + { + memset((void*)wait_pe_array, 0, sizeof(*wait_pe_array) * wait_pe_count); + wait_pe_array[0] = nlong; + wait_pe_count--; + + while (wait_pe_count) + { + SCOLL_VERBOSE(14, "[#%d] Gather data size info from all PEs in the group", group->my_pe); + for (i = 1; (i < group->proc_count) && (rc == OSHMEM_SUCCESS); i++) + { + if (wait_pe_array[i] == 0) + { + pe_cur = oshmem_proc_pe(group->proc_array[i]); + value = 0; + rc = MCA_SPML_CALL(get((void*)pSync, sizeof(value), (void*)&value, pe_cur)); + if ( (rc == OSHMEM_SUCCESS) && (value != _SHMEM_SYNC_VALUE) && (value > 0) ) + { + wait_pe_array[i] = (size_t)value; + wait_pe_count--; + SCOLL_VERBOSE(14, "Got source data size as %d from #%d (wait list counter: %d)", (int)value, pe_cur, (int)wait_pe_count); + } + } + } + } + + memcpy((void*)((unsigned char*)target + 0 * nlong), (void *)source, nlong); + offset += nlong; + + for (i = 1; (i < group->proc_count) && (rc == OSHMEM_SUCCESS); i++) + { + /* Get PE ID of a peer from the group */ + pe_cur = oshmem_proc_pe(group->proc_array[i]); + + /* Get data from the current peer */ + rc = MCA_SPML_CALL(get((void *)source, wait_pe_array[i], (void*)((unsigned char*)target + offset), pe_cur)); + + SCOLL_VERBOSE(14, "Got %d bytes of data from #%d (offset: %d)", (int)wait_pe_array[i], pe_cur, (int)offset); + + offset += wait_pe_array[i]; + } + + free(wait_pe_array); + } + else + { + rc = OSHMEM_ERR_OUT_OF_RESOURCE; + } + } + + /* Send result to all PE in group */ + if (rc == OSHMEM_SUCCESS) + { + SCOLL_VERBOSE(14, "[#%d] Broadcast from the root #%d", group->my_pe, PE_root); + rc = group->g_scoll.scoll_broadcast(group, PE_root, target, target, offset, (pSync + 1), SCOLL_DEFAULT_ALG); + } + + return rc; +} diff --git a/oshmem/mca/scoll/basic/scoll_basic_component.c b/oshmem/mca/scoll/basic/scoll_basic_component.c new file mode 100644 index 0000000000..ccba761f21 --- /dev/null +++ b/oshmem/mca/scoll/basic/scoll_basic_component.c @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "oshmem_config.h" + +#include "oshmem/constants.h" +#include "oshmem/mca/scoll/scoll.h" +#include "oshmem/mca/scoll/base/base.h" +#include "scoll_basic.h" + + +/* + * Public string showing the scoll basic component version number + */ +const char *mca_scoll_basic_component_version_string = + "Open SHMEM basic collective MCA component version " OSHMEM_VERSION; + +/* + * Global variable + */ +int mca_scoll_basic_priority_param = -1; +int mca_scoll_basic_param_barrier_algorithm = SCOLL_ALG_BARRIER_ADAPTIVE; +int mca_scoll_basic_param_broadcast_algorithm = SCOLL_ALG_BROADCAST_BINOMIAL; +int mca_scoll_basic_param_collect_algorithm = SCOLL_ALG_COLLECT_RECURSIVE_DOUBLING; +int mca_scoll_basic_param_reduce_algorithm = SCOLL_ALG_REDUCE_RECURSIVE_DOUBLING; + +/* + * Local function + */ +static int __basic_open(void); + + +/* + * Instantiate the public struct with all of our public information + * and pointers to our public functions in it + */ + +mca_scoll_base_component_t mca_scoll_basic_component = { + + /* First, the mca_component_t struct containing meta information + about the component itself */ + + { + MCA_SCOLL_BASE_VERSION_2_0_0, + + /* Component name and version */ + "basic", + OSHMEM_MAJOR_VERSION, + OSHMEM_MINOR_VERSION, + OSHMEM_RELEASE_VERSION, + + /* Component open and close functions */ + __basic_open, + NULL + }, + { + /* The component is checkpoint ready */ + MCA_BASE_METADATA_PARAM_CHECKPOINT + }, + + /* Initialization / querying functions */ + + mca_scoll_basic_init, + mca_scoll_basic_query +}; + + +static int __basic_open(void) +{ + char help_msg[200]; + mca_base_component_t *comp = &mca_scoll_basic_component.scoll_version; + int default_value = 75; + int param_value = default_value; + + /* We'll always be picked if there's only one process in the + communicator */ + mca_scoll_basic_priority_param = + mca_base_param_reg_int(&mca_scoll_basic_component.scoll_version, "priority", NULL, false, false, default_value, ¶m_value); + + sprintf( help_msg, + "Algoritm selection for Barrier (%d - Central Counter, %d - Tournament, %d - Recursive Doubling, %d - Dissemination, %d - Basic, %d - Adaptive)", + SCOLL_ALG_BARRIER_CENTRAL_COUNTER, + SCOLL_ALG_BARRIER_TOURNAMENT, + SCOLL_ALG_BARRIER_RECURSIVE_DOUBLING, + SCOLL_ALG_BARRIER_DISSEMINATION, + SCOLL_ALG_BARRIER_BASIC, + SCOLL_ALG_BARRIER_ADAPTIVE); + mca_base_param_reg_int(comp, "barrier_alg", + (const char *)help_msg, + false, false, + mca_scoll_basic_param_barrier_algorithm, &mca_scoll_basic_param_barrier_algorithm); + + sprintf( help_msg, + "Algoritm selection for Broadcast (%d - Central Counter, %d - Binomial)", + SCOLL_ALG_BROADCAST_CENTRAL_COUNTER, + SCOLL_ALG_BROADCAST_BINOMIAL); + mca_base_param_reg_int(comp, "broadcast_alg", + (const char *)help_msg, + false, false, + mca_scoll_basic_param_broadcast_algorithm, &mca_scoll_basic_param_broadcast_algorithm); + + sprintf( help_msg, + "Algoritm selection for Collect (%d - Central Counter, %d - Tournament, %d - Recursive Doubling, %d - Ring)", + SCOLL_ALG_COLLECT_CENTRAL_COUNTER, + SCOLL_ALG_COLLECT_TOURNAMENT, + SCOLL_ALG_COLLECT_RECURSIVE_DOUBLING, + SCOLL_ALG_COLLECT_RING); + mca_base_param_reg_int(comp, "collect_alg", + (const char *)help_msg, + false, false, + mca_scoll_basic_param_collect_algorithm, &mca_scoll_basic_param_collect_algorithm); + + sprintf( help_msg, + "Algoritm selection for Reduce (%d - Central Counter, %d - Tournament, %d - Recursive Doubling %d - Linear %d - Log)", + SCOLL_ALG_REDUCE_CENTRAL_COUNTER, + SCOLL_ALG_REDUCE_TOURNAMENT, + SCOLL_ALG_REDUCE_RECURSIVE_DOUBLING, + SCOLL_ALG_REDUCE_LEGACY_LINEAR, + SCOLL_ALG_REDUCE_LEGACY_LOG); + mca_base_param_reg_int(comp, "reduce_alg", + (const char *)help_msg, + false, false, + mca_scoll_basic_param_reduce_algorithm, &mca_scoll_basic_param_reduce_algorithm); + + return OSHMEM_SUCCESS; +} + + +OBJ_CLASS_INSTANCE(mca_scoll_basic_module_t, + mca_scoll_base_module_t, + NULL, NULL); diff --git a/oshmem/mca/scoll/basic/scoll_basic_module.c b/oshmem/mca/scoll/basic/scoll_basic_module.c new file mode 100644 index 0000000000..8a021c7910 --- /dev/null +++ b/oshmem/mca/scoll/basic/scoll_basic_module.c @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "oshmem_config.h" +#include + +#include "opal/mca/base/mca_base_param.h" + +#include "oshmem/constants.h" +#include "oshmem/mca/scoll/scoll.h" +#include "oshmem/mca/scoll/base/base.h" +#include "scoll_basic.h" + + +/* + * Initial query function that is invoked during initialization, allowing + * this module to indicate what level of thread support it provides. + */ +int mca_scoll_basic_init(bool enable_progress_threads, + bool enable_threads) +{ + /* Nothing to do */ + return OSHMEM_SUCCESS; +} + + +/* + * Invoked when there's a new communicator that has been created. + * Look at the communicator and decide which set of functions and + * priority we want to return. + */ +static int mca_scoll_basic_enable(mca_scoll_base_module_t *module, + struct oshmem_group_t *comm) +{ + /*nothing to do here*/ + return OSHMEM_SUCCESS; +} + + +mca_scoll_base_module_t * +mca_scoll_basic_query(struct oshmem_group_t *group, + int *priority) +{ + mca_scoll_basic_module_t *module; + + if (OSHMEM_SUCCESS == + mca_base_param_lookup_int(mca_scoll_basic_priority_param, + priority)) + { + module = OBJ_NEW(mca_scoll_basic_module_t); + if (module) + { + module->super.scoll_barrier = mca_scoll_basic_barrier; + module->super.scoll_broadcast = mca_scoll_basic_broadcast; + module->super.scoll_collect = mca_scoll_basic_collect; + module->super.scoll_reduce = mca_scoll_basic_reduce; + module->super.scoll_module_enable = mca_scoll_basic_enable; + return &(module->super); + } + } + + return NULL; +} diff --git a/oshmem/mca/scoll/basic/scoll_basic_reduce.c b/oshmem/mca/scoll/basic/scoll_basic_reduce.c new file mode 100644 index 0000000000..c7cc11c449 --- /dev/null +++ b/oshmem/mca/scoll/basic/scoll_basic_reduce.c @@ -0,0 +1,691 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "oshmem_config.h" +#include +#include + +#include "opal/util/bit_ops.h" + +#include "oshmem/constants.h" +#include "oshmem/op/op.h" +#include "oshmem/mca/spml/spml.h" +#include "oshmem/mca/scoll/scoll.h" +#include "oshmem/mca/scoll/base/base.h" +#include "scoll_basic.h" + + +static int __algorithm_central_counter(struct oshmem_group_t *group, struct oshmem_op_t *op, void *target, const void *source, size_t nlong, long *pSync, void *pWrk); +static int __algorithm_tournament(struct oshmem_group_t *group, struct oshmem_op_t *op, void *target, const void *source, size_t nlong, long *pSync, void *pWrk); +static int __algorithm_recursive_doubling(struct oshmem_group_t *group, struct oshmem_op_t *op, void *target, const void *source, size_t nlong, long *pSync, void *pWrk); +static int __algorithm_linear(struct oshmem_group_t *group, struct oshmem_op_t *op, void *target, const void *source, size_t nlong, long *pSync, void *pWrk); +static int __algorithm_log(struct oshmem_group_t *group, struct oshmem_op_t *op, void *target, const void *source, size_t nlong, long *pSync, void *pWrk); + + +int mca_scoll_basic_reduce(struct oshmem_group_t *group, struct oshmem_op_t *op, void *target, const void *source, size_t nlong, long *pSync, void *pWrk, int alg) +{ + int rc = OSHMEM_SUCCESS; + + /* Arguments validation */ + if (!group) + { + SCOLL_ERROR("Active set (group) of PE is not defined"); + rc = OSHMEM_ERR_BAD_PARAM; + } + + /* Check if this PE is part of the group */ + if ((rc == OSHMEM_SUCCESS) && oshmem_proc_group_is_member(group)) + { + int i = 0; + + if (pSync) + { + alg = ( alg == SCOLL_DEFAULT_ALG ? mca_scoll_basic_param_reduce_algorithm : alg); + switch(alg) + { + case SCOLL_ALG_REDUCE_CENTRAL_COUNTER: + { + rc = __algorithm_central_counter(group, op, target, source, nlong, pSync, pWrk); + break; + } + case SCOLL_ALG_REDUCE_TOURNAMENT: + { + rc = __algorithm_tournament(group, op, target, source, nlong, pSync, pWrk); + break; + } + case SCOLL_ALG_REDUCE_RECURSIVE_DOUBLING: + { + rc = __algorithm_recursive_doubling(group, op, target, source, nlong, pSync, pWrk); + break; + } + case SCOLL_ALG_REDUCE_LEGACY_LINEAR: + { + rc = __algorithm_linear(group, op, target, source, nlong, pSync, pWrk); + break; + } + case SCOLL_ALG_REDUCE_LEGACY_LOG: + { + rc = __algorithm_log(group, op, target, source, nlong, pSync, pWrk); + break; + } + default: + { + rc = __algorithm_central_counter(group, op, target, source, nlong, pSync, pWrk); + } + } + } + else + { + SCOLL_ERROR("Incorrect argument pSync"); + rc = OSHMEM_ERR_BAD_PARAM; + } + + /* Restore initial values */ + SCOLL_VERBOSE(12, "PE#%d Restore special synchronization array", group->my_pe); + for (i = 0; pSync && (i < _SHMEM_REDUCE_SYNC_SIZE); i++) + { + pSync[i] = _SHMEM_SYNC_VALUE; + } + } + + return rc; +} + + +/* + This algorithm is quite simple and straightforward for PEs with identical data size. + One node gathers data from peers and send final result to them. + Outlay: + NP-1 competing network transfers are needed. +*/ +static int __algorithm_central_counter(struct oshmem_group_t *group, struct oshmem_op_t *op, void *target, const void *source, size_t nlong, long *pSync, void *pWrk) +{ + int rc = OSHMEM_SUCCESS; + int i = 0; + int PE_root = oshmem_proc_pe(group->proc_array[0]); + + SCOLL_VERBOSE(12, "[#%d] Reduce algorithm: Central Counter", group->my_pe); + + if (PE_root == group->my_pe) + { + int pe_cur = 0; + void *target_cur = NULL; + + target_cur = malloc(nlong); + if (target_cur) + { + memcpy(target, (void *)source, nlong); + + SCOLL_VERBOSE(14, "[#%d] Gather data from all PEs in the group", group->my_pe); + for (i = 0; (i < group->proc_count) && (rc == OSHMEM_SUCCESS); i++) + { + /* Get PE ID of a peer from the group */ + pe_cur = oshmem_proc_pe(group->proc_array[i]); + + if (pe_cur == group->my_pe) continue; + + SCOLL_VERBOSE(14, "[#%d] Gather data (%d bytes) from #%d", group->my_pe, (int)nlong, pe_cur); + + /* Clean up temporary buffer */ + memset(target_cur, 0, nlong); + + /* Get data from the current peer */ + rc = MCA_SPML_CALL(get((void *)source, nlong, target_cur, pe_cur)); + + /* Do reduction operation */ + if (rc == OSHMEM_SUCCESS) + { + op->o_func.c_fn(target_cur, target, nlong / op->dt_size); + } + } + + free(target_cur); + } + else + { + rc = OSHMEM_ERR_OUT_OF_RESOURCE; + } + } + + /* Send result to all PE in group */ + if (rc == OSHMEM_SUCCESS) + { + SCOLL_VERBOSE(14, "[#%d] Broadcast from the root #%d", group->my_pe, PE_root); + rc = group->g_scoll.scoll_broadcast(group, PE_root, target, target, nlong, (pSync + 1), SCOLL_DEFAULT_ALG); + } + + return rc; +} + + +static int __algorithm_tournament(struct oshmem_group_t *group, struct oshmem_op_t *op, void *target, const void *source, size_t nlong, long *pSync, void *pWrk) +{ + int rc = OSHMEM_SUCCESS; + int round = 0; + int exit_flag = group->proc_count - 1; + long value = SHMEM_SYNC_INIT; + int my_id = oshmem_proc_group_find_id(group, group->my_pe); + int peer_id = 0; + int peer_pe = 0; + void *target_cur = NULL; + int PE_root = oshmem_proc_pe(group->proc_array[0]); + + SCOLL_VERBOSE(12, "[#%d] Reduce algorithm: Tournament", group->my_pe); + SCOLL_VERBOSE(15, "[#%d] pSync[0] = %ld", group->my_pe, pSync[0]); + + /* Set current state as WAIT */ + pSync[0] = SHMEM_SYNC_WAIT; + + target_cur = malloc(nlong); + if (target_cur) + { + memcpy(target_cur, (void *)source, nlong); + } + else + { + return OSHMEM_ERR_OUT_OF_RESOURCE; + } + + while (exit_flag && (rc == OSHMEM_SUCCESS)) + { + /* Define a peer for competition */ + peer_id = my_id ^ (1 << round); + + /* Update exit condition and round counter */ + exit_flag >>= 1; + round++; + + /* Do not have peer for tournament */ + if (peer_id >= group->proc_count) continue; + + if ( my_id < peer_id ) + { + pSync[0] = peer_id; + value = my_id; + + SCOLL_VERBOSE(14, "[#%d] round = %d wait", group->my_pe, round); + rc = MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_EQ, (void*)&value, SHMEM_LONG)); + + /* Do reduction operation */ + if (rc == OSHMEM_SUCCESS) + { + op->o_func.c_fn(target, target_cur, nlong / op->dt_size); + } + } + else + { + peer_pe = oshmem_proc_pe(group->proc_array[peer_id]); + +#if 1 /* It is ugly implementation of compare and swap operation + Usage of this hack does not give performance improvement but + it is expected that shmem_long_cswap() will make it faster. + */ + do + { + MCA_SPML_CALL(get((void*)pSync, sizeof(value), (void*)&value, peer_pe)); + } while (value != my_id); + + SCOLL_VERBOSE(14, "[#%d] round = %d send data to #%d", group->my_pe, round, peer_pe); + rc = MCA_SPML_CALL(put(target, nlong, target_cur, peer_pe)); + + MCA_SPML_CALL(fence()); + + SCOLL_VERBOSE(14, "[#%d] round = %d signals to #%d", group->my_pe, round, peer_pe); + value = peer_id; + rc = MCA_SPML_CALL(put((void*)pSync, sizeof(value), (void*)&value, peer_pe)); +#endif + SCOLL_VERBOSE(14, "[#%d] round = %d wait", group->my_pe, round); + value = SHMEM_SYNC_RUN; + rc = MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_EQ, (void*)&value, SHMEM_LONG)); + + break; + } + } + + /* Send result to all PE in group */ + if ( (my_id == 0) && (rc == OSHMEM_SUCCESS) ) + { + SCOLL_VERBOSE(14, "[#%d] signals to all", group->my_pe); + + memcpy(target, target_cur, nlong); + + value = SHMEM_SYNC_RUN; + for (peer_id = 1; (peer_id < group->proc_count) && (rc == OSHMEM_SUCCESS); peer_id++) + { + peer_pe = oshmem_proc_pe(group->proc_array[peer_id]); + rc = MCA_SPML_CALL(put((void*)pSync, sizeof(value), (void*)&value, peer_pe)); + } + } + + /* Send result to all PE in group */ + if ( rc == OSHMEM_SUCCESS ) + { + SCOLL_VERBOSE(14, "[#%d] Broadcast from the root #%d", group->my_pe, PE_root); + rc = group->g_scoll.scoll_broadcast(group, PE_root, target, target, nlong, (pSync + 1), SCOLL_DEFAULT_ALG); + } + + free(target_cur); + + SCOLL_VERBOSE(15, "[#%d] pSync[0] = %ld", group->my_pe, pSync[0]); + + return rc; +} + + +static int __algorithm_recursive_doubling(struct oshmem_group_t *group, struct oshmem_op_t *op, void *target, const void *source, size_t nlong, long *pSync, void *pWrk) +{ + int rc = OSHMEM_SUCCESS; + int round = 0; + int floor2_proc = 0; + int exit_flag = 0; + long value = SHMEM_SYNC_INIT; + void *target_cur = NULL; + int my_id = oshmem_proc_group_find_id(group, group->my_pe); + int peer_id = 0; + int peer_pe = 0; + int i = 0; + + floor2_proc = 1; + i = group->proc_count; + i >>= 1; + while (i) + { + i >>= 1; + floor2_proc <<= 1; + } + + target_cur = malloc(nlong); + if (target_cur) + { + memcpy(target_cur, (void *)source, nlong); + } + else + { + return OSHMEM_ERR_OUT_OF_RESOURCE; + } + + SCOLL_VERBOSE(12, "[#%d] Reduce algorithm: Recursive Doubling", group->my_pe); + SCOLL_VERBOSE(15, "[#%d] pSync[0] = %ld floor2_proc = %d", group->my_pe, pSync[0], floor2_proc); + + if (my_id >= floor2_proc) + { + /* I am in extra group, my partner is node (my_id-y) in basic group */ + peer_id = my_id - floor2_proc; + peer_pe = oshmem_proc_pe(group->proc_array[peer_id]); + + /* Special procedure is needed in case target and source are the same */ + if (source == target) + { + SCOLL_VERBOSE(14, "[#%d] wait for peer #%d is ready", group->my_pe, peer_pe); + value = SHMEM_SYNC_WAIT; + rc = MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_EQ, (void*)&value, SHMEM_LONG)); + } + + SCOLL_VERBOSE(14, "[#%d] is extra send data to #%d", group->my_pe, peer_pe); + rc = MCA_SPML_CALL(put(target, nlong, target_cur, peer_pe)); + + MCA_SPML_CALL(fence()); + + SCOLL_VERBOSE(14, "[#%d] is extra and signal to #%d", group->my_pe, peer_pe); + value = SHMEM_SYNC_RUN; + rc = MCA_SPML_CALL(put((void*)pSync, sizeof(value), (void*)&value, peer_pe)); + + SCOLL_VERBOSE(14, "[#%d] wait", group->my_pe); + value = SHMEM_SYNC_RUN; + rc = MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_EQ, (void*)&value, SHMEM_LONG)); + } + else + { + /* Wait for a peer from extra group */ + if ((group->proc_count - floor2_proc) > my_id) + { + /* I am in basic group, my partner is node (my_id+y) in extra group */ + peer_id = my_id + floor2_proc; + peer_pe = oshmem_proc_pe(group->proc_array[peer_id]); + + /* Special procedure is needed in case target and source are the same */ + if (source == target) + { + SCOLL_VERBOSE(14, "[#%d] signal to #%d that I am ready", group->my_pe, peer_pe); + value = SHMEM_SYNC_WAIT; + rc = MCA_SPML_CALL(put((void*)pSync, sizeof(value), (void*)&value, peer_pe)); + } + + SCOLL_VERBOSE(14, "[#%d] wait a signal from #%d", group->my_pe, peer_pe); + value = SHMEM_SYNC_RUN; + rc = MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_EQ, (void*)&value, SHMEM_LONG)); + + /* Do reduction operation */ + if (rc == OSHMEM_SUCCESS) + { + op->o_func.c_fn(target, target_cur, nlong / op->dt_size); + } + } + + /* Pairwise exchange */ + exit_flag = floor2_proc - 1; + pSync[0] = round; + while (exit_flag && (rc == OSHMEM_SUCCESS)) + { + /* Define a peer for competition */ + peer_id = my_id ^ (1 << round); + + /* Update exit condition and round counter */ + exit_flag >>= 1; + round++; + + peer_pe = oshmem_proc_pe(group->proc_array[peer_id]); + +#if 1 /* It is ugly implementation of compare and swap operation + Usage of this hack does not give performance improvement but + it is expected that shmem_long_cswap() will make it faster. + */ + do + { + MCA_SPML_CALL(get((void*)pSync, sizeof(value), (void*)&value, peer_pe)); + } while (value != (round - 1)); + + SCOLL_VERBOSE(14, "[#%d] round = %d send data to #%d", group->my_pe, round, peer_pe); + rc = MCA_SPML_CALL(put(target, nlong, target_cur, peer_pe)); + + MCA_SPML_CALL(fence()); + + SCOLL_VERBOSE(14, "[#%d] round = %d signals to #%d", group->my_pe, round, peer_pe); + value = SHMEM_SYNC_RUN; + rc = MCA_SPML_CALL(put((void*)pSync, sizeof(value), (void*)&value, peer_pe)); +#endif + + SCOLL_VERBOSE(14, "[#%d] round = %d wait", group->my_pe, round); + value = SHMEM_SYNC_RUN; + rc = MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_EQ, (void*)&value, SHMEM_LONG)); + + /* Do reduction operation */ + if (rc == OSHMEM_SUCCESS) + { + op->o_func.c_fn(target, target_cur, nlong / op->dt_size); + } + + pSync[0] = round; + } + + memcpy(target, target_cur, nlong); + + /* Notify a peer from extra group */ + if ((group->proc_count - floor2_proc) > my_id) + { + /* I am in basic group, my partner is node (my_id+y) in extra group */ + peer_id = my_id + floor2_proc; + peer_pe = oshmem_proc_pe(group->proc_array[peer_id]); + + SCOLL_VERBOSE(14, "[#%d] is extra send data to #%d", group->my_pe, peer_pe); + rc = MCA_SPML_CALL(put(target, nlong, target_cur, peer_pe)); + + MCA_SPML_CALL(fence()); + + SCOLL_VERBOSE(14, "[#%d] signals to #%d", group->my_pe, peer_pe); + value = SHMEM_SYNC_RUN; + rc = MCA_SPML_CALL(put((void*)pSync, sizeof(value), (void*)&value, peer_pe)); + } + } + + free(target_cur); + + SCOLL_VERBOSE(15, "[#%d] pSync[0] = %ld", group->my_pe, pSync[0]); + + return rc; +} + + +static int __algorithm_linear(struct oshmem_group_t *group, struct oshmem_op_t *op, void *target, const void *source, size_t nlong, long *pSync, void *pWrk) +{ + int rc = OSHMEM_SUCCESS; + int i, rank, size; + char *free_buffer = NULL; + char *pml_buffer = NULL; + char *inbuf; + int peer_id = 0; + int peer_pe = 0; + + /* Initialize */ + rank = group->my_pe; + size = group->proc_count; + int root_id = size - 1; + int root_pe = oshmem_proc_pe(group->proc_array[root_id]); + + SCOLL_VERBOSE(12, "[#%d] Reduce algorithm: Basic", group->my_pe); + + /* If not root, send data to the root. */ + + if (rank != root_pe) { + rc = MCA_SPML_CALL(send((void*)source, nlong, root_pe, MCA_SPML_BASE_PUT_STANDARD)); + } + else { + + /* for reducing buffer allocation lengths.... */ + + if (size > 1) { + free_buffer = (char*)malloc(nlong); + if (NULL == free_buffer) { + return OSHMEM_ERR_OUT_OF_RESOURCE; + } + pml_buffer = free_buffer; + } + + /* Initialize the receive buffer. */ + + if (root_id == (size - 1)) { + memcpy(target, (void *)source, nlong); + } else { + peer_id = size - 1; + peer_pe = oshmem_proc_pe(group->proc_array[peer_id]); + rc = MCA_SPML_CALL(recv(target, nlong, peer_pe)); + } + if (OSHMEM_SUCCESS != rc) { + if (NULL != free_buffer) { + free(free_buffer); + } + return rc; + } + + /* Loop receiving and calling reduction function (C or Fortran). */ + + for (i = size - 2; i >= 0; --i) { + if (root_id == i) { + inbuf = (char*)source; + } else { + peer_id = i; + peer_pe = oshmem_proc_pe(group->proc_array[peer_id]); + rc = MCA_SPML_CALL(recv(pml_buffer, nlong, peer_pe)); + if (OSHMEM_SUCCESS != rc) { + if (NULL != free_buffer) { + free(free_buffer); + } + return rc; + } + + inbuf = pml_buffer; + } + + /* Perform the reduction */ + op->o_func.c_fn(inbuf, target, nlong / op->dt_size); + } + + if (NULL != free_buffer) { + free(free_buffer); + } + } + + /* Send result to all PE in group */ + if (rc == OSHMEM_SUCCESS) + { + SCOLL_VERBOSE(14, "[#%d] Broadcast from the root #%d", group->my_pe, root_pe); + rc = group->g_scoll.scoll_broadcast(group, root_pe, target, target, nlong, (pSync + 1), SCOLL_DEFAULT_ALG); + } + + /* All done */ + return rc; +} + + +static int __algorithm_log(struct oshmem_group_t *group, struct oshmem_op_t *op, void *target, const void *source, size_t nlong, long *pSync, void *pWrk) +{ + int rc = OSHMEM_SUCCESS; + int i, size, rank, vrank; + int mask; + void *sbuf = (void*)source; + void *rbuf = target; + char *free_buffer = NULL; + char *free_rbuf = NULL; + char *pml_buffer = NULL; + char *snd_buffer = NULL; + char *rcv_buffer = (char*)rbuf; + int my_id = oshmem_proc_group_find_id(group, group->my_pe); + int peer_id = 0; + int peer_pe = 0; + int root_id = 0; + int root_pe = oshmem_proc_pe(group->proc_array[root_id]); + int dim = 0; + + /* Initialize */ + rank = group->my_pe; + size = group->proc_count; + dim = opal_cube_dim(group->proc_count); + vrank = (my_id + size - root_id) % size; + + SCOLL_VERBOSE(12, "[#%d] Reduce algorithm: Log", group->my_pe); + + /* Allocate the incoming and resulting message buffers. See lengthy + * rationale above. */ + + free_buffer = (char*)malloc(nlong); + if (NULL == free_buffer) { + return OSHMEM_ERR_OUT_OF_RESOURCE; + } + + pml_buffer = free_buffer; + rcv_buffer = pml_buffer; + + /* Allocate sendbuf in case the MPI_IN_PLACE option has been used. See lengthy + * rationale above. */ + + snd_buffer = (char*)sbuf; + + if (my_id != root_id && 0 == (vrank & 1)) { + /* root is the only one required to provide a valid rbuf. + * Assume rbuf is invalid for all other ranks, so fix it up + * here to be valid on all non-leaf ranks */ + free_rbuf = (char*)malloc(nlong); + if (NULL == free_rbuf) { + rc = OSHMEM_ERR_OUT_OF_RESOURCE; + goto cleanup_and_return; + } + rbuf = free_rbuf; + } + + /* Loop over cube dimensions. High processes send to low ones in the + * dimension. */ + + for (i = 0, mask = 1; i < dim; ++i, mask <<= 1) { + + /* A high-proc sends to low-proc and stops. */ + if (vrank & mask) { + peer_id = vrank & ~mask; + peer_id = (peer_id + root_id) % size; + peer_pe = oshmem_proc_pe(group->proc_array[peer_id]); + + rc = MCA_SPML_CALL(send((void*)snd_buffer, nlong, peer_pe, MCA_SPML_BASE_PUT_STANDARD)); + if (OSHMEM_SUCCESS != rc) { + goto cleanup_and_return; + } + snd_buffer = (char*)rbuf; + break; + } + + /* A low-proc receives, reduces, and moves to a higher + * dimension. */ + + else { + peer_id = vrank | mask; + if (peer_id >= size) { + continue; + } + peer_id = (peer_id + root_id) % size; + peer_pe = oshmem_proc_pe(group->proc_array[peer_id]); + + /* Most of the time (all except the first one for commutative + * operations) we receive in the user provided buffer + * (rbuf). But the exception is here to allow us to dont have + * to copy from the sbuf to a temporary location. If the + * operation is commutative we dont care in which order we + * apply the operation, so for the first time we can receive + * the data in the pml_buffer and then apply to operation + * between this buffer and the user provided data. */ + + rc = MCA_SPML_CALL(recv(rcv_buffer, nlong, peer_pe)); + if (OSHMEM_SUCCESS != rc) { + goto cleanup_and_return; + } + /* Perform the operation. The target is always the user + * provided buffer We do the operation only if we receive it + * not in the user buffer */ + if (snd_buffer != sbuf) { + /* the target buffer is the locally allocated one */ + op->o_func.c_fn(rcv_buffer, pml_buffer, nlong / op->dt_size); + } else { + /* If we're commutative, we don't care about the order of + * operations and we can just reduce the operations now. + * If we are not commutative, we have to copy the send + * buffer into a temp buffer (pml_buffer) and then reduce + * what we just received against it. */ + { + op->o_func.c_fn(sbuf, pml_buffer, nlong / op->dt_size); + } + /* now we have to send the buffer containing the computed data */ + snd_buffer = pml_buffer; + /* starting from now we always receive in the user + * provided buffer */ + rcv_buffer = (char*)rbuf; + } + } + } + + /* Get the result to the root if needed. */ + rc = OSHMEM_SUCCESS; + if (0 == vrank) { + if (root_id == my_id) { + memcpy(rbuf, snd_buffer, nlong); + } else { + rc = MCA_SPML_CALL(send((void*)snd_buffer, nlong, root_pe, MCA_SPML_BASE_PUT_STANDARD)); + } + } else if (my_id == root_id) { + rc = MCA_SPML_CALL(recv(rcv_buffer, nlong, root_pe)); + if (rcv_buffer != rbuf) { + op->o_func.c_fn(rcv_buffer, rbuf, nlong / op->dt_size); + } + } + + cleanup_and_return: + if (NULL != free_buffer) { + free(free_buffer); + } + if (NULL != free_rbuf) { + free(free_rbuf); + } + + /* Send result to all PE in group */ + if (rc == OSHMEM_SUCCESS) + { + SCOLL_VERBOSE(14, "[#%d] Broadcast from the root #%d", group->my_pe, root_pe); + rc = group->g_scoll.scoll_broadcast(group, root_pe, target, target, nlong, (pSync + 1), SCOLL_DEFAULT_ALG); + } + + /* All done */ + return rc; +} diff --git a/oshmem/mca/scoll/fca/.windows b/oshmem/mca/scoll/fca/.windows new file mode 100644 index 0000000000..104768dd6a --- /dev/null +++ b/oshmem/mca/scoll/fca/.windows @@ -0,0 +1,12 @@ +# +# Copyright (c) 2012 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# Specific to this module +mca_link_libraries=libshmem diff --git a/oshmem/mca/scoll/fca/Makefile.am b/oshmem/mca/scoll/fca/Makefile.am new file mode 100644 index 0000000000..fc9f772975 --- /dev/null +++ b/oshmem/mca/scoll/fca/Makefile.am @@ -0,0 +1,38 @@ +# -*- shell-script -*- +# +# +# Copyright (c) 2012 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# +# +AM_CPPFLAGS = $(OSHMEM_CFLAGS) $(coll_fca_CPPFLAGS) -DCOLL_FCA_HOME=\"$(coll_fca_HOME)\" -I$(coll_fca_HOME)/include/fca -I$(coll_fca_HOME)/include/fca_core +scoll_fca_sources = \ + scoll_fca.h \ + scoll_fca_debug.h \ + scoll_fca_api.h \ + scoll_fca_module.c \ + scoll_fca_component.c \ + scoll_fca_ops.c +if MCA_BUILD_oshmem_scoll_fca_DSO +component_noinst = +component_install = mca_scoll_fca.la +else +component_noinst = libmca_scoll_fca.la +component_install = +endif + +mcacomponentdir = $(pkglibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_scoll_fca_la_SOURCES = $(scoll_fca_sources) +mca_scoll_fca_la_LIBADD = $(scoll_fca_LIBS) +mca_scoll_fca_la_LDFLAGS = -module -avoid-version $(scoll_fca_LDFLAGS) + +noinst_LTLIBRARIES = $(component_noinst) +libmca_scoll_fca_la_SOURCES =$(scoll_fca_sources) +libmca_scoll_fca_la_LIBADD = $(scoll_fca_LIBS) +libmca_scoll_fca_la_LDFLAGS = -module -avoid-version $(scoll_fca_LDFLAGS) diff --git a/oshmem/mca/scoll/fca/configure.m4 b/oshmem/mca/scoll/fca/configure.m4 new file mode 100644 index 0000000000..6c7c94496c --- /dev/null +++ b/oshmem/mca/scoll/fca/configure.m4 @@ -0,0 +1,39 @@ +# -*- shell-script -*- +# +# +# Copyright (c) 2012 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + + +# MCA_oshmem_scoll_fca_CONFIG([action-if-can-compile], +# [action-if-cant-compile]) +# ------------------------------------------------ +AC_DEFUN([MCA_oshmem_scoll_fca_CONFIG],[ + AC_CONFIG_FILES([oshmem/mca/scoll/fca/Makefile]) + + OMPI_CHECK_FCA([scoll_fca], + [scoll_fca_happy="yes"], + [scoll_fca_happy="no"]) + + AS_IF([test "$scoll_fca_happy" = "yes"], + [scoll_fca_WRAPPER_EXTRA_LDFLAGS="$scoll_fca_LDFLAGS" + scoll_fca_CPPFLAGS="$scoll_fca_CPPFLAGS" + scoll_fca_WRAPPER_EXTRA_CPPFLAGS="$scoll_fca_CPPFLAGS" + scoll_fca_WRAPPER_EXTRA_LIBS="$scoll_fca_LIBS" + $1], + [$2]) + + # substitute in the things needed to build fca + AC_SUBST([scoll_fca_CFLAGS]) + AC_SUBST([scoll_fca_CPPFLAGS]) + AC_SUBST([scoll_fca_LDFLAGS]) + AC_SUBST([scoll_fca_LIBS]) + AC_SUBST(scoll_fca_HOME, "$ompi_check_fca_dir") +])dnl + diff --git a/oshmem/mca/scoll/fca/configure.params b/oshmem/mca/scoll/fca/configure.params new file mode 100644 index 0000000000..5a3f93008f --- /dev/null +++ b/oshmem/mca/scoll/fca/configure.params @@ -0,0 +1,13 @@ +# -*- shell-script -*- +# Copyright (c) 2012 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# Specific to this module + +PARAM_CONFIG_FILES="Makefile" diff --git a/oshmem/mca/scoll/fca/scoll_fca.h b/oshmem/mca/scoll/fca/scoll_fca.h new file mode 100644 index 0000000000..0fb6cb276b --- /dev/null +++ b/oshmem/mca/scoll/fca/scoll_fca.h @@ -0,0 +1,126 @@ +/** + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + * */ + +#ifndef MCA_SCOLL_FCA_H +#define MCA_SCOLL_FCA_H +#include "oshmem_config.h" +#include "oshmem/constants.h" +#include "shmem.h" +#include "opal/mca/mca.h" +#include "oshmem/mca/scoll/scoll.h" +#include "scoll_fca_api.h" +#include "scoll_fca_debug.h" + +#ifdef OMPI_PROC_FLAG_LOCAL +#define FCA_IS_LOCAL_PROCESS(n) ((n) & OMPI_PROC_FLAG_LOCAL) +#else +#define FCA_IS_LOCAL_PROCESS(n) OPAL_PROC_ON_LOCAL_NODE(n) +#endif + +BEGIN_C_DECLS +struct mca_scoll_fca_component_t { + /** Base coll component */ + mca_scoll_base_component_1_0_0_t super; + + /** MCA parameter: Priority of this component */ + int fca_priority; + + /** MCA parameter: Verbose level of this component */ + int fca_verbose; + +// /** MCA parameter: Comm_mLid */ +// char *fca_comm_mlid; +// +// /** MCA parameter: Comm_mGid */ +// char *fca_comm_mgid; +// +// /** MCA parameter: FCA_Mlid */ +// char *fca_fmm_mlid; +// + /** MCA parameter: Path to fca spec file */ + char* fca_spec_file; + + /** MCA parameter: FCA device */ + char* fca_dev; + + /** MCA parameter: Enable FCA */ + int fca_enable; + + /** MCA parameter: Enable FCA Barrier */ + int fca_enable_barrier; + + /** MCA parameter: Enable FCA Bcast */ + int fca_enable_bcast; + + /** MCA parameter: Enable FCA Allreduce */ + int fca_enable_allreduce; + + /** MCA parameter: Enable FCA Allgather */ + int fca_enable_allgather; + + /** MCA parameter: Enable FCA Allgatherv */ + int fca_enable_allgatherv; + + /** MCA parameter: FCA NP */ + int fca_np; + + /* FCA global stuff */ + fca_t *fca_context; /* FCA context handle */ + // mca_coll_fca_dtype_info_t fca_dtypes[FCA_DT_MAX_PREDEFINED]; /* FCA dtype translation */ + // mca_coll_fca_op_info_t fca_reduce_ops[FCA_MAX_OPS]; /* FCA op translation */ + + /*These vars are used as symmetric objects during __fca_comm_new. The proper amount of memory + is allocated only once during fca_comm_query*/ + int *ret; + int *rcounts; + void *my_info_exchangeable; + void *fca_comm_desc_exchangeable; +}; +typedef struct mca_scoll_fca_component_t mca_scoll_fca_component_t; + +OSHMEM_MODULE_DECLSPEC extern mca_scoll_fca_component_t mca_scoll_fca_component; + +struct mca_scoll_fca_module_t { + mca_scoll_base_module_t super; + struct oshmem_group_t *comm; + int rank; + int local_proc_idx; + int num_local_procs; + int *local_ranks; + fca_comm_t *fca_comm; + fca_comm_desc_t fca_comm_desc; + fca_comm_caps_t fca_comm_caps; + + + /* Saved handlers - for fallback */ + mca_scoll_base_module_barrier_fn_t previous_barrier; + mca_scoll_base_module_t *previous_barrier_module; + mca_scoll_base_module_broadcast_fn_t previous_broadcast; + mca_scoll_base_module_t *previous_broadcast_module; + mca_scoll_base_module_collect_fn_t previous_collect; + mca_scoll_base_module_t *previous_collect_module; + mca_scoll_base_module_reduce_fn_t previous_reduce; + mca_scoll_base_module_t *previous_reduce_module; +}; +typedef struct mca_scoll_fca_module_t mca_scoll_fca_module_t; +OBJ_CLASS_DECLARATION(mca_scoll_fca_module_t); + +/* API functions */ +int mca_scoll_fca_init_query(bool enable_progress_threads, bool enable_mpi_threads); +mca_scoll_base_module_t *mca_scoll_fca_comm_query(struct oshmem_group_t *comm, int *priority); +int mca_scoll_fca_get_fca_lib(struct oshmem_group_t *comm); + +int mca_scoll_fca_barrier(struct oshmem_group_t *group, long *pSync, int algorithm_type); +int mca_scoll_fca_broadcast(struct oshmem_group_t *group, int PE_root, void *target, const void *source, size_t nlong, long *pSync, int algorithm_type); +int mca_scoll_fca_collect(struct oshmem_group_t *group, void *target, const void *source, size_t nlong, long *pSync, bool nlong_type, int algorithm_type); +int mca_scoll_fca_reduce(struct oshmem_group_t *group, struct oshmem_op_t *op, void *target, const void *source, size_t nlong, long *pSync, void *pWrk, int algorithm_type); +OBJ_CLASS_DECLARATION(mca_coll_fca_module_t); +END_C_DECLS +#endif diff --git a/oshmem/mca/scoll/fca/scoll_fca_api.h b/oshmem/mca/scoll/fca/scoll_fca_api.h new file mode 100644 index 0000000000..7afe0b0516 --- /dev/null +++ b/oshmem/mca/scoll/fca/scoll_fca_api.h @@ -0,0 +1,75 @@ +/* +* Copyright (c) 2012 Mellanox Technologies, Inc. +* All rights reserved. +* $COPYRIGHT$ +* +* Additional copyrights may follow +* +* $HEADER$ +*/ +#include "oshmem_config.h" + +#include +#include +#include + +#ifndef FCA_API +#define OSHMEM_FCA_VERSION 12 +#else +#define OSHMEM_FCA_VERSION FCA_API +#endif + +/* + * * FCA API compatibility layer. + * * MPI build must define an FCA version macro. + * */ + +#define OSHMEM_FCA_BARRIER 1 +#define OSHMEM_FCA_BCAST 1 +#define OSHMEM_FCA_ALLREDUCE 1 + + +#if OSHMEM_FCA_VERSION == 12 + +#define OSHMEM_FCA_ALLGATHER 0 +#define FCA_API_ABI_MAJOR 1 +#define FCA_API_ABI_MINOR 2 +#define FCA_MAJOR_BIT 24ul +#define FCA_MINOR_BIT 16ul +#define EUSESHMEM 287 + +static inline int mca_scoll_fca_comm_init(fca_t *fca_context, int rank, int comm_size, + int local_proc_idx, int num_local_procs, + fca_comm_desc_t *comm_desc, + fca_comm_t **fca_comm) +{ + return fca_comm_init(fca_context, local_proc_idx, num_local_procs, + comm_size, comm_desc, fca_comm); +} +#elif OSHMEM_FCA_VERSION >= 20 + +#define OSHMEM_FCA_ALLGATHER 1 +#define OSHMEM_FCA_ALLGATHERV 1 + +#define OSHMEM_FCA_PROGRESS 1 +#define EUSESHMEM 287 + +static inline int mca_scoll_fca_comm_init(fca_t *fca_context, int rank, int comm_size, + int local_proc_idx, int num_local_procs, + fca_comm_desc_t *comm_desc, + fca_comm_t **fca_comm) +{ + fca_comm_init_spec_t spec; + + spec.rank = rank; + spec.size = comm_size; + spec.desc = *comm_desc; + spec.proc_idx = local_proc_idx; + spec.num_procs = num_local_procs; + return fca_comm_init(fca_context, &spec, fca_comm); +} +#else + +#error "FCA API version is unsupported" + +#endif diff --git a/oshmem/mca/scoll/fca/scoll_fca_component.c b/oshmem/mca/scoll/fca/scoll_fca_component.c new file mode 100644 index 0000000000..e6c9a794de --- /dev/null +++ b/oshmem/mca/scoll/fca/scoll_fca_component.c @@ -0,0 +1,272 @@ +/** + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + * */ +#define _GNU_SOURCE +#include + +#include +#include + +#include "scoll_fca.h" + +#include "opal/runtime/opal_progress.h" +#include "oshmem/proc/proc.h" +#include "oshmem/mca/memheap/memheap.h" +/* + * * Public string showing the coll ompi_fca component version number + * */ +const char *mca_scoll_fca_component_version_string = +"Open SHMEM FCA collective MCA component version " OSHMEM_VERSION; + +/* + * * Global variable + * */ +int mca_scoll_fca_output = -1; + +/* + * * Instantiate the public struct with all of our public information + * * and pointers to our public functions in it + * */ +static int fca_open(void); +static int fca_close(void); +static int fca_register(void); + +mca_scoll_fca_component_t mca_scoll_fca_component = { + + /* First, the mca_component_t struct containing meta information + * about the component itfca */ + { + { + MCA_SCOLL_BASE_VERSION_2_0_0, + + /* Component name and version */ + "fca", + OSHMEM_MAJOR_VERSION, + OSHMEM_MINOR_VERSION, + OSHMEM_RELEASE_VERSION, + + /* Component open and close functions */ + fca_open, + fca_close, + NULL, + fca_register + }, + { + /* The component is not checkpoint ready */ + MCA_BASE_METADATA_PARAM_NONE + }, + + /* Initialization / querying functions */ + + mca_scoll_fca_init_query, + mca_scoll_fca_comm_query, + } +}; + +#define FCA_API_CLEAR_MICRO(__x) ((__x>>FCA_MINOR_BIT)<>FCA_MAJOR_BIT); + minor = (fca_ver>>FCA_MINOR_BIT) & 0xf; + sprintf(x, "%ld%ld", major, minor); + detected_ver = atol(x); + + if (detected_ver != OSHMEM_FCA_VERSION) { + FCA_ERROR("Unsupported FCA version: %s, please update FCA to v%d, now v%ld", + fca_get_version_string(), + OSHMEM_FCA_VERSION, fca_ver); + return OSHMEM_ERROR; + } + + spec = fca_parse_spec_file(mca_scoll_fca_component.fca_spec_file); + if (!spec) { + FCA_ERROR("Failed to parse FCA spec file `%s'", mca_scoll_fca_component.fca_spec_file); + return OSHMEM_ERROR; + } + spec->job_id = oshmem_proc_local()->proc_name.jobid; + spec->rank_id = oshmem_proc_pe(oshmem_proc_local()); + spec->progress.func = mca_scoll_fca_progress_cb; + spec->progress.arg = NULL; + + ret = fca_init(spec, &mca_scoll_fca_component.fca_context); + if (ret < 0) { + FCA_ERROR("Failed to initialize FCA: %s", fca_strerror(ret)); + return OSHMEM_ERROR; + } + fca_free_init_spec(spec); + // mca_scoll_fca_init_fca_translations(); + + opal_progress_register(mca_scoll_fca_mpi_progress_cb); + return OSHMEM_SUCCESS; +} + +static void mca_scoll_fca_close_fca_lib(void) +{ + opal_progress_unregister(mca_scoll_fca_mpi_progress_cb); + fca_cleanup(mca_scoll_fca_component.fca_context); + mca_scoll_fca_component.fca_context = NULL; +} + +static int fca_register(void) +{ + mca_base_component_t *c; + + FCA_VERBOSE(2, "==>"); + + c = &mca_scoll_fca_component.super.scoll_version; + + mca_base_param_reg_int(c, "priority", + "Priority of the fca coll component", + false, false, + 80, + &mca_scoll_fca_component.fca_priority); + + mca_base_param_reg_int(c, "verbose", + "Verbose level of the fca coll component", + false, false, + 0, + &mca_scoll_fca_component.fca_verbose); + + mca_base_param_reg_int(c, "enable", + "[1|0|] Enable/Disable Fabric Collective Accelerator", + false, false, + 1, + &mca_scoll_fca_component.fca_enable); + + mca_base_param_reg_string(c, "spec_file", + "Path to the FCA configuration file fca_mpi_spec.ini", + false, false, + ""COLL_FCA_HOME"/etc/fca_mpi_spec.ini", + &mca_scoll_fca_component.fca_spec_file); + + mca_base_param_reg_int(c, "np", + "[integer] Minimal allowed job's NP to activate FCA", + false, false, + 64, + &mca_scoll_fca_component.fca_np); + + mca_base_param_reg_int(c, "enable_barrier", + "[1|0|] Enable/Disable FCA Barrier support", + false, false, + OSHMEM_FCA_BARRIER, + &mca_scoll_fca_component.fca_enable_barrier); + + mca_base_param_reg_int(c, "enable_bcast", + "[1|0|] Enable/Disable FCA Bcast support", + false, false, + OSHMEM_FCA_BCAST, + &mca_scoll_fca_component.fca_enable_bcast); + + mca_base_param_reg_int(c, "enable_allreduce", + "[1|0|] Enable/Disable FCA Allreduce support", + false, false, + OSHMEM_FCA_ALLREDUCE, + &mca_scoll_fca_component.fca_enable_allreduce); + + mca_base_param_reg_int(c, "enable_allgather", + "[1|0|] Enable/Disable FCA Allgather support", + false, false, + OSHMEM_FCA_ALLGATHER, + &mca_scoll_fca_component.fca_enable_allgather); + + mca_base_param_reg_int(c, "enable_allgatherv", + "[1|0|] Enable/Disable FCA Allgatherv support", + false, false, + OSHMEM_FCA_ALLGATHERV, + &mca_scoll_fca_component.fca_enable_allgatherv); + return OSHMEM_SUCCESS; +} + +static int fca_open(void) +{ + FCA_VERBOSE(2, "==>"); + + + mca_scoll_fca_output = opal_output_open(NULL); + opal_output_set_verbosity(mca_scoll_fca_output, mca_scoll_fca_component.fca_verbose); + mca_scoll_fca_component.fca_context = NULL; + mca_scoll_fca_component.ret = NULL; + mca_scoll_fca_component.rcounts = NULL; + mca_scoll_fca_component.fca_comm_desc_exchangeable = NULL; + mca_scoll_fca_component.my_info_exchangeable = NULL; + return OSHMEM_SUCCESS; +} + +static int fca_close(void) +{ + FCA_VERBOSE(2, "==>"); + + if (!mca_scoll_fca_component.fca_context) + return OSHMEM_SUCCESS; + + mca_scoll_fca_close_fca_lib(); + + if (NULL != mca_scoll_fca_component.ret) + MCA_MEMHEAP_CALL(private_free(mca_scoll_fca_component.ret)); + + if (NULL != mca_scoll_fca_component.rcounts) + MCA_MEMHEAP_CALL(private_free(mca_scoll_fca_component.rcounts)); + + if (NULL != mca_scoll_fca_component.fca_comm_desc_exchangeable) + MCA_MEMHEAP_CALL(private_free(mca_scoll_fca_component.fca_comm_desc_exchangeable)); + + if (NULL != mca_scoll_fca_component.my_info_exchangeable) + MCA_MEMHEAP_CALL(private_free(mca_scoll_fca_component.my_info_exchangeable)); + return OSHMEM_SUCCESS; +} + diff --git a/oshmem/mca/scoll/fca/scoll_fca_debug.h b/oshmem/mca/scoll/fca/scoll_fca_debug.h new file mode 100644 index 0000000000..9f82308cb4 --- /dev/null +++ b/oshmem/mca/scoll/fca/scoll_fca_debug.h @@ -0,0 +1,36 @@ +/** + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + * */ + +#ifndef MCA_SCOLL_FCA_DEBUG_H +#define MCA_SCOLL_FCA_DEBUG_H +#pragma GCC system_header + +#ifdef __BASE_FILE__ +#define __FCA_FILE__ __BASE_FILE__ +#else +#define __FCA_FILE__ __FILE__ +#endif + +#define FCA_VERBOSE(level, format, ...) \ + opal_output_verbose(level, mca_scoll_fca_output, "%s:%d - %s() " format, \ + __FCA_FILE__, __LINE__, __FUNCTION__, ## __VA_ARGS__) + +#define FCA_ERROR(format, ... ) \ + opal_output_verbose(0, mca_scoll_fca_output, "Error: %s:%d - %s() " format, \ + __FCA_FILE__, __LINE__, __FUNCTION__, ## __VA_ARGS__) + + +#define FCA_MODULE_VERBOSE(fca_module, level, format, ...) \ + FCA_VERBOSE(level, "[%p:%d] " format, (void*)(fca_module)->comm, (fca_module)->rank, ## __VA_ARGS__) + +extern int mca_scoll_fca_output; + +#endif + diff --git a/oshmem/mca/scoll/fca/scoll_fca_module.c b/oshmem/mca/scoll/fca/scoll_fca_module.c new file mode 100644 index 0000000000..b38cfbc335 --- /dev/null +++ b/oshmem/mca/scoll/fca/scoll_fca_module.c @@ -0,0 +1,555 @@ +/** + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + * */ +#include "oshmem_config.h" +#include "scoll_fca.h" +#include "opal/mca/base/mca_base_param.h" +#include +#include +#include "oshmem/constants.h" +#include "oshmem/mca/scoll/scoll.h" +#include "oshmem/mca/scoll/base/base.h" +#include "oshmem/proc/proc.h" +#include "oshmem/mca/spml/spml.h" +#include "oshmem/mca/memheap/memheap.h" +#include "oshmem/runtime/runtime.h" + +/* + * * Initial query function that is invoked during MPI_INIT, allowing + * * this module to indicate what level of thread support it provides. + * */ + + +static const int root_id = 0; + +#define __INTERNAL_BARRIER_FROM_SCOLL_BASIC 1 +static int __internal_barrier(mca_scoll_fca_module_t *fca_module) +{ +#if !__INTERNAL_BARRIER_FROM_SCOLL_BASIC + struct oshmem_group_t *group = fca_module->comm; + int rc = OSHMEM_SUCCESS; + int root_id = 0; + int PE_root = oshmem_proc_pe(group->proc_array[root_id]); + int i = 0; + + if (PE_root != group->my_pe) + { + rc = MCA_SPML_CALL(send(NULL, 0, PE_root, MCA_SPML_BASE_PUT_STANDARD)); + if (OSHMEM_SUCCESS != rc) { + return rc; + } + + rc = MCA_SPML_CALL(recv(NULL, 0, PE_root)); + if (OSHMEM_SUCCESS != rc) { + return rc; + } + } + + /* The root collects and broadcasts the messages. */ + + else + { + int pe_cur = 0; + + for (i = 0; (i < group->proc_count) && (rc == OSHMEM_SUCCESS); i++) + { + pe_cur = oshmem_proc_pe(group->proc_array[i]); + if (pe_cur != PE_root) + { + rc = MCA_SPML_CALL(recv(NULL, 0, SHMEM_ANY_SOURCE)); + } + if (OSHMEM_SUCCESS != rc) { + return rc; + } + } + + for (i = 0; (i < group->proc_count) && (rc == OSHMEM_SUCCESS); i++) + { + pe_cur = oshmem_proc_pe(group->proc_array[i]); + if (pe_cur != PE_root) + { + rc = MCA_SPML_CALL(send(NULL, 0, pe_cur, MCA_SPML_BASE_PUT_STANDARD)); + } + if (OSHMEM_SUCCESS != rc) { + return rc; + } + } + } + + return rc; +#else + long pSync = _SHMEM_SYNC_VALUE; + /*we use 4th algorithm for barrier from scoll/basic. It does not use pSync, + * so we pass to that function just regular long value in order to meet function defenition requirements*/ + return fca_module->previous_barrier(fca_module->comm,&pSync, SCOLL_ALG_BARRIER_BASIC); +#endif +} +int mca_scoll_fca_init_query(bool enable_progress_threads, + bool enable_mpi_threads) +{ + return OSHMEM_SUCCESS; +} + +static int have_remote_peers(struct oshmem_group_t *group, size_t size, int *local_peers) +{ + struct oshmem_proc_t *proc; + size_t i; + int ret; + + *local_peers = 0; + ret = 0; + for (i = 0; i < size; ++i) { + proc = group->proc_array[i]; + if (FCA_IS_LOCAL_PROCESS(proc->proc_flags)) { + ++*local_peers; + } else { + ret = 1; + } + } + return ret; +} + + + +/** + * * Fills local rank information in fca_module. + * */ + +static int __get_local_ranks(mca_scoll_fca_module_t *fca_module) +{ + struct oshmem_group_t *comm = fca_module->comm; + oshmem_proc_t* proc; + int i, rank; + + /* Count the local ranks */ + fca_module->num_local_procs = 0; + for (rank = 0; rank < comm->proc_count; ++rank) { + proc = comm->proc_array[rank]; + if (FCA_IS_LOCAL_PROCESS(proc->proc_flags)) { + if (proc->proc_name.vpid == (uint32_t)fca_module->rank) { + fca_module->local_proc_idx = fca_module->num_local_procs; + } + ++fca_module->num_local_procs; + } + } + /* Make a list of local ranks */ + fca_module->local_ranks = calloc(fca_module->num_local_procs, + sizeof *fca_module->local_ranks); + if (!fca_module->local_ranks) { + FCA_ERROR("Failed to allocate memory for %d local ranks", + fca_module->num_local_procs); + return OSHMEM_ERROR; + } + + i = 0; + for (rank = 0; rank < comm->proc_count; ++rank) { + proc = comm->proc_array[rank]; + if (FCA_IS_LOCAL_PROCESS(proc->proc_flags)) { + fca_module->local_ranks[i++] = rank; + } + } + + FCA_MODULE_VERBOSE(fca_module, 3, "i am %d/%d", fca_module->local_proc_idx, + fca_module->num_local_procs); + + return OSHMEM_SUCCESS; +} + + +static int __fca_comm_new(mca_scoll_fca_module_t *fca_module) +{ + struct oshmem_group_t *comm = fca_module->comm; + fca_comm_new_spec_t spec; + int info_size = 0, all_info_size = 0; + void *all_info = NULL, *my_info = NULL; + int *disps = NULL; + int i; + const int root_pe = oshmem_proc_pe(comm->proc_array[root_id]); + const int my_id = oshmem_proc_group_find_id(comm,comm->my_pe); + /* call fca_get_rank_info() on node managers only*/ + + if (fca_module->local_proc_idx == 0) + { + my_info = fca_get_rank_info(mca_scoll_fca_component.fca_context, + &info_size); + if (!my_info) { + FCA_ERROR("fca_get_rank_info returned NULL"); + return OSHMEM_ERROR; + } + + } else { + info_size = 0; + } + + FCA_MODULE_VERBOSE(fca_module, 1, "Info size: %d", info_size); + for (i=0; iproc_count; i++) + { + mca_scoll_fca_component.rcounts[i] = -1; + } + __internal_barrier(fca_module); + MCA_SPML_CALL(put((void *)&mca_scoll_fca_component.rcounts[my_id], + (size_t)sizeof(info_size), (void *)&info_size, root_pe)); + + if (root_pe == comm->my_pe) + { + int value = -1; + for (i=0; iproc_count; i++) + { + MCA_SPML_CALL(wait((void *)&mca_scoll_fca_component.rcounts[i], + SHMEM_CMP_NE, &value, SHMEM_INT)); + } + } + + + /* Allocate buffer for gathering rank information on rank0 */ + if (root_pe == comm->my_pe) { + all_info_size = 0; + disps = calloc(comm->proc_count, sizeof *disps); + for (i = 0; i < comm->proc_count; ++i) { + disps[i] = all_info_size; + all_info_size += mca_scoll_fca_component.rcounts[i]; + } + all_info = NULL; + FCA_MODULE_VERBOSE(fca_module, 1, "Total rank_info size: %d", all_info_size); + all_info = malloc(all_info_size); + memset(all_info,0,all_info_size); + } + + if (my_info) + { + memcpy(mca_scoll_fca_component.my_info_exchangeable,my_info,info_size); + } + __internal_barrier(fca_module); + if (root_pe == comm->my_pe) + { + for (i=0; iproc_count; i++) + { + if (mca_scoll_fca_component.rcounts[i] > 0) + { + MCA_SPML_CALL(get((void *)mca_scoll_fca_component.my_info_exchangeable, + mca_scoll_fca_component.rcounts[i], + (void*)(((char*)all_info)+disps[i]),comm->proc_array[i]->proc_name.vpid)); + } + } + } + + + /* Rank0 calls fca_comm_new() and fills fca_comm_spec filed */ + if (root_pe == comm->my_pe) { + spec.rank_info = all_info; + spec.is_comm_world = comm == oshmem_group_all; + spec.rank_count = 0; + for (i = 0; i < comm->proc_count; ++i) { + FCA_MODULE_VERBOSE(fca_module, 1, "rcounts[%d]=%d disps[%d]=%d", + i, mca_scoll_fca_component.rcounts[i], i, disps[i]); + if (mca_scoll_fca_component.rcounts[i] > 0) + ++spec.rank_count; + } + + FCA_MODULE_VERBOSE(fca_module, 1, "starting fca_comm_new(), rank_count: %d", + spec.rank_count); + + *mca_scoll_fca_component.ret = fca_comm_new(mca_scoll_fca_component.fca_context, + &spec, &fca_module->fca_comm_desc); + + free(disps); + free(all_info); + } + + __internal_barrier(fca_module); + + if (root_pe != comm->my_pe) + { + MCA_SPML_CALL(get((void *)mca_scoll_fca_component.ret,sizeof(int), + (void *)mca_scoll_fca_component.ret, root_pe)); + } + + /* Examine comm_new return value */ + __internal_barrier(fca_module); + if (*mca_scoll_fca_component.ret < 0) + { + FCA_ERROR("rank %i: COMM_NEW failed: %s", fca_module->rank, + fca_strerror(*mca_scoll_fca_component.ret)); + return OSHMEM_ERROR; + } + + + + /* Release allocate rank_info on node managers */ + if (fca_module->local_proc_idx == 0) { + fca_free_rank_info(my_info); + } + + { + if (root_pe == comm->my_pe) + { + memcpy(mca_scoll_fca_component.fca_comm_desc_exchangeable, + &fca_module->fca_comm_desc, + sizeof(fca_module->fca_comm_desc)); + } + + __internal_barrier(fca_module); + if (root_pe != comm->my_pe) + { + MCA_SPML_CALL(get((void *)mca_scoll_fca_component.fca_comm_desc_exchangeable, + sizeof(fca_module->fca_comm_desc), + (void *)&fca_module->fca_comm_desc, + root_pe)); + } + + __internal_barrier(fca_module); + + } + FCA_MODULE_VERBOSE(fca_module, 1, "Received FCA communicator spec, comm_id %d", + fca_module->fca_comm_desc.comm_id); + return OSHMEM_SUCCESS; +} + +static int __create_fca_comm(mca_scoll_fca_module_t *fca_module) +{ + int comm_size; + int rc, ret; + + rc = __fca_comm_new(fca_module); + if (rc != OSHMEM_SUCCESS) + return rc; + + /* allocate comm_init_spec */ + FCA_MODULE_VERBOSE(fca_module, 1, "Starting COMM_INIT comm_id %d proc_idx %d num_procs %d", + fca_module->fca_comm_desc.comm_id, fca_module->local_proc_idx, + fca_module->num_local_procs); + + comm_size = fca_module->comm->proc_count; + ret = mca_scoll_fca_comm_init(mca_scoll_fca_component.fca_context, + oshmem_proc_group_find_id(fca_module->comm,fca_module->rank), comm_size, + fca_module->local_proc_idx, fca_module->num_local_procs, + &fca_module->fca_comm_desc, &fca_module->fca_comm); + if (ret < 0) { + FCA_ERROR("COMM_INIT failed: %s", fca_strerror(ret)); + return OSHMEM_ERROR; + } + + /* get communicator capabilities */ + ret = fca_comm_get_caps(fca_module->fca_comm, + &fca_module->fca_comm_caps); + if (ret < 0) { + FCA_ERROR("GET_COMM_CAPS failed: %s", fca_strerror(ret)); + return OSHMEM_ERROR; + } + + /* by this point every rank in the communicator is set up */ + FCA_MODULE_VERBOSE(fca_module, 1, "Initialized FCA communicator, comm_id %d", + fca_module->fca_comm_desc.comm_id); + + return OSHMEM_SUCCESS; +} + +static void __destroy_fca_comm(mca_scoll_fca_module_t *fca_module) +{ + int ret; + struct oshmem_group_t *comm = fca_module->comm; + const int root_pe = oshmem_proc_pe(comm->proc_array[root_id]); + + fca_comm_destroy(fca_module->fca_comm); + if (comm->my_pe == root_pe && mca_scoll_fca_component.fca_context) { + ret = fca_comm_end(mca_scoll_fca_component.fca_context, + fca_module->fca_comm_desc.comm_id); + if (ret < 0) { + FCA_ERROR("COMM_END failed: %s", fca_strerror(ret)); + } + } + + FCA_MODULE_VERBOSE(fca_module, 1, "Destroyed FCA communicator, comm_id %d", + fca_module->fca_comm_desc.comm_id); +} + +#define FCA_SAVE_PREV_SCOLL_API(__api) do {\ + fca_module->previous_ ## __api = comm->g_scoll.scoll_ ## __api;\ + fca_module->previous_ ## __api ## _module = comm->g_scoll.scoll_ ## __api ## _module;\ + if (!comm->g_scoll.scoll_ ## __api || !comm->g_scoll.scoll_ ## __api ## _module) {\ + FCA_VERBOSE(1, "no underlying " # __api"; disqualifying myself");\ + return OSHMEM_ERROR;\ + }\ + OBJ_RETAIN(fca_module->previous_ ## __api ## _module);\ +} while(0) + +static int __save_coll_handlers(mca_scoll_fca_module_t *fca_module) +{ + struct oshmem_group_t *comm = fca_module->comm; + + FCA_SAVE_PREV_SCOLL_API(barrier); + FCA_SAVE_PREV_SCOLL_API(broadcast); + FCA_SAVE_PREV_SCOLL_API(collect); + FCA_SAVE_PREV_SCOLL_API(reduce); + + return OSHMEM_SUCCESS; +} + +/* + * * Initialize module on the communicator + * */ +static int mca_scoll_fca_module_enable(mca_scoll_base_module_t *module, + struct oshmem_group_t *comm) +{ + + + mca_scoll_fca_module_t *fca_module = (mca_scoll_fca_module_t*) module; + int rc; + + + fca_module->comm = comm; + fca_module->rank = comm->my_pe; + + + rc = mca_scoll_fca_get_fca_lib(comm); + if (rc != OSHMEM_SUCCESS) + goto exit_fatal; + + rc = __save_coll_handlers(fca_module); + if (rc != OSHMEM_SUCCESS) + goto exit_fatal; + + + rc = __get_local_ranks(fca_module); + if (rc != OSHMEM_SUCCESS) + goto exit_fatal; + + rc = __create_fca_comm(fca_module); + if (rc != OSHMEM_SUCCESS) + goto exit_fatal; + + FCA_MODULE_VERBOSE(fca_module, 1, "FCA Module initialized"); + return OMPI_SUCCESS; + +exit_fatal: + /* it is possible that other pe(s) succesfully enabled fca. + * So differnt frameworks will be used for collective ops + */ + FCA_ERROR("FCA module enable failed - aborting to prevent inconsistent application state"); + oshmem_shmem_abort(-1); + return OMPI_ERROR; +} + + +static void mca_scoll_fca_module_clear(mca_scoll_fca_module_t *fca_module) +{ + fca_module->num_local_procs = 0; + fca_module->local_ranks = NULL; + fca_module->fca_comm = NULL; + + fca_module->previous_barrier = NULL; + fca_module->previous_broadcast = NULL; + fca_module->previous_collect = NULL; + fca_module->previous_reduce = NULL; +} + +static void mca_scoll_fca_module_construct(mca_scoll_fca_module_t *fca_module) +{ + FCA_VERBOSE(5, "==>"); + mca_scoll_fca_module_clear(fca_module); +} + +static void mca_scoll_fca_module_destruct(mca_scoll_fca_module_t *fca_module) +{ + FCA_VERBOSE(5, "==>"); + OBJ_RELEASE(fca_module->previous_barrier_module); + OBJ_RELEASE(fca_module->previous_broadcast_module); + OBJ_RELEASE(fca_module->previous_collect_module); + OBJ_RELEASE(fca_module->previous_reduce_module); + if (fca_module->fca_comm) + __destroy_fca_comm(fca_module); + free(fca_module->local_ranks); + mca_scoll_fca_module_clear(fca_module); +} + + +/* + * * Invoked when there's a new communicator that has been created. + * * Look at the communicator and decide which set of functions and + * * priority we want to return. + * */ + mca_scoll_base_module_t * +mca_scoll_fca_comm_query(struct oshmem_group_t *comm, int *priority) +{ + mca_scoll_base_module_t *module; + int size = comm->proc_count; + int local_peers = 0; + + mca_scoll_fca_module_t *fca_module; + + *priority = 0; + module = NULL; + + if (!mca_scoll_fca_component.fca_enable) { + FCA_VERBOSE(20,"FCA is disable on user request => exiting"); + goto exit; + } + + if (mca_memheap.memheap_component == NULL) + { + FCA_VERBOSE(20,"No memheap => exiting"); + goto exit; + } + + if (NULL == mca_scoll_fca_component.ret){ + MCA_MEMHEAP_CALL(private_alloc(sizeof(int),(void **)&mca_scoll_fca_component.ret)); + MCA_MEMHEAP_CALL(private_alloc(oshmem_group_all->proc_count*sizeof(*mca_scoll_fca_component.rcounts), + (void **)&mca_scoll_fca_component.rcounts )); + MCA_MEMHEAP_CALL(private_alloc(/*info_size*/20,&mca_scoll_fca_component.my_info_exchangeable)); + MCA_MEMHEAP_CALL(private_alloc(sizeof(fca_comm_desc_t), &mca_scoll_fca_component.fca_comm_desc_exchangeable)); + } + if (size < mca_scoll_fca_component.fca_np) { + FCA_VERBOSE(20,"size(%d) < fca_np(%d)", size, mca_scoll_fca_component.fca_np); + goto exit; + } + + if (size < 2) { + FCA_VERBOSE(20,"size(%d) < 2", size); + goto exit; + } + + + if (!have_remote_peers(comm, size, &local_peers) /* || OMPI_COMM_IS_INTER(comm)*/) { + FCA_VERBOSE(1,"all peers in group are on the same node, fca disabled\n"); + goto exit; + } + + fca_module = OBJ_NEW(mca_scoll_fca_module_t); + if (!fca_module) { + goto exit_fatal; + } + fca_module->super.scoll_module_enable = mca_scoll_fca_module_enable; + fca_module->super.scoll_collect = mca_scoll_fca_component.fca_enable_allgather? mca_scoll_fca_collect : NULL; + fca_module->super.scoll_reduce = mca_scoll_fca_component.fca_enable_allreduce? mca_scoll_fca_reduce : NULL; + fca_module->super.scoll_barrier = mca_scoll_fca_component.fca_enable_barrier? mca_scoll_fca_barrier : NULL; + fca_module->super.scoll_broadcast = mca_scoll_fca_component.fca_enable_bcast? mca_scoll_fca_broadcast : NULL; + + *priority = mca_scoll_fca_component.fca_priority; + module = &fca_module->super; + +exit: + FCA_VERBOSE(4, "Query FCA module for comm %p size %d rank %d local_peers=%d: priority=%d %s", + (void *)comm, size, comm->my_pe, local_peers, + *priority, module ? "enabled" : "disabled"); + return module; + +exit_fatal: + /* it is possible that other pe(s) succesfully initialized fca. + * So differnt frameworks will be used for collective ops + */ + FCA_ERROR("FCA module query failed - aborting"); + oshmem_shmem_abort(-1); + return NULL; +} + +OBJ_CLASS_INSTANCE(mca_scoll_fca_module_t, + mca_scoll_base_module_t, + mca_scoll_fca_module_construct, + mca_scoll_fca_module_destruct); + diff --git a/oshmem/mca/scoll/fca/scoll_fca_ops.c b/oshmem/mca/scoll/fca/scoll_fca_ops.c new file mode 100644 index 0000000000..4a274222dc --- /dev/null +++ b/oshmem/mca/scoll/fca/scoll_fca_ops.c @@ -0,0 +1,240 @@ +/* +* Copyright (c) 2012 Mellanox Technologies, Inc. +* All rights reserved. +* $COPYRIGHT$ +* +* Additional copyrights may follow +* +* $HEADER$ +*/ +#include "oshmem_config.h" +#include "oshmem/constants.h" +#include "scoll_fca.h" +#include +#include "oshmem/proc/proc.h" +#include "oshmem/op/op.h" +int mca_scoll_fca_barrier(struct oshmem_group_t *group, long *pSync, int alg) +{ + mca_scoll_fca_module_t *fca_module = ( mca_scoll_fca_module_t *)group->g_scoll.scoll_barrier_module; + int ret; + + FCA_VERBOSE(5,"Using FCA Barrier"); + ret = fca_do_barrier(fca_module->fca_comm); + if (ret < 0) { + if (ret == -EUSESHMEM) { + FCA_VERBOSE(5,"FCA Barrier failed, using original barrier"); + goto orig_barrier; + } + FCA_ERROR("Barrier failed: %s", fca_strerror(ret)); + return OSHMEM_ERROR; + } + return OSHMEM_SUCCESS; +orig_barrier: + return fca_module->previous_barrier(group, pSync, SCOLL_DEFAULT_ALG); +} + +int mca_scoll_fca_broadcast(struct oshmem_group_t *group, int PE_root, void *target, const void *source, size_t nlong, long *pSync, int alg) +{ + mca_scoll_fca_module_t *fca_module = ( mca_scoll_fca_module_t *)group->g_scoll.scoll_broadcast_module; + fca_bcast_spec_t spec; + int ret; + + FCA_VERBOSE(5,"rank %i, DOING FCA BCAST\n", group->my_pe); + spec.root = oshmem_proc_group_find_id(group,PE_root); + if (group->my_pe == PE_root) + spec.buf = (void *)source; + else + spec.buf = target; + spec.size = nlong; + if (spec.size > fca_module->fca_comm_caps.max_payload) { + FCA_VERBOSE(5, "Unsupported bcast operation size %d, using fallback", + spec.size); + goto orig_bcast; + } + ret = fca_do_bcast(fca_module->fca_comm, &spec); + if (ret < 0) { + if (ret == -EUSESHMEM) { + FCA_VERBOSE(5,"FCA Broadcast failed, using original Broadcast"); + goto orig_bcast; + } + FCA_ERROR("Bcast failed: %s", fca_strerror(ret)); + return OSHMEM_ERROR; + } + return OSHMEM_SUCCESS; +orig_bcast: + return fca_module->previous_broadcast(group, PE_root, target, source, nlong, pSync, SCOLL_DEFAULT_ALG); +} + +int mca_scoll_fca_collect(struct oshmem_group_t *group, void *target, const void *source, size_t nlong, long *pSync, bool nlong_type, int alg) +{ + mca_scoll_fca_module_t *fca_module = ( mca_scoll_fca_module_t *)group->g_scoll.scoll_collect_module; + + FCA_VERBOSE(5,"rank %i, DOING FCA_COLLECT, nlong_type = %i\n",group->my_pe,(int)nlong_type); +#if OSHMEM_FCA_ALLGATHER + if (nlong_type == true){ + fca_gather_spec_t spec = {0,}; + int ret; + spec.size = (int)nlong; + spec.sbuf = (void *)source; + spec.rbuf = target; + ret = fca_do_allgather(fca_module->fca_comm, &spec); + if (ret < 0) { + if (ret == -EUSESHMEM) { + FCA_VERBOSE(5,"FCA Fcollect(allgather) failed, using original Fcollect"); + goto orig_collect; + } + FCA_ERROR("Fcollect(allgather) failed: %s", fca_strerror(ret)); + return OSHMEM_ERROR; + } + return OSHMEM_SUCCESS; + } + else + { + int i, ret; + size_t *sendcounts = (size_t *)malloc(group->proc_count*sizeof(size_t)); + mca_scoll_fca_collect(group,sendcounts,(void *)&nlong,sizeof(size_t),pSync,true,SCOLL_DEFAULT_ALG); + fca_gatherv_spec_t spec; + spec.sendsize = (int)nlong; + spec.sbuf = (void *)source; + spec.rbuf = target; + spec.recvsizes = alloca(sizeof(*spec.recvsizes) * group->proc_count); + spec.displs = alloca(sizeof(*spec.displs) * group->proc_count); + for (i=0; iproc_count; i++){ + spec.recvsizes[i] = (int)sendcounts[i]; + } + spec.displs[0] = 0; + for (i=1; iproc_count; i++){ + spec.displs[i] = spec.displs[i-1]+spec.recvsizes[i-1]; + } + ret = fca_do_allgatherv(fca_module->fca_comm, &spec); + if (ret < 0){ + if (ret == -EUSESHMEM) { + FCA_VERBOSE(5,"FCA Collect(allgatherv) failed, using original Collect"); + goto orig_collect; + } + FCA_ERROR("Collect(allgatherv) failed: %s", fca_strerror(ret)); + return OSHMEM_ERROR; + } + free(sendcounts); + return OSHMEM_SUCCESS; + } +orig_collect: +#endif + return fca_module->previous_collect(group, target, source, nlong, pSync, nlong_type, SCOLL_DEFAULT_ALG); +} + +#define FCA_DTYPE_8_SIGNED 1 +#define FCA_DTYPE_16_SIGNED 2 +#define FCA_DTYPE_32_SIGNED 3 +#define FCA_DTYPE_64_SIGNED 4 +#define FCA_DTYPE_32_FLOAT 9 +#define FCA_DTYPE_64_FLOAT 10 +#define UNSUPPORTED_OP -1 + +static bool if_floating_type(oshmem_op_t *op) +{ + if ((op->dt == OSHMEM_OP_TYPE_FLOAT) || + (op->dt == OSHMEM_OP_TYPE_DOUBLE) || + (op->dt == OSHMEM_OP_TYPE_LDOUBLE)) + return true; + else + return false; +} +static int shmem_dtype_to_fca_dtype(oshmem_op_t *op) +{ + if ((op->dt == OSHMEM_OP_TYPE_FCOMPLEX) || + (op->dt == OSHMEM_OP_TYPE_DCOMPLEX)){ + return UNSUPPORTED_OP; + } + switch(op->dt_size*8){ + case 64: + if (if_floating_type(op)) + return FCA_DTYPE_64_FLOAT; + else + return FCA_DTYPE_64_SIGNED; + break; + case 32: + if (if_floating_type(op)) + return FCA_DTYPE_32_FLOAT; + else + return FCA_DTYPE_32_SIGNED; + break; + case 16: + if (OPAL_UNLIKELY(if_floating_type(op))) + return UNSUPPORTED_OP; + else + return FCA_DTYPE_16_SIGNED; + break; + case 8: + if (OPAL_UNLIKELY(if_floating_type(op))) + return UNSUPPORTED_OP; + else + return FCA_DTYPE_8_SIGNED; + break; + default: + return UNSUPPORTED_OP; + } +} + +static int shmem_op_to_fca_op(oshmem_op_t *op) +{ + switch(op->op){ + case OSHMEM_OP_AND: + return FCA_OP_BAND; + break; + case OSHMEM_OP_OR: + return FCA_OP_BOR; + break; + case OSHMEM_OP_XOR: + return FCA_OP_BXOR; + case OSHMEM_OP_MAX: + return FCA_OP_MAX; + break; + case OSHMEM_OP_MIN: + return FCA_OP_MIN; + break; + case OSHMEM_OP_SUM: + return FCA_OP_SUM; + break; + case OSHMEM_OP_PROD: + return FCA_OP_PROD; + break; + default: + return UNSUPPORTED_OP; + } +} +int mca_scoll_fca_reduce(struct oshmem_group_t *group, struct oshmem_op_t *op, void *target, const void *source, size_t nlong, long *pSync, void *pWrk, int alg) +{ + mca_scoll_fca_module_t *fca_module = ( mca_scoll_fca_module_t *)group->g_scoll.scoll_reduce_module; + int fca_dtype; + int fca_op; + int ret; + fca_reduce_spec_t spec; + + FCA_VERBOSE(5,"rank %i, DOING FCA_REDUCE\n",group->my_pe); + if ( (fca_dtype = shmem_dtype_to_fca_dtype(op)) < 0){ + FCA_VERBOSE(5,"SHMEM_DATA_TYPE = %i is unsupported in the current version of FCA library; using original reduce",op->dt); + goto orig_reduce; + } + if ( (fca_op = shmem_op_to_fca_op(op)) < 0){ + FCA_VERBOSE(5,"SHMEM_OPERATION_TYPE = %i is unsupported; using original reduce",op->op); + goto orig_reduce; + } + spec.sbuf = (void *)source; + spec.rbuf = target; + spec.dtype = (enum fca_reduce_dtype_t)fca_dtype; + spec.op = (enum fca_reduce_op_t)fca_op; + spec.length = (int)(nlong/op->dt_size); + ret = fca_do_all_reduce(fca_module->fca_comm, &spec); + if (ret < 0) { + if (ret == -EUSESHMEM) { + FCA_VERBOSE(5,"FCA Reduce(allreduce) failed, using original Reduce"); + goto orig_reduce; + } + FCA_ERROR("Reduce (allreduce) failed: %s", fca_strerror(ret)); + return OSHMEM_ERROR; + } + return OSHMEM_SUCCESS; +orig_reduce: + return fca_module->previous_reduce(group, op, target, source, nlong, pSync, pWrk, SCOLL_DEFAULT_ALG); +} diff --git a/oshmem/mca/scoll/scoll.h b/oshmem/mca/scoll/scoll.h new file mode 100644 index 0000000000..b7044211f4 --- /dev/null +++ b/oshmem/mca/scoll/scoll.h @@ -0,0 +1,181 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/** + * @file + * + * Collective Communication Interface + * + */ + +#ifndef OSHMEM_MCA_SCOLL_H +#define OSHMEM_MCA_SCOLL_H + +#include "oshmem_config.h" +#include "oshmem/types.h" +#include "oshmem/constants.h" + +#include "opal/util/output.h" +#include "mpi.h" +#include "opal/mca/mca.h" +#include "opal/mca/base/base.h" + + +BEGIN_C_DECLS + + +/* ******************************************************************** */ + +struct oshmem_group_t; +struct oshmem_op_t; + + +/* ******************************************************************** */ + + +typedef int (*mca_scoll_base_component_init_fn_t) + (bool enable_progress_threads, bool enable_threads); + +typedef struct mca_scoll_base_module_1_0_0_t* (*mca_scoll_base_component_query_fn_t) + (struct oshmem_group_t *group, int *priority); + + +/* ******************************************************************** */ + + +/** + * Collective component interface + * + * Component interface for the collective framework. A public + * instance of this structure, called + * mca_scoll_[component_name]_component, must exist in any collective + * component. + */ +struct mca_scoll_base_component_1_0_0_t { + /** Base component description */ + mca_base_component_t scoll_version; + /** Base component data block */ + mca_base_component_data_t scoll_data; + + /** Component initialization function */ + mca_scoll_base_component_init_fn_t scoll_init; + mca_scoll_base_component_query_fn_t scoll_query; +}; +typedef struct mca_scoll_base_component_1_0_0_t mca_scoll_base_component_1_0_0_t; + +/** Per guidence in mca.h, use the unversioned struct name if you just + want to always keep up with the most recent version of the + interace. */ +typedef struct mca_scoll_base_component_1_0_0_t mca_scoll_base_component_t; + + +/** + * Collective module interface + * + * Module interface to the Collective framework. Modules are + * reference counted based on the number of functions from the module + * used on the commuicator. There is at most one module per component + * on a given communicator, and there can be many component modules on + * a given communicator. + * + * @note The collective framework and the + * communicator functionality only stores a pointer to the module + * function, so the component is free to create a structure that + * inherits from this one for use as the module structure. + */ +typedef int +(*mca_scoll_base_module_enable_1_0_0_fn_t)(struct mca_scoll_base_module_1_0_0_t* module, + struct oshmem_group_t *comm); +typedef int (*mca_scoll_base_module_ft_event_fn_t) (int state); + +#define SCOLL_DEFAULT_ALG (-1) + +#define SCOLL_ALG_BARRIER_CENTRAL_COUNTER 0 +#define SCOLL_ALG_BARRIER_TOURNAMENT 1 +#define SCOLL_ALG_BARRIER_RECURSIVE_DOUBLING 2 +#define SCOLL_ALG_BARRIER_DISSEMINATION 3 +#define SCOLL_ALG_BARRIER_BASIC 4 +#define SCOLL_ALG_BARRIER_ADAPTIVE 5 + +#define SCOLL_ALG_BROADCAST_CENTRAL_COUNTER 0 +#define SCOLL_ALG_BROADCAST_BINOMIAL 1 + +#define SCOLL_ALG_COLLECT_CENTRAL_COUNTER 0 +#define SCOLL_ALG_COLLECT_TOURNAMENT 1 +#define SCOLL_ALG_COLLECT_RECURSIVE_DOUBLING 2 +#define SCOLL_ALG_COLLECT_RING 3 + +#define SCOLL_ALG_REDUCE_CENTRAL_COUNTER 0 +#define SCOLL_ALG_REDUCE_TOURNAMENT 1 +#define SCOLL_ALG_REDUCE_RECURSIVE_DOUBLING 2 +#define SCOLL_ALG_REDUCE_LEGACY_LINEAR 3 /* Based linear algorithm from OMPI coll:basic */ +#define SCOLL_ALG_REDUCE_LEGACY_LOG 4 /* Based log algorithm from OMPI coll:basic */ + +typedef int (*mca_scoll_base_module_barrier_fn_t) + (struct oshmem_group_t *group, long *pSync, int alg); +typedef int (*mca_scoll_base_module_broadcast_fn_t) + (struct oshmem_group_t *group, int PE_root, void *target, const void *source, size_t nlong, long *pSync, int alg); +typedef int (*mca_scoll_base_module_collect_fn_t) + (struct oshmem_group_t *group, void *target, const void *source, size_t nlong, long *pSync, bool nlong_type, int alg); +typedef int (*mca_scoll_base_module_reduce_fn_t) + (struct oshmem_group_t *group, struct oshmem_op_t *op, void *target, const void *source, size_t nlong, long *pSync, void *pWrk, int alg); + +struct mca_scoll_base_module_1_0_0_t { + /** Collective modules all inherit from opal_object */ + opal_object_t super; + + /* Collective function pointers */ + mca_scoll_base_module_barrier_fn_t scoll_barrier; + mca_scoll_base_module_broadcast_fn_t scoll_broadcast; + mca_scoll_base_module_collect_fn_t scoll_collect; + mca_scoll_base_module_reduce_fn_t scoll_reduce; + mca_scoll_base_module_enable_1_0_0_fn_t scoll_module_enable; +}; +typedef struct mca_scoll_base_module_1_0_0_t mca_scoll_base_module_1_0_0_t; + +/** Per guidence in mca.h, use the unversioned struct name if you just + want to always keep up with the most recent version of the + interace. */ +typedef struct mca_scoll_base_module_1_0_0_t mca_scoll_base_module_t; +OSHMEM_DECLSPEC OBJ_CLASS_DECLARATION(mca_scoll_base_module_t); + + +/* ******************************************************************** */ + + +/* + * Macro for use in components that are of type coll + */ +#define MCA_SCOLL_BASE_VERSION_2_0_0 \ + MCA_BASE_VERSION_2_0_0, \ + "scoll", 1, 0, 0 + + +/* ******************************************************************** */ +/* + * Collectives group cache structure + * + * Collectives gorup cache structure, used to find functions to + * implement collective algorithms and their associated modules. + */ +struct mca_scoll_base_group_scoll_t { + mca_scoll_base_module_barrier_fn_t scoll_barrier; + mca_scoll_base_module_1_0_0_t *scoll_barrier_module; + mca_scoll_base_module_broadcast_fn_t scoll_broadcast; + mca_scoll_base_module_1_0_0_t *scoll_broadcast_module; + mca_scoll_base_module_collect_fn_t scoll_collect; + mca_scoll_base_module_1_0_0_t *scoll_collect_module; + mca_scoll_base_module_reduce_fn_t scoll_reduce; + mca_scoll_base_module_1_0_0_t *scoll_reduce_module; +}; +typedef struct mca_scoll_base_group_scoll_t mca_scoll_base_group_scoll_t; +END_C_DECLS + +#endif /* OSHMEM_MCA_SCOLL_H */ diff --git a/oshmem/mca/spml/Makefile.am b/oshmem/mca/spml/Makefile.am new file mode 100644 index 0000000000..ae8484320a --- /dev/null +++ b/oshmem/mca/spml/Makefile.am @@ -0,0 +1,35 @@ +# +# Copyright (c) 2012 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# main library setup +noinst_LTLIBRARIES = libmca_spml.la +libmca_spml_la_SOURCES = + +# header setup +nobase_oshmem_HEADERS = +nobase_nodist_oshmem_HEADERS = + +# local files +headers = spml.h +libmca_spml_la_SOURCES += $(headers) $(nodist_headers) + +# Conditionally install the header files +if WANT_INSTALL_HEADERS +nobase_oshmem_HEADERS += $(headers) +nobase_nodist_oshmem_HEADERS += $(nodist_headers) +oshmemdir = $(includedir)/oshmem/oshmem/mca/spml +else +oshmemdir = $(includedir) +endif + +include base/Makefile.am + +distclean-local: + rm -f base/static-components.h diff --git a/oshmem/mca/spml/base/Makefile.am b/oshmem/mca/spml/base/Makefile.am new file mode 100644 index 0000000000..ef8c2740d5 --- /dev/null +++ b/oshmem/mca/spml/base/Makefile.am @@ -0,0 +1,29 @@ +# +# Copyright (c) 2012 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +AM_CFLAGS = $(OSHMEM_CFLAGS) $(btl_sm_CPPFLAGS) + +headers += \ + base/base.h \ + base/spml_base_request.h \ + base/spml_base_request_dbg.h \ + base/spml_base_getreq.h \ + base/spml_base_atomicreq.h \ + base/spml_base_putreq.h + +libmca_spml_la_SOURCES += \ + base/spml_base_close.c \ + base/spml_base_open.c \ + base/spml_base_select.c \ + base/spml_base_request.c \ + base/spml_base_atomicreq.c \ + base/spml_base_getreq.c \ + base/spml_base_putreq.c \ + base/spml_base.c diff --git a/oshmem/mca/spml/base/base.h b/oshmem/mca/spml/base/base.h new file mode 100644 index 0000000000..610c5ddf79 --- /dev/null +++ b/oshmem/mca/spml/base/base.h @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_SPML_BASE_H +#define MCA_SPML_BASE_H + +#include "oshmem_config.h" +#include "opal/mca/mca.h" +#include "oshmem/mca/spml/spml.h" +#include "opal/class/opal_list.h" +#include "opal/class/opal_pointer_array.h" + +/* + * Global functions for the PML + */ + +BEGIN_C_DECLS + +/* + * This is the base priority for a SPML wrapper component + * If there exists more than one then it is undefined + * which one is picked. + */ +#define SPML_SELECT_WRAPPER_PRIORITY -128 + +OSHMEM_DECLSPEC int mca_spml_base_open(void); +OSHMEM_DECLSPEC int mca_spml_base_close(void); +OSHMEM_DECLSPEC int mca_spml_base_finalize(void); +OSHMEM_DECLSPEC int mca_spml_base_select(bool, bool); + +/* share in modex the name of the selected component */ +OSHMEM_DECLSPEC int mca_spml_base_spml_selected(const char *name); + +/* TODO: Re-write for spml */ +/* verify that all new procs are using the currently selected component */ +OSHMEM_DECLSPEC int mca_spml_base_spml_check_selected(const char *my_spml, + oshmem_proc_t **procs, + size_t nprocs); + + +OSHMEM_DECLSPEC int mca_spml_base_wait(void* addr, int cmp, void* value, int datatype); +OSHMEM_DECLSPEC int mca_spml_base_wait_nb(void* handle); +OSHMEM_DECLSPEC int mca_spml_base_oob_get_mkeys(int pe, uint32_t seg, mca_spml_mkey_t *mkeys); +/* + * Globals + */ +OSHMEM_DECLSPEC extern int mca_spml_base_output; +OSHMEM_DECLSPEC extern opal_list_t mca_spml_base_components_available; +OSHMEM_DECLSPEC extern mca_spml_base_component_t mca_spml_base_selected_component; +OSHMEM_DECLSPEC extern mca_spml_base_module_t mca_spml; +OSHMEM_DECLSPEC extern opal_pointer_array_t mca_spml_base_spml; +/*----------------------------------------------------------------------------------*/ +/*logger macros*/ + +#ifdef __BASE_FILE__ +#define __SPML_FILE__ __BASE_FILE__ +#else +#define __SPML_FILE__ __FILE__ +#endif + +#define SPML_VERBOSE(level, format, ...) \ + opal_output_verbose(level, mca_spml_base_output, "%s:%d - %s() " format, \ + __SPML_FILE__, __LINE__, __FUNCTION__, ## __VA_ARGS__) + +#define SPML_ERROR(format, ... ) \ + opal_output_verbose(0, mca_spml_base_output, "Error: %s:%d - %s() " format, \ + __SPML_FILE__, __LINE__, __FUNCTION__, ## __VA_ARGS__) + + +END_C_DECLS + +#endif /* MCA_SPML_BASE_H */ diff --git a/oshmem/mca/spml/base/spml_base.c b/oshmem/mca/spml/base/spml_base.c new file mode 100644 index 0000000000..9083f76962 --- /dev/null +++ b/oshmem/mca/spml/base/spml_base.c @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "oshmem_config.h" +#include "ompi/mca/bml/base/base.h" +#include "opal/datatype/opal_convertor.h" +#include "orte/include/orte/types.h" +#include "orte/runtime/orte_globals.h" +#include "oshmem/mca/spml/yoda/spml_yoda.h" +#include "oshmem/proc/proc.h" +#include "oshmem/mca/spml/base/base.h" +#include "oshmem/mca/spml/yoda/spml_yoda_putreq.h" +#include "oshmem/mca/spml/yoda/spml_yoda_getreq.h" +#include "ompi/mca/btl/btl.h" + + +#define SPML_BASE_DO_CMP(res, addr, op, val) \ + switch((op)) { \ + case SHMEM_CMP_EQ: \ + res = *(addr) == (val) ? 1 : 0; \ + break; \ + case SHMEM_CMP_NE: \ + res = *(addr) != (val) ? 1 : 0; \ + break; \ + case SHMEM_CMP_GT: \ + res = *(addr) > (val) ? 1 : 0; \ + break; \ + case SHMEM_CMP_LE: \ + res = *(addr) <= (val) ? 1 : 0; \ + break; \ + case SHMEM_CMP_LT: \ + res = *(addr) < (val) ? 1: 0; \ + break; \ + case SHMEM_CMP_GE: \ + res = *(addr) >= (val) ? 1 : 0; \ + break; \ + } + +#define SPML_BASE_DO_WAIT(cond, val, addr, op) \ + do { \ + SPML_BASE_DO_CMP(cond, val,addr,op); \ + opal_progress(); \ + } while (cond == 0) ; + + +/** + * Wait for data delivery. + * Pool on a variable given in addr until it is not equal to value. + */ +int mca_spml_base_wait(void* addr, int cmp, void* value, int datatype) +{ + int *int_addr, int_value; + long *long_addr, long_value; + short *short_addr, short_value; + long long *longlong_addr, longlong_value; + ompi_fortran_integer_t *fint_addr, fint_value; + ompi_fortran_integer4_t *fint4_addr, fint4_value; + ompi_fortran_integer8_t *fint8_addr, fint8_value; + int res = 0; + + switch(datatype){ + + /* Int */ + case SHMEM_INT: + int_value = *(int*)value; + int_addr = (int*)addr; + SPML_BASE_DO_WAIT(res, int_addr, cmp, int_value); + break; + + /* Short */ + case SHMEM_SHORT: + short_value = *(short*)value; + short_addr = (short*)addr; + SPML_BASE_DO_WAIT(res, short_addr, cmp, short_value); + break; + + /* Long */ + case SHMEM_LONG: + long_value = *(long*)value; + long_addr = (long*)addr; + SPML_BASE_DO_WAIT(res, long_addr, cmp, long_value); + break; + + /* Long-Long */ + case SHMEM_LLONG: + longlong_value = *(long long*)value; + longlong_addr = (long long*)addr; + SPML_BASE_DO_WAIT(res, longlong_addr, cmp, longlong_value); + break; + + /*C equivalent of Fortran integer type */ + case SHMEM_FINT: + fint_value = *(ompi_fortran_integer_t *)value; + fint_addr = (ompi_fortran_integer_t *)addr; + SPML_BASE_DO_WAIT(res, fint_addr, cmp, fint_value); + break; + + /*C equivalent of Fortran int4 type*/ + case SHMEM_FINT4: + fint4_value = *(ompi_fortran_integer4_t *)value; + fint4_addr = (ompi_fortran_integer4_t *)addr; + SPML_BASE_DO_WAIT(res, fint4_addr, cmp, fint4_value); + break; + + /*C equivalent of Fortran int8 type*/ + case SHMEM_FINT8: + fint8_value = *(ompi_fortran_integer8_t *)value; + fint8_addr = (ompi_fortran_integer8_t *)addr; + SPML_BASE_DO_WAIT(res, fint8_addr, cmp, fint8_value); + break; + } + + return OSHMEM_SUCCESS; +} + + +/** + * Waits for completion of a non-blocking put or get issued by the calling PE. + * This function waits for completion of a single non-blocking transfer issued by + * shmem_put_nb() or shmem_get_nb() (or related functions) when called with the + * address of a completion handle. + * Completion of the call to shmem_wait_nb() ensures that a non-blocking transfer has + * completed. The source buffer may then be reused. + */ +int mca_spml_base_wait_nb(void* handle) +{ + /* TODO fence is a gag for more accurate code + * Use shmem_quiet() (or a function calling shmem_quiet()) or + * shmem_wait_nb() to force completion of transfers for non-blocking operations. + */ + MCA_SPML_CALL(fence()); + + return OSHMEM_SUCCESS; +} + + +int mca_spml_base_oob_get_mkeys(int pe, uint32_t seg, mca_spml_mkey_t *mkeys) +{ + return OSHMEM_ERROR; +} + diff --git a/oshmem/mca/spml/base/spml_base_atomicreq.c b/oshmem/mca/spml/base/spml_base_atomicreq.c new file mode 100644 index 0000000000..d023947d90 --- /dev/null +++ b/oshmem/mca/spml/base/spml_base_atomicreq.c @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/ + +#include "oshmem_config.h" +#include "oshmem/types.h" +#include "oshmem/mca/spml/spml.h" +#include "oshmem/mca/spml/base/spml_base_atomicreq.h" + + +static void mca_spml_base_atomic_request_construct(mca_spml_base_atomic_request_t*); +static void mca_spml_base_atomic_request_destruct(mca_spml_base_atomic_request_t*); + + +OBJ_CLASS_INSTANCE( + mca_spml_base_atomic_request_t, + mca_spml_base_request_t, + mca_spml_base_atomic_request_construct, + mca_spml_base_atomic_request_destruct +); + + +static void mca_spml_base_atomic_request_construct(mca_spml_base_atomic_request_t* request) +{ + /* no need to reinit for every atomic -- never changes */ + request->req_base.req_type = MCA_SPML_REQUEST_ATOMIC_CAS; + OBJ_CONSTRUCT(&request->req_base.req_convertor, opal_convertor_t); +} + + +static void mca_spml_base_atomic_request_destruct(mca_spml_base_atomic_request_t* request) +{ + /* For each request the convertor get cleaned after each message + * (in the base _FINI macro). Therefore, as the convertor is a static object + * we don't have to call OBJ_DESTRUCT here. + */ +} + diff --git a/oshmem/mca/spml/base/spml_base_atomicreq.h b/oshmem/mca/spml/base/spml_base_atomicreq.h new file mode 100644 index 0000000000..86595e7905 --- /dev/null +++ b/oshmem/mca/spml/base/spml_base_atomicreq.h @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + */ +#ifndef MCA_SPML_BASE_ATOMIC_REQUEST_H +#define MCA_SPML_BASE_ATOMIC_REQUEST_H + +#include "oshmem_config.h" +#include "oshmem/mca/spml/base/spml_base_request.h" +#include "ompi/peruse/peruse-internal.h" + +BEGIN_C_DECLS + +/** + * Base type for atomic requests. + */ +struct mca_spml_base_atomic_request_t { + mca_spml_base_request_t req_base; /**< base request */ + size_t req_bytes_packed; /**< size of virtual heap memory variable operated on */ +}; +typedef struct mca_spml_base_atomic_request_t mca_spml_base_atomic_request_t; + +OSHMEM_DECLSPEC OBJ_CLASS_DECLARATION(mca_spml_base_atomic_request_t); + +/** + * Initialize an atomic request with call parameters. + * + * @param request (IN) Atomic request. + * @param addr (IN) User buffer. + * @param count (IN) Number of bytes. + * @param src (IN) Source rank w/in the communicator. + * @param comm (IN) Communicator. + * @param persistent (IN) Is this a persistent request. + */ + +#define MCA_SPML_BASE_ATOMIC_REQUEST_INIT( \ + request, \ + addr, \ + count, \ + src, \ + comm, \ + persistent) \ +{ \ + /* increment reference count on communicator */ \ + OBJ_RETAIN(comm); \ + \ + OSHMEM_REQUEST_INIT(&(request)->req_base.req_oshmem, persistent); \ + (request)->req_base.req_oshmem.req_shmem_object.comm = comm; \ + (request)->req_bytes_packed = 0; \ + (request)->req_base.req_addr = addr; \ + (request)->req_base.req_count = count; \ + (request)->req_base.req_peer = src; \ + (request)->req_base.req_comm = comm; \ + (request)->req_base.req_proc = NULL; \ + (request)->req_base.req_sequence = 0; \ + /* What about req_type ? */ \ + (request)->req_base.req_spml_complete = OPAL_INT_TO_BOOL(persistent); \ + (request)->req_base.req_free_called = false; \ +} +/** + * + * + */ +#define MCA_SPML_BASE_ATOMIC_START( request ) \ + do { \ + (request)->req_spml_complete = false; \ + \ + (request)->req_oshmem.req_status.SHMEM_SOURCE = SHMEM_ANY_SOURCE; \ + (request)->req_oshmem.req_status.SHMEM_ERROR = OSHMEM_SUCCESS; \ + (request)->req_oshmem.req_status._count = 0; \ + (request)->req_oshmem.req_status._cancelled = 0; \ + \ + (request)->req_oshmem.req_complete = false; \ + (request)->req_oshmem.req_state = OSHMEM_REQUEST_ACTIVE; \ + } while (0) + +/** + * Return a atomic request. Handle the release of the communicator and the + * attached datatype. + * + * @param request (IN) Get request. + */ +#define MCA_SPML_BASE_ATOMIC_REQUEST_FINI( request ) \ + do { \ + OSHMEM_REQUEST_FINI(&(request)->req_base.req_oshmem); \ + OBJ_RELEASE( (request)->req_base.req_comm); \ + opal_convertor_cleanup( &((request)->req_base.req_convertor) ); \ + } while (0) + +END_C_DECLS + +#endif + diff --git a/oshmem/mca/spml/base/spml_base_close.c b/oshmem/mca/spml/base/spml_base_close.c new file mode 100644 index 0000000000..c3d068cefc --- /dev/null +++ b/oshmem/mca/spml/base/spml_base_close.c @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "oshmem_config.h" + +#include + +#include "oshmem/constants.h" +#include "opal/mca/mca.h" /* TODO: remove redefined in spml.h*/ +#include "opal/mca/base/base.h" +#include "oshmem/mca/spml/spml.h" +#include "oshmem/mca/spml/base/base.h" +#include "oshmem/mca/spml/base/spml_base_request.h" +#include "opal/runtime/opal_progress.h" + + +int mca_spml_base_finalize(void) +{ + if (NULL != mca_spml_base_selected_component.spmlm_finalize) { + return mca_spml_base_selected_component.spmlm_finalize(); + } + return OSHMEM_SUCCESS; +} + + +int mca_spml_base_close(void) +{ + int i; + /* turn off the progress code for the spml */ + /*TODO: Irit Restore */ + /*if( NULL != mca_spml.spml_progress ) { + opal_progress_unregister(mca_spml.spml_progress); + } */ + + /*TODO: Remove*/ + /* Blatently ignore the return code (what would we do to recover, + * anyway? This module is going away, so errors don't matter anymore) + */ + + /** + * Destruct the send and receive queues. The ompi_free_list_t destructor + * will return the memory to the mpool, so this has to be done before the + * mpool get released by the SPML close function. + */ + OBJ_DESTRUCT(&mca_spml_base_put_requests); + OBJ_DESTRUCT(&mca_spml_base_get_requests); + + /* TODO: Restore mca_spml.spml_progress = mca_spml_base_progress;*/ + + for( i = 0; i < opal_pointer_array_get_size(&mca_spml_base_spml); i++) { + char * tmp_val; + tmp_val = (char *)opal_pointer_array_get_item(&mca_spml_base_spml, i); + if( NULL == tmp_val) { + continue; + } + free(tmp_val); + } + OBJ_DESTRUCT(&mca_spml_base_spml); + + mca_base_components_close(mca_spml_base_output, + &mca_spml_base_components_available, NULL); + + /* All done */ + + return OSHMEM_SUCCESS; +} + diff --git a/oshmem/mca/spml/base/spml_base_getreq.c b/oshmem/mca/spml/base/spml_base_getreq.c new file mode 100644 index 0000000000..55c8227896 --- /dev/null +++ b/oshmem/mca/spml/base/spml_base_getreq.c @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/ + +#include "oshmem_config.h" +#include "oshmem/types.h" +#include "oshmem/mca/spml/spml.h" +#include "oshmem/mca/spml/base/spml_base_getreq.h" + + +static void mca_spml_base_get_request_construct(mca_spml_base_get_request_t*); +static void mca_spml_base_get_request_destruct(mca_spml_base_get_request_t*); + + +OBJ_CLASS_INSTANCE( + mca_spml_base_get_request_t, + mca_spml_base_request_t, + mca_spml_base_get_request_construct, + mca_spml_base_get_request_destruct +); + + +static void mca_spml_base_get_request_construct(mca_spml_base_get_request_t* request) +{ + /* no need to reinit for every get -- never changes */ + request->req_base.req_type = MCA_SPML_REQUEST_GET; + OBJ_CONSTRUCT(&request->req_base.req_convertor, opal_convertor_t); +} + + +static void mca_spml_base_get_request_destruct(mca_spml_base_get_request_t* request) +{ + /* For each request the convertor get cleaned after each message + * (in the base _FINI macro). Therefore, as the convertor is a static object + * we don't have to call OBJ_DESTRUCT here. + */ +} + diff --git a/oshmem/mca/spml/base/spml_base_getreq.h b/oshmem/mca/spml/base/spml_base_getreq.h new file mode 100644 index 0000000000..f182365d0a --- /dev/null +++ b/oshmem/mca/spml/base/spml_base_getreq.h @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + */ +#ifndef MCA_SPML_BASE_GET_REQUEST_H +#define MCA_SPML_BASE_GET_REQUEST_H + +#include "oshmem_config.h" +#include "oshmem/mca/spml/base/spml_base_request.h" +#include "ompi/peruse/peruse-internal.h" + +BEGIN_C_DECLS + +/** + * Base type for get requests. + */ +struct mca_spml_base_get_request_t { + mca_spml_base_request_t req_base; /**< base request */ + void *req_addr; /**< pointer to recv buffer on the local PE - not necessarily an application buffer */ + size_t req_bytes_packed; /**< size of message being read */ +}; +typedef struct mca_spml_base_get_request_t mca_spml_base_get_request_t; +OSHMEM_DECLSPEC OBJ_CLASS_DECLARATION(mca_spml_base_get_request_t); + + + +/** + * Initialize a get request. + * + * @param request (IN) Pointer to the Get request. + * @param addr (IN) User buffer. + * @param count (IN) Number of bytes. + * @param peer (IN) rank w/in the communicator where the data is read from. + * @param mode (IN) Get Mode. + * @param persistent (IN) Is this a persistent request. + * @param convertor_flags(IN) + */ +#define MCA_SPML_BASE_GET_REQUEST_INIT( request, \ + addr, \ + count, \ + peer, \ + persistent) \ + { \ + OSHMEM_REQUEST_INIT(&(request)->req_base.req_oshmem, persistent); \ + (request)->req_addr = addr; \ + (request)->req_base.req_addr = addr; \ + (request)->req_base.req_count = count; \ + (request)->req_base.req_peer = (int32_t)peer; \ + (request)->req_base.req_spml_complete = OPAL_INT_TO_BOOL(persistent); \ + (request)->req_base.req_free_called = false; \ + (request)->req_base.req_oshmem.req_status._cancelled = 0; \ + (request)->req_bytes_packed = 0; \ +} + + + +/** + * + * + */ +#define MCA_SPML_BASE_GET_START( request ) \ + do { \ + (request)->req_spml_complete = false; \ + \ + (request)->req_oshmem.req_status.SHMEM_SOURCE = SHMEM_ANY_SOURCE; \ + (request)->req_oshmem.req_status.SHMEM_ERROR = OSHMEM_SUCCESS; \ + (request)->req_oshmem.req_status._count = 0; \ + (request)->req_oshmem.req_status._cancelled = 0; \ + \ + (request)->req_oshmem.req_complete = false; \ + (request)->req_oshmem.req_state = OSHMEM_REQUEST_ACTIVE; \ + } while (0) + +/** + * Return a Get request. Handle the release of the communicator and the + * attached datatype. + * + * @param request (IN) Get request. + */ +#define MCA_SPML_BASE_GET_REQUEST_FINI( request ) \ + do { \ + OSHMEM_REQUEST_FINI(&(request)->req_base.req_oshmem); \ + opal_convertor_cleanup( &((request)->req_base.req_convertor) ); \ + } while (0) + +END_C_DECLS + +#endif + diff --git a/oshmem/mca/spml/base/spml_base_open.c b/oshmem/mca/spml/base/spml_base_open.c new file mode 100644 index 0000000000..30d5d0bb5f --- /dev/null +++ b/oshmem/mca/spml/base/spml_base_open.c @@ -0,0 +1,168 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + + +#include "oshmem_config.h" +#include + +#ifdef HAVE_STRING_H +#include +#endif +#ifdef HAVE_UNISTD_H +#include +#endif /* HAVE_UNIST_H */ +#include "opal/mca/mca.h" +#include "opal/util/output.h" +#include "opal/mca/base/base.h" + + +#include "oshmem/constants.h" +#include "oshmem/mca/spml/spml.h" +#include "oshmem/mca/spml/base/base.h" +#include "oshmem/mca/spml/base/spml_base_request.h" + +/* + * The following file was created by configure. It contains extern + * statements and the definition of an array of pointers to each + * component's public mca_base_component_t struct. + */ + +#include "oshmem/mca/spml/base/static-components.h" +/* irit +int mca_spml_base_progress(void) +{ + return OSHMEM_SUCCESS; +} +*/ +#define xstringify(spml) #spml +#define stringify(spml) xstringify(spml) + +/* + * Global variables + */ +int mca_spml_base_output = 0; +int mca_spml_base_verbose = -1; +mca_spml_base_module_t mca_spml; + + +opal_list_t mca_spml_base_components_available; +mca_spml_base_component_t mca_spml_base_selected_component; +opal_pointer_array_t mca_spml_base_spml; + +/** + * Function for finding and opening either all MCA components, or the one + * that was specifically requested via a MCA parameter. + */ +int mca_spml_base_open(void) +{ + + int value = -1; + +/* TODO: Consider restoring FT (fault tolerance) */ +/* Irit removed temporarily. +#if OPAL_ENABLE_FT == 1 + char* wrapper_spml = NULL; +#endif +Irit*/ + + /* + * Register some MCA parameters + */ + /* Debugging/Verbose output */ + + + mca_base_param_reg_int_name("spml", + "base_verbose", + "Verbosity level of the SPML framework", + false, false, + 0, &value); + + mca_spml_base_output = opal_output_open(NULL); + opal_output_set_verbosity(mca_spml_base_output, value); + + /** + * Construct the send and receive request queues. There are 2 reasons to do it + * here. First, as they are globals it's better to construct them in one common + * place. Second, in order to be able to allow the external debuggers to show + * their content, they should get constructed as soon as possible once the MPI + * process is started. + */ + OBJ_CONSTRUCT(&mca_spml_base_put_requests, ompi_free_list_t); + OBJ_CONSTRUCT(&mca_spml_base_get_requests, ompi_free_list_t); + OBJ_CONSTRUCT(&mca_spml_base_spml, opal_pointer_array_t); + + /* Open up all available components */ + + if (OSHMEM_SUCCESS != + mca_base_components_open("spml", mca_spml_base_output, mca_spml_base_static_components, + &mca_spml_base_components_available, + !MCA_oshmem_spml_DIRECT_CALL)) { /*TODO: Irit change to MCA_oshmem_spml_DIRECT_CALL*/ + SPML_ERROR("Spml failed to open base component\n"); + return OSHMEM_ERROR; + } + + /* Set a sentinel in case we don't select any components (e.g., + * ompi_info) */ + + mca_spml_base_selected_component.spmlm_finalize = NULL; + + + /** + * Right now our selection of BTLs is completely broken. If we have + * multiple SPMLs that use BTLs than we will open all BTLs several times, leading to + * undefined behaviors. The simplest solution, at least until we + * figure out the correct way to do it, is to force a default SPML that + * uses BTLs and any other SPMLs that do not in the mca_spml_base_spml array. + */ + +/*TODO: Irit change to MCA_oshmem_spml_DIRECT_CALL*/ +#if MCA_oshmem_spml_DIRECT_CALL + opal_pointer_array_add(&mca_spml_base_spml, + stringify(MCA_oshmem_spml_DIRECT_CALL_COMPONENT)); +#else + { + /* Specify a SPML as a parameter */ + char* default_spml = NULL; + + mca_base_param_reg_string_name("spml", NULL, + "Specify a specific SPML to use", + false, false, "", &default_spml); + + if( (0 == strlen(default_spml)) || (default_spml[0] == '^') ) { +#ifdef OSHMEM_HAS_IKRIT + opal_pointer_array_add(&mca_spml_base_spml, strdup("ikrit")); +#else + opal_pointer_array_add(&mca_spml_base_spml, strdup("yoda")); +#endif + } else { + opal_pointer_array_add(&mca_spml_base_spml, default_spml); + } + } +/*TODO: Consider restoring FT */ +/*#if OPAL_ENABLE_FT == 1*/ + /* + *Which SPML Wrapper component to use, if any + * - NULL or "" = No wrapper + * - ow. select that specific wrapper component + */ +/* Irit removed temporarily + mca_base_param_reg_string_name("spml", "wrapper", + "Use a Wrapper component around the selected SPML component", + false, false, + NULL, &wrapper_spml); + if( NULL != wrapper_spml ) { + opal_pointer_array_add(&mca_spml_base_spml, wrapper_spml); + } +#endif +Irit */ +#endif + + return OSHMEM_SUCCESS; +} diff --git a/oshmem/mca/spml/base/spml_base_putreq.c b/oshmem/mca/spml/base/spml_base_putreq.c new file mode 100644 index 0000000000..0f0f175016 --- /dev/null +++ b/oshmem/mca/spml/base/spml_base_putreq.c @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#include "oshmem_config.h" +#include +#include "oshmem/mca/spml/spml.h" +#include "oshmem/mca/spml/base/spml_base_putreq.h" + +static void mca_spml_base_put_request_construct(mca_spml_base_put_request_t* req); +static void mca_spml_base_put_request_destruct(mca_spml_base_put_request_t* req); + + +OBJ_CLASS_INSTANCE( + mca_spml_base_put_request_t, + mca_spml_base_request_t, + mca_spml_base_put_request_construct, + mca_spml_base_put_request_destruct +); + + +static void mca_spml_base_put_request_construct(mca_spml_base_put_request_t* request) +{ + /* no need to reinit for every send -- never changes */ + request->req_base.req_type = MCA_SPML_REQUEST_PUT; +} + +static void mca_spml_base_put_request_destruct(mca_spml_base_put_request_t* req) +{ + /* For each request the convertor get cleaned after each message + * (in the base _FINI macro). Therefore, as the convertor is a static object + * we don't have to call OBJ_DESTRUCT here. + */ +} + diff --git a/oshmem/mca/spml/base/spml_base_putreq.h b/oshmem/mca/spml/base/spml_base_putreq.h new file mode 100644 index 0000000000..1a51c536a3 --- /dev/null +++ b/oshmem/mca/spml/base/spml_base_putreq.h @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + */ +#ifndef MCA_SPML_BASE_PUT_REQUEST_H +#define MCA_SPML_BASE_PUT_REQUEST_H + +#include "oshmem_config.h" +#include "oshmem/mca/spml/spml.h" +#include "oshmem/mca/spml/base/spml_base_request.h" +#include "ompi/peruse/peruse-internal.h" + +BEGIN_C_DECLS + +/** + * Base type for send requests + */ +struct mca_spml_base_put_request_t { + mca_spml_base_request_t req_base; /**< base request type - common data structure for use by wait/test */ + void *req_addr; /**< pointer to send buffer - may not be application buffer */ + size_t req_bytes_packed; /**< packed size of a message given the datatype and count */ +}; +typedef struct mca_spml_base_put_request_t mca_spml_base_put_request_t; + +OSHMEM_DECLSPEC OBJ_CLASS_DECLARATION( mca_spml_base_put_request_t ); + +/** + * Initialize a send request with call parameters. + * + * @param request (IN) Send request + * @param addr (IN) User buffer + * @param count (IN) Number of bytes. + * @param peer (IN) Destination rank + * @param comm (IN) Communicator + * @param mode (IN) Send mode (STANDARD,BUFFERED,SYNCHRONOUS,READY) + * @param persistent (IN) Is request persistent. + * @param convertor_flags (IN) Flags to pass to convertor + * + * Perform a any one-time initialization. Note that per-use initialization + * is done in the send request start routine. + */ + +#define MCA_SPML_BASE_PUT_REQUEST_INIT( request, \ + addr, \ + count, \ + peer, \ + persistent) \ + { \ + OSHMEM_REQUEST_INIT(&(request)->req_base.req_oshmem, persistent); \ + (request)->req_addr = addr; \ + (request)->req_base.req_addr = addr; \ + (request)->req_base.req_count = count; \ + (request)->req_base.req_peer = (int32_t)peer; \ + (request)->req_base.req_spml_complete = OPAL_INT_TO_BOOL(persistent); \ + (request)->req_base.req_free_called = false; \ + (request)->req_base.req_oshmem.req_status._cancelled = 0; \ + (request)->req_bytes_packed = 0; \ + \ + } + + + +/** + * Mark the request as started from the SPML base point of view. + * + * @param request (IN) The put request. + */ + +#define MCA_SPML_BASE_PUT_START( request ) \ + do { \ + (request)->req_spml_complete = false; \ + (request)->req_oshmem.req_complete = false; \ + (request)->req_oshmem.req_state = OSHMEM_REQUEST_ACTIVE; \ + (request)->req_oshmem.req_status._cancelled = 0; \ + } while (0) + +/** + * Release the ref counts on the communicator and datatype. + * + * @param request (IN) The put request. + */ + +#define MCA_SPML_BASE_PUT_REQUEST_FINI( request ) \ + do { \ + OSHMEM_REQUEST_FINI(&(request)->req_base.req_oshmem); \ + opal_convertor_cleanup( &((request)->req_base.req_convertor) ); \ + } while (0) + + +END_C_DECLS + +#endif + diff --git a/oshmem/mca/spml/base/spml_base_request.c b/oshmem/mca/spml/base/spml_base_request.c new file mode 100644 index 0000000000..de37935a16 --- /dev/null +++ b/oshmem/mca/spml/base/spml_base_request.c @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "oshmem_config.h" +#include "oshmem/mca/spml/spml.h" +#include "oshmem/mca/spml/base/spml_base_request.h" + +/** + * If you wonder why these 2 freelists are declared here read the comment + * in the spml_base_request.h file. + */ +ompi_free_list_t mca_spml_base_put_requests/* = {{{0}}}*/; +ompi_free_list_t mca_spml_base_get_requests /*= {{{0}}}*/; +ompi_free_list_t mca_spml_base_atomic_requests = {{{0}}}; + +static void mca_spml_base_request_construct(mca_spml_base_request_t* req) +{ + req->req_oshmem.req_type = OSHMEM_REQUEST_SPML; +} + +static void mca_spml_base_request_destruct(mca_spml_base_request_t* req) +{ +} + +OBJ_CLASS_INSTANCE( + mca_spml_base_request_t, + oshmem_request_t, + mca_spml_base_request_construct, + mca_spml_base_request_destruct +); + diff --git a/oshmem/mca/spml/base/spml_base_request.h b/oshmem/mca/spml/base/spml_base_request.h new file mode 100644 index 0000000000..e68df69cd8 --- /dev/null +++ b/oshmem/mca/spml/base/spml_base_request.h @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + */ +#ifndef MCA_SPML_BASE_REQUEST_H +#define MCA_SPML_BASE_REQUEST_H + +#include "oshmem_config.h" +#include "oshmem/request/request.h" /* TODO: define */ + +#include "opal/datatype/opal_convertor.h" + +#include "ompi/class/ompi_free_list.h" +#include "ompi/mca/pml/ob1/pml_ob1_comm.h" + + +BEGIN_C_DECLS + +/** + * External list for the requests. They are declared as lists of + * the basic request type, which will allow all SPML to overload + * the list. Beware these free lists have to be initialized + * directly by the SPML who win the SPML election. + */ +OSHMEM_DECLSPEC extern ompi_free_list_t mca_spml_base_put_requests; +OSHMEM_DECLSPEC extern ompi_free_list_t mca_spml_base_get_requests; +OSHMEM_DECLSPEC extern ompi_free_list_t mca_spml_base_send_requests; +OSHMEM_DECLSPEC extern ompi_free_list_t mca_spml_base_recv_requests; +OSHMEM_DECLSPEC extern ompi_free_list_t mca_spml_base_atomic_requests; + +/* TODO: Consider to add requests lists + * 1. List of Non blocking requests with NULL handle. + * 2. List of Non blocking request with Non-NULL handle. + * 3. List of non completed puts (for small msgs). + */ + +/** + * Types of one sided requests. + */ +typedef enum { + MCA_SPML_REQUEST_NULL, + MCA_SPML_REQUEST_PUT, /* Put request */ + MCA_SPML_REQUEST_GET, /* Get Request */ + MCA_SPML_REQUEST_SEND, /* Send Request */ + MCA_SPML_REQUEST_RECV, /* Receive Request */ + MCA_SPML_REQUEST_ATOMIC_CAS, /* Atomic Compare-And-Swap request */ + MCA_SPML_REQUEST_ATOMIC_FAAD /* Atomic Fatch-And-Add request */ +} mca_spml_base_request_type_t; + + +/** + * Base type for SPML one sided requests + */ +struct mca_spml_base_request_t { + + oshmem_request_t req_oshmem; /**< base request */ + volatile bool req_spml_complete; /**< flag indicating if the one sided layer is done with this request */ + mca_spml_base_request_type_t req_type; /**< SHMEM request type */ + volatile bool req_free_called; /**< flag indicating if the user has freed this request */ + opal_convertor_t req_convertor; /**< always need the convertor */ + + void *req_addr; /**< pointer to application buffer */ + size_t req_count; /**< count of user datatype elements */ /* TODO: Need to remove since we are going to remove datatype*/ + int32_t req_peer; /**< peer process - rank of process executing the parallel program */ + oshmem_proc_t* req_proc; /**< peer process */ + uint64_t req_sequence; /**< sequence number for shmem one sided ordering */ +}; +typedef struct mca_spml_base_request_t mca_spml_base_request_t; + +OSHMEM_DECLSPEC OBJ_CLASS_DECLARATION(mca_spml_base_request_t); + +END_C_DECLS + +#endif + diff --git a/oshmem/mca/spml/base/spml_base_request_dbg.h b/oshmem/mca/spml/base/spml_base_request_dbg.h new file mode 100644 index 0000000000..9ae3907ddf --- /dev/null +++ b/oshmem/mca/spml/base/spml_base_request_dbg.h @@ -0,0 +1,30 @@ +/* -*- Mode: C; c-basic-offset:4 ; -*- */ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#ifndef MCA_SPML_BASE_REQUEST_DBG_H +#define MCA_SPML_BASE_REQUEST_DBG_H + + +/** + * Type of request. + */ +typedef enum { + MCA_SPML_REQUEST_NULL, + MCA_SPML_REQUEST_PUT, /* Added */ + MCA_SPML_REQUEST_GET, /* Added */ + MCA_SPML_REQUEST_ATOMIC_CAS, /* Added */ + MCA_SPML_REQUEST_ATOMIC_FAAD /* Added */ + /*MCA_SPML_REQUEST_SEND, + MCA_SPML_REQUEST_RECV, + MCA_SPML_REQUEST_IPROBE, + MCA_SPML_REQUEST_PROBE*/ +} mca_spml_base_request_type_t; + +#endif /* MCA_SPML_BASE_REQUEST_DBG_H */ diff --git a/oshmem/mca/spml/base/spml_base_select.c b/oshmem/mca/spml/base/spml_base_select.c new file mode 100644 index 0000000000..0a64ff4093 --- /dev/null +++ b/oshmem/mca/spml/base/spml_base_select.c @@ -0,0 +1,332 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "oshmem_config.h" + +#ifdef HAVE_STRING_H +#include +#endif +#include "oshmem/mca/memheap/base/base.h" +#include "oshmem/mca/memheap/buddy/memheap_buddy.h" +#include "opal/class/opal_list.h" /*TODO: included in spml/base/base.h remove this include */ +#include "opal/util/output.h" +#include "orte/util/show_help.h" +#include "opal/runtime/opal_progress.h" +/* TODO: remove included in spml.h #include "opal/mca/mca.h" */ +#include "opal/mca/base/base.h" +#include "opal/runtime/opal.h" + +#include "orte/mca/errmgr/errmgr.h" +#include "orte/util/name_fns.h" +#include "orte/runtime/orte_globals.h" + +#include "oshmem/constants.h" +#include "oshmem/mca/spml/spml.h" +#include "oshmem/mca/spml/base/base.h" +#include "oshmem/proc/proc.h" + +typedef struct opened_component_t { + opal_list_item_t super; + mca_spml_base_component_t *om_component; +} opened_component_t; + +/* TODO: Restore modex. + static bool modex_reqd=false; +*/ + + +/** + * Function for selecting one component from all those that are + * available. + * + * Call the init function on all available components and get their + * priorities. Select the component with the highest priority. All + * other components will be closed and unloaded. The selected component + * will have all of its function pointers saved and returned to the + * caller. + */ +int mca_spml_base_select(bool enable_progress_threads, + bool enable_mpi_threads) +{ + int i, priority = 0, best_priority = 0, num_spml = 0; + int round = 0; + opal_list_item_t *item = NULL; + mca_base_component_list_item_t *cli = NULL; + mca_spml_base_component_t *component = NULL, *best_component = NULL; + mca_spml_base_module_t *module = NULL, *best_module = NULL; + opal_list_t opened; + opened_component_t *om = NULL; + bool found_spml; +/* TODO:Irit Consider restoring FT +#if OPAL_ENABLE_FT == 1 + mca_spml_base_component_t *wrapper_component = NULL; + mca_spml_base_module_t *wrapper_module = NULL; + int wrapper_priority = -1; +#endif +Irit */ + + /* Traverse the list of available components; call their init + functions. */ + + best_priority = -1; + best_component = NULL; + module = NULL; + OBJ_CONSTRUCT(&opened, opal_list_t); + for (round = 0; (round < 2) && (NULL == best_component); round++) { + for (item = opal_list_get_first(&mca_spml_base_components_available); + ((opal_list_get_end(&mca_spml_base_components_available) != item) && (item != NULL)); + item = opal_list_get_next(item) ) { + cli = (mca_base_component_list_item_t *) item; + component = (mca_spml_base_component_t *) cli->cli_component; + + /* if there is an include list - item must be in the list to be included */ + if (0 == round) { + found_spml = false; + for( i = 0; i < opal_pointer_array_get_size(&mca_spml_base_spml); i++) { + char * tmp_val = NULL; + tmp_val = (char *) opal_pointer_array_get_item(&mca_spml_base_spml, i); + if( NULL == tmp_val) { + continue; + } + if(0 == strncmp(component->spmlm_version.mca_component_name, + tmp_val, strlen(component->spmlm_version.mca_component_name)) ) { + found_spml = true; + break; + } + } + } + else { + found_spml = true; + } + + if(!found_spml && opal_pointer_array_get_size(&mca_spml_base_spml)) { + SPML_VERBOSE( 10,"select: component %s not in the include list", + component->spmlm_version.mca_component_name ); + + continue; + } + + /* if there is no init function - ignore it */ + if (NULL == component->spmlm_init) { + SPML_VERBOSE( 10,"select: no init function; ignoring component %s", + component->spmlm_version.mca_component_name ); + continue; + } + + /* this is a spml that could be considered */ + num_spml++; + + /* Init component to get its priority */ + SPML_VERBOSE( 10,"select: initializing %s component %s", + component->spmlm_version.mca_type_name, + component->spmlm_version.mca_component_name ); + priority = best_priority; + module = component->spmlm_init(&priority, enable_progress_threads, + enable_mpi_threads); + if (NULL == module) { + SPML_VERBOSE( 10,"select: init returned failure for component %s", + component->spmlm_version.mca_component_name ); + continue; + } + + SPML_VERBOSE( 10,"select: init returned priority %d", priority ); + + + /* TODO: Consider restoring FT */ + /* Irit + #if OPAL_ENABLE_FT == 1 */ + /* Determine if this is the wrapper component */ + /* if( priority <= SPML_SELECT_WRAPPER_PRIORITY) { + SPML_VERBOSE( 10,"spml:select: Wrapper Component: Component %s was determined to be a Wrapper SPML with priority %d", + component->pmlm_version.mca_component_name, priority ); + wrapper_priority = priority; + wrapper_component = component; + wrapper_module = module; + continue; + }*/ + /* Otherwise determine if this is the best component */ + /* else + #endif + Irit */ + + if (priority > best_priority) { + best_priority = priority; + best_component = component; + best_module = module; + } + + om = (opened_component_t*)malloc(sizeof(opened_component_t)); + if (NULL == om) { + return OSHMEM_ERR_OUT_OF_RESOURCE; + } + OBJ_CONSTRUCT(om, opal_list_item_t); + om->om_component = component; + opal_list_append(&opened, (opal_list_item_t*) om); + } + + /* Sasha: don't think that we need this code, but it can still be useful for debugging */ + if ((0 == round) && (NULL == best_component)) { + num_spml = 0; + for( i = 0; i < opal_pointer_array_get_size(&mca_spml_base_spml); i++) { + char * tmp_val = NULL; + tmp_val = (char *) opal_pointer_array_get_item(&mca_spml_base_spml, i); + if( NULL == tmp_val) { + continue; + } + SPML_VERBOSE(1, "SPML %s cannot be selected", tmp_val); + } + } + } + + /* Finished querying all components. Check for the bozo case. */ + + if( NULL == best_component ) { + orte_show_help("help-shmem-mca.txt", "find-available:none-found", true, "spml"); + for( i = 0; i < opal_pointer_array_get_size(&mca_spml_base_spml); i++) { + char * tmp_val = NULL; + tmp_val = (char *) opal_pointer_array_get_item(&mca_spml_base_spml, i); + if( NULL == tmp_val) { + continue; + } + orte_errmgr.abort(1, "SPML %s cannot be selected", tmp_val); + } + if(0 == i) { + orte_errmgr.abort(2, "No spml component available. This shouldn't happen."); + } + } + + SPML_VERBOSE( 10,"selected %s best priority %d\n", + best_component->spmlm_version.mca_component_name, best_priority); + + +/* TODO: restore. Irit removed temporarily */ + /* if more than one SPML could be considered, then we still need the + * modex since we cannot know which one will be selected on all procs + */ +/* if (1 < num_spml) { + modex_reqd = true; + } +*/ + + + /* Finalize all non-selected components */ +/*TODO: IRIT suggestion why not replace with mca_base_components_close with best_component and wrapper (in case of FT) at skip? */ + for (item = opal_list_remove_first(&opened); + NULL != item; + item = opal_list_remove_first(&opened)) { + om = (opened_component_t *) item; + + if (om->om_component != best_component +/* TODO: Consider restoring FT */ +/* Irit +#if OPAL_ENABLE_FT == 1 + && om->om_component != wrapper_component +#endif +Irit */ + ) { + /* Finalize */ + + if (NULL != om->om_component->spmlm_finalize) { + + /* Blatently ignore the return code (what would we do to + recover, anyway? This component is going away, so errors + don't matter anymore) */ + + om->om_component->spmlm_finalize(); + SPML_VERBOSE(10, "select: component %s not selected / finalized", + om->om_component->spmlm_version.mca_component_name); + } + } + OBJ_DESTRUCT( om ); + free(om); + } + OBJ_DESTRUCT( &opened ); +/*TODO: Irit end of suggestion */ + +/* TODO: Consider restoring FT*/ +#if 0 +#if OPAL_ENABLE_FT == 1 + /* Remove the wrapper component from the mca_spml_base_components_available list + * so we don't unload it prematurely in the next call + */ + if( NULL != wrapper_component ) { + for (item = opal_list_get_first(&mca_spml_base_components_available); + item != opal_list_get_end(&mca_spml_base_components_available); + item = opal_list_get_next(item) ) { + cli = (mca_base_component_list_item_t *) item; + component = (mca_spml_base_component_t *) cli->cli_component; + + if( component == wrapper_component ) { + opal_list_remove_item(&mca_spml_base_components_available, item); + } + } + } +#endif +#endif + + /* Save the winner */ + + mca_spml_base_selected_component = *best_component; + mca_spml = *best_module; + SPML_VERBOSE( 10, "select: component %s selected", + mca_spml_base_selected_component.spmlm_version.mca_component_name ); + + /* This base function closes, unloads, and removes from the + available list all unselected components. The available list will + contain only the selected component. */ + + mca_base_components_close(mca_spml_base_output, + &mca_spml_base_components_available, + (mca_base_component_t *) best_component); + +/* TODO: Consider restoring FT */ +#if 0 +#if OPAL_ENABLE_FT == 1 + /* If we have a wrapper then initalize it */ + if( NULL != wrapper_component ) { + priority = SPML_SELECT_WRAPPER_PRIORITY; + SPML_VERBOSE( 10,"spml:select: Wrapping: Component %s [%d] is being wrapped by component %s [%d]", + mca_spml_base_selected_component.spmlm_version.mca_component_name, + best_priority, + wrapper_component->spmlm_version.mca_component_name, + wrapper_priority ); + + /* Ask the wrapper commponent to wrap around the currently + * selected component. Indicated by the priority value provided + * this will cause the wrapper to do something different this time around + */ + module = wrapper_component->spmlm_init(&priority, + enable_progress_threads, + enable_mpi_threads); + /* Replace with the wrapper */ + best_component = wrapper_component; + mca_spml_base_selected_component = *best_component; + best_module = module; + mca_spml = *best_module; + } +#endif +#endif + /* register the winner's callback */ + /*if( NULL != mca_spml.spml_progress ) { + opal_progress_register(mca_spml.spml_progress); + }*/ + + +/* TODO: Restore. Irit Disabled temporarily. */ + /* register winner in the modex */ +/* if (modex_reqd && 0 == ORTE_PROC_MY_NAME->vpid) { + mca_spml_base_spml_selected(best_component->spmlm_version.mca_component_name); + } +*/ + + /* All done */ + + return OSHMEM_SUCCESS; +} diff --git a/oshmem/mca/spml/configure.m4 b/oshmem/mca/spml/configure.m4 new file mode 100644 index 0000000000..080a5f41e4 --- /dev/null +++ b/oshmem/mca/spml/configure.m4 @@ -0,0 +1,19 @@ +# -*- shell-script -*- +# +# Copyright (c) 2012 Mellanox Technologies, Inc. +# All rights reserved. +# +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +AC_DEFUN([MCA_oshmem_spml_CONFIG],[ + # configure all the components + MCA_CONFIGURE_FRAMEWORK($1, $2, 1) + + # this is a direct callable component, so set that up. + MCA_SETUP_DIRECT_CALL($1, $2) +]) diff --git a/oshmem/mca/spml/ikrit/.windows b/oshmem/mca/spml/ikrit/.windows new file mode 100644 index 0000000000..4e9e484624 --- /dev/null +++ b/oshmem/mca/spml/ikrit/.windows @@ -0,0 +1,12 @@ +# +# Copyright (c) 2012 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# Specific to this module +mca_dependencies=libshmem diff --git a/oshmem/mca/spml/ikrit/Makefile.am b/oshmem/mca/spml/ikrit/Makefile.am new file mode 100644 index 0000000000..c264240363 --- /dev/null +++ b/oshmem/mca/spml/ikrit/Makefile.am @@ -0,0 +1,42 @@ +# +# Copyright (c) 2012 Mellanox Technologies, Inc. +# All rights reserved. +# +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +dist_pkgdata_DATA = \ + help-shmem-spml-ikrit.txt + +AM_CFLAGS = $(OSHMEM_CFLAGS) +AM_CPPFLAGS = $(spml_ikrit_CPPFLAGS) + +ikrit_sources = \ + spml_ikrit.c \ + spml_ikrit.h \ + spml_ikrit_component.c \ + spml_ikrit_component.h + +if MCA_BUILD_oshmem_spml_ikrit_DSO +component_noinst = +component_install = mca_spml_ikrit.la +else +component_noinst = libmca_spml_ikrit.la +component_install = +endif + +mcacomponentdir = $(pkglibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_spml_ikrit_la_SOURCES = $(ikrit_sources) +mca_spml_ikrit_la_LIBADD = $(spml_ikrit_LIBS) +mca_spml_ikrit_la_LDFLAGS = -module -avoid-version $(spml_ikrit_LDFLAGS) + +noinst_LTLIBRARIES = $(component_noinst) +libmca_spml_ikrit_la_SOURCES = $(ikrit_sources) +libmca_spml_ikrit_la_LIBADD = $(spml_ikrit_LIBS) +libmca_spml_ikrit_la_LDFLAGS = -module -avoid-version $(spml_ikrit_LDFLAGS) + diff --git a/oshmem/mca/spml/ikrit/configure.m4 b/oshmem/mca/spml/ikrit/configure.m4 new file mode 100644 index 0000000000..9ada989a67 --- /dev/null +++ b/oshmem/mca/spml/ikrit/configure.m4 @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +# MCA_oshmem_mtl_mxm_CONFIG([action-if-can-compile], +# [action-if-cant-compile]) +# ------------------------------------------------ +AC_DEFUN([MCA_oshmem_spml_ikrit_CONFIG],[ + AC_CONFIG_FILES([oshmem/mca/spml/ikrit/Makefile]) + + OMPI_CHECK_MXM([spml_ikrit], + [AC_DEFINE([OSHMEM_HAS_IKRIT], [1], [mxm support is available]) + spml_ikrit_happy="yes"], + [spml_ikrit_happy="no"]) + + AS_IF([test "$spml_ikrit_happy" = "yes"], + [spml_ikrit_WRAPPER_EXTRA_LDFLAGS="$spml_ikrit_LDFLAGS" + spml_ikrit_WRAPPER_EXTRA_LIBS="$spml_ikrit_LIBS" + $1], + [$2]) + + + # substitute in the things needed to build mxm + AC_SUBST([spml_ikrit_CFLAGS]) + AC_SUBST([spml_ikrit_CPPFLAGS]) + AC_SUBST([spml_ikrit_LDFLAGS]) + AC_SUBST([spml_ikrit_LIBS]) +])dnl + diff --git a/oshmem/mca/spml/ikrit/configure.params b/oshmem/mca/spml/ikrit/configure.params new file mode 100644 index 0000000000..7bc1905bb2 --- /dev/null +++ b/oshmem/mca/spml/ikrit/configure.params @@ -0,0 +1,14 @@ +# -*- shell-script -*- +# +# Copyright (c) 2012 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# Specific to this module + +PARAM_CONFIG_FILES="Makefile" diff --git a/oshmem/mca/spml/ikrit/help-shmem-spml-ikrit.txt b/oshmem/mca/spml/ikrit/help-shmem-spml-ikrit.txt new file mode 100644 index 0000000000..c85baeee0f --- /dev/null +++ b/oshmem/mca/spml/ikrit/help-shmem-spml-ikrit.txt @@ -0,0 +1,68 @@ +# +# Copyright (c) 2012 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + + +[no uuid present] +Error obtaining unique transport key from ORTE (orte_precondition_transports %s +the environment). + + Local host: %s + +[unable to create endpoint] +MXM was unable to create an endpoint. Please make sure that the network link is +active on the node and the hardware is functioning. + + Error: %s + +[unable to extract endpoint ib address] +MXM was unable to read IB settings for endpoint + + Error: %s + +[unable to extract endpoint local address] +MXM was unable to read shmem settings for endpoint + + Error: %s + +[mxm mq create] +Failed to create MQ for endpoint + + Error: %s + +[errors during mxm_progress] + +Error %s occurred in attempting to make network progress (mxm_progress). + + +[mxm init] +Initialization of MXM library failed. + + Error: %s + +[error polling network] +Error %s occurred in attempting to make network progress (mxm_mq_ipeek). + +[error posting receive] +Unable to post application receive buffer + + Error: %s + Buffer: %p + Length: %d + +[error posting send] +Unable to post application send buffer + + Error: %s + +[error while waiting in send] +Unable while waiting in send + + Error: %s + \ No newline at end of file diff --git a/oshmem/mca/spml/ikrit/spml_ikrit.c b/oshmem/mca/spml/ikrit/spml_ikrit.c new file mode 100644 index 0000000000..c696d20849 --- /dev/null +++ b/oshmem/mca/spml/ikrit/spml_ikrit.c @@ -0,0 +1,1343 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#define _GNU_SOURCE +#include + +#include +#include +#include + + +#include "oshmem_config.h" +#include "opal/datatype/opal_convertor.h" +#include "orte/include/orte/types.h" +#include "orte/runtime/orte_globals.h" +#include "oshmem/mca/spml/ikrit/spml_ikrit.h" +#include "oshmem/include/shmem.h" +#include "oshmem/mca/memheap/memheap.h" +#include "oshmem/mca/memheap/base/base.h" +#include "oshmem/proc/proc.h" +#include "oshmem/mca/spml/base/base.h" +#include "oshmem/mca/spml/base/spml_base_putreq.h" +#include "oshmem/runtime/runtime.h" +#include "orte/util/show_help.h" +#include "ompi/runtime/ompi_module_exchange.h" + +#include "oshmem/mca/spml/ikrit/spml_ikrit_component.h" +#include "ompi/communicator/communicator.h" /*TODO: ompi_communicator_t */ +#include "ompi/patterns/comm/coll_ops.h" /*TODO: comm_bcast_pml */ + +/* use zcopy for put/get via sysv shared memory */ +#define SPML_IKRIT_USE_SHM_ZCOPY +//#define SPML_IKRIT_DEBUG_PUT + +typedef struct spml_ikrit_am_hdr { + uint64_t va; +} spml_ikrit_am_hdr_t; + +struct mca_spml_ikrit_put_request { + mca_spml_base_put_request_t req_put; + mxm_send_req_t mxm_req; + int pe; + mxm_req_buffer_t iov[2]; + spml_ikrit_am_hdr_t am_pkt; +}; + +typedef struct mca_spml_ikrit_put_request mca_spml_ikrit_put_request_t; +OBJ_CLASS_DECLARATION(mca_spml_ikrit_put_request_t); + +static int spml_ikrit_get_ep_address(spml_ikrit_mxm_ep_conn_info_t *ep_info, mxm_ptl_id_t ptlid) +{ + size_t addrlen; + mxm_error_t err; + + addrlen = sizeof(ep_info->ptl_addr[ptlid]); + err = mxm_ep_address(mca_spml_ikrit.mxm_ep, ptlid, + (struct sockaddr *) &ep_info->ptl_addr[ptlid], &addrlen); + if (MXM_OK != err) { + orte_show_help("help-spml-ikrit.txt", "unable to extract endpoint address", + true, mxm_error_string(err)); + return OSHMEM_ERROR; + } + + return OSHMEM_SUCCESS; +} + +static inline void mca_spml_irkit_req_wait(mxm_req_base_t *req) +{ + while (!mxm_req_test(req)) + opal_progress(); +} + +static int mca_spml_ikrit_put_request_free(struct oshmem_request_t** request) +{ + mca_spml_ikrit_put_request_t *put_req = *(mca_spml_ikrit_put_request_t **)request; + + assert(false == put_req->req_put.req_base.req_free_called); + OPAL_THREAD_LOCK(&oshmem_request_lock); + put_req->req_put.req_base.req_free_called = true; + OMPI_FREE_LIST_RETURN( &mca_spml_base_put_requests, + (ompi_free_list_item_t*)put_req); + OPAL_THREAD_UNLOCK(&oshmem_request_lock); + + *request = SHMEM_REQUEST_NULL; /*MPI_REQUEST_NULL;*/ + + return OSHMEM_SUCCESS; +} + +static int mca_spml_ikrit_put_request_cancel(struct oshmem_request_t * request, int complete) +{ + return OSHMEM_SUCCESS; +} + +static void mca_spml_ikrit_put_request_construct(mca_spml_ikrit_put_request_t* req) +{ + req->req_put.req_base.req_type = MCA_SPML_REQUEST_PUT; + req->req_put.req_base.req_oshmem.req_free = mca_spml_ikrit_put_request_free; + req->req_put.req_base.req_oshmem.req_cancel = mca_spml_ikrit_put_request_cancel; +} + +static void mca_spml_ikrit_put_request_destruct(mca_spml_ikrit_put_request_t* req) +{ +} + +OBJ_CLASS_INSTANCE( mca_spml_ikrit_put_request_t, + mca_spml_base_put_request_t, + mca_spml_ikrit_put_request_construct, + mca_spml_ikrit_put_request_destruct ); + + +struct mca_spml_ikrit_get_request { + mca_spml_base_get_request_t req_get; + mxm_send_req_t mxm_req; +}; + +typedef struct mca_spml_ikrit_get_request mca_spml_ikrit_get_request_t; +OBJ_CLASS_DECLARATION(mca_spml_ikrit_get_request_t); + +static int mca_spml_ikrit_get_request_free(struct oshmem_request_t** request) +{ + mca_spml_ikrit_get_request_t *get_req = *(mca_spml_ikrit_get_request_t **)request; + + assert(false == get_req->req_get.req_base.req_free_called); + OPAL_THREAD_LOCK(&oshmem_request_lock); + get_req->req_get.req_base.req_free_called = true; + OMPI_FREE_LIST_RETURN( &mca_spml_base_get_requests, + (ompi_free_list_item_t*)get_req); + OPAL_THREAD_UNLOCK(&oshmem_request_lock); + + *request = SHMEM_REQUEST_NULL; /*MPI_REQUEST_NULL;*/ + + return OSHMEM_SUCCESS; +} + +static int mca_spml_ikrit_get_request_cancel(struct oshmem_request_t * request, int complete) +{ + return OSHMEM_SUCCESS; +} + +static void mca_spml_ikrit_get_request_construct(mca_spml_ikrit_get_request_t* req) +{ + req->req_get.req_base.req_type = MCA_SPML_REQUEST_PUT; + req->req_get.req_base.req_oshmem.req_free = mca_spml_ikrit_get_request_free; + req->req_get.req_base.req_oshmem.req_cancel = mca_spml_ikrit_get_request_cancel; +} + +static void mca_spml_ikrit_get_request_destruct(mca_spml_ikrit_get_request_t* req) +{ +} + +OBJ_CLASS_INSTANCE( mca_spml_ikrit_get_request_t, + mca_spml_base_get_request_t, + mca_spml_ikrit_get_request_construct, + mca_spml_ikrit_get_request_destruct ); + +int mca_spml_ikrit_put_simple(void* dst_addr, size_t size, void* src_addr, int dst); + +static void mxm_setup_relays(oshmem_proc_t **procs, size_t nprocs); + +mca_spml_ikrit_t mca_spml_ikrit = { + { + /* Init mca_spml_base_module_t */ + mca_spml_ikrit_add_procs, + mca_spml_ikrit_del_procs, + mca_spml_ikrit_enable, + mca_spml_ikrit_register, + mca_spml_ikrit_deregister, + mca_spml_ikrit_oob_get_mkeys, + mca_spml_ikrit_put, + mca_spml_ikrit_put_nb, + mca_spml_ikrit_get, + mca_spml_ikrit_recv , + mca_spml_ikrit_send, + mca_spml_base_wait, + mca_spml_base_wait_nb, + mca_spml_ikrit_fence + } +}; + +void mca_spml_ikrit_dump_stats(void); +void mca_spml_ikrit_dump_stats() +{ + int num_procs; + int i; + char sbuf[1024]; + FILE *fp; + + fp = fmemopen(sbuf, sizeof(sbuf), "rw"); + num_procs = oshmem_num_procs(); + for (i = 0; i < num_procs; i++) { + mxm_print_conn_state(mca_spml_ikrit.mxm_peers[i]->mxm_conn, MXM_STATE_DETAIL_LEVEL_DATA, "", fp); + printf("=========== pe:%d conn:%p stats:\n %s==================\n", i, mca_spml_ikrit.mxm_peers[i]->mxm_conn, sbuf); + rewind(fp); + } + fclose(fp); +} + +static inline mca_spml_ikrit_put_request_t *alloc_put_req(void) +{ + mca_spml_ikrit_put_request_t *req; + ompi_free_list_item_t* item; + int rc; + + rc = OSHMEM_SUCCESS; + OMPI_FREE_LIST_WAIT(&mca_spml_base_put_requests, item, rc); + if (OMPI_SUCCESS != rc) + return NULL; + + req = (mca_spml_ikrit_put_request_t *)item; + req->req_put.req_base.req_free_called = false; + req->req_put.req_base.req_oshmem.req_complete = false; + + return req; +} + +static inline mca_spml_ikrit_get_request_t *alloc_get_req(void) +{ + mca_spml_ikrit_get_request_t *req; + ompi_free_list_item_t* item; + int rc; + + rc = OSHMEM_SUCCESS; + OMPI_FREE_LIST_WAIT(&mca_spml_base_get_requests, item, rc); + if (OMPI_SUCCESS != rc) + return NULL; + + req = (mca_spml_ikrit_get_request_t *)item; + req->req_get.req_base.req_free_called = false; + req->req_get.req_base.req_oshmem.req_complete = false; + + return req; +} + +int mca_spml_ikrit_enable(bool enable) +{ + SPML_VERBOSE(50, "*** ikrit ENABLED ****"); + ompi_free_list_init_new( &mca_spml_base_put_requests, + sizeof(mca_spml_ikrit_put_request_t), + opal_cache_line_size, + OBJ_CLASS(mca_spml_ikrit_put_request_t), + 0,opal_cache_line_size, + mca_spml_ikrit.free_list_num, + mca_spml_ikrit.free_list_max, + mca_spml_ikrit.free_list_inc, + NULL ); + + ompi_free_list_init_new( &mca_spml_base_get_requests, + sizeof(mca_spml_ikrit_get_request_t), + opal_cache_line_size, + OBJ_CLASS(mca_spml_ikrit_get_request_t), + 0,opal_cache_line_size, + mca_spml_ikrit.free_list_num, + mca_spml_ikrit.free_list_max, + mca_spml_ikrit.free_list_inc, + NULL ); + + return OSHMEM_SUCCESS; +} + +static int create_ptl_idx(int dst_pe) +{ + oshmem_proc_t *proc; + + proc = oshmem_proc_group_find(oshmem_group_all, dst_pe); + + proc->transport_ids = (char *)malloc(MXM_PTL_LAST * sizeof(char)); + if (!proc->transport_ids) + return OSHMEM_ERROR; + + proc->num_transports = 1; + if (oshmem_my_proc_id() == dst_pe) + proc->transport_ids[0] = MXM_PTL_SELF; + else + proc->transport_ids[0] = MXM_PTL_RDMA; + return OSHMEM_SUCCESS; +} + +static void destroy_ptl_idx(int dst_pe) +{ + oshmem_proc_t *proc; + + proc = oshmem_proc_group_find(oshmem_group_all, dst_pe); + if (proc->transport_ids) + free(proc->transport_ids); +} + +static void mxm_peer_construct(mxm_peer_t *p) +{ + p->pe = -1; + p->n_active_puts = 0; + p->need_fence = 0; + p->pe_relay = -1; + p->n_slaves = 0; +} + +static void mxm_peer_destruct(mxm_peer_t *p) +{ + /* may be we need to remov item from list */ +} + +OBJ_CLASS_INSTANCE( + mxm_peer_t, + opal_list_item_t, + mxm_peer_construct, + mxm_peer_destruct + ); + + +int mca_spml_ikrit_del_procs(oshmem_proc_t** procs, size_t nprocs) +{ + size_t i; + opal_list_item_t *item; + + if (mca_spml_ikrit.mxm_ep) { + mxm_ep_destroy(mca_spml_ikrit.mxm_ep); + mca_spml_ikrit.mxm_ep = 0; + } + + while (NULL != (item = opal_list_remove_first(&mca_spml_ikrit.active_peers))) {}; + OBJ_DESTRUCT(&mca_spml_ikrit.active_peers); + + for (i = 0; i < nprocs; i++) { + destroy_ptl_idx(i); + if (mca_spml_ikrit.mxm_peers[i]) { + //mxm_ep_disconnect(mca_spml_ikrit.mxm_peers[i]->mxm_conn); + OBJ_RELEASE(mca_spml_ikrit.mxm_peers[i]); + } + } + if (mca_spml_ikrit.mxm_peers) + free(mca_spml_ikrit.mxm_peers); + + return OSHMEM_SUCCESS; +} + +int mca_spml_ikrit_add_procs(oshmem_proc_t** procs, size_t nprocs) +{ + spml_ikrit_mxm_ep_conn_info_t* ep_info; + mxm_conn_req_t *conn_reqs; + mxm_error_t err; + size_t i; + int rc = OSHMEM_ERROR; + oshmem_proc_t *proc_self; + int my_rank = oshmem_my_proc_id(); + int timeout; + + OBJ_CONSTRUCT(&mca_spml_ikrit.active_peers, opal_list_t); + /* Allocate connection requests */ + conn_reqs = malloc(nprocs * sizeof(mxm_conn_req_t)); + ep_info = malloc(nprocs * sizeof(spml_ikrit_mxm_ep_conn_info_t)); + if (NULL == conn_reqs || NULL == ep_info) { + rc = OSHMEM_ERR_OUT_OF_RESOURCE; + goto bail; + } + memset(conn_reqs, 0x0, sizeof(mxm_conn_req_t)); + memset(ep_info, 0x0, sizeof(spml_ikrit_mxm_ep_conn_info_t)); + + mca_spml_ikrit.mxm_peers = (mxm_peer_t **)malloc(nprocs*sizeof(*(mca_spml_ikrit.mxm_peers))); + if (NULL == mca_spml_ikrit.mxm_peers) { + rc = OSHMEM_ERR_OUT_OF_RESOURCE; + goto bail; + } + + int* ranks_in_comm; + ranks_in_comm = (int *) malloc(nprocs * sizeof(int)); + for (i = 0; i < nprocs; ++i) { + ranks_in_comm[i] = i; + } + + if (OSHMEM_SUCCESS != spml_ikrit_get_ep_address(ep_info + my_rank, MXM_PTL_SELF)) { + return OSHMEM_ERROR; + } + if (OSHMEM_SUCCESS != spml_ikrit_get_ep_address(ep_info + my_rank, MXM_PTL_RDMA)) { + return OSHMEM_ERROR; + } + + opal_progress_register(spml_ikrit_progress); + for (i = 0; i < nprocs; ++i) + { + comm_bcast_pml(ep_info + i, i, sizeof(spml_ikrit_mxm_ep_conn_info_t), + MPI_BYTE, my_rank, nprocs, + ranks_in_comm, (ompi_communicator_t *)&ompi_mpi_comm_world); + } + + if (ranks_in_comm) + free(ranks_in_comm); + + /* Get the EP connection requests for all the processes from modex */ + for (i = 0; i < nprocs; ++i) { + + mca_spml_ikrit.mxm_peers[i] = OBJ_NEW(mxm_peer_t); + if (NULL == mca_spml_ikrit.mxm_peers[i]) { + rc = OSHMEM_ERR_OUT_OF_RESOURCE; + goto bail; + } + mca_spml_ikrit.mxm_peers[i]->pe = i; + + conn_reqs[i].ptl_addr[MXM_PTL_SELF] = (struct sockaddr *)&ep_info[i].ptl_addr[MXM_PTL_SELF]; + conn_reqs[i].ptl_addr[MXM_PTL_SHM] = NULL; + conn_reqs[i].ptl_addr[MXM_PTL_RDMA] = (struct sockaddr *)&ep_info[i].ptl_addr[MXM_PTL_RDMA]; + } + + /* Connect to remote peers */ + if (mxm_get_version() < MXM_VERSION(1,5)) { + timeout = 1000; + } else { + timeout = -1; + } + err = mxm_ep_connect(mca_spml_ikrit.mxm_ep, conn_reqs, nprocs, timeout); + if (MXM_OK != err) { + SPML_ERROR("MXM returned connect error: %s\n", mxm_error_string(err)); + for (i = 0; i < nprocs; ++i) { + if (MXM_OK != conn_reqs[i].error) { + SPML_ERROR("MXM EP connect to %s error: %s\n", procs[i]->proc_hostname, + mxm_error_string(conn_reqs[i].error)); + } + } + rc = OSHMEM_ERR_CONNECTION_FAILED; + goto bail; + } + + /* Save returned connections */ + for (i = 0; i < nprocs; ++i) { + mca_spml_ikrit.mxm_peers[i]->mxm_conn = conn_reqs[i].conn; + if (OSHMEM_SUCCESS != create_ptl_idx(i)) + goto bail; + + //printf("proc=%d name=%s jobid = %u vpid = %u flags = %x\n", (int)i, procs[i]->proc_hostname, procs[i]->proc_name.jobid, procs[i]->proc_name.vpid, procs[i]->proc_flags); + mxm_conn_ctx_set(conn_reqs[i].conn, mca_spml_ikrit.mxm_peers[i]); + } + + if (ep_info) + free(ep_info); + if (conn_reqs) + free(conn_reqs); + + proc_self = oshmem_proc_group_find(oshmem_group_all, my_rank); + /* identify local processes and change transport to SHM */ + for (i = 0; i < nprocs; i++) { + if (procs[i]->proc_name.jobid != proc_self->proc_name.jobid || + !OPAL_PROC_ON_LOCAL_NODE(procs[i]->proc_flags)) { + continue; + } + if (procs[i] == proc_self) + continue; +#ifdef SPML_IKRIT_USE_SHM_ZCOPY + procs[i]->transport_ids[0] = MXM_PTL_SHM; + procs[i]->transport_ids[1] = MXM_PTL_RDMA; + procs[i]->num_transports = 2; +#endif + } + + mxm_setup_relays(procs, nprocs); + + SPML_VERBOSE(50, "*** ADDED PROCS ***"); + return OSHMEM_SUCCESS; + +bail: + if (ep_info) + free(ep_info); + if (conn_reqs) + free(conn_reqs); + SPML_ERROR("add procs FAILED rc=%d", rc); + return rc; + +} + +mca_spml_mkey_t *mca_spml_ikrit_register(void* addr, size_t size, uint64_t shmid, int *count) +{ + //mxm_error_t err; + int i; + mca_spml_mkey_t *mkeys; + + *count = 0; + mkeys = (mca_spml_mkey_t *)calloc(1, MXM_PTL_LAST * sizeof(*mkeys)); + if(!mkeys){ + return NULL; + } + + for (i = 0; i < MXM_PTL_LAST; i++) { + switch(i) { + case MXM_PTL_SHM: + if ((int)MEMHEAP_SHM_GET_ID(shmid) != MEMHEAP_SHM_INVALID) { + mkeys[i].key = shmid; + mkeys[i].va_base = 0; + } + else { + mkeys[i].key = 0; + mkeys[i].va_base = (unsigned long)addr; + } + mkeys[i].spml_context = 0; + break; + case MXM_PTL_SELF: + mkeys[i].key = 0; + mkeys[i].spml_context = 0; + mkeys[i].va_base = (unsigned long)addr; + break; + case MXM_PTL_RDMA: +#if MXM_API < MXM_VERSION(1,5) + mkeys[i].ib.lkey = mkeys[i].ib.rkey = MXM_MKEY_NONE; +#else + mkeys[i].ib.lkey = mkeys[i].ib.rkey = 0; +#endif + mkeys[i].spml_context = 0; +#if 0 + /* don't register memheap if zcopy support is not enabled */ + err = mxm_reg_mr(mca_spml_ikrit.mxm_ep, MXM_PTL_RDMA, addr, size, &mkeys[i].ib.lkey, &mkeys[i].ib.rkey); + if (MXM_OK != err) { + SPML_VERBOSE(1, "failed to register memory: %s", mxm_error_string(err)); + goto err; + } +#endif + mkeys[i].va_base = (unsigned long)addr; + //mkeys[i].spml_context = (void *)(unsigned long)size; + break; + + default: + SPML_ERROR("unsupported PTL: %d", i); + goto err; + } + SPML_VERBOSE(5,"rank %d ptl %d rkey %x lkey %x key %llx address 0x%llX len %llu shmid 0x%X|0x%X", + oshmem_proc_local_proc->proc_name.vpid, + i, + mkeys[i].ib.rkey, + mkeys[i].ib.lkey, + (unsigned long long)mkeys[i].key, + (unsigned long long)mkeys[i].va_base, + (unsigned long long)size, + MEMHEAP_SHM_GET_TYPE(shmid), MEMHEAP_SHM_GET_ID(shmid) + ); + + } + *count = MXM_PTL_LAST; + + return mkeys; + +err: + mca_spml_ikrit_deregister(mkeys); + return NULL; +} + +int mca_spml_ikrit_deregister(mca_spml_mkey_t *mkeys) +{ + int i; + + if (!mkeys) + return OSHMEM_SUCCESS; + + for (i = 0; i < MXM_PTL_LAST; i++) { + switch(i) { + case MXM_PTL_SELF: + case MXM_PTL_SHM: + break; + case MXM_PTL_RDMA: + /* dereg memory */ + if (!mkeys[i].spml_context) + break; +#if MXM_API < MXM_VERSION(1,5) + mxm_dereg_mr(mca_spml_ikrit.mxm_ep, MXM_PTL_RDMA, + (void *)mkeys[i].va_base, + (unsigned long)mkeys[i].spml_context + ); +#endif + break; + } + } + return OSHMEM_SUCCESS; + +} + +static inline int get_ptl_id(int dst) +{ + oshmem_proc_t *proc; + + /* get endpoint and btl */ + proc = oshmem_proc_group_all(dst); + if (!proc) { + SPML_ERROR("Can not find destination proc for pe=%d", dst); + oshmem_shmem_abort(-1); + return -1; + } + return proc->transport_ids[0]; +} + +int mca_spml_ikrit_oob_get_mkeys(int pe, uint32_t seg, mca_spml_mkey_t *mkeys) +{ + int ptl; + + ptl = get_ptl_id(pe); + if (ptl < 0) + return OSHMEM_ERROR; + + if (ptl != MXM_PTL_RDMA) + return OSHMEM_ERROR; + + if (seg > 1) + return OSHMEM_ERROR; + +#if MXM_API < MXM_VERSION(1,5) + mkeys[ptl].ib.rkey = MXM_MKEY_NONE; +#endif + + return OSHMEM_SUCCESS; +} + + +static int mca_spml_ikrit_get_helper(mxm_send_req_t *sreq, void *src_addr, size_t size, void *dst_addr, int src) +{ + /* shmem spec states that get() operations are blocking. So it is enough + to have single mxm request. Also we count on mxm doing copy */ + uint64_t rva; + mca_spml_mkey_t *r_mkey; + int ptl_id; + + ptl_id = get_ptl_id(src); + /* already tried to send via shm and failed. go via rdma */ + if (ptl_id == MXM_PTL_SHM) + ptl_id = MXM_PTL_RDMA; + + /** + * Get the address to the remote rkey. + **/ + r_mkey = mca_memheap.memheap_get_cached_mkey(src, (unsigned long)src_addr, ptl_id, &rva); + if (!r_mkey) { + SPML_ERROR("pe=%d: %p is not address of shared variable", src, src_addr); + oshmem_shmem_abort(-1); + return OSHMEM_ERROR; + } + + SPML_VERBOSE(100, "get: pe:%d ptl=%d src=%p -> dst: %p sz=%d. src_rva=%p, src_rkey=0x%lx", + src, ptl_id, src_addr, dst_addr, (int)size, (void *)rva, r_mkey->key); + + + /* mxm does not really cares for get lkey */ + sreq->base.mq = mca_spml_ikrit.mxm_mq; + sreq->base.conn = mca_spml_ikrit.mxm_peers[src]->mxm_conn; + sreq->base.data_type = MXM_REQ_DATA_BUFFER; + sreq->base.data.buffer.ptr = dst_addr; + sreq->base.data.buffer.length = size; +#if MXM_API < MXM_VERSION(1,5) + sreq->base.data.buffer.mkey = MXM_MKEY_NONE; + sreq->op.mem.remote_mkey = r_mkey->ib.rkey; +#else + sreq->base.data.buffer.memh = NULL; + sreq->op.mem.remote_memh = NULL; +#endif + sreq->opcode = MXM_REQ_OP_GET; + sreq->op.mem.remote_vaddr = (intptr_t)rva; + sreq->base.state = MXM_REQ_NEW; + + return OSHMEM_SUCCESS; +} + +static inline int mca_spml_ikrit_get_shm(void *src_addr, size_t size, void *dst_addr, int src) +{ + int ptl_id; + uint64_t rva; + mca_spml_mkey_t *r_mkey; + + ptl_id = get_ptl_id(src); + /** + * Get the address to the remote rkey. + **/ + if (ptl_id != MXM_PTL_SHM) + return OSHMEM_ERROR; + + r_mkey = mca_memheap.memheap_get_cached_mkey(src, (unsigned long)src_addr, ptl_id, &rva); + if (!r_mkey) { + SPML_ERROR("pe=%d: %p is not address of shared variable", src, src_addr); + oshmem_shmem_abort(-1); + return OSHMEM_ERROR; + } + + if (OPAL_UNLIKELY(!mca_memheap.memheap_is_symmetric_addr((unsigned long)src_addr) || (unsigned long)src_addr == rva)) + return OSHMEM_ERROR; + + SPML_VERBOSE(100, "shm get: pe:%d src=%p -> dst: %p sz=%d. src_rva=%p, src_rkey=0x%lx", + src, src_addr, dst_addr, (int)size, (void *)rva, r_mkey->key); + memcpy(dst_addr, (void *)(unsigned long)rva, size); + opal_progress(); + return OSHMEM_SUCCESS; +} + +int mca_spml_ikrit_get(void *src_addr, size_t size, void *dst_addr, int src) +{ + mxm_send_req_t sreq; + + if (OSHMEM_SUCCESS == mca_spml_ikrit_get_shm(src_addr, size, dst_addr, src)) + return OSHMEM_SUCCESS; + + if (OSHMEM_SUCCESS != mca_spml_ikrit_get_helper(&sreq, src_addr, size, dst_addr, src)) { + oshmem_shmem_abort(-1); + return OSHMEM_ERROR; + } + + sreq.base.flags = MXM_REQ_FLAG_BLOCKING; + sreq.base.completed_cb = NULL; + + mxm_req_send(&sreq); + opal_progress(); + mca_spml_irkit_req_wait(&sreq.base); + + if (MXM_OK != sreq.base.error) { + SPML_ERROR("get request failed: %s - aborting", mxm_error_string(sreq.base.error)); + oshmem_shmem_abort(-1); + return OSHMEM_ERROR; + } + return OSHMEM_SUCCESS; +} + +static inline void get_completion_cb(void *ctx) +{ + mca_spml_ikrit_get_request_t *get_req = (mca_spml_ikrit_get_request_t *)ctx; + + OPAL_THREAD_ADD32(&mca_spml_ikrit.n_active_gets, -1); + get_req->req_get.req_base.req_spml_complete = true; + get_req->req_get.req_base.req_oshmem.req_status.SHMEM_ERROR = OSHMEM_SUCCESS; + oshmem_request_complete( &get_req->req_get.req_base.req_oshmem, 1); + oshmem_request_free( (oshmem_request_t**)&get_req ); +} + +/* extension. used 4 fence implementation b4 fence was added to mxm */ +int mca_spml_ikrit_get_async(void *src_addr, size_t size, void *dst_addr, int src) +{ + mca_spml_ikrit_get_request_t *get_req; + + if (OSHMEM_SUCCESS == mca_spml_ikrit_get_shm(src_addr, size, dst_addr, src)) + return OSHMEM_SUCCESS; + + get_req = alloc_get_req(); + if (NULL == get_req) { + SPML_ERROR("out of get requests - aborting"); + oshmem_shmem_abort(-1); + return OSHMEM_ERROR; + } + + if (OSHMEM_SUCCESS != mca_spml_ikrit_get_helper(&get_req->mxm_req, src_addr, size, dst_addr, src)) { + oshmem_shmem_abort(-1); + return OSHMEM_ERROR; + } + + get_req->mxm_req.base.flags = 0; + get_req->mxm_req.base.completed_cb = get_completion_cb; + get_req->mxm_req.base.context = get_req; + OPAL_THREAD_ADD32(&mca_spml_ikrit.n_active_gets, 1); + + mxm_req_send(&get_req->mxm_req); + + if (MXM_OK != get_req->mxm_req.base.error) { + SPML_ERROR("get request failed: %s - aborting", mxm_error_string(get_req->mxm_req.base.error)); + oshmem_shmem_abort(-1); + return OSHMEM_ERROR; + } + return OSHMEM_SUCCESS; +} + +static inline void fence_completion_cb(void *ctx) +{ + mca_spml_ikrit_get_request_t *fence_req = (mca_spml_ikrit_get_request_t *)ctx; + + OPAL_THREAD_ADD32(&mca_spml_ikrit.n_mxm_fences, -1); + fence_req->req_get.req_base.req_spml_complete = true; + fence_req->req_get.req_base.req_oshmem.req_status.SHMEM_ERROR = OSHMEM_SUCCESS; + oshmem_request_complete( &fence_req->req_get.req_base.req_oshmem, 1); + oshmem_request_free( (oshmem_request_t**)&fence_req ); +} + +static int mca_spml_ikrit_mxm_fence(int dst) +{ + mca_spml_ikrit_get_request_t *fence_req; + + + fence_req = alloc_get_req(); + if (NULL == fence_req) { + SPML_ERROR("out of get requests - aborting"); + oshmem_shmem_abort(-1); + return OSHMEM_ERROR; + } + + fence_req->mxm_req.base.mq = mca_spml_ikrit.mxm_mq; + fence_req->mxm_req.base.conn = mca_spml_ikrit.mxm_peers[dst]->mxm_conn; + fence_req->mxm_req.opcode = MXM_REQ_OP_FENCE; + fence_req->mxm_req.base.state = MXM_REQ_NEW; + fence_req->mxm_req.base.flags = MXM_REQ_FLAG_SEND_SYNC; + fence_req->mxm_req.base.completed_cb = fence_completion_cb; + fence_req->mxm_req.base.context = fence_req; + OPAL_THREAD_ADD32(&mca_spml_ikrit.n_mxm_fences, 1); + + mxm_req_send(&fence_req->mxm_req); + return OSHMEM_SUCCESS; +} + +static inline void put_completion_cb(void *ctx) +{ + mca_spml_ikrit_put_request_t *put_req = (mca_spml_ikrit_put_request_t *)ctx; + mxm_peer_t *peer; + + OPAL_THREAD_ADD32(&mca_spml_ikrit.n_active_puts, -1); + peer = mca_spml_ikrit.mxm_peers[put_req->pe]; + + /* this was last put in progress. Remove peer from the list so that we do not need explicit fence */ +#ifdef SPML_IKRIT_DEBUG_PUT + if (peer) { + if (peer->n_active_puts <= 0) { + /* actually this can happen because fence forces ref count to 0 while puts still may be in flight */ + SPML_VERBOSE(1, "pe %d n_active_puts %d", put_req->pe, peer->n_active_puts); + } + } + + if (put_req->mxm_req.base.state != MXM_REQ_COMPLETED) + SPML_ERROR("oops: pe %d uncompleted request state %d", put_req->pe, put_req->mxm_req.base.state); +#endif + + if (0 < peer->n_active_puts) { + peer->n_active_puts--; + if (0 == peer->n_active_puts && (put_req->mxm_req.base.flags & MXM_REQ_FLAG_SEND_SYNC)) { + //SPML_VERBOSE(20, "removed pe %d from active list", put_req->pe); + opal_list_remove_item(&mca_spml_ikrit.active_peers, &peer->super); + peer->need_fence = 0; + } + } + + put_req->req_put.req_base.req_spml_complete = true; + put_req->req_put.req_base.req_oshmem.req_status.SHMEM_ERROR = OSHMEM_SUCCESS; + oshmem_request_complete( &put_req->req_put.req_base.req_oshmem, 1); + oshmem_request_free( (oshmem_request_t**)&put_req ); +} + +/** + * TODO: using put request as handle is not good. + */ +static inline int mca_spml_ikrit_put_internal(void* dst_addr, size_t size, void* src_addr, int dst, void **handle, int zcopy) +{ + uint64_t rva; + mca_spml_ikrit_put_request_t *put_req; + int ptl_id; + mca_spml_mkey_t *l_mkey, *r_mkey; + uint32_t lkey; + static int count; + int need_progress = 0; + + ptl_id = get_ptl_id(dst); + /* Get rkey of remote PE (dst proc) which must be on memheap */ + r_mkey = mca_memheap.memheap_get_cached_mkey(dst, (unsigned long)dst_addr, ptl_id, &rva); + if(!r_mkey) { + SPML_ERROR("pe=%d: %p is not address of shared variable", dst, dst_addr); + oshmem_shmem_abort(-1); + return OSHMEM_ERROR; + } + +#ifdef SPML_IKRIT_DEBUG_PUT + + SPML_VERBOSE(100, "put: pe:%d ptl=%d dst=%p <- src: %p sz=%d. dst_rva=%p, dst_rkey=0x%lx", + dst, ptl_id, dst_addr, src_addr, (int)size, (void *)rva, r_mkey->key); +#endif + if (ptl_id == MXM_PTL_SHM) { + + if (OPAL_LIKELY(mca_memheap.memheap_is_symmetric_addr((unsigned long)dst_addr) && (unsigned long)dst_addr != rva)) { + memcpy((void *)(unsigned long)rva, src_addr, size); + // call progress as often as we would have with regular put + if (++count % SPML_IKRIT_PACKETS_PER_SYNC == 0) + mxm_progress(mca_spml_ikrit.mxm_context); + return OSHMEM_SUCCESS; + } + // segment not mapped - fallback to rmda + ptl_id = MXM_PTL_RDMA; + r_mkey = mca_memheap.memheap_get_cached_mkey(dst, (unsigned long)dst_addr, ptl_id, &rva); + if(!r_mkey) { + SPML_ERROR("pe=%d: %p is not address of shared variable", dst, dst_addr); + oshmem_shmem_abort(-1); + return OSHMEM_ERROR; + } + } + +#ifdef SPML_IKRIT_DEBUG_PUT + SPML_VERBOSE(100, "put: pe:%d ptl=%d dst=%p <- src: %p sz=%d. dst_rva=%p, dst_rkey=0x%lx", + dst, ptl_id, dst_addr, src_addr, (int)size, (void *)rva, r_mkey->key); +#endif + + l_mkey = mca_memheap.memheap_get_local_mkey((unsigned long)src_addr, ptl_id); + if (zcopy == 0 || !l_mkey) { + /* local memory is not registered - pass proper flag to mxm */ +#if MXM_API < MXM_VERSION(1,5) + lkey = MXM_MKEY_NONE; +#else + lkey = 0; +#endif + } + else { + lkey = l_mkey->ib.lkey; + } + + put_req = alloc_put_req(); + if (NULL == put_req) { + SPML_ERROR("out of put requests - aborting"); + oshmem_shmem_abort(-1); + return OSHMEM_ERROR; + } + if (handle) + *handle = put_req; + + /* fill out request */ + put_req->mxm_req.base.mq = mca_spml_ikrit.mxm_mq; + /* request immediate responce if we are getting low on send buffers. We only get responce from remote on ack timeout. + * Also request explicit ack once in a while */ + if (mca_spml_ikrit.free_list_max - mca_spml_ikrit.n_active_puts <= SPML_IKRIT_PUT_LOW_WATER || + (mca_spml_ikrit.mxm_peers[dst]->n_active_puts + 1) % SPML_IKRIT_PACKETS_PER_SYNC == 0) { + put_req->mxm_req.base.flags = MXM_REQ_FLAG_SEND_SYNC; + need_progress = 1; + } else { + put_req->mxm_req.base.flags = MXM_REQ_FLAG_SEND_LAZY|MXM_REQ_FLAG_SEND_SYNC; + } + + put_req->mxm_req.base.conn = mca_spml_ikrit.mxm_peers[dst]->mxm_conn; + put_req->mxm_req.base.data_type = MXM_REQ_DATA_BUFFER; + put_req->mxm_req.base.data.buffer.ptr = src_addr; + put_req->mxm_req.base.data.buffer.length = size; + put_req->mxm_req.base.completed_cb = put_completion_cb; + put_req->mxm_req.base.context = put_req; + put_req->mxm_req.opcode = MXM_REQ_OP_PUT; + put_req->mxm_req.op.mem.remote_vaddr = (intptr_t)rva; + put_req->mxm_req.base.state = MXM_REQ_NEW; + put_req->pe = dst; + +#if MXM_API < MXM_VERSION(1,5) + put_req->mxm_req.base.data.buffer.mkey = lkey; + put_req->mxm_req.op.mem.remote_mkey = r_mkey->ib.rkey; +#else + put_req->mxm_req.base.data.buffer.memh = NULL; + put_req->mxm_req.op.mem.remote_memh = NULL; +#endif + + if (mca_spml_ikrit.mxm_peers[dst]->pe_relay >= 0 && + mca_memheap_base_detect_addr_type((unsigned long)dst_addr) == ADDR_USER ) { + put_req->mxm_req.op.am.hid = 0; + put_req->mxm_req.op.am.imm_data = dst; + put_req->pe = mca_spml_ikrit.mxm_peers[dst]->pe_relay; + put_req->mxm_req.base.conn = mca_spml_ikrit.mxm_peers[put_req->pe]->mxm_conn; + put_req->mxm_req.opcode = MXM_REQ_OP_AM; + + /* set up iov */ + put_req->mxm_req.base.data_type = MXM_REQ_DATA_IOV; + put_req->mxm_req.base.data.iov.count = 2; + put_req->mxm_req.base.data.iov.vector = put_req->iov; + + put_req->iov[0].ptr = &put_req->am_pkt.va; + put_req->iov[0].length = sizeof(uint64_t); + put_req->am_pkt.va = (uint64_t)rva; + + put_req->iov[1].ptr = src_addr; + put_req->iov[1].length = size; + +#if MXM_API < MXM_VERSION(1,5) + put_req->iov[0].mkey = MXM_MKEY_NONE; + put_req->iov[1].mkey = lkey; +#else + put_req->iov[0].memh = NULL; + put_req->iov[1].memh = NULL; +#endif + } + + OPAL_THREAD_ADD32(&mca_spml_ikrit.n_active_puts, 1); + if (mca_spml_ikrit.mxm_peers[dst]->need_fence == 0) { + //SPML_VERBOSE(20, "added pe %d to active list", dst); + opal_list_append(&mca_spml_ikrit.active_peers, &mca_spml_ikrit.mxm_peers[dst]->super); + mca_spml_ikrit.mxm_peers[dst]->need_fence = 1; + } + + mca_spml_ikrit.mxm_peers[dst]->n_active_puts++; + + mxm_req_send(&put_req->mxm_req); + + if (MXM_OK != put_req->mxm_req.base.error) { + OPAL_THREAD_ADD32(&mca_spml_ikrit.n_active_puts, -1); + SPML_ERROR("put request %p failed: %s - aborting", put_req, mxm_error_string(put_req->mxm_req.base.error)); + oshmem_shmem_abort(-1); + return OSHMEM_ERROR; + } + //put_completion_cb(put_req); + if (need_progress) + mxm_progress(mca_spml_ikrit.mxm_context); + + return OSHMEM_SUCCESS; +} + +/* simple buffered put implementation. NOT IN USE + * Problems: + * - slighly worse performance than impl based on non buffered put + * - fence complexity is O(n_active_connections) instead of O(n_connections_with_outstanding_puts). + * Later is bounded by the network RTT & mxm ack timer. + */ +int mca_spml_ikrit_put_simple(void* dst_addr, size_t size, void* src_addr, int dst) +{ + uint64_t rva; + mxm_send_req_t mxm_req; + mxm_wait_t wait; + int ptl_id; + mca_spml_mkey_t *r_mkey; + static int count; + + ptl_id = get_ptl_id(dst); + /* Get rkey of remote PE (dst proc) which must be on memheap */ + r_mkey = mca_memheap.memheap_get_cached_mkey(dst, (unsigned long)dst_addr, ptl_id, &rva); + if(!r_mkey) { + SPML_ERROR("pe=%d: %p is not address of shared variable", dst, dst_addr); + oshmem_shmem_abort(-1); + return OSHMEM_ERROR; + } + +#ifdef SPML_IKRIT_DEBUG_PUT + + SPML_VERBOSE(100, "put: pe:%d ptl=%d dst=%p <- src: %p sz=%d. dst_rva=%p, dst_rkey=0x%lx", + dst, ptl_id, dst_addr, src_addr, (int)size, (void *)rva, r_mkey->key); +#endif + if (ptl_id == MXM_PTL_SHM) { + + if (OPAL_LIKELY(mca_memheap.memheap_is_symmetric_addr((unsigned long)dst_addr) && (unsigned long)dst_addr != rva)) { + memcpy((void *)(unsigned long)rva, src_addr, size); + // call progress as often as we would have with regular put + if (++count % SPML_IKRIT_PACKETS_PER_SYNC == 0) + mxm_progress(mca_spml_ikrit.mxm_context); + return OSHMEM_SUCCESS; + } + // segment not mapped - fallback to rmda + ptl_id = MXM_PTL_RDMA; + r_mkey = mca_memheap.memheap_get_cached_mkey(dst, (unsigned long)dst_addr, ptl_id, &rva); + if(!r_mkey) { + SPML_ERROR("pe=%d: %p is not address of shared variable", dst, dst_addr); + oshmem_shmem_abort(-1); + return OSHMEM_ERROR; + } + } + +#ifdef SPML_IKRIT_DEBUG_PUT + SPML_VERBOSE(100, "put: pe:%d ptl=%d dst=%p <- src: %p sz=%d. dst_rva=%p, dst_rkey=0x%lx", + dst, ptl_id, dst_addr, src_addr, (int)size, (void *)rva, r_mkey->key); +#endif + + + /* fill out request */ + mxm_req.base.mq = mca_spml_ikrit.mxm_mq; + mxm_req.base.flags = MXM_REQ_FLAG_BLOCKING; + mxm_req.base.conn = mca_spml_ikrit.mxm_peers[dst]->mxm_conn; + mxm_req.base.data_type = MXM_REQ_DATA_BUFFER; + mxm_req.base.data.buffer.ptr = src_addr; + mxm_req.base.data.buffer.length = size; + mxm_req.base.completed_cb = 0; + mxm_req.base.context = 0; + mxm_req.opcode = MXM_REQ_OP_PUT; + mxm_req.op.mem.remote_vaddr = (intptr_t)rva; + mxm_req.base.state = MXM_REQ_NEW; + mxm_req.base.error = MXM_OK; + +#if MXM_API < MXM_VERSION(1,5) + mxm_req.base.data.buffer.mkey = MXM_MKEY_NONE; + mxm_req.op.mem.remote_mkey = MXM_MKEY_NONE; +#else + mxm_req.base.data.buffer.memh = NULL; + mxm_req.op.mem.remote_memh = NULL; +#endif + + + if (mca_spml_ikrit.mxm_peers[dst]->need_fence == 0) { + //SPML_VERBOSE(20, "added pe %d to active list", dst); + opal_list_append(&mca_spml_ikrit.active_peers, &mca_spml_ikrit.mxm_peers[dst]->super); + mca_spml_ikrit.mxm_peers[dst]->need_fence = 1; + } + + mxm_req_send(&mxm_req); + if (MXM_OK != mxm_req.base.error) { + SPML_ERROR("put request failed: %s(%d) - aborting", mxm_error_string(mxm_req.base.error), mxm_req.base.error); + oshmem_shmem_abort(-1); + return OSHMEM_ERROR; + } + + + wait.req = &mxm_req.base; + wait.state = (mxm_req_state_t)(MXM_REQ_SENT|MXM_REQ_COMPLETED); + wait.progress_cb = NULL; + wait.progress_arg = NULL; + mxm_wait(&wait); + + return OSHMEM_SUCCESS; +} + +int mca_spml_ikrit_put_nb(void* dst_addr, size_t size, void* src_addr, int dst, void **handle) +{ + int err; + err = mca_spml_ikrit_put_internal(dst_addr, size, src_addr, dst, handle, 1); + if (OSHMEM_SUCCESS != err) { + SPML_ERROR("put failed - aborting"); + oshmem_shmem_abort(-1); + return OSHMEM_ERROR; + } + return OSHMEM_SUCCESS; +} + +int mca_spml_ikrit_put(void* dst_addr, size_t size, void* src_addr, int dst) +{ + int err; + mca_spml_ikrit_put_request_t *put_req; + mxm_wait_t wait; + + put_req = 0; + err = mca_spml_ikrit_put_internal(dst_addr, size, src_addr, dst, (void **)&put_req, 0); + if (OSHMEM_SUCCESS != err) { + SPML_ERROR("put failed - aborting"); + oshmem_shmem_abort(-1); + return OSHMEM_ERROR; + } + if (!put_req) + return OSHMEM_SUCCESS; + + wait.req = &put_req->mxm_req.base; + wait.state = (mxm_req_state_t)(MXM_REQ_SENT|MXM_REQ_COMPLETED); + wait.progress_cb = NULL; + wait.progress_arg = NULL; + mxm_wait(&wait); + + return OSHMEM_SUCCESS; +} + +static void mxm_relay_handler(mxm_conn_h conn, mxm_imm_t imm, void *data, size_t len, size_t offset, int is_lf) +{ + uint64_t va, rva; + char *pkt_data; + mca_spml_mkey_t *r_mkey; + int ptl_id; + mxm_peer_t *peer; + + //printf("relay req recvd conn=%p data=%p dst=%d len=%d offset=%d is_lf=%d - aborting\n", + // conn, data, (int)imm, (int)len, (int)offset, is_lf); + + if (offset == 0) { + va = *(uint64_t *)data; + pkt_data = (char *)data + sizeof(va); + len -= sizeof(va); + if (!is_lf) { + // we expect more fragments: save destination virtual address + peer = mxm_conn_ctx_get(conn); + peer->dst_va = va; + } + } + else { + // next fragment: use saved va and offset to compute va + pkt_data = data; + peer = mxm_conn_ctx_get(conn); + va = peer->dst_va + offset - sizeof(va); + } + + ptl_id = get_ptl_id(imm); + if (ptl_id != MXM_PTL_SHM) { + SPML_ERROR("relay req to non local PE recvd dst=%d va=%llx len=%d - aborting", (int)imm, (unsigned long long)va, (int)len); + oshmem_shmem_abort(-1); + return; + } + + /* Get rkey of remote PE (dst proc) which must be on memheap */ + r_mkey = mca_memheap.memheap_get_cached_mkey(imm, va, ptl_id, &rva); + if(!r_mkey) { + SPML_ERROR("relay to PE=%d: %p is not address of shared variable", imm, (void *)va); + oshmem_shmem_abort(-1); + return; + } + + memcpy((void *)(unsigned long)rva, pkt_data, len); +} + +static void mxm_setup_relays(oshmem_proc_t **procs, size_t nprocs) +{ + size_t i; + //long i; + opal_hash_table_t h; + int pe_relay; + int ret; + int r_i, r; + + if (mca_spml_ikrit.n_relays <= 0) + return; + + OBJ_CONSTRUCT(&h, opal_hash_table_t); + opal_hash_table_init(&h, 128); + + /* lowest rank on node will be used to relay to everyone on that node */ + for (i = 0; i < nprocs; i++) { + //for (i = nprocs-1; i >= 0; i--) { + if (OPAL_PROC_ON_LOCAL_NODE(procs[i]->proc_flags)) + continue; + + ret = opal_hash_table_get_value_ptr(&h, procs[i]->proc_hostname, strlen(procs[i]->proc_hostname), (void **)&pe_relay); + if (ret != OPAL_SUCCESS) { + opal_hash_table_set_value_ptr(&h, procs[i]->proc_hostname, strlen(procs[i]->proc_hostname), (void *)i); + mca_spml_ikrit.mxm_peers[i]->n_relays = 1; + mca_spml_ikrit.mxm_peers[i]->pe_relays[0] = i; + continue; + } +#if 0 + if (mca_spml_ikrit.mxm_peers[pe_relay]->n_slaves >= 15) { + opal_hash_table_set_value_ptr(&h, procs[i]->proc_hostname, strlen(procs[i]->proc_hostname), (void *)i); + } + else { + mca_spml_ikrit.mxm_peers[pe_relay]->n_slaves++; + mca_spml_ikrit.mxm_peers[i]->pe_relay = pe_relay; +// printf("dst %d relay %d\n", (int)i, pe_relay); + } +#endif + /* first allocate relays */ + if (mca_spml_ikrit.mxm_peers[pe_relay]->n_relays < mca_spml_ikrit.n_relays) { + //printf("r_i=%d assigned relay %d\n", mca_spml_ikrit.mxm_peers[pe_relay]->n_relays, (int)i); + mca_spml_ikrit.mxm_peers[pe_relay]->pe_relays[mca_spml_ikrit.mxm_peers[pe_relay]->n_relays] = i; + mca_spml_ikrit.mxm_peers[pe_relay]->n_relays++; + continue; + } + + /* now assign slave to relay */ + r_i = mca_spml_ikrit.mxm_peers[pe_relay]->n_relays - 1; + while (r_i >= 0) { + r = mca_spml_ikrit.mxm_peers[pe_relay]->pe_relays[r_i]; + if (mca_spml_ikrit.mxm_peers[r]->n_slaves >= 1) { + r_i--; + continue; + } + mca_spml_ikrit.mxm_peers[r]->n_slaves++; + mca_spml_ikrit.mxm_peers[i]->pe_relay = r; + //printf("dst %d relay %d r_idx %d master %d\n", (int)i, r, r_i, pe_relay); + break; + + } + //mca_spml_ikrit.mxm_peers[pe_relay]->n_relays = r_i; + } + + OBJ_DESTRUCT(&h); + mxm_set_am_handler(mca_spml_ikrit.mxm_context, 0, mxm_relay_handler, MXM_AM_FLAG_THREAD_SAFE); +} + +int mca_spml_ikrit_fence(void) +{ + mxm_peer_t *peer; + opal_list_item_t *item; + + + SPML_VERBOSE(20, "Into fence with %d active puts on %d pes", + mca_spml_ikrit.n_active_puts, + (int)opal_list_get_size(&mca_spml_ikrit.active_peers) + ); + + /* puts(unless are send sync) are completed by remote side lazily. That is either when remote decides to + * ack window which can take hundreds of ms. So speed things up by doing fence */ + while (NULL != (item = opal_list_remove_first(&mca_spml_ikrit.active_peers))) { + peer = (mxm_peer_t *)item; + peer->n_active_puts = 0; + peer->need_fence = 0; + mca_spml_ikrit_mxm_fence(peer->pe); + } + + while (0 < mca_spml_ikrit.n_mxm_fences) { + oshmem_request_wait_any_completion(); + } + + SPML_VERBOSE(20, "fence completed"); + return OSHMEM_SUCCESS; +} + + + +/* blocking receive */ +int mca_spml_ikrit_recv(void* buf, size_t size, int src) +{ + mxm_recv_req_t req; + char dummy_buf[1]; + + /* tag mask 0 matches any tag */ + SPML_VERBOSE(100, "want to recv from src %d, size %d buf %p", src, (int)size, buf); + req.tag = src == SHMEM_ANY_SOURCE ? 0 : src; + req.tag_mask = src == SHMEM_ANY_SOURCE ? 0 : 0xFFFFFFFF; + + req.base.state = MXM_REQ_NEW; + req.base.mq = mca_spml_ikrit.mxm_mq; + req.base.conn = NULL; + req.base.flags = MXM_REQ_FLAG_BLOCKING; + req.base.completed_cb = NULL; + + req.base.data_type = MXM_REQ_DATA_BUFFER; + req.base.data.buffer.ptr = buf == NULL ? dummy_buf : buf; + req.base.data.buffer.length = size == 0 ? sizeof(dummy_buf) : size; +#if MXM_API < MXM_VERSION(1,5) + req.base.data.buffer.mkey = MXM_MKEY_NONE; +#else + req.base.data.buffer.memh = NULL; +#endif + + mxm_req_recv(&req); + mca_spml_irkit_req_wait(&req.base); + if (req.base.error != MXM_OK) { + return OSHMEM_ERROR; + } + SPML_VERBOSE(100, "recvd from tag %d len %d", req.completion.sender_tag, + (int)req.completion.actual_len); + + return OSHMEM_SUCCESS; +} + + +/* for now only do blocking copy send */ +int mca_spml_ikrit_send(void* buf, size_t size, int dst, mca_spml_base_put_mode_t mode) +{ + mxm_send_req_t req; + char dummy_buf[1]; + + SPML_VERBOSE(100, "sending %p size %d to %d, mode %d", buf, (int)size, dst, (int)mode); + req.opcode = MXM_REQ_OP_SEND; + // FIXME: doing tagging like this can cause a conflict with MPI mtl + req.op.send.tag = oshmem_my_proc_id(); + + req.base.state = MXM_REQ_NEW; + req.base.mq = mca_spml_ikrit.mxm_mq; + req.base.conn = mca_spml_ikrit.mxm_peers[dst]->mxm_conn; + req.base.flags = MXM_REQ_FLAG_BLOCKING; + req.base.completed_cb = NULL; + + req.base.data_type = MXM_REQ_DATA_BUFFER; + req.base.data.buffer.ptr = buf == NULL ? dummy_buf : buf; + req.base.data.buffer.length = size == 0 ? sizeof(dummy_buf) : size; +#if MXM_API < MXM_VERSION(1,5) + req.base.data.buffer.mkey = MXM_MKEY_NONE; +#else + req.base.data.buffer.memh = NULL; +#endif + + mxm_req_send(&req); + mca_spml_irkit_req_wait(&req.base); + if (req.base.error != MXM_OK) { + return OSHMEM_ERROR; + } + + return OSHMEM_SUCCESS; +} diff --git a/oshmem/mca/spml/ikrit/spml_ikrit.h b/oshmem/mca/spml/ikrit/spml_ikrit.h new file mode 100644 index 0000000000..79fb441bc9 --- /dev/null +++ b/oshmem/mca/spml/ikrit/spml_ikrit.h @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + */ + +#ifndef MCA_SPML_UD_MXM_H +#define MCA_SPML_UD_MXM_H + +#include "oshmem_config.h" +#include "oshmem/request/request.h" +#include "oshmem/mca/spml/spml.h" +#include "oshmem/mca/spml/base/spml_base_putreq.h" +#include "oshmem/proc/proc.h" +#include "oshmem/mca/spml/base/spml_base_request.h" +#include "oshmem/mca/spml/base/spml_base_getreq.h" + +#include "ompi/mca/bml/base/base.h" +#include "ompi/class/ompi_free_list.h" +#include "opal/class/opal_list.h" + +#include "orte/runtime/orte_globals.h" + +#include +#include +#include + +#ifndef MXM_VERSION +#define MXM_VERSION(major, minor) (((major)<= 0, data will be send to pe_relay which will forward it to destination pe */ + int pe_relay; + uint64_t dst_va; /* virtual address on the final destination */ + int n_slaves; + int pe_relays[16]; + int n_relays; +}; + +typedef struct mxm_peer mxm_peer_t; +OBJ_CLASS_DECLARATION(mxm_peer_t); + +struct mca_spml_ikrit_t { + mca_spml_base_module_t super; + + mxm_h mxm_context; + mxm_ep_h mxm_ep; + mxm_mq_h mxm_mq; + mxm_peer_t **mxm_peers; + + uint32_t n_active_puts; + uint32_t n_active_gets; + uint32_t n_mxm_fences; + + int priority; /* component priority */ + int free_list_num; /* initial size of free list */ + int free_list_max; /* maximum size of free list */ + int free_list_inc; /* number of elements to grow free list */ + + bool enabled; + opal_list_t active_peers; + int n_relays; /* number of procs/node serving as relays */ +}; + +typedef struct mca_spml_ikrit_t mca_spml_ikrit_t; + +typedef struct spml_ikrit_mxm_ep_conn_info_t { + struct sockaddr_storage ptl_addr[MXM_PTL_LAST]; +} spml_ikrit_mxm_ep_conn_info_t; + + +extern mca_spml_ikrit_t mca_spml_ikrit; + +extern int mca_spml_ikrit_enable( bool enable ); +extern int mca_spml_ikrit_get(void* dst_addr, size_t size, void* src_addr, int src); +/* extension. used 4 fence implementation b4 fence was added to mxm */ +extern int mca_spml_ikrit_get_async(void *src_addr, size_t size, void *dst_addr, int src); + +extern int mca_spml_ikrit_put(void* dst_addr, size_t size, void* src_addr, int dst); +extern int mca_spml_ikrit_put_nb(void* dst_addr, size_t size, void* src_addr, int dst, void **handle); + +extern int mca_spml_ikrit_recv(void* buf, size_t size, int src); +extern int mca_spml_ikrit_send(void* buf, size_t size, int dst, mca_spml_base_put_mode_t mode); + +extern mca_spml_mkey_t *mca_spml_ikrit_register(void* addr, size_t size, uint64_t shmid, int *count); +extern int mca_spml_ikrit_deregister(mca_spml_mkey_t *mkeys); +extern int mca_spml_ikrit_oob_get_mkeys(int pe, uint32_t seg, mca_spml_mkey_t *mkeys); + +extern int mca_spml_ikrit_add_procs(oshmem_proc_t** procs, size_t nprocs); +extern int mca_spml_ikrit_del_procs(oshmem_proc_t** procs, size_t nprocs); +extern int mca_spml_ikrit_fence(void); +extern int spml_ikrit_progress(void); + +static inline oshmem_proc_t *mca_spml_ikrit_proc_find(int dst) +{ + orte_process_name_t name; + + name.jobid = ORTE_PROC_MY_NAME->jobid; + name.vpid = dst; + return oshmem_proc_find(&name); +} + + +END_C_DECLS + +#endif + diff --git a/oshmem/mca/spml/ikrit/spml_ikrit_component.c b/oshmem/mca/spml/ikrit/spml_ikrit_component.c new file mode 100644 index 0000000000..8a5c9fced3 --- /dev/null +++ b/oshmem/mca/spml/ikrit/spml_ikrit_component.c @@ -0,0 +1,279 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#define _GNU_SOURCE +#include + +#include +#include + + +#include "oshmem_config.h" +#include "orte/util/show_help.h" +#include "shmem.h" +#include "oshmem/runtime/params.h" +#include "oshmem/mca/spml/spml.h" +#include "oshmem/mca/spml/base/base.h" +#include "opal/mca/base/mca_base_param.h" +#include "spml_ikrit_component.h" +#include "oshmem/mca/spml/ikrit/spml_ikrit.h" + +#include "orte/util/show_help.h" +#include "ompi/runtime/ompi_module_exchange.h" + + +static int mca_spml_ikrit_component_open(void); +static int mca_spml_ikrit_component_close(void); +static mca_spml_base_module_t* +mca_spml_ikrit_component_init( int* priority, bool enable_progress_threads, + bool enable_mpi_threads ); +static int mca_spml_ikrit_component_fini(void); +mca_spml_base_component_2_0_0_t mca_spml_ikrit_component = { + + /* First, the mca_base_component_t struct containing meta + information about the component itself */ + + { + MCA_SPML_BASE_VERSION_2_0_0, + + "ikrit", /* MCA component name */ + OSHMEM_MAJOR_VERSION, /* MCA component major version */ + OSHMEM_MINOR_VERSION, /* MCA component minor version */ + OSHMEM_RELEASE_VERSION, /* MCA component release version */ + mca_spml_ikrit_component_open, /* component open */ + mca_spml_ikrit_component_close /* component close */ + }, + { + /* The component is checkpoint ready */ + MCA_BASE_METADATA_PARAM_CHECKPOINT + }, + + mca_spml_ikrit_component_init, /* component init */ + mca_spml_ikrit_component_fini /* component finalize */ + +}; + + +static inline int mca_spml_ikrit_param_register_int( + const char* param_name, + int default_value, + const char *help_msg) +{ + int param_value; + + param_value = default_value; + mca_base_param_reg_int( + &mca_spml_ikrit_component.spmlm_version, + param_name, + help_msg, + false, false, + default_value, ¶m_value); + + return param_value; +} + + +int spml_ikrit_progress(void) +{ + mxm_error_t err; + + err = mxm_progress(mca_spml_ikrit.mxm_context); + if ((MXM_OK != err) && (MXM_ERR_NO_PROGRESS != err) ) { + orte_show_help("help-spml-ikrit.txt", "errors during mxm_progress", true, mxm_error_string(err)); + } + return 1; +} + + +static int mca_spml_ikrit_component_open(void) +{ + mxm_error_t err; + int np; + unsigned long cur_ver; + + cur_ver = mxm_get_version(); + if (cur_ver != MXM_API) { + char *str; + if (asprintf(&str, "SHMEM was compiled with MXM version %d.%d but " + "version %ld.%ld detected.", MXM_VERNO_MAJOR, + MXM_VERNO_MINOR, (cur_ver >> MXM_MAJOR_BIT)& 0xff, + (cur_ver >> MXM_MINOR_BIT) & 0xff)>0) { + orte_show_help("help-spml-ikrit.txt", "mxm init", true, str); + + free(str); + } + return OSHMEM_ERROR; + } + + +#if MXM_API < MXM_VERSION(1,5) + mxm_context_opts_t mxm_opts; + + mxm_fill_context_opts(&mxm_opts); + // only enable rmda and self ptls + mxm_opts.ptl_bitmap = (MXM_BIT(MXM_PTL_SELF) | MXM_BIT(MXM_PTL_RDMA)); + + +#else + mxm_context_opts_t *mxm_opts; + + err = mxm_config_read_context_opts(&mxm_opts); + mxm_opts->ptl_bitmap = (MXM_BIT(MXM_PTL_SELF) | MXM_BIT(MXM_PTL_RDMA)); + if (MXM_OK != err) { + SPML_ERROR("Failed to parse MXM configuration"); + return OSHMEM_ERROR; + } +#endif + + + mca_spml_ikrit.free_list_num = + mca_spml_ikrit_param_register_int("free_list_num", 1024, 0); + mca_spml_ikrit.free_list_max = + mca_spml_ikrit_param_register_int("free_list_max", 1024, 0); + mca_spml_ikrit.free_list_inc = + mca_spml_ikrit_param_register_int("free_list_inc", 16, 0); + mca_spml_ikrit.priority = + mca_spml_ikrit_param_register_int("priority", 20, "[integer] ikrit priority"); + + mca_spml_ikrit.n_relays = + mca_spml_ikrit_param_register_int("use_relays", -1, "[integer] First N ranks on host will receive and forward put messages to other ranks running on it. Can be used to as work around Sandy Bridge far socket problem"); + + np = mca_spml_ikrit_param_register_int("np", 128, "[integer] Minimal allowed job's NP to activate ikrit"); + if (oshmem_num_procs() < np) { + SPML_VERBOSE(1, "Not enough ranks (%d<%d), disqualifying spml/ikrit", oshmem_num_procs(), np); + return OSHMEM_ERR_NOT_AVAILABLE; + } + +#if MXM_API < MXM_VERSION(1,5) + err = mxm_init(&mxm_opts, &mca_spml_ikrit.mxm_context); +#else + err = mxm_init(mxm_opts, &mca_spml_ikrit.mxm_context); + mxm_config_free(mxm_opts); +#endif + + if (MXM_OK != err) { + if (MXM_ERR_NO_DEVICE == err) { + SPML_VERBOSE(1, "No supported device found, disqualifying spml/ikrit"); + } else { + orte_show_help("help-spml-ikrit.txt", "mxm init", true, + mxm_error_string(err)); + } + return OSHMEM_ERR_NOT_AVAILABLE; + } + + err = mxm_mq_create(mca_spml_ikrit.mxm_context, MXM_SHMEM_MQ_ID, &mca_spml_ikrit.mxm_mq); + if (MXM_OK != err) { + orte_show_help("help-spml-ikrit.txt", "mxm mq create", true, mxm_error_string(err)); + return OSHMEM_ERROR; + } + + + return OSHMEM_SUCCESS; +} + + +static int mca_spml_ikrit_component_close(void) +{ + if (mca_spml_ikrit.mxm_context) + mxm_cleanup(mca_spml_ikrit.mxm_context); + mca_spml_ikrit.mxm_context = NULL; + return OSHMEM_SUCCESS; +} + +static int spml_ikrit_mxm_init(void) +{ + mxm_error_t err; + mxm_ep_opts_t *p_ep_opts; + +#if MXM_API < MXM_VERSION(1,5) + mxm_ep_opts_t ep_opt; + struct sockaddr_mxm_local_proc sa_bind_self; + struct sockaddr_mxm_ib_local sa_bind_rdma; + + p_ep_opts = &ep_opt; + /* Setup the endpoint options and local addresses to bind to. */ + mxm_fill_ep_opts(&ep_opt); + + sa_bind_self.sa_family = AF_MXM_LOCAL_PROC; + sa_bind_self.context_id = 0; + sa_bind_self.process_id = oshmem_proc_local()->proc_name.vpid; + + sa_bind_rdma.sa_family = AF_MXM_IB_LOCAL; + sa_bind_rdma.lid = 0; + sa_bind_rdma.pkey = 0; + sa_bind_rdma.qp_num = 0; + sa_bind_rdma.sl = 0; + + ep_opt.ptl_bind_addr[MXM_PTL_SELF] = (struct sockaddr*)&sa_bind_self; + ep_opt.ptl_bind_addr[MXM_PTL_RDMA] = (struct sockaddr*)&sa_bind_rdma; + +#else + err = mxm_config_read_ep_opts(&p_ep_opts); + if (err != MXM_OK) { + SPML_ERROR("Failed to parse MXM configuration"); + return OSHMEM_ERROR; + } + + /* Only relevant for SHM PTL - ignore */ + p_ep_opts->job_id = 0; + p_ep_opts->local_rank = 0; + p_ep_opts->num_local_procs = 0; + +#endif + p_ep_opts->rdma.drain_cq = 1; + + /* Open MXM endpoint */ + err = mxm_ep_create(mca_spml_ikrit.mxm_context, p_ep_opts, &mca_spml_ikrit.mxm_ep); + if (MXM_OK != err) { + orte_show_help("help-spml-ikrit.txt", "unable to create endpoint", true, + mxm_error_string(err)); + return OSHMEM_ERROR; + } + +#if MXM_API >= MXM_VERSION(1,5) + mxm_config_free(p_ep_opts); +#endif + + return OSHMEM_SUCCESS; +} + +static mca_spml_base_module_t* +mca_spml_ikrit_component_init( int* priority, + bool enable_progress_threads, + bool enable_mpi_threads ) +{ + SPML_VERBOSE( 10, + "in ikrit, my priority is %d\n", mca_spml_ikrit.priority); + + if((*priority) > mca_spml_ikrit.priority) { + *priority = mca_spml_ikrit.priority; + return NULL; + } + *priority = mca_spml_ikrit.priority; + + if (OSHMEM_SUCCESS != spml_ikrit_mxm_init()) + return NULL; + + mca_spml_ikrit.n_active_puts = 0; + mca_spml_ikrit.n_active_gets = 0; + mca_spml_ikrit.n_mxm_fences = 0; + SPML_VERBOSE(50, "*** ikrit initialized ****"); + return &mca_spml_ikrit.super; +} + +static int mca_spml_ikrit_component_fini(void) +{ + opal_progress_unregister(spml_ikrit_progress); + if (NULL != mca_spml_ikrit.mxm_ep) + { + mxm_ep_destroy(mca_spml_ikrit.mxm_ep); + } + return OSHMEM_SUCCESS; +} + diff --git a/oshmem/mca/spml/ikrit/spml_ikrit_component.h b/oshmem/mca/spml/ikrit/spml_ikrit_component.h new file mode 100644 index 0000000000..82803df047 --- /dev/null +++ b/oshmem/mca/spml/ikrit/spml_ikrit_component.h @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + */ + +#ifndef MCA_SPML_YODA_COMPONENT_H +#define MCA_SPML_YODA_COMPONENT_H + +BEGIN_C_DECLS + +/* + * SPML module functions. + */ +OSHMEM_MODULE_DECLSPEC extern mca_spml_base_component_2_0_0_t mca_spml_ikrit_component; +END_C_DECLS + +#endif diff --git a/oshmem/mca/spml/spml.h b/oshmem/mca/spml/spml.h new file mode 100644 index 0000000000..734e33b0f3 --- /dev/null +++ b/oshmem/mca/spml/spml.h @@ -0,0 +1,294 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_SPML_H +#define MCA_SPML_H + +#include "oshmem_config.h" +#include "oshmem/types.h" +#include "oshmem/constants.h" + +#include "opal/mca/mca.h" +#include "oshmem/proc/proc.h" +#include "ompi/mca/btl/btl.h" + + +BEGIN_C_DECLS + +/* + * SPML component types + */ + +/** + * MCA->PML Called by MCA framework to initialize the component. + * + * @param priority (OUT) Relative priority or ranking used by MCA to + * selected a component. + * + * @param enable_progress_threads (IN) Whether this component is + * allowed to run a hidden/progress thread or not. + * + * @param enable_mpi_threads (IN) Whether support for multiple MPI + * threads is enabled or not (i.e., MPI_THREAD_MULTIPLE), which + * indicates whether multiple threads may invoke this component + * simultaneously or not. + */ +typedef enum { + MCA_SPML_BASE_PUT_SYNCHRONOUS, + MCA_SPML_BASE_PUT_COMPLETE, + MCA_SPML_BASE_PUT_BUFFERED, + MCA_SPML_BASE_PUT_READY, + MCA_SPML_BASE_PUT_STANDARD, + MCA_SPML_BASE_PUT_SIZE +} mca_spml_base_put_mode_t; + +typedef struct mca_spml_base_module_1_0_0_t * (*mca_spml_base_component_init_fn_t)( + int *priority, + bool enable_progress_threads, + bool enable_mpi_threads); + +typedef int (*mca_spml_base_component_finalize_fn_t)(void); + +/** + * SPML component version and interface functions. + */ +struct mca_spml_base_component_2_0_0_t { + mca_base_component_t spmlm_version; + mca_base_component_data_t spmlm_data; + mca_spml_base_component_init_fn_t spmlm_init; + mca_spml_base_component_finalize_fn_t spmlm_finalize; +}; +typedef struct mca_spml_base_component_2_0_0_t mca_spml_base_component_2_0_0_t; +typedef mca_spml_base_component_2_0_0_t mca_spml_base_component_t; + + +/** + * MCA management functions. + */ +/** + * memory key + */ +typedef struct mca_spml_mkey { + union { + struct { + uint32_t rkey; + uint32_t lkey; + } ib; + uint64_t key; + }; + uint64_t va_base; + void *spml_context; // spml module can attach internal structures here +} mca_spml_mkey_t; + +/** + * Downcall from MCA layer to enable the PML/BTLs. + * + * @param enable Enable/Disable SPML forwarding + * @return OSHMEM_SUCCESS or failure status. + */ +typedef int (*mca_spml_base_module_enable_fn_t)( + bool enable + ); + + + +/** + * Waits for an int variable to change on the local PE. + * Blocked until the variable is not equal to value. + * + * @param addr Address of the variable to pool on. + * @param value The value to pool on. Pool until the value held in addr is different than value. + * @return OSHMEM_SUCCESS or failure status. + */ +typedef int (*mca_spml_base_module_wait_fn_t)(void* addr, int cmp, void* value, int datatype); + +/** + * Register (Pinn) a buffer of 'size' bits starting in address addr + * + * @param addr base address of the registered buffer. + * @param size the size of the buffer to be registered. + * @param seg_id sysv segment id + * @param count number of internal transports (btls) that registered memory + * @return array of mkeys (one mkey per "btl") or NULL on failure + * + */ +typedef mca_spml_mkey_t * (*mca_spml_base_module_register_fn_t)(void *addr, size_t size, uint64_t shmid, int *count); + +/** + * deregister memory pinned by register() + */ +typedef int (*mca_spml_base_module_deregister_fn_t)(mca_spml_mkey_t *mkeys); + +/** + * try to fill up mkeys that can be used to reach remote pe. + * @param pe remote pe + * @param seg 0 - symmetric heap, 1 - static data, everything else are static data in .so + * @param mkeys mkeys array + * + * @return OSHMEM_SUCCSESS if keys are found + */ +typedef int (*mca_spml_base_module_oob_get_mkeys_fn_t)(int pe, uint32_t seg, mca_spml_mkey_t *mkeys); + +/** + * For each proc setup a datastructure that indicates the BTLs + * that can be used to reach the destination. + * + * @param procs A list of all procs participating in the parallel application. + * @param nprocs The number of procs in the parallel application. + * @return OSHMEM_SUCCESS or failure status. + * + */ +typedef int (*mca_spml_base_module_add_procs_fn_t)(oshmem_proc_t** procs, size_t nprocs); +typedef int (*mca_spml_base_module_del_procs_fn_t)(oshmem_proc_t** procs, size_t nprocs); + +/** + * Transfer data to a remote pe. + * + * @param dst_addr The address in the remote PE of the object being written. + * @param size The number of bytes to be written. + * @param src_addr An address on the local PE holdng the value to be written. + * @param dst The remote PE to be written to. + * @return OSHMEM_SUCCESS or failure status. + */ +typedef int (*mca_spml_base_module_put_fn_t)(void *dst_addr, size_t size, void *src_addr, int dst); + + +/** + * These routines provide the means for copying contiguous data to another PE without + * blocking the caller. These routines return before the data has been delivered to the + * remote PE. + * + * @param dst_addr The address in the remote PE of the object being written. + * @param size The number of bytes to be written. + * @param src_addr An address on the local PE holdng the value to be written. + * @param dst The remote PE to be written to. + * @param handle The address of a handle to be passed to shmem_wait_nb() or + * shmem_test_nb() to wait or poll for the completion of the transfer. + * @return OSHMEM_SUCCESS or failure status. + */ +typedef int (*mca_spml_base_module_put_nb_fn_t)(void *dst_addr, size_t size, void *src_addr, int dst, void **handle); + + +/** + * Blocking data transfer from remote PE. + * Read data from remote PE. + * + * @param dst_addr - The address on the local PE, to write the result of the get operation to. + * @param size - The number of bytes to be read. + * @param src_addr - The address on the remote PE, to read from. + * @param src - The ID of the remote PE. + * @return - OSHMEM_SUCCESS or failure status. + */ +typedef int (*mca_spml_base_module_get_fn_t)(void *dst_addr, size_t size, void *src_addr, int src); + + +/** + * Post a receive and wait for completion. + * + * @param buf (IN) User buffer. + * @param count (IN) The number of bytes to be sent. + * @param src (IN) The ID of the remote PE. + * @return OSHMEM_SUCCESS or failure status. + */ +typedef int (*mca_spml_base_module_recv_fn_t)( + void *buf, + size_t count, + int src +); + + +/** + * Post a send request and wait for completion. + * + * @param buf (IN) User buffer. + * @param count (IN) The number of bytes to be sent. + * @param dst (IN) The ID of the remote PE. + * @param mode (IN) Send mode (STANDARD,BUFFERED,SYNCHRONOUS,READY) + * @return OSHMEM_SUCCESS or failure status. + */ +typedef int (*mca_spml_base_module_send_fn_t)( + void *buf, + size_t count, + int dst, + mca_spml_base_put_mode_t mode +); + + +/** + * Wait for completion of all outstanding put() requests + * + * @return - OSHMEM_SUCCESS or failure status. + */ +typedef int (*mca_spml_base_module_fence_fn_t)(void); + + +/** + * Waits for completion of a non-blocking put or get issued by the calling PE. + * + * @return - OSHMEM_SUCCESS or failure status. + */ +typedef int (*mca_spml_base_module_wait_nb_fn_t)(void*); + + +/** + * SPML instance. + */ +struct mca_spml_base_module_1_0_0_t { + + mca_spml_base_module_add_procs_fn_t spml_add_procs; + mca_spml_base_module_del_procs_fn_t spml_del_procs; + + mca_spml_base_module_enable_fn_t spml_enable; + mca_spml_base_module_register_fn_t spml_register; + mca_spml_base_module_deregister_fn_t spml_deregister; + mca_spml_base_module_oob_get_mkeys_fn_t spml_oob_get_mkeys; + + mca_spml_base_module_put_fn_t spml_put; + mca_spml_base_module_put_nb_fn_t spml_put_nb; + mca_spml_base_module_get_fn_t spml_get; + + mca_spml_base_module_recv_fn_t spml_recv; + mca_spml_base_module_send_fn_t spml_send; + + mca_spml_base_module_wait_fn_t spml_wait; + mca_spml_base_module_wait_nb_fn_t spml_wait_nb; + mca_spml_base_module_fence_fn_t spml_fence; +}; + +typedef struct mca_spml_base_module_1_0_0_t mca_spml_base_module_1_0_0_t; +typedef mca_spml_base_module_1_0_0_t mca_spml_base_module_t; + +/* + * Macro for use in components that are of type spml + */ +#define MCA_SPML_BASE_VERSION_2_0_0 \ + MCA_BASE_VERSION_2_0_0, \ +"spml", 2, 0, 0 + +/* + * macro for doing direct call / call through struct + */ +/*TODO: Irit - change to MCA_oshmem_spml_DIRECT_CALL_HEADER*/ +#if MCA_oshmem_spml_DIRECT_CALL + +#include MCA_oshmem_spml_DIRECT_CALL_HEADER + +#define MCA_SPML_CALL_STAMP(a, b) mca_spml_ ## a ## _ ## b +#define MCA_SPML_CALL_EXPANDER(a, b) MCA_SPML_CALL_STAMP(a,b) +#define MCA_SPML_CALL(a) MCA_SPML_CALL_EXPANDER(MCA_oshmem_spml_DIRECT_CALL_COMPONENT, a) + +#else +#define MCA_SPML_CALL(a) mca_spml.spml_ ## a +#endif + +OSHMEM_DECLSPEC extern mca_spml_base_module_t mca_spml; + +END_C_DECLS +#endif /* MCA_SPML_H */ diff --git a/oshmem/mca/spml/yoda/.windows b/oshmem/mca/spml/yoda/.windows new file mode 100644 index 0000000000..4e9e484624 --- /dev/null +++ b/oshmem/mca/spml/yoda/.windows @@ -0,0 +1,12 @@ +# +# Copyright (c) 2012 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# Specific to this module +mca_dependencies=libshmem diff --git a/oshmem/mca/spml/yoda/Makefile.am b/oshmem/mca/spml/yoda/Makefile.am new file mode 100644 index 0000000000..c8d49cbb60 --- /dev/null +++ b/oshmem/mca/spml/yoda/Makefile.am @@ -0,0 +1,45 @@ +# +# Copyright (c) 2012 Mellanox Technologies, Inc. +# All rights reserved. +# +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +dist_pkgdata_DATA = \ + help-shmem-spml-yoda.txt + +EXTRA_DIST = post_configure.sh + +AM_CFLAGS = $(OSHMEM_CFLAGS) $(btl_sm_CPPFLAGS) + +yoda_sources = \ + spml_yoda.c \ + spml_yoda.h \ + spml_yoda_component.c \ + spml_yoda_component.h \ + spml_yoda_rdmafrag.h \ + spml_yoda_putreq.c \ + spml_yoda_putreq.h \ + spml_yoda_getreq.c \ + spml_yoda_getreq.h + +if MCA_BUILD_ompi_pml_ob1_DSO +component_noinst = +component_install = mca_spml_yoda.la +else +component_noinst = libmca_spml_yoda.la +component_install = +endif + +mcacomponentdir = $(pkglibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_spml_yoda_la_SOURCES = $(yoda_sources) +mca_spml_yoda_la_LDFLAGS = -module -avoid-version + +noinst_LTLIBRARIES = $(component_noinst) +libmca_spml_yoda_la_SOURCES = $(yoda_sources) +libmca_spml_yoda_la_LDFLAGS = -module -avoid-version diff --git a/oshmem/mca/spml/yoda/configure.params b/oshmem/mca/spml/yoda/configure.params new file mode 100644 index 0000000000..7bc1905bb2 --- /dev/null +++ b/oshmem/mca/spml/yoda/configure.params @@ -0,0 +1,14 @@ +# -*- shell-script -*- +# +# Copyright (c) 2012 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# Specific to this module + +PARAM_CONFIG_FILES="Makefile" diff --git a/oshmem/mca/spml/yoda/help-shmem-spml-yoda.txt b/oshmem/mca/spml/yoda/help-shmem-spml-yoda.txt new file mode 100644 index 0000000000..53d7d7168c --- /dev/null +++ b/oshmem/mca/spml/yoda/help-shmem-spml-yoda.txt @@ -0,0 +1,21 @@ +# -*- text -*- +# +# Copyright (c) 2012 Mellanox Technologies, Inc. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# +[eager_limit_too_small] +The "eager limit" MCA parameter in the %s BTL was set to a value which +is too low for Open SHMEM to function properly. Please re-run your job +with a higher eager limit value for this BTL; the exact MCA parameter +name and its corresponding minimum value is shown below. + + Local host: %s + BTL name: %s + BTL eager limit value: %d (set via btl_%s_eager_limit) + BTL eager limit minimum: %d + MCA parameter name: btl_%s_eager_limit diff --git a/oshmem/mca/spml/yoda/post_configure.sh b/oshmem/mca/spml/yoda/post_configure.sh new file mode 100644 index 0000000000..da90f9cdca --- /dev/null +++ b/oshmem/mca/spml/yoda/post_configure.sh @@ -0,0 +1,4 @@ +# Copyright (c) 2012 Mellanox Technologies, Inc. +# All rights reserved +# $COPYRIGHT$ +DIRECT_CALL_HEADER="oshmem/mca/spml/yoda/spml_yoda.h" diff --git a/oshmem/mca/spml/yoda/spml_yoda.c b/oshmem/mca/spml/yoda/spml_yoda.c new file mode 100644 index 0000000000..68c219912f --- /dev/null +++ b/oshmem/mca/spml/yoda/spml_yoda.c @@ -0,0 +1,1171 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "oshmem_config.h" + +#include "orte/include/orte/types.h" +#include "orte/runtime/orte_globals.h" + +#include "opal/datatype/opal_convertor.h" + +#include "ompi/datatype/ompi_datatype.h" +#include "ompi/mca/pml/pml.h" +#include "ompi/mca/btl/btl.h" +#include "ompi/mca/btl/base/base.h" +#include "ompi/mca/btl/openib/btl_openib_endpoint.h" +#include "ompi/mca/btl/sm/btl_sm_frag.h" + +#include "oshmem/proc/proc.h" +#include "oshmem/mca/memheap/memheap.h" +#include "oshmem/mca/memheap/base/base.h" +#include "oshmem/mca/spml/spml.h" + +#include "spml_yoda.h" +#include "spml_yoda_putreq.h" +#include "spml_yoda_getreq.h" +#ifdef HAVE_UNISTD_H +#include +#endif +#define ILLEGAL_ORDER -1 +#include "oshmem/runtime/runtime.h" +mca_spml_yoda_module_t mca_spml_yoda = { + { + /* Init mca_spml_base_module_t */ + mca_spml_yoda_add_procs, + mca_spml_yoda_del_procs, + mca_spml_yoda_enable, + mca_spml_yoda_register, + mca_spml_yoda_deregister, + mca_spml_base_oob_get_mkeys, + mca_spml_yoda_put, + mca_spml_yoda_put_nb, + mca_spml_yoda_get, + mca_spml_yoda_recv, + mca_spml_yoda_send, + mca_spml_base_wait, + mca_spml_base_wait_nb, + mca_spml_yoda_fence + } +}; + +#ifdef OSHMEM_WAIT_COMPLETION_DEBUG +char **op_type_dbg;//[OSHMEM_WAIT_COMPLETION_DEBUG][20]; +char **btl_name_dbg;//[OSHMEM_WAIT_COMPLETION_DEBUG][20]; +int pe_dst_dbg[OSHMEM_WAIT_COMPLETION_DEBUG]; +int msg_length_dbg[OSHMEM_WAIT_COMPLETION_DEBUG]; +uint64_t src_dbg[OSHMEM_WAIT_COMPLETION_DEBUG]; +uint64_t dst_dbg[OSHMEM_WAIT_COMPLETION_DEBUG]; +static void save_dbg_history(char op_type[], char btl_name[], int pe_dst, int msg_length, void *src, void *dst) +{ + int i; + for (i=0; ibtl->btl_free(ybtl->btl, mkeys[i].spml_context); + } + } + free(mkeys); + + return OSHMEM_SUCCESS; +} + +mca_spml_mkey_t *mca_spml_yoda_register( + void* addr, + size_t size, + uint64_t shmid, + int *count + ) +{ + int i; + mca_btl_base_descriptor_t* des = NULL; + const opal_datatype_t *datatype = &opal_datatype_wchar; + opal_convertor_t convertor; + mca_spml_mkey_t *mkeys; + struct yoda_btl *ybtl; + oshmem_proc_t *proc_self; + + SPML_VERBOSE(10, "address %p len %llu", addr, (unsigned long long)size); + *count = 0; + /* make sure everything is initialized to 0 */ + mkeys = (mca_spml_mkey_t *)calloc(1, mca_spml_yoda.n_btls * sizeof(*mkeys)); + if(!mkeys){ + return NULL; + } + + proc_self = oshmem_proc_group_find(oshmem_group_all, oshmem_my_proc_id()); + /* create convertor */ + OBJ_CONSTRUCT(&convertor, opal_convertor_t); + + /* Register proc memory in every rdma BTL. */ + for(i = 0; i < mca_spml_yoda.n_btls; i++) { + + ybtl = &mca_spml_yoda.btl_type_map[i]; + if (!ybtl->use_cnt) { + SPML_VERBOSE(10, "%s: present but not in use. SKIP registration", btl_type2str(ybtl->btl_type)); + continue; + } + + /* initialize convertor */ + opal_convertor_copy_and_prepare_for_recv(proc_self->proc_convertor, + datatype, + size, + addr, + 0, + &convertor); + + switch (ybtl->btl_type) { + case YODA_BTL_SM: + /* shadow sm btl */ + if ((int)MEMHEAP_SHM_GET_ID(shmid) != MEMHEAP_SHM_INVALID) { + mkeys[i].key = shmid; + mkeys[i].spml_context = 0; + mkeys[i].va_base = 0; /* memory must be shmat'ed localy upon reception of rkey */ + } + else { + des = ybtl->btl->btl_prepare_src(ybtl->btl, + 0, + NULL, &convertor, MCA_BTL_NO_ORDER, + 0, &size, 0); + if (NULL == des) { + SPML_ERROR("%s: failed to register memory. ", btl_type2str(ybtl->btl_type)); + goto err; + } + + mkeys[i].key = ((mca_btl_sm_segment_t *)des->des_src)->key; + mkeys[i].spml_context = des; + mkeys[i].va_base = (unsigned long)addr; + } + break; + + case YODA_BTL_OPENIB: + des = ybtl->btl->btl_prepare_dst(ybtl->btl, + 0, + NULL, &convertor, MCA_BTL_NO_ORDER, + 0, &size, 0); + if (NULL == des) { + SPML_ERROR("%s: failed to register memory. ", btl_type2str(ybtl->btl_type)); + goto err; + } + + mkeys[i].ib.rkey = ((mca_btl_openib_segment_t *)des->des_dst)->key; + mkeys[i].ib.lkey = ((mca_btl_openib_segment_t *)des->des_dst)->lkey; + mkeys[i].spml_context = des; + mkeys[i].va_base = (unsigned long)addr; + break; + + case YODA_BTL_SELF: + SPML_VERBOSE(10, "self btl - doing nothing"); + mkeys[i].key = 0; + mkeys[i].spml_context = 0; + mkeys[i].va_base = (unsigned long)addr; + break; + + default: + SPML_ERROR("unsupported btl: %d\n", ybtl->btl_type); + goto err; + } + + SPML_VERBOSE(5,"rank %d btl %s rkey %x lkey %x key %llx address 0x%llX len %llu shmid 0x%X|0x%X", + oshmem_proc_local_proc->proc_name.vpid, + btl_type2str(ybtl->btl_type), + mkeys[i].ib.rkey, + mkeys[i].ib.lkey, + (unsigned long long)mkeys[i].key, + (unsigned long long)mkeys[i].va_base, + (unsigned long long)size, + MEMHEAP_SHM_GET_TYPE(shmid), MEMHEAP_SHM_GET_ID(shmid) + ); + } + OBJ_DESTRUCT(&convertor); + *count = mca_spml_yoda.n_btls; + return mkeys; + +err: + mca_spml_yoda_deregister(mkeys); + return NULL; +} + + + +/* + * For each proc setup a datastructure that indicates the BTLs + * that can be used to reach the destination. + */ +static void mca_spml_yoda_error_handler( + struct mca_btl_base_module_t* btl, int32_t flags, + ompi_proc_t* errproc, char* btlinfo ) { + oshmem_shmem_abort(-1); +} + +/* make global btl list&map */ +static int create_btl_list(void) +{ + int btl_id; + char *btl_name; + int size; + opal_list_item_t *item; + mca_btl_base_selected_module_t *btl_sm; + int i; + + size = opal_list_get_size(&mca_btl_base_modules_initialized); + if (0 >= size) { + SPML_ERROR("no btl(s) available"); + return OSHMEM_ERROR; + } + SPML_VERBOSE(50, "found %d capable btls", size); + + mca_spml_yoda.btl_type_map = (struct yoda_btl *)calloc(size, sizeof(struct yoda_btl)); + if (! mca_spml_yoda.btl_type_map) + return OSHMEM_ERROR; + + mca_spml_yoda.n_btls = 0; + for (i = 0, item = opal_list_get_first(&mca_btl_base_modules_initialized) ; + item != opal_list_get_end(&mca_btl_base_modules_initialized) ; + item = opal_list_get_next(item), i++) { + + btl_sm = (mca_btl_base_selected_module_t *)item; + btl_name = btl_sm->btl_component->btl_version.mca_component_name; + btl_id = btl_name_to_id(btl_name); + + SPML_VERBOSE(50, "found btl (%s) btl_id=%d", btl_name, btl_id); + if (YODA_BTL_UNKNOWN == btl_id) { + SPML_VERBOSE(5, "unknown btl: %s btl_id=%d", btl_name, btl_id); + continue; + } + mca_spml_yoda.btl_type_map[mca_spml_yoda.n_btls].btl = btl_sm->btl_module; + mca_spml_yoda.btl_type_map[mca_spml_yoda.n_btls].btl_type = btl_id; + mca_spml_yoda.n_btls++; + } + + if (0 == mca_spml_yoda.n_btls) { + SPML_ERROR("can not find any suitable btl"); + return OSHMEM_ERROR; + } + + return OSHMEM_SUCCESS; +} + +static int _find_btl_id(mca_bml_base_btl_t *bml_btl) +{ + int i; + + for (i = 0; i < mca_spml_yoda.n_btls; i++) { + if (mca_spml_yoda.btl_type_map[i].btl == bml_btl->btl) + return i; + } + return -1; +} + +/* for each proc create transport ids which are indexes into global + * btl list&map + */ +static int create_btl_idx(int dst_pe) +{ + oshmem_proc_t *proc; + int btl_id; + mca_bml_base_endpoint_t* endpoint; + mca_bml_base_btl_t* bml_btl = 0; + int i, size; + mca_bml_base_btl_array_t *btl_array; + int sm_index = -1; + + proc = oshmem_proc_group_find(oshmem_group_all, dst_pe); + endpoint = (mca_bml_base_endpoint_t*)proc->proc_bml; + assert(endpoint); + size = mca_bml_base_btl_array_get_size(btl_array = &endpoint->btl_rdma); + + if (0 >= size) { + int is_sm_btl = 0; + //Possibly this is SM BTL with KNEM disabled? Then we should use send based get/put + /* + This hack is necessary for the case when KNEM is not available. + In this case we still want to use send/recv of SM BTL for put and get + but SM BTL is not in the rdma list anymore + */ + size = mca_bml_base_btl_array_get_size(btl_array = &endpoint->btl_eager); + if (size > 0) { + int btl_id = -1; + for (sm_index = 0; sm_index < size; sm_index++) { + bml_btl = mca_bml_base_btl_array_get_index(btl_array, sm_index); + btl_id = _find_btl_id(bml_btl); + is_sm_btl = (btl_id != -1) && ((mca_spml_yoda.btl_type_map[btl_id].btl_type == YODA_BTL_SM) || (btl_id == YODA_BTL_OPENIB)); + if (is_sm_btl) { + size = 1; + break; + } + } + } + if (!is_sm_btl) { + SPML_ERROR("no RDMA capable transport for dest pe=%d", dst_pe); + return OSHMEM_ERROR; + } + } + + proc->transport_ids = (char *)malloc(size * sizeof(char)); + if (!proc->transport_ids) + return OSHMEM_ERROR; + + proc->num_transports = size; + + for (i = 0; i < size; i++) { + bml_btl = mca_bml_base_btl_array_get_index(btl_array, (sm_index>=0)?(sm_index):(i)); + btl_id = _find_btl_id(bml_btl); + SPML_VERBOSE(50, "dst_pe(%d) use btl (%s) btl_id=%d", + dst_pe, + bml_btl->btl->btl_component->btl_version.mca_component_name, btl_id); + if (btl_id < 0) { + SPML_ERROR("unknown btl: dst_pe(%d) use btl (%s) btl_id=%d", + dst_pe, + bml_btl->btl->btl_component->btl_version.mca_component_name, btl_id); + return OSHMEM_ERROR; + } + proc->transport_ids[i] = btl_id; + mca_spml_yoda.btl_type_map[btl_id].use_cnt++; + } + return OSHMEM_SUCCESS; +} + +static int destroy_btl_list(void) +{ + if (mca_spml_yoda.btl_type_map) + free(mca_spml_yoda.btl_type_map); + + return OSHMEM_SUCCESS; +} + + +static int destroy_btl_idx(int dst_pe) +{ + oshmem_proc_t *proc; + + proc = oshmem_proc_group_find(oshmem_group_all, dst_pe); + if (proc->transport_ids) + free(proc->transport_ids); + + return OSHMEM_SUCCESS; +} + +int mca_spml_yoda_add_procs(oshmem_proc_t** procs, size_t nprocs) +{ + opal_bitmap_t reachable; + int rc; + size_t i; + + if(nprocs == 0) + return OSHMEM_SUCCESS; + + /* we don't have any endpoint data we need to cache on the + * oshmem_proc_t, so set proc_spml to NULL */ + for (i = 0 ; i < nprocs ; ++i) { + procs[i]->proc_pml = NULL; + } + + OBJ_CONSTRUCT(&reachable, opal_bitmap_t); + rc = opal_bitmap_init(&reachable, (int)nprocs); + if(OSHMEM_SUCCESS != rc) + return rc; + + rc = mca_bml.bml_add_procs( nprocs, + (ompi_proc_t**)procs, + &reachable ); + + if(OSHMEM_SUCCESS != rc){ + SPML_ERROR("SPML YODA: shmem error\n"); + goto cleanup_and_return; + } + + rc = mca_bml.bml_register_error(mca_spml_yoda_error_handler); + if(OMPI_SUCCESS != rc) + goto cleanup_and_return; + + /* create btl index and map */ + rc = create_btl_list(); + if(OSHMEM_SUCCESS != rc) + goto cleanup_and_return; + + for (i = 0; i < nprocs; i++) { + rc = create_btl_idx(i); + if (OSHMEM_SUCCESS != rc) + goto cleanup_and_return; + } + +cleanup_and_return: + OBJ_DESTRUCT(&reachable); + + return rc; +} + + +int mca_spml_yoda_del_procs(oshmem_proc_t** procs, size_t nprocs) +{ + size_t i; + + mca_bml.bml_del_procs(nprocs, (ompi_proc_t**)procs); + for (i = 0; i < nprocs; i++) { + destroy_btl_idx(i); + } + destroy_btl_list(); + + return OSHMEM_SUCCESS; +} + +static inline mca_bml_base_btl_t *get_next_btl(int dst, int *btl_id) +{ + mca_bml_base_endpoint_t* endpoint; + mca_bml_base_btl_t* bml_btl; + oshmem_proc_t *proc; + mca_bml_base_btl_array_t *btl_array = 0; + int size = 0; + int sm_index = 0; + + /* get endpoint and btl */ + proc = oshmem_proc_group_all(dst); + if (!proc) { + SPML_ERROR("Can not find destination proc for pe=%d", dst); + return NULL; + } + + endpoint = (mca_bml_base_endpoint_t*)proc->proc_bml; + if (!endpoint) { + SPML_ERROR("pe=%d proc has no endpoint", dst); + return NULL; + } + + /* At the moment always return first transport */ + size = mca_bml_base_btl_array_get_size(btl_array = &endpoint->btl_rdma); + + if (0 >= size) { + int is_sm_btl = 0; + //Possibly this is SM BTL with KNEM disabled? Then we should use send based get/put + /* + This hack is necessary for the case when KNEM is not available. + In this case we still want to use send/recv of SM BTL for put and get + but SM BTL is not in the rdma list anymore + */ + size = mca_bml_base_btl_array_get_size(btl_array = &endpoint->btl_eager); + if (size > 0) { + int btl_id = -1; + for (sm_index = 0; sm_index < size; sm_index++) { + bml_btl = mca_bml_base_btl_array_get_index(btl_array, sm_index); + btl_id = _find_btl_id(bml_btl); + is_sm_btl = (btl_id != -1) && ((mca_spml_yoda.btl_type_map[btl_id].btl_type == YODA_BTL_SM) || (btl_id == YODA_BTL_OPENIB)); + if (is_sm_btl) { + size = 1; + break; + } + } + } + if (!is_sm_btl) { + SPML_ERROR("no RDMA capable transport for dest pe=%d", dst); + oshmem_shmem_abort(-1); + } + } + + bml_btl = mca_bml_base_btl_array_get_index(btl_array, sm_index); + *btl_id = proc->transport_ids[0]; +#if 0 + assert(*btl_id >= 0 && *btl_id < YODA_BTL_MAX); + SPML_VERBOSE(100, "pe=%d reachable via btl %s %d", dst, + bml_btl->btl->btl_component->btl_version.mca_component_name, *btl_id); +#endif + return bml_btl; +} + +static inline void calc_nfrags(mca_bml_base_btl_t* bml_btl, size_t size, unsigned *frag_size, int *nfrags) +{ + *frag_size = bml_btl->btl->btl_max_send_size; + *nfrags = 1+(size-1)/(*frag_size); +} + + + +static inline int mca_spml_yoda_put_internal(void *dst_addr, size_t size, void *src_addr, int dst, int is_nb) +{ + int rc = OSHMEM_SUCCESS; + mca_spml_yoda_put_request_t *putreq = NULL; + mca_bml_base_btl_t* bml_btl; + mca_btl_base_descriptor_t* des = NULL; + mca_btl_base_segment_t* segment; + mca_spml_yoda_rdma_frag_t* frag; + int nfrags, need_copy; + int i; + unsigned ncopied = 0; + unsigned frag_size = 0; + char *p_src, *p_dst; + + uint64_t rva; + uint64_t offset = 0; + + mca_spml_mkey_t *r_mkey; + uint64_t rkey; + mca_spml_mkey_t *l_mkey; + uint32_t lkey; + int btl_id = 0; + struct yoda_btl *ybtl; + + + /* find bml_btl and its global btl_id */ + bml_btl = get_next_btl(dst, &btl_id); + if (!bml_btl) oshmem_shmem_abort(-1); + + /* Get rkey of remote PE (dst proc) which must be on memheap */ + r_mkey = mca_memheap.memheap_get_cached_mkey(dst, (unsigned long)dst_addr, btl_id, &rva); + if(!r_mkey) { + SPML_ERROR("pe=%d: %p is not address of shared variable", dst, dst_addr); + oshmem_shmem_abort(-1); + return OSHMEM_ERROR; + } + +#if 0 + SPML_VERBOSE(100, "put: pe:%d dst=%p <- src: %p sz=%d. dst_rva=%p, dst_rkey=0x%lx", + dst, dst_addr, src_addr, (int)size, (void *)rva, r_mkey->key); +#endif + + + ybtl = &mca_spml_yoda.btl_type_map[btl_id]; + + /* check if source is on memheap. If not: + - copy it to temporary space. Temporary buffer will be allocated from BML. + - check if we need to split source in part and send it with several + put operations + */ + + + nfrags = 1; + need_copy = 0; + rkey = 0; + lkey = 0; + + switch(ybtl->btl_type) { + case YODA_BTL_OPENIB: + l_mkey = mca_memheap.memheap_get_local_mkey((unsigned long)src_addr, btl_id); + if ((!is_nb && size > (size_t)mca_btl_openib_rdma_inline_size(bml_btl)) || l_mkey == 0) { + need_copy = 1; + calc_nfrags(bml_btl, size, &frag_size, &nfrags); + } + else + lkey = l_mkey->ib.lkey; + rkey = r_mkey->ib.rkey; + break; + + case YODA_BTL_SM: + /* check if we doing put into shm attached segment and if so + * just do memcpy + */ + if (OPAL_LIKELY(mca_memheap.memheap_is_symmetric_addr((unsigned long)dst_addr) && (unsigned long)dst_addr != rva)) { + // todo: may need to run opal progress from time to time + memcpy((void *)(unsigned long)rva, src_addr, size); + return OSHMEM_SUCCESS; + } + + offset = mca_memheap.memheap_find_offset(dst, btl_id, (unsigned long)dst_addr, rva); + rkey = r_mkey->key; + + /* TODO: + * 1. KNEM does not support blocking mode (need_copy = 1) (see OSHMEM_SM_PUT_SYNC_MODE to enable it) + * 2. SM(w/o KNEM) does not support non-blocking mode (need_copy = 0) + */ + if ((size < mca_spml_yoda.knem_threshold) || (!mca_spml_yoda.use_knem)) { + need_copy = 1; + calc_nfrags(bml_btl, size, &frag_size, &nfrags); + } + break; + + case YODA_BTL_SELF: + /* self btl ignores rkey/lkey */ + break; + default: + SPML_ERROR("btl %d bad btl type %d", btl_id, ybtl->btl_type); + oshmem_shmem_abort(-1); + } + + + p_src = (char*)src_addr; + p_dst = (char*)(unsigned long)rva; //dst_addr; + + for (i = 0; i < nfrags; i++) { + + /* Allocating send request from free list */ + putreq = mca_spml_yoda_putreq_alloc(dst); + frag = &putreq->put_frag; + + /* ToDo: + * - allocate buffer + * - memcopy data if needed + * - call to put with SEND descriptor. + */ + + ncopied = i < nfrags - 1 ? frag_size : (char *)src_addr + size - p_src; + + /* allocate buffer */ + mca_bml_base_alloc(bml_btl, &des, + MCA_BTL_NO_ORDER, + need_copy ? ncopied : 0, /* hack: allocate dummy segment if we are sending from symmetric heap */ + MCA_BTL_DES_SEND_ALWAYS_CALLBACK); + + if (OPAL_UNLIKELY(!des)) { + SPML_ERROR("shmem OOM error need %d bytes", ncopied); + SPML_ERROR("src=%p nfrags = %d need_copy = %d frag_size=%d", src_addr, nfrags, need_copy, frag_size); + oshmem_shmem_abort(-1); + } + + assert(NULL != des); + assert(NULL != des->des_src); + + if (need_copy) { + segment = des->des_src; + memcpy((IOVBASE_TYPE*)((unsigned char*)segment->seg_addr.pval), p_src, ncopied); + p_src += ncopied; + frag->btl_seg = 0; + } + else { + frag->btl_seg = des->des_src; + frag->rdma_segs[1].base_seg.seg_addr.pval = (void*)p_src; + frag->rdma_segs[1].base_seg.seg_len = ncopied; + + /* Get lkey value of the symmetric heap */ + if (ybtl->btl_type == YODA_BTL_OPENIB) + { + frag->rdma_segs[1].openib_seg.lkey = lkey; + } + des->des_src = &frag->rdma_segs[1].base_seg; + } + + frag->rdma_segs[0].base_seg.seg_addr.lval = (uintptr_t)p_dst; + frag->rdma_segs[0].base_seg.seg_len = ncopied; + if (ybtl->btl_type == YODA_BTL_SM) + { + frag->rdma_segs[0].sm_seg.key = rkey; + } + else if (ybtl->btl_type == YODA_BTL_OPENIB) + { + frag->rdma_segs[0].openib_seg.key = (uint32_t)rkey; + } + des->des_dst = &frag->rdma_segs[0].base_seg; + frag->rdma_req = putreq; + + p_dst += ncopied; + + des->des_cbdata = frag; + des->des_cbfunc = mca_spml_yoda_put_completion; + + OPAL_THREAD_ADD32(&mca_spml_yoda.n_active_puts, 1); + /* put the data to remote side */ + if (OPAL_UNLIKELY(YODA_BTL_SM == ybtl->btl_type)) { + /* in the case of SM BTL we may use both: PUT via KNEM and SEND via shared memory fifos. + * Choosing depending on the threshold */ + if ((size < mca_spml_yoda.knem_threshold) || (!mca_spml_yoda.use_knem)) { + /* + * This ugly hack is done to support following configuration as: + * OSHMEM + SM => put/get for small messages using send() + */ + rc = mca_bml_base_send(bml_btl, des, BTL_SM_HDR_TYPE_PUT_AS_SEND); + if (1 == rc) rc = OSHMEM_SUCCESS; + } + else { + ((mca_btl_sm_segment_t *) des->des_src)->key = offset ; //des_src is not used in SM KNEM put, so we pass offset through key64 field + rc = mca_bml_base_put(bml_btl, des); + } + } + else { + rc = mca_bml_base_put(bml_btl, des); + } + +#ifdef OSHMEM_WAIT_COMPLETION_DEBUG + if (YODA_BTL_SM == ybtl->btl_type) { + oshmem_request_cond.puts_counter_sm++; + } + else if (YODA_BTL_OPENIB == ybtl->btl_type) { + oshmem_request_cond.puts_counter_openib++; + } + save_dbg_history("PUT", btl_type2str(ybtl->btl_type), dst, size, src_addr, dst_addr); + //condition_dbg_init(); +#endif + if( OPAL_UNLIKELY(OSHMEM_SUCCESS != rc) ) { + if(OSHMEM_ERR_OUT_OF_RESOURCE == rc) { + /* No free resources, Block on completion here */ + SPML_ERROR("shmem error: OSHMEM_ERR_OUT_OF_RESOURCE"); + oshmem_request_wait_completion(&putreq->req_put.req_base.req_oshmem); + } else { + SPML_ERROR("shmem error"); + } + /* exit with errro */ + SPML_ERROR("shmem error: ret = %i, send_pe = %i, dest_pe = %i",rc, oshmem_my_proc_id(),dst); + oshmem_shmem_abort(-1); + rc = OSHMEM_ERROR; + } + } + + return rc; +} + +int mca_spml_yoda_put(void *dst_addr, size_t size, void *src_addr, int dst) +{ + + return mca_spml_yoda_put_internal(dst_addr, size, src_addr, dst, 0); +} + +int mca_spml_yoda_put_nb(void* dst_addr, size_t size, void* src_addr, int dst, void **handle) +{ + UNREFERENCED_PARAMETER(handle); + + return mca_spml_yoda_put_internal(dst_addr, size, src_addr, dst, 1); +} + + +int mca_spml_yoda_fence(void) +{ + + while (0 < mca_spml_yoda.n_active_puts) { + oshmem_request_wait_any_completion(); + } + return OSHMEM_SUCCESS; +} + + + + +int mca_spml_yoda_enable(bool enable) +{ + if( false == enable ) { + return OSHMEM_SUCCESS; + } + + OBJ_CONSTRUCT(&mca_spml_yoda.lock, opal_mutex_t); + + /** + *If we get here this is the SPML who get selected for the run. We + * should get ownership for the put and get requests list, and + * initialize them with the size of our own requests. + */ + + ompi_free_list_init_new( &mca_spml_base_put_requests, + sizeof(mca_spml_yoda_put_request_t), + opal_cache_line_size, + OBJ_CLASS(mca_spml_yoda_put_request_t), + 0,opal_cache_line_size, + mca_spml_yoda.free_list_num, + mca_spml_yoda.free_list_max, + mca_spml_yoda.free_list_inc, + NULL ); + + ompi_free_list_init_new( &mca_spml_base_get_requests, + sizeof(mca_spml_yoda_get_request_t), + opal_cache_line_size, + OBJ_CLASS(mca_spml_yoda_get_request_t), + 0,opal_cache_line_size, + mca_spml_yoda.free_list_num, + mca_spml_yoda.free_list_max, + mca_spml_yoda.free_list_inc, + NULL ); + + mca_spml_yoda.enabled = true; + +#ifdef OSHMEM_WAIT_COMPLETION_DEBUG + condition_dbg_init(); +#endif + +return OSHMEM_SUCCESS; +} + + + +/** + * shmem_get reads data from a remote address + * in the symmetric heap via RDMA READ. + * Get operation: + * 1. Get the rkey to the remote address. + * 2. Allocate a get request. + * 3. Allocated a temporary pre-registered buffer + * to copy the data to. + * 4. Init the request descriptor with remote side + * data and local side data. + * 5. Read the remote buffer to a pre-registered + * buffer on the local PE using RDMA READ. + * 6. Copy the received data to dst_addr if an + * intermediate pre-register buffer was used. + * 7. Clear the request and return. + * + * src_addr - address on remote pe. + * size - the amount on bytes to be read. + * dst_addr - address on the local pe. + * src - the pe of remote process. + */ +int mca_spml_yoda_get( + void* src_addr, + size_t size, + void* dst_addr, + int src) +{ + int rc = OSHMEM_SUCCESS; + uint64_t rkey; + uint32_t lkey; + mca_spml_mkey_t *r_mkey, *l_mkey; + uint64_t rva; + uint64_t offset = 0; + unsigned ncopied = 0; + unsigned frag_size = 0; + char *p_src, *p_dst; + int i; + int nfrags, need_copy; + mca_spml_yoda_get_request_t* getreq = NULL; + mca_bml_base_btl_t* bml_btl = NULL; + mca_btl_base_descriptor_t* des = NULL; + mca_spml_yoda_rdma_frag_t* frag = NULL; + struct mca_spml_yoda_getreq_parent get_holder; + struct yoda_btl *ybtl; + int btl_id = 0; + + /* find bml_btl and its global btl_id */ + bml_btl = get_next_btl(src, &btl_id); + if (!bml_btl) oshmem_shmem_abort(-1); + + /** + * Get the address to the remote rkey. + */ + r_mkey = mca_memheap.memheap_get_cached_mkey(src, (unsigned long)src_addr, btl_id, &rva); + if (!r_mkey) { + SPML_ERROR("pe=%d: %p is not address of shared variable", src, src_addr); + oshmem_shmem_abort(-1); + return rc; + } +#if 0 + SPML_VERBOSE(100, "get: pe:%d src=%p -> dst: %p sz=%d. src_rva=%p, src_rkey=0x%lx", + src, src_addr, dst_addr, (int)size, (void *)rva, r_mkey->key); +#endif + + /* check if dest is on memheap. If not: + - copy it to temporary space. Temporary buffer will be allocated from BML. + - check if we need to split source in part and send it with several + put operations + */ + + /* TODO: unify with the same code in put */ + ybtl = &mca_spml_yoda.btl_type_map[btl_id]; + + nfrags = 1; + need_copy = 0; + rkey = 0; + lkey = 0; + + switch (ybtl->btl_type) { + case YODA_BTL_OPENIB: + l_mkey = mca_memheap.memheap_get_local_mkey((unsigned long)dst_addr, btl_id); + if (!l_mkey) { + need_copy = 1; + calc_nfrags(bml_btl, size, &frag_size, &nfrags); + } + else + lkey = l_mkey->ib.lkey; + rkey = r_mkey->ib.rkey; + break; + + case YODA_BTL_SM: + /* check if we doing put into shm attached segment and if so + * just do memcpy. + */ + if (OPAL_LIKELY(mca_memheap.memheap_is_symmetric_addr((unsigned long)src_addr) && (unsigned long)src_addr != rva)) { + memcpy(dst_addr, (void *)(unsigned long)rva, size); + /* must call progress here to avoid deadlock. Scenarion: + * pe1 pols pe2 via shm get. pe2 tries to get static variable from node one, which goes to sm btl + * In this case pe2 is stuck forever because pe1 never calls opal_progress. + * May be we do not need to call progress on every get() here but rather once in a while. + */ + opal_progress(); + return OSHMEM_SUCCESS; + } + offset = mca_memheap.memheap_find_offset(src, btl_id, (unsigned long)src_addr,rva); + rkey = r_mkey->key; + if ((size < mca_spml_yoda.knem_threshold) || (!mca_spml_yoda.use_knem)) { + need_copy = 1; + calc_nfrags(bml_btl, size, &frag_size, &nfrags); + } + break; + + case YODA_BTL_SELF: + /* self btl ignores rkey/lkey */ + break; + default: + SPML_ERROR("btl %d bad btl type %d", btl_id, ybtl->btl_type); + oshmem_shmem_abort(-1); + } + + p_src = (char*)(unsigned long)rva; //src_addr; + p_dst = (char*)dst_addr; + get_holder.active_count = 0; + + for ( i = 0; i < nfrags; i++) { + /** + * Allocating a get request from a pre-allocated + * and pre-registered free list. + */ + getreq = mca_spml_yoda_getreq_alloc(src); + assert(getreq); + frag = &getreq->get_frag; + + getreq->parent = &get_holder; + + ncopied = i < nfrags - 1 ? frag_size : (char *)dst_addr + size - p_dst; + + /** + * Allocate a temporary buffer on the local PE. + * The local buffer will store the data read + * from the remote address. + */ + mca_bml_base_alloc( bml_btl, &des, + MCA_BTL_NO_ORDER, + need_copy ? ncopied : 0, /* hack */ + MCA_BTL_DES_SEND_ALWAYS_CALLBACK); + + assert(NULL != des); + assert(NULL != des->des_src); + + /** + * A Hack: swap as btl allocates only des_src and not des_dst + * TODO: undo hack in completion callback + * Undo Hack in handle_wc under openin/btl_openib_component.c. + */ + des->des_dst = des->des_src; + des->des_src = NULL; + + if (!need_copy) { + frag->btl_seg = des->des_dst; + frag->rdma_segs[1].base_seg.seg_addr.pval = (void*)p_dst; + frag->rdma_segs[1].base_seg.seg_len = ncopied; + + if (ybtl->btl_type == YODA_BTL_OPENIB) + { + frag->rdma_segs[1].openib_seg.lkey = lkey; + } + des->des_dst = &frag->rdma_segs[1].base_seg; + + getreq->p_dst = 0; + } + else { + frag->btl_seg = 0; + /* + * Init the lkey value + * Get the lkey of the intermediate buffer pre-allocated on the + * source decriptor (pointed by destination descriptor). + */ + ((mca_btl_openib_segment_t *)des->des_dst)->lkey = ((mca_btl_openib_segment_t *)des->des_dst)->key; + getreq->p_dst = (uint64_t*)p_dst; + } + + /** + * Initialize the remote data fragment + * with remote address data required for + * executing RDMA READ from a remote buffer. + */ + frag->rdma_segs[0].base_seg.seg_addr.lval = (uintptr_t)p_src; + frag->rdma_segs[0].base_seg.seg_len = ncopied; + + if (ybtl->btl_type == YODA_BTL_SM) + { + frag->rdma_segs[0].sm_seg.key = rkey; + } + else if (ybtl->btl_type == YODA_BTL_OPENIB) + { + frag->rdma_segs[0].openib_seg.key = (uint32_t)rkey; + } + des->des_src = &frag->rdma_segs[0].base_seg; + + frag->rdma_req = getreq; + + /** + * Init remote side descriptor. + */ + des->des_src_cnt = 1; + des->des_cbdata = frag; + des->des_cbfunc = mca_spml_yoda_get_completion; + + + /** + * Get the data from remote side + * using RDMA READ. + */ + if (YODA_BTL_SM == ybtl->btl_type) { + ((mca_btl_sm_segment_t *)des->des_dst)->key = offset ; //des_dst is not used in SM KNEM get, so we pass offset through key64 field + if ((size < mca_spml_yoda.knem_threshold) || (!mca_spml_yoda.use_knem)) { + /* + * This ugly hack is done to support following configuration as: + * OSHMEM + SM => put/get for small messages using send() + */ + rc = mca_bml_base_send(bml_btl, des, BTL_SM_HDR_TYPE_GET_AS_SEND); + if (1 == rc) rc = OSHMEM_SUCCESS; + } + else { + des->des_flags |= MCA_BTL_DES_FLAGS_SHMEM_REQUEST; + rc = mca_bml_base_get(bml_btl, des); + } + } + else { + des->des_flags |= MCA_BTL_DES_FLAGS_SHMEM_REQUEST; + rc = mca_bml_base_get(bml_btl, des); + } + +#ifdef OSHMEM_WAIT_COMPLETION_DEBUG + save_dbg_history("GET", btl_type2str(ybtl->btl_type), src, size, dst_addr, src_addr); +#endif + + if( OPAL_UNLIKELY(OSHMEM_SUCCESS != rc) ) { + if(OSHMEM_ERR_OUT_OF_RESOURCE == rc) { + /* No free resources, Block on completion here */ + oshmem_request_wait_completion(&getreq->req_get.req_base.req_oshmem); + return OSHMEM_SUCCESS; + } else { + SPML_ERROR("oshmem_get: error %d", rc); + oshmem_shmem_abort(-1); + /* + ORTE_ERROR_LOG(rc); + orte_errmgr.abort(-1, NULL); + */ + + return rc; + } + } + p_dst += ncopied; + p_src += ncopied; + OPAL_THREAD_ADD32(&get_holder.active_count, 1); + } + + /* revisit if we really need this for self and sm */ + if (YODA_BTL_SM == ybtl->btl_type || YODA_BTL_SELF == ybtl->btl_type) + opal_progress(); + + /* Wait for completion on request */ + while (get_holder.active_count > 0) + oshmem_request_wait_completion(&getreq->req_get.req_base.req_oshmem); + + return rc; +} + + +int mca_spml_yoda_send(void* buf, size_t size, int dst, mca_spml_base_put_mode_t sendmode) +{ + int rc = OSHMEM_SUCCESS; + + rc = MCA_PML_CALL(send(buf, + size, + &(ompi_mpi_unsigned_char.dt), + dst, + 0, + (mca_pml_base_send_mode_t)sendmode, + &(ompi_mpi_comm_world.comm))); + + return rc; +} + + +int mca_spml_yoda_recv(void* buf, size_t size, int src) +{ + int rc = OSHMEM_SUCCESS; + + rc = MCA_PML_CALL(recv(buf, + size, + &(ompi_mpi_unsigned_char.dt), + src, + 0, + &(ompi_mpi_comm_world.comm), + NULL)); + + return rc; +} + diff --git a/oshmem/mca/spml/yoda/spml_yoda.h b/oshmem/mca/spml/yoda/spml_yoda.h new file mode 100644 index 0000000000..575c7e10ec --- /dev/null +++ b/oshmem/mca/spml/yoda/spml_yoda.h @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + */ + +#ifndef MCA_SPML_YODA_H +#define MCA_SPML_YODA_H + +#include "oshmem_config.h" +#include "oshmem/request/request.h" +#include "oshmem/mca/spml/spml.h" +#include "oshmem/mca/spml/base/spml_base_putreq.h" +#include "oshmem/proc/proc.h" +#include "oshmem/mca/spml/base/spml_base_request.h" +#include "oshmem/mca/spml/base/spml_base_getreq.h" + +#include "orte/runtime/orte_globals.h" + +#include "ompi/mca/bml/base/base.h" +#include "ompi/mca/btl/btl.h" +#include "ompi/class/ompi_free_list.h" + +BEGIN_C_DECLS + +/** + * YODA SPML module + */ + +enum { + YODA_BTL_UNKNOWN = -1, + YODA_BTL_SELF = 0, + YODA_BTL_SM, + YODA_BTL_OPENIB, + YODA_BTL_MAX +}; + +struct yoda_btl { + mca_btl_base_module_t *btl; + int btl_type; + int use_cnt; +}; + +struct mca_spml_yoda_t { + mca_spml_base_module_t super; + + int priority; + int free_list_num; /* initial size of free list */ + int free_list_max; /* maximum size of free list */ + int free_list_inc; /* number of elements to grow free list */ + + /* lock queue access */ + opal_mutex_t lock; + + /* free lists */ + ompi_free_list_t rdma_frags; + /* number of outstanding put requests */ + uint32_t n_active_puts; + bool enabled; + struct yoda_btl *btl_type_map; + int n_btls; + int use_knem; + unsigned int knem_threshold; +}; + +typedef struct mca_spml_yoda_t mca_spml_yoda_module_t; + +extern mca_spml_yoda_module_t mca_spml_yoda; + +extern int mca_spml_yoda_enable( bool enable ); +extern int mca_spml_yoda_get(void* dst_addr, size_t size, void* src_addr, int src); +extern int mca_spml_yoda_put(void* dst_addr, size_t size, void* src_addr, int dst); +extern int mca_spml_yoda_put_nb(void* dst_addr, size_t size, void* src_addr, int dst, void **handle); +extern int mca_spml_yoda_recv(void* buf, size_t size, int src); +extern int mca_spml_yoda_send(void* buf, size_t size, int dst, mca_spml_base_put_mode_t mode); +extern mca_spml_mkey_t *mca_spml_yoda_register(void* addr, size_t size, uint64_t shmid, int *count); +extern int mca_spml_yoda_deregister(mca_spml_mkey_t *mkeys); +extern int mca_spml_yoda_add_procs(oshmem_proc_t** procs, size_t nprocs); +extern int mca_spml_yoda_del_procs(oshmem_proc_t** procs, size_t nprocs); +extern int mca_spml_yoda_fence(void); + +#ifdef OSHMEM_WAIT_COMPLETION_DEBUG +extern void condition_dbg_init(void); +extern void condition_dbg_finalize(void); +#endif + +END_C_DECLS + +#endif + diff --git a/oshmem/mca/spml/yoda/spml_yoda_component.c b/oshmem/mca/spml/yoda/spml_yoda_component.c new file mode 100644 index 0000000000..9c3122020f --- /dev/null +++ b/oshmem/mca/spml/yoda/spml_yoda_component.c @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "oshmem_config.h" +#include "oshmem/runtime/params.h" +#include "oshmem/mca/spml/spml.h" +#include "opal/mca/base/mca_base_param.h" +#include "ompi/mca/bml/base/base.h" +#include "spml_yoda_component.h" +#include "oshmem/mca/spml/yoda/spml_yoda_rdmafrag.h" +#include "oshmem/mca/spml/yoda/spml_yoda_putreq.h" +#include "oshmem/mca/spml/yoda/spml_yoda.h" + + +static int mca_spml_yoda_component_open(void); +static int mca_spml_yoda_component_close(void); +static mca_spml_base_module_t* +mca_spml_yoda_component_init( int* priority, bool enable_progress_threads, + bool enable_mpi_threads ); +static int mca_spml_yoda_component_fini(void); +mca_spml_base_component_2_0_0_t mca_spml_yoda_component = { + + /* First, the mca_base_component_t struct containing meta + information about the component itself */ + + { + MCA_SPML_BASE_VERSION_2_0_0, + + "yoda", /* MCA component name */ + OSHMEM_MAJOR_VERSION, /* MCA component major version */ + OSHMEM_MINOR_VERSION, /* MCA component minor version */ + OSHMEM_RELEASE_VERSION, /* MCA component release version */ + mca_spml_yoda_component_open, /* component open */ + mca_spml_yoda_component_close /* component close */ + }, + { + /* The component is checkpoint ready */ + MCA_BASE_METADATA_PARAM_CHECKPOINT + }, + + mca_spml_yoda_component_init, /* component init */ + mca_spml_yoda_component_fini /* component finalize */ + +}; + + +static inline int mca_spml_yoda_param_register_int( + const char *param_name, + int default_value, + const char *help_msg) +{ + int param_value; + + param_value = default_value; + mca_base_param_reg_int( + &mca_spml_yoda_component.spmlm_version, + param_name, + help_msg, + false, false, + default_value, ¶m_value); + + return param_value; +} + + +static int mca_spml_yoda_component_open(void) +{ + int value; + mca_spml_yoda.free_list_num = + mca_spml_yoda_param_register_int("free_list_num", 1024, 0); + mca_spml_yoda.free_list_max = + mca_spml_yoda_param_register_int("free_list_max", 1024, 0); + mca_spml_yoda.free_list_inc = + mca_spml_yoda_param_register_int("free_list_inc", 16, 0); + mca_spml_yoda.priority = + mca_spml_yoda_param_register_int("priority", 20, "[integer] yoda priority"); + + mca_base_param_lookup_int(mca_base_param_find("btl", "sm", "component_use_knem_value"), &mca_spml_yoda.use_knem); + mca_base_param_lookup_int(mca_base_param_find("btl", "sm", "knem_threshold"), &value); + mca_spml_yoda.knem_threshold = (unsigned int)value; + return mca_bml_base_open(); +} + + +static int mca_spml_yoda_component_close(void) +{ + int rc; + if (OSHMEM_SUCCESS != (rc = mca_bml_base_close())) { + return rc; + } + return OSHMEM_SUCCESS; +} + + +static mca_spml_base_module_t* +mca_spml_yoda_component_init( int* priority, + bool enable_progress_threads, + bool enable_mpi_threads ) +{ + SPML_VERBOSE( 10, + "in yoda, my priority is %d\n", mca_spml_yoda.priority); + + *priority = mca_spml_yoda.priority; + if( (*priority) > mca_spml_yoda.priority ) + { + return NULL; + } + + /* We use BML/BTL and need to start it */ + if (!mca_bml_base_inited()) + { + SPML_VERBOSE(10, "starting bml\n"); + if ( OSHMEM_SUCCESS != mca_bml_base_init( enable_progress_threads, enable_mpi_threads) ) + { + return NULL; + } + } + + mca_spml_yoda.n_active_puts = 0; + + return &mca_spml_yoda.super; +} + +int mca_spml_yoda_component_fini(void) +{ + int rc; + + /* Shutdown BML */ + if(OMPI_SUCCESS != (rc = mca_bml.bml_finalize())) + return rc; + + OBJ_DESTRUCT(&mca_spml_yoda.lock); +#ifdef OSHMEM_WAIT_COMPLETION_DEBUG + condition_dbg_finalize(); +#endif + + return OSHMEM_SUCCESS; +} + diff --git a/oshmem/mca/spml/yoda/spml_yoda_component.h b/oshmem/mca/spml/yoda/spml_yoda_component.h new file mode 100644 index 0000000000..3c3c4673bd --- /dev/null +++ b/oshmem/mca/spml/yoda/spml_yoda_component.h @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + */ + +#ifndef MCA_SPML_YODA_COMPONENT_H +#define MCA_SPML_YODA_COMPONENT_H + +BEGIN_C_DECLS + +/* + * SPML module functions. + */ +OSHMEM_MODULE_DECLSPEC extern mca_spml_base_component_2_0_0_t mca_spml_yoda_component; +END_C_DECLS + +#endif diff --git a/oshmem/mca/spml/yoda/spml_yoda_getreq.c b/oshmem/mca/spml/yoda/spml_yoda_getreq.c new file mode 100644 index 0000000000..10c3837d85 --- /dev/null +++ b/oshmem/mca/spml/yoda/spml_yoda_getreq.c @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + + +#include "oshmem_config.h" +#include "opal/prefetch.h" +#include "oshmem/constants.h" +#include "oshmem/mca/spml/spml.h" +#include "ompi/mca/btl/btl.h" +#include "orte/mca/errmgr/errmgr.h" +#include "ompi/mca/mpool/mpool.h" +#include "ompi/mca/bml/base/base.h" +#include "oshmem/mca/spml/yoda/spml_yoda.h" +#include "oshmem/mca/spml/yoda/spml_yoda_putreq.h" +#include "oshmem/mca/spml/yoda/spml_yoda_getreq.h" +#include "oshmem/mca/spml/yoda/spml_yoda_rdmafrag.h" + + +/* + * The free call mark the final stage in a request life-cycle. Starting from this + * point the request is completed at both SPML and user level, and can be used + * for others one sided communications. Therefore, in the case of the YODA SPML it should + * be added to the free request list. + */ +static int mca_spml_yoda_get_request_free(struct oshmem_request_t** request) +{ + mca_spml_yoda_get_request_t* getreq = *(mca_spml_yoda_get_request_t**)request; + + assert( false == getreq->req_get.req_base.req_free_called ); + + OPAL_THREAD_LOCK(&oshmem_request_lock); + getreq->req_get.req_base.req_free_called = true; + + OMPI_FREE_LIST_RETURN( &mca_spml_base_get_requests, + (ompi_free_list_item_t*)getreq); + + OPAL_THREAD_UNLOCK(&oshmem_request_lock); + + *request = SHMEM_REQUEST_NULL; /*MPI_REQUEST_NULL;*/ + return OSHMEM_SUCCESS; +} + +static int mca_spml_yoda_get_request_cancel(struct oshmem_request_t* request, int complete) +{ + /* we dont cancel get requests by now */ + return OSHMEM_SUCCESS; +} + +static void mca_spml_yoda_get_request_construct(mca_spml_yoda_get_request_t* req) +{ + req->req_get.req_base.req_type = MCA_SPML_REQUEST_GET; + req->req_get.req_base.req_oshmem.req_free = mca_spml_yoda_get_request_free; + req->req_get.req_base.req_oshmem.req_cancel = mca_spml_yoda_get_request_cancel; +} + +static void mca_spml_yoda_get_request_destruct(mca_spml_yoda_get_request_t* req) +{ +} + +OBJ_CLASS_INSTANCE( mca_spml_yoda_get_request_t, + mca_spml_base_get_request_t, + mca_spml_yoda_get_request_construct, + mca_spml_yoda_get_request_destruct ); + + +void mca_spml_yoda_get_completion( mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* ep, + struct mca_btl_base_descriptor_t* des, + int status ) + { + mca_spml_yoda_rdma_frag_t* frag = (mca_spml_yoda_rdma_frag_t*)des->des_cbdata; + mca_spml_yoda_get_request_t* getreq = (mca_spml_yoda_get_request_t*)frag->rdma_req; + mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) des->des_context; + + /* check completion status */ + if( OPAL_UNLIKELY(OSHMEM_SUCCESS != status) ) { + /* shmem has no way to propagate errors. cry&die */ + SPML_ERROR("FATAL get completion error"); + abort(); + } + /* decide if we need to copy buffer */ + if (getreq->p_dst) { + memcpy(getreq->p_dst, des->des_dst->seg_addr.pval, des->des_dst->seg_len); + } + + /* restore descriptor */ + if (frag->btl_seg) { + des->des_dst = frag->btl_seg; + } + + if (getreq->parent) { + OPAL_THREAD_ADD32(&getreq->parent->active_count, -1); + } + + getreq->req_get.req_base.req_spml_complete = true; + oshmem_request_complete(&getreq->req_get.req_base.req_oshmem, 1); + oshmem_request_free( (oshmem_request_t**)&getreq ); + + /* swap back source and dest segments */ + des->des_src = des->des_dst; + des->des_dst = 0; + mca_bml_base_free(bml_btl, des); +} + diff --git a/oshmem/mca/spml/yoda/spml_yoda_getreq.h b/oshmem/mca/spml/yoda/spml_yoda_getreq.h new file mode 100644 index 0000000000..306926f296 --- /dev/null +++ b/oshmem/mca/spml/yoda/spml_yoda_getreq.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef OSHMEM_SPML_YODA_GET_REQUEST_H +#define OSHMEM_SPML_YODA_GET_REQUEST_H + +#include "ompi/mca/btl/btl.h" +#include "oshmem/mca/spml/base/spml_base_putreq.h" +#include "ompi/mca/mpool/base/base.h" +#include "ompi/mca/pml/ob1/pml_ob1_comm.h" +#include "ompi/mca/bml/bml.h" +#include "oshmem/mca/spml/yoda/spml_yoda_rdmafrag.h" +#include "oshmem/mca/spml/yoda/spml_yoda.h" +#include "orte/runtime/orte_globals.h" +#include "oshmem/mca/spml/base/spml_base_getreq.h" + +BEGIN_C_DECLS + +struct mca_spml_yoda_getreq_parent { + uint32_t active_count; +}; + +struct mca_spml_yoda_get_request_t { + mca_spml_base_get_request_t req_get; + uint64_t *p_dst; + struct mca_spml_yoda_getreq_parent *parent; + mca_spml_yoda_rdma_frag_t get_frag; +}; + +typedef struct mca_spml_yoda_get_request_t mca_spml_yoda_get_request_t; +OBJ_CLASS_DECLARATION(mca_spml_yoda_get_request_t); + +static inline mca_spml_yoda_get_request_t *mca_spml_yoda_getreq_alloc(int dst) +{ + ompi_free_list_item_t *item; + int rc; + mca_spml_yoda_get_request_t *getreq; + + OMPI_FREE_LIST_WAIT(&mca_spml_base_get_requests, item, rc); + getreq = (mca_spml_yoda_get_request_t*)item; + assert(getreq); + getreq->req_get.req_base.req_free_called = false; + getreq->req_get.req_base.req_oshmem.req_complete = false; + + return getreq; +} + +void mca_spml_yoda_get_completion( mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* ep, + struct mca_btl_base_descriptor_t* des, + int status ); + +END_C_DECLS +#endif /* OSHMEM_SPML_YODA_GET_REQUEST_H */ diff --git a/oshmem/mca/spml/yoda/spml_yoda_putreq.c b/oshmem/mca/spml/yoda/spml_yoda_putreq.c new file mode 100644 index 0000000000..f1f3a1edb6 --- /dev/null +++ b/oshmem/mca/spml/yoda/spml_yoda_putreq.c @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + + +#include "oshmem_config.h" +#include "opal/prefetch.h" +#include "oshmem/constants.h" +#include "oshmem/mca/spml/spml.h" +#include "ompi/mca/btl/btl.h" +#include "orte/mca/errmgr/errmgr.h" +#include "ompi/mca/mpool/mpool.h" +#include "ompi/mca/bml/base/base.h" +#include "oshmem/mca/spml/yoda/spml_yoda.h" +#include "oshmem/mca/spml/yoda/spml_yoda_putreq.h" +#include "oshmem/mca/spml/yoda/spml_yoda_rdmafrag.h" +#include "oshmem/runtime/runtime.h" +/* + * The free call mark the final stage in a request life-cycle. Starting from this + * point the request is completed at both SPML and user level, and can be used + * for others p2p communications. Therefore, in the case of the YODA SPML it should + * be added to the free request list. + */ +static int mca_spml_yoda_put_request_free(struct oshmem_request_t** request) +{ + mca_spml_yoda_put_request_t* putreq = *(mca_spml_yoda_put_request_t**)request; + + assert( false == putreq->req_put.req_base.req_free_called ); + + OPAL_THREAD_LOCK(&oshmem_request_lock); + putreq->req_put.req_base.req_free_called = true; + OMPI_FREE_LIST_RETURN( &mca_spml_base_put_requests, + (ompi_free_list_item_t*)putreq); + OPAL_THREAD_UNLOCK(&oshmem_request_lock); + + *request = SHMEM_REQUEST_NULL; + return OSHMEM_SUCCESS; +} + +static int mca_spml_yoda_put_request_cancel(struct oshmem_request_t* request, int complete) +{ + /* we dont cancel put requests by now */ + return OSHMEM_SUCCESS; +} + +static void mca_spml_yoda_put_request_construct(mca_spml_yoda_put_request_t* req) +{ + req->req_put.req_base.req_type = MCA_SPML_REQUEST_PUT; + req->req_put.req_base.req_oshmem.req_free = mca_spml_yoda_put_request_free; + req->req_put.req_base.req_oshmem.req_cancel = mca_spml_yoda_put_request_cancel; +} + +static void mca_spml_yoda_put_request_destruct(mca_spml_yoda_put_request_t* req) +{ +} + +OBJ_CLASS_INSTANCE( mca_spml_yoda_put_request_t, + mca_spml_base_put_request_t, + mca_spml_yoda_put_request_construct, + mca_spml_yoda_put_request_destruct ); + + + +void mca_spml_yoda_put_completion( mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* ep, + struct mca_btl_base_descriptor_t* des, + int status ) +{ + mca_spml_yoda_rdma_frag_t* frag = (mca_spml_yoda_rdma_frag_t*)des->des_cbdata; + mca_spml_yoda_put_request_t* putreq = (mca_spml_yoda_put_request_t*)frag->rdma_req; + mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) des->des_context; + + + OPAL_THREAD_ADD32(&mca_spml_yoda.n_active_puts, -1); + /* check completion status */ + if( OPAL_UNLIKELY(OSHMEM_SUCCESS != status) ) { + /* no way to propagete errors. die */ + SPML_ERROR("FATAL put completion error"); + oshmem_shmem_abort(-1); + } + + /* restore descriptor */ + if (frag->btl_seg) { + des->des_src = frag->btl_seg; + } + + putreq->req_put.req_base.req_spml_complete = true; + oshmem_request_complete(&putreq->req_put.req_base.req_oshmem, 1); + oshmem_request_free( (oshmem_request_t**)&putreq ); + mca_bml_base_free(bml_btl, des); + +#ifdef OSHMEM_WAIT_COMPLETION_DEBUG + if (!strcmp(bml_btl->btl->btl_component->btl_version.mca_component_name,"sm")){ + oshmem_request_cond.puts_counter_sm--; + } + else if (!strcmp(bml_btl->btl->btl_component->btl_version.mca_component_name,"openib")){ + oshmem_request_cond.puts_counter_openib--; + } +#endif +} + diff --git a/oshmem/mca/spml/yoda/spml_yoda_putreq.h b/oshmem/mca/spml/yoda/spml_yoda_putreq.h new file mode 100644 index 0000000000..0cdf105732 --- /dev/null +++ b/oshmem/mca/spml/yoda/spml_yoda_putreq.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef OSHMEM_SPML_YODA_PUT_REQUEST_H +#define OSHMEM_SPML_YODA_PUT_REQUEST_H + +#include "ompi/mca/btl/btl.h" +#include "oshmem/mca/spml/base/base.h" +#include "oshmem/mca/spml/base/spml_base_putreq.h" +#include "ompi/mca/mpool/base/base.h" +#include "ompi/mca/bml/bml.h" +#include "oshmem/mca/spml/yoda/spml_yoda_rdmafrag.h" +#include "oshmem/mca/spml/yoda/spml_yoda.h" +#include "orte/runtime/orte_globals.h" + +BEGIN_C_DECLS + +struct mca_spml_yoda_put_request_t { + mca_spml_base_put_request_t req_put; + mca_spml_yoda_rdma_frag_t put_frag; +}; + +typedef struct mca_spml_yoda_put_request_t mca_spml_yoda_put_request_t; + +OBJ_CLASS_DECLARATION(mca_spml_yoda_put_request_t); + +static inline mca_spml_yoda_put_request_t *mca_spml_yoda_putreq_alloc(int dst) +{ + ompi_free_list_item_t *item; + int rc; + mca_spml_yoda_put_request_t *putreq; + + OMPI_FREE_LIST_WAIT(&mca_spml_base_put_requests, item, rc); + putreq = (mca_spml_yoda_put_request_t*)item; + assert(putreq); + putreq->req_put.req_base.req_free_called = false; + putreq->req_put.req_base.req_oshmem.req_complete = false; + + return putreq; +} + + +void mca_spml_yoda_put_completion( mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* ep, + struct mca_btl_base_descriptor_t* des, + int status ); + + +END_C_DECLS + +#endif /* OSHMEM_SPML_YODA_PUT_REQUEST_H */ diff --git a/oshmem/mca/spml/yoda/spml_yoda_rdmafrag.h b/oshmem/mca/spml/yoda/spml_yoda_rdmafrag.h new file mode 100644 index 0000000000..4f98485a88 --- /dev/null +++ b/oshmem/mca/spml/yoda/spml_yoda_rdmafrag.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + */ + +#ifndef MCA_SPML_YODA_RDMAFRAG_H +#define MCA_SPML_YODA_RDMAFRAG_H + +#include "ompi/mca/btl/btl.h" +#include "opal/types.h" +#include "opal/util/arch.h" +#include "oshmem/proc/proc.h" +#include "ompi/mca/btl/openib/btl_openib_endpoint.h" +#include "ompi/mca/btl/sm/btl_sm_frag.h" + + +BEGIN_C_DECLS + +typedef enum { + MCA_SPML_YODA_RDMA_PUT, + MCA_SPML_YODA_RDMA_GET +} mca_spml_yoda_rdma_state_t; + +typedef union mca_spml_yoda_segment_t { + mca_btl_base_segment_t base_seg; + mca_btl_sm_segment_t sm_seg; + mca_btl_openib_segment_t openib_seg; +} mca_spml_yoda_segment_t; + +struct mca_spml_yoda_rdma_frag_t { + mca_spml_yoda_segment_t rdma_segs[2]; + mca_btl_base_segment_t *btl_seg; /* save pointer to btl allocated descriptor segment */ + void *rdma_req; +}; + +typedef struct mca_spml_yoda_rdma_frag_t mca_spml_yoda_rdma_frag_t; +END_C_DECLS +#endif + diff --git a/oshmem/op/Makefile.am b/oshmem/op/Makefile.am new file mode 100644 index 0000000000..e663a4356c --- /dev/null +++ b/oshmem/op/Makefile.am @@ -0,0 +1,19 @@ +# +# Copyright (c) 2012 Mellanox Technologies, Inc. +# All rights reserved. +# # $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# +# +# This makefile.am does not stand on its own - it is included from oshmem/Makefile.am + + +headers += \ + op/op.h + +libshmem_la_SOURCES += \ + op/op.c + diff --git a/oshmem/op/op.c b/oshmem/op/op.c new file mode 100644 index 0000000000..2e636c7ce7 --- /dev/null +++ b/oshmem/op/op.c @@ -0,0 +1,398 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "oshmem_config.h" +#include + +#include "orte/runtime/orte_globals.h" + +#include "opal/datatype/opal_datatype_internal.h" +#include "opal/class/opal_pointer_array.h" + +#include "oshmem/constants.h" +#include "oshmem/op/op.h" + + +/* + * Table for op handle conversion + */ +opal_pointer_array_t oshmem_op_array; + + +/* + * Class information + */ +static void oshmem_op_construct(oshmem_op_t *object); +static void oshmem_op_destruct(oshmem_op_t *object); + + +/* + * Class instance + */ +OBJ_CLASS_INSTANCE(oshmem_op_t, opal_object_t, + oshmem_op_construct, oshmem_op_destruct); + + +/* + * Intrinsic Operation objects + */ +/* Bitwise AND */ +oshmem_op_t* oshmem_op_and_short = NULL; +oshmem_op_t* oshmem_op_and_int = NULL; +oshmem_op_t* oshmem_op_and_long = NULL; +oshmem_op_t* oshmem_op_and_longlong = NULL; +oshmem_op_t* oshmem_op_and_fint4 = NULL; +oshmem_op_t* oshmem_op_and_fint8 = NULL; + +/* Bitwise OR */ +oshmem_op_t* oshmem_op_or_short = NULL; +oshmem_op_t* oshmem_op_or_int = NULL; +oshmem_op_t* oshmem_op_or_long = NULL; +oshmem_op_t* oshmem_op_or_longlong = NULL; +oshmem_op_t* oshmem_op_or_fint4 = NULL; +oshmem_op_t* oshmem_op_or_fint8 = NULL; + +/* Bitwise XOR */ +oshmem_op_t* oshmem_op_xor_short = NULL; +oshmem_op_t* oshmem_op_xor_int = NULL; +oshmem_op_t* oshmem_op_xor_long = NULL; +oshmem_op_t* oshmem_op_xor_longlong = NULL; +oshmem_op_t* oshmem_op_xor_fint4 = NULL; +oshmem_op_t* oshmem_op_xor_fint8 = NULL; +//oshmem_op_t* oshmem_op_xor_fcomp4 = NULL; +//oshmem_op_t* oshmem_op_xor_fcomp8 = NULL; + +/* MAX */ +oshmem_op_t* oshmem_op_max_short = NULL; +oshmem_op_t* oshmem_op_max_int = NULL; +oshmem_op_t* oshmem_op_max_long = NULL; +oshmem_op_t* oshmem_op_max_longlong = NULL; +oshmem_op_t* oshmem_op_max_float = NULL; +oshmem_op_t* oshmem_op_max_double = NULL; +oshmem_op_t* oshmem_op_max_longdouble = NULL; +oshmem_op_t* oshmem_op_max_fint4 = NULL; +oshmem_op_t* oshmem_op_max_fint8 = NULL; +oshmem_op_t* oshmem_op_max_freal4 = NULL; +oshmem_op_t* oshmem_op_max_freal8 = NULL; +oshmem_op_t* oshmem_op_max_freal16 = NULL; + +/* MIN */ +oshmem_op_t* oshmem_op_min_short = NULL; +oshmem_op_t* oshmem_op_min_int = NULL; +oshmem_op_t* oshmem_op_min_long = NULL; +oshmem_op_t* oshmem_op_min_longlong = NULL; +oshmem_op_t* oshmem_op_min_float = NULL; +oshmem_op_t* oshmem_op_min_double = NULL; +oshmem_op_t* oshmem_op_min_longdouble = NULL; +oshmem_op_t* oshmem_op_min_fint4 = NULL; +oshmem_op_t* oshmem_op_min_fint8 = NULL; +oshmem_op_t* oshmem_op_min_freal4 = NULL; +oshmem_op_t* oshmem_op_min_freal8 = NULL; +oshmem_op_t* oshmem_op_min_freal16 = NULL; + +/* SUM */ +oshmem_op_t* oshmem_op_sum_short = NULL; +oshmem_op_t* oshmem_op_sum_int = NULL; +oshmem_op_t* oshmem_op_sum_long = NULL; +oshmem_op_t* oshmem_op_sum_longlong = NULL; +oshmem_op_t* oshmem_op_sum_float = NULL; +oshmem_op_t* oshmem_op_sum_double = NULL; +oshmem_op_t* oshmem_op_sum_longdouble = NULL; +oshmem_op_t* oshmem_op_sum_complexf = NULL; +oshmem_op_t* oshmem_op_sum_complexd = NULL; +oshmem_op_t* oshmem_op_sum_fint4 = NULL; +oshmem_op_t* oshmem_op_sum_fint8 = NULL; +oshmem_op_t* oshmem_op_sum_freal4 = NULL; +oshmem_op_t* oshmem_op_sum_freal8 = NULL; +oshmem_op_t* oshmem_op_sum_freal16 = NULL; + +/* PROD */ +oshmem_op_t* oshmem_op_prod_short = NULL; +oshmem_op_t* oshmem_op_prod_int = NULL; +oshmem_op_t* oshmem_op_prod_long = NULL; +oshmem_op_t* oshmem_op_prod_longlong = NULL; +oshmem_op_t* oshmem_op_prod_float = NULL; +oshmem_op_t* oshmem_op_prod_double = NULL; +oshmem_op_t* oshmem_op_prod_longdouble = NULL; +oshmem_op_t* oshmem_op_prod_complexf = NULL; +oshmem_op_t* oshmem_op_prod_complexd = NULL; +oshmem_op_t* oshmem_op_prod_fint4 = NULL; +oshmem_op_t* oshmem_op_prod_fint8 = NULL; +oshmem_op_t* oshmem_op_prod_freal4 = NULL; +oshmem_op_t* oshmem_op_prod_freal8 = NULL; +oshmem_op_t* oshmem_op_prod_freal16 = NULL; + + +#define FUNC_OP_CREATE(name, type_name, type, calc) \ + void oshmem_op_##name##_##type_name##_func(void *in, void *out, int count); \ + void oshmem_op_##name##_##type_name##_func(void *in, void *out, int count) \ + { \ + int i; \ + type *a = (type *) in; \ + type *b = (type *) out; \ + for (i = 0; i < count; ++i) { \ + *(b) = calc(*(b), *(a)); \ + ++b; \ + ++a; \ + } \ + } + +#define OBJ_OP_CREATE(name, type_name, type, op_id, dt_id) \ + oshmem_op_##name##_##type_name = OBJ_NEW(oshmem_op_t); \ + if (oshmem_op_##name##_##type_name) \ + { \ + oshmem_op_##name##_##type_name->op = op_id; \ + oshmem_op_##name##_##type_name->dt = dt_id; \ + oshmem_op_##name##_##type_name->dt_size = sizeof(type); \ + oshmem_op_##name##_##type_name->o_func.c_fn = oshmem_op_##name##_##type_name##_func; \ + } \ + +/* Bitwise AND */ +#define __and_op(a, b) ((a) & (b)) +FUNC_OP_CREATE(and, short, short, __and_op); +FUNC_OP_CREATE(and, int, int, __and_op); +FUNC_OP_CREATE(and, long, long, __and_op); +FUNC_OP_CREATE(and, longlong, long long, __and_op); +FUNC_OP_CREATE(and, fint4, ompi_fortran_integer4_t, __and_op); +FUNC_OP_CREATE(and, fint8, ompi_fortran_integer8_t, __and_op); + +/* Bitwise OR */ +#define __or_op(a, b) ((a) | (b)) +FUNC_OP_CREATE(or, short, short, __or_op); +FUNC_OP_CREATE(or, int, int, __or_op); +FUNC_OP_CREATE(or, long, long, __or_op); +FUNC_OP_CREATE(or, longlong, long long, __or_op); +FUNC_OP_CREATE(or, fint4, ompi_fortran_integer4_t, __or_op); +FUNC_OP_CREATE(or, fint8, ompi_fortran_integer8_t, __or_op); + +/* Bitwise XOR */ +#define __xor_op(a, b) ((a) ^ (b)) +FUNC_OP_CREATE(xor, short, short, __xor_op); +FUNC_OP_CREATE(xor, int, int, __xor_op); +FUNC_OP_CREATE(xor, long, long, __xor_op); +FUNC_OP_CREATE(xor, longlong, long long, __xor_op); +FUNC_OP_CREATE(xor, fint4, ompi_fortran_integer4_t, __xor_op); +FUNC_OP_CREATE(xor, fint8, ompi_fortran_integer8_t, __xor_op); +//FUNC_OP_CREATE(xor, fcomp4, ompi_fortran_complex4_t, __xor_op); +//FUNC_OP_CREATE(xor, fcomp8, ompi_fortran_complex8_t, __xor_op); + +/* MAX */ +#define __max_op(a, b) ((a) > (b) ? (a) : (b)) +FUNC_OP_CREATE(max, short, short, __max_op); +FUNC_OP_CREATE(max, int, int, __max_op); +FUNC_OP_CREATE(max, long, long, __max_op); +FUNC_OP_CREATE(max, longlong, long long, __max_op); +FUNC_OP_CREATE(max, float, float, __max_op); +FUNC_OP_CREATE(max, double, double, __max_op); +FUNC_OP_CREATE(max, longdouble, long double, __max_op); +FUNC_OP_CREATE(max, fint4, ompi_fortran_integer4_t, __max_op); +FUNC_OP_CREATE(max, fint8, ompi_fortran_integer8_t, __max_op); +FUNC_OP_CREATE(max, freal4, ompi_fortran_real4_t, __max_op); +FUNC_OP_CREATE(max, freal8, ompi_fortran_real8_t, __max_op); +FUNC_OP_CREATE(max, freal16, ompi_fortran_real16_t, __max_op); + +/* MIN */ +#define __min_op(a, b) ((a) < (b) ? (a) : (b)) +FUNC_OP_CREATE(min, short, short, __min_op); +FUNC_OP_CREATE(min, int, int, __min_op); +FUNC_OP_CREATE(min, long, long, __min_op); +FUNC_OP_CREATE(min, longlong, long long, __min_op); +FUNC_OP_CREATE(min, float, float, __min_op); +FUNC_OP_CREATE(min, double, double, __min_op); +FUNC_OP_CREATE(min, longdouble, long double, __min_op); +FUNC_OP_CREATE(min, fint4, ompi_fortran_integer4_t, __min_op); +FUNC_OP_CREATE(min, fint8, ompi_fortran_integer8_t, __min_op); +FUNC_OP_CREATE(min, freal4, ompi_fortran_real4_t, __min_op); +FUNC_OP_CREATE(min, freal8, ompi_fortran_real8_t, __min_op); +FUNC_OP_CREATE(min, freal16, ompi_fortran_real16_t, __min_op); + +/* SUM */ +#define __sum_op(a, b) ((a) + (b)) +FUNC_OP_CREATE(sum, short, short, __sum_op); +FUNC_OP_CREATE(sum, int, int, __sum_op); +FUNC_OP_CREATE(sum, long, long, __sum_op); +FUNC_OP_CREATE(sum, longlong, long long, __sum_op); +FUNC_OP_CREATE(sum, float, float, __sum_op); +FUNC_OP_CREATE(sum, double, double, __sum_op); +FUNC_OP_CREATE(sum, longdouble, long double, __sum_op); +FUNC_OP_CREATE(sum, complexf, float complex, __sum_op); +FUNC_OP_CREATE(sum, complexd, double complex, __sum_op); +FUNC_OP_CREATE(sum, fint4, ompi_fortran_integer4_t, __sum_op); +FUNC_OP_CREATE(sum, fint8, ompi_fortran_integer8_t, __sum_op); +FUNC_OP_CREATE(sum, freal4, ompi_fortran_real4_t, __sum_op); +FUNC_OP_CREATE(sum, freal8, ompi_fortran_real8_t, __sum_op); +FUNC_OP_CREATE(sum, freal16, ompi_fortran_real16_t, __sum_op); + +/* PROD */ +#define __prod_op(a, b) ((a) * (b)) +FUNC_OP_CREATE(prod, short, short, __prod_op); +FUNC_OP_CREATE(prod, int, int, __prod_op); +FUNC_OP_CREATE(prod, long, long, __prod_op); +FUNC_OP_CREATE(prod, longlong, long long, __prod_op); +FUNC_OP_CREATE(prod, float, float, __prod_op); +FUNC_OP_CREATE(prod, double, double, __prod_op); +FUNC_OP_CREATE(prod, longdouble, long double, __prod_op); +FUNC_OP_CREATE(prod, complexf, float complex, __prod_op); +FUNC_OP_CREATE(prod, complexd, double complex, __prod_op); +FUNC_OP_CREATE(prod, fint4, ompi_fortran_integer4_t, __prod_op); +FUNC_OP_CREATE(prod, fint8, ompi_fortran_integer8_t, __prod_op); +FUNC_OP_CREATE(prod, freal4, ompi_fortran_real4_t, __prod_op); +FUNC_OP_CREATE(prod, freal8, ompi_fortran_real8_t, __prod_op); +FUNC_OP_CREATE(prod, freal16, ompi_fortran_real16_t, __prod_op); + + +int oshmem_op_init(void) +{ + + /* Setup operation array */ + OBJ_CONSTRUCT(&oshmem_op_array, opal_pointer_array_t); + if( OPAL_SUCCESS != opal_pointer_array_init(&oshmem_op_array, 0, + ORTE_GLOBAL_ARRAY_MAX_SIZE, 1) ) { + return OSHMEM_ERROR; + } + + /* Bitwise AND */ + OBJ_OP_CREATE(and, short, short, OSHMEM_OP_AND, OSHMEM_OP_TYPE_SHORT); + OBJ_OP_CREATE(and, int, int, OSHMEM_OP_AND, OSHMEM_OP_TYPE_INT); + OBJ_OP_CREATE(and, long, long, OSHMEM_OP_AND, OSHMEM_OP_TYPE_LONG); + OBJ_OP_CREATE(and, longlong, long long, OSHMEM_OP_AND, OSHMEM_OP_TYPE_LLONG); + OBJ_OP_CREATE(and, fint4, ompi_fortran_integer4_t, OSHMEM_OP_AND, OSHMEM_OP_TYPE_FINT4); + OBJ_OP_CREATE(and, fint8, ompi_fortran_integer8_t, OSHMEM_OP_AND, OSHMEM_OP_TYPE_FINT8); + + /* Bitwise OR */ + OBJ_OP_CREATE(or, short, short, OSHMEM_OP_OR, OSHMEM_OP_TYPE_SHORT); + OBJ_OP_CREATE(or, int, int, OSHMEM_OP_OR, OSHMEM_OP_TYPE_INT); + OBJ_OP_CREATE(or, long, long, OSHMEM_OP_OR, OSHMEM_OP_TYPE_LONG); + OBJ_OP_CREATE(or, longlong, long long, OSHMEM_OP_OR, OSHMEM_OP_TYPE_LLONG); + OBJ_OP_CREATE(or, fint4, ompi_fortran_integer4_t, OSHMEM_OP_OR, OSHMEM_OP_TYPE_FINT4); + OBJ_OP_CREATE(or, fint8, ompi_fortran_integer8_t, OSHMEM_OP_OR, OSHMEM_OP_TYPE_FINT8); + + /* Bitwise XOR */ + OBJ_OP_CREATE(xor, short, short, OSHMEM_OP_XOR, OSHMEM_OP_TYPE_SHORT); + OBJ_OP_CREATE(xor, int, int, OSHMEM_OP_XOR, OSHMEM_OP_TYPE_INT); + OBJ_OP_CREATE(xor, long, long, OSHMEM_OP_XOR, OSHMEM_OP_TYPE_LONG); + OBJ_OP_CREATE(xor, longlong, long long, OSHMEM_OP_XOR, OSHMEM_OP_TYPE_LLONG); + OBJ_OP_CREATE(xor, fint4, ompi_fortran_integer4_t, OSHMEM_OP_XOR, OSHMEM_OP_TYPE_FINT4); + OBJ_OP_CREATE(xor, fint8, ompi_fortran_integer8_t, OSHMEM_OP_XOR, OSHMEM_OP_TYPE_FINT8); +// OBJ_OP_CREATE(xor, fcomp4, ompi_fortran_comp4_t, OSHMEM_OP_XOR, OSHMEM_OP_TYPE_FCOMP4); +// OBJ_OP_CREATE(xor, fcomp8, ompi_fortran_comp8_t, OSHMEM_OP_XOR, OSHMEM_OP_TYPE_FCOMP8); + + /* MAX */ + OBJ_OP_CREATE(max, short, short, OSHMEM_OP_MAX, OSHMEM_OP_TYPE_SHORT); + OBJ_OP_CREATE(max, int, int, OSHMEM_OP_MAX, OSHMEM_OP_TYPE_INT); + OBJ_OP_CREATE(max, long, long, OSHMEM_OP_MAX, OSHMEM_OP_TYPE_LONG); + OBJ_OP_CREATE(max, longlong, long long, OSHMEM_OP_MAX, OSHMEM_OP_TYPE_LLONG); + OBJ_OP_CREATE(max, float, float, OSHMEM_OP_MAX, OSHMEM_OP_TYPE_FLOAT); + OBJ_OP_CREATE(max, double, double, OSHMEM_OP_MAX, OSHMEM_OP_TYPE_DOUBLE); + OBJ_OP_CREATE(max, longdouble, long double, OSHMEM_OP_MAX, OSHMEM_OP_TYPE_LDOUBLE); + OBJ_OP_CREATE(max, fint4, ompi_fortran_integer4_t, OSHMEM_OP_MAX, OSHMEM_OP_TYPE_FINT4); + OBJ_OP_CREATE(max, fint8, ompi_fortran_integer8_t, OSHMEM_OP_MAX, OSHMEM_OP_TYPE_FINT8); + OBJ_OP_CREATE(max, freal4, ompi_fortran_real4_t, OSHMEM_OP_MAX, OSHMEM_OP_TYPE_FREAL4); + OBJ_OP_CREATE(max, freal8, ompi_fortran_real8_t, OSHMEM_OP_MAX, OSHMEM_OP_TYPE_FREAL8); + OBJ_OP_CREATE(max, freal16, ompi_fortran_real16_t, OSHMEM_OP_MAX, OSHMEM_OP_TYPE_FREAL16); + + /* MIN */ + OBJ_OP_CREATE(min, short, short, OSHMEM_OP_MIN, OSHMEM_OP_TYPE_SHORT); + OBJ_OP_CREATE(min, int, int, OSHMEM_OP_MIN, OSHMEM_OP_TYPE_INT); + OBJ_OP_CREATE(min, long, long, OSHMEM_OP_MIN, OSHMEM_OP_TYPE_LONG); + OBJ_OP_CREATE(min, longlong, long long, OSHMEM_OP_MIN, OSHMEM_OP_TYPE_LLONG); + OBJ_OP_CREATE(min, float, float, OSHMEM_OP_MIN, OSHMEM_OP_TYPE_FLOAT); + OBJ_OP_CREATE(min, double, double, OSHMEM_OP_MIN, OSHMEM_OP_TYPE_DOUBLE); + OBJ_OP_CREATE(min, longdouble, long double, OSHMEM_OP_MIN, OSHMEM_OP_TYPE_LDOUBLE); + OBJ_OP_CREATE(min, fint4, ompi_fortran_integer4_t, OSHMEM_OP_MIN, OSHMEM_OP_TYPE_FINT4); + OBJ_OP_CREATE(min, fint8, ompi_fortran_integer8_t, OSHMEM_OP_MIN, OSHMEM_OP_TYPE_FINT8); + OBJ_OP_CREATE(min, freal4, ompi_fortran_real4_t, OSHMEM_OP_MIN, OSHMEM_OP_TYPE_FREAL4); + OBJ_OP_CREATE(min, freal8, ompi_fortran_real8_t, OSHMEM_OP_MIN, OSHMEM_OP_TYPE_FREAL8); + OBJ_OP_CREATE(min, freal16, ompi_fortran_real16_t, OSHMEM_OP_MIN, OSHMEM_OP_TYPE_FREAL16); + + /* SUM */ + OBJ_OP_CREATE(sum, short, short, OSHMEM_OP_SUM, OSHMEM_OP_TYPE_SHORT); + OBJ_OP_CREATE(sum, int, int, OSHMEM_OP_SUM, OSHMEM_OP_TYPE_INT); + OBJ_OP_CREATE(sum, long, long, OSHMEM_OP_SUM, OSHMEM_OP_TYPE_LONG); + OBJ_OP_CREATE(sum, longlong, long long, OSHMEM_OP_SUM, OSHMEM_OP_TYPE_LLONG); + OBJ_OP_CREATE(sum, float, float, OSHMEM_OP_SUM, OSHMEM_OP_TYPE_FLOAT); + OBJ_OP_CREATE(sum, double, double, OSHMEM_OP_SUM, OSHMEM_OP_TYPE_DOUBLE); + OBJ_OP_CREATE(sum, longdouble, long double, OSHMEM_OP_SUM, OSHMEM_OP_TYPE_LDOUBLE); + OBJ_OP_CREATE(sum, complexf, float complex, OSHMEM_OP_SUM, OSHMEM_OP_TYPE_FCOMPLEX); + OBJ_OP_CREATE(sum, complexd, double complex, OSHMEM_OP_SUM, OSHMEM_OP_TYPE_DCOMPLEX); + OBJ_OP_CREATE(sum, fint4, ompi_fortran_integer4_t, OSHMEM_OP_SUM, OSHMEM_OP_TYPE_FINT4); + OBJ_OP_CREATE(sum, fint8, ompi_fortran_integer8_t, OSHMEM_OP_SUM, OSHMEM_OP_TYPE_FINT8); + OBJ_OP_CREATE(sum, freal4, ompi_fortran_real4_t, OSHMEM_OP_SUM, OSHMEM_OP_TYPE_FREAL4); + OBJ_OP_CREATE(sum, freal8, ompi_fortran_real8_t, OSHMEM_OP_SUM, OSHMEM_OP_TYPE_FREAL8); + OBJ_OP_CREATE(sum, freal16, ompi_fortran_real16_t, OSHMEM_OP_SUM, OSHMEM_OP_TYPE_FREAL16); + + /* PROD */ + OBJ_OP_CREATE(prod, short, short, OSHMEM_OP_PROD, OSHMEM_OP_TYPE_SHORT); + OBJ_OP_CREATE(prod, int, int, OSHMEM_OP_PROD, OSHMEM_OP_TYPE_INT); + OBJ_OP_CREATE(prod, long, long, OSHMEM_OP_PROD, OSHMEM_OP_TYPE_LONG); + OBJ_OP_CREATE(prod, longlong, long long, OSHMEM_OP_PROD, OSHMEM_OP_TYPE_LLONG); + OBJ_OP_CREATE(prod, float, float, OSHMEM_OP_PROD, OSHMEM_OP_TYPE_FLOAT); + OBJ_OP_CREATE(prod, double, double, OSHMEM_OP_PROD, OSHMEM_OP_TYPE_DOUBLE); + OBJ_OP_CREATE(prod, longdouble, long double, OSHMEM_OP_PROD, OSHMEM_OP_TYPE_LDOUBLE); + OBJ_OP_CREATE(prod, complexf, float complex, OSHMEM_OP_PROD, OSHMEM_OP_TYPE_FCOMPLEX); + OBJ_OP_CREATE(prod, complexd, double complex, OSHMEM_OP_PROD, OSHMEM_OP_TYPE_DCOMPLEX); + OBJ_OP_CREATE(prod, fint4, ompi_fortran_integer4_t, OSHMEM_OP_PROD, OSHMEM_OP_TYPE_FINT4); + OBJ_OP_CREATE(prod, fint8, ompi_fortran_integer8_t, OSHMEM_OP_PROD, OSHMEM_OP_TYPE_FINT8); + OBJ_OP_CREATE(prod, freal4, ompi_fortran_real4_t, OSHMEM_OP_PROD, OSHMEM_OP_TYPE_FREAL4); + OBJ_OP_CREATE(prod, freal8, ompi_fortran_real8_t, OSHMEM_OP_PROD, OSHMEM_OP_TYPE_FREAL8); + OBJ_OP_CREATE(prod, freal16, ompi_fortran_real16_t, OSHMEM_OP_PROD, OSHMEM_OP_TYPE_FREAL16); + + return OSHMEM_SUCCESS; +} + + +int oshmem_op_finalize(void) +{ + int max, i; + oshmem_op_t *op; + + /* Check whether we have some left */ + max = opal_pointer_array_get_size(&oshmem_op_array); + for ( i = 0; i < max; i++ ) + { + op = (oshmem_op_t *)opal_pointer_array_get_item(&oshmem_op_array, i); + if ( NULL != op ) + { + OBJ_RELEASE(op); + } + } + + OBJ_DESTRUCT (&oshmem_op_array); + + return OSHMEM_SUCCESS; +} + + +/************************************************************************** + * + * Static functions + * + **************************************************************************/ + + +/* + * Op constructor + */ +static void oshmem_op_construct(oshmem_op_t *object) +{ + object->id = opal_pointer_array_add(&oshmem_op_array, object); +} + + +/* + * Op destructor + */ +static void oshmem_op_destruct(oshmem_op_t *object) +{ + if (NULL != opal_pointer_array_get_item(&oshmem_op_array, object->id)) + { + opal_pointer_array_set_item(&oshmem_op_array, object->id, NULL); + } +} diff --git a/oshmem/op/op.h b/oshmem/op/op.h new file mode 100644 index 0000000000..1e25a80f79 --- /dev/null +++ b/oshmem/op/op.h @@ -0,0 +1,206 @@ +/* +* Copyright (c) 2012 Mellanox Technologies, Inc. +* All rights reserved. +* $COPYRIGHT$ +* +* Additional copyrights may follow +* +* $HEADER$ +*/ +#ifndef OSHMEM_OP_H +#define OSHMEM_OP_H + +#include "oshmem_config.h" +#include "oshmem/types.h" +#include "oshmem/constants.h" + +#include "oshmem/mca/scoll/scoll.h" + +#include "opal/class/opal_list.h" +#include "opal/dss/dss_types.h" + +#include "orte/types.h" + + +BEGIN_C_DECLS + +/* ******************************************************************** */ + +/** + * Corresponding to the types that we can reduce over. + */ +enum { + OSHMEM_OP_TYPE_SHORT, /** C integer: short */ + OSHMEM_OP_TYPE_INT, /** C integer: int */ + OSHMEM_OP_TYPE_LONG, /** C integer: long */ + OSHMEM_OP_TYPE_LLONG, /** C integer: long long */ + + OSHMEM_OP_TYPE_FLOAT, /** Floating point: float */ + OSHMEM_OP_TYPE_DOUBLE, /** Floating point: double */ + OSHMEM_OP_TYPE_LDOUBLE, /** Floating point: long double */ + + OSHMEM_OP_TYPE_FCOMPLEX, /** Complex: float */ + OSHMEM_OP_TYPE_DCOMPLEX, /** Complex: double */ + + OSHMEM_OP_TYPE_FINT4, /** Fortran integer: int4 */ + OSHMEM_OP_TYPE_FINT8, /** Fortran integer: int8 */ + OSHMEM_OP_TYPE_FREAL4, /** Fortran integer: real4 */ + OSHMEM_OP_TYPE_FREAL8, /** Fortran integer: real8 */ + OSHMEM_OP_TYPE_FREAL16, /** Fortran integer: real16 */ +// OSHMEM_OP_TYPE_FCOMP4, /** Fortran complex4 */ +// OSHMEM_OP_TYPE_FCOMP8, /** Fortran complex8 */ + + /** Maximum type */ + OSHMEM_OP_TYPE_NUMBER +}; + + +/** + * Supported reduce operations. + */ +enum { + OSHMEM_OP_AND, /** AND */ + OSHMEM_OP_OR, /** OR */ + OSHMEM_OP_XOR, /** XOR */ + OSHMEM_OP_MAX, /** MAX */ + OSHMEM_OP_MIN, /** MIN */ + OSHMEM_OP_SUM, /** SUM */ + OSHMEM_OP_PROD, /** PROD */ + + /** Maximum operation */ + OSHMEM_OP_NUMBER +}; + + +typedef void (oshmem_op_c_handler_fn_t)(void *, void *, int); + + +/** + * Back-end type of OSHMEM reduction operations + */ +struct oshmem_op_t { + opal_object_t base; + int id; /**< index in global array */ + int op; /**< operation type */ + int dt; /**< datatype */ + size_t dt_size; /**< datatype size */ + union { + /** C handler function pointer */ + oshmem_op_c_handler_fn_t *c_fn; + } o_func; +}; +typedef struct oshmem_op_t oshmem_op_t; +OSHMEM_DECLSPEC OBJ_CLASS_DECLARATION(oshmem_op_t); + +/* Bitwise AND */ +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_and_short; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_and_int; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_and_long; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_and_longlong; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_and_fint4; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_and_fint8; + +/* Bitwise OR */ +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_or_short; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_or_int; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_or_long; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_or_longlong; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_or_fint4; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_or_fint8; + +/* Bitwise XOR */ +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_xor_short; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_xor_int; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_xor_long; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_xor_longlong; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_xor_fint4; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_xor_fint8; +//OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_xor_fcomp4; +//OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_xor_fcomp8; + +/* MAX */ +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_max_short; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_max_int; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_max_long; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_max_longlong; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_max_float; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_max_double; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_max_longdouble; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_max_fint4; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_max_fint8; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_max_freal4; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_max_freal8; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_max_freal16; + +/* MIN */ +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_min_short; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_min_int; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_min_long; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_min_longlong; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_min_float; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_min_double; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_min_longdouble; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_min_fint4; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_min_fint8; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_min_freal4; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_min_freal8; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_min_freal16; + +/* SUM */ +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_sum_short; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_sum_int; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_sum_long; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_sum_longlong; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_sum_float; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_sum_double; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_sum_longdouble; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_sum_complexf; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_sum_complexd; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_sum_fint4; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_sum_fint8; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_sum_freal4; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_sum_freal8; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_sum_freal16; + + +/* PROD */ +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_prod_short; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_prod_int; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_prod_long; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_prod_longlong; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_prod_float; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_prod_double; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_prod_longdouble; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_prod_complexf; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_prod_complexd; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_prod_fint4; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_prod_fint8; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_prod_freal4; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_prod_freal8; +OSHMEM_DECLSPEC extern oshmem_op_t* oshmem_op_prod_freal16; + +/** + * Initialize the op interface. + * + * @returns OSHMEM_SUCCESS Upon success + * @returns OSHMEM_ERROR Otherwise + * + * Invoked from oshmem_shmem_init(); sets up the op interface, creates + * the predefined operations. + */ +int oshmem_op_init(void); + + +/** + * Finalize the op interface. + * + * @returns OSHMEM_SUCCESS Always + * + * Invokes from oshmem_shmem_finalize(); tears down the op interface. + */ +int oshmem_op_finalize(void); + + +END_C_DECLS + +#endif /* OSHMEM_OP_H */ diff --git a/oshmem/proc/Makefile.am b/oshmem/proc/Makefile.am new file mode 100644 index 0000000000..2331e4f629 --- /dev/null +++ b/oshmem/proc/Makefile.am @@ -0,0 +1,21 @@ +# +# Copyright (c) 2012 Mellanox Technologies, Inc. +# All rights reserved. +# # $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# +# +# This makefile.am does not stand on its own - it is included from oshmem/Makefile.am + + +headers += \ + proc/proc.h \ + proc/proc_group_cache.h + +libshmem_la_SOURCES += \ + proc/proc.c \ + proc/proc_group_cache.c + diff --git a/oshmem/proc/proc.c b/oshmem/proc/proc.c new file mode 100644 index 0000000000..c1b453bab0 --- /dev/null +++ b/oshmem/proc/proc.c @@ -0,0 +1,785 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "oshmem_config.h" +#include "oshmem/proc/proc.h" +#include "oshmem/constants.h" +#include "oshmem/runtime/runtime.h" +#include "oshmem/mca/scoll/base/base.h" + +#include "orte/util/proc_info.h" +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/ess/ess.h" +#include "orte/util/proc_info.h" +#include "orte/util/name_fns.h" +#include "orte/util/show_help.h" +#include "orte/runtime/orte_globals.h" +#include "orte/mca/db/db_types.h" + +#include "opal/datatype/opal_convertor.h" +#include "opal/threads/mutex.h" +#include "opal/dss/dss.h" +#include "opal/util/arch.h" +#include "opal/class/opal_list.h" + +#include "ompi/communicator/communicator.h" /*TODO: ompi_communicator_t */ +#include "ompi/patterns/comm/coll_ops.h" /*TODO: comm_bcast_pml */ + + +opal_convertor_t* oshmem_shmem_local_convertor = NULL; + +opal_list_t oshmem_proc_list; +static opal_mutex_t oshmem_proc_lock; +oshmem_proc_t* oshmem_proc_local_proc = NULL; + +static void oshmem_proc_construct(oshmem_proc_t* proc); +static void oshmem_proc_destruct(oshmem_proc_t* proc); + + +OBJ_CLASS_INSTANCE( + oshmem_proc_t, + opal_list_item_t, + oshmem_proc_construct, + oshmem_proc_destruct +); + + +void oshmem_proc_construct(oshmem_proc_t* proc) +{ + proc->proc_bml = NULL; + proc->proc_pml = NULL; + + /* By default all processors are supposedly having the same architecture as me. Thus, + * by default we run in a homogeneous environment. Later, when the RTE can tell us + * the arch of the remote nodes, we will have to set the convertors to the correct + * architecture. + */ + proc->proc_arch = opal_local_arch; + proc->proc_convertor = oshmem_shmem_local_convertor; + OBJ_RETAIN( oshmem_shmem_local_convertor ); + + proc->proc_flags = 0; + proc->num_transports = 0; + + /* initialize this pointer to NULL */ + proc->proc_hostname = NULL; +} + + +void oshmem_proc_destruct(oshmem_proc_t* proc) +{ + /* As all the convertors are created with OBJ_NEW we can just call OBJ_RELEASE. All, except + * the local convertor, will get destroyed at some point here. If the reference count is correct + * the local convertor (who has the reference count increased in the datatype) will not get + * destroyed here. It will be destroyed later when the ompi_datatype_finalize is called. + */ + OBJ_RELEASE( proc->proc_convertor ); + + /* DO NOT FREE THE HOSTNAME FIELD AS THIS POINTS + * TO AN AREA ALLOCATED/FREE'D ELSEWHERE + */ + OPAL_THREAD_LOCK(&oshmem_proc_lock); + opal_list_remove_item(&oshmem_proc_list, (opal_list_item_t*)proc); + OPAL_THREAD_UNLOCK(&oshmem_proc_lock); +} + + +int oshmem_proc_init(void) +{ + orte_vpid_t i; + + OBJ_CONSTRUCT(&oshmem_proc_list, opal_list_t); + OBJ_CONSTRUCT(&oshmem_proc_lock, opal_mutex_t); + oshmem_shmem_local_convertor = opal_convertor_create( opal_local_arch, 0 ); + + size_t ompi_num_procs; + ompi_proc_t **ompi_procs = ompi_proc_world(&ompi_num_procs); + /* create proc structures and find self */ + for( i = 0; i < orte_process_info.num_procs; i++ ) { + oshmem_proc_t *proc = OBJ_NEW(oshmem_proc_t); + opal_list_append(&oshmem_proc_list, (opal_list_item_t*)proc); + + proc->proc_name.jobid = ompi_procs[i]->proc_name.jobid; + proc->proc_name.vpid = ompi_procs[i]->proc_name.vpid; + proc->proc_arch = ompi_procs[i]->proc_arch; + proc->proc_flags = ompi_procs[i]->proc_flags; + proc->proc_hostname = ompi_procs[i]->proc_hostname; + + if (i == ORTE_PROC_MY_NAME->vpid) { + oshmem_proc_local_proc = proc; + } + } + + if (ompi_procs) + free(ompi_procs); + + return OSHMEM_SUCCESS; +} + + +/* in some cases, all PE procs are required to do a modex so they + * can (at the least) exchange their architecture. Since we cannot + * know in advance if this was required, we provide a separate function + * to set the arch (instead of doing it inside of oshmem_proc_init) that + * can be called after the modex completes in oshmem_shmem_init. Thus, we + * know that - regardless of how the arch is known, whether via modex + * or dropped in from a local daemon - the arch can be set correctly + * at this time + */ +int oshmem_proc_set_arch(void) +{ + oshmem_proc_t *proc = NULL; + opal_list_item_t *item = NULL; + int ret = OSHMEM_SUCCESS; + + OPAL_THREAD_LOCK(&oshmem_proc_lock); + + for( item = opal_list_get_first(&oshmem_proc_list); + item != opal_list_get_end(&oshmem_proc_list); + item = opal_list_get_next(item)) { + proc = (oshmem_proc_t*)item; + + if (proc->proc_name.vpid != ORTE_PROC_MY_NAME->vpid) { + // if arch is different than mine, create a new convertor for this proc + if (proc->proc_arch != opal_local_arch) { +#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT + OBJ_RELEASE(proc->proc_convertor); + proc->proc_convertor = opal_convertor_create(proc->proc_arch, 0); +#else + orte_show_help("help-shmem-runtime.txt", + "heterogeneous-support-unavailable", + true, orte_process_info.nodename, + proc->proc_hostname == NULL ? "" : + proc->proc_hostname); + OPAL_THREAD_UNLOCK(&oshmem_proc_lock); + return OSHMEM_ERR_NOT_SUPPORTED; +#endif + } + } + } + + /* Set predefined groups */ + ret = oshmem_proc_group_init(); + + OPAL_THREAD_UNLOCK(&oshmem_proc_lock); + + return ret; +} + + +int oshmem_proc_finalize (void) +{ + opal_list_item_t *item; + + /* Destroy all groups */ + oshmem_proc_group_finalize(); + + /* remove all items from list and destroy them. Since we cannot know + * the reference count of the procs for certain, it is possible that + * a single OBJ_RELEASE won't drive the count to zero, and hence will + * not release the memory. Accordingly, we cycle through the list here, + * calling release on each item. + * + * This will cycle until it forces the reference count of each item + * to zero, thus causing the destructor to run - which will remove + * the item from the list! + * + * We cannot do this under the thread lock as the destructor will + * call it when removing the item from the list. However, this function + * is ONLY called from MPI_Finalize, and all threads are prohibited from + * calling an MPI function once ANY thread has called MPI_Finalize. Of + * course, multiple threads are allowed to call MPI_Finalize, so this + * function may get called multiple times by various threads. We believe + * it is thread safe to do so...though it may not -appear- to be so + * without walking through the entire list/destructor sequence. + */ + while (opal_list_get_end(&oshmem_proc_list) != (item = opal_list_get_first(&oshmem_proc_list))) { + OBJ_RELEASE(item); + } + OBJ_RELEASE( oshmem_shmem_local_convertor ); + /* now destruct the list and thread lock */ + OBJ_DESTRUCT(&oshmem_proc_list); + OBJ_DESTRUCT(&oshmem_proc_lock); + + return OSHMEM_SUCCESS; +} + +oshmem_proc_t** oshmem_proc_world(size_t *size) +{ + oshmem_proc_t **procs; + oshmem_proc_t *proc; + size_t count = 0; + orte_ns_cmp_bitmask_t mask; + orte_process_name_t my_name; + + /* check bozo case */ + if (NULL == oshmem_proc_local_proc) { + return NULL; + } + mask = ORTE_NS_CMP_JOBID; + my_name = oshmem_proc_local_proc->proc_name; + + /* First count how many match this jobid */ + OPAL_THREAD_LOCK(&oshmem_proc_lock); + for (proc = (oshmem_proc_t*)opal_list_get_first(&oshmem_proc_list); + proc != (oshmem_proc_t*)opal_list_get_end(&oshmem_proc_list); + proc = (oshmem_proc_t*)opal_list_get_next(proc)) { + if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &proc->proc_name, &my_name)) { + ++count; + } + } + + /* allocate an array */ + procs = (oshmem_proc_t**) malloc(count * sizeof(oshmem_proc_t*)); + if (NULL == procs) { + OPAL_THREAD_UNLOCK(&oshmem_proc_lock); + return NULL; + } + + /* now save only the procs that match this jobid */ + count = 0; + for (proc = (oshmem_proc_t*)opal_list_get_first(&oshmem_proc_list); + proc != (oshmem_proc_t*)opal_list_get_end(&oshmem_proc_list); + proc = (oshmem_proc_t*)opal_list_get_next(proc)) { + if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &proc->proc_name, &my_name)) { + /* DO NOT RETAIN THIS OBJECT - the reference count on this + * object will be adjusted by external callers. The intent + * here is to allow the reference count to drop to zero if + * the app no longer desires to communicate with this proc. + * For example, the proc may call comm_disconnect on all + * communicators involving this proc. In such cases, we want + * the proc object to be removed from the list. By not incrementing + * the reference count here, we allow this to occur. + * + * We don't implement that yet, but we are still safe for now as + * the OBJ_NEW in oshmem_proc_init owns the initial reference + * count which cannot be released until oshmem_proc_finalize is + * called. + */ + procs[count++] = proc; + } + } + OPAL_THREAD_UNLOCK(&oshmem_proc_lock); + + *size = count; + return procs; +} + + +oshmem_proc_t** oshmem_proc_all(size_t* size) +{ + oshmem_proc_t **procs = + (oshmem_proc_t**) malloc(opal_list_get_size(&oshmem_proc_list) * sizeof(oshmem_proc_t*)); + oshmem_proc_t *proc; + size_t count = 0; + + if (NULL == procs) { + return NULL; + } + + OPAL_THREAD_LOCK(&oshmem_proc_lock); + for(proc = (oshmem_proc_t*)opal_list_get_first(&oshmem_proc_list); + ((proc != (oshmem_proc_t*)opal_list_get_end(&oshmem_proc_list)) && (proc != NULL)); + proc = (oshmem_proc_t*)opal_list_get_next(proc)) { + /* We know this isn't consistent with the behavior in oshmem_proc_world, + * but we are leaving the RETAIN for now because the code using this function + * assumes that the results need to be released when done. It will + * be cleaned up later as the "fix" will impact other places in + * the code + */ + OBJ_RETAIN(proc); + procs[count++] = proc; + } + OPAL_THREAD_UNLOCK(&oshmem_proc_lock); + + *size = count; + + return procs; +} + + +oshmem_proc_t** oshmem_proc_self(size_t* size) +{ + oshmem_proc_t **procs = (oshmem_proc_t**) malloc(sizeof(oshmem_proc_t*)); + if (NULL == procs) { + return NULL; + } + /* We know this isn't consistent with the behavior in oshmem_proc_world, + * but we are leaving the RETAIN for now because the code using this function + * assumes that the results need to be released when done. It will + * be cleaned up later as the "fix" will impact other places in + * the code + */ + OBJ_RETAIN(oshmem_proc_local_proc); + + *procs = oshmem_proc_local_proc; + *size = 1; + return procs; +} + +oshmem_proc_t * oshmem_proc_find ( const orte_process_name_t * name ) +{ + oshmem_proc_t *proc, *rproc=NULL; + orte_ns_cmp_bitmask_t mask; + + /* return the proc-struct which matches this jobid+process id */ + mask = ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID; + OPAL_THREAD_LOCK(&oshmem_proc_lock); + for(proc = (oshmem_proc_t*)opal_list_get_first(&oshmem_proc_list); + proc != (oshmem_proc_t*)opal_list_get_end(&oshmem_proc_list); + proc = (oshmem_proc_t*)opal_list_get_next(proc)) { + if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &proc->proc_name, name)) { + rproc = proc; + break; + } + } + OPAL_THREAD_UNLOCK(&oshmem_proc_lock); + + return rproc; +} + + +int oshmem_proc_refresh(void) { + oshmem_proc_t *proc = NULL; + opal_list_item_t *item = NULL; + orte_vpid_t i = 0; + + OPAL_THREAD_LOCK(&oshmem_proc_lock); + + int* ranks_in_comm; + ranks_in_comm = (int *) malloc(orte_process_info.num_procs * sizeof(int)); + for (i = 0; i < orte_process_info.num_procs; ++i) { + ranks_in_comm[i] = i; + } + + + for( item = opal_list_get_first(&oshmem_proc_list), i = 0; + item != opal_list_get_end(&oshmem_proc_list); + item = opal_list_get_next(item), ++i ) { + proc = (oshmem_proc_t*)item; + + /* Does not change: proc->proc_name.vpid */ + proc->proc_name.jobid = ORTE_PROC_MY_NAME->jobid; + + /* Make sure to clear the local flag before we set it below */ + proc->proc_flags = 0; + + proc->proc_arch = opal_local_arch; + + comm_bcast_pml(&proc->proc_arch, i, sizeof(uint32_t), + MPI_BYTE, ORTE_PROC_MY_NAME->vpid, orte_process_info.num_procs, + ranks_in_comm, (ompi_communicator_t *)&ompi_mpi_comm_world); + + int hostname_length = strlen(orte_process_info.nodename); + comm_bcast_pml(&hostname_length, i, sizeof(int), + MPI_BYTE, ORTE_PROC_MY_NAME->vpid, orte_process_info.num_procs, + ranks_in_comm, (ompi_communicator_t *)&ompi_mpi_comm_world); + + if (proc->proc_hostname) + free(proc->proc_hostname); + + proc->proc_hostname = (i == ORTE_PROC_MY_NAME->vpid ? + strdup(orte_process_info.nodename) : (char *) malloc (hostname_length)); + + comm_bcast_pml(proc->proc_hostname, i, hostname_length, + MPI_BYTE, ORTE_PROC_MY_NAME->vpid, orte_process_info.num_procs, + ranks_in_comm, (ompi_communicator_t *)&ompi_mpi_comm_world); + + if (i == ORTE_PROC_MY_NAME->vpid) { + oshmem_proc_local_proc = proc; + } else { + /* if arch is different than mine, create a new convertor for this proc */ + if (proc->proc_arch != opal_local_arch) { +#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT + OBJ_RELEASE(proc->proc_convertor); + proc->proc_convertor = opal_convertor_create(proc->proc_arch, 0); +#else + orte_show_help("help-shmem-runtime.txt", + "heterogeneous-support-unavailable", + true, orte_process_info.nodename, + proc->proc_hostname == NULL ? "" : + proc->proc_hostname); + OPAL_THREAD_UNLOCK(&oshmem_proc_lock); + return OSHMEM_ERR_NOT_SUPPORTED; +#endif + } + } + } + + if (ranks_in_comm) + free(ranks_in_comm); + + OPAL_THREAD_UNLOCK(&oshmem_proc_lock); + + return OSHMEM_SUCCESS; +} + +int +oshmem_proc_pack(oshmem_proc_t **proclist, int proclistsize, opal_buffer_t* buf) +{ + int i, rc; + + OPAL_THREAD_LOCK(&oshmem_proc_lock); + + /* cycle through the provided array, packing the OSHMEM level + * data for each proc. This data may or may not be included + * in any subsequent modex operation, so we include it here + * to ensure completion of a connect/accept handshake. See + * the ompi/mca/dpm framework for an example of where and how + * this info is used. + * + * Eventually, we will review the procedures that call this + * function to see if duplication of communication can be + * reduced. For now, just go ahead and pack the info so it + * can be sent. + */ + for (i=0; iproc_name), 1, ORTE_NAME); + if(rc != ORTE_SUCCESS) { + ORTE_ERROR_LOG(rc); + OPAL_THREAD_UNLOCK(&oshmem_proc_lock); + return rc; + } + rc = opal_dss.pack(buf, &(proclist[i]->proc_arch), 1, OPAL_UINT32); + if(rc != ORTE_SUCCESS) { + ORTE_ERROR_LOG(rc); + OPAL_THREAD_UNLOCK(&oshmem_proc_lock); + return rc; + } + rc = opal_dss.pack(buf, &(proclist[i]->proc_hostname), 1, OPAL_STRING); + if(rc != ORTE_SUCCESS) { + ORTE_ERROR_LOG(rc); + OPAL_THREAD_UNLOCK(&oshmem_proc_lock); + return rc; + } + } + OPAL_THREAD_UNLOCK(&oshmem_proc_lock); + return OSHMEM_SUCCESS; +} + +static oshmem_proc_t * +oshmem_proc_find_and_add(const orte_process_name_t * name, bool* isnew) +{ + oshmem_proc_t *proc, *rproc = NULL; + orte_ns_cmp_bitmask_t mask; + + /* return the proc-struct which matches this jobid+process id */ + mask = ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID; + OPAL_THREAD_LOCK(&oshmem_proc_lock); + for(proc = (oshmem_proc_t*)opal_list_get_first(&oshmem_proc_list); + proc != (oshmem_proc_t*)opal_list_get_end(&oshmem_proc_list); + proc = (oshmem_proc_t*)opal_list_get_next(proc)) { + if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &proc->proc_name, name)) { + rproc = proc; + *isnew = false; + break; + } + } + + /* if we didn't find this proc in the list, create a new + * proc_t and append it to the list + */ + if (NULL == rproc) { + *isnew = true; + rproc = OBJ_NEW(oshmem_proc_t); + if (NULL != rproc) { + opal_list_append(&oshmem_proc_list, (opal_list_item_t*)rproc); + rproc->proc_name = *name; + } + /* caller had better fill in the rest of the proc, or there's + going to be pain later... */ + } + + OPAL_THREAD_UNLOCK(&oshmem_proc_lock); + + return rproc; +} + + +int +oshmem_proc_unpack(opal_buffer_t* buf, + int proclistsize, oshmem_proc_t ***proclist, + int *newproclistsize, oshmem_proc_t ***newproclist) +{ + int i; + size_t newprocs_len = 0; + oshmem_proc_t **plist=NULL, **newprocs = NULL; + + /* do not free plist *ever*, since it is used in the remote group + structure of a communicator */ + plist = (oshmem_proc_t **) calloc (proclistsize, sizeof (oshmem_proc_t *)); + if ( NULL == plist ) { + return OSHMEM_ERR_OUT_OF_RESOURCE; + } + /* free this on the way out */ + newprocs = (oshmem_proc_t **) calloc (proclistsize, sizeof (oshmem_proc_t *)); + if (NULL == newprocs) { + free(plist); + return OSHMEM_ERR_OUT_OF_RESOURCE; + } + + /* cycle through the array of provided procs and unpack + * their info - as packed by oshmem_proc_pack + */ + for ( i=0; iproc_arch = new_arch; + /* if arch is different than mine, create a new convertor for this proc */ + if (plist[i]->proc_arch != opal_local_arch) { +#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT + OBJ_RELEASE(plist[i]->proc_convertor); + plist[i]->proc_convertor = opal_convertor_create(plist[i]->proc_arch, 0); +#else + orte_show_help("help-shmem-runtime.txt", + "heterogeneous-support-unavailable", + true, orte_process_info.nodename, + new_hostname == NULL ? "" : + new_hostname); + free(plist); + free(newprocs); + return OSHMEM_ERR_NOT_SUPPORTED; +#endif + } + if (0 == strcmp(oshmem_proc_local_proc->proc_hostname,new_hostname)) { + plist[i]->proc_flags |= (OPAL_PROC_ON_NODE | OPAL_PROC_ON_CU | OPAL_PROC_ON_CLUSTER); + } + + /* Save the hostname */ + plist[i]->proc_hostname = new_hostname; + + /* eventually, we will update the orte/mca/ess framework's data + * to contain the info for the new proc. For now, we ignore + * this step since the MPI layer already has all the info + * it requires + */ + } + } + + if (NULL != newproclistsize) *newproclistsize = newprocs_len; + if (NULL != newproclist) { + *newproclist = newprocs; + } else if (newprocs != NULL) { + free(newprocs); + } + + *proclist = plist; + return OSHMEM_SUCCESS; +} + + +opal_pointer_array_t oshmem_group_array; + +oshmem_group_t* oshmem_group_all = NULL; +oshmem_group_t* oshmem_group_self = NULL; +oshmem_group_t* oshmem_group_null = NULL; + + +OBJ_CLASS_INSTANCE(oshmem_group_t, opal_object_t, NULL, NULL); + + +OSHMEM_DECLSPEC int oshmem_proc_group_init(void) +{ + + /* Setup communicator array */ + OBJ_CONSTRUCT(&oshmem_group_array, opal_pointer_array_t); + if( OPAL_SUCCESS != opal_pointer_array_init(&oshmem_group_array, 0, + ORTE_GLOBAL_ARRAY_MAX_SIZE, 1) ) { + return OSHMEM_ERROR; + } + + /* Setup SHMEM_GROUP_ALL */ + if (NULL == (oshmem_group_all = oshmem_proc_group_create(0, 1, opal_list_get_size(&oshmem_proc_list)))) { + oshmem_proc_group_destroy(oshmem_group_all); + return OSHMEM_ERROR; + } + + /* Setup SHMEM_GROUP_SELF */ + if (NULL == (oshmem_group_self = oshmem_proc_group_create(oshmem_proc_local()->proc_name.vpid, 0, 1))) { + oshmem_proc_group_destroy(oshmem_group_self); + return OSHMEM_ERROR; + } + + /* Setup SHMEM_GROUP_NULL */ + oshmem_group_null = NULL; + + return OSHMEM_SUCCESS; +} + + +OSHMEM_DECLSPEC int oshmem_proc_group_finalize(void) +{ + int max, i; + oshmem_group_t *group; + + /* Check whether we have some left */ + max = opal_pointer_array_get_size(&oshmem_group_array); + for ( i = 0; i < max; i++ ) + { + group = (oshmem_group_t *)opal_pointer_array_get_item(&oshmem_group_array, i); + if ( NULL != group ) + { + /* Group has not been freed before finalize */ + oshmem_proc_group_destroy(group); + } + } + + OBJ_DESTRUCT (&oshmem_group_array); + + return OSHMEM_SUCCESS; +} + + +OSHMEM_DECLSPEC oshmem_group_t* oshmem_proc_group_create(int pe_start, int pe_stride, size_t pe_size) +{ + oshmem_group_t* group = NULL; + + group = OBJ_NEW(oshmem_group_t); + + if (group) + { + int cur_pe = 0; + int count_pe = 0; + oshmem_proc_t** proc_array = NULL; + oshmem_proc_t* proc = NULL; + + OPAL_THREAD_LOCK(&oshmem_proc_lock); + + + /* allocate an array */ + proc_array = (oshmem_proc_t**) malloc(pe_size * sizeof(oshmem_proc_t*)); + if (NULL == proc_array) { + OPAL_THREAD_UNLOCK(&oshmem_proc_lock); + return NULL; + } + + group->my_pe = oshmem_proc_local()->proc_name.vpid; + group->is_member = 0; + /* now save only the procs that match this jobid */ + for (proc = (oshmem_proc_t*)opal_list_get_first(&oshmem_proc_list); + proc != (oshmem_proc_t*)opal_list_get_end(&oshmem_proc_list); + proc = (oshmem_proc_t*)opal_list_get_next(proc)) { + if (count_pe >= (int)pe_size) + { + break; + } + else if ((cur_pe >= pe_start) && ((pe_stride == 0) || (((cur_pe - pe_start) % pe_stride) == 0))) + { + proc_array[count_pe++] = proc; + if (oshmem_proc_pe(proc) == group->my_pe) + group->is_member = 1; + } + cur_pe++; + } + group->proc_array = proc_array; + group->proc_count = (int)count_pe; + + /* Prepare peers list */ + OBJ_CONSTRUCT(&(group->peer_list), opal_list_t); + { + int i = 0; + orte_namelist_t *peer = NULL; + + for (i = 0; i < group->proc_count; i++) + { + peer = OBJ_NEW(orte_namelist_t); + peer->name.jobid = group->proc_array[i]->proc_name.jobid; + peer->name.vpid = group->proc_array[i]->proc_name.vpid; + opal_list_append(&(group->peer_list), &peer->super); + } + } + group->id = opal_pointer_array_add(&oshmem_group_array, group); + + memset(&group->g_scoll, 0, sizeof(mca_scoll_base_group_scoll_t)); + + if (OSHMEM_SUCCESS != mca_scoll_base_select(group)){ + opal_output(0,"Error: No collective modules are available: group is not created, returning NULL"); + oshmem_proc_group_destroy(group); + OPAL_THREAD_UNLOCK(&oshmem_proc_lock); + return NULL; + } + OPAL_THREAD_UNLOCK(&oshmem_proc_lock); + } + + return group; +} + + +OSHMEM_DECLSPEC void oshmem_proc_group_destroy(oshmem_group_t* group) +{ + if (group) + { + mca_scoll_base_group_unselect(group); + + /* Destroy proc array */ + if (group->proc_array) + { + free(group->proc_array); + } + + /* Destroy peer list */ + { + opal_list_item_t *item; + + while (NULL != (item = opal_list_remove_first(&(group->peer_list)))) + { + /* destruct the item (we constructed it), then free the memory chunk */ + OBJ_RELEASE(item); + } + OBJ_DESTRUCT(&(group->peer_list)); + } + + /* reset the oshmem_group_array entry - make sure that the + * entry is in the table */ + if (NULL != opal_pointer_array_get_item(&oshmem_group_array, group->id)) + { + opal_pointer_array_set_item(&oshmem_group_array, group->id, NULL); + } + + OBJ_RELEASE(group); + } +} diff --git a/oshmem/proc/proc.h b/oshmem/proc/proc.h new file mode 100644 index 0000000000..0506c92ac6 --- /dev/null +++ b/oshmem/proc/proc.h @@ -0,0 +1,505 @@ +/* +* Copyright (c) 2012 Mellanox Technologies, Inc. +* All rights reserved. +* $COPYRIGHT$ +* +* Additional copyrights may follow +* +* $HEADER$ +*/ +#ifndef OSHMEM_PROC_PROC_H +#define OSHMEM_PROC_PROC_H + +#include "oshmem_config.h" +#include "oshmem/types.h" +#include "oshmem/constants.h" + +#include "oshmem/mca/scoll/scoll.h" + + +#include "opal/class/opal_list.h" +#include "opal/dss/dss_types.h" +#include "opal/mca/hwloc/hwloc.h" + +#include "orte/types.h" +#include "orte/runtime/orte_globals.h" +#include "ompi/mca/bml/bml.h" + +BEGIN_C_DECLS + +/* ******************************************************************** */ + +struct oshmem_group_t; + + +#define OSHMEM_PE_INVALID (-1) + + +/** + * Remote Open SHMEM process structure + * + * Remote Open SHMEM process structure. Each process contains exactly + * one oshmem_proc_t structure for each remote process it knows about. + */ +struct oshmem_proc_t { + /** allow proc to be placed on a list */ + opal_list_item_t super; + /** this process' name */ + orte_process_name_t proc_name; + /** PML specific proc data */ + struct mca_pml_endpoint_t* proc_pml; + /** BML specific proc data */ + struct mca_bml_base_endpoint_t* proc_bml; + /** architecture of this process */ + uint32_t proc_arch; + /** flags for this proc */ + opal_hwloc_locality_t proc_flags; + /** Base convertor for the proc described by this process */ + struct opal_convertor_t* proc_convertor; + /** A pointer to the name of this host - data is + * actually stored in the RTE + */ + char* proc_hostname; + + /* + * All transport channels are globally ordered. + * pe(s) can talk to each other via subset of transports + * these holds indexes of each transport into global array + * proc -> id, where id can be btl id in yoda or mxm ptl id + * in ikrit + * spml is supposed to fill this during add_procs() + **/ + int num_transports; + char *transport_ids; +}; + +typedef struct oshmem_proc_t oshmem_proc_t; +OBJ_CLASS_DECLARATION(oshmem_proc_t); + + +/** + * Group of Open SHMEM processes structure + * + * Set of processes used in collective operations. + */ +struct oshmem_group_t { + opal_object_t base; + int id; /**< index in global array */ + int my_pe; + int proc_count; /**< number of processes in group */ + int is_member; /* true if my_pe is part of the group, participate in collectives */ + struct oshmem_proc_t **proc_array; /**< list of pointers to ompi_proc_t structures + for each process in the group */ + opal_list_t peer_list; + + /* Collectives module interface and data */ + mca_scoll_base_group_scoll_t g_scoll; +}; +typedef struct oshmem_group_t oshmem_group_t; +OSHMEM_DECLSPEC OBJ_CLASS_DECLARATION(oshmem_group_t); + +OSHMEM_DECLSPEC extern oshmem_group_t* oshmem_group_all; +OSHMEM_DECLSPEC extern oshmem_group_t* oshmem_group_self; +OSHMEM_DECLSPEC extern oshmem_group_t* oshmem_group_null; + + +/** + * @private + * + * Pointer to the oshmem_proc_t structure for the local process + * + * Pointer to the oshmem_proc_t structure for the local process. + * + * @note This pointer is declared here to allow inline functions + * within this header file to access the local process quickly. + * Please use oshmem_proc_local() instead. + */ +OSHMEM_DECLSPEC extern oshmem_proc_t* oshmem_proc_local_proc; + + +/* ******************************************************************** */ + + +/** + * Initialize the OSHMEM process subsystem + * + * Initialize the Open SHMEM process subsystem. This function will + * query the run-time environment and build a list of the proc + * instances in the current pe set. The local information not + * easily determined by the run-time ahead of time (architecture and + * hostname) will be published during this call. + * + * @note While an oshmem_proc_t will exist with mostly valid information + * for each process in the pe set at the conclusion of this + * call, some information will not be immediately available. This + * includes the architecture and hostname, which will be available by + * the conclusion of the stage gate. + * + * @retval OSHMEM_SUCESS System successfully initialized + * @retval OSHMEM_ERROR Initialization failed due to unspecified error + */ +OSHMEM_DECLSPEC int oshmem_proc_init(void); + +/** + * Set the arch of each proc in the oshmem_proc_list + * + * In some environments, SHMEM procs are required to exchange their + * arch via a modex operation during mpi_init. In other environments, + * the arch is determined by other mechanisms and provided to the + * proc directly. To support both mechanisms, we provide a separate + * function to set the arch of the procs -after- the modex operation + * has completed in mpi_init. + * + * @retval OSHMEM_SUCCESS Archs successfully set + * @retval OSHMEM_ERROR Archs could not be initialized + */ +OSHMEM_DECLSPEC int oshmem_proc_set_arch(void); + +/** + * Finalize the OSHMEM Process subsystem + * + * Finalize the Open SHMEM process subsystem. This function will + * release all memory created during the life of the application, + * including all oshmem_proc_t structures. + * + * @retval OSHMEM_SUCCESS System successfully finalized + */ +OSHMEM_DECLSPEC int oshmem_proc_finalize(void); + + +/** + * Returns the list of proc instances associated with this job. + * + * Returns the list of proc instances associated with this job. Given + * the current association between a job and an pe set, this + * function provides the process instances for the current + * pe set. + * + * @note The reference count of each process in the array is + * NOT incremented - the caller is responsible for ensuring the + * correctness of the reference count once they are done with + * the array. + * + * @param[in] size Number of processes in the oshmem_proc_t array + * + * @return Array of pointers to proc instances in the current + * pe set, or NULL if there is an internal failure. + */ +OSHMEM_DECLSPEC oshmem_proc_t** oshmem_proc_world(size_t* size); + + +/** + * Returns the list of all known proc instances. + * + * Returns the list of all known proc instances, including those in + * other pe sets. It is possible that we may no longer be + * connected to some of the procs returned (in the SHMEM sense of the + * word connected). In a strictly SHMEM-1 application, this function + * will return the same information as oshmem_proc_world(). + * + * @note The reference count of each process in the array is + * incremented and the caller is responsible for releasing each + * process in the array, as well as freeing the array. + * + * @param[in] size Number of processes in the oshmem_proc_t array + * + * @return Array of pointers to proc instances in the current + * known universe, or NULL if there is an internal failure. + */ +OSHMEM_DECLSPEC oshmem_proc_t** oshmem_proc_all(size_t* size); + + +/** + * Returns a list of the local process + * + * Returns a list containing the local process (and only the local + * process). Has calling semantics similar to oshmem_proc_world() and + * oshmem_proc_all(). + * + * @note The reference count of each process in the array is + * incremented and the caller is responsible for releasing each + * process in the array, as well as freeing the array. + * + * @param[in] size Number of processes in the oshmem_proc_t array + * + * @return Array of pointers to proc instances in the current + * known universe, or NULL if there is an internal failure. + */ +OSHMEM_DECLSPEC oshmem_proc_t** oshmem_proc_self(size_t* size); + + +/** + * Returns a pointer to the local process + * + * Returns a pointer to the local process. Unlike oshmem_proc_self(), + * the reference count on the local proc instance is not modified by + * this function. + * + * @return Pointer to the local process structure + */ +static inline oshmem_proc_t* oshmem_proc_local(void) +{ + return oshmem_proc_local_proc; +} + + +/** + * Returns the proc instance for a given name + * + * Returns the proc instance for the specified process name. The + * reference count for the proc instance is not incremented by this + * function. + * + * @param[in] name The process name to look for + * + * @return Pointer to the process instance for \c name +*/ +OSHMEM_DECLSPEC oshmem_proc_t * oshmem_proc_find ( const orte_process_name_t* name ); + +/** + * Pack proc list into portable buffer + * + * This function takes a list of oshmem_proc_t pointers (e.g. as given + * in groups) and returns a orte buffer containing all information + * needed to add the proc to a remote list. This includes the ORTE + * process name, the architecture, and the hostname. Ordering is + * maintained. The buffer is packed to be sent to a remote node with + * different architecture (endian or word size). The buffer can be + * dss unloaded to be sent using SHMEM or send using rml_send_packed(). + * + * @param[in] proclist List of process pointers + * @param[in] proclistsize Length of the proclist array + * @param[in,out] buf An orte_buffer containing the packed names. + * The buffer must be constructed but empty when + * passed to this function + * @retval OSHMEM_SUCCESS Success + * @retval OSHMEM_ERROR Unspecified error + */ +OSHMEM_DECLSPEC int oshmem_proc_pack(oshmem_proc_t **proclist, int proclistsize, + opal_buffer_t *buf); + + +/** + * Unpack a portable buffer of procs + * + * This function unpacks a packed list of oshmem_proc_t structures and + * returns the ordered list of proc structures. If the given proc is + * already "known", the architecture and hostname information in the + * buffer is ignored. If the proc is "new" to this process, it will + * be added to the global list of known procs, with information + * provided in the buffer. The lookup actions are always entirely + * local. The proclist returned is a list of pointers to all procs in + * the buffer, whether they were previously known or are new to this + * process. + * + * @note In previous versions of this function, The PML's add_procs() + * function was called for any new processes discovered as a result of + * this operation. That is no longer the case -- the caller must use + * the newproclist information to call add_procs() if necessary. + * + * @note The reference count for procs created as a result of this + * operation will be set to 1. Existing procs will not have their + * reference count changed. The reference count of a proc at the + * return of this function is the same regardless of whether NULL is + * provided for newproclist. The user is responsible for freeing the + * newproclist array. + * + * @param[in] buf orte_buffer containing the packed names + * @param[in] proclistsize number of expected proc-pointres + * @param[out] proclist list of process pointers + * @param[out] newproclistsize Number of new procs added as a result + * of the unpack operation. NULL may be + * provided if information is not needed. + * @param[out] newproclist List of new procs added as a result of + * the unpack operation. NULL may be + * provided if informationis not needed. + * + * Return value: + * OSHMEM_SUCCESS on success + * OSHMEM_ERROR else + */ +OSHMEM_DECLSPEC int oshmem_proc_unpack(opal_buffer_t *buf, + int proclistsize, oshmem_proc_t ***proclist, + int *newproclistsize, oshmem_proc_t ***newproclist); + +/** + * Refresh the OSHMEM process subsystem + * + * Refresh the Open SHMEM process subsystem. This function will update + * the list of proc instances in the current pe set with + * data from the run-time environemnt. + * + * @note This is primarily used when restarting a process and thus + * need to update the jobid and node name. + * + * @retval OSHMEM_SUCESS System successfully refreshed + * @retval OSHMEM_ERROR Refresh failed due to unspecified error + */ +OSHMEM_DECLSPEC int oshmem_proc_refresh(void); + + +static inline int oshmem_proc_pe(oshmem_proc_t *proc) +{ + return (proc ? (int)proc->proc_name.vpid : -1); +} + + +/** + * Initialize the OSHMEM process predefined groups + * + * Initialize the Open SHMEM process predefined groups. This function will + * query the run-time environment and build a list of the proc + * instances in the current pe set. The local information not + * easily determined by the run-time ahead of time (architecture and + * hostname) will be published during this call. + * + * @note This is primarily used once during SHMEM setup. + * + * @retval OSHMEM_SUCESS System successfully initialized + * @retval OSHMEM_ERROR Initialization failed due to unspecified error + */ +OSHMEM_DECLSPEC int oshmem_proc_group_init(void); + + +/** + * Finalize the OSHMEM process predefined groups + * + * Initialize the Open SHMEM process predefined groups. This function will + * query the run-time environment and build a list of the proc + * instances in the current pe set. The local information not + * easily determined by the run-time ahead of time (architecture and + * hostname) will be published during this call. + * + * @note This is primarily used once during SHMEM setup. + * + * @retval OSHMEM_SUCESS System successfully initialized + * @retval OSHMEM_ERROR Initialization failed due to unspecified error + */ +OSHMEM_DECLSPEC int oshmem_proc_group_finalize(void); + + +/** + * Create processes group. + * + * Returns the list of known proc instances located in this group. + * + * @param[in] pe_start The lowest PE in the active set. + * @param[in] pe_stride The log (base 2) of the stride between consecutive + * PEs in the active set. + * @param[in] pe_size The number of PEs in the active set. + * + * @return Array of pointers to proc instances in the current + * known universe, or NULL if there is an internal failure. + */ +OSHMEM_DECLSPEC oshmem_group_t* oshmem_proc_group_create(int pe_start, int pe_stride, size_t pe_size); + + +/** + * Destroy processes group. + * + */ +OSHMEM_DECLSPEC void oshmem_proc_group_destroy(oshmem_group_t* group); + +static inline oshmem_proc_t *oshmem_proc_group_all(int pe) +{ + return oshmem_group_all->proc_array[pe]; +} + + +static inline oshmem_proc_t* oshmem_proc_group_find (oshmem_group_t* group, int pe) +{ + int i = 0; + oshmem_proc_t* proc = NULL; + + if (OPAL_LIKELY(group)) + { + if (OPAL_LIKELY(group == oshmem_group_all)) + { + /* To improve performance use direct index. It is feature of oshmem_group_all */ + proc = group->proc_array[pe]; + } + else + { + for (i = 0; i < group->proc_count; i++) + { + if (pe == oshmem_proc_pe(group->proc_array[i])) + { + proc = group->proc_array[i]; + break; + } + } + } + } + else + { + orte_process_name_t name; + + name.jobid = ORTE_PROC_MY_NAME->jobid; + name.vpid = pe; + proc = oshmem_proc_find(&name); + } + + return proc; +} + + +static inline int oshmem_proc_group_find_id (oshmem_group_t* group, int pe) +{ + int i = 0; + int id = -1; + + if (group) + { + for (i = 0; i < group->proc_count; i++) + { + if (pe == oshmem_proc_pe(group->proc_array[i])) + { + id = i; + break; + } + } + } + + return id; +} + +static inline int oshmem_proc_group_is_member(oshmem_group_t *group) +{ + return group->is_member; +} + + +static inline int oshmem_num_procs(void) +{ + extern opal_list_t oshmem_proc_list; + + if (!oshmem_group_all) + return opal_list_get_size(&oshmem_proc_list); + + return oshmem_group_all->proc_count; +} + +static inline int oshmem_my_proc_id(void) +{ + return oshmem_group_self->my_pe; +} + +static inline int oshmem_get_transport_id(int pe) +{ + oshmem_proc_t *proc; + + proc = oshmem_proc_group_find(oshmem_group_all, pe); + + return (int)proc->transport_ids[0]; +} + +static inline int oshmem_get_transport_count(int pe) +{ + oshmem_proc_t *proc; + proc = oshmem_proc_group_find(oshmem_group_all, pe); + return proc->num_transports; +} + +END_C_DECLS + +#endif /* OSHMEM_PROC_PROC_H */ diff --git a/oshmem/proc/proc_group_cache.c b/oshmem/proc/proc_group_cache.c new file mode 100644 index 0000000000..bc9baa75d3 --- /dev/null +++ b/oshmem/proc/proc_group_cache.c @@ -0,0 +1,102 @@ +/* +* Copyright (c) 2012 Mellanox Technologies, Inc. +* All rights reserved. +* $COPYRIGHT$ +* +* Additional copyrights may follow +* +* $HEADER$ +*/ +#include "oshmem/proc/proc_group_cache.h" +#include "oshmem/constants.h" +#include "opal/mca/base/mca_base_param.h" +#include "oshmem/runtime/runtime.h" + +OBJ_CLASS_INSTANCE(oshmem_group_cache_t, opal_object_t, NULL, NULL); +opal_list_t oshmem_group_cache_list; +unsigned int oshmem_group_cache_size; +oshmem_group_t* find_group_in_cache(int PE_start, int logPE_stride, int PE_size) +{ + int cache_look_up_id[3] = {PE_start,logPE_stride,PE_size}; + opal_list_item_t *item; + if (opal_list_is_empty(&oshmem_group_cache_list)) + { + return NULL; + } + + for (item = opal_list_get_first(&oshmem_group_cache_list); + item != opal_list_get_end(&oshmem_group_cache_list); + item = opal_list_get_next(item)) + { + if (!memcmp(((oshmem_group_cache_t *)item)->cache_id,cache_look_up_id,3*sizeof(int))) +