diff --git a/oshmem/mca/atomic/mxm/atomic_mxm.h b/oshmem/mca/atomic/mxm/atomic_mxm.h index 8d06cb3fcf..64478b2b66 100644 --- a/oshmem/mca/atomic/mxm/atomic_mxm.h +++ b/oshmem/mca/atomic/mxm/atomic_mxm.h @@ -19,6 +19,7 @@ /* This component does uses SPML:IKRIT */ #include "oshmem/mca/spml/ikrit/spml_ikrit.h" +#include "oshmem/runtime/runtime.h" BEGIN_C_DECLS @@ -60,16 +61,76 @@ struct mca_atomic_mxm_module_t { typedef struct mca_atomic_mxm_module_t mca_atomic_mxm_module_t; OBJ_CLASS_DECLARATION(mca_atomic_mxm_module_t); + +static inline uint8_t mca_atomic_mxm_order(size_t nlong) +{ + if (OPAL_LIKELY(8 == nlong)) { + return 3; + } + + if (OPAL_LIKELY(4 == nlong)) { + return 2; + } + + if (2 == nlong) { + return 1; + } + + if (1 == nlong) { + return 0; + } + + ATOMIC_ERROR("Type size must be 1/2/4 or 8 bytes."); + oshmem_shmem_abort(-1); + return OSHMEM_ERR_BAD_PARAM; +} + +static inline void mca_atomic_mxm_req_init(mxm_send_req_t *sreq, int pe, void *target, size_t nlong) +{ + uint8_t nlong_order; + void *remote_addr; + mxm_mem_key_t *mkey; + + nlong_order = mca_atomic_mxm_order(nlong); + + mkey = mca_spml_ikrit_get_mkey(pe, target, MXM_PTL_RDMA, &remote_addr); + + /* mxm request init */ + sreq->base.state = MXM_REQ_NEW; + sreq->base.mq = mca_atomic_mxm_spml_self->mxm_mq; + sreq->base.conn = mca_atomic_mxm_spml_self->mxm_peers[pe].mxm_hw_rdma_conn; + sreq->base.completed_cb = NULL; + sreq->base.data_type = MXM_REQ_DATA_BUFFER; + + sreq->base.data.buffer.memh = MXM_INVALID_MEM_HANDLE; + sreq->base.data.buffer.length = nlong; + + sreq->op.atomic.remote_vaddr = (uintptr_t) remote_addr; + sreq->op.atomic.remote_mkey = mkey; + sreq->op.atomic.order = nlong_order; + + sreq->flags = 0; +} + +static inline void mca_atomic_mxm_post(mxm_send_req_t *sreq) +{ + mxm_error_t mxm_err; + + mxm_err = mxm_req_send(sreq); + if (OPAL_UNLIKELY(MXM_OK != mxm_err)) { + ATOMIC_ERROR("mxm_req_send failed, mxm_error = %d", + mxm_err); + oshmem_shmem_abort(-1); + } + + mxm_req_wait(&sreq->base); + if (OPAL_UNLIKELY(MXM_OK != sreq->base.error)) { + ATOMIC_ERROR("mxm_req_wait got non MXM_OK error: %d", + sreq->base.error); + oshmem_shmem_abort(-1); + } +} + END_C_DECLS -#if MXM_API >= MXM_VERSION(2,0) -static inline mxm_mem_key_t *to_mxm_mkey(sshmem_mkey_t *mkey) { - - if (0 == mkey->len) { - return &mxm_empty_mem_key; - } - return (mxm_mem_key_t *)mkey->u.data; -} -#endif - #endif /* MCA_ATOMIC_MXM_H */ diff --git a/oshmem/mca/atomic/mxm/atomic_mxm_cswap.c b/oshmem/mca/atomic/mxm/atomic_mxm_cswap.c index 8e56a1014a..bb6c675a03 100644 --- a/oshmem/mca/atomic/mxm/atomic_mxm_cswap.c +++ b/oshmem/mca/atomic/mxm/atomic_mxm_cswap.c @@ -31,96 +31,20 @@ int mca_atomic_mxm_cswap(void *target, size_t nlong, int pe) { - unsigned my_pe; - uint8_t nlong_order; - void *remote_addr; - int ptl_id; mxm_send_req_t sreq; - mxm_error_t mxm_err; - sshmem_mkey_t *r_mkey; - my_pe = oshmem_my_proc_id(); - ptl_id = -1; - mxm_err = MXM_OK; + mca_atomic_mxm_req_init(&sreq, pe, target, nlong); - switch (nlong) { - case 1: - nlong_order = 0; - break; - case 2: - nlong_order = 1; - break; - case 4: - nlong_order = 2; - break; - case 8: - nlong_order = 3; - break; - default: - ATOMIC_ERROR("[#%d] Type size must be 1/2/4 or 8 bytes.", my_pe); - oshmem_shmem_abort(-1); - return OSHMEM_ERR_BAD_PARAM; - } - - ptl_id = OSHMEM_PROC_DATA(oshmem_proc_group_all(pe))->transport_ids[0]; - if (MXM_PTL_SHM == ptl_id) { - ptl_id = MXM_PTL_RDMA; - } - r_mkey = mca_memheap_base_get_cached_mkey(pe, target, ptl_id, &remote_addr); - if (!r_mkey) { - ATOMIC_ERROR("[#%d] %p is not address of symmetric variable", - my_pe, target); - oshmem_shmem_abort(-1); - return OSHMEM_ERR_BAD_PARAM; - } - - /* mxm request init */ - sreq.base.state = MXM_REQ_NEW; - sreq.base.mq = mca_atomic_mxm_spml_self->mxm_mq; - sreq.base.conn = mca_atomic_mxm_spml_self->mxm_peers[pe]->mxm_hw_rdma_conn; - sreq.base.completed_cb = NULL; - sreq.base.data_type = MXM_REQ_DATA_BUFFER; - - /* set data */ sreq.base.data.buffer.ptr = (void *) value; - sreq.base.data.buffer.length = nlong; - sreq.base.data.buffer.memh = MXM_INVALID_MEM_HANDLE; - - sreq.op.atomic.remote_vaddr = (uintptr_t) remote_addr; -#if MXM_API < MXM_VERSION(2,0) - sreq.base.flags = 0; - sreq.op.atomic.remote_memh = MXM_INVALID_MEM_HANDLE; -#else - sreq.flags = 0; - sreq.op.atomic.remote_mkey = to_mxm_mkey(r_mkey); -#endif - sreq.op.atomic.order = nlong_order; - if (NULL == cond) { sreq.opcode = MXM_REQ_OP_ATOMIC_SWAP; } else { -#if MXM_API < MXM_VERSION(2,0) - memcpy(&sreq.op.atomic.value8, cond, nlong); -#else memcpy(&sreq.op.atomic.value, cond, nlong); -#endif sreq.opcode = MXM_REQ_OP_ATOMIC_CSWAP; } - if (MXM_OK != (mxm_err = mxm_req_send(&sreq))) { - ATOMIC_ERROR("[#%d] mxm_req_send failed, mxm_error = %d", - my_pe, mxm_err); - oshmem_shmem_abort(-1); - return OSHMEM_ERROR; - } + mca_atomic_mxm_post(&sreq); - mxm_req_wait(&sreq.base); - if (MXM_OK != sreq.base.error) { - ATOMIC_ERROR("[#%d] mxm_req_wait got non MXM_OK error: %d", - my_pe, sreq.base.error); - oshmem_shmem_abort(-1); - return OSHMEM_ERROR; - } memcpy(prev, value, nlong); return OSHMEM_SUCCESS; diff --git a/oshmem/mca/atomic/mxm/atomic_mxm_fadd.c b/oshmem/mca/atomic/mxm/atomic_mxm_fadd.c index 2c2accd322..54676ceace 100644 --- a/oshmem/mca/atomic/mxm/atomic_mxm_fadd.c +++ b/oshmem/mca/atomic/mxm/atomic_mxm_fadd.c @@ -32,106 +32,20 @@ int mca_atomic_mxm_fadd(void *target, int pe, struct oshmem_op_t *op) { - unsigned my_pe; - uint8_t nlong_order; - void *remote_addr; - int ptl_id; mxm_send_req_t sreq; - mxm_error_t mxm_err; - sshmem_mkey_t *r_mkey; static char dummy_buf[8]; - my_pe = oshmem_my_proc_id(); - ptl_id = -1; - mxm_err = MXM_OK; + mca_atomic_mxm_req_init(&sreq, pe, target, nlong); - switch (nlong) { - case 1: - nlong_order = 0; - break; - case 2: - nlong_order = 1; - break; - case 4: - nlong_order = 2; - break; - case 8: - nlong_order = 3; - break; - default: - ATOMIC_ERROR("[#%d] Type size must be 1/2/4 or 8 bytes.", my_pe); - oshmem_shmem_abort(-1); - return OSHMEM_ERR_BAD_PARAM; - } - - ptl_id = OSHMEM_PROC_DATA(oshmem_proc_group_all(pe))->transport_ids[0]; - if (MXM_PTL_SHM == ptl_id) { - ptl_id = MXM_PTL_RDMA; - } - r_mkey = mca_memheap_base_get_cached_mkey(pe, target, ptl_id, &remote_addr); - if (!r_mkey) { - ATOMIC_ERROR("[#%d] %p is not address of symmetric variable", - my_pe, target); - oshmem_shmem_abort(-1); - return OSHMEM_ERR_BAD_PARAM; - } - - /* mxm request init */ - sreq.base.state = MXM_REQ_NEW; - sreq.base.mq = mca_atomic_mxm_spml_self->mxm_mq; - sreq.base.conn = mca_atomic_mxm_spml_self->mxm_peers[pe]->mxm_hw_rdma_conn; - sreq.base.completed_cb = NULL; - sreq.base.data_type = MXM_REQ_DATA_BUFFER; - - sreq.op.atomic.remote_vaddr = (uintptr_t) remote_addr; -#if MXM_API < MXM_VERSION(2,0) - sreq.op.atomic.remote_memh = MXM_INVALID_MEM_HANDLE; - memcpy(&sreq.op.atomic.value8, value, nlong); -#else - sreq.op.atomic.remote_mkey = to_mxm_mkey(r_mkey); memcpy(&sreq.op.atomic.value, value, nlong); -#endif - sreq.op.atomic.order = nlong_order; - - /* Do we need atomic 'add' or atomic 'fetch and add'? */ + sreq.opcode = MXM_REQ_OP_ATOMIC_FADD; if (NULL == prev) { sreq.base.data.buffer.ptr = dummy_buf; - sreq.base.data.buffer.length = nlong; - sreq.base.data.buffer.memh = MXM_INVALID_MEM_HANDLE; -#if MXM_API < MXM_VERSION(2,0) - sreq.base.flags = MXM_REQ_FLAG_SEND_SYNC; - sreq.opcode = MXM_REQ_OP_ATOMIC_ADD; -#else - sreq.flags = 0; - sreq.opcode = MXM_REQ_OP_ATOMIC_FADD; -#endif } else { sreq.base.data.buffer.ptr = prev; - sreq.base.data.buffer.length = nlong; - sreq.base.data.buffer.memh = MXM_INVALID_MEM_HANDLE; -#if MXM_API < MXM_VERSION(2,0) - sreq.base.flags = 0; -#else - sreq.flags = 0; -#endif - - sreq.opcode = MXM_REQ_OP_ATOMIC_FADD; } - if (MXM_OK != (mxm_err = mxm_req_send(&sreq))) { - ATOMIC_ERROR("[#%d] mxm_req_send failed, mxm_error = %d", - my_pe, mxm_err); - oshmem_shmem_abort(-1); - return OSHMEM_ERROR; - } - - mxm_req_wait(&sreq.base); - if (MXM_OK != sreq.base.error) { - ATOMIC_ERROR("[#%d] mxm_req_wait got non MXM_OK error: %d", - my_pe, sreq.base.error); - oshmem_shmem_abort(-1); - return OSHMEM_ERROR; - } + mca_atomic_mxm_post(&sreq); return OSHMEM_SUCCESS; } diff --git a/oshmem/mca/memheap/base/base.h b/oshmem/mca/memheap/base/base.h index 34e92346d5..c82e03da5a 100644 --- a/oshmem/mca/memheap/base/base.h +++ b/oshmem/mca/memheap/base/base.h @@ -44,9 +44,13 @@ extern char* mca_memheap_base_exclude; extern int mca_memheap_base_already_opened; extern int mca_memheap_base_key_exchange; -#define MCA_MEMHEAP_MAX_SEGMENTS 256 +#define MCA_MEMHEAP_MAX_SEGMENTS 4 #define HEAP_SEG_INDEX 0 #define SYMB_SEG_INDEX 1 +#define MCA_MEMHEAP_SEG_COUNT (SYMB_SEG_INDEX+1) + +#define MEMHEAP_SEG_INVALID 0xFFFF + typedef struct mca_memheap_map { map_segment_t mem_segs[MCA_MEMHEAP_MAX_SEGMENTS]; /* TODO: change into pointer array */ @@ -158,32 +162,29 @@ extern int mca_memheap_seg_cmp(const void *k, const void *v); extern mca_memheap_map_t* memheap_map; -static inline map_segment_t *memheap_find_va(const void* va) +static inline int map_segment_is_va_in(map_base_segment_t *s, void *va) { - map_segment_t *s; + return (va >= s->va_base && va < s->va_end); +} - if (OPAL_LIKELY((uintptr_t)va >= (uintptr_t)memheap_map->mem_segs[HEAP_SEG_INDEX].seg_base_addr && - (uintptr_t)va < (uintptr_t)memheap_map->mem_segs[HEAP_SEG_INDEX].end)) { - s = &memheap_map->mem_segs[HEAP_SEG_INDEX]; - } else { - s = bsearch(va, - &memheap_map->mem_segs[SYMB_SEG_INDEX], - memheap_map->n_segments - 1, - sizeof(*s), - mca_memheap_seg_cmp); - } +static inline map_segment_t *memheap_find_seg(int segno) +{ + return &mca_memheap_base_map.mem_segs[segno]; +} -#if MEMHEAP_BASE_DEBUG == 1 - if (s) { - MEMHEAP_VERBOSE(5, "match seg#%02ld: 0x%llX - 0x%llX %llu bytes va=%p", - s - memheap_map->mem_segs, - (long long)s->seg_base_addr, - (long long)s->end, - (long long)(s->end - s->seg_base_addr), - (void *)va); +static inline int memheap_is_va_in_segment(void *va, int segno) +{ + return map_segment_is_va_in(&memheap_find_seg(segno)->super, va); +} + +static inline int memheap_find_segnum(void *va) +{ + if (OPAL_LIKELY(memheap_is_va_in_segment(va, SYMB_SEG_INDEX))) { + return SYMB_SEG_INDEX; + } else if (memheap_is_va_in_segment(va, HEAP_SEG_INDEX)) { + return HEAP_SEG_INDEX; } -#endif - return s; + return MEMHEAP_SEG_INVALID; } static inline void* memheap_va2rva(void* va, void* local_base, void* remote_base) @@ -193,6 +194,62 @@ static inline void* memheap_va2rva(void* va, void* local_base, void* remote_base (uintptr_t)va - ((uintptr_t)local_base - (uintptr_t)remote_base)); } +static inline void *map_segment_va2rva(mkey_segment_t *seg, void *va) +{ + return memheap_va2rva(va, seg->super.va_base, seg->rva_base); +} + +static inline map_base_segment_t *map_segment_find_va(map_base_segment_t *segs, size_t elem_size, void *va) +{ + map_base_segment_t *rseg; + + rseg = (map_base_segment_t *)((char *)segs + elem_size * HEAP_SEG_INDEX); + if (OPAL_LIKELY(map_segment_is_va_in(rseg, va))) { + return rseg; + } + + rseg = (map_base_segment_t *)((char *)segs + elem_size * SYMB_SEG_INDEX); + if (OPAL_LIKELY(map_segment_is_va_in(rseg, va))) { + return rseg; + } + + return NULL; +} + +void mkey_segment_init(mkey_segment_t *seg, sshmem_mkey_t *mkey, uint32_t segno); + +static inline map_segment_t *memheap_find_va(void* va) +{ + map_segment_t *s; + + /* most probably there will be only two segments: heap and global data */ + if (OPAL_LIKELY(memheap_is_va_in_segment(va, SYMB_SEG_INDEX))) { + s = &memheap_map->mem_segs[SYMB_SEG_INDEX]; + } else if (memheap_is_va_in_segment(va, HEAP_SEG_INDEX)) { + s = &memheap_map->mem_segs[HEAP_SEG_INDEX]; + } else if (memheap_map->n_segments - 2 > 0) { + s = bsearch(va, + &memheap_map->mem_segs[SYMB_SEG_INDEX+1], + memheap_map->n_segments - 2, + sizeof(*s), + mca_memheap_seg_cmp); + } else { + s = NULL; + } + +#if MEMHEAP_BASE_DEBUG == 1 + if (s) { + MEMHEAP_VERBOSE(5, "match seg#%02ld: 0x%llX - 0x%llX %llu bytes va=%p", + s - memheap_map->mem_segs, + (long long)s->super.va_base, + (long long)s->super.va_end, + (long long)(s->super.va_end - s->super.va_base), + (void *)va); + } +#endif + return s; +} + static inline sshmem_mkey_t *mca_memheap_base_get_cached_mkey(int pe, void* va, int btl_id, @@ -218,7 +275,7 @@ static inline sshmem_mkey_t *mca_memheap_base_get_cached_mkey(int pe, if (OPAL_LIKELY(s->mkeys_cache[pe])) { mkey = &s->mkeys_cache[pe][btl_id]; - *rva = memheap_va2rva(va, s->seg_base_addr, mkey->va_base); + *rva = memheap_va2rva(va, s->super.va_base, mkey->va_base); MEMHEAP_VERBOSE_FASTPATH(10, "rkey: pe=%d va=%p -> (cached) %lx %p", pe, (void *)va, mkey->u.key, (void *)*rva); return mkey; } @@ -226,6 +283,16 @@ static inline sshmem_mkey_t *mca_memheap_base_get_cached_mkey(int pe, return mca_memheap_base_get_cached_mkey_slow(s, pe, va, btl_id, rva); } +static inline int mca_memheap_base_num_transports(void) +{ + return memheap_map->num_transports; +} + +static inline void* mca_memheap_seg2base_va(int seg) +{ + return memheap_map->mem_segs[seg].super.va_base; +} + END_C_DECLS #endif /* MCA_MEMHEAP_BASE_H */ diff --git a/oshmem/mca/memheap/base/memheap_base_frame.c b/oshmem/mca/memheap/base/memheap_base_frame.c index dc9c51b25b..578b4eda72 100644 --- a/oshmem/mca/memheap/base/memheap_base_frame.c +++ b/oshmem/mca/memheap/base/memheap_base_frame.c @@ -38,7 +38,7 @@ char* mca_memheap_base_exclude = NULL; opal_list_t mca_memheap_base_components_opened = {{0}}; struct mca_memheap_base_module_t* mca_memheap_base_module_initialized = NULL; int mca_memheap_base_already_opened = 0; -mca_memheap_map_t mca_memheap_base_map = {{{0}}}; +mca_memheap_map_t mca_memheap_base_map; static int mca_memheap_base_register(mca_base_register_flag_t flags) { diff --git a/oshmem/mca/memheap/base/memheap_base_mkey.c b/oshmem/mca/memheap/base/memheap_base_mkey.c index 5e230f3aef..563fcd0bb9 100644 --- a/oshmem/mca/memheap/base/memheap_base_mkey.c +++ b/oshmem/mca/memheap/base/memheap_base_mkey.c @@ -49,10 +49,12 @@ typedef struct oob_comm_request { struct oob_comm { opal_mutex_t lck; opal_condition_t cond; + uint32_t segno; sshmem_mkey_t *mkeys; int mkeys_rcvd; oob_comm_request_t req_pool[MEMHEAP_RECV_REQS_MAX]; opal_list_t req_list; + int is_inited; }; mca_memheap_map_t* memheap_map = NULL; @@ -68,61 +70,36 @@ static int memheap_oob_get_mkeys(int pe, uint32_t va_seg_num, sshmem_mkey_t *mkey); -static inline void* mca_memheap_seg2base_va(int seg) -{ - return memheap_map->mem_segs[seg].seg_base_addr; -} - int mca_memheap_seg_cmp(const void *k, const void *v) { uintptr_t va = (uintptr_t) k; map_segment_t *s = (map_segment_t *) v; - if (va < (uintptr_t)s->seg_base_addr) + if (va < (uintptr_t)s->super.va_base) return -1; - if (va >= (uintptr_t)s->end) + if (va >= (uintptr_t)s->super.va_end) return 1; return 0; } -/** - * @param all_trs - * 0 - pack mkeys for transports to given pe - * 1 - pack mkeys for ALL possible transports. value of pe is ignored - */ -static int pack_local_mkeys(opal_buffer_t *msg, int pe, int seg, int all_trs) +static int pack_local_mkeys(opal_buffer_t *msg, int pe, int seg) { - ompi_proc_t *proc; - int i, n, tr_id; + int i, n; sshmem_mkey_t *mkey; - /* go over all transports to remote pe and pack mkeys */ - if (!all_trs) { - n = oshmem_get_transport_count(pe); - proc = oshmem_proc_group_find(oshmem_group_all, pe); - } - else { - proc = NULL; - n = memheap_map->num_transports; - } - + /* go over all transports and pack mkeys */ + n = memheap_map->num_transports; opal_dss.pack(msg, &n, 1, OPAL_UINT32); MEMHEAP_VERBOSE(5, "found %d transports to %d", n, pe); for (i = 0; i < n; i++) { - if (!all_trs) { - tr_id = OSHMEM_PROC_DATA(proc)->transport_ids[i]; - } - else { - tr_id = i; - } - mkey = mca_memheap_base_get_mkey(mca_memheap_seg2base_va(seg), tr_id); + mkey = mca_memheap_base_get_mkey(mca_memheap_seg2base_va(seg), i); if (!mkey) { MEMHEAP_ERROR("seg#%d tr_id: %d failed to find local mkey", - seg, tr_id); + seg, i); return OSHMEM_ERROR; } - opal_dss.pack(msg, &tr_id, 1, OPAL_UINT32); + opal_dss.pack(msg, &i, 1, OPAL_UINT32); opal_dss.pack(msg, &mkey->va_base, 1, OPAL_UINT64); if (0 == mkey->va_base) { opal_dss.pack(msg, &mkey->u.key, 1, OPAL_UINT64); @@ -134,7 +111,7 @@ static int pack_local_mkeys(opal_buffer_t *msg, int pe, int seg, int all_trs) } MEMHEAP_VERBOSE(5, "seg#%d tr_id: %d %s", - seg, tr_id, mca_spml_base_mkey2str(mkey)); + seg, i, mca_spml_base_mkey2str(mkey)); } return OSHMEM_SUCCESS; } @@ -202,10 +179,10 @@ static void unpack_remote_mkeys(opal_buffer_t *msg, int remote_pe) } cnt = memheap_oob.mkeys[tr_id].len; opal_dss.unpack(msg, memheap_oob.mkeys[tr_id].u.data, &cnt, OPAL_BYTE); - MCA_SPML_CALL(rmkey_unpack(&memheap_oob.mkeys[tr_id], remote_pe)); } else { memheap_oob.mkeys[tr_id].u.key = MAP_SEGMENT_SHM_INVALID; } + MCA_SPML_CALL(rmkey_unpack(&memheap_oob.mkeys[tr_id], memheap_oob.segno, remote_pe, tr_id)); } MEMHEAP_VERBOSE(5, @@ -249,7 +226,7 @@ static void do_recv(int source_pe, opal_buffer_t* buffer) msg_type = MEMHEAP_RKEY_RESP; opal_dss.pack(msg, &msg_type, 1, OPAL_UINT8); - if (OSHMEM_SUCCESS != pack_local_mkeys(msg, source_pe, seg, 0)) { + if (OSHMEM_SUCCESS != pack_local_mkeys(msg, source_pe, seg)) { OBJ_RELEASE(msg); goto send_fail; } @@ -435,6 +412,7 @@ int memheap_oob_init(mca_memheap_map_t *map) } opal_progress_register(oshmem_mkey_recv_cb); + memheap_oob.is_inited = 1; return rc; } @@ -444,6 +422,10 @@ void memheap_oob_destruct(void) int i; oob_comm_request_t *r; + if (!memheap_oob.is_inited) { + return; + } + opal_progress_unregister(oshmem_mkey_recv_cb); for (i = 0; i < MEMHEAP_RECV_REQS_MAX; i++) { @@ -455,6 +437,7 @@ void memheap_oob_destruct(void) OBJ_DESTRUCT(&memheap_oob.req_list); OBJ_DESTRUCT(&memheap_oob.lck); OBJ_DESTRUCT(&memheap_oob.cond); + memheap_oob.is_inited = 0; } static int send_buffer(int pe, opal_buffer_t *msg) @@ -481,7 +464,6 @@ static int memheap_oob_get_mkeys(int pe, uint32_t seg, sshmem_mkey_t *mkeys) if (OSHMEM_SUCCESS == MCA_SPML_CALL(oob_get_mkeys(pe, seg, mkeys))) { for (i = 0; i < memheap_map->num_transports; i++) { - mkeys[i].va_base = mca_memheap_seg2base_va(seg); MEMHEAP_VERBOSE(5, "MKEY CALCULATED BY LOCAL SPML: pe: %d tr_id: %d %s", pe, @@ -494,6 +476,7 @@ static int memheap_oob_get_mkeys(int pe, uint32_t seg, sshmem_mkey_t *mkeys) OPAL_THREAD_LOCK(&memheap_oob.lck); memheap_oob.mkeys = mkeys; + memheap_oob.segno = seg; memheap_oob.mkeys_rcvd = 0; msg = OBJ_NEW(opal_buffer_t); @@ -585,7 +568,7 @@ void mca_memheap_modex_recv_all(void) } for (j = 0; j < memheap_map->n_segments; j++) { - pack_local_mkeys(msg, 0, j, 1); + pack_local_mkeys(msg, 0, j); } /* we assume here that int32_t returned by opal_dss.unload @@ -661,6 +644,7 @@ void mca_memheap_modex_recv_all(void) } } memheap_oob.mkeys = s->mkeys_cache[i]; + memheap_oob.segno = j; unpack_remote_mkeys(msg, i); } } @@ -699,6 +683,10 @@ sshmem_mkey_t * mca_memheap_base_get_cached_mkey_slow(map_segment_t *s, int rc; sshmem_mkey_t *mkey; + if (!memheap_oob.is_inited) { + return NULL; + } + s->mkeys_cache[pe] = (sshmem_mkey_t *) calloc(memheap_map->num_transports, sizeof(sshmem_mkey_t)); if (!s->mkeys_cache[pe]) @@ -711,7 +699,7 @@ sshmem_mkey_t * mca_memheap_base_get_cached_mkey_slow(map_segment_t *s, return NULL ; mkey = &s->mkeys_cache[pe][btl_id]; - *rva = memheap_va2rva(va, s->seg_base_addr, mkey->va_base); + *rva = memheap_va2rva(va, s->super.va_base, mkey->va_base); MEMHEAP_VERBOSE_FASTPATH(5, "rkey: pe=%d va=%p -> (remote lookup) %lx %p", pe, (void *)va, mkey->u.key, (void *)*rva); return mkey; @@ -737,7 +725,7 @@ uint64_t mca_memheap_base_find_offset(int pe, s = memheap_find_va(va); if (my_pe == pe) { - return (uintptr_t)va - (uintptr_t)s->seg_base_addr; + return (uintptr_t)va - (uintptr_t)s->super.va_base; } else { return ((s && MAP_SEGMENT_IS_VALID(s)) ? ((uintptr_t)rva - (uintptr_t)(s->mkeys_cache[pe][tr_id].va_base)) : 0); @@ -746,7 +734,7 @@ uint64_t mca_memheap_base_find_offset(int pe, int mca_memheap_base_is_symmetric_addr(const void* va) { - return (memheap_find_va(va) ? 1 : 0); + return (memheap_find_va((void *)va) ? 1 : 0); } int mca_memheap_base_detect_addr_type(void* va) @@ -759,14 +747,31 @@ int mca_memheap_base_detect_addr_type(void* va) if (s) { if (s->type == MAP_SEGMENT_STATIC) { addr_type = ADDR_STATIC; - } else if ((uintptr_t)va >= (uintptr_t) s->seg_base_addr - && (uintptr_t)va < (uintptr_t) ((uintptr_t)s->seg_base_addr + mca_memheap.memheap_size)) { + } else if ((uintptr_t)va >= (uintptr_t) s->super.va_base + && (uintptr_t)va < (uintptr_t) ((uintptr_t)s->super.va_base + mca_memheap.memheap_size)) { addr_type = ADDR_USER; } else { - assert( (uintptr_t)va >= (uintptr_t) ((uintptr_t)s->seg_base_addr + mca_memheap.memheap_size) && (uintptr_t)va < (uintptr_t)s->end); + assert( (uintptr_t)va >= (uintptr_t) ((uintptr_t)s->super.va_base + mca_memheap.memheap_size) && (uintptr_t)va < (uintptr_t)s->super.va_end); addr_type = ADDR_PRIVATE; } } return addr_type; } + +void mkey_segment_init(mkey_segment_t *seg, sshmem_mkey_t *mkey, uint32_t segno) +{ + map_segment_t *s; + + if (segno >= MCA_MEMHEAP_SEG_COUNT) { + return; + } + + s = memheap_find_seg(segno); + assert(NULL != s); + + seg->super.va_base = s->super.va_base; + seg->super.va_end = s->super.va_end; + seg->rva_base = mkey->va_base; +} + diff --git a/oshmem/mca/memheap/base/memheap_base_register.c b/oshmem/mca/memheap/base/memheap_base_register.c index 18da1790f5..ea742b2eb5 100644 --- a/oshmem/mca/memheap/base/memheap_base_register.c +++ b/oshmem/mca/memheap/base/memheap_base_register.c @@ -32,9 +32,9 @@ int mca_memheap_base_reg(mca_memheap_map_t *memheap_map) MEMHEAP_VERBOSE(5, "register seg#%02d: 0x%p - 0x%p %llu bytes type=0x%X id=0x%X", i, - s->seg_base_addr, - s->end, - (long long)((uintptr_t)s->end - (uintptr_t)s->seg_base_addr), + s->super.va_base, + s->super.va_end, + (long long)((uintptr_t)s->super.va_end - (uintptr_t)s->super.va_base), s->type, s->seg_id); ret = _reg_segment(s, &memheap_map->num_transports); @@ -60,9 +60,9 @@ int mca_memheap_base_dereg(mca_memheap_map_t *memheap_map) MEMHEAP_VERBOSE(5, "deregistering segment#%d: %p - %p %llu bytes", i, - s->seg_base_addr, - s->end, - (long long)((uintptr_t)s->end - (uintptr_t)s->seg_base_addr)); + s->super.va_base, + s->super.va_end, + (long long)((uintptr_t)s->super.va_end - (uintptr_t)s->super.va_base)); (void)_dereg_segment(s); } @@ -120,8 +120,8 @@ static int _reg_segment(map_segment_t *s, int *num_btl) } if (!rc) { - s->mkeys = MCA_SPML_CALL(register((void *)(unsigned long)s->seg_base_addr, - (uintptr_t)s->end - (uintptr_t)s->seg_base_addr, + s->mkeys = MCA_SPML_CALL(register((void *)(unsigned long)s->super.va_base, + (uintptr_t)s->super.va_end - (uintptr_t)s->super.va_base, s->seg_id, num_btl)); if (NULL == s->mkeys) { diff --git a/oshmem/mca/memheap/base/memheap_base_select.c b/oshmem/mca/memheap/base/memheap_base_select.c index 95e5eb01f7..b1a52e7a7b 100644 --- a/oshmem/mca/memheap/base/memheap_base_select.c +++ b/oshmem/mca/memheap/base/memheap_base_select.c @@ -218,10 +218,10 @@ static memheap_context_t* _memheap_create(void) context.user_size = user_size; context.private_size = MEMHEAP_BASE_PRIVATE_SIZE; context.user_base_addr = - (void*) ((unsigned char*) mca_memheap_base_map.mem_segs[HEAP_SEG_INDEX].seg_base_addr + (void*) ((unsigned char*) mca_memheap_base_map.mem_segs[HEAP_SEG_INDEX].super.va_base + 0); context.private_base_addr = - (void*) ((unsigned char*) mca_memheap_base_map.mem_segs[HEAP_SEG_INDEX].seg_base_addr + (void*) ((unsigned char*) mca_memheap_base_map.mem_segs[HEAP_SEG_INDEX].super.va_base + context.user_size); } diff --git a/oshmem/mca/memheap/base/memheap_base_static.c b/oshmem/mca/memheap/base/memheap_base_static.c index ff0a43b7be..edbb11aa31 100644 --- a/oshmem/mca/memheap/base/memheap_base_static.c +++ b/oshmem/mca/memheap/base/memheap_base_static.c @@ -63,13 +63,13 @@ int mca_memheap_base_static_init(mca_memheap_map_t *map) memset(s, 0, sizeof(*s)); MAP_SEGMENT_RESET_FLAGS(s); s->seg_id = MAP_SEGMENT_SHM_INVALID; - s->seg_base_addr = memheap_context.mem_segs[i].start; - s->end = memheap_context.mem_segs[i].end; - s->seg_size = ((uintptr_t)s->end - (uintptr_t)s->seg_base_addr); + s->super.va_base = memheap_context.mem_segs[i].start; + s->super.va_end = memheap_context.mem_segs[i].end; + s->seg_size = ((uintptr_t)s->super.va_end - (uintptr_t)s->super.va_base); s->type = MAP_SEGMENT_STATIC; map->n_segments++; - total_mem += ((uintptr_t)s->end - (uintptr_t)s->seg_base_addr); + total_mem += ((uintptr_t)s->super.va_end - (uintptr_t)s->super.va_base); } MEMHEAP_VERBOSE(1, "Memheap static memory: %llu byte(s), %d segments", diff --git a/oshmem/mca/spml/base/base.h b/oshmem/mca/spml/base/base.h index af2ad32f43..a0fd613d4d 100644 --- a/oshmem/mca/spml/base/base.h +++ b/oshmem/mca/spml/base/base.h @@ -71,7 +71,7 @@ OSHMEM_DECLSPEC int mca_spml_base_oob_get_mkeys(int pe, uint32_t seg, sshmem_mkey_t *mkeys); -OSHMEM_DECLSPEC void mca_spml_base_rmkey_unpack(sshmem_mkey_t *mkey, int pe); +OSHMEM_DECLSPEC void mca_spml_base_rmkey_unpack(sshmem_mkey_t *mkey, uint32_t seg, int pe, int tr_id); OSHMEM_DECLSPEC void mca_spml_base_rmkey_free(sshmem_mkey_t *mkey); OSHMEM_DECLSPEC int mca_spml_base_put_nb(void *dst_addr, size_t size, @@ -104,6 +104,8 @@ OSHMEM_DECLSPEC extern mca_base_framework_t oshmem_spml_base_framework; #define SPML_VERBOSE(level, ...) #endif +#define SPML_VERBOSE_FASTPATH(level, ...) + #define SPML_ERROR(...) \ oshmem_output(oshmem_spml_base_framework.framework_output, \ "Error %s:%d - %s()", __SPML_FILE__, __LINE__, __func__, __VA_ARGS__) diff --git a/oshmem/mca/spml/base/spml_base.c b/oshmem/mca/spml/base/spml_base.c index f43db019b8..bdaf013438 100644 --- a/oshmem/mca/spml/base/spml_base.c +++ b/oshmem/mca/spml/base/spml_base.c @@ -153,12 +153,12 @@ int mca_spml_base_wait_nb(void* handle) return OSHMEM_SUCCESS; } -int mca_spml_base_oob_get_mkeys(int pe, uint32_t seg, sshmem_mkey_t *mkeys) +int mca_spml_base_oob_get_mkeys(int pe, uint32_t segno, sshmem_mkey_t *mkeys) { return OSHMEM_ERROR; } -void mca_spml_base_rmkey_unpack(sshmem_mkey_t *mkey, int pe) +void mca_spml_base_rmkey_unpack(sshmem_mkey_t *mkey, uint32_t segno, int pe, int tr_id) { } diff --git a/oshmem/mca/spml/ikrit/spml_ikrit.c b/oshmem/mca/spml/ikrit/spml_ikrit.c index 3e5ecc5313..1374ceb455 100644 --- a/oshmem/mca/spml/ikrit/spml_ikrit.c +++ b/oshmem/mca/spml/ikrit/spml_ikrit.c @@ -34,6 +34,7 @@ #include "oshmem/mca/spml/base/spml_base_putreq.h" #include "oshmem/runtime/runtime.h" #include "orte/util/show_help.h" +#include "oshmem/mca/sshmem/sshmem.h" #include "oshmem/mca/spml/ikrit/spml_ikrit_component.h" @@ -55,44 +56,25 @@ do { \ } \ } while(0) -typedef struct spml_ikrit_am_hdr { - uint64_t va; -} spml_ikrit_am_hdr_t; +static int mca_spml_ikrit_get_async(void *src_addr, + size_t size, + void *dst_addr, + int src); struct mca_spml_ikrit_put_request { - mca_spml_base_put_request_t req_put; - mxm_send_req_t mxm_req; - int pe; - mxm_req_buffer_t iov[2]; - spml_ikrit_am_hdr_t am_pkt; + opal_free_list_item_t link; /* must be a first member */ + mxm_send_req_t mxm_req; + int pe; }; typedef struct mca_spml_ikrit_put_request mca_spml_ikrit_put_request_t; -OBJ_CLASS_DECLARATION(mca_spml_ikrit_put_request_t); -#if MXM_API < MXM_VERSION(2,0) -static int spml_ikrit_get_ep_address(spml_ikrit_mxm_ep_conn_info_t *ep_info, - mxm_ptl_id_t ptlid) + +static inline int get_ptl_id(int dst) { - size_t addrlen; - mxm_error_t err; - - addrlen = sizeof(ep_info->addr.ptl_addr[ptlid]); - err = mxm_ep_address(mca_spml_ikrit.mxm_ep, - ptlid, - (struct sockaddr *) &ep_info->addr.ptl_addr[ptlid], - &addrlen); - if (MXM_OK != err) { - orte_show_help("help-oshmem-spml-ikrit.txt", - "unable to get endpoint address", - true, - mxm_error_string(err)); - return OSHMEM_ERROR; - } - - return OSHMEM_SUCCESS; + return mca_spml_ikrit.mxm_peers[dst].ptl_id; } -#else + static inline mxm_mem_key_t *to_mxm_mkey(sshmem_mkey_t *mkey) { if (0 == mkey->len) { @@ -100,8 +82,6 @@ static inline mxm_mem_key_t *to_mxm_mkey(sshmem_mkey_t *mkey) { } return (mxm_mem_key_t *)mkey->u.data; } -#endif - static inline void mca_spml_irkit_req_wait(mxm_req_base_t *req) { @@ -114,101 +94,63 @@ static inline void mca_spml_irkit_req_wait(mxm_req_base_t *req) } while (!mxm_req_test(req)); } -static int mca_spml_ikrit_put_request_free(struct oshmem_request_t** request) +static inline void free_put_req(mca_spml_ikrit_put_request_t *put_req) { - mca_spml_ikrit_put_request_t *put_req = - *(mca_spml_ikrit_put_request_t **) request; - - OPAL_THREAD_LOCK(&oshmem_request_lock); - assert(false == put_req->req_put.req_base.req_free_called); - put_req->req_put.req_base.req_free_called = true; opal_free_list_return (&mca_spml_base_put_requests, (opal_free_list_item_t*)put_req); opal_memchecker_base_mem_noaccess(put_req, sizeof(*put_req)); - OPAL_THREAD_UNLOCK(&oshmem_request_lock); - - *request = SHMEM_REQUEST_NULL; /*MPI_REQUEST_NULL;*/ - - return OSHMEM_SUCCESS; } -static int mca_spml_ikrit_put_request_cancel(struct oshmem_request_t * request, - int complete) +static inline mca_spml_ikrit_put_request_t *alloc_put_req(void) { - return OSHMEM_SUCCESS; + mca_spml_ikrit_put_request_t *req; + opal_free_list_item_t* item; + + item = opal_free_list_wait (&mca_spml_base_put_requests); + assert(item != NULL); + + req = (mca_spml_ikrit_put_request_t *) item; + opal_memchecker_base_mem_undefined(req, sizeof(*req)); + + return req; } -static void mca_spml_ikrit_put_request_construct(mca_spml_ikrit_put_request_t* req) -{ - req->req_put.req_base.req_type = MCA_SPML_REQUEST_PUT; - req->req_put.req_base.req_oshmem.req_free = mca_spml_ikrit_put_request_free; - req->req_put.req_base.req_oshmem.req_cancel = - mca_spml_ikrit_put_request_cancel; -} - -static void mca_spml_ikrit_put_request_destruct(mca_spml_ikrit_put_request_t* req) -{ -} - -OBJ_CLASS_INSTANCE( mca_spml_ikrit_put_request_t, - mca_spml_base_put_request_t, - mca_spml_ikrit_put_request_construct, - mca_spml_ikrit_put_request_destruct); struct mca_spml_ikrit_get_request { - mca_spml_base_get_request_t req_get; - mxm_send_req_t mxm_req; + opal_free_list_item_t link; /* must be a first member */ + mxm_send_req_t mxm_req; }; typedef struct mca_spml_ikrit_get_request mca_spml_ikrit_get_request_t; -OBJ_CLASS_DECLARATION(mca_spml_ikrit_get_request_t); -static int mca_spml_ikrit_get_request_free(struct oshmem_request_t** request) +static inline void free_get_req(mca_spml_ikrit_get_request_t *get_req) { - mca_spml_ikrit_get_request_t *get_req = - *(mca_spml_ikrit_get_request_t **) request; - - OPAL_THREAD_LOCK(&oshmem_request_lock); - assert(false == get_req->req_get.req_base.req_free_called); - get_req->req_get.req_base.req_free_called = true; opal_free_list_return (&mca_spml_base_get_requests, (opal_free_list_item_t*)get_req); opal_memchecker_base_mem_noaccess(get_req, sizeof(*get_req)); - OPAL_THREAD_UNLOCK(&oshmem_request_lock); - - *request = SHMEM_REQUEST_NULL; /*MPI_REQUEST_NULL;*/ - - return OSHMEM_SUCCESS; } -static int mca_spml_ikrit_get_request_cancel(struct oshmem_request_t * request, - int complete) +static inline mca_spml_ikrit_get_request_t *alloc_get_req(void) { - return OSHMEM_SUCCESS; + mca_spml_ikrit_get_request_t *req; + opal_free_list_item_t* item; + + item = opal_free_list_wait (&mca_spml_base_get_requests); + assert(item != NULL); + + req = (mca_spml_ikrit_get_request_t *) item; + opal_memchecker_base_mem_undefined(req, sizeof(*req)); + return req; } -static void mca_spml_ikrit_get_request_construct(mca_spml_ikrit_get_request_t* req) -{ - req->req_get.req_base.req_type = MCA_SPML_REQUEST_GET; - req->req_get.req_base.req_oshmem.req_free = mca_spml_ikrit_get_request_free; - req->req_get.req_base.req_oshmem.req_cancel = - mca_spml_ikrit_get_request_cancel; -} - -static void mca_spml_ikrit_get_request_destruct(mca_spml_ikrit_get_request_t* req) -{ -} - -OBJ_CLASS_INSTANCE( mca_spml_ikrit_get_request_t, - mca_spml_base_get_request_t, - mca_spml_ikrit_get_request_construct, - mca_spml_ikrit_get_request_destruct); int mca_spml_ikrit_put_simple(void* dst_addr, size_t size, void* src_addr, int dst); +static void mca_spml_ikrit_cache_mkeys(sshmem_mkey_t *, uint32_t seg, int remote_pe, int tr_id); + mca_spml_ikrit_t mca_spml_ikrit = { { /* Init mca_spml_base_module_t */ @@ -227,73 +169,60 @@ mca_spml_ikrit_t mca_spml_ikrit = { mca_spml_base_wait, mca_spml_base_wait_nb, mca_spml_ikrit_fence, - mca_spml_base_rmkey_unpack, + mca_spml_ikrit_cache_mkeys, mca_spml_base_rmkey_free, (void*)&mca_spml_ikrit } }; -#if MXM_API < MXM_VERSION(2,0) -void mca_spml_ikrit_dump_stats(void); -void mca_spml_ikrit_dump_stats() +static void mca_spml_ikrit_cache_mkeys(sshmem_mkey_t *mkey, uint32_t seg, int dst_pe, int tr_id) { - int num_procs; - int i; - char sbuf[1024]; - FILE *fp; + mxm_peer_t *peer; - fp = fmemopen(sbuf, sizeof(sbuf), "rw"); - num_procs = oshmem_num_procs(); - for (i = 0; i < num_procs; i++) { - mxm_print_conn_state(mca_spml_ikrit.mxm_peers[i]->mxm_conn, - MXM_STATE_DETAIL_LEVEL_DATA, - "", - fp); - printf("=========== pe:%d conn:%p stats:\n %s==================\n", - i, - mca_spml_ikrit.mxm_peers[i]->mxm_conn, - sbuf); - rewind(fp); + if (MXM_PTL_RDMA != tr_id) { + return; } - fclose(fp); -} -#endif -static inline mca_spml_ikrit_put_request_t *alloc_put_req(void) -{ - mca_spml_ikrit_put_request_t *req; - opal_free_list_item_t* item; + peer = &mca_spml_ikrit.mxm_peers[dst_pe]; + mkey_segment_init(&peer->mkeys[seg].super, mkey, seg); - item = opal_free_list_wait (&mca_spml_base_put_requests); - - req = (mca_spml_ikrit_put_request_t *) item; - opal_memchecker_base_mem_undefined(req, sizeof(*req)); - opal_memchecker_base_mem_defined(&req->req_put.req_base, - sizeof(req->req_put.req_base)); - - req->req_put.req_base.req_free_called = false; - req->req_put.req_base.req_oshmem.req_complete = false; - - return req; + if (0 != mkey->len) { + memcpy(&peer->mkeys[seg].key, mkey->u.data, mkey->len); + } else { + memcpy(&peer->mkeys[seg].key, &mxm_empty_mem_key, sizeof(mxm_empty_mem_key)); + } } -static inline mca_spml_ikrit_get_request_t *alloc_get_req(void) +mxm_mem_key_t *mca_spml_ikrit_get_mkey_slow(int pe, void *va, int ptl_id, void **rva) { - mca_spml_ikrit_get_request_t *req; - opal_free_list_item_t* item; + sshmem_mkey_t *mkey; - item = opal_free_list_wait (&mca_spml_base_get_requests); +retry: + mkey = mca_memheap_base_get_cached_mkey(pe, va, ptl_id, rva); + if (NULL == mkey) { + SPML_ERROR("pe=%d: %p is not address of shared variable", pe, va); + oshmem_shmem_abort(-1); + return NULL; + } - req = (mca_spml_ikrit_get_request_t *) item; - opal_memchecker_base_mem_undefined(req, sizeof(*req)); - opal_memchecker_base_mem_defined(&req->req_get.req_base, - sizeof(req->req_get.req_base)); + if (MXM_PTL_SHM == ptl_id) { + if (mca_memheap_base_can_local_copy(mkey, va)) { + return NULL; + } - req->req_get.req_base.req_free_called = false; - req->req_get.req_base.req_oshmem.req_complete = false; + /* if dst addr is on memheap and local copy is not allowed + * disable direct shm transport + */ + if (memheap_is_va_in_segment(va, HEAP_SEG_INDEX)) { + mca_spml_ikrit.mxm_peers[pe].ptl_id = MXM_PTL_RDMA; + } + /* going via mxm must always work */ + ptl_id = MXM_PTL_RDMA; + goto retry; + } - return req; + return to_mxm_mkey(mkey); } int mca_spml_ikrit_enable(bool enable) @@ -306,7 +235,7 @@ int mca_spml_ikrit_enable(bool enable) opal_free_list_init (&mca_spml_base_put_requests, sizeof(mca_spml_ikrit_put_request_t), opal_cache_line_size, - OBJ_CLASS(mca_spml_ikrit_put_request_t), + OBJ_CLASS(opal_free_list_item_t), 0, opal_cache_line_size, mca_spml_ikrit.free_list_num, @@ -317,7 +246,7 @@ int mca_spml_ikrit_enable(bool enable) opal_free_list_init (&mca_spml_base_get_requests, sizeof(mca_spml_ikrit_get_request_t), opal_cache_line_size, - OBJ_CLASS(mca_spml_ikrit_get_request_t), + OBJ_CLASS(opal_free_list_item_t), 0, opal_cache_line_size, mca_spml_ikrit.free_list_num, @@ -330,63 +259,28 @@ int mca_spml_ikrit_enable(bool enable) return OSHMEM_SUCCESS; } -static int create_ptl_idx(int dst_pe) -{ - ompi_proc_t *proc; - - proc = oshmem_proc_group_find(oshmem_group_all, dst_pe); - - OSHMEM_PROC_DATA(proc)->transport_ids = (char *) malloc(MXM_PTL_LAST * sizeof(char)); - if (NULL == OSHMEM_PROC_DATA(proc)->transport_ids) - return OSHMEM_ERROR; - - OSHMEM_PROC_DATA(proc)->num_transports = 1; -#if MXM_API < MXM_VERSION(2,0) - if (oshmem_my_proc_id() == dst_pe) - OSHMEM_PROC_DATA(proc)->transport_ids[0] = MXM_PTL_SELF; - else -#endif - OSHMEM_PROC_DATA(proc)->transport_ids[0] = MXM_PTL_RDMA; - return OSHMEM_SUCCESS; -} - -static void destroy_ptl_idx(int dst_pe) -{ - ompi_proc_t *proc; - - proc = oshmem_proc_group_find(oshmem_group_all, dst_pe); - if (NULL != OSHMEM_PROC_DATA(proc)->transport_ids) - free(OSHMEM_PROC_DATA(proc)->transport_ids); -} - static void mxm_peer_construct(mxm_peer_t *p) { - p->pe = -1; p->n_active_puts = 0; - p->need_fence = 0; + p->need_fence = 0; + p->ptl_id = MXM_PTL_RDMA; + OBJ_CONSTRUCT(&p->link, opal_list_item_t); } static void mxm_peer_destruct(mxm_peer_t *p) { - /* may be we need to remov item from list */ + OBJ_DESTRUCT(&p->link); } -OBJ_CLASS_INSTANCE( mxm_peer_t, - opal_list_item_t, - mxm_peer_construct, - mxm_peer_destruct); - int mca_spml_ikrit_del_procs(ompi_proc_t** procs, size_t nprocs) { size_t i, n; int my_rank = oshmem_my_proc_id(); oshmem_shmem_barrier(); -#if MXM_API >= MXM_VERSION(2,0) if (mca_spml_ikrit.bulk_disconnect) { mxm_ep_powerdown(mca_spml_ikrit.mxm_ep); } -#endif while (NULL != opal_list_remove_first(&mca_spml_ikrit.active_peers)) { }; @@ -394,13 +288,12 @@ int mca_spml_ikrit_del_procs(ompi_proc_t** procs, size_t nprocs) for (n = 0; n < nprocs; n++) { i = (my_rank + n) % nprocs; - mxm_ep_disconnect(mca_spml_ikrit.mxm_peers[i]->mxm_conn); + mxm_ep_disconnect(mca_spml_ikrit.mxm_peers[i].mxm_conn); if (mca_spml_ikrit.hw_rdma_channel) { - assert(mca_spml_ikrit.mxm_peers[i]->mxm_hw_rdma_conn != mca_spml_ikrit.mxm_peers[i]->mxm_conn); - mxm_ep_disconnect(mca_spml_ikrit.mxm_peers[i]->mxm_hw_rdma_conn); + assert(mca_spml_ikrit.mxm_peers[i].mxm_hw_rdma_conn != mca_spml_ikrit.mxm_peers[i].mxm_conn); + mxm_ep_disconnect(mca_spml_ikrit.mxm_peers[i].mxm_hw_rdma_conn); } - destroy_ptl_idx(i); - OBJ_RELEASE(mca_spml_ikrit.mxm_peers[i]); + mxm_peer_destruct(&mca_spml_ikrit.mxm_peers[i]); } free(mca_spml_ikrit.mxm_peers); @@ -411,13 +304,8 @@ int mca_spml_ikrit_add_procs(ompi_proc_t** procs, size_t nprocs) { spml_ikrit_mxm_ep_conn_info_t *ep_info = NULL; spml_ikrit_mxm_ep_conn_info_t *ep_hw_rdma_info = NULL; - spml_ikrit_mxm_ep_conn_info_t my_ep_info = {{0}}; -#if MXM_API < MXM_VERSION(2,0) - mxm_conn_req_t *conn_reqs; - int timeout; -#else + spml_ikrit_mxm_ep_conn_info_t my_ep_info; size_t mxm_addr_len = MXM_MAX_ADDR_LEN; -#endif mxm_error_t err; size_t i, n; int rc = OSHMEM_ERROR; @@ -426,14 +314,6 @@ int mca_spml_ikrit_add_procs(ompi_proc_t** procs, size_t nprocs) OBJ_CONSTRUCT(&mca_spml_ikrit.active_peers, opal_list_t); /* Allocate connection requests */ -#if MXM_API < MXM_VERSION(2,0) - conn_reqs = malloc(nprocs * sizeof(mxm_conn_req_t)); - if (NULL == conn_reqs) { - rc = OSHMEM_ERR_OUT_OF_RESOURCE; - goto bail; - } - memset(conn_reqs, 0x0, sizeof(mxm_conn_req_t)); -#endif ep_info = calloc(sizeof(spml_ikrit_mxm_ep_conn_info_t), nprocs); if (NULL == ep_info) { rc = OSHMEM_ERR_OUT_OF_RESOURCE; @@ -448,25 +328,14 @@ int mca_spml_ikrit_add_procs(ompi_proc_t** procs, size_t nprocs) } } - mca_spml_ikrit.mxm_peers = (mxm_peer_t **) malloc(nprocs - * sizeof(*(mca_spml_ikrit.mxm_peers))); + mca_spml_ikrit.mxm_peers = (mxm_peer_t *) calloc(nprocs , sizeof(mxm_peer_t)); if (NULL == mca_spml_ikrit.mxm_peers) { rc = OSHMEM_ERR_OUT_OF_RESOURCE; goto bail; } -#if MXM_API < MXM_VERSION(2,0) - if (OSHMEM_SUCCESS - != spml_ikrit_get_ep_address(&my_ep_info, MXM_PTL_SELF)) { - rc = OSHMEM_ERROR; - goto bail; - } - if (OSHMEM_SUCCESS - != spml_ikrit_get_ep_address(&my_ep_info, MXM_PTL_RDMA)) { - rc = OSHMEM_ERROR; - goto bail; - } -#else + memset(&my_ep_info, 0, sizeof(my_ep_info)); + if (mca_spml_ikrit.hw_rdma_channel) { err = mxm_ep_get_address(mca_spml_ikrit.mxm_hw_rdma_ep, &my_ep_info.addr.ep_addr, &mxm_addr_len); if (MXM_OK != err) { @@ -485,7 +354,7 @@ int mca_spml_ikrit_add_procs(ompi_proc_t** procs, size_t nprocs) rc = OSHMEM_ERROR; goto bail; } -#endif + oshmem_shmem_allgather(&my_ep_info, ep_info, sizeof(spml_ikrit_mxm_ep_conn_info_t)); @@ -497,86 +366,35 @@ int mca_spml_ikrit_add_procs(ompi_proc_t** procs, size_t nprocs) /* mxm 2.0 keeps its connections on a list. Make sure * that list have different order on every rank */ i = (my_rank + n) % nprocs; - mca_spml_ikrit.mxm_peers[i] = OBJ_NEW(mxm_peer_t); - if (NULL == mca_spml_ikrit.mxm_peers[i]) { - rc = OSHMEM_ERR_OUT_OF_RESOURCE; - goto bail; - } - mca_spml_ikrit.mxm_peers[i]->pe = i; + mxm_peer_construct(&mca_spml_ikrit.mxm_peers[i]); -#if MXM_API < MXM_VERSION(2,0) - conn_reqs[i].ptl_addr[MXM_PTL_SELF] = - (struct sockaddr *) &ep_info[i].addr.ptl_addr[MXM_PTL_SELF]; - conn_reqs[i].ptl_addr[MXM_PTL_SHM] = NULL; - conn_reqs[i].ptl_addr[MXM_PTL_RDMA] = - (struct sockaddr *) &ep_info[i].addr.ptl_addr[MXM_PTL_RDMA]; -#else - err = mxm_ep_connect(mca_spml_ikrit.mxm_ep, ep_info[i].addr.ep_addr, &mca_spml_ikrit.mxm_peers[i]->mxm_conn); + err = mxm_ep_connect(mca_spml_ikrit.mxm_ep, ep_info[i].addr.ep_addr, &mca_spml_ikrit.mxm_peers[i].mxm_conn); if (MXM_OK != err) { SPML_ERROR("MXM returned connect error: %s\n", mxm_error_string(err)); goto bail; } - if (OSHMEM_SUCCESS != create_ptl_idx(i)) - goto bail; - mxm_conn_ctx_set(mca_spml_ikrit.mxm_peers[i]->mxm_conn, mca_spml_ikrit.mxm_peers[i]); + mxm_conn_ctx_set(mca_spml_ikrit.mxm_peers[i].mxm_conn, &mca_spml_ikrit.mxm_peers[i]); if (mca_spml_ikrit.hw_rdma_channel) { - err = mxm_ep_connect(mca_spml_ikrit.mxm_hw_rdma_ep, ep_hw_rdma_info[i].addr.ep_addr, &mca_spml_ikrit.mxm_peers[i]->mxm_hw_rdma_conn); + err = mxm_ep_connect(mca_spml_ikrit.mxm_hw_rdma_ep, ep_hw_rdma_info[i].addr.ep_addr, &mca_spml_ikrit.mxm_peers[i].mxm_hw_rdma_conn); if (MXM_OK != err) { SPML_ERROR("MXM returned connect error: %s\n", mxm_error_string(err)); goto bail; } } else { - mca_spml_ikrit.mxm_peers[i]->mxm_hw_rdma_conn = mca_spml_ikrit.mxm_peers[i]->mxm_conn; + mca_spml_ikrit.mxm_peers[i].mxm_hw_rdma_conn = mca_spml_ikrit.mxm_peers[i].mxm_conn; } -#endif } -#if MXM_API < MXM_VERSION(2,0) - /* Connect to remote peers */ - if (mxm_get_version() < MXM_VERSION(1,5)) { - timeout = 1000; - } else { - timeout = -1; - } - err = mxm_ep_connect(mca_spml_ikrit.mxm_ep, conn_reqs, nprocs, timeout); - if (MXM_OK != err) { - SPML_ERROR("MXM returned connect error: %s\n", mxm_error_string(err)); - for (i = 0; i < nprocs; ++i) { - if (MXM_OK != conn_reqs[i].error) { - SPML_ERROR("MXM EP connect to %s error: %s\n", - procs[i]->proc_hostname, mxm_error_string(conn_reqs[i].error)); - } - } - rc = OSHMEM_ERR_CONNECTION_FAILED; - goto bail; - } - - /* Save returned connections */ - for (i = 0; i < nprocs; ++i) { - mca_spml_ikrit.mxm_peers[i]->mxm_conn = conn_reqs[i].conn; - if (OSHMEM_SUCCESS != create_ptl_idx(i)) { - rc = OSHMEM_ERR_CONNECTION_FAILED; - goto bail; - } - - mxm_conn_ctx_set(conn_reqs[i].conn, mca_spml_ikrit.mxm_peers[i]); - } - - if (conn_reqs) - free(conn_reqs); -#endif if (ep_info) free(ep_info); if (ep_hw_rdma_info) free(ep_hw_rdma_info); -#if MXM_API >= MXM_VERSION(2,0) if (mca_spml_ikrit.bulk_connect) { /* Need a barrier to ensure remote peers already created connection */ oshmem_shmem_barrier(); mxm_ep_wireup(mca_spml_ikrit.mxm_ep); } -#endif proc_self = oshmem_proc_group_find(oshmem_group_all, my_rank); /* identify local processes and change transport to SHM */ @@ -588,20 +406,14 @@ int mca_spml_ikrit_add_procs(ompi_proc_t** procs, size_t nprocs) if (procs[i] == proc_self) continue; - /* use zcopy for put/get via sysv shared memory */ - OSHMEM_PROC_DATA(procs[i])->transport_ids[0] = MXM_PTL_SHM; - OSHMEM_PROC_DATA(procs[i])->transport_ids[1] = MXM_PTL_RDMA; - OSHMEM_PROC_DATA(procs[i])->num_transports = 2; + /* use zcopy for put/get via sysv shared memory with fallback to RDMA */ + mca_spml_ikrit.mxm_peers[i].ptl_id = MXM_PTL_SHM; } SPML_VERBOSE(50, "*** ADDED PROCS ***"); return OSHMEM_SUCCESS; bail: -#if MXM_API < MXM_VERSION(2,0) - if (conn_reqs) - free(conn_reqs); -#endif if (ep_info) free(ep_info); if (ep_hw_rdma_info) @@ -619,10 +431,9 @@ sshmem_mkey_t *mca_spml_ikrit_register(void* addr, { int i; sshmem_mkey_t *mkeys; -#if MXM_API >= MXM_VERSION(2,0) mxm_error_t err; mxm_mem_key_t *m_key; -#endif + int my_rank = oshmem_my_proc_id(); *count = 0; mkeys = (sshmem_mkey_t *) calloc(1, MXM_PTL_LAST * sizeof(*mkeys)); @@ -643,19 +454,10 @@ sshmem_mkey_t *mca_spml_ikrit_register(void* addr, } mkeys[i].spml_context = 0; break; -#if MXM_API < MXM_VERSION(2,0) - case MXM_PTL_SELF: - mkeys[i].len = 0; - mkeys[i].spml_context = 0; - mkeys[i].va_base = addr; - break; -#endif case MXM_PTL_RDMA: mkeys[i].va_base = addr; mkeys[i].spml_context = 0; -#if MXM_API < MXM_VERSION(2,0) - mkeys[i].len = 0; -#else + if (mca_spml_ikrit.ud_only) { mkeys[i].len = 0; break; @@ -681,7 +483,6 @@ sshmem_mkey_t *mca_spml_ikrit_register(void* addr, SPML_ERROR("Failed to get memory key: %s", mxm_error_string(err)); goto error_out; } -#endif break; default: @@ -690,9 +491,10 @@ sshmem_mkey_t *mca_spml_ikrit_register(void* addr, } SPML_VERBOSE(5, "rank %d ptl %d addr %p size %llu %s", - oshmem_proc_pe(oshmem_proc_local()), i, addr, (unsigned long long)size, + my_rank, i, addr, (unsigned long long)size, mca_spml_base_mkey2str(&mkeys[i])); + mca_spml_ikrit_cache_mkeys(&mkeys[i], memheap_find_segnum(addr), my_rank, i); } *count = MXM_PTL_LAST; @@ -714,16 +516,12 @@ int mca_spml_ikrit_deregister(sshmem_mkey_t *mkeys) for (i = 0; i < MXM_PTL_LAST; i++) { switch (i) { -#if MXM_API < MXM_VERSION(2,0) - case MXM_PTL_SELF: -#endif case MXM_PTL_SHM: break; case MXM_PTL_RDMA: /* dereg memory */ if (!mkeys[i].spml_context) break; -#if MXM_API >= MXM_VERSION(2,0) mxm_mem_unmap(mca_spml_ikrit.mxm_context, (void *)mkeys[i].va_base, (unsigned long)mkeys[i].spml_context, @@ -731,7 +529,6 @@ int mca_spml_ikrit_deregister(sshmem_mkey_t *mkeys) if (0 < mkeys[i].len) { free(mkeys[i].u.data); } -#endif break; } } @@ -741,23 +538,10 @@ int mca_spml_ikrit_deregister(sshmem_mkey_t *mkeys) } -static inline int get_ptl_id(int dst) -{ - ompi_proc_t *proc; - - /* get endpoint and btl */ - proc = oshmem_proc_group_all(dst); - if (!proc) { - SPML_ERROR("Can not find destination proc for pe=%d", dst); - oshmem_shmem_abort(-1); - return -1; - } - return OSHMEM_PROC_DATA(proc)->transport_ids[0]; -} - int mca_spml_ikrit_oob_get_mkeys(int pe, uint32_t seg, sshmem_mkey_t *mkeys) { int ptl; + ptl = get_ptl_id(pe); if (ptl < 0) return OSHMEM_ERROR; @@ -765,71 +549,45 @@ int mca_spml_ikrit_oob_get_mkeys(int pe, uint32_t seg, sshmem_mkey_t *mkeys) if (ptl != MXM_PTL_RDMA) return OSHMEM_ERROR; -#if MXM_API < MXM_VERSION(2,0) - if (seg > 1) - return OSHMEM_ERROR; - - mkeys[ptl].len = 0; - mkeys[ptl].u.key = MAP_SEGMENT_SHM_INVALID; - return OSHMEM_SUCCESS; -#else /* we are actually registering memory in 2.0 and later. * So can only skip mkey exchange when ud is the only transport */ if (mca_spml_ikrit.ud_only) { - mkeys[ptl].len = 0; - mkeys[ptl].u.key = MAP_SEGMENT_SHM_INVALID; + /* assumes that remote has the same va_base as we do */ + mkeys[ptl].len = 0; + mkeys[ptl].va_base = mca_memheap_seg2base_va(seg); + mkeys[ptl].u.key = MAP_SEGMENT_SHM_INVALID; + mca_spml_ikrit_cache_mkeys(&mkeys[ptl], seg, pe, ptl); return OSHMEM_SUCCESS; } return OSHMEM_ERROR; -#endif } -static int mca_spml_ikrit_get_helper(mxm_send_req_t *sreq, - void *src_addr, - size_t size, - void *dst_addr, - int src) +static inline int mca_spml_ikrit_get_helper(mxm_send_req_t *sreq, + void *src_addr, + size_t size, + void *dst_addr, + int src) { /* shmem spec states that get() operations are blocking. So it is enough to have single mxm request. Also we count on mxm doing copy */ void *rva; - sshmem_mkey_t *r_mkey; - int ptl_id; + mxm_mem_key_t *mkey; - ptl_id = get_ptl_id(src); - /* already tried to send via shm and failed. go via rdma */ - if (ptl_id == MXM_PTL_SHM) - ptl_id = MXM_PTL_RDMA; + mkey = mca_spml_ikrit_get_mkey(src, src_addr, MXM_PTL_RDMA, &rva); - /** - * Get the address to the remote rkey. - **/ - r_mkey = mca_memheap_base_get_cached_mkey(src, src_addr, ptl_id, &rva); - if (!r_mkey) { - SPML_ERROR("pe=%d: %p is not address of shared variable", - src, src_addr); - oshmem_shmem_abort(-1); - return OSHMEM_ERROR; - } - - SPML_VERBOSE(100, - "get: pe:%d ptl=%d src=%p -> dst: %p sz=%d. src_rva=%p, %s", - src, ptl_id, src_addr, dst_addr, (int)size, (void *)rva, mca_spml_base_mkey2str(r_mkey)); + SPML_VERBOSE_FASTPATH(100, + "get: pe:%d ptl=%d src=%p -> dst: %p sz=%d. src_rva=%p", + src, MXM_PTL_RDMA, src_addr, dst_addr, (int)size, (void *)rva); /* mxm does not really cares for get lkey */ sreq->base.mq = mca_spml_ikrit.mxm_mq; - sreq->base.conn = mca_spml_ikrit.mxm_peers[src]->mxm_conn; + sreq->base.conn = mca_spml_ikrit.mxm_peers[src].mxm_conn; sreq->base.data_type = MXM_REQ_DATA_BUFFER; sreq->base.data.buffer.ptr = dst_addr; sreq->base.data.buffer.length = size; -#if MXM_API < MXM_VERSION(2,0) - sreq->base.data.buffer.memh = NULL; - sreq->op.mem.remote_memh = NULL; -#else - sreq->op.mem.remote_mkey = to_mxm_mkey(r_mkey); -#endif + sreq->op.mem.remote_mkey = mkey; sreq->opcode = MXM_REQ_OP_GET; sreq->op.mem.remote_vaddr = (intptr_t) rva; sreq->base.state = MXM_REQ_NEW; @@ -844,7 +602,6 @@ static inline int mca_spml_ikrit_get_shm(void *src_addr, { int ptl_id; void *rva; - sshmem_mkey_t *r_mkey; ptl_id = get_ptl_id(src); /** @@ -853,20 +610,13 @@ static inline int mca_spml_ikrit_get_shm(void *src_addr, if (ptl_id != MXM_PTL_SHM) return OSHMEM_ERROR; - r_mkey = mca_memheap_base_get_cached_mkey(src, src_addr, ptl_id, &rva); - if (!r_mkey) { - SPML_ERROR("pe=%d: %p is not address of shared variable", - src, src_addr); - oshmem_shmem_abort(-1); - return OSHMEM_ERROR; - } - - if (!mca_memheap_base_can_local_copy(r_mkey, src_addr)) + if (NULL != mca_spml_ikrit_get_mkey(src, src_addr, MXM_PTL_SHM, &rva)) return OSHMEM_ERROR; - SPML_VERBOSE(100, - "shm get: pe:%d src=%p -> dst: %p sz=%d. src_rva=%p, %s", - src, src_addr, dst_addr, (int)size, (void *)rva, mca_spml_base_mkey2str(r_mkey)); + SPML_VERBOSE_FASTPATH(100, + "shm get: pe:%d src=%p -> dst: %p sz=%d. src_rva=%p", + src, src_addr, dst_addr, (int)size, (void *)rva); + memcpy(dst_addr, (void *) (unsigned long) rva, size); opal_progress(); return OSHMEM_SUCCESS; @@ -922,18 +672,13 @@ static inline void get_completion_cb(void *ctx) mca_spml_ikrit_get_request_t *get_req = (mca_spml_ikrit_get_request_t *) ctx; OPAL_THREAD_ADD32(&mca_spml_ikrit.n_active_gets, -1); - get_req->req_get.req_base.req_spml_complete = true; - get_req->req_get.req_base.req_oshmem.req_status.SHMEM_ERROR = - OSHMEM_SUCCESS; - oshmem_request_complete(&get_req->req_get.req_base.req_oshmem, 1); - oshmem_request_free((oshmem_request_t**) &get_req); + free_get_req(get_req); } -/* extension. used 4 fence implementation b4 fence was added to mxm */ -int mca_spml_ikrit_get_async(void *src_addr, - size_t size, - void *dst_addr, - int src) +static inline int mca_spml_ikrit_get_async(void *src_addr, + size_t size, + void *dst_addr, + int src) { mca_spml_ikrit_get_request_t *get_req; @@ -941,27 +686,17 @@ int mca_spml_ikrit_get_async(void *src_addr, return OSHMEM_SUCCESS; get_req = alloc_get_req(); - if (NULL == get_req) { - SPML_ERROR("out of get requests - aborting"); + + if (OSHMEM_SUCCESS != mca_spml_ikrit_get_helper(&get_req->mxm_req, + src_addr, + size, + dst_addr, + src)) { oshmem_shmem_abort(-1); return OSHMEM_ERROR; } - if (OSHMEM_SUCCESS - != mca_spml_ikrit_get_helper(&get_req->mxm_req, - src_addr, - size, - dst_addr, - src)) { - oshmem_shmem_abort(-1); - return OSHMEM_ERROR; - } - -#if MXM_API < MXM_VERSION(2,0) - get_req->mxm_req.base.flags = 0; -#else get_req->mxm_req.flags = 0; -#endif get_req->mxm_req.base.completed_cb = get_completion_cb; get_req->mxm_req.base.context = get_req; OPAL_THREAD_ADD32(&mca_spml_ikrit.n_active_gets, 1); @@ -977,11 +712,7 @@ static inline void fence_completion_cb(void *ctx) (mca_spml_ikrit_get_request_t *) ctx; OPAL_THREAD_ADD32(&mca_spml_ikrit.n_mxm_fences, -1); - fence_req->req_get.req_base.req_spml_complete = true; - fence_req->req_get.req_base.req_oshmem.req_status.SHMEM_ERROR = - OSHMEM_SUCCESS; - oshmem_request_complete(&fence_req->req_get.req_base.req_oshmem, 1); - oshmem_request_free((oshmem_request_t**) &fence_req); + free_get_req(fence_req); } static int mca_spml_ikrit_mxm_fence(int dst) @@ -989,18 +720,9 @@ static int mca_spml_ikrit_mxm_fence(int dst) mca_spml_ikrit_get_request_t *fence_req; fence_req = alloc_get_req(); - if (NULL == fence_req) { - SPML_ERROR("out of get requests - aborting"); - oshmem_shmem_abort(-1); - return OSHMEM_ERROR; - } fence_req->mxm_req.base.mq = mca_spml_ikrit.mxm_mq; - fence_req->mxm_req.base.conn = mca_spml_ikrit.mxm_peers[dst]->mxm_conn; -#if MXM_API < MXM_VERSION(2,0) - fence_req->mxm_req.opcode = MXM_REQ_OP_FENCE; - fence_req->mxm_req.base.flags = MXM_REQ_FLAG_SEND_SYNC; -#else + fence_req->mxm_req.base.conn = mca_spml_ikrit.mxm_peers[dst].mxm_conn; fence_req->mxm_req.opcode = MXM_REQ_OP_PUT_SYNC; fence_req->mxm_req.flags = MXM_REQ_SEND_FLAG_FENCE; fence_req->mxm_req.op.mem.remote_vaddr = 0; @@ -1008,7 +730,6 @@ static int mca_spml_ikrit_mxm_fence(int dst) fence_req->mxm_req.base.data_type = MXM_REQ_DATA_BUFFER; fence_req->mxm_req.base.data.buffer.ptr = 0; fence_req->mxm_req.base.data.buffer.length = 0; -#endif fence_req->mxm_req.base.state = MXM_REQ_NEW; fence_req->mxm_req.base.completed_cb = fence_completion_cb; fence_req->mxm_req.base.context = fence_req; @@ -1024,7 +745,8 @@ static inline void put_completion_cb(void *ctx) mxm_peer_t *peer; OPAL_THREAD_ADD32(&mca_spml_ikrit.n_active_puts, -1); - peer = mca_spml_ikrit.mxm_peers[put_req->pe]; + /* TODO: keep pointer to peer in the request */ + peer = &mca_spml_ikrit.mxm_peers[put_req->pe]; /* this was last put in progress. Remove peer from the list so that we do not need explicit fence */ #if SPML_IKRIT_PUT_DEBUG == 1 @@ -1041,26 +763,14 @@ static inline void put_completion_cb(void *ctx) if (0 < peer->n_active_puts) { peer->n_active_puts--; -#if MXM_API < MXM_VERSION(2,0) - if (0 == peer->n_active_puts && - (put_req->mxm_req.base.flags & MXM_REQ_FLAG_SEND_SYNC)) { - opal_list_remove_item(&mca_spml_ikrit.active_peers, &peer->super); - peer->need_fence = 0; - } -#else if (0 == peer->n_active_puts && (put_req->mxm_req.opcode == MXM_REQ_OP_PUT_SYNC)) { - opal_list_remove_item(&mca_spml_ikrit.active_peers, &peer->super); + opal_list_remove_item(&mca_spml_ikrit.active_peers, &peer->link); peer->need_fence = 0; } -#endif } - put_req->req_put.req_base.req_spml_complete = true; - put_req->req_put.req_base.req_oshmem.req_status.SHMEM_ERROR = - OSHMEM_SUCCESS; - oshmem_request_complete(&put_req->req_put.req_base.req_oshmem, 1); - oshmem_request_free((oshmem_request_t**) &put_req); + free_put_req(put_req); } /** @@ -1076,60 +786,30 @@ static inline int mca_spml_ikrit_put_internal(void* dst_addr, void *rva; mca_spml_ikrit_put_request_t *put_req; int ptl_id; - sshmem_mkey_t *r_mkey; static int count; int need_progress = 0; + mxm_mem_key_t *mkey; - if (0 >= size) { + if (OPAL_UNLIKELY(0 >= size)) { return OSHMEM_SUCCESS; } ptl_id = get_ptl_id(dst); - /* Get rkey of remote PE (dst proc) which must be on memheap */ - r_mkey = mca_memheap_base_get_cached_mkey(dst, dst_addr, ptl_id, &rva); - if (!r_mkey) { - SPML_ERROR("pe=%d: %p is not address of shared variable", - dst, dst_addr); - oshmem_shmem_abort(-1); - return OSHMEM_ERROR; + mkey = mca_spml_ikrit_get_mkey(dst, dst_addr, ptl_id, &rva); + + if (OPAL_UNLIKELY(NULL == mkey)) { + memcpy((void *) (unsigned long) rva, src_addr, size); + /* call progress as often as we would have with regular put */ + if (++count % SPML_IKRIT_PACKETS_PER_SYNC == 0) + mxm_progress(mca_spml_ikrit.mxm_context); + return OSHMEM_SUCCESS; } -#if SPML_IKRIT_PUT_DEBUG == 1 - - SPML_VERBOSE(100, "put: pe:%d ptl=%d dst=%p <- src: %p sz=%d. dst_rva=%p, %s", - dst, ptl_id, dst_addr, src_addr, (int)size, (void *)rva, mca_spml_base_mkey2str(r_mkey)); -#endif - if (ptl_id == MXM_PTL_SHM) { - - if (mca_memheap_base_can_local_copy(r_mkey, dst_addr)) { - memcpy((void *) (unsigned long) rva, src_addr, size); - /* call progress as often as we would have with regular put */ - if (++count % SPML_IKRIT_PACKETS_PER_SYNC == 0) - mxm_progress(mca_spml_ikrit.mxm_context); - return OSHMEM_SUCCESS; - } - /* segment not mapped - fallback to rmda */ - ptl_id = MXM_PTL_RDMA; - r_mkey = mca_memheap_base_get_cached_mkey(dst, dst_addr, ptl_id, &rva); - if (!r_mkey) { - SPML_ERROR("pe=%d: %p is not address of shared variable", - dst, dst_addr); - oshmem_shmem_abort(-1); - return OSHMEM_ERROR; - } - } - -#if SPML_IKRIT_PUT_DEBUG == 1 - SPML_VERBOSE(100, "put: pe:%d ptl=%d dst=%p <- src: %p sz=%d. dst_rva=%p, %s", - dst, ptl_id, dst_addr, src_addr, (int)size, (void *)rva, mca_spml_base_mkey2str(r_mkey)); -#endif + SPML_VERBOSE_FASTPATH(100, "put: pe:%d ptl=%d dst=%p <- src: %p sz=%d. dst_rva=%p, %s", + dst, ptl_id, dst_addr, src_addr, (int)size, (void *)rva); put_req = alloc_put_req(); - if (NULL == put_req) { - SPML_ERROR("out of put requests - aborting"); - oshmem_shmem_abort(-1); - return OSHMEM_ERROR; - } + if (handle) *handle = put_req; @@ -1137,20 +817,10 @@ static inline int mca_spml_ikrit_put_internal(void* dst_addr, put_req->mxm_req.base.mq = mca_spml_ikrit.mxm_mq; /* request immediate responce if we are getting low on send buffers. We only get responce from remote on ack timeout. * Also request explicit ack once in a while */ -#if MXM_API < MXM_VERSION(2,0) - put_req->mxm_req.opcode = MXM_REQ_OP_PUT; - if (mca_spml_ikrit.free_list_max - mca_spml_ikrit.n_active_puts <= SPML_IKRIT_PUT_LOW_WATER || - (mca_spml_ikrit.mxm_peers[dst]->n_active_puts + 1) % SPML_IKRIT_PACKETS_PER_SYNC == 0) { - put_req->mxm_req.base.flags = MXM_REQ_FLAG_SEND_SYNC; - need_progress = 1; - } else { - put_req->mxm_req.base.flags = MXM_REQ_FLAG_SEND_LAZY|MXM_REQ_FLAG_SEND_SYNC; - } -#else put_req->mxm_req.flags = 0; if (mca_spml_ikrit.free_list_max - mca_spml_ikrit.n_active_puts <= SPML_IKRIT_PUT_LOW_WATER || (int)opal_list_get_size(&mca_spml_ikrit.active_peers) > mca_spml_ikrit.unsync_conn_max || - (mca_spml_ikrit.mxm_peers[dst]->n_active_puts + 1) % SPML_IKRIT_PACKETS_PER_SYNC == 0) { + (mca_spml_ikrit.mxm_peers[dst].n_active_puts + 1) % SPML_IKRIT_PACKETS_PER_SYNC == 0) { need_progress = 1; put_req->mxm_req.opcode = MXM_REQ_OP_PUT_SYNC; } else { @@ -1163,9 +833,8 @@ static inline int mca_spml_ikrit_put_internal(void* dst_addr, put_req->mxm_req.opcode = MXM_REQ_OP_PUT_SYNC; } } -#endif - put_req->mxm_req.base.conn = mca_spml_ikrit.mxm_peers[dst]->mxm_conn; + put_req->mxm_req.base.conn = mca_spml_ikrit.mxm_peers[dst].mxm_conn; put_req->mxm_req.base.data_type = MXM_REQ_DATA_BUFFER; put_req->mxm_req.base.data.buffer.ptr = src_addr; put_req->mxm_req.base.data.buffer.length = size; @@ -1175,21 +844,16 @@ static inline int mca_spml_ikrit_put_internal(void* dst_addr, put_req->mxm_req.base.state = MXM_REQ_NEW; put_req->pe = dst; -#if MXM_API < MXM_VERSION(2,0) - put_req->mxm_req.base.data.buffer.memh = NULL; - put_req->mxm_req.op.mem.remote_memh = NULL; -#else - put_req->mxm_req.op.mem.remote_mkey = to_mxm_mkey(r_mkey); -#endif + put_req->mxm_req.op.mem.remote_mkey = mkey; OPAL_THREAD_ADD32(&mca_spml_ikrit.n_active_puts, 1); - if (mca_spml_ikrit.mxm_peers[dst]->need_fence == 0) { + if (mca_spml_ikrit.mxm_peers[dst].need_fence == 0) { opal_list_append(&mca_spml_ikrit.active_peers, - &mca_spml_ikrit.mxm_peers[dst]->super); - mca_spml_ikrit.mxm_peers[dst]->need_fence = 1; + &mca_spml_ikrit.mxm_peers[dst].link); + mca_spml_ikrit.mxm_peers[dst].need_fence = 1; } - mca_spml_ikrit.mxm_peers[dst]->n_active_puts++; + mca_spml_ikrit.mxm_peers[dst].n_active_puts++; SPML_IKRIT_MXM_POST_SEND(put_req->mxm_req); @@ -1214,60 +878,30 @@ int mca_spml_ikrit_put_simple(void* dst_addr, mxm_send_req_t mxm_req; mxm_wait_t wait; int ptl_id; - sshmem_mkey_t *r_mkey; + mxm_mem_key_t *mkey; static int count; ptl_id = get_ptl_id(dst); - /* Get rkey of remote PE (dst proc) which must be on memheap */ - r_mkey = mca_memheap_base_get_cached_mkey(dst, dst_addr, ptl_id, &rva); - if (!r_mkey) { - SPML_ERROR("pe=%d: %p is not address of shared variable", - dst, dst_addr); - oshmem_shmem_abort(-1); - return OSHMEM_ERROR; + mkey = mca_spml_ikrit_get_mkey(dst, dst_addr, ptl_id, &rva); + + SPML_VERBOSE_FASTPATH(100, "put: pe:%d ptl=%d dst=%p <- src: %p sz=%d. dst_rva=%p, %s", + dst, ptl_id, dst_addr, src_addr, (int)size, (void *)rva); + + if (NULL == mkey) { + memcpy((void *) (unsigned long) rva, src_addr, size); + /* call progress as often as we would have with regular put */ + if (++count % SPML_IKRIT_PACKETS_PER_SYNC == 0) + mxm_progress(mca_spml_ikrit.mxm_context); + return OSHMEM_SUCCESS; } -#if SPML_IKRIT_PUT_DEBUG == 1 - SPML_VERBOSE(100, "put: pe:%d ptl=%d dst=%p <- src: %p sz=%d. dst_rva=%p, %s", - dst, ptl_id, dst_addr, src_addr, (int)size, (void *)rva, mca_spml_base_mkey2str(r_mkey)); -#endif - if (ptl_id == MXM_PTL_SHM) { - - if (mca_memheap_base_can_local_copy(r_mkey, dst_addr)) { - memcpy((void *) (unsigned long) rva, src_addr, size); - /* call progress as often as we would have with regular put */ - if (++count % SPML_IKRIT_PACKETS_PER_SYNC == 0) - mxm_progress(mca_spml_ikrit.mxm_context); - return OSHMEM_SUCCESS; - } - /* segment not mapped - fallback to rmda */ - ptl_id = MXM_PTL_RDMA; - r_mkey = mca_memheap_base_get_cached_mkey(dst, - //(unsigned long) dst_addr, - dst_addr, - ptl_id, - &rva); - if (!r_mkey) { - SPML_ERROR("pe=%d: %p is not address of shared variable", - dst, dst_addr); - oshmem_shmem_abort(-1); - return OSHMEM_ERROR; - } - } - -#if SPML_IKRIT_PUT_DEBUG == 1 - SPML_VERBOSE(100, "put: pe:%d ptl=%d dst=%p <- src: %p sz=%d. dst_rva=%p, %s", - dst, ptl_id, dst_addr, src_addr, (int)size, (void *)rva, mca_spml_base_mkey2str(r_mkey)); -#endif + SPML_VERBOSE_FASTPATH(100, "put: pe:%d ptl=%d dst=%p <- src: %p sz=%d. dst_rva=%p, %s", + dst, MXM_PTL_RDMA, dst_addr, src_addr, (int)size, (void *)rva); /* fill out request */ mxm_req.base.mq = mca_spml_ikrit.mxm_mq; -#if MXM_API < MXM_VERSION(2,0) - mxm_req.base.flags = MXM_REQ_FLAG_BLOCKING; -#else mxm_req.flags = MXM_REQ_SEND_FLAG_BLOCKING; -#endif - mxm_req.base.conn = mca_spml_ikrit.mxm_peers[dst]->mxm_conn; + mxm_req.base.conn = mca_spml_ikrit.mxm_peers[dst].mxm_conn; mxm_req.base.data_type = MXM_REQ_DATA_BUFFER; mxm_req.base.data.buffer.ptr = src_addr; mxm_req.base.data.buffer.length = size; @@ -1278,17 +912,12 @@ int mca_spml_ikrit_put_simple(void* dst_addr, mxm_req.base.state = MXM_REQ_NEW; mxm_req.base.error = MXM_OK; -#if MXM_API < MXM_VERSION(2, 0) - mxm_req.base.data.buffer.memh = NULL; - mxm_req.op.mem.remote_memh = NULL; -#else - mxm_req.op.mem.remote_mkey = to_mxm_mkey(r_mkey); -#endif + mxm_req.op.mem.remote_mkey = mkey; - if (mca_spml_ikrit.mxm_peers[dst]->need_fence == 0) { + if (mca_spml_ikrit.mxm_peers[dst].need_fence == 0) { opal_list_append(&mca_spml_ikrit.active_peers, - &mca_spml_ikrit.mxm_peers[dst]->super); - mca_spml_ikrit.mxm_peers[dst]->need_fence = 1; + &mca_spml_ikrit.mxm_peers[dst].link); + mca_spml_ikrit.mxm_peers[dst].need_fence = 1; } SPML_IKRIT_MXM_POST_SEND(mxm_req); @@ -1361,14 +990,14 @@ int mca_spml_ikrit_fence(void) /* puts(unless are send sync) are completed by remote side lazily. That is either when remote decides to * ack window which can take hundreds of ms. So speed things up by doing fence */ while (NULL != (item = opal_list_remove_first(&mca_spml_ikrit.active_peers))) { - peer = (mxm_peer_t *) item; + peer = spml_ikrit_container_of(item, mxm_peer_t, link); peer->n_active_puts = 0; peer->need_fence = 0; - mca_spml_ikrit_mxm_fence(peer->pe); + mca_spml_ikrit_mxm_fence(peer - mca_spml_ikrit.mxm_peers); } - while (0 < mca_spml_ikrit.n_mxm_fences) { - oshmem_request_wait_any_completion(); + while (0 < mca_spml_ikrit.n_mxm_fences || 0 < mca_spml_ikrit.n_active_gets) { + opal_progress(); } SPML_VERBOSE(20, "fence completed"); @@ -1392,9 +1021,6 @@ int mca_spml_ikrit_recv(void* buf, size_t size, int src) req.base.state = MXM_REQ_NEW; req.base.mq = mca_spml_ikrit.mxm_mq; req.base.conn = NULL; -#if MXM_API < MXM_VERSION(2,0) - req.base.flags = MXM_REQ_FLAG_BLOCKING; -#endif req.base.completed_cb = NULL; req.base.data_type = MXM_REQ_DATA_BUFFER; @@ -1435,12 +1061,8 @@ int mca_spml_ikrit_send(void* buf, req.base.state = MXM_REQ_NEW; req.base.mq = mca_spml_ikrit.mxm_mq; - req.base.conn = mca_spml_ikrit.mxm_peers[dst]->mxm_conn; -#if MXM_API < MXM_VERSION(2,0) - req.base.flags = MXM_REQ_FLAG_BLOCKING; -#else - req.flags = MXM_REQ_SEND_FLAG_BLOCKING; -#endif + req.base.conn = mca_spml_ikrit.mxm_peers[dst].mxm_conn; + req.flags = MXM_REQ_SEND_FLAG_BLOCKING; req.base.completed_cb = NULL; req.base.data_type = MXM_REQ_DATA_BUFFER; diff --git a/oshmem/mca/spml/ikrit/spml_ikrit.h b/oshmem/mca/spml/ikrit/spml_ikrit.h index 45117b500d..c6810dbce5 100644 --- a/oshmem/mca/spml/ikrit/spml_ikrit.h +++ b/oshmem/mca/spml/ikrit/spml_ikrit.h @@ -33,6 +33,7 @@ #include "opal/class/opal_list.h" #include "orte/runtime/orte_globals.h" +#include "oshmem/mca/memheap/base/base.h" #include @@ -40,11 +41,6 @@ #define MXM_VERSION(major, minor) (((major)< -#include -#endif - #define MXM_SHMEM_MQ_ID 0x7119 /* start request explicit ack once our buffer pool is less than watermark */ @@ -52,22 +48,38 @@ /* request explicit ack (SYNC) per every X put requests per connection */ #define SPML_IKRIT_PACKETS_PER_SYNC 64 +#define spml_ikrit_container_of(ptr, type, member) ( \ + (type *)( ((char *)(ptr)) - offsetof(type,member) )) + +#define MXM_MAX_ADDR_LEN 512 + +#define MXM_PTL_RDMA 0 +#define MXM_PTL_SHM 1 +#define MXM_PTL_LAST 2 + BEGIN_C_DECLS /** - * UD MXM SPML module + * MXM SPML module */ +/* TODO: move va_xx to base struct */ +struct spml_ikrit_mkey { + mkey_segment_t super; + mxm_mem_key_t key; +}; +typedef struct spml_ikrit_mkey spml_ikrit_mkey_t; + struct mxm_peer { - opal_list_item_t super; mxm_conn_h mxm_conn; mxm_conn_h mxm_hw_rdma_conn; - int pe; + uint8_t ptl_id; + uint8_t need_fence; int32_t n_active_puts; - int need_fence; + opal_list_item_t link; + spml_ikrit_mkey_t mkeys[MCA_MEMHEAP_SEG_COUNT]; }; typedef struct mxm_peer mxm_peer_t; -OBJ_CLASS_DECLARATION(mxm_peer_t); struct mca_spml_ikrit_t { mca_spml_base_module_t super; @@ -79,7 +91,7 @@ struct mca_spml_ikrit_t { mxm_ep_h mxm_ep; mxm_ep_h mxm_hw_rdma_ep; mxm_mq_h mxm_mq; - mxm_peer_t **mxm_peers; + mxm_peer_t *mxm_peers; int32_t n_active_puts; int32_t n_active_gets; @@ -103,22 +115,13 @@ struct mca_spml_ikrit_t { int hw_rdma_channel; /* true if we provide separate channel that has true one sided capability */ int np; -#if MXM_API >= MXM_VERSION(2,0) int unsync_conn_max; -#endif size_t put_zcopy_threshold; /* enable zcopy in put if message size is greater than the threshold */ }; typedef struct mca_spml_ikrit_t mca_spml_ikrit_t; -#define MXM_MAX_ADDR_LEN 512 - -#if MXM_API >= MXM_VERSION(2,0) -#define MXM_PTL_SHM 0 -#define MXM_PTL_RDMA 1 -#define MXM_PTL_LAST 2 -#endif typedef struct spml_ikrit_mxm_ep_conn_info_t { union { @@ -139,11 +142,6 @@ extern int mca_spml_ikrit_get_nb(void* src_addr, void* dst_addr, int src, void **handle); -/* extension. used 4 fence implementation b4 fence was added to mxm */ -extern int mca_spml_ikrit_get_async(void *src_addr, - size_t size, - void *dst_addr, - int src); extern int mca_spml_ikrit_put(void* dst_addr, size_t size, @@ -167,7 +165,7 @@ extern sshmem_mkey_t *mca_spml_ikrit_register(void* addr, int *count); extern int mca_spml_ikrit_deregister(sshmem_mkey_t *mkeys); extern int mca_spml_ikrit_oob_get_mkeys(int pe, - uint32_t seg, + uint32_t segno, sshmem_mkey_t *mkeys); extern int mca_spml_ikrit_add_procs(ompi_proc_t** procs, size_t nprocs); @@ -175,6 +173,30 @@ extern int mca_spml_ikrit_del_procs(ompi_proc_t** procs, size_t nprocs); extern int mca_spml_ikrit_fence(void); extern int spml_ikrit_progress(void); +mxm_mem_key_t *mca_spml_ikrit_get_mkey_slow(int pe, void *va, int ptl_id, void **rva); + +/* the functionreturns NULL if data can be directly copied via shared memory + * else it returns mxm mem key + * + * the function will abort() if va is not symmetric var address. + */ +static inline mxm_mem_key_t *mca_spml_ikrit_get_mkey(int pe, void *va, int ptl_id, void **rva) +{ + spml_ikrit_mkey_t *mkey; + + if (OPAL_UNLIKELY(MXM_PTL_RDMA != ptl_id)) { + return mca_spml_ikrit_get_mkey_slow(pe, va, ptl_id, rva); + } + + mkey = mca_spml_ikrit.mxm_peers[pe].mkeys; + mkey = (spml_ikrit_mkey_t *)map_segment_find_va(&mkey->super.super, sizeof(*mkey), va); + if (OPAL_UNLIKELY(NULL == mkey)) { + return mca_spml_ikrit_get_mkey_slow(pe, va, ptl_id, rva); + } + *rva = map_segment_va2rva(&mkey->super, va); + return &mkey->key; +} + END_C_DECLS #endif diff --git a/oshmem/mca/spml/ikrit/spml_ikrit_component.c b/oshmem/mca/spml/ikrit/spml_ikrit_component.c index e698cfa8d8..45cba8eb4b 100644 --- a/oshmem/mca/spml/ikrit/spml_ikrit_component.c +++ b/oshmem/mca/spml/ikrit/spml_ikrit_component.c @@ -233,17 +233,11 @@ static int mca_spml_ikrit_component_register(void) &mca_spml_ikrit.mxm_tls); mca_spml_ikrit_param_register_int("np", -#if MXM_API <= MXM_VERSION(2,0) - 128, -#else - 0, -#endif - "[integer] Minimal allowed job's NP to activate ikrit", &mca_spml_ikrit.np); -#if MXM_API >= MXM_VERSION(2,0) + 0, + "[integer] Minimal allowed job's NP to activate ikrit", &mca_spml_ikrit.np); mca_spml_ikrit_param_register_int("unsync_conn_max", 8, "[integer] Max number of connections that do not require notification of PUT operation remote completion. Increasing this number improves efficiency of p2p communication but increases overhead of shmem_fence/shmem_quiet/shmem_barrier", &mca_spml_ikrit.unsync_conn_max); -#endif mca_spml_ikrit_param_register_size_t("put_zcopy_threshold", 16384ULL, "[size_t] Use zero copy put if message size is greater than the threshold", @@ -312,10 +306,6 @@ static int mca_spml_ikrit_component_open(void) return OSHMEM_ERROR; } -#if MXM_API < MXM_VERSION(2,0) - mca_spml_ikrit.ud_only = 1; - mca_spml_ikrit.mxm_ctx_opts->ptl_bitmap = (MXM_BIT(MXM_PTL_SELF) | MXM_BIT(MXM_PTL_RDMA)); -#endif SPML_VERBOSE(5, "UD only mode is %s", mca_spml_ikrit.ud_only ? "enabled" : "disabled"); @@ -354,15 +344,10 @@ static int mca_spml_ikrit_component_close(void) } if (mca_spml_ikrit.mxm_context) { mxm_cleanup(mca_spml_ikrit.mxm_context); -#if MXM_API < MXM_VERSION(2,0) - mxm_config_free(mca_spml_ikrit.mxm_ep_opts); - mxm_config_free(mca_spml_ikrit.mxm_ctx_opts); -#else mxm_config_free_ep_opts(mca_spml_ikrit.mxm_ep_opts); mxm_config_free_context_opts(mca_spml_ikrit.mxm_ctx_opts); if (mca_spml_ikrit.hw_rdma_channel) mxm_config_free_ep_opts(mca_spml_ikrit.mxm_ep_hw_rdma_opts); -#endif } mca_spml_ikrit.mxm_mq = NULL; mca_spml_ikrit.mxm_context = NULL; @@ -373,14 +358,6 @@ static int spml_ikrit_mxm_init(void) { mxm_error_t err; -#if MXM_API < MXM_VERSION(2,0) - /* Only relevant for SHM PTL - ignore */ - mca_spml_ikrit.mxm_ep_opts->job_id = 0; - mca_spml_ikrit.mxm_ep_opts->local_rank = 0; - mca_spml_ikrit.mxm_ep_opts->num_local_procs = 0; - mca_spml_ikrit.mxm_ep_opts->rdma.drain_cq = 1; -#endif - /* Open MXM endpoint */ err = mxm_ep_create(mca_spml_ikrit.mxm_context, mca_spml_ikrit.mxm_ep_opts, diff --git a/oshmem/mca/spml/spml.h b/oshmem/mca/spml/spml.h index ffcc61f411..f081b8b7b1 100644 --- a/oshmem/mca/spml/spml.h +++ b/oshmem/mca/spml/spml.h @@ -118,7 +118,7 @@ typedef int (*mca_spml_base_module_wait_fn_t)(void* addr, * * @param mkey remote mkey */ -typedef void (*mca_spml_base_module_mkey_unpack_fn_t)(sshmem_mkey_t *, int remote_pe); +typedef void (*mca_spml_base_module_mkey_unpack_fn_t)(sshmem_mkey_t *, uint32_t segno, int remote_pe, int tr_id); /** * free resources used by deserialized remote mkey @@ -149,9 +149,9 @@ typedef int (*mca_spml_base_module_deregister_fn_t)(sshmem_mkey_t *mkeys); /** * try to fill up mkeys that can be used to reach remote pe. - * @param pe remote pe + * @param pe remote pe * @param seg 0 - symmetric heap, 1 - static data, everything else are static data in .so - * @param mkeys mkeys array + * @param mkeys mkeys array * * @return OSHMEM_SUCCSESS if keys are found */ diff --git a/oshmem/mca/spml/ucx/spml_ucx.c b/oshmem/mca/spml/ucx/spml_ucx.c index 008e0ed779..3f49a5ea9a 100644 --- a/oshmem/mca/spml/ucx/spml_ucx.c +++ b/oshmem/mca/spml/ucx/spml_ucx.c @@ -115,7 +115,6 @@ int mca_spml_ucx_del_procs(ompi_proc_t** procs, size_t nprocs) int my_rank = oshmem_my_proc_id(); size_t num_reqs, max_reqs; void *dreq, **dreqs; - ompi_proc_t *proc; ucp_ep_h ep; size_t i, n; @@ -157,7 +156,7 @@ int mca_spml_ucx_del_procs(ompi_proc_t** procs, size_t nprocs) mca_spml_ucx.ucp_peers[n].ucp_conn = NULL; - if (num_reqs >= mca_spml_ucx.num_disconnect) { + if ((int)num_reqs >= mca_spml_ucx.num_disconnect) { mca_spml_ucx_waitall(dreqs, &num_reqs); } } @@ -322,6 +321,21 @@ error: } + +spml_ucx_mkey_t * mca_spml_ucx_get_mkey_slow(int pe, void *va, void **rva) +{ + sshmem_mkey_t *r_mkey; + + r_mkey = mca_memheap_base_get_cached_mkey(pe, va, 0, rva); + if (OPAL_UNLIKELY(!r_mkey)) { + SPML_ERROR("pe=%d: %p is not address of symmetric variable", + pe, va); + oshmem_shmem_abort(-1); + return NULL; + } + return (spml_ucx_mkey_t *)(r_mkey->spml_context); +} + void mca_spml_ucx_rmkey_free(sshmem_mkey_t *mkey) { spml_ucx_mkey_t *ucx_mkey; @@ -331,20 +345,23 @@ void mca_spml_ucx_rmkey_free(sshmem_mkey_t *mkey) } ucx_mkey = (spml_ucx_mkey_t *)(mkey->spml_context); ucp_rkey_destroy(ucx_mkey->rkey); - free(ucx_mkey); } -void mca_spml_ucx_rmkey_unpack(sshmem_mkey_t *mkey, int pe) +static void mca_spml_ucx_cache_mkey(sshmem_mkey_t *mkey, uint32_t segno, int dst_pe) +{ + ucp_peer_t *peer; + + peer = &mca_spml_ucx.ucp_peers[dst_pe]; + mkey_segment_init(&peer->mkeys[segno].super, mkey, segno); +} + +void mca_spml_ucx_rmkey_unpack(sshmem_mkey_t *mkey, uint32_t segno, int pe, int tr_id) { spml_ucx_mkey_t *ucx_mkey; ucs_status_t err; - - ucx_mkey = (spml_ucx_mkey_t *)malloc(sizeof(*ucx_mkey)); - if (!ucx_mkey) { - SPML_ERROR("not enough memory to allocate mkey"); - goto error_fatal; - } + ucx_mkey = &mca_spml_ucx.ucp_peers[pe].mkeys[segno].key; + err = ucp_ep_rkey_unpack(mca_spml_ucx.ucp_peers[pe].ucp_conn, mkey->u.data, &ucx_mkey->rkey); @@ -354,6 +371,7 @@ void mca_spml_ucx_rmkey_unpack(sshmem_mkey_t *mkey, int pe) } mkey->spml_context = ucx_mkey; + mca_spml_ucx_cache_mkey(mkey, segno, pe); return; error_fatal: @@ -370,23 +388,23 @@ sshmem_mkey_t *mca_spml_ucx_register(void* addr, ucs_status_t err; spml_ucx_mkey_t *ucx_mkey; size_t len; + int my_pe = oshmem_my_proc_id(); + int seg; *count = 0; mkeys = (sshmem_mkey_t *) calloc(1, sizeof(*mkeys)); if (!mkeys) { - return NULL ; + return NULL; } - ucx_mkey = (spml_ucx_mkey_t *)malloc(sizeof(*ucx_mkey)); - if (!ucx_mkey) { - goto error_out; - } + seg = memheap_find_segnum(addr); + ucx_mkey = &mca_spml_ucx.ucp_peers[my_pe].mkeys[seg].key; mkeys[0].spml_context = ucx_mkey; - err = ucp_mem_map(mca_spml_ucx.ucp_context, - &addr, size, 0, &ucx_mkey->mem_h); + + err = ucp_mem_map(mca_spml_ucx.ucp_context, &addr, size, 0, &ucx_mkey->mem_h); if (UCS_OK != err) { - goto error_out1; + goto error_out; } err = ucp_rkey_pack(mca_spml_ucx.ucp_context, ucx_mkey->mem_h, @@ -412,12 +430,11 @@ sshmem_mkey_t *mca_spml_ucx_register(void* addr, mkeys[0].len = len; mkeys[0].va_base = addr; *count = 1; + mca_spml_ucx_cache_mkey(&mkeys[0], seg, my_pe); return mkeys; error_unmap: ucp_mem_unmap(mca_spml_ucx.ucp_context, ucx_mkey->mem_h); -error_out1: - free(ucx_mkey); error_out: free(mkeys); @@ -442,7 +459,6 @@ int mca_spml_ucx_deregister(sshmem_mkey_t *mkeys) ucp_rkey_buffer_release(mkeys[0].u.data); } - free(ucx_mkey); return OSHMEM_SUCCESS; } diff --git a/oshmem/mca/spml/ucx/spml_ucx.h b/oshmem/mca/spml/ucx/spml_ucx.h index 5e828e0637..0e0e01b7f1 100644 --- a/oshmem/mca/spml/ucx/spml_ucx.h +++ b/oshmem/mca/spml/ucx/spml_ucx.h @@ -40,10 +40,22 @@ BEGIN_C_DECLS /** * UCX SPML module */ -struct ucp_peer { - ucp_ep_h ucp_conn; -}; +struct spml_ucx_mkey { + ucp_rkey_h rkey; + ucp_mem_h mem_h; +}; +typedef struct spml_ucx_mkey spml_ucx_mkey_t; +struct spml_ucx_cached_mkey { + mkey_segment_t super; + spml_ucx_mkey_t key; +}; +typedef struct spml_ucx_cached_mkey spml_ucx_cached_mkey_t; + +struct ucp_peer { + ucp_ep_h ucp_conn; + spml_ucx_cached_mkey_t mkeys[MCA_MEMHEAP_SEG_COUNT]; +}; typedef struct ucp_peer ucp_peer_t; struct mca_spml_ucx { @@ -56,16 +68,8 @@ struct mca_spml_ucx { int priority; /* component priority */ bool enabled; }; - typedef struct mca_spml_ucx mca_spml_ucx_t; -struct spml_ucx_mkey { - ucp_rkey_h rkey; - ucp_mem_h mem_h; -}; - -typedef struct spml_ucx_mkey spml_ucx_mkey_t; - extern mca_spml_ucx_t mca_spml_ucx; @@ -103,7 +107,7 @@ extern sshmem_mkey_t *mca_spml_ucx_register(void* addr, int *count); extern int mca_spml_ucx_deregister(sshmem_mkey_t *mkeys); -extern void mca_spml_ucx_rmkey_unpack(sshmem_mkey_t *mkey, int pe); +extern void mca_spml_ucx_rmkey_unpack(sshmem_mkey_t *mkey, uint32_t segno, int pe, int tr_id); extern void mca_spml_ucx_rmkey_free(sshmem_mkey_t *mkey); extern int mca_spml_ucx_add_procs(ompi_proc_t** procs, size_t nprocs); @@ -113,30 +117,38 @@ extern int mca_spml_ucx_quiet(void); extern int spml_ucx_progress(void); +spml_ucx_mkey_t * mca_spml_ucx_get_mkey_slow(int pe, void *va, void **rva); static inline spml_ucx_mkey_t * mca_spml_ucx_get_mkey(int pe, void *va, void **rva) { - sshmem_mkey_t *r_mkey; + spml_ucx_cached_mkey_t *mkey; - r_mkey = mca_memheap_base_get_cached_mkey(pe, va, 0, rva); - if (OPAL_UNLIKELY(!r_mkey)) { - SPML_ERROR("pe=%d: %p is not address of symmetric variable", - pe, va); - oshmem_shmem_abort(-1); - return NULL; + mkey = mca_spml_ucx.ucp_peers[pe].mkeys; + mkey = (spml_ucx_cached_mkey_t *)map_segment_find_va(&mkey->super.super, sizeof(*mkey), va); + if (OPAL_UNLIKELY(NULL == mkey)) { + return mca_spml_ucx_get_mkey_slow(pe, va, rva); } - return (spml_ucx_mkey_t *)(r_mkey->spml_context); + *rva = map_segment_va2rva(&mkey->super, va); + return &mkey->key; } static inline int ucx_status_to_oshmem(ucs_status_t status) { +#if OSHMEM_PARAM_CHECK == 1 return OPAL_LIKELY(UCS_OK == status) ? OSHMEM_SUCCESS : OSHMEM_ERROR; +#else + return OSHMEM_SUCCESS; +#endif } static inline int ucx_status_to_oshmem_nb(ucs_status_t status) { +#if OSHMEM_PARAM_CHECK == 1 return OPAL_LIKELY(status >= 0) ? OSHMEM_SUCCESS : OSHMEM_ERROR; +#else + return OSHMEM_SUCCESS; +#endif } END_C_DECLS diff --git a/oshmem/mca/sshmem/mmap/sshmem_mmap_module.c b/oshmem/mca/sshmem/mmap/sshmem_mmap_module.c index 8d5ef386db..07de1e6d58 100644 --- a/oshmem/mca/sshmem/mmap/sshmem_mmap_module.c +++ b/oshmem/mca/sshmem/mmap/sshmem_mmap_module.c @@ -118,8 +118,8 @@ shmem_ds_reset(map_segment_t *ds_buf) MAP_SEGMENT_RESET_FLAGS(ds_buf); ds_buf->seg_id = MAP_SEGMENT_SHM_INVALID; - ds_buf->seg_base_addr = 0; - ds_buf->end = 0; + ds_buf->super.va_base = 0; + ds_buf->super.va_end = 0; ds_buf->seg_size = 0; ds_buf->type = MAP_SEGMENT_UNKNOWN; unlink(ds_buf->seg_name); @@ -218,9 +218,9 @@ segment_create(map_segment_t *ds_buf, */ ds_buf->seg_id = oshmem_my_proc_id(); } - ds_buf->seg_base_addr = addr; - ds_buf->seg_size = size; - ds_buf->end = (void*)((uintptr_t)ds_buf->seg_base_addr + ds_buf->seg_size); + ds_buf->super.va_base = addr; + ds_buf->seg_size = size; + ds_buf->super.va_end = (void*)((uintptr_t)ds_buf->super.va_base + ds_buf->seg_size); OPAL_OUTPUT_VERBOSE( (70, oshmem_sshmem_base_framework.framework_output, @@ -229,7 +229,7 @@ segment_create(map_segment_t *ds_buf, mca_sshmem_mmap_component.super.base_version.mca_type_name, mca_sshmem_mmap_component.super.base_version.mca_component_name, (rc ? "failure" : "successful"), - ds_buf->seg_id, ds_buf->seg_base_addr, (unsigned long)ds_buf->seg_size, ds_buf->seg_name) + ds_buf->seg_id, ds_buf->super.va_base, (unsigned long)ds_buf->seg_size, ds_buf->seg_name) ); return rc; @@ -319,7 +319,7 @@ segment_attach(map_segment_t *ds_buf, sshmem_mkey_t *mkey) "(id: %d, addr: %p size: %lu, name: %s | va_base: 0x%p len: %d key %llx)\n", mca_sshmem_mmap_component.super.base_version.mca_type_name, mca_sshmem_mmap_component.super.base_version.mca_component_name, - ds_buf->seg_id, ds_buf->seg_base_addr, (unsigned long)ds_buf->seg_size, ds_buf->seg_name, + ds_buf->seg_id, ds_buf->super.va_base, (unsigned long)ds_buf->seg_size, ds_buf->seg_name, mkey->va_base, mkey->len, (unsigned long long)mkey->u.key) ); @@ -341,10 +341,10 @@ segment_detach(map_segment_t *ds_buf, sshmem_mkey_t *mkey) "(id: %d, addr: %p size: %lu, name: %s)\n", mca_sshmem_mmap_component.super.base_version.mca_type_name, mca_sshmem_mmap_component.super.base_version.mca_component_name, - ds_buf->seg_id, ds_buf->seg_base_addr, (unsigned long)ds_buf->seg_size, ds_buf->seg_name) + ds_buf->seg_id, ds_buf->super.va_base, (unsigned long)ds_buf->seg_size, ds_buf->seg_name) ); - munmap((void *)ds_buf->seg_base_addr, ds_buf->seg_size); + munmap((void *)ds_buf->super.va_base, ds_buf->seg_size); /* reset the contents of the map_segment_t associated with this * shared memory segment. @@ -366,7 +366,7 @@ segment_unlink(map_segment_t *ds_buf) "(id: %d, addr: %p size: %lu, name: %s)\n", mca_sshmem_mmap_component.super.base_version.mca_type_name, mca_sshmem_mmap_component.super.base_version.mca_component_name, - ds_buf->seg_id, ds_buf->seg_base_addr, (unsigned long)ds_buf->seg_size, ds_buf->seg_name) + ds_buf->seg_id, ds_buf->super.va_base, (unsigned long)ds_buf->seg_size, ds_buf->seg_name) ); /* don't completely reset. in particular, only reset diff --git a/oshmem/mca/sshmem/sshmem_types.h b/oshmem/mca/sshmem/sshmem_types.h index a89f3396fe..3081892e5f 100644 --- a/oshmem/mca/sshmem/sshmem_types.h +++ b/oshmem/mca/sshmem/sshmem_types.h @@ -96,16 +96,25 @@ typedef struct sshmem_mkey { void *spml_context; /* spml module can attach internal structures here */ } sshmem_mkey_t; -typedef struct map_segment_t { - sshmem_mkey_t **mkeys_cache; /* includes remote segment bases in va_base */ - sshmem_mkey_t *mkeys; /* includes local segment bases in va_base */ - segment_flag_t flags; /* enable/disable flag */ - int seg_id; - void* seg_base_addr; /* base address of the segment */ - void* end; /* final address of the segment */ - char seg_name[OPAL_PATH_MAX]; - size_t seg_size; /* length of the segment */ - segment_type_t type; /* type of the segment */ +typedef struct map_base_segment { + void *va_base; /* base address of the segment */ + void *va_end; /* final address of the segment */ +} map_base_segment_t; + +typedef struct mkey_segment { + map_base_segment_t super; + void *rva_base; /* base va on remote pe */ +} mkey_segment_t; + +typedef struct map_segment { + map_base_segment_t super; + sshmem_mkey_t **mkeys_cache; /* includes remote segment bases in va_base */ + sshmem_mkey_t *mkeys; /* includes local segment bases in va_base */ + segment_flag_t flags; /* enable/disable flag */ + int seg_id; + char seg_name[OPAL_PATH_MAX]; + size_t seg_size; /* length of the segment */ + segment_type_t type; /* type of the segment */ } map_segment_t; END_C_DECLS diff --git a/oshmem/mca/sshmem/sysv/sshmem_sysv_module.c b/oshmem/mca/sshmem/sysv/sshmem_sysv_module.c index 737051ea30..625ef44534 100644 --- a/oshmem/mca/sshmem/sysv/sshmem_sysv_module.c +++ b/oshmem/mca/sshmem/sysv/sshmem_sysv_module.c @@ -114,8 +114,8 @@ shmem_ds_reset(map_segment_t *ds_buf) MAP_SEGMENT_RESET_FLAGS(ds_buf); ds_buf->seg_id = MAP_SEGMENT_SHM_INVALID; - ds_buf->seg_base_addr = 0; - ds_buf->end = 0; + ds_buf->super.va_base = 0; + ds_buf->super.va_end = 0; ds_buf->seg_size = 0; ds_buf->type = MAP_SEGMENT_UNKNOWN; memset(ds_buf->seg_name, '\0', sizeof(ds_buf->seg_name)); @@ -225,9 +225,9 @@ segment_create(map_segment_t *ds_buf, ds_buf->type = MAP_SEGMENT_ALLOC_SHM; ds_buf->seg_id = shmid; - ds_buf->seg_base_addr = addr; + ds_buf->super.va_base = addr; ds_buf->seg_size = size; - ds_buf->end = (void*)((uintptr_t)ds_buf->seg_base_addr + ds_buf->seg_size); + ds_buf->super.va_end = (void*)((uintptr_t)ds_buf->super.va_base + ds_buf->seg_size); OPAL_OUTPUT_VERBOSE( (70, oshmem_sshmem_base_framework.framework_output, @@ -236,7 +236,7 @@ segment_create(map_segment_t *ds_buf, mca_sshmem_sysv_component.super.base_version.mca_type_name, mca_sshmem_sysv_component.super.base_version.mca_component_name, (rc ? "failure" : "successful"), - ds_buf->seg_id, ds_buf->seg_base_addr, (unsigned long)ds_buf->seg_size, ds_buf->seg_name) + ds_buf->seg_id, ds_buf->super.va_base, (unsigned long)ds_buf->seg_size, ds_buf->seg_name) ); return rc; @@ -264,7 +264,7 @@ segment_attach(map_segment_t *ds_buf, sshmem_mkey_t *mkey) "(id: %d, addr: %p size: %lu, name: %s | va_base: 0x%p len: %d key %llx)\n", mca_sshmem_sysv_component.super.base_version.mca_type_name, mca_sshmem_sysv_component.super.base_version.mca_component_name, - ds_buf->seg_id, ds_buf->seg_base_addr, (unsigned long)ds_buf->seg_size, ds_buf->seg_name, + ds_buf->seg_id, ds_buf->super.va_base, (unsigned long)ds_buf->seg_size, ds_buf->seg_name, mkey->va_base, mkey->len, (unsigned long long)mkey->u.key) ); @@ -286,7 +286,7 @@ segment_detach(map_segment_t *ds_buf, sshmem_mkey_t *mkey) "(id: %d, addr: %p size: %lu, name: %s)\n", mca_sshmem_sysv_component.super.base_version.mca_type_name, mca_sshmem_sysv_component.super.base_version.mca_component_name, - ds_buf->seg_id, ds_buf->seg_base_addr, (unsigned long)ds_buf->seg_size, ds_buf->seg_name) + ds_buf->seg_id, ds_buf->super.va_base, (unsigned long)ds_buf->seg_size, ds_buf->seg_name) ); if (ds_buf->seg_id != MAP_SEGMENT_SHM_INVALID) { diff --git a/oshmem/mca/sshmem/verbs/sshmem_verbs_module.c b/oshmem/mca/sshmem/verbs/sshmem_verbs_module.c index 8f9ed70fee..f182876413 100644 --- a/oshmem/mca/sshmem/verbs/sshmem_verbs_module.c +++ b/oshmem/mca/sshmem/verbs/sshmem_verbs_module.c @@ -110,8 +110,8 @@ shmem_ds_reset(map_segment_t *ds_buf) MAP_SEGMENT_RESET_FLAGS(ds_buf); ds_buf->seg_id = MAP_SEGMENT_SHM_INVALID; - ds_buf->seg_base_addr = 0; - ds_buf->end = 0; + ds_buf->super.va_base = 0; + ds_buf->super.va_end = 0; ds_buf->seg_size = 0; ds_buf->type = MAP_SEGMENT_UNKNOWN; memset(ds_buf->seg_name, '\0', sizeof(ds_buf->seg_name)); @@ -320,9 +320,9 @@ segment_create(map_segment_t *ds_buf, ds_buf->type = MAP_SEGMENT_ALLOC_IBV_NOSHMR; ds_buf->seg_id = MAP_SEGMENT_SHM_INVALID; } - ds_buf->seg_base_addr = ib_mr->addr; + ds_buf->super.va_base = ib_mr->addr; ds_buf->seg_size = size; - ds_buf->end = (void*)((uintptr_t)ds_buf->seg_base_addr + ds_buf->seg_size); + ds_buf->super.va_end = (void*)((uintptr_t)ds_buf->super.va_base + ds_buf->seg_size); } } @@ -333,7 +333,7 @@ segment_create(map_segment_t *ds_buf, mca_sshmem_verbs_component.super.base_version.mca_type_name, mca_sshmem_verbs_component.super.base_version.mca_component_name, (rc ? "failure" : "successful"), - ds_buf->seg_id, ds_buf->seg_base_addr, (unsigned long)ds_buf->seg_size, ds_buf->seg_name) + ds_buf->seg_id, ds_buf->super.va_base, (unsigned long)ds_buf->seg_size, ds_buf->seg_name) ); return rc; @@ -398,7 +398,7 @@ segment_attach(map_segment_t *ds_buf, sshmem_mkey_t *mkey) "(id: %d, addr: %p size: %lu, name: %s | va_base: 0x%p len: %d key %llx)\n", mca_sshmem_verbs_component.super.base_version.mca_type_name, mca_sshmem_verbs_component.super.base_version.mca_component_name, - ds_buf->seg_id, ds_buf->seg_base_addr, (unsigned long)ds_buf->seg_size, ds_buf->seg_name, + ds_buf->seg_id, ds_buf->super.va_base, (unsigned long)ds_buf->seg_size, ds_buf->seg_name, mkey->va_base, mkey->len, (unsigned long long)mkey->u.key) ); @@ -422,7 +422,7 @@ segment_detach(map_segment_t *ds_buf, sshmem_mkey_t *mkey) "(id: %d, addr: %p size: %lu, name: %s)\n", mca_sshmem_verbs_component.super.base_version.mca_type_name, mca_sshmem_verbs_component.super.base_version.mca_component_name, - ds_buf->seg_id, ds_buf->seg_base_addr, (unsigned long)ds_buf->seg_size, ds_buf->seg_name) + ds_buf->seg_id, ds_buf->super.va_base, (unsigned long)ds_buf->seg_size, ds_buf->seg_name) ); if (device) { @@ -501,7 +501,7 @@ segment_unlink(map_segment_t *ds_buf) "(id: %d, addr: %p size: %lu, name: %s)\n", mca_sshmem_verbs_component.super.base_version.mca_type_name, mca_sshmem_verbs_component.super.base_version.mca_component_name, - ds_buf->seg_id, ds_buf->seg_base_addr, (unsigned long)ds_buf->seg_size, ds_buf->seg_name) + ds_buf->seg_id, ds_buf->super.va_base, (unsigned long)ds_buf->seg_size, ds_buf->seg_name) ); /* don't completely reset. in particular, only reset diff --git a/oshmem/shmem/c/shmem_addr_accessible.c b/oshmem/shmem/c/shmem_addr_accessible.c index 7d1a2486f8..8d44ff4181 100644 --- a/oshmem/shmem/c/shmem_addr_accessible.c +++ b/oshmem/shmem/c/shmem_addr_accessible.c @@ -26,10 +26,16 @@ int shmem_addr_accessible(const void *addr, int pe) { void* rva; sshmem_mkey_t *mkey; + int i; RUNTIME_CHECK_INIT(); - mkey = mca_memheap_base_get_cached_mkey(pe, (void *)addr, oshmem_get_transport_id(pe), &rva); + for (i = 0; i < mca_memheap_base_num_transports(); i++) { + mkey = mca_memheap_base_get_cached_mkey(pe, (void *)addr, i, &rva); + if (mkey) { + return 1; + } + } - return mkey ? 1 : 0; + return 0; } diff --git a/oshmem/shmem/c/shmem_lock.c b/oshmem/shmem/c/shmem_lock.c index 3d167b61d5..e7b7d81548 100644 --- a/oshmem/shmem/c/shmem_lock.c +++ b/oshmem/shmem/c/shmem_lock.c @@ -270,7 +270,23 @@ static uint64_t shmem_lock_cswap(void *target, prev_value = prev_value_32; } + return prev_value; +} +/* function is used to busy wait for the value. + * Call opal_progress() so that ompi will no deadlock + * (for example may need to respond to rkey requests) + */ +static uint64_t shmem_lock_cswap_poll(void *target, + int target_size, + uint64_t cond, + uint64_t value, + int pe) +{ + uint64_t prev_value; + + prev_value = shmem_lock_cswap(target, target_size, cond, value, pe); + opal_progress(); return prev_value; } @@ -316,11 +332,11 @@ static int pack_first_word(void *lock, extract_second_word(&lock_value, lock_size, &two); pack_2_words(&new_long_value, lock_size, one, &two); while (lock_value - != (temp = shmem_lock_cswap(lock, - lock_size, - lock_value, - new_long_value, - my_pe))) { + != (temp = shmem_lock_cswap_poll(lock, + lock_size, + lock_value, + new_long_value, + my_pe))) { lock_value = temp; extract_second_word(&lock_value, lock_size, &two); pack_2_words(&new_long_value, lock_size, one, &two); @@ -367,11 +383,11 @@ static int pack_second_word(void *lock, extract_first_word(&lock_value, lock_size, &one); pack_2_words(&new_long_value, lock_size, &one, two); while (lock_value - != (temp = shmem_lock_cswap(lock, - lock_size, - lock_value, - new_long_value, - my_pe))) { + != (temp = shmem_lock_cswap_poll(lock, + lock_size, + lock_value, + new_long_value, + my_pe))) { lock_value = temp; extract_first_word(&lock_value, lock_size, &one); pack_2_words(&new_long_value, lock_size, &one, two); @@ -691,11 +707,11 @@ static int shmem_lock_wait_for_ticket(void *lock, new_server_lock = server_lock = temp; lock_pack_pe_last(&new_server_lock, lock_size, &my_pe, 0); } while (server_lock - != (temp = shmem_lock_cswap(lock, - lock_size, - server_lock, - new_server_lock, - server_pe))); + != (temp = shmem_lock_cswap_poll(lock, + lock_size, + server_lock, + new_server_lock, + server_pe))); lock_extract_pe_last(&server_lock, lock_size, pe_last); if (*pe_last == -1) { /* we are first in queue for the lock */ @@ -751,11 +767,11 @@ static int shmem_lock_subscribe_for_informing(void *lock, prev_remote_value += my_pe + 1; while (prev_remote_value - != (temp_value = shmem_lock_cswap(lock, - lock_size, - prev_remote_value, - new_remote_value, - pe_last))) { + != (temp_value = shmem_lock_cswap_poll(lock, + lock_size, + prev_remote_value, + new_remote_value, + pe_last))) { prev_remote_value = temp_value; lock_extract_counter(&prev_remote_value, lock_size, @@ -849,11 +865,11 @@ static int shmem_lock_inform_next(void *lock, int lock_size, int pe_next) | (((uint64_t) 1) << (lock_bitwise_size - 1)); while (remote_value - != (temp_value = shmem_lock_cswap(lock, - lock_size, - remote_value, - new_remote_value, - pe_next))) { + != (temp_value = shmem_lock_cswap_poll(lock, + lock_size, + remote_value, + new_remote_value, + pe_next))) { remote_value = temp_value; new_remote_value = remote_value | (((uint64_t) 1) << (lock_bitwise_size - 1)); @@ -938,7 +954,7 @@ static int shmem_lock_try_inform_server(void *lock, int lock_size) &incorrect_pe, &my_pe); return !(remote_value - == shmem_lock_cswap(lock, lock_size, remote_value, zero, server_pe)); + == shmem_lock_cswap_poll(lock, lock_size, remote_value, zero, server_pe)); } /***************************************************************************/ diff --git a/oshmem/util/oshmem_util.c b/oshmem/util/oshmem_util.c index f97309906c..ac64d42a0a 100644 --- a/oshmem/util/oshmem_util.c +++ b/oshmem/util/oshmem_util.c @@ -23,7 +23,7 @@ void oshmem_output_verbose(int level, int output_id, const char* prefix, char *buff, *str; int ret = 0; - if (level < opal_output_get_verbosity(output_id)) { + if (level <= opal_output_get_verbosity(output_id)) { UNREFERENCED_PARAMETER(ret); va_start(args, format);