1
1

Merge pull request #828 from hjelmn/openib_thread_fix

openib thread fixes
Этот коммит содержится в:
Nathan Hjelm 2015-09-01 09:12:50 -06:00
родитель d8cb3fe705 64e4419d76
Коммит f926796e57
3 изменённых файлов: 35 добавлений и 21 удалений

Просмотреть файл

@ -2534,12 +2534,6 @@ btl_openib_component_init(int *num_btl_modules,
malloc_hook_set = true; malloc_hook_set = true;
} }
#endif #endif
/* Currently refuse to run if MPI_THREAD_MULTIPLE is enabled */
if (enable_mpi_threads && !mca_btl_base_thread_multiple_override) {
opal_output_verbose(5, opal_btl_base_framework.framework_output,
"btl:openib: MPI_THREAD_MULTIPLE not suppported; skipping this component");
goto no_btls;
}
/* Per https://svn.open-mpi.org/trac/ompi/ticket/1305, check to /* Per https://svn.open-mpi.org/trac/ompi/ticket/1305, check to
see if $sysfsdir/class/infiniband exists. If it does not, see if $sysfsdir/class/infiniband exists. If it does not,

Просмотреть файл

@ -1,5 +1,8 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/* /*
* Copyright (c) 2006-2007 Voltaire All rights reserved. * Copyright (c) 2006-2007 Voltaire All rights reserved.
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -81,17 +84,33 @@ typedef struct mca_btl_openib_eager_rdma_remote_t mca_btl_openib_eager_rdma_remo
mca_btl_openib_component.eager_rdma_num) \ mca_btl_openib_component.eager_rdma_num) \
(I) = 0; \ (I) = 0; \
} while (0) } while (0)
#define MCA_BTL_OPENIB_RDMA_MOVE_INDEX(HEAD, OLD_HEAD) \
do { \
int32_t new_head; \ #if OPAL_ENABLE_DEBUG
do { \
OLD_HEAD = HEAD; \ /**
new_head = OLD_HEAD + 1; \ * @brief read and increment the remote head index and generate a sequence
if(new_head == mca_btl_openib_component.eager_rdma_num) \ * number
new_head = 0; \ */
} while(!OPAL_ATOMIC_CMPSET_32(&HEAD, OLD_HEAD, new_head)); \
#define MCA_BTL_OPENIB_RDMA_MOVE_INDEX(HEAD, OLD_HEAD, SEQ) \
do { \
(SEQ) = OPAL_THREAD_ADD32(&(HEAD), 1) - 1; \
(OLD_HEAD) = (SEQ) % mca_btl_openib_component.eager_rdma_num; \
} while(0) } while(0)
#else
/**
* @brief read and increment the remote head index
*/
#define MCA_BTL_OPENIB_RDMA_MOVE_INDEX(HEAD, OLD_HEAD) \
do { \
(OLD_HEAD) = (OPAL_THREAD_ADD32(&(HEAD), 1) - 1) % mca_btl_openib_component.eager_rdma_num; \
} while(0)
#endif
END_C_DECLS END_C_DECLS
#endif #endif

Просмотреть файл

@ -569,17 +569,18 @@ static inline int post_send(mca_btl_openib_endpoint_t *ep,
MCA_BTL_OPENIB_RDMA_FRAG_SET_SIZE(ftr, sg->length); MCA_BTL_OPENIB_RDMA_FRAG_SET_SIZE(ftr, sg->length);
MCA_BTL_OPENIB_RDMA_MAKE_LOCAL(ftr); MCA_BTL_OPENIB_RDMA_MAKE_LOCAL(ftr);
#if OPAL_ENABLE_DEBUG #if OPAL_ENABLE_DEBUG
do { /* NTH: generate the sequence from the remote head index to ensure that the
ftr->seq = ep->eager_rdma_remote.seq; * wrong sequence isn't set. The way this code used to look the sequence number
} while (!OPAL_ATOMIC_CMPSET_32((int32_t*) &ep->eager_rdma_remote.seq, * and head were updated independently and it led to false positives for incorrect
(int32_t) ftr->seq, * sequence numbers. */
(int32_t) (ftr->seq+1))); MCA_BTL_OPENIB_RDMA_MOVE_INDEX(ep->eager_rdma_remote.head, head, ftr->seq);
#else
MCA_BTL_OPENIB_RDMA_MOVE_INDEX(ep->eager_rdma_remote.head, head);
#endif #endif
if(ep->nbo) if(ep->nbo)
BTL_OPENIB_FOOTER_HTON(*ftr); BTL_OPENIB_FOOTER_HTON(*ftr);
sr_desc->wr.rdma.rkey = ep->eager_rdma_remote.rkey; sr_desc->wr.rdma.rkey = ep->eager_rdma_remote.rkey;
MCA_BTL_OPENIB_RDMA_MOVE_INDEX(ep->eager_rdma_remote.head, head);
#if BTL_OPENIB_FAILOVER_ENABLED #if BTL_OPENIB_FAILOVER_ENABLED
/* frag->ftr is unused on the sending fragment, so use it /* frag->ftr is unused on the sending fragment, so use it
* to indicate it is an eager fragment. A non-zero value * to indicate it is an eager fragment. A non-zero value