1
1

Fix a bug in the uGNI btl that could cause certain descriptor callbacks to be called twice.

There was a race condition in the eager get protocol where the RDMA complete message could be received before the local completion of the SMSG message that started the eager get protocol.

cmr:v1.7

This commit was SVN r27740.
Этот коммит содержится в:
Nathan Hjelm 2013-01-03 23:11:13 +00:00
родитель a159bfaf25
Коммит 84e34ee0d7
3 изменённых файлов: 21 добавлений и 12 удалений

Просмотреть файл

@ -1,6 +1,6 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2011 UT-Battelle, LLC. All rights reserved.
* $COPYRIGHT$
@ -56,10 +56,11 @@ typedef union mca_btl_ugni_frag_hdr_t {
} mca_btl_ugni_frag_hdr_t;
enum {
MCA_BTL_UGNI_FRAG_BUFFERED = 1, /* frag data is buffered */
MCA_BTL_UGNI_FRAG_COMPLETE = 2, /* smsg complete for frag */
MCA_BTL_UGNI_FRAG_EAGER = 4, /* eager get frag */
MCA_BTL_UGNI_FRAG_IGNORE = 8 /* ignore local smsg completion */
MCA_BTL_UGNI_FRAG_BUFFERED = 1, /* frag data is buffered */
MCA_BTL_UGNI_FRAG_COMPLETE = 2, /* smsg complete for frag */
MCA_BTL_UGNI_FRAG_EAGER = 4, /* eager get frag */
MCA_BTL_UGNI_FRAG_IGNORE = 8, /* ignore local smsg completion */
MCA_BTL_UGNI_FRAG_SMSG_COMPLETE = 16 /* SMSG has completed for this message */
};
struct mca_btl_ugni_base_frag_t;
@ -68,15 +69,15 @@ typedef void (*frag_cb_t) (struct mca_btl_ugni_base_frag_t *, int);
typedef struct mca_btl_ugni_base_frag_t {
mca_btl_base_descriptor_t base;
size_t hdr_size;
uint32_t msg_id;
uint16_t hdr_size;
uint16_t flags;
mca_btl_ugni_frag_hdr_t hdr;
mca_btl_ugni_segment_t segments[2];
ompi_common_ugni_post_desc_t post_desc;
mca_btl_base_endpoint_t *endpoint;
mca_btl_ugni_reg_t *registration;
ompi_free_list_t *my_list;
uint32_t msg_id;
uint32_t flags;
frag_cb_t cbfunc;
} mca_btl_ugni_base_frag_t;
@ -105,7 +106,6 @@ static inline int mca_btl_ugni_frag_alloc (mca_btl_base_endpoint_t *ep,
if (OPAL_LIKELY(NULL != item)) {
(*frag)->my_list = list;
(*frag)->endpoint = ep;
(*frag)->flags = 0;
}
return rc;
@ -119,6 +119,8 @@ static inline int mca_btl_ugni_frag_return (mca_btl_ugni_base_frag_t *frag)
frag->registration = NULL;
}
frag->flags = 0;
OMPI_FREE_LIST_RETURN(frag->my_list, (ompi_free_list_item_t *) frag);
return OMPI_SUCCESS;

Просмотреть файл

@ -1,6 +1,6 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2011 UT-Battelle, LLC. All rights reserved.
* $COPYRIGHT$
@ -120,7 +120,12 @@ int mca_btl_ugni_smsg_process (mca_btl_base_endpoint_t *ep)
case MCA_BTL_UGNI_TAG_RDMA_COMPLETE:
frag.hdr.rdma = ((mca_btl_ugni_rdma_frag_hdr_t *) data_ptr)[0];
mca_btl_ugni_frag_complete (frag.hdr.rdma.ctx, OMPI_SUCCESS);
if (((mca_btl_ugni_base_frag_t *)frag.hdr.rdma.ctx)->flags & MCA_BTL_UGNI_FRAG_SMSG_COMPLETE) {
mca_btl_ugni_frag_complete (frag.hdr.rdma.ctx, OMPI_SUCCESS);
} else {
/* let the local smsg completion finish this frag */
((mca_btl_ugni_base_frag_t *)frag.hdr.rdma.ctx)->flags &= ~MCA_BTL_UGNI_FRAG_IGNORE;
}
break;
case MCA_BTL_UGNI_TAG_DISCONNECT:
/* remote endpoint has disconnected */

Просмотреть файл

@ -1,6 +1,6 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2011 UT-Battelle, LLC. All rights reserved.
* $COPYRIGHT$
@ -64,6 +64,8 @@ static inline int mca_btl_ugni_progress_local_smsg (mca_btl_ugni_module_t *ugni_
return OMPI_ERROR;
}
frag->flags |= MCA_BTL_UGNI_FRAG_SMSG_COMPLETE;
if (!(frag->flags & MCA_BTL_UGNI_FRAG_IGNORE)) {
mca_btl_ugni_frag_complete (frag, OMPI_SUCCESS);
}