Wire up all the RDMA-capable BTLs. Still no RDMA communication, but the
datastructures are finally all there This commit was SVN r15271.
Этот коммит содержится в:
родитель
1701013dd0
Коммит
f5c721d11c
@ -34,7 +34,7 @@ int
|
||||
ompi_osc_rdma_module_free(ompi_win_t *win)
|
||||
{
|
||||
int ret = OMPI_SUCCESS;
|
||||
int tmp;
|
||||
int tmp, i;
|
||||
ompi_osc_rdma_module_t *module = GET_MODULE(win);
|
||||
|
||||
opal_output_verbose(1, ompi_osc_base_output,
|
||||
@ -92,9 +92,57 @@ ompi_osc_rdma_module_free(ompi_win_t *win)
|
||||
if (NULL != module->m_num_pending_sendreqs) {
|
||||
free(module->m_num_pending_sendreqs);
|
||||
}
|
||||
if (NULL != module->m_peer_info) {
|
||||
for (i = 0 ; i < ompi_comm_size(module->m_comm) ; ++i) {
|
||||
ompi_osc_rdma_peer_info_free(&module->m_peer_info[i]);
|
||||
}
|
||||
free(module->m_peer_info);
|
||||
}
|
||||
if (NULL != module->m_comm) ompi_comm_free(&module->m_comm);
|
||||
|
||||
if (NULL != module) free(module);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
ompi_osc_rdma_peer_info_free(ompi_osc_rdma_peer_info_t *peer_info)
|
||||
{
|
||||
int i;
|
||||
|
||||
if (NULL != peer_info->peer_btls) {
|
||||
free(peer_info->peer_btls);
|
||||
}
|
||||
|
||||
if (NULL != peer_info->local_descriptors) {
|
||||
for (i = 0 ; i < peer_info->local_num_btls ; ++i) {
|
||||
if (NULL != peer_info->local_descriptors[i]) {
|
||||
mca_bml_base_btl_t *bml_btl =
|
||||
peer_info->local_btls[i];
|
||||
bml_btl->btl_free(bml_btl->btl,
|
||||
peer_info->local_descriptors[i]);
|
||||
}
|
||||
}
|
||||
free(peer_info->local_descriptors);
|
||||
}
|
||||
|
||||
if (NULL != peer_info->local_registrations) {
|
||||
for (i = 0 ; i < peer_info->local_num_btls ; ++i) {
|
||||
if (NULL != peer_info->local_registrations[i]) {
|
||||
mca_mpool_base_module_t *module =
|
||||
peer_info->local_registrations[i]->mpool;
|
||||
module->mpool_deregister(module,
|
||||
peer_info->local_registrations[i]);
|
||||
}
|
||||
}
|
||||
free(peer_info->local_registrations);
|
||||
}
|
||||
|
||||
if (NULL != peer_info->local_btls) {
|
||||
free(peer_info->local_btls);
|
||||
}
|
||||
|
||||
memset(peer_info, 0, sizeof(ompi_osc_rdma_peer_info_t));
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
@ -27,6 +27,7 @@
|
||||
#include "ompi/communicator/communicator.h"
|
||||
#include "ompi/mca/osc/osc.h"
|
||||
#include "ompi/mca/btl/btl.h"
|
||||
#include "ompi/mca/bml/bml.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
@ -67,14 +68,51 @@ struct ompi_osc_rdma_component_t {
|
||||
#endif
|
||||
|
||||
bool c_btl_registered;
|
||||
|
||||
uint32_t c_sequence_number;
|
||||
};
|
||||
typedef struct ompi_osc_rdma_component_t ompi_osc_rdma_component_t;
|
||||
|
||||
|
||||
struct ompi_osc_rdma_btl_t {
|
||||
uint64_t peer_seg_key;
|
||||
mca_bml_base_btl_t *bml_btl;
|
||||
int rdma_order;
|
||||
};
|
||||
typedef struct ompi_osc_rdma_btl_t ompi_osc_rdma_btl_t;
|
||||
|
||||
|
||||
struct ompi_osc_rdma_peer_info_t {
|
||||
uint64_t peer_base;
|
||||
uint64_t peer_len;
|
||||
|
||||
int peer_num_btls;
|
||||
volatile int peer_index_btls;
|
||||
ompi_osc_rdma_btl_t *peer_btls;
|
||||
|
||||
int local_num_btls;
|
||||
mca_bml_base_btl_t **local_btls;
|
||||
mca_mpool_base_registration_t **local_registrations;
|
||||
mca_btl_base_descriptor_t **local_descriptors;
|
||||
};
|
||||
typedef struct ompi_osc_rdma_peer_info_t ompi_osc_rdma_peer_info_t;
|
||||
|
||||
|
||||
struct ompi_osc_rdma_setup_info_t {
|
||||
volatile int32_t num_btls_callin;
|
||||
int32_t num_btls_expected;
|
||||
volatile int32_t num_btls_outgoing;
|
||||
opal_list_t *outstanding_btl_requests;
|
||||
};
|
||||
typedef struct ompi_osc_rdma_setup_info_t ompi_osc_rdma_setup_info_t;
|
||||
|
||||
|
||||
struct ompi_osc_rdma_module_t {
|
||||
/** Extend the basic osc module interface */
|
||||
ompi_osc_base_module_t super;
|
||||
|
||||
uint32_t m_sequence_number;
|
||||
|
||||
/** lock access to data structures in the current module */
|
||||
opal_mutex_t m_lock;
|
||||
|
||||
@ -136,6 +174,15 @@ struct ompi_osc_rdma_module_t {
|
||||
bool m_eager_send_active;
|
||||
bool m_eager_send_ok;
|
||||
|
||||
/* RDMA data */
|
||||
bool m_use_rdma;
|
||||
ompi_osc_rdma_setup_info_t *m_setup_info;
|
||||
ompi_osc_rdma_peer_info_t *m_peer_info;
|
||||
|
||||
int32_t m_num_pending_rdma;
|
||||
|
||||
volatile int32_t m_num_complete_rdma;
|
||||
|
||||
/* ********************* FENCE data ************************ */
|
||||
/* an array of <sizeof(m_comm)> ints, each containing the value
|
||||
1. */
|
||||
@ -157,6 +204,7 @@ struct ompi_osc_rdma_module_t {
|
||||
typedef struct ompi_osc_rdma_module_t ompi_osc_rdma_module_t;
|
||||
OMPI_MODULE_DECLSPEC extern ompi_osc_rdma_component_t mca_osc_rdma_component;
|
||||
|
||||
|
||||
#define GET_MODULE(win) ((ompi_osc_rdma_module_t*) win->w_osc_module)
|
||||
|
||||
/*
|
||||
@ -178,6 +226,8 @@ int ompi_osc_rdma_component_select(struct ompi_win_t *win,
|
||||
|
||||
int ompi_osc_rdma_component_progress(void);
|
||||
|
||||
int ompi_osc_rdma_peer_info_free(ompi_osc_rdma_peer_info_t *peer_info);
|
||||
|
||||
/*
|
||||
* Module interface function types
|
||||
*/
|
||||
@ -248,6 +298,7 @@ int ompi_osc_rdma_passive_unlock(ompi_osc_rdma_module_t *module,
|
||||
|
||||
int ompi_osc_rdma_passive_unlock_complete(ompi_osc_rdma_module_t *module);
|
||||
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* OMPI_OSC_RDMA_H */
|
||||
|
@ -47,6 +47,7 @@ static void component_fragment_cb(struct mca_btl_base_module_t *btl,
|
||||
#if OMPI_ENABLE_PROGRESS_THREADS
|
||||
static void* component_thread_fn(opal_object_t *obj);
|
||||
#endif
|
||||
static int setup_rdma(ompi_osc_rdma_module_t *module);
|
||||
|
||||
ompi_osc_rdma_component_t mca_osc_rdma_component = {
|
||||
{ /* ompi_osc_base_component_t */
|
||||
@ -143,6 +144,12 @@ component_open(void)
|
||||
"Info key of same name overrides this value.",
|
||||
false, false, 1, NULL);
|
||||
|
||||
mca_base_param_reg_int(&mca_osc_rdma_component.super.osc_version,
|
||||
"use_rdma",
|
||||
"Use real RDMA operations to transfer data. "
|
||||
"Info key of same name overrides this value.",
|
||||
false, false, 0, NULL);
|
||||
|
||||
mca_base_param_reg_int(&mca_osc_rdma_component.super.osc_version,
|
||||
"no_locks",
|
||||
"Enable optimizations available only if MPI_LOCK is "
|
||||
@ -204,6 +211,8 @@ ompi_osc_rdma_component_init(bool enable_progress_threads,
|
||||
|
||||
mca_osc_rdma_component.c_btl_registered = false;
|
||||
|
||||
mca_osc_rdma_component.c_sequence_number = 0;
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
@ -292,6 +301,10 @@ ompi_osc_rdma_component_select(ompi_win_t *win,
|
||||
|
||||
module->m_win = win;
|
||||
|
||||
OPAL_THREAD_LOCK(&mca_osc_rdma_component.c_lock);
|
||||
module->m_sequence_number = (mca_osc_rdma_component.c_sequence_number++);
|
||||
OPAL_THREAD_UNLOCK(&mca_osc_rdma_component.c_lock);
|
||||
|
||||
ret = ompi_comm_dup(comm, &module->m_comm, 0);
|
||||
if (ret != OMPI_SUCCESS) goto cleanup;
|
||||
|
||||
@ -329,6 +342,11 @@ ompi_osc_rdma_component_select(ompi_win_t *win,
|
||||
they start their epochs, so this isn't a problem. */
|
||||
module->m_eager_send_active = module->m_eager_send_ok;
|
||||
|
||||
/* allocate space for rdma information */
|
||||
module->m_use_rdma = check_config_value_bool("use_rdma", info);
|
||||
module->m_setup_info = NULL;
|
||||
module->m_peer_info = NULL;
|
||||
|
||||
/* fence data */
|
||||
module->m_fence_coll_counts = (int*)
|
||||
malloc(sizeof(int) * ompi_comm_size(module->m_comm));
|
||||
@ -388,9 +406,6 @@ ompi_osc_rdma_component_select(ompi_win_t *win,
|
||||
win->w_flags |= OMPI_WIN_NO_LOCKS;
|
||||
}
|
||||
|
||||
/* sync memory - make sure all initialization completed */
|
||||
opal_atomic_mb();
|
||||
|
||||
/* register to receive fragment callbacks, if not already done */
|
||||
OPAL_THREAD_LOCK(&mca_osc_rdma_component.c_lock);
|
||||
if (!mca_osc_rdma_component.c_btl_registered) {
|
||||
@ -402,9 +417,18 @@ ompi_osc_rdma_component_select(ompi_win_t *win,
|
||||
OPAL_THREAD_UNLOCK(&mca_osc_rdma_component.c_lock);
|
||||
if (OMPI_SUCCESS != ret) goto cleanup;
|
||||
|
||||
/* need to make create a collective, or lock requests can come in
|
||||
before the window is fully created... */
|
||||
module->m_comm->c_coll.coll_barrier(module->m_comm);
|
||||
/* sync memory - make sure all initialization completed */
|
||||
opal_atomic_mb();
|
||||
|
||||
if (module->m_use_rdma) {
|
||||
/* fill in rdma information - involves barrier semantics */
|
||||
ret = setup_rdma(module);
|
||||
} else {
|
||||
/* barrier to prevent arrival of lock requests before we're
|
||||
fully created */
|
||||
ret = module->m_comm->c_coll.coll_barrier(module->m_comm);
|
||||
}
|
||||
if (OMPI_SUCCESS != ret) goto cleanup;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_output,
|
||||
"done creating window %d", module->m_comm->c_contextid));
|
||||
@ -436,8 +460,13 @@ ompi_osc_rdma_component_select(ompi_win_t *win,
|
||||
if (NULL != module->m_num_pending_sendreqs) {
|
||||
free(module->m_num_pending_sendreqs);
|
||||
}
|
||||
if (NULL != module->m_peer_info) {
|
||||
for (i = 0 ; i < ompi_comm_size(module->m_comm) ; ++i) {
|
||||
ompi_osc_rdma_peer_info_free(&module->m_peer_info[i]);
|
||||
}
|
||||
free(module->m_peer_info);
|
||||
}
|
||||
if (NULL != module->m_comm) ompi_comm_free(&module->m_comm);
|
||||
|
||||
if (NULL != module) free(module);
|
||||
|
||||
return ret;
|
||||
@ -766,6 +795,55 @@ component_fragment_cb(struct mca_btl_base_module_t *btl,
|
||||
}
|
||||
break;
|
||||
|
||||
case OMPI_OSC_RDMA_HDR_RDMA_INFO:
|
||||
{
|
||||
ompi_osc_rdma_rdma_info_header_t *header =
|
||||
(ompi_osc_rdma_rdma_info_header_t*)
|
||||
descriptor->des_dst[0].seg_addr.pval;
|
||||
ompi_proc_t *proc = NULL;
|
||||
mca_bml_base_endpoint_t *endpoint = NULL;
|
||||
mca_bml_base_btl_t *bml_btl;
|
||||
ompi_osc_rdma_btl_t *rdma_btl;
|
||||
int origin, index;
|
||||
|
||||
#if !defined(WORDS_BIGENDIAN) && OMPI_ENABLE_HETEROGENEOUS_SUPPORT
|
||||
if (header->hdr_base.hdr_flags & OMPI_OSC_RDMA_HDR_FLAG_NBO) {
|
||||
OMPI_OSC_RDMA_RDMA_INFO_HDR_NTOH(*header);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* get our module pointer */
|
||||
module = ompi_osc_rdma_windx_to_module(header->hdr_windx);
|
||||
if (NULL == module) return;
|
||||
|
||||
origin = header->hdr_origin;
|
||||
|
||||
/* find the bml_btl */
|
||||
proc = ompi_comm_peer_lookup(module->m_comm, origin);
|
||||
endpoint = (mca_bml_base_endpoint_t*) proc->proc_bml;
|
||||
bml_btl = mca_bml_base_btl_array_find(&endpoint->btl_rdma, btl);
|
||||
if (NULL == bml_btl) {
|
||||
opal_output(ompi_osc_base_output,
|
||||
"received rdma info for unknown btl from rank %d",
|
||||
origin);
|
||||
return;
|
||||
}
|
||||
|
||||
OPAL_THREAD_LOCK(&module->m_lock);
|
||||
index = module->m_peer_info[origin].peer_num_btls++;
|
||||
rdma_btl = &(module->m_peer_info[origin].peer_btls[index]);
|
||||
|
||||
rdma_btl->peer_seg_key = header->hdr_segkey;
|
||||
rdma_btl->bml_btl = bml_btl;
|
||||
rdma_btl->rdma_order = MCA_BTL_NO_ORDER;
|
||||
|
||||
module->m_setup_info->num_btls_callin++;
|
||||
OPAL_THREAD_UNLOCK(&module->m_lock);
|
||||
|
||||
opal_condition_broadcast(&module->m_cond);
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
/* BWB - FIX ME - this sucks */
|
||||
opal_output(ompi_osc_base_output,
|
||||
@ -844,3 +922,374 @@ component_thread_fn(opal_object_t *obj)
|
||||
return NULL;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
/*********** RDMA setup stuff ***********/
|
||||
|
||||
|
||||
struct peer_rdma_send_info_t{
|
||||
opal_list_item_t super;
|
||||
ompi_osc_rdma_module_t *module;
|
||||
ompi_proc_t *proc;
|
||||
mca_bml_base_btl_t *bml_btl;
|
||||
uint64_t seg_key;
|
||||
};
|
||||
typedef struct peer_rdma_send_info_t peer_rdma_send_info_t;
|
||||
OBJ_CLASS_INSTANCE(peer_rdma_send_info_t, opal_list_item_t, NULL, NULL);
|
||||
|
||||
|
||||
static void
|
||||
rdma_send_info_send_complete(struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t *endpoint,
|
||||
struct mca_btl_base_descriptor_t* descriptor,
|
||||
int status)
|
||||
{
|
||||
peer_rdma_send_info_t *peer_send_info =
|
||||
(peer_rdma_send_info_t*) descriptor->des_cbdata;
|
||||
|
||||
if (OMPI_SUCCESS == status) {
|
||||
btl->btl_free(btl, descriptor);
|
||||
|
||||
OPAL_THREAD_LOCK(&peer_send_info->module->m_lock);
|
||||
peer_send_info->module->m_setup_info->num_btls_outgoing--;
|
||||
OPAL_THREAD_UNLOCK(&peer_send_info->module->m_lock);
|
||||
|
||||
opal_condition_broadcast(&(peer_send_info->module->m_cond));
|
||||
|
||||
OBJ_RELEASE(peer_send_info);
|
||||
} else {
|
||||
/* BWB - fix me */
|
||||
abort();
|
||||
}
|
||||
}
|
||||
|
||||
static int
|
||||
rdma_send_info_send(ompi_osc_rdma_module_t *module,
|
||||
peer_rdma_send_info_t *peer_send_info)
|
||||
{
|
||||
int ret = OMPI_SUCCESS;
|
||||
mca_bml_base_btl_t *bml_btl = NULL;
|
||||
mca_btl_base_descriptor_t *descriptor = NULL;
|
||||
ompi_osc_rdma_rdma_info_header_t *header = NULL;
|
||||
|
||||
bml_btl = peer_send_info->bml_btl;
|
||||
descriptor = bml_btl->btl_alloc(bml_btl->btl,
|
||||
MCA_BTL_NO_ORDER,
|
||||
sizeof(ompi_osc_rdma_rdma_info_header_t));
|
||||
if (NULL == descriptor) {
|
||||
ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* verify at least enough space for header */
|
||||
if (descriptor->des_src[0].seg_len < sizeof(ompi_osc_rdma_rdma_info_header_t)) {
|
||||
ret = OMPI_ERR_OUT_OF_RESOURCE;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* setup descriptor */
|
||||
descriptor->des_cbfunc = rdma_send_info_send_complete;
|
||||
descriptor->des_cbdata = peer_send_info;
|
||||
descriptor->des_flags = MCA_BTL_DES_FLAGS_PRIORITY;
|
||||
descriptor->des_src[0].seg_len = sizeof(ompi_osc_rdma_rdma_info_header_t);
|
||||
|
||||
/* pack header */
|
||||
header = (ompi_osc_rdma_rdma_info_header_t*) descriptor->des_src[0].seg_addr.pval;
|
||||
header->hdr_base.hdr_type = OMPI_OSC_RDMA_HDR_RDMA_INFO;
|
||||
header->hdr_segkey = peer_send_info->seg_key;
|
||||
header->hdr_origin = ompi_comm_rank(module->m_comm);
|
||||
header->hdr_windx = module->m_comm->c_contextid;
|
||||
|
||||
#ifdef WORDS_BIGENDIAN
|
||||
header->hdr_base.hdr_flags |= OMPI_OSC_RDMA_HDR_FLAG_NBO;
|
||||
#elif OMPI_ENABLE_HETEROGENEOUS_SUPPORT
|
||||
if (peer_send_info->proc->proc_arch & OMPI_ARCH_ISBIGENDIAN) {
|
||||
header->hdr_base.hdr_flags |= OMPI_OSC_RDMA_HDR_FLAG_NBO;
|
||||
OMPI_OSC_RDMA_RDMA_INFO_HDR_HTON(*header);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* send fragment */
|
||||
ret = mca_bml_base_send(bml_btl, descriptor, MCA_BTL_TAG_OSC_RDMA);
|
||||
goto done;
|
||||
|
||||
cleanup:
|
||||
if (descriptor != NULL) {
|
||||
mca_bml_base_free(bml_btl, descriptor);
|
||||
}
|
||||
|
||||
done:
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
static bool
|
||||
is_valid_rdma(mca_bml_base_btl_t *bml_btl)
|
||||
{
|
||||
if (((bml_btl->btl_flags & MCA_BTL_FLAGS_PUT) != 0) &&
|
||||
((bml_btl->btl_flags & MCA_BTL_FLAGS_GET) != 0)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
static int
|
||||
setup_rdma(ompi_osc_rdma_module_t *module)
|
||||
{
|
||||
|
||||
uint64_t local;
|
||||
uint64_t *remote = NULL;
|
||||
MPI_Datatype ui64_type;
|
||||
int ret = OMPI_SUCCESS;
|
||||
int i;
|
||||
|
||||
#if SIZEOF_LONG == 8
|
||||
ui64_type = MPI_LONG;
|
||||
#else
|
||||
ui64_type = MPI_LONG_LONG;
|
||||
#endif
|
||||
|
||||
/* create a setup info structure */
|
||||
module->m_setup_info = malloc(sizeof(ompi_osc_rdma_setup_info_t));
|
||||
if (NULL == module->m_setup_info) {
|
||||
ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE;
|
||||
goto cleanup;
|
||||
}
|
||||
module->m_setup_info->num_btls_callin = 0;
|
||||
module->m_setup_info->num_btls_expected = -1;
|
||||
module->m_setup_info->num_btls_outgoing = 0;
|
||||
module->m_setup_info->outstanding_btl_requests =
|
||||
malloc(sizeof(opal_list_t) * ompi_comm_size(module->m_comm));
|
||||
if (NULL == module->m_setup_info->outstanding_btl_requests) {
|
||||
ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE;
|
||||
goto cleanup;
|
||||
}
|
||||
for (i = 0 ; i < ompi_comm_size(module->m_comm) ; ++i) {
|
||||
OBJ_CONSTRUCT(&(module->m_setup_info->outstanding_btl_requests[i]),
|
||||
opal_list_t);
|
||||
}
|
||||
|
||||
/* create peer info array */
|
||||
module->m_peer_info = (ompi_osc_rdma_peer_info_t*)
|
||||
malloc(sizeof(ompi_osc_rdma_peer_info_t) *
|
||||
ompi_comm_size(module->m_comm));
|
||||
if (NULL == module->m_peer_info) {
|
||||
ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE;
|
||||
goto cleanup;
|
||||
}
|
||||
memset(module->m_peer_info, 0,
|
||||
sizeof(ompi_osc_rdma_peer_info_t) * ompi_comm_size(module->m_comm));
|
||||
|
||||
/* get number of btls to each peer, descriptors for the window for
|
||||
each peer */
|
||||
for (i = 0 ; i < ompi_comm_size(module->m_comm) ; ++i) {
|
||||
ompi_proc_t *proc = ompi_comm_peer_lookup(module->m_comm, i);
|
||||
ompi_osc_rdma_peer_info_t *peer_info = &module->m_peer_info[i];
|
||||
mca_bml_base_endpoint_t *endpoint =
|
||||
(mca_bml_base_endpoint_t*) proc->proc_bml;
|
||||
int num_avail =
|
||||
mca_bml_base_btl_array_get_size(&endpoint->btl_rdma);
|
||||
size_t j, size;
|
||||
ompi_convertor_t convertor;
|
||||
|
||||
/* skip peer if heterogeneous */
|
||||
if (ompi_proc_local()->proc_arch != proc->proc_arch) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* get a rough estimation of how many BTLs we'll be able to
|
||||
use, and exit if the answer is none */
|
||||
for (j = 0 ;
|
||||
j < mca_bml_base_btl_array_get_size(&endpoint->btl_rdma) ;
|
||||
++j) {
|
||||
mca_bml_base_btl_t *bml_btl =
|
||||
mca_bml_base_btl_array_get_index(&endpoint->btl_rdma, j);
|
||||
if (!is_valid_rdma(bml_btl)) num_avail--;
|
||||
}
|
||||
if (0 == num_avail) continue;
|
||||
|
||||
/* Allocate space for all the useable BTLs. They might not
|
||||
all end up useable, if we can't pin memory for the btl or
|
||||
the like. But the number of elements to start with should
|
||||
be small and the number that fail the pin test should be
|
||||
approximately 0, so this isn't too big of a waste */
|
||||
peer_info->peer_btls = (ompi_osc_rdma_btl_t*)
|
||||
malloc(sizeof(ompi_osc_rdma_btl_t) * num_avail);
|
||||
peer_info->local_btls = (mca_bml_base_btl_t**)
|
||||
malloc(sizeof(mca_bml_base_btl_t*) * num_avail);
|
||||
peer_info->local_registrations = (mca_mpool_base_registration_t**)
|
||||
malloc(sizeof(mca_mpool_base_registration_t*) * num_avail);
|
||||
peer_info->local_descriptors = (mca_btl_base_descriptor_t**)
|
||||
malloc(sizeof(mca_btl_base_descriptor_t*) * num_avail);
|
||||
if (NULL == peer_info->peer_btls ||
|
||||
NULL == peer_info->local_btls ||
|
||||
NULL == peer_info->local_registrations ||
|
||||
NULL == peer_info->local_descriptors) {
|
||||
ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE;
|
||||
goto cleanup;
|
||||
}
|
||||
memset(peer_info->peer_btls, 0,
|
||||
sizeof(ompi_osc_rdma_btl_t) * num_avail);
|
||||
memset(peer_info->local_registrations, 0,
|
||||
sizeof(mca_mpool_base_registration_t*) * num_avail);
|
||||
memset(peer_info->local_descriptors, 0,
|
||||
sizeof(mca_btl_base_descriptor_t*) * num_avail);
|
||||
|
||||
OBJ_CONSTRUCT(&convertor, ompi_convertor_t);
|
||||
|
||||
/* Find all useable btls, try to do the descriptor thing for
|
||||
them, and store all that information */
|
||||
for (j = 0 ;
|
||||
j < mca_bml_base_btl_array_get_size(&endpoint->btl_rdma) ;
|
||||
++j) {
|
||||
mca_bml_base_btl_t *bml_btl =
|
||||
mca_bml_base_btl_array_get_index(&endpoint->btl_rdma, j);
|
||||
mca_mpool_base_module_t *btl_mpool = bml_btl->btl_mpool;
|
||||
int index = peer_info->local_num_btls;
|
||||
|
||||
if (!is_valid_rdma(bml_btl)) continue;
|
||||
|
||||
if (NULL != btl_mpool) {
|
||||
ret = btl_mpool->mpool_register(btl_mpool, module->m_win->w_baseptr,
|
||||
module->m_win->w_size, 0,
|
||||
&(peer_info->local_registrations[index]));
|
||||
if (OMPI_SUCCESS != ret) continue;
|
||||
} else {
|
||||
peer_info->local_registrations[index] = NULL;
|
||||
}
|
||||
|
||||
size = module->m_win->w_size;
|
||||
|
||||
ompi_convertor_copy_and_prepare_for_send(proc->proc_convertor,
|
||||
MPI_BYTE,
|
||||
module->m_win->w_size,
|
||||
module->m_win->w_baseptr,
|
||||
0,
|
||||
&convertor);
|
||||
|
||||
peer_info->local_descriptors[index] =
|
||||
bml_btl->btl_prepare_dst(bml_btl->btl,
|
||||
bml_btl->btl_endpoint,
|
||||
peer_info->local_registrations[index],
|
||||
&convertor,
|
||||
MCA_BTL_NO_ORDER,
|
||||
0,
|
||||
&size);
|
||||
if (NULL == peer_info->local_descriptors[index]) {
|
||||
if (NULL != peer_info->local_registrations[index]) {
|
||||
btl_mpool->mpool_deregister(btl_mpool,
|
||||
peer_info->local_registrations[index]);
|
||||
}
|
||||
ompi_convertor_cleanup(&convertor);
|
||||
continue;
|
||||
}
|
||||
|
||||
peer_info->local_btls[index] = bml_btl;
|
||||
|
||||
ompi_convertor_cleanup(&convertor);
|
||||
|
||||
peer_info->local_num_btls++;
|
||||
module->m_setup_info->num_btls_outgoing++;
|
||||
}
|
||||
|
||||
OBJ_DESTRUCT(&convertor);
|
||||
}
|
||||
|
||||
/* fill in information about remote peers */
|
||||
remote = malloc(sizeof(uint64_t) * ompi_comm_size(module->m_comm));
|
||||
if (NULL == remote) {
|
||||
ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
local = ompi_ptr_ptol(module->m_win->w_baseptr);
|
||||
ret = module->m_comm->c_coll.coll_allgather(&local, 1, ui64_type,
|
||||
remote, 1, ui64_type,
|
||||
module->m_comm);
|
||||
if (OMPI_SUCCESS != ret) goto cleanup;
|
||||
for (i = 0 ; i < ompi_comm_size(module->m_comm) ; ++i) {
|
||||
module->m_peer_info[i].peer_base = remote[i];
|
||||
}
|
||||
|
||||
local = module->m_win->w_size;
|
||||
ret = module->m_comm->c_coll.coll_allgather(&local, 1, ui64_type,
|
||||
remote, 1, ui64_type,
|
||||
module->m_comm);
|
||||
if (OMPI_SUCCESS != ret) goto cleanup;
|
||||
for (i = 0 ; i < ompi_comm_size(module->m_comm) ; ++i) {
|
||||
module->m_peer_info[i].peer_base = remote[i];
|
||||
}
|
||||
|
||||
/* get number of btls we're expecting from everyone */
|
||||
for (i = 0 ; i < ompi_comm_size(module->m_comm) ; ++i) {
|
||||
remote[i] = module->m_peer_info[i].local_num_btls;
|
||||
}
|
||||
ret = module->m_comm->c_coll.coll_reduce_scatter(remote,
|
||||
&local,
|
||||
module->m_fence_coll_counts,
|
||||
ui64_type,
|
||||
MPI_SUM,
|
||||
module->m_comm);
|
||||
if (OMPI_SUCCESS != ret) goto cleanup;
|
||||
module->m_setup_info->num_btls_expected = local;
|
||||
/* end fill in information about remote peers */
|
||||
|
||||
/* send our contact info to everyone... */
|
||||
for (i = 0 ; i < ompi_comm_size(module->m_comm) ; ++i) {
|
||||
ompi_osc_rdma_peer_info_t *peer_info = &module->m_peer_info[i];
|
||||
int j;
|
||||
|
||||
for (j = 0 ; j < peer_info->local_num_btls ; ++j) {
|
||||
peer_rdma_send_info_t *peer_send_info =
|
||||
OBJ_NEW(peer_rdma_send_info_t);
|
||||
peer_send_info->module = module;
|
||||
peer_send_info->proc = ompi_comm_peer_lookup(module->m_comm, i);
|
||||
peer_send_info->bml_btl = peer_info->local_btls[j];
|
||||
peer_send_info->seg_key =
|
||||
peer_info->local_descriptors[j]->des_dst[0].seg_key.key64;
|
||||
|
||||
ret = rdma_send_info_send(module, peer_send_info);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
opal_list_append(&(module->m_setup_info->outstanding_btl_requests[i]),
|
||||
&peer_send_info->super);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
OPAL_THREAD_LOCK(&module->m_lock);
|
||||
while ((module->m_setup_info->num_btls_outgoing != 0) ||
|
||||
(module->m_setup_info->num_btls_expected !=
|
||||
module->m_setup_info->num_btls_callin)) {
|
||||
for (i = 0 ; i < ompi_comm_size(module->m_comm) ; ++i) {
|
||||
peer_rdma_send_info_t *peer_send_info =
|
||||
(peer_rdma_send_info_t*) opal_list_remove_first(&module->m_setup_info->outstanding_btl_requests[i]);
|
||||
if (NULL != peer_send_info) {
|
||||
ret = rdma_send_info_send(module, peer_send_info);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
opal_list_append(&(module->m_setup_info->outstanding_btl_requests[i]),
|
||||
&peer_send_info->super);
|
||||
}
|
||||
}
|
||||
}
|
||||
opal_condition_wait(&module->m_cond, &module->m_lock);
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&module->m_lock);
|
||||
|
||||
ret = OMPI_SUCCESS;
|
||||
|
||||
cleanup:
|
||||
if (NULL != module->m_setup_info) {
|
||||
if (NULL != module->m_setup_info->outstanding_btl_requests) {
|
||||
for (i = 0 ; i < ompi_comm_size(module->m_comm) ; ++i) {
|
||||
OBJ_DESTRUCT(&(module->m_setup_info->outstanding_btl_requests[i]));
|
||||
}
|
||||
free(module->m_setup_info->outstanding_btl_requests);
|
||||
}
|
||||
free(module->m_setup_info);
|
||||
}
|
||||
if (NULL != remote) free(remote);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
@ -25,17 +25,18 @@
|
||||
|
||||
#include "opal/types.h"
|
||||
|
||||
#define OMPI_OSC_RDMA_HDR_PUT 0x0001
|
||||
#define OMPI_OSC_RDMA_HDR_ACC 0x0002
|
||||
#define OMPI_OSC_RDMA_HDR_GET 0x0003
|
||||
#define OMPI_OSC_RDMA_HDR_REPLY 0x0004
|
||||
#define OMPI_OSC_RDMA_HDR_POST 0x0005
|
||||
#define OMPI_OSC_RDMA_HDR_COMPLETE 0x0006
|
||||
#define OMPI_OSC_RDMA_HDR_LOCK_REQ 0x0007
|
||||
#define OMPI_OSC_RDMA_HDR_UNLOCK_REQ 0x0008
|
||||
#define OMPI_OSC_RDMA_HDR_UNLOCK_REPLY 0x0009
|
||||
#define OMPI_OSC_RDMA_HDR_PUT 0x01
|
||||
#define OMPI_OSC_RDMA_HDR_ACC 0x02
|
||||
#define OMPI_OSC_RDMA_HDR_GET 0x03
|
||||
#define OMPI_OSC_RDMA_HDR_REPLY 0x04
|
||||
#define OMPI_OSC_RDMA_HDR_POST 0x05
|
||||
#define OMPI_OSC_RDMA_HDR_COMPLETE 0x06
|
||||
#define OMPI_OSC_RDMA_HDR_LOCK_REQ 0x07
|
||||
#define OMPI_OSC_RDMA_HDR_UNLOCK_REQ 0x08
|
||||
#define OMPI_OSC_RDMA_HDR_UNLOCK_REPLY 0x09
|
||||
#define OMPI_OSC_RDMA_HDR_RDMA_INFO 0x0A
|
||||
|
||||
#define OMPI_OSC_RDMA_HDR_FLAG_NBO 0x0001
|
||||
#define OMPI_OSC_RDMA_HDR_FLAG_NBO 0x01
|
||||
|
||||
struct ompi_osc_rdma_base_header_t {
|
||||
uint8_t hdr_type;
|
||||
@ -122,18 +123,44 @@ typedef struct ompi_osc_rdma_control_header_t ompi_osc_rdma_control_header_t;
|
||||
|
||||
#define OMPI_OSC_RDMA_CONTROL_HDR_HTON(hdr) \
|
||||
do { \
|
||||
OMPI_OSC_RDMA_BASE_HDR_HTON((hdr).hdr_base) \
|
||||
(hdr).hdr_windx = htons((hdr).hdr_windx); \
|
||||
OMPI_OSC_RDMA_BASE_HDR_HTON((hdr).hdr_base); \
|
||||
(hdr).hdr_windx = htons((hdr).hdr_windx); \
|
||||
(hdr).hdr_value[0] = htonl((hdr).hdr_value[0]); \
|
||||
(hdr).hdr_value[1] = htonl((hdr).hdr_value[1]); \
|
||||
} while (0)
|
||||
|
||||
#define OMPI_OSC_RDMA_CONTROL_HDR_NTOH(hdr) \
|
||||
do { \
|
||||
OMPI_OSC_RDMA_BASE_HDR_NTOH((hdr).hdr_base) \
|
||||
(hdr).hdr_windx = ntohs((hdr).hdr_windx); \
|
||||
OMPI_OSC_RDMA_BASE_HDR_NTOH((hdr).hdr_base); \
|
||||
(hdr).hdr_windx = ntohs((hdr).hdr_windx); \
|
||||
(hdr).hdr_value[0] = ntohl((hdr).hdr_value[0]); \
|
||||
(hdr).hdr_value[1] = ntohl((hdr).hdr_value[1]); \
|
||||
} while (0)
|
||||
|
||||
|
||||
struct ompi_osc_rdma_rdma_info_header_t {
|
||||
ompi_osc_rdma_base_header_t hdr_base;
|
||||
int16_t hdr_windx;
|
||||
int32_t hdr_origin;
|
||||
uint64_t hdr_segkey;
|
||||
};
|
||||
typedef struct ompi_osc_rdma_rdma_info_header_t ompi_osc_rdma_rdma_info_header_t;
|
||||
|
||||
#define OMPI_OSC_RDMA_RDMA_INFO_HDR_HTON(hdr) \
|
||||
do { \
|
||||
OMPI_OSC_RDMA_BASE_HDR_HTON((hdr).hdr_base); \
|
||||
(hdr).hdr_windx = htons((hdr).hdr_windx); \
|
||||
(hdr).hdr_origin = htonl((hdr).hdr_origin); \
|
||||
(hdr).hdr_segkey = hton64((hdr).hdr_segkey); \
|
||||
} while (0)
|
||||
|
||||
#define OMPI_OSC_RDMA_RDMA_INFO_HDR_NTOH(hdr) \
|
||||
do { \
|
||||
OMPI_OSC_RDMA_BASE_HDR_NTOH((hdr).hdr_base); \
|
||||
(hdr).hdr_windx = ntohs((hdr).hdr_windx); \
|
||||
(hdr).hdr_origin = ntohl((hdr).hdr_origin); \
|
||||
(hdr).hdr_segkey = ntoh64((hdr).hdr_segkey); \
|
||||
} while (0)
|
||||
|
||||
|
||||
#endif /* OMPI_MCA_OSC_RDMA_HDR_H */
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user