2011-12-10 01:24:07 +04:00
|
|
|
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
|
|
|
/*
|
2014-07-10 20:31:15 +04:00
|
|
|
* Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights
|
2011-12-10 01:24:07 +04:00
|
|
|
* reserved.
|
|
|
|
* Copyright (c) 2011 UT-Battelle, LLC. All rights reserved.
|
|
|
|
* $COPYRIGHT$
|
|
|
|
*
|
|
|
|
* Additional copyrights may follow
|
|
|
|
*
|
|
|
|
* $HEADER$
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "btl_ugni.h"
|
|
|
|
#include "btl_ugni_frag.h"
|
|
|
|
#include "btl_ugni_rdma.h"
|
2012-02-10 04:47:29 +04:00
|
|
|
#include "btl_ugni_smsg.h"
|
2011-12-10 01:24:07 +04:00
|
|
|
|
|
|
|
#include "opal/memoryhooks/memory.h"
|
2014-08-08 22:02:46 +04:00
|
|
|
#include "opal/runtime/opal_params.h"
|
2011-12-10 01:24:07 +04:00
|
|
|
|
|
|
|
static int btl_ugni_component_register(void);
|
|
|
|
static int btl_ugni_component_open(void);
|
|
|
|
static int btl_ugni_component_close(void);
|
|
|
|
static mca_btl_base_module_t **mca_btl_ugni_component_init(int *, bool, bool);
|
|
|
|
static int mca_btl_ugni_component_progress(void);
|
|
|
|
|
|
|
|
mca_btl_ugni_component_t mca_btl_ugni_component = {
|
2013-11-18 08:58:37 +04:00
|
|
|
.super = {
|
2011-12-10 01:24:07 +04:00
|
|
|
/* First, the mca_base_component_t struct containing meta information
|
|
|
|
about the component itself */
|
2013-11-18 08:58:37 +04:00
|
|
|
.btl_version = {
|
2014-07-10 20:31:15 +04:00
|
|
|
MCA_BTL_DEFAULT_VERSION("ugni"),
|
2013-11-18 08:58:37 +04:00
|
|
|
.mca_open_component = btl_ugni_component_open,
|
|
|
|
.mca_close_component = btl_ugni_component_close,
|
|
|
|
.mca_register_component_params = btl_ugni_component_register,
|
2011-12-10 01:24:07 +04:00
|
|
|
},
|
2013-11-18 08:58:37 +04:00
|
|
|
.btl_data = {
|
2014-07-10 20:31:15 +04:00
|
|
|
.param_field = MCA_BASE_METADATA_PARAM_CHECKPOINT
|
2011-12-10 01:24:07 +04:00
|
|
|
},
|
2013-11-18 08:58:37 +04:00
|
|
|
.btl_init = mca_btl_ugni_component_init,
|
|
|
|
.btl_progress = mca_btl_ugni_component_progress,
|
2011-12-10 01:24:07 +04:00
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2013-11-18 08:58:37 +04:00
|
|
|
mca_base_var_enum_value_t mpool_values[] = {
|
|
|
|
{MCA_BTL_UGNI_MPOOL_UDREG, "udreg"},
|
|
|
|
{MCA_BTL_UGNI_MPOOL_GRDMA, "grdma"},
|
|
|
|
{-1, NULL} /* sentinal */
|
|
|
|
};
|
|
|
|
|
2011-12-10 01:24:07 +04:00
|
|
|
static int
|
|
|
|
btl_ugni_component_register(void)
|
|
|
|
{
|
2013-11-18 08:58:37 +04:00
|
|
|
mca_base_var_enum_t *new_enum;
|
|
|
|
int rc;
|
|
|
|
|
2013-03-28 01:09:41 +04:00
|
|
|
(void) mca_base_var_group_component_register(&mca_btl_ugni_component.super.btl_version,
|
2014-10-08 20:10:19 +04:00
|
|
|
"uGNI byte transport layer");
|
2013-03-28 01:09:41 +04:00
|
|
|
|
|
|
|
mca_btl_ugni_component.ugni_free_list_num = 8;
|
|
|
|
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
|
|
|
|
"free_list_num", NULL, MCA_BASE_VAR_TYPE_INT,
|
|
|
|
NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
|
2013-11-18 08:58:37 +04:00
|
|
|
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_LOCAL,
|
2013-03-28 01:09:41 +04:00
|
|
|
&mca_btl_ugni_component.ugni_free_list_num);
|
2014-10-08 20:10:19 +04:00
|
|
|
mca_btl_ugni_component.ugni_free_list_max = 4096;
|
2013-03-28 01:09:41 +04:00
|
|
|
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
|
|
|
|
"free_list_max", NULL, MCA_BASE_VAR_TYPE_INT,
|
|
|
|
NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
|
2013-11-18 08:58:37 +04:00
|
|
|
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_LOCAL,
|
2013-03-28 01:09:41 +04:00
|
|
|
&mca_btl_ugni_component.ugni_free_list_max);
|
|
|
|
mca_btl_ugni_component.ugni_free_list_inc = 64;
|
|
|
|
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
|
|
|
|
"free_list_inc", NULL, MCA_BASE_VAR_TYPE_INT,
|
|
|
|
NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
|
2013-11-18 08:58:37 +04:00
|
|
|
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_LOCAL,
|
2013-03-28 01:09:41 +04:00
|
|
|
&mca_btl_ugni_component.ugni_free_list_inc);
|
|
|
|
|
|
|
|
mca_btl_ugni_component.ugni_eager_num = 16;
|
|
|
|
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
|
2013-11-18 08:58:37 +04:00
|
|
|
"eager_num", NULL, MCA_BASE_VAR_TYPE_INT,
|
|
|
|
NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
|
|
|
|
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
|
|
|
|
&mca_btl_ugni_component.ugni_eager_num);
|
2013-03-28 01:09:41 +04:00
|
|
|
mca_btl_ugni_component.ugni_eager_max = 128;
|
|
|
|
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
|
|
|
|
"eager_max", NULL, MCA_BASE_VAR_TYPE_INT,
|
|
|
|
NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
|
2013-11-18 08:58:37 +04:00
|
|
|
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
|
2013-03-28 01:09:41 +04:00
|
|
|
&mca_btl_ugni_component.ugni_eager_max);
|
|
|
|
mca_btl_ugni_component.ugni_eager_inc = 16;
|
|
|
|
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
|
|
|
|
"eager_inc", NULL, MCA_BASE_VAR_TYPE_INT,
|
|
|
|
NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
|
2013-11-18 08:58:37 +04:00
|
|
|
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
|
2013-03-28 01:09:41 +04:00
|
|
|
&mca_btl_ugni_component.ugni_eager_inc);
|
|
|
|
|
|
|
|
mca_btl_ugni_component.remote_cq_size = 40000;
|
|
|
|
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
|
|
|
|
"remote_cq_size", "Remote SMSG completion queue "
|
|
|
|
"size (default 40000)", MCA_BASE_VAR_TYPE_INT,
|
|
|
|
NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
|
2013-11-18 08:58:37 +04:00
|
|
|
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
|
2013-03-28 01:09:41 +04:00
|
|
|
&mca_btl_ugni_component.remote_cq_size);
|
|
|
|
mca_btl_ugni_component.local_cq_size = 8192;
|
|
|
|
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
|
|
|
|
"local_cq_size", "Local completion queue size "
|
|
|
|
"(default 8192)", MCA_BASE_VAR_TYPE_INT,
|
|
|
|
NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
|
2013-11-18 08:58:37 +04:00
|
|
|
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
|
2013-03-28 01:09:41 +04:00
|
|
|
&mca_btl_ugni_component.local_cq_size);
|
|
|
|
|
|
|
|
mca_btl_ugni_component.ugni_smsg_limit = 0;
|
|
|
|
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
|
|
|
|
"smsg_limit", "Maximum size message that "
|
|
|
|
"will be sent using the SMSG/MSGQ protocol "
|
|
|
|
"(0 - autoselect(default), 16k max)",
|
|
|
|
MCA_BASE_VAR_TYPE_INT, NULL, 0,
|
|
|
|
MCA_BASE_VAR_FLAG_SETTABLE,
|
2013-11-18 08:58:37 +04:00
|
|
|
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
|
2013-03-28 01:09:41 +04:00
|
|
|
&mca_btl_ugni_component.ugni_smsg_limit);
|
|
|
|
|
|
|
|
mca_btl_ugni_component.smsg_max_credits = 32;
|
|
|
|
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
|
|
|
|
"smsg_max_credits", "Maximum number of "
|
|
|
|
"outstanding SMSG/MSGQ message (default 32)",
|
|
|
|
MCA_BASE_VAR_TYPE_INT, NULL, 0,
|
|
|
|
MCA_BASE_VAR_FLAG_SETTABLE,
|
2013-11-18 08:58:37 +04:00
|
|
|
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
|
2013-03-28 01:09:41 +04:00
|
|
|
&mca_btl_ugni_component.smsg_max_credits);
|
|
|
|
|
|
|
|
mca_btl_ugni_component.ugni_fma_limit = 1024;
|
|
|
|
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
|
|
|
|
"fma_limit", "Maximum size message that "
|
|
|
|
"will be sent using the FMA (Fast Memory "
|
|
|
|
"Access) protocol (default 1024, 64k max)",
|
|
|
|
MCA_BASE_VAR_TYPE_INT, NULL, 0,
|
|
|
|
MCA_BASE_VAR_FLAG_SETTABLE,
|
2013-11-18 08:58:37 +04:00
|
|
|
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
|
2013-03-28 01:09:41 +04:00
|
|
|
&mca_btl_ugni_component.ugni_fma_limit);
|
|
|
|
|
2014-11-20 09:22:43 +03:00
|
|
|
mca_btl_ugni_component.ugni_get_limit = 1 * 1024 * 1024;
|
|
|
|
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
|
|
|
|
"get_limit", "Maximum size message that "
|
|
|
|
"will be sent using a get protocol "
|
|
|
|
"(default 1M)", MCA_BASE_VAR_TYPE_INT,
|
|
|
|
NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
|
|
|
|
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
|
|
|
|
&mca_btl_ugni_component.ugni_get_limit);
|
|
|
|
|
2013-03-28 01:09:41 +04:00
|
|
|
mca_btl_ugni_component.rdma_max_retries = 16;
|
|
|
|
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
|
|
|
|
"rdma_max_retries", NULL, MCA_BASE_VAR_TYPE_INT,
|
|
|
|
NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
|
2013-11-18 08:58:37 +04:00
|
|
|
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
|
2013-03-28 01:09:41 +04:00
|
|
|
&mca_btl_ugni_component.rdma_max_retries);
|
|
|
|
|
|
|
|
mca_btl_ugni_component.smsg_max_retries = 16;
|
|
|
|
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
|
|
|
|
"smsg_max_retries", NULL, MCA_BASE_VAR_TYPE_INT,
|
|
|
|
NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
|
2013-11-18 08:58:37 +04:00
|
|
|
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
|
2013-03-28 01:09:41 +04:00
|
|
|
&mca_btl_ugni_component.smsg_max_retries);
|
|
|
|
|
|
|
|
mca_btl_ugni_component.max_mem_reg = 0;
|
|
|
|
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
|
|
|
|
"max_mem_reg", "Maximum number of "
|
|
|
|
"memory registrations a process can "
|
|
|
|
"hold (0 - autoselect, -1 - unlimited)"
|
|
|
|
" (default 0)", MCA_BASE_VAR_TYPE_INT,
|
|
|
|
NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
|
2013-11-18 08:58:37 +04:00
|
|
|
OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL,
|
2013-03-28 01:09:41 +04:00
|
|
|
&mca_btl_ugni_component.max_mem_reg);
|
2011-12-10 01:24:07 +04:00
|
|
|
|
2013-11-18 08:58:37 +04:00
|
|
|
mca_btl_ugni_component.mbox_increment = 0;
|
|
|
|
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
|
|
|
|
"mbox_inc", "Number of SMSG mailboxes to "
|
|
|
|
"allocate in each block (0 - autoselect(default))",
|
|
|
|
MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0,
|
|
|
|
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3,
|
|
|
|
MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.mbox_increment);
|
|
|
|
|
|
|
|
mca_btl_ugni_component.smsg_page_size = 2 << 20;
|
|
|
|
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
|
|
|
|
"smsg_page_size", "Page size to use for SMSG "
|
|
|
|
"mailbox allocation (default 2M)",
|
|
|
|
MCA_BASE_VAR_TYPE_INT, NULL, 0,
|
|
|
|
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3,
|
|
|
|
MCA_BASE_VAR_SCOPE_LOCAL,
|
|
|
|
&mca_btl_ugni_component.smsg_page_size);
|
|
|
|
|
|
|
|
/* btl/ugni can only support only a fixed set of mpools (these mpools have compatible resource
|
|
|
|
* structures) */
|
|
|
|
rc = mca_base_var_enum_create ("btl_ugni_mpool", mpool_values, &new_enum);
|
|
|
|
if (OPAL_SUCCESS != rc) {
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
mca_btl_ugni_component.mpool_type = MCA_BTL_UGNI_MPOOL_UDREG;
|
|
|
|
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
|
|
|
|
"mpool", "mpool to use", MCA_BASE_VAR_TYPE_INT, new_enum,
|
|
|
|
0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3,
|
|
|
|
MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.mpool_type);
|
|
|
|
OBJ_RELEASE(new_enum);
|
|
|
|
|
2011-12-10 01:24:07 +04:00
|
|
|
mca_btl_ugni_module.super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH;
|
|
|
|
|
|
|
|
/* smsg threshold */
|
2012-02-10 04:47:29 +04:00
|
|
|
mca_btl_ugni_module.super.btl_eager_limit = 8 * 1024;
|
2011-12-10 01:24:07 +04:00
|
|
|
mca_btl_ugni_module.super.btl_rndv_eager_limit = 8 * 1024;
|
2012-04-10 23:56:19 +04:00
|
|
|
mca_btl_ugni_module.super.btl_rdma_pipeline_frag_size = 4 * 1024 * 1024;
|
2012-02-10 04:47:29 +04:00
|
|
|
mca_btl_ugni_module.super.btl_max_send_size = 8 * 1024;
|
|
|
|
mca_btl_ugni_module.super.btl_rdma_pipeline_send_length = 8 * 1024;
|
2011-12-10 01:24:07 +04:00
|
|
|
|
|
|
|
/* threshold for put */
|
2012-02-10 04:47:29 +04:00
|
|
|
mca_btl_ugni_module.super.btl_min_rdma_pipeline_size = 8 * 1024;
|
2011-12-10 01:24:07 +04:00
|
|
|
|
|
|
|
mca_btl_ugni_module.super.btl_flags = MCA_BTL_FLAGS_SEND |
|
2014-11-20 09:22:43 +03:00
|
|
|
MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_SEND_INPLACE;
|
2011-12-10 01:24:07 +04:00
|
|
|
|
2014-11-20 09:22:43 +03:00
|
|
|
mca_btl_ugni_module.super.btl_seg_size = sizeof (mca_btl_ugni_segment_t);
|
2012-06-21 21:09:12 +04:00
|
|
|
|
2011-12-10 01:24:07 +04:00
|
|
|
mca_btl_ugni_module.super.btl_bandwidth = 40000; /* Mbs */
|
|
|
|
mca_btl_ugni_module.super.btl_latency = 2; /* Microsecs */
|
|
|
|
|
|
|
|
/* Call the BTL based to register its MCA params */
|
|
|
|
mca_btl_base_param_register(&mca_btl_ugni_component.super.btl_version,
|
|
|
|
&mca_btl_ugni_module.super);
|
2012-04-20 01:51:44 +04:00
|
|
|
|
George did the work and deserves all the credit for it. Ralph did the merge, and deserves whatever blame results from errors in it :-)
WHAT: Open our low-level communication infrastructure by moving all necessary components (btl/rcache/allocator/mpool) down in OPAL
All the components required for inter-process communications are currently deeply integrated in the OMPI layer. Several groups/institutions have express interest in having a more generic communication infrastructure, without all the OMPI layer dependencies. This communication layer should be made available at a different software level, available to all layers in the Open MPI software stack. As an example, our ORTE layer could replace the current OOB and instead use the BTL directly, gaining access to more reactive network interfaces than TCP. Similarly, external software libraries could take advantage of our highly optimized AM (active message) communication layer for their own purpose. UTK with support from Sandia, developped a version of Open MPI where the entire communication infrastucture has been moved down to OPAL (btl/rcache/allocator/mpool). Most of the moved components have been updated to match the new schema, with few exceptions (mainly BTLs where I have no way of compiling/testing them). Thus, the completion of this RFC is tied to being able to completing this move for all BTLs. For this we need help from the rest of the Open MPI community, especially those supporting some of the BTLs. A non-exhaustive list of BTLs that qualify here is: mx, portals4, scif, udapl, ugni, usnic.
This commit was SVN r32317.
2014-07-26 04:47:28 +04:00
|
|
|
return OPAL_SUCCESS;
|
2011-12-10 01:24:07 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
btl_ugni_component_open(void)
|
|
|
|
{
|
|
|
|
mca_btl_ugni_component.ugni_num_btls = 0;
|
|
|
|
mca_btl_ugni_component.modules = NULL;
|
|
|
|
|
George did the work and deserves all the credit for it. Ralph did the merge, and deserves whatever blame results from errors in it :-)
WHAT: Open our low-level communication infrastructure by moving all necessary components (btl/rcache/allocator/mpool) down in OPAL
All the components required for inter-process communications are currently deeply integrated in the OMPI layer. Several groups/institutions have express interest in having a more generic communication infrastructure, without all the OMPI layer dependencies. This communication layer should be made available at a different software level, available to all layers in the Open MPI software stack. As an example, our ORTE layer could replace the current OOB and instead use the BTL directly, gaining access to more reactive network interfaces than TCP. Similarly, external software libraries could take advantage of our highly optimized AM (active message) communication layer for their own purpose. UTK with support from Sandia, developped a version of Open MPI where the entire communication infrastucture has been moved down to OPAL (btl/rcache/allocator/mpool). Most of the moved components have been updated to match the new schema, with few exceptions (mainly BTLs where I have no way of compiling/testing them). Thus, the completion of this RFC is tied to being able to completing this move for all BTLs. For this we need help from the rest of the Open MPI community, especially those supporting some of the BTLs. A non-exhaustive list of BTLs that qualify here is: mx, portals4, scif, udapl, ugni, usnic.
This commit was SVN r32317.
2014-07-26 04:47:28 +04:00
|
|
|
return OPAL_SUCCESS;
|
2011-12-10 01:24:07 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* component cleanup - sanity checking of queue lengths
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
btl_ugni_component_close(void)
|
|
|
|
{
|
George did the work and deserves all the credit for it. Ralph did the merge, and deserves whatever blame results from errors in it :-)
WHAT: Open our low-level communication infrastructure by moving all necessary components (btl/rcache/allocator/mpool) down in OPAL
All the components required for inter-process communications are currently deeply integrated in the OMPI layer. Several groups/institutions have express interest in having a more generic communication infrastructure, without all the OMPI layer dependencies. This communication layer should be made available at a different software level, available to all layers in the Open MPI software stack. As an example, our ORTE layer could replace the current OOB and instead use the BTL directly, gaining access to more reactive network interfaces than TCP. Similarly, external software libraries could take advantage of our highly optimized AM (active message) communication layer for their own purpose. UTK with support from Sandia, developped a version of Open MPI where the entire communication infrastucture has been moved down to OPAL (btl/rcache/allocator/mpool). Most of the moved components have been updated to match the new schema, with few exceptions (mainly BTLs where I have no way of compiling/testing them). Thus, the completion of this RFC is tied to being able to completing this move for all BTLs. For this we need help from the rest of the Open MPI community, especially those supporting some of the BTLs. A non-exhaustive list of BTLs that qualify here is: mx, portals4, scif, udapl, ugni, usnic.
This commit was SVN r32317.
2014-07-26 04:47:28 +04:00
|
|
|
opal_common_ugni_fini ();
|
2011-12-10 01:24:07 +04:00
|
|
|
|
2014-05-14 01:22:33 +04:00
|
|
|
if (mca_btl_ugni_component.modules) {
|
|
|
|
free (mca_btl_ugni_component.modules);
|
|
|
|
mca_btl_ugni_component.modules = NULL;
|
|
|
|
}
|
|
|
|
|
George did the work and deserves all the credit for it. Ralph did the merge, and deserves whatever blame results from errors in it :-)
WHAT: Open our low-level communication infrastructure by moving all necessary components (btl/rcache/allocator/mpool) down in OPAL
All the components required for inter-process communications are currently deeply integrated in the OMPI layer. Several groups/institutions have express interest in having a more generic communication infrastructure, without all the OMPI layer dependencies. This communication layer should be made available at a different software level, available to all layers in the Open MPI software stack. As an example, our ORTE layer could replace the current OOB and instead use the BTL directly, gaining access to more reactive network interfaces than TCP. Similarly, external software libraries could take advantage of our highly optimized AM (active message) communication layer for their own purpose. UTK with support from Sandia, developped a version of Open MPI where the entire communication infrastucture has been moved down to OPAL (btl/rcache/allocator/mpool). Most of the moved components have been updated to match the new schema, with few exceptions (mainly BTLs where I have no way of compiling/testing them). Thus, the completion of this RFC is tied to being able to completing this move for all BTLs. For this we need help from the rest of the Open MPI community, especially those supporting some of the BTLs. A non-exhaustive list of BTLs that qualify here is: mx, portals4, scif, udapl, ugni, usnic.
This commit was SVN r32317.
2014-07-26 04:47:28 +04:00
|
|
|
return OPAL_SUCCESS;
|
2011-12-10 01:24:07 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
static void mca_btl_ugni_autoset_leave_pinned (void) {
|
2013-11-18 08:58:37 +04:00
|
|
|
if (MCA_BTL_UGNI_MPOOL_UDREG != mca_btl_ugni_component.mpool_type) {
|
|
|
|
int value = opal_mem_hooks_support_level();
|
|
|
|
if ((OPAL_MEMORY_FREE_SUPPORT | OPAL_MEMORY_MUNMAP_SUPPORT) ==
|
|
|
|
((OPAL_MEMORY_FREE_SUPPORT | OPAL_MEMORY_MUNMAP_SUPPORT) & value)) {
|
|
|
|
/* Set leave pinned to 1 if leave pinned pipeline is not set */
|
George did the work and deserves all the credit for it. Ralph did the merge, and deserves whatever blame results from errors in it :-)
WHAT: Open our low-level communication infrastructure by moving all necessary components (btl/rcache/allocator/mpool) down in OPAL
All the components required for inter-process communications are currently deeply integrated in the OMPI layer. Several groups/institutions have express interest in having a more generic communication infrastructure, without all the OMPI layer dependencies. This communication layer should be made available at a different software level, available to all layers in the Open MPI software stack. As an example, our ORTE layer could replace the current OOB and instead use the BTL directly, gaining access to more reactive network interfaces than TCP. Similarly, external software libraries could take advantage of our highly optimized AM (active message) communication layer for their own purpose. UTK with support from Sandia, developped a version of Open MPI where the entire communication infrastucture has been moved down to OPAL (btl/rcache/allocator/mpool). Most of the moved components have been updated to match the new schema, with few exceptions (mainly BTLs where I have no way of compiling/testing them). Thus, the completion of this RFC is tied to being able to completing this move for all BTLs. For this we need help from the rest of the Open MPI community, especially those supporting some of the BTLs. A non-exhaustive list of BTLs that qualify here is: mx, portals4, scif, udapl, ugni, usnic.
This commit was SVN r32317.
2014-07-26 04:47:28 +04:00
|
|
|
if (-1 == opal_leave_pinned) {
|
|
|
|
opal_leave_pinned = !opal_leave_pinned_pipeline;
|
2013-11-18 08:58:37 +04:00
|
|
|
}
|
2012-06-01 00:02:41 +04:00
|
|
|
} else {
|
George did the work and deserves all the credit for it. Ralph did the merge, and deserves whatever blame results from errors in it :-)
WHAT: Open our low-level communication infrastructure by moving all necessary components (btl/rcache/allocator/mpool) down in OPAL
All the components required for inter-process communications are currently deeply integrated in the OMPI layer. Several groups/institutions have express interest in having a more generic communication infrastructure, without all the OMPI layer dependencies. This communication layer should be made available at a different software level, available to all layers in the Open MPI software stack. As an example, our ORTE layer could replace the current OOB and instead use the BTL directly, gaining access to more reactive network interfaces than TCP. Similarly, external software libraries could take advantage of our highly optimized AM (active message) communication layer for their own purpose. UTK with support from Sandia, developped a version of Open MPI where the entire communication infrastucture has been moved down to OPAL (btl/rcache/allocator/mpool). Most of the moved components have been updated to match the new schema, with few exceptions (mainly BTLs where I have no way of compiling/testing them). Thus, the completion of this RFC is tied to being able to completing this move for all BTLs. For this we need help from the rest of the Open MPI community, especially those supporting some of the BTLs. A non-exhaustive list of BTLs that qualify here is: mx, portals4, scif, udapl, ugni, usnic.
This commit was SVN r32317.
2014-07-26 04:47:28 +04:00
|
|
|
opal_leave_pinned = 0;
|
|
|
|
opal_leave_pinned_pipeline = 0;
|
2012-06-01 00:02:41 +04:00
|
|
|
}
|
George did the work and deserves all the credit for it. Ralph did the merge, and deserves whatever blame results from errors in it :-)
WHAT: Open our low-level communication infrastructure by moving all necessary components (btl/rcache/allocator/mpool) down in OPAL
All the components required for inter-process communications are currently deeply integrated in the OMPI layer. Several groups/institutions have express interest in having a more generic communication infrastructure, without all the OMPI layer dependencies. This communication layer should be made available at a different software level, available to all layers in the Open MPI software stack. As an example, our ORTE layer could replace the current OOB and instead use the BTL directly, gaining access to more reactive network interfaces than TCP. Similarly, external software libraries could take advantage of our highly optimized AM (active message) communication layer for their own purpose. UTK with support from Sandia, developped a version of Open MPI where the entire communication infrastucture has been moved down to OPAL (btl/rcache/allocator/mpool). Most of the moved components have been updated to match the new schema, with few exceptions (mainly BTLs where I have no way of compiling/testing them). Thus, the completion of this RFC is tied to being able to completing this move for all BTLs. For this we need help from the rest of the Open MPI community, especially those supporting some of the BTLs. A non-exhaustive list of BTLs that qualify here is: mx, portals4, scif, udapl, ugni, usnic.
This commit was SVN r32317.
2014-07-26 04:47:28 +04:00
|
|
|
} else if (-1 == opal_leave_pinned) {
|
2013-11-18 08:58:37 +04:00
|
|
|
/* if udreg is in use we can set leave pinned without checking for the
|
|
|
|
* memory hooks. */
|
George did the work and deserves all the credit for it. Ralph did the merge, and deserves whatever blame results from errors in it :-)
WHAT: Open our low-level communication infrastructure by moving all necessary components (btl/rcache/allocator/mpool) down in OPAL
All the components required for inter-process communications are currently deeply integrated in the OMPI layer. Several groups/institutions have express interest in having a more generic communication infrastructure, without all the OMPI layer dependencies. This communication layer should be made available at a different software level, available to all layers in the Open MPI software stack. As an example, our ORTE layer could replace the current OOB and instead use the BTL directly, gaining access to more reactive network interfaces than TCP. Similarly, external software libraries could take advantage of our highly optimized AM (active message) communication layer for their own purpose. UTK with support from Sandia, developped a version of Open MPI where the entire communication infrastucture has been moved down to OPAL (btl/rcache/allocator/mpool). Most of the moved components have been updated to match the new schema, with few exceptions (mainly BTLs where I have no way of compiling/testing them). Thus, the completion of this RFC is tied to being able to completing this move for all BTLs. For this we need help from the rest of the Open MPI community, especially those supporting some of the BTLs. A non-exhaustive list of BTLs that qualify here is: mx, portals4, scif, udapl, ugni, usnic.
This commit was SVN r32317.
2014-07-26 04:47:28 +04:00
|
|
|
opal_leave_pinned = !opal_leave_pinned_pipeline;
|
2012-06-01 00:02:41 +04:00
|
|
|
}
|
2011-12-10 01:24:07 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
static mca_btl_base_module_t **
|
|
|
|
mca_btl_ugni_component_init (int *num_btl_modules,
|
|
|
|
bool enable_progress_threads,
|
|
|
|
bool enable_mpi_threads)
|
|
|
|
{
|
|
|
|
struct mca_btl_base_module_t **base_modules;
|
|
|
|
mca_btl_ugni_module_t *ugni_modules;
|
|
|
|
unsigned int i;
|
|
|
|
int rc;
|
|
|
|
|
2013-03-28 01:09:41 +04:00
|
|
|
if (16384 < mca_btl_ugni_component.ugni_smsg_limit) {
|
|
|
|
mca_btl_ugni_component.ugni_smsg_limit = 16384;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (65536 < mca_btl_ugni_component.ugni_fma_limit) {
|
|
|
|
mca_btl_ugni_component.ugni_fma_limit = 65536;
|
|
|
|
}
|
|
|
|
|
2014-12-18 20:23:14 +03:00
|
|
|
if (enable_mpi_threads) {
|
|
|
|
mca_btl_ugni_component.progress_thread_allowed = 1;
|
|
|
|
}
|
|
|
|
|
2011-12-10 01:24:07 +04:00
|
|
|
/* Initialize ugni library and create communication domain */
|
George did the work and deserves all the credit for it. Ralph did the merge, and deserves whatever blame results from errors in it :-)
WHAT: Open our low-level communication infrastructure by moving all necessary components (btl/rcache/allocator/mpool) down in OPAL
All the components required for inter-process communications are currently deeply integrated in the OMPI layer. Several groups/institutions have express interest in having a more generic communication infrastructure, without all the OMPI layer dependencies. This communication layer should be made available at a different software level, available to all layers in the Open MPI software stack. As an example, our ORTE layer could replace the current OOB and instead use the BTL directly, gaining access to more reactive network interfaces than TCP. Similarly, external software libraries could take advantage of our highly optimized AM (active message) communication layer for their own purpose. UTK with support from Sandia, developped a version of Open MPI where the entire communication infrastucture has been moved down to OPAL (btl/rcache/allocator/mpool). Most of the moved components have been updated to match the new schema, with few exceptions (mainly BTLs where I have no way of compiling/testing them). Thus, the completion of this RFC is tied to being able to completing this move for all BTLs. For this we need help from the rest of the Open MPI community, especially those supporting some of the BTLs. A non-exhaustive list of BTLs that qualify here is: mx, portals4, scif, udapl, ugni, usnic.
This commit was SVN r32317.
2014-07-26 04:47:28 +04:00
|
|
|
rc = opal_common_ugni_init();
|
|
|
|
if (OPAL_SUCCESS != rc) {
|
2011-12-10 01:24:07 +04:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2012-06-01 00:02:41 +04:00
|
|
|
/* Create and initialize one module per uGNI device */
|
George did the work and deserves all the credit for it. Ralph did the merge, and deserves whatever blame results from errors in it :-)
WHAT: Open our low-level communication infrastructure by moving all necessary components (btl/rcache/allocator/mpool) down in OPAL
All the components required for inter-process communications are currently deeply integrated in the OMPI layer. Several groups/institutions have express interest in having a more generic communication infrastructure, without all the OMPI layer dependencies. This communication layer should be made available at a different software level, available to all layers in the Open MPI software stack. As an example, our ORTE layer could replace the current OOB and instead use the BTL directly, gaining access to more reactive network interfaces than TCP. Similarly, external software libraries could take advantage of our highly optimized AM (active message) communication layer for their own purpose. UTK with support from Sandia, developped a version of Open MPI where the entire communication infrastucture has been moved down to OPAL (btl/rcache/allocator/mpool). Most of the moved components have been updated to match the new schema, with few exceptions (mainly BTLs where I have no way of compiling/testing them). Thus, the completion of this RFC is tied to being able to completing this move for all BTLs. For this we need help from the rest of the Open MPI community, especially those supporting some of the BTLs. A non-exhaustive list of BTLs that qualify here is: mx, portals4, scif, udapl, ugni, usnic.
This commit was SVN r32317.
2014-07-26 04:47:28 +04:00
|
|
|
mca_btl_ugni_component.ugni_num_btls = opal_common_ugni_module.device_count;
|
2011-12-10 01:24:07 +04:00
|
|
|
|
|
|
|
BTL_VERBOSE(("btl/ugni initializing"));
|
|
|
|
|
|
|
|
ugni_modules = mca_btl_ugni_component.modules = (mca_btl_ugni_module_t *)
|
|
|
|
calloc (mca_btl_ugni_component.ugni_num_btls,
|
|
|
|
sizeof (mca_btl_ugni_module_t));
|
|
|
|
|
|
|
|
if (OPAL_UNLIKELY(NULL == mca_btl_ugni_component.modules)) {
|
|
|
|
BTL_ERROR(("Failed malloc: %s:%d", __FILE__, __LINE__));
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
base_modules = (struct mca_btl_base_module_t **)
|
|
|
|
calloc (mca_btl_ugni_component.ugni_num_btls,
|
|
|
|
sizeof (struct mca_btl_base_module_t *));
|
|
|
|
if (OPAL_UNLIKELY(NULL == base_modules)) {
|
|
|
|
BTL_ERROR(("Malloc failed : %s:%d", __FILE__, __LINE__));
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
mca_btl_ugni_autoset_leave_pinned ();
|
|
|
|
|
2012-02-10 04:47:29 +04:00
|
|
|
mca_btl_ugni_module.super.btl_rdma_pipeline_send_length = mca_btl_ugni_module.super.btl_eager_limit;
|
|
|
|
|
2011-12-10 01:24:07 +04:00
|
|
|
for (i = 0 ; i < mca_btl_ugni_component.ugni_num_btls ; ++i) {
|
|
|
|
mca_btl_ugni_module_t *ugni_module = ugni_modules + i;
|
|
|
|
|
|
|
|
rc = mca_btl_ugni_module_init (ugni_module,
|
George did the work and deserves all the credit for it. Ralph did the merge, and deserves whatever blame results from errors in it :-)
WHAT: Open our low-level communication infrastructure by moving all necessary components (btl/rcache/allocator/mpool) down in OPAL
All the components required for inter-process communications are currently deeply integrated in the OMPI layer. Several groups/institutions have express interest in having a more generic communication infrastructure, without all the OMPI layer dependencies. This communication layer should be made available at a different software level, available to all layers in the Open MPI software stack. As an example, our ORTE layer could replace the current OOB and instead use the BTL directly, gaining access to more reactive network interfaces than TCP. Similarly, external software libraries could take advantage of our highly optimized AM (active message) communication layer for their own purpose. UTK with support from Sandia, developped a version of Open MPI where the entire communication infrastucture has been moved down to OPAL (btl/rcache/allocator/mpool). Most of the moved components have been updated to match the new schema, with few exceptions (mainly BTLs where I have no way of compiling/testing them). Thus, the completion of this RFC is tied to being able to completing this move for all BTLs. For this we need help from the rest of the Open MPI community, especially those supporting some of the BTLs. A non-exhaustive list of BTLs that qualify here is: mx, portals4, scif, udapl, ugni, usnic.
This commit was SVN r32317.
2014-07-26 04:47:28 +04:00
|
|
|
opal_common_ugni_module.devices + i);
|
|
|
|
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
|
2011-12-10 01:24:07 +04:00
|
|
|
BTL_ERROR(("Failed to initialize uGNI module @ %s:%d", __FILE__,
|
|
|
|
__LINE__));
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
base_modules[i] = (mca_btl_base_module_t *) ugni_module;
|
|
|
|
}
|
|
|
|
|
|
|
|
*num_btl_modules = mca_btl_ugni_component.ugni_num_btls;
|
|
|
|
|
2013-11-18 08:58:37 +04:00
|
|
|
BTL_VERBOSE(("btl/ugni done initializing %d module(s)", *num_btl_modules));
|
2011-12-10 01:24:07 +04:00
|
|
|
|
|
|
|
return base_modules;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int
|
2012-06-01 00:02:41 +04:00
|
|
|
mca_btl_ugni_progress_datagram (mca_btl_ugni_module_t *ugni_module)
|
2011-12-10 01:24:07 +04:00
|
|
|
{
|
|
|
|
uint32_t remote_addr, remote_id;
|
2013-11-18 08:58:37 +04:00
|
|
|
uint64_t datagram_id, data;
|
2011-12-10 01:24:07 +04:00
|
|
|
mca_btl_base_endpoint_t *ep;
|
|
|
|
gni_post_state_t post_state;
|
2012-02-10 04:47:29 +04:00
|
|
|
gni_ep_handle_t handle;
|
|
|
|
gni_return_t grc;
|
2013-11-18 08:58:37 +04:00
|
|
|
int count = 0, rc;
|
2011-12-10 01:24:07 +04:00
|
|
|
|
2012-02-10 04:47:29 +04:00
|
|
|
/* check for datagram completion */
|
2014-10-08 20:10:19 +04:00
|
|
|
OPAL_THREAD_LOCK(&ugni_module->device->dev_lock); /* TODO: may not need lock for this function */
|
2012-06-01 00:02:41 +04:00
|
|
|
grc = GNI_PostDataProbeById (ugni_module->device->dev_handle, &datagram_id);
|
2014-10-08 20:10:19 +04:00
|
|
|
OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock);
|
2012-02-10 04:47:29 +04:00
|
|
|
if (OPAL_LIKELY(GNI_RC_SUCCESS != grc)) {
|
2011-12-10 01:24:07 +04:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2013-11-18 08:58:37 +04:00
|
|
|
data = datagram_id & ~(MCA_BTL_UGNI_DATAGRAM_MASK);
|
|
|
|
|
|
|
|
BTL_VERBOSE(("datgram_id: %" PRIx64 ", mask: %" PRIx64, datagram_id, (uint64_t) (datagram_id & MCA_BTL_UGNI_DATAGRAM_MASK)));
|
|
|
|
|
|
|
|
if ((datagram_id & MCA_BTL_UGNI_DATAGRAM_MASK) == MCA_BTL_UGNI_CONNECT_DIRECTED_ID) {
|
|
|
|
ep = (mca_btl_base_endpoint_t *) opal_pointer_array_get_item (&ugni_module->endpoints, data);
|
|
|
|
handle = ep->smsg_ep_handle;
|
2011-12-10 01:24:07 +04:00
|
|
|
} else {
|
2013-11-18 08:58:37 +04:00
|
|
|
handle = ugni_module->wildcard_ep;
|
2011-12-10 01:24:07 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/* wait for the incoming datagram to complete (in case it isn't) */
|
2014-10-08 20:10:19 +04:00
|
|
|
OPAL_THREAD_LOCK(&ugni_module->device->dev_lock); /* TODO: may not need lock for this function */
|
2012-02-10 04:47:29 +04:00
|
|
|
grc = GNI_EpPostDataWaitById (handle, datagram_id, -1, &post_state,
|
2012-04-20 01:51:55 +04:00
|
|
|
&remote_addr, &remote_id);
|
2014-10-08 20:10:19 +04:00
|
|
|
OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock);
|
2012-02-10 04:47:29 +04:00
|
|
|
if (GNI_RC_SUCCESS != grc) {
|
|
|
|
BTL_ERROR(("GNI_EpPostDataWaitById failed with rc = %d", grc));
|
George did the work and deserves all the credit for it. Ralph did the merge, and deserves whatever blame results from errors in it :-)
WHAT: Open our low-level communication infrastructure by moving all necessary components (btl/rcache/allocator/mpool) down in OPAL
All the components required for inter-process communications are currently deeply integrated in the OMPI layer. Several groups/institutions have express interest in having a more generic communication infrastructure, without all the OMPI layer dependencies. This communication layer should be made available at a different software level, available to all layers in the Open MPI software stack. As an example, our ORTE layer could replace the current OOB and instead use the BTL directly, gaining access to more reactive network interfaces than TCP. Similarly, external software libraries could take advantage of our highly optimized AM (active message) communication layer for their own purpose. UTK with support from Sandia, developped a version of Open MPI where the entire communication infrastucture has been moved down to OPAL (btl/rcache/allocator/mpool). Most of the moved components have been updated to match the new schema, with few exceptions (mainly BTLs where I have no way of compiling/testing them). Thus, the completion of this RFC is tied to being able to completing this move for all BTLs. For this we need help from the rest of the Open MPI community, especially those supporting some of the BTLs. A non-exhaustive list of BTLs that qualify here is: mx, portals4, scif, udapl, ugni, usnic.
This commit was SVN r32317.
2014-07-26 04:47:28 +04:00
|
|
|
return opal_common_rc_ugni_to_opal (grc);
|
2011-12-10 01:24:07 +04:00
|
|
|
}
|
|
|
|
|
2013-11-18 08:58:37 +04:00
|
|
|
/* if this is a wildcard endpoint lookup the remote peer by the proc id we received */
|
|
|
|
if (handle == ugni_module->wildcard_ep) {
|
|
|
|
BTL_VERBOSE(("received connection attempt on wildcard endpoint from proc id: %" PRIx64, ugni_module->wc_remote_attr.proc_id));
|
|
|
|
rc = opal_hash_table_get_value_uint64 (&ugni_module->id_to_endpoint,
|
|
|
|
ugni_module->wc_remote_attr.proc_id,
|
|
|
|
(void *) &ep);
|
|
|
|
/* check if the endpoint is known */
|
|
|
|
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc || NULL == ep)) {
|
|
|
|
BTL_ERROR(("received connection attempt from an unknown peer. rc: %d, ep: %p, id: 0x%" PRIx64,
|
2014-05-14 01:22:33 +04:00
|
|
|
rc, (void *) ep, ugni_module->wc_remote_attr.proc_id));
|
George did the work and deserves all the credit for it. Ralph did the merge, and deserves whatever blame results from errors in it :-)
WHAT: Open our low-level communication infrastructure by moving all necessary components (btl/rcache/allocator/mpool) down in OPAL
All the components required for inter-process communications are currently deeply integrated in the OMPI layer. Several groups/institutions have express interest in having a more generic communication infrastructure, without all the OMPI layer dependencies. This communication layer should be made available at a different software level, available to all layers in the Open MPI software stack. As an example, our ORTE layer could replace the current OOB and instead use the BTL directly, gaining access to more reactive network interfaces than TCP. Similarly, external software libraries could take advantage of our highly optimized AM (active message) communication layer for their own purpose. UTK with support from Sandia, developped a version of Open MPI where the entire communication infrastucture has been moved down to OPAL (btl/rcache/allocator/mpool). Most of the moved components have been updated to match the new schema, with few exceptions (mainly BTLs where I have no way of compiling/testing them). Thus, the completion of this RFC is tied to being able to completing this move for all BTLs. For this we need help from the rest of the Open MPI community, especially those supporting some of the BTLs. A non-exhaustive list of BTLs that qualify here is: mx, portals4, scif, udapl, ugni, usnic.
This commit was SVN r32317.
2014-07-26 04:47:28 +04:00
|
|
|
return OPAL_ERR_NOT_FOUND;
|
2013-11-18 08:58:37 +04:00
|
|
|
}
|
|
|
|
} else {
|
2014-05-14 01:22:33 +04:00
|
|
|
BTL_VERBOSE(("directed datagram complete for endpoint %p", (void *) ep));
|
2013-11-18 08:58:37 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/* should not have gotten a NULL endpoint */
|
|
|
|
assert (NULL != ep);
|
2011-12-10 01:24:07 +04:00
|
|
|
|
2013-11-18 08:58:37 +04:00
|
|
|
BTL_VERBOSE(("got a datagram completion: id = %" PRIx64 ", state = %d, "
|
|
|
|
"data = 0x%" PRIx64 ", ep = %p, remote id: %d", datagram_id, post_state,
|
2014-05-14 01:22:33 +04:00
|
|
|
data, (void *) ep, remote_id));
|
2011-12-10 01:24:07 +04:00
|
|
|
|
|
|
|
/* NTH: TODO -- error handling */
|
|
|
|
(void) mca_btl_ugni_ep_connect_progress (ep);
|
|
|
|
|
2012-04-20 01:51:55 +04:00
|
|
|
if (MCA_BTL_UGNI_EP_STATE_CONNECTED == ep->state) {
|
2011-12-10 01:24:07 +04:00
|
|
|
/* process messages waiting in the endpoint's smsg mailbox */
|
2012-02-10 04:47:29 +04:00
|
|
|
count = mca_btl_ugni_smsg_process (ep);
|
2011-12-10 01:24:07 +04:00
|
|
|
}
|
|
|
|
|
2012-06-01 00:02:41 +04:00
|
|
|
/* repost the wildcard datagram */
|
2013-11-18 08:58:37 +04:00
|
|
|
if (handle == ugni_module->wildcard_ep) {
|
2012-06-01 00:02:41 +04:00
|
|
|
mca_btl_ugni_wildcard_ep_post (ugni_module);
|
2011-12-10 01:24:07 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
return count;
|
|
|
|
}
|
|
|
|
|
2014-11-20 09:22:43 +03:00
|
|
|
static inline int
|
|
|
|
mca_btl_ugni_progress_rdma (mca_btl_ugni_module_t *ugni_module, int which_cq)
|
2011-12-10 01:24:07 +04:00
|
|
|
{
|
2014-11-20 09:22:43 +03:00
|
|
|
opal_common_ugni_post_desc_t *desc;
|
|
|
|
mca_btl_ugni_base_frag_t *frag;
|
2012-04-20 01:51:55 +04:00
|
|
|
gni_cq_entry_t event_data = 0;
|
|
|
|
uint32_t recoverable = 1;
|
2014-11-20 09:22:43 +03:00
|
|
|
gni_return_t rc;
|
|
|
|
gni_cq_handle_t the_cq;
|
|
|
|
|
|
|
|
the_cq = (which_cq == 0) ? ugni_module->rdma_local_cq : ugni_module->rdma_local_irq_cq;
|
2014-10-08 20:10:19 +04:00
|
|
|
|
|
|
|
OPAL_THREAD_LOCK(&ugni_module->device->dev_lock);
|
2014-11-20 09:22:43 +03:00
|
|
|
rc = GNI_CqGetEvent (the_cq, &event_data);
|
|
|
|
if (GNI_RC_NOT_DONE == rc) {
|
2014-10-08 20:10:19 +04:00
|
|
|
OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock);
|
2012-04-20 01:51:55 +04:00
|
|
|
return 0;
|
2011-12-10 01:24:07 +04:00
|
|
|
}
|
|
|
|
|
2014-11-20 09:22:43 +03:00
|
|
|
if (OPAL_UNLIKELY((GNI_RC_SUCCESS != rc && !event_data) || GNI_CQ_OVERRUN(event_data))) {
|
2012-04-20 01:51:55 +04:00
|
|
|
/* TODO -- need to handle overrun -- how do we do this without an event?
|
|
|
|
will the event eventually come back? Ask Cray */
|
2014-11-20 09:22:43 +03:00
|
|
|
BTL_ERROR(("unhandled post error! ugni rc = %d %s", rc,gni_err_str[rc]));
|
2014-10-08 20:10:19 +04:00
|
|
|
OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock);
|
2014-11-20 09:22:43 +03:00
|
|
|
return opal_common_rc_ugni_to_opal (rc);
|
2012-04-20 01:51:55 +04:00
|
|
|
}
|
2011-12-10 01:24:07 +04:00
|
|
|
|
2014-11-20 09:22:43 +03:00
|
|
|
rc = GNI_GetCompleted (the_cq, event_data, (gni_post_descriptor_t **) &desc);
|
2014-10-08 20:10:19 +04:00
|
|
|
OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock);
|
2014-11-20 09:22:43 +03:00
|
|
|
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc && GNI_RC_TRANSACTION_ERROR != rc)) {
|
|
|
|
BTL_ERROR(("Error in GNI_GetComplete %s", gni_err_str[rc]));
|
|
|
|
return opal_common_rc_ugni_to_opal (rc);
|
2011-12-10 01:24:07 +04:00
|
|
|
}
|
|
|
|
|
2014-11-20 09:22:43 +03:00
|
|
|
frag = MCA_BTL_UGNI_DESC_TO_FRAG(desc);
|
|
|
|
|
|
|
|
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc || !GNI_CQ_STATUS_OK(event_data))) {
|
|
|
|
char buffer[1024];
|
2014-10-08 20:10:19 +04:00
|
|
|
|
2012-04-20 01:51:55 +04:00
|
|
|
(void) GNI_CqErrorRecoverable (event_data, &recoverable);
|
2014-11-20 09:22:43 +03:00
|
|
|
GNI_CqErrorStr(event_data,buffer,sizeof(buffer));
|
2011-12-10 01:24:07 +04:00
|
|
|
|
2014-11-20 09:22:43 +03:00
|
|
|
if (OPAL_UNLIKELY(++frag->post_desc.tries >= mca_btl_ugni_component.rdma_max_retries ||
|
2012-04-20 01:51:55 +04:00
|
|
|
!recoverable)) {
|
|
|
|
/* give up */
|
2014-11-20 09:22:43 +03:00
|
|
|
BTL_ERROR(("giving up on frag %p type %d CQE error %s", (void *) frag, frag->post_desc.base.type, buffer));
|
|
|
|
mca_btl_ugni_frag_complete (frag, OPAL_ERROR);
|
2011-12-10 01:24:07 +04:00
|
|
|
|
George did the work and deserves all the credit for it. Ralph did the merge, and deserves whatever blame results from errors in it :-)
WHAT: Open our low-level communication infrastructure by moving all necessary components (btl/rcache/allocator/mpool) down in OPAL
All the components required for inter-process communications are currently deeply integrated in the OMPI layer. Several groups/institutions have express interest in having a more generic communication infrastructure, without all the OMPI layer dependencies. This communication layer should be made available at a different software level, available to all layers in the Open MPI software stack. As an example, our ORTE layer could replace the current OOB and instead use the BTL directly, gaining access to more reactive network interfaces than TCP. Similarly, external software libraries could take advantage of our highly optimized AM (active message) communication layer for their own purpose. UTK with support from Sandia, developped a version of Open MPI where the entire communication infrastucture has been moved down to OPAL (btl/rcache/allocator/mpool). Most of the moved components have been updated to match the new schema, with few exceptions (mainly BTLs where I have no way of compiling/testing them). Thus, the completion of this RFC is tied to being able to completing this move for all BTLs. For this we need help from the rest of the Open MPI community, especially those supporting some of the BTLs. A non-exhaustive list of BTLs that qualify here is: mx, portals4, scif, udapl, ugni, usnic.
This commit was SVN r32317.
2014-07-26 04:47:28 +04:00
|
|
|
return OPAL_ERROR;
|
2012-04-20 01:51:55 +04:00
|
|
|
}
|
2011-12-10 01:24:07 +04:00
|
|
|
|
2014-11-20 09:22:43 +03:00
|
|
|
/* repost transaction */
|
|
|
|
mca_btl_ugni_repost (frag);
|
2011-12-10 01:24:07 +04:00
|
|
|
|
2012-06-01 00:02:41 +04:00
|
|
|
return 0;
|
2011-12-10 01:24:07 +04:00
|
|
|
}
|
|
|
|
|
2014-11-20 09:22:43 +03:00
|
|
|
BTL_VERBOSE(("RDMA/FMA complete for frag %p", (void *) frag));
|
|
|
|
|
|
|
|
mca_btl_ugni_frag_complete (frag, opal_common_rc_ugni_to_opal (rc));
|
2011-12-10 01:24:07 +04:00
|
|
|
|
2012-04-20 01:51:55 +04:00
|
|
|
return 1;
|
2011-12-10 01:24:07 +04:00
|
|
|
}
|
|
|
|
|
2012-06-01 00:02:41 +04:00
|
|
|
static inline int
|
2014-11-20 09:22:43 +03:00
|
|
|
mca_btl_ugni_retry_failed (mca_btl_ugni_module_t *ugni_module)
|
2011-12-10 01:24:07 +04:00
|
|
|
{
|
2014-11-20 09:22:43 +03:00
|
|
|
int count = opal_list_get_size (&ugni_module->failed_frags);
|
2012-06-01 00:02:41 +04:00
|
|
|
int i;
|
2011-12-10 01:24:07 +04:00
|
|
|
|
2012-06-01 00:02:41 +04:00
|
|
|
for (i = 0 ; i < count ; ++i) {
|
2014-11-20 09:22:43 +03:00
|
|
|
OPAL_THREAD_LOCK(&ugni_module->failed_frags_lock);
|
|
|
|
mca_btl_ugni_base_frag_t *frag =
|
|
|
|
(mca_btl_ugni_base_frag_t *) opal_list_remove_first (&ugni_module->failed_frags);
|
|
|
|
OPAL_THREAD_UNLOCK(&ugni_module->failed_frags_lock);
|
|
|
|
if (NULL == frag) {
|
2014-10-09 00:58:09 +04:00
|
|
|
break;
|
2014-10-08 20:10:19 +04:00
|
|
|
}
|
2014-11-20 09:22:43 +03:00
|
|
|
|
|
|
|
mca_btl_ugni_repost (frag);
|
2011-12-10 01:24:07 +04:00
|
|
|
}
|
|
|
|
|
2014-11-20 09:22:43 +03:00
|
|
|
return count;
|
2012-06-01 00:02:41 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline int
|
|
|
|
mca_btl_ugni_progress_wait_list (mca_btl_ugni_module_t *ugni_module)
|
|
|
|
{
|
2014-10-08 20:10:19 +04:00
|
|
|
int rc = OPAL_SUCCESS;
|
|
|
|
mca_btl_base_endpoint_t *endpoint = NULL;
|
|
|
|
int count;
|
2012-06-01 00:02:41 +04:00
|
|
|
|
2014-10-08 20:10:19 +04:00
|
|
|
OPAL_THREAD_LOCK(&ugni_module->ep_wait_list_lock);
|
|
|
|
count = opal_list_get_size(&ugni_module->ep_wait_list);
|
|
|
|
OPAL_THREAD_UNLOCK(&ugni_module->ep_wait_list_lock);
|
2012-06-01 00:02:41 +04:00
|
|
|
|
2014-10-08 20:10:19 +04:00
|
|
|
do {
|
|
|
|
OPAL_THREAD_LOCK(&ugni_module->ep_wait_list_lock);
|
|
|
|
endpoint = (mca_btl_base_endpoint_t *) opal_list_remove_first (&ugni_module->ep_wait_list);
|
|
|
|
OPAL_THREAD_UNLOCK(&ugni_module->ep_wait_list_lock);
|
|
|
|
if (endpoint != NULL) {
|
2012-06-21 21:09:12 +04:00
|
|
|
|
2014-10-08 20:10:19 +04:00
|
|
|
endpoint->wait_listed = false;
|
|
|
|
|
|
|
|
rc = mca_btl_ugni_progress_send_wait_list (endpoint);
|
|
|
|
|
|
|
|
if (OPAL_SUCCESS != rc && false == endpoint->wait_listed) {
|
|
|
|
|
|
|
|
endpoint->wait_listed = true;
|
|
|
|
OPAL_THREAD_LOCK(&ugni_module->ep_wait_list_lock);
|
|
|
|
opal_list_append (&ugni_module->ep_wait_list, &endpoint->super);
|
|
|
|
OPAL_THREAD_UNLOCK(&ugni_module->ep_wait_list_lock);
|
|
|
|
}
|
2012-06-01 00:02:41 +04:00
|
|
|
}
|
|
|
|
|
2014-10-08 20:10:19 +04:00
|
|
|
--count;
|
|
|
|
if (count == 0) break;
|
|
|
|
|
|
|
|
} while (endpoint != NULL) ;
|
|
|
|
|
|
|
|
return rc;
|
2011-12-10 01:24:07 +04:00
|
|
|
}
|
|
|
|
|
2012-04-20 01:51:55 +04:00
|
|
|
static int mca_btl_ugni_component_progress (void)
|
2011-12-10 01:24:07 +04:00
|
|
|
{
|
2012-06-01 00:02:41 +04:00
|
|
|
mca_btl_ugni_module_t *ugni_module;
|
2011-12-10 12:20:46 +04:00
|
|
|
unsigned int i;
|
2012-04-20 01:51:55 +04:00
|
|
|
int count = 0;
|
2011-12-10 01:24:07 +04:00
|
|
|
|
|
|
|
for (i = 0 ; i < mca_btl_ugni_component.ugni_num_btls ; ++i) {
|
2012-06-01 00:02:41 +04:00
|
|
|
ugni_module = mca_btl_ugni_component.modules + i;
|
2011-12-10 01:24:07 +04:00
|
|
|
|
2014-11-20 09:22:43 +03:00
|
|
|
mca_btl_ugni_retry_failed (ugni_module);
|
2012-06-01 00:02:41 +04:00
|
|
|
mca_btl_ugni_progress_wait_list (ugni_module);
|
2011-12-10 01:24:07 +04:00
|
|
|
|
2012-06-01 00:02:41 +04:00
|
|
|
count += mca_btl_ugni_progress_datagram (ugni_module);
|
|
|
|
count += mca_btl_ugni_progress_local_smsg (ugni_module);
|
|
|
|
count += mca_btl_ugni_progress_remote_smsg (ugni_module);
|
2014-10-08 20:10:19 +04:00
|
|
|
count += mca_btl_ugni_progress_rdma (ugni_module, 0);
|
2014-12-05 02:18:16 +03:00
|
|
|
if (howards_progress_var) {
|
|
|
|
count += mca_btl_ugni_progress_rdma (ugni_module, 1);
|
|
|
|
}
|
2011-12-10 01:24:07 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
return count;
|
|
|
|
}
|