2005-07-01 01:28:35 +04:00
|
|
|
/*
|
|
|
|
* Copyright (c) 2004-2005 The Trustees of Indiana University.
|
|
|
|
* All rights reserved.
|
|
|
|
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
|
|
|
* All rights reserved.
|
|
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
|
|
* University of Stuttgart. All rights reserved.
|
|
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
|
|
* All rights reserved.
|
|
|
|
* $COPYRIGHT$
|
|
|
|
*
|
|
|
|
* Additional copyrights may follow
|
|
|
|
*
|
|
|
|
* $HEADER$
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
|
|
#include "ompi_config.h"
|
2005-08-13 01:42:07 +04:00
|
|
|
#include "ompi/include/constants.h"
|
2005-07-04 03:09:55 +04:00
|
|
|
#include "opal/event/event.h"
|
2005-07-04 05:36:20 +04:00
|
|
|
#include "opal/util/if.h"
|
2005-07-04 04:13:44 +04:00
|
|
|
#include "opal/util/argv.h"
|
2005-07-04 03:31:27 +04:00
|
|
|
#include "opal/util/output.h"
|
2005-07-01 01:28:35 +04:00
|
|
|
#include "mca/pml/pml.h"
|
|
|
|
#include "mca/btl/btl.h"
|
|
|
|
|
|
|
|
#include "mca/base/mca_base_param.h"
|
|
|
|
#include "mca/errmgr/errmgr.h"
|
|
|
|
#include "mca/mpool/base/base.h"
|
|
|
|
#include "btl_openib.h"
|
|
|
|
#include "btl_openib_frag.h"
|
|
|
|
#include "btl_openib_endpoint.h"
|
2005-07-12 17:38:54 +04:00
|
|
|
#include "mca/btl/base/base.h"
|
2005-07-20 01:04:22 +04:00
|
|
|
#include "mca/btl/base/btl_base_error.h"
|
2005-07-12 17:38:54 +04:00
|
|
|
|
|
|
|
|
2005-07-01 01:28:35 +04:00
|
|
|
#include "datatype/convertor.h"
|
|
|
|
#include "mca/mpool/mvapi/mpool_mvapi.h"
|
2005-07-12 17:38:54 +04:00
|
|
|
#include <sysfs/libsysfs.h>
|
|
|
|
#include <infiniband/verbs.h>
|
2005-07-15 19:13:19 +04:00
|
|
|
#include <errno.h>
|
|
|
|
#include <string.h> /* for strerror()*/
|
2005-07-12 17:38:54 +04:00
|
|
|
|
2005-10-01 02:58:09 +04:00
|
|
|
#include "mca/pml/base/pml_base_module_exchange.h"
|
|
|
|
|
2005-07-01 01:28:35 +04:00
|
|
|
mca_btl_openib_component_t mca_btl_openib_component = {
|
|
|
|
{
|
|
|
|
/* First, the mca_base_component_t struct containing meta information
|
|
|
|
about the component itself */
|
|
|
|
|
|
|
|
{
|
|
|
|
/* Indicate that we are a pml v1.0.0 component (which also implies a
|
|
|
|
specific MCA version) */
|
|
|
|
|
|
|
|
MCA_BTL_BASE_VERSION_1_0_0,
|
|
|
|
|
2005-07-12 23:02:39 +04:00
|
|
|
"openib", /* MCA component name */
|
Major simplifications to component versioning:
- After long discussions and ruminations on how we run components in
LAM/MPI, made the decision that, by default, all components included
in Open MPI will use the version number of their parent project
(i.e., OMPI or ORTE). They are certaint free to use a different
number, but this simplification makes the common cases easy:
- components are only released when the parent project is released
- it is easy (trivial?) to distinguish which version component goes
with with version of the parent project
- removed all autogen/configure code for templating the version .h
file in components
- made all ORTE components use ORTE_*_VERSION for version numbers
- made all OMPI components use OMPI_*_VERSION for version numbers
- removed all VERSION files from components
- configure now displays OPAL, ORTE, and OMPI version numbers
- ditto for ompi_info
- right now, faking it -- OPAL and ORTE and OMPI will always have the
same version number (i.e., they all come from the same top-level
VERSION file). But this paves the way for the Great Configure
Reorganization, where, among other things, each project will have
its own version number.
So all in all, we went from a boatload of version numbers to
[effectively] three. That's pretty good. :-)
This commit was SVN r6344.
2005-07-05 00:12:36 +04:00
|
|
|
OMPI_MAJOR_VERSION, /* MCA component major version */
|
|
|
|
OMPI_MINOR_VERSION, /* MCA component minor version */
|
|
|
|
OMPI_RELEASE_VERSION, /* MCA component release version */
|
2005-07-01 01:28:35 +04:00
|
|
|
mca_btl_openib_component_open, /* component open */
|
|
|
|
mca_btl_openib_component_close /* component close */
|
|
|
|
},
|
|
|
|
|
|
|
|
/* Next the MCA v1.0.0 component meta data */
|
|
|
|
|
|
|
|
{
|
|
|
|
/* Whether the component is checkpointable or not */
|
|
|
|
|
|
|
|
false
|
|
|
|
},
|
|
|
|
|
|
|
|
mca_btl_openib_component_init,
|
|
|
|
mca_btl_openib_component_progress,
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* utility routines for parameter registration
|
|
|
|
*/
|
|
|
|
|
|
|
|
static inline char* mca_btl_openib_param_register_string(
|
|
|
|
const char* param_name,
|
|
|
|
const char* default_value)
|
|
|
|
{
|
|
|
|
char *param_value;
|
2005-07-12 23:02:39 +04:00
|
|
|
int id = mca_base_param_register_string("btl","openib",param_name,NULL,default_value);
|
2005-07-01 01:28:35 +04:00
|
|
|
mca_base_param_lookup_string(id, ¶m_value);
|
|
|
|
return param_value;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int mca_btl_openib_param_register_int(
|
|
|
|
const char* param_name,
|
|
|
|
int default_value)
|
|
|
|
{
|
2005-07-12 23:02:39 +04:00
|
|
|
int id = mca_base_param_register_int("btl","openib",param_name,NULL,default_value);
|
2005-07-01 01:28:35 +04:00
|
|
|
int param_value = default_value;
|
|
|
|
mca_base_param_lookup_int(id,¶m_value);
|
|
|
|
return param_value;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Called by MCA framework to open the component, registers
|
|
|
|
* component parameters.
|
|
|
|
*/
|
|
|
|
|
|
|
|
int mca_btl_openib_component_open(void)
|
|
|
|
{
|
|
|
|
|
|
|
|
int param, value;
|
|
|
|
|
|
|
|
/* initialize state */
|
|
|
|
mca_btl_openib_component.ib_num_btls=0;
|
2005-07-13 04:17:08 +04:00
|
|
|
mca_btl_openib_component.openib_btls=NULL;
|
2005-07-01 01:28:35 +04:00
|
|
|
|
|
|
|
/* initialize objects */
|
2005-07-03 20:22:16 +04:00
|
|
|
OBJ_CONSTRUCT(&mca_btl_openib_component.ib_procs, opal_list_t);
|
2005-07-01 01:28:35 +04:00
|
|
|
|
|
|
|
/* register IB component parameters */
|
|
|
|
mca_btl_openib_component.ib_free_list_num =
|
|
|
|
mca_btl_openib_param_register_int ("free_list_num", 8);
|
|
|
|
mca_btl_openib_component.ib_free_list_max =
|
|
|
|
mca_btl_openib_param_register_int ("free_list_max", 1024);
|
|
|
|
mca_btl_openib_component.ib_free_list_inc =
|
|
|
|
mca_btl_openib_param_register_int ("free_list_inc", 32);
|
|
|
|
mca_btl_openib_component.ib_mem_registry_hints_log_size =
|
|
|
|
mca_btl_openib_param_register_int ("hints_log_size", 8);
|
|
|
|
mca_btl_openib_component.ib_mpool_name =
|
2005-07-14 01:13:30 +04:00
|
|
|
mca_btl_openib_param_register_string("mpool", "openib");
|
2005-07-01 01:28:35 +04:00
|
|
|
mca_btl_openib_component.ib_rr_buf_max =
|
|
|
|
mca_btl_openib_param_register_int("rr_buf_max", 16);
|
|
|
|
mca_btl_openib_component.ib_rr_buf_min =
|
|
|
|
mca_btl_openib_param_register_int("rr_buf_min", 8);
|
|
|
|
mca_btl_openib_component.reg_mru_len =
|
|
|
|
mca_btl_openib_param_register_int("reg_mru_len", 16);
|
|
|
|
|
2005-07-15 19:13:19 +04:00
|
|
|
mca_btl_openib_component.ib_cq_size =
|
2005-07-01 01:28:35 +04:00
|
|
|
mca_btl_openib_param_register_int("ib_cq_size",
|
2005-07-15 19:13:19 +04:00
|
|
|
500);
|
|
|
|
mca_btl_openib_component.ib_wq_size =
|
2005-07-01 01:28:35 +04:00
|
|
|
mca_btl_openib_param_register_int("ib_wq_size",
|
2005-07-15 19:13:19 +04:00
|
|
|
500);
|
|
|
|
mca_btl_openib_component.ib_sg_list_size =
|
2005-07-01 01:28:35 +04:00
|
|
|
mca_btl_openib_param_register_int("ib_sg_list_size",
|
|
|
|
1);
|
2005-07-15 19:13:19 +04:00
|
|
|
mca_btl_openib_component.ib_pkey_ix =
|
2005-07-01 01:28:35 +04:00
|
|
|
mca_btl_openib_param_register_int("ib_pkey_ix",
|
|
|
|
0);
|
2005-07-15 19:13:19 +04:00
|
|
|
mca_btl_openib_component.ib_psn =
|
2005-07-01 01:28:35 +04:00
|
|
|
mca_btl_openib_param_register_int("ib_psn",
|
|
|
|
0);
|
2005-07-15 19:13:19 +04:00
|
|
|
mca_btl_openib_component.ib_qp_ous_rd_atom =
|
2005-07-01 01:28:35 +04:00
|
|
|
mca_btl_openib_param_register_int("ib_qp_ous_rd_atom",
|
|
|
|
1);
|
2005-07-15 19:13:19 +04:00
|
|
|
mca_btl_openib_component.ib_mtu =
|
2005-07-01 01:28:35 +04:00
|
|
|
mca_btl_openib_param_register_int("ib_mtu",
|
2005-07-13 04:17:08 +04:00
|
|
|
IBV_MTU_1024);
|
2005-07-15 19:13:19 +04:00
|
|
|
mca_btl_openib_component.ib_min_rnr_timer =
|
2005-07-01 01:28:35 +04:00
|
|
|
mca_btl_openib_param_register_int("ib_min_rnr_timer",
|
|
|
|
5);
|
2005-07-15 19:13:19 +04:00
|
|
|
mca_btl_openib_component.ib_timeout =
|
2005-07-01 01:28:35 +04:00
|
|
|
mca_btl_openib_param_register_int("ib_timeout",
|
|
|
|
10);
|
2005-07-15 19:13:19 +04:00
|
|
|
mca_btl_openib_component.ib_retry_count =
|
2005-07-01 01:28:35 +04:00
|
|
|
mca_btl_openib_param_register_int("ib_retry_count",
|
|
|
|
7);
|
2005-07-15 19:13:19 +04:00
|
|
|
mca_btl_openib_component.ib_rnr_retry =
|
2005-07-01 01:28:35 +04:00
|
|
|
mca_btl_openib_param_register_int("ib_rnr_retry",
|
|
|
|
7);
|
2005-07-15 19:13:19 +04:00
|
|
|
mca_btl_openib_component.ib_max_rdma_dst_ops =
|
2005-07-01 01:28:35 +04:00
|
|
|
mca_btl_openib_param_register_int("ib_max_rdma_dst_ops",
|
|
|
|
16);
|
|
|
|
|
2005-07-15 19:13:19 +04:00
|
|
|
mca_btl_openib_component.ib_service_level =
|
2005-07-01 01:28:35 +04:00
|
|
|
mca_btl_openib_param_register_int("ib_service_level",
|
|
|
|
0);
|
2005-07-15 19:13:19 +04:00
|
|
|
mca_btl_openib_component.ib_static_rate =
|
2005-07-01 01:28:35 +04:00
|
|
|
mca_btl_openib_param_register_int("ib_static_rate",
|
|
|
|
0);
|
2005-07-15 19:13:19 +04:00
|
|
|
mca_btl_openib_component.ib_src_path_bits =
|
2005-07-01 01:28:35 +04:00
|
|
|
mca_btl_openib_param_register_int("ib_src_path_bits",
|
2005-07-15 19:13:19 +04:00
|
|
|
0);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
mca_btl_openib_module.super.btl_exclusivity =
|
2005-08-12 20:56:46 +04:00
|
|
|
mca_btl_openib_param_register_int ("exclusivity", MCA_BTL_EXCLUSIVITY_DEFAULT);
|
2005-07-15 19:13:19 +04:00
|
|
|
mca_btl_openib_module.super.btl_eager_limit =
|
|
|
|
mca_btl_openib_param_register_int ("eager_limit", (64*1024))
|
|
|
|
- sizeof(mca_btl_openib_header_t);
|
|
|
|
|
|
|
|
mca_btl_openib_module.super.btl_min_send_size =
|
|
|
|
mca_btl_openib_param_register_int ("min_send_size", (64*1024))
|
|
|
|
- sizeof(mca_btl_openib_header_t);
|
|
|
|
|
|
|
|
mca_btl_openib_module.super.btl_max_send_size =
|
|
|
|
mca_btl_openib_param_register_int ("max_send_size", (128*1024))
|
|
|
|
- sizeof(mca_btl_openib_header_t);
|
2005-07-01 01:28:35 +04:00
|
|
|
mca_btl_openib_module.super.btl_min_rdma_size =
|
|
|
|
mca_btl_openib_param_register_int("min_rdma_size",
|
|
|
|
1024*1024);
|
|
|
|
mca_btl_openib_module.super.btl_max_rdma_size =
|
|
|
|
mca_btl_openib_param_register_int("max_rdma_size",
|
|
|
|
1024*1024);
|
|
|
|
mca_btl_openib_module.super.btl_flags =
|
|
|
|
mca_btl_openib_param_register_int("flags",
|
2005-08-17 22:23:38 +04:00
|
|
|
MCA_BTL_FLAGS_PUT);
|
2005-07-01 01:28:35 +04:00
|
|
|
|
|
|
|
|
|
|
|
param = mca_base_param_find("mpi", NULL, "leave_pinned");
|
|
|
|
mca_base_param_lookup_int(param, &value);
|
|
|
|
mca_btl_openib_component.leave_pinned = value;
|
|
|
|
|
|
|
|
mca_btl_openib_component.max_send_size = mca_btl_openib_module.super.btl_max_send_size;
|
|
|
|
mca_btl_openib_component.eager_limit = mca_btl_openib_module.super.btl_eager_limit;
|
|
|
|
|
|
|
|
return OMPI_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* component cleanup - sanity checking of queue lengths
|
|
|
|
*/
|
|
|
|
|
|
|
|
int mca_btl_openib_component_close(void)
|
|
|
|
{
|
|
|
|
return OMPI_SUCCESS;
|
|
|
|
}
|
|
|
|
|
2005-10-01 02:58:09 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Register MVAPI port information. The MCA framework
|
|
|
|
* will make this available to all peers.
|
|
|
|
*/
|
|
|
|
|
|
|
|
static int
|
|
|
|
mca_btl_openib_modex_send(void)
|
|
|
|
{
|
|
|
|
int rc;
|
|
|
|
size_t i;
|
|
|
|
size_t size;
|
|
|
|
mca_btl_openib_port_info_t *ports = NULL;
|
|
|
|
|
|
|
|
size = mca_btl_openib_component.ib_num_btls * sizeof (mca_btl_openib_port_info_t);
|
|
|
|
if (size != 0) {
|
|
|
|
ports = (mca_btl_openib_port_info_t *)malloc (size);
|
|
|
|
if (NULL == ports) {
|
|
|
|
return OMPI_ERR_OUT_OF_RESOURCE;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (i = 0; i < mca_btl_openib_component.ib_num_btls; i++) {
|
|
|
|
mca_btl_openib_module_t *btl = &mca_btl_openib_component.openib_btls[i];
|
|
|
|
ports[i] = btl->port_info;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
rc = mca_pml_base_modex_send (&mca_btl_openib_component.super.btl_version, ports, size);
|
|
|
|
if (NULL != ports) {
|
|
|
|
free (ports);
|
|
|
|
}
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2005-07-01 01:28:35 +04:00
|
|
|
/*
|
|
|
|
* IB component initialization:
|
|
|
|
* (1) read interface list from kernel and compare against component parameters
|
|
|
|
* then create a BTL instance for selected interfaces
|
|
|
|
* (2) setup IB listen socket for incoming connection attempts
|
|
|
|
* (3) register BTL parameters with the MCA
|
|
|
|
*/
|
|
|
|
|
|
|
|
mca_btl_base_module_t** mca_btl_openib_component_init(int *num_btl_modules,
|
|
|
|
bool enable_progress_threads,
|
|
|
|
bool enable_mpi_threads)
|
|
|
|
{
|
|
|
|
struct ibv_device **ib_devs;
|
|
|
|
uint32_t num_devs;
|
|
|
|
mca_btl_base_module_t** btls;
|
|
|
|
uint32_t i,j, length;
|
2005-07-12 17:38:54 +04:00
|
|
|
struct mca_mpool_base_resources_t mpool_resources;
|
2005-07-03 20:22:16 +04:00
|
|
|
opal_list_t btl_list;
|
2005-07-13 04:17:08 +04:00
|
|
|
mca_btl_openib_module_t * openib_btl;
|
2005-07-01 01:28:35 +04:00
|
|
|
mca_btl_base_selected_module_t* ib_selected;
|
2005-07-03 20:22:16 +04:00
|
|
|
opal_list_item_t* item;
|
2005-07-20 01:04:22 +04:00
|
|
|
struct dlist *dev_list;
|
|
|
|
|
|
|
|
struct ibv_device* ib_dev;
|
|
|
|
|
|
|
|
|
2005-07-01 01:28:35 +04:00
|
|
|
/* initialization */
|
|
|
|
*num_btl_modules = 0;
|
|
|
|
num_devs = 0;
|
|
|
|
|
2005-07-12 17:38:54 +04:00
|
|
|
|
2005-07-01 01:28:35 +04:00
|
|
|
/* Determine the number of hca's available on the host */
|
|
|
|
dev_list = ibv_get_devices();
|
2005-09-30 18:58:59 +04:00
|
|
|
if (NULL == dev_list) {
|
|
|
|
mca_btl_base_error_no_nics("OpenIB", "HCA");
|
2005-10-01 02:58:09 +04:00
|
|
|
mca_btl_openib_component.ib_num_btls = 0;
|
|
|
|
mca_btl_openib_modex_send();
|
2005-09-30 18:58:59 +04:00
|
|
|
return NULL;
|
|
|
|
}
|
2005-07-01 01:28:35 +04:00
|
|
|
dlist_start(dev_list);
|
|
|
|
|
|
|
|
dlist_for_each_data(dev_list, ib_dev, struct ibv_device)
|
|
|
|
num_devs++;
|
|
|
|
|
|
|
|
if(0 == num_devs) {
|
2005-09-30 18:58:59 +04:00
|
|
|
mca_btl_base_error_no_nics("OpenIB", "HCA");
|
2005-10-01 02:58:09 +04:00
|
|
|
mca_btl_openib_modex_send();
|
2005-07-01 01:28:35 +04:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Allocate space for the ib devices */
|
2005-07-13 04:17:08 +04:00
|
|
|
ib_devs = (struct ibv_device**) malloc(num_devs * sizeof(struct ibv_dev*));
|
2005-07-01 01:28:35 +04:00
|
|
|
if(NULL == ib_devs) {
|
|
|
|
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
dlist_start(dev_list);
|
|
|
|
|
2005-07-13 04:17:08 +04:00
|
|
|
i = 0;
|
2005-07-01 01:28:35 +04:00
|
|
|
dlist_for_each_data(dev_list, ib_dev, struct ibv_device)
|
|
|
|
ib_devs[i++] = ib_dev;
|
|
|
|
|
|
|
|
|
|
|
|
/** We must loop through all the hca id's, get there handles and
|
|
|
|
for each hca we query the number of ports on the hca and set up
|
|
|
|
a distinct btl module for each hca port */
|
|
|
|
|
2005-07-03 20:22:16 +04:00
|
|
|
OBJ_CONSTRUCT(&btl_list, opal_list_t);
|
2005-07-04 02:45:48 +04:00
|
|
|
OBJ_CONSTRUCT(&mca_btl_openib_component.ib_lock, opal_mutex_t);
|
2005-07-01 01:28:35 +04:00
|
|
|
|
|
|
|
|
|
|
|
for(i = 0; i < num_devs; i++){
|
2005-07-13 04:17:08 +04:00
|
|
|
struct ibv_device_attr ib_dev_attr;
|
2005-07-20 01:04:22 +04:00
|
|
|
struct ibv_context* ib_dev_context;
|
|
|
|
|
2005-07-01 01:28:35 +04:00
|
|
|
ib_dev = ib_devs[i];
|
|
|
|
|
2005-07-12 17:38:54 +04:00
|
|
|
ib_dev_context = ibv_open_device(ib_dev);
|
|
|
|
if(!ib_dev_context) {
|
2005-08-02 17:20:50 +04:00
|
|
|
BTL_ERROR((" error obtaining device context for %s errno says %s\n", ibv_get_device_name(ib_dev), strerror(errno)));
|
2005-07-12 17:38:54 +04:00
|
|
|
return NULL;
|
|
|
|
}
|
2005-07-20 01:04:22 +04:00
|
|
|
|
2005-07-13 04:17:08 +04:00
|
|
|
if(ibv_query_device(ib_dev_context, &ib_dev_attr)){
|
2005-08-02 17:20:50 +04:00
|
|
|
BTL_ERROR(("error obtaining device attributes for %s errno says %s\n", ibv_get_device_name(ib_dev), strerror(errno)));
|
2005-07-01 01:28:35 +04:00
|
|
|
return NULL;
|
|
|
|
}
|
2005-07-12 17:38:54 +04:00
|
|
|
|
|
|
|
|
|
|
|
/* Note ports are 1 based hence j = 1 */
|
|
|
|
|
|
|
|
for(j = 1; j <= ib_dev_attr.phys_port_cnt; j++){
|
|
|
|
struct ibv_port_attr* ib_port_attr;
|
2005-07-13 04:17:08 +04:00
|
|
|
ib_port_attr = (struct ibv_port_attr*) malloc(sizeof(struct ibv_port_attr));
|
|
|
|
if(ibv_query_port(ib_dev_context, (uint8_t) j, ib_port_attr)){
|
2005-08-02 17:20:50 +04:00
|
|
|
BTL_ERROR(("error getting port attributes for device %s port number %d errno says %s",
|
|
|
|
ibv_get_device_name(ib_dev), j, strerror(errno)));
|
2005-07-12 17:38:54 +04:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
if( IBV_PORT_ACTIVE == ib_port_attr->state ){
|
|
|
|
|
2005-07-13 04:17:08 +04:00
|
|
|
openib_btl = (mca_btl_openib_module_t*) malloc(sizeof(mca_btl_openib_module_t));
|
|
|
|
memcpy(openib_btl, &mca_btl_openib_module, sizeof(mca_btl_openib_module));
|
2005-07-01 01:28:35 +04:00
|
|
|
|
|
|
|
ib_selected = OBJ_NEW(mca_btl_base_selected_module_t);
|
2005-07-12 17:38:54 +04:00
|
|
|
ib_selected->btl_module = (mca_btl_base_module_t*) openib_btl;
|
|
|
|
openib_btl->ib_dev = ib_dev;
|
|
|
|
openib_btl->ib_dev_context = ib_dev_context;
|
|
|
|
openib_btl->port_num = (uint8_t) j;
|
|
|
|
openib_btl->ib_port_attr = ib_port_attr;
|
2005-10-01 02:58:09 +04:00
|
|
|
openib_btl->port_info.subnet = ib_port_attr->sm_lid; /* store the sm_lid for multi-nic support */
|
2005-07-03 20:22:16 +04:00
|
|
|
opal_list_append(&btl_list, (opal_list_item_t*) ib_selected);
|
2005-07-01 01:28:35 +04:00
|
|
|
mca_btl_openib_component.ib_num_btls ++;
|
|
|
|
|
|
|
|
}
|
2005-07-15 19:13:19 +04:00
|
|
|
else{
|
|
|
|
free(ib_port_attr);
|
|
|
|
}
|
2005-07-01 01:28:35 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Allocate space for btl modules */
|
2005-07-12 17:38:54 +04:00
|
|
|
mca_btl_openib_component.openib_btls = (mca_btl_openib_module_t*) malloc(sizeof(mca_btl_openib_module_t) *
|
2005-07-01 01:28:35 +04:00
|
|
|
mca_btl_openib_component.ib_num_btls);
|
|
|
|
|
2005-07-12 17:38:54 +04:00
|
|
|
if(NULL == mca_btl_openib_component.openib_btls) {
|
2005-07-01 01:28:35 +04:00
|
|
|
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
btls = (struct mca_btl_base_module_t**)
|
|
|
|
malloc(mca_btl_openib_component.ib_num_btls * sizeof(struct mca_btl_openib_module_t*));
|
|
|
|
if(NULL == btls) {
|
|
|
|
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for(i = 0; i < mca_btl_openib_component.ib_num_btls; i++){
|
2005-07-03 20:22:16 +04:00
|
|
|
item = opal_list_remove_first(&btl_list);
|
2005-07-01 01:28:35 +04:00
|
|
|
ib_selected = (mca_btl_base_selected_module_t*)item;
|
2005-07-12 17:38:54 +04:00
|
|
|
openib_btl = (mca_btl_openib_module_t*) ib_selected->btl_module;
|
|
|
|
memcpy(&(mca_btl_openib_component.openib_btls[i]), openib_btl , sizeof(mca_btl_openib_module_t));
|
2005-07-01 01:28:35 +04:00
|
|
|
free(ib_selected);
|
2005-07-12 17:38:54 +04:00
|
|
|
free(openib_btl);
|
2005-07-01 01:28:35 +04:00
|
|
|
|
2005-07-12 17:38:54 +04:00
|
|
|
openib_btl = &mca_btl_openib_component.openib_btls[i];
|
2005-07-01 01:28:35 +04:00
|
|
|
|
|
|
|
/* Initialize module state */
|
|
|
|
|
2005-07-12 17:38:54 +04:00
|
|
|
OBJ_CONSTRUCT(&openib_btl->ib_lock, opal_mutex_t);
|
|
|
|
OBJ_CONSTRUCT(&openib_btl->send_free_eager, ompi_free_list_t);
|
|
|
|
OBJ_CONSTRUCT(&openib_btl->send_free_max, ompi_free_list_t);
|
|
|
|
OBJ_CONSTRUCT(&openib_btl->send_free_frag, ompi_free_list_t);
|
2005-07-01 01:28:35 +04:00
|
|
|
|
2005-07-12 17:38:54 +04:00
|
|
|
OBJ_CONSTRUCT(&openib_btl->recv_free_eager, ompi_free_list_t);
|
|
|
|
OBJ_CONSTRUCT(&openib_btl->recv_free_max, ompi_free_list_t);
|
2005-07-01 01:28:35 +04:00
|
|
|
|
|
|
|
|
2005-07-12 17:38:54 +04:00
|
|
|
OBJ_CONSTRUCT(&openib_btl->repost, opal_list_t);
|
|
|
|
OBJ_CONSTRUCT(&openib_btl->reg_mru_list, opal_list_t);
|
2005-07-01 01:28:35 +04:00
|
|
|
|
|
|
|
|
|
|
|
|
2005-07-12 17:38:54 +04:00
|
|
|
if(mca_btl_openib_module_init(openib_btl) != OMPI_SUCCESS) {
|
|
|
|
free(ib_devs);
|
2005-07-01 01:28:35 +04:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2005-07-12 17:38:54 +04:00
|
|
|
mpool_resources.ib_pd = openib_btl->ib_pd;
|
|
|
|
|
2005-07-01 01:28:35 +04:00
|
|
|
/* initialize the memory pool using the hca */
|
2005-09-13 02:28:23 +04:00
|
|
|
openib_btl->super.btl_mpool =
|
2005-07-01 01:28:35 +04:00
|
|
|
mca_mpool_base_module_create(mca_btl_openib_component.ib_mpool_name,
|
2005-07-12 17:38:54 +04:00
|
|
|
&openib_btl->super,
|
|
|
|
&mpool_resources);
|
2005-07-01 01:28:35 +04:00
|
|
|
|
2005-09-13 02:28:23 +04:00
|
|
|
if(NULL == openib_btl->super.btl_mpool) {
|
2005-08-02 17:20:50 +04:00
|
|
|
BTL_ERROR(("error creating vapi memory pool! aborting openib btl initialization"));
|
2005-07-01 01:28:35 +04:00
|
|
|
return NULL;
|
|
|
|
}
|
2005-07-12 17:38:54 +04:00
|
|
|
|
2005-07-01 01:28:35 +04:00
|
|
|
/* Initialize pool of send fragments */
|
|
|
|
length = sizeof(mca_btl_openib_frag_t) +
|
|
|
|
sizeof(mca_btl_openib_header_t) +
|
2005-07-12 17:38:54 +04:00
|
|
|
openib_btl->super.btl_eager_limit+
|
2005-07-01 01:28:35 +04:00
|
|
|
2*MCA_BTL_IB_FRAG_ALIGN;
|
|
|
|
|
2005-07-12 17:38:54 +04:00
|
|
|
ompi_free_list_init(&openib_btl->send_free_eager,
|
2005-07-01 01:28:35 +04:00
|
|
|
length,
|
|
|
|
OBJ_CLASS(mca_btl_openib_send_frag_eager_t),
|
|
|
|
mca_btl_openib_component.ib_free_list_num,
|
|
|
|
mca_btl_openib_component.ib_free_list_max,
|
|
|
|
mca_btl_openib_component.ib_free_list_inc,
|
2005-09-13 02:28:23 +04:00
|
|
|
openib_btl->super.btl_mpool);
|
2005-07-01 01:28:35 +04:00
|
|
|
|
2005-07-12 17:38:54 +04:00
|
|
|
ompi_free_list_init(&openib_btl->recv_free_eager,
|
2005-07-01 01:28:35 +04:00
|
|
|
length,
|
|
|
|
OBJ_CLASS(mca_btl_openib_recv_frag_eager_t),
|
|
|
|
mca_btl_openib_component.ib_free_list_num,
|
|
|
|
mca_btl_openib_component.ib_free_list_max,
|
|
|
|
mca_btl_openib_component.ib_free_list_inc,
|
2005-09-13 02:28:23 +04:00
|
|
|
openib_btl->super.btl_mpool);
|
2005-07-01 01:28:35 +04:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
length = sizeof(mca_btl_openib_frag_t) +
|
|
|
|
sizeof(mca_btl_openib_header_t) +
|
2005-07-12 17:38:54 +04:00
|
|
|
openib_btl->super.btl_max_send_size +
|
2005-07-01 01:28:35 +04:00
|
|
|
2*MCA_BTL_IB_FRAG_ALIGN;
|
|
|
|
|
|
|
|
|
2005-07-12 17:38:54 +04:00
|
|
|
ompi_free_list_init(&openib_btl->send_free_max,
|
2005-07-01 01:28:35 +04:00
|
|
|
length,
|
|
|
|
OBJ_CLASS(mca_btl_openib_send_frag_max_t),
|
|
|
|
mca_btl_openib_component.ib_free_list_num,
|
|
|
|
mca_btl_openib_component.ib_free_list_max,
|
|
|
|
mca_btl_openib_component.ib_free_list_inc,
|
2005-09-13 02:28:23 +04:00
|
|
|
openib_btl->super.btl_mpool);
|
2005-07-12 17:38:54 +04:00
|
|
|
|
2005-07-01 01:28:35 +04:00
|
|
|
/* Initialize pool of receive fragments */
|
2005-07-12 17:38:54 +04:00
|
|
|
ompi_free_list_init (&openib_btl->recv_free_max,
|
2005-07-01 01:28:35 +04:00
|
|
|
length,
|
|
|
|
OBJ_CLASS (mca_btl_openib_recv_frag_max_t),
|
|
|
|
mca_btl_openib_component.ib_free_list_num,
|
|
|
|
mca_btl_openib_component.ib_free_list_max,
|
2005-09-13 02:28:23 +04:00
|
|
|
mca_btl_openib_component.ib_free_list_inc, openib_btl->super.btl_mpool);
|
2005-07-01 01:28:35 +04:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
length = sizeof(mca_btl_openib_frag_t) +
|
|
|
|
sizeof(mca_btl_openib_header_t)+
|
|
|
|
2*MCA_BTL_IB_FRAG_ALIGN;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2005-07-12 17:38:54 +04:00
|
|
|
ompi_free_list_init(&openib_btl->send_free_frag,
|
2005-07-01 01:28:35 +04:00
|
|
|
length,
|
|
|
|
OBJ_CLASS(mca_btl_openib_send_frag_frag_t),
|
|
|
|
mca_btl_openib_component.ib_free_list_num,
|
|
|
|
mca_btl_openib_component.ib_free_list_max,
|
|
|
|
mca_btl_openib_component.ib_free_list_inc,
|
2005-09-13 02:28:23 +04:00
|
|
|
openib_btl->super.btl_mpool);
|
2005-07-01 01:28:35 +04:00
|
|
|
|
|
|
|
|
|
|
|
/* Initialize the rr_desc_post array for posting of rr*/
|
2005-07-13 04:17:08 +04:00
|
|
|
openib_btl->rr_desc_post = (struct ibv_recv_wr *)
|
|
|
|
malloc((mca_btl_openib_component.ib_rr_buf_max * sizeof(struct ibv_recv_wr)));
|
2005-07-01 01:28:35 +04:00
|
|
|
|
2005-07-12 17:38:54 +04:00
|
|
|
btls[i] = &openib_btl->super;
|
2005-07-01 01:28:35 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Post OOB receive to support dynamic connection setup */
|
|
|
|
mca_btl_openib_post_recv();
|
2005-10-01 02:58:09 +04:00
|
|
|
mca_btl_openib_modex_send();
|
2005-07-01 01:28:35 +04:00
|
|
|
|
|
|
|
*num_btl_modules = mca_btl_openib_component.ib_num_btls;
|
2005-07-13 04:17:08 +04:00
|
|
|
free(ib_devs);
|
2005-07-01 01:28:35 +04:00
|
|
|
return btls;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* IB component progress.
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
|
|
int mca_btl_openib_component_progress()
|
|
|
|
{
|
2005-08-08 23:10:36 +04:00
|
|
|
uint32_t i;
|
|
|
|
int count = 0,ne;
|
2005-07-01 01:28:35 +04:00
|
|
|
mca_btl_openib_frag_t* frag;
|
2005-07-20 01:04:22 +04:00
|
|
|
mca_btl_openib_endpoint_t* endpoint;
|
2005-07-01 01:28:35 +04:00
|
|
|
/* Poll for completions */
|
|
|
|
for(i = 0; i < mca_btl_openib_component.ib_num_btls; i++) {
|
2005-07-13 04:17:08 +04:00
|
|
|
|
|
|
|
struct ibv_wc wc;
|
2005-07-12 17:38:54 +04:00
|
|
|
mca_btl_openib_module_t* openib_btl = &mca_btl_openib_component.openib_btls[i];
|
2005-07-20 19:17:18 +04:00
|
|
|
memset(&wc, 0, sizeof(struct ibv_wc));
|
|
|
|
|
2005-07-20 01:04:22 +04:00
|
|
|
/* we have two completion queues, one for "high" priority and one for "low".
|
|
|
|
* we will check the high priority and process them until there are none left.
|
|
|
|
* note that low priority messages are only processed one per progress call.
|
|
|
|
*/
|
2005-08-18 21:08:27 +04:00
|
|
|
ne=ibv_poll_cq(openib_btl->ib_cq_high, 1, &wc );
|
|
|
|
if(ne < 0 ){
|
|
|
|
BTL_ERROR(("error polling CQ with %d errno says %s\n", ne, strerror(errno)));
|
|
|
|
return OMPI_ERROR;
|
|
|
|
}
|
|
|
|
else if(wc.status != IBV_WC_SUCCESS) {
|
|
|
|
BTL_ERROR(("error polling CQ with status %d for wr_id %llu\n",
|
|
|
|
wc.status, wc.wr_id));
|
|
|
|
return OMPI_ERROR;
|
|
|
|
}
|
|
|
|
else if(1 == ne) {
|
|
|
|
BTL_VERBOSE(("completion queue event says opcode is %d\n", wc.opcode));
|
|
|
|
|
|
|
|
/* Handle work completions */
|
|
|
|
switch(wc.opcode) {
|
|
|
|
case IBV_WC_RECV_RDMA_WITH_IMM:
|
|
|
|
BTL_ERROR(("Got an RDMA with Immediate data Not supported!"));
|
2005-07-01 01:28:35 +04:00
|
|
|
return OMPI_ERROR;
|
|
|
|
|
2005-08-18 21:08:27 +04:00
|
|
|
case IBV_WC_RECV:
|
|
|
|
/* Process a RECV */
|
2005-07-20 01:04:22 +04:00
|
|
|
|
2005-08-18 21:08:27 +04:00
|
|
|
BTL_VERBOSE(("Got an recv on the completion queue"));
|
2005-10-01 02:58:09 +04:00
|
|
|
frag = (mca_btl_openib_frag_t*) (void*) wc.wr_id;
|
2005-08-18 21:08:27 +04:00
|
|
|
endpoint = (mca_btl_openib_endpoint_t*) frag->endpoint;
|
|
|
|
frag->rc=OMPI_SUCCESS;
|
|
|
|
frag->segment.seg_len =
|
|
|
|
wc.byte_len-
|
|
|
|
((unsigned char*) frag->segment.seg_addr.pval - (unsigned char*) frag->hdr);
|
2005-07-01 01:28:35 +04:00
|
|
|
|
|
|
|
|
2005-07-20 01:04:22 +04:00
|
|
|
|
2005-08-18 21:08:27 +04:00
|
|
|
/* advance the segment address past the header and subtract from the length..*/
|
|
|
|
openib_btl->ib_reg[frag->hdr->tag].cbfunc(&openib_btl->super,
|
|
|
|
frag->hdr->tag,
|
|
|
|
&frag->base,
|
|
|
|
openib_btl->ib_reg[frag->hdr->tag].cbdata);
|
2005-07-01 01:28:35 +04:00
|
|
|
|
2005-08-18 21:08:27 +04:00
|
|
|
OPAL_THREAD_ADD32(&endpoint->rr_posted_high, -1);
|
|
|
|
MCA_BTL_OPENIB_ENDPOINT_POST_RR_HIGH(((mca_btl_openib_frag_t*)wc.wr_id)->endpoint, 0);
|
|
|
|
OMPI_FREE_LIST_RETURN(&(openib_btl->recv_free_eager), (opal_list_item_t*) frag);
|
|
|
|
count++;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case IBV_WC_RDMA_READ:
|
|
|
|
case IBV_WC_RDMA_WRITE:
|
|
|
|
case IBV_WC_SEND :
|
2005-07-20 01:04:22 +04:00
|
|
|
|
2005-08-18 21:08:27 +04:00
|
|
|
/* Process a completed send or rdma write*/
|
|
|
|
frag = (mca_btl_openib_frag_t*) wc.wr_id;
|
|
|
|
frag->rc = OMPI_SUCCESS;
|
|
|
|
frag->base.des_cbfunc(&openib_btl->super, frag->endpoint, &frag->base, frag->rc);
|
|
|
|
count++;
|
|
|
|
break;
|
2005-07-20 01:04:22 +04:00
|
|
|
|
2005-08-18 21:08:27 +04:00
|
|
|
|
|
|
|
default:
|
|
|
|
BTL_ERROR(("Unhandled work completion opcode is %d", wc.opcode));
|
|
|
|
break;
|
2005-07-01 01:28:35 +04:00
|
|
|
}
|
|
|
|
}
|
2005-07-13 04:17:08 +04:00
|
|
|
|
|
|
|
ne=ibv_poll_cq(openib_btl->ib_cq_low, 1, &wc );
|
|
|
|
if(ne < 0){
|
2005-08-02 17:20:50 +04:00
|
|
|
BTL_ERROR(("error polling CQ with %d errno says %s", ne, strerror(errno)));
|
2005-07-13 04:17:08 +04:00
|
|
|
return OMPI_ERROR;
|
|
|
|
}
|
|
|
|
else if(wc.status != IBV_WC_SUCCESS) {
|
2005-08-02 17:20:50 +04:00
|
|
|
BTL_ERROR(("error polling CQ with status %d for wr_id %llu",
|
|
|
|
wc.status, wc.wr_id));
|
2005-07-13 04:17:08 +04:00
|
|
|
return OMPI_ERROR;
|
2005-07-01 01:28:35 +04:00
|
|
|
}
|
2005-07-13 04:17:08 +04:00
|
|
|
else if(1 == ne) {
|
2005-07-01 01:28:35 +04:00
|
|
|
/* Handle n/w completions */
|
2005-07-13 04:17:08 +04:00
|
|
|
switch(wc.opcode) {
|
|
|
|
case IBV_WC_RECV_RDMA_WITH_IMM:
|
2005-08-02 17:20:50 +04:00
|
|
|
BTL_ERROR(("Got an RDMA with Immediate data Not supported!"));
|
2005-07-13 04:17:08 +04:00
|
|
|
return OMPI_ERROR;
|
2005-07-01 01:28:35 +04:00
|
|
|
|
2005-07-20 01:04:22 +04:00
|
|
|
case IBV_WC_RECV:
|
|
|
|
/* process a recv completion (this should only occur for a send not an rdma) */
|
2005-08-09 21:49:39 +04:00
|
|
|
BTL_VERBOSE(( "Got a recv completion"));
|
2005-07-20 01:04:22 +04:00
|
|
|
frag = (mca_btl_openib_frag_t*) wc.wr_id;
|
|
|
|
endpoint = (mca_btl_openib_endpoint_t*) frag->endpoint;
|
|
|
|
frag->rc=OMPI_SUCCESS;
|
2005-07-01 01:28:35 +04:00
|
|
|
|
2005-07-20 01:04:22 +04:00
|
|
|
/* advance the segment address past the header and subtract from the length..*/
|
|
|
|
frag->segment.seg_len =
|
|
|
|
wc.byte_len-
|
|
|
|
((unsigned char*) frag->segment.seg_addr.pval - (unsigned char*) frag->hdr);
|
|
|
|
|
2005-07-01 01:28:35 +04:00
|
|
|
|
2005-07-20 01:04:22 +04:00
|
|
|
openib_btl->ib_reg[frag->hdr->tag].cbfunc(&openib_btl->super,
|
|
|
|
frag->hdr->tag,
|
|
|
|
&frag->base,
|
|
|
|
openib_btl->ib_reg[frag->hdr->tag].cbdata);
|
|
|
|
|
2005-07-20 19:17:18 +04:00
|
|
|
OPAL_THREAD_ADD32(&endpoint->rr_posted_low, -1);
|
|
|
|
MCA_BTL_OPENIB_ENDPOINT_POST_RR_LOW(((mca_btl_openib_frag_t*)wc.wr_id)->endpoint, 0);
|
2005-07-20 01:04:22 +04:00
|
|
|
OMPI_FREE_LIST_RETURN(&(openib_btl->recv_free_max), (opal_list_item_t*) frag);
|
|
|
|
count++;
|
|
|
|
break;
|
2005-07-01 01:28:35 +04:00
|
|
|
|
2005-08-18 21:08:27 +04:00
|
|
|
case IBV_WC_RDMA_READ:
|
2005-07-20 01:04:22 +04:00
|
|
|
case IBV_WC_RDMA_WRITE:
|
|
|
|
case IBV_WC_SEND :
|
2005-08-18 21:08:27 +04:00
|
|
|
|
2005-07-20 01:04:22 +04:00
|
|
|
/* Process a completed send */
|
|
|
|
frag = (mca_btl_openib_frag_t*) wc.wr_id;
|
|
|
|
frag->rc = OMPI_SUCCESS;
|
|
|
|
frag->base.des_cbfunc(&openib_btl->super, frag->endpoint, &frag->base, frag->rc);
|
|
|
|
count++;
|
2005-07-01 01:28:35 +04:00
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
2005-08-02 17:20:50 +04:00
|
|
|
BTL_ERROR(("Unhandled work completion opcode is %d", wc.opcode));
|
2005-07-01 01:28:35 +04:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
return count;
|
|
|
|
}
|