1
1
openmpi/ompi/mca/mtl/ofi/mtl_ofi_component.c
Howard Pritchard eb977de5e9 mtl: add query method to mtl components
Switch to using the query/priority method for selecting
MTLs.  This switch was motivated by the fact that now
on some platforms, its possible for multiple MTLs to
be initializable, but only one MTL should be selected.

In addition, there is a complication with the PSM and
IFO (with PSM provider) MTLs owing to the fact that
they cannot both intialize the underlying PSM context,
i.e. only one call to psm_init is allowed per process.

The mxm component has not been compiled as the author
doesn't currently have access to a system with a recent
enough mxm installed to allow for a compile.

The portals4, ofi, and psm components have been checked
for compilation.  The ofi and psm components have been
checked for runtime correctness on a intel/qlogic system
with up to date PSM installed.
2015-01-29 09:02:52 -07:00

508 строки
18 KiB
C

/*
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved
*
* Copyright (c) 2014 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "opal/mca/event/event.h"
#include "opal/util/output.h"
#include "opal/mca/pmix/pmix.h"
#include "mtl_ofi.h"
#include "mtl_ofi_types.h"
#include "mtl_ofi_request.h"
#include "mtl_ofi_message.h"
static int ompi_mtl_ofi_component_open(void);
static int ompi_mtl_ofi_component_query(mca_base_module_t **module, int *priority);
static int ompi_mtl_ofi_component_close(void);
static int ompi_mtl_ofi_component_register(void);
static mca_mtl_base_module_t*
ompi_mtl_ofi_component_init(bool enable_progress_threads,
bool enable_mpi_threads);
static int param_priority;
mca_mtl_ofi_component_t mca_mtl_ofi_component = {
{
/* First, the mca_base_component_t struct containing meta
* information about the component itself */
{
MCA_MTL_BASE_VERSION_2_0_0,
"ofi", /* MCA component name */
OMPI_MAJOR_VERSION, /* MCA component major version */
OMPI_MINOR_VERSION, /* MCA component minor version */
OMPI_RELEASE_VERSION, /* MCA component release version */
ompi_mtl_ofi_component_open, /* component open */
ompi_mtl_ofi_component_close, /* component close */
ompi_mtl_ofi_component_query,
ompi_mtl_ofi_component_register
},
{
/* The component is not checkpoint ready */
MCA_BASE_METADATA_PARAM_NONE
},
ompi_mtl_ofi_component_init, /* component init */
}
};
static int
ompi_mtl_ofi_component_register(void)
{
ompi_mtl_ofi.provider_name = NULL;
(void) mca_base_component_var_register(&mca_mtl_ofi_component.super.mtl_version,
"provider",
"Name of OFI provider to use",
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&ompi_mtl_ofi.provider_name);
param_priority = 10; /* for now give a lower priority than the psm mtl */
mca_base_component_var_register (&mca_mtl_ofi_component.super.mtl_version,
"priority", "Priority of the OFI MTL component",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&param_priority);
return OMPI_SUCCESS;
}
static int
ompi_mtl_ofi_component_open(void)
{
ompi_mtl_ofi.base.mtl_request_size =
sizeof(ompi_mtl_ofi_request_t) - sizeof(struct mca_mtl_request_t);
OBJ_CONSTRUCT(&ompi_mtl_ofi.free_messages, opal_free_list_t);
opal_free_list_init(&ompi_mtl_ofi.free_messages,
sizeof(ompi_mtl_ofi_message_t),
OBJ_CLASS(ompi_mtl_ofi_message_t),
1, -1, 1);
ompi_mtl_ofi.domain = NULL;
ompi_mtl_ofi.av = NULL;
ompi_mtl_ofi.cq = NULL;
ompi_mtl_ofi.mr = NULL;
ompi_mtl_ofi.ep = NULL;
return OMPI_SUCCESS;
}
static int
ompi_mtl_ofi_component_query(mca_base_module_t **module, int *priority)
{
*priority = param_priority;
*module = &ompi_mtl_ofi.base;
return OMPI_SUCCESS;
}
static int
ompi_mtl_ofi_component_close(void)
{
OBJ_DESTRUCT(&ompi_mtl_ofi.free_messages);
return OMPI_SUCCESS;
}
static mca_mtl_base_module_t*
ompi_mtl_ofi_component_init(bool enable_progress_threads,
bool enable_mpi_threads)
{
int ret, fi_version;
struct fi_info hints = {0};
struct fi_info *providers = NULL, *prov = NULL;
struct fi_domain_attr domain_attr = {0};
struct fi_fabric_attr fabric_attr = {0};
struct fi_cq_attr cq_attr = {0};
struct fi_av_attr av_attr = {0};
fi_addr_t ep_name = 0;
char *null_addr = NULL;
size_t namelen;
/**
* Hints to filter providers
* See man fi_getinfo for a list of all filters
* mode: Select capabilities MTL is prepared to support.
* In this case, MTL will pass in context into communication calls
* ep_type: reliable datagram operation
* caps: Capabilities required from the provider. The bits specified
* with buffered receive and cancel implement MPI semantics.
* Tagged is used to support tag matching.
* We expect to register all memory up front for use with this
* endpoint, so the MTL requires dynamic memory regions
*/
hints.mode = FI_CONTEXT;
hints.ep_type = FI_EP_RDM; /* Reliable datagram */
hints.caps = FI_TAGGED; /* Tag matching interface */
hints.caps |= FI_BUFFERED_RECV; /* Buffered receives */
hints.caps |= FI_CANCEL; /* Support cancel */
hints.caps |= FI_DYNAMIC_MR; /* Global dynamic mem region */
/**
* Refine filter for additional capabilities
* threading: Disable locking
* control_progress: enable async progress
*/
domain_attr.threading = FI_THREAD_ENDPOINT;
domain_attr.control_progress = FI_PROGRESS_AUTO;
fabric_attr.prov_name = ompi_mtl_ofi.provider_name;
hints.domain_attr = &domain_attr;
hints.fabric_attr = &fabric_attr;
/**
* FI_VERSION provides binary backward and forward compatibility support
* Specify the version of OFI is coded to, the provider will select struct
* layouts that are compatible with this version.
*/
fi_version = FI_VERSION(1, 0);
/**
* fi_getinfo: returns information about fabric services for reaching a
* remote node or service. this does not necessarily allocate resources.
* Pass NULL for name/service because we want a list of providers supported.
*/
ret = fi_getinfo(fi_version, /* OFI version requested */
NULL, /* Optional name or fabric to resolve */
NULL, /* Optional service name or port to request */
0ULL, /* Optional flag */
&hints, /* In: Hints to filter providers */
&providers); /* Out: List of matching providers */
if (0 != ret) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: fi_getinfo failed: %s\n",
__FILE__, __LINE__, fi_strerror(-ret));
goto error;
}
/**
* Here we elect to use the first provider from the list.
* Further filtering could be done at this point (e.g. name).
*/
prov = providers;
/**
* Open fabric
* The getinfo struct returns a fabric attribute struct that can be used to
* instantiate the virtual or physical network. This opens a "fabric
* provider". See man fi_fabric for details.
*/
ret = fi_fabric(prov->fabric_attr, /* In: Fabric attributes */
&ompi_mtl_ofi.fabric, /* Out: Fabric handle */
NULL); /* Optional context for fabric events */
if (0 != ret) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: fi_fabric failed: %s\n",
__FILE__, __LINE__, fi_strerror(-ret));
goto error;
}
/**
* Create the access domain, which is the physical or virtual network or
* hardware port/collection of ports. Returns a domain object that can be
* used to create endpoints. See man fi_domain for details.
*/
ret = fi_domain(ompi_mtl_ofi.fabric, /* In: Fabric object */
prov, /* In: Provider */
&ompi_mtl_ofi.domain, /* Out: Domain oject */
NULL); /* Optional context for domain events */
if (0 != ret) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: fi_domain failed: %s\n",
__FILE__, __LINE__, fi_strerror(-ret));
goto error;
}
/**
* Create a transport level communication endpoint. To use the endpoint,
* it must be bound to completion counters or event queues and enabled,
* and the resources consumed by it, such as address vectors, counters,
* completion queues, etc.
* see man fi_endpoint for more details.
*/
ret = fi_endpoint(ompi_mtl_ofi.domain, /* In: Domain object */
prov, /* In: Provider */
&ompi_mtl_ofi.ep, /* Out: Endpoint object */
NULL); /* Optional context */
if (0 != ret) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: fi_endpoint failed: %s\n",
__FILE__, __LINE__, fi_strerror(-ret));
goto error;
}
/**
* Create the objects that will be bound to the endpoint.
* The objects include:
* - completion queue for events
* - address vector of other endpoint addresses
* - dynamic memory-spanning memory region
*/
cq_attr.format = FI_CQ_FORMAT_TAGGED;
ret = fi_cq_open(ompi_mtl_ofi.domain, &cq_attr, &ompi_mtl_ofi.cq, NULL);
if (ret) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: fi_cq_open failed: %s\n",
__FILE__, __LINE__, fi_strerror(-ret));
goto error;
}
/**
* The remote fi_addr will be stored in the ofi_endpoint struct.
* So, we use the AV in "map" mode.
*/
av_attr.type = FI_AV_MAP;
ret = fi_av_open(ompi_mtl_ofi.domain, &av_attr, &ompi_mtl_ofi.av, NULL);
if (ret) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: fi_av_open failed: %s\n",
__FILE__, __LINE__, fi_strerror(-ret));
goto error;
}
/**
* All OFI communication routines require at least one MR.
* This MTL only needs a single MR.
*/
ret = fi_mr_reg(ompi_mtl_ofi.domain, /* In: Domain object */
0, /* In: Lower memory address */
UINTPTR_MAX, /* In: Upper memory address */
FI_SEND | FI_RECV, /* In: Expose MR for read/write */
0ULL, /* In: base MR offset */
0ULL, /* In: requested key */
0ULL, /* In: No flags */
&ompi_mtl_ofi.mr, /* Out: memregion object */
NULL); /* Context: memregion events */
if (0 != ret) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: fi_mr_reg failed: %s\n",
__FILE__, __LINE__, fi_strerror(-ret));
goto error;
}
/**
* Bind the CQ and AV to the endpoint object.
*/
ret = fi_ep_bind(ompi_mtl_ofi.ep,
(fid_t)ompi_mtl_ofi.cq,
FI_SEND | FI_RECV);
if (0 != ret) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: fi_bind CQ-EP failed: %s\n",
__FILE__, __LINE__, fi_strerror(-ret));
goto error;
}
ret = fi_ep_bind(ompi_mtl_ofi.ep,
(fid_t)ompi_mtl_ofi.av,
0);
if (0 != ret) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: fi_bind AV-EP failed: %s\n",
__FILE__, __LINE__, fi_strerror(-ret));
goto error;
}
/**
* Enable the endpoint for communication
* This commits the bind operations.
*/
ret = fi_enable(ompi_mtl_ofi.ep);
if (0 != ret) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: fi_enable failed: %s\n",
__FILE__, __LINE__, fi_strerror(-ret));
goto error;
}
/**
* Free providers info since it's not needed anymore.
*/
fi_freeinfo(providers);
providers = NULL;
/**
* Get our address and publish it with modex.
*/
namelen = sizeof(ep_name);
ret = fi_getname((fid_t)ompi_mtl_ofi.ep, &ep_name, &namelen);
if (ret) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: fi_getname failed: %s\n",
__FILE__, __LINE__, fi_strerror(-ret));
goto error;
}
OPAL_MODEX_SEND(ret, PMIX_SYNC_REQD, PMIX_GLOBAL,
&mca_mtl_ofi_component.super.mtl_version,
&ep_name, namelen);
if (OMPI_SUCCESS != ret) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: opal_modex_send failed: %d\n",
__FILE__, __LINE__, ret);
goto error;
}
ompi_mtl_ofi.epnamelen = namelen;
/**
* Insert the ANY_SRC address.
*/
null_addr = malloc(namelen);
if (!null_addr) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: malloc failed\n", __FILE__, __LINE__);
goto error;
}
memset(null_addr, 0, namelen);
ret = fi_av_insert(ompi_mtl_ofi.av,
null_addr,
1,
&ompi_mtl_ofi.any_addr,
0ULL,
NULL);
if (1 != ret) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: fi_av_insert failed: %s\n",
__FILE__, __LINE__, fi_strerror(-ret));
goto error;
}
free(null_addr);
null_addr = NULL;
/**
* Activate progress callback.
*/
ret = opal_progress_register(ompi_mtl_ofi_progress);
if (OMPI_SUCCESS != ret) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: opal_progress_register failed: %d\n",
__FILE__, __LINE__, ret);
goto error;
}
return &ompi_mtl_ofi.base;
error:
if (null_addr) {
free(null_addr);
}
if (providers) {
(void) fi_freeinfo(providers);
}
if (ompi_mtl_ofi.av) {
(void) fi_close((fid_t)ompi_mtl_ofi.av);
}
if (ompi_mtl_ofi.cq) {
(void) fi_close((fid_t)ompi_mtl_ofi.cq);
}
if (ompi_mtl_ofi.mr) {
(void) fi_close((fid_t)ompi_mtl_ofi.mr);
}
if (ompi_mtl_ofi.ep) {
(void) fi_close((fid_t)ompi_mtl_ofi.ep);
}
if (ompi_mtl_ofi.domain) {
(void) fi_close((fid_t)ompi_mtl_ofi.domain);
}
if (ompi_mtl_ofi.fabric) {
(void) fi_close((fid_t)ompi_mtl_ofi.fabric);
}
return NULL;
}
int
ompi_mtl_ofi_get_error(int error_num)
{
int ret;
switch (error_num) {
case 0:
ret = OMPI_SUCCESS;
break;
default:
ret = OMPI_ERROR;
}
return ret;
}
int
ompi_mtl_ofi_progress(void)
{
int ret, count = 0;
struct fi_cq_tagged_entry wc;
struct fi_cq_err_entry error;
ompi_mtl_ofi_request_t *ofi_req = NULL;
/**
* Read the work completions from the CQ.
* From the completion's op_context, we get the associated OFI request.
* Call the request's callback.
*/
while (true) {
memset(&wc, 0, sizeof(wc));
ret = fi_cq_read(ompi_mtl_ofi.cq, (void *)&wc, 1);
if (ret > 0) {
count++;
if (NULL != wc.op_context) {
ofi_req = TO_OFI_REQ(wc.op_context);
assert(ofi_req);
ret = ofi_req->event_callback(&wc, ofi_req);
if (OMPI_SUCCESS != ret) {
opal_output(ompi_mtl_base_framework.framework_output,
"Error returned by request event callback: %d",
ret);
abort();
}
}
} else if (ret == -FI_EAVAIL) {
/**
* An error occured and is being reported via the CQ.
* Read the error and forward it to the upper layer.
*/
memset(&error, 0, sizeof(error));
ret = fi_cq_readerr(ompi_mtl_ofi.cq,
&error,
0);
if (ret) {
opal_output(ompi_mtl_base_framework.framework_output,
"Error returned from fi_cq_readerr: %d", ret);
}
assert(error.op_context);
ofi_req = TO_OFI_REQ(error.op_context);
assert(ofi_req);
ret = ofi_req->error_callback(&error, ofi_req);
if (OMPI_SUCCESS != ret) {
opal_output(ompi_mtl_base_framework.framework_output,
"Error returned by request error callback: %d",
ret);
abort();
}
} else {
/**
* The CQ is empty. Return.
*/
break;
}
}
return count;
}