1
1
openmpi/ompi/mca/mtl/ofi/mtl_ofi_component.c
2015-01-05 17:01:37 -08:00

487 строки
17 KiB
C

/*
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved
*
* Copyright (c) 2014 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "opal/mca/event/event.h"
#include "opal/util/output.h"
#include "opal/mca/pmix/pmix.h"
#include "mtl_ofi.h"
#include "mtl_ofi_types.h"
#include "mtl_ofi_request.h"
#include "mtl_ofi_message.h"
static int ompi_mtl_ofi_component_open(void);
static int ompi_mtl_ofi_component_close(void);
static int ompi_mtl_ofi_component_register(void);
static mca_mtl_base_module_t*
ompi_mtl_ofi_component_init(bool enable_progress_threads,
bool enable_mpi_threads);
mca_mtl_ofi_component_t mca_mtl_ofi_component = {
{
/* First, the mca_base_component_t struct containing meta
* information about the component itself */
{
MCA_MTL_BASE_VERSION_2_0_0,
"ofi", /* MCA component name */
OMPI_MAJOR_VERSION, /* MCA component major version */
OMPI_MINOR_VERSION, /* MCA component minor version */
OMPI_RELEASE_VERSION, /* MCA component release version */
ompi_mtl_ofi_component_open, /* component open */
ompi_mtl_ofi_component_close, /* component close */
NULL,
ompi_mtl_ofi_component_register
},
{
/* The component is not checkpoint ready */
MCA_BASE_METADATA_PARAM_NONE
},
ompi_mtl_ofi_component_init, /* component init */
}
};
static int
ompi_mtl_ofi_component_register(void)
{
return OMPI_SUCCESS;
}
static int
ompi_mtl_ofi_component_open(void)
{
ompi_mtl_ofi.base.mtl_request_size =
sizeof(ompi_mtl_ofi_request_t) - sizeof(struct mca_mtl_request_t);
OBJ_CONSTRUCT(&ompi_mtl_ofi.free_messages, opal_free_list_t);
opal_free_list_init(&ompi_mtl_ofi.free_messages,
sizeof(ompi_mtl_ofi_message_t),
OBJ_CLASS(ompi_mtl_ofi_message_t),
1, -1, 1);
ompi_mtl_ofi.domain = NULL;
ompi_mtl_ofi.av = NULL;
ompi_mtl_ofi.cq = NULL;
ompi_mtl_ofi.mr = NULL;
ompi_mtl_ofi.ep = NULL;
return OMPI_SUCCESS;
}
static int
ompi_mtl_ofi_component_close(void)
{
OBJ_DESTRUCT(&ompi_mtl_ofi.free_messages);
return OMPI_SUCCESS;
}
static mca_mtl_base_module_t*
ompi_mtl_ofi_component_init(bool enable_progress_threads,
bool enable_mpi_threads)
{
int ret, fi_version;
struct fi_info hints = {0};
struct fi_info *providers = NULL, *prov = NULL;
struct fi_domain_attr domain_attr = {0};
struct fi_tx_attr tx_attr = {0};
struct fi_cq_attr cq_attr = {0};
struct fi_av_attr av_attr = {0};
fi_addr_t ep_name = 0;
char *null_addr = NULL;
size_t namelen;
/**
* Hints to filter providers
* See man fi_getinfo for a list of all filters
* mode: Select capabilities MTL is prepared to support.
* In this case, MTL will pass in context into communication calls
* ep_type: reliable datagram operation
* caps: Capabilities required from the provider. The bits specified
* with buffered receive, cancel, and remote complete
* implements MPI semantics. Tagged is used to support tag
* matching.
* We expect to register all memory up front for use with this
* endpoint, so the MTL requires dynamic memory regions
*/
hints.mode = FI_CONTEXT;
hints.ep_type = FI_EP_RDM; /* Reliable datagram */
hints.caps = FI_TAGGED; /* Tag matching interface */
hints.caps |= FI_BUFFERED_RECV; /* Buffered receives */
hints.caps |= FI_REMOTE_COMPLETE; /* Remote completion */
hints.caps |= FI_CANCEL; /* Support cancel */
hints.caps |= FI_DYNAMIC_MR; /* Global dynamic mem region */
/**
* Refine filter for additional capabilities
* threading: Disable locking
* control_progress: enable async progress
* op_flags: Specifies default operation to set on all communication.
* In this case, we want remote completion to be set by default.
*/
domain_attr.threading = FI_THREAD_PROGRESS;
domain_attr.control_progress = FI_PROGRESS_AUTO;
tx_attr.op_flags = FI_REMOTE_COMPLETE;
hints.domain_attr = &domain_attr;
hints.tx_attr = &tx_attr;
/**
* FI_VERSION provides binary backward and forward compatibility support
* Specify the version of OFI is coded to, the provider will select struct
* layouts that are compatible with this version.
*/
fi_version = FI_VERSION(1, 0);
/**
* fi_getinfo: returns information about fabric services for reaching a
* remote node or service. this does not necessarily allocate resources.
* Pass NULL for name/service because we want a list of providers supported.
*/
ret = fi_getinfo(fi_version, /* OFI version requested */
NULL, /* Optional name or fabric to resolve */
NULL, /* Optional service name or port to request */
0ULL, /* Optional flag */
&hints, /* In: Hints to filter providers */
&providers); /* Out: List of matching providers */
if (0 != ret) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: fi_getinfo failed: %s\n",
__FILE__, __LINE__, fi_strerror(-ret));
goto error;
}
/**
* Here we elect to use the first provider from the list.
* Further filtering could be done at this point (e.g. name).
*/
prov = providers;
/**
* Open fabric
* The getinfo struct returns a fabric attribute struct that can be used to
* instantiate the virtual or physical network. This opens a "fabric
* provider". See man fi_fabric for details.
*/
ret = fi_fabric(prov->fabric_attr, /* In: Fabric attributes */
&ompi_mtl_ofi.fabric, /* Out: Fabric handle */
NULL); /* Optional context for fabric events */
if (0 != ret) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: fi_fabric failed: %s\n",
__FILE__, __LINE__, fi_strerror(-ret));
goto error;
}
/**
* Create the access domain, which is the physical or virtual network or
* hardware port/collection of ports. Returns a domain object that can be
* used to create endpoints. See man fi_domain for details.
*/
ret = fi_domain(ompi_mtl_ofi.fabric, /* In: Fabric object */
prov, /* In: Provider */
&ompi_mtl_ofi.domain, /* Out: Domain oject */
NULL); /* Optional context for domain events */
if (0 != ret) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: fi_domain failed: %s\n",
__FILE__, __LINE__, fi_strerror(-ret));
goto error;
}
/**
* Create a transport level communication endpoint. To use the endpoint,
* it must be bound to completion counters or event queues and enabled,
* and the resources consumed by it, such as address vectors, counters,
* completion queues, etc.
* see man fi_endpoint for more details.
*/
ret = fi_endpoint(ompi_mtl_ofi.domain, /* In: Domain object */
prov, /* In: Provider */
&ompi_mtl_ofi.ep, /* Out: Endpoint object */
NULL); /* Optional context */
if (0 != ret) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: fi_endpoint failed: %s\n",
__FILE__, __LINE__, fi_strerror(-ret));
goto error;
}
/**
* Create the objects that will be bound to the endpoint.
* The objects include:
* - completion queue for events
* - address vector of other endpoint addresses
* - dynamic memory-spanning memory region
*/
cq_attr.format = FI_CQ_FORMAT_TAGGED;
ret = fi_cq_open(ompi_mtl_ofi.domain, &cq_attr, &ompi_mtl_ofi.cq, NULL);
if (ret) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: fi_cq_open failed: %s\n",
__FILE__, __LINE__, fi_strerror(-ret));
goto error;
}
/**
* The remote fi_addr will be stored in the ofi_endpoint struct.
* So, we use the AV in "map" mode.
*/
av_attr.type = FI_AV_MAP;
ret = fi_av_open(ompi_mtl_ofi.domain, &av_attr, &ompi_mtl_ofi.av, NULL);
if (ret) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: fi_av_open failed: %s\n",
__FILE__, __LINE__, fi_strerror(-ret));
goto error;
}
/**
* All OFI communication routines require at least one MR.
* This MTL only needs a single MR.
*/
ret = fi_mr_reg(ompi_mtl_ofi.domain, /* In: Domain object */
0, /* In: Lower memory address */
UINTPTR_MAX, /* In: Upper memory address */
FI_SEND | FI_RECV, /* In: Expose MR for read/write */
0ULL, /* In: base MR offset */
0ULL, /* In: requested key */
0ULL, /* In: No flags */
&ompi_mtl_ofi.mr, /* Out: memregion object */
NULL); /* Context: memregion events */
if (0 != ret) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: fi_mr_reg failed: %s\n",
__FILE__, __LINE__, fi_strerror(-ret));
goto error;
}
/**
* Bind the CQ and AV to the endpoint object.
*/
ret = fi_bind((fid_t)ompi_mtl_ofi.ep,
(fid_t)ompi_mtl_ofi.cq,
FI_SEND | FI_RECV);
if (0 != ret) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: fi_bind CQ-EP failed: %s\n",
__FILE__, __LINE__, fi_strerror(-ret));
goto error;
}
ret = fi_bind((fid_t)ompi_mtl_ofi.ep,
(fid_t)ompi_mtl_ofi.av,
0);
if (0 != ret) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: fi_bind AV-EP failed: %s\n",
__FILE__, __LINE__, fi_strerror(-ret));
goto error;
}
/**
* Enable the endpoint for communication
* This commits the bind operations.
*/
ret = fi_enable(ompi_mtl_ofi.ep);
if (0 != ret) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: fi_enable failed: %s\n",
__FILE__, __LINE__, fi_strerror(-ret));
goto error;
}
/**
* Free providers info since it's not needed anymore.
*/
fi_freeinfo(providers);
providers = NULL;
/**
* Get our address and publish it with modex.
*/
namelen = sizeof(ep_name);
ret = fi_getname((fid_t)ompi_mtl_ofi.ep, &ep_name, &namelen);
if (ret) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: fi_getname failed: %s\n",
__FILE__, __LINE__, fi_strerror(-ret));
goto error;
}
OPAL_MODEX_SEND(ret, PMIX_SYNC_REQD, PMIX_GLOBAL,
&mca_mtl_ofi_component.super.mtl_version,
&ep_name, namelen);
if (OMPI_SUCCESS != ret) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: opal_modex_send failed: %d\n",
__FILE__, __LINE__, ret);
goto error;
}
ompi_mtl_ofi.epnamelen = namelen;
/**
* Insert the ANY_SRC address.
*/
null_addr = malloc(namelen);
if (!null_addr) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: malloc failed\n", __FILE__, __LINE__);
goto error;
}
memset(null_addr, 0, namelen);
ret = fi_av_insert(ompi_mtl_ofi.av,
null_addr,
1,
&ompi_mtl_ofi.any_addr,
0ULL,
NULL);
if (1 != ret) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: fi_av_insert failed: %s\n",
__FILE__, __LINE__, fi_strerror(-ret));
goto error;
}
free(null_addr);
null_addr = NULL;
/**
* Activate progress callback.
*/
ret = opal_progress_register(ompi_mtl_ofi_progress);
if (OMPI_SUCCESS != ret) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: opal_progress_register failed: %d\n",
__FILE__, __LINE__, ret);
goto error;
}
return &ompi_mtl_ofi.base;
error:
if (null_addr) {
free(null_addr);
}
if (providers) {
(void) fi_freeinfo(providers);
}
if (ompi_mtl_ofi.av) {
(void) fi_close((fid_t)ompi_mtl_ofi.av);
}
if (ompi_mtl_ofi.cq) {
(void) fi_close((fid_t)ompi_mtl_ofi.cq);
}
if (ompi_mtl_ofi.mr) {
(void) fi_close((fid_t)ompi_mtl_ofi.mr);
}
if (ompi_mtl_ofi.ep) {
(void) fi_close((fid_t)ompi_mtl_ofi.ep);
}
if (ompi_mtl_ofi.domain) {
(void) fi_close((fid_t)ompi_mtl_ofi.domain);
}
if (ompi_mtl_ofi.fabric) {
(void) fi_close((fid_t)ompi_mtl_ofi.fabric);
}
return NULL;
}
int
ompi_mtl_ofi_get_error(int error_num)
{
int ret;
switch (error_num) {
case 0:
ret = OMPI_SUCCESS;
break;
default:
ret = OMPI_ERROR;
}
return ret;
}
int
ompi_mtl_ofi_progress(void)
{
int ret, count = 0;
struct fi_cq_tagged_entry wc;
struct fi_cq_err_entry error;
ompi_mtl_ofi_request_t *ofi_req = NULL;
/**
* Read the work completions from the CQ.
* From the completion's op_context, we get the associated OFI request.
* Call the request's callback.
*/
while (true) {
memset(&wc, 0, sizeof(wc));
ret = fi_cq_read(ompi_mtl_ofi.cq, (void *)&wc, 1);
if (ret > 0) {
count++;
if (NULL != wc.op_context) {
ofi_req = TO_OFI_REQ(wc.op_context);
assert(ofi_req);
ret = ofi_req->event_callback(&wc, ofi_req);
if (OMPI_SUCCESS != ret) {
opal_output(ompi_mtl_base_framework.framework_output,
"Error returned by request event callback: %d",
ret);
abort();
}
}
} else if (ret == -FI_EAVAIL) {
/**
* An error occured and is being reported via the CQ.
* Read the error and forward it to the upper layer.
*/
memset(&error, 0, sizeof(error));
ret = fi_cq_readerr(ompi_mtl_ofi.cq,
&error,
0);
if (ret) {
opal_output(ompi_mtl_base_framework.framework_output,
"Error returned from fi_cq_readerr: %d", ret);
}
assert(error.op_context);
ofi_req = TO_OFI_REQ(error.op_context);
assert(ofi_req);
ret = ofi_req->error_callback(&error, ofi_req);
if (OMPI_SUCCESS != ret) {
opal_output(ompi_mtl_base_framework.framework_output,
"Error returned by request error callback: %d",
ret);
abort();
}
} else {
/**
* The CQ is empty. Return.
*/
break;
}
}
return count;
}