1
1
openmpi/oshmem/mca/sshmem/verbs/sshmem_verbs_module.c
2014-10-13 15:24:12 +03:00

522 строки
17 KiB
C

/*
* Copyright (c) 2014 Mellanox Technologies, Inc.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "oshmem_config.h"
#include <errno.h>
#ifdef HAVE_FCNTL_H
#include <fcntl.h>
#endif /* HAVE_FCNTL_H */
#ifdef HAVE_SYS_MMAN_H
#include <sys/mman.h>
#endif /* HAVE_SYS_MMAN_H */
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_SYS_TYPES_H
#include <sys/types.h>
#endif /* HAVE_SYS_TYPES_H */
#ifdef HAVE_STRING_H
#include <string.h>
#endif /* HAVE_STRING_H */
#ifdef HAVE_NETDB_H
#include <netdb.h>
#endif /* HAVE_NETDB_H */
#ifdef HAVE_TIME_H
#include <time.h>
#endif /* HAVE_NETDB_H */
#ifdef HAVE_SYS_STAT_H
#include <sys/stat.h>
#endif /* HAVE_SYS_STAT_H */
#include "opal/constants.h"
#include "opal/util/output.h"
#include "opal/util/path.h"
#include "opal/util/show_help.h"
#include "oshmem/mca/sshmem/sshmem.h"
#include "oshmem/mca/sshmem/base/base.h"
#include "sshmem_verbs.h"
static openib_device_t memheap_device;
/* ////////////////////////////////////////////////////////////////////////// */
/*local functions */
/* local functions */
static int
module_init(void);
static int
segment_create(map_segment_t *ds_buf,
const char *file_name,
size_t size);
static int
ds_copy(const map_segment_t *from,
map_segment_t *to);
static void *
segment_attach(map_segment_t *ds_buf, sshmem_mkey_t *mkey);
static int
segment_detach(map_segment_t *ds_buf, sshmem_mkey_t *mkey);
static int
segment_unlink(map_segment_t *ds_buf);
static int
module_finalize(void);
/*
* mmap shmem module
*/
mca_sshmem_verbs_module_t mca_sshmem_verbs_module = {
/* super */
{
module_init,
segment_create,
ds_copy,
segment_attach,
segment_detach,
segment_unlink,
module_finalize
}
};
/* ////////////////////////////////////////////////////////////////////////// */
/* private utility functions */
/* ////////////////////////////////////////////////////////////////////////// */
/* ////////////////////////////////////////////////////////////////////////// */
/**
* completely resets the contents of *ds_buf
*/
static inline void
shmem_ds_reset(map_segment_t *ds_buf)
{
OPAL_OUTPUT_VERBOSE(
(70, oshmem_sshmem_base_framework.framework_output,
"%s: %s: shmem_ds_resetting "
"(id: %d, size: %lu, name: %s)\n",
mca_sshmem_verbs_component.super.base_version.mca_type_name,
mca_sshmem_verbs_component.super.base_version.mca_component_name,
ds_buf->seg_id, (unsigned long)ds_buf->seg_size, ds_buf->seg_name)
);
MAP_SEGMENT_RESET_FLAGS(ds_buf);
ds_buf->seg_id = MAP_SEGMENT_SHM_INVALID;
ds_buf->seg_base_addr = 0;
ds_buf->end = 0;
ds_buf->seg_size = 0;
ds_buf->type = MAP_SEGMENT_UNKNOWN;
memset(ds_buf->seg_name, '\0', sizeof(ds_buf->seg_name));
}
/* ////////////////////////////////////////////////////////////////////////// */
static int
module_init(void)
{
/* nothing to do */
return OSHMEM_SUCCESS;
}
/* ////////////////////////////////////////////////////////////////////////// */
static int
module_finalize(void)
{
/* nothing to do */
return OSHMEM_SUCCESS;
}
/* ////////////////////////////////////////////////////////////////////////// */
static int
ds_copy(const map_segment_t *from,
map_segment_t *to)
{
memcpy(to, from, sizeof(map_segment_t));
OPAL_OUTPUT_VERBOSE(
(70, oshmem_sshmem_base_framework.framework_output,
"%s: %s: ds_copy complete "
"from: (id: %d, size: %lu, "
"name: %s flags: 0x%02x) "
"to: (id: %d, size: %lu, "
"name: %s flags: 0x%02x)\n",
mca_sshmem_verbs_component.super.base_version.mca_type_name,
mca_sshmem_verbs_component.super.base_version.mca_component_name,
from->seg_id, (unsigned long)from->seg_size, from->seg_name,
from->flags, to->seg_id, (unsigned long)to->seg_size, to->seg_name,
to->flags)
);
return OSHMEM_SUCCESS;
}
/* ////////////////////////////////////////////////////////////////////////// */
static int
segment_create(map_segment_t *ds_buf,
const char *file_name,
size_t size)
{
int rc = OSHMEM_SUCCESS;
void *addr = NULL;
openib_device_t *device = &memheap_device;
int num_devs = 0;
int i = 0;
assert(ds_buf);
/* init the contents of map_segment_t */
shmem_ds_reset(ds_buf);
memset(device, 0, sizeof(*device));
#ifdef HAVE_IBV_GET_DEVICE_LIST
device->ib_devs = ibv_get_device_list(&num_devs);
#else
#error unsupported ibv_get_device_list in infiniband/verbs.h
#endif
if (num_devs == 0 || !device->ib_devs) {
return OSHMEM_ERR_NOT_SUPPORTED;
}
/* Open device */
if (NULL != mca_sshmem_verbs_component.hca_name) {
for (i = 0; i < num_devs; i++) {
if (0 == strcmp(mca_sshmem_verbs_component.hca_name, ibv_get_device_name(device->ib_devs[i]))) {
device->ib_dev = device->ib_devs[i];
break;
}
}
} else {
device->ib_dev = device->ib_devs[0];
}
if (NULL == device->ib_dev) {
OPAL_OUTPUT_VERBOSE(
(5, oshmem_sshmem_base_framework.framework_output,
"error getting device says %d: %s",
errno, strerror(errno))
);
return OSHMEM_ERR_NOT_FOUND;
}
if (NULL == (device->ib_dev_context = ibv_open_device(device->ib_dev))) {
OPAL_OUTPUT_VERBOSE(
(5, oshmem_sshmem_base_framework.framework_output,
"error obtaining device context for %s errno says %d: %s",
ibv_get_device_name(device->ib_dev), errno, strerror(errno))
);
return OSHMEM_ERR_RESOURCE_BUSY;
}
/* Obtain device attributes */
if (ibv_query_device(device->ib_dev_context, &device->ib_dev_attr)) {
OPAL_OUTPUT_VERBOSE(
(5, oshmem_sshmem_base_framework.framework_output,
"error obtaining device attributes for %s errno says %d: %s",
ibv_get_device_name(device->ib_dev), errno, strerror(errno))
);
return OSHMEM_ERR_RESOURCE_BUSY;
}
/* Allocate the protection domain for the device */
device->ib_pd = ibv_alloc_pd(device->ib_dev_context);
if (NULL == device->ib_pd) {
OPAL_OUTPUT_VERBOSE(
(5, oshmem_sshmem_base_framework.framework_output,
"error allocating protection domain for %s errno says %d: %s",
ibv_get_device_name(device->ib_dev), errno, strerror(errno))
);
return OSHMEM_ERR_RESOURCE_BUSY;
}
/* Allocate memory */
if (!rc) {
void *addr = NULL;
struct ibv_mr *ib_mr = NULL;
uint64_t access_flag = IBV_ACCESS_LOCAL_WRITE |
IBV_ACCESS_REMOTE_WRITE |
IBV_ACCESS_REMOTE_READ;
uint64_t exp_access_flag = 0;
OBJ_CONSTRUCT(&device->ib_mr_array, opal_value_array_t);
opal_value_array_init(&device->ib_mr_array, sizeof(struct ibv_mr *));
#if (MPAGE_ENABLE > 0)
exp_access_flag = IBV_EXP_ACCESS_ALLOCATE_MR |
IBV_EXP_ACCESS_SHARED_MR_USER_READ |
IBV_EXP_ACCESS_SHARED_MR_USER_WRITE;
#endif /* MPAGE_ENABLE */
struct ibv_exp_reg_mr_in in = {device->ib_pd, addr, size, access_flag|exp_access_flag, 0};
#if MPAGE_HAVE_IBV_EXP_REG_MR_CREATE_FLAGS
if (0 == mca_sshmem_verbs_component.has_shared_mr) {
in.addr = (void *)mca_sshmem_base_start_address;
in.comp_mask = IBV_EXP_REG_MR_CREATE_FLAGS;
in.create_flags = IBV_EXP_REG_MR_CREATE_CONTIG;
in.exp_access = access_flag;
}
#endif
ib_mr = ibv_exp_reg_mr(&in);
if (NULL == ib_mr) {
OPAL_OUTPUT_VERBOSE(
(5, oshmem_sshmem_base_framework.framework_output,
"error to ibv_exp_reg_mr() %llu bytes errno says %d: %s",
(unsigned long long)size, errno, strerror(errno))
);
rc = OSHMEM_ERR_OUT_OF_RESOURCE;
} else {
device->ib_mr_shared = ib_mr;
opal_value_array_append_item(&device->ib_mr_array, &ib_mr);
}
#if (MPAGE_ENABLE > 0)
if (!rc && mca_sshmem_verbs_component.has_shared_mr) {
access_flag = IBV_ACCESS_LOCAL_WRITE |
IBV_ACCESS_REMOTE_WRITE |
IBV_ACCESS_REMOTE_READ|
IBV_EXP_ACCESS_NO_RDMA;
addr = (void *)mca_sshmem_base_start_address;
struct ibv_exp_reg_shared_mr_in in;
mca_sshmem_verbs_fill_shared_mr(&in, device->ib_pd, device->ib_mr_shared->handle, addr, access_flag);
ib_mr = ibv_exp_reg_shared_mr(&in);
if (NULL == ib_mr) {
OPAL_OUTPUT_VERBOSE(
(5, oshmem_sshmem_base_framework.framework_output,
"error to ibv_reg_shared_mr() %llu bytes errno says %d: %s has_shared_mr: %d",
(unsigned long long)size, errno, strerror(errno),
mca_sshmem_verbs_component.has_shared_mr
)
);
rc = OSHMEM_ERR_OUT_OF_RESOURCE;
} else {
opal_value_array_append_item(&device->ib_mr_array, &ib_mr);
}
}
#endif /* MPAGE_ENABLE */
if (!rc) {
OPAL_OUTPUT_VERBOSE(
(70, oshmem_sshmem_base_framework.framework_output,
"ibv device %s shared_mr: %d",
ibv_get_device_name(device->ib_dev),
mca_sshmem_verbs_component.has_shared_mr)
);
if (mca_sshmem_verbs_component.has_shared_mr) {
assert(size == device->ib_mr_shared->length);
ds_buf->type = MAP_SEGMENT_ALLOC_IBV;
ds_buf->seg_id = device->ib_mr_shared->handle;
} else {
ds_buf->type = MAP_SEGMENT_ALLOC_IBV_NOSHMR;
ds_buf->seg_id = MAP_SEGMENT_SHM_INVALID;
}
ds_buf->seg_base_addr = ib_mr->addr;
ds_buf->seg_size = size;
ds_buf->end = (void*)((uintptr_t)ds_buf->seg_base_addr + ds_buf->seg_size);
}
}
OPAL_OUTPUT_VERBOSE(
(70, oshmem_sshmem_base_framework.framework_output,
"%s: %s: create %s "
"(id: %d, addr: %p size: %lu, name: %s)\n",
mca_sshmem_verbs_component.super.base_version.mca_type_name,
mca_sshmem_verbs_component.super.base_version.mca_component_name,
(rc ? "failure" : "successful"),
ds_buf->seg_id, ds_buf->seg_base_addr, (unsigned long)ds_buf->seg_size, ds_buf->seg_name)
);
return rc;
}
/* ////////////////////////////////////////////////////////////////////////// */
/**
* segment_attach can only be called after a successful call to segment_create
*/
static void *
segment_attach(map_segment_t *ds_buf, sshmem_mkey_t *mkey)
{
openib_device_t *device = &memheap_device;
static int mr_count = 0;
void *addr = NULL;
assert(ds_buf);
assert(mkey->va_base == 0);
if (MAP_SEGMENT_SHM_INVALID == (int)(mkey->u.key)) {
return (mkey->va_base);
}
/* workaround mtt problem - request aligned addresses */
++mr_count;
addr = (void *)((uintptr_t)mca_sshmem_base_start_address +
mca_sshmem_verbs_component.mr_interleave_factor * 1024ULL * 1024ULL * 1024ULL * mr_count);
{
struct ibv_mr *ib_mr = NULL;
uint64_t access_flag = IBV_ACCESS_LOCAL_WRITE |
IBV_ACCESS_REMOTE_WRITE |
IBV_ACCESS_REMOTE_READ |
IBV_EXP_ACCESS_NO_RDMA;
struct ibv_exp_reg_shared_mr_in in;
mca_sshmem_verbs_fill_shared_mr(&in, device->ib_pd, mkey->u.key, addr, access_flag);
ib_mr = ibv_exp_reg_shared_mr(&in);
if (NULL == ib_mr) {
mkey->va_base = (void *)-1;
OPAL_OUTPUT_VERBOSE(
(5, oshmem_sshmem_base_framework.framework_output,
"error to ibv_reg_shared_mr() %llu bytes errno says %d: %s",
(unsigned long long)ds_buf->seg_size, errno, strerror(errno))
);
} else {
if (ib_mr->addr != addr) {
OPAL_OUTPUT_VERBOSE(
(5, oshmem_sshmem_base_framework.framework_output,
"Failed to map shared region to address %p got addr %p. Try to increase 'memheap_mr_interleave_factor' from %d",
addr, ib_mr->addr, mca_sshmem_verbs_component.mr_interleave_factor)
);
}
opal_value_array_append_item(&device->ib_mr_array, &ib_mr);
mkey->va_base = ib_mr->addr;
}
}
OPAL_OUTPUT_VERBOSE(
(70, oshmem_sshmem_base_framework.framework_output,
"%s: %s: attach successful "
"(id: %d, addr: %p size: %lu, name: %s | va_base: 0x%p len: %d key %llx)\n",
mca_sshmem_verbs_component.super.base_version.mca_type_name,
mca_sshmem_verbs_component.super.base_version.mca_component_name,
ds_buf->seg_id, ds_buf->seg_base_addr, (unsigned long)ds_buf->seg_size, ds_buf->seg_name,
mkey->va_base, mkey->len, (unsigned long long)mkey->u.key)
);
/* update returned base pointer with an offset that hides our stuff */
return (mkey->va_base);
}
/* ////////////////////////////////////////////////////////////////////////// */
static int
segment_detach(map_segment_t *ds_buf, sshmem_mkey_t *mkey)
{
int rc = OSHMEM_SUCCESS;
openib_device_t *device = &memheap_device;
int i;
assert(ds_buf);
OPAL_OUTPUT_VERBOSE(
(70, oshmem_sshmem_base_framework.framework_output,
"%s: %s: detaching "
"(id: %d, addr: %p size: %lu, name: %s)\n",
mca_sshmem_verbs_component.super.base_version.mca_type_name,
mca_sshmem_verbs_component.super.base_version.mca_component_name,
ds_buf->seg_id, ds_buf->seg_base_addr, (unsigned long)ds_buf->seg_size, ds_buf->seg_name)
);
if (device) {
if (0 < (i = opal_value_array_get_size(&device->ib_mr_array))) {
struct ibv_mr** array;
struct ibv_mr* ib_mr = NULL;
array = OPAL_VALUE_ARRAY_GET_BASE(&device->ib_mr_array, struct ibv_mr *);
for (i--;i >= 0; i--) {
ib_mr = array[i];
if(ibv_dereg_mr(ib_mr)) {
OPAL_OUTPUT_VERBOSE(
(5, oshmem_sshmem_base_framework.framework_output,
"error ibv_dereg_mr(): %d: %s",
errno, strerror(errno))
);
rc = OSHMEM_ERROR;
}
opal_value_array_remove_item(&device->ib_mr_array, i);
}
if (!rc && device->ib_mr_shared) {
device->ib_mr_shared = NULL;
}
OBJ_DESTRUCT(&device->ib_mr_array);
}
if (!rc && device->ib_pd) {
if (ibv_dealloc_pd(device->ib_pd)) {
OPAL_OUTPUT_VERBOSE(
(5, oshmem_sshmem_base_framework.framework_output,
"error ibv_dealloc_pd(): %d: %s",
errno, strerror(errno))
);
rc = OSHMEM_ERROR;
} else {
device->ib_pd = NULL;
}
}
if(!rc && device->ib_dev_context) {
if(ibv_close_device(device->ib_dev_context)) {
OPAL_OUTPUT_VERBOSE(
(5, oshmem_sshmem_base_framework.framework_output,
"error ibv_close_device(): %d: %s",
errno, strerror(errno))
);
rc = OSHMEM_ERROR;
} else {
device->ib_dev_context = NULL;
}
}
if(!rc && device->ib_devs) {
ibv_free_device_list(device->ib_devs);
device->ib_devs = NULL;
}
}
/* reset the contents of the map_segment_t associated with this
* shared memory segment.
*/
shmem_ds_reset(ds_buf);
return rc;
}
/* ////////////////////////////////////////////////////////////////////////// */
static int
segment_unlink(map_segment_t *ds_buf)
{
/* not much unlink work needed for sysv */
OPAL_OUTPUT_VERBOSE(
(70, oshmem_sshmem_base_framework.framework_output,
"%s: %s: unlinking "
"(id: %d, addr: %p size: %lu, name: %s)\n",
mca_sshmem_verbs_component.super.base_version.mca_type_name,
mca_sshmem_verbs_component.super.base_version.mca_component_name,
ds_buf->seg_id, ds_buf->seg_base_addr, (unsigned long)ds_buf->seg_size, ds_buf->seg_name)
);
/* don't completely reset. in particular, only reset
* the id and flip the invalid bit. size and name values will remain valid
* across unlinks. other information stored in flags will remain untouched.
*/
ds_buf->seg_id = MAP_SEGMENT_SHM_INVALID;
/* note: this is only changing the valid bit to 0. */
MAP_SEGMENT_INVALIDATE(ds_buf);
return OSHMEM_SUCCESS;
}