/* * Copyright (c) 2014 Mellanox Technologies, Inc. * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ */ #include "oshmem_config.h" #include #ifdef HAVE_FCNTL_H #include #endif /* HAVE_FCNTL_H */ #ifdef HAVE_SYS_MMAN_H #include #endif /* HAVE_SYS_MMAN_H */ #ifdef HAVE_UNISTD_H #include #endif /* HAVE_UNISTD_H */ #ifdef HAVE_SYS_TYPES_H #include #endif /* HAVE_SYS_TYPES_H */ #ifdef HAVE_STRING_H #include #endif /* HAVE_STRING_H */ #ifdef HAVE_NETDB_H #include #endif /* HAVE_NETDB_H */ #ifdef HAVE_TIME_H #include #endif /* HAVE_NETDB_H */ #ifdef HAVE_SYS_STAT_H #include #endif /* HAVE_SYS_STAT_H */ #include "opal/constants.h" #include "opal/util/output.h" #include "opal/util/path.h" #include "opal/util/show_help.h" #include "oshmem/mca/sshmem/sshmem.h" #include "oshmem/mca/sshmem/base/base.h" #include "sshmem_verbs.h" static openib_device_t memheap_device; /* ////////////////////////////////////////////////////////////////////////// */ /*local functions */ /* local functions */ static int module_init(void); static int segment_create(map_segment_t *ds_buf, const char *file_name, size_t size); static int ds_copy(const map_segment_t *from, map_segment_t *to); static void * segment_attach(map_segment_t *ds_buf, sshmem_mkey_t *mkey); static int segment_detach(map_segment_t *ds_buf, sshmem_mkey_t *mkey); static int segment_unlink(map_segment_t *ds_buf); static int module_finalize(void); /* * mmap shmem module */ mca_sshmem_verbs_module_t mca_sshmem_verbs_module = { /* super */ { module_init, segment_create, ds_copy, segment_attach, segment_detach, segment_unlink, module_finalize } }; /* ////////////////////////////////////////////////////////////////////////// */ /* private utility functions */ /* ////////////////////////////////////////////////////////////////////////// */ /* ////////////////////////////////////////////////////////////////////////// */ /** * completely resets the contents of *ds_buf */ static inline void shmem_ds_reset(map_segment_t *ds_buf) { OPAL_OUTPUT_VERBOSE( (70, oshmem_sshmem_base_framework.framework_output, "%s: %s: shmem_ds_resetting " "(id: %d, size: %lu, name: %s)\n", mca_sshmem_verbs_component.super.base_version.mca_type_name, mca_sshmem_verbs_component.super.base_version.mca_component_name, ds_buf->seg_id, (unsigned long)ds_buf->seg_size, ds_buf->seg_name) ); MAP_SEGMENT_RESET_FLAGS(ds_buf); ds_buf->seg_id = MAP_SEGMENT_SHM_INVALID; ds_buf->seg_base_addr = 0; ds_buf->end = 0; ds_buf->seg_size = 0; ds_buf->type = MAP_SEGMENT_UNKNOWN; memset(ds_buf->seg_name, '\0', sizeof(ds_buf->seg_name)); } /* ////////////////////////////////////////////////////////////////////////// */ static int module_init(void) { /* nothing to do */ return OSHMEM_SUCCESS; } /* ////////////////////////////////////////////////////////////////////////// */ static int module_finalize(void) { /* nothing to do */ return OSHMEM_SUCCESS; } /* ////////////////////////////////////////////////////////////////////////// */ static int ds_copy(const map_segment_t *from, map_segment_t *to) { memcpy(to, from, sizeof(map_segment_t)); OPAL_OUTPUT_VERBOSE( (70, oshmem_sshmem_base_framework.framework_output, "%s: %s: ds_copy complete " "from: (id: %d, size: %lu, " "name: %s flags: 0x%02x) " "to: (id: %d, size: %lu, " "name: %s flags: 0x%02x)\n", mca_sshmem_verbs_component.super.base_version.mca_type_name, mca_sshmem_verbs_component.super.base_version.mca_component_name, from->seg_id, (unsigned long)from->seg_size, from->seg_name, from->flags, to->seg_id, (unsigned long)to->seg_size, to->seg_name, to->flags) ); return OSHMEM_SUCCESS; } /* ////////////////////////////////////////////////////////////////////////// */ static int segment_create(map_segment_t *ds_buf, const char *file_name, size_t size) { int rc = OSHMEM_SUCCESS; void *addr = NULL; openib_device_t *device = &memheap_device; int num_devs = 0; int i = 0; assert(ds_buf); /* init the contents of map_segment_t */ shmem_ds_reset(ds_buf); memset(device, 0, sizeof(*device)); #ifdef HAVE_IBV_GET_DEVICE_LIST device->ib_devs = ibv_get_device_list(&num_devs); #else #error unsupported ibv_get_device_list in infiniband/verbs.h #endif if (num_devs == 0 || !device->ib_devs) { return OSHMEM_ERR_NOT_SUPPORTED; } /* Open device */ if (NULL != mca_sshmem_verbs_component.hca_name) { for (i = 0; i < num_devs; i++) { if (0 == strcmp(mca_sshmem_verbs_component.hca_name, ibv_get_device_name(device->ib_devs[i]))) { device->ib_dev = device->ib_devs[i]; break; } } } else { device->ib_dev = device->ib_devs[0]; } if (NULL == device->ib_dev) { OPAL_OUTPUT_VERBOSE( (5, oshmem_sshmem_base_framework.framework_output, "error getting device says %d: %s", errno, strerror(errno)) ); return OSHMEM_ERR_NOT_FOUND; } if (NULL == (device->ib_dev_context = ibv_open_device(device->ib_dev))) { OPAL_OUTPUT_VERBOSE( (5, oshmem_sshmem_base_framework.framework_output, "error obtaining device context for %s errno says %d: %s", ibv_get_device_name(device->ib_dev), errno, strerror(errno)) ); return OSHMEM_ERR_RESOURCE_BUSY; } /* Obtain device attributes */ if (ibv_query_device(device->ib_dev_context, &device->ib_dev_attr)) { OPAL_OUTPUT_VERBOSE( (5, oshmem_sshmem_base_framework.framework_output, "error obtaining device attributes for %s errno says %d: %s", ibv_get_device_name(device->ib_dev), errno, strerror(errno)) ); return OSHMEM_ERR_RESOURCE_BUSY; } /* Allocate the protection domain for the device */ device->ib_pd = ibv_alloc_pd(device->ib_dev_context); if (NULL == device->ib_pd) { OPAL_OUTPUT_VERBOSE( (5, oshmem_sshmem_base_framework.framework_output, "error allocating protection domain for %s errno says %d: %s", ibv_get_device_name(device->ib_dev), errno, strerror(errno)) ); return OSHMEM_ERR_RESOURCE_BUSY; } /* Allocate memory */ if (!rc) { void *addr = NULL; struct ibv_mr *ib_mr = NULL; uint64_t access_flag = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ; uint64_t exp_access_flag = 0; OBJ_CONSTRUCT(&device->ib_mr_array, opal_value_array_t); opal_value_array_init(&device->ib_mr_array, sizeof(struct ibv_mr *)); #if defined(MPAGE_ENABLE) && (MPAGE_ENABLE > 0) exp_access_flag = IBV_EXP_ACCESS_ALLOCATE_MR | IBV_EXP_ACCESS_SHARED_MR_USER_READ | IBV_EXP_ACCESS_SHARED_MR_USER_WRITE; #endif /* MPAGE_ENABLE */ struct ibv_exp_reg_mr_in in = {device->ib_pd, addr, size, access_flag|exp_access_flag, 0}; ib_mr = ibv_exp_reg_mr(&in); if (NULL == ib_mr) { OPAL_OUTPUT_VERBOSE( (5, oshmem_sshmem_base_framework.framework_output, "error to ibv_exp_reg_mr() %llu bytes errno says %d: %s", (unsigned long long)size, errno, strerror(errno)) ); rc = OSHMEM_ERR_OUT_OF_RESOURCE; } else { device->ib_mr_shared = ib_mr; opal_value_array_append_item(&device->ib_mr_array, &ib_mr); } #if defined(MPAGE_ENABLE) && (MPAGE_ENABLE > 0) if (!rc && mca_sshmem_verbs_component.has_shared_mr) { access_flag = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ| IBV_EXP_ACCESS_NO_RDMA; addr = (void *)mca_sshmem_base_start_address; struct ibv_exp_reg_shared_mr_in in; mca_sshmem_verbs_fill_shared_mr(&in, device->ib_pd, device->ib_mr_shared->handle, addr, access_flag); ib_mr = ibv_exp_reg_shared_mr(&in); if (NULL == ib_mr) { OPAL_OUTPUT_VERBOSE( (5, oshmem_sshmem_base_framework.framework_output, "error to ibv_reg_shared_mr() %llu bytes errno says %d: %s has_shared_mr: %d", (unsigned long long)size, errno, strerror(errno), mca_sshmem_verbs_component.has_shared_mr ) ); rc = OSHMEM_ERR_OUT_OF_RESOURCE; } else { opal_value_array_append_item(&device->ib_mr_array, &ib_mr); } } #endif /* MPAGE_ENABLE */ if (!rc) { OPAL_OUTPUT_VERBOSE( (70, oshmem_sshmem_base_framework.framework_output, "ibv device %s shared_mr: %d", ibv_get_device_name(device->ib_dev), mca_sshmem_verbs_component.has_shared_mr) ); if (mca_sshmem_verbs_component.has_shared_mr) { assert(size == device->ib_mr_shared->length); ds_buf->type = MAP_SEGMENT_ALLOC_IBV; ds_buf->seg_id = device->ib_mr_shared->handle; } else { ds_buf->type = MAP_SEGMENT_ALLOC_IBV_NOSHMR; ds_buf->seg_id = MAP_SEGMENT_SHM_INVALID; } ds_buf->seg_base_addr = ib_mr->addr; ds_buf->seg_size = size; ds_buf->end = (void*)((uintptr_t)ds_buf->seg_base_addr + ds_buf->seg_size); } } OPAL_OUTPUT_VERBOSE( (70, oshmem_sshmem_base_framework.framework_output, "%s: %s: create %s " "(id: %d, addr: %p size: %lu, name: %s)\n", mca_sshmem_verbs_component.super.base_version.mca_type_name, mca_sshmem_verbs_component.super.base_version.mca_component_name, (rc ? "failure" : "successful"), ds_buf->seg_id, ds_buf->seg_base_addr, (unsigned long)ds_buf->seg_size, ds_buf->seg_name) ); return rc; } /* ////////////////////////////////////////////////////////////////////////// */ /** * segment_attach can only be called after a successful call to segment_create */ static void * segment_attach(map_segment_t *ds_buf, sshmem_mkey_t *mkey) { openib_device_t *device = &memheap_device; static int mr_count = 0; void *addr = NULL; assert(ds_buf); assert(mkey->va_base == 0); if (MAP_SEGMENT_SHM_INVALID == (int)(mkey->u.key)) { return (mkey->va_base); } /* workaround mtt problem - request aligned addresses */ ++mr_count; addr = (void *)((uintptr_t)mca_sshmem_base_start_address + mca_sshmem_verbs_component.mr_interleave_factor * 1024ULL * 1024ULL * 1024ULL * mr_count); { struct ibv_mr *ib_mr = NULL; uint64_t access_flag = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_EXP_ACCESS_NO_RDMA; struct ibv_exp_reg_shared_mr_in in; mca_sshmem_verbs_fill_shared_mr(&in, device->ib_pd, mkey->u.key, addr, access_flag); ib_mr = ibv_exp_reg_shared_mr(&in); if (NULL == ib_mr) { mkey->va_base = (void *)-1; OPAL_OUTPUT_VERBOSE( (5, oshmem_sshmem_base_framework.framework_output, "error to ibv_reg_shared_mr() %llu bytes errno says %d: %s", (unsigned long long)ds_buf->seg_size, errno, strerror(errno)) ); } else { if (ib_mr->addr != addr) { OPAL_OUTPUT_VERBOSE( (5, oshmem_sshmem_base_framework.framework_output, "Failed to map shared region to address %p got addr %p. Try to increase 'memheap_mr_interleave_factor' from %d", addr, ib_mr->addr, mca_sshmem_verbs_component.mr_interleave_factor) ); } opal_value_array_append_item(&device->ib_mr_array, &ib_mr); mkey->va_base = ib_mr->addr; } } OPAL_OUTPUT_VERBOSE( (70, oshmem_sshmem_base_framework.framework_output, "%s: %s: attach successful " "(id: %d, addr: %p size: %lu, name: %s | va_base: 0x%p len: %d key %llx)\n", mca_sshmem_verbs_component.super.base_version.mca_type_name, mca_sshmem_verbs_component.super.base_version.mca_component_name, ds_buf->seg_id, ds_buf->seg_base_addr, (unsigned long)ds_buf->seg_size, ds_buf->seg_name, mkey->va_base, mkey->len, (unsigned long long)mkey->u.key) ); /* update returned base pointer with an offset that hides our stuff */ return (mkey->va_base); } /* ////////////////////////////////////////////////////////////////////////// */ static int segment_detach(map_segment_t *ds_buf, sshmem_mkey_t *mkey) { int rc = OSHMEM_SUCCESS; openib_device_t *device = &memheap_device; assert(ds_buf); OPAL_OUTPUT_VERBOSE( (70, oshmem_sshmem_base_framework.framework_output, "%s: %s: detaching " "(id: %d, addr: %p size: %lu, name: %s)\n", mca_sshmem_verbs_component.super.base_version.mca_type_name, mca_sshmem_verbs_component.super.base_version.mca_component_name, ds_buf->seg_id, ds_buf->seg_base_addr, (unsigned long)ds_buf->seg_size, ds_buf->seg_name) ); if (device) { if (opal_value_array_get_size(&device->ib_mr_array)) { struct ibv_mr** array; struct ibv_mr* ib_mr = NULL; array = OPAL_VALUE_ARRAY_GET_BASE(&device->ib_mr_array, struct ibv_mr *); while (opal_value_array_get_size(&device->ib_mr_array) > 0) { ib_mr = array[0]; if(ibv_dereg_mr(ib_mr)) { OPAL_OUTPUT_VERBOSE( (5, oshmem_sshmem_base_framework.framework_output, "error ibv_dereg_mr(): %d: %s", errno, strerror(errno)) ); rc = OSHMEM_ERROR; } opal_value_array_remove_item(&device->ib_mr_array, 0); } if (!rc && device->ib_mr_shared) { device->ib_mr_shared = NULL; } OBJ_DESTRUCT(&device->ib_mr_array); } if (!rc && device->ib_pd) { if (ibv_dealloc_pd(device->ib_pd)) { OPAL_OUTPUT_VERBOSE( (5, oshmem_sshmem_base_framework.framework_output, "error ibv_dealloc_pd(): %d: %s", errno, strerror(errno)) ); rc = OSHMEM_ERROR; } else { device->ib_pd = NULL; } } if(!rc && device->ib_dev_context) { if(ibv_close_device(device->ib_dev_context)) { OPAL_OUTPUT_VERBOSE( (5, oshmem_sshmem_base_framework.framework_output, "error ibv_close_device(): %d: %s", errno, strerror(errno)) ); rc = OSHMEM_ERROR; } else { device->ib_dev_context = NULL; } } if(!rc && device->ib_devs) { ibv_free_device_list(device->ib_devs); device->ib_devs = NULL; } } /* reset the contents of the map_segment_t associated with this * shared memory segment. */ shmem_ds_reset(ds_buf); return rc; } /* ////////////////////////////////////////////////////////////////////////// */ static int segment_unlink(map_segment_t *ds_buf) { /* not much unlink work needed for sysv */ OPAL_OUTPUT_VERBOSE( (70, oshmem_sshmem_base_framework.framework_output, "%s: %s: unlinking " "(id: %d, addr: %p size: %lu, name: %s)\n", mca_sshmem_verbs_component.super.base_version.mca_type_name, mca_sshmem_verbs_component.super.base_version.mca_component_name, ds_buf->seg_id, ds_buf->seg_base_addr, (unsigned long)ds_buf->seg_size, ds_buf->seg_name) ); /* don't completely reset. in particular, only reset * the id and flip the invalid bit. size and name values will remain valid * across unlinks. other information stored in flags will remain untouched. */ ds_buf->seg_id = MAP_SEGMENT_SHM_INVALID; /* note: this is only changing the valid bit to 0. */ MAP_SEGMENT_INVALIDATE(ds_buf); return OSHMEM_SUCCESS; }