488 строки
19 KiB
C
488 строки
19 KiB
C
/*
|
|
* Copyright (c) 2007 Los Alamos National Security, LLC.
|
|
* All righs reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
#include "ompi_config.h"
|
|
#include "ompi/constants.h"
|
|
|
|
#include <sys/types.h>
|
|
#include <sys/stat.h>
|
|
#include <unistd.h>
|
|
#include <errno.h>
|
|
#include <string.h>
|
|
#include <sched.h>
|
|
#include <ctype.h>
|
|
|
|
#include "opal/event/event.h"
|
|
#include "opal/util/argv.h"
|
|
#include "opal/util/if.h"
|
|
#include "opal/util/output.h"
|
|
#include "opal/mca/base/mca_base_param.h"
|
|
#include "opal/mca/paffinity/paffinity.h"
|
|
#include "opal/mca/paffinity/base/base.h"
|
|
|
|
#include "orte/util/proc_info.h"
|
|
#include "orte/util/show_help.h"
|
|
#include "orte/mca/errmgr/errmgr.h"
|
|
|
|
#include "ompi/constants.h"
|
|
#include "ompi/datatype/convertor.h"
|
|
#include "ompi/mca/btl/btl.h"
|
|
#include "ompi/mca/btl/base/base.h"
|
|
#include "ompi/mca/btl/base/btl_base_error.h"
|
|
#include "ompi/mca/mpool/base/base.h"
|
|
#include "ompi/mca/mpool/rdma/mpool_rdma.h"
|
|
#include "ompi/mca/pml/pml.h"
|
|
#include "ompi/runtime/ompi_module_exchange.h"
|
|
|
|
#include "btl_pcie.h"
|
|
#include "btl_pcie_frag.h"
|
|
#include "btl_pcie_endpoint.h"
|
|
#include "btl_pcie_ddriver.h"
|
|
|
|
|
|
static int pcie_reg_mr(void *reg_data, void *base, size_t size,
|
|
mca_mpool_base_registration_t *reg);
|
|
static int pcie_dereg_mr(void* reg_data, mca_mpool_base_registration_t *reg);
|
|
|
|
|
|
mca_btl_pcie_component_t mca_btl_pcie_component = {
|
|
{
|
|
/* First, the mca_base_component_t struct containing meta information
|
|
about the component itself */
|
|
|
|
{
|
|
/* Indicate that we are a pml v2.0.0 component (which also implies a
|
|
specific MCA version) */
|
|
|
|
MCA_BTL_BASE_VERSION_2_0_0,
|
|
|
|
"pcie", /* MCA component name */
|
|
OMPI_MAJOR_VERSION, /* MCA component major version */
|
|
OMPI_MINOR_VERSION, /* MCA component minor version */
|
|
OMPI_RELEASE_VERSION, /* MCA component release version */
|
|
mca_btl_pcie_component_open, /* component open */
|
|
mca_btl_pcie_component_close /* component close */
|
|
},
|
|
|
|
/* Next the MCA v2.0.0 component meta data */
|
|
|
|
{
|
|
false
|
|
},
|
|
|
|
mca_btl_pcie_component_init,
|
|
mca_btl_pcie_component_progress,
|
|
}
|
|
};
|
|
|
|
|
|
/*
|
|
* utility routines for parameter registration
|
|
*/
|
|
static char*
|
|
mca_btl_pcie_param_register_string(const char* param_name,
|
|
const char* param_desc,
|
|
const char* default_value)
|
|
{
|
|
char *value;
|
|
|
|
mca_base_param_reg_string(&mca_btl_pcie_component.super.btl_version,
|
|
param_name, param_desc, false, false,
|
|
default_value, &value);
|
|
return value;
|
|
}
|
|
|
|
|
|
static int
|
|
mca_btl_pcie_param_register_int(const char* param_name,
|
|
const char* param_desc,
|
|
int default_value)
|
|
{
|
|
int value;
|
|
|
|
mca_base_param_reg_int(&mca_btl_pcie_component.super.btl_version,
|
|
param_name, param_desc, false, false,
|
|
default_value, &value);
|
|
return value;
|
|
}
|
|
|
|
|
|
/*
|
|
* Register PCIE device found in local config file. The MCA framework
|
|
* will make this available to all peers.
|
|
*/
|
|
static int
|
|
btl_pcie_modex_send(void)
|
|
{
|
|
size_t size;
|
|
unsigned int i;
|
|
mca_btl_pcie_modex_info_t *info;
|
|
|
|
size = mca_btl_pcie_component.pcie_num_btls *
|
|
sizeof(mca_btl_pcie_modex_info_t);
|
|
info = malloc(size);
|
|
if (NULL == info) return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
|
|
|
|
for (i = 0 ; i < mca_btl_pcie_component.pcie_num_btls ; ++i) {
|
|
strncpy(info[i].hostname,
|
|
orte_process_info.nodename,
|
|
ORTE_MAX_HOSTNAME_SIZE - 1);
|
|
info[i].hostname[ORTE_MAX_HOSTNAME_SIZE - 1] = '\0';
|
|
strncpy(info[i].devicename,
|
|
mca_btl_pcie_component.pcie_btls[i].lcl_dev_name,
|
|
OMPI_PATH_MAX - 1);
|
|
info[i].devicename[OMPI_PATH_MAX - 1] = '\0';
|
|
MCA_BTL_PCIE_MODEX_INFO_HTON(info[i]);
|
|
}
|
|
|
|
#if (OMPI_MAJOR_VERSION <= 1) && (OMPI_MINOR_VERSION <= 2)
|
|
return mca_pml_base_modex_send(&mca_btl_pcie_component.super.btl_version, info, size);
|
|
#else
|
|
return ompi_modex_send(&mca_btl_pcie_component.super.btl_version, info, size);
|
|
#endif
|
|
}
|
|
|
|
|
|
/*
|
|
* Called by MCA framework to open the component, registers
|
|
* component parameters.
|
|
*/
|
|
int
|
|
mca_btl_pcie_component_open(void)
|
|
{
|
|
/* initialize state */
|
|
mca_btl_pcie_component.pcie_num_btls = 0;
|
|
mca_btl_pcie_component.pcie_btls = NULL;
|
|
|
|
/* initialize objects */
|
|
OBJ_CONSTRUCT(&mca_btl_pcie_component.pcie_procs, opal_list_t);
|
|
|
|
/* component parameters */
|
|
mca_btl_pcie_component.pcie_free_list_num =
|
|
mca_btl_pcie_param_register_int ("free_list_num",
|
|
"Initial size of free lists (must be >= 1)",
|
|
16);
|
|
/* BWB - FIX ME - The need to limit the free list max size is an
|
|
artifact of the lack of flow control in the BTL. Since we're
|
|
already using bounce fragments, it should be possible to make
|
|
this unlimited, and then properly handle the case where an SMA
|
|
region isn't available when send is called on a given frag.
|
|
Something similar to what Open IB does when we don't have send
|
|
credits would work really well here. See comment in
|
|
btl_pcie_send() for more information. */
|
|
mca_btl_pcie_component.pcie_free_list_max =
|
|
mca_btl_pcie_param_register_int ("free_list_max",
|
|
"Max size of free lists. "
|
|
"free_list_max * (first_frag_size + max_send_size) "
|
|
"must be less than (SMA memory size - (recv_queue_len * 4) - 8)",
|
|
32);
|
|
mca_btl_pcie_component.pcie_free_list_inc =
|
|
mca_btl_pcie_param_register_int ("free_list_inc",
|
|
"Increment size of free lists (must be >= 1)",
|
|
8);
|
|
|
|
mca_btl_pcie_component.pcie_send_mpool_name =
|
|
mca_btl_pcie_param_register_string("send_mpool",
|
|
"Name of the memory pool to be used for send messages. "
|
|
"(it is unlikely that you will ever want to change this)",
|
|
"pcie");
|
|
|
|
mca_btl_pcie_component.pcie_dma_mpool_name =
|
|
mca_btl_pcie_param_register_string("dma_mpool",
|
|
"Name of the memory pool to be used for rdma messages. "
|
|
"(it is unlikely that you will ever want to change this)",
|
|
"rdma");
|
|
|
|
mca_btl_pcie_component.pcie_recv_queue_len =
|
|
mca_btl_pcie_param_register_int("recv_queue_len",
|
|
"Length of receive fifo. Must be 4 * free_list_max",
|
|
256);
|
|
|
|
mca_btl_pcie_module.super.btl_exclusivity =
|
|
mca_btl_pcie_param_register_int ("exclusivity",
|
|
"Priority of PCIe BTL. (must be > 0)",
|
|
MCA_BTL_EXCLUSIVITY_DEFAULT + 1);
|
|
|
|
mca_btl_pcie_module.super.btl_eager_limit =
|
|
mca_btl_pcie_param_register_int ("first_frag_size",
|
|
"Size (in bytes) of the first fragment sent of any "
|
|
"message. It is the maximum size of \"short\" messages "
|
|
"and the maximum size of the \"phase 1\" fragment sent "
|
|
"for all large messages (must be >= 1).",
|
|
1*1024) - sizeof(mca_btl_pcie_header_t);
|
|
mca_btl_pcie_module.super.btl_rndv_eager_limit =
|
|
mca_btl_pcie_param_register_int ("btl_rndv_eager_limit",
|
|
"Minimum message size (in bytes) that will be striped "
|
|
"across multiple network devices when using "
|
|
"send/receive semantics. Messages shorter than this "
|
|
"size will be sent across a single network (must be >= "
|
|
"1)",
|
|
2*1024) - sizeof(mca_btl_pcie_header_t);
|
|
mca_btl_pcie_module.super.btl_max_send_size =
|
|
mca_btl_pcie_param_register_int ("max_send_size",
|
|
"Maximum size (in bytes) of a single \"phase 2\" fragment "
|
|
"of a long message when using the pipeline protocol "
|
|
"(must be >= 1)",
|
|
4*1024) - sizeof(mca_btl_pcie_header_t);
|
|
mca_btl_pcie_module.super.btl_rdma_pipeline_send_length =
|
|
mca_btl_pcie_param_register_int("rdma_pipeline_send_length",
|
|
"Length of the \"phase 2\" portion of a large message (in "
|
|
"bytes) when using the pipeline protocol. This part of "
|
|
"the message will be split into fragments of size "
|
|
"max_send_size and sent using send/receive semantics "
|
|
"(must be >= 0; only relevant when the PUT flag is "
|
|
"set)",
|
|
12*1024);
|
|
mca_btl_pcie_module.super.btl_rdma_pipeline_frag_size =
|
|
mca_btl_pcie_param_register_int("rdma_pipeline_frag_size",
|
|
"Maximum size (in bytes) of a single \"phase 3\" fragment "
|
|
"from a long message when using the pipeline protocol. "
|
|
"These fragments will be sent using RDMA semantics "
|
|
"(must be >= 1; only relevant when the PUT flag is "
|
|
"set)",
|
|
2*1024*1024);
|
|
mca_btl_pcie_module.super.btl_min_rdma_pipeline_size =
|
|
mca_btl_pcie_param_register_int("min_rdma_pipeline_size",
|
|
"Messages smaller than this size (in bytes) will not "
|
|
"use the RDMA pipeline protocol. Instead, they will be "
|
|
"split into fragments of max_send_size and sent using "
|
|
"send/receive semantics (must be >=0, and is "
|
|
"automatically adjusted up to at least "
|
|
"(eager_limit+btl_rdma_pipeline_send_length); only "
|
|
"relevant when the PUT flag is set)",
|
|
16 * 1024);
|
|
|
|
mca_btl_pcie_module.super.btl_flags =
|
|
mca_btl_pcie_param_register_int("flags",
|
|
"BTL control flags. Defaults to (SEND|PUT|HETEROGENEOUS_RDMA)",
|
|
#ifdef MCA_BTL_FLAGS_HETEROGENEOUS_RDMA
|
|
MCA_BTL_FLAGS_HETEROGENEOUS_RDMA |
|
|
#endif
|
|
MCA_BTL_FLAGS_SEND |
|
|
MCA_BTL_FLAGS_PUT);
|
|
|
|
return OMPI_SUCCESS;
|
|
}
|
|
|
|
|
|
int
|
|
mca_btl_pcie_component_close(void)
|
|
{
|
|
return OMPI_SUCCESS;
|
|
}
|
|
|
|
|
|
mca_btl_base_module_t**
|
|
mca_btl_pcie_component_init(int *num_btl_modules,
|
|
bool enable_progress_threads,
|
|
bool enable_mpi_threads)
|
|
{
|
|
cpu_set_t cpu_set;
|
|
unsigned int i;
|
|
int num_cpus, *cpus;
|
|
struct stat stat_buf;
|
|
struct mca_mpool_base_resources_t mpool_resources;
|
|
mca_btl_base_module_t **btl_array;
|
|
|
|
*num_btl_modules = 0;
|
|
|
|
/* find all cpus we're bound to */
|
|
cpus = malloc(CPU_SETSIZE * sizeof(int));
|
|
memset(cpus, 0, CPU_SETSIZE * sizeof(int));
|
|
num_cpus = 0;
|
|
CPU_ZERO(&cpu_set);
|
|
|
|
sched_getaffinity(0, sizeof(cpu_set), &cpu_set);
|
|
for (i = 0 ; i < CPU_SETSIZE ; ++i) {
|
|
if (CPU_ISSET(i, &cpu_set)) cpus[num_cpus++] = i;
|
|
}
|
|
#if defined(__PPC__)
|
|
if (num_cpus > 1) {
|
|
orte_show_help("help-mpi-btl-pcie.txt", "initialization:more-than-one-cpu",
|
|
true, num_cpus);
|
|
return NULL;
|
|
}
|
|
#endif /* #ifdef __PPC__ */
|
|
if (0 == num_cpus) {
|
|
orte_show_help("help-mpi-btl-pcie.txt", "initialization:no-cpus",
|
|
true);
|
|
return NULL;
|
|
}
|
|
|
|
/* Create the module storage space */
|
|
mca_btl_pcie_component.pcie_num_btls = num_cpus;
|
|
mca_btl_pcie_component.pcie_btls = malloc(mca_btl_pcie_component.pcie_num_btls *
|
|
sizeof(struct mca_btl_pcie_module_t));
|
|
btl_array = malloc(mca_btl_pcie_component.pcie_num_btls *
|
|
sizeof(mca_btl_base_module_t*));
|
|
|
|
/* initialize the modules */
|
|
for (i = 0 ; i < mca_btl_pcie_component.pcie_num_btls ; ++i) {
|
|
mca_btl_pcie_module_t *btl = &(mca_btl_pcie_component.pcie_btls[i]);
|
|
|
|
btl_array[i] = (mca_btl_base_module_t*) btl;
|
|
|
|
memcpy(btl, &mca_btl_pcie_module, sizeof(mca_btl_pcie_module_t));
|
|
|
|
/* check if we have a device listed in our local config file */
|
|
btl->lcl_dev_name =
|
|
ompi_btl_pcie_cfg_get_local_device(orte_process_info.nodename, cpus[i]);
|
|
BTL_VERBOSE(("Local device for %s:%d = %s", orte_process_info.nodename, cpus[i],
|
|
btl->lcl_dev_name));
|
|
|
|
/* make sure said device is sane */
|
|
if(stat(btl->lcl_dev_name, &stat_buf)) {
|
|
BTL_ERROR(("Error %s opening device %s\n", strerror(errno),
|
|
btl->lcl_dev_name));
|
|
return NULL;
|
|
}
|
|
|
|
OBJ_CONSTRUCT(&btl->pcie_sma_buf_eager, ompi_free_list_t);
|
|
OBJ_CONSTRUCT(&btl->pcie_sma_buf_max, ompi_free_list_t);
|
|
|
|
OBJ_CONSTRUCT(&btl->pcie_frag_eager, ompi_free_list_t);
|
|
OBJ_CONSTRUCT(&btl->pcie_frag_max, ompi_free_list_t);
|
|
|
|
OBJ_CONSTRUCT(&btl->pcie_frag_dma, ompi_free_list_t);
|
|
|
|
OBJ_CONSTRUCT(&btl->pcie_lock, opal_mutex_t);
|
|
|
|
/* time to setup DMA mpool */
|
|
mpool_resources.reg_data = (void*) btl;
|
|
mpool_resources.sizeof_reg = sizeof(mca_btl_pcie_reg_t);
|
|
mpool_resources.register_mem = pcie_reg_mr;
|
|
mpool_resources.deregister_mem = pcie_dereg_mr;
|
|
btl->rdma_mpool =
|
|
mca_mpool_base_module_create("rdma",
|
|
&btl->super,
|
|
&mpool_resources);
|
|
btl->super.btl_mpool = btl->rdma_mpool;
|
|
|
|
btl->active = false;
|
|
}
|
|
|
|
/* push our address info to everyone */
|
|
btl_pcie_modex_send();
|
|
|
|
*num_btl_modules = mca_btl_pcie_component.pcie_num_btls;
|
|
return btl_array;;
|
|
}
|
|
|
|
|
|
int
|
|
mca_btl_pcie_component_progress()
|
|
{
|
|
unsigned int i;
|
|
btl_pcie_fifo_entry_t msg_idx;
|
|
int count = 0;
|
|
|
|
for (i = 0 ; i < mca_btl_pcie_component.pcie_num_btls ; ++i) {
|
|
mca_btl_pcie_module_t *pcie_btl =
|
|
&(mca_btl_pcie_component.pcie_btls[i]);
|
|
mca_btl_base_endpoint_t *endpoint = pcie_btl->endpoint;
|
|
|
|
if (!pcie_btl->active) continue;
|
|
|
|
msg_idx = ompi_btl_pcie_fifo_get_msg(&endpoint->recv_fifo);
|
|
|
|
/* Potential optimization is to drain every time we enter progress */
|
|
if (msg_idx) {
|
|
int rc;
|
|
int ack = ((msg_idx & BTL_PCIE_FIFO_TYPE_MASK) == BTL_PCIE_FIFO_TYPE_ACK) ? 1 : 0;
|
|
msg_idx &= BTL_PCIE_FIFO_DATA_MASK;
|
|
|
|
if (ack) {
|
|
/* we have a send frag ack */
|
|
mca_btl_pcie_frag_t *frag = (mca_btl_pcie_frag_t*) msg_idx;
|
|
mca_btl_pcie_sma_buf_t *buf = frag->sma_buf;
|
|
|
|
BTL_VERBOSE(("received ack for frag %lx (0x%lx)", msg_idx, frag));
|
|
|
|
/* Done with buffer, can return now */
|
|
MCA_BTL_PCIE_SMA_BUF_RETURN(pcie_btl, buf, rc);
|
|
|
|
frag->base.des_cbfunc(&pcie_btl->super, endpoint,
|
|
&(frag->base),
|
|
OMPI_SUCCESS);
|
|
|
|
/* return the send credit */
|
|
ompi_btl_pcie_fifo_complete_msg(&endpoint->send_fifo, 1);
|
|
count++;
|
|
} else {
|
|
/* we have a send frag (incoming data) */
|
|
mca_btl_pcie_frag_t *recv_frag = &pcie_btl->pcie_recv_frag;
|
|
mca_btl_pcie_header_t *hdr = (mca_btl_pcie_header_t*) (endpoint->lcl_frag_base + msg_idx);
|
|
recv_frag->hdr = hdr;
|
|
OMPI_BTL_PCIE_HEADER_NTOH((*recv_frag->hdr));
|
|
recv_frag->segment.seg_addr.pval = ((unsigned char*) recv_frag->hdr) + sizeof(mca_btl_pcie_header_t);
|
|
recv_frag->segment.seg_len = recv_frag->hdr->length;
|
|
BTL_VERBOSE(("received tag %d, base 0x%lx", recv_frag->hdr->tag, &recv_frag->base));
|
|
pcie_btl->pcie_reg[recv_frag->hdr->tag].cbfunc(&pcie_btl->super,
|
|
recv_frag->hdr->tag, &recv_frag->base,
|
|
pcie_btl->pcie_reg[recv_frag->hdr->tag].cbdata);
|
|
|
|
rc = ompi_btl_pcie_fifo_set_msg(&endpoint->send_fifo, hdr->send_frag.lval);
|
|
/* BWB - FIX ME - this is only safe if the number of
|
|
queue entries is twice the free list size */
|
|
ompi_btl_pcie_fifo_complete_msg(&endpoint->send_fifo, 1);
|
|
count++;
|
|
}
|
|
}
|
|
}
|
|
|
|
return count;
|
|
}
|
|
|
|
|
|
static int
|
|
pcie_reg_mr(void *reg_data, void *base, size_t size,
|
|
mca_mpool_base_registration_t *reg)
|
|
{
|
|
mca_btl_pcie_module_t * pcie_btl = (mca_btl_pcie_module_t*) reg_data;
|
|
mca_btl_pcie_endpoint_t * endpoint = pcie_btl->endpoint;
|
|
mca_btl_pcie_reg_t * pcie_reg = (mca_btl_pcie_reg_t*) reg;
|
|
|
|
if(dd_register_memory_region(&endpoint->pcie_adapter,
|
|
&pcie_reg->handle,
|
|
base,
|
|
size,
|
|
DD_ALLOW_LOCAL_READ |
|
|
DD_ALLOW_LOCAL_WRITE |
|
|
DD_ALLOW_REMOTE_ACCESS |
|
|
DD_ALLOW_REMOTE_READ |
|
|
DD_ALLOW_REMOTE_WRITE )) {
|
|
BTL_ERROR(("error deregistering memory!\n"));
|
|
return OMPI_ERROR;
|
|
}
|
|
|
|
return OMPI_SUCCESS;
|
|
}
|
|
|
|
|
|
static int
|
|
pcie_dereg_mr(void* reg_data, mca_mpool_base_registration_t *reg)
|
|
{
|
|
mca_btl_pcie_module_t * pcie_btl = (mca_btl_pcie_module_t*) reg_data;
|
|
mca_btl_pcie_endpoint_t * endpoint = pcie_btl->endpoint;
|
|
mca_btl_pcie_reg_t * pcie_reg = (mca_btl_pcie_reg_t*) reg;
|
|
|
|
if(pcie_reg->handle >= 0) {
|
|
if(dd_deregister_memory_region(&endpoint->pcie_adapter,
|
|
&pcie_reg->handle)) {
|
|
BTL_ERROR(("error deregistering memory!\n"));
|
|
return OMPI_ERROR;
|
|
}
|
|
} else {
|
|
return OMPI_ERROR;
|
|
}
|
|
|
|
return OMPI_SUCCESS;
|
|
}
|