1
1
openmpi/ompi/mca/btl/pcie/btl_pcie_proc.c
Ralph Castain 4d3aa5a8a4 Once again, into the breach!
Yes, friends, our favorite PCIE BTL has resurfaced as mgmt vacillates over its existence. This is an updated version that actually mostly works, in its final stages of debugging.

Some generalization still remains to be done...

This commit was SVN r21358.
2009-06-02 22:26:36 +00:00

195 строки
5.9 KiB
C

/*
* Copyright (c) 2007 Los Alamos National Security, LLC.
* All righs reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "opal/class/opal_hash_table.h"
#include "ompi/mca/btl/base/btl_base_error.h"
#include "ompi/mca/mpool/base/base.h"
#include "ompi/runtime/ompi_module_exchange.h"
#include "btl_pcie.h"
#include "btl_pcie_proc.h"
static void mca_btl_pcie_proc_construct(mca_btl_pcie_proc_t* proc);
static void mca_btl_pcie_proc_destruct(mca_btl_pcie_proc_t* proc);
OBJ_CLASS_INSTANCE(mca_btl_pcie_proc_t,
opal_list_item_t, mca_btl_pcie_proc_construct,
mca_btl_pcie_proc_destruct);
void mca_btl_pcie_proc_construct(mca_btl_pcie_proc_t* proc)
{
proc->proc_ompi = 0;
proc->proc_addr_count = 0;
proc->proc_endpoint_count = 0;
OBJ_CONSTRUCT(&proc->proc_lock, opal_mutex_t);
/* add to list of all proc instance */
OPAL_THREAD_LOCK(&mca_btl_pcie_component.pcie_lock);
opal_list_append(&mca_btl_pcie_component.pcie_procs, &proc->super);
OPAL_THREAD_UNLOCK(&mca_btl_pcie_component.pcie_lock);
}
/*
* Cleanup ib proc instance
*/
void mca_btl_pcie_proc_destruct(mca_btl_pcie_proc_t* proc)
{
/* remove from list of all proc instances */
OPAL_THREAD_LOCK(&mca_btl_pcie_component.pcie_lock);
opal_list_remove_item(&mca_btl_pcie_component.pcie_procs, &proc->super);
OPAL_THREAD_UNLOCK(&mca_btl_pcie_component.pcie_lock);
OBJ_DESTRUCT(&proc->proc_lock);
}
/*
* Look for an existing TEMPLATE process instances based on the associated
* ompi_proc_t instance.
*/
static mca_btl_pcie_proc_t* mca_btl_pcie_proc_lookup_ompi(ompi_proc_t* ompi_proc)
{
mca_btl_pcie_proc_t* pcie_proc;
OPAL_THREAD_LOCK(&mca_btl_pcie_component.pcie_lock);
for(pcie_proc = (mca_btl_pcie_proc_t*)
opal_list_get_first(&mca_btl_pcie_component.pcie_procs);
pcie_proc != (mca_btl_pcie_proc_t*)
opal_list_get_end(&mca_btl_pcie_component.pcie_procs);
pcie_proc = (mca_btl_pcie_proc_t*)opal_list_get_next(pcie_proc)) {
if(pcie_proc->proc_ompi == ompi_proc) {
OPAL_THREAD_UNLOCK(&mca_btl_pcie_component.pcie_lock);
return pcie_proc;
}
}
OPAL_THREAD_UNLOCK(&mca_btl_pcie_component.pcie_lock);
return NULL;
}
/*
* Create a TEMPLATE process structure. There is a one-to-one correspondence
* between a ompi_proc_t and a mca_btl_pcie_proc_t instance. We cache
* additional data (specifically the list of mca_btl_pcie_endpoint_t instances,
* and published addresses) associated w/ a given destination on this
* datastructure.
*/
int mca_btl_pcie_proc_create(ompi_proc_t* ompi_proc,
mca_btl_pcie_module_t* pcie_btl,
mca_btl_pcie_proc_t** ret_proc)
{
mca_btl_pcie_proc_t* pcie_proc = NULL;
char *rem_dev_name = NULL, *lcl_dev_name = NULL;
char *rem_hostname = NULL;
int rc, num_peers, i;
size_t size;
mca_btl_pcie_modex_info_t *modex_info;
/* Check if already have proc structure for this ompi process */
pcie_proc = mca_btl_pcie_proc_lookup_ompi(ompi_proc);
if(pcie_proc != NULL) {
/* Gotcha! */
*ret_proc = pcie_proc;
return OMPI_SUCCESS;
}
/* query for the peer's device name info */
rc = ompi_modex_recv(&mca_btl_pcie_component.super.btl_version,
ompi_proc,
(void*)&modex_info,
&size);
if (OMPI_SUCCESS != rc) {
opal_output(mca_btl_base_output, "[%s:%d] ompi_modex_recv failed for peer %s",
__FILE__, __LINE__, ORTE_NAME_PRINT(&ompi_proc->proc_name));
OBJ_RELEASE(pcie_proc);
*ret_proc = NULL;
return OMPI_ERROR;
}
if (0 == size || 0 != size % sizeof(mca_btl_pcie_modex_info_t)) {
*ret_proc = NULL;
return OMPI_SUCCESS;
}
num_peers = size / sizeof(mca_btl_pcie_modex_info_t);
for (i = 0 ; i < num_peers ; ++i) {
MCA_BTL_PCIE_MODEX_INFO_NTOH(modex_info[i]);
rem_hostname = modex_info[i].hostname;
rem_dev_name = modex_info[i].devicename;
lcl_dev_name = ompi_btl_pcie_cfg_get_matching_device(rem_hostname,
rem_dev_name);
if (NULL != lcl_dev_name &&
0 == strcmp(lcl_dev_name, pcie_btl->lcl_dev_name)) {
/* we have a match. continue onward */
break;
}
}
/* make sure the local device names match */
if(NULL == lcl_dev_name ||
0 != strcmp(lcl_dev_name, pcie_btl->lcl_dev_name)){
*ret_proc = NULL;
return OMPI_SUCCESS;
}
BTL_VERBOSE(("Have matching devices: %s:%s <-> %s:%s",
orte_process_info.nodename,
pcie_btl->lcl_dev_name,
rem_hostname,
rem_dev_name));
pcie_proc = OBJ_NEW(mca_btl_pcie_proc_t);
if(NULL == pcie_proc){
*ret_proc = NULL;
return OMPI_ERR_OUT_OF_RESOURCE;
}
pcie_proc->proc_ompi = ompi_proc;
/* build a unique identifier (of arbitrary
* size) to represent the proc */
pcie_proc->proc_guid = ompi_proc->proc_name;
/* Initialize number of peer */
pcie_proc->proc_endpoint_count = 1;
pcie_proc->endpoint_proc = OBJ_NEW(mca_btl_pcie_endpoint_t);
if(NULL == pcie_proc->endpoint_proc) {
free(rem_dev_name);
*ret_proc = NULL;
return OMPI_ERR_OUT_OF_RESOURCE;
}
pcie_proc->endpoint_proc->lcl_dev_name = lcl_dev_name;
pcie_proc->endpoint_proc->rem_dev_name = rem_dev_name;
pcie_proc->endpoint_proc->endpoint_proc = pcie_proc;
pcie_proc->endpoint_proc->endpoint_btl = pcie_btl;
if(OMPI_SUCCESS != mca_btl_pcie_endpoint_init(pcie_proc->endpoint_proc)) {
BTL_ERROR(("Error initializing the PCIE endpoint \n"));
*ret_proc = NULL;
return OMPI_ERROR;
}
*ret_proc = pcie_proc;
return OMPI_SUCCESS;
}