f7664b3814
ofacm - generic connection manager for IB interconnects. ofautils - IB common utilities and compatibility code 2. Updating OpenIB configure code - ORNL & Mellanox Teams This commit was SVN r26707.
1673 строки
58 KiB
C
1673 строки
58 KiB
C
/*
|
|
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
|
* University Research and Technology
|
|
* Corporation. All rights reserved.
|
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
|
* of Tennessee Research Foundation. All rights
|
|
* reserved.
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
* University of Stuttgart. All rights reserved.
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
* All rights reserved.
|
|
* Copyright (c) 2006-2008 Cisco Systems, Inc. All rights reserved.
|
|
* Copyright (c) 2006 Los Alamos National Security, LLC. All rights
|
|
* reserved.
|
|
* Copyright (c) 2008-2012 Mellanox Technologies. All rights reserved.
|
|
*
|
|
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
#include "ompi_config.h"
|
|
|
|
#include "opal/dss/dss.h"
|
|
#include "orte/util/show_help.h"
|
|
#include "opal/util/error.h"
|
|
#include "opal/util/output.h"
|
|
#include "orte/mca/rml/rml.h"
|
|
#include "orte/mca/rml/rml_types.h"
|
|
#include "orte/mca/errmgr/errmgr.h"
|
|
#include "orte/util/name_fns.h"
|
|
#include "orte/runtime/orte_globals.h"
|
|
#include "ompi/mca/dpm/dpm.h"
|
|
#include "connect.h"
|
|
#include "base.h"
|
|
#include "orte/util/show_help.h"
|
|
#include "opal/class/opal_hash_table.h"
|
|
#include "opal/class/opal_object.h"
|
|
|
|
#include <inttypes.h>
|
|
|
|
#define MAX_LINE_LEN 80
|
|
#define NUM_OF_TOKENS 7
|
|
|
|
|
|
typedef enum {
|
|
ENDPOINT_CONNECT_REQUEST,
|
|
ENDPOINT_CONNECT_RESPONSE,
|
|
ENDPOINT_CONNECT_ACK
|
|
} connect_message_type_t;
|
|
|
|
typedef struct port_to_switch_lids{
|
|
uint16_t port_lid;
|
|
uint16_t switch_lid;
|
|
struct port_to_switch_lids* next;
|
|
} port_to_switch_lids;
|
|
|
|
typedef struct switch_to_switch_sl{
|
|
uint16_t switch_lid;
|
|
uint8_t service_level;
|
|
struct switch_to_switch_sl* next;
|
|
} switch_to_switch_sl;
|
|
|
|
static int oob_priority = 50;
|
|
static bool rml_recv_posted = false;
|
|
|
|
static void oob_component_register(void);
|
|
static int oob_component_query(ompi_common_ofacm_base_dev_desc_t *dev,
|
|
ompi_common_ofacm_base_module_t **cpc);
|
|
static int oob_component_finalize(void);
|
|
|
|
static int oob_module_start_connect(ompi_common_ofacm_base_local_connection_context_t* context);
|
|
static int reply_start_connect(ompi_common_ofacm_base_local_connection_context_t* context,
|
|
ompi_common_ofacm_base_remote_connection_context_t *remote_info);
|
|
static int set_remote_info(ompi_common_ofacm_base_local_connection_context_t *context,
|
|
ompi_common_ofacm_base_remote_connection_context_t *remote_info);
|
|
static int qp_connect_all(ompi_common_ofacm_base_local_connection_context_t* context);
|
|
static int qp_create_all(ompi_common_ofacm_base_local_connection_context_t* context);
|
|
static int qp_create_one(ompi_common_ofacm_base_local_connection_context_t* context, int qp);
|
|
static int send_connect_data(ompi_common_ofacm_base_local_connection_context_t* context,
|
|
uint8_t message_type);
|
|
static ompi_common_ofacm_base_local_connection_context_t*
|
|
oob_endpoint_init(ompi_proc_t *proc,
|
|
ompi_common_ofacm_base_qp_config_t *qp_config,
|
|
struct ibv_pd *pd, uint64_t subnet_id, int cpc_type,
|
|
uint16_t lid, uint16_t rem_lid,
|
|
int32_t user_context_index, void *user_context,
|
|
ompi_common_ofacm_base_module_t *cpc,
|
|
ompi_common_ofacm_base_context_connect_cb_fn_t connect_cb,
|
|
ompi_common_ofacm_base_context_error_cb_fn_t error_cb,
|
|
ompi_common_ofacm_base_context_prepare_recv_cb_fn_t prepare_recv_cb);
|
|
static int oob_endpoint_finalize(ompi_common_ofacm_base_local_connection_context_t *context);
|
|
|
|
static void report_error(ompi_common_ofacm_base_local_connection_context_t* context);
|
|
|
|
static void rml_send_cb(int status, orte_process_name_t* endpoint,
|
|
opal_buffer_t* buffer, orte_rml_tag_t tag,
|
|
void* cbdata);
|
|
static void rml_recv_cb(int status, orte_process_name_t* process_name,
|
|
opal_buffer_t* buffer, orte_rml_tag_t tag,
|
|
void* cbdata);
|
|
|
|
/* Build service level hashtables per port */
|
|
static int create_service_level_table_for_port(uint16_t lid,
|
|
opal_hash_table_t* port_to_switch_hash_table,
|
|
opal_hash_table_t* switch_to_switch_hash_table);
|
|
|
|
/* Pick the service level of path between to endpoints */
|
|
static int pick_service_level(uint16_t src_port_lid, uint16_t dst_port_lid,
|
|
uint8_t* service_level,
|
|
opal_hash_table_t* port_to_switch_hash_table,
|
|
opal_hash_table_t* switch_to_switch_hash_table);
|
|
|
|
/*
|
|
* The "component" struct -- the top-level function pointers for the
|
|
* oob connection scheme.
|
|
*/
|
|
ompi_common_ofacm_base_component_t ompi_common_ofacm_oob = {
|
|
"oob",
|
|
/* Register */
|
|
oob_component_register,
|
|
/* Init */
|
|
NULL,
|
|
/* Query */
|
|
oob_component_query,
|
|
/* Finalize */
|
|
oob_component_finalize,
|
|
};
|
|
|
|
/* Open - this functions sets up any oob specific commandline params */
|
|
static void oob_component_register(void)
|
|
{
|
|
mca_base_param_reg_int_name("common",
|
|
"ofacm_connect_oob_priority",
|
|
"The selection method priority for oob",
|
|
false, false, oob_priority, &oob_priority);
|
|
|
|
if (oob_priority > 100) {
|
|
oob_priority = 100;
|
|
} else if (oob_priority < -1) {
|
|
oob_priority = -1;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Init function. Post non-blocking RML receive to accept incoming
|
|
* connection requests.
|
|
*/
|
|
static int oob_component_query(ompi_common_ofacm_base_dev_desc_t *dev,
|
|
ompi_common_ofacm_base_module_t **cpc)
|
|
{
|
|
int rc;
|
|
|
|
/* If we have the transport_type member, check to ensure we're on
|
|
IB (this CPC will not work with iWarp). If we do not have the
|
|
transport_type member, then we must be < OFED v1.2, and
|
|
therefore we must be IB. */
|
|
#if defined(HAVE_STRUCT_IBV_DEVICE_TRANSPORT_TYPE)
|
|
if (IBV_TRANSPORT_IB != dev->ib_dev->transport_type) {
|
|
OFACM_VERBOSE(("OFACM: oob CPC only supported on InfiniBand; skipped on device %s",
|
|
ibv_get_device_name(dev->ib_dev)));
|
|
return OMPI_ERR_NOT_SUPPORTED;
|
|
}
|
|
#endif
|
|
|
|
if (dev->capabilities & OMPI_COMMON_OFACM_XRC_ONLY) {
|
|
OFACM_VERBOSE(("OFACM: oob CPC not supported with XRC receive queues, please try xoob CPC; skipped"));
|
|
return OMPI_ERR_NOT_SUPPORTED;
|
|
}
|
|
/* If this btl supports OOB, then post the RML message. But
|
|
ensure to only post it *once*, because another btl may have
|
|
come in before this and already posted it. */
|
|
if (!rml_recv_posted) {
|
|
rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
|
|
OMPI_RML_TAG_OFACM,
|
|
ORTE_RML_PERSISTENT,
|
|
rml_recv_cb,
|
|
NULL);
|
|
if (ORTE_SUCCESS != rc) {
|
|
OFACM_VERBOSE(("OFACM: oob CPC system error %d (%s)",
|
|
rc, opal_strerror(rc)));
|
|
return rc;
|
|
}
|
|
rml_recv_posted = true;
|
|
}
|
|
|
|
*cpc = malloc(sizeof(ompi_common_ofacm_base_module_t));
|
|
if (NULL == *cpc) {
|
|
orte_rml.recv_cancel(ORTE_NAME_WILDCARD, OMPI_RML_TAG_OFACM);
|
|
rml_recv_posted = false;
|
|
OFACM_VERBOSE(("openib BTL: oob CPC system error (malloc failed)"));
|
|
return OMPI_ERR_OUT_OF_RESOURCE;
|
|
}
|
|
/* Init global list of all connection contexts */
|
|
OBJ_CONSTRUCT(&ompi_common_ofacm_oob.all_procs, opal_list_t);
|
|
(*cpc)->data.cbm_component = &ompi_common_ofacm_oob;
|
|
(*cpc)->data.cbm_priority = oob_priority;
|
|
(*cpc)->data.cbm_modex_message = NULL;
|
|
(*cpc)->data.cbm_modex_message_len = 0;
|
|
|
|
(*cpc)->cbm_endpoint_init = oob_endpoint_init;
|
|
(*cpc)->cbm_start_connect = oob_module_start_connect;
|
|
(*cpc)->cbm_endpoint_finalize = oob_endpoint_finalize;
|
|
(*cpc)->cbm_finalize = NULL;
|
|
(*cpc)->cbm_uses_cts = false;
|
|
|
|
OFACM_VERBOSE(("openib BTL: oob CPC available for use on %s",
|
|
ibv_get_device_name(dev->ib_dev)));
|
|
return OMPI_SUCCESS;
|
|
}
|
|
|
|
static ompi_common_ofacm_base_proc_t* find_proc(ompi_proc_t *proc)
|
|
{
|
|
ompi_common_ofacm_base_proc_t *ret = NULL;
|
|
opal_list_item_t *item;
|
|
opal_list_t *list = &ompi_common_ofacm_oob.all_procs;
|
|
|
|
for (item = opal_list_get_first(list);
|
|
item != opal_list_get_end(list);
|
|
item = opal_list_get_next(item)) {
|
|
if (proc == ((ompi_common_ofacm_base_proc_t *)item)->proc_ompi){
|
|
ret = (ompi_common_ofacm_base_proc_t *)item;
|
|
}
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
/* OOB connection context init */
|
|
static ompi_common_ofacm_base_local_connection_context_t*
|
|
oob_endpoint_init(ompi_proc_t *proc,
|
|
ompi_common_ofacm_base_qp_config_t *qp_config,
|
|
struct ibv_pd *pd, uint64_t subnet_id, int cpc_type,
|
|
uint16_t lid, uint16_t rem_lid,
|
|
int32_t user_context_index, void *user_context,
|
|
ompi_common_ofacm_base_module_t *cpc,
|
|
ompi_common_ofacm_base_context_connect_cb_fn_t connect_cb,
|
|
ompi_common_ofacm_base_context_error_cb_fn_t error_cb,
|
|
ompi_common_ofacm_base_context_prepare_recv_cb_fn_t prepare_recv_cb)
|
|
{
|
|
int ret;
|
|
bool new_proc;
|
|
ompi_common_ofacm_base_local_connection_context_t *context;
|
|
ompi_common_ofacm_base_proc_t *context_proc;
|
|
|
|
context = (ompi_common_ofacm_base_local_connection_context_t*)
|
|
OBJ_NEW(ompi_common_ofacm_base_local_connection_context_t);
|
|
context_proc = find_proc(proc);
|
|
|
|
if (NULL == context_proc) {
|
|
new_proc = true;
|
|
/* constructing new proc */
|
|
context_proc = (ompi_common_ofacm_base_proc_t *)
|
|
OBJ_NEW(ompi_common_ofacm_base_proc_t );
|
|
} else {
|
|
new_proc = false;
|
|
OBJ_RETAIN(context_proc);
|
|
}
|
|
|
|
ompi_common_ofacm_base_proc_setup(context_proc, context, proc);
|
|
ret = ompi_common_ofacm_base_context_init(context, cpc, connect_cb, error_cb,
|
|
prepare_recv_cb, context_proc, qp_config,
|
|
pd, subnet_id, cpc_type, lid, rem_lid, user_context_index, user_context);
|
|
if (OMPI_SUCCESS != ret) {
|
|
OBJ_DESTRUCT(context_proc);
|
|
OBJ_DESTRUCT(context);
|
|
return NULL;
|
|
}
|
|
|
|
if (new_proc) {
|
|
opal_list_append(&ompi_common_ofacm_oob.all_procs, (opal_list_item_t *)context_proc);
|
|
}
|
|
|
|
return context;
|
|
}
|
|
|
|
/* OOB connection context finalization */
|
|
static int oob_endpoint_finalize
|
|
(ompi_common_ofacm_base_local_connection_context_t *context)
|
|
{
|
|
opal_list_item_t *proc_item, *cntx_item, *cntx_item_next;
|
|
bool found = false;
|
|
bool pfound = false;
|
|
int qp;
|
|
opal_list_t *proc_list = &ompi_common_ofacm_oob.all_procs;
|
|
|
|
/* Proc cleanup. We should find the context proc in all proc list and remove
|
|
* from the proc list our context. After it we try to release the proc context */
|
|
for (proc_item = opal_list_get_first(proc_list);
|
|
proc_item != opal_list_get_end(proc_list);
|
|
proc_item = opal_list_get_next(proc_item)) {
|
|
if (context->proc == ((ompi_common_ofacm_base_proc_t *)proc_item)){
|
|
ompi_common_ofacm_base_proc_t *proc =
|
|
(ompi_common_ofacm_base_proc_t *)proc_item;
|
|
opal_list_t *cntx_list = &proc->all_contexts;
|
|
pfound = true;
|
|
|
|
/* Remove the context from proc list */
|
|
cntx_item = opal_list_get_first(cntx_list);
|
|
while(cntx_item != opal_list_get_end(cntx_list)) {
|
|
/* take the next before removing from the list */
|
|
cntx_item_next = opal_list_get_next(cntx_item);
|
|
if (context == (ompi_common_ofacm_base_local_connection_context_t *)cntx_item) {
|
|
found = true;
|
|
opal_list_remove_item(cntx_list, cntx_item);
|
|
}
|
|
cntx_item = cntx_item_next;
|
|
}
|
|
|
|
/* Remove our proc from all list */
|
|
if (opal_list_is_empty(cntx_list)) {
|
|
opal_list_remove_item(proc_list, (opal_list_item_t *)proc);
|
|
}
|
|
OBJ_RELEASE(proc);
|
|
}
|
|
}
|
|
|
|
/* Release QPs */
|
|
for (qp = 0; qp < context->num_of_qps; qp++) {
|
|
if(NULL != context->qps[qp].lcl_qp) {
|
|
if(ibv_destroy_qp(context->qps[qp].lcl_qp)) {
|
|
OFACM_ERROR(("Failed to destroy QP:%d\n", qp));
|
|
}
|
|
}
|
|
}
|
|
|
|
assert(true == found);
|
|
assert(true == pfound);
|
|
|
|
/* We done with proc release and now we way destroy the context */
|
|
OBJ_RELEASE(context);
|
|
|
|
return OMPI_SUCCESS;
|
|
}
|
|
|
|
/*
|
|
* Connect function. Start initiation of connections to a remote
|
|
* peer. We send our Queue Pair information over the RML/OOB
|
|
* communication mechanism. On completion of our send, a send
|
|
* completion handler is called.
|
|
*/
|
|
static int oob_module_start_connect(ompi_common_ofacm_base_local_connection_context_t *context)
|
|
{
|
|
int rc;
|
|
|
|
if (OMPI_SUCCESS != (rc = qp_create_all(context))) {
|
|
return rc;
|
|
}
|
|
|
|
/* Send connection info over to remote endpoint */
|
|
context->state = MCA_COMMON_OFACM_CONNECTING;
|
|
if (OMPI_SUCCESS !=
|
|
(rc = send_connect_data(context, ENDPOINT_CONNECT_REQUEST))) {
|
|
OFACM_ERROR(("error sending connect request, error code %d", rc));
|
|
return rc;
|
|
}
|
|
|
|
return OMPI_SUCCESS;
|
|
}
|
|
|
|
/*
|
|
* Component finalize function. Cleanup RML non-blocking receive.
|
|
*/
|
|
static int oob_component_finalize(void)
|
|
{
|
|
if (rml_recv_posted) {
|
|
orte_rml.recv_cancel(ORTE_NAME_WILDCARD, OMPI_RML_TAG_OFACM);
|
|
rml_recv_posted = false;
|
|
}
|
|
|
|
return OMPI_SUCCESS;
|
|
}
|
|
|
|
/**************************************************************************/
|
|
|
|
/*
|
|
* Reply to a `start - connect' message
|
|
*/
|
|
static int reply_start_connect(ompi_common_ofacm_base_local_connection_context_t* context,
|
|
ompi_common_ofacm_base_remote_connection_context_t *remote_info)
|
|
{
|
|
int rc;
|
|
|
|
OFACM_VERBOSE(("Initialized QPs, LID = %d", context->lid));
|
|
|
|
/* Create local QP's and post receive resources */
|
|
if (OMPI_SUCCESS != (rc = qp_create_all(context))) {
|
|
return rc;
|
|
}
|
|
|
|
/* Set the remote side info */
|
|
set_remote_info(context, remote_info);
|
|
|
|
/* Connect to remote endpoint qp's */
|
|
if (OMPI_SUCCESS != (rc = qp_connect_all(context))) {
|
|
return rc;
|
|
}
|
|
|
|
/* Send connection info over to remote endpoint */
|
|
context->state = MCA_COMMON_OFACM_CONNECT_ACK;
|
|
if (OMPI_SUCCESS !=
|
|
(rc = send_connect_data(context, ENDPOINT_CONNECT_RESPONSE))) {
|
|
OFACM_ERROR(("error in endpoint send connect request error code is %d",
|
|
rc));
|
|
return rc;
|
|
}
|
|
return OMPI_SUCCESS;
|
|
}
|
|
|
|
|
|
static int set_remote_info(ompi_common_ofacm_base_local_connection_context_t *context,
|
|
ompi_common_ofacm_base_remote_connection_context_t *remote_info)
|
|
{
|
|
/* copy the remote_info stuff */
|
|
memcpy(&context->remote_info,
|
|
remote_info, sizeof(ompi_common_ofacm_base_remote_connection_context_t ));
|
|
|
|
OFACM_VERBOSE(("Setting QP info, LID = %d", context->remote_info.rem_lid));
|
|
return OMPI_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
/*
|
|
* Connect the local ends of all qp's to the remote side
|
|
*/
|
|
static int qp_connect_all(ompi_common_ofacm_base_local_connection_context_t* context)
|
|
{
|
|
int i;
|
|
uint8_t service_level = 0;
|
|
uint32_t rtr_mask = 0, rts_mask = 0;
|
|
int rc = OMPI_SUCCESS;
|
|
|
|
static bool is_hash_table_initialized = false;
|
|
static opal_hash_table_t switch_to_switch_hash_table;
|
|
static opal_hash_table_t port_to_switch_hash_table;
|
|
|
|
|
|
/* Create two hash tables for a given port in order to allow
|
|
* an efficient search of service level on any route exiting
|
|
* from it */
|
|
if((NULL != ompi_common_ofacm_three_dim_torus) &&
|
|
(false == is_hash_table_initialized)){
|
|
|
|
rc = create_service_level_table_for_port(context->lid, &port_to_switch_hash_table,
|
|
&switch_to_switch_hash_table);
|
|
if(OMPI_SUCCESS != rc){
|
|
/* Failed to create service table for port */
|
|
return OMPI_ERROR;
|
|
}
|
|
is_hash_table_initialized = true;
|
|
}
|
|
|
|
|
|
/* Pick the Service Level of each route from the table */
|
|
if(is_hash_table_initialized){
|
|
rc = pick_service_level(context->lid, context->remote_info.rem_lid, &service_level,
|
|
&port_to_switch_hash_table, &switch_to_switch_hash_table);
|
|
if(OMPI_SUCCESS != rc){
|
|
/* Failed to retrieve service level on the route */
|
|
return OMPI_ERROR;
|
|
}
|
|
/*printf("Debug: qp_connect_all: lid %hu rem lid %hu num_qps %d SL %c\n", context->lid,
|
|
context->remote_info.rem_lid, context->num_of_qps, service_level);*/
|
|
}
|
|
|
|
|
|
for (i = 0; i < context->num_of_qps; i++) {
|
|
struct ibv_qp_attr attr;
|
|
struct ibv_qp* qp = context->qps[i].lcl_qp;
|
|
enum ibv_mtu mtu = (context->attr[i].path_mtu < context->remote_info.rem_mtu) ?
|
|
context->attr[i].path_mtu : context->remote_info.rem_mtu;
|
|
|
|
memset(&attr, 0, sizeof(attr));
|
|
memcpy(&attr, context->attr, sizeof(struct ibv_qp_attr));
|
|
attr.qp_state = IBV_QPS_RTR;
|
|
attr.path_mtu = mtu;
|
|
attr.dest_qp_num = context->remote_info.rem_qps[i].rem_qp_num;
|
|
attr.rq_psn = context->remote_info.rem_qps[i].rem_psn;
|
|
attr.ah_attr.dlid = context->remote_info.rem_lid;
|
|
|
|
if(is_hash_table_initialized){
|
|
attr.ah_attr.sl = service_level;
|
|
}
|
|
/* JMS to be filled in later dynamically */
|
|
attr.ah_attr.static_rate = 0;
|
|
rtr_mask = IBV_QP_STATE |
|
|
IBV_QP_AV |
|
|
IBV_QP_PATH_MTU |
|
|
IBV_QP_DEST_QPN |
|
|
IBV_QP_RQ_PSN |
|
|
IBV_QP_MAX_DEST_RD_ATOMIC |
|
|
IBV_QP_MIN_RNR_TIMER;
|
|
|
|
/* applying user specified rtr mask */
|
|
if (NULL != context->custom_rtr_attr_mask) {
|
|
rtr_mask |= context->custom_rtr_attr_mask[i];
|
|
}
|
|
|
|
OFACM_VERBOSE(("Set MTU to IBV value %d (%s bytes)", mtu,
|
|
(mtu == IBV_MTU_256) ? "256" :
|
|
(mtu == IBV_MTU_512) ? "512" :
|
|
(mtu == IBV_MTU_1024) ? "1024" :
|
|
(mtu == IBV_MTU_2048) ? "2048" :
|
|
(mtu == IBV_MTU_4096) ? "4096" :
|
|
"unknown (!)"));
|
|
|
|
if (ibv_modify_qp(qp, &attr, rtr_mask)) {
|
|
OFACM_ERROR(("Error modifing QP to RTR errno says %s",
|
|
strerror(errno)));
|
|
return OMPI_ERROR;
|
|
}
|
|
attr.qp_state = IBV_QPS_RTS;
|
|
/* On PP QPs we have SW flow control, no need for rnr retries. Setting
|
|
* it to zero helps to catch bugs */
|
|
/*
|
|
attr.rnr_retry = BTL_OPENIB_QP_TYPE_PP(i) ? 0 :
|
|
mca_btl_openib_component.ib_rnr_retry;
|
|
*/
|
|
attr.sq_psn = context->qps[i].lcl_psn;
|
|
rts_mask = IBV_QP_STATE |
|
|
IBV_QP_TIMEOUT |
|
|
IBV_QP_RETRY_CNT |
|
|
IBV_QP_RNR_RETRY |
|
|
IBV_QP_SQ_PSN |
|
|
IBV_QP_MAX_QP_RD_ATOMIC;
|
|
|
|
/* applying user specified rts mask */
|
|
if (NULL != context->custom_rts_attr_mask) {
|
|
rts_mask |= context->custom_rts_attr_mask[i];
|
|
}
|
|
|
|
if (ibv_modify_qp(qp, &attr, rts_mask)) {
|
|
OFACM_ERROR(("error modifying QP to RTS errno says %s",
|
|
strerror(errno)));
|
|
return OMPI_ERROR;
|
|
}
|
|
}
|
|
|
|
return OMPI_SUCCESS;
|
|
}
|
|
|
|
|
|
/*
|
|
* Create the local side of all the qp's. The remote sides will be
|
|
* connected later.
|
|
*/
|
|
static int qp_create_all(ompi_common_ofacm_base_local_connection_context_t* context)
|
|
{
|
|
int qp, rc;
|
|
|
|
for (qp = 0; qp < context->num_of_qps; ++qp) {
|
|
rc = qp_create_one(context, qp);
|
|
if (OMPI_SUCCESS != rc) {
|
|
return rc;
|
|
}
|
|
}
|
|
/* Now that all the qp's are created locally, post some receive
|
|
buffers, setup credits, etc. */
|
|
return context->prepare_recv_cb(context->user_context);
|
|
}
|
|
|
|
/*
|
|
* Create the local side of one qp. The remote side will be connected
|
|
* later.
|
|
*/
|
|
static int qp_create_one(ompi_common_ofacm_base_local_connection_context_t *context, int qp)
|
|
{
|
|
struct ibv_qp *my_qp;
|
|
struct ibv_qp_init_attr init_attr;
|
|
struct ibv_qp_attr attr;
|
|
size_t req_inline = context->init_attr[qp].cap.max_inline_data;
|
|
uint32_t init_mask = 0;
|
|
|
|
/* Taking default init attributes from user */
|
|
memcpy(&init_attr, &context->init_attr[qp], sizeof(init_attr));
|
|
my_qp = ibv_create_qp(context->ib_pd, &init_attr);
|
|
|
|
if (NULL == my_qp) {
|
|
OFACM_ERROR(("error creating qp errno says %s", strerror(errno)));
|
|
return OMPI_ERROR;
|
|
}
|
|
context->qps[qp].lcl_qp = my_qp;
|
|
|
|
if (init_attr.cap.max_inline_data < req_inline) {
|
|
context->qps[qp].ib_inline_max = init_attr.cap.max_inline_data;
|
|
orte_show_help("help-mpi-common-ofacm-cpc-base.txt",
|
|
"inline truncated", true, orte_process_info.nodename,
|
|
req_inline, init_attr.cap.max_inline_data);
|
|
} else {
|
|
context->qps[qp].ib_inline_max = req_inline;
|
|
}
|
|
|
|
/* Taking default attributes from user */
|
|
memcpy(&attr, &context->attr[qp], sizeof(attr));
|
|
attr.qp_state = IBV_QPS_INIT;
|
|
attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ;
|
|
init_mask = IBV_QP_STATE |
|
|
IBV_QP_PKEY_INDEX |
|
|
IBV_QP_PORT |
|
|
IBV_QP_ACCESS_FLAGS;
|
|
/* apply user specified init mask */
|
|
if (NULL != context->custom_init_attr_mask) {
|
|
init_mask |= context->custom_init_attr_mask[qp];
|
|
}
|
|
|
|
if (ibv_modify_qp(context->qps[qp].lcl_qp,
|
|
&attr, init_mask)) {
|
|
OFACM_ERROR(("Error modifying qp to INIT errno says %s", strerror(errno)));
|
|
return OMPI_ERROR;
|
|
}
|
|
|
|
/* Setup meta data on the endpoint */
|
|
context->qps[qp].lcl_psn = lrand48() & 0xffffff;
|
|
|
|
return OMPI_SUCCESS;
|
|
}
|
|
|
|
|
|
/*
|
|
* RML send connect information to remote endpoint
|
|
*/
|
|
static int send_connect_data(ompi_common_ofacm_base_local_connection_context_t* context,
|
|
uint8_t message_type)
|
|
{
|
|
opal_buffer_t* buffer = OBJ_NEW(opal_buffer_t);
|
|
int rc;
|
|
|
|
if (NULL == buffer) {
|
|
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
|
return ORTE_ERR_OUT_OF_RESOURCE;
|
|
}
|
|
|
|
/* pack the info in the send buffer */
|
|
OFACM_VERBOSE(("packing %d of %d\n", 1, OPAL_UINT8));
|
|
OFACM_VERBOSE(("type %d\n", message_type));
|
|
rc = opal_dss.pack(buffer, &message_type, 1, OPAL_UINT8);
|
|
if (ORTE_SUCCESS != rc) {
|
|
ORTE_ERROR_LOG(rc);
|
|
return rc;
|
|
}
|
|
|
|
OFACM_VERBOSE(("packing %d of %d\n", 1, OPAL_UINT64));
|
|
rc = opal_dss.pack(buffer, &context->subnet_id, 1, OPAL_UINT64);
|
|
if (ORTE_SUCCESS != rc) {
|
|
ORTE_ERROR_LOG(rc);
|
|
return rc;
|
|
}
|
|
|
|
if (message_type != ENDPOINT_CONNECT_REQUEST) {
|
|
/* send the QP connect request info we respond to */
|
|
OFACM_VERBOSE(("packing %d of %d\n", 1, OPAL_UINT32));
|
|
rc = opal_dss.pack(buffer,
|
|
&context->remote_info.rem_qps[0].rem_qp_num, 1,
|
|
OPAL_UINT32);
|
|
if (ORTE_SUCCESS != rc) {
|
|
ORTE_ERROR_LOG(rc);
|
|
return rc;
|
|
}
|
|
OFACM_VERBOSE(("packing %d of %d\n", 1, OPAL_UINT16));
|
|
rc = opal_dss.pack(buffer, &context->remote_info.rem_lid, 1, OPAL_UINT16);
|
|
if (ORTE_SUCCESS != rc) {
|
|
ORTE_ERROR_LOG(rc);
|
|
return rc;
|
|
}
|
|
}
|
|
|
|
if (message_type != ENDPOINT_CONNECT_ACK) {
|
|
int qp;
|
|
/* send CM type/family */
|
|
OFACM_VERBOSE(("packing %d of %d\n", 1, OPAL_INT));
|
|
rc = opal_dss.pack(buffer, &context->cpc_type, 1, OPAL_INT);
|
|
if (ORTE_SUCCESS != rc) {
|
|
ORTE_ERROR_LOG(rc);
|
|
return rc;
|
|
}
|
|
/* Pasha: Send number of qp here. We don't must to send number of QPs here, BUT
|
|
* recv side callback code is pretty complicated and I don't want to touch
|
|
* it now. So best work around on this stage is send another 1byte with number of
|
|
* qps.
|
|
*/
|
|
OFACM_VERBOSE(("packing %d of %d\n", 1, OPAL_UINT8));
|
|
rc = opal_dss.pack(buffer, &context->num_of_qps, 1, OPAL_UINT8);
|
|
if (ORTE_SUCCESS != rc) {
|
|
ORTE_ERROR_LOG(rc);
|
|
return rc;
|
|
}
|
|
/* stuff all the QP info into the buffer */
|
|
for (qp = 0; qp < context->num_of_qps; qp++) {
|
|
OFACM_VERBOSE(("packing %d of %d\n", 1, OPAL_UINT32));
|
|
rc = opal_dss.pack(buffer, &context->qps[qp].lcl_qp->qp_num,
|
|
1, OPAL_UINT32);
|
|
if (ORTE_SUCCESS != rc) {
|
|
ORTE_ERROR_LOG(rc);
|
|
return rc;
|
|
}
|
|
OFACM_VERBOSE(("packing %d of %d\n", 1, OPAL_UINT32));
|
|
rc = opal_dss.pack(buffer, &context->qps[qp].lcl_psn, 1,
|
|
OPAL_UINT32);
|
|
if (ORTE_SUCCESS != rc) {
|
|
ORTE_ERROR_LOG(rc);
|
|
return rc;
|
|
}
|
|
}
|
|
|
|
OFACM_VERBOSE(("packing %d of %d\n", 1, OPAL_UINT16));
|
|
rc = opal_dss.pack(buffer, &context->lid, 1, OPAL_UINT16);
|
|
if (ORTE_SUCCESS != rc) {
|
|
ORTE_ERROR_LOG(rc);
|
|
return rc;
|
|
}
|
|
OFACM_VERBOSE(("packing %d of %d\n", 1, OPAL_UINT32));
|
|
rc = opal_dss.pack(buffer, &context->attr[0].path_mtu, 1,
|
|
OPAL_UINT32);
|
|
if (ORTE_SUCCESS != rc) {
|
|
ORTE_ERROR_LOG(rc);
|
|
return rc;
|
|
}
|
|
OFACM_VERBOSE(("packing %d of %d\n", 1, OPAL_UINT32));
|
|
rc = opal_dss.pack(buffer, &context->index, 1, OPAL_UINT32);
|
|
if (ORTE_SUCCESS != rc) {
|
|
ORTE_ERROR_LOG(rc);
|
|
return rc;
|
|
}
|
|
}
|
|
|
|
/* send to remote endpoint */
|
|
rc = orte_rml.send_buffer_nb(&context->proc->proc_ompi->proc_name,
|
|
buffer, OMPI_RML_TAG_OFACM, 0,
|
|
rml_send_cb, NULL);
|
|
if (ORTE_SUCCESS != rc) {
|
|
ORTE_ERROR_LOG(rc);
|
|
return rc;
|
|
}
|
|
OFACM_VERBOSE(("Sent QP Info, LID = %d, SUBNET = %lx\n",
|
|
context->lid,
|
|
context->subnet_id));
|
|
|
|
return OMPI_SUCCESS;
|
|
}
|
|
|
|
static void report_error(ompi_common_ofacm_base_local_connection_context_t* context)
|
|
{
|
|
if (NULL == context || NULL == context->error_cb) {
|
|
/* The context is undefined and we can not print specific error */
|
|
orte_show_help("help-mpi-common-ofacm-oob.txt",
|
|
"ofacm oob fatal error", true,
|
|
orte_process_info.nodename,
|
|
__FILE__, __LINE__);
|
|
exit(1);
|
|
}
|
|
|
|
/* Other way, call to user error callback */
|
|
context->error_cb(context->user_context);
|
|
}
|
|
|
|
/*
|
|
* Callback when we have finished RML sending the connect data to a
|
|
* remote peer
|
|
*/
|
|
static void rml_send_cb(int status, orte_process_name_t* endpoint,
|
|
opal_buffer_t* buffer, orte_rml_tag_t tag,
|
|
void* cbdata)
|
|
{
|
|
OBJ_RELEASE(buffer);
|
|
}
|
|
|
|
|
|
/*
|
|
* Non blocking RML recv callback. Read incoming QP and other info,
|
|
* and if this endpoint is trying to connect, reply with our QP info,
|
|
* otherwise try to modify QP's and establish reliable connection
|
|
*/
|
|
static void rml_recv_cb(int status, orte_process_name_t* process_name,
|
|
opal_buffer_t* buffer, orte_rml_tag_t tag,
|
|
void* cbdata)
|
|
{
|
|
int context_state;
|
|
int rc;
|
|
uint32_t lcl_qp = 0;
|
|
uint16_t lcl_lid = 0;
|
|
int32_t cnt = 1;
|
|
ompi_common_ofacm_base_remote_connection_context_t remote_info;
|
|
ompi_common_ofacm_base_local_connection_context_t *l_context;
|
|
ompi_common_ofacm_base_proc_t *proc;
|
|
uint8_t message_type, num_qps;
|
|
int cpc_type;
|
|
opal_list_t *procs_list = &ompi_common_ofacm_oob.all_procs;
|
|
opal_list_t *context_list;
|
|
bool master;
|
|
|
|
/* start by unpacking data first so we know who is knocking at
|
|
our door */
|
|
OFACM_VERBOSE(("unpacking %d of %d\n", cnt, OPAL_UINT8));
|
|
rc = opal_dss.unpack(buffer, &message_type, &cnt, OPAL_UINT8);
|
|
if (ORTE_SUCCESS != rc) {
|
|
ORTE_ERROR_LOG(rc);
|
|
report_error(NULL);
|
|
return;
|
|
}
|
|
|
|
OFACM_VERBOSE(("unpacking %d of %d\n", cnt, OPAL_UINT64));
|
|
rc = opal_dss.unpack(buffer, &remote_info.rem_subnet_id, &cnt, OPAL_UINT64);
|
|
if (ORTE_SUCCESS != rc) {
|
|
ORTE_ERROR_LOG(rc);
|
|
report_error(NULL);
|
|
return;
|
|
}
|
|
|
|
if (ENDPOINT_CONNECT_REQUEST != message_type) {
|
|
OFACM_VERBOSE(("unpacking %d of %d\n", cnt, OPAL_UINT32));
|
|
rc = opal_dss.unpack(buffer, &lcl_qp, &cnt, OPAL_UINT32);
|
|
if (ORTE_SUCCESS != rc) {
|
|
ORTE_ERROR_LOG(rc);
|
|
report_error(NULL);
|
|
return;
|
|
}
|
|
OFACM_VERBOSE(("unpacking %d of %d\n", cnt, OPAL_UINT16));
|
|
rc = opal_dss.unpack(buffer, &lcl_lid, &cnt, OPAL_UINT16);
|
|
if (ORTE_SUCCESS != rc) {
|
|
ORTE_ERROR_LOG(rc);
|
|
report_error(NULL);
|
|
return;
|
|
}
|
|
}
|
|
|
|
if (ENDPOINT_CONNECT_ACK != message_type) {
|
|
int qp;
|
|
|
|
OFACM_VERBOSE(("unpacking %d of %d\n", cnt, OPAL_INT));
|
|
rc = opal_dss.unpack(buffer, &cpc_type, &cnt, OPAL_INT);
|
|
if (ORTE_SUCCESS != rc) {
|
|
ORTE_ERROR_LOG(rc);
|
|
report_error(NULL);
|
|
return;
|
|
}
|
|
/* Pasha: Reading number of qps, in original code we tool it from
|
|
* btl component. In future we may change order of operations here. We may start
|
|
* lookup for connection descriptor after receiving subnet_id and lid. But in order
|
|
* to do it here I need totally to rewrite the recv callback...next time ;)
|
|
*/
|
|
OFACM_VERBOSE(("unpacking %d of %d\n", cnt, OPAL_UINT8));
|
|
rc = opal_dss.unpack(buffer, &num_qps, &cnt, OPAL_UINT8);
|
|
if (ORTE_SUCCESS != rc) {
|
|
ORTE_ERROR_LOG(rc);
|
|
report_error(NULL);
|
|
return;
|
|
}
|
|
/* get ready for the data */
|
|
ompi_common_ofacm_base_remote_context_init(&remote_info,
|
|
num_qps, 0);
|
|
|
|
/* unpack all the qp info */
|
|
for (qp = 0; qp < num_qps; ++qp) {
|
|
OFACM_VERBOSE(("unpacking %d of %d\n", cnt, OPAL_UINT32));
|
|
rc = opal_dss.unpack(buffer, &remote_info.rem_qps[qp].rem_qp_num, &cnt,
|
|
OPAL_UINT32);
|
|
if (ORTE_SUCCESS != rc) {
|
|
ORTE_ERROR_LOG(rc);
|
|
report_error(NULL);
|
|
return;
|
|
}
|
|
OFACM_VERBOSE(("unpacking %d of %d\n", cnt, OPAL_UINT32));
|
|
rc = opal_dss.unpack(buffer, &remote_info.rem_qps[qp].rem_psn, &cnt,
|
|
OPAL_UINT32);
|
|
if (ORTE_SUCCESS != rc) {
|
|
ORTE_ERROR_LOG(rc);
|
|
report_error(NULL);
|
|
return;
|
|
}
|
|
}
|
|
|
|
OFACM_VERBOSE(("unpacking %d of %d\n", cnt, OPAL_UINT16));
|
|
rc = opal_dss.unpack(buffer, &remote_info.rem_lid, &cnt, OPAL_UINT16);
|
|
if (ORTE_SUCCESS != rc) {
|
|
ORTE_ERROR_LOG(rc);
|
|
report_error(NULL);
|
|
return;
|
|
}
|
|
|
|
OFACM_VERBOSE(("unpacking %d of %d\n", cnt, OPAL_UINT32));
|
|
rc = opal_dss.unpack(buffer, &remote_info.rem_mtu, &cnt, OPAL_UINT32);
|
|
if (ORTE_SUCCESS != rc) {
|
|
ORTE_ERROR_LOG(rc);
|
|
report_error(NULL);
|
|
return;
|
|
}
|
|
|
|
OFACM_VERBOSE(("unpacking %d of %d\n", cnt, OPAL_UINT32));
|
|
rc = opal_dss.unpack(buffer, &remote_info.rem_index, &cnt, OPAL_UINT32);
|
|
if (ORTE_SUCCESS != rc) {
|
|
ORTE_ERROR_LOG(rc);
|
|
report_error(NULL);
|
|
return;
|
|
}
|
|
}
|
|
|
|
OFACM_VERBOSE(("Received QP Info, LID = %d, SUBNET = %lx, CPC_TYPE = %d",
|
|
remote_info.rem_lid,
|
|
remote_info.rem_subnet_id,
|
|
cpc_type));
|
|
|
|
master = orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_NAME,
|
|
process_name) >= 0 ? true : false;
|
|
for (proc = (ompi_common_ofacm_base_proc_t *)opal_list_get_first(procs_list);
|
|
proc != (ompi_common_ofacm_base_proc_t *)opal_list_get_end(procs_list);
|
|
proc = (ompi_common_ofacm_base_proc_t *)opal_list_get_next(proc)){
|
|
bool found = false;
|
|
if (orte_util_compare_name_fields(ORTE_NS_CMP_ALL,
|
|
&proc->proc_ompi->proc_name,
|
|
process_name) != OPAL_EQUAL) {
|
|
continue;
|
|
}
|
|
context_list = &proc->all_contexts;
|
|
if (ENDPOINT_CONNECT_REQUEST != message_type) {
|
|
/* This is a reply message. Try to get the endpoint
|
|
instance the reply belongs to */
|
|
for (l_context = (ompi_common_ofacm_base_local_connection_context_t *)opal_list_get_first(context_list);
|
|
l_context != (ompi_common_ofacm_base_local_connection_context_t *)opal_list_get_end(context_list);
|
|
l_context = (ompi_common_ofacm_base_local_connection_context_t *)opal_list_get_next(l_context)) {
|
|
if (l_context->qps[0].lcl_qp != NULL &&
|
|
lcl_lid == l_context->lid &&
|
|
lcl_qp == l_context->qps[0].lcl_qp->qp_num &&
|
|
remote_info.rem_subnet_id == l_context->subnet_id) {
|
|
found = true;
|
|
break;
|
|
}
|
|
}
|
|
} else {
|
|
/* This is new connection request. If this is master try
|
|
to find endpoint in a connecting state. If this is
|
|
slave try to find endpoint in closed state and
|
|
initiate connection back */
|
|
ompi_common_ofacm_base_local_connection_context_t *context_found = NULL;
|
|
for (l_context = (ompi_common_ofacm_base_local_connection_context_t *)opal_list_get_first(context_list);
|
|
l_context != (ompi_common_ofacm_base_local_connection_context_t *)opal_list_get_end(context_list);
|
|
l_context = (ompi_common_ofacm_base_local_connection_context_t *)opal_list_get_next(l_context)) {
|
|
if (l_context->subnet_id != remote_info.rem_subnet_id ||
|
|
l_context->cpc_type != cpc_type ||
|
|
(l_context->state != MCA_COMMON_OFACM_CONNECTING
|
|
&& l_context->state != MCA_COMMON_OFACM_CLOSED))
|
|
continue;
|
|
found = true;
|
|
context_found = l_context;
|
|
if ((master &&
|
|
MCA_COMMON_OFACM_CONNECTING == l_context->state) ||
|
|
(!master &&
|
|
MCA_COMMON_OFACM_CLOSED == l_context->state))
|
|
break; /* Found one. No point to continue */
|
|
}
|
|
l_context = context_found;
|
|
|
|
/* if this is slave and there is no endpoints in closed
|
|
state then all connection are already in progress so
|
|
just ignore this connection request */
|
|
if (found && !master &&
|
|
MCA_COMMON_OFACM_CLOSED != l_context->state) {
|
|
return;
|
|
}
|
|
}
|
|
|
|
if (!found) {
|
|
OFACM_ERROR(("can't find suitable endpoint for this peer\n"));
|
|
report_error(NULL);
|
|
return;
|
|
}
|
|
|
|
OPAL_THREAD_LOCK(&l_context->context_lock);
|
|
context_state = l_context->state;
|
|
|
|
/* Update status */
|
|
switch (context_state) {
|
|
case MCA_COMMON_OFACM_CLOSED:
|
|
/* We had this connection closed before. The endpoint is
|
|
trying to connect. Move the status of this connection
|
|
to CONNECTING, and then reply with our QP
|
|
information */
|
|
if (master) {
|
|
rc = reply_start_connect(l_context, &remote_info);
|
|
} else {
|
|
rc = oob_module_start_connect(l_context);
|
|
}
|
|
|
|
if (OMPI_SUCCESS != rc) {
|
|
OFACM_ERROR(("error in endpoint reply start connect"));
|
|
report_error(l_context);
|
|
break;
|
|
}
|
|
|
|
/* As long as we expect a message from the peer (in order
|
|
to setup the connection) let the event engine pool the
|
|
RML events. Note: we increment it once peer active
|
|
connection. */
|
|
opal_progress_event_users_increment();
|
|
break;
|
|
|
|
case MCA_COMMON_OFACM_CONNECTING:
|
|
/* preparing remote info for this context */
|
|
ompi_common_ofacm_base_remote_context_init(&l_context->remote_info,
|
|
l_context->num_of_qps, 0);
|
|
/* need to check status here */
|
|
set_remote_info(l_context, &remote_info);
|
|
if (OMPI_SUCCESS != (rc = qp_connect_all(l_context))) {
|
|
OFACM_ERROR(("endpoint connect error: %d", rc));
|
|
report_error(l_context);
|
|
break;
|
|
}
|
|
|
|
if (master) {
|
|
l_context->state = MCA_COMMON_OFACM_WAITING_ACK;
|
|
|
|
/* Send him an ACK */
|
|
send_connect_data(l_context, ENDPOINT_CONNECT_RESPONSE);
|
|
} else {
|
|
send_connect_data(l_context, ENDPOINT_CONNECT_ACK);
|
|
/* Tell main BTL that we're done */
|
|
l_context->state = MCA_COMMON_OFACM_CONNECTED;
|
|
l_context->connect_cb(l_context->user_context);
|
|
}
|
|
break;
|
|
|
|
case MCA_COMMON_OFACM_WAITING_ACK:
|
|
/* Tell main BTL that we're done */
|
|
l_context->state = MCA_COMMON_OFACM_CONNECTED;
|
|
l_context->connect_cb(l_context->user_context);
|
|
break;
|
|
|
|
case MCA_COMMON_OFACM_CONNECT_ACK:
|
|
send_connect_data(l_context, ENDPOINT_CONNECT_ACK);
|
|
/* Tell main BTL that we're done */
|
|
l_context->state = MCA_COMMON_OFACM_CONNECTED;
|
|
l_context->connect_cb(l_context->user_context);
|
|
break;
|
|
|
|
case MCA_COMMON_OFACM_CONNECTED:
|
|
break;
|
|
|
|
default :
|
|
OFACM_ERROR(("Invalid endpoint state %d", context_state));
|
|
report_error(l_context);
|
|
}
|
|
OPAL_THREAD_UNLOCK(&l_context->context_lock);
|
|
break;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Get the service level on the route between
|
|
* source port LID and destination port LID.
|
|
* @Param src_port_lid - LID of the source port.
|
|
* @Param dst_port_lid - LID of destination port.
|
|
* @Param service_level - Returned value.
|
|
* The service level on the route between source port
|
|
* to destination port.
|
|
* @return - Error Code. Non Zero value on error.
|
|
*/
|
|
static int pick_service_level(uint16_t src_port_lid, uint16_t dst_port_lid, uint8_t* service_level,
|
|
opal_hash_table_t* port_to_switch_hash_table, opal_hash_table_t* switch_to_switch_hash_table)
|
|
{
|
|
uint8_t* sl;
|
|
uint16_t* dst_switch_lid;
|
|
void* p_src_switch_lid = NULL;
|
|
void* p_dst_switch_lid = NULL;
|
|
void* p_service_level = NULL;
|
|
int rc = OMPI_SUCCESS;
|
|
|
|
/* Get the switch LID connected tothe source HCA LID */
|
|
rc = opal_hash_table_get_value_ptr(port_to_switch_hash_table, &src_port_lid, sizeof(uint16_t), &p_src_switch_lid);
|
|
if(OMPI_SUCCESS != rc){
|
|
/* Could not find source port LID */
|
|
rc = OMPI_ERROR;
|
|
return rc;
|
|
}
|
|
|
|
|
|
/* Get the switch LID connected to the destination HCA LID */
|
|
rc = opal_hash_table_get_value_ptr(port_to_switch_hash_table, &dst_port_lid, sizeof(uint16_t), &p_dst_switch_lid);
|
|
if(OMPI_SUCCESS != rc){
|
|
/* Could not find destination port LID */
|
|
rc = OMPI_ERROR;
|
|
return rc;
|
|
}
|
|
dst_switch_lid = (uint16_t*)p_dst_switch_lid;
|
|
|
|
|
|
/* Get the service level of the route beween the source HCA LID and destination HCA LID */
|
|
rc = opal_hash_table_get_value_ptr(switch_to_switch_hash_table, dst_switch_lid, sizeof(uint16_t), &p_service_level);
|
|
if(OMPI_SUCCESS != rc){
|
|
/* Could not find destination switch LID in hashtable*/
|
|
rc = OMPI_ERROR;
|
|
return rc;
|
|
}
|
|
sl = (uint8_t*)p_service_level;
|
|
*service_level = *sl;
|
|
|
|
return rc;
|
|
}
|
|
|
|
|
|
/*
|
|
* Get the size of the port to switch hashtable from a file.
|
|
|
|
* @Params fp - Descriptor of the input file.
|
|
* @Param hash_table_size - Pointer to the size of
|
|
* the port to switch hashtable.
|
|
* @param head - pointer to a linked list containing
|
|
* the pairs to be stored in the hashtable.
|
|
* @return - Error code. Non zero value for failure.
|
|
*/
|
|
static int get_port_to_switch_hashtable_data_from_file(FILE* fp, int* hash_table_size, port_to_switch_lids** head)
|
|
{
|
|
int i;
|
|
char c;
|
|
int num_items;
|
|
int rc = OMPI_SUCCESS;
|
|
int ret = OMPI_SUCCESS;
|
|
|
|
uint64_t guid;
|
|
uint16_t port_lid;
|
|
uint16_t switch_lid;
|
|
uint16_t mtu, rate, lmc; /* TODO: Check binary representation */
|
|
int port_number;
|
|
|
|
port_to_switch_lids* item = NULL;
|
|
port_to_switch_lids* p_head = *head;
|
|
port_to_switch_lids* p_next_item = NULL;
|
|
|
|
char str[MAX_LINE_LEN] = "\0";
|
|
char input_str[NUM_OF_TOKENS][MAX_LINE_LEN] = {"\0"};
|
|
char expected_str[NUM_OF_TOKENS][MAX_LINE_LEN] = {"\0"};
|
|
|
|
|
|
c = fgetc(fp);
|
|
fseek(fp, -1, SEEK_CUR);
|
|
|
|
/* Init expected input strings */
|
|
strcpy(expected_str[0], "Channel");
|
|
strcpy(expected_str[1], "Adapter");
|
|
strcpy(expected_str[2], "base");
|
|
strcpy(expected_str[3], "LID");
|
|
strcpy(expected_str[4], "LMC");
|
|
strcpy(expected_str[5], "port");
|
|
|
|
/* Create list */
|
|
p_head = (port_to_switch_lids*)calloc(1, sizeof(port_to_switch_lids));
|
|
if(NULL == p_head){
|
|
rc = OMPI_ERR_OUT_OF_RESOURCE;
|
|
return rc;
|
|
}
|
|
*head = p_head;
|
|
/* Pre-process the port-to-switch table */
|
|
while(EOF != c)
|
|
{
|
|
ret = fscanf(fp, "%s %s %" PRIx64 " %c", input_str[0], input_str[1], &guid, &c);
|
|
ret += fscanf(fp, "%s %s %hx %c", input_str[2], input_str[3], &port_lid, &c);
|
|
ret += fscanf(fp, "%s %hu %c", input_str[4], &lmc, &c);
|
|
ret += fscanf(fp, "%s %s %d", input_str[6], input_str[5], &port_number);
|
|
|
|
|
|
if(14 != ret){
|
|
rc = OMPI_ERR_FILE_READ_FAILURE;
|
|
return rc;
|
|
}
|
|
|
|
for(i = 0; i < 6; i++)
|
|
{
|
|
/*if(strncmp(str, table_header, hash_table_header_size)){*/
|
|
if(strcmp(input_str[i], expected_str[i])){
|
|
/* Incorrect table header */
|
|
rc = OMPI_ERROR;
|
|
return rc;
|
|
}
|
|
}
|
|
|
|
c = fgetc(fp);
|
|
fgets(str, MAX_LINE_LEN, fp);
|
|
if(strncmp(str, "# LID : MTU : RATE", strlen(str) - 1)){
|
|
/* Incorrect table header */
|
|
rc = OMPI_ERROR;
|
|
return rc;
|
|
}
|
|
|
|
c = fgetc(fp);
|
|
fseek(fp, -1, SEEK_CUR);
|
|
|
|
|
|
/* Read next line */
|
|
fgets(str, MAX_LINE_LEN, fp);
|
|
|
|
/* Update the port to switch hashtable size if read valid data */
|
|
num_items = sscanf(str, "%hx %c %hu %c %hu", &switch_lid, &c, &mtu, &c, &rate);
|
|
if(5 == num_items){
|
|
(*hash_table_size)++;
|
|
}
|
|
else{
|
|
/* Wrong file format */
|
|
rc = OMPI_ERROR;
|
|
return rc;
|
|
}
|
|
/* Store port LID and switch LID */
|
|
item = calloc(1, sizeof(port_to_switch_lids));
|
|
if(NULL == item){
|
|
rc = OMPI_ERR_OUT_OF_RESOURCE;
|
|
return rc;
|
|
}
|
|
item->port_lid = port_lid;
|
|
item->switch_lid = switch_lid;
|
|
|
|
/* Insert the item to the head of the list */
|
|
p_next_item = p_head->next;
|
|
p_head->next = item;
|
|
item->next = p_next_item;
|
|
|
|
|
|
/* Get Next char */
|
|
c = fgetc(fp);
|
|
fseek(fp, -1, SEEK_CUR);
|
|
}
|
|
|
|
return rc;
|
|
}
|
|
|
|
/*
|
|
* Get from the input file the size of the
|
|
* switch-to-switch hashtable dedicated for
|
|
* the input switch LID.
|
|
|
|
* @Params fp - Descriptor of the input file.
|
|
* @Param switch_lid - the source switch local ID (LID).
|
|
* @Param hash_table_size - Pointer to the hashtable size.
|
|
* Value returned by this routine.
|
|
* @Param head - pointer to a linked list containing the pairs
|
|
* to be stored in the hashtable.
|
|
* @return - Error code. Non zero value for failure.
|
|
*/
|
|
static int get_switch_to_switch_hashtable_size_from_file(FILE* fp, uint16_t switch_lid, int* hash_table_size, switch_to_switch_sl** head)
|
|
{
|
|
int i;
|
|
char c;
|
|
int num_items;
|
|
|
|
int port;
|
|
uint64_t guid;
|
|
uint16_t source_lid;
|
|
uint16_t dest_lid;
|
|
|
|
int rc = OMPI_SUCCESS;
|
|
int ret = OMPI_SUCCESS;uint8_t service_level;
|
|
|
|
switch_to_switch_sl* item = NULL;
|
|
switch_to_switch_sl* p_head = NULL;
|
|
switch_to_switch_sl* p_next_item = NULL;
|
|
|
|
int table_offset = 0;
|
|
int offset_in_table = 0;
|
|
|
|
char str[MAX_LINE_LEN] = "\0";
|
|
char input_str[NUM_OF_TOKENS][MAX_LINE_LEN] = {"\0"};
|
|
char expected_str[NUM_OF_TOKENS][MAX_LINE_LEN] = {"\0"};
|
|
|
|
|
|
/* Init expected strings */
|
|
strcpy(expected_str[0], "Switch");
|
|
strcpy(expected_str[1], "base");
|
|
strcpy(expected_str[2], "LID");
|
|
strcpy(expected_str[3], "port");
|
|
|
|
|
|
/* Allocate empty list */
|
|
p_head = (switch_to_switch_sl*)calloc(1, sizeof(switch_to_switch_sl));
|
|
if(NULL == p_head){
|
|
rc = OMPI_ERR_OUT_OF_RESOURCE;
|
|
return rc;
|
|
}
|
|
*head = p_head;
|
|
|
|
c = fgetc(fp);
|
|
fseek(fp, -1, SEEK_CUR);
|
|
|
|
/* Read info */
|
|
while(EOF != c){
|
|
|
|
/* Go over the switch-to-switch routing tables until the requested
|
|
* table dedicated for the input switch_lid is found */
|
|
ret = fscanf(fp, "%s %" PRIx64 " %c", input_str[0], &guid, &c);
|
|
ret += fscanf(fp, "%s %s %hx %c", input_str[1], input_str[2], &source_lid, &c);
|
|
ret += fscanf(fp, "%s %s %d", input_str[4], input_str[3], &port);
|
|
c = fgetc(fp);
|
|
|
|
if(10 != ret)
|
|
{
|
|
rc = OMPI_ERR_FILE_READ_FAILURE;
|
|
return rc;
|
|
}
|
|
|
|
for(i = 0; i < 4; i++){
|
|
/* Validate the table header correctness */
|
|
if(strncmp(input_str[i], expected_str[i], strlen(input_str[i]))){
|
|
/* Incorrect table header */
|
|
rc = OMPI_ERROR;
|
|
return rc;
|
|
}
|
|
}
|
|
|
|
/* Get next line acording to the currect structure of the file */
|
|
fgets(str, MAX_LINE_LEN, fp);
|
|
if(strncmp(str, "# LID : SL : MTU : RATE", strlen(str) - 1)){
|
|
rc = OMPI_ERROR;
|
|
return rc;
|
|
}
|
|
|
|
/* Test if this is the requested table,
|
|
* dedicated for the input source switch lid */
|
|
if(source_lid != switch_lid){
|
|
/* Skip to next table */
|
|
|
|
while(EOF != c)
|
|
{
|
|
offset_in_table = ftell(fp);
|
|
fgets(str, MAX_LINE_LEN, fp);
|
|
if(!strncmp(str, "Switch", strlen("Switch"))){
|
|
/* Found new table found - start over */
|
|
fseek(fp, offset_in_table, SEEK_SET);
|
|
break;
|
|
}
|
|
/* Receive next charecter */
|
|
c = fgetc(fp);
|
|
fseek(fp, -1, SEEK_CUR);
|
|
}
|
|
if(EOF == c){
|
|
/* End-Of-File was met without
|
|
* finding the required routing table*/
|
|
rc = OMPI_ERROR;
|
|
}
|
|
}
|
|
else{
|
|
/* The right table was found */
|
|
while(EOF != c){
|
|
|
|
fgets(str, MAX_LINE_LEN, fp);
|
|
|
|
/* Test if a new table was found */
|
|
if(!strncmp(str, "Switch", strlen("Switch"))){
|
|
/* Quit the search - table was fully read */
|
|
return rc;
|
|
}
|
|
/* Still in the required switch route table */
|
|
else{
|
|
/* Check correcness of the data and update table size */
|
|
num_items = sscanf(str, "%hx %c %c", &dest_lid, &c, &service_level);
|
|
if(3 != num_items){
|
|
/* Failed to read input data / wrong input formate */
|
|
rc = OMPI_ERROR;
|
|
return rc;
|
|
}
|
|
(*hash_table_size)++;
|
|
|
|
/* Add the data to the list*/
|
|
item = (switch_to_switch_sl*)calloc(1, sizeof(switch_to_switch_sl));
|
|
if(NULL == item){
|
|
rc = OMPI_ERR_OUT_OF_RESOURCE;
|
|
return rc;
|
|
}
|
|
item->switch_lid = dest_lid;
|
|
item->service_level = service_level;
|
|
|
|
p_next_item = p_head->next;
|
|
p_head->next = item;
|
|
item->next = p_next_item;
|
|
}
|
|
/* Get next charecter */
|
|
c = fgetc(fp);
|
|
fseek(fp, -1, SEEK_CUR);
|
|
}
|
|
/* Set file descriptor to the beginning
|
|
* of the required table table */
|
|
fseek(fp, table_offset, SEEK_SET);
|
|
}
|
|
}
|
|
return rc;
|
|
}
|
|
|
|
/*
|
|
* Set port to switch hashtable according to data read from an input file.
|
|
* The hashtable Key is the port local ID (uint16_t).
|
|
* The hashtable Value is the local ID (uint16_t) of the switch connected to the port in the fabric.
|
|
*
|
|
* @Param hashtable - the hashtable to set.
|
|
* @Param hashtable_size - the number of hashtable elements.
|
|
* @Param head - Pointer to a linked list containing
|
|
* the pairs two be stored in the hashtable.
|
|
* @return - Error code. Non Zero value on error.
|
|
*/
|
|
static int set_port_to_switch_hash_table(opal_hash_table_t* hashtable, size_t hashtable_size, port_to_switch_lids** p_head)
|
|
{
|
|
int ret;
|
|
uint16_t key;
|
|
uint16_t* value = NULL;
|
|
unsigned int i;
|
|
int rc = OMPI_SUCCESS;
|
|
|
|
port_to_switch_lids* head = NULL;
|
|
port_to_switch_lids* p_item = NULL;
|
|
port_to_switch_lids* p_item_next = NULL;
|
|
|
|
|
|
if((NULL == p_head) || (NULL == *p_head)){
|
|
rc = OMPI_ERROR;
|
|
return rc;
|
|
}
|
|
head = *p_head;
|
|
|
|
for(i = 0; i < hashtable_size; i++){
|
|
|
|
/* Read pairs of port-lid and witch-lid from
|
|
* file and store them in the input hashtable */
|
|
value = (uint16_t*)calloc(1, sizeof(uint16_t));
|
|
if(NULL == value){
|
|
rc = OMPI_ERR_OUT_OF_RESOURCE;
|
|
return rc;
|
|
}
|
|
|
|
/* Get next pair to store */
|
|
p_item = head->next;
|
|
if(NULL == p_item){
|
|
rc = OMPI_ERROR;
|
|
return rc;
|
|
}
|
|
key = p_item->port_lid;
|
|
*value = p_item->switch_lid;
|
|
/* Remove item from list */
|
|
p_item_next = p_item->next;
|
|
head->next = p_item_next;
|
|
free(p_item);
|
|
|
|
/* Set the port to switch LIDS hashtable */
|
|
ret = opal_hash_table_set_value_ptr(hashtable, &key, sizeof(uint16_t), (void*)value);
|
|
if(OPAL_SUCCESS != ret){
|
|
OFACM_ERROR(("Failed to set port2switch hashtable\n"));
|
|
rc = OMPI_ERROR;
|
|
break;
|
|
}
|
|
}
|
|
|
|
free(*p_head);
|
|
*p_head = NULL;
|
|
return rc;
|
|
}
|
|
|
|
/*
|
|
* Set switch to switch hashtable according to data read from an input file.
|
|
* The hashtable Key is a switch local ID (uint16_t).
|
|
* The hashtable Value is the service level (uint8_t) of the route in the
|
|
* fabric between local switch LID (represented by key) and remote switch LID.
|
|
*
|
|
* @Param hashtable - The hashtable to set.
|
|
* @Param hashtable_size - The number of hashtable elements.
|
|
* @Param head - Pointer to a list of all the data
|
|
* pair to be inserted into the hashtable.
|
|
* @return - Error code. Non Zero value on error.
|
|
*/
|
|
static int set_switch_to_switch_hash_table(opal_hash_table_t* hashtable, size_t hashtable_size, switch_to_switch_sl** p_head)
|
|
{
|
|
uint16_t key; /* switch lid */
|
|
uint8_t* value = NULL;
|
|
unsigned int i;
|
|
int rc = OMPI_SUCCESS;
|
|
int ret = OMPI_SUCCESS;
|
|
|
|
switch_to_switch_sl* head = NULL;
|
|
switch_to_switch_sl* item = NULL;
|
|
switch_to_switch_sl* p_next_item = NULL;
|
|
|
|
|
|
if((NULL == p_head) || (NULL == *p_head)){
|
|
rc = OMPI_ERROR;
|
|
return rc;
|
|
}
|
|
head = *p_head;
|
|
|
|
/* Read pairs of remote switch (LID) and
|
|
* route service level (SL) from file
|
|
* and store the in the input hashtable */
|
|
for(i = 0; i < hashtable_size; i++)
|
|
{
|
|
|
|
value = (uint8_t*)calloc(1, sizeof(uint8_t));
|
|
if(NULL == value){
|
|
rc = OMPI_ERR_OUT_OF_RESOURCE;
|
|
return rc;
|
|
}
|
|
|
|
/* Get data from list */
|
|
item = head->next;
|
|
if(NULL == item){
|
|
rc = OMPI_ERROR;
|
|
return rc;
|
|
}
|
|
key = item->switch_lid;
|
|
*value = item->service_level;
|
|
|
|
/* Remove data item from list */
|
|
p_next_item = item->next;
|
|
head->next = p_next_item;
|
|
free(item);
|
|
|
|
ret = opal_hash_table_set_value_ptr(hashtable, &key, sizeof(uint16_t), value);
|
|
if(OPAL_SUCCESS != ret){
|
|
OFACM_ERROR(("Failed to set sw2sw hashtable\n"));
|
|
rc = OMPI_ERROR;
|
|
break;
|
|
}
|
|
}
|
|
|
|
free(*p_head);
|
|
*p_head = NULL;
|
|
return rc;
|
|
}
|
|
|
|
/*
|
|
* An efficient method that allows to find the service level of any
|
|
* any route from an input port to any other port in the fabric.
|
|
*
|
|
* Create two hashtables according to data read from an input file.
|
|
* The first table maps any port LID in the fabric to the LID of
|
|
* the switch it is connected to.
|
|
* The second table is dedicated to the switch LID to which the
|
|
* local port is connected.
|
|
*
|
|
* The table maps a remote switch LID to the service level
|
|
* of the route between the table's LID and this remote LID.
|
|
*
|
|
* @Param lid - the local ID of the port.
|
|
* @return - Error Code. Non Zero value in case of error.
|
|
*/
|
|
static int create_service_level_table_for_port(uint16_t lid, opal_hash_table_t* port_to_switch_hash_table,
|
|
opal_hash_table_t* switch_to_switch_hash_table)
|
|
{
|
|
FILE* fp = NULL;
|
|
uint16_t* switch_lid;
|
|
void* p_switch_lid = NULL;
|
|
|
|
int rc = OMPI_SUCCESS;
|
|
int ret = OMPI_SUCCESS;
|
|
|
|
int file_name_len;
|
|
char* switch_to_sl = NULL;
|
|
|
|
int port_to_switch_hash_table_size = 0;
|
|
int switch_to_switch_hash_table_size = 0;
|
|
|
|
port_to_switch_lids* port_switch_lids = NULL;
|
|
switch_to_switch_sl* switch_sl = NULL;
|
|
|
|
|
|
|
|
/* Open input configuration file */
|
|
fp = fopen(ompi_common_ofacm_three_dim_torus, "rt");
|
|
if(NULL == fp){
|
|
/* File Opening failed */
|
|
fprintf(stderr, "Failed to open the input file for the fabric's service level\n");
|
|
rc = OMPI_ERR_FILE_OPEN_FAILURE;
|
|
goto ERROR;
|
|
}
|
|
|
|
/* Get port-to-switch hashtable size */
|
|
rc = get_port_to_switch_hashtable_data_from_file(fp, &port_to_switch_hash_table_size, &port_switch_lids);
|
|
if(OMPI_SUCCESS != rc){
|
|
goto ERROR;
|
|
}
|
|
fclose(fp);
|
|
fp = NULL;
|
|
|
|
/* Build and initialize the port-to-swich hashtable */
|
|
OBJ_CONSTRUCT(port_to_switch_hash_table, opal_hash_table_t);
|
|
opal_hash_table_init(port_to_switch_hash_table, port_to_switch_hash_table_size);
|
|
|
|
/* Set the port-to-switch hashtable */
|
|
rc = set_port_to_switch_hash_table(port_to_switch_hash_table, port_to_switch_hash_table_size, &port_switch_lids);
|
|
if(OMPI_SUCCESS != rc){
|
|
goto ERROR;
|
|
}
|
|
|
|
/* Get the LID of the switch connected to the port's LID */
|
|
ret = opal_hash_table_get_value_ptr(port_to_switch_hash_table, &lid, sizeof(uint16_t), &p_switch_lid);
|
|
if(OPAL_SUCCESS != ret){
|
|
rc = OMPI_ERROR;
|
|
goto ERROR;
|
|
}
|
|
|
|
|
|
/* Open the file containing the mapping from switch-to-switch route to service level */
|
|
file_name_len = strlen(ompi_common_ofacm_three_dim_torus);
|
|
switch_to_sl = (char*)calloc(file_name_len + 7, sizeof(char));
|
|
if(NULL == switch_to_sl){
|
|
rc = OMPI_ERR_OUT_OF_RESOURCE;
|
|
goto ERROR;
|
|
}
|
|
/* Build the switch-to-switch file name based on the port-to-switch file name */
|
|
strncpy(switch_to_sl, ompi_common_ofacm_three_dim_torus,
|
|
strlen(ompi_common_ofacm_three_dim_torus) - strlen("peer-paths.dump") - 1);
|
|
strcat(switch_to_sl, "-sw2sw-path-records.dump");
|
|
|
|
/* Open path-to-SL file */
|
|
fp = fopen(switch_to_sl, "rt");
|
|
if(NULL == fp){
|
|
/* File Opening failed */
|
|
fprintf(stderr, "Failed to open the input file for the fabric's service level\n");
|
|
rc = OMPI_ERR_FILE_OPEN_FAILURE;
|
|
goto ERROR;
|
|
}
|
|
free(switch_to_sl);
|
|
|
|
switch_lid = (uint16_t*)p_switch_lid;
|
|
rc = get_switch_to_switch_hashtable_size_from_file(fp, *(uint16_t*)switch_lid,
|
|
&switch_to_switch_hash_table_size, &switch_sl);
|
|
if(OMPI_SUCCESS != rc){
|
|
|
|
goto ERROR;
|
|
}
|
|
fclose(fp);
|
|
fp = NULL;
|
|
|
|
/* Build and initialize the switch-to-switch hashtable */
|
|
OBJ_CONSTRUCT(switch_to_switch_hash_table, opal_hash_table_t);
|
|
opal_hash_table_init(switch_to_switch_hash_table, switch_to_switch_hash_table_size);
|
|
|
|
/* Set the switch-to-switch hashtable */
|
|
rc = set_switch_to_switch_hash_table(switch_to_switch_hash_table,
|
|
switch_to_switch_hash_table_size, &switch_sl);
|
|
if(OMPI_SUCCESS != rc){
|
|
goto ERROR;
|
|
}
|
|
|
|
|
|
/* Use: opal_hash_table_get_value_uint64 */
|
|
return OMPI_SUCCESS;
|
|
ERROR:
|
|
/* Close open files */
|
|
if(NULL != fp){
|
|
fclose(fp);
|
|
}
|
|
/* Release allocated resources */
|
|
if(NULL != port_switch_lids){
|
|
port_to_switch_lids* p_list = port_switch_lids;
|
|
port_to_switch_lids* p_item = NULL;
|
|
while(p_list->next != NULL){
|
|
p_item = p_list->next;
|
|
if(NULL != p_item){
|
|
p_list->next = p_item->next;
|
|
free(p_item);
|
|
}
|
|
}
|
|
free(p_list);
|
|
}
|
|
if(NULL != switch_sl){
|
|
switch_to_switch_sl* p_list = switch_sl;
|
|
switch_to_switch_sl* p_item = NULL;
|
|
while(p_list->next != NULL){
|
|
p_item = p_list->next;
|
|
if(NULL != p_item){
|
|
p_list->next = p_item->next;
|
|
free(p_item);
|
|
}
|
|
}
|
|
free(p_list);
|
|
}
|
|
return rc;
|
|
}
|
|
|