diff --git a/ompi/mca/btl/openib/btl_openib.c b/ompi/mca/btl/openib/btl_openib.c index 4de824f39c..b513421477 100644 --- a/ompi/mca/btl/openib/btl_openib.c +++ b/ompi/mca/btl/openib/btl_openib.c @@ -343,7 +343,7 @@ int mca_btl_openib_add_procs( rem_subnet_id_port_cnt = 0; BTL_VERBOSE(("got %d port_infos ", ib_proc->proc_port_count)); for (j = 0; j < (int) ib_proc->proc_port_count; j++){ - BTL_VERBOSE(("got a subnet %016x", + BTL_VERBOSE(("got a subnet %016" PRIx64, ib_proc->proc_ports[j].pm_port_info.subnet_id)); if (ib_proc->proc_ports[j].pm_port_info.subnet_id == openib_btl->port_info.subnet_id) { diff --git a/ompi/mca/btl/openib/btl_openib_component.c b/ompi/mca/btl/openib/btl_openib_component.c index 80036f7d4e..e50afdb0d9 100644 --- a/ompi/mca/btl/openib/btl_openib_component.c +++ b/ompi/mca/btl/openib/btl_openib_component.c @@ -500,17 +500,29 @@ static int init_one_port(opal_list_t *btl_list, mca_btl_openib_hca_t *hca, #if defined(HAVE_STRUCT_IBV_DEVICE_TRANSPORT_TYPE) if (IBV_TRANSPORT_IWARP == hca->ib_dev->transport_type) { subnet_id = mca_btl_openib_get_iwarp_subnet_id(hca->ib_dev); + BTL_VERBOSE(("my iWARP subnet_id is %016" PRIx64, subnet_id)); } else { - ibv_query_gid(hca->ib_dev_context, port_num, 0, &gid); + memset(&gid, 0, sizeof(gid)); + if (0 != ibv_query_gid(hca->ib_dev_context, port_num, 0, &gid)) { + BTL_ERROR(("ibv_query_gid failed (%s:%d)\n", + ibv_get_device_name(hca->ib_dev), port_num)); + return OMPI_ERR_NOT_FOUND; + } subnet_id = ntoh64(gid.global.subnet_prefix); + BTL_VERBOSE(("my IB subnet_id for HCA %s port %d is %016" PRIx64, + ibv_get_device_name(hca->ib_dev), port_num, subnet_id)); } #else - ibv_query_gid(hca->ib_dev_context, port_num, 0, &gid); + if (0 != ibv_query_gid(hca->ib_dev_context, port_num, 0, &gid)) { + BTL_ERROR(("ibv_query_gid failed (%s:%d)\n", + ibv_get_device_name(hca->ib_dev), port_num)); + return OMPI_ERR_NOT_FOUND; + } subnet_id = ntoh64(gid.global.subnet_prefix); + BTL_VERBOSE(("my IB-only subnet_id for HCA %s port %d is %016" PRIx64, + ibv_get_device_name(hca->ib_dev), port_num, subnet_id)); #endif - BTL_VERBOSE(("my subnet_id is %016x", subnet_id)); - if(mca_btl_openib_component.ib_num_btls > 0 && IB_DEFAULT_GID_PREFIX == subnet_id && mca_btl_openib_component.warn_default_gid_prefix) { diff --git a/ompi/mca/btl/openib/connect/btl_openib_connect_ibcm.c b/ompi/mca/btl/openib/connect/btl_openib_connect_ibcm.c index 5c7db96422..2735af34b4 100644 --- a/ompi/mca/btl/openib/connect/btl_openib_connect_ibcm.c +++ b/ompi/mca/btl/openib/connect/btl_openib_connect_ibcm.c @@ -908,7 +908,7 @@ static int fill_path_record(ibcm_module_t *m, /* Global attributes */ path_rec->dgid.global.subnet_prefix = path_rec->sgid.global.subnet_prefix = - m->btl->port_info.subnet_id; + hton64(m->btl->port_info.subnet_id); path_rec->dgid.global.interface_id = hton64(remote_msg->mm_port_guid); path_rec->sgid.global.interface_id = hton64(local_msg->mm_port_guid); path_rec->dlid = htons(endpoint->rem_info.rem_lid); @@ -1065,7 +1065,7 @@ static ibcm_request_t *alloc_request(ibcm_module_t *m, modex_msg_t *msg, { struct ib_cm_req_param *cm_req; ibcm_request_t *req = OBJ_NEW(ibcm_request_t); - BTL_VERBOSE(("allocated cached req id: %p", (void*)req)); + BTL_VERBOSE(("allocated cached req id: 0x%" PRIx64, (void*)req)); if (NULL == req) { return NULL; @@ -1079,6 +1079,7 @@ static ibcm_request_t *alloc_request(ibcm_module_t *m, modex_msg_t *msg, OBJ_RELEASE(req); return NULL; } + BTL_VERBOSE(("created CM ID 0x%" PRIx64, &(req->super.cm_id))); /* This data is constant for all the QP's */ req->path_rec = *path_rec; @@ -1104,6 +1105,29 @@ static ibcm_request_t *alloc_request(ibcm_module_t *m, modex_msg_t *msg, return req; } + + +static void print_req(struct ib_cm_req_param *cm_req) +{ + BTL_VERBOSE(("cm_req->primary_path: 0x%" PRIx64, cm_req->primary_path)); + BTL_VERBOSE(("cm_req->alternate_path: 0x%" PRIx64, cm_req->alternate_path)); + BTL_VERBOSE(("cm_req->service_id: 0x016%" PRIx64, cm_req->service_id)); + BTL_VERBOSE(("cm_req->qp_num: %d", cm_req->qp_num)); + BTL_VERBOSE(("cm_req->qp_type: %d", cm_req->qp_type)); + BTL_VERBOSE(("cm_req->starting_psn: %d", cm_req->starting_psn)); + BTL_VERBOSE(("cm_req->private_data: %" PRIx64, cm_req->private_data)); + BTL_VERBOSE(("cm_req->private_data_len: %d", cm_req->private_data_len)); + BTL_VERBOSE(("cm_req->peer_to_peer: %d", cm_req->peer_to_peer)); + BTL_VERBOSE(("cm_req->responder_resources: %d", cm_req->responder_resources)); + BTL_VERBOSE(("cm_req->initiator_depth: %d", cm_req->initiator_depth)); + BTL_VERBOSE(("cm_req->remote_cm_response_timeout: %d", cm_req->remote_cm_response_timeout)); + BTL_VERBOSE(("cm_req->flow_control: %d", cm_req->flow_control)); + BTL_VERBOSE(("cm_req->local_cm_response_timeout: %d", cm_req->local_cm_response_timeout)); + BTL_VERBOSE(("cm_req->retry_count: %d", cm_req->retry_count)); + BTL_VERBOSE(("cm_req->rnr_retry_count: %d", cm_req->rnr_retry_count)); + BTL_VERBOSE(("cm_req->max_cm_retries: %d", cm_req->max_cm_retries)); + BTL_VERBOSE(("cm_req->srq: %d", cm_req->srq)); +} static int ibcm_module_start_connect(ompi_btl_openib_connect_base_module_t *cpc, mca_btl_base_endpoint_t *endpoint) @@ -1228,7 +1252,14 @@ static int ibcm_module_start_connect(ompi_btl_openib_connect_base_module_t *cpc, BTL_VERBOSE(("sending connect request %d of %d (id %p)", i, mca_btl_openib_component.num_qps, (void*)req->super.cm_id)); - if (0 != ib_cm_send_req(req->super.cm_id, cm_req)) { + if (mca_btl_base_verbose > 0) { + print_req(cm_req); + } + if (0 != (rc = ib_cm_send_req(req->super.cm_id, cm_req))) { + BTL_VERBOSE(("Got nonzero return from ib_cm_send_req: %d, errno %d", rc, errno)); + if (-1 == rc) { + perror("Errno is "); + } rc = OMPI_ERR_UNREACH; goto err; } @@ -1263,19 +1294,26 @@ static int ibcm_module_start_connect(ompi_btl_openib_connect_base_module_t *cpc, req->private_data.ireqd_request = req; req->private_data.ireqd_qp_index = 0; + if (mca_btl_base_verbose > 0) { + print_req(cm_req); + } /* Send the request */ if (0 != (rc = ib_cm_send_req(req->super.cm_id, cm_req))) { - return OMPI_ERR_UNREACH; + BTL_VERBOSE(("Got nonzero return from ib_cm_send_req: %d", rc)); + rc = OMPI_ERR_UNREACH; + goto err; } /* Save the request on the global "pending requests" list */ opal_list_append(&ibcm_pending_requests, &(req->super.super)); } + BTL_VERBOSE(("connect request send successfully")); return OMPI_SUCCESS; err: + BTL_VERBOSE(("error!")); if (NULL != ie && NULL != ie->ie_cm_id_cache) { free(ie->ie_cm_id_cache); ie->ie_cm_id_cache = NULL; @@ -1853,6 +1891,7 @@ static int request_received(ibcm_listen_cm_id_t *cmh, } cbdata->cscd_cpc = (ompi_btl_openib_connect_base_module_t *) imodule; cbdata->cscd_endpoint = endpoint; + BTL_VERBOSE(("starting connect in other direction")); ompi_btl_openib_fd_schedule(callback_start_connect, cbdata); return OMPI_SUCCESS;