/* * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ */ #include "ompi_config.h" #include #include "ompi/constants.h" #include "ompi/mca/bcol/bcol.h" #include "ompi/mca/bcol/base/base.h" #include "ompi/mca/common/ofacm/connect.h" #include "opal/threads/mutex.h" #include "opal/class/opal_object.h" #include "bcol_iboffload.h" #include "bcol_iboffload_frag.h" #include "bcol_iboffload_device.h" #include "bcol_iboffload_endpoint.h" static void mca_bcol_iboffload_endpoint_construct(mca_bcol_iboffload_endpoint_t *ep) { ep->iboffload_module = NULL; ep->ibnet_proc = NULL; ep->qps = (mca_bcol_iboffload_endpoint_qp_t *) calloc(mca_bcol_iboffload_component.num_qps, sizeof(mca_bcol_iboffload_endpoint_qp_t)); ep->index = 0; OBJ_CONSTRUCT(&ep->endpoint_lock, opal_mutex_t); OBJ_CONSTRUCT(&ep->pending_frags, opal_list_t); memset(ep->recv_cq, 0, IBOFFLOAD_CQ_LAST * sizeof(ep->recv_cq[0])); memset(&ep->qp_config, 0, sizeof(ompi_common_ofacm_base_qp_config_t)); ep->cpc_context = NULL; memset(&ep->remote_zero_rdma_addr, 0, sizeof(mca_bcol_iboffload_rdma_info_t)); memset(&ep->remote_rdma_block, 0, sizeof(mca_bcol_iboffload_rem_rdma_block_t)); ep->need_toset_remote_rdma_info = false; } static void mca_bcol_iboffload_endpoint_destruct(mca_bcol_iboffload_endpoint_t *ep) { int qp_index, num_qps, i; ompi_free_list_item_t *item; mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; num_qps = cm->num_qps; IBOFFLOAD_VERBOSE(10, ("Destruct: ep - %p, ep->index - %d", ep, ep->index)); if (NULL != ep->qps) { for (qp_index = 0; qp_index < num_qps; ++qp_index) { do { item = (ompi_free_list_item_t *) opal_list_remove_first(&ep->qps[qp_index].preposted_frags); if(OPAL_LIKELY(NULL != item)) { OMPI_FREE_LIST_RETURN_MT(&ep->device->frags_free[qp_index], item); } } while (NULL != item); OBJ_DESTRUCT(&ep->qps[qp_index].preposted_frags); } free(ep->qps); } OBJ_DESTRUCT(&ep->endpoint_lock); OBJ_DESTRUCT(&ep->pending_frags); /* If the CPC has an endpoint_finalize function, call it */ if (NULL != ep->endpoint_cpc->cbm_endpoint_finalize) { ep->endpoint_cpc->cbm_endpoint_finalize(ep->cpc_context); } for (i = 0; i < IBOFFLOAD_CQ_LAST; i++) { if (NULL != ep->recv_cq[i]) { if (ibv_destroy_cq(ep->recv_cq[i])) { IBOFFLOAD_ERROR(("Endpoint %x " ", failed to destroy CQ, errno says %s", ep, strerror(errno))); } } } } OBJ_CLASS_INSTANCE(mca_bcol_iboffload_endpoint_t, opal_list_item_t, mca_bcol_iboffload_endpoint_construct, mca_bcol_iboffload_endpoint_destruct); /* Pasha: Add some error message here */ /* * Called when the CPC has established a connection on an endpoint */ static void mca_bcol_iboffload_endpoint_invoke_error(void *context) { mca_bcol_iboffload_endpoint_t *endpoint = (mca_bcol_iboffload_endpoint_t *) context; IBOFFLOAD_ERROR(("Getting error on endpoint - %p!", endpoint)); } /* Pasha: Need to add more logic here */ static void mca_bcol_iboffload_endpoint_cpc_complete(void *context) { mca_bcol_iboffload_endpoint_t *endpoint = (mca_bcol_iboffload_endpoint_t *) context; IBOFFLOAD_VERBOSE(10, ("Endpoint - %p for comm rank %d: CPC complete.\n", endpoint, endpoint->iboffload_module->ibnet->super.group_list[endpoint->index])); if (OMPI_SUCCESS != mca_bcol_iboffload_exchange_rem_addr(endpoint)) { IBOFFLOAD_ERROR(("endpoint - %p, " "remote addr exchange error.\n", endpoint)); } /* The connection is correctly setup. Now we can decrease the event trigger. */ opal_progress_event_users_decrement(); } /* Vasily: Need to add more logic here */ int mca_bcol_iboffload_endpoint_post_recvs(void *context) { int qp_index, rc, num_qps; mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; mca_bcol_iboffload_endpoint_t *endpoint = (mca_bcol_iboffload_endpoint_t *) context; IBOFFLOAD_VERBOSE(10, ("endpoint - %p, post of %d recvs !", endpoint, cm->qp_infos[0].rd_num)); /* TODO Pasha - fix later */ num_qps = cm->num_qps; for (qp_index = 0; qp_index < num_qps; ++qp_index) { rc = mca_bcol_iboffload_prepost_recv(endpoint, qp_index, cm->qp_infos[qp_index].rd_num); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { /* Pasha: Need to add more failure logic */ IBOFFLOAD_ERROR(("Failed to prepost recv fragments " "on qp index %d, return code - %d", qp_index, rc)); return OMPI_ERROR; } } return OMPI_SUCCESS; } /* The function go over each ibnet proc and creates endpoint for each one */ int mca_bcol_iboffloads_create_endpoints(mca_sbgp_ibnet_connection_group_info_t *cgroup, mca_bcol_iboffload_module_t *module) { uint32_t i; mca_bcol_iboffload_endpoint_t *ep; if (NULL == cgroup || NULL == module) { IBOFFLOAD_ERROR(("Bad parameters for create endpoints function.")); return OMPI_ERROR; } module->num_endpoints = cgroup->num_procs; module->endpoints = (mca_bcol_iboffload_endpoint_t **) calloc(module->num_endpoints, sizeof(mca_bcol_iboffload_endpoint_t *)); if (NULL == module->endpoints) { IBOFFLOAD_ERROR(("Error memory allocation for endpoints array" ", errno says %s", strerror(errno))); return OMPI_ERROR; } IBOFFLOAD_VERBOSE(10, ("iboffload - %p, num of endpoints - %d.\n", module, module->num_endpoints)); /* Ishai: No need to open so many endpoints. We are not talking with all procs */ for (i = 0; i < cgroup->num_procs; i++) { ep = OBJ_NEW(mca_bcol_iboffload_endpoint_t); /* check qp memory allocation */ if (NULL == ep->qps) { IBOFFLOAD_ERROR(("Failed to allocate memory for qps")); return OMPI_ERROR; } /* init new endpoint */ ep->index = i; ep->iboffload_module = module; /* saving the device for the destruction - iboffload module amy not exist than */ ep->device = ep->iboffload_module->device; ep->ibnet_proc = (mca_sbgp_ibnet_proc_t *) opal_pointer_array_get_item(cgroup->ibnet_procs, i); if (NULL == ep->ibnet_proc) { IBOFFLOAD_ERROR(("Failed to get proc pointer, for index %d", i)); return OMPI_ERROR; } if (OMPI_SUCCESS != mca_bcol_iboffload_endpoint_init(ep)) { IBOFFLOAD_ERROR(("Failed to init endpoint - %p", ep)); return OMPI_ERROR; } IBOFFLOAD_VERBOSE(10, ("Endpoint - %p, ep index - %d, iboffload - %p, " "cpc contex - %p.\n", ep, ep->index, ep->iboffload_module, ep->cpc_context)); /* Add the new endpoint to array of endpoints */ module->endpoints[i] = ep; } /* Pasha: Need to add better clean-up here */ return OMPI_SUCCESS; } static int config_qps(mca_bcol_iboffload_endpoint_t *ep) { int qp_index; int ret = OMPI_SUCCESS; ompi_common_ofacm_base_qp_config_t *qp_config = &ep->qp_config; mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; qp_config->num_srqs = 0; qp_config->srq_num = NULL; qp_config->num_qps = cm->num_qps; qp_config->init_attr = (struct ibv_qp_init_attr *) calloc(qp_config->num_qps, sizeof(struct ibv_qp_init_attr)); if (NULL == qp_config->init_attr) { IBOFFLOAD_ERROR(("Failed allocate memory for qp init attributes")); ret = OMPI_ERR_OUT_OF_RESOURCE; goto config_qps_exit; } qp_config->attr = (struct ibv_qp_attr *) calloc(qp_config->num_qps, sizeof(struct ibv_qp_attr)); if (OPAL_UNLIKELY(NULL == qp_config->attr)) { IBOFFLOAD_ERROR(("Failed allocate memory for qp attributes")); ret = OMPI_ERR_OUT_OF_RESOURCE; goto config_qps_exit; } /* we must to specify that the qps are special */ qp_config->init_attr_mask = (uint32_t *) calloc(qp_config->num_qps, sizeof(uint32_t)); if (OPAL_UNLIKELY(NULL == qp_config->init_attr_mask)) { IBOFFLOAD_ERROR(("Failed allocate memory for qp mask.")); ret = OMPI_ERR_OUT_OF_RESOURCE; goto config_qps_exit; } /* qp_config->rtr_attr_mask = qp_config->rts_attr_mask = NULL; */ qp_config->rtr_attr_mask = (uint32_t *) calloc(qp_config->num_qps, sizeof(uint32_t)); if (OPAL_UNLIKELY(NULL == qp_config->rtr_attr_mask)) { IBOFFLOAD_ERROR(("Failled allocate memory for qp rtr attributes mask.")); ret = OMPI_ERR_OUT_OF_RESOURCE; goto config_qps_exit; } qp_config->rts_attr_mask = (uint32_t *) calloc(qp_config->num_qps, sizeof(uint32_t)); if (OPAL_UNLIKELY(NULL == qp_config->rts_attr_mask)) { IBOFFLOAD_ERROR(("Failled allocate memory for qp rts attributes mask.")); ret = OMPI_ERR_OUT_OF_RESOURCE; goto config_qps_exit; } for (qp_index = 0; qp_index < qp_config->num_qps; ++qp_index) { mca_bcol_iboffload_config_qps_fn_t config_qp = cm->qp_infos[qp_index].config_qp; if (NULL != config_qp) { config_qp(qp_index, ep, qp_config); } } config_qps_exit: return ret; } /* The fucntion is called for endpoints * with MCA_COMMON_OFACM_USER_CUSTOM state only, * we need a OPAL_THREAD_LOCK before call to this function */ int mca_bcol_iboffload_endpoint_init(mca_bcol_iboffload_endpoint_t *ep) { int qp_index, cq_index, num_qps; ompi_common_ofacm_base_module_t *cpc; mca_bcol_iboffload_device_t *device = ep->iboffload_module->device; mca_sbgp_ibnet_connection_group_info_t *cgroup = &ep->iboffload_module->ibnet->cgroups[ep->iboffload_module->cgroup_index]; for (cq_index = 0; cq_index < IBOFFLOAD_CQ_LAST; cq_index++) { if (OMPI_SUCCESS != mca_bcol_iboffload_adjust_cq(device, &ep->recv_cq[cq_index])) { IBOFFLOAD_ERROR(("Error creating CQ for %s errno says %s", ibv_get_device_name(device->dev.ib_dev), strerror(errno))); /* OBJ_RELEASE(ep); */ /* Vasily: What must we do in this case ??? */ return OMPI_ERROR; } } if (OPAL_UNLIKELY(OMPI_SUCCESS != config_qps(ep))) { IBOFFLOAD_ERROR(("Error configure QPs for endpoint %x errno says %s", ep, strerror(errno))); return OMPI_ERROR; } /* Adding here one more redirection in critical path. Need to think * what is the best way to prevent it */ IBOFFLOAD_VERBOSE(10, ("Endpoint - %p, rem port - %d", ep, ep->ibnet_proc->remote_ports_info[BCOL_IBOFFLOAD_ENDPOINT_PORT_IDX(cgroup, ep)].id)); cpc = ep->ibnet_proc->remote_ports_info[BCOL_IBOFFLOAD_ENDPOINT_PORT_IDX(cgroup, ep)].local_cpc; ep->endpoint_cpc = cpc; /* caching pointer to cpc */ if (NULL != cpc->cbm_endpoint_init) { ep->cpc_context = cpc->cbm_endpoint_init( ep->ibnet_proc->ompi_proc, &ep->qp_config, device->ib_pd, ep->iboffload_module->subnet_id, ep->iboffload_module->ibnet->group_id, ep->iboffload_module->lid, /* Remote lid of target module */ ep->ibnet_proc->remote_ports_info[BCOL_IBOFFLOAD_ENDPOINT_PORT_IDX(cgroup, ep)].lid, ep->index, /* user context index */ (void *) ep, /* user context */ cpc, mca_bcol_iboffload_endpoint_cpc_complete, mca_bcol_iboffload_endpoint_invoke_error, mca_bcol_iboffload_endpoint_post_recvs); if (OPAL_UNLIKELY(NULL == ep->cpc_context)) { IBOFFLOAD_ERROR(("Endpoint - %p, failed to init context", ep)); /* OBJ_RELEASE(ep); */ /* Vasily: What must we do in this case ??? */ return OMPI_ERROR; } /* Updating remote port info */ num_qps = mca_bcol_iboffload_component.num_qps; ep->remote_info = &ep->cpc_context->remote_info; for (qp_index = 0; qp_index < num_qps; ++qp_index) { ep->qps[qp_index].qp = &ep->cpc_context->qps[qp_index]; } } return OMPI_SUCCESS; }