/* -*- Mode: C; c-basic-offset:4 ; -*- */ /* * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. * Copyright (c) 2004-2011 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2006 Sandia National Laboratories. All rights * reserved. * Copyright (c) 2007-2009 Sun Microsystems, Inc. All rights reserved. * * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ */ #include "ompi_config.h" #include #include #include "ompi/types.h" #include "opal/align.h" #include "orte/mca/rml/rml.h" #include "orte/mca/errmgr/errmgr.h" #include "opal/dss/dss.h" #include "opal/class/opal_pointer_array.h" #include "ompi/class/ompi_free_list.h" #include "ompi/mca/mpool/grdma/mpool_grdma.h" #include "ompi/mca/dpm/dpm.h" #include "ompi/mca/btl/base/btl_base_error.h" #include "btl_udapl.h" #include "btl_udapl_endpoint.h" #include "btl_udapl_frag.h" #include "btl_udapl_mca.h" #include "btl_udapl_proc.h" static void mca_btl_udapl_endpoint_send_cb( int status, orte_process_name_t* endpoint, opal_buffer_t* buffer, orte_rml_tag_t tag, void* cbdata); static int mca_btl_udapl_start_connect(mca_btl_base_endpoint_t* endpoint); static int mca_btl_udapl_endpoint_post_recv( mca_btl_udapl_endpoint_t* endpoint, size_t size); void mca_btl_udapl_endpoint_connect(mca_btl_udapl_endpoint_t* endpoint); void mca_btl_udapl_endpoint_recv( int status, orte_process_name_t* endpoint, opal_buffer_t* buffer, orte_rml_tag_t tag, void* cbdata); static int mca_btl_udapl_endpoint_finish_eager(mca_btl_udapl_endpoint_t*); static int mca_btl_udapl_endpoint_finish_max(mca_btl_udapl_endpoint_t*); static mca_btl_base_endpoint_t* mca_btl_udapl_find_endpoint_connection_match( struct mca_btl_udapl_module_t* btl, DAT_EP_HANDLE ep); static int mca_btl_udapl_endpoint_pd_finish_eager( mca_btl_udapl_endpoint_t* endpoint); static int mca_btl_udapl_endpoint_pd_finish_max( mca_btl_udapl_endpoint_t* endpoint); static int mca_btl_udapl_endpoint_pd_connections_completed( mca_btl_udapl_endpoint_t* endpoint); static void mca_btl_udapl_endpoint_connect_eager_rdma( mca_btl_udapl_endpoint_t* endpoint); static int mca_btl_udapl_endpoint_write_eager( mca_btl_base_endpoint_t* endpoint, mca_btl_udapl_frag_t* frag); static void mca_btl_udapl_endpoint_control_send_cb(mca_btl_base_module_t* btl, mca_btl_base_endpoint_t* endpoint, mca_btl_base_descriptor_t* descriptor, int status); static int mca_btl_udapl_endpoint_send_eager_rdma( mca_btl_base_endpoint_t* endpoint); extern void mca_btl_udapl_frag_progress_pending( mca_btl_udapl_module_t* udapl_btl, mca_btl_base_endpoint_t* endpoint, const int connection); /* * Write a fragment * * @param endpoint (IN) BTL addressing information * @param frag (IN) Fragment to be transferred * * @return OMPI_SUCCESS or OMPI_ERROR */ int mca_btl_udapl_endpoint_write_eager(mca_btl_base_endpoint_t* endpoint, mca_btl_udapl_frag_t* frag) { DAT_DTO_COOKIE cookie; char* remote_buf; DAT_RMR_TRIPLET remote_buffer; int rc = OMPI_SUCCESS; int pad = 0; uint8_t head = endpoint->endpoint_eager_rdma_remote.head; size_t size_plus_align = OPAL_ALIGN( mca_btl_udapl_component.udapl_eager_frag_size, DAT_OPTIMAL_ALIGNMENT, size_t); /* now that we have the head update it */ MCA_BTL_UDAPL_RDMA_NEXT_INDEX(endpoint->endpoint_eager_rdma_remote.head); MCA_BTL_UDAPL_FRAG_CALC_ALIGNMENT_PAD(pad, (frag->segment.base.seg_len + sizeof(mca_btl_udapl_footer_t))); /* set the rdma footer information */ frag->rdma_ftr = (mca_btl_udapl_rdma_footer_t *) ((char *)frag->segment.base.seg_addr.pval + frag->segment.base.seg_len + sizeof(mca_btl_udapl_footer_t) + pad); frag->rdma_ftr->active = 1; frag->rdma_ftr->size = frag->segment.base.seg_len; /* this is size PML wants; * will have to calc * alignment * at the other end */ /* prep the fragment to be written out */ frag->type = MCA_BTL_UDAPL_RDMA_WRITE; frag->triplet.segment_length = frag->segment.base.seg_len + sizeof(mca_btl_udapl_footer_t) + pad + sizeof(mca_btl_udapl_rdma_footer_t); /* set remote_buf to start of the remote write location; * compute by first finding the end of the entire fragment * and then working way back */ remote_buf = (char *)(endpoint->endpoint_eager_rdma_remote.base.pval) + (head * size_plus_align) + frag->size - frag->triplet.segment_length; /* execute transfer with one contiguous write */ /* establish remote memory region */ remote_buffer.rmr_context = (DAT_RMR_CONTEXT)endpoint->endpoint_eager_rdma_remote.rkey; remote_buffer.target_address = (DAT_VADDR)(uintptr_t)remote_buf; remote_buffer.segment_length = frag->triplet.segment_length; /* write the data out */ cookie.as_ptr = frag; rc = dat_ep_post_rdma_write(endpoint->endpoint_eager, 1, &(frag->triplet), cookie, &remote_buffer, DAT_COMPLETION_DEFAULT_FLAG); if(DAT_SUCCESS != rc) { char* major; char* minor; dat_strerror(rc, (const char**)&major, (const char**)&minor); BTL_ERROR(("ERROR: %s %s %s\n", "dat_ep_post_rdma_write", major, minor)); return OMPI_ERROR; } return rc; } int mca_btl_udapl_endpoint_send(mca_btl_base_endpoint_t* endpoint, mca_btl_udapl_frag_t* frag) { int rc = OMPI_SUCCESS; DAT_RETURN dat_rc; DAT_DTO_COOKIE cookie; bool call_progress = false; /* Fix up the segment length before we do anything with the frag */ frag->triplet.segment_length = frag->segment.base.seg_len + sizeof(mca_btl_udapl_footer_t); OPAL_THREAD_LOCK(&endpoint->endpoint_lock); switch(endpoint->endpoint_state) { case MCA_BTL_UDAPL_CONNECTED: /* just send it already.. */ if(frag->size == mca_btl_udapl_component.udapl_eager_frag_size) { if (OPAL_THREAD_ADD32(&endpoint->endpoint_lwqe_tokens[BTL_UDAPL_EAGER_CONNECTION], -1) < 0) { /* no local work queue tokens available */ OPAL_THREAD_ADD32(&endpoint->endpoint_lwqe_tokens[BTL_UDAPL_EAGER_CONNECTION], 1); opal_list_append(&endpoint->endpoint_eager_frags, (opal_list_item_t*)frag); call_progress = true; } else { /* work queue tokens available, try to write */ if(OPAL_THREAD_ADD32(&endpoint->endpoint_eager_rdma_remote.tokens, -1) < 0) { /* no rdma segment available so either send or queue */ OPAL_THREAD_ADD32(&endpoint->endpoint_eager_rdma_remote.tokens, 1); if(OPAL_THREAD_ADD32(&endpoint->endpoint_sr_tokens[BTL_UDAPL_EAGER_CONNECTION], -1) < 0) { /* no sr tokens available, put on queue */ OPAL_THREAD_ADD32(&endpoint->endpoint_lwqe_tokens[BTL_UDAPL_EAGER_CONNECTION], 1); OPAL_THREAD_ADD32(&endpoint->endpoint_sr_tokens[BTL_UDAPL_EAGER_CONNECTION], 1); opal_list_append(&endpoint->endpoint_eager_frags, (opal_list_item_t*)frag); call_progress = true; } else { /* sr tokens available, send eager size frag */ cookie.as_ptr = frag; dat_rc = dat_ep_post_send(endpoint->endpoint_eager, 1, &frag->triplet, cookie, DAT_COMPLETION_DEFAULT_FLAG); if(DAT_SUCCESS != dat_rc) { char* major; char* minor; dat_strerror(dat_rc, (const char**)&major, (const char**)&minor); BTL_ERROR(("ERROR: %s %s %s\n", "dat_ep_post_send", major, minor)); endpoint->endpoint_state = MCA_BTL_UDAPL_FAILED; rc = OMPI_ERROR; } } } else { rc = mca_btl_udapl_endpoint_write_eager(endpoint, frag); } } } else { assert(frag->size == mca_btl_udapl_component.udapl_max_frag_size); if (OPAL_THREAD_ADD32(&endpoint->endpoint_lwqe_tokens[BTL_UDAPL_MAX_CONNECTION], -1) < 0) { /* no local work queue tokens available, put on queue */ OPAL_THREAD_ADD32(&endpoint->endpoint_lwqe_tokens[BTL_UDAPL_MAX_CONNECTION], 1); opal_list_append(&endpoint->endpoint_max_frags, (opal_list_item_t*)frag); call_progress = true; } else { /* work queue tokens available, try to send */ if(OPAL_THREAD_ADD32(&endpoint->endpoint_sr_tokens[BTL_UDAPL_MAX_CONNECTION], -1) < 0) { /* no sr tokens available, put on queue */ OPAL_THREAD_ADD32(&endpoint->endpoint_lwqe_tokens[BTL_UDAPL_MAX_CONNECTION], 1); OPAL_THREAD_ADD32(&endpoint->endpoint_sr_tokens[BTL_UDAPL_MAX_CONNECTION], 1); opal_list_append(&endpoint->endpoint_max_frags, (opal_list_item_t*)frag); call_progress = true; } else { /* sr tokens available, send max size frag */ cookie.as_ptr = frag; dat_rc = dat_ep_post_send(endpoint->endpoint_max, 1, &frag->triplet, cookie, DAT_COMPLETION_DEFAULT_FLAG); if(DAT_SUCCESS != dat_rc) { char* major; char* minor; dat_strerror(dat_rc, (const char**)&major, (const char**)&minor); BTL_ERROR(("ERROR: %s %s %s\n", "dat_ep_post_send", major, minor)); rc = OMPI_ERROR; } } } } break; case MCA_BTL_UDAPL_CLOSED: /* Initiate a new connection, add this send to a queue */ rc = mca_btl_udapl_start_connect(endpoint); if(OMPI_SUCCESS != rc) { endpoint->endpoint_state = MCA_BTL_UDAPL_FAILED; break; } /* Fall through on purpose to queue the send */ case MCA_BTL_UDAPL_CONN_EAGER: case MCA_BTL_UDAPL_CONN_MAX: /* Add this send to a queue */ if(frag->size == mca_btl_udapl_component.udapl_eager_frag_size) { opal_list_append(&endpoint->endpoint_eager_frags, (opal_list_item_t*)frag); } else { assert(frag->size == mca_btl_udapl_component.udapl_max_frag_size); opal_list_append(&endpoint->endpoint_max_frags, (opal_list_item_t*)frag); } break; case MCA_BTL_UDAPL_FAILED: rc = OMPI_ERR_UNREACH; break; } OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock); if(call_progress) opal_progress(); return rc; } static void mca_btl_udapl_endpoint_send_cb(int status, orte_process_name_t* endpoint, opal_buffer_t* buffer, orte_rml_tag_t tag, void* cbdata) { OBJ_RELEASE(buffer); } /* * Set uDAPL endpoint parameters as required in ep_param. Accomplished * by retrieving the default set of parameters from temporary (dummy) * endpoint and then setting any other parameters as required by * this BTL. * * @param btl (IN) BTL module * @param ep_param (IN/OUT)Pointer to a valid endpoint parameter location * * @return OMPI_SUCCESS or error status on failure */ int mca_btl_udapl_endpoint_get_params(mca_btl_udapl_module_t* btl, DAT_EP_PARAM* ep_param) { int rc = OMPI_SUCCESS; int request_dtos; int max_control_messages; DAT_EP_HANDLE dummy_ep; DAT_EP_ATTR* ep_attr = &((*ep_param).ep_attr); /* open dummy endpoint, used to find default endpoint parameters */ rc = dat_ep_create(btl->udapl_ia, btl->udapl_pz, btl->udapl_evd_dto, btl->udapl_evd_dto, btl->udapl_evd_conn, NULL, &dummy_ep); if (DAT_SUCCESS != rc) { char* major; char* minor; dat_strerror(rc, (const char**)&major, (const char**)&minor); BTL_ERROR(("ERROR: %s %s %s\n", "dat_ep_create", major, minor)); /* this could be recoverable, by just using defaults */ ep_attr = NULL; return OMPI_ERROR; } rc = dat_ep_query(dummy_ep, DAT_EP_FIELD_ALL, ep_param); if (DAT_SUCCESS != rc) { char* major; char* minor; dat_strerror(rc, (const char**)&major, (const char**)&minor); BTL_ERROR(("ERROR: %s %s %s\n", "dat_ep_query", major, minor)); /* this could be recoverable, by just using defaults */ ep_attr = NULL; return OMPI_ERROR; } /* Set max_recv_dtos : * The max_recv_dtos should be equal to the number of * outstanding posted receives, which for this BTL will * be mca_btl_udapl_component.udapl_num_recvs. */ if (btl->udapl_max_recv_dtos < mca_btl_udapl_component.udapl_num_recvs) { if (MCA_BTL_UDAPL_MAX_RECV_DTOS_DEFAULT != btl->udapl_max_recv_dtos) { /* user modified, this will fail and is not acceptable */ BTL_UDAPL_VERBOSE_HELP(VERBOSE_SHOW_HELP, ("help-mpi-btl-udapl.txt", "max_recv_dtos too low", true, btl->udapl_max_recv_dtos, mca_btl_udapl_component.udapl_num_recvs)); btl->udapl_max_recv_dtos = mca_btl_udapl_component.udapl_num_recvs; } if (MCA_BTL_UDAPL_NUM_RECVS_DEFAULT != mca_btl_udapl_component.udapl_num_recvs) { /* user modified udapl_num_recvs so adjust max_recv_dtos */ btl->udapl_max_recv_dtos = mca_btl_udapl_component.udapl_num_recvs; } } (*ep_attr).max_recv_dtos = btl->udapl_max_recv_dtos; /* Set max_request_dtos : * The max_request_dtos should equal the max number of * outstanding sends plus RDMA operations. * * Note: Using the same value for both EAGER and MAX * connections even though the MAX connection does not * have the extra RDMA operations that the EAGER * connection does. */ max_control_messages = (mca_btl_udapl_component.udapl_num_recvs / mca_btl_udapl_component.udapl_sr_win) + 1 + (mca_btl_udapl_component.udapl_eager_rdma_num / mca_btl_udapl_component.udapl_eager_rdma_win) + 1; request_dtos = mca_btl_udapl_component.udapl_num_sends + (2*mca_btl_udapl_component.udapl_eager_rdma_num) + max_control_messages; if (btl->udapl_max_request_dtos < request_dtos) { if (MCA_BTL_UDAPL_MAX_REQUEST_DTOS_DEFAULT != mca_btl_udapl_module.udapl_max_request_dtos) { /* user has modified */ BTL_UDAPL_VERBOSE_HELP(VERBOSE_SHOW_HELP, ("help-mpi-btl-udapl.txt", "max_request_dtos too low", true, btl->udapl_max_request_dtos, request_dtos)); } else { btl->udapl_max_request_dtos = mca_btl_udapl_module.udapl_max_request_dtos = request_dtos; } } if (btl->udapl_max_request_dtos > btl->udapl_ia_attr.max_dto_per_ep) { /* do not go beyond what is allowed by the system */ BTL_UDAPL_VERBOSE_HELP(VERBOSE_SHOW_HELP, ("help-mpi-btl-udapl.txt", "max_request_dtos system max", true, btl->udapl_max_request_dtos, btl->udapl_ia_attr.max_dto_per_ep)); btl->udapl_max_request_dtos = btl->udapl_ia_attr.max_dto_per_ep; } (*ep_attr).max_request_dtos = btl->udapl_max_request_dtos; /* close the dummy endpoint */ rc = dat_ep_free(dummy_ep); if (DAT_SUCCESS != rc) { char* major; char* minor; dat_strerror(rc, (const char**)&major, (const char**)&minor); BTL_ERROR(("WARNING: %s %s %s\n", "dat_ep_free", major, minor)); /* this could be recoverable, by just using defaults */ } return rc; } /* * Create a uDAPL endpoint * * @param btl (IN) BTL module * @param ep_endpoint (IN) uDAPL endpoint information * * @return OMPI_SUCCESS or error status on failure */ int mca_btl_udapl_endpoint_create(mca_btl_udapl_module_t* btl, DAT_EP_HANDLE* udapl_endpoint) { int rc = OMPI_SUCCESS; /* Create a new uDAPL endpoint and start the connection process */ rc = dat_ep_create(btl->udapl_ia, btl->udapl_pz, btl->udapl_evd_dto, btl->udapl_evd_dto, btl->udapl_evd_conn, &(btl->udapl_ep_param.ep_attr), udapl_endpoint); if(DAT_SUCCESS != rc) { char* major; char* minor; dat_strerror(rc, (const char**)&major, (const char**)&minor); BTL_ERROR(("ERROR: %s %s %s\n", "dat_ep_create", major, minor)); dat_ep_free(udapl_endpoint); udapl_endpoint = DAT_HANDLE_NULL; } return rc; } static int mca_btl_udapl_start_connect(mca_btl_base_endpoint_t* endpoint) { mca_btl_udapl_addr_t* addr = &endpoint->endpoint_btl->udapl_addr; opal_buffer_t* buf = OBJ_NEW(opal_buffer_t); int rc; if(NULL == buf) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); return OMPI_ERR_OUT_OF_RESOURCE; } OPAL_THREAD_ADD32(&(endpoint->endpoint_btl->udapl_connect_inprogress), 1); /* Pack our address information */ rc = opal_dss.pack(buf, &addr->port, 1, OPAL_UINT64); if(OPAL_SUCCESS != rc) { ORTE_ERROR_LOG(rc); return rc; } rc = opal_dss.pack(buf, &addr->addr, sizeof(DAT_SOCK_ADDR), OPAL_UINT8); if(OPAL_SUCCESS != rc) { ORTE_ERROR_LOG(rc); return rc; } /* Send the buffer */ rc = orte_rml.send_buffer_nb(&endpoint->endpoint_proc->proc_ompi->proc_name, buf, OMPI_RML_TAG_UDAPL, 0, mca_btl_udapl_endpoint_send_cb, NULL); if(0 > rc) { ORTE_ERROR_LOG(rc); return rc; } endpoint->endpoint_state = MCA_BTL_UDAPL_CONN_EAGER; return OMPI_SUCCESS; } void mca_btl_udapl_endpoint_recv(int status, orte_process_name_t* endpoint, opal_buffer_t* buffer, orte_rml_tag_t tag, void* cbdata) { mca_btl_udapl_addr_t addr; mca_btl_udapl_proc_t* proc; mca_btl_base_endpoint_t* ep; int32_t cnt = 1; size_t i; int rc; /* Unpack data */ rc = opal_dss.unpack(buffer, &addr.port, &cnt, OPAL_UINT64); if(OPAL_SUCCESS != rc) { ORTE_ERROR_LOG(rc); return; } cnt = sizeof(mca_btl_udapl_addr_t); rc = opal_dss.unpack(buffer, &addr.addr, &cnt, OPAL_UINT8); if(OPAL_SUCCESS != rc) { ORTE_ERROR_LOG(rc); return; } /* Match the endpoint and handle it */ OPAL_THREAD_LOCK(&mca_btl_udapl_component.udapl_lock); for(proc = (mca_btl_udapl_proc_t*) opal_list_get_first(&mca_btl_udapl_component.udapl_procs); proc != (mca_btl_udapl_proc_t*) opal_list_get_end(&mca_btl_udapl_component.udapl_procs); proc = (mca_btl_udapl_proc_t*)opal_list_get_next(proc)) { if(OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &proc->proc_ompi->proc_name, endpoint)) { for(i = 0; i < proc->proc_endpoint_count; i++) { ep = proc->proc_endpoints[i]; /* Does this endpoint match? Only compare the address * portion of mca_btl_udapl_addr_t. */ if(!memcmp(&addr, &ep->endpoint_addr, (sizeof(DAT_CONN_QUAL) + sizeof(DAT_SOCK_ADDR)))) { OPAL_THREAD_UNLOCK(&mca_btl_udapl_component.udapl_lock); mca_btl_udapl_endpoint_connect(ep); return; } } } } OPAL_THREAD_UNLOCK(&mca_btl_udapl_component.udapl_lock); } /* * Set up OOB recv callback. */ void mca_btl_udapl_endpoint_post_oob_recv(void) { orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, OMPI_RML_TAG_UDAPL, ORTE_RML_PERSISTENT, mca_btl_udapl_endpoint_recv, NULL); } void mca_btl_udapl_endpoint_connect(mca_btl_udapl_endpoint_t* endpoint) { mca_btl_udapl_module_t* btl = endpoint->endpoint_btl; int rc; char *priv_data_ptr = NULL; DAT_COUNT priv_data_size = 0; OPAL_THREAD_LOCK(&endpoint->endpoint_lock); OPAL_THREAD_ADD32(&(btl->udapl_connect_inprogress), 1); /* Nasty test to prevent deadlock and unwanted connection attempts */ /* This right here is the whole point of using the ORTE/RML handshake */ if((MCA_BTL_UDAPL_CONN_EAGER == endpoint->endpoint_state && 0 > orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &endpoint->endpoint_proc->proc_ompi->proc_name, &ompi_proc_local()->proc_name)) || (MCA_BTL_UDAPL_CLOSED != endpoint->endpoint_state && MCA_BTL_UDAPL_CONN_EAGER != endpoint->endpoint_state)) { OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock); return; } /* Create a new uDAPL endpoint and start the connection process */ rc = mca_btl_udapl_endpoint_create(btl, &endpoint->endpoint_eager); if(DAT_SUCCESS != rc) { BTL_ERROR(("mca_btl_udapl_endpoint_create")); goto failure_create; } /* create private data as required */ if (mca_btl_udapl_component.udapl_conn_priv_data) { int32_t priv_data_conn_type = BTL_UDAPL_EAGER_CONNECTION; priv_data_size = sizeof(mca_btl_udapl_addr_t) + sizeof(int32_t); priv_data_ptr = (char *)malloc(priv_data_size); if (NULL == priv_data_ptr) { BTL_ERROR(("ERROR: %s %s\n", "mca_btl_udapl_endpoint_connect", "out of resources")); goto failure_create; } /* private data consists of local btl address, listen port (psp), * and endpoint state to indicate EAGER or MAX endpoint */ memcpy(priv_data_ptr, &btl->udapl_addr, sizeof(mca_btl_udapl_addr_t)); memcpy((priv_data_ptr + sizeof(mca_btl_udapl_addr_t)), &priv_data_conn_type, sizeof(int32_t)); } rc = dat_ep_connect(endpoint->endpoint_eager, &endpoint->endpoint_addr.addr, endpoint->endpoint_addr.port, mca_btl_udapl_component.udapl_timeout, priv_data_size, priv_data_ptr, 0, DAT_CONNECT_DEFAULT_FLAG); if(DAT_SUCCESS != rc) { char* major; char* minor; dat_strerror(rc, (const char**)&major, (const char**)&minor); BTL_ERROR(("ERROR: %s %s %s\n", "dat_ep_connect", major, minor)); goto failure; } endpoint->endpoint_state = MCA_BTL_UDAPL_CONN_EAGER; OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock); if (mca_btl_udapl_component.udapl_conn_priv_data) { free(priv_data_ptr); } return; failure: dat_ep_free(endpoint->endpoint_eager); failure_create: endpoint->endpoint_eager = DAT_HANDLE_NULL; endpoint->endpoint_state = MCA_BTL_UDAPL_FAILED; OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock); return; } /* * Finish establishing a connection * Note that this routine expects that the mca_btl_udapl_component.udapl.lock * has been acquired by the callee. */ int mca_btl_udapl_endpoint_finish_connect(struct mca_btl_udapl_module_t* btl, mca_btl_udapl_addr_t* addr, int32_t* connection_seq, DAT_EP_HANDLE endpoint) { mca_btl_base_endpoint_t* ep; int rc = OMPI_SUCCESS; /* find the endpoint which matches the address in data received */ ep = mca_btl_udapl_find_endpoint_address_match(btl, *addr); if (ep == NULL) { /* If this point is reached, no matching endpoint was found */ BTL_UDAPL_VERBOSE_OUTPUT(VERBOSE_CRITICAL, ("ERROR: could not match endpoint\n")); return OMPI_ERROR; } if(MCA_BTL_UDAPL_CONN_EAGER == ep->endpoint_state) { ep->endpoint_connection_seq = (NULL != connection_seq) ? *connection_seq:0; ep->endpoint_eager = endpoint; rc = mca_btl_udapl_endpoint_finish_eager(ep); } else if(MCA_BTL_UDAPL_CONN_MAX == ep->endpoint_state) { /* Check to see order of messages received are in * the same order the actual connections are made. * If they are not we need to swap the eager and * max connections. This inversion is possible due * to a race condition that one process may actually * receive the sendrecv messages from the max connection * before the eager connection. */ if (NULL == connection_seq || ep->endpoint_connection_seq < *connection_seq) { /* normal order connection matching */ ep->endpoint_max = endpoint; } else { /* inverted order connection matching */ ep->endpoint_max = ep->endpoint_eager; ep->endpoint_eager = endpoint; } rc = mca_btl_udapl_endpoint_finish_max(ep); } else { BTL_UDAPL_VERBOSE_OUTPUT(VERBOSE_DIAGNOSE, ("ERROR: invalid EP state %d\n", ep->endpoint_state)); return OMPI_ERROR; } return rc; } /* * Finish setting up an eager connection, start a max connection */ static int mca_btl_udapl_endpoint_finish_eager( mca_btl_udapl_endpoint_t* endpoint) { mca_btl_udapl_module_t* btl = endpoint->endpoint_btl; int rc; endpoint->endpoint_state = MCA_BTL_UDAPL_CONN_MAX; OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock); /* establish eager rdma connection */ if ((1 == mca_btl_udapl_component.udapl_use_eager_rdma) && (btl->udapl_eager_rdma_endpoint_count < mca_btl_udapl_component.udapl_max_eager_rdma_peers)) { mca_btl_udapl_endpoint_connect_eager_rdma(endpoint); } /* Only one side does dat_ep_connect() */ if(0 < orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &endpoint->endpoint_proc->proc_ompi->proc_name, &ompi_proc_local()->proc_name)) { rc = mca_btl_udapl_endpoint_create(btl, &endpoint->endpoint_max); if(DAT_SUCCESS != rc) { endpoint->endpoint_state = MCA_BTL_UDAPL_FAILED; OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock); return OMPI_ERROR; } rc = dat_ep_connect(endpoint->endpoint_max, &endpoint->endpoint_addr.addr, endpoint->endpoint_addr.port, mca_btl_udapl_component.udapl_timeout, 0, NULL, 0, DAT_CONNECT_DEFAULT_FLAG); if(DAT_SUCCESS != rc) { char* major; char* minor; dat_strerror(rc, (const char**)&major, (const char**)&minor); BTL_ERROR(("ERROR: %s %s %s\n", "dat_ep_connect", major, minor)); dat_ep_free(endpoint->endpoint_max); return OMPI_ERROR; } } return OMPI_SUCCESS; } static int mca_btl_udapl_endpoint_finish_max(mca_btl_udapl_endpoint_t* endpoint) { int ret = OMPI_SUCCESS; mca_btl_udapl_module_t* udapl_btl = endpoint->endpoint_btl; endpoint->endpoint_state = MCA_BTL_UDAPL_CONNECTED; OPAL_THREAD_ADD32(&(endpoint->endpoint_btl->udapl_connect_inprogress), -1); /* post eager recv buffers */ ret = mca_btl_udapl_endpoint_post_recv(endpoint, mca_btl_udapl_component.udapl_eager_frag_size); if (OMPI_SUCCESS != ret) { return ret; } /* post max recv buffers */ ret = mca_btl_udapl_endpoint_post_recv(endpoint, mca_btl_udapl_component.udapl_max_frag_size); if (OMPI_SUCCESS != ret) { return ret; } /* progress eager frag queue as allowed */ mca_btl_udapl_frag_progress_pending(udapl_btl, endpoint, BTL_UDAPL_EAGER_CONNECTION); /* progress max frag queue as allowed */ mca_btl_udapl_frag_progress_pending(udapl_btl, endpoint, BTL_UDAPL_MAX_CONNECTION); return ret; } /* * Utility routine. Search list of endpoints to find one that matches * the given address. * * @param btl (IN) BTL module * @param addr (IN) Address used to find endpoint to be returned * * @return Pointer to the base endpoint matching addr or NULL */ mca_btl_base_endpoint_t* mca_btl_udapl_find_endpoint_address_match(struct mca_btl_udapl_module_t* btl, mca_btl_udapl_addr_t addr) { size_t i; mca_btl_udapl_proc_t *proc; mca_btl_base_endpoint_t *proc_ep; mca_btl_base_endpoint_t *endpoint = NULL; for(proc = (mca_btl_udapl_proc_t*) opal_list_get_first(&mca_btl_udapl_component.udapl_procs); proc != (mca_btl_udapl_proc_t*) opal_list_get_end(&mca_btl_udapl_component.udapl_procs); proc = (mca_btl_udapl_proc_t*)opal_list_get_next(proc)) { for(i = 0; i < proc->proc_endpoint_count; i++) { proc_ep = proc->proc_endpoints[i]; if(proc_ep->endpoint_btl == btl && !memcmp(&addr, &proc_ep->endpoint_addr, (sizeof(DAT_CONN_QUAL) + sizeof(DAT_SOCK_ADDR)))) { /* match found */ endpoint = proc_ep; return endpoint; } } } return endpoint; } /* * Utility routine. Search list of endpoints to find one that matches * the given DAT endpoint handle, this could either be the eager or * max ep. * * @param btl (IN) BTL module * @param ep (IN) EP handle used to find endpoint to be returned * * @return Pointer to the base endpoint matching addr or NULL */ static mca_btl_base_endpoint_t* mca_btl_udapl_find_endpoint_connection_match(struct mca_btl_udapl_module_t* btl, DAT_EP_HANDLE ep) { size_t i; mca_btl_udapl_proc_t *proc; mca_btl_base_endpoint_t *proc_ep; mca_btl_base_endpoint_t *endpoint = NULL; for(proc = (mca_btl_udapl_proc_t*) opal_list_get_first(&mca_btl_udapl_component.udapl_procs); proc != (mca_btl_udapl_proc_t*) opal_list_get_end(&mca_btl_udapl_component.udapl_procs); proc = (mca_btl_udapl_proc_t*)opal_list_get_next(proc)) { for(i = 0; i < proc->proc_endpoint_count; i++) { proc_ep = proc->proc_endpoints[i]; if(proc_ep->endpoint_btl == btl) { if (ep == proc_ep->endpoint_eager || ep == proc_ep->endpoint_max) { /* match found */ endpoint = proc_ep; return endpoint; } else { continue; } } } } return endpoint; } /* * Private Data connection establishment process. Operations to be * performed once the eager connection of the given endpoint has * completed. * * @param btl (IN) BTL module * @param endpoint (IN) BTL addressing information * * @return OMPI_SUCCESS or error status on failure */ static int mca_btl_udapl_endpoint_pd_finish_eager( mca_btl_udapl_endpoint_t* endpoint) { mca_btl_udapl_module_t* btl = endpoint->endpoint_btl; int rc = OMPI_SUCCESS; char *priv_data_ptr = NULL; DAT_COUNT priv_data_size = 0; OPAL_THREAD_LOCK(&endpoint->endpoint_lock); endpoint->endpoint_state = MCA_BTL_UDAPL_CONN_MAX; OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock); /* initiate the eager rdma connection */ if ((1 == mca_btl_udapl_component.udapl_use_eager_rdma) && (btl->udapl_eager_rdma_endpoint_count < mca_btl_udapl_component.udapl_max_eager_rdma_peers)) { mca_btl_udapl_endpoint_connect_eager_rdma(endpoint); } /* Only one side does dat_ep_connect() and if by chance the * connection is already established we don't need to bother * with this. */ if((BTL_UDAPL_NUM_CONNECTION != endpoint->endpoint_connections_completed) && (0 < orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &endpoint->endpoint_proc->proc_ompi->proc_name, &ompi_proc_local()->proc_name))) { rc = mca_btl_udapl_endpoint_create(btl, &endpoint->endpoint_max); if(DAT_SUCCESS != rc) { endpoint->endpoint_state = MCA_BTL_UDAPL_FAILED; return OMPI_ERROR; } if (mca_btl_udapl_component.udapl_conn_priv_data) { int32_t priv_data_conn_type = BTL_UDAPL_MAX_CONNECTION; priv_data_size = (sizeof(mca_btl_udapl_addr_t) + sizeof(int32_t)); priv_data_ptr = (char *)malloc(priv_data_size); if (NULL == priv_data_ptr) { BTL_ERROR(("ERROR: %s %s\n", "mca_btl_udapl_endpoint_pd_finish_eager", "out of resources")); return OMPI_ERR_OUT_OF_RESOURCE; } /* private data consists of local btl address, listen port (psp), * and endpoint state to indicate EAGER or MAX endpoint */ memcpy(priv_data_ptr, &btl->udapl_addr, sizeof(mca_btl_udapl_addr_t)); memcpy((priv_data_ptr + sizeof(mca_btl_udapl_addr_t)), &priv_data_conn_type, sizeof(int32_t)); } rc = dat_ep_connect(endpoint->endpoint_max, &endpoint->endpoint_addr.addr, endpoint->endpoint_addr.port, mca_btl_udapl_component.udapl_timeout, priv_data_size, priv_data_ptr, 0, DAT_CONNECT_DEFAULT_FLAG); if (mca_btl_udapl_component.udapl_conn_priv_data) { free(priv_data_ptr); } if(DAT_SUCCESS != rc) { char* major; char* minor; dat_strerror(rc, (const char**)&major, (const char**)&minor); BTL_ERROR(("ERROR: %s %s %s\n", "dat_ep_connect", major, minor)); dat_ep_free(endpoint->endpoint_max); return OMPI_ERROR; } } /* post eager recv buffers */ rc = mca_btl_udapl_endpoint_post_recv(endpoint, mca_btl_udapl_component.udapl_eager_frag_size); if (OMPI_SUCCESS != rc) { return rc; } /* Not progressing here because the entire endpoint needs to be * marked MCA_BTL_UDAPL_CONNECTED, otherwise * mca_btl_udapl_endpoint_send() will just put queued sends back on * the queue. */ return OMPI_SUCCESS; } /* * Private Data connection establishment process. Operations to be * performed once the max connection of the given endpoint has * completed. * * @param btl (IN) BTL module * @param endpoint (IN) BTL addressing information * * @return OMPI_SUCCESS or error status on failure */ static int mca_btl_udapl_endpoint_pd_finish_max(mca_btl_udapl_endpoint_t* endpoint) { int rc = OMPI_SUCCESS; /* post max recv buffers */ rc = mca_btl_udapl_endpoint_post_recv(endpoint, mca_btl_udapl_component.udapl_max_frag_size); /* Not progressing here because the entire endpoint needs to be * marked MCA_BTL_UDAPL_CONNECTED otherwise * mca_btl_udapl_endpoint_send() will just put queued sends back on * the queue. */ return rc; } /* * Private Data connection establishment process. Operations to be * performed once both the eager and max max connections of the given * endpoint has completed. * * @param endpoint (IN) BTL addressing information * * @return OMPI_SUCCESS or error status on failure */ static int mca_btl_udapl_endpoint_pd_connections_completed(mca_btl_udapl_endpoint_t* endpoint) { int rc = OMPI_SUCCESS; mca_btl_udapl_module_t* udapl_btl = endpoint->endpoint_btl; OPAL_THREAD_LOCK(&endpoint->endpoint_lock); endpoint->endpoint_state = MCA_BTL_UDAPL_CONNECTED; OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock); OPAL_THREAD_ADD32(&(endpoint->endpoint_btl->udapl_connect_inprogress), -1); /* progress eager frag queue */ mca_btl_udapl_frag_progress_pending(udapl_btl, endpoint, BTL_UDAPL_EAGER_CONNECTION); /* progress max frag queue */ mca_btl_udapl_frag_progress_pending(udapl_btl, endpoint, BTL_UDAPL_MAX_CONNECTION); return rc; } /* * Private Data connection establishment process. Called once the * DAT_CONNECTION_EVENT_ESTABLISHED is dequeued from the connecton * event dispatcher (evd). This event is the local completion event * for both the dat_ep_connect and dat_cr_accpept calls. * * @param btl (IN) BTL module * @param ep (IN) EP handle used to find endpoint to be returned * * @return Pointer to the base endpoint matching addr */ int mca_btl_udapl_endpoint_pd_established_conn(struct mca_btl_udapl_module_t* btl, DAT_EP_HANDLE established_ep) { int rc = OMPI_SUCCESS; mca_btl_base_endpoint_t* proc_ep = NULL; /* search for ep and decide what to do next */ proc_ep = mca_btl_udapl_find_endpoint_connection_match(btl, established_ep); if (proc_ep == NULL) { /* If this point is reached, no matching endpoint was found */ BTL_UDAPL_VERBOSE_OUTPUT(VERBOSE_CRITICAL, ("ERROR: could not match endpoint\n")); return OMPI_ERROR; } proc_ep->endpoint_connections_completed++; if (established_ep == proc_ep->endpoint_eager) { rc = mca_btl_udapl_endpoint_pd_finish_eager(proc_ep); } else if (established_ep == proc_ep->endpoint_max) { rc = mca_btl_udapl_endpoint_pd_finish_max(proc_ep); } if (rc == OMPI_SUCCESS && BTL_UDAPL_NUM_CONNECTION == proc_ep->endpoint_connections_completed) { rc = mca_btl_udapl_endpoint_pd_connections_completed(proc_ep); } return rc; } /* * Post receive buffers for a newly established endpoint connection. */ static int mca_btl_udapl_endpoint_post_recv(mca_btl_udapl_endpoint_t* endpoint, size_t size) { mca_btl_udapl_frag_t* frag = NULL; DAT_DTO_COOKIE cookie; DAT_EP_HANDLE ep; int rc; int i; for(i = 0; i < mca_btl_udapl_component.udapl_num_recvs; i++) { if(size == mca_btl_udapl_component.udapl_eager_frag_size) { MCA_BTL_UDAPL_FRAG_ALLOC_EAGER_RECV(endpoint->endpoint_btl, frag, rc); ep = endpoint->endpoint_eager; } else { assert(size == mca_btl_udapl_component.udapl_max_frag_size); MCA_BTL_UDAPL_FRAG_ALLOC_MAX_RECV(endpoint->endpoint_btl, frag, rc); ep = endpoint->endpoint_max; } if (NULL == frag) { BTL_ERROR(("ERROR: %s posting recv, out of resources\n", "MCA_BTL_UDAPL_ALLOC")); return rc; } assert(size == frag->size); /* Set up the LMR triplet from the frag segment */ /* Note that this triplet defines a sub-region of a registered LMR */ frag->triplet.virtual_address = (DAT_VADDR)(uintptr_t)frag->segment.base.seg_addr.pval; frag->triplet.segment_length = frag->size; frag->btl = endpoint->endpoint_btl; frag->endpoint = endpoint; frag->base.des_dst = &frag->segment.base; frag->base.des_dst_cnt = 1; frag->base.des_src = NULL; frag->base.des_src_cnt = 0; frag->base.des_flags = 0; frag->type = MCA_BTL_UDAPL_RECV; cookie.as_ptr = frag; rc = dat_ep_post_recv(ep, 1, &frag->triplet, cookie, DAT_COMPLETION_DEFAULT_FLAG); if(DAT_SUCCESS != rc) { char* major; char* minor; dat_strerror(rc, (const char**)&major, (const char**)&minor); BTL_ERROR(("ERROR: %s %s %s\n", "dat_ep_post_recv", major, minor)); return OMPI_ERROR; } } return OMPI_SUCCESS; } /* * Initialize state of the endpoint instance. * */ static void mca_btl_udapl_endpoint_construct(mca_btl_base_endpoint_t* endpoint) { endpoint->endpoint_btl = 0; endpoint->endpoint_proc = 0; endpoint->endpoint_connection_seq = 0; endpoint->endpoint_connections_completed = 0;; endpoint->endpoint_eager_sends = mca_btl_udapl_component.udapl_num_sends; endpoint->endpoint_max_sends = mca_btl_udapl_component.udapl_num_sends; endpoint->endpoint_state = MCA_BTL_UDAPL_CLOSED; endpoint->endpoint_eager = DAT_HANDLE_NULL; endpoint->endpoint_max = DAT_HANDLE_NULL; endpoint->endpoint_sr_tokens[BTL_UDAPL_EAGER_CONNECTION] = endpoint->endpoint_eager_sends; endpoint->endpoint_sr_tokens[BTL_UDAPL_MAX_CONNECTION] = endpoint->endpoint_max_sends; endpoint->endpoint_sr_credits[BTL_UDAPL_EAGER_CONNECTION] = 0; endpoint->endpoint_sr_credits[BTL_UDAPL_MAX_CONNECTION] = 0; endpoint->endpoint_lwqe_tokens[BTL_UDAPL_EAGER_CONNECTION] = mca_btl_udapl_component.udapl_num_sends + (2*mca_btl_udapl_component.udapl_eager_rdma_num); endpoint->endpoint_lwqe_tokens[BTL_UDAPL_MAX_CONNECTION] = mca_btl_udapl_component.udapl_num_sends + (2*mca_btl_udapl_component.udapl_eager_rdma_num); OBJ_CONSTRUCT(&endpoint->endpoint_eager_frags, opal_list_t); OBJ_CONSTRUCT(&endpoint->endpoint_max_frags, opal_list_t); OBJ_CONSTRUCT(&endpoint->endpoint_lock, opal_mutex_t); /* initialize eager RDMA */ memset(&endpoint->endpoint_eager_rdma_local, 0, sizeof(mca_btl_udapl_eager_rdma_local_t)); memset (&endpoint->endpoint_eager_rdma_remote, 0, sizeof(mca_btl_udapl_eager_rdma_remote_t)); OBJ_CONSTRUCT(&endpoint->endpoint_eager_rdma_local.lock, opal_mutex_t); OBJ_CONSTRUCT(&endpoint->endpoint_eager_rdma_remote.lock, opal_mutex_t); } /* * Destroy a endpoint * */ static void mca_btl_udapl_endpoint_destruct(mca_btl_base_endpoint_t* endpoint) { mca_btl_udapl_module_t* udapl_btl = endpoint->endpoint_btl; mca_mpool_base_registration_t *reg = (mca_mpool_base_registration_t*)endpoint->endpoint_eager_rdma_local.reg; OBJ_DESTRUCT(&endpoint->endpoint_eager_frags); OBJ_DESTRUCT(&endpoint->endpoint_max_frags); OBJ_DESTRUCT(&endpoint->endpoint_lock); /* release eager rdma resources */ if (NULL != reg) { udapl_btl->super.btl_mpool->mpool_free(udapl_btl->super.btl_mpool, NULL, reg); } if (NULL != endpoint->endpoint_eager_rdma_local.base.pval) { free(endpoint->endpoint_eager_rdma_local.base.pval); } } /* * Release the fragment used to send the eager rdma control message. * Callback to be executed upon receiving local completion event * from sending a control message operation. Should essentially do * the same thing as mca_btl_udapl_free(). * * @param btl (IN) BTL module * @param endpoint (IN) BTL addressing information * @param descriptor (IN) Description of the data to be transferred * @param status (IN/OUT) */ static void mca_btl_udapl_endpoint_control_send_cb( mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, struct mca_btl_base_descriptor_t* descriptor, int status) { int connection = BTL_UDAPL_EAGER_CONNECTION; mca_btl_udapl_frag_t* frag = (mca_btl_udapl_frag_t*)descriptor; if(frag->size != mca_btl_udapl_component.udapl_eager_frag_size) { connection = BTL_UDAPL_MAX_CONNECTION; } /* control messages are not part of the regular accounting * so here we subtract because the addition was made during * the send completion during progress */ OPAL_THREAD_ADD32(&(endpoint->endpoint_lwqe_tokens[connection]), -1); MCA_BTL_UDAPL_FRAG_RETURN_CONTROL(((mca_btl_udapl_module_t*)btl), ((mca_btl_udapl_frag_t*)descriptor)); } /* * Allocate and initialize descriptor to be used in sending uDAPL BTL * control messages. Should essentially accomplish same as would be * from calling mca_btl_udapl_alloc(). * * @param btl (IN) BTL module * @param size (IN) Size of segment required to be transferred * * @return descriptor (IN) Description of the data to be transferred */ static mca_btl_base_descriptor_t* mca_btl_udapl_endpoint_initialize_control_message( struct mca_btl_base_module_t* btl, size_t size) { mca_btl_udapl_module_t* udapl_btl = (mca_btl_udapl_module_t*) btl; mca_btl_udapl_frag_t* frag; int rc; int pad = 0; /* compute pad as needed */ MCA_BTL_UDAPL_FRAG_CALC_ALIGNMENT_PAD(pad, (size + sizeof(mca_btl_udapl_footer_t))); /* control messages size should never be greater than eager message size */ assert((size+pad) <= btl->btl_eager_limit); MCA_BTL_UDAPL_FRAG_ALLOC_CONTROL(udapl_btl, frag, rc); /* Set up the LMR triplet from the frag segment */ frag->segment.base.seg_len = (uint32_t)size; frag->triplet.virtual_address = (DAT_VADDR)(uintptr_t)frag->segment.base.seg_addr.pval; /* assume send/recv as default when computing segment_length */ frag->triplet.segment_length = frag->segment.base.seg_len + sizeof(mca_btl_udapl_footer_t); assert(frag->triplet.lmr_context == ((mca_btl_udapl_reg_t*)frag->registration)->lmr_triplet.lmr_context); frag->btl = udapl_btl; frag->base.des_src = &frag->segment.base; frag->base.des_src_cnt = 1; frag->base.des_dst = NULL; frag->base.des_dst_cnt = 0; frag->base.des_flags = 0; frag->base.des_cbfunc = mca_btl_udapl_endpoint_control_send_cb; frag->base.des_cbdata = NULL; return &frag->base; } /* * Transfer the given endpoints rdma segment information. Expects that * the endpoints rdma segment has previoulsy been created and * registered as required. * * @param endpoint (IN) BTL addressing information * * @return OMPI_SUCCESS or error status on failure */ static int mca_btl_udapl_endpoint_send_eager_rdma( mca_btl_base_endpoint_t* endpoint) { mca_btl_udapl_eager_rdma_connect_t* rdma_connect; mca_btl_base_descriptor_t* des; mca_btl_udapl_segment_t* segment; mca_btl_udapl_frag_t* data_frag; mca_btl_udapl_frag_t* local_frag = (mca_btl_udapl_frag_t*)endpoint->endpoint_eager_rdma_local.base.pval; mca_btl_udapl_module_t* udapl_btl = endpoint->endpoint_btl; size_t cntrl_msg_size = sizeof(mca_btl_udapl_eager_rdma_connect_t); int rc = OMPI_SUCCESS; des = mca_btl_udapl_endpoint_initialize_control_message( &udapl_btl->super, cntrl_msg_size); des->des_flags = 0; des->des_cbfunc = mca_btl_udapl_endpoint_control_send_cb; des->des_cbdata = NULL; /* fill in data */ segment = des->des_src; rdma_connect = (mca_btl_udapl_eager_rdma_connect_t*)segment->base.seg_addr.pval; rdma_connect->control.type = MCA_BTL_UDAPL_CONTROL_RDMA_CONNECT; rdma_connect->rkey = endpoint->endpoint_eager_rdma_local.reg->rmr_context; rdma_connect->rdma_start.pval = (unsigned char*)local_frag->base.super.ptr; /* prep fragment and put on queue */ data_frag = (mca_btl_udapl_frag_t*)des; data_frag->endpoint = endpoint; data_frag->ftr = (mca_btl_udapl_footer_t *) ((char *)data_frag->segment.base.seg_addr.pval + data_frag->segment.base.seg_len); data_frag->ftr->tag = MCA_BTL_TAG_UDAPL; data_frag->type = MCA_BTL_UDAPL_SEND; OPAL_THREAD_LOCK(&endpoint->endpoint_lock); opal_list_append(&endpoint->endpoint_eager_frags, (opal_list_item_t*)data_frag); OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock); return rc; } /* * Endpoint handed in is the local process peer. This routine * creates and initializes a local memory region which will be used for * reading from locally. This memory region will be made available to peer * for writing into by sending a description of the area to the given * endpoint. * * Note: The local memory region is actually two areas, one is a * contiguous memory region containing only the fragment structures. A * pointer to the first fragment structure is held here: * endpoint->endpoint_eager_rdma_local.base.pval. Each of these * fragment structures will contain a pointer, * frag->segment.base.seg_addr.pval set during a call to OBJ_CONSTRUCT(), * to its associated data region. The data region for all fragments * will be contiguous and created by accessing the mpool. * * @param endpoint (IN) BTL addressing information */ void mca_btl_udapl_endpoint_connect_eager_rdma( mca_btl_udapl_endpoint_t* endpoint) { char* buf; char* alloc_ptr; size_t size_plus_align; int i; uint32_t flags = MCA_MPOOL_FLAGS_CACHE_BYPASS; mca_btl_udapl_module_t* udapl_btl = endpoint->endpoint_btl; OPAL_THREAD_LOCK(&endpoint->endpoint_eager_rdma_local.lock); if (endpoint->endpoint_eager_rdma_local.base.pval) goto unlock_rdma_local; if (mca_btl_udapl_component.udapl_eager_rdma_num <= 0) { /* NOTE: Need to find a more generic way to check ranges * for all mca parameters. */ BTL_UDAPL_VERBOSE_HELP(VERBOSE_SHOW_HELP, ("help-mpi-btl-udapl.txt", "invalid num rdma segments", true, mca_btl_udapl_component.udapl_eager_rdma_num)); goto unlock_rdma_local; } /* create space for fragment structures */ alloc_ptr = (char*)malloc(mca_btl_udapl_component.udapl_eager_rdma_num * sizeof(mca_btl_udapl_frag_eager_rdma_t)); if(NULL == alloc_ptr) { goto unlock_rdma_local; } /* get size of one fragment's data region */ size_plus_align = OPAL_ALIGN( mca_btl_udapl_component.udapl_eager_frag_size, DAT_OPTIMAL_ALIGNMENT, size_t); /* set flags value accordingly if ro aware */ if (mca_btl_udapl_component.ro_aware_system) { flags |= MCA_MPOOL_FLAGS_SO_MEM; } /* create and register memory for all rdma segments */ buf = udapl_btl->super.btl_mpool->mpool_alloc(udapl_btl->super.btl_mpool, (size_plus_align * mca_btl_udapl_component.udapl_eager_rdma_num), 0, flags, (mca_mpool_base_registration_t**)&endpoint->endpoint_eager_rdma_local.reg); if(!buf) goto unlock_rdma_local; /* initialize the rdma segments */ for(i = 0; i < mca_btl_udapl_component.udapl_eager_rdma_num; i++) { mca_btl_udapl_frag_eager_rdma_t* local_rdma_frag; ompi_free_list_item_t *item = (ompi_free_list_item_t *)(alloc_ptr + i*sizeof(mca_btl_udapl_frag_eager_rdma_t)); item->registration = (void*)endpoint->endpoint_eager_rdma_local.reg; item->ptr = buf + i * size_plus_align; OBJ_CONSTRUCT(item, mca_btl_udapl_frag_eager_rdma_t); local_rdma_frag = ((mca_btl_udapl_frag_eager_rdma_t*)item); local_rdma_frag->base.des_dst = &local_rdma_frag->segment.base; local_rdma_frag->base.des_dst_cnt = 1; local_rdma_frag->base.des_src = NULL; local_rdma_frag->base.des_src_cnt = 0; local_rdma_frag->btl = endpoint->endpoint_btl; local_rdma_frag->endpoint = endpoint; local_rdma_frag->type = MCA_BTL_UDAPL_FRAG_EAGER_RDMA; local_rdma_frag->triplet.segment_length = local_rdma_frag->size; } OPAL_THREAD_LOCK(&udapl_btl->udapl_eager_rdma_lock); endpoint->endpoint_eager_rdma_index = opal_pointer_array_add(udapl_btl->udapl_eager_rdma_endpoints, endpoint); if( 0 > endpoint->endpoint_eager_rdma_index ) goto cleanup; /* record first fragment location */ endpoint->endpoint_eager_rdma_local.base.pval = alloc_ptr; udapl_btl->udapl_eager_rdma_endpoint_count++; /* send the relevant data describing the registered space to the endpoint */ if (mca_btl_udapl_endpoint_send_eager_rdma(endpoint) == 0) { OPAL_THREAD_UNLOCK(&udapl_btl->udapl_eager_rdma_lock); OPAL_THREAD_UNLOCK(&endpoint->endpoint_eager_rdma_local.lock); return; } udapl_btl->udapl_eager_rdma_endpoint_count--; endpoint->endpoint_eager_rdma_local.base.pval = NULL; opal_pointer_array_set_item(udapl_btl->udapl_eager_rdma_endpoints, endpoint->endpoint_eager_rdma_index, NULL); cleanup: /* this would fail if we hit the max and can not add anymore to the array * and this could happen because we do not lock before checking if max has * been reached */ free(alloc_ptr); endpoint->endpoint_eager_rdma_local.base.pval = NULL; OPAL_THREAD_UNLOCK(&udapl_btl->udapl_eager_rdma_lock); udapl_btl->super.btl_mpool->mpool_free(udapl_btl->super.btl_mpool, buf, (mca_mpool_base_registration_t*)endpoint->endpoint_eager_rdma_local.reg); unlock_rdma_local: OPAL_THREAD_UNLOCK(&endpoint->endpoint_eager_rdma_local.lock); } /* * Send control message with the number of credits available on the * endpoint. Update the credit value accordingly. * * @param endpoint (IN) BTL addressing information * * @return OMPI_SUCCESS or error status on failure */ int mca_btl_udapl_endpoint_send_eager_rdma_credits( mca_btl_base_endpoint_t* endpoint) { mca_btl_udapl_eager_rdma_credit_t *rdma_credit; mca_btl_base_descriptor_t* des; mca_btl_udapl_segment_t* segment; DAT_DTO_COOKIE cookie; mca_btl_udapl_frag_t* frag; mca_btl_udapl_module_t* udapl_btl = endpoint->endpoint_btl; size_t cntrl_msg_size = sizeof(mca_btl_udapl_eager_rdma_credit_t); int rc = OMPI_SUCCESS; des = mca_btl_udapl_endpoint_initialize_control_message( &udapl_btl->super, cntrl_msg_size); /* fill in data */ segment = des->des_src; rdma_credit = (mca_btl_udapl_eager_rdma_credit_t*)segment->base.seg_addr.pval; rdma_credit->control.type = MCA_BTL_UDAPL_CONTROL_RDMA_CREDIT; rdma_credit->credits = endpoint->endpoint_eager_rdma_local.credits; /* reset local credits value */ OPAL_THREAD_LOCK(&endpoint->endpoint_eager_rdma_local.lock); endpoint->endpoint_eager_rdma_local.credits -= rdma_credit->credits; /* prep and send fragment : control messages do not count * against the token/credit number so do not subtract from tokens * with this send */ frag = (mca_btl_udapl_frag_t*)des; frag->endpoint = endpoint; frag->ftr = (mca_btl_udapl_footer_t *) ((char *)frag->segment.base.seg_addr.pval + frag->segment.base.seg_len); frag->ftr->tag = MCA_BTL_TAG_UDAPL; frag->type = MCA_BTL_UDAPL_SEND; cookie.as_ptr = frag; rc = dat_ep_post_send(endpoint->endpoint_eager, 1, &frag->triplet, cookie, DAT_COMPLETION_DEFAULT_FLAG); OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock); if(DAT_SUCCESS != rc) { char* major; char* minor; dat_strerror(rc, (const char**)&major, (const char**)&minor); BTL_ERROR(("ERROR: %s %s %s\n", "dat_ep_post_send", major, minor)); endpoint->endpoint_state = MCA_BTL_UDAPL_FAILED; rc = OMPI_ERROR; } return rc; } /* * Send control message with the number of credits available on the * endpoint. Update the credit value accordingly. * * @param endpoint (IN) BTL addressing information * * @param connection (IN) 0 for eager and 1 for max connection * * @return OMPI_SUCCESS or error status on failure */ int mca_btl_udapl_endpoint_send_sr_credits( mca_btl_base_endpoint_t* endpoint, const int connection) { mca_btl_udapl_sr_credit_t *sr_credit; mca_btl_base_descriptor_t* des; mca_btl_udapl_segment_t* segment; DAT_DTO_COOKIE cookie; mca_btl_udapl_frag_t* frag; mca_btl_udapl_module_t* udapl_btl = endpoint->endpoint_btl; size_t cntrl_msg_size = sizeof(mca_btl_udapl_sr_credit_t); int rc = OMPI_SUCCESS; des = mca_btl_udapl_endpoint_initialize_control_message( &udapl_btl->super, cntrl_msg_size); /* fill in data */ segment = des->des_src; sr_credit = (mca_btl_udapl_sr_credit_t*)segment->base.seg_addr.pval; sr_credit->control.type = MCA_BTL_UDAPL_CONTROL_SR_CREDIT; OPAL_THREAD_LOCK(&endpoint->endpoint_lock); sr_credit->credits = endpoint->endpoint_sr_credits[connection]; sr_credit->connection = connection; /* reset local credits value */ endpoint->endpoint_sr_credits[connection] = 0; /* prep and send fragment : control messages do not count * against the token/credit count so do not subtract from tokens * with this send */ frag = (mca_btl_udapl_frag_t*)des; frag->endpoint = endpoint; frag->ftr = (mca_btl_udapl_footer_t *) ((char *)frag->segment.base.seg_addr.pval + frag->segment.base.seg_len); frag->ftr->tag = MCA_BTL_TAG_UDAPL; frag->type = MCA_BTL_UDAPL_SEND; cookie.as_ptr = frag; if (BTL_UDAPL_EAGER_CONNECTION == connection) { rc = dat_ep_post_send(endpoint->endpoint_eager, 1, &frag->triplet, cookie, DAT_COMPLETION_DEFAULT_FLAG); } else { assert(BTL_UDAPL_MAX_CONNECTION == connection); rc = dat_ep_post_send(endpoint->endpoint_max, 1, &frag->triplet, cookie, DAT_COMPLETION_DEFAULT_FLAG); } OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock); if(DAT_SUCCESS != rc) { char* major; char* minor; dat_strerror(rc, (const char**)&major, (const char**)&minor); BTL_ERROR(("ERROR: %s %s %s\n", "dat_ep_post_send", major, minor)); endpoint->endpoint_state = MCA_BTL_UDAPL_FAILED; rc = OMPI_ERROR; } return rc; } OBJ_CLASS_INSTANCE( mca_btl_udapl_endpoint_t, opal_list_item_t, mca_btl_udapl_endpoint_construct, mca_btl_udapl_endpoint_destruct);