/* * Copyright (c) 2015-2017 Intel, Inc. All rights reserved. * Copyright (c) 2017 Los Alamos National Security, LLC. All rights * reserved. * * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ */ #include "orte_config.h" #include "opal/dss/dss_types.h" #include "opal/util/net.h" #include "opal/util/output.h" #include "opal/mca/event/event.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/rml/base/base.h" #include "orte/mca/rml/rml_types.h" #include #include #include #include #include #include #include "rml_ofi.h" static void ofi_req_cons(orte_rml_ofi_request_t *ptr) { OBJ_CONSTRUCT(&ptr->pkt_list, opal_list_t); } static void ofi_req_des(orte_rml_ofi_request_t *ptr) { OPAL_LIST_DESTRUCT(&ptr->pkt_list); } OBJ_CLASS_INSTANCE(orte_rml_ofi_request_t, opal_object_t, ofi_req_cons, ofi_req_des); static void ofi_send_req_cons(ofi_send_request_t *ptr) { OBJ_CONSTRUCT(&ptr->send, orte_rml_send_t); } OBJ_CLASS_INSTANCE(ofi_send_request_t, opal_object_t, ofi_send_req_cons, NULL); OBJ_CLASS_INSTANCE(orte_rml_ofi_send_pkt_t, opal_list_item_t, NULL, NULL); OBJ_CLASS_INSTANCE(orte_rml_ofi_recv_pkt_t, opal_list_item_t, NULL, NULL); static void ofi_recv_msg_queue_cons(ofi_recv_msg_queue_t *ptr) { ptr->msgid = 0; ptr->tot_pkts = 1; ptr->pkt_recd = 0; OBJ_CONSTRUCT(&ptr->pkt_list, opal_list_t); } static void ofi_recv_msg_queue_des(ofi_recv_msg_queue_t *ptr) { OPAL_LIST_DESTRUCT(&ptr->pkt_list); } OBJ_CLASS_INSTANCE(ofi_recv_msg_queue_t, opal_list_item_t, ofi_recv_msg_queue_cons, ofi_recv_msg_queue_des); static void send_self_exe(int fd, short args, void* data) { orte_self_send_xfer_t *xfer = (orte_self_send_xfer_t*)data; opal_output_verbose(1, orte_rml_base_framework.framework_output, "%s rml_send_to_self ofi callback executing for tag %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), xfer->tag); /* execute the send callback function - note that * send-to-self always returns a SUCCESS status */ if (NULL != xfer->iov) { if (NULL != xfer->cbfunc.iov) { /* non-blocking iovec send */ xfer->cbfunc.iov(ORTE_SUCCESS, ORTE_PROC_MY_NAME, xfer->iov, xfer->count, xfer->tag, xfer->cbdata); } } else if (NULL != xfer->buffer) { if (NULL != xfer->cbfunc.buffer) { /* non-blocking buffer send */ xfer->cbfunc.buffer(ORTE_SUCCESS, ORTE_PROC_MY_NAME, xfer->buffer, xfer->tag, xfer->cbdata); } } else { /* should never happen */ abort(); } /* cleanup the memory */ OBJ_RELEASE(xfer); } /** Send callback */ /* [Desc] This is called from the progress fn when a send completion ** is received in the cq ** wc [in] : the completion queue data entry ** ofi_send_req [in]: ofi send request with the send msg and callback */ int orte_rml_ofi_send_callback(struct fi_cq_data_entry *wc, orte_rml_ofi_request_t* ofi_req) { orte_rml_ofi_send_pkt_t *ofi_send_pkt, *next; opal_output_verbose(10, orte_rml_base_framework.framework_output, "%s orte_rml_ofi_send_callback called, completion count = %d, msgid = %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ofi_req->completion_count, ofi_req->hdr.msgid); assert(ofi_req->completion_count > 0); ofi_req->completion_count--; if ( 0 == ofi_req->completion_count ) { // call the callback fn of the sender ofi_req->send->status = ORTE_SUCCESS; opal_output_verbose(10, orte_rml_base_framework.framework_output, "%s calling ORTE_RML_SEND_COMPLETE macro for msgid = %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ofi_req->hdr.msgid); ORTE_RML_SEND_COMPLETE(ofi_req->send); OPAL_LIST_FOREACH_SAFE(ofi_send_pkt, next, &ofi_req->pkt_list, orte_rml_ofi_send_pkt_t) { free( ofi_send_pkt->data); ofi_send_pkt->pkt_size=0; opal_list_remove_item(&ofi_req->pkt_list, &ofi_send_pkt->super); opal_output_verbose(10, orte_rml_base_framework.framework_output, "%s Removed pkt from list ",ORTE_NAME_PRINT(ORTE_PROC_MY_NAME) ); OBJ_RELEASE(ofi_send_pkt); opal_output_verbose(10, orte_rml_base_framework.framework_output, "%s Released packet ",ORTE_NAME_PRINT(ORTE_PROC_MY_NAME) ); } free(ofi_req->data_blob); OBJ_RELEASE(ofi_req); } // [TODO] need to check for error before returning success return ORTE_SUCCESS; } /** Error callback */ /* [Desc] This is called from the progress fn when a send completion ** is received in the cq ** wc [in] : the completion queue data entry ** ofi_send_req [in]: ofi send request with the send msg and callback */ int orte_rml_ofi_error_callback(struct fi_cq_err_entry *error, orte_rml_ofi_request_t* ofi_req) { opal_output_verbose(10, orte_rml_base_framework.framework_output, "%s orte_rml_ofi_error_callback called ", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME) ); switch(error->err) { default: /* call the send-callback fn with error and return, also return failure status */ ofi_req->send->status = ORTE_ERR_CONDUIT_SEND_FAIL; ORTE_RML_SEND_COMPLETE(ofi_req->send); } return ORTE_SUCCESS; } /** Recv handler */ /* [Desc] This is called from the progress fn when a recv completion ** is received in the cq ** wc [in] : the completion queue data entry */ int orte_rml_ofi_recv_handler(struct fi_cq_data_entry *wc, uint8_t ofi_prov_id) { orte_rml_ofi_msg_header_t msg_hdr; uint32_t msglen, datalen = 0; char *data, *totdata, *nextpkt; ofi_recv_msg_queue_t *recv_msg_queue, *new_msg; orte_rml_ofi_recv_pkt_t *ofi_recv_pkt, *new_pkt, *next; bool msg_in_queue = false; opal_output_verbose(10, orte_rml_base_framework.framework_output, "%s orte_rml_ofi_recv_handler called ", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME) ); /*copy the header and data from buffer and pass it on ** since this is the ofi_prov recv buffer don't want it to be released as ** considering re-using it, so for now copying to newly allocated *data ** the *data will be released by orte_rml_base functions */ memcpy(&msg_hdr,wc->buf,sizeof(orte_rml_ofi_msg_header_t)); msglen = wc->len - sizeof(orte_rml_ofi_msg_header_t); opal_output_verbose(10, orte_rml_base_framework.framework_output, "%s Received packet -> msg id = %d wc->len = %lu, msglen = %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), msg_hdr.msgid, wc->len, msglen ); data = (char *)malloc(msglen); memcpy(data,((char *)wc->buf+sizeof(orte_rml_ofi_msg_header_t)),msglen); opal_output_verbose(15, orte_rml_base_framework.framework_output, "%s header info of received packet -> cur_pkt_num = %d, tot_pkts = %d ", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), msg_hdr.cur_pkt_num, msg_hdr.tot_pkts ); /* To accomodate message bigger than recv buffer size, check if current message is in multiple blocks and append them before sending it to RML */ if ( msg_hdr.tot_pkts == 1) { /* Since OFI is point-to-point, no need to check if the intended destination is me send to RML */ opal_output_verbose(10, orte_rml_base_framework.framework_output, "%s Posting Recv for msgid %d, from peer - %s , Tag = %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), msg_hdr.msgid, ORTE_NAME_PRINT(&msg_hdr.origin),msg_hdr.tag ); ORTE_RML_POST_MESSAGE(&msg_hdr.origin, msg_hdr.tag, msg_hdr.seq_num,data,msglen); } else { msg_in_queue = false; new_pkt = OBJ_NEW(orte_rml_ofi_recv_pkt_t); new_pkt->cur_pkt_num = msg_hdr.cur_pkt_num; new_pkt->pkt_size = msglen; new_pkt->data = data; opal_output_verbose(10, orte_rml_base_framework.framework_output, "%s Just beofe checking if this message-pkt is already in queue. msgid-%d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), msg_hdr.msgid ); /* check if the queue has the [msgid,sender] entry */ OPAL_LIST_FOREACH(recv_msg_queue, &orte_rml_ofi.recv_msg_queue_list, ofi_recv_msg_queue_t) { opal_output_verbose(10, orte_rml_base_framework.framework_output, "%s Checking msgid-%d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), recv_msg_queue->msgid ); if( (recv_msg_queue->msgid == msg_hdr.msgid) && (recv_msg_queue->sender.jobid == msg_hdr.origin.jobid) && (recv_msg_queue->sender.vpid == msg_hdr.origin.vpid) ) { opal_output_verbose(10, orte_rml_base_framework.framework_output, "%s Found Msg entry in queue for msgid %d, sender jobid=%d, sender vpid=%d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), recv_msg_queue->msgid, recv_msg_queue->sender.jobid, recv_msg_queue->sender.vpid); msg_in_queue = true; opal_output_verbose(10, orte_rml_base_framework.framework_output, "%s msgid %d, tot_pkts=%d, opal_list_get_size()=%lu,total pkt_recd=%d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), recv_msg_queue->msgid, recv_msg_queue->tot_pkts, opal_list_get_size(&recv_msg_queue->pkt_list), recv_msg_queue->pkt_recd ); if( recv_msg_queue->tot_pkts == (recv_msg_queue->pkt_recd +1) ) { /* all packets received for this message - post message to rml and remove this from queue */ opal_output_verbose(10, orte_rml_base_framework.framework_output, "%s All packets recd for msgid %d, tot_pkts=%d, opal_list_get_size()=%lu,total pkt_recd=%d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), recv_msg_queue->msgid, recv_msg_queue->tot_pkts, opal_list_get_size(&recv_msg_queue->pkt_list), recv_msg_queue->pkt_recd ); totdata = NULL; datalen = 0; OPAL_LIST_FOREACH(ofi_recv_pkt, &recv_msg_queue->pkt_list, orte_rml_ofi_recv_pkt_t) { opal_output_verbose(10, orte_rml_base_framework.framework_output, "%s Adding data for packet %d, pktlength = %lu, cumulative datalen so far = %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ofi_recv_pkt->cur_pkt_num, ofi_recv_pkt->pkt_size, datalen ); if (0 == datalen) { if (NULL != totdata) { free(totdata); } totdata = (char *)malloc(ofi_recv_pkt->pkt_size); if( totdata == NULL) { opal_output_verbose(1, orte_rml_base_framework.framework_output, "%s Error: malloc failed for msgid %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),recv_msg_queue->msgid ); return 1; //[TODO: error-handling needs to be implemented } memcpy(totdata,ofi_recv_pkt->data,ofi_recv_pkt->pkt_size); } else { totdata = realloc(totdata,datalen+ofi_recv_pkt->pkt_size); if (NULL != totdata ) { memcpy((totdata+datalen),ofi_recv_pkt->data,ofi_recv_pkt->pkt_size); } else { opal_output_verbose(1, orte_rml_base_framework.framework_output, "%s Error: realloc failed for msgid %d, from sender jobid=%d, sender vpid=%d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), recv_msg_queue->msgid, recv_msg_queue->sender.jobid, recv_msg_queue->sender.vpid); return 1; //[TODO: error-handling needs to be implemented } } datalen += ofi_recv_pkt->pkt_size; opal_output_verbose(10, orte_rml_base_framework.framework_output, "%s packet %d done, datalen = %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ofi_recv_pkt->cur_pkt_num,datalen); } opal_output_verbose(10, orte_rml_base_framework.framework_output, "%s Adding leftover data recd, datalen = %d, new_pkt->pkt_size = %lu", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), datalen, new_pkt->pkt_size); //add the last packet totdata =realloc(totdata,datalen+new_pkt->pkt_size); if( NULL != totdata ) { opal_output_verbose(10, orte_rml_base_framework.framework_output, "%s Realloc completed for leftover data recd, datalen = %d, new->pkt->pkt_size = %lu", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), datalen, new_pkt->pkt_size); nextpkt = totdata+datalen; opal_output_verbose(10, orte_rml_base_framework.framework_output, "%s totdata = %p,nextpkt = %p ", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (void *)totdata, (void *)nextpkt); memcpy(nextpkt,new_pkt->data,new_pkt->pkt_size); opal_output_verbose(10, orte_rml_base_framework.framework_output, "%s memcpy completed for leftover data recd, datalen = %d, new->pkt->pkt_size = %lu", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), datalen, new_pkt->pkt_size); datalen += new_pkt->pkt_size; opal_output_verbose(10, orte_rml_base_framework.framework_output, "%s Posting Recv for msgid %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), msg_hdr.msgid ); ORTE_RML_POST_MESSAGE(&msg_hdr.origin, msg_hdr.tag, msg_hdr.seq_num,totdata,datalen);\ // free the pkts opal_output_verbose(10, orte_rml_base_framework.framework_output, "%s msgid %d - posting recv completed, freeing packets", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), msg_hdr.msgid ); OPAL_LIST_FOREACH_SAFE(ofi_recv_pkt, next, &recv_msg_queue->pkt_list, orte_rml_ofi_recv_pkt_t) { free( ofi_recv_pkt->data); opal_output_verbose(10, orte_rml_base_framework.framework_output, "%s freed data for packet %d",ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ofi_recv_pkt->cur_pkt_num ); ofi_recv_pkt->pkt_size=0; opal_list_remove_item(&recv_msg_queue->pkt_list, &ofi_recv_pkt->super); opal_output_verbose(10, orte_rml_base_framework.framework_output, "%s Removed pkt from list ",ORTE_NAME_PRINT(ORTE_PROC_MY_NAME) ); OBJ_RELEASE(ofi_recv_pkt); opal_output_verbose(10, orte_rml_base_framework.framework_output, "%s Released packet ",ORTE_NAME_PRINT(ORTE_PROC_MY_NAME) ); } opal_output_verbose(10, orte_rml_base_framework.framework_output, "%s freeing packets completed",ORTE_NAME_PRINT(ORTE_PROC_MY_NAME) ); //free the msg from the queue-list opal_list_remove_item(&orte_rml_ofi.recv_msg_queue_list,&recv_msg_queue->super); opal_output_verbose(10, orte_rml_base_framework.framework_output, "%s Successfully removed msg from queue", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME) ); OBJ_RELEASE(recv_msg_queue); } else { opal_output_verbose(1, orte_rml_base_framework.framework_output, "%s Error: realloc failed for msgid %d, from sender jobid=%d, sender vpid=%d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), recv_msg_queue->msgid, recv_msg_queue->sender.jobid, recv_msg_queue->sender.vpid); return 1; //[TODO: error-handling needs to be implemented } } else { /* add this packet to the msg in the queue ordered by cur_pkt_num */ opal_output_verbose(10, orte_rml_base_framework.framework_output, "%s Adding packet to list, msgid %d, pkt - %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), recv_msg_queue->msgid, msg_hdr.cur_pkt_num ); bool pkt_added = false; OPAL_LIST_FOREACH(ofi_recv_pkt, &recv_msg_queue->pkt_list, orte_rml_ofi_recv_pkt_t) { if( msg_hdr.cur_pkt_num < ofi_recv_pkt->cur_pkt_num ) { opal_list_insert_pos(&recv_msg_queue->pkt_list, (opal_list_item_t*)ofi_recv_pkt, &new_pkt->super); recv_msg_queue->pkt_recd++; pkt_added = true; break; } } if (!pkt_added) { opal_list_append(&recv_msg_queue->pkt_list,&new_pkt->super); recv_msg_queue->pkt_recd++; } } } break; //we found the msg or added it so exit out of the msg_queue loop } if( !msg_in_queue ) { /*add to the queue as this is the first packet for [msgid,sender] */ new_msg = OBJ_NEW(ofi_recv_msg_queue_t); new_msg->msgid = msg_hdr.msgid; new_msg->sender = msg_hdr.origin; new_msg->tot_pkts = msg_hdr.tot_pkts; new_msg->pkt_recd = 1; opal_output_verbose(10, orte_rml_base_framework.framework_output, "%s Adding first Msg entry in queue for msgid %d, sender jobid=%d, sender vpid=%d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), new_msg->msgid, new_msg->sender.jobid, new_msg->sender.vpid); opal_list_append(&new_msg->pkt_list, &new_pkt->super); opal_list_append(&orte_rml_ofi.recv_msg_queue_list, &new_msg->super); } } return ORTE_SUCCESS; } /* populate_peer_ofi_addr * [Desc] This fn does a PMIx Modex recv on "rml.ofi" key * to get the ofi address blob of all providers on the peer. * Then it populates the array parameter peer_ofi_addr[] * with providername, ofi_ep_name and ofi_ep_namelen * [in] peer -> peer address * [out] peer_ofi_addr[] -> array to hold the provider details on the peer * [Return value] -> total providers on success. OPAL_ERROR if fails to load array. */ static int populate_peer_ofi_addr(orte_process_name_t *peer, orte_rml_ofi_peer_t *peer_ofi_addr ) { uint8_t *data; int32_t sz, cnt; opal_buffer_t modex, *entry; char *prov_name; uint8_t prov_num; size_t entrysize; uint8_t *bytes; uint8_t tot_prov=0,cur_prov; int ret = OPAL_ERROR; OPAL_MODEX_RECV_STRING(ret, "rml.ofi", peer, (void**)&data, &sz); if (OPAL_SUCCESS != ret) { opal_output_verbose(1, orte_rml_base_framework.framework_output, "%s rml:ofi::populate_peer_ofi_addr() Modex_Recv Failed for peer %s. ", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(peer)); return OPAL_ERROR; } opal_output_verbose(1, orte_rml_base_framework.framework_output, "%s rml:ofi::populate_peer_ofi_addr() Modex_Recv Succeeded. ", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); /* load the data into a buffer for unpacking */ OBJ_CONSTRUCT(&modex, opal_buffer_t); opal_dss.load(&modex, data, sz); cnt = 1; /* cycle thru the returned providers and see which one we want to use */ for(cur_prov=0;OPAL_SUCCESS == (ret = opal_dss.unpack(&modex, &entry, &cnt, OPAL_BUFFER));cur_prov++) { /* unpack the provider name */ cnt = 1; if (OPAL_SUCCESS != (ret = opal_dss.unpack(entry, &prov_name, &cnt, OPAL_STRING))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(entry); break; } /* unpack the provider's index on the remote peer - note that there * is no guarantee that the same provider has the same local index! */ cnt = 1; if (OPAL_SUCCESS != (ret = opal_dss.unpack(entry, &prov_num, &cnt, OPAL_UINT8))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(entry); break; } /* unpack the size of their connection blob */ cnt = 1; if (OPAL_SUCCESS != (ret = opal_dss.unpack(entry, &entrysize, &cnt, OPAL_SIZE))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(entry); break; } /* create the necessary space */ bytes = (uint8_t*)malloc(entrysize); /* unpack the connection blob */ cnt = entrysize; if (OPAL_SUCCESS != (ret = opal_dss.unpack(entry, bytes, &cnt, OPAL_BYTE))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(entry); break; } /* done with the buffer */ OBJ_RELEASE(entry); peer_ofi_addr[cur_prov].ofi_prov_name = prov_name; peer_ofi_addr[cur_prov].ofi_ep = bytes; peer_ofi_addr[cur_prov].ofi_ep_len = entrysize; opal_output_verbose(1, orte_rml_base_framework.framework_output, "%s rml:ofi:populate_peer_ofi_addr() Unpacked peer provider %s ", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),peer_ofi_addr[cur_prov].ofi_prov_name); } OBJ_DESTRUCT(&modex); // releases the data returned by the modex_recv tot_prov=cur_prov; return tot_prov; } /* check_provider_in_peer(prov_name, peer_ofi_addr) * [Desc] This fn checks for a match of prov_name in the peer_ofi_addr array * and returns the index of the match or OPAL_ERROR if not found. * The peer_ofi_addr array has all the ofi providers in peer. * [in] prov_name -> The provider name we want to use to send this message to peer. * [in] tot_prov -> total provider entries in array * [in] peer_ofi_addr[] -> array of provider details on the peer * [in] local_ofi_prov_idx -> the index of local provider we are comparing with * (index into orte_rml_ofi.ofi_prov[] array. * [Return value] -> index that matches provider on success. OPAL_ERROR if no match found. */ static int check_provider_in_peer( char *prov_name, int tot_prov, orte_rml_ofi_peer_t *peer_ofi_addr, int local_ofi_prov_idx ) { int idx; int ret = OPAL_ERROR; for( idx=0; idx < tot_prov; idx++) { opal_output_verbose(1, orte_rml_base_framework.framework_output, "%s rml:ofi:check_provider_in_peer() checking peer provider %s to match %s ", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),peer_ofi_addr[idx].ofi_prov_name,prov_name); if ( 0 == strcmp(prov_name, peer_ofi_addr[idx].ofi_prov_name) ) { /* we found a matching provider on peer */ opal_output_verbose(1, orte_rml_base_framework.framework_output, "%s rml:ofi:check_provider_in_peer() matched provider %s ", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),peer_ofi_addr[idx].ofi_prov_name); if ( 0 == strcmp(prov_name, "sockets") ) { /* check if the address is reachable */ struct sockaddr_in *ep_sockaddr, *ep_sockaddr2; opal_output_verbose(1, orte_rml_base_framework.framework_output, "%s rml:ofi:check_provider_in_peer() checking if sockets provider is reachable ", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); ep_sockaddr = (struct sockaddr_in*)peer_ofi_addr[idx].ofi_ep; ep_sockaddr2 = (struct sockaddr_in*)orte_rml_ofi.ofi_prov[local_ofi_prov_idx].ep_name; if (opal_net_samenetwork((struct sockaddr*)ep_sockaddr, (struct sockaddr*)ep_sockaddr2, 24)) { /* we found same ofi provider reachable via ethernet on peer so return this idx*/ ret = idx; opal_output_verbose(1, orte_rml_base_framework.framework_output, "%s rml:ofi:check_provider_in_peer() sockets provider is reachable ", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); break; } } else { ret = idx; break; } } } return ret; } static void send_msg(int fd, short args, void *cbdata) { ofi_send_request_t *req = (ofi_send_request_t*)cbdata; orte_process_name_t *peer = &(req->send.dst); orte_rml_tag_t tag = req->send.tag; char *dest_ep_name; size_t dest_ep_namelen = 0; int ret = OPAL_ERROR, rc; uint32_t total_packets; fi_addr_t dest_fi_addr; orte_rml_send_t *snd; orte_rml_ofi_request_t* ofi_send_req = OBJ_NEW( orte_rml_ofi_request_t ); uint8_t ofi_prov_id = req->ofi_prov_id; orte_rml_ofi_send_pkt_t* ofi_msg_pkt; size_t datalen_per_pkt, hdrsize, data_in_pkt; // the length of data in per packet excluding the header size orte_rml_ofi_peer_t* pr; uint64_t ui64; struct sockaddr_in* ep_sockaddr; snd = OBJ_NEW(orte_rml_send_t); snd->dst = *peer; snd->origin = *ORTE_PROC_MY_NAME; snd->tag = tag; if (NULL != req->send.iov) { snd->iov = req->send.iov; snd->count = req->send.count; snd->cbfunc.iov = req->send.cbfunc.iov; } else { snd->buffer = req->send.buffer; snd->cbfunc.buffer = req->send.cbfunc.buffer; } snd->cbdata = req->send.cbdata; opal_output_verbose(1, orte_rml_base_framework.framework_output, "%s send_msg_transport to peer %s at tag %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(peer), tag); /* get the peer address from our internal hash table */ memcpy(&ui64, (char*)peer, sizeof(uint64_t)); opal_output_verbose(1, orte_rml_base_framework.framework_output, "%s getting contact info for DAEMON peer %s from internal hash table", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(peer)); if (OPAL_SUCCESS != (ret = opal_hash_table_get_value_uint64(&orte_rml_ofi.peers, ui64, (void**)&pr) || NULL == pr)) { orte_rml_ofi_peer_t peer_ofi_addr[MAX_OFI_PROVIDERS]; int tot_peer_prov=0, peer_prov_id=ofi_prov_id; bool peer_match_found=false; opal_output_verbose(1, orte_rml_base_framework.framework_output, "%s rml:ofi:Send peer OFI contact info not found in internal hash - checking modex", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); /* Do Modex_recv and populate the peer's providers and ofi ep address in peer_ofi_addr[] array */ if( OPAL_ERROR == ( tot_peer_prov = populate_peer_ofi_addr( peer, peer_ofi_addr ))) { opal_output_verbose(1, orte_rml_base_framework.framework_output, "%s rml:ofi::send_msg() Error when Populating peer ofi_addr array ", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); snd->status = ORTE_ERR_ADDRESSEE_UNKNOWN; ORTE_RML_SEND_COMPLETE(snd); //OBJ_RELEASE( ofi_send_req); return ; } /* decide the provider we want to use from the list of providers in peer as per below order. * 1. if the user specified the transport for this conduit (even giving us a prioritized list of candidates), * then the one we selected is the _only_ one we will use. If the remote peer has a matching endpoint, * then we use it - otherwise, we error out * 2. if the user did not specify a transport, then we look for matches against _all_ of * our available transports, starting with fabric and then going to Ethernet, taking the first one that matches. * 3. if we cannot find any match, then we error out */ if ( true == user_override() ) { /*case 1. User has specified the provider, find a match in peer for the current selected provider or error out*/ opal_output_verbose(1, orte_rml_base_framework.framework_output, "%s rml:ofi::send_msg() Case1. looking for a match for current provider", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); if( OPAL_ERROR == ( peer_prov_id = check_provider_in_peer( orte_rml_ofi.ofi_prov[ofi_prov_id].fabric_info->fabric_attr->prov_name, tot_peer_prov, peer_ofi_addr, ofi_prov_id ) )) { opal_output_verbose(1, orte_rml_base_framework.framework_output, "%s rml:ofi::send_msg() Peer is Unreachable - no common ofi provider ", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); snd->status = ORTE_ERR_ADDRESSEE_UNKNOWN; ORTE_RML_SEND_COMPLETE(snd); //OBJ_RELEASE( ofi_send_req); return ; } peer_match_found = true; } else { /* case 2. look for any matching fabric (other than ethernet) provider */ opal_output_verbose(1, orte_rml_base_framework.framework_output, "%s rml:ofi::send_msg() Case 2 - looking for any match for fabric provider", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); for(int cur_prov_id=0; cur_prov_id < orte_rml_ofi.ofi_prov_open_num && !peer_match_found ; cur_prov_id++) { if( 0 != strcmp( orte_rml_ofi.ofi_prov[cur_prov_id].fabric_info->fabric_attr->prov_name, "sockets" ) ) { peer_prov_id = check_provider_in_peer( orte_rml_ofi.ofi_prov[cur_prov_id].fabric_info->fabric_attr->prov_name, tot_peer_prov, peer_ofi_addr, cur_prov_id ); if (OPAL_ERROR != peer_prov_id) { peer_match_found = true; ofi_prov_id = cur_prov_id; } } } /* if we haven't found a common provider for local node and peer to send message yet, check for ethernet */ if(!peer_match_found) { opal_output_verbose(1, orte_rml_base_framework.framework_output, "%s rml:ofi::send_msg() Case 2 - common fabric to peer not found,looking for ethernet provider", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); } for(int cur_prov_id=0; cur_prov_id < orte_rml_ofi.ofi_prov_open_num && !peer_match_found ; cur_prov_id++) { if( 0 == strcmp( orte_rml_ofi.ofi_prov[cur_prov_id].fabric_info->fabric_attr->prov_name, "sockets" ) ) { peer_prov_id = check_provider_in_peer( orte_rml_ofi.ofi_prov[cur_prov_id].fabric_info->fabric_attr->prov_name, tot_peer_prov, peer_ofi_addr, cur_prov_id ); if (OPAL_ERROR != peer_prov_id) { peer_match_found = true; ofi_prov_id = cur_prov_id; } } } /* if we haven't found a common provider yet, then error out - case 3 */ if ( !peer_match_found ) { opal_output_verbose(1, orte_rml_base_framework.framework_output, "%s rml:ofi::send_msg() Peer is Unreachable - no common ofi provider ", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); snd->status = ORTE_ERR_ADDRESSEE_UNKNOWN; ORTE_RML_SEND_COMPLETE(snd); //OBJ_RELEASE( ofi_send_req); return ; } } /* creating a copy of the chosen provider to put it in hashtable * as the ofi_peer_addr array is local */ pr = OBJ_NEW(orte_rml_ofi_peer_t); pr->ofi_ep_len = peer_ofi_addr[peer_prov_id].ofi_ep_len; pr->ofi_ep = malloc(pr->ofi_ep_len); memcpy(pr->ofi_ep,peer_ofi_addr[peer_prov_id].ofi_ep,pr->ofi_ep_len); pr->ofi_prov_name = strdup(peer_ofi_addr[peer_prov_id].ofi_prov_name); pr->src_prov_id = ofi_prov_id; if(OPAL_SUCCESS != (rc = opal_hash_table_set_value_uint64(&orte_rml_ofi.peers, ui64, (void*)pr))) { opal_output_verbose(15, orte_rml_base_framework.framework_output, "%s: ofi address insertion into hash table failed for peer %s ", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(peer)); ORTE_ERROR_LOG(rc); } dest_ep_name = pr->ofi_ep; dest_ep_namelen = pr->ofi_ep_len; opal_output_verbose(1, orte_rml_base_framework.framework_output, "%s rml:ofi: Peer ofi provider details added to hash table. Sending to provider %s on peer %s ", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),pr->ofi_prov_name,ORTE_NAME_PRINT(peer)); } else { opal_output_verbose(1, orte_rml_base_framework.framework_output, "%s rml:ofi: OFI peer contact info got from hash table", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); dest_ep_name = pr->ofi_ep; dest_ep_namelen = pr->ofi_ep_len; ofi_prov_id = pr->src_prov_id; } //[Debug] printing additional info of IP switch ( orte_rml_ofi.ofi_prov[ofi_prov_id].fabric_info->addr_format) { case FI_SOCKADDR_IN : /* Address is of type sockaddr_in (IPv4) */ /*[debug] - print the sockaddr - port and s_addr */ ep_sockaddr = (struct sockaddr_in*)dest_ep_name; opal_output_verbose(1,orte_rml_base_framework.framework_output, "%s peer %s epnamelen is %lu, port = %d (or) 0x%x, InternetAddr = 0x%s ", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),ORTE_NAME_PRINT(peer), (unsigned long)orte_rml_ofi.ofi_prov[ofi_prov_id].epnamelen,ntohs(ep_sockaddr->sin_port), ntohs(ep_sockaddr->sin_port),inet_ntoa(ep_sockaddr->sin_addr)); /*[end debug]*/ break; } //[Debug] end debug opal_output_verbose(10, orte_rml_base_framework.framework_output, "%s peer ep name obtained for %s. length=%lu", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(peer), dest_ep_namelen); ret = fi_av_insert(orte_rml_ofi.ofi_prov[ofi_prov_id].av, dest_ep_name,1,&dest_fi_addr,0,NULL); if( ret != 1) { opal_output_verbose(1, orte_rml_base_framework.framework_output, "%s fi_av_insert failed in send_msg() returned %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),ret ); /* call the send-callback fn with error and return, also return failure status */ snd->status = ORTE_ERR_ADDRESSEE_UNKNOWN; ORTE_RML_SEND_COMPLETE(snd); return; } ofi_send_req->send = snd; ofi_send_req->completion_count = 1; /* [DESC] we want to send the pid,seqnum,tag in addition to the data * copy all of this to header of message from the ofi_send_t* send */ ofi_send_req->hdr.dst = ofi_send_req->send->dst; ofi_send_req->hdr.origin = ofi_send_req->send->origin; ofi_send_req->hdr.seq_num = ofi_send_req->send->seq_num; ofi_send_req->hdr.tag = ofi_send_req->send->tag; /* * also insert ofi plugin specific header details - * the unique msgid, for now initalise total_packets to 1 */ ofi_send_req->hdr.msgid = orte_rml_ofi.cur_msgid; orte_rml_ofi.cur_msgid += 1; total_packets = 1; /* copy the buffer/iov/data to the ofi_send_req->datablob and update ofi_send_req->length*/ ofi_send_req->length = 0; if( NULL != ofi_send_req->send->buffer) { ofi_send_req->length = ofi_send_req->send->buffer->bytes_used; ofi_send_req->data_blob = (char *)malloc(ofi_send_req->length); memcpy(ofi_send_req->data_blob , ofi_send_req->send->buffer->base_ptr, ofi_send_req->send->buffer->bytes_used); } else if ( NULL != ofi_send_req->send->iov) { for (int i=0; i < ofi_send_req->send->count; i++) { ofi_send_req->length += ofi_send_req->send->iov[i].iov_len; } ofi_send_req->data_blob = (char *)malloc(ofi_send_req->length); int iovlen=0; for (int i=0; i < ofi_send_req->send->count; i++) { memcpy(((char *)ofi_send_req->data_blob + iovlen ), ofi_send_req->send->iov[i].iov_base, ofi_send_req->send->iov[i].iov_len); iovlen += ofi_send_req->send->iov[i].iov_len; } } else { //just send the data ofi_send_req->length = ofi_send_req->send->count; ofi_send_req->data_blob = (char *)malloc(ofi_send_req->length); memcpy(ofi_send_req->data_blob , ofi_send_req->send->data, ofi_send_req->send->count); } opal_output_verbose(15, orte_rml_base_framework.framework_output, "%s Completed copying all data into ofi_send_req->data_blob, total data - %lu bytes", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ofi_send_req->length ); /* Each packet will have header information, so the data length in each packet is datalen_per_packet. * check if the ofi_send_req->send->buffer->bytes_used is greater than the data per packet datalen_per_packet(recv buffer) * if so fragment and add info to header and send it in a loop back-to-back */ hdrsize = sizeof(orte_rml_ofi_msg_header_t); datalen_per_pkt = MIN_MULTI_BUF_SIZE - hdrsize; if (ofi_send_req->length > datalen_per_pkt ) { total_packets = ( ofi_send_req->length / datalen_per_pkt ) + 1 ; } ofi_send_req->hdr.tot_pkts = total_packets; opal_output_verbose(15, orte_rml_base_framework.framework_output, "%s datalen_per_pkt = %lu, ofi_send_req->length= %lu, total packets = %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), datalen_per_pkt, ofi_send_req->length, total_packets ); /* in a loop send create and send the packets */ for(size_t pkt_num=1,sent_data=0; sent_data < ofi_send_req->length; pkt_num++) { ofi_send_req->hdr.cur_pkt_num = pkt_num; /* create the packet */ ofi_msg_pkt = OBJ_NEW(orte_rml_ofi_send_pkt_t); data_in_pkt = ((ofi_send_req->length - sent_data) >= datalen_per_pkt) ? datalen_per_pkt : (ofi_send_req->length - sent_data); ofi_msg_pkt->pkt_size = hdrsize + data_in_pkt; opal_output_verbose(15, orte_rml_base_framework.framework_output, "%s Packet %lu -> data_in_pkt= %lu, header_size= %lu, pkt_size=%lu", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), pkt_num,data_in_pkt,hdrsize,ofi_msg_pkt->pkt_size ); /* copy the header and data for this pkt */ ofi_msg_pkt->data = malloc( ofi_msg_pkt->pkt_size); memcpy(ofi_msg_pkt->data, &ofi_send_req->hdr, hdrsize ); memcpy( ( (char *)ofi_msg_pkt->data + hdrsize ), ((char*)ofi_send_req->data_blob + sent_data), data_in_pkt); opal_output_verbose(15, orte_rml_base_framework.framework_output, "%s Copying header, data into packets completed", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME) ); /* add it to list */ opal_list_append(&(ofi_send_req->pkt_list), &ofi_msg_pkt->super); opal_output_verbose(15, orte_rml_base_framework.framework_output, "%s adding packet %lu to list done successful", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),pkt_num ); sent_data += data_in_pkt; } if( ofi_send_req->hdr.tot_pkts != ofi_send_req->hdr.cur_pkt_num ) { opal_output_verbose(1, orte_rml_base_framework.framework_output, "%s Error: Total packets calculated [%d] does not match total created-%d pkts to peer %s with tag %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ofi_send_req->hdr.tot_pkts, ofi_send_req->hdr.cur_pkt_num, ORTE_NAME_PRINT(peer), tag); } /* do the fi_send() for all the pkts */ ofi_send_req->completion_count= ofi_send_req->hdr.tot_pkts; OPAL_LIST_FOREACH(ofi_msg_pkt, &ofi_send_req->pkt_list, orte_rml_ofi_send_pkt_t) { /* debug purpose - copying the header from packet to verify if it is correct */ struct orte_rml_ofi_msg_header_t *cur_hdr; cur_hdr = (struct orte_rml_ofi_msg_header_t* ) ofi_msg_pkt->data; opal_output_verbose(10, orte_rml_base_framework.framework_output, "%s Sending Pkt[%d] of total %d pkts for msgid:%d to peer %s with tag %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), cur_hdr->cur_pkt_num, ofi_send_req->completion_count, cur_hdr->msgid, ORTE_NAME_PRINT(peer), tag); /* end debug*/ RML_OFI_RETRY_UNTIL_DONE(fi_send(orte_rml_ofi.ofi_prov[ofi_prov_id].ep, ofi_msg_pkt->data, ofi_msg_pkt->pkt_size, fi_mr_desc(orte_rml_ofi.ofi_prov[ofi_prov_id].mr_multi_recv), dest_fi_addr, (void *)&ofi_send_req->ctx)); } opal_output_verbose(10, orte_rml_base_framework.framework_output, "%s End of send_msg_transport. fi_send completed to peer %s with tag %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(peer), tag); OBJ_RELEASE(req); } int orte_rml_ofi_send_nb(struct orte_rml_base_module_t* mod, orte_process_name_t* peer, struct iovec* iov, int count, orte_rml_tag_t tag, orte_rml_callback_fn_t cbfunc, void* cbdata) { orte_rml_recv_t *rcv; int bytes; orte_self_send_xfer_t *xfer; int i; char* ptr; ofi_send_request_t *req; orte_rml_ofi_module_t *ofi_mod = (orte_rml_ofi_module_t*)mod; int ofi_prov_id = ofi_mod->cur_transport_id; opal_output_verbose(10, orte_rml_base_framework.framework_output, "%s rml_ofi_send_transport to peer %s at tag %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(peer), tag); if( (0 > ofi_prov_id) || ( ofi_prov_id >= orte_rml_ofi.ofi_prov_open_num ) ) { /* Invalid ofi_prov ID provided */ ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); return ORTE_ERR_BAD_PARAM; } if (ORTE_RML_TAG_INVALID == tag) { /* cannot send to an invalid tag */ ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); return ORTE_ERR_BAD_PARAM; } if (NULL == peer || OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_NAME_INVALID, peer)) { /* cannot send to an invalid peer */ ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); return ORTE_ERR_BAD_PARAM; } /* if this is a message to myself, then just post the message * for receipt - no need to dive into the ofi send_msg() */ if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, peer, ORTE_PROC_MY_NAME)) { /* local delivery */ OPAL_OUTPUT_VERBOSE((1, orte_rml_base_framework.framework_output, "%s rml_send_iovec_to_self at tag %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), tag)); /* send to self is a tad tricky - we really don't want * to track the send callback function throughout the recv * process and execute it upon receipt as this would provide * very different timing from a non-self message. Specifically, * if we just retain a pointer to the incoming data * and then execute the send callback prior to the receive, * then the caller will think we are done with the data and * can release it. So we have to copy the data in order to * execute the send callback prior to receiving the message. * * In truth, this really is a better mimic of the non-self * message behavior. If we actually pushed the message out * on the wire and had it loop back, then we would receive * a new block of data anyway. */ /* setup the send callback */ xfer = OBJ_NEW(orte_self_send_xfer_t); xfer->iov = iov; xfer->count = count; xfer->cbfunc.iov = cbfunc; xfer->tag = tag; xfer->cbdata = cbdata; /* setup the event for the send callback */ opal_event_set(orte_event_base, &xfer->ev, -1, OPAL_EV_WRITE, send_self_exe, xfer); opal_event_set_priority(&xfer->ev, ORTE_MSG_PRI); opal_event_active(&xfer->ev, OPAL_EV_WRITE, 1); /* copy the message for the recv */ rcv = OBJ_NEW(orte_rml_recv_t); rcv->sender = *peer; rcv->tag = tag; /* get the total number of bytes in the iovec array */ bytes = 0; for (i = 0 ; i < count ; ++i) { bytes += iov[i].iov_len; } /* get the required memory allocation */ if (0 < bytes) { rcv->iov.iov_base = (IOVBASE_TYPE*)malloc(bytes); rcv->iov.iov_len = bytes; /* transfer the bytes */ ptr = (char*)rcv->iov.iov_base; for (i = 0 ; i < count ; ++i) { memcpy(ptr, iov[i].iov_base, iov[i].iov_len); ptr += iov[i].iov_len; } } /* post the message for receipt - since the send callback was posted * first and has the same priority, it will execute first */ ORTE_RML_ACTIVATE_MESSAGE(rcv); return ORTE_SUCCESS; } /* get ourselves into an event to protect against * race conditions and threads */ req = OBJ_NEW(ofi_send_request_t); req->ofi_prov_id = ofi_prov_id; req->send.dst = *peer; req->send.iov = iov; req->send.count = count; req->send.tag = tag; req->send.cbfunc.iov = cbfunc; req->send.cbdata = cbdata; /* setup the event for the send callback */ opal_event_set(orte_event_base, &req->ev, -1, OPAL_EV_WRITE, send_msg, req); opal_event_set_priority(&req->ev, ORTE_MSG_PRI); opal_event_active(&req->ev, OPAL_EV_WRITE, 1); return ORTE_SUCCESS; } int orte_rml_ofi_send_buffer_nb(struct orte_rml_base_module_t *mod, orte_process_name_t* peer, struct opal_buffer_t* buffer, orte_rml_tag_t tag, orte_rml_buffer_callback_fn_t cbfunc, void* cbdata) { orte_rml_recv_t *rcv; orte_self_send_xfer_t *xfer; ofi_send_request_t *req; orte_rml_ofi_module_t *ofi_mod = (orte_rml_ofi_module_t*)mod; int ofi_prov_id = ofi_mod->cur_transport_id; opal_output_verbose(10, orte_rml_base_framework.framework_output, "%s rml_ofi_send_buffer_transport to peer %s at tag %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(peer), tag); if( (0 > ofi_prov_id) || ( ofi_prov_id >= orte_rml_ofi.ofi_prov_open_num ) ) { /* Invalid ofi_prov ID provided */ ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); return ORTE_ERR_BAD_PARAM; } if (ORTE_RML_TAG_INVALID == tag) { /* cannot send to an invalid tag */ ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); return ORTE_ERR_BAD_PARAM; } if (NULL == peer || OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_NAME_INVALID, peer)) { /* cannot send to an invalid peer */ ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); return ORTE_ERR_BAD_PARAM; } /* if this is a message to myself, then just post the message * for receipt - no need to dive into the oob */ if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, peer, ORTE_PROC_MY_NAME)) { /* local delivery */ OPAL_OUTPUT_VERBOSE((1, orte_rml_base_framework.framework_output, "%s rml_send_iovec_to_self at tag %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), tag)); /* send to self is a tad tricky - we really don't want * to track the send callback function throughout the recv * process and execute it upon receipt as this would provide * very different timing from a non-self message. Specifically, * if we just retain a pointer to the incoming data * and then execute the send callback prior to the receive, * then the caller will think we are done with the data and * can release it. So we have to copy the data in order to * execute the send callback prior to receiving the message. * * In truth, this really is a better mimic of the non-self * message behavior. If we actually pushed the message out * on the wire and had it loop back, then we would receive * a new block of data anyway. */ /* setup the send callback */ xfer = OBJ_NEW(orte_self_send_xfer_t); xfer->buffer = buffer; xfer->cbfunc.buffer = cbfunc; xfer->tag = tag; xfer->cbdata = cbdata; /* setup the event for the send callback */ opal_event_set(orte_event_base, &xfer->ev, -1, OPAL_EV_WRITE, send_self_exe, xfer); opal_event_set_priority(&xfer->ev, ORTE_MSG_PRI); opal_event_active(&xfer->ev, OPAL_EV_WRITE, 1); /* copy the message for the recv */ rcv = OBJ_NEW(orte_rml_recv_t); rcv->sender = *peer; rcv->tag = tag; rcv->iov.iov_base = (IOVBASE_TYPE*)malloc(buffer->bytes_used); memcpy(rcv->iov.iov_base, buffer->base_ptr, buffer->bytes_used); rcv->iov.iov_len = buffer->bytes_used; /* post the message for receipt - since the send callback was posted * first and has the same priority, it will execute first */ ORTE_RML_ACTIVATE_MESSAGE(rcv); return ORTE_SUCCESS; } /* get ourselves into an event to protect against * race conditions and threads */ req = OBJ_NEW(ofi_send_request_t); req->ofi_prov_id = ofi_prov_id; req->send.dst = *peer; req->send.buffer = buffer; req->send.tag = tag; req->send.cbfunc.buffer = cbfunc; req->send.cbdata = cbdata; /* setup the event for the send callback */ opal_event_set(orte_event_base, &req->ev, -1, OPAL_EV_WRITE, send_msg, req); opal_event_set_priority(&req->ev, ORTE_MSG_PRI); opal_event_active(&req->ev, OPAL_EV_WRITE, 1); return ORTE_SUCCESS; }