/* * Copyright (c) 2004-2005 The Trustees of Indiana University. * All rights reserved. * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. * All rights reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ */ #include "ompi_config.h" #include #include #include #include #include "ompi/include/constants.h" #include "opal/util/output.h" #include "mca/pml/pml.h" #include "mca/btl/btl.h" #include "ompi/datatype/convertor.h" #include "btl_portals.h" #include "btl_portals_compat.h" #include "btl_portals_endpoint.h" #include "btl_portals_recv.h" #include "btl_portals_frag.h" mca_btl_portals_module_t mca_btl_portals_module = { { &mca_btl_portals_component.super, /* NOTE: All these default values are set in component_open() */ 0, /* max size of first frag */ 0, /* min send size */ 0, /* max send size */ 0, /* min rdma size */ 0, /* max rdma size */ 0, /* exclusivity - higher than sm, lower than self */ 0, /* latency */ 0, /* bandwidth */ 0, /* btl flags */ mca_btl_portals_add_procs, mca_btl_portals_del_procs, mca_btl_portals_register, mca_btl_portals_finalize, mca_btl_portals_alloc, mca_btl_portals_free, mca_btl_portals_prepare_src, mca_btl_portals_prepare_dst, mca_btl_portals_send, mca_btl_portals_put, mca_btl_portals_get }, }; int mca_btl_portals_add_procs(struct mca_btl_base_module_t* btl_base, size_t nprocs, struct ompi_proc_t **procs, struct mca_btl_base_endpoint_t** peers, ompi_bitmap_t* reachable) { int ret; struct ompi_proc_t *curr_proc = NULL; ptl_process_id_t *portals_procs = NULL; size_t i; unsigned long distance; bool need_activate = false; assert(&mca_btl_portals_module == (mca_btl_portals_module_t*) btl_base); opal_output_verbose(50, mca_btl_portals_component.portals_output, "Adding %d procs (%d)", nprocs, mca_btl_portals_module.portals_num_procs); /* make sure our environment is fully initialized. At end of this call, we have a working network handle on our module and portals_procs will have the portals process identifier for each proc (ordered, in theory) */ ret = mca_btl_portals_add_procs_compat(&mca_btl_portals_module, nprocs, procs, &portals_procs); if (OMPI_SUCCESS != ret) return ret; if (0 == mca_btl_portals_module.portals_num_procs) { need_activate = true; } /* loop through all procs, setting our reachable flag */ for (i= 0; i < nprocs ; ++i) { curr_proc = procs[i]; peers[i] = malloc(sizeof(mca_btl_base_endpoint_t)); if (NULL == peers[i]) return OMPI_ERROR; *((mca_btl_base_endpoint_t*) peers[i]) = portals_procs[i]; /* make sure we can reach the process - this is supposed to be a cheap-ish operation */ ret = PtlNIDist(mca_btl_portals_module.portals_ni_h, portals_procs[i], &distance); if (ret != PTL_OK) { opal_output_verbose(10, mca_btl_portals_component.portals_output, "Could not find distance to process %d", i); continue; } OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_num_procs, 1); /* and here we can reach */ ompi_bitmap_set_bit(reachable, i); } if (NULL != portals_procs) free(portals_procs); if (need_activate && mca_btl_portals_module.portals_num_procs > 0) { /* create eqs */ int i; opal_output_verbose(50, mca_btl_portals_component.portals_output, "Enabling progress"); for (i = 0 ; i < OMPI_BTL_PORTALS_EQ_SIZE ; ++i) { int ptl_ret = PtlEQAlloc(mca_btl_portals_module.portals_ni_h, mca_btl_portals_module.portals_eq_sizes[i], PTL_EQ_HANDLER_NONE, &(mca_btl_portals_module.portals_eq_handles[i])); if (PTL_OK != ptl_ret) { opal_output(mca_btl_portals_component.portals_output, "Error creating EQ %d: %d", i, ptl_ret); /* BWB - better error code? */ return OMPI_ERROR; } } ret = mca_btl_portals_recv_enable(&mca_btl_portals_module); /* fill in send memory descriptor */ mca_btl_portals_module.md_send.start = NULL; mca_btl_portals_module.md_send.length = 0; mca_btl_portals_module.md_send.threshold = 2; /* send and ack */ mca_btl_portals_module.md_send.max_size = 0; mca_btl_portals_module.md_send.options = PTL_MD_EVENT_START_DISABLE; mca_btl_portals_module.md_send.user_ptr = NULL; mca_btl_portals_module.md_send.eq_handle = mca_btl_portals_module.portals_eq_handles[OMPI_BTL_PORTALS_EQ_SEND]; } else { ret = OMPI_SUCCESS; } opal_output_verbose(50, mca_btl_portals_component.portals_output, "count: %d", mca_btl_portals_module.portals_num_procs); return ret; } int mca_btl_portals_del_procs(struct mca_btl_base_module_t *btl_base, size_t nprocs, struct ompi_proc_t **procs, struct mca_btl_base_endpoint_t **peers) { size_t i = 0; int ret = OMPI_SUCCESS; assert(&mca_btl_portals_module == (mca_btl_portals_module_t*) btl_base); opal_output_verbose(50, mca_btl_portals_component.portals_output, "Removing %d procs (%d)", nprocs, mca_btl_portals_module.portals_num_procs); for (i = 0 ; i < nprocs ; ++i) { free(peers[i]); OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_num_procs, -1); } if (0 == mca_btl_portals_module.portals_num_procs) { int i; opal_output_verbose(50, mca_btl_portals_component.portals_output, "Disabling progress"); ret = mca_btl_portals_recv_disable(&mca_btl_portals_module); /* destroy eqs */ for (i = 0 ; i < OMPI_BTL_PORTALS_EQ_SIZE ; ++i) { int ptl_ret = PtlEQFree(mca_btl_portals_module.portals_eq_handles[i]); if (PTL_OK != ptl_ret) { opal_output(mca_btl_portals_component.portals_output, "Error freeing EQ %d: %d", i, ptl_ret); } } } else { ret = OMPI_SUCCESS; } return ret; } int mca_btl_portals_register(struct mca_btl_base_module_t* btl_base, mca_btl_base_tag_t tag, mca_btl_base_module_recv_cb_fn_t cbfunc, void* cbdata) { assert(&mca_btl_portals_module == (mca_btl_portals_module_t*) btl_base); mca_btl_portals_module.portals_reg[tag].cbfunc = cbfunc; mca_btl_portals_module.portals_reg[tag].cbdata = cbdata; return OMPI_SUCCESS; } mca_btl_base_descriptor_t* mca_btl_portals_alloc(struct mca_btl_base_module_t* btl_base, size_t size) { mca_btl_portals_frag_t* frag; int rc; assert(&mca_btl_portals_module == (mca_btl_portals_module_t*) btl_base); if (size <= mca_btl_portals_module.super.btl_eager_limit) { OMPI_BTL_PORTALS_FRAG_ALLOC_EAGER(&mca_btl_portals_module, frag, rc); frag->segments[0].seg_len = size <= mca_btl_portals_module.super.btl_eager_limit ? size : mca_btl_portals_module.super.btl_eager_limit ; } else { OMPI_BTL_PORTALS_FRAG_ALLOC_MAX(&mca_btl_portals_module, frag, rc); frag->segments[0].seg_len = size <= mca_btl_portals_module.super.btl_max_send_size ? size : mca_btl_portals_module.super.btl_max_send_size ; } frag->base.des_src_cnt = 1; frag->base.des_flags = 0; return &frag->base; } int mca_btl_portals_free(struct mca_btl_base_module_t* btl_base, mca_btl_base_descriptor_t* des) { mca_btl_portals_frag_t* frag = (mca_btl_portals_frag_t*) des; assert(&mca_btl_portals_module == (mca_btl_portals_module_t*) btl_base); if (frag->size == 0) { OMPI_BTL_PORTALS_FRAG_RETURN_USER(&mca_btl_portals_module.super, frag); } else if (frag->size == mca_btl_portals_module.super.btl_eager_limit){ OMPI_BTL_PORTALS_FRAG_RETURN_EAGER(&mca_btl_portals_module.super, frag); } else if (frag->size == mca_btl_portals_module.super.btl_max_send_size) { OMPI_BTL_PORTALS_FRAG_RETURN_MAX(&mca_btl_portals_module.super, frag); } else { return OMPI_ERR_BAD_PARAM; } return OMPI_SUCCESS; } mca_btl_base_descriptor_t* mca_btl_portals_prepare_src(struct mca_btl_base_module_t* btl_base, struct mca_btl_base_endpoint_t* peer, mca_mpool_base_registration_t* registration, struct ompi_convertor_t* convertor, size_t reserve, size_t* size) { mca_btl_portals_frag_t* frag; size_t max_data = *size; struct iovec iov; uint32_t iov_count = 1; int32_t free_after; int ret; assert(&mca_btl_portals_module == (mca_btl_portals_module_t*) btl_base); if (0 != ompi_convertor_need_buffers(convertor)) { /* if we need to use buffers to pack the data, grab either an eager or (if we need more space) max buffer, pack the data into the first segment, and return */ if (max_data+reserve <= mca_btl_portals_module.super.btl_eager_limit) { /* * if we can't send out of the buffer directly and the * requested size is less than the eager limit, pack into a * fragment from the eager pool */ OMPI_BTL_PORTALS_FRAG_ALLOC_EAGER(&mca_btl_portals_module, frag, ret); if (NULL == frag) { return NULL; } iov.iov_len = max_data; iov.iov_base = (unsigned char*) frag->segments[0].seg_addr.pval + reserve; ret = ompi_convertor_pack(convertor, &iov, &iov_count, &max_data, &free_after); *size = max_data; if (ret < 0) { OMPI_BTL_PORTALS_FRAG_RETURN_EAGER(&mca_btl_portals_module, frag); return NULL; } frag->segments[0].seg_len = max_data + reserve; frag->base.des_src_cnt = 1; } else { /* * otherwise pack as much data as we can into a fragment * that is the max send size. */ OMPI_BTL_PORTALS_FRAG_ALLOC_MAX(&mca_btl_portals_module, frag, ret); if (NULL == frag) { return NULL; } if (max_data + reserve > mca_btl_portals_module.super.btl_max_send_size){ max_data = mca_btl_portals_module.super.btl_max_send_size - reserve; } iov.iov_len = max_data; iov.iov_base = (unsigned char*) frag->segments[0].seg_addr.pval + reserve; ret = ompi_convertor_pack(convertor, &iov, &iov_count, &max_data, &free_after); *size = max_data; if ( ret < 0 ) { OMPI_BTL_PORTALS_FRAG_RETURN_MAX(&mca_btl_portals_module, frag); return NULL; } frag->segments[0].seg_len = max_data + reserve; frag->base.des_src_cnt = 1; } } else { /* no need to pack - we can send directly out of the user's buffer. If we have reserve space, use an eager fragment and give the caller the eager space as reserve. If we have no reserve space needs, use a user frag */ if (0 == reserve) { /* user frags are always setup to use only one fragment */ OMPI_BTL_PORTALS_FRAG_ALLOC_USER(&mca_btl_portals_module.super, frag, ret); if(NULL == frag){ return NULL; } iov.iov_len = max_data; iov.iov_base = NULL; ompi_convertor_pack(convertor, &iov, &iov_count, &max_data, &free_after); frag->segments[0].seg_len = max_data; frag->segments[0].seg_addr.pval = iov.iov_base; frag->base.des_src_cnt = 1; } else { OMPI_BTL_PORTALS_FRAG_ALLOC_EAGER(&mca_btl_portals_module, frag, ret); if (NULL == frag) { return NULL; } iov.iov_len = max_data; iov.iov_base = NULL; ret = ompi_convertor_pack(convertor, &iov, &iov_count, &max_data, &free_after); *size = max_data; if (ret < 0) { OMPI_BTL_PORTALS_FRAG_RETURN_EAGER(&mca_btl_portals_module, frag); return NULL; } frag->segments[0].seg_len = reserve; frag->segments[1].seg_addr.pval = iov.iov_base; frag->segments[1].seg_len = max_data; frag->base.des_src_cnt = 2; frag->iov[0].iov_base = frag->segments[0].seg_addr.pval; frag->iov[0].iov_len = frag->segments[0].seg_len; frag->iov[1].iov_base = frag->segments[1].seg_addr.pval; frag->iov[1].iov_len = frag->segments[1].seg_len; } } frag->base.des_src = frag->segments; frag->base.des_dst = NULL; frag->base.des_dst_cnt = 0; frag->base.des_flags = 0; return &frag->base; } mca_btl_base_descriptor_t* mca_btl_portals_prepare_dst(struct mca_btl_base_module_t* btl_base, struct mca_btl_base_endpoint_t* peer, mca_mpool_base_registration_t* registration, struct ompi_convertor_t* convertor, size_t reserve, size_t* size) { mca_btl_portals_frag_t* frag; ptl_md_t md; ptl_handle_me_t me_h; ptl_handle_md_t md_h; int ret; assert(&mca_btl_portals_module == (mca_btl_portals_module_t*) btl_base); OMPI_BTL_PORTALS_FRAG_ALLOC_USER(&mca_btl_portals_module.super, frag, ret); if(NULL == frag) { return NULL; } frag->segments[0].seg_len = *size; frag->segments[0].seg_addr.pval = convertor->pBaseBuf + convertor->bConverted; frag->segments[0].seg_key.key64 = OPAL_THREAD_ADD64(&(mca_btl_portals_module.portals_rdma_key), 1); frag->base.des_src = NULL; frag->base.des_src_cnt = 0; frag->base.des_dst = frag->segments; frag->base.des_dst_cnt = 1; frag->base.des_flags = 0; frag->type = mca_btl_portals_frag_type_rdma; OPAL_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output, "rdma dest posted for frag 0x%x, callback 0x%x, bits %lld", frag, frag->base.des_cbfunc, frag->segments[0].seg_key.key64)); /* create a match entry */ ret = PtlMEAttach(mca_btl_portals_module.portals_ni_h, OMPI_BTL_PORTALS_RDMA_TABLE_ID, *((mca_btl_base_endpoint_t*) peer), frag->segments[0].seg_key.key64, /* match */ 0, /* ignore */ PTL_UNLINK, PTL_INS_AFTER, &me_h); if (PTL_OK != ret) { opal_output(mca_btl_portals_component.portals_output, "Error creating rdma dest ME: %d", ret); OMPI_BTL_PORTALS_FRAG_RETURN_USER(&mca_btl_portals_module.super, frag); return NULL; } /* setup the memory descriptor. RDMA should never need to be retransmitted, so we set the threshold for the event it will receive (PUT/GET START and END). No need to track the unlinks later :) */ md.start = frag->segments[0].seg_addr.pval; md.length = frag->segments[0].seg_len; md.threshold = 1; /* unlink after put */ md.max_size = 0; md.options = PTL_MD_OP_PUT | PTL_MD_OP_GET | PTL_MD_EVENT_START_DISABLE; md.user_ptr = frag; /* keep a pointer to ourselves */ md.eq_handle = mca_btl_portals_module.portals_eq_handles[OMPI_BTL_PORTALS_EQ]; ret = PtlMDAttach(me_h, md, PTL_UNLINK, &md_h); if (PTL_OK != ret) { opal_output(mca_btl_portals_component.portals_output, "Error creating rdma dest MD: %d", ret); PtlMEUnlink(me_h); OMPI_BTL_PORTALS_FRAG_RETURN_USER(&mca_btl_portals_module.super, frag); return NULL; } return &frag->base; } int mca_btl_portals_finalize(struct mca_btl_base_module_t *btl_base) { int ret; assert(&mca_btl_portals_module == (mca_btl_portals_module_t*) btl_base); /* finalize all communication */ while (mca_btl_portals_module.portals_outstanding_sends > 0) { mca_btl_portals_component_progress(); } if (0 != opal_list_get_size(&(mca_btl_portals_module.portals_queued_sends))) { opal_output(mca_btl_portals_component.portals_output, "Warning: there were %d queued sends not sent", opal_list_get_size(&(mca_btl_portals_module.portals_queued_sends))); } if (mca_btl_portals_module.portals_num_procs != 0) { int i; ret = mca_btl_portals_recv_disable(&mca_btl_portals_module); /* destroy eqs */ for (i = 0 ; i < OMPI_BTL_PORTALS_EQ_SIZE ; ++i) { int ptl_ret = PtlEQFree(mca_btl_portals_module.portals_eq_handles[i]); if (PTL_OK != ptl_ret) { opal_output(mca_btl_portals_component.portals_output, "Error freeing EQ %d: %d", i, ptl_ret); } } } OBJ_DESTRUCT(&mca_btl_portals_module.portals_recv_blocks); OBJ_DESTRUCT(&mca_btl_portals_module.portals_queued_sends); if (PTL_INVALID_HANDLE != mca_btl_portals_module.portals_ni_h) { ret = PtlNIFini(mca_btl_portals_module.portals_ni_h); if (PTL_OK != ret) { opal_output_verbose(20, mca_btl_portals_component.portals_output, "PtlNIFini returned %d", ret); return OMPI_ERROR; } } opal_output_verbose(20, mca_btl_portals_component.portals_output, "successfully finalized module"); return OMPI_SUCCESS; }