diff --git a/ompi/mca/pml/bfo/Makefile.am b/ompi/mca/pml/bfo/Makefile.am new file mode 100644 index 0000000000..a5ce5464c1 --- /dev/null +++ b/ompi/mca/pml/bfo/Makefile.am @@ -0,0 +1,68 @@ +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. +# +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +dist_pkgdata_DATA = \ + help-mpi-pml-bfo.txt + +EXTRA_DIST = post_configure.sh .windows + +bfo_sources = \ + pml_bfo.c \ + pml_bfo.h \ + pml_bfo_comm.c \ + pml_bfo_comm.h \ + pml_bfo_component.c \ + pml_bfo_component.h \ + pml_bfo_failover.c \ + pml_bfo_failover.h \ + pml_bfo_hdr.h \ + pml_bfo_iprobe.c \ + pml_bfo_irecv.c \ + pml_bfo_isend.c \ + pml_bfo_progress.c \ + pml_bfo_rdma.c \ + pml_bfo_rdma.h \ + pml_bfo_rdmafrag.c \ + pml_bfo_rdmafrag.h \ + pml_bfo_recvfrag.c \ + pml_bfo_recvfrag.h \ + pml_bfo_recvreq.c \ + pml_bfo_recvreq.h \ + pml_bfo_sendreq.c \ + pml_bfo_sendreq.h \ + pml_bfo_start.c + +if OMPI_BUILD_pml_bfo_DSO +component_noinst = +component_install = mca_pml_bfo.la +else +component_noinst = libmca_pml_bfo.la +component_install = +endif + +mcacomponentdir = $(pkglibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_pml_bfo_la_SOURCES = $(bfo_sources) +mca_pml_bfo_la_LDFLAGS = -module -avoid-version + +noinst_LTLIBRARIES = $(component_noinst) +libmca_pml_bfo_la_SOURCES = $(bfo_sources) +libmca_pml_bfo_la_LDFLAGS = -module -avoid-version diff --git a/ompi/mca/pml/bfo/check-diffs-ob1.sh b/ompi/mca/pml/bfo/check-diffs-ob1.sh new file mode 100755 index 0000000000..a0ac4ca01a --- /dev/null +++ b/ompi/mca/pml/bfo/check-diffs-ob1.sh @@ -0,0 +1,74 @@ +#!/bin/sh +# +# Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. +# +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# This scripts runs a diff between the ob1 and bfo files. This +# allows us to quickly see the differences between the two and +# how well the bfo files are tracking ob1 as it changes. You +# can also modify this, and run it from the csum directory. + +CP=/bin/cp +MKDIR=/bin/mkdir +RM=/bin/rm +TOUCH=/bin/touch +pml=bfo +PML=BFO +ob1=ob1 +OB1=OB1 +DIFF=$ob1-$pml.diff +DIFFDIR=diff-dir + +$MKDIR $DIFFDIR + +# Only diff a subset of files that are known to be different. +FILES="Makefile.am \ + pml_NAME.c \ + pml_NAME.h \ + pml_NAME_component.c \ + pml_NAME_component.h \ + pml_NAME_hdr.h \ + pml_NAME_rdmafrag.h \ + pml_NAME_recvfrag.c \ + pml_NAME_recvreq.c \ + pml_NAME_recvreq.h \ + pml_NAME_sendreq.c \ + pml_NAME_sendreq.h" + +# Copy over the files from the bfo directory. +for name in $FILES +do + $CP `echo $name | sed s/NAME/$pml/` $DIFFDIR +done + +cd $DIFFDIR +# Convert the pml/PML strings back into ob1/OB1 strings +# to avoid spurious differences between the files. +../../../../../contrib/search_replace.pl $pml $ob1 +../../../../../contrib/search_replace.pl $PML $OB1 + +# Copy over the files from the ob1 directory. +for name in $FILES +do + $CP ../../ob1/`echo $name | sed s/NAME/$ob1/` . +done + +$RM -f $DIFF +$TOUCH $DIFF + +# Now run the diff. +for name in $FILES +do + diff -c `echo $name | sed s/NAME/$ob1/` `echo $name | sed s/NAME/$pml/` >> $DIFF +done + +# Cleanup +mv $DIFF .. +cd .. +$RM -rf $DIFFDIR diff --git a/ompi/mca/pml/bfo/configure.params b/ompi/mca/pml/bfo/configure.params new file mode 100644 index 0000000000..d14bd950fe --- /dev/null +++ b/ompi/mca/pml/bfo/configure.params @@ -0,0 +1,25 @@ +# -*- shell-script -*- +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2007 Los Alamos National Security, LLC. All rights +# reserved. +# Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# Specific to this module + +PARAM_CONFIG_FILES="Makefile" diff --git a/ompi/mca/pml/bfo/help-mpi-pml-bfo.txt b/ompi/mca/pml/bfo/help-mpi-pml-bfo.txt new file mode 100644 index 0000000000..b3c44ec80e --- /dev/null +++ b/ompi/mca/pml/bfo/help-mpi-pml-bfo.txt @@ -0,0 +1,20 @@ +# -*- text -*- +# +# Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# +[eager_limit_too_small] +The "eager limit" MCA parameter in the %s BTL was set to a value which +is too low for Open MPI to function properly. Please re-run your job +with a higher eager limit value for this BTL; the exact MCA parameter +name and its corresponding minimum value is shown below. + + Local host: %s + BTL name: %s + BTL eager limit value: %d (set via btl_%s_eager_limit) + BTL eager limit minimum: %d + MCA parameter name: btl_%s_eager_limit diff --git a/ompi/mca/pml/bfo/pml_bfo.c b/ompi/mca/pml/bfo/pml_bfo.c new file mode 100644 index 0000000000..f57dab4b48 --- /dev/null +++ b/ompi/mca/pml/bfo/pml_bfo.c @@ -0,0 +1,898 @@ +/* -*- Mode: C; c-basic-offset:4 ; -*- */ +/* + * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2009 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2008 UT-Battelle, LLC. All rights reserved. + * Copyright (c) 2006-2008 University of Houston. All rights reserved. + * Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#include +#include + +#include "opal/class/opal_bitmap.h" +#include "ompi/mca/pml/pml.h" +#include "ompi/mca/pml/base/base.h" +#include "ompi/mca/btl/btl.h" +#include "ompi/mca/pml/base/base.h" +#include "ompi/mca/btl/base/base.h" +#include "pml_bfo.h" +#include "pml_bfo_component.h" +#include "pml_bfo_comm.h" +#include "pml_bfo_hdr.h" +#include "pml_bfo_recvfrag.h" +#include "pml_bfo_sendreq.h" +#include "pml_bfo_recvreq.h" +#include "pml_bfo_rdmafrag.h" +#include "ompi/mca/bml/base/base.h" +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/grpcomm/grpcomm.h" +#include "orte/util/show_help.h" +/* BFO FAILOVER CODE - begin */ +#include "pml_bfo_failover.h" +/* BFO FAILOVER CODE - end */ + +#include "ompi/runtime/ompi_cr.h" + +mca_pml_bfo_t mca_pml_bfo = { + { + mca_pml_bfo_add_procs, + mca_pml_bfo_del_procs, + mca_pml_bfo_enable, + mca_pml_bfo_progress, + mca_pml_bfo_add_comm, + mca_pml_bfo_del_comm, + mca_pml_bfo_irecv_init, + mca_pml_bfo_irecv, + mca_pml_bfo_recv, + mca_pml_bfo_isend_init, + mca_pml_bfo_isend, + mca_pml_bfo_send, + mca_pml_bfo_iprobe, + mca_pml_bfo_probe, + mca_pml_bfo_start, + mca_pml_bfo_dump, + mca_pml_bfo_ft_event, + 65535, + INT_MAX + } +}; + + +void mca_pml_bfo_error_handler( struct mca_btl_base_module_t* btl, + int32_t flags, ompi_proc_t* errproc, + char* btlinfo ); + +int mca_pml_bfo_enable(bool enable) +{ + if( false == enable ) { + return OMPI_SUCCESS; + } + + OBJ_CONSTRUCT(&mca_pml_bfo.lock, opal_mutex_t); + + /* fragments */ + OBJ_CONSTRUCT(&mca_pml_bfo.rdma_frags, ompi_free_list_t); + ompi_free_list_init_new( &mca_pml_bfo.rdma_frags, + sizeof(mca_pml_bfo_rdma_frag_t), + opal_cache_line_size, + OBJ_CLASS(mca_pml_bfo_rdma_frag_t), + 0,opal_cache_line_size, + mca_pml_bfo.free_list_num, + mca_pml_bfo.free_list_max, + mca_pml_bfo.free_list_inc, + NULL ); + + OBJ_CONSTRUCT(&mca_pml_bfo.recv_frags, ompi_free_list_t); + + ompi_free_list_init_new( &mca_pml_bfo.recv_frags, + sizeof(mca_pml_bfo_recv_frag_t) + mca_pml_bfo.unexpected_limit, + opal_cache_line_size, + OBJ_CLASS(mca_pml_bfo_recv_frag_t), + 0,opal_cache_line_size, + mca_pml_bfo.free_list_num, + mca_pml_bfo.free_list_max, + mca_pml_bfo.free_list_inc, + NULL ); + + OBJ_CONSTRUCT(&mca_pml_bfo.pending_pckts, ompi_free_list_t); + ompi_free_list_init_new( &mca_pml_bfo.pending_pckts, + sizeof(mca_pml_bfo_pckt_pending_t), + opal_cache_line_size, + OBJ_CLASS(mca_pml_bfo_pckt_pending_t), + 0,opal_cache_line_size, + mca_pml_bfo.free_list_num, + mca_pml_bfo.free_list_max, + mca_pml_bfo.free_list_inc, + NULL ); + + + OBJ_CONSTRUCT(&mca_pml_bfo.buffers, ompi_free_list_t); + OBJ_CONSTRUCT(&mca_pml_bfo.send_ranges, ompi_free_list_t); + ompi_free_list_init_new( &mca_pml_bfo.send_ranges, + sizeof(mca_pml_bfo_send_range_t) + + (mca_pml_bfo.max_send_per_range - 1) * sizeof(mca_pml_bfo_com_btl_t), + opal_cache_line_size, + OBJ_CLASS(mca_pml_bfo_send_range_t), + 0,opal_cache_line_size, + mca_pml_bfo.free_list_num, + mca_pml_bfo.free_list_max, + mca_pml_bfo.free_list_inc, + NULL ); + + /* pending operations */ + OBJ_CONSTRUCT(&mca_pml_bfo.send_pending, opal_list_t); + OBJ_CONSTRUCT(&mca_pml_bfo.recv_pending, opal_list_t); + OBJ_CONSTRUCT(&mca_pml_bfo.pckt_pending, opal_list_t); + OBJ_CONSTRUCT(&mca_pml_bfo.rdma_pending, opal_list_t); + /* missing communicator pending list */ + OBJ_CONSTRUCT(&mca_pml_bfo.non_existing_communicator_pending, opal_list_t); + + /** + * If we get here this is the PML who get selected for the run. We + * should get ownership for the send and receive requests list, and + * initialize them with the size of our own requests. + */ + ompi_free_list_init_new( &mca_pml_base_send_requests, + sizeof(mca_pml_bfo_send_request_t) + + (mca_pml_bfo.max_rdma_per_request - 1) * + sizeof(mca_pml_bfo_com_btl_t), + opal_cache_line_size, + OBJ_CLASS(mca_pml_bfo_send_request_t), + 0,opal_cache_line_size, + mca_pml_bfo.free_list_num, + mca_pml_bfo.free_list_max, + mca_pml_bfo.free_list_inc, + NULL ); + + ompi_free_list_init_new( &mca_pml_base_recv_requests, + sizeof(mca_pml_bfo_recv_request_t) + + (mca_pml_bfo.max_rdma_per_request - 1) * + sizeof(mca_pml_bfo_com_btl_t), + opal_cache_line_size, + OBJ_CLASS(mca_pml_bfo_recv_request_t), + 0,opal_cache_line_size, + mca_pml_bfo.free_list_num, + mca_pml_bfo.free_list_max, + mca_pml_bfo.free_list_inc, + NULL ); + + mca_pml_bfo.enabled = true; + return OMPI_SUCCESS; +} + +int mca_pml_bfo_add_comm(ompi_communicator_t* comm) +{ + /* allocate pml specific comm data */ + mca_pml_bfo_comm_t* pml_comm = OBJ_NEW(mca_pml_bfo_comm_t); + opal_list_item_t *item, *next_item; + mca_pml_bfo_recv_frag_t* frag; + mca_pml_bfo_comm_proc_t* pml_proc; + mca_pml_bfo_match_hdr_t* hdr; + int i; + + if (NULL == pml_comm) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + + /* should never happen, but it was, so check */ + if (comm->c_contextid > mca_pml_bfo.super.pml_max_contextid) { + OBJ_RELEASE(pml_comm); + return OMPI_ERR_OUT_OF_RESOURCE; + } + + mca_pml_bfo_comm_init_size(pml_comm, comm->c_remote_group->grp_proc_count); + comm->c_pml_comm = pml_comm; + + for( i = 0; i < comm->c_remote_group->grp_proc_count; i++ ) { + pml_comm->procs[i].ompi_proc = ompi_group_peer_lookup(comm->c_remote_group,i); + OBJ_RETAIN(pml_comm->procs[i].ompi_proc); + } + /* Grab all related messages from the non_existing_communicator pending queue */ + for( item = opal_list_get_first(&mca_pml_bfo.non_existing_communicator_pending); + item != opal_list_get_end(&mca_pml_bfo.non_existing_communicator_pending); + item = next_item ) { + frag = (mca_pml_bfo_recv_frag_t*)item; + next_item = opal_list_get_next(item); + hdr = &frag->hdr.hdr_match; + + /* Is this fragment for the current communicator ? */ + if( frag->hdr.hdr_match.hdr_ctx != comm->c_contextid ) + continue; + + /* As we now know we work on a fragment for this communicator + * we should remove it from the + * non_existing_communicator_pending list. */ + opal_list_remove_item( &mca_pml_bfo.non_existing_communicator_pending, + item ); + + add_fragment_to_unexpected: + + /* We generate the MSG_ARRIVED event as soon as the PML is aware + * of a matching fragment arrival. Independing if it is received + * on the correct order or not. This will allow the tools to + * figure out if the messages are not received in the correct + * order (if multiple network interfaces). + */ + PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_MSG_ARRIVED, comm, + hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV); + + /* There is no matching to be done, and no lock to be held on the communicator as + * we know at this point that the communicator has not yet been returned to the user. + * The only required protection is around the non_existing_communicator_pending queue. + * We just have to push the fragment into the unexpected list of the corresponding + * proc, or into the out-of-order (cant_match) list. + */ + pml_proc = &(pml_comm->procs[hdr->hdr_src]); + + if( ((uint16_t)hdr->hdr_seq) == ((uint16_t)pml_proc->expected_sequence) ) { + /* We're now expecting the next sequence number. */ + pml_proc->expected_sequence++; + opal_list_append( &pml_proc->unexpected_frags, (opal_list_item_t*)frag ); + PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_MSG_INSERT_IN_UNEX_Q, comm, + hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV); + /* And now the ugly part. As some fragments can be inserted in the cant_match list, + * every time we succesfully add a fragment in the unexpected list we have to make + * sure the next one is not in the cant_match. Otherwise, we will endup in a deadlock + * situation as the cant_match is only checked when a new fragment is received from + * the network. + */ + for(frag = (mca_pml_bfo_recv_frag_t *)opal_list_get_first(&pml_proc->frags_cant_match); + frag != (mca_pml_bfo_recv_frag_t *)opal_list_get_end(&pml_proc->frags_cant_match); + frag = (mca_pml_bfo_recv_frag_t *)opal_list_get_next(frag)) { + hdr = &frag->hdr.hdr_match; + /* If the message has the next expected seq from that proc... */ + if(hdr->hdr_seq != pml_proc->expected_sequence) + continue; + + opal_list_remove_item(&pml_proc->frags_cant_match, (opal_list_item_t*)frag); + goto add_fragment_to_unexpected; + } + } else { + opal_list_append( &pml_proc->frags_cant_match, (opal_list_item_t*)frag ); + } + } + return OMPI_SUCCESS; +} + +int mca_pml_bfo_del_comm(ompi_communicator_t* comm) +{ + mca_pml_bfo_comm_t* pml_comm = comm->c_pml_comm; + int i; + + for( i = 0; i < comm->c_remote_group->grp_proc_count; i++ ) { + OBJ_RELEASE(pml_comm->procs[i].ompi_proc); + } + OBJ_RELEASE(comm->c_pml_comm); + comm->c_pml_comm = NULL; + return OMPI_SUCCESS; +} + + +/* + * For each proc setup a datastructure that indicates the BTLs + * that can be used to reach the destination. + * + */ + +int mca_pml_bfo_add_procs(ompi_proc_t** procs, size_t nprocs) +{ + opal_bitmap_t reachable; + int rc; + size_t i; + opal_list_item_t *item; + + if(nprocs == 0) + return OMPI_SUCCESS; + + /* we don't have any endpoint data we need to cache on the + ompi_proc_t, so set proc_pml to NULL */ + for (i = 0 ; i < nprocs ; ++i) { + procs[i]->proc_pml = NULL; + } + + OBJ_CONSTRUCT(&reachable, opal_bitmap_t); + rc = opal_bitmap_init(&reachable, (int)nprocs); + if(OMPI_SUCCESS != rc) + return rc; + + /* + * JJH: Disable this in FT enabled builds since + * we use a wrapper PML. It will cause this check to + * return failure as all processes will return the wrapper PML + * component in use instead of the wrapped PML component underneath. + */ +#if OPAL_ENABLE_FT_CR == 0 + /* make sure remote procs are using the same PML as us */ + if (OMPI_SUCCESS != (rc = mca_pml_base_pml_check_selected("bfo", + procs, + nprocs))) { + return rc; + } +#endif + + rc = mca_bml.bml_add_procs( nprocs, + procs, + &reachable ); + if(OMPI_SUCCESS != rc) + goto cleanup_and_return; + + /* Check that values supplied by all initialized btls will work + for us. Note that this is the list of all initialized BTLs, + not the ones used for the just added procs. This is a little + overkill and inaccurate, as we may end up not using the BTL in + question and all add_procs calls after the first one are + duplicating an already completed check. But the final + initialization of the PML occurs before the final + initialization of the BTLs, and iterating through the in-use + BTLs requires iterating over the procs, as the BML does not + expose all currently in use btls. */ + + for (item = opal_list_get_first(&mca_btl_base_modules_initialized) ; + item != opal_list_get_end(&mca_btl_base_modules_initialized) ; + item = opal_list_get_next(item)) { + mca_btl_base_selected_module_t *sm = + (mca_btl_base_selected_module_t*) item; + if (sm->btl_module->btl_eager_limit < sizeof(mca_pml_bfo_hdr_t)) { + orte_show_help("help-mpi-pml-bfo.txt", "eager_limit_too_small", + true, + sm->btl_component->btl_version.mca_component_name, + orte_process_info.nodename, + sm->btl_component->btl_version.mca_component_name, + sm->btl_module->btl_eager_limit, + sm->btl_component->btl_version.mca_component_name, + sizeof(mca_pml_bfo_hdr_t), + sm->btl_component->btl_version.mca_component_name); + rc = OMPI_ERR_BAD_PARAM; + goto cleanup_and_return; + } + } + + + /* TODO: Move these callback registration to another place */ + rc = mca_bml.bml_register( MCA_PML_BFO_HDR_TYPE_MATCH, + mca_pml_bfo_recv_frag_callback_match, + NULL ); + if(OMPI_SUCCESS != rc) + goto cleanup_and_return; + + rc = mca_bml.bml_register( MCA_PML_BFO_HDR_TYPE_RNDV, + mca_pml_bfo_recv_frag_callback_rndv, + NULL ); + if(OMPI_SUCCESS != rc) + goto cleanup_and_return; + + rc = mca_bml.bml_register( MCA_PML_BFO_HDR_TYPE_RGET, + mca_pml_bfo_recv_frag_callback_rget, + NULL ); + if(OMPI_SUCCESS != rc) + goto cleanup_and_return; + + rc = mca_bml.bml_register( MCA_PML_BFO_HDR_TYPE_ACK, + mca_pml_bfo_recv_frag_callback_ack, + NULL ); + if(OMPI_SUCCESS != rc) + goto cleanup_and_return; + + rc = mca_bml.bml_register( MCA_PML_BFO_HDR_TYPE_FRAG, + mca_pml_bfo_recv_frag_callback_frag, + NULL ); + if(OMPI_SUCCESS != rc) + goto cleanup_and_return; + + rc = mca_bml.bml_register( MCA_PML_BFO_HDR_TYPE_PUT, + mca_pml_bfo_recv_frag_callback_put, + NULL ); + if(OMPI_SUCCESS != rc) + goto cleanup_and_return; + + rc = mca_bml.bml_register( MCA_PML_BFO_HDR_TYPE_FIN, + mca_pml_bfo_recv_frag_callback_fin, + NULL ); + if(OMPI_SUCCESS != rc) + goto cleanup_and_return; + +/* BFO FAILOVER CODE - begin */ + /* The following four functions are utilized when failover + * support for openib is enabled. */ + rc = mca_bml.bml_register( MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNOTIFY, + mca_pml_bfo_recv_frag_callback_rndvrestartnotify, + NULL ); + if(OMPI_SUCCESS != rc) + goto cleanup_and_return; + + rc = mca_bml.bml_register( MCA_PML_BFO_HDR_TYPE_RNDVRESTARTACK, + mca_pml_bfo_recv_frag_callback_rndvrestartack, + NULL ); + if(OMPI_SUCCESS != rc) + goto cleanup_and_return; + + rc = mca_bml.bml_register( MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNACK, + mca_pml_bfo_recv_frag_callback_rndvrestartnack, + NULL ); + if(OMPI_SUCCESS != rc) + goto cleanup_and_return; + + rc = mca_bml.bml_register( MCA_PML_BFO_HDR_TYPE_RECVERRNOTIFY, + mca_pml_bfo_recv_frag_callback_recverrnotify, + NULL ); + if(OMPI_SUCCESS != rc) + goto cleanup_and_return; +/* BFO FAILOVER CODE - end */ + + /* register error handlers */ + rc = mca_bml.bml_register_error(mca_pml_bfo_error_handler); + if(OMPI_SUCCESS != rc) + goto cleanup_and_return; + + cleanup_and_return: + OBJ_DESTRUCT(&reachable); + + return rc; +} + +/* + * iterate through each proc and notify any PTLs associated + * with the proc that it is/has gone away + */ + +int mca_pml_bfo_del_procs(ompi_proc_t** procs, size_t nprocs) +{ + return mca_bml.bml_del_procs(nprocs, procs); +} + +/* + * diagnostics + */ + +int mca_pml_bfo_dump(struct ompi_communicator_t* comm, int verbose) +{ + struct mca_pml_comm_t* pml_comm = comm->c_pml_comm; + int i; + + /* iterate through all procs on communicator */ + for( i = 0; i < (int)pml_comm->num_procs; i++ ) { + mca_pml_bfo_comm_proc_t* proc = &pml_comm->procs[i]; + mca_bml_base_endpoint_t* ep = (mca_bml_base_endpoint_t*)proc->ompi_proc->proc_bml; + size_t n; + + opal_output(0, "[Rank %d]\n", i); + /* dump all receive queues */ + + /* dump all btls */ + for(n=0; nbtl_eager.arr_size; n++) { + mca_bml_base_btl_t* bml_btl = &ep->btl_eager.bml_btls[n]; + bml_btl->btl->btl_dump(bml_btl->btl, bml_btl->btl_endpoint, verbose); + } + } + return OMPI_SUCCESS; +} + +static void mca_pml_bfo_fin_completion( mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* ep, + struct mca_btl_base_descriptor_t* des, + int status ) +{ +/* BFO FAILOVER CODE - begin */ + if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) { + mca_pml_bfo_repost_fin(des); + return; + } +/* BFO FAILOVER CODE - end */ + MCA_PML_BFO_PROGRESS_PENDING(btl); +} + +/** + * Send an FIN to the peer. If we fail to send this ack (no more available + * fragments or the send failed) this function automatically add the FIN + * to the list of pending FIN, Which guarantee that the FIN will be sent + * later. + */ +int mca_pml_bfo_send_fin( ompi_proc_t* proc, + mca_bml_base_btl_t* bml_btl, + ompi_ptr_t hdr_des, + uint8_t order, + uint32_t status, +/* BFO FAILOVER CODE - begin */ + uint16_t seq, + uint8_t restartseq, + uint16_t ctx, uint32_t src) +/* BFO FAILOVER CODE - end */ +{ + mca_btl_base_descriptor_t* fin; + mca_pml_bfo_fin_hdr_t* hdr; + int rc; + + mca_bml_base_alloc(bml_btl, &fin, order, sizeof(mca_pml_bfo_fin_hdr_t), + MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); + + if(NULL == fin) { + MCA_PML_BFO_ADD_FIN_TO_PENDING(proc, hdr_des, bml_btl, order, status); + return OMPI_ERR_OUT_OF_RESOURCE; + } + fin->des_cbfunc = mca_pml_bfo_fin_completion; + fin->des_cbdata = proc; + + /* fill in header */ + hdr = (mca_pml_bfo_fin_hdr_t*)fin->des_src->seg_addr.pval; + hdr->hdr_match.hdr_common.hdr_flags = 0; + hdr->hdr_match.hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_FIN; + hdr->hdr_des = hdr_des; + hdr->hdr_fail = status; + hdr->hdr_match.hdr_seq = seq; + hdr->hdr_restartseq = restartseq; + hdr->hdr_match.hdr_ctx = ctx; + hdr->hdr_match.hdr_src = src; + + bfo_hdr_hton(hdr, MCA_PML_BFO_HDR_TYPE_FIN, proc); + + /* queue request */ + rc = mca_bml_base_send( bml_btl, + fin, + MCA_PML_BFO_HDR_TYPE_FIN ); + if( OPAL_LIKELY( rc >= 0 ) ) { + if( OPAL_LIKELY( 1 == rc ) ) { + MCA_PML_BFO_PROGRESS_PENDING(bml_btl->btl); + } + return OMPI_SUCCESS; + } + mca_bml_base_free(bml_btl, fin); + MCA_PML_BFO_ADD_FIN_TO_PENDING(proc, hdr_des, bml_btl, order, status); + return OMPI_ERR_OUT_OF_RESOURCE; +} + +void mca_pml_bfo_process_pending_packets(struct mca_btl_base_module_t* btl) +{ + mca_pml_bfo_pckt_pending_t *pckt; + int32_t i, rc, s = (int32_t)opal_list_get_size(&mca_pml_bfo.pckt_pending); + + for(i = 0; i < s; i++) { + mca_bml_base_btl_t *send_dst = NULL; + OPAL_THREAD_LOCK(&mca_pml_bfo.lock); + pckt = (mca_pml_bfo_pckt_pending_t*) + opal_list_remove_first(&mca_pml_bfo.pckt_pending); + OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock); + if(NULL == pckt) + break; + if(pckt->bml_btl != NULL && + pckt->bml_btl->btl == btl) { + send_dst = pckt->bml_btl; + } else { + send_dst = mca_bml_base_btl_array_find( + &pckt->proc->proc_bml->btl_eager, btl); + } + if(NULL == send_dst) { + OPAL_THREAD_LOCK(&mca_pml_bfo.lock); + opal_list_append(&mca_pml_bfo.pckt_pending, + (opal_list_item_t*)pckt); + OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock); + continue; + } + + switch(pckt->hdr.hdr_common.hdr_type) { + case MCA_PML_BFO_HDR_TYPE_ACK: + rc = mca_pml_bfo_recv_request_ack_send_btl(pckt->proc, + send_dst, + pckt->hdr.hdr_ack.hdr_src_req.lval, + pckt->hdr.hdr_ack.hdr_dst_req.pval, + pckt->hdr.hdr_ack.hdr_send_offset, + pckt->hdr.hdr_common.hdr_flags & MCA_PML_BFO_HDR_FLAGS_NORDMA); + if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) ) { + OPAL_THREAD_LOCK(&mca_pml_bfo.lock); + opal_list_append(&mca_pml_bfo.pckt_pending, + (opal_list_item_t*)pckt); + OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock); + return; + } + break; + case MCA_PML_BFO_HDR_TYPE_FIN: + rc = mca_pml_bfo_send_fin(pckt->proc, send_dst, + pckt->hdr.hdr_fin.hdr_des, + pckt->order, + pckt->hdr.hdr_fin.hdr_fail, + pckt->hdr.hdr_match.hdr_seq, + pckt->hdr.hdr_fin.hdr_restartseq, + pckt->hdr.hdr_match.hdr_ctx, + pckt->hdr.hdr_match.hdr_src); + if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) ) { + return; + } + break; + default: + opal_output(0, "[%s:%d] wrong header type\n", + __FILE__, __LINE__); + break; + } + /* We're done with this packet, return it back to the free list */ + MCA_PML_BFO_PCKT_PENDING_RETURN(pckt); + } +} + +void mca_pml_bfo_process_pending_rdma(void) +{ + mca_pml_bfo_rdma_frag_t* frag; + int32_t i, rc, s = (int32_t)opal_list_get_size(&mca_pml_bfo.rdma_pending); + + for(i = 0; i < s; i++) { + OPAL_THREAD_LOCK(&mca_pml_bfo.lock); + frag = (mca_pml_bfo_rdma_frag_t*) + opal_list_remove_first(&mca_pml_bfo.rdma_pending); + OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock); + if(NULL == frag) + break; + if(frag->rdma_state == MCA_PML_BFO_RDMA_PUT) { + frag->retries++; + rc = mca_pml_bfo_send_request_put_frag(frag); + } else { + rc = mca_pml_bfo_recv_request_get_frag(frag); + } + if(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) + break; + } +} + + +void mca_pml_bfo_error_handler( + struct mca_btl_base_module_t* btl, int32_t flags, + ompi_proc_t* errproc, char* btlname ) { +/* BFO FAILOVER CODE - begin */ + /* If we get a non-fatal error, try to failover */ + if (flags & MCA_BTL_ERROR_FLAGS_NONFATAL) { + mca_pml_bfo_failover_error_handler(btl, flags, errproc, btlname); +/* BFO FAILOVER CODE - end */ + } else { + orte_errmgr.abort(-1, NULL); + } +} + +#if OPAL_ENABLE_FT_CR == 0 +int mca_pml_bfo_ft_event( int state ) { + return OMPI_SUCCESS; +} +#else +int mca_pml_bfo_ft_event( int state ) +{ + static bool first_continue_pass = false; + ompi_proc_t** procs = NULL; + size_t num_procs; + int ret, p; + + if(OPAL_CRS_CHECKPOINT == state) { + if( opal_cr_timing_barrier_enabled ) { + OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCPBR1); + orte_grpcomm.barrier(); + } + + OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P0); + } + else if(OPAL_CRS_CONTINUE == state) { + first_continue_pass = !first_continue_pass; + + if( !first_continue_pass ) { + if( opal_cr_timing_barrier_enabled ) { + OPAL_CR_SET_TIMER(OPAL_CR_TIMER_COREBR0); + orte_grpcomm.barrier(); + } + OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P2); + } + + if( ompi_cr_continue_like_restart && !first_continue_pass ) { + /* + * Get a list of processes + */ + procs = ompi_proc_all(&num_procs); + if(NULL == procs) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + + /* + * Refresh the proc structure, and publish our proc info in the modex. + * NOTE: Do *not* call ompi_proc_finalize as there are many places in + * the code that point to indv. procs in this strucutre. For our + * needs here we only need to fix up the modex, bml and pml + * references. + */ + if (OMPI_SUCCESS != (ret = ompi_proc_refresh())) { + opal_output(0, + "pml:bfo: ft_event(Restart): proc_refresh Failed %d", + ret); + for(p = 0; p < (int)num_procs; ++p) { + OBJ_RELEASE(procs[p]); + } + free (procs); + return ret; + } + } + } + else if(OPAL_CRS_RESTART_PRE == state ) { + /* Nothing here */ + } + else if(OPAL_CRS_RESTART == state ) { + /* + * Get a list of processes + */ + procs = ompi_proc_all(&num_procs); + if(NULL == procs) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + + /* + * Clean out the modex information since it is invalid now. + * orte_grpcomm.purge_proc_attrs(); + * This happens at the ORTE level, so doing it again here will cause + * some issues with socket caching. + */ + + + /* + * Refresh the proc structure, and publish our proc info in the modex. + * NOTE: Do *not* call ompi_proc_finalize as there are many places in + * the code that point to indv. procs in this strucutre. For our + * needs here we only need to fix up the modex, bml and pml + * references. + */ + if (OMPI_SUCCESS != (ret = ompi_proc_refresh())) { + opal_output(0, + "pml:bfo: ft_event(Restart): proc_refresh Failed %d", + ret); + for(p = 0; p < (int)num_procs; ++p) { + OBJ_RELEASE(procs[p]); + } + free (procs); + return ret; + } + } + else if(OPAL_CRS_TERM == state ) { + ; + } + else { + ; + } + + /* Call the BML + * BML is expected to call ft_event in + * - BTL(s) + * - MPool(s) + */ + if( OMPI_SUCCESS != (ret = mca_bml.bml_ft_event(state))) { + opal_output(0, "pml:base: ft_event: BML ft_event function failed: %d\n", + ret); + } + + if(OPAL_CRS_CHECKPOINT == state) { + OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P1); + + if( opal_cr_timing_barrier_enabled ) { + OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR0); + /* JJH Cannot barrier here due to progress engine -- orte_grpcomm.barrier();*/ + } + } + else if(OPAL_CRS_CONTINUE == state) { + if( !first_continue_pass ) { + if( opal_cr_timing_barrier_enabled ) { + OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR1); + orte_grpcomm.barrier(); + } + OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P3); + } + + if( ompi_cr_continue_like_restart && !first_continue_pass ) { + /* + * Exchange the modex information once again. + * BTLs will have republished their modex information. + */ + if (OMPI_SUCCESS != (ret = orte_grpcomm.modex(NULL))) { + opal_output(0, + "pml:bfo: ft_event(Restart): Failed orte_grpcomm.modex() = %d", + ret); + return ret; + } + + /* + * Startup the PML stack now that the modex is running again + * Add the new procs (BTLs redo modex recv's) + */ + if( OMPI_SUCCESS != (ret = mca_pml_bfo_add_procs(procs, num_procs) ) ) { + opal_output(0, "pml:bfo: ft_event(Restart): Failed in add_procs (%d)", ret); + return ret; + } + + /* Is this barrier necessary ? JJH */ + if (OMPI_SUCCESS != (ret = orte_grpcomm.barrier())) { + opal_output(0, "pml:bfo: ft_event(Restart): Failed in orte_grpcomm.barrier (%d)", ret); + return ret; + } + + if( NULL != procs ) { + for(p = 0; p < (int)num_procs; ++p) { + OBJ_RELEASE(procs[p]); + } + free(procs); + procs = NULL; + } + } + if( !first_continue_pass ) { + if( opal_cr_timing_barrier_enabled ) { + OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR2); + orte_grpcomm.barrier(); + } + OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCP1); + } + } + else if(OPAL_CRS_RESTART_PRE == state ) { + /* Nothing here */ + } + else if(OPAL_CRS_RESTART == state ) { + /* + * Exchange the modex information once again. + * BTLs will have republished their modex information. + */ + if (OMPI_SUCCESS != (ret = orte_grpcomm.modex(NULL))) { + opal_output(0, + "pml:bfo: ft_event(Restart): Failed orte_grpcomm.modex() = %d", + ret); + return ret; + } + + /* + * Startup the PML stack now that the modex is running again + * Add the new procs (BTLs redo modex recv's) + */ + if( OMPI_SUCCESS != (ret = mca_pml_bfo_add_procs(procs, num_procs) ) ) { + opal_output(0, "pml:bfo: ft_event(Restart): Failed in add_procs (%d)", ret); + return ret; + } + + /* Is this barrier necessary ? JJH */ + if (OMPI_SUCCESS != (ret = orte_grpcomm.barrier())) { + opal_output(0, "pml:bfo: ft_event(Restart): Failed in orte_grpcomm.barrier (%d)", ret); + return ret; + } + + if( NULL != procs ) { + for(p = 0; p < (int)num_procs; ++p) { + OBJ_RELEASE(procs[p]); + } + free(procs); + procs = NULL; + } + } + else if(OPAL_CRS_TERM == state ) { + ; + } + else { + ; + } + + return OMPI_SUCCESS; +} +#endif /* OPAL_ENABLE_FT_CR */ + +int mca_pml_bfo_com_btl_comp(const void *v1, const void *v2) +{ + const mca_pml_bfo_com_btl_t *b1 = (const mca_pml_bfo_com_btl_t *) v1; + const mca_pml_bfo_com_btl_t *b2 = (const mca_pml_bfo_com_btl_t *) v2; + + if(b1->bml_btl->btl_weight < b2->bml_btl->btl_weight) + return 1; + if(b1->bml_btl->btl_weight > b2->bml_btl->btl_weight) + return -1; + + return 0; +} + diff --git a/ompi/mca/pml/bfo/pml_bfo.h b/ompi/mca/pml/bfo/pml_bfo.h new file mode 100644 index 0000000000..7129382787 --- /dev/null +++ b/ompi/mca/pml/bfo/pml_bfo.h @@ -0,0 +1,321 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2007 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + */ + +#ifndef MCA_PML_BFO_H +#define MCA_PML_BFO_H + +#include "ompi_config.h" +#include "ompi/class/ompi_free_list.h" +#include "ompi/request/request.h" +#include "ompi/mca/pml/pml.h" +#include "ompi/mca/pml/base/pml_base_request.h" +#include "ompi/mca/pml/base/pml_base_bsend.h" +#include "ompi/mca/pml/base/pml_base_sendreq.h" +#include "ompi/datatype/ompi_datatype.h" +#include "pml_bfo_hdr.h" +#include "ompi/mca/bml/base/base.h" +#include "ompi/proc/proc.h" +#include "ompi/mca/allocator/base/base.h" + +BEGIN_C_DECLS + +/** + * BFO PML module + */ + +struct mca_pml_bfo_t { + mca_pml_base_module_t super; + + int priority; + int free_list_num; /* initial size of free list */ + int free_list_max; /* maximum size of free list */ + int free_list_inc; /* number of elements to grow free list */ + size_t send_pipeline_depth; + size_t recv_pipeline_depth; + size_t rdma_put_retries_limit; + int max_rdma_per_request; + int max_send_per_range; + bool leave_pinned; + int leave_pinned_pipeline; + + /* lock queue access */ + opal_mutex_t lock; + + /* free lists */ + ompi_free_list_t rdma_frags; + ompi_free_list_t recv_frags; + ompi_free_list_t pending_pckts; + ompi_free_list_t buffers; + ompi_free_list_t send_ranges; + + /* list of pending operations */ + opal_list_t pckt_pending; + opal_list_t send_pending; + opal_list_t recv_pending; + opal_list_t rdma_pending; + /* List of pending fragments without a matching communicator */ + opal_list_t non_existing_communicator_pending; + bool enabled; +/* BFO FAILOVER CODE - begin */ + bool fast_failover; +/* BFO FAILOVER CODE - end */ + char* allocator_name; + mca_allocator_base_module_t* allocator; + uint32_t unexpected_limit; +}; +typedef struct mca_pml_bfo_t mca_pml_bfo_t; + +extern mca_pml_bfo_t mca_pml_bfo; +extern int mca_pml_bfo_output; + +/* + * PML interface functions. + */ + +extern int mca_pml_bfo_add_comm( + struct ompi_communicator_t* comm +); + +extern int mca_pml_bfo_del_comm( + struct ompi_communicator_t* comm +); + +extern int mca_pml_bfo_add_procs( + struct ompi_proc_t **procs, + size_t nprocs +); + +extern int mca_pml_bfo_del_procs( + struct ompi_proc_t **procs, + size_t nprocs +); + +extern int mca_pml_bfo_enable( bool enable ); + +extern int mca_pml_bfo_progress(void); + +extern int mca_pml_bfo_iprobe( int dst, + int tag, + struct ompi_communicator_t* comm, + int *matched, + ompi_status_public_t* status ); + +extern int mca_pml_bfo_probe( int dst, + int tag, + struct ompi_communicator_t* comm, + ompi_status_public_t* status ); + +extern int mca_pml_bfo_isend_init( void *buf, + size_t count, + ompi_datatype_t *datatype, + int dst, + int tag, + mca_pml_base_send_mode_t mode, + struct ompi_communicator_t* comm, + struct ompi_request_t **request ); + +extern int mca_pml_bfo_isend( void *buf, + size_t count, + ompi_datatype_t *datatype, + int dst, + int tag, + mca_pml_base_send_mode_t mode, + struct ompi_communicator_t* comm, + struct ompi_request_t **request ); + +extern int mca_pml_bfo_send( void *buf, + size_t count, + ompi_datatype_t *datatype, + int dst, + int tag, + mca_pml_base_send_mode_t mode, + struct ompi_communicator_t* comm ); + +extern int mca_pml_bfo_irecv_init( void *buf, + size_t count, + ompi_datatype_t *datatype, + int src, + int tag, + struct ompi_communicator_t* comm, + struct ompi_request_t **request ); + +extern int mca_pml_bfo_irecv( void *buf, + size_t count, + ompi_datatype_t *datatype, + int src, + int tag, + struct ompi_communicator_t* comm, + struct ompi_request_t **request ); + +extern int mca_pml_bfo_recv( void *buf, + size_t count, + ompi_datatype_t *datatype, + int src, + int tag, + struct ompi_communicator_t* comm, + ompi_status_public_t* status ); + +extern int mca_pml_bfo_dump( struct ompi_communicator_t* comm, + int verbose ); + +extern int mca_pml_bfo_start( size_t count, + ompi_request_t** requests ); + +extern int mca_pml_bfo_ft_event( int state ); + +END_C_DECLS + +struct mca_pml_bfo_pckt_pending_t { + ompi_free_list_item_t super; + ompi_proc_t* proc; + mca_pml_bfo_hdr_t hdr; + struct mca_bml_base_btl_t *bml_btl; + uint8_t order; +}; +typedef struct mca_pml_bfo_pckt_pending_t mca_pml_bfo_pckt_pending_t; +OBJ_CLASS_DECLARATION(mca_pml_bfo_pckt_pending_t); + +#define MCA_PML_BFO_PCKT_PENDING_ALLOC(pckt,rc) \ +do { \ + ompi_free_list_item_t* item; \ + OMPI_FREE_LIST_WAIT(&mca_pml_bfo.pending_pckts, item, rc); \ + pckt = (mca_pml_bfo_pckt_pending_t*)item; \ +} while (0) + +#define MCA_PML_BFO_PCKT_PENDING_RETURN(pckt) \ +do { \ + /* return packet */ \ + OMPI_FREE_LIST_RETURN(&mca_pml_bfo.pending_pckts, \ + (ompi_free_list_item_t*)pckt); \ +} while(0) + +#define MCA_PML_BFO_ADD_FIN_TO_PENDING(P, D, B, O, S) \ + do { \ + mca_pml_bfo_pckt_pending_t *_pckt; \ + int _rc; \ + \ + MCA_PML_BFO_PCKT_PENDING_ALLOC(_pckt,_rc); \ + _pckt->hdr.hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_FIN; \ + _pckt->hdr.hdr_fin.hdr_des = (D); \ + _pckt->hdr.hdr_fin.hdr_fail = (S); \ + _pckt->proc = (P); \ + _pckt->bml_btl = (B); \ + _pckt->order = (O); \ + OPAL_THREAD_LOCK(&mca_pml_bfo.lock); \ + opal_list_append(&mca_pml_bfo.pckt_pending, \ + (opal_list_item_t*)_pckt); \ + OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock); \ + } while(0) + + +int mca_pml_bfo_send_fin(ompi_proc_t* proc, mca_bml_base_btl_t* bml_btl, + ompi_ptr_t hdr_des, uint8_t order, uint32_t status, + uint16_t seq, uint8_t reqseq, uint16_t ctx, uint32_t src); + +/* This function tries to resend FIN/ACK packets from pckt_pending queue. + * Packets are added to the queue when sending of FIN or ACK is failed due to + * resource unavailability. bml_btl passed to the function doesn't represents + * packet's destination, it represents BTL on which resource was freed, so only + * this BTL should be considered for resending packets */ +void mca_pml_bfo_process_pending_packets(struct mca_btl_base_module_t* btl); + +/* This function retries failed PUT/GET operations on frag. When RDMA operation + * cannot be accomplished for some reason, frag is put on the rdma_pending list. + * Later the operation is retried. The destination of RDMA operation is stored + * inside the frag structure */ +void mca_pml_bfo_process_pending_rdma(void); + +#define MCA_PML_BFO_PROGRESS_PENDING(btl) \ + do { \ + if(opal_list_get_size(&mca_pml_bfo.pckt_pending)) \ + mca_pml_bfo_process_pending_packets(btl); \ + if(opal_list_get_size(&mca_pml_bfo.recv_pending)) \ + mca_pml_bfo_recv_request_process_pending(); \ + if(opal_list_get_size(&mca_pml_bfo.send_pending)) \ + mca_pml_bfo_send_request_process_pending(btl); \ + if(opal_list_get_size(&mca_pml_bfo.rdma_pending)) \ + mca_pml_bfo_process_pending_rdma(); \ + } while (0) + +/* + * Compute the total number of bytes on supplied descriptor + */ +#define MCA_PML_BFO_COMPUTE_SEGMENT_LENGTH(segments, count, hdrlen, length) \ +do { \ + size_t i; \ + \ + for( i = 0; i < count; i++ ) { \ + length += segments[i].seg_len; \ + } \ + length -= hdrlen; \ +} while(0) + +/* represent BTL chosen for sending request */ +struct mca_pml_bfo_com_btl_t { + mca_bml_base_btl_t *bml_btl; + struct mca_mpool_base_registration_t* btl_reg; + size_t length; +}; +typedef struct mca_pml_bfo_com_btl_t mca_pml_bfo_com_btl_t; + +int mca_pml_bfo_com_btl_comp(const void *v1, const void *v2); + +/* Calculate what percentage of a message to send through each BTL according to + * relative weight */ +static inline void +mca_pml_bfo_calc_weighted_length( mca_pml_bfo_com_btl_t *btls, int num_btls, size_t size, + double weight_total ) +{ + int i; + size_t length_left; + + /* shortcut for common case for only one BTL */ + if( OPAL_LIKELY(1 == num_btls) ) { + btls[0].length = size; + return; + } + + /* sort BTLs according of their weights so BTLs with smaller weight will + * not hijack all of the traffic */ + qsort( btls, num_btls, sizeof(mca_pml_bfo_com_btl_t), + mca_pml_bfo_com_btl_comp ); + + for(length_left = size, i = 0; i < num_btls; i++) { + mca_bml_base_btl_t* bml_btl = btls[i].bml_btl; + size_t length = 0; + if( OPAL_UNLIKELY(0 != length_left) ) { + length = (length_left > bml_btl->btl->btl_eager_limit)? + ((size_t)(size * (bml_btl->btl_weight / weight_total))) : + length_left; + + if(length > length_left) + length = length_left; + length_left -= length; + } + btls[i].length = length; + } + + /* account for rounding errors */ + btls[0].length += length_left; +} + +#endif diff --git a/ompi/mca/pml/bfo/pml_bfo_comm.c b/ompi/mca/pml/bfo/pml_bfo_comm.c new file mode 100644 index 0000000000..168eaf7912 --- /dev/null +++ b/ompi/mca/pml/bfo/pml_bfo_comm.c @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2006 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" +#include + +#include "pml_bfo.h" +#include "pml_bfo_comm.h" + + + +static void mca_pml_bfo_comm_proc_construct(mca_pml_bfo_comm_proc_t* proc) +{ + proc->expected_sequence = 1; + proc->ompi_proc = NULL; + proc->send_sequence = 0; + OBJ_CONSTRUCT(&proc->frags_cant_match, opal_list_t); + OBJ_CONSTRUCT(&proc->specific_receives, opal_list_t); + OBJ_CONSTRUCT(&proc->unexpected_frags, opal_list_t); +} + + +static void mca_pml_bfo_comm_proc_destruct(mca_pml_bfo_comm_proc_t* proc) +{ + OBJ_DESTRUCT(&proc->frags_cant_match); + OBJ_DESTRUCT(&proc->specific_receives); + OBJ_DESTRUCT(&proc->unexpected_frags); +} + + +static OBJ_CLASS_INSTANCE( + mca_pml_bfo_comm_proc_t, + opal_object_t, + mca_pml_bfo_comm_proc_construct, + mca_pml_bfo_comm_proc_destruct); + + +static void mca_pml_bfo_comm_construct(mca_pml_bfo_comm_t* comm) +{ + OBJ_CONSTRUCT(&comm->wild_receives, opal_list_t); + OBJ_CONSTRUCT(&comm->matching_lock, opal_mutex_t); + comm->recv_sequence = 0; + comm->procs = NULL; + comm->num_procs = 0; +} + + +static void mca_pml_bfo_comm_destruct(mca_pml_bfo_comm_t* comm) +{ + size_t i; + for(i=0; inum_procs; i++) + OBJ_DESTRUCT((&comm->procs[i])); + if(NULL != comm->procs) + free(comm->procs); + OBJ_DESTRUCT(&comm->wild_receives); + OBJ_DESTRUCT(&comm->matching_lock); +} + + +OBJ_CLASS_INSTANCE( + mca_pml_bfo_comm_t, + opal_object_t, + mca_pml_bfo_comm_construct, + mca_pml_bfo_comm_destruct); + + +int mca_pml_bfo_comm_init_size(mca_pml_bfo_comm_t* comm, size_t size) +{ + size_t i; + + /* send message sequence-number support - sender side */ + comm->procs = (mca_pml_bfo_comm_proc_t*)malloc(sizeof(mca_pml_bfo_comm_proc_t)*size); + if(NULL == comm->procs) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + for(i=0; iprocs+i, mca_pml_bfo_comm_proc_t); + } + comm->num_procs = size; + return OMPI_SUCCESS; +} + + diff --git a/ompi/mca/pml/bfo/pml_bfo_comm.h b/ompi/mca/pml/bfo/pml_bfo_comm.h new file mode 100644 index 0000000000..c9564480fb --- /dev/null +++ b/ompi/mca/pml/bfo/pml_bfo_comm.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2006 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + */ +#ifndef MCA_PML_BFO_COMM_H +#define MCA_PML_BFO_COMM_H + +#include "opal/threads/mutex.h" +#include "opal/class/opal_list.h" +#include "ompi/proc/proc.h" +BEGIN_C_DECLS + + +struct mca_pml_bfo_comm_proc_t { + opal_object_t super; + uint16_t expected_sequence; /**< send message sequence number - receiver side */ + struct ompi_proc_t* ompi_proc; +#if OPAL_HAVE_THREAD_SUPPORT + volatile int32_t send_sequence; /**< send side sequence number */ +#else + int32_t send_sequence; /**< send side sequence number */ +#endif + opal_list_t frags_cant_match; /**< out-of-order fragment queues */ + opal_list_t specific_receives; /**< queues of unmatched specific receives */ + opal_list_t unexpected_frags; /**< unexpected fragment queues */ +}; +typedef struct mca_pml_bfo_comm_proc_t mca_pml_bfo_comm_proc_t; + + +/** + * Cached on ompi_communicator_t to hold queues/state + * used by the PML<->PTL interface for matching logic. + */ +struct mca_pml_comm_t { + opal_object_t super; +#if OPAL_HAVE_THREAD_SUPPORT + volatile uint32_t recv_sequence; /**< recv request sequence number - receiver side */ +#else + uint32_t recv_sequence; /**< recv request sequence number - receiver side */ +#endif + opal_mutex_t matching_lock; /**< matching lock */ + opal_list_t wild_receives; /**< queue of unmatched wild (source process not specified) receives */ + mca_pml_bfo_comm_proc_t* procs; + size_t num_procs; +}; +typedef struct mca_pml_comm_t mca_pml_bfo_comm_t; + +OBJ_CLASS_DECLARATION(mca_pml_bfo_comm_t); + + +/** + * Initialize an instance of mca_pml_bfo_comm_t based on the communicator size. + * + * @param comm Instance of mca_pml_bfo_comm_t + * @param size Size of communicator + * @return OMPI_SUCCESS or error status on failure. + */ + +extern int mca_pml_bfo_comm_init_size(mca_pml_bfo_comm_t* comm, size_t size); + +END_C_DECLS +#endif + diff --git a/ompi/mca/pml/bfo/pml_bfo_component.c b/ompi/mca/pml/bfo/pml_bfo_component.c new file mode 100644 index 0000000000..01f3391e79 --- /dev/null +++ b/ompi/mca/pml/bfo/pml_bfo_component.c @@ -0,0 +1,252 @@ +/* -*- Mode: C; c-basic-offset:4 ; -*- */ +/* + * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2009 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2007-2010 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" +#include "opal/event/event.h" +#include "mpi.h" +#include "ompi/runtime/params.h" +#include "ompi/mca/pml/pml.h" +#include "opal/mca/base/mca_base_param.h" +#include "ompi/mca/pml/base/pml_base_bsend.h" +#include "pml_bfo.h" +#include "pml_bfo_hdr.h" +#include "pml_bfo_sendreq.h" +#include "pml_bfo_recvreq.h" +#include "pml_bfo_rdmafrag.h" +#include "pml_bfo_recvfrag.h" +#include "ompi/mca/bml/base/base.h" +#include "pml_bfo_component.h" +#include "ompi/mca/allocator/base/base.h" + +OBJ_CLASS_INSTANCE( mca_pml_bfo_pckt_pending_t, + ompi_free_list_item_t, + NULL, + NULL ); + +static int mca_pml_bfo_component_open(void); +static int mca_pml_bfo_component_close(void); +static mca_pml_base_module_t* +mca_pml_bfo_component_init( int* priority, bool enable_progress_threads, + bool enable_mpi_threads ); +static int mca_pml_bfo_component_fini(void); +int mca_pml_bfo_output = 0; + +mca_pml_base_component_2_0_0_t mca_pml_bfo_component = { + + /* First, the mca_base_component_t struct containing meta + information about the component itself */ + + { + MCA_PML_BASE_VERSION_2_0_0, + + "bfo", /* MCA component name */ + OMPI_MAJOR_VERSION, /* MCA component major version */ + OMPI_MINOR_VERSION, /* MCA component minor version */ + OMPI_RELEASE_VERSION, /* MCA component release version */ + mca_pml_bfo_component_open, /* component open */ + mca_pml_bfo_component_close /* component close */ + }, + { + /* The component is checkpoint ready */ + MCA_BASE_METADATA_PARAM_CHECKPOINT + }, + + mca_pml_bfo_component_init, /* component init */ + mca_pml_bfo_component_fini /* component finalize */ + +}; + +void *mca_pml_bfo_seg_alloc( struct mca_mpool_base_module_t* mpool, + size_t* size, + mca_mpool_base_registration_t** registration); + +void mca_pml_bfo_seg_free( struct mca_mpool_base_module_t* mpool, + void* segment ); + +static inline int mca_pml_bfo_param_register_int( + const char* param_name, + int default_value) +{ + int id = mca_base_param_register_int("pml","bfo",param_name,NULL,default_value); + int param_value = default_value; + mca_base_param_lookup_int(id,¶m_value); + return param_value; +} + +static int mca_pml_bfo_component_open(void) +{ + int value; + mca_allocator_base_component_t* allocator_component; + + value = mca_pml_bfo_param_register_int("verbose", 0); + mca_pml_bfo_output = opal_output_open(NULL); + opal_output_set_verbosity(mca_pml_bfo_output, value); + + mca_pml_bfo.free_list_num = + mca_pml_bfo_param_register_int("free_list_num", 4); + mca_pml_bfo.free_list_max = + mca_pml_bfo_param_register_int("free_list_max", -1); + mca_pml_bfo.free_list_inc = + mca_pml_bfo_param_register_int("free_list_inc", 64); + mca_pml_bfo.priority = + mca_pml_bfo_param_register_int("priority", 5); + mca_pml_bfo.send_pipeline_depth = + mca_pml_bfo_param_register_int("send_pipeline_depth", 3); + mca_pml_bfo.recv_pipeline_depth = + mca_pml_bfo_param_register_int("recv_pipeline_depth", 4); + mca_pml_bfo.rdma_put_retries_limit = + mca_pml_bfo_param_register_int("rdma_put_retries_limit", 5); + mca_pml_bfo.max_rdma_per_request = + mca_pml_bfo_param_register_int("max_rdma_per_request", 4); + mca_pml_bfo.max_send_per_range = + mca_pml_bfo_param_register_int("max_send_per_range", 4); + + mca_pml_bfo.unexpected_limit = + mca_pml_bfo_param_register_int("unexpected_limit", 128); +/* BFO FAILOVER CODE - begin */ + mca_pml_bfo.fast_failover = + mca_pml_bfo_param_register_int("fast_failover", 0); +/* BFO FAILOVER CODE - end */ + + mca_base_param_reg_string(&mca_pml_bfo_component.pmlm_version, + "allocator", + "Name of allocator component for unexpected messages", + false, false, + "bucket", + &mca_pml_bfo.allocator_name); + + allocator_component = mca_allocator_component_lookup( mca_pml_bfo.allocator_name ); + if(NULL == allocator_component) { + opal_output(0, "mca_pml_bfo_component_open: can't find allocator: %s\n", mca_pml_bfo.allocator_name); + return OMPI_ERROR; + } + + mca_pml_bfo.allocator = allocator_component->allocator_init(true, + mca_pml_bfo_seg_alloc, + mca_pml_bfo_seg_free, NULL); + if(NULL == mca_pml_bfo.allocator) { + opal_output(0, "mca_pml_bfo_component_open: unable to initialize allocator\n"); + return OMPI_ERROR; + } + + mca_pml_bfo.enabled = false; + return mca_bml_base_open(); +} + + +static int mca_pml_bfo_component_close(void) +{ + int rc; + + if (OMPI_SUCCESS != (rc = mca_bml_base_close())) { + return rc; + } + if (NULL != mca_pml_bfo.allocator_name) { + free(mca_pml_bfo.allocator_name); + } + + return OMPI_SUCCESS; +} + + +static mca_pml_base_module_t* +mca_pml_bfo_component_init( int* priority, + bool enable_progress_threads, + bool enable_mpi_threads ) +{ + opal_output_verbose( 10, mca_pml_bfo_output, + "in bfo, my priority is %d\n", mca_pml_bfo.priority); + + if((*priority) > mca_pml_bfo.priority) { + *priority = mca_pml_bfo.priority; + return NULL; + } + *priority = mca_pml_bfo.priority; + + if(OMPI_SUCCESS != mca_bml_base_init( enable_progress_threads, + enable_mpi_threads)) { + return NULL; + } + + /* Set this here (vs in component_open()) because + ompi_mpi_leave_pinned* may have been set after MCA params were + read (e.g., by the openib btl) */ + mca_pml_bfo.leave_pinned = (1 == ompi_mpi_leave_pinned); + mca_pml_bfo.leave_pinned_pipeline = (int) ompi_mpi_leave_pinned_pipeline; + + return &mca_pml_bfo.super; +} + +int mca_pml_bfo_component_fini(void) +{ + int rc; + + /* Shutdown BML */ + if(OMPI_SUCCESS != (rc = mca_bml.bml_finalize())) + return rc; + + if(!mca_pml_bfo.enabled) + return OMPI_SUCCESS; /* never selected.. return success.. */ + mca_pml_bfo.enabled = false; /* not anymore */ + + OBJ_DESTRUCT(&mca_pml_bfo.rdma_pending); + OBJ_DESTRUCT(&mca_pml_bfo.pckt_pending); + OBJ_DESTRUCT(&mca_pml_bfo.recv_pending); + OBJ_DESTRUCT(&mca_pml_bfo.send_pending); + OBJ_DESTRUCT(&mca_pml_bfo.non_existing_communicator_pending); + OBJ_DESTRUCT(&mca_pml_bfo.buffers); + OBJ_DESTRUCT(&mca_pml_bfo.pending_pckts); + OBJ_DESTRUCT(&mca_pml_bfo.recv_frags); + OBJ_DESTRUCT(&mca_pml_bfo.rdma_frags); + OBJ_DESTRUCT(&mca_pml_bfo.lock); + + if(OMPI_SUCCESS != (rc = mca_pml_bfo.allocator->alc_finalize(mca_pml_bfo.allocator))) { + return rc; + } + +#if 0 + if (mca_pml_base_send_requests.fl_num_allocated != + mca_pml_base_send_requests.super.opal_list_length) { + opal_output(0, "bfo send requests: %d allocated %d returned\n", + mca_pml_base_send_requests.fl_num_allocated, + mca_pml_base_send_requests.super.opal_list_length); + } + if (mca_pml_base_recv_requests.fl_num_allocated != + mca_pml_base_recv_requests.super.opal_list_length) { + opal_output(0, "bfo recv requests: %d allocated %d returned\n", + mca_pml_base_recv_requests.fl_num_allocated, + mca_pml_base_recv_requests.super.opal_list_length); + } +#endif + + return OMPI_SUCCESS; +} + +void *mca_pml_bfo_seg_alloc( struct mca_mpool_base_module_t* mpool, + size_t* size, + mca_mpool_base_registration_t** registration) { + return malloc(*size); +} + +void mca_pml_bfo_seg_free( struct mca_mpool_base_module_t* mpool, + void* segment ) { + free(segment); +} diff --git a/ompi/mca/pml/bfo/pml_bfo_component.h b/ompi/mca/pml/bfo/pml_bfo_component.h new file mode 100644 index 0000000000..2fd08d018e --- /dev/null +++ b/ompi/mca/pml/bfo/pml_bfo_component.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2006 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + */ + +#ifndef MCA_PML_BFO_COMPONENT_H +#define MCA_PML_BFO_COMPONENT_H + +BEGIN_C_DECLS + +/* + * PML module functions. + */ +OMPI_MODULE_DECLSPEC extern mca_pml_base_component_2_0_0_t mca_pml_bfo_component; + +END_C_DECLS + +#endif diff --git a/ompi/mca/pml/bfo/pml_bfo_failover.c b/ompi/mca/pml/bfo/pml_bfo_failover.c new file mode 100644 index 0000000000..d0fbc94362 --- /dev/null +++ b/ompi/mca/pml/bfo/pml_bfo_failover.c @@ -0,0 +1,1883 @@ +/* + * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/** + * @file + * Functions that implement failover capabilities. To utilize the + * failover feature, one needs to configure the library with + * --enable-openib-failover. Then the system that is being used + * must have two or more openib BTLs in use. When an error occurs, + * the BTL will call into this PML to map out the offending BTL and + * continue using the one that is still working. + * Most of the differences between the ob1 PML and the bfo PML are + * contained in this file. + */ + +#include "ompi_config.h" + +#include +#include + +#include "opal/class/opal_bitmap.h" +#include "ompi/mca/pml/pml.h" +#include "ompi/mca/pml/base/base.h" +#include "ompi/mca/btl/btl.h" +#include "ompi/mca/pml/base/base.h" +#include "ompi/mca/btl/base/base.h" +#include "pml_bfo.h" +#include "pml_bfo_component.h" +#include "pml_bfo_comm.h" +#include "pml_bfo_hdr.h" +#include "pml_bfo_recvfrag.h" +#include "pml_bfo_sendreq.h" +#include "pml_bfo_recvreq.h" +#include "pml_bfo_rdmafrag.h" +#include "pml_bfo_failover.h" +#include "ompi/mca/bml/base/base.h" +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/grpcomm/grpcomm.h" +#include "orte/util/show_help.h" +#include "orte/mca/notifier/notifier.h" + +#include "ompi/runtime/ompi_cr.h" + +static void mca_pml_bfo_error_pending_packets(mca_btl_base_module_t* btl, + mca_bml_base_endpoint_t* ep); + +/** + * When running with failover enabled, check the PML sequence numbers + * to see if we have received a duplicate message. This check is done + * for for all MATCH fragments. It is also done for RNDV and RGET + * fragments that do not have the MCA_PML_BFO_HDR_FLAGS_RESTART flag + * set. + * We set the window size to half the total range of sequence numbers. + * We only enter this code when the seq_num is not the expected one. + * A few more notes on the algorithm used here. In normal operation, + * the expected value will either be equal to or less than the + * sequence number of the header. This is because we are using this + * sequence number to detect packets arriving prior to them being + * expected. If we determine that expected is less than header, then + * make sure this is not a rollover case. We do that by adding the + * maxnum to the expected. + * @param proc Pointer to proc from where message came + * @param hdr Pointer to header of message + */ +bool mca_pml_bfo_is_duplicate_msg(mca_pml_bfo_comm_proc_t* proc, + mca_pml_bfo_match_hdr_t *hdr) +{ + const int window = 32768; + const int maxnum = 65536; + mca_pml_bfo_recv_frag_t *frag; + +#if 0 + opal_output(0, "checking dup, exp=%d, act=%d, type=%d, cant_match=%d\n", + (uint16_t)proc->expected_sequence, + hdr->hdr_seq, hdr->hdr_common.hdr_type, + opal_list_get_size(&proc->frags_cant_match)); +#endif + + /* Few cases near end of values where expected may equal 65535 and + * an out of order shows up that may equal something like 1. */ + if (OPAL_UNLIKELY((uint16_t)proc->expected_sequence > hdr->hdr_seq)) { + if (((uint16_t)proc->expected_sequence - hdr->hdr_seq) < window) { + opal_output_verbose(20, mca_pml_bfo_output, + "%s:%d: frag duplicated, exp=%d, act=%d, type=%d\n", + __FILE__, __LINE__, (uint16_t)proc->expected_sequence, + hdr->hdr_seq, hdr->hdr_common.hdr_type); + return true; + } + } else { + /* This is the normal flow through this code. We also need to + * use the maxnum to ensure that we handle cases where the + * expected number has rolled over but then a duplicate message + * shows up that is greater than it. */ + if ((((uint16_t)proc->expected_sequence + maxnum) - hdr->hdr_seq) < window) { + opal_output_verbose(20, mca_pml_bfo_output, + "%s:%d: frag duplicated, exp=%d, act=%d, type=%d\n", + __FILE__, __LINE__, (uint16_t)proc->expected_sequence, + hdr->hdr_seq, hdr->hdr_common.hdr_type); + return true; + } + } + + /* Need to explicitly check against any out of order fragments. Unfortunately, we + * always have to do this since we can get a duplicate out of order fragment. */ + if(OPAL_UNLIKELY(opal_list_get_size(&proc->frags_cant_match) > 0)) { + for(frag = (mca_pml_bfo_recv_frag_t*)opal_list_get_first(&proc->frags_cant_match); + frag != (mca_pml_bfo_recv_frag_t*)opal_list_get_end(&proc->frags_cant_match); + frag = (mca_pml_bfo_recv_frag_t*)opal_list_get_next(frag)) + { + mca_pml_bfo_match_hdr_t* mhdr = &frag->hdr.hdr_match; + + if(mhdr->hdr_seq == hdr->hdr_seq) { + opal_output_verbose(20, mca_pml_bfo_output, + "%s:%d: frag duplicated on frags_cant_match list, seq=%d, type=%d\n", + __FILE__, __LINE__, hdr->hdr_seq, hdr->hdr_common.hdr_type); + return true; + } + } + } + + return false; +} + +/** + * This function checks to see if we have received a duplicate FIN + * message. This is done by first pulling the pointer of the request + * that the FIN message is pointing to from the message. We then + * check the various fields in the request to the fields in the header + * and make sure they match. If they do not, then the request must + * have been recycled already and this is a duplicate FIN message. We + * have to do this check on every FIN message that we receive. + */ +bool mca_pml_bfo_is_duplicate_fin(mca_pml_bfo_hdr_t* hdr, mca_btl_base_descriptor_t* rdma, + mca_btl_base_module_t* btl) +{ + mca_pml_base_request_t* basereq; + /* When running with failover enabled, need to ensure that this + * is not a duplicate FIN message. */ + if (btl->btl_flags & MCA_BTL_FLAGS_FAILOVER_SUPPORT) { + /* The first check is to make sure the descriptor is pointing + * to a valid request. The descriptor may be pointing to NULL + * if it was freed and not reused yet. */ + if (NULL == rdma->des_cbdata) { + opal_output_verbose(20, mca_pml_bfo_output, + "FIN: received: dropping because not pointing to valid descriptor " + "PML=%d CTX=%d SRC=%d RQS=%d", + hdr->hdr_match.hdr_seq, hdr->hdr_match.hdr_ctx, + hdr->hdr_match.hdr_src, hdr->hdr_fin.hdr_restartseq); + return true; + } + + basereq = (mca_pml_base_request_t*)rdma->des_cbdata; + /* Now we know the descriptor is pointing to a non-null request. + * Does it match what we expect? To make sure the receiver request + * matches the FIN message, check the context number, source of the + * message, and MPI sequence number. Then make sure that it also + * matches the internal sequencing number of the requests. We need + * to look at the type of request we are pointing at to figure out + * what fields to access. */ + if (basereq->req_type == MCA_PML_REQUEST_RECV) { + mca_pml_bfo_recv_request_t* recvreq = (mca_pml_bfo_recv_request_t*)basereq; + if ((hdr->hdr_match.hdr_ctx != recvreq->req_recv.req_base.req_comm->c_contextid) || + (hdr->hdr_match.hdr_src != recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE) || + (hdr->hdr_match.hdr_seq != (uint16_t)recvreq->req_msgseq)) { + opal_output_verbose(5, mca_pml_bfo_output, + "FIN: received on receiver: dropping because no match " + "PML:exp=%d,act=%d CTX:exp=%d,act=%d SRC:exp=%d,act=%d " + "RQS:exp=%d,act=%d, dst_req=%p", + (uint16_t)recvreq->req_msgseq, hdr->hdr_match.hdr_seq, + recvreq->req_recv.req_base.req_comm->c_contextid, + hdr->hdr_match.hdr_ctx, + recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE, + hdr->hdr_match.hdr_src, + recvreq->req_restartseq, hdr->hdr_fin.hdr_restartseq, + (void *)recvreq); + return true; + } + if (hdr->hdr_fin.hdr_restartseq != recvreq->req_restartseq) { + opal_output_verbose(5, mca_pml_bfo_output, + "FIN: received on receiver: dropping because old " + "PML:exp=%d,act=%d CTX:exp=%d,act=%d SRC:exp=%d,act=%d " + "RQS:exp=%d,act=%d, dst_req=%p", + (uint16_t)recvreq->req_msgseq, hdr->hdr_match.hdr_seq, + recvreq->req_recv.req_base.req_comm->c_contextid, + hdr->hdr_match.hdr_ctx, + recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE, + hdr->hdr_match.hdr_src, + recvreq->req_restartseq, hdr->hdr_fin.hdr_restartseq, + (void *)recvreq); + return true; + } + } else if (basereq->req_type == MCA_PML_REQUEST_SEND) { + mca_pml_bfo_send_request_t* sendreq = (mca_pml_bfo_send_request_t*)basereq; + if ((hdr->hdr_match.hdr_ctx != sendreq->req_send.req_base.req_comm->c_contextid) || + (hdr->hdr_match.hdr_src != sendreq->req_send.req_base.req_peer) || + (hdr->hdr_match.hdr_seq != (uint16_t)sendreq->req_send.req_base.req_sequence)) { + uint16_t seq = (uint16_t)sendreq->req_send.req_base.req_sequence; + opal_output_verbose(5, mca_pml_bfo_output, + "FIN: received on sender: dropping because no match " + "PML:exp=%d,act=%d CTX:exp=%d,act=%d SRC:exp=%d,act=%d " + "RQS:exp=%d,act=%d, dst_req=%p", + seq, hdr->hdr_match.hdr_seq, + sendreq->req_send.req_base.req_comm->c_contextid, + hdr->hdr_match.hdr_ctx, + sendreq->req_send.req_base.req_peer, hdr->hdr_match.hdr_src, + sendreq->req_restartseq, hdr->hdr_fin.hdr_restartseq, + (void *)sendreq); + return true; + } + if (hdr->hdr_fin.hdr_restartseq != sendreq->req_restartseq) { + uint16_t seq = (uint16_t)sendreq->req_send.req_base.req_sequence; + opal_output_verbose(5, mca_pml_bfo_output, + "FIN: received on sender: dropping because old " + "PML:exp=%d,act=%d CTX:exp=%d,act=%d SRC:exp=%d,act=%d " + "RQS:exp=%d,act=%d, dst_req=%p", + seq, hdr->hdr_match.hdr_seq, + sendreq->req_send.req_base.req_comm->c_contextid, + hdr->hdr_match.hdr_ctx, + sendreq->req_send.req_base.req_peer, hdr->hdr_match.hdr_src, + sendreq->req_restartseq, hdr->hdr_fin.hdr_restartseq, + (void *)sendreq); + return true; + } + } else { + /* We can get here if the descriptor has been reused, but + * not as an RDMA descriptor. In that case, the callback + * function has been set to something else. Clearly the + * descriptor we are interested is gone, so just drop the + * FIN message. */ + opal_output_verbose(5, mca_pml_bfo_output, + "FIN: received: dropping because descriptor has been reused " + "PML=%d CTX=%d SRC=%d RQS=%d rdma->des_flags=%d", + hdr->hdr_match.hdr_seq, hdr->hdr_match.hdr_ctx, + hdr->hdr_match.hdr_src, hdr->hdr_fin.hdr_restartseq, rdma->des_flags); + return true; + } + } + return false; +} + +/** + * Repost a FIN message if we get an error on the completion event. + */ +void mca_pml_bfo_repost_fin(struct mca_btl_base_descriptor_t* des) { + /* In the error case, we will repost the FIN message. I had + * considered restarting the request. The problem is that the + * request may be already complete when we detect that a FIN + * message got an error on its completion event. For example, with + * the PUT protocol, if the RDMA writes succeed and all the data + * has been sent, then the request is marked as complete and can be + * freed. Therefore, an error on the FIN message has no request to + * refer back to. So, we will just repost it. However, we are also + * faced with the case where the FIN message has an error but it + * actually makes it to the other side. In that case we are now + * sending a FIN message to a non-existent request on the receiver + * side. To handle that, we have added the match information to + * the FIN message. That way, we can check on the receiving side + * to ensure that it is pointing to a valid request. */ + mca_pml_bfo_fin_hdr_t* hdr; + mca_bml_base_endpoint_t* bml_endpoint; + ompi_proc_t *proc; + mca_bml_base_btl_t* bml_btl; + + proc = (ompi_proc_t*) des->des_cbdata; + bml_endpoint = (mca_bml_base_endpoint_t*) proc->proc_bml; + hdr = (mca_pml_bfo_fin_hdr_t*)des->des_src->seg_addr.pval; + + opal_output_verbose(20, mca_pml_bfo_output, + "REPOST: BFO_HDR_TYPE_FIN: seq=%d,myrank=%d,peer=%d,hdr->hdr_fail=%d,src=%d", + hdr->hdr_match.hdr_seq, ORTE_PROC_MY_NAME->vpid, proc->proc_name.vpid, + hdr->hdr_fail, hdr->hdr_match.hdr_src); + + bml_btl = mca_bml_base_btl_array_get_next(&bml_endpoint->btl_eager); + + /* Reconstruct the fin for sending on the other BTL */ + mca_pml_bfo_send_fin(proc, bml_btl, + hdr->hdr_des, MCA_BTL_NO_ORDER, + hdr->hdr_fail, hdr->hdr_match.hdr_seq, hdr->hdr_restartseq, + hdr->hdr_match.hdr_ctx, hdr->hdr_match.hdr_src); + return; +} + +/** + * This function is called when a RNDV or RGET is received with the + * FLAGS_RESTART flag set. This means this message already has a + * receive request already associated with it. + */ +mca_pml_bfo_recv_request_t* mca_pml_bfo_get_request(mca_pml_bfo_match_hdr_t *hdr) { + mca_pml_bfo_recv_request_t *match = NULL; + mca_pml_bfo_rendezvous_hdr_t * rhdr = (mca_pml_bfo_rendezvous_hdr_t *) hdr; + match = (mca_pml_bfo_recv_request_t *) rhdr->hdr_dst_req.pval; + + /* Check to see if we have received a duplicate RNDV (or RGET). This can + * occur because we got an error when we reposted the RNDV. Therefore, + * we make sure that the request has not completed from underneath us + * and been recycled. Secondly, make sure we are not getting it a + * second time for the same request. */ + if ((rhdr->hdr_match.hdr_ctx != match->req_recv.req_base.req_comm->c_contextid) || + (rhdr->hdr_match.hdr_src != match->req_recv.req_base.req_ompi.req_status.MPI_SOURCE) || + (rhdr->hdr_match.hdr_seq != (uint16_t)match->req_msgseq) || + (rhdr->hdr_restartseq == match->req_restartseq)) { + if (hdr->hdr_common.hdr_type == MCA_PML_BFO_HDR_TYPE_RNDV) { + opal_output_verbose(20, mca_pml_bfo_output, + "RNDV: received with RESTART flag: duplicate, dropping " + "PML:exp=%d,act=%d RQS=%d, src_req=%p, dst_req=%p, peer=%d", + match->req_msgseq, rhdr->hdr_match.hdr_seq, match->req_restartseq, + match->remote_req_send.pval, (void *)match, + match->req_recv.req_base.req_ompi.req_status.MPI_SOURCE); + } else { + opal_output_verbose(20, mca_pml_bfo_output, + "RGET: received with RESTART flag: duplicate, dropping " + "PML:exp=%d,act=%d RQS=%d, src_req=%p, dst_req=%p, peer=%d", + match->req_msgseq, rhdr->hdr_match.hdr_seq, match->req_restartseq, + match->remote_req_send.pval, (void *)match, + match->req_recv.req_base.req_ompi.req_status.MPI_SOURCE); + } + return NULL; + } + + mca_pml_bfo_recv_request_reset(match); + if (hdr->hdr_common.hdr_type == MCA_PML_BFO_HDR_TYPE_RNDV) { + opal_output_verbose(30, mca_pml_bfo_output, + "RNDV: received with RESTART flag: restarting recv, " + "PML:exp=%d,act=%d RQS(new)=%d, src_req=%p, dst_req=%p, peer=%d", + match->req_msgseq, rhdr->hdr_match.hdr_seq, match->req_restartseq, + match->remote_req_send.pval, (void *)match, + match->req_recv.req_base.req_ompi.req_status.MPI_SOURCE); + } else { + opal_output_verbose(30, mca_pml_bfo_output, + "RGET: received with RESTART flag: restarting recv, " + "PML:exp=%d,act=%d RQS(new)=%d, src_req=%p, dst_req=%p, peer=%d", + match->req_msgseq, rhdr->hdr_match.hdr_seq, match->req_restartseq, + match->remote_req_send.pval, (void *)match, + match->req_recv.req_base.req_ompi.req_status.MPI_SOURCE); + } + return match; +} + +/** + * Callback for when a RNDVRESTARTNOTIFY message is received. A + * RNDVRESTARTNOTIFY message is sent from the sender to the receiver + * telling the receiver that the message is going to be started over. + * The receiver first makes sure that the request being pointed to is + * still valid. If it is not, that means the receiver must have + * completed the request and therefore we need to send a NACK back to + * the sender. The receiver then makes sure this is not a duplicate + * message. If it is a duplicate, it will just drop it. Otherwise, + * it will then send a RNDVRESTARTACK message if there are no + * outstanding events on the receiver. Otherwise, it will just change + * the state of the request and wait for another event to send the + * RNDVRESTARTACK to the sender. + */ +void mca_pml_bfo_recv_frag_callback_rndvrestartnotify(mca_btl_base_module_t* btl, + mca_btl_base_tag_t tag, + mca_btl_base_descriptor_t* des, + void* cbdata ) { + mca_btl_base_segment_t* segments = des->des_dst; + mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval; + mca_pml_bfo_recv_request_t* recvreq; + ompi_proc_t* ompi_proc; + orte_process_name_t orte_proc; + + bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNOTIFY); + recvreq = (mca_pml_bfo_recv_request_t*)hdr->hdr_restart.hdr_dst_req.pval; + + /* Check to see if the receive request is still valid. If the + * request is recycled, that means the original request must have + * completed and we therefore need to send a NACK back to the sender. + * Note that when the request is gone, we need to pull some information + * off the header so that we can figure out where to send the NACK + * message back to. */ + if ((hdr->hdr_match.hdr_ctx != recvreq->req_recv.req_base.req_comm->c_contextid) || + (hdr->hdr_match.hdr_src != recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE) || + (hdr->hdr_match.hdr_seq != (uint16_t)recvreq->req_msgseq)) { + orte_proc.jobid = hdr->hdr_restart.hdr_jobid; + orte_proc.vpid = hdr->hdr_restart.hdr_vpid; + ompi_proc = ompi_proc_find(&orte_proc); + opal_output_verbose(20, mca_pml_bfo_output, + "RNDVRESTARTNOTIFY: received: does not match request, sending NACK back " + "PML:req=%d,hdr=%d CTX:req=%d,hdr=%d SRC:req=%d,hdr=%d " + "RQS:req=%d,hdr=%d src_req=%p, dst_req=%p, peer=%d, hdr->hdr_jobid=%d, " + "hdr->hdr_vpid=%d, ompi_proc->proc_hostname=%s", + (uint16_t)recvreq->req_msgseq, hdr->hdr_match.hdr_seq, + recvreq->req_recv.req_base.req_comm->c_contextid, hdr->hdr_match.hdr_ctx, + recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE, + hdr->hdr_match.hdr_src, recvreq->req_restartseq, + hdr->hdr_restart.hdr_restartseq, + recvreq->remote_req_send.pval, (void *)recvreq, + recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE, + hdr->hdr_restart.hdr_jobid, hdr->hdr_restart.hdr_vpid, + ompi_proc->proc_hostname); + mca_pml_bfo_recv_request_rndvrestartnack(des, ompi_proc, false); + return; + } + + /* We know that we have the correct receive request. Make sure this is not + * a duplicate RNDVRESTARTNOTIFY on this request. */ + if (hdr->hdr_restart.hdr_restartseq == recvreq->req_restartseq) { + opal_output_verbose(20, mca_pml_bfo_output, + "RNDVRESTARTNOTIFY: received duplicate: dropping RNDVRESTARTNOTIFY " + "message PML:req=%d,hdr=%d CTX:req=%d,hdr=%d SRC:req=%d,hdr=%d " + "RQS:req=%d,hdr=%d src_req=%p, dst_req=%p, peer=%d", + (uint16_t)recvreq->req_msgseq, hdr->hdr_match.hdr_seq, + recvreq->req_recv.req_base.req_comm->c_contextid, hdr->hdr_match.hdr_ctx, + recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE, + hdr->hdr_match.hdr_src, recvreq->req_restartseq, + hdr->hdr_restart.hdr_restartseq, + recvreq->remote_req_send.pval, (void *)recvreq, + recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE); + return; + } + + /* Increment restart number. */ + recvreq->req_restartseq++; + recvreq->req_errstate |= RECVREQ_RNDVRESTART_RECVED; + opal_output_verbose(30, mca_pml_bfo_output, + "RNDVRESTARTNOTIFY: received: outstanding receive events=%d, " + "PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d", + recvreq->req_events, recvreq->req_msgseq, recvreq->req_restartseq, + recvreq->remote_req_send.pval, (void *)recvreq, + recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE); + + if (0 == recvreq->req_events) { + mca_pml_bfo_recv_request_rndvrestartack(recvreq, MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNOTIFY, + OMPI_SUCCESS, btl); + } + + return; +} + +/** + * Callback for when a RNDVRESTARTACK message is received. This + * message is sent from the receiver to the sender to acknowledge + * the receipt of the RNDVRESTARTNOTIFY message. At this point, + * the sender can reset the send request and restart the message. + */ +void mca_pml_bfo_recv_frag_callback_rndvrestartack(mca_btl_base_module_t* btl, + mca_btl_base_tag_t tag, + mca_btl_base_descriptor_t* des, + void* cbdata ) { + mca_btl_base_segment_t* segments = des->des_dst; + mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval; + mca_pml_bfo_send_request_t* sendreq; + + bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_RNDVRESTARTACK); + sendreq = (mca_pml_bfo_send_request_t*)hdr->hdr_restart.hdr_src_req.pval; + + /* Check to see if we have received a duplicate message. The + * first three comparisons make sure that we are not looking at a + * recycled request. The last check makes sure we are not getting + * a duplicate message for this specific request. All of this is + * needed because the receiver might get an error and repost the + * RNDVRESTARTACK message, but the RNDVRESTARTACK was actually received. */ + if ((hdr->hdr_match.hdr_ctx != sendreq->req_send.req_base.req_comm->c_contextid) || + (hdr->hdr_match.hdr_src != sendreq->req_send.req_base.req_peer) || + (hdr->hdr_match.hdr_seq != (uint16_t)sendreq->req_send.req_base.req_sequence) || + (hdr->hdr_restart.hdr_restartseq != sendreq->req_restartseq)) { + opal_output_verbose(20, mca_pml_bfo_output, + "RNDVRESTARTACK: received: does not match request, dropping " + "PML:exp=%d,act=%d CTX:exp=%d,act=%d SRC:exp=%d,act=%d EXP:exp=%d,act=%d " + "src_req=%p, dst_req=%p, peer=%d", + (uint16_t)sendreq->req_send.req_base.req_sequence, hdr->hdr_match.hdr_seq, + sendreq->req_send.req_base.req_comm->c_contextid, hdr->hdr_match.hdr_ctx, + sendreq->req_send.req_base.req_peer, hdr->hdr_match.hdr_src, + sendreq->req_restartseq, hdr->hdr_restart.hdr_restartseq, + (void *)sendreq, sendreq->req_recv.pval, + sendreq->req_send.req_base.req_peer); + return; + } + + sendreq->req_restart++; + if (2 == sendreq->req_restart) { + opal_output_verbose(30, mca_pml_bfo_output, + "RNDVRESTARTACK: received: restarting send " + "PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d", + hdr->hdr_match.hdr_seq, hdr->hdr_restart.hdr_restartseq, + (void *)sendreq, sendreq->req_recv.pval, + sendreq->req_send.req_base.req_peer); + mca_pml_bfo_send_request_restart(sendreq, false, 0); + } else { + opal_output_verbose(30, mca_pml_bfo_output, + "RNDVRESTARTACK received: waiting for RNDVRESTARTNOTIFY completion " + "PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d", + hdr->hdr_match.hdr_seq, hdr->hdr_restart.hdr_restartseq, + (void *)sendreq, sendreq->req_recv.pval, + sendreq->req_send.req_base.req_peer); + } + return; +} + + +/** + * Callback for when a RECVERRNOTIFY message is received. This message + * is sent from the receiver to the sender and tells the sender that + * the receiver has seen an error. This will trigger the sender + * to start the request restart sequence. + */ +void mca_pml_bfo_recv_frag_callback_recverrnotify(mca_btl_base_module_t* btl, + mca_btl_base_tag_t tag, + mca_btl_base_descriptor_t* des, + void* cbdata ) { + mca_btl_base_segment_t* segments = des->des_dst; + mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval; + mca_pml_bfo_send_request_t* sendreq; + + bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_RECVERRNOTIFY); + sendreq = (mca_pml_bfo_send_request_t*)hdr->hdr_restart.hdr_src_req.pval; + + /* First make sure that this message is pointing to a valid request. + * This can be determined if the communicator context, the source of + * the message, and the MPI sequence number all match. */ + if ((hdr->hdr_match.hdr_ctx != sendreq->req_send.req_base.req_comm->c_contextid) || + (hdr->hdr_match.hdr_src != sendreq->req_send.req_base.req_peer) || + (hdr->hdr_match.hdr_seq != (uint16_t)sendreq->req_send.req_base.req_sequence)) { + opal_output_verbose(20, mca_pml_bfo_output, + "RECVERRNOTIFY: received: does not match request, dropping " + "PML:exp=%d,act=%d CTX:exp=%d,act=%d SRC:exp=%d,act=%d RQS:exp=%d,act=%d " + "src_req=%p, dst_req=%p, peer=%d", + (uint16_t)sendreq->req_send.req_base.req_sequence, hdr->hdr_match.hdr_seq, + sendreq->req_send.req_base.req_comm->c_contextid, hdr->hdr_match.hdr_ctx, + sendreq->req_send.req_base.req_peer, hdr->hdr_match.hdr_src, + sendreq->req_restartseq, hdr->hdr_restart.hdr_restartseq, + (void *)sendreq, sendreq->req_recv.pval, + sendreq->req_send.req_base.req_peer); + return; + } + + /* If a good ACK was never received, then the first ACK received + * might be a RECVERRNOTIFY message. In that case, the sendreq does not + * have a valid req_recv pointer in it. Therefore, check for that + * case and update the field in the sendreq if necessary. */ + if (NULL == sendreq->req_recv.pval) { + sendreq->req_recv = hdr->hdr_restart.hdr_dst_req; + } + + /* Now check to see a restart needs to be issued. The request + * sequence number in the header is compared against the current + * request sequence number in the send request. If the header + * sequence number is greater than or equal to the send request + * number, then a rndvrestartnotify is issued. There are some cases + * where a few extra rndvrestartnotifys are issued. That is OK as + * it will all work itself out. The idea is to prevent many + * restarts unnecessarily. This still allows multiple restarts to + * happen. It could be that sometime later another error occurs + * which initiates a restart. That is OK as it will have the new + * sequence number and all is well. */ + if (hdr->hdr_restart.hdr_restartseq >= sendreq->req_restartseq) { + assert(sendreq->req_send.req_base.req_ompi.req_state == OMPI_REQUEST_ACTIVE); + sendreq->req_error++; + opal_output_verbose(30, mca_pml_bfo_output, + "RECVERRNOTIFY: received: sendreq has error, outstanding events=%d, " + "PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d", + sendreq->req_events, (uint16_t)sendreq->req_send.req_base.req_sequence, + sendreq->req_restartseq, (void *)sendreq, + sendreq->req_recv.pval, + sendreq->req_send.req_base.req_peer); + + if (0 == sendreq->req_events) { + mca_pml_bfo_send_request_rndvrestartnotify(sendreq, false, + MCA_PML_BFO_HDR_TYPE_RECVERRNOTIFY, + OMPI_SUCCESS, btl); + } + } else { + opal_output_verbose(30, mca_pml_bfo_output, + "RECVERRNOTIFY: received: error has already been noted, ignoring " + "PML:exp=%d,act=%d RQS:exp=%d,act=%d src_req=%p, dst_req=%p, peer=%d", + sendreq->req_restartseq, hdr->hdr_restart.hdr_restartseq, + (uint16_t)sendreq->req_send.req_base.req_sequence, hdr->hdr_match.hdr_seq, + (void *)sendreq, sendreq->req_recv.pval, + sendreq->req_send.req_base.req_peer); + } + return; +} + +/** + * Callback for when a RNDVRESTARTNACK message is received. This message + * is sent from the receiver to the sender and tells the sender that + * the receiver has already completed the message and there is nothing + * else to be done. The sender should then just make the send request + * complete. + */ +void mca_pml_bfo_recv_frag_callback_rndvrestartnack(mca_btl_base_module_t* btl, + mca_btl_base_tag_t tag, + mca_btl_base_descriptor_t* des, + void* cbdata ) { + + mca_btl_base_segment_t* segments = des->des_dst; + mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval; + mca_pml_bfo_send_request_t* sendreq; + + bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNACK); + sendreq = (mca_pml_bfo_send_request_t*)hdr->hdr_restart.hdr_src_req.pval; + + /* Not convinced a RNDVRESTARTNACK that does not match a request can + * happen, but have the check in here anyways for now */ + if ((hdr->hdr_match.hdr_ctx != sendreq->req_send.req_base.req_comm->c_contextid) || + (hdr->hdr_match.hdr_src != sendreq->req_send.req_base.req_peer) || + (hdr->hdr_match.hdr_seq != (uint16_t)sendreq->req_send.req_base.req_sequence) || + (hdr->hdr_restart.hdr_restartseq != sendreq->req_restartseq)) { + opal_output_verbose(20, mca_pml_bfo_output, + "RNDVRESTARTNACK: received: does not match request, dropping " + "PML:exp=%d,act=%d CTX:exp=%d,act=%d SRC:exp=%d,act=%d EXP:exp=%d,act=%d " + "src_req=%p, dst_req=%p, peer=%d", + (uint16_t)sendreq->req_send.req_base.req_sequence, hdr->hdr_match.hdr_seq, + sendreq->req_send.req_base.req_comm->c_contextid, hdr->hdr_match.hdr_ctx, + sendreq->req_send.req_base.req_peer, hdr->hdr_match.hdr_src, + sendreq->req_restartseq, hdr->hdr_restart.hdr_restartseq, + (void *)sendreq, sendreq->req_recv.pval, + sendreq->req_send.req_base.req_peer); + return; + } + + opal_output_verbose(20, mca_pml_bfo_output, + "RNDVRESTARTNACK: received: marking send request as complete " + "PML=%d CTX=%d SRC=%d EXP=%d " + "src_req=%p, dst_req=%p, peer=%d", + (uint16_t)sendreq->req_send.req_base.req_sequence, + sendreq->req_send.req_base.req_comm->c_contextid, + sendreq->req_send.req_base.req_peer, sendreq->req_restartseq, + (void *)sendreq, sendreq->req_recv.pval, + sendreq->req_send.req_base.req_peer); + mca_pml_bfo_send_request_rndvrestartnack(sendreq); + return; +} + + +/** + * This function gets called when failover is enabled and an error + * occurs during the rendezvous protocol. A message is sent to the + * receiving side notifying the request that the communication is + * going to be starting over. However, none of the information in the + * send request is reset yet, so that any in flight fragments can + * still find a home. Information in the send request gets reset when + * the completion event for this send occurs AND an ACK has been + * received back from the receiver. + */ +void mca_pml_bfo_send_request_rndvrestartnotify(mca_pml_bfo_send_request_t* sendreq, + bool repost, mca_btl_base_tag_t tag, + int status, mca_btl_base_module_t* btl) +{ + mca_btl_base_descriptor_t* des; + mca_pml_bfo_restart_hdr_t* restart; + int rc; + mca_bml_base_btl_t* bml_btl; + ompi_proc_t* proc = (ompi_proc_t*)sendreq->req_send.req_base.req_proc; + mca_bml_base_endpoint_t* bml_endpoint = (mca_bml_base_endpoint_t*) proc->proc_bml; + + /* If this message is not a repost, then update the sequence number. */ + if (!repost) { + /* Bump up the rendezvous request sequence number. */ + sendreq->req_restartseq++; + } + + assert(0 == sendreq->req_events); + assert(0 != bml_endpoint->btl_eager.arr_size); + + /* In the case that this is started because the receiver has + * sent us a message, then attempt to use a different BTL than the + * error message was received on. This may potentially tickle the + * error sooner if this side has not seen it yet. */ + bml_btl = mca_bml_base_btl_array_get_next(&bml_endpoint->btl_eager); + if (bml_btl->btl == btl) { + /* If there is more than one BTL left, then we will get a + * different one. If there is only one, we will just get + * the same one back again. That is OK. */ + bml_btl = mca_bml_base_btl_array_get_next(&bml_endpoint->btl_eager); + } + + /* allocate descriptor */ + mca_bml_base_alloc(bml_btl, &des, MCA_BTL_NO_ORDER, + sizeof(mca_pml_bfo_restart_hdr_t), + MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | + MCA_BTL_DES_SEND_ALWAYS_CALLBACK); + if( OPAL_UNLIKELY(NULL == des) ) { + opal_output(0, "%s:%d Our of resources, cannot proceed", __FILE__, __LINE__); + orte_errmgr.abort(-1, NULL); + } + + /* fill out header */ + restart = (mca_pml_bfo_restart_hdr_t*)des->des_src->seg_addr.pval; + restart->hdr_match.hdr_common.hdr_flags = 0; + restart->hdr_match.hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNOTIFY; + restart->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid; + restart->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank; + restart->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence; + restart->hdr_restartseq = sendreq->req_restartseq; + restart->hdr_src_req.pval = sendreq; + restart->hdr_dst_req = sendreq->req_recv; + restart->hdr_dst_rank = sendreq->req_send.req_base.req_peer; /* Needed for NACKs */ + restart->hdr_jobid = ORTE_PROC_MY_NAME->jobid; + restart->hdr_vpid = ORTE_PROC_MY_NAME->vpid; + + bfo_hdr_hton(restart, MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNOTIFY, proc); + + /* initialize descriptor */ + des->des_cbfunc = mca_pml_bfo_rndvrestartnotify_completion; + + opal_output_verbose(30, mca_pml_bfo_output, + "RNDVRESTARTNOTIFY: sent: PML=%d, RQS(new)=%d, CTX=%d, SRC=%d, " + "src_req=%p, dst_req=%p, peer=%d", + (uint16_t)sendreq->req_send.req_base.req_sequence, sendreq->req_restartseq, + restart->hdr_match.hdr_ctx, restart->hdr_match.hdr_src, + (void *)sendreq, sendreq->req_recv.pval, + sendreq->req_send.req_base.req_peer); + + rc = mca_bml_base_send(bml_btl, des, MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNOTIFY); + if( OPAL_UNLIKELY( rc < 0 ) ) { + opal_output(0, "[%s:%d] Cannot send rndvrestartnotify message", __FILE__, __LINE__); + orte_errmgr.abort(-1, NULL); + } + +} + +/** + * This function is called when a RNDVRESTARTNACK message is received + * by the sender. + */ +void mca_pml_bfo_send_request_rndvrestartnack(mca_pml_bfo_send_request_t* sendreq) +{ + /* A RNDVRESTARTNACK was sent by the receiver. This means that the + * receiver is rejecting the RNDVRESTARTNOTIFY message indicating the + * receiver's request is complete. Therefore, mark the sender complete + * also. This data exchange is over. */ + send_request_pml_complete(sendreq); +} + +/** + * This function restarts a RNDV send request. When this is called, + * all the fields in the send request are reset and the send is + * started over. The sendreq->req_restartseq will be non-zero which will + * trigger a special flag in the RNDV header which indicates the match + * has already happened on the receiving side. + */ +void mca_pml_bfo_send_request_restart(mca_pml_bfo_send_request_t* sendreq, + bool repost, mca_btl_base_tag_t tag) +{ + size_t offset = 0; + opal_list_item_t *first_item; + opal_list_item_t *last_item; + mca_bml_base_endpoint_t* endpoint; + size_t i; + + /* If the tag is something valid, it was a repost. We could also + * check the repost field as well. Maybe I can drop the + * repost and have the tag double as it. */ + switch (tag) { + case MCA_PML_BFO_HDR_TYPE_RNDV: + opal_output_verbose(30, mca_pml_bfo_output, + "RNDV: completion failed, reset and repost: PML=%d, RQS=%d, " + "CTX=%d, SRC=%d, src_req=%p, peer=%d", + (uint16_t)sendreq->req_send.req_base.req_sequence, sendreq->req_restartseq, + sendreq->req_send.req_base.req_comm->c_contextid, + sendreq->req_send.req_base.req_comm->c_my_rank, (void *)sendreq, + sendreq->req_send.req_base.req_peer); + break; + case MCA_PML_BFO_HDR_TYPE_RGET: + opal_output_verbose(30, mca_pml_bfo_output, + "RGET: completion failed, reset and repost: PML=%d, RQS=%d, " + "CTX=%d, SRC=%d, src_req=%p, peer=%d", + (uint16_t)sendreq->req_send.req_base.req_sequence, sendreq->req_restartseq, + sendreq->req_send.req_base.req_comm->c_contextid, + sendreq->req_send.req_base.req_comm->c_my_rank, (void *)sendreq, + sendreq->req_send.req_base.req_peer); + break; + default: + break; + } + + /* Return mpool resources, they get reacquired when request starts over. */ + mca_pml_bfo_free_rdma_resources(sendreq); + + /* Release any memory in use if this is a buffered send */ + if (sendreq->req_send.req_send_mode == MCA_PML_BASE_SEND_BUFFERED && + sendreq->req_send.req_addr != sendreq->req_send.req_base.req_addr) { + mca_pml_base_bsend_request_fini((ompi_request_t*)sendreq); + } + + /* Clear out any unsent send ranges. Recreated the get_send_range + * and the get_next_send_range functions. */ + OPAL_THREAD_LOCK(&sendreq->req_send_range_lock); + first_item = opal_list_get_first(&sendreq->req_send_ranges); + last_item = opal_list_get_end(&sendreq->req_send_ranges); + while (first_item != last_item) { + opal_list_remove_item(&sendreq->req_send_ranges, last_item); + OMPI_FREE_LIST_RETURN(&mca_pml_bfo.send_ranges, (ompi_free_list_item_t *)last_item); + last_item = opal_list_get_end(&sendreq->req_send_ranges); + } + OPAL_THREAD_UNLOCK(&sendreq->req_send_range_lock); + + /* Reset the converter to the beginning. */ + opal_convertor_set_position(&sendreq->req_send.req_base.req_convertor, + &offset); + + /* Bump up internal sequence number to handle possible duplicate + * RNDV messages. In the case of reposting a RNDV message, do not + * increment the value. That way, a duplicate message can be + * detected. */ + if (!repost) { + sendreq->req_restartseq++; + } + + /* This code here is essentially the same is mca_pml_bfo_send_request_start() + * but with a few modifications since we are restarting the request, not + * starting entirely from scratch. */ + endpoint = (mca_bml_base_endpoint_t*)sendreq->req_send.req_base.req_proc->proc_bml; + sendreq->req_endpoint = endpoint; + sendreq->req_state = 0; + sendreq->req_lock = 0; + sendreq->req_pipeline_depth = 0; + sendreq->req_bytes_delivered = 0; + sendreq->req_pending = MCA_PML_BFO_SEND_PENDING_NONE; + + /* Note that we do not reset the following two items. + * They stay with their original values. + * sendreq->req_send.req_base.req_sequence + * sendreq->req_restartseq + */ + sendreq->req_restart = 0; /* reset in case we restart again */ + sendreq->req_error = 0; /* clear error state */ + sendreq->req_events = 0; /* clear events, probably 0 anyways */ + sendreq->req_acked = false; + + MCA_PML_BASE_SEND_START( &sendreq->req_send.req_base ); + + for(i = 0; i < mca_bml_base_btl_array_get_size(&endpoint->btl_eager); i++) { + mca_bml_base_btl_t* bml_btl; + int rc; + + /* select a btl */ + bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager); + rc = mca_pml_bfo_send_request_start_btl(sendreq, bml_btl); + if( OPAL_LIKELY(OMPI_ERR_OUT_OF_RESOURCE != rc) ) + return; + } + add_request_to_send_pending(sendreq, MCA_PML_BFO_SEND_PENDING_START, true); +} + +/** + * This function will repost a match fragment. This function has to + * handle the case where there may not be a request associated with + * the fragment and just use the information in the fragment to + * repost the send. + */ +void mca_pml_bfo_repost_match_fragment(struct mca_btl_base_descriptor_t* des) +{ + mca_pml_bfo_send_request_t* sendreq = (mca_pml_bfo_send_request_t*)des->des_cbdata; + mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) des->des_context; + struct mca_bml_base_endpoint_t* endpoint; + int rc; + size_t offset = 0; + + /* At this point a determination has to be made whether the + * BFO_HDR_TYPE_MATCH fragment was sent via the sendi interface or + * via the regular send interface. This is important because if it + * was sent via the sendi interface, then the request associated + * with it has already been completed and released. This can be + * determined by looking at the des->des_flags field of the + * descriptor. If the ALWAYS_CALLBACK flag is set then it is known + * that there is a valid send request associated with the fragment + * and it can be used to extricate information. If ALWAYS_CALLBACK + * is not set, then the endpoint information is in the callback + * data field and where to resend the fragment can be determined + * from the fragment. */ + if (des->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) { + endpoint = sendreq->req_endpoint; + opal_output_verbose(30, mca_pml_bfo_output, + "MATCH: repost: src_req=%p", + (void *)sendreq); + } else { + endpoint = des->des_cbdata; + opal_output_verbose(30, mca_pml_bfo_output, + "MATCH: repost: des=%p (sendi fragment)", + (void *)des); + } + + assert(0 != endpoint->btl_eager.arr_size); + bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager); + + if (des->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) { + /* Reset the converter to the beginning */ + opal_convertor_set_position(&sendreq->req_send.req_base.req_convertor, + &offset); + rc = mca_pml_bfo_send_request_start_btl(sendreq, bml_btl); + if (OMPI_SUCCESS == rc) { + return; + } else if (OMPI_ERR_OUT_OF_RESOURCE == rc) { + opal_output_verbose(30, mca_pml_bfo_output, + "Warning: delaying reposting of BFO_HDR_TYPE_MATCH, btls=%d", + (int)sendreq->req_endpoint->btl_eager.arr_size); + add_request_to_send_pending(sendreq, MCA_PML_BFO_SEND_PENDING_START, true); + return; + } else { + opal_output(0, "%s:%d FATAL ERROR, cannot repost BFO_HDR_TYPE_MATCH", + __FILE__, __LINE__); + orte_errmgr.abort(-1, NULL); + } + } else { + /* No send request available so alloc and repost explicitly */ + mca_btl_base_descriptor_t* newdes = NULL; + mca_btl_base_segment_t* oldseg; + mca_btl_base_segment_t* newseg; + + oldseg = des->des_src; + /* The alloc routine must be called with the MCA_BTL_NO_ORDER + * flag so that the allocation routine works. The allocation + * will fill in the order flag in the descriptor. */ + mca_bml_base_alloc( bml_btl, &newdes, + MCA_BTL_NO_ORDER, + oldseg->seg_len, + MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); + if (OPAL_UNLIKELY(NULL == newdes)) { + opal_output(0, "%s:%d FATAL ERROR, cannot repost BFO_HDR_TYPE_MATCH", + __FILE__, __LINE__); + orte_errmgr.abort(-1, NULL); + } + newseg = newdes->des_src; + /* Copy over all the data that is actually sent over the wire */ + memcpy(newseg->seg_addr.pval, oldseg->seg_addr.pval, oldseg->seg_len); + newseg->seg_len = oldseg->seg_len; + + /* This call will either return OMPI_SUCCESS or OMPI_ERROR. The + * OMPI_SUCCESS only says that the send request can be freed. + * It may be that the message was queued up in the BTL. */ + rc = mca_bml_base_send(bml_btl, newdes, MCA_PML_BFO_HDR_TYPE_MATCH); + + /* Some BTLs will set the CALLBACK flag but we do not want that + * as there is no longer a request associated with this descriptor. + * Therefore, always make sure it is cleared. */ + newdes->des_flags &= ~MCA_BTL_DES_SEND_ALWAYS_CALLBACK; + + if( OPAL_LIKELY( rc >= 0 )) { + /* Just let the normal flow of data free whatever needs + * to be freed */ + return; + } else { + opal_output(0, "%s:%d FATAL ERROR, cannot repost BFO_HDR_TYPE_MATCH", + __FILE__, __LINE__); + orte_errmgr.abort(-1, NULL); + } + } + /* No need to free any descriptors. The BTLs take care of it since + * we originally allocated with MCA_BTL_DES_FLAGS_BTL_OWNERSHIP. */ +} + +/** + * Completion callback for rndvrestartnotify completion event. If the + * RNDVRESTARTACK has already been received, then reset and restart. + * Otherwise, just update the state and let the RNDVRESTARTACK trigger + * the reset and restart. + */ +void +mca_pml_bfo_rndvrestartnotify_completion(mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* ep, + struct mca_btl_base_descriptor_t* des, + int status) +{ + mca_pml_bfo_restart_hdr_t* restart; + mca_pml_bfo_send_request_t* sendreq; + + restart = (mca_pml_bfo_restart_hdr_t*)des->des_src->seg_addr.pval; + sendreq = (mca_pml_bfo_send_request_t*) restart->hdr_src_req.pval; + + /* Need to resend this message in the case that it fails */ + if( OPAL_UNLIKELY((OMPI_SUCCESS != status))) { + opal_output_verbose(30, mca_pml_bfo_output, + "RNDVRESTARTNOTIFY: completion failed: repost " + "PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d", + (uint16_t)sendreq->req_send.req_base.req_sequence, + sendreq->req_restartseq, + (void *)sendreq, sendreq->req_recv.pval, + sendreq->req_send.req_base.req_peer); + /* Repost the message and indicate it is a repost, not a new one. No need + * to check the req_events as this is the only possible outstanding send + * event when we have posted this message. We also know the sendreq is still + * available because nothing can proceed until this completion event happens + * successfully as we track the req_restart value. */ + mca_pml_bfo_send_request_rndvrestartnotify(sendreq, true, + MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNOTIFY, + status, btl); + return; + } + + /* The req_restart value is incremented to indicate completion of + * the RNDVRESTARTNOTIFY message. Then (typically) the arrival of the + * ACK message will cause the request to reset and restart. Need to + * make sure that RNDVRESTARTNOTIFY callback has been called as well as + * the ACK back from the receiver prior to resetting and restarting + * the request. This is needed in case we get an error on the + * RNDVRESTARTNOTIFY message, but it actually makes it over. We want + * to make sure the send request has not restarted yet. So, keep a + * counter that counts to 2. */ + sendreq->req_restart++; + if (2 == sendreq->req_restart) { + opal_output_verbose(30, mca_pml_bfo_output, + "RNDVRESTARTNOTIFY: completion: restarting request " + "PML=%d, RQS=%d, CTX=%d, src_req=%p, dst_req=%p, peer=%d", + (uint16_t)sendreq->req_send.req_base.req_sequence, + sendreq->req_restartseq, + sendreq->req_send.req_base.req_comm->c_contextid, + sendreq->req_recv.pval, (void *)sendreq, + sendreq->req_send.req_base.req_peer); + mca_pml_bfo_send_request_restart(sendreq, false, 0); + } else { + opal_output_verbose(30, mca_pml_bfo_output, + "RNDVRESTARTNOTIFY: completion: waiting for ack " + "PML=%d, RQS=%d, CTX=%d, src_req=%p, dst_req=%p, peer=%d", + (uint16_t)sendreq->req_send.req_base.req_sequence, + sendreq->req_restartseq, + sendreq->req_send.req_base.req_comm->c_contextid, + sendreq->req_recv.pval, (void *)sendreq, + sendreq->req_send.req_base.req_peer); + } +} + +/** + * This function is called when an error is detected on a completion + * event on the receiving side. This can come from a ACK, PUT, RDMA + * read (GET) or RECVERRNOTIFY completion event. When this happens, check + * the state of the request and decide if the sender needs be notified + * that a problem was seen. If no RECVERRNOTIFY message has been sent and + * no RNDVRESTARTNOTIFY has been received from the sender, then send a + * message telling the sender an error was seen. + */ +void mca_pml_bfo_recv_request_recverrnotify(mca_pml_bfo_recv_request_t* recvreq, + mca_btl_base_tag_t tag, int status) +{ + mca_btl_base_descriptor_t* des; + mca_pml_bfo_restart_hdr_t* restart; + ompi_proc_t* proc = (ompi_proc_t*)recvreq->req_recv.req_base.req_proc; + mca_bml_base_endpoint_t* bml_endpoint = (mca_bml_base_endpoint_t*) proc->proc_bml; + mca_bml_base_btl_t* bml_btl; + int rc; + + assert(0 != bml_endpoint->btl_eager.arr_size); + + bml_btl = mca_bml_base_btl_array_get_next(&bml_endpoint->btl_eager); + + /* allocate descriptor */ + mca_bml_base_alloc(bml_btl, &des, MCA_BTL_NO_ORDER, + sizeof(mca_pml_bfo_restart_hdr_t), + MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | + MCA_BTL_DES_SEND_ALWAYS_CALLBACK); + if( OPAL_UNLIKELY(NULL == des) ) { + opal_output(0, "%s:%d Out of resources, cannot proceed", __FILE__, __LINE__); + orte_errmgr.abort(-1, NULL); + } + + /* fill out header */ + restart = (mca_pml_bfo_restart_hdr_t*)des->des_src->seg_addr.pval; + restart->hdr_match.hdr_common.hdr_flags = 0; + restart->hdr_match.hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_RECVERRNOTIFY; + restart->hdr_match.hdr_ctx = recvreq->req_recv.req_base.req_comm->c_contextid; + restart->hdr_match.hdr_src = recvreq->req_recv.req_base.req_comm->c_my_rank; + restart->hdr_match.hdr_seq = (uint16_t)recvreq->req_msgseq; + restart->hdr_restartseq = recvreq->req_restartseq; + restart->hdr_src_req = recvreq->remote_req_send; + restart->hdr_dst_req.pval = recvreq; + + bfo_hdr_hton(restart, MCA_PML_BFO_HDR_TYPE_RECVERRNOTIFY, proc); + + /* initialize descriptor */ + des->des_cbfunc = mca_pml_bfo_recv_restart_completion; + + opal_output_verbose(30, mca_pml_bfo_output, + "RECVERRNOTIFY: sending to sender, " + "PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d, btl=%p", + recvreq->req_msgseq, recvreq->req_restartseq, + recvreq->remote_req_send.pval, + (void *)recvreq, + recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE, + (void *)bml_btl->btl); + + rc = mca_bml_base_send(bml_btl, des, MCA_PML_BFO_HDR_TYPE_RECVERRNOTIFY); + if( OPAL_UNLIKELY( rc < 0 ) ) { + opal_output(0, "[%s:%d] Cannot send recverrnotify message", __FILE__, __LINE__); + orte_errmgr.abort(-1, NULL); + } + /* Prevent future error messages on this request */ + recvreq->req_errstate |= RECVREQ_RECVERRSENT; +} + +/** + * This function is called when it may be time to send a RNDVRESTARTACK + * message back to the sending side. This can happen because we + * received a RNDVRESTARTNOTIFY message from the sender. This can + * also happen if we have noticed that the request has received the + * RNDVRESTARTNOTIFY message, but has not yet sent out the RNDVRESTARTACK + * because there were still some pending receive events on the request. + * That means we can enter this routine from a completion event on a ACK, + * PUT, or RDMA read as well as from the receipt of a RNDVRESTARTNOTIFY + * message. If all is good, we sent the RNDVRESTARTACK message back to + * the sender. Then sometime later a message will arrive telling us + * to reset and restart the receive request. + */ +void mca_pml_bfo_recv_request_rndvrestartack(mca_pml_bfo_recv_request_t* recvreq, + mca_btl_base_tag_t tag, int status, + mca_btl_base_module_t* btl) +{ + mca_btl_base_descriptor_t* des; + mca_pml_bfo_restart_hdr_t* restart; + ompi_proc_t* proc = (ompi_proc_t*)recvreq->req_recv.req_base.req_proc; + mca_bml_base_endpoint_t* bml_endpoint = (mca_bml_base_endpoint_t*) proc->proc_bml; + mca_bml_base_btl_t* bml_btl; + int rc; + + assert((recvreq->req_errstate & RECVREQ_RNDVRESTART_RECVED) == RECVREQ_RNDVRESTART_RECVED); + assert((recvreq->req_errstate & RECVREQ_RNDVRESTART_ACKED) == 0); + assert(0 != bml_endpoint->btl_eager.arr_size); + + bml_btl = mca_bml_base_btl_array_get_next(&bml_endpoint->btl_eager); + + /* Attempt to use a different BTL than the error message was + * received on. This may potentially tickle the error sooner if + * this side has not seen it yet. */ + if (bml_btl->btl == btl) { + /* If there is more than one BTL left, then we will get a + * different one. If there is only one, we will just get + * the same one back again. That is OK. */ + bml_btl = mca_bml_base_btl_array_get_next(&bml_endpoint->btl_eager); + } + + /* allocate descriptor */ + mca_bml_base_alloc(bml_btl, &des, MCA_BTL_NO_ORDER, + sizeof(mca_pml_bfo_restart_hdr_t), + MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | + MCA_BTL_DES_SEND_ALWAYS_CALLBACK); + if( OPAL_UNLIKELY(NULL == des) ) { + opal_output(0, "%s:%d Out of resources, cannot proceed", __FILE__, __LINE__); + orte_errmgr.abort(-1, NULL); + } + + /* fill out header */ + restart = (mca_pml_bfo_restart_hdr_t*)des->des_src->seg_addr.pval; + restart->hdr_match.hdr_common.hdr_flags = 0; + restart->hdr_match.hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_RNDVRESTARTACK; + restart->hdr_match.hdr_ctx = recvreq->req_recv.req_base.req_comm->c_contextid; + restart->hdr_match.hdr_src = recvreq->req_recv.req_base.req_comm->c_my_rank; + restart->hdr_match.hdr_seq = (uint16_t)recvreq->req_msgseq; + restart->hdr_restartseq = recvreq->req_restartseq; + restart->hdr_src_req = recvreq->remote_req_send; + restart->hdr_dst_req.pval = recvreq; + + bfo_hdr_hton(restart, MCA_PML_BFO_HDR_TYPE_RNDVRESTARTACK, proc); + + /* initialize descriptor */ + des->des_cbfunc = mca_pml_bfo_recv_restart_completion; + des->des_cbdata = (void *)proc; + + opal_output_verbose(30, mca_pml_bfo_output, + "RNDVRESTARTACK: due to PML tag=%d completion, sending to " + "sender, PML=%d, RQS=%d, src_req=%p, dst_req=%p, status=%d, " + "peer=%d, btl=%p", + tag, recvreq->req_msgseq, recvreq->req_restartseq, + recvreq->remote_req_send.pval, (void *)recvreq, status, + recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE, + (void *)bml_btl->btl); + + rc = mca_bml_base_send(bml_btl, des, MCA_PML_BFO_HDR_TYPE_RNDVRESTARTACK); + if( OPAL_UNLIKELY( rc < 0 ) ) { + opal_output(0, "[%s:%d] Cannot send rndvrestartack message", __FILE__, __LINE__); + orte_errmgr.abort(-1, NULL); + } + /* Move to the next state so we do not send anymore ACKs */ + recvreq->req_errstate &= ~RECVREQ_RNDVRESTART_RECVED; + recvreq->req_errstate |= RECVREQ_RNDVRESTART_ACKED; +} + +/** + * Called after the receipt of a RNDVRESTARTNOTIFY message to a request + * that no longer matches. This can happen if the sender detected an + * error, but the receiver actually received all the data. Therefore + * send a NACK back instead of the ACK so that the sender can complete + * its request. This happens very rarely. Note that we need to make + * use of the hdr_dst_rank that we received from the notify message. + * This is so the sending side make sure the message matches a valid + * request on the sending side. + */ +void mca_pml_bfo_recv_request_rndvrestartnack(mca_btl_base_descriptor_t* olddes, + ompi_proc_t* ompi_proc, bool repost) +{ + mca_btl_base_segment_t* segments; + mca_pml_bfo_restart_hdr_t* hdr; /* hdr of NOTIFY message */ + mca_pml_bfo_restart_hdr_t* nack; /* hdr of NACK message */ + mca_btl_base_descriptor_t* des; + mca_bml_base_endpoint_t* bml_endpoint; + mca_bml_base_btl_t* bml_btl; + int rc; + + if (repost) { + /* In the case where we are reposting the NACK, the information + * is in the src area, since we are reposting a send. In addition, + * we get the ompi_proc from the old descriptor. */ + segments = olddes->des_src; + ompi_proc = olddes->des_cbdata; + } else { + segments = olddes->des_dst; + } + hdr = (mca_pml_bfo_restart_hdr_t*)segments->seg_addr.pval; + + bml_endpoint = ompi_proc->proc_bml; + assert(0 != bml_endpoint->btl_eager.arr_size); + bml_btl = mca_bml_base_btl_array_get_next(&bml_endpoint->btl_eager); + + /* allocate descriptor */ + mca_bml_base_alloc(bml_btl, &des, MCA_BTL_NO_ORDER, + sizeof(mca_pml_bfo_restart_hdr_t), + MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | + MCA_BTL_DES_SEND_ALWAYS_CALLBACK); + if( OPAL_UNLIKELY(NULL == des) ) { + opal_output(0, "%s:%d Out of resources, cannot proceed", __FILE__, __LINE__); + orte_errmgr.abort(-1, NULL); + } + + /* fill out header */ + nack = (mca_pml_bfo_restart_hdr_t*)des->des_src->seg_addr.pval; + nack->hdr_match.hdr_common.hdr_flags = 0; + nack->hdr_match.hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNACK; + nack->hdr_match.hdr_ctx = hdr->hdr_match.hdr_ctx; + nack->hdr_match.hdr_src = hdr->hdr_dst_rank; /* Receiver rank */ + nack->hdr_match.hdr_seq = hdr->hdr_match.hdr_seq; + nack->hdr_restartseq = hdr->hdr_restartseq; + nack->hdr_src_req = hdr->hdr_src_req; + nack->hdr_dst_req.pval = 0; + + bfo_hdr_hton(nack, MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNACK, ompi_proc); + + /* Initialize descriptor. Save away ompi_proc in case we need + * to respost this fragmnet. */ + des->des_cbfunc = mca_pml_bfo_recv_restart_completion; + des->des_cbdata = ompi_proc; + + opal_output_verbose(30, mca_pml_bfo_output, + "RNDVRESTARTNACK: sending to sender, " + "PML=%d, RQS=%d, CTX=%d, SRC=%d, peer=%d", + nack->hdr_match.hdr_seq, nack->hdr_restartseq, + nack->hdr_match.hdr_ctx, nack->hdr_match.hdr_src, + ompi_proc->proc_name.vpid); + + rc = mca_bml_base_send(bml_btl, des, MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNACK); + if( OPAL_UNLIKELY( rc < 0 ) ) { + opal_output(0, "[%s:%d] Cannot send rndvrestartnack message", __FILE__, __LINE__); + orte_errmgr.abort(-1, NULL); + } +} + + +/** + * Reset all the receive request fields to match what a request + * looks like when it is first started. This gets called when + * the rendezvous/rget message is being restarted. + */ +void mca_pml_bfo_recv_request_reset(mca_pml_bfo_recv_request_t* match) { + int i; + + assert(true != match->req_recv.req_base.req_pml_complete); + + /* Free up any resources that were reserved for this receive. This + * was copied from the receive completion code. */ + for(i = 0; i < (int)match->req_rdma_cnt; i++) { + mca_mpool_base_registration_t* btl_reg = match->req_rdma[i].btl_reg; + if( NULL != btl_reg && btl_reg->mpool != NULL) { + btl_reg->mpool->mpool_deregister( btl_reg->mpool, btl_reg ); + } + } + match->req_rdma_cnt = 0; + + /* This code is mostly copied from mca_pml_bfo_recv_req_start. + * Note 1: Leave req_bytes_expected as the original value. No + * need to adjust this as it is set when convertor is created. + * Note 2: Leave req_bytes_delivered as the original value. + * This is created when the convertor is created and represents + * the expected bytes from the user. */ + assert(0 == match->req_events); + match->req_errstate = 0; + match->req_lock = 0; + match->req_pipeline_depth = 0; + match->req_bytes_received = 0; + match->req_rdma_idx = 0; + match->req_rdma_offset = 0; + match->req_send_offset = 0; + match->req_pending = false; + match->req_ack_sent = false; + match->req_restartseq++; + + /* These really should not need to be set, but this matches some + * of the initialization within MCA_PML_BASE_RECV_START. */ + match->req_recv.req_base.req_pml_complete = false; + match->req_recv.req_base.req_ompi.req_complete = false; + match->req_recv.req_base.req_ompi.req_state = OMPI_REQUEST_ACTIVE; + + /* Reset the convertor */ + opal_convertor_set_position(&match->req_recv.req_base.req_convertor, + &match->req_rdma_offset); + return; +} + +/* + * Completion callback for RNDVRESTARTACK, RNDVRESTARTNACK and RECVERRNOTIFY. + */ +void mca_pml_bfo_recv_restart_completion( mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* ep, + struct mca_btl_base_descriptor_t* des, + int status ) +{ + if(OPAL_UNLIKELY(OMPI_SUCCESS != status)) { + mca_pml_bfo_common_hdr_t* common = des->des_src->seg_addr.pval; + mca_pml_bfo_restart_hdr_t* restart; /* RESTART header */ + mca_pml_bfo_recv_request_t* recvreq; + int peer; + + switch (common->hdr_type) { + case MCA_PML_BFO_HDR_TYPE_RNDVRESTARTACK: + restart = (mca_pml_bfo_restart_hdr_t*)des->des_src->seg_addr.pval; + recvreq = (mca_pml_bfo_recv_request_t*) restart->hdr_dst_req.pval; + peer = recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE; + opal_output_verbose(30, mca_pml_bfo_output, + "RNDVRESTARTACK: completion failed: try again " + "PML:req=%d,hdr=%d RQS:req=%d,hdr=%d CTX:req=%d,hdr=%d " + "src_req=%p, dst_req=%p, peer=%d", + recvreq->req_msgseq, restart->hdr_match.hdr_seq, + recvreq->req_restartseq, restart->hdr_restartseq, + recvreq->req_recv.req_base.req_comm->c_contextid, + restart->hdr_match.hdr_ctx, + recvreq->remote_req_send.pval, + (void *)recvreq, peer); + + /* Adjust the states back to avoid assert errors */ + recvreq->req_errstate &= ~RECVREQ_RNDVRESTART_ACKED; + recvreq->req_errstate |= RECVREQ_RNDVRESTART_RECVED; + mca_pml_bfo_recv_request_rndvrestartack(recvreq, MCA_PML_BFO_HDR_TYPE_RNDVRESTARTACK, + status, btl); + break; + case MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNACK: + opal_output_verbose(30, mca_pml_bfo_output, + "RNDVRESTARTNACK: completion failed: try again " + "des=%p ", (void *)des); + /* Just blast it again. No request associated with it. */ + mca_pml_bfo_recv_request_rndvrestartnack(des, NULL, true); + break; + case MCA_PML_BFO_HDR_TYPE_RECVERRNOTIFY: + /* With just two BTLs, this should never happen as we are + * typically sending the RECVERRNOTIFY message on the + * working BTL. But, just in case, if we get an error, + * send it again. */ + opal_output_verbose(30, mca_pml_bfo_output, + "RECVERRNOTIFY: completion failed: try again, " + "PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d", + recvreq->req_msgseq, recvreq->req_restartseq, + recvreq->remote_req_send.pval, + (void *)recvreq, + recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE); + mca_pml_bfo_recv_request_recverrnotify(recvreq, MCA_PML_BFO_HDR_TYPE_RECVERRNOTIFY, + status); + break; + default: + opal_output(0, "[%s:%d] Unknown callback error", __FILE__, __LINE__); + orte_errmgr.abort(-1, NULL); + } + } +} + +/* + * Remove a btl for future communication on an endpoint. + */ +void mca_pml_bfo_map_out_btl(struct mca_btl_base_module_t* btl, + ompi_proc_t *errproc, char *btlname) +{ + mca_bml_base_endpoint_t* ep; + bool remove = false; + int i; + + ep = (mca_bml_base_endpoint_t*)errproc->proc_bml; + + /* The bml_del_proc_btl function does not indicate if it + * actually removed a btl, so let me check up front. This is + * done so that we can only print out messages when a btl is + * actually going to be removed. These arrays are small so it + * is OK to walk through all of them even though it may be + * redundant. */ + for( i = 0; i < (int)ep->btl_eager.arr_size; i++ ) { + if( ep->btl_eager.bml_btls[i].btl == btl ) { + remove = true; + } + } + for( i = 0; i < (int)ep->btl_send.arr_size; i++ ) { + if( ep->btl_send.bml_btls[i].btl == btl ) { + remove = true; + } + } + for( i = 0; i < (int)ep->btl_rdma.arr_size; i++ ) { + if( ep->btl_rdma.bml_btls[i].btl == btl ) { + remove = true; + } + } + + if (true == remove) { + mca_bml.bml_del_proc_btl(errproc, btl); + + orte_notifier.log(ORTE_NOTIFIER_ERROR, ORTE_ERR_COMM_FAILURE, + "BTL %s error: rank=%d mapping out %s " + "to rank=%d on node=%s", + btl->btl_component->btl_version.mca_component_name, + ORTE_PROC_MY_NAME->vpid, + btlname, errproc->proc_name.vpid, + errproc->proc_hostname); + + opal_output_verbose(10, mca_pml_bfo_output, + "BTL %s error: rank=%d mapping out %s " + "to rank=%d on node=%s \n", + btl->btl_component->btl_version.mca_component_name, + ORTE_PROC_MY_NAME->vpid, + btlname, errproc->proc_name.vpid, + errproc->proc_hostname); + + /* Need to search for any pending packets associated + * with this endpoint and remove them. We may also + * have to restarts depending on the state of the + * requests. */ + mca_pml_bfo_error_pending_packets(btl, ep); + + if ((ep->btl_eager.arr_size == 0) && + (ep->btl_send.arr_size == 0) && + (ep->btl_rdma.arr_size == 0)) { + opal_output(0, "%s:%d: No more interfaces, aborting", + __FILE__, __LINE__); + orte_errmgr.abort(-1, NULL); + } + } +} + +void mca_pml_bfo_failover_error_handler(struct mca_btl_base_module_t* btl, + int32_t flags, ompi_proc_t *errproc, char *btlname) +{ + ompi_proc_t** procs; + size_t p, num_procs; + + /* If we are in here, we know that the we were called + * with the flags == MCA_BTL_ERROR_FLAGS_NONFATAL so no + * need to check it in here. */ + assert(flags & MCA_BTL_ERROR_FLAGS_NONFATAL); + + procs = ompi_proc_all(&num_procs); + + if(NULL == procs) { + opal_output(0, "%s:%d: Out of memory, giving up.", + __FILE__, __LINE__); + orte_errmgr.abort(-1, NULL); + } + + if (NULL == btlname) { + btlname = "unknown"; + } + + /* If the process to map out is not specified or if the fast + * failover flag is specified, then map out the entire BTL. + * Otherwise, only map out the BTL for the specific remote + * process. */ + if (NULL == errproc || mca_pml_bfo.fast_failover) { + for( p = 0; p < num_procs; p++ ) { + mca_pml_bfo_map_out_btl(btl, procs[p], btlname); + } + } else { + mca_pml_bfo_map_out_btl(btl, errproc, btlname); + } + free(procs); +} + +/** + * This function is called since when we are mapping out a BML. This + * will walk through the four PML lists and dispatch with the + * fragments/requests. There are four different lists and each one is + * handled slighty differently. In all cases, we first see if the + * message is associated with the endpoint that is being mapped out. + * If not, then just leave it alone and put it back on the list. If + * it is associated with the endpoint, then a each list handles it + * slighlty differently. Also, in some cases, we actually adjust the + * pointers to the BMLs in the messages as they may have changed when + * the BML is mapped out. That is because this is called after we + * have mapped out the offending BML and adjusted the array of + * available BMLs. + */ +static void mca_pml_bfo_error_pending_packets(mca_btl_base_module_t* btl, + mca_bml_base_endpoint_t* ep) { + int32_t i, s; + + /* The pckt_pending list contains both ACK and FIN messages. + * ACKs can be sent over any BTL associated with the endpoint. + * Therefore, the bml_btl entry for ACKS is NULL and they do + * not need to be adjusted. It is also worth noting that + * the ACK will be the only outstanding message associated + * with a request so we can just let nature takes it course. + * + * FIN messages do have a BML associated with them, but they + * can also be sent over any BTL. Therefore, adjust the bml + * pointer in the pckt to ensure it points at a valid BML. + */ + + s = (int32_t)opal_list_get_size(&mca_pml_bfo.pckt_pending); + for(i = 0; i < s; i++) { + mca_pml_bfo_pckt_pending_t *pckt; + opal_output_verbose(0, mca_pml_bfo_output, + "INFO: pckt_pending list has %d entries", s); +#if 1 + /* TODO: Error out until code is tested */ + opal_output_verbose(0, mca_pml_bfo_output, + "%s:%d: Support not implemented, aborting", + __FILE__, __LINE__); + orte_errmgr.abort(-1, NULL); +#endif + OPAL_THREAD_LOCK(&mca_pml_bfo.lock); + pckt = (mca_pml_bfo_pckt_pending_t*) + opal_list_remove_first(&mca_pml_bfo.pckt_pending); + OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock); + + /* My guess is that this can happen in the threaded + * case where the other thread removed some packets + * after we determined the size of the list. */ + if(NULL == pckt) + break; + + /* If there is no bml stored on the packet, then just + * put it back on the list as there is nothing to adjust. + * This appears to be true with ACK packets. */ + if (NULL == pckt->bml_btl) { + OPAL_THREAD_LOCK(&mca_pml_bfo.lock); + opal_list_append(&mca_pml_bfo.pckt_pending, + (opal_list_item_t*)pckt); + OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock); + continue; + } + + /* Now see if this endpoint matches the one we are mapping + * out. If so, adjust the bml entry so to ensure it is + * not pointing at a stale bml. We do not really care + * which BML it is pointing at as long as it is valid. + * In either case, then put entry back on the list. */ + if (pckt->proc->proc_bml == ep) { + opal_output_verbose(15, mca_pml_bfo_output, + "INFO: Found matching pckt on pckt_pending list, adjusting bml"); + pckt->bml_btl = mca_bml_base_btl_array_get_next(&ep->btl_eager); + } + OPAL_THREAD_LOCK(&mca_pml_bfo.lock); + opal_list_append(&mca_pml_bfo.pckt_pending, + (opal_list_item_t*)pckt); + OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock); + + } + + /* This next list holds rdma fragments. We need to walk through + * the list and see if any are associated with the endpoint + * we are mapping out. If not, then just put back on the + * list. If they are, then we need to error them out. One issue + * is that we need to deal with the case where there may be more + * then one pending rdma fragment for a request. */ + s = (int32_t)opal_list_get_size(&mca_pml_bfo.rdma_pending); + for(i = 0; i < s; i++) { + mca_pml_bfo_rdma_frag_t* frag; + mca_pml_bfo_send_request_t* sendreq; + mca_pml_bfo_recv_request_t* recvreq; + opal_output_verbose(0, mca_pml_bfo_output, + "INFO: rdma_pending list has %d entries", s); +#if 1 + /* TODO: Error out until code is tested */ + opal_output_verbose(0, mca_pml_bfo_output, + "%s:%d: Support not implemented, aborting", + __FILE__, __LINE__); + orte_errmgr.abort(-1, NULL); +#endif + OPAL_THREAD_LOCK(&mca_pml_bfo.lock); + frag = (mca_pml_bfo_rdma_frag_t*) + opal_list_remove_first(&mca_pml_bfo.rdma_pending); + OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock); + + /* My guess is that this can happen in the threaded + * case where the other thread removed some packets + * after we determined the size of the list. */ + if(NULL == frag) + break; + + /* Check to see if it matches our endpoint. If it does, + * then check if it matches the BTL that is being mapped + * out. If it does not, then just readjust the BML pointer. + * If it does, then we need to do something with it. */ + if (frag->rdma_ep != ep) { + OPAL_THREAD_LOCK(&mca_pml_bfo.lock); + opal_list_append(&mca_pml_bfo.rdma_pending, + (opal_list_item_t*)frag); + OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock); + continue; + } + + /* If we are here, then we know we are working on the same + * endpoint. Now check the BTL. */ + if (frag->rdma_btl != btl) { + opal_output_verbose(15, mca_pml_bfo_output, + "INFO: Found matching frag on rdma_pending list, adjusting bml"); + /* The BTL this RDMA is associated with is not the + * one that is getting mapped out, so just adjust the + * BML pointer and put back on the list. */ + frag->rdma_bml = mca_bml_base_btl_array_find(&ep->btl_rdma, frag->rdma_btl); + OPAL_THREAD_LOCK(&mca_pml_bfo.lock); + opal_list_append(&mca_pml_bfo.rdma_pending, + (opal_list_item_t*)frag); + OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock); + continue; + } + + /* Now we call the restart routine. This is just like if we got + * a completion event after calling an RDMA write. This will + * take care of figuring out if we need to restart the request + * or wait for any outstanding events to complete. */ + if(frag->rdma_state == MCA_PML_BFO_RDMA_PUT) { + opal_output_verbose(15, mca_pml_bfo_output, + "INFO: Found matching PUT frag on rdma_pending list, restarting"); + sendreq = frag->rdma_req; + mca_pml_bfo_send_request_rndvrestartnotify(sendreq, false, + MCA_PML_BFO_HDR_TYPE_PUT, 2, btl); + MCA_PML_BFO_RDMA_FRAG_RETURN(frag); + } else { + opal_output_verbose(15, mca_pml_bfo_output, + "INFO: Found matching RGET frag on rdma_pending list, sending reqerror"); + /* This is just like what we do on an rget completion event */ + recvreq = (mca_pml_bfo_recv_request_t*)frag->rdma_req; + mca_pml_bfo_recv_request_recverrnotify(recvreq, MCA_PML_BFO_HDR_TYPE_RGET, 2); + + /* See if the request has received a RNDVRESTARTNOTIFY */ + if( OPAL_UNLIKELY(recvreq->req_errstate)) { + if (recvreq->req_errstate & RECVREQ_RNDVRESTART_RECVED) { + mca_pml_bfo_recv_request_rndvrestartack(recvreq, + MCA_PML_BFO_HDR_TYPE_RGET, + 2, btl); + } + } + MCA_PML_BFO_RDMA_FRAG_RETURN(frag); + } + } + + s = opal_list_get_size(&mca_pml_bfo.send_pending); + /* Look for pending events on our endpoint */ + for(i = 0; i < s; i++) { + mca_pml_bfo_send_request_t* sendreq; + ompi_proc_t* proc; + mca_bml_base_endpoint_t* bml_endpoint; + opal_output_verbose(0, mca_pml_bfo_output, + "INFO: send_pending list has %d entries", s); +#if 1 + /* TODO: Error out until code is tested */ + opal_output_verbose(0, mca_pml_bfo_output, + "%s:%d: Support not implemented, aborting", + __FILE__, __LINE__); + orte_errmgr.abort(-1, NULL); +#endif + OPAL_THREAD_LOCK(&mca_pml_bfo.lock); + sendreq = (mca_pml_bfo_send_request_t*) + opal_list_remove_first(&mca_pml_bfo.send_pending); + OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock); + + /* My guess is that this can happen in the threaded + * case where the other thread removed some packets + * after we determined the size of the list. */ + if(NULL == sendreq) + break; + + proc = (ompi_proc_t*)sendreq->req_send.req_base.req_proc; + bml_endpoint = (mca_bml_base_endpoint_t*) proc->proc_bml; + + /* Check to see if it matches our endpoint. If it does not, + * then just put it back on the list as there is nothing + * we need to do with it. */ + if (bml_endpoint != ep) { + OPAL_THREAD_LOCK(&mca_pml_bfo.lock); + opal_list_append(&mca_pml_bfo.send_pending, + (opal_list_item_t*)sendreq); + OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock); + continue; + } + + switch(sendreq->req_pending) { + case MCA_PML_BFO_SEND_PENDING_SCHEDULE: + /* If this send request is using the endpoint that received + * the error, then let us error it out. In the case + * where there is only one fragment left to be scheduled + * and it would have gone over the good BTL, this is + * not necessary. But, we will use simplicity here + * and assume that some of the fragments are still + * scheduled to go over the broken BTL. */ + sendreq->req_error++; + mca_pml_bfo_send_request_rndvrestartnotify(sendreq, false, + MCA_PML_BFO_HDR_TYPE_FRAG, 2, btl); + break; + case MCA_PML_BFO_SEND_PENDING_START: + /* If the request has not even started, then just put it back + * on the list. Nothing else to do with it. */ + OPAL_THREAD_LOCK(&mca_pml_bfo.lock); + opal_list_append(&mca_pml_bfo.send_pending, + (opal_list_item_t*)sendreq); + OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock); + break; + default: + opal_output(0, "[%s:%d] wrong send request type\n", + __FILE__, __LINE__); + break; + } + } + + s = (int)opal_list_get_size(&mca_pml_bfo.recv_pending); + for(i = 0; i < s; i++) { + mca_pml_bfo_recv_request_t* recvreq; + ompi_proc_t* proc; + mca_bml_base_endpoint_t* bml_endpoint; + opal_output_verbose(0, mca_pml_bfo_output, + "INFO: recv_pending list has %d entries", s); +#if 1 + /* TODO: Error out until code is tested */ + opal_output_verbose(0, mca_pml_bfo_output, + "%s:%d: Support not implemented, aborting", + __FILE__, __LINE__); + orte_errmgr.abort(-1, NULL); +#endif + OPAL_THREAD_LOCK(&mca_pml_bfo.lock); + recvreq = (mca_pml_bfo_recv_request_t*) + opal_list_remove_first(&mca_pml_bfo.recv_pending); + OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock); + + /* My guess is that this can happen in the threaded + * case where the other thread removed some packets + * after we determined the size of the list. */ + if(NULL == recvreq) + break; + + proc = (ompi_proc_t*)recvreq->req_recv.req_base.req_proc; + bml_endpoint = (mca_bml_base_endpoint_t*) proc->proc_bml; + + if (bml_endpoint != ep) { + OPAL_THREAD_LOCK(&mca_pml_bfo.lock); + opal_list_append(&mca_pml_bfo.recv_pending, + (opal_list_item_t*)recvreq); + OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock); + continue; + } + + mca_pml_bfo_recv_request_recverrnotify(recvreq, MCA_PML_BFO_HDR_TYPE_PUT, 2); + } +} + +/** + * Call each time we get a completion event on ACK or PUT message. + * These types of messages are receive control type messages. This + * function is only called if the underlying BTL supports failover. + * Otherwise, there is no need for this check. + */ +void mca_pml_bfo_check_recv_ctl_completion_status(mca_btl_base_module_t* btl, + struct mca_btl_base_descriptor_t* des, + int status) +{ + mca_pml_bfo_common_hdr_t * common = des->des_src->seg_addr.pval; + mca_pml_bfo_ack_hdr_t* ack; /* ACK header */ + mca_pml_bfo_rdma_hdr_t* hdr; /* PUT header */ + struct mca_btl_base_descriptor_t* rdma_des; + mca_pml_bfo_recv_request_t* recvreq; + + if(OPAL_UNLIKELY(OMPI_SUCCESS != status)) { + switch (common->hdr_type) { + case MCA_PML_BFO_HDR_TYPE_ACK: + ack = (mca_pml_bfo_ack_hdr_t*)des->des_src->seg_addr.pval; + recvreq = (mca_pml_bfo_recv_request_t*) ack->hdr_dst_req.pval; + + /* Record the error. Send RECVERRNOTIFY if necessary. */ + if (recvreq->req_errstate) { + opal_output_verbose(30, mca_pml_bfo_output, + "ACK: completion failed, error already seen, " + "PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d", + recvreq->req_msgseq, recvreq->req_restartseq, + recvreq->remote_req_send.pval, (void *)recvreq, + recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE); + } else { + opal_output_verbose(30, mca_pml_bfo_output, + "ACK: completion failed, sending RECVERRNOTIFY to sender, " + "PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d", + recvreq->req_msgseq, recvreq->req_restartseq, + recvreq->remote_req_send.pval, (void *)recvreq, + recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE); + mca_pml_bfo_recv_request_recverrnotify(recvreq, MCA_PML_BFO_HDR_TYPE_ACK, status); + } + break; + + case MCA_PML_BFO_HDR_TYPE_PUT: + hdr = (mca_pml_bfo_rdma_hdr_t*)des->des_src->seg_addr.pval; + rdma_des = hdr->hdr_des.pval; + recvreq = des->des_cbdata; + if ((NULL != rdma_des->des_cbdata) && (recvreq == rdma_des->des_cbdata)) { + /* We now record the error, send the RECVERRNOTIFY if + * necessary, and free the descriptor. Prior to this, + * we want to ensure that we have not reached the case + * where the PUT message actually made it over and we + * have already received a FIN back. We first check to + * see if the RDMA descriptor cbdata is pointing to + * NULL. If it is, this means that the PUT message must + * have made it over and a corresponding FIN already + * made it back and freed the RDMA descriptor. Second, + * if it is non-null, we make sure that it is pointing + * to the same request as the PUT descriptor is. If + * it is not, again we assume that the FIN came back + * and freed it. And we can count on the fact that the + * recvreq has not been freed or reused as it is held + * until this very completion event occurs. */ + if (recvreq->req_errstate) { + opal_output_verbose(30, mca_pml_bfo_output, + "PUT: completion failed, error already seen, " + "PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d", + recvreq->req_msgseq, recvreq->req_restartseq, + recvreq->remote_req_send.pval, (void *)recvreq, + recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE); + } else { + opal_output_verbose(30, mca_pml_bfo_output, + "PUT: completion failed, sending RECVERRNOTIFY to sender, " + "PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d", + recvreq->req_msgseq, recvreq->req_restartseq, + recvreq->remote_req_send.pval, (void *)recvreq, + recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE); + mca_pml_bfo_recv_request_recverrnotify(recvreq, MCA_PML_BFO_HDR_TYPE_PUT, status); + } +#if 0 + /* TODO: Add descriptor to receive request so it can + * be freed only when receive request is freed and + * only if needed. */ + btl->btl_free(btl, rdma_des); +#endif + } + break; + default: + orte_errmgr.abort(-1, NULL); + } + } + + switch (common->hdr_type) { + case MCA_PML_BFO_HDR_TYPE_ACK: + ack = (mca_pml_bfo_ack_hdr_t*)des->des_src->seg_addr.pval; + recvreq = (mca_pml_bfo_recv_request_t*) ack->hdr_dst_req.pval; + recvreq->req_events--; + assert(recvreq->req_events >= 0); + if(OPAL_UNLIKELY (recvreq->req_errstate & RECVREQ_RNDVRESTART_RECVED)) { + opal_output_verbose(30, mca_pml_bfo_output, + "ACK: completion: recvreq in error, outstanding events=%d " + "PML=%d, RQS=%d, src_req=%p, dst_req=%p, status=%d, peer=%d", + recvreq->req_events, recvreq->req_msgseq, recvreq->req_restartseq, + recvreq->remote_req_send.pval, (void *)recvreq, status, + recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE); + if (0 == recvreq->req_events) { + mca_pml_bfo_recv_request_rndvrestartack(recvreq, MCA_PML_BFO_HDR_TYPE_ACK, + status, btl); + } + return; + } + recv_request_pml_complete_check(recvreq); + break; + case MCA_PML_BFO_HDR_TYPE_PUT: + recvreq = des->des_cbdata; + recvreq->req_events--; + assert(recvreq->req_events >= 0); + if(OPAL_UNLIKELY(recvreq->req_errstate & RECVREQ_RNDVRESTART_RECVED)) { + opal_output_verbose(30, mca_pml_bfo_output, + "PUT: completion: recvreq in error, outstanding events=%d " + "PML=%d, RQS=%d, src_req=%p, dst_req=%p, status=%d, peer=%d", + recvreq->req_events, recvreq->req_msgseq, recvreq->req_restartseq, + recvreq->remote_req_send.pval, (void *)recvreq, status, + recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE); + if (0 == recvreq->req_events) { + mca_pml_bfo_recv_request_rndvrestartack(recvreq, MCA_PML_BFO_HDR_TYPE_PUT, + status, btl); + } + return; + } + recv_request_pml_complete_check(recvreq); + break; + } +} diff --git a/ompi/mca/pml/bfo/pml_bfo_failover.h b/ompi/mca/pml/bfo/pml_bfo_failover.h new file mode 100644 index 0000000000..2aea218ef8 --- /dev/null +++ b/ompi/mca/pml/bfo/pml_bfo_failover.h @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/** + * @file + * Functions that implement failover capabilities. + */ + +#ifndef MCA_PML_BFO_FAILOVER_H +#define MCA_PML_BFO_FAILOVER_H + +#include "ompi/mca/btl/btl.h" +#include "pml_bfo_hdr.h" + +BEGIN_C_DECLS + +bool mca_pml_bfo_is_duplicate_msg(mca_pml_bfo_comm_proc_t* proc, + mca_pml_bfo_match_hdr_t *hdr); +bool mca_pml_bfo_is_duplicate_fin(mca_pml_bfo_hdr_t* hdr, mca_btl_base_descriptor_t* rdma, + mca_btl_base_module_t* btl); + +mca_pml_bfo_recv_request_t* mca_pml_bfo_get_request(mca_pml_bfo_match_hdr_t *hdr); + +void mca_pml_bfo_send_request_restart(mca_pml_bfo_send_request_t* sendreq, + bool repost, mca_btl_base_tag_t tag); +void mca_pml_bfo_send_request_rndvrestartnotify(mca_pml_bfo_send_request_t* sendreq, + bool repost, mca_btl_base_tag_t tag, int status, + mca_btl_base_module_t* btl); +void mca_pml_bfo_send_request_rndvrestartnack(mca_pml_bfo_send_request_t* sendreq); + +void +mca_pml_bfo_rndvrestartnotify_completion(mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* ep, + struct mca_btl_base_descriptor_t* des, + int status); +void +mca_pml_bfo_check_recv_ctl_completion_status(mca_btl_base_module_t* btl, + struct mca_btl_base_descriptor_t* des, + int status); + +/* Reset a receive request to the beginning */ +void mca_pml_bfo_recv_request_reset(mca_pml_bfo_recv_request_t* recvreq); +/* Notify sender that receiver detected an error */ +void mca_pml_bfo_recv_request_recverrnotify(mca_pml_bfo_recv_request_t* recvreq, + mca_btl_base_tag_t tag, int status); +/* Ack the RNDVRESTARTNOTIFY message */ +void mca_pml_bfo_recv_request_rndvrestartack(mca_pml_bfo_recv_request_t* recvreq, + mca_btl_base_tag_t tag, int status, + mca_btl_base_module_t* btl); +/* Nack the RNDVRESTARTNOTIFY message */ +void mca_pml_bfo_recv_request_rndvrestartnack(mca_btl_base_descriptor_t* olddes, + ompi_proc_t* ompi_proc, bool repost); + +void mca_pml_bfo_recv_restart_completion(mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* ep, + struct mca_btl_base_descriptor_t* des, + int status); +void mca_pml_bfo_failover_error_handler(struct mca_btl_base_module_t* btl, + int32_t flags, ompi_proc_t *errproc, char *btlname); +void mca_pml_bfo_repost_match_fragment(struct mca_btl_base_descriptor_t* des); +void mca_pml_bfo_repost_fin(struct mca_btl_base_descriptor_t* des); + +void mca_pml_bfo_map_out_btl(struct mca_btl_base_module_t* btl, + ompi_proc_t *errproc, char *btlname); + +extern void mca_pml_bfo_map_out( mca_btl_base_module_t *btl, + mca_btl_base_tag_t tag, + mca_btl_base_descriptor_t* descriptor, + void* cbdata ); + + + + +/** + * Four new callbacks for the four new message types. + */ +extern void mca_pml_bfo_recv_frag_callback_rndvrestartnotify( mca_btl_base_module_t *btl, + mca_btl_base_tag_t tag, + mca_btl_base_descriptor_t* descriptor, + void* cbdata ); + +extern void mca_pml_bfo_recv_frag_callback_rndvrestartack( mca_btl_base_module_t *btl, + mca_btl_base_tag_t tag, + mca_btl_base_descriptor_t* descriptor, + void* cbdata ); + +extern void mca_pml_bfo_recv_frag_callback_rndvrestartnack( mca_btl_base_module_t *btl, + mca_btl_base_tag_t tag, + mca_btl_base_descriptor_t* descriptor, + void* cbdata ); + +extern void mca_pml_bfo_recv_frag_callback_recverrnotify( mca_btl_base_module_t *btl, + mca_btl_base_tag_t tag, + mca_btl_base_descriptor_t* descriptor, + void* cbdata ); + + +END_C_DECLS + +#endif diff --git a/ompi/mca/pml/bfo/pml_bfo_hdr.h b/ompi/mca/pml/bfo/pml_bfo_hdr.h new file mode 100644 index 0000000000..6e9e63aba0 --- /dev/null +++ b/ompi/mca/pml/bfo/pml_bfo_hdr.h @@ -0,0 +1,516 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2009 IBM Corporation. All rights reserved. + * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + */ +#ifndef MCA_PML_BFO_HEADER_H +#define MCA_PML_BFO_HEADER_H + +#include "ompi_config.h" +#ifdef HAVE_SYS_TYPES_H +#include +#endif +#ifdef HAVE_NETINET_IN_H +#include +#endif + +#include "opal/types.h" +#include "opal/util/arch.h" +#include "ompi/mca/btl/btl.h" +#include "ompi/proc/proc.h" + +#define MCA_PML_BFO_HDR_TYPE_MATCH (MCA_BTL_TAG_PML + 1) +#define MCA_PML_BFO_HDR_TYPE_RNDV (MCA_BTL_TAG_PML + 2) +#define MCA_PML_BFO_HDR_TYPE_RGET (MCA_BTL_TAG_PML + 3) +#define MCA_PML_BFO_HDR_TYPE_ACK (MCA_BTL_TAG_PML + 4) +#define MCA_PML_BFO_HDR_TYPE_NACK (MCA_BTL_TAG_PML + 5) +#define MCA_PML_BFO_HDR_TYPE_FRAG (MCA_BTL_TAG_PML + 6) +#define MCA_PML_BFO_HDR_TYPE_GET (MCA_BTL_TAG_PML + 7) +#define MCA_PML_BFO_HDR_TYPE_PUT (MCA_BTL_TAG_PML + 8) +#define MCA_PML_BFO_HDR_TYPE_FIN (MCA_BTL_TAG_PML + 9) +/* BFO FAILOVER CODE - begin */ +#define MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNOTIFY (MCA_BTL_TAG_PML + 10) +#define MCA_PML_BFO_HDR_TYPE_RNDVRESTARTACK (MCA_BTL_TAG_PML + 11) +#define MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNACK (MCA_BTL_TAG_PML + 12) +#define MCA_PML_BFO_HDR_TYPE_RECVERRNOTIFY (MCA_BTL_TAG_PML + 13) +/* BFO FAILOVER CODE - end */ + +#define MCA_PML_BFO_HDR_FLAGS_ACK 1 /* is an ack required */ +#define MCA_PML_BFO_HDR_FLAGS_NBO 2 /* is the hdr in network byte order */ +#define MCA_PML_BFO_HDR_FLAGS_PIN 4 /* is user buffer pinned */ +#define MCA_PML_BFO_HDR_FLAGS_CONTIG 8 /* is user buffer contiguous */ +#define MCA_PML_BFO_HDR_FLAGS_NORDMA 16 /* rest will be send by copy-in-out */ +/* BFO FAILOVER CODE - begin */ +#define MCA_PML_BFO_HDR_FLAGS_RESTART 32 /* restart RNDV because of error */ +/* BFO FAILOVER CODE - end */ + +/** + * Common hdr attributes - must be first element in each hdr type + */ +struct mca_pml_bfo_common_hdr_t { + uint8_t hdr_type; /**< type of envelope */ + uint8_t hdr_flags; /**< flags indicating how fragment should be processed */ +}; +typedef struct mca_pml_bfo_common_hdr_t mca_pml_bfo_common_hdr_t; + +#define MCA_PML_BFO_COMMON_HDR_NTOH(h) +#define MCA_PML_BFO_COMMON_HDR_HTON(h) + +/** + * Header definition for the first fragment, contains the + * attributes required to match the corresponding posted receive. + */ +struct mca_pml_bfo_match_hdr_t { + mca_pml_bfo_common_hdr_t hdr_common; /**< common attributes */ + uint16_t hdr_ctx; /**< communicator index */ + int32_t hdr_src; /**< source rank */ + int32_t hdr_tag; /**< user tag */ + uint16_t hdr_seq; /**< message sequence number */ +#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT + uint8_t hdr_padding[2]; /**< explicitly pad to 16 bytes. Compilers seem to already prefer to do this, but make it explicit just in case */ +#endif +}; +#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT +#define OMPI_PML_BFO_MATCH_HDR_LEN 16 +#else +#define OMPI_PML_BFO_MATCH_HDR_LEN 14 +#endif + +typedef struct mca_pml_bfo_match_hdr_t mca_pml_bfo_match_hdr_t; + +#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG +#define MCA_PML_BFO_MATCH_HDR_FILL(h) \ +do { \ + (h).hdr_padding[0] = 0; \ + (h).hdr_padding[1] = 0; \ +} while(0) +#else +#define MCA_PML_BFO_MATCH_HDR_FILL(h) +#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */ + +#define MCA_PML_BFO_MATCH_HDR_NTOH(h) \ +do { \ + MCA_PML_BFO_COMMON_HDR_NTOH((h).hdr_common); \ + (h).hdr_ctx = ntohs((h).hdr_ctx); \ + (h).hdr_src = ntohl((h).hdr_src); \ + (h).hdr_tag = ntohl((h).hdr_tag); \ + (h).hdr_seq = ntohs((h).hdr_seq); \ +} while (0) + +#define MCA_PML_BFO_MATCH_HDR_HTON(h) \ +do { \ + MCA_PML_BFO_COMMON_HDR_HTON((h).hdr_common); \ + MCA_PML_BFO_MATCH_HDR_FILL(h); \ + (h).hdr_ctx = htons((h).hdr_ctx); \ + (h).hdr_src = htonl((h).hdr_src); \ + (h).hdr_tag = htonl((h).hdr_tag); \ + (h).hdr_seq = htons((h).hdr_seq); \ +} while (0) + +/** + * Header definition for the first fragment when an acknowledgment + * is required. This could be the first fragment of a large message + * or a short message that requires an ack (synchronous). + */ +struct mca_pml_bfo_rendezvous_hdr_t { + mca_pml_bfo_match_hdr_t hdr_match; + uint64_t hdr_msg_length; /**< message length */ + ompi_ptr_t hdr_src_req; /**< pointer to source request - returned in ack */ +/* BFO FAILOVER CODE - begin */ + ompi_ptr_t hdr_dst_req; /**< pointer to dst req - failover use only */ + uint8_t hdr_restartseq; /**< restart sequence - failover use only */ +/* BFO FAILOVER CODE - end */ +}; +typedef struct mca_pml_bfo_rendezvous_hdr_t mca_pml_bfo_rendezvous_hdr_t; + +#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG +#define MCA_PML_BFO_RNDV_HDR_FILL(h) \ + MCA_PML_BFO_MATCH_HDR_FILL((h).hdr_match) +#else +#define MCA_PML_BFO_RNDV_HDR_FILL(h) +#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */ + +/* Note that hdr_src_req is not put in network byte order because it + is never processed by the receiver, other than being copied into + the ack header */ +#define MCA_PML_BFO_RNDV_HDR_NTOH(h) \ + do { \ + MCA_PML_BFO_MATCH_HDR_NTOH((h).hdr_match); \ + (h).hdr_msg_length = ntoh64((h).hdr_msg_length); \ + } while (0) + +#define MCA_PML_BFO_RNDV_HDR_HTON(h) \ + do { \ + MCA_PML_BFO_MATCH_HDR_HTON((h).hdr_match); \ + MCA_PML_BFO_RNDV_HDR_FILL(h); \ + (h).hdr_msg_length = hton64((h).hdr_msg_length); \ + } while (0) + +/** + * Header definition for a combined rdma rendezvous/get + */ +struct mca_pml_bfo_rget_hdr_t { + mca_pml_bfo_rendezvous_hdr_t hdr_rndv; + uint32_t hdr_seg_cnt; /**< number of segments for rdma */ +#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT + uint8_t hdr_padding[4]; +#endif + ompi_ptr_t hdr_des; /**< source descriptor */ + mca_btl_base_segment_t hdr_segs[1]; /**< list of segments for rdma */ +}; +typedef struct mca_pml_bfo_rget_hdr_t mca_pml_bfo_rget_hdr_t; + +#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG +#define MCA_PML_BFO_RGET_HDR_FILL(h) \ +do { \ + MCA_PML_BFO_RNDV_HDR_FILL((h).hdr_rndv); \ + (h).hdr_padding[0] = 0; \ + (h).hdr_padding[1] = 0; \ + (h).hdr_padding[2] = 0; \ + (h).hdr_padding[3] = 0; \ +} while(0) +#else +#define MCA_PML_BFO_RGET_HDR_FILL(h) +#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */ + +#define MCA_PML_BFO_RGET_HDR_NTOH(h) \ + do { \ + MCA_PML_BFO_RNDV_HDR_NTOH((h).hdr_rndv); \ + (h).hdr_seg_cnt = ntohl((h).hdr_seg_cnt); \ + } while (0) + +#define MCA_PML_BFO_RGET_HDR_HTON(h) \ + do { \ + MCA_PML_BFO_RNDV_HDR_HTON((h).hdr_rndv); \ + MCA_PML_BFO_RGET_HDR_FILL(h); \ + (h).hdr_seg_cnt = htonl((h).hdr_seg_cnt); \ + } while (0) + +/** + * Header for subsequent fragments. + */ +struct mca_pml_bfo_frag_hdr_t { + mca_pml_bfo_common_hdr_t hdr_common; /**< common attributes */ +#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT + uint8_t hdr_padding[6]; +#endif + uint64_t hdr_frag_offset; /**< offset into message */ + ompi_ptr_t hdr_src_req; /**< pointer to source request */ + ompi_ptr_t hdr_dst_req; /**< pointer to matched receive */ +}; +typedef struct mca_pml_bfo_frag_hdr_t mca_pml_bfo_frag_hdr_t; + +#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG +#define MCA_PML_BFO_FRAG_HDR_FILL(h) \ +do { \ + (h).hdr_padding[0] = 0; \ + (h).hdr_padding[1] = 0; \ + (h).hdr_padding[2] = 0; \ + (h).hdr_padding[3] = 0; \ + (h).hdr_padding[4] = 0; \ + (h).hdr_padding[5] = 0; \ +} while(0) +#else +#define MCA_PML_BFO_FRAG_HDR_FILL(h) +#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */ + +#define MCA_PML_BFO_FRAG_HDR_NTOH(h) \ + do { \ + MCA_PML_BFO_COMMON_HDR_NTOH((h).hdr_common); \ + (h).hdr_frag_offset = ntoh64((h).hdr_frag_offset); \ + } while (0) + +#define MCA_PML_BFO_FRAG_HDR_HTON(h) \ + do { \ + MCA_PML_BFO_COMMON_HDR_HTON((h).hdr_common); \ + MCA_PML_BFO_FRAG_HDR_FILL(h); \ + (h).hdr_frag_offset = hton64((h).hdr_frag_offset); \ + } while (0) + +/** + * Header used to acknowledgment outstanding fragment(s). + */ + +struct mca_pml_bfo_ack_hdr_t { + mca_pml_bfo_common_hdr_t hdr_common; /**< common attributes */ +#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT + uint8_t hdr_padding[6]; +#endif + ompi_ptr_t hdr_src_req; /**< source request */ + ompi_ptr_t hdr_dst_req; /**< matched receive request */ + uint64_t hdr_send_offset; /**< starting point of copy in/out */ +}; +typedef struct mca_pml_bfo_ack_hdr_t mca_pml_bfo_ack_hdr_t; + +#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG +#define MCA_PML_BFO_ACK_HDR_FILL(h) \ +do { \ + (h).hdr_padding[0] = 0; \ + (h).hdr_padding[1] = 0; \ + (h).hdr_padding[2] = 0; \ + (h).hdr_padding[3] = 0; \ + (h).hdr_padding[4] = 0; \ + (h).hdr_padding[5] = 0; \ +} while (0) +#else +#define MCA_PML_BFO_ACK_HDR_FILL(h) +#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */ + +/* Note that the request headers are not put in NBO because the + src_req is already in receiver's byte order and the dst_req is not + used by the receiver for anything other than backpointers in return + headers */ +#define MCA_PML_BFO_ACK_HDR_NTOH(h) \ + do { \ + MCA_PML_BFO_COMMON_HDR_NTOH((h).hdr_common); \ + (h).hdr_send_offset = ntoh64((h).hdr_send_offset); \ + } while (0) + +#define MCA_PML_BFO_ACK_HDR_HTON(h) \ + do { \ + MCA_PML_BFO_COMMON_HDR_HTON((h).hdr_common); \ + MCA_PML_BFO_ACK_HDR_FILL(h); \ + (h).hdr_send_offset = hton64((h).hdr_send_offset); \ + } while (0) + +/** + * Header used to initiate an RDMA operation. + */ + +struct mca_pml_bfo_rdma_hdr_t { + mca_pml_bfo_common_hdr_t hdr_common; /**< common attributes */ +#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT + uint8_t hdr_padding[2]; /** two to pad out the hdr to a 4 byte alignment. hdr_req will then be 8 byte aligned after 4 for hdr_seg_cnt */ +#endif + uint32_t hdr_seg_cnt; /**< number of segments for rdma */ + ompi_ptr_t hdr_req; /**< destination request */ +/* BFO FAILOVER CODE - begin */ + ompi_ptr_t hdr_dst_req; /**< pointer to destination request */ +/* BFO FAILOVER CODE - end */ + ompi_ptr_t hdr_des; /**< source descriptor */ + uint64_t hdr_rdma_offset; /**< current offset into user buffer */ + mca_btl_base_segment_t hdr_segs[1]; /**< list of segments for rdma */ +}; +typedef struct mca_pml_bfo_rdma_hdr_t mca_pml_bfo_rdma_hdr_t; + +#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG +#define MCA_PML_BFO_RDMA_HDR_FILL(h) \ +do { \ + (h).hdr_padding[0] = 0; \ + (h).hdr_padding[1] = 0; \ +} while(0) +#else +#define MCA_PML_BFO_RDMA_HDR_FILL(h) +#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */ + +#define MCA_PML_BFO_RDMA_HDR_NTOH(h) \ + do { \ + MCA_PML_BFO_COMMON_HDR_NTOH((h).hdr_common); \ + (h).hdr_seg_cnt = ntohl((h).hdr_seg_cnt); \ + (h).hdr_rdma_offset = ntoh64((h).hdr_rdma_offset); \ + } while (0) + +#define MCA_PML_BFO_RDMA_HDR_HTON(h) \ + do { \ + MCA_PML_BFO_COMMON_HDR_HTON((h).hdr_common); \ + MCA_PML_BFO_RDMA_HDR_FILL(h); \ + (h).hdr_seg_cnt = htonl((h).hdr_seg_cnt); \ + (h).hdr_rdma_offset = hton64((h).hdr_rdma_offset); \ + } while (0) + +/** + * Header used to complete an RDMA operation. + */ + +struct mca_pml_bfo_fin_hdr_t { +/* BFO FAILOVER CODE - begin */ + mca_pml_bfo_match_hdr_t hdr_match; /**< match info - needed for failover */ + uint8_t hdr_restartseq; /**< restart sequence - failover use only */ +/* BFO FAILOVER CODE - end */ +#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT + uint8_t hdr_padding[7]; +#endif + uint32_t hdr_fail; /**< RDMA operation failed */ + ompi_ptr_t hdr_des; /**< completed descriptor */ +}; +typedef struct mca_pml_bfo_fin_hdr_t mca_pml_bfo_fin_hdr_t; + +#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG +#define MCA_PML_BFO_FIN_HDR_FILL(h) \ +do { \ + (h).hdr_padding[0] = 0; \ + (h).hdr_padding[1] = 0; \ +} while (0) +#else +#define MCA_PML_BFO_FIN_HDR_FILL(h) +#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */ + +#define MCA_PML_BFO_FIN_HDR_NTOH(h) \ + do { \ + MCA_PML_BFO_COMMON_HDR_NTOH((h).hdr_common); \ + } while (0) + +#define MCA_PML_BFO_FIN_HDR_HTON(h) \ + do { \ + MCA_PML_BFO_COMMON_HDR_HTON((h).hdr_common); \ + MCA_PML_BFO_FIN_HDR_FILL(h); \ + } while (0) + +/* BFO FAILOVER CODE - begin */ +/** + * Header used to restart a rendezvous request. + */ +struct mca_pml_bfo_restart_hdr_t { + mca_pml_bfo_match_hdr_t hdr_match; /**< needed to avoid duplicate messages */ + uint8_t hdr_restartseq; /**< restart sequence */ +#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT + uint8_t hdr_padding[3]; +#endif + ompi_ptr_t hdr_src_req; /**< source request */ + ompi_ptr_t hdr_dst_req; /**< matched receive request */ + int32_t hdr_dst_rank; /**< needed to send NACK */ + uint32_t hdr_jobid; /**< needed to send NACK */ + uint32_t hdr_vpid; /**< needed to send NACK */ +}; +typedef struct mca_pml_bfo_restart_hdr_t mca_pml_bfo_restart_hdr_t; + +/* Only need to put parts of the restart header in NBO. No need + to do hdr_src_req and hdr_dst_req as they are only used on the + by the process that originated them. */ +#define MCA_PML_BFO_RESTART_HDR_NTOH(h) \ + do { \ + MCA_PML_BFO_MATCH_HDR_NTOH((h).hdr_match); \ + (h).hdr_dst_rank = ntohl((h).hdr_dst_rank); \ + (h).hdr_jobid = ntohl((h).hdr_jobid); \ + (h).hdr_vpid = ntohl((h).hdr_vpid); \ + } while (0) + +#define MCA_PML_BFO_RESTART_HDR_HTON(h) \ + do { \ + MCA_PML_BFO_MATCH_HDR_HTON((h).hdr_match); \ + (h).hdr_dst_rank = htonl((h).hdr_dst_rank); \ + (h).hdr_jobid = htonl((h).hdr_jobid); \ + (h).hdr_vpid = htonl((h).hdr_vpid); \ + } while (0) +/* BFO FAILOVER CODE - end */ + +/** + * Union of defined hdr types. + */ +union mca_pml_bfo_hdr_t { + mca_pml_bfo_common_hdr_t hdr_common; + mca_pml_bfo_match_hdr_t hdr_match; + mca_pml_bfo_rendezvous_hdr_t hdr_rndv; + mca_pml_bfo_rget_hdr_t hdr_rget; + mca_pml_bfo_frag_hdr_t hdr_frag; + mca_pml_bfo_ack_hdr_t hdr_ack; + mca_pml_bfo_rdma_hdr_t hdr_rdma; + mca_pml_bfo_fin_hdr_t hdr_fin; +/* BFO FAILOVER CODE - begin */ + mca_pml_bfo_restart_hdr_t hdr_restart; +/* BFO FAILOVER CODE - end */ +}; +typedef union mca_pml_bfo_hdr_t mca_pml_bfo_hdr_t; + +#if !defined(WORDS_BIGENDIAN) && OPAL_ENABLE_HETEROGENEOUS_SUPPORT +static inline __opal_attribute_always_inline__ void +bfo_hdr_ntoh(mca_pml_bfo_hdr_t *hdr, const uint8_t hdr_type) +{ + if(!(hdr->hdr_common.hdr_flags & MCA_PML_BFO_HDR_FLAGS_NBO)) + return; + + switch(hdr_type) { + case MCA_PML_BFO_HDR_TYPE_MATCH: + MCA_PML_BFO_MATCH_HDR_NTOH(hdr->hdr_match); + break; + case MCA_PML_BFO_HDR_TYPE_RNDV: + MCA_PML_BFO_RNDV_HDR_NTOH(hdr->hdr_rndv); + break; + case MCA_PML_BFO_HDR_TYPE_RGET: + MCA_PML_BFO_RGET_HDR_NTOH(hdr->hdr_rget); + break; + case MCA_PML_BFO_HDR_TYPE_ACK: + MCA_PML_BFO_ACK_HDR_NTOH(hdr->hdr_ack); + break; + case MCA_PML_BFO_HDR_TYPE_FRAG: + MCA_PML_BFO_FRAG_HDR_NTOH(hdr->hdr_frag); + break; + case MCA_PML_BFO_HDR_TYPE_PUT: + MCA_PML_BFO_RDMA_HDR_NTOH(hdr->hdr_rdma); + break; + case MCA_PML_BFO_HDR_TYPE_FIN: + MCA_PML_BFO_FIN_HDR_NTOH(hdr->hdr_fin); + break; + default: + assert(0); + break; + } +} +#else +#define bfo_hdr_ntoh(h, t) do{}while(0) +#endif + +#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT +#define bfo_hdr_hton(h, t, p) \ + bfo_hdr_hton_intr((mca_pml_bfo_hdr_t*)h, t, p) +static inline __opal_attribute_always_inline__ void +bfo_hdr_hton_intr(mca_pml_bfo_hdr_t *hdr, const uint8_t hdr_type, + const ompi_proc_t *proc) +{ +#ifdef WORDS_BIGENDIAN + hdr->hdr_common.hdr_flags |= MCA_PML_BFO_HDR_FLAGS_NBO; +#else + + if(!(proc->proc_arch & OPAL_ARCH_ISBIGENDIAN)) + return; + + hdr->hdr_common.hdr_flags |= MCA_PML_BFO_HDR_FLAGS_NBO; + switch(hdr_type) { + case MCA_PML_BFO_HDR_TYPE_MATCH: + MCA_PML_BFO_MATCH_HDR_HTON(hdr->hdr_match); + break; + case MCA_PML_BFO_HDR_TYPE_RNDV: + MCA_PML_BFO_RNDV_HDR_HTON(hdr->hdr_rndv); + break; + case MCA_PML_BFO_HDR_TYPE_RGET: + MCA_PML_BFO_RGET_HDR_HTON(hdr->hdr_rget); + break; + case MCA_PML_BFO_HDR_TYPE_ACK: + MCA_PML_BFO_ACK_HDR_HTON(hdr->hdr_ack); + break; + case MCA_PML_BFO_HDR_TYPE_FRAG: + MCA_PML_BFO_FRAG_HDR_HTON(hdr->hdr_frag); + break; + case MCA_PML_BFO_HDR_TYPE_PUT: + MCA_PML_BFO_RDMA_HDR_HTON(hdr->hdr_rdma); + break; + case MCA_PML_BFO_HDR_TYPE_FIN: + MCA_PML_BFO_FIN_HDR_HTON(hdr->hdr_fin); + break; + default: + assert(0); + break; + } +#endif +} +#else +#define bfo_hdr_hton(h, t, p) do{}while(0) +#endif +#endif diff --git a/ompi/mca/pml/bfo/pml_bfo_iprobe.c b/ompi/mca/pml/bfo/pml_bfo_iprobe.c new file mode 100644 index 0000000000..70a931927e --- /dev/null +++ b/ompi/mca/pml/bfo/pml_bfo_iprobe.c @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" +#include "ompi/request/request.h" +#include "pml_bfo_recvreq.h" + + +int mca_pml_bfo_iprobe(int src, + int tag, + struct ompi_communicator_t *comm, + int *matched, ompi_status_public_t * status) +{ + int rc = OMPI_SUCCESS; + mca_pml_bfo_recv_request_t recvreq; + + OBJ_CONSTRUCT( &recvreq, mca_pml_bfo_recv_request_t ); + recvreq.req_recv.req_base.req_ompi.req_type = OMPI_REQUEST_PML; + recvreq.req_recv.req_base.req_type = MCA_PML_REQUEST_IPROBE; + + MCA_PML_BFO_RECV_REQUEST_INIT(&recvreq, NULL, 0, &ompi_mpi_char.dt, src, tag, comm, true); + MCA_PML_BFO_RECV_REQUEST_START(&recvreq); + + if( recvreq.req_recv.req_base.req_ompi.req_complete == true ) { + if( NULL != status ) { + *status = recvreq.req_recv.req_base.req_ompi.req_status; + } + *matched = 1; + } else { + *matched = 0; + opal_progress(); + } + MCA_PML_BASE_RECV_REQUEST_FINI( &recvreq.req_recv ); + return rc; +} + + +int mca_pml_bfo_probe(int src, + int tag, + struct ompi_communicator_t *comm, + ompi_status_public_t * status) +{ + mca_pml_bfo_recv_request_t recvreq; + + OBJ_CONSTRUCT( &recvreq, mca_pml_bfo_recv_request_t ); + recvreq.req_recv.req_base.req_ompi.req_type = OMPI_REQUEST_PML; + recvreq.req_recv.req_base.req_type = MCA_PML_REQUEST_PROBE; + + MCA_PML_BFO_RECV_REQUEST_INIT(&recvreq, NULL, 0, &ompi_mpi_char.dt, src, tag, comm, true); + MCA_PML_BFO_RECV_REQUEST_START(&recvreq); + + ompi_request_wait_completion(&recvreq.req_recv.req_base.req_ompi); + + if (NULL != status) { + *status = recvreq.req_recv.req_base.req_ompi.req_status; + } + MCA_PML_BASE_RECV_REQUEST_FINI( &recvreq.req_recv ); + return OMPI_SUCCESS; +} diff --git a/ompi/mca/pml/bfo/pml_bfo_irecv.c b/ompi/mca/pml/bfo/pml_bfo_irecv.c new file mode 100644 index 0000000000..62bdf78794 --- /dev/null +++ b/ompi/mca/pml/bfo/pml_bfo_irecv.c @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2007 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" +#include "ompi/request/request.h" +#include "pml_bfo_recvreq.h" +#include "ompi/peruse/peruse-internal.h" + +int mca_pml_bfo_irecv_init(void *addr, + size_t count, + ompi_datatype_t * datatype, + int src, + int tag, + struct ompi_communicator_t *comm, + struct ompi_request_t **request) +{ + int rc; + mca_pml_bfo_recv_request_t *recvreq; + MCA_PML_BFO_RECV_REQUEST_ALLOC(recvreq, rc); + if (NULL == recvreq) + return rc; + + MCA_PML_BFO_RECV_REQUEST_INIT(recvreq, + addr, + count, datatype, src, tag, comm, true); + + PERUSE_TRACE_COMM_EVENT (PERUSE_COMM_REQ_ACTIVATE, + &((recvreq)->req_recv.req_base), + PERUSE_RECV); + + *request = (ompi_request_t *) recvreq; + return OMPI_SUCCESS; +} + +int mca_pml_bfo_irecv(void *addr, + size_t count, + ompi_datatype_t * datatype, + int src, + int tag, + struct ompi_communicator_t *comm, + struct ompi_request_t **request) +{ + int rc; + + mca_pml_bfo_recv_request_t *recvreq; + MCA_PML_BFO_RECV_REQUEST_ALLOC(recvreq, rc); + if (NULL == recvreq) + return rc; + + MCA_PML_BFO_RECV_REQUEST_INIT(recvreq, + addr, + count, datatype, src, tag, comm, false); + + PERUSE_TRACE_COMM_EVENT (PERUSE_COMM_REQ_ACTIVATE, + &((recvreq)->req_recv.req_base), + PERUSE_RECV); + + MCA_PML_BFO_RECV_REQUEST_START(recvreq); + *request = (ompi_request_t *) recvreq; + return OMPI_SUCCESS; +} + + +int mca_pml_bfo_recv(void *addr, + size_t count, + ompi_datatype_t * datatype, + int src, + int tag, + struct ompi_communicator_t *comm, + ompi_status_public_t * status) +{ + int rc; + mca_pml_bfo_recv_request_t *recvreq; + MCA_PML_BFO_RECV_REQUEST_ALLOC(recvreq, rc); + if (NULL == recvreq) + return rc; + + MCA_PML_BFO_RECV_REQUEST_INIT(recvreq, + addr, + count, datatype, src, tag, comm, false); + + PERUSE_TRACE_COMM_EVENT (PERUSE_COMM_REQ_ACTIVATE, + &((recvreq)->req_recv.req_base), + PERUSE_RECV); + + MCA_PML_BFO_RECV_REQUEST_START(recvreq); + ompi_request_wait_completion(&recvreq->req_recv.req_base.req_ompi); + + if (NULL != status) { /* return status */ + *status = recvreq->req_recv.req_base.req_ompi.req_status; + } + rc = recvreq->req_recv.req_base.req_ompi.req_status.MPI_ERROR; + ompi_request_free( (ompi_request_t**)&recvreq ); + return rc; +} diff --git a/ompi/mca/pml/bfo/pml_bfo_isend.c b/ompi/mca/pml/bfo/pml_bfo_isend.c new file mode 100644 index 0000000000..d2cf02c705 --- /dev/null +++ b/ompi/mca/pml/bfo/pml_bfo_isend.c @@ -0,0 +1,131 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2007 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2007 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#include "pml_bfo.h" +#include "pml_bfo_sendreq.h" +#include "pml_bfo_recvreq.h" +#include "ompi/peruse/peruse-internal.h" + +int mca_pml_bfo_isend_init(void *buf, + size_t count, + ompi_datatype_t * datatype, + int dst, + int tag, + mca_pml_base_send_mode_t sendmode, + ompi_communicator_t * comm, + ompi_request_t ** request) +{ + int rc; + + mca_pml_bfo_send_request_t *sendreq = NULL; + MCA_PML_BFO_SEND_REQUEST_ALLOC(comm, dst, sendreq, rc); + if (rc != OMPI_SUCCESS) + return rc; + + MCA_PML_BFO_SEND_REQUEST_INIT(sendreq, + buf, + count, + datatype, + dst, tag, + comm, sendmode, true); + + PERUSE_TRACE_COMM_EVENT (PERUSE_COMM_REQ_ACTIVATE, + &(sendreq)->req_send.req_base, + PERUSE_SEND); + + *request = (ompi_request_t *) sendreq; + return OMPI_SUCCESS; +} + + +int mca_pml_bfo_isend(void *buf, + size_t count, + ompi_datatype_t * datatype, + int dst, + int tag, + mca_pml_base_send_mode_t sendmode, + ompi_communicator_t * comm, + ompi_request_t ** request) +{ + int rc; + mca_pml_bfo_send_request_t *sendreq = NULL; + + MCA_PML_BFO_SEND_REQUEST_ALLOC(comm, dst, sendreq, rc); + if (rc != OMPI_SUCCESS) + return rc; + + MCA_PML_BFO_SEND_REQUEST_INIT(sendreq, + buf, + count, + datatype, + dst, tag, + comm, sendmode, false); + + PERUSE_TRACE_COMM_EVENT (PERUSE_COMM_REQ_ACTIVATE, + &(sendreq)->req_send.req_base, + PERUSE_SEND); + + MCA_PML_BFO_SEND_REQUEST_START(sendreq, rc); + *request = (ompi_request_t *) sendreq; + return rc; +} + + +int mca_pml_bfo_send(void *buf, + size_t count, + ompi_datatype_t * datatype, + int dst, + int tag, + mca_pml_base_send_mode_t sendmode, + ompi_communicator_t * comm) +{ + int rc; + mca_pml_bfo_send_request_t *sendreq; + + MCA_PML_BFO_SEND_REQUEST_ALLOC(comm, dst, sendreq, rc); + if (rc != OMPI_SUCCESS) + return rc; + + MCA_PML_BFO_SEND_REQUEST_INIT(sendreq, + buf, + count, + datatype, + dst, tag, + comm, sendmode, false); + + PERUSE_TRACE_COMM_EVENT (PERUSE_COMM_REQ_ACTIVATE, + &(sendreq)->req_send.req_base, + PERUSE_SEND); + + MCA_PML_BFO_SEND_REQUEST_START(sendreq, rc); + if (rc != OMPI_SUCCESS) { + MCA_PML_BFO_SEND_REQUEST_RETURN( sendreq ); + return rc; + } + + ompi_request_wait_completion(&sendreq->req_send.req_base.req_ompi); + + rc = sendreq->req_send.req_base.req_ompi.req_status.MPI_ERROR; + ompi_request_free( (ompi_request_t**)&sendreq ); + return rc; +} diff --git a/ompi/mca/pml/bfo/pml_bfo_progress.c b/ompi/mca/pml/bfo/pml_bfo_progress.c new file mode 100644 index 0000000000..07c92125d0 --- /dev/null +++ b/ompi/mca/pml/bfo/pml_bfo_progress.c @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2008 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#include "pml_bfo.h" +#include "pml_bfo_sendreq.h" +#include "ompi/mca/bml/base/base.h" + +int mca_pml_bfo_progress(void) +{ + int i, queue_length = opal_list_get_size(&mca_pml_bfo.send_pending); + int j, completed_requests = 0; + bool send_succedded; + + if( OPAL_LIKELY(0 == queue_length) ) + return 0; + + for( i = 0; i < queue_length; i++ ) { + mca_pml_bfo_send_pending_t pending_type = MCA_PML_BFO_SEND_PENDING_NONE; + mca_pml_bfo_send_request_t* sendreq; + mca_bml_base_endpoint_t* endpoint; + + sendreq = get_request_from_send_pending(&pending_type); + if(OPAL_UNLIKELY(NULL == sendreq)) + break; + + switch(pending_type) { + case MCA_PML_BFO_SEND_PENDING_NONE: + assert(0); + return 0; + case MCA_PML_BFO_SEND_PENDING_SCHEDULE: + if( mca_pml_bfo_send_request_schedule_exclusive(sendreq) == + OMPI_ERR_OUT_OF_RESOURCE ) { + return 0; + } + completed_requests++; + break; + case MCA_PML_BFO_SEND_PENDING_START: + endpoint = sendreq->req_endpoint; + send_succedded = false; + for(j = 0; j < (int)mca_bml_base_btl_array_get_size(&endpoint->btl_eager); j++) { + mca_bml_base_btl_t* bml_btl; + int rc; + + /* select a btl */ + bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager); + rc = mca_pml_bfo_send_request_start_btl(sendreq, bml_btl); + if( OPAL_LIKELY(OMPI_SUCCESS == rc) ) { + send_succedded = true; + completed_requests++; + break; + } + } + if( false == send_succedded ) { + add_request_to_send_pending(sendreq, MCA_PML_BFO_SEND_PENDING_START, true); + } + } + } + return completed_requests; +} + diff --git a/ompi/mca/pml/bfo/pml_bfo_rdma.c b/ompi/mca/pml/bfo/pml_bfo_rdma.c new file mode 100644 index 0000000000..129f68059d --- /dev/null +++ b/ompi/mca/pml/bfo/pml_bfo_rdma.c @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2006 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + + +/*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/ + +#include "ompi_config.h" +#include "ompi/constants.h" +#include "ompi/mca/pml/pml.h" +#include "ompi/mca/bml/bml.h" +#include "ompi/mca/mpool/mpool.h" +#include "pml_bfo.h" +#include "pml_bfo_rdma.h" + +/* Use this registration if no registration needed for a BTL instead of NULL. + * This will help other code to distinguish case when memory is not registered + * from case when registration is not needed */ +static mca_mpool_base_registration_t pml_bfo_dummy_reg; + +/* + * Check to see if memory is registered or can be registered. Build a + * set of registrations on the request. + */ + +size_t mca_pml_bfo_rdma_btls( + mca_bml_base_endpoint_t* bml_endpoint, + unsigned char* base, + size_t size, + mca_pml_bfo_com_btl_t* rdma_btls) +{ + int num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma); + double weight_total = 0; + int num_btls_used = 0, n; + + /* shortcut when there are no rdma capable btls */ + if(num_btls == 0) { + return 0; + } + + /* check to see if memory is registered */ + for(n = 0; n < num_btls && num_btls_used < mca_pml_bfo.max_rdma_per_request; + n++) { + mca_bml_base_btl_t* bml_btl = + mca_bml_base_btl_array_get_index(&bml_endpoint->btl_rdma, + (bml_endpoint->btl_rdma_index + n) % num_btls); + mca_mpool_base_registration_t* reg = &pml_bfo_dummy_reg; + mca_mpool_base_module_t *btl_mpool = bml_btl->btl->btl_mpool; + + if( NULL != btl_mpool ) { + if(!mca_pml_bfo.leave_pinned) { + /* look through existing registrations */ + btl_mpool->mpool_find(btl_mpool, base, size, ®); + } else { + /* register the memory */ + btl_mpool->mpool_register(btl_mpool, base, size, 0, ®); + } + + if(NULL == reg) + continue; + } + + rdma_btls[num_btls_used].bml_btl = bml_btl; + rdma_btls[num_btls_used].btl_reg = reg; + weight_total += bml_btl->btl_weight; + num_btls_used++; + } + + /* if we don't use leave_pinned and all BTLs that already have this memory + * registered amount to less then half of available bandwidth - fall back to + * pipeline protocol */ + if(0 == num_btls_used || (!mca_pml_bfo.leave_pinned && weight_total < 0.5)) + return 0; + + mca_pml_bfo_calc_weighted_length(rdma_btls, num_btls_used, size, + weight_total); + + bml_endpoint->btl_rdma_index = (bml_endpoint->btl_rdma_index + 1) % num_btls; + return num_btls_used; +} + +size_t mca_pml_bfo_rdma_pipeline_btls( mca_bml_base_endpoint_t* bml_endpoint, + size_t size, + mca_pml_bfo_com_btl_t* rdma_btls ) +{ + int i, num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma); + double weight_total = 0; + + for(i = 0; i < num_btls && i < mca_pml_bfo.max_rdma_per_request; i++) { + rdma_btls[i].bml_btl = + mca_bml_base_btl_array_get_next(&bml_endpoint->btl_rdma); + if(NULL != rdma_btls[i].bml_btl->btl->btl_mpool) + rdma_btls[i].btl_reg = NULL; + else + rdma_btls[i].btl_reg = &pml_bfo_dummy_reg; + + weight_total += rdma_btls[i].bml_btl->btl_weight; + } + + mca_pml_bfo_calc_weighted_length(rdma_btls, i, size, weight_total); + + return i; +} diff --git a/ompi/mca/pml/bfo/pml_bfo_rdma.h b/ompi/mca/pml/bfo/pml_bfo_rdma.h new file mode 100644 index 0000000000..8572682d36 --- /dev/null +++ b/ompi/mca/pml/bfo/pml_bfo_rdma.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + */ + +#ifndef MCA_PML_BFO_RDMA_H +#define MCA_PML_BFO_RDMA_H + +struct mca_bml_base_endpoint_t; + +/* + * Of the set of available btls that support RDMA, + * find those that already have registrations - or + * register if required (for leave_pinned option) + */ +size_t mca_pml_bfo_rdma_btls(struct mca_bml_base_endpoint_t* endpoint, + unsigned char* base, size_t size, struct mca_pml_bfo_com_btl_t* btls); + +/* Choose RDMA BTLs to use for sending of a request by pipeline protocol. + * Calculate number of bytes to send through each BTL according to available + * bandwidth */ +size_t mca_pml_bfo_rdma_pipeline_btls(struct mca_bml_base_endpoint_t* endpoint, + size_t size, mca_pml_bfo_com_btl_t* rdma_btls); +#endif + diff --git a/ompi/mca/pml/bfo/pml_bfo_rdmafrag.c b/ompi/mca/pml/bfo/pml_bfo_rdmafrag.c new file mode 100644 index 0000000000..b99e30a8de --- /dev/null +++ b/ompi/mca/pml/bfo/pml_bfo_rdmafrag.c @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#include "pml_bfo.h" +#include "pml_bfo_rdmafrag.h" + + +OBJ_CLASS_INSTANCE( + mca_pml_bfo_rdma_frag_t, + ompi_free_list_item_t, + NULL, + NULL); diff --git a/ompi/mca/pml/bfo/pml_bfo_rdmafrag.h b/ompi/mca/pml/bfo/pml_bfo_rdmafrag.h new file mode 100644 index 0000000000..51dc4727b2 --- /dev/null +++ b/ompi/mca/pml/bfo/pml_bfo_rdmafrag.h @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2006 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + */ + +#ifndef MCA_PML_BFO_RDMAFRAG_H +#define MCA_PML_BFO_RDMAFRAG_H + +#include "ompi/mca/btl/btl.h" +#include "pml_bfo_hdr.h" + +BEGIN_C_DECLS + +typedef enum { + MCA_PML_BFO_RDMA_PUT, + MCA_PML_BFO_RDMA_GET +} mca_pml_bfo_rdma_state_t; + +struct mca_pml_bfo_rdma_frag_t { + ompi_free_list_item_t super; + mca_bml_base_btl_t* rdma_bml; + mca_btl_base_module_t* rdma_btl; + mca_pml_bfo_hdr_t rdma_hdr; + mca_pml_bfo_rdma_state_t rdma_state; + size_t rdma_length; + mca_btl_base_segment_t rdma_segs[MCA_BTL_DES_MAX_SEGMENTS]; + void *rdma_req; + struct mca_bml_base_endpoint_t* rdma_ep; + opal_convertor_t convertor; + mca_mpool_base_registration_t* reg; + uint32_t retries; +}; +typedef struct mca_pml_bfo_rdma_frag_t mca_pml_bfo_rdma_frag_t; + +OBJ_CLASS_DECLARATION(mca_pml_bfo_rdma_frag_t); + + +#define MCA_PML_BFO_RDMA_FRAG_ALLOC(frag,rc) \ +do { \ + ompi_free_list_item_t* item; \ + OMPI_FREE_LIST_WAIT(&mca_pml_bfo.rdma_frags, item, rc); \ + frag = (mca_pml_bfo_rdma_frag_t*)item; \ +} while(0) + +#define MCA_PML_BFO_RDMA_FRAG_RETURN(frag) \ +do { \ + /* return fragment */ \ + OMPI_FREE_LIST_RETURN(&mca_pml_bfo.rdma_frags, \ + (ompi_free_list_item_t*)frag); \ +} while(0) + + +END_C_DECLS +#endif + diff --git a/ompi/mca/pml/bfo/pml_bfo_recvfrag.c b/ompi/mca/pml/bfo/pml_bfo_recvfrag.c new file mode 100644 index 0000000000..b940815ae6 --- /dev/null +++ b/ompi/mca/pml/bfo/pml_bfo_recvfrag.c @@ -0,0 +1,767 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2009 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2007 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2008 UT-Battelle, LLC. All rights reserved. + * Copyright (c) 2006-2008 University of Houston. All rights reserved. + * Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/** + * @file + */ + +#include "ompi_config.h" + +#include "opal/class/opal_list.h" +#include "opal/threads/mutex.h" +#include "opal/prefetch.h" +#include "ompi/constants.h" +#include "ompi/communicator/communicator.h" +#include "ompi/mca/pml/pml.h" +#include "pml_bfo.h" +#include "pml_bfo_comm.h" +#include "pml_bfo_recvfrag.h" +#include "pml_bfo_recvreq.h" +#include "pml_bfo_sendreq.h" +#include "pml_bfo_hdr.h" +/* BFO FAILOVER CODE - begin */ +#include "pml_bfo_failover.h" +/* BFO FAILOVER CODE - end */ +#include "ompi/peruse/peruse-internal.h" +#include "ompi/memchecker.h" + + +OBJ_CLASS_INSTANCE( mca_pml_bfo_buffer_t, + ompi_free_list_item_t, + NULL, + NULL ); + +OBJ_CLASS_INSTANCE( mca_pml_bfo_recv_frag_t, + opal_list_item_t, + NULL, + NULL ); + +/** + * Static functions. + */ + +/** + * Append a unexpected descriptor to a queue. This function will allocate and + * initialize the fragment (if necessary) and then will add it to the specified + * queue. The allocated fragment is not returned to the caller. + */ +static void +append_frag_to_list(opal_list_t *queue, mca_btl_base_module_t *btl, + mca_pml_bfo_match_hdr_t *hdr, mca_btl_base_segment_t* segments, + size_t num_segments, mca_pml_bfo_recv_frag_t* frag) +{ + int rc; + + if(NULL == frag) { + MCA_PML_BFO_RECV_FRAG_ALLOC(frag, rc); + MCA_PML_BFO_RECV_FRAG_INIT(frag, hdr, segments, num_segments, btl); + } + opal_list_append(queue, (opal_list_item_t*)frag); +} + +/** + * Match incoming recv_frags against posted receives. + * Supports out of order delivery. + * + * @param frag_header (IN) Header of received recv_frag. + * @param frag_desc (IN) Received recv_frag descriptor. + * @param match_made (OUT) Flag indicating wether a match was made. + * @param additional_matches (OUT) List of additional matches + * @return OMPI_SUCCESS or error status on failure. + */ +static int mca_pml_bfo_recv_frag_match( mca_btl_base_module_t *btl, + mca_pml_bfo_match_hdr_t *hdr, + mca_btl_base_segment_t* segments, + size_t num_segments, + int type); + +static mca_pml_bfo_recv_request_t* +match_one(mca_btl_base_module_t *btl, + mca_pml_bfo_match_hdr_t *hdr, mca_btl_base_segment_t* segments, + size_t num_segments, ompi_communicator_t *comm_ptr, + mca_pml_bfo_comm_proc_t *proc, + mca_pml_bfo_recv_frag_t* frag); + +void mca_pml_bfo_recv_frag_callback_match(mca_btl_base_module_t* btl, + mca_btl_base_tag_t tag, + mca_btl_base_descriptor_t* des, + void* cbdata ) +{ + mca_btl_base_segment_t* segments = des->des_dst; + mca_pml_bfo_match_hdr_t* hdr = (mca_pml_bfo_match_hdr_t*)segments->seg_addr.pval; + ompi_communicator_t *comm_ptr; + mca_pml_bfo_recv_request_t *match = NULL; + mca_pml_bfo_comm_t *comm; + mca_pml_bfo_comm_proc_t *proc; + size_t num_segments = des->des_dst_cnt; + size_t bytes_received = 0; + + if( OPAL_UNLIKELY(segments->seg_len < OMPI_PML_BFO_MATCH_HDR_LEN) ) { + return; + } + bfo_hdr_ntoh(((mca_pml_bfo_hdr_t*) hdr), MCA_PML_BFO_HDR_TYPE_MATCH); + + /* communicator pointer */ + comm_ptr = ompi_comm_lookup(hdr->hdr_ctx); + if(OPAL_UNLIKELY(NULL == comm_ptr)) { + /* This is a special case. A message for a not yet existing + * communicator can happens. Instead of doing a matching we + * will temporarily add it the a pending queue in the PML. + * Later on, when the communicator is completely instantiated, + * this pending queue will be searched and all matching fragments + * moved to the right communicator. + */ + append_frag_to_list( &mca_pml_bfo.non_existing_communicator_pending, + btl, hdr, segments, num_segments, NULL ); + return; + } + comm = (mca_pml_bfo_comm_t *)comm_ptr->c_pml_comm; + + /* source sequence number */ + proc = &comm->procs[hdr->hdr_src]; + + /* We generate the MSG_ARRIVED event as soon as the PML is aware + * of a matching fragment arrival. Independing if it is received + * on the correct order or not. This will allow the tools to + * figure out if the messages are not received in the correct + * order (if multiple network interfaces). + */ + PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_MSG_ARRIVED, comm_ptr, + hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV); + + /* get next expected message sequence number - if threaded + * run, lock to make sure that if another thread is processing + * a frag from the same message a match is made only once. + * Also, this prevents other posted receives (for a pair of + * end points) from being processed, and potentially "loosing" + * the fragment. + */ + OPAL_THREAD_LOCK(&comm->matching_lock); + + /* get sequence number of next message that can be processed */ + if(OPAL_UNLIKELY((((uint16_t) hdr->hdr_seq) != ((uint16_t) proc->expected_sequence)) || + (opal_list_get_size(&proc->frags_cant_match) > 0 ))) { + goto slow_path; + } + + /* This is the sequence number we were expecting, so we can try + * matching it to already posted receives. + */ + + /* We're now expecting the next sequence number. */ + proc->expected_sequence++; + + /* We generate the SEARCH_POSTED_QUEUE only when the message is + * received in the correct sequence. Otherwise, we delay the event + * generation until we reach the correct sequence number. + */ + PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_SEARCH_POSTED_Q_BEGIN, comm_ptr, + hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV); + + match = match_one(btl, hdr, segments, num_segments, comm_ptr, proc, NULL); + + /* The match is over. We generate the SEARCH_POSTED_Q_END here, + * before going into the mca_pml_bfo_check_cantmatch_for_match so + * we can make a difference for the searching time for all + * messages. + */ + PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_SEARCH_POSTED_Q_END, comm_ptr, + hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV); + + /* release matching lock before processing fragment */ + OPAL_THREAD_UNLOCK(&comm->matching_lock); + + if(OPAL_LIKELY(match)) { + bytes_received = segments->seg_len - OMPI_PML_BFO_MATCH_HDR_LEN; + match->req_recv.req_bytes_packed = bytes_received; + + MCA_PML_BFO_RECV_REQUEST_MATCHED(match, hdr); + if(match->req_bytes_expected > 0) { + struct iovec iov[2]; + uint32_t iov_count = 1; + + /* + * Make user buffer accessable(defined) before unpacking. + */ + MEMCHECKER( + memchecker_call(&opal_memchecker_base_mem_defined, + match->req_recv.req_base.req_addr, + match->req_recv.req_base.req_count, + match->req_recv.req_base.req_datatype); + ); + + iov[0].iov_len = bytes_received; + iov[0].iov_base = (IOVBASE_TYPE*)((unsigned char*)segments->seg_addr.pval + + OMPI_PML_BFO_MATCH_HDR_LEN); + while (iov_count < num_segments) { + bytes_received += segments[iov_count].seg_len; + iov[iov_count].iov_len = segments[iov_count].seg_len; + iov[iov_count].iov_base = (IOVBASE_TYPE*)((unsigned char*)segments[iov_count].seg_addr.pval); + iov_count++; + } + opal_convertor_unpack( &match->req_recv.req_base.req_convertor, + iov, + &iov_count, + &bytes_received ); + match->req_bytes_received = bytes_received; + /* + * Unpacking finished, make the user buffer unaccessable again. + */ + MEMCHECKER( + memchecker_call(&opal_memchecker_base_mem_noaccess, + match->req_recv.req_base.req_addr, + match->req_recv.req_base.req_count, + match->req_recv.req_base.req_datatype); + ); + } + + /* no need to check if complete we know we are.. */ + /* don't need a rmb as that is for checking */ + recv_request_pml_complete(match); + } + return; + + slow_path: + OPAL_THREAD_UNLOCK(&comm->matching_lock); +/* BFO FAILOVER CODE - begin */ + /* Check for duplicate messages. If message is duplicate, then just + * return as that essentially drops the message. */ + if (true == mca_pml_bfo_is_duplicate_msg(proc, hdr)) { + return; + } +/* BFO FAILOVER CODE - end */ + mca_pml_bfo_recv_frag_match(btl, hdr, segments, + num_segments, MCA_PML_BFO_HDR_TYPE_MATCH); +} + + +void mca_pml_bfo_recv_frag_callback_rndv(mca_btl_base_module_t* btl, + mca_btl_base_tag_t tag, + mca_btl_base_descriptor_t* des, + void* cbdata ) +{ + mca_btl_base_segment_t* segments = des->des_dst; + mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval; + + if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_bfo_common_hdr_t)) ) { + return; + } + bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_RNDV); + mca_pml_bfo_recv_frag_match(btl, &hdr->hdr_match, segments, + des->des_dst_cnt, MCA_PML_BFO_HDR_TYPE_RNDV); + return; +} + +void mca_pml_bfo_recv_frag_callback_rget(mca_btl_base_module_t* btl, + mca_btl_base_tag_t tag, + mca_btl_base_descriptor_t* des, + void* cbdata ) +{ + mca_btl_base_segment_t* segments = des->des_dst; + mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval; + + if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_bfo_common_hdr_t)) ) { + return; + } + bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_RGET); + mca_pml_bfo_recv_frag_match(btl, &hdr->hdr_match, segments, + des->des_dst_cnt, MCA_PML_BFO_HDR_TYPE_RGET); + return; +} + + + +void mca_pml_bfo_recv_frag_callback_ack(mca_btl_base_module_t* btl, + mca_btl_base_tag_t tag, + mca_btl_base_descriptor_t* des, + void* cbdata ) +{ + mca_btl_base_segment_t* segments = des->des_dst; + mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval; + mca_pml_bfo_send_request_t* sendreq; + + if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_bfo_common_hdr_t)) ) { + return; + } + + bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_ACK); + sendreq = (mca_pml_bfo_send_request_t*)hdr->hdr_ack.hdr_src_req.pval; + sendreq->req_recv = hdr->hdr_ack.hdr_dst_req; +/* BFO FAILOVER CODE - begin */ + /* Drop any fragments if request is in error state. Do not want + * to initiate any more activity. */ + if( OPAL_UNLIKELY(sendreq->req_error)) { + opal_output_verbose(20, mca_pml_bfo_output, + "ACK: received: dropping because request in error, " + "PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d", + (uint16_t)sendreq->req_send.req_base.req_sequence, + sendreq->req_restartseq, + (void *)sendreq, sendreq->req_recv.pval, + sendreq->req_send.req_base.req_peer); + return; + } +/* BFO FAILOVER CODE - end */ + + /* if the request should be delivered entirely by copy in/out + * then throttle sends */ + if(hdr->hdr_common.hdr_flags & MCA_PML_BFO_HDR_FLAGS_NORDMA) + sendreq->req_throttle_sends = true; + + mca_pml_bfo_send_request_copy_in_out(sendreq, + hdr->hdr_ack.hdr_send_offset, + sendreq->req_send.req_bytes_packed - + hdr->hdr_ack.hdr_send_offset); + + if (sendreq->req_state != 0) { + /* Typical receipt of an ACK message causes req_state to be + * decremented. However, a send request that started as an + * RGET request can become a RNDV. For example, when the + * receiver determines that its receive buffer is not + * contiguous and therefore cannot support the RGET + * protocol. A send request that started with the RGET + * protocol has req_state == 0 and as such should not be + * decremented. + */ + OPAL_THREAD_ADD32(&sendreq->req_state, -1); + } +/* BFO FAILOVER CODE - begin */ + sendreq->req_acked = true; +/* BFO FAILOVER CODE - end */ + + if(send_request_pml_complete_check(sendreq) == false) + mca_pml_bfo_send_request_schedule(sendreq); + + return; +} + +void mca_pml_bfo_recv_frag_callback_frag(mca_btl_base_module_t* btl, + mca_btl_base_tag_t tag, + mca_btl_base_descriptor_t* des, + void* cbdata ) { + mca_btl_base_segment_t* segments = des->des_dst; + mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval; + mca_pml_bfo_recv_request_t* recvreq; + + if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_bfo_common_hdr_t)) ) { + return; + } + bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_FRAG); + recvreq = (mca_pml_bfo_recv_request_t*)hdr->hdr_frag.hdr_dst_req.pval; +/* BFO FAILOVER CODE - begin */ + /* Drop any fragments if request is in error state. Do not want + * to initiate any more activity. */ + if( OPAL_UNLIKELY(recvreq->req_errstate)) { + opal_output_verbose(20, mca_pml_bfo_output, + "FRAG: received: dropping because request in error, " + "PML=%d, src_req=%p, dst_req=%p, peer=%d, offset=%d", + (uint16_t)recvreq->req_msgseq, + recvreq->remote_req_send.pval, + (void *)recvreq, + recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE, + (int)hdr->hdr_frag.hdr_frag_offset); + return; + } +/* BFO FAILOVER CODE - end */ + mca_pml_bfo_recv_request_progress_frag(recvreq,btl,segments,des->des_dst_cnt); + + return; +} + + +void mca_pml_bfo_recv_frag_callback_put(mca_btl_base_module_t* btl, + mca_btl_base_tag_t tag, + mca_btl_base_descriptor_t* des, + void* cbdata ) { + mca_btl_base_segment_t* segments = des->des_dst; + mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval; + mca_pml_bfo_send_request_t* sendreq; + + if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_bfo_common_hdr_t)) ) { + return; + } + + bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_PUT); + sendreq = (mca_pml_bfo_send_request_t*)hdr->hdr_rdma.hdr_req.pval; +/* BFO FAILOVER CODE - begin */ + /* Drop any fragments if request is in error state. Do not want + * to initiate any more activity. */ + if( OPAL_UNLIKELY(sendreq->req_error)) { + opal_output_verbose(20, mca_pml_bfo_output, + "PUT: received: dropping because request in error, " + "PML=%d, src_req=%p, dst_req=%p, peer=%d", + (uint16_t)sendreq->req_send.req_base.req_sequence, + (void *)sendreq, sendreq->req_recv.pval, + sendreq->req_send.req_base.req_peer); + return; + } +/* BFO FAILOVER CODE - end */ + mca_pml_bfo_send_request_put(sendreq,btl,&hdr->hdr_rdma); + + return; +} + + +void mca_pml_bfo_recv_frag_callback_fin(mca_btl_base_module_t* btl, + mca_btl_base_tag_t tag, + mca_btl_base_descriptor_t* des, + void* cbdata ) { + mca_btl_base_segment_t* segments = des->des_dst; + mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval; + mca_btl_base_descriptor_t* rdma; + + if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_bfo_common_hdr_t)) ) { + return; + } + + bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_FIN); + rdma = (mca_btl_base_descriptor_t*)hdr->hdr_fin.hdr_des.pval; +/* BFO FAILOVER CODE - begin */ + if (true == mca_pml_bfo_is_duplicate_fin(hdr, rdma, btl)) { + return; + } +/* BFO FAILOVER CODE - end */ + rdma->des_cbfunc(btl, NULL, rdma, + hdr->hdr_fin.hdr_fail ? OMPI_ERROR : OMPI_SUCCESS); + + return; +} + + + +#define PML_MAX_SEQ ~((mca_pml_sequence_t)0); + +static inline mca_pml_bfo_recv_request_t* get_posted_recv(opal_list_t *queue) +{ + if(opal_list_get_size(queue) == 0) + return NULL; + + return (mca_pml_bfo_recv_request_t*)opal_list_get_first(queue); +} + +static inline mca_pml_bfo_recv_request_t* get_next_posted_recv( + opal_list_t *queue, + mca_pml_bfo_recv_request_t* req) +{ + opal_list_item_t *i = opal_list_get_next((opal_list_item_t*)req); + + if(opal_list_get_end(queue) == i) + return NULL; + + return (mca_pml_bfo_recv_request_t*)i; +} + +static mca_pml_bfo_recv_request_t *match_incomming( + mca_pml_bfo_match_hdr_t *hdr, mca_pml_bfo_comm_t *comm, + mca_pml_bfo_comm_proc_t *proc) +{ + mca_pml_bfo_recv_request_t *specific_recv, *wild_recv; + mca_pml_sequence_t wild_recv_seq, specific_recv_seq; + int tag = hdr->hdr_tag; + + specific_recv = get_posted_recv(&proc->specific_receives); + wild_recv = get_posted_recv(&comm->wild_receives); + + wild_recv_seq = wild_recv ? + wild_recv->req_recv.req_base.req_sequence : PML_MAX_SEQ; + specific_recv_seq = specific_recv ? + specific_recv->req_recv.req_base.req_sequence : PML_MAX_SEQ; + + /* they are equal only if both are PML_MAX_SEQ */ + while(wild_recv_seq != specific_recv_seq) { + mca_pml_bfo_recv_request_t **match; + opal_list_t *queue; + int req_tag; + mca_pml_sequence_t *seq; + + if (OPAL_UNLIKELY(wild_recv_seq < specific_recv_seq)) { + match = &wild_recv; + queue = &comm->wild_receives; + seq = &wild_recv_seq; + } else { + match = &specific_recv; + queue = &proc->specific_receives; + seq = &specific_recv_seq; + } + + req_tag = (*match)->req_recv.req_base.req_tag; + if(req_tag == tag || (req_tag == OMPI_ANY_TAG && tag >= 0)) { + opal_list_remove_item(queue, (opal_list_item_t*)(*match)); + PERUSE_TRACE_COMM_EVENT(PERUSE_COMM_REQ_REMOVE_FROM_POSTED_Q, + &((*match)->req_recv.req_base), PERUSE_RECV); + return *match; + } + + *match = get_next_posted_recv(queue, *match); + *seq = (*match) ? (*match)->req_recv.req_base.req_sequence : PML_MAX_SEQ; + } + + return NULL; +} + +static mca_pml_bfo_recv_request_t* +match_one(mca_btl_base_module_t *btl, + mca_pml_bfo_match_hdr_t *hdr, mca_btl_base_segment_t* segments, + size_t num_segments, ompi_communicator_t *comm_ptr, + mca_pml_bfo_comm_proc_t *proc, + mca_pml_bfo_recv_frag_t* frag) +{ + mca_pml_bfo_recv_request_t *match; + mca_pml_bfo_comm_t *comm = (mca_pml_bfo_comm_t *)comm_ptr->c_pml_comm; + + do { + match = match_incomming(hdr, comm, proc); + + /* if match found, process data */ + if(OPAL_LIKELY(NULL != match)) { + match->req_recv.req_base.req_proc = proc->ompi_proc; + + if(OPAL_UNLIKELY(MCA_PML_REQUEST_PROBE == match->req_recv.req_base.req_type)) { + /* complete the probe */ + mca_pml_bfo_recv_request_matched_probe(match, btl, segments, + num_segments); + /* attempt to match actual request */ + continue; + } + + PERUSE_TRACE_COMM_EVENT(PERUSE_COMM_MSG_MATCH_POSTED_REQ, + &(match->req_recv.req_base), PERUSE_RECV); + return match; + } + + /* if no match found, place on unexpected queue */ + append_frag_to_list(&proc->unexpected_frags, btl, hdr, segments, + num_segments, frag); + PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_MSG_INSERT_IN_UNEX_Q, comm_ptr, + hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV); + return NULL; + } while(true); +} + +static mca_pml_bfo_recv_frag_t* check_cantmatch_for_match(mca_pml_bfo_comm_proc_t *proc) +{ + mca_pml_bfo_recv_frag_t *frag; + + /* search the list for a fragment from the send with sequence + * number next_msg_seq_expected + */ + for(frag = (mca_pml_bfo_recv_frag_t*)opal_list_get_first(&proc->frags_cant_match); + frag != (mca_pml_bfo_recv_frag_t*)opal_list_get_end(&proc->frags_cant_match); + frag = (mca_pml_bfo_recv_frag_t*)opal_list_get_next(frag)) + { + mca_pml_bfo_match_hdr_t* hdr = &frag->hdr.hdr_match; + /* + * If the message has the next expected seq from that proc... + */ + if(hdr->hdr_seq != proc->expected_sequence) + continue; + + opal_list_remove_item(&proc->frags_cant_match, (opal_list_item_t*)frag); + return frag; + } + + return NULL; +} + +/** + * RCS/CTS receive side matching + * + * @param hdr list of parameters needed for matching + * This list is also embeded in frag, + * but this allows to save a memory copy when + * a match is made in this routine. (IN) + * @param frag pointer to receive fragment which we want + * to match (IN/OUT). If a match is not made, + * hdr is copied to frag. + * @param match_made parameter indicating if we matched frag/ + * hdr (OUT) + * @param additional_matches if a match is made with frag, we + * may be able to match fragments that previously + * have arrived out-of-order. If this is the + * case, the associated fragment descriptors are + * put on this list for further processing. (OUT) + * + * @return OMPI error code + * + * This routine is used to try and match a newly arrived message fragment + * to pre-posted receives. The following assumptions are made + * - fragments are received out of order + * - for long messages, e.g. more than one fragment, a RTS/CTS algorithm + * is used. + * - 2nd and greater fragments include a receive descriptor pointer + * - fragments may be dropped + * - fragments may be corrupt + * - this routine may be called simultaneously by more than one thread + */ +static int mca_pml_bfo_recv_frag_match( mca_btl_base_module_t *btl, + mca_pml_bfo_match_hdr_t *hdr, + mca_btl_base_segment_t* segments, + size_t num_segments, + int type) +{ + /* local variables */ + uint16_t next_msg_seq_expected, frag_msg_seq; + ompi_communicator_t *comm_ptr; + mca_pml_bfo_recv_request_t *match = NULL; + mca_pml_bfo_comm_t *comm; + mca_pml_bfo_comm_proc_t *proc; + mca_pml_bfo_recv_frag_t* frag = NULL; + + /* communicator pointer */ + comm_ptr = ompi_comm_lookup(hdr->hdr_ctx); + if(OPAL_UNLIKELY(NULL == comm_ptr)) { + /* This is a special case. A message for a not yet existing + * communicator can happens. Instead of doing a matching we + * will temporarily add it the a pending queue in the PML. + * Later on, when the communicator is completely instantiated, + * this pending queue will be searched and all matching fragments + * moved to the right communicator. + */ + append_frag_to_list( &mca_pml_bfo.non_existing_communicator_pending, + btl, hdr, segments, num_segments, NULL ); + return OMPI_SUCCESS; + } + comm = (mca_pml_bfo_comm_t *)comm_ptr->c_pml_comm; + + /* source sequence number */ + frag_msg_seq = hdr->hdr_seq; + proc = &comm->procs[hdr->hdr_src]; + + /** + * We generate the MSG_ARRIVED event as soon as the PML is aware of a matching + * fragment arrival. Independing if it is received on the correct order or not. + * This will allow the tools to figure out if the messages are not received in the + * correct order (if multiple network interfaces). + */ + PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_MSG_ARRIVED, comm_ptr, + hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV); + + /* get next expected message sequence number - if threaded + * run, lock to make sure that if another thread is processing + * a frag from the same message a match is made only once. + * Also, this prevents other posted receives (for a pair of + * end points) from being processed, and potentially "loosing" + * the fragment. + */ + OPAL_THREAD_LOCK(&comm->matching_lock); +/* BFO FAILOVER CODE - begin */ + /* In case of network failover, we may get a message telling us to + * restart. In that case, we already have a pointer to the receive + * request in the header itself. */ + if(OPAL_UNLIKELY(hdr->hdr_common.hdr_flags & MCA_PML_BFO_HDR_FLAGS_RESTART)) { + match = mca_pml_bfo_get_request(hdr); + if (NULL == match) { + return OMPI_SUCCESS; + } +/* BFO FAILOVER CODE - end */ + } else { + + /* get sequence number of next message that can be processed */ + next_msg_seq_expected = (uint16_t)proc->expected_sequence; + if(OPAL_UNLIKELY(frag_msg_seq != next_msg_seq_expected)) + goto wrong_seq; + + /* + * This is the sequence number we were expecting, + * so we can try matching it to already posted + * receives. + */ + +out_of_order_match: + /* We're now expecting the next sequence number. */ + proc->expected_sequence++; + + /** + * We generate the SEARCH_POSTED_QUEUE only when the message is received + * in the correct sequence. Otherwise, we delay the event generation until + * we reach the correct sequence number. + */ + PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_SEARCH_POSTED_Q_BEGIN, comm_ptr, + hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV); + + match = match_one(btl, hdr, segments, num_segments, comm_ptr, proc, frag); + + /** + * The match is over. We generate the SEARCH_POSTED_Q_END here, before going + * into the mca_pml_bfo_check_cantmatch_for_match so we can make a difference + * for the searching time for all messages. + */ + PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_SEARCH_POSTED_Q_END, comm_ptr, + hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV); + + /* release matching lock before processing fragment */ + OPAL_THREAD_UNLOCK(&comm->matching_lock); + } + + if(OPAL_LIKELY(match)) { + switch(type) { + case MCA_PML_BFO_HDR_TYPE_MATCH: + mca_pml_bfo_recv_request_progress_match(match, btl, segments, num_segments); + break; + case MCA_PML_BFO_HDR_TYPE_RNDV: + mca_pml_bfo_recv_request_progress_rndv(match, btl, segments, num_segments); + break; + case MCA_PML_BFO_HDR_TYPE_RGET: + mca_pml_bfo_recv_request_progress_rget(match, btl, segments, num_segments); + break; + } + + if(OPAL_UNLIKELY(frag)) + MCA_PML_BFO_RECV_FRAG_RETURN(frag); + } + + /* + * Now that new message has arrived, check to see if + * any fragments on the c_c_frags_cant_match list + * may now be used to form new matchs + */ + if(OPAL_UNLIKELY(opal_list_get_size(&proc->frags_cant_match) > 0)) { + OPAL_THREAD_LOCK(&comm->matching_lock); + if((frag = check_cantmatch_for_match(proc))) { + hdr = &frag->hdr.hdr_match; + segments = frag->segments; + num_segments = frag->num_segments; + btl = frag->btl; + type = hdr->hdr_common.hdr_type; + goto out_of_order_match; + } + OPAL_THREAD_UNLOCK(&comm->matching_lock); + } + + return OMPI_SUCCESS; +wrong_seq: + /* + * This message comes after the next expected, so it + * is ahead of sequence. Save it for later. + */ +/* BFO FAILOVER CODE - begin */ + /* Check for duplicate messages. If message is duplicate, then just + * return as that essentially drops the message. */ + if (true == mca_pml_bfo_is_duplicate_msg(proc, hdr)) { + return OMPI_SUCCESS; + } +/* BFO FAILOVER CODE - end */ + + append_frag_to_list(&proc->frags_cant_match, btl, hdr, segments, + num_segments, NULL); + OPAL_THREAD_UNLOCK(&comm->matching_lock); + return OMPI_SUCCESS; +} + diff --git a/ompi/mca/pml/bfo/pml_bfo_recvfrag.h b/ompi/mca/pml/bfo/pml_bfo_recvfrag.h new file mode 100644 index 0000000000..fc94975d7b --- /dev/null +++ b/ompi/mca/pml/bfo/pml_bfo_recvfrag.h @@ -0,0 +1,172 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2006 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2008 UT-Battelle, LLC. All rights reserved. + * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + */ + +#ifndef MCA_PML_BFO_RECVFRAG_H +#define MCA_PML_BFO_RECVFRAG_H + +#include "ompi/mca/btl/btl.h" +#include "pml_bfo_hdr.h" + +BEGIN_C_DECLS + +struct mca_pml_bfo_buffer_t { + size_t len; + void * addr; +}; +typedef struct mca_pml_bfo_buffer_t mca_pml_bfo_buffer_t; + + +struct mca_pml_bfo_recv_frag_t { + ompi_free_list_item_t super; + mca_pml_bfo_hdr_t hdr; + size_t num_segments; + mca_btl_base_module_t* btl; + mca_btl_base_segment_t segments[MCA_BTL_DES_MAX_SEGMENTS]; + mca_pml_bfo_buffer_t buffers[MCA_BTL_DES_MAX_SEGMENTS]; + unsigned char addr[1]; +}; +typedef struct mca_pml_bfo_recv_frag_t mca_pml_bfo_recv_frag_t; + +OBJ_CLASS_DECLARATION(mca_pml_bfo_recv_frag_t); + + +#define MCA_PML_BFO_RECV_FRAG_ALLOC(frag,rc) \ +do { \ + ompi_free_list_item_t* item; \ + OMPI_FREE_LIST_WAIT(&mca_pml_bfo.recv_frags, item, rc); \ + frag = (mca_pml_bfo_recv_frag_t*)item; \ +} while(0) + + +#define MCA_PML_BFO_RECV_FRAG_INIT(frag, hdr, segs, cnt, btl ) \ +do { \ + size_t i, _size; \ + mca_btl_base_segment_t* macro_segments = frag->segments; \ + mca_pml_bfo_buffer_t* buffers = frag->buffers; \ + unsigned char* _ptr = (unsigned char*)frag->addr; \ + /* init recv_frag */ \ + frag->btl = btl; \ + frag->hdr = *(mca_pml_bfo_hdr_t*)hdr; \ + frag->num_segments = 1; \ + _size = segs[0].seg_len; \ + for( i = 1; i < cnt; i++ ) { \ + _size += segs[i].seg_len; \ + } \ + /* copy over data */ \ + if(_size <= mca_pml_bfo.unexpected_limit ) { \ + macro_segments[0].seg_addr.pval = frag->addr; \ + } else { \ + buffers[0].len = _size; \ + buffers[0].addr = (char*) \ + mca_pml_bfo.allocator->alc_alloc( mca_pml_bfo.allocator, \ + buffers[0].len, \ + 0, NULL); \ + _ptr = (unsigned char*)(buffers[0].addr); \ + macro_segments[0].seg_addr.pval = buffers[0].addr; \ + } \ + macro_segments[0].seg_len = _size; \ + for( i = 0; i < cnt; i++ ) { \ + memcpy( _ptr, segs[i].seg_addr.pval, segs[i].seg_len); \ + _ptr += segs[i].seg_len; \ + } \ + } while(0) + + +#define MCA_PML_BFO_RECV_FRAG_RETURN(frag) \ +do { \ + if( frag->segments[0].seg_len > mca_pml_bfo.unexpected_limit ) { \ + /* return buffers */ \ + mca_pml_bfo.allocator->alc_free( mca_pml_bfo.allocator, \ + frag->buffers[0].addr ); \ + } \ + frag->num_segments = 0; \ + \ + /* return recv_frag */ \ + OMPI_FREE_LIST_RETURN(&mca_pml_bfo.recv_frags, \ + (ompi_free_list_item_t*)frag); \ + } while(0) + + +/** + * Callback from BTL on receipt of a recv_frag (match). + */ + +extern void mca_pml_bfo_recv_frag_callback_match( mca_btl_base_module_t *btl, + mca_btl_base_tag_t tag, + mca_btl_base_descriptor_t* descriptor, + void* cbdata ); + +/** + * Callback from BTL on receipt of a recv_frag (rndv). + */ + +extern void mca_pml_bfo_recv_frag_callback_rndv( mca_btl_base_module_t *btl, + mca_btl_base_tag_t tag, + mca_btl_base_descriptor_t* descriptor, + void* cbdata ); +/** + * Callback from BTL on receipt of a recv_frag (rget). + */ + +extern void mca_pml_bfo_recv_frag_callback_rget( mca_btl_base_module_t *btl, + mca_btl_base_tag_t tag, + mca_btl_base_descriptor_t* descriptor, + void* cbdata ); + +/** + * Callback from BTL on receipt of a recv_frag (ack). + */ + +extern void mca_pml_bfo_recv_frag_callback_ack( mca_btl_base_module_t *btl, + mca_btl_base_tag_t tag, + mca_btl_base_descriptor_t* descriptor, + void* cbdata ); +/** + * Callback from BTL on receipt of a recv_frag (frag). + */ + +extern void mca_pml_bfo_recv_frag_callback_frag( mca_btl_base_module_t *btl, + mca_btl_base_tag_t tag, + mca_btl_base_descriptor_t* descriptor, + void* cbdata ); +/** + * Callback from BTL on receipt of a recv_frag (put). + */ + +extern void mca_pml_bfo_recv_frag_callback_put( mca_btl_base_module_t *btl, + mca_btl_base_tag_t tag, + mca_btl_base_descriptor_t* descriptor, + void* cbdata ); +/** + * Callback from BTL on receipt of a recv_frag (fin). + */ + +extern void mca_pml_bfo_recv_frag_callback_fin( mca_btl_base_module_t *btl, + mca_btl_base_tag_t tag, + mca_btl_base_descriptor_t* descriptor, + void* cbdata ); + + +END_C_DECLS +#endif + diff --git a/ompi/mca/pml/bfo/pml_bfo_recvreq.c b/ompi/mca/pml/bfo/pml_bfo_recvreq.c new file mode 100644 index 0000000000..bdc247f495 --- /dev/null +++ b/ompi/mca/pml/bfo/pml_bfo_recvreq.c @@ -0,0 +1,1163 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2009 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2008 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2008 UT-Battelle, LLC. All rights reserved. + * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#include "ompi/mca/pml/pml.h" +#include "ompi/mca/bml/bml.h" +#include "ompi/mca/btl/btl.h" +#include "ompi/mca/mpool/mpool.h" +#include "pml_bfo_comm.h" +#include "pml_bfo_recvreq.h" +#include "pml_bfo_recvfrag.h" +#include "pml_bfo_sendreq.h" +#include "pml_bfo_rdmafrag.h" +#include "ompi/mca/bml/base/base.h" +#include "orte/mca/errmgr/errmgr.h" +#include "opal/util/arch.h" +#include "ompi/memchecker.h" +/* BFO FAILOVER CODE - begin */ +#include "pml_bfo_failover.h" +/* BFO FAILOVER CODE - end */ + +void mca_pml_bfo_recv_request_process_pending(void) +{ + mca_pml_bfo_recv_request_t* recvreq; + int i, s = (int)opal_list_get_size(&mca_pml_bfo.recv_pending); + + for(i = 0; i < s; i++) { + OPAL_THREAD_LOCK(&mca_pml_bfo.lock); + recvreq = (mca_pml_bfo_recv_request_t*) + opal_list_remove_first(&mca_pml_bfo.recv_pending); + OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock); + if( OPAL_UNLIKELY(NULL == recvreq) ) + break; + recvreq->req_pending = false; + if(OPAL_SOS_GET_ERROR_CODE(mca_pml_bfo_recv_request_schedule_exclusive(recvreq, NULL)) == + OMPI_ERR_OUT_OF_RESOURCE) + break; + } +} + +static int mca_pml_bfo_recv_request_free(struct ompi_request_t** request) +{ + mca_pml_bfo_recv_request_t* recvreq = *(mca_pml_bfo_recv_request_t**)request; + + assert( false == recvreq->req_recv.req_base.req_free_called ); + + OPAL_THREAD_LOCK(&ompi_request_lock); + recvreq->req_recv.req_base.req_free_called = true; + + PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_NOTIFY, + &(recvreq->req_recv.req_base), PERUSE_RECV ); + + if( true == recvreq->req_recv.req_base.req_pml_complete ) { + /* make buffer defined when the request is compeleted, + and before releasing the objects. */ + MEMCHECKER( + memchecker_call(&opal_memchecker_base_mem_defined, + recvreq->req_recv.req_base.req_addr, + recvreq->req_recv.req_base.req_count, + recvreq->req_recv.req_base.req_datatype); + ); + + MCA_PML_BFO_RECV_REQUEST_RETURN( recvreq ); + } + + OPAL_THREAD_UNLOCK(&ompi_request_lock); + *request = MPI_REQUEST_NULL; + return OMPI_SUCCESS; +} + +static int mca_pml_bfo_recv_request_cancel(struct ompi_request_t* ompi_request, int complete) +{ + mca_pml_bfo_recv_request_t* request = (mca_pml_bfo_recv_request_t*)ompi_request; + mca_pml_bfo_comm_t* comm = request->req_recv.req_base.req_comm->c_pml_comm; + + if( true == ompi_request->req_complete ) { /* way to late to cancel this one */ + /* + * Receive request completed, make user buffer accessable. + */ + MEMCHECKER( + memchecker_call(&opal_memchecker_base_mem_defined, + request->req_recv.req_base.req_addr, + request->req_recv.req_base.req_count, + request->req_recv.req_base.req_datatype); + ); + return OMPI_SUCCESS; + } + + /* The rest should be protected behind the match logic lock */ + OPAL_THREAD_LOCK(&comm->matching_lock); + if( OMPI_ANY_TAG == ompi_request->req_status.MPI_TAG ) { /* the match has not been already done */ + if( request->req_recv.req_base.req_peer == OMPI_ANY_SOURCE ) { + opal_list_remove_item( &comm->wild_receives, (opal_list_item_t*)request ); + } else { + mca_pml_bfo_comm_proc_t* proc = comm->procs + request->req_recv.req_base.req_peer; + opal_list_remove_item(&proc->specific_receives, (opal_list_item_t*)request); + } + PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_REMOVE_FROM_POSTED_Q, + &(request->req_recv.req_base), PERUSE_RECV ); + /** + * As now the PML is done with this request we have to force the pml_complete + * to true. Otherwise, the request will never be freed. + */ + request->req_recv.req_base.req_pml_complete = true; + } + OPAL_THREAD_UNLOCK(&comm->matching_lock); + + OPAL_THREAD_LOCK(&ompi_request_lock); + ompi_request->req_status._cancelled = true; + /* This macro will set the req_complete to true so the MPI Test/Wait* functions + * on this request will be able to complete. As the status is marked as + * cancelled the cancel state will be detected. + */ + MCA_PML_BFO_RECV_REQUEST_MPI_COMPLETE(request); + OPAL_THREAD_UNLOCK(&ompi_request_lock); + /* + * Receive request cancelled, make user buffer accessable. + */ + MEMCHECKER( + memchecker_call(&opal_memchecker_base_mem_defined, + request->req_recv.req_base.req_addr, + request->req_recv.req_base.req_count, + request->req_recv.req_base.req_datatype); + ); + return OMPI_SUCCESS; +} + +static void mca_pml_bfo_recv_request_construct(mca_pml_bfo_recv_request_t* request) +{ + request->req_recv.req_base.req_type = MCA_PML_REQUEST_RECV; + request->req_recv.req_base.req_ompi.req_free = mca_pml_bfo_recv_request_free; + request->req_recv.req_base.req_ompi.req_cancel = mca_pml_bfo_recv_request_cancel; + request->req_rdma_cnt = 0; + OBJ_CONSTRUCT(&request->lock, opal_mutex_t); +} + +OBJ_CLASS_INSTANCE( + mca_pml_bfo_recv_request_t, + mca_pml_base_recv_request_t, + mca_pml_bfo_recv_request_construct, + NULL); + + +/* + * Release resources. + */ + +static void mca_pml_bfo_recv_ctl_completion( mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* ep, + struct mca_btl_base_descriptor_t* des, + int status ) +{ +/* BFO FAILOVER CODE - begin */ + if (btl->btl_flags & MCA_BTL_FLAGS_FAILOVER_SUPPORT) { + mca_pml_bfo_check_recv_ctl_completion_status(btl, des, status); + } +/* BFO FAILOVER CODE - end */ + MCA_PML_BFO_PROGRESS_PENDING(btl); +} + +/* + * Put operation has completed remotely - update request status + */ + +static void mca_pml_bfo_put_completion( mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* ep, + struct mca_btl_base_descriptor_t* des, + int status ) +{ + mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*)des->des_context; + mca_pml_bfo_recv_request_t* recvreq = (mca_pml_bfo_recv_request_t*)des->des_cbdata; + size_t bytes_received = 0; + + if( OPAL_LIKELY(status == OMPI_SUCCESS) ) { + MCA_PML_BFO_COMPUTE_SEGMENT_LENGTH( des->des_dst, des->des_dst_cnt, + 0, bytes_received ); + } + OPAL_THREAD_ADD_SIZE_T(&recvreq->req_pipeline_depth,-1); + + btl->btl_free(btl, des); +/* BFO FAILOVER CODE - begin */ + /* This can happen if a FIN message arrives after the request was + * marked in error. So, just drop the message. Note that the + * status field is not being checked. That is because the status + * field is the value returned in the FIN hdr.hdr_fail field and + * may be used for other things. Note that we allow the various + * fields to be updated in case this actually completes the + * request and the sending side thinks it is done. */ + if( OPAL_UNLIKELY(recvreq->req_errstate)) { + opal_output_verbose(20, mca_pml_bfo_output, + "FIN: received on broken request, skipping, " + "PML=%d, RQS=%d, src_req=%lx, dst_req=%lx, peer=%d", + recvreq->req_msgseq, recvreq->req_restartseq, + (unsigned long)recvreq->remote_req_send.pval, + (unsigned long)recvreq, + recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE); + /* Even though in error, it still might complete. */ + recv_request_pml_complete_check(recvreq); + return; + } +/* BFO FAILOVER CODE - end */ + + /* check completion status */ + OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, bytes_received); + if(recv_request_pml_complete_check(recvreq) == false && + recvreq->req_rdma_offset < recvreq->req_send_offset) { + /* schedule additional rdma operations */ + mca_pml_bfo_recv_request_schedule(recvreq, bml_btl); + } + MCA_PML_BFO_PROGRESS_PENDING(btl); +} + +/* + * + */ + +int mca_pml_bfo_recv_request_ack_send_btl( + ompi_proc_t* proc, mca_bml_base_btl_t* bml_btl, + uint64_t hdr_src_req, void *hdr_dst_req, uint64_t hdr_send_offset, + bool nordma) +{ + mca_btl_base_descriptor_t* des; + mca_pml_bfo_ack_hdr_t* ack; + int rc; + + /* allocate descriptor */ + mca_bml_base_alloc(bml_btl, &des, MCA_BTL_NO_ORDER, + sizeof(mca_pml_bfo_ack_hdr_t), + MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK); + if( OPAL_UNLIKELY(NULL == des) ) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + + /* fill out header */ + ack = (mca_pml_bfo_ack_hdr_t*)des->des_src->seg_addr.pval; + ack->hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_ACK; + ack->hdr_common.hdr_flags = nordma ? MCA_PML_BFO_HDR_FLAGS_NORDMA : 0; + ack->hdr_src_req.lval = hdr_src_req; + ack->hdr_dst_req.pval = hdr_dst_req; + ack->hdr_send_offset = hdr_send_offset; + + bfo_hdr_hton(ack, MCA_PML_BFO_HDR_TYPE_ACK, proc); + + /* initialize descriptor */ + des->des_cbfunc = mca_pml_bfo_recv_ctl_completion; +/* BFO FAILOVER CODE - begin */ + des->des_cbdata = (void *)proc; +/* BFO FAILOVER CODE - end */ + + rc = mca_bml_base_send(bml_btl, des, MCA_PML_BFO_HDR_TYPE_ACK); + if( OPAL_LIKELY( rc >= 0 ) ) { +/* BFO FAILOVER CODE - begin */ + if ((bml_btl->btl_flags & MCA_BTL_FLAGS_FAILOVER_SUPPORT) && + (des->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK)) { + ((mca_pml_bfo_recv_request_t *)hdr_dst_req)->req_events++; + } +/* BFO FAILOVER CODE - end */ + return OMPI_SUCCESS; + } + mca_bml_base_free(bml_btl, des); + return OMPI_ERR_OUT_OF_RESOURCE; +} + +static int mca_pml_bfo_recv_request_ack( + mca_pml_bfo_recv_request_t* recvreq, + mca_pml_bfo_rendezvous_hdr_t* hdr, + size_t bytes_received) +{ + ompi_proc_t* proc = (ompi_proc_t*)recvreq->req_recv.req_base.req_proc; + mca_bml_base_endpoint_t* bml_endpoint = NULL; + + bml_endpoint = (mca_bml_base_endpoint_t*) proc->proc_bml; + + /* by default copy everything */ + recvreq->req_send_offset = bytes_received; + if(hdr->hdr_msg_length > bytes_received) { + size_t rdma_num = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma); + /* + * lookup request buffer to determine if memory is already + * registered. + */ + + if(opal_convertor_need_buffers(&recvreq->req_recv.req_base.req_convertor) == 0 && + hdr->hdr_match.hdr_common.hdr_flags & MCA_PML_BFO_HDR_FLAGS_CONTIG && + rdma_num != 0) { + unsigned char *base; + opal_convertor_get_current_pointer( &recvreq->req_recv.req_base.req_convertor, (void**)&(base) ); + + if(hdr->hdr_match.hdr_common.hdr_flags & MCA_PML_BFO_HDR_FLAGS_PIN) + recvreq->req_rdma_cnt = mca_pml_bfo_rdma_btls(bml_endpoint, + base, recvreq->req_recv.req_bytes_packed, + recvreq->req_rdma ); + else + recvreq->req_rdma_cnt = 0; + + /* memory is already registered on both sides */ + if (recvreq->req_rdma_cnt != 0) { + recvreq->req_send_offset = hdr->hdr_msg_length; + /* are rdma devices available for long rdma protocol */ + } else if(bml_endpoint->btl_send_limit < hdr->hdr_msg_length) { + /* use convertor to figure out the rdma offset for this request */ + recvreq->req_send_offset = hdr->hdr_msg_length - + bml_endpoint->btl_pipeline_send_length; + + if(recvreq->req_send_offset < bytes_received) + recvreq->req_send_offset = bytes_received; + + /* use converter to figure out the rdma offset for this + * request */ + opal_convertor_set_position(&recvreq->req_recv.req_base.req_convertor, + &recvreq->req_send_offset); + + recvreq->req_rdma_cnt = + mca_pml_bfo_rdma_pipeline_btls(bml_endpoint, + recvreq->req_send_offset - bytes_received, + recvreq->req_rdma); + } + } + /* nothing to send by copy in/out - no need to ack */ + if(recvreq->req_send_offset == hdr->hdr_msg_length) + return OMPI_SUCCESS; + } + /* let know to shedule function there is no need to put ACK flag */ + recvreq->req_ack_sent = true; + return mca_pml_bfo_recv_request_ack_send(proc, hdr->hdr_src_req.lval, + recvreq, recvreq->req_send_offset, + recvreq->req_send_offset == bytes_received); +} + + +/** + * Return resources used by the RDMA + */ + +static void mca_pml_bfo_rget_completion( mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* ep, + struct mca_btl_base_descriptor_t* des, + int status ) +{ + mca_pml_bfo_rdma_frag_t* frag = (mca_pml_bfo_rdma_frag_t*)des->des_cbdata; + mca_pml_bfo_recv_request_t* recvreq = (mca_pml_bfo_recv_request_t*)frag->rdma_req; + mca_bml_base_btl_t* bml_btl; + mca_bml_base_endpoint_t* bml_endpoint; + +/* BFO FAILOVER CODE - begin */ + if (btl->btl_flags & MCA_BTL_FLAGS_FAILOVER_SUPPORT) { + recvreq->req_events--; + assert(recvreq->req_events >= 0); + } +/* BFO FAILOVER CODE - end */ + + /* check completion status */ + if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) { +/* BFO FAILOVER CODE - begin */ + /* Record the error and send RECVERRNOTIFY if necessary. */ + if (recvreq->req_errstate) { + opal_output_verbose(30, mca_pml_bfo_output, + "RDMA read: completion failed, error already seen, " + "PML=%d, RQS=%d, src_req=%lx, dst_req=%lx, peer=%d", + recvreq->req_msgseq, recvreq->req_restartseq, + (unsigned long)recvreq->remote_req_send.pval, + (unsigned long)recvreq, + recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE); + return; + } else { + opal_output_verbose(30, mca_pml_bfo_output, + "RDMA read: completion failed, sending RECVERRNOTIFY to sender, " + "PML=%d, RQS=%d, src_req=%lx, dst_req=%lx, peer=%d", + recvreq->req_msgseq, recvreq->req_restartseq, + (unsigned long)recvreq->remote_req_send.pval, + (unsigned long)recvreq, + recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE); + mca_pml_bfo_recv_request_recverrnotify(recvreq, MCA_PML_BFO_HDR_TYPE_RGET, status); + } + } +/* BFO FAILOVER CODE - end */ +/* BFO FAILOVER CODE - begin */ + /* See if the request has received a RNDVRESTARTNOTIFY */ + if( OPAL_UNLIKELY(recvreq->req_errstate)) { + if (recvreq->req_errstate & RECVREQ_RNDVRESTART_RECVED) { + opal_output_verbose(30, mca_pml_bfo_output, + "RDMA read: completion: recvreq has error, outstanding events=%d " + "PML=%d, RQS=%d, src_req=%lx, dst_req=%lx, status=%d, peer=%d", + recvreq->req_events, recvreq->req_msgseq, recvreq->req_restartseq, + (unsigned long)recvreq->remote_req_send.pval, + (unsigned long)recvreq, status, + recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE); + if (0 == recvreq->req_events) { + mca_pml_bfo_recv_request_rndvrestartack(recvreq, MCA_PML_BFO_HDR_TYPE_RGET, + status, btl); + } + } + MCA_PML_BFO_RDMA_FRAG_RETURN(frag); + return; + } +/* BFO FAILOVER CODE - end */ +/* BFO FAILOVER CODE - begin */ + /* Find back the bml_btl that this btl belongs to. If we cannot + * find it, then it may have been removed from underneath us, so + * find the next available one to send the FIN message on. */ + bml_endpoint = recvreq->req_recv.req_base.req_proc->proc_bml; + bml_btl = mca_bml_base_btl_array_find(&bml_endpoint->btl_rdma, btl); + if( OPAL_UNLIKELY(NULL == bml_btl) ) { + opal_output_verbose(20, mca_pml_bfo_output, + "RDMA write completion: BML was removed from underneath us, " + "PML=%d, RQS=%d, src_req=%lx, dst_req=%lx, status=%d, peer=%d", + recvreq->req_msgseq, recvreq->req_restartseq, + (unsigned long)recvreq->remote_req_send.pval, + (unsigned long)recvreq, status, + recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE); + bml_btl = mca_bml_base_btl_array_get_next(&bml_endpoint->btl_rdma); + } +/* BFO FAILOVER CODE - end */ + + mca_pml_bfo_send_fin(recvreq->req_recv.req_base.req_proc, + bml_btl, + frag->rdma_hdr.hdr_rget.hdr_des, + des->order, 0, (uint16_t)recvreq->req_msgseq, recvreq->req_restartseq, + recvreq->req_recv.req_base.req_comm->c_contextid, + recvreq->req_recv.req_base.req_comm->c_my_rank); + + /* is receive request complete */ + OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, frag->rdma_length); + recv_request_pml_complete_check(recvreq); + + MCA_PML_BFO_RDMA_FRAG_RETURN(frag); + + MCA_PML_BFO_PROGRESS_PENDING(btl); +} + + +/* + * + */ +int mca_pml_bfo_recv_request_get_frag( mca_pml_bfo_rdma_frag_t* frag ) +{ + mca_pml_bfo_recv_request_t* recvreq = (mca_pml_bfo_recv_request_t*)frag->rdma_req; + mca_bml_base_btl_t* bml_btl = frag->rdma_bml; + mca_btl_base_descriptor_t* descriptor; + size_t save_size = frag->rdma_length; + int rc; + + /* prepare descriptor */ + mca_bml_base_prepare_dst( bml_btl, + NULL, + &recvreq->req_recv.req_base.req_convertor, + MCA_BTL_NO_ORDER, + 0, + &frag->rdma_length, + MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK, + &descriptor ); + if( OPAL_UNLIKELY(NULL == descriptor) ) { + frag->rdma_length = save_size; + OPAL_THREAD_LOCK(&mca_pml_bfo.lock); + opal_list_append(&mca_pml_bfo.rdma_pending, (opal_list_item_t*)frag); + OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock); + return OMPI_ERR_OUT_OF_RESOURCE; + } + + descriptor->des_src = frag->rdma_segs; + descriptor->des_src_cnt = frag->rdma_hdr.hdr_rdma.hdr_seg_cnt; + descriptor->des_cbfunc = mca_pml_bfo_rget_completion; + descriptor->des_cbdata = frag; + + PERUSE_TRACE_COMM_OMPI_EVENT(PERUSE_COMM_REQ_XFER_CONTINUE, + &(recvreq->req_recv.req_base), + frag->rdma_length, PERUSE_RECV); + + /* queue up get request */ + rc = mca_bml_base_get(bml_btl,descriptor); + if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) { + if(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) { + mca_bml_base_free(bml_btl, descriptor); + OPAL_THREAD_LOCK(&mca_pml_bfo.lock); + opal_list_append(&mca_pml_bfo.rdma_pending, + (opal_list_item_t*)frag); + OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock); + return OMPI_ERR_OUT_OF_RESOURCE; + } else { + ORTE_ERROR_LOG(rc); + orte_errmgr.abort(-1, NULL); + } + } +/* BFO FAILOVER CODE - begin */ + if ((bml_btl->btl_flags & MCA_BTL_FLAGS_FAILOVER_SUPPORT) && + (descriptor->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK)) { + recvreq->req_events++; + } +/* BFO FAILOVER CODE - end */ + + return OMPI_SUCCESS; +} + + + + +/* + * Update the recv request status to reflect the number of bytes + * received and actually delivered to the application. + */ + +void mca_pml_bfo_recv_request_progress_frag( mca_pml_bfo_recv_request_t* recvreq, + mca_btl_base_module_t* btl, + mca_btl_base_segment_t* segments, + size_t num_segments ) +{ + size_t bytes_received = 0; + size_t bytes_delivered __opal_attribute_unused__; /* is being set to zero in MCA_PML_BFO_RECV_REQUEST_UNPACK */ + size_t data_offset = 0; + mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval; + + MCA_PML_BFO_COMPUTE_SEGMENT_LENGTH( segments, num_segments, + 0, bytes_received ); + bytes_received -= sizeof(mca_pml_bfo_frag_hdr_t); + data_offset = hdr->hdr_frag.hdr_frag_offset; + /* + * Make user buffer accessable(defined) before unpacking. + */ + MEMCHECKER( + memchecker_call(&opal_memchecker_base_mem_defined, + recvreq->req_recv.req_base.req_addr, + recvreq->req_recv.req_base.req_count, + recvreq->req_recv.req_base.req_datatype); + ); + MCA_PML_BFO_RECV_REQUEST_UNPACK( recvreq, + segments, + num_segments, + sizeof(mca_pml_bfo_frag_hdr_t), + data_offset, + bytes_received, + bytes_delivered ); + /* + * Unpacking finished, make the user buffer unaccessable again. + */ + MEMCHECKER( + memchecker_call(&opal_memchecker_base_mem_noaccess, + recvreq->req_recv.req_base.req_addr, + recvreq->req_recv.req_base.req_count, + recvreq->req_recv.req_base.req_datatype); + ); + + OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, bytes_received); + /* check completion status */ + if(recv_request_pml_complete_check(recvreq) == false && + recvreq->req_rdma_offset < recvreq->req_send_offset) { + /* schedule additional rdma operations */ + mca_pml_bfo_recv_request_schedule(recvreq, NULL); + } +} + +/* + * Update the recv request status to reflect the number of bytes + * received and actually delivered to the application. + */ + +void mca_pml_bfo_recv_request_progress_rget( mca_pml_bfo_recv_request_t* recvreq, + mca_btl_base_module_t* btl, + mca_btl_base_segment_t* segments, + size_t num_segments ) +{ + size_t bytes_received = 0; + mca_pml_bfo_rget_hdr_t* hdr = (mca_pml_bfo_rget_hdr_t*)segments->seg_addr.pval; + mca_bml_base_endpoint_t* bml_endpoint = NULL; + mca_pml_bfo_rdma_frag_t* frag; + size_t i, size = 0; + int rc; + + MCA_PML_BFO_COMPUTE_SEGMENT_LENGTH( segments, num_segments, + 0, bytes_received ); + recvreq->req_recv.req_bytes_packed = hdr->hdr_rndv.hdr_msg_length; + +/* BFO FAILOVER CODE - begin */ + recvreq->remote_req_send = hdr->hdr_rndv.hdr_src_req; +/* BFO FAILOVER CODE - end */ + MCA_PML_BFO_RECV_REQUEST_MATCHED(recvreq, &hdr->hdr_rndv.hdr_match); + + /* if receive buffer is not contiguous we can't just RDMA read into it, so + * fall back to copy in/out protocol. It is a pity because buffer on the + * sender side is already registered. We need to be smarter here, perhaps + * do couple of RDMA reads */ + if(opal_convertor_need_buffers(&recvreq->req_recv.req_base.req_convertor) == true) { + mca_pml_bfo_recv_request_ack(recvreq, &hdr->hdr_rndv, 0); + return; + } + + MCA_PML_BFO_RDMA_FRAG_ALLOC(frag,rc); + if( OPAL_UNLIKELY(NULL == frag) ) { + /* GLB - FIX */ + ORTE_ERROR_LOG(rc); + orte_errmgr.abort(-1, NULL); + } + + /* lookup bml datastructures */ + bml_endpoint = (mca_bml_base_endpoint_t*)recvreq->req_recv.req_base.req_proc->proc_bml; + + /* allocate/initialize a fragment */ + for(i = 0; i < hdr->hdr_seg_cnt; i++) { + frag->rdma_segs[i] = hdr->hdr_segs[i]; +#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT + if ((recvreq->req_recv.req_base.req_proc->proc_arch & OPAL_ARCH_ISBIGENDIAN) != + (ompi_proc_local()->proc_arch & OPAL_ARCH_ISBIGENDIAN)) { + size += opal_swap_bytes4(hdr->hdr_segs[i].seg_len); + } else +#endif + { + size += hdr->hdr_segs[i].seg_len; + } + } +/* BFO FAILOVER CODE - begin */ + frag->rdma_btl = btl; +/* BFO FAILOVER CODE - end */ + frag->rdma_bml = mca_bml_base_btl_array_find(&bml_endpoint->btl_rdma, btl); + if( OPAL_UNLIKELY(NULL == frag->rdma_bml) ) { + opal_output(0, "[%s:%d] invalid bml for rdma get", __FILE__, __LINE__); + orte_errmgr.abort(-1, NULL); + } + frag->rdma_hdr.hdr_rget = *hdr; + frag->rdma_req = recvreq; + frag->rdma_ep = bml_endpoint; + frag->rdma_length = size; + frag->rdma_state = MCA_PML_BFO_RDMA_GET; + frag->reg = NULL; + + mca_pml_bfo_recv_request_get_frag(frag); + return; +} + +/* + * Update the recv request status to reflect the number of bytes + * received and actually delivered to the application. + */ + +void mca_pml_bfo_recv_request_progress_rndv( mca_pml_bfo_recv_request_t* recvreq, + mca_btl_base_module_t* btl, + mca_btl_base_segment_t* segments, + size_t num_segments ) +{ + size_t bytes_received = 0; + size_t bytes_delivered __opal_attribute_unused__; /* is being set to zero in MCA_PML_BFO_RECV_REQUEST_UNPACK */ + size_t data_offset = 0; + mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval; + + MCA_PML_BFO_COMPUTE_SEGMENT_LENGTH( segments, num_segments, + 0, bytes_received ); + + bytes_received -= sizeof(mca_pml_bfo_rendezvous_hdr_t); + recvreq->req_recv.req_bytes_packed = hdr->hdr_rndv.hdr_msg_length; + recvreq->remote_req_send = hdr->hdr_rndv.hdr_src_req; + recvreq->req_rdma_offset = bytes_received; + MCA_PML_BFO_RECV_REQUEST_MATCHED(recvreq, &hdr->hdr_match); + mca_pml_bfo_recv_request_ack(recvreq, &hdr->hdr_rndv, bytes_received); + /** + * The PUT protocol do not attach any data to the original request. + * Therefore, we might want to avoid unpacking if there is nothing to + * unpack. + */ + if( 0 < bytes_received ) { + MEMCHECKER( + memchecker_call(&opal_memchecker_base_mem_defined, + recvreq->req_recv.req_base.req_addr, + recvreq->req_recv.req_base.req_count, + recvreq->req_recv.req_base.req_datatype); + ); + MCA_PML_BFO_RECV_REQUEST_UNPACK( recvreq, + segments, + num_segments, + sizeof(mca_pml_bfo_rendezvous_hdr_t), + data_offset, + bytes_received, + bytes_delivered ); + MEMCHECKER( + memchecker_call(&opal_memchecker_base_mem_noaccess, + recvreq->req_recv.req_base.req_addr, + recvreq->req_recv.req_base.req_count, + recvreq->req_recv.req_base.req_datatype); + ); + } + OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, bytes_received); + /* check completion status */ + if(recv_request_pml_complete_check(recvreq) == false && + recvreq->req_rdma_offset < recvreq->req_send_offset) { + /* schedule additional rdma operations */ + mca_pml_bfo_recv_request_schedule(recvreq, NULL); + } +} + +/* + * Update the recv request status to reflect the number of bytes + * received and actually delivered to the application. + */ +void mca_pml_bfo_recv_request_progress_match( mca_pml_bfo_recv_request_t* recvreq, + mca_btl_base_module_t* btl, + mca_btl_base_segment_t* segments, + size_t num_segments ) +{ + size_t bytes_received = 0, data_offset = 0; + size_t bytes_delivered __opal_attribute_unused__; /* is being set to zero in MCA_PML_BFO_RECV_REQUEST_UNPACK */ + mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval; + + MCA_PML_BFO_COMPUTE_SEGMENT_LENGTH( segments, num_segments, + 0, bytes_received ); + bytes_received -= OMPI_PML_BFO_MATCH_HDR_LEN; + recvreq->req_recv.req_bytes_packed = bytes_received; + + MCA_PML_BFO_RECV_REQUEST_MATCHED(recvreq, &hdr->hdr_match); + /* + * Make user buffer accessable(defined) before unpacking. + */ + MEMCHECKER( + memchecker_call(&opal_memchecker_base_mem_defined, + recvreq->req_recv.req_base.req_addr, + recvreq->req_recv.req_base.req_count, + recvreq->req_recv.req_base.req_datatype); + ); + MCA_PML_BFO_RECV_REQUEST_UNPACK( recvreq, + segments, + num_segments, + OMPI_PML_BFO_MATCH_HDR_LEN, + data_offset, + bytes_received, + bytes_delivered); + /* + * Unpacking finished, make the user buffer unaccessable again. + */ + MEMCHECKER( + memchecker_call(&opal_memchecker_base_mem_noaccess, + recvreq->req_recv.req_base.req_addr, + recvreq->req_recv.req_base.req_count, + recvreq->req_recv.req_base.req_datatype); + ); + + /* + * No need for atomic here, as we know there is only one fragment + * for this request. + */ + recvreq->req_bytes_received += bytes_received; + recv_request_pml_complete(recvreq); +} + + +/** + * Handle completion of a probe request + */ + +void mca_pml_bfo_recv_request_matched_probe( mca_pml_bfo_recv_request_t* recvreq, + mca_btl_base_module_t* btl, + mca_btl_base_segment_t* segments, + size_t num_segments ) +{ + size_t bytes_packed = 0; + mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval; + + switch(hdr->hdr_common.hdr_type) { + case MCA_PML_BFO_HDR_TYPE_MATCH: + + MCA_PML_BFO_COMPUTE_SEGMENT_LENGTH( segments, num_segments, + OMPI_PML_BFO_MATCH_HDR_LEN, + bytes_packed ); + break; + + case MCA_PML_BFO_HDR_TYPE_RNDV: + case MCA_PML_BFO_HDR_TYPE_RGET: + + bytes_packed = hdr->hdr_rndv.hdr_msg_length; + break; + } + + /* set completion status */ + recvreq->req_recv.req_base.req_ompi.req_status.MPI_TAG = hdr->hdr_match.hdr_tag; + recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE = hdr->hdr_match.hdr_src; + recvreq->req_bytes_received = bytes_packed; + recvreq->req_bytes_expected = bytes_packed; + recv_request_pml_complete(recvreq); +} + + +/* + * Schedule RDMA protocol. + * +*/ + +int mca_pml_bfo_recv_request_schedule_once( mca_pml_bfo_recv_request_t* recvreq, + mca_btl_base_module_t *start_btl ) +{ + mca_bml_base_btl_t* bml_btl; + int num_tries = recvreq->req_rdma_cnt, num_fail = 0; + size_t i, prev_bytes_remaining = 0; + size_t bytes_remaining = recvreq->req_send_offset - + recvreq->req_rdma_offset; + + /* if starting bml_btl is provided schedule next fragment on it first */ + if(start_btl != NULL) { + for(i = 0; i < recvreq->req_rdma_cnt; i++) { + if(recvreq->req_rdma[i].bml_btl->btl != start_btl) + continue; + /* something left to be send? */ + if( OPAL_LIKELY(recvreq->req_rdma[i].length) ) + recvreq->req_rdma_idx = i; + break; + } + } + + while(bytes_remaining > 0 && + recvreq->req_pipeline_depth < mca_pml_bfo.recv_pipeline_depth) { + size_t hdr_size; + size_t size; + mca_pml_bfo_rdma_hdr_t* hdr; + mca_btl_base_descriptor_t* dst; + mca_btl_base_descriptor_t* ctl; + mca_mpool_base_registration_t * reg = NULL; + mca_btl_base_module_t* btl; + int rc, rdma_idx; + + if(prev_bytes_remaining == bytes_remaining) { + if(++num_fail == num_tries) { + OPAL_THREAD_LOCK(&mca_pml_bfo.lock); + if(false == recvreq->req_pending) { + opal_list_append(&mca_pml_bfo.recv_pending, + (opal_list_item_t*)recvreq); + recvreq->req_pending = true; + } + OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock); + return OMPI_ERR_OUT_OF_RESOURCE; + } + } else { + num_fail = 0; + prev_bytes_remaining = bytes_remaining; + } + + do { + rdma_idx = recvreq->req_rdma_idx; + bml_btl = recvreq->req_rdma[rdma_idx].bml_btl; + reg = recvreq->req_rdma[rdma_idx].btl_reg; + size = recvreq->req_rdma[rdma_idx].length; + if(++recvreq->req_rdma_idx >= recvreq->req_rdma_cnt) + recvreq->req_rdma_idx = 0; + } while(!size); + btl = bml_btl->btl; + + /* makes sure that we don't exceed BTL max rdma size + * if memory is not pinned already */ + if( (NULL == reg) && (btl->btl_rdma_pipeline_frag_size != 0) && + (size > btl->btl_rdma_pipeline_frag_size)) { + size = btl->btl_rdma_pipeline_frag_size; + } + + /* take lock to protect converter against concurrent access + * from unpack */ + OPAL_THREAD_LOCK(&recvreq->lock); + opal_convertor_set_position( &recvreq->req_recv.req_base.req_convertor, + &recvreq->req_rdma_offset ); + + /* prepare a descriptor for RDMA */ + mca_bml_base_prepare_dst(bml_btl, reg, + &recvreq->req_recv.req_base.req_convertor, + MCA_BTL_NO_ORDER, 0, &size, MCA_BTL_DES_FLAGS_BTL_OWNERSHIP, &dst); + OPAL_THREAD_UNLOCK(&recvreq->lock); + + if(OPAL_UNLIKELY(dst == NULL)) { + continue; + } + + dst->des_cbfunc = mca_pml_bfo_put_completion; + dst->des_cbdata = recvreq; + + /* prepare a descriptor for rdma control message */ + hdr_size = sizeof(mca_pml_bfo_rdma_hdr_t); + if(dst->des_dst_cnt > 1) { + hdr_size += (sizeof(mca_btl_base_segment_t) * + (dst->des_dst_cnt-1)); + } + + mca_bml_base_alloc(bml_btl, &ctl, MCA_BTL_NO_ORDER, hdr_size, + MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK); + + if( OPAL_UNLIKELY(NULL == ctl) ) { + mca_bml_base_free(bml_btl,dst); + continue; + } + ctl->des_cbfunc = mca_pml_bfo_recv_ctl_completion; +/* BFO FAILOVER CODE - begin */ + ctl->des_cbdata = recvreq; +/* BFO FAILOVER CODE - end */ + + /* fill in rdma header */ + hdr = (mca_pml_bfo_rdma_hdr_t*)ctl->des_src->seg_addr.pval; + hdr->hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_PUT; + hdr->hdr_common.hdr_flags = + (!recvreq->req_ack_sent) ? MCA_PML_BFO_HDR_TYPE_ACK : 0; + hdr->hdr_req = recvreq->remote_req_send; +/* BFO FAILOVER CODE - begin */ + hdr->hdr_dst_req.pval = recvreq; /* only needed in the first put message */ +/* BFO FAILOVER CODE - end */ + hdr->hdr_des.pval = dst; + hdr->hdr_rdma_offset = recvreq->req_rdma_offset; + hdr->hdr_seg_cnt = dst->des_dst_cnt; + + for( i = 0; i < dst->des_dst_cnt; i++ ) { + hdr->hdr_segs[i].seg_addr.lval = ompi_ptr_ptol(dst->des_dst[i].seg_addr.pval); + hdr->hdr_segs[i].seg_len = dst->des_dst[i].seg_len; + hdr->hdr_segs[i].seg_key.key64 = dst->des_dst[i].seg_key.key64; + } + + if(!recvreq->req_ack_sent) + recvreq->req_ack_sent = true; + bfo_hdr_hton(hdr, MCA_PML_BFO_HDR_TYPE_PUT, recvreq->req_recv.req_base.req_proc); + + PERUSE_TRACE_COMM_OMPI_EVENT( PERUSE_COMM_REQ_XFER_CONTINUE, + &(recvreq->req_recv.req_base), size, + PERUSE_RECV); + + /* send rdma request to peer */ + rc = mca_bml_base_send(bml_btl, ctl, MCA_PML_BFO_HDR_TYPE_PUT); + if( OPAL_LIKELY( rc >= 0 ) ) { +/* BFO FAILOVER CODE - begin */ + if ((btl->btl_flags & MCA_BTL_FLAGS_FAILOVER_SUPPORT) && + (ctl->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK)) { + recvreq->req_events++; + } +/* BFO FAILOVER CODE - end */ + /* update request state */ + recvreq->req_rdma_offset += size; + OPAL_THREAD_ADD_SIZE_T(&recvreq->req_pipeline_depth, 1); + recvreq->req_rdma[rdma_idx].length -= size; + bytes_remaining -= size; + } else { + mca_bml_base_free(bml_btl,ctl); + mca_bml_base_free(bml_btl,dst); + } + } + + return OMPI_SUCCESS; +} + +#define IS_PROB_REQ(R) \ + ((MCA_PML_REQUEST_IPROBE == (R)->req_recv.req_base.req_type) || \ + (MCA_PML_REQUEST_PROBE == (R)->req_recv.req_base.req_type)) + +static inline void append_recv_req_to_queue(opal_list_t *queue, + mca_pml_bfo_recv_request_t *req) +{ + if(OPAL_UNLIKELY(req->req_recv.req_base.req_type == MCA_PML_REQUEST_IPROBE)) + return; + + opal_list_append(queue, (opal_list_item_t*)req); + + /** + * We don't want to generate this kind of event for MPI_Probe. Hopefully, + * the compiler will optimize out the empty if loop in the case where PERUSE + * support is not required by the user. + */ + if(req->req_recv.req_base.req_type != MCA_PML_REQUEST_PROBE) { + PERUSE_TRACE_COMM_EVENT(PERUSE_COMM_REQ_INSERT_IN_POSTED_Q, + &(req->req_recv.req_base), PERUSE_RECV); + } +} + +/* + * this routine tries to match a posted receive. If a match is found, + * it places the request in the appropriate matched receive list. This + * function has to be called with the communicator matching lock held. +*/ +static mca_pml_bfo_recv_frag_t* +recv_req_match_specific_proc( const mca_pml_bfo_recv_request_t *req, + mca_pml_bfo_comm_proc_t *proc ) +{ + opal_list_t* unexpected_frags = &proc->unexpected_frags; + opal_list_item_t *i; + mca_pml_bfo_recv_frag_t* frag; + int tag = req->req_recv.req_base.req_tag; + + if(opal_list_get_size(unexpected_frags) == 0) + return NULL; + + if( OMPI_ANY_TAG == tag ) { + for (i = opal_list_get_first(unexpected_frags); + i != opal_list_get_end(unexpected_frags); + i = opal_list_get_next(i)) { + frag = (mca_pml_bfo_recv_frag_t*)i; + + if( frag->hdr.hdr_match.hdr_tag >= 0 ) + return frag; + } + } else { + for (i = opal_list_get_first(unexpected_frags); + i != opal_list_get_end(unexpected_frags); + i = opal_list_get_next(i)) { + frag = (mca_pml_bfo_recv_frag_t*)i; + + if( frag->hdr.hdr_match.hdr_tag == tag ) + return frag; + } + } + return NULL; +} + +/* + * this routine is used to try and match a wild posted receive - where + * wild is determined by the value assigned to the source process +*/ +static mca_pml_bfo_recv_frag_t* +recv_req_match_wild( mca_pml_bfo_recv_request_t* req, + mca_pml_bfo_comm_proc_t **p) +{ + mca_pml_bfo_comm_t* comm = req->req_recv.req_base.req_comm->c_pml_comm; + mca_pml_bfo_comm_proc_t* proc = comm->procs; + size_t proc_count = comm->num_procs, i; + + /* + * Loop over all the outstanding messages to find one that matches. + * There is an outer loop over lists of messages from each + * process, then an inner loop over the messages from the + * process. + */ + for (i = 0; i < proc_count; i++) { + mca_pml_bfo_recv_frag_t* frag; + + /* loop over messages from the current proc */ + if((frag = recv_req_match_specific_proc(req, &proc[i]))) { + *p = &proc[i]; + req->req_recv.req_base.req_proc = proc[i].ompi_proc; + prepare_recv_req_converter(req); + return frag; /* match found */ + } + } + + *p = NULL; + return NULL; +} + + +void mca_pml_bfo_recv_req_start(mca_pml_bfo_recv_request_t *req) +{ + mca_pml_bfo_comm_t* comm = req->req_recv.req_base.req_comm->c_pml_comm; + mca_pml_bfo_comm_proc_t* proc; + mca_pml_bfo_recv_frag_t* frag; + opal_list_t *queue; + mca_pml_bfo_hdr_t* hdr; + + /* init/re-init the request */ + req->req_lock = 0; + req->req_pipeline_depth = 0; + req->req_bytes_received = 0; + req->req_bytes_expected = 0; + /* What about req_rdma_cnt ? */ +/* BFO FAILOVER CODE - begin */ + req->req_rdma_cnt = 0; + req->req_events = 0; + req->req_restartseq = 0; + req->req_errstate = 0; +/* BFO FAILOVER CODE - end */ + req->req_rdma_idx = 0; + req->req_pending = false; + req->req_ack_sent = false; + + MCA_PML_BASE_RECV_START(&req->req_recv.req_base); + + OPAL_THREAD_LOCK(&comm->matching_lock); + /** + * The laps of time between the ACTIVATE event and the SEARCH_UNEX one include + * the cost of the request lock. + */ + PERUSE_TRACE_COMM_EVENT(PERUSE_COMM_SEARCH_UNEX_Q_BEGIN, + &(req->req_recv.req_base), PERUSE_RECV); + + /* assign sequence number */ + req->req_recv.req_base.req_sequence = comm->recv_sequence++; + + /* attempt to match posted recv */ + if(req->req_recv.req_base.req_peer == OMPI_ANY_SOURCE) { + frag = recv_req_match_wild(req, &proc); + queue = &comm->wild_receives; +#if !OPAL_ENABLE_HETEROGENEOUS_SUPPORT + /* As we are in a homogeneous environment we know that all remote + * architectures are exactly the same as the local one. Therefore, + * we can safely construct the convertor based on the proc + * information of rank 0. + */ + if( NULL == frag ) { + req->req_recv.req_base.req_proc = ompi_proc_local_proc; + prepare_recv_req_converter(req); + } +#endif /* !OPAL_ENABLE_HETEROGENEOUS_SUPPORT */ + } else { + proc = &comm->procs[req->req_recv.req_base.req_peer]; + req->req_recv.req_base.req_proc = proc->ompi_proc; + frag = recv_req_match_specific_proc(req, proc); + queue = &proc->specific_receives; + /* wild cardrecv will be prepared on match */ + prepare_recv_req_converter(req); + } + + if(OPAL_UNLIKELY(NULL == frag)) { + PERUSE_TRACE_COMM_EVENT(PERUSE_COMM_SEARCH_UNEX_Q_END, + &(req->req_recv.req_base), PERUSE_RECV); + /* We didn't find any matches. Record this irecv so we can match + it when the message comes in. */ + append_recv_req_to_queue(queue, req); + req->req_match_received = false; + OPAL_THREAD_UNLOCK(&comm->matching_lock); + } else { + if(OPAL_LIKELY(!IS_PROB_REQ(req))) { + PERUSE_TRACE_COMM_EVENT(PERUSE_COMM_REQ_MATCH_UNEX, + &(req->req_recv.req_base), PERUSE_RECV); + + hdr = (mca_pml_bfo_hdr_t*)frag->segments->seg_addr.pval; + PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_MSG_REMOVE_FROM_UNEX_Q, + req->req_recv.req_base.req_comm, + hdr->hdr_match.hdr_src, + hdr->hdr_match.hdr_tag, + PERUSE_RECV); + + PERUSE_TRACE_COMM_EVENT(PERUSE_COMM_SEARCH_UNEX_Q_END, + &(req->req_recv.req_base), PERUSE_RECV); + + opal_list_remove_item(&proc->unexpected_frags, + (opal_list_item_t*)frag); + OPAL_THREAD_UNLOCK(&comm->matching_lock); + + switch(hdr->hdr_common.hdr_type) { + case MCA_PML_BFO_HDR_TYPE_MATCH: + mca_pml_bfo_recv_request_progress_match(req, frag->btl, frag->segments, + frag->num_segments); + break; + case MCA_PML_BFO_HDR_TYPE_RNDV: + mca_pml_bfo_recv_request_progress_rndv(req, frag->btl, frag->segments, + frag->num_segments); + break; + case MCA_PML_BFO_HDR_TYPE_RGET: + mca_pml_bfo_recv_request_progress_rget(req, frag->btl, frag->segments, + frag->num_segments); + break; + default: + assert(0); + } + + MCA_PML_BFO_RECV_FRAG_RETURN(frag); + + } else { + OPAL_THREAD_UNLOCK(&comm->matching_lock); + mca_pml_bfo_recv_request_matched_probe(req, frag->btl, + frag->segments, frag->num_segments); + } + } +} diff --git a/ompi/mca/pml/bfo/pml_bfo_recvreq.h b/ompi/mca/pml/bfo/pml_bfo_recvreq.h new file mode 100644 index 0000000000..4e942f3a0c --- /dev/null +++ b/ompi/mca/pml/bfo/pml_bfo_recvreq.h @@ -0,0 +1,444 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2009 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2007 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2008 UT-Battelle, LLC. All rights reserved. + * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + */ +#ifndef OMPI_PML_BFO_RECV_REQUEST_H +#define OMPI_PML_BFO_RECV_REQUEST_H + +#include "pml_bfo.h" +#include "pml_bfo_rdma.h" +#include "pml_bfo_rdmafrag.h" +#include "ompi/proc/proc.h" +#include "ompi/mca/pml/bfo/pml_bfo_comm.h" +#include "ompi/mca/mpool/base/base.h" +#include "ompi/mca/pml/base/pml_base_recvreq.h" + +/* BFO FAILOVER CODE - begin */ +#define RECVREQ_RECVERRSENT 0x01 +#define RECVREQ_RNDVRESTART_RECVED 0x02 +#define RECVREQ_RNDVRESTART_ACKED 0x04 +/* BFO FAILOVER CODE - end */ + +BEGIN_C_DECLS + +struct mca_pml_bfo_recv_request_t { + mca_pml_base_recv_request_t req_recv; + ompi_ptr_t remote_req_send; +/* BFO FAILOVER CODE - begin */ + int32_t req_msgseq; /* PML sequence number */ + int32_t req_events; /* number of outstanding events on request */ + int32_t req_restartseq; /* sequence number of restarted request */ + int32_t req_errstate; /* state of request if in error */ +/* BFO FAILOVER CODE - end */ + int32_t req_lock; + size_t req_pipeline_depth; + size_t req_bytes_received; /**< amount of data transferred into the user buffer */ + size_t req_bytes_expected; /**< local size of the data as suggested by the user */ + size_t req_rdma_offset; + size_t req_send_offset; + uint32_t req_rdma_cnt; + uint32_t req_rdma_idx; + bool req_pending; + bool req_ack_sent; /**< whether ack was sent to the sender */ + bool req_match_received; /**< Prevent request to be completed prematurely */ + opal_mutex_t lock; + mca_pml_bfo_com_btl_t req_rdma[1]; +}; +typedef struct mca_pml_bfo_recv_request_t mca_pml_bfo_recv_request_t; + +OBJ_CLASS_DECLARATION(mca_pml_bfo_recv_request_t); + +static inline bool lock_recv_request(mca_pml_bfo_recv_request_t *recvreq) +{ + return OPAL_THREAD_ADD32(&recvreq->req_lock, 1) == 1; +} + +static inline bool unlock_recv_request(mca_pml_bfo_recv_request_t *recvreq) +{ + return OPAL_THREAD_ADD32(&recvreq->req_lock, -1) == 0; +} + +/** + * Allocate a recv request from the modules free list. + * + * @param rc (OUT) OMPI_SUCCESS or error status on failure. + * @return Receive request. + */ +#define MCA_PML_BFO_RECV_REQUEST_ALLOC(recvreq, rc) \ +do { \ + ompi_free_list_item_t* item; \ + rc = OMPI_SUCCESS; \ + OMPI_FREE_LIST_GET(&mca_pml_base_recv_requests, item, rc); \ + recvreq = (mca_pml_bfo_recv_request_t*)item; \ +} while(0) + + +/** + * Initialize a receive request with call parameters. + * + * @param request (IN) Receive request. + * @param addr (IN) User buffer. + * @param count (IN) Number of elements of indicated datatype. + * @param datatype (IN) User defined datatype. + * @param src (IN) Source rank w/in the communicator. + * @param tag (IN) User defined tag. + * @param comm (IN) Communicator. + * @param persistent (IN) Is this a ersistent request. + */ +#define MCA_PML_BFO_RECV_REQUEST_INIT( request, \ + addr, \ + count, \ + datatype, \ + src, \ + tag, \ + comm, \ + persistent) \ +do { \ + MCA_PML_BASE_RECV_REQUEST_INIT( &(request)->req_recv, \ + addr, \ + count, \ + datatype, \ + src, \ + tag, \ + comm, \ + persistent); \ +} while(0) + +/** + * Mark the request as completed at MPI level for internal purposes. + * + * @param recvreq (IN) Receive request. + */ +#define MCA_PML_BFO_RECV_REQUEST_MPI_COMPLETE( recvreq ) \ + do { \ + PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_COMPLETE, \ + &(recvreq->req_recv.req_base), PERUSE_RECV ); \ + ompi_request_complete( &(recvreq->req_recv.req_base.req_ompi), true ); \ + } while (0) + +/* + * Free the PML receive request + */ +#define MCA_PML_BFO_RECV_REQUEST_RETURN(recvreq) \ + { \ + MCA_PML_BASE_RECV_REQUEST_FINI(&(recvreq)->req_recv); \ + OMPI_FREE_LIST_RETURN( &mca_pml_base_recv_requests, \ + (ompi_free_list_item_t*)(recvreq)); \ + } + +/** + * Complete receive request. Request structure cannot be accessed after calling + * this function any more. + * + * @param recvreq (IN) Receive request. + */ +static inline void +recv_request_pml_complete(mca_pml_bfo_recv_request_t *recvreq) +{ + size_t i; + + assert(false == recvreq->req_recv.req_base.req_pml_complete); + + if(recvreq->req_recv.req_bytes_packed > 0) { + PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_XFER_END, + &recvreq->req_recv.req_base, PERUSE_RECV ); + } + + for(i = 0; i < recvreq->req_rdma_cnt; i++) { + mca_mpool_base_registration_t* btl_reg = recvreq->req_rdma[i].btl_reg; + if( NULL != btl_reg && btl_reg->mpool != NULL) { + btl_reg->mpool->mpool_deregister( btl_reg->mpool, btl_reg ); + } + } + recvreq->req_rdma_cnt = 0; +/* BFO FAILOVER CODE - begin */ + /* Initialize to a value that we indicate it is invalid */ + recvreq->req_msgseq = 42; +/* BFO FAILOVER CODE - end */ + + OPAL_THREAD_LOCK(&ompi_request_lock); + if(true == recvreq->req_recv.req_base.req_free_called) { + MCA_PML_BFO_RECV_REQUEST_RETURN(recvreq); + } else { + /* initialize request status */ + recvreq->req_recv.req_base.req_pml_complete = true; + recvreq->req_recv.req_base.req_ompi.req_status._count = + (int)recvreq->req_bytes_received; + if (recvreq->req_recv.req_bytes_packed > recvreq->req_bytes_expected) { + recvreq->req_recv.req_base.req_ompi.req_status._count = + (int)recvreq->req_recv.req_bytes_packed; + recvreq->req_recv.req_base.req_ompi.req_status.MPI_ERROR = + MPI_ERR_TRUNCATE; + } + MCA_PML_BFO_RECV_REQUEST_MPI_COMPLETE(recvreq); + } + OPAL_THREAD_UNLOCK(&ompi_request_lock); +} + +static inline bool +recv_request_pml_complete_check(mca_pml_bfo_recv_request_t *recvreq) +{ +#if OPAL_HAVE_THREAD_SUPPORT + opal_atomic_rmb(); +#endif + if(recvreq->req_match_received && + recvreq->req_bytes_received >= recvreq->req_recv.req_bytes_packed && + (0 == recvreq->req_events) && lock_recv_request(recvreq)) { + recv_request_pml_complete(recvreq); + return true; + } + + return false; +} + +extern void mca_pml_bfo_recv_req_start(mca_pml_bfo_recv_request_t *req); +#define MCA_PML_BFO_RECV_REQUEST_START(r) mca_pml_bfo_recv_req_start(r) + +static inline void prepare_recv_req_converter(mca_pml_bfo_recv_request_t *req) +{ + if( req->req_recv.req_base.req_datatype->super.size | req->req_recv.req_base.req_count ) { + opal_convertor_copy_and_prepare_for_recv( + req->req_recv.req_base.req_proc->proc_convertor, + &(req->req_recv.req_base.req_datatype->super), + req->req_recv.req_base.req_count, + req->req_recv.req_base.req_addr, + 0, + &req->req_recv.req_base.req_convertor); + opal_convertor_get_unpacked_size(&req->req_recv.req_base.req_convertor, + &req->req_bytes_expected); + } +} + +#define MCA_PML_BFO_RECV_REQUEST_MATCHED(request, hdr) \ + recv_req_matched(request, hdr) + +static inline void recv_req_matched(mca_pml_bfo_recv_request_t *req, + mca_pml_bfo_match_hdr_t *hdr) +{ + req->req_recv.req_base.req_ompi.req_status.MPI_SOURCE = hdr->hdr_src; + req->req_recv.req_base.req_ompi.req_status.MPI_TAG = hdr->hdr_tag; + req->req_match_received = true; +/* BFO FAILOVER CODE - begin */ + req->req_msgseq = hdr->hdr_seq; +/* BFO FAILOVER CODE - end */ +#if OPAL_HAVE_THREAD_SUPPORT + opal_atomic_wmb(); +#endif + if(req->req_recv.req_bytes_packed > 0) { +#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT + if(MPI_ANY_SOURCE == req->req_recv.req_base.req_peer) { + /* non wildcard prepared during post recv */ + prepare_recv_req_converter(req); + } +#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT */ + PERUSE_TRACE_COMM_EVENT(PERUSE_COMM_REQ_XFER_BEGIN, + &req->req_recv.req_base, PERUSE_RECV); + } +} + + +/** + * + */ + +#define MCA_PML_BFO_RECV_REQUEST_UNPACK( request, \ + segments, \ + num_segments, \ + seg_offset, \ + data_offset, \ + bytes_received, \ + bytes_delivered) \ +do { \ + bytes_delivered = 0; \ + if(request->req_recv.req_bytes_packed > 0) { \ + struct iovec iov[MCA_BTL_DES_MAX_SEGMENTS]; \ + uint32_t iov_count = 0; \ + size_t max_data = bytes_received; \ + size_t n, offset = seg_offset; \ + mca_btl_base_segment_t* segment = segments; \ + \ + OPAL_THREAD_LOCK(&request->lock); \ + for( n = 0; n < num_segments; n++, segment++ ) { \ + if(offset >= segment->seg_len) { \ + offset -= segment->seg_len; \ + } else { \ + iov[iov_count].iov_len = segment->seg_len - offset; \ + iov[iov_count].iov_base = (IOVBASE_TYPE*) \ + ((unsigned char*)segment->seg_addr.pval + offset); \ + iov_count++; \ + offset = 0; \ + } \ + } \ + PERUSE_TRACE_COMM_OMPI_EVENT (PERUSE_COMM_REQ_XFER_CONTINUE, \ + &(recvreq->req_recv.req_base), max_data, \ + PERUSE_RECV); \ + opal_convertor_set_position( &(request->req_recv.req_base.req_convertor), \ + &data_offset ); \ + opal_convertor_unpack( &(request)->req_recv.req_base.req_convertor, \ + iov, \ + &iov_count, \ + &max_data ); \ + bytes_delivered = max_data; \ + OPAL_THREAD_UNLOCK(&request->lock); \ + } \ +} while (0) + + +/** + * + */ + +void mca_pml_bfo_recv_request_progress_match( + mca_pml_bfo_recv_request_t* req, + struct mca_btl_base_module_t* btl, + mca_btl_base_segment_t* segments, + size_t num_segments); + +/** + * + */ + +void mca_pml_bfo_recv_request_progress_frag( + mca_pml_bfo_recv_request_t* req, + struct mca_btl_base_module_t* btl, + mca_btl_base_segment_t* segments, + size_t num_segments); + +/** + * + */ + +void mca_pml_bfo_recv_request_progress_rndv( + mca_pml_bfo_recv_request_t* req, + struct mca_btl_base_module_t* btl, + mca_btl_base_segment_t* segments, + size_t num_segments); + +/** + * + */ + +void mca_pml_bfo_recv_request_progress_rget( + mca_pml_bfo_recv_request_t* req, + struct mca_btl_base_module_t* btl, + mca_btl_base_segment_t* segments, + size_t num_segments); + +/** + * + */ + +void mca_pml_bfo_recv_request_matched_probe( + mca_pml_bfo_recv_request_t* req, + struct mca_btl_base_module_t* btl, + mca_btl_base_segment_t* segments, + size_t num_segments); + +/** + * + */ + +int mca_pml_bfo_recv_request_schedule_once( + mca_pml_bfo_recv_request_t* req, mca_btl_base_module_t* start_btl); + +static inline int mca_pml_bfo_recv_request_schedule_exclusive( + mca_pml_bfo_recv_request_t* req, + mca_bml_base_btl_t* start_bml_btl) +{ + int rc; + + do { + rc = mca_pml_bfo_recv_request_schedule_once(req, start_bml_btl ? start_bml_btl->btl : NULL); + if(OPAL_SOS_GET_ERROR_CODE(rc) == OMPI_ERR_OUT_OF_RESOURCE) + break; + } while(!unlock_recv_request(req)); + + if(OMPI_SUCCESS == rc) + recv_request_pml_complete_check(req); + + return rc; +} + +static inline void mca_pml_bfo_recv_request_schedule( + mca_pml_bfo_recv_request_t* req, + mca_bml_base_btl_t* start_bml_btl) +{ + if(!lock_recv_request(req)) + return; + + (void)mca_pml_bfo_recv_request_schedule_exclusive(req, start_bml_btl); +} + +#define MCA_PML_BFO_ADD_ACK_TO_PENDING(P, S, D, O) \ + do { \ + mca_pml_bfo_pckt_pending_t *_pckt; \ + int _rc; \ + \ + MCA_PML_BFO_PCKT_PENDING_ALLOC(_pckt,_rc); \ + _pckt->hdr.hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_ACK; \ + _pckt->hdr.hdr_ack.hdr_src_req.lval = (S); \ + _pckt->hdr.hdr_ack.hdr_dst_req.pval = (D); \ + _pckt->hdr.hdr_ack.hdr_send_offset = (O); \ + _pckt->proc = (P); \ + _pckt->bml_btl = NULL; \ + OPAL_THREAD_LOCK(&mca_pml_bfo.lock); \ + opal_list_append(&mca_pml_bfo.pckt_pending, \ + (opal_list_item_t*)_pckt); \ + OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock); \ + } while(0) + +int mca_pml_bfo_recv_request_ack_send_btl(ompi_proc_t* proc, + mca_bml_base_btl_t* bml_btl, uint64_t hdr_src_req, void *hdr_dst_req, + uint64_t hdr_rdma_offset, bool nordma); + +static inline int mca_pml_bfo_recv_request_ack_send(ompi_proc_t* proc, + uint64_t hdr_src_req, void *hdr_dst_req, uint64_t hdr_send_offset, + bool nordma) +{ + size_t i; + mca_bml_base_btl_t* bml_btl; + mca_bml_base_endpoint_t* endpoint = + (mca_bml_base_endpoint_t*)proc->proc_bml; + + for(i = 0; i < mca_bml_base_btl_array_get_size(&endpoint->btl_eager); i++) { + bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager); + if(mca_pml_bfo_recv_request_ack_send_btl(proc, bml_btl, hdr_src_req, + hdr_dst_req, hdr_send_offset, nordma) == OMPI_SUCCESS) + return OMPI_SUCCESS; + } + + MCA_PML_BFO_ADD_ACK_TO_PENDING(proc, hdr_src_req, hdr_dst_req, + hdr_send_offset); + + return OMPI_ERR_OUT_OF_RESOURCE; +} + +int mca_pml_bfo_recv_request_get_frag(mca_pml_bfo_rdma_frag_t* frag); + +/* This function tries to continue recvreq that stuck due to resource + * unavailability. Recvreq is added to recv_pending list if scheduling of put + * operation cannot be accomplished for some reason. */ +void mca_pml_bfo_recv_request_process_pending(void); + +END_C_DECLS + +#endif + diff --git a/ompi/mca/pml/bfo/pml_bfo_sendreq.c b/ompi/mca/pml/bfo/pml_bfo_sendreq.c new file mode 100644 index 0000000000..7c676966ba --- /dev/null +++ b/ompi/mca/pml/bfo/pml_bfo_sendreq.c @@ -0,0 +1,1595 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2008 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2008 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2008 UT-Battelle, LLC. All rights reserved. + * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + + +#include "ompi_config.h" +#include "opal/prefetch.h" +#include "ompi/constants.h" +#include "ompi/mca/pml/pml.h" +#include "ompi/mca/btl/btl.h" +#include "orte/mca/errmgr/errmgr.h" +#include "ompi/mca/mpool/mpool.h" +#include "pml_bfo.h" +#include "pml_bfo_hdr.h" +#include "pml_bfo_sendreq.h" +#include "pml_bfo_rdmafrag.h" +#include "pml_bfo_recvreq.h" +/* BFO FAILOVER CODE - begin */ +#include "pml_bfo_failover.h" +/* BFO FAILOVER CODE - end */ +#include "ompi/mca/bml/base/base.h" +#include "ompi/memchecker.h" + +OBJ_CLASS_INSTANCE(mca_pml_bfo_send_range_t, ompi_free_list_item_t, + NULL, NULL); + +void mca_pml_bfo_send_request_process_pending(struct mca_btl_base_module_t *btl) +{ + int i, s = opal_list_get_size(&mca_pml_bfo.send_pending); + + /* advance pending requests */ + for(i = 0; i < s; i++) { + mca_pml_bfo_send_pending_t pending_type = MCA_PML_BFO_SEND_PENDING_NONE; + mca_pml_bfo_send_request_t* sendreq; + mca_bml_base_btl_t *send_dst; + + sendreq = get_request_from_send_pending(&pending_type); + if(OPAL_UNLIKELY(NULL == sendreq)) + break; + + switch(pending_type) { + case MCA_PML_BFO_SEND_PENDING_SCHEDULE: + if(OPAL_SOS_GET_ERROR_CODE(mca_pml_bfo_send_request_schedule_exclusive(sendreq)) == + OMPI_ERR_OUT_OF_RESOURCE) { + return; + } + break; + case MCA_PML_BFO_SEND_PENDING_START: + send_dst = mca_bml_base_btl_array_find( + &sendreq->req_endpoint->btl_eager, btl); + if( (NULL == send_dst) || + (OPAL_SOS_GET_ERROR_CODE(mca_pml_bfo_send_request_start_btl(sendreq, send_dst)) == + OMPI_ERR_OUT_OF_RESOURCE) ) { + /* prepend to the pending list to minimize reordering in case + * send_dst != 0 */ + add_request_to_send_pending(sendreq, + MCA_PML_BFO_SEND_PENDING_START, NULL == send_dst); + /* if no destination try next request otherwise give up, + * no more resources on this btl */ + if(send_dst != NULL) + return; + } + break; + default: + opal_output(0, "[%s:%d] wrong send request type\n", + __FILE__, __LINE__); + break; + } + } +} + +/* + * The free call mark the final stage in a request life-cycle. Starting from this + * point the request is completed at both PML and user level, and can be used + * for others p2p communications. Therefore, in the case of the BFO PML it should + * be added to the free request list. + */ +static int mca_pml_bfo_send_request_free(struct ompi_request_t** request) +{ + mca_pml_bfo_send_request_t* sendreq = *(mca_pml_bfo_send_request_t**)request; + + assert( false == sendreq->req_send.req_base.req_free_called ); + + OPAL_THREAD_LOCK(&ompi_request_lock); + sendreq->req_send.req_base.req_free_called = true; + + PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_NOTIFY, + &(sendreq->req_send.req_base), PERUSE_SEND ); + + if( true == sendreq->req_send.req_base.req_pml_complete ) { + /* make buffer defined when the request is compeleted, + and before releasing the objects. */ + MEMCHECKER( + memchecker_call(&opal_memchecker_base_mem_defined, + sendreq->req_send.req_base.req_addr, + sendreq->req_send.req_base.req_count, + sendreq->req_send.req_base.req_datatype); + ); + + MCA_PML_BFO_SEND_REQUEST_RETURN( sendreq ); + } + + OPAL_THREAD_UNLOCK(&ompi_request_lock); + + *request = MPI_REQUEST_NULL; + return OMPI_SUCCESS; +} + +static int mca_pml_bfo_send_request_cancel(struct ompi_request_t* request, int complete) +{ + /* we dont cancel send requests by now */ + return OMPI_SUCCESS; +} + +static void mca_pml_bfo_send_request_construct(mca_pml_bfo_send_request_t* req) +{ + req->req_send.req_base.req_type = MCA_PML_REQUEST_SEND; + req->req_send.req_base.req_ompi.req_free = mca_pml_bfo_send_request_free; + req->req_send.req_base.req_ompi.req_cancel = mca_pml_bfo_send_request_cancel; + req->req_rdma_cnt = 0; + req->req_throttle_sends = false; + OBJ_CONSTRUCT(&req->req_send_ranges, opal_list_t); + OBJ_CONSTRUCT(&req->req_send_range_lock, opal_mutex_t); +} + +static void mca_pml_bfo_send_request_destruct(mca_pml_bfo_send_request_t* req) +{ + OBJ_DESTRUCT(&req->req_send_ranges); + OBJ_DESTRUCT(&req->req_send_range_lock); +} + +OBJ_CLASS_INSTANCE( mca_pml_bfo_send_request_t, + mca_pml_base_send_request_t, + mca_pml_bfo_send_request_construct, + mca_pml_bfo_send_request_destruct ); + +/** + * Completion of a short message - nothing left to schedule. + */ + +static inline void +mca_pml_bfo_match_completion_free_request( struct mca_btl_base_module_t* btl, + mca_pml_bfo_send_request_t* sendreq ) +{ + if( sendreq->req_send.req_bytes_packed > 0 ) { + PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_XFER_BEGIN, + &(sendreq->req_send.req_base), PERUSE_SEND ); + } + + /* signal request completion */ + send_request_pml_complete(sendreq); + + /* check for pending requests */ + MCA_PML_BFO_PROGRESS_PENDING(btl); +} + +static void +mca_pml_bfo_match_completion_free( struct mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* ep, + struct mca_btl_base_descriptor_t* des, + int status ) +{ + mca_pml_bfo_send_request_t* sendreq = (mca_pml_bfo_send_request_t*)des->des_cbdata; + + /* check completion status */ + if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) { +/* BFO FAILOVER CODE - begin */ + mca_pml_bfo_repost_match_fragment(des); + return; +/* BFO FAILOVER CODE - end */ + } + mca_pml_bfo_match_completion_free_request( btl, sendreq ); +} + +static inline void +mca_pml_bfo_rndv_completion_request( struct mca_btl_base_module_t* btl, + mca_pml_bfo_send_request_t* sendreq, + size_t req_bytes_delivered ) +{ + if( sendreq->req_send.req_bytes_packed > 0 ) { + PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_XFER_BEGIN, + &(sendreq->req_send.req_base), PERUSE_SEND ); + } + + OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, req_bytes_delivered); + + /* advance the request */ + OPAL_THREAD_ADD32(&sendreq->req_state, -1); + + send_request_pml_complete_check(sendreq); + + /* check for pending requests */ + MCA_PML_BFO_PROGRESS_PENDING(btl); +} + +/* + * Completion of the first fragment of a long message that + * requires an acknowledgement + */ +static void +mca_pml_bfo_rndv_completion( mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* ep, + struct mca_btl_base_descriptor_t* des, + int status ) +{ + mca_pml_bfo_send_request_t* sendreq = (mca_pml_bfo_send_request_t*)des->des_cbdata; + size_t req_bytes_delivered = 0; + + /* check completion status */ + if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) { +/* BFO FAILOVER CODE - begin */ + /* The completion event for the RNDV message has returned with + * an error. We know that the send request we are looking at is + * valid because it cannot be completed until the sendreq->req_state + * value reaches 0. And for the sendreq->req_state to reach 0, + * the completion event on the RNDV message must occur. So, we + * do not bother checking whether the send request is valid, + * because we know it is, but we put a few asserts in for good + * measure. We then check a few fields in the request to decide what + * to do. If the sendreq->req_error is set, that means that something + * has happend already to the request and we do not want to restart + * it. Presumably, we may have received a RECVERRNOTIFY + * message from the receiver. We also check the sendreq->req_acked + * field to see if it has been acked. If it has, then again we + * do not restart everything because obviously the RNDV message + * has made it to the other side. */ + assert((mca_pml_bfo_hdr_t*)(des->des_src->seg_addr.pval)->hdr_match.hdr_ctx == + sendreq->req_send.req_base.req_comm->c_contextid); + assert((mca_pml_bfo_hdr_t*)(des->des_src->seg_addr.pval)->hdr_match.hdr_src == + sendreq->req_send.req_base.req_comm->c_my_rank); + assert((mca_pml_bfo_hdr_t*)(des->des_src->seg_addr.pval)->hdr_match.hdr_seq == + (uint16_t)sendreq->req_send.req_base.req_sequence); + + if ((!sendreq->req_error) && (!sendreq->req_acked)) { + sendreq->req_events--; + /* Assume RNDV did not make it, so restart from the beginning. */ + mca_pml_bfo_send_request_restart(sendreq, true, MCA_PML_BFO_HDR_TYPE_RNDV); + return; + } +/* BFO FAILOVER CODE - end */ + } +/* BFO FAILOVER CODE - begin */ + sendreq->req_events--; + + /* Now check the error state. This request can be in error if the + * RNDV message made it over, but the receiver got an error trying + * to send the ACK back and therefore sent a RECVERRNOTIFY message. + * In that case, we want to start the restart dance as the receiver + * has matched this message already. Only restart if there are no + * outstanding events on send request. */ + if (sendreq->req_error) { + opal_output_verbose(30, mca_pml_bfo_output, + "RNDV: completion: sendreq has error, outstanding events=%d, " + "PML=%d, RQS=%d, src_req=%lx, dst_req=%lx, status=%d, peer=%d", + sendreq->req_events, (uint16_t)sendreq->req_send.req_base.req_sequence, + sendreq->req_restartseq, (unsigned long)sendreq, + (unsigned long)sendreq->req_recv.pval, + status, sendreq->req_send.req_base.req_peer); + if (0 == sendreq->req_events) { + mca_pml_bfo_send_request_rndvrestartnotify(sendreq, false, + MCA_PML_BFO_HDR_TYPE_RNDV, + status, btl); + } + return; + } +/* BFO FAILOVER CODE - end */ + + /* count bytes of user data actually delivered. As the rndv completion only + * happens in one thread, the increase of the req_bytes_delivered does not + * have to be atomic. + */ + MCA_PML_BFO_COMPUTE_SEGMENT_LENGTH( des->des_src, + des->des_src_cnt, + sizeof(mca_pml_bfo_rendezvous_hdr_t), + req_bytes_delivered ); + + mca_pml_bfo_rndv_completion_request( btl, sendreq, req_bytes_delivered ); +} + + +/** + * Completion of a get request. + */ + +static void +mca_pml_bfo_rget_completion( mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* ep, + struct mca_btl_base_descriptor_t* des, + int status ) +{ + mca_pml_bfo_send_request_t* sendreq = (mca_pml_bfo_send_request_t*)des->des_cbdata; + size_t req_bytes_delivered = 0; +/* BFO FAILOVER CODE - begin */ + /* This can happen if a FIN message arrives after the request was + * marked in error. So, just drop the message. Note that the + * status field is not checked here. That is because that is the + * value returned in the FIN hdr.hdr_fail field and may be used for + * other things. */ + if( OPAL_UNLIKELY(sendreq->req_error)) { + opal_output_verbose(30, mca_pml_bfo_output, + "FIN: received on broken request, skipping, " + "PML=%d, src_req=%lx, dst_req=%lx, peer=%d", + (uint16_t)sendreq->req_send.req_base.req_sequence, + (unsigned long)sendreq, (unsigned long)sendreq->req_recv.pval, + sendreq->req_send.req_base.req_peer); + btl->btl_free(btl, des); + return; + } +/* BFO FAILOVER CODE - end */ + + /* count bytes of user data actually delivered and check for request completion */ + MCA_PML_BFO_COMPUTE_SEGMENT_LENGTH( des->des_src, des->des_src_cnt, + 0, req_bytes_delivered ); + OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, req_bytes_delivered); + + send_request_pml_complete_check(sendreq); + /* free the descriptor */ + btl->btl_free(btl, des); + MCA_PML_BFO_PROGRESS_PENDING(btl); +} + + +/** + * Completion of a control message - return resources. + */ + +static void +mca_pml_bfo_send_ctl_completion( mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* ep, + struct mca_btl_base_descriptor_t* des, + int status ) +{ +/* BFO FAILOVER CODE - begin */ + if(OPAL_LIKELY(OMPI_SUCCESS == status)) { + /* check for pending requests */ + MCA_PML_BFO_PROGRESS_PENDING(btl); + } else { + mca_pml_bfo_hdr_t* hdr = des->des_src->seg_addr.pval; + /* If we get an error on the RGET message, then first make + * sure that header matches the send request that we are + * pointing to. This is necessary, because even though the + * sending side got an error, the RGET may have made it to the + * receiving side and the message transfer may have completed. + * This would then mean the send request has been completed and + * perhaps in use by another communication. So there is no need + * to restart this request. Therefore, ensure that we are + * looking at the same request that the header thinks we are + * looking at. If not, then there is nothing else to be done. */ + mca_pml_bfo_send_request_t* sendreq = (mca_pml_bfo_send_request_t*)des->des_cbdata; + + switch (hdr->hdr_common.hdr_type) { + case MCA_PML_BFO_HDR_TYPE_RGET: + if ((hdr->hdr_match.hdr_ctx != sendreq->req_send.req_base.req_comm->c_contextid) || + (hdr->hdr_match.hdr_src != sendreq->req_send.req_base.req_comm->c_my_rank) || + (hdr->hdr_match.hdr_seq != (uint16_t)sendreq->req_send.req_base.req_sequence)) { + opal_output_verbose(20, mca_pml_bfo_output, + "RGET: completion event: dropping because no valid request " + "PML:exp=%d,act=%d CTX:exp=%d,act=%d SRC:exp=%d,act=%d " + "RQS:exp=%d,act=%d, dst_req=%p", + (uint16_t)sendreq->req_send.req_base.req_sequence, + hdr->hdr_match.hdr_seq, + sendreq->req_send.req_base.req_comm->c_contextid, + hdr->hdr_match.hdr_ctx, + sendreq->req_send.req_base.req_comm->c_my_rank, + hdr->hdr_match.hdr_src, + sendreq->req_restartseq, hdr->hdr_fin.hdr_restartseq, + (void *)sendreq); + return; + } + mca_pml_bfo_send_request_restart(sendreq, true, MCA_PML_BFO_HDR_TYPE_RGET); + return; + default: + opal_output(0, "%s:%d FATAL ERROR, unknown header (hdr=%d)", + __FILE__, __LINE__, hdr->hdr_common.hdr_type); + orte_errmgr.abort(-1, NULL); + } + } +/* BFO FAILOVER CODE - end */ +} + +/** + * Completion of additional fragments of a large message - may need + * to schedule additional fragments. + */ + +static void +mca_pml_bfo_frag_completion( mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* ep, + struct mca_btl_base_descriptor_t* des, + int status ) +{ + mca_pml_bfo_send_request_t* sendreq = (mca_pml_bfo_send_request_t*)des->des_cbdata; + size_t req_bytes_delivered = 0; +/* BFO FAILOVER CODE - begin */ + sendreq->req_events--; +/* BFO FAILOVER CODE - end */ + + /* check completion status */ + if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) { +/* BFO FAILOVER CODE - begin */ + sendreq->req_error++; +/* BFO FAILOVER CODE - end */ + } + + /* count bytes of user data actually delivered */ + MCA_PML_BFO_COMPUTE_SEGMENT_LENGTH( des->des_src, + des->des_src_cnt, + sizeof(mca_pml_bfo_frag_hdr_t), + req_bytes_delivered ); + + OPAL_THREAD_ADD_SIZE_T(&sendreq->req_pipeline_depth, -1); + OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, req_bytes_delivered); + +/* BFO FAILOVER CODE - begin */ + /* note we check error after bytes delivered computation in case frag made it */ + if( OPAL_UNLIKELY(sendreq->req_error)) { + opal_output_verbose(30, mca_pml_bfo_output, + "FRAG: completion: sendreq has error, outstanding events=%d, " + "PML=%d, RQS=%d, src_req=%p, dst_req=%p, status=%d, peer=%d", + sendreq->req_events, (uint16_t)sendreq->req_send.req_base.req_sequence, + sendreq->req_restartseq, (void *)sendreq, + sendreq->req_recv.pval, + status, sendreq->req_send.req_base.req_peer); + if (0 == sendreq->req_events) { + mca_pml_bfo_send_request_rndvrestartnotify(sendreq, false, + MCA_PML_BFO_HDR_TYPE_FRAG, + status, btl); + } + return; + } +/* BFO FAILOVER CODE - end */ + if(send_request_pml_complete_check(sendreq) == false) { + mca_pml_bfo_send_request_schedule(sendreq); +/* BFO FAILOVER CODE - begin */ + if( OPAL_UNLIKELY(sendreq->req_error)) { + /* This situation can happen if the scheduling function + * determined that a BTL was removed from underneath us + * and therefore marked the request in error. In that + * case, the scheduling of fragments can no longer proceed + * properly. Therefore, if no outstanding events, initiate + * the restart dance. */ + opal_output_verbose(30, mca_pml_bfo_output, + "FRAG: completion: BTL has been removed, outstanding events=%d, " + "PML=%d, RQS=%d, src_req=%p, dst_req=%p, status=%d, peer=%d", + sendreq->req_events, (uint16_t)sendreq->req_send.req_base.req_sequence, + sendreq->req_restartseq, (void *)sendreq, + sendreq->req_recv.pval, + status, sendreq->req_send.req_base.req_peer); + if (0 == sendreq->req_events) { + mca_pml_bfo_send_request_rndvrestartnotify(sendreq, false, + MCA_PML_BFO_HDR_TYPE_FRAG, + status, btl); + } + } +/* BFO FAILOVER CODE - end */ + } + + /* check for pending requests */ + MCA_PML_BFO_PROGRESS_PENDING(btl); +} + +/** + * Buffer the entire message and mark as complete. + */ + +int mca_pml_bfo_send_request_start_buffered( + mca_pml_bfo_send_request_t* sendreq, + mca_bml_base_btl_t* bml_btl, + size_t size) +{ + mca_btl_base_descriptor_t* des; + mca_btl_base_segment_t* segment; + mca_pml_bfo_hdr_t* hdr; + struct iovec iov; + unsigned int iov_count; + size_t max_data, req_bytes_delivered; + int rc; + + /* allocate descriptor */ + mca_bml_base_alloc(bml_btl, &des, + MCA_BTL_NO_ORDER, + sizeof(mca_pml_bfo_rendezvous_hdr_t) + size, + MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); + if( OPAL_UNLIKELY(NULL == des) ) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + segment = des->des_src; + + /* pack the data into the BTL supplied buffer */ + iov.iov_base = (IOVBASE_TYPE*)((unsigned char*)segment->seg_addr.pval + + sizeof(mca_pml_bfo_rendezvous_hdr_t)); + iov.iov_len = size; + iov_count = 1; + max_data = size; + if((rc = opal_convertor_pack( &sendreq->req_send.req_base.req_convertor, + &iov, + &iov_count, + &max_data)) < 0) { + mca_bml_base_free(bml_btl, des); + return rc; + } + req_bytes_delivered = max_data; + + /* build rendezvous header */ + hdr = (mca_pml_bfo_hdr_t*)segment->seg_addr.pval; + hdr->hdr_common.hdr_flags = 0; + hdr->hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_RNDV; + hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid; + hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank; + hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag; + hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence; + hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed; + hdr->hdr_rndv.hdr_src_req.pval = sendreq; +/* BFO FAILOVER CODE - begin */ + if (0 < sendreq->req_restartseq) { + hdr->hdr_common.hdr_flags |= MCA_PML_BFO_HDR_FLAGS_RESTART; + hdr->hdr_rndv.hdr_dst_req = sendreq->req_recv; + hdr->hdr_rndv.hdr_restartseq = sendreq->req_restartseq; + opal_output_verbose(30, mca_pml_bfo_output, + "RNDV(buffered): restarting: PML=%d, RQS=%d, CTX=%d, SRC=%d, " + "src_req=%p, dst_req=%p, peer=%d", + (uint16_t)sendreq->req_send.req_base.req_sequence, sendreq->req_restartseq, + sendreq->req_send.req_base.req_comm->c_contextid, + sendreq->req_send.req_base.req_comm->c_my_rank, (void *)sendreq, + sendreq->req_recv.pval, sendreq->req_send.req_base.req_peer); + } +/* BFO FAILOVER CODE - end */ + + bfo_hdr_hton(hdr, MCA_PML_BFO_HDR_TYPE_RNDV, + sendreq->req_send.req_base.req_proc); + + /* update lengths */ + segment->seg_len = sizeof(mca_pml_bfo_rendezvous_hdr_t) + max_data; + + des->des_cbfunc = mca_pml_bfo_rndv_completion; + des->des_cbdata = sendreq; + + /* buffer the remainder of the message */ + rc = mca_pml_base_bsend_request_alloc((ompi_request_t*)sendreq); + if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) { + mca_bml_base_free(bml_btl, des); + return rc; + } + + iov.iov_base = (IOVBASE_TYPE*)(((unsigned char*)sendreq->req_send.req_addr) + max_data); + iov.iov_len = max_data = sendreq->req_send.req_bytes_packed - max_data; + + if((rc = opal_convertor_pack( &sendreq->req_send.req_base.req_convertor, + &iov, + &iov_count, + &max_data)) < 0) { + mca_bml_base_free(bml_btl, des); + return rc; + } + + /* re-init convertor for packed data */ + opal_convertor_prepare_for_send( &sendreq->req_send.req_base.req_convertor, + &(ompi_mpi_byte.dt.super), + sendreq->req_send.req_bytes_packed, + sendreq->req_send.req_addr ); + + /* wait for ack and completion */ + sendreq->req_state = 2; + + /* request is complete at mpi level */ + OPAL_THREAD_LOCK(&ompi_request_lock); + MCA_PML_BFO_SEND_REQUEST_MPI_COMPLETE(sendreq, true); + OPAL_THREAD_UNLOCK(&ompi_request_lock); + + /* send */ + rc = mca_bml_base_send(bml_btl, des, MCA_PML_BFO_HDR_TYPE_RNDV); + if( OPAL_LIKELY( rc >= 0 ) ) { + if( OPAL_LIKELY( 1 == rc ) ) { + mca_pml_bfo_rndv_completion_request( bml_btl->btl, sendreq, req_bytes_delivered); + } +/* BFO FAILOVER CODE - begin */ + if (des->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) { + sendreq->req_events++; + } +/* BFO FAILOVER CODE - end */ + return OMPI_SUCCESS; + } + mca_bml_base_free(bml_btl, des ); + return rc; +} + + +/** + * We work on a buffered request with a size smaller than the eager size + * or the BTL is not able to send the data IN_PLACE. Request a segment + * that is used for initial hdr and any eager data. This is used only + * from the _START macro. + */ +int mca_pml_bfo_send_request_start_copy( mca_pml_bfo_send_request_t* sendreq, + mca_bml_base_btl_t* bml_btl, + size_t size ) +{ + mca_btl_base_descriptor_t* des = NULL; + mca_btl_base_segment_t* segment; + mca_pml_bfo_hdr_t* hdr; + struct iovec iov; + unsigned int iov_count; + size_t max_data = size; + int rc; + + if(NULL != bml_btl->btl->btl_sendi) { + mca_pml_bfo_match_hdr_t match; + match.hdr_common.hdr_flags = 0; + match.hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_MATCH; + match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid; + match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank; + match.hdr_tag = sendreq->req_send.req_base.req_tag; + match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence; + + bfo_hdr_hton(&match, MCA_PML_BFO_HDR_TYPE_MATCH, + sendreq->req_send.req_base.req_proc); + + /* try to send immediately */ + rc = mca_bml_base_sendi( bml_btl, &sendreq->req_send.req_base.req_convertor, + &match, OMPI_PML_BFO_MATCH_HDR_LEN, + size, MCA_BTL_NO_ORDER, + MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP, + MCA_PML_BFO_HDR_TYPE_MATCH, + &des); + if( OPAL_LIKELY(OMPI_SUCCESS == rc) ) { +/* BFO FAILOVER CODE - begin */ + /* Needed for failover */ + if (NULL != des) { + des->des_cbfunc = mca_pml_bfo_match_completion_free; + des->des_cbdata = sendreq->req_endpoint; + } +/* BFO FAILOVER CODE - end */ + + /* signal request completion */ + send_request_pml_complete(sendreq); + + /* check for pending requests */ + MCA_PML_BFO_PROGRESS_PENDING(bml_btl->btl); + return OMPI_SUCCESS; + } + } else { + /* allocate descriptor */ + mca_bml_base_alloc( bml_btl, &des, + MCA_BTL_NO_ORDER, + OMPI_PML_BFO_MATCH_HDR_LEN + size, + MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); + } + if( OPAL_UNLIKELY(NULL == des) ) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + + segment = des->des_src; + + if(size > 0) { + /* pack the data into the supplied buffer */ + iov.iov_base = (IOVBASE_TYPE*)((unsigned char*)segment->seg_addr.pval + + OMPI_PML_BFO_MATCH_HDR_LEN); + iov.iov_len = size; + iov_count = 1; + /* + * Before copy the user buffer, make the target part + * accessible. + */ + MEMCHECKER( + memchecker_call(&opal_memchecker_base_mem_defined, + sendreq->req_send.req_base.req_addr, + sendreq->req_send.req_base.req_count, + sendreq->req_send.req_base.req_datatype); + ); + (void)opal_convertor_pack( &sendreq->req_send.req_base.req_convertor, + &iov, &iov_count, &max_data ); + /* + * Packing finished, make the user buffer unaccessable. + */ + MEMCHECKER( + memchecker_call(&opal_memchecker_base_mem_noaccess, + sendreq->req_send.req_base.req_addr, + sendreq->req_send.req_base.req_count, + sendreq->req_send.req_base.req_datatype); + ); + } + + + /* build match header */ + hdr = (mca_pml_bfo_hdr_t*)segment->seg_addr.pval; + hdr->hdr_common.hdr_flags = 0; + hdr->hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_MATCH; + hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid; + hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank; + hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag; + hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence; + + bfo_hdr_hton(hdr, MCA_PML_BFO_HDR_TYPE_MATCH, + sendreq->req_send.req_base.req_proc); + + /* update lengths */ + segment->seg_len = OMPI_PML_BFO_MATCH_HDR_LEN + max_data; + + /* short message */ + des->des_cbdata = sendreq; + des->des_cbfunc = mca_pml_bfo_match_completion_free; + + /* send */ + rc = mca_bml_base_send_status(bml_btl, des, MCA_PML_BFO_HDR_TYPE_MATCH); + if( OPAL_LIKELY( rc >= OMPI_SUCCESS ) ) { + if( OPAL_LIKELY( 1 == rc ) ) { + mca_pml_bfo_match_completion_free_request( bml_btl->btl, sendreq ); + } + return OMPI_SUCCESS; + } + switch(OPAL_SOS_GET_ERROR_CODE(rc)) { + case OMPI_ERR_RESOURCE_BUSY: + /* No more resources. Allow the upper level to queue the send */ + rc = OMPI_ERR_OUT_OF_RESOURCE; + break; + default: + mca_bml_base_free(bml_btl, des); + break; + } + return rc; +} + +/** + * BTL can send directly from user buffer so allow the BTL + * to prepare the segment list. Start sending a small message. + */ + +int mca_pml_bfo_send_request_start_prepare( mca_pml_bfo_send_request_t* sendreq, + mca_bml_base_btl_t* bml_btl, + size_t size ) +{ + mca_btl_base_descriptor_t* des; + mca_btl_base_segment_t* segment; + mca_pml_bfo_hdr_t* hdr; + int rc; + + /* prepare descriptor */ + mca_bml_base_prepare_src( bml_btl, + NULL, + &sendreq->req_send.req_base.req_convertor, + MCA_BTL_NO_ORDER, + OMPI_PML_BFO_MATCH_HDR_LEN, + &size, + MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP, + &des ); + if( OPAL_UNLIKELY(NULL == des) ) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + segment = des->des_src; + + /* build match header */ + hdr = (mca_pml_bfo_hdr_t*)segment->seg_addr.pval; + hdr->hdr_common.hdr_flags = 0; + hdr->hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_MATCH; + hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid; + hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank; + hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag; + hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence; + + bfo_hdr_hton(hdr, MCA_PML_BFO_HDR_TYPE_MATCH, + sendreq->req_send.req_base.req_proc); + + /* short message */ + des->des_cbfunc = mca_pml_bfo_match_completion_free; + des->des_cbdata = sendreq; + + /* send */ + rc = mca_bml_base_send(bml_btl, des, MCA_PML_BFO_HDR_TYPE_MATCH); + if( OPAL_LIKELY( rc >= 0 ) ) { + if( OPAL_LIKELY( 1 == rc ) ) { + mca_pml_bfo_match_completion_free_request( bml_btl->btl, sendreq ); + } + return OMPI_SUCCESS; + } + mca_bml_base_free(bml_btl, des ); + return rc; +} + + +/** + * We have contigous data that is registered - schedule across + * available nics. + */ + +int mca_pml_bfo_send_request_start_rdma( mca_pml_bfo_send_request_t* sendreq, + mca_bml_base_btl_t* bml_btl, + size_t size ) +{ + /* + * When req_rdma array is constructed the first element of the array always + * assigned different btl in round robin fashion (if there are more than + * one RDMA capable BTLs). This way round robin distribution of RDMA + * operation is achieved. + */ + + mca_btl_base_descriptor_t* des; + mca_btl_base_segment_t* segment; + mca_pml_bfo_hdr_t* hdr; + bool need_local_cb = false; + int rc; + + bml_btl = sendreq->req_rdma[0].bml_btl; + if((sendreq->req_rdma_cnt == 1) && (bml_btl->btl_flags & MCA_BTL_FLAGS_GET)) { + mca_mpool_base_registration_t* reg = sendreq->req_rdma[0].btl_reg; + mca_btl_base_descriptor_t* src; + size_t i; + size_t old_position = sendreq->req_send.req_base.req_convertor.bConverted; + + MEMCHECKER( + memchecker_call(&opal_memchecker_base_mem_defined, + sendreq->req_send.req_base.req_addr, + sendreq->req_send.req_base.req_count, + sendreq->req_send.req_base.req_datatype); + ); + /* prepare source descriptor/segment(s) */ + /* PML owns this descriptor and will free it in */ + /* get_completion */ + mca_bml_base_prepare_src( bml_btl, + reg, + &sendreq->req_send.req_base.req_convertor, + MCA_BTL_NO_ORDER, + 0, + &size, + 0, + &src ); + MEMCHECKER( + memchecker_call(&opal_memchecker_base_mem_noaccess, + sendreq->req_send.req_base.req_addr, + sendreq->req_send.req_base.req_count, + sendreq->req_send.req_base.req_datatype); + ); + if( OPAL_UNLIKELY(NULL == src) ) { + opal_convertor_set_position(&sendreq->req_send.req_base.req_convertor, + &old_position); + return OMPI_ERR_OUT_OF_RESOURCE; + } + src->des_cbfunc = mca_pml_bfo_rget_completion; + src->des_cbdata = sendreq; + + /* allocate space for get hdr + segment list */ + mca_bml_base_alloc(bml_btl, &des, MCA_BTL_NO_ORDER, + sizeof(mca_pml_bfo_rget_hdr_t) + + (sizeof(mca_btl_base_segment_t) * (src->des_src_cnt-1)), + MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); + if( OPAL_UNLIKELY(NULL == des) ) { + opal_convertor_set_position( &sendreq->req_send.req_base.req_convertor, + &old_position ); + mca_bml_base_free(bml_btl, src); + return OMPI_ERR_OUT_OF_RESOURCE; + } + segment = des->des_src; + + /* build match header */ + hdr = (mca_pml_bfo_hdr_t*)segment->seg_addr.pval; + hdr->hdr_common.hdr_flags = MCA_PML_BFO_HDR_FLAGS_CONTIG|MCA_PML_BFO_HDR_FLAGS_PIN; + hdr->hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_RGET; + hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid; + hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank; + hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag; + hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence; + hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed; + hdr->hdr_rndv.hdr_src_req.pval = sendreq; +/* BFO FAILOVER CODE - begin */ + if (0 < sendreq->req_restartseq) { + hdr->hdr_common.hdr_flags |= MCA_PML_BFO_HDR_FLAGS_RESTART; + hdr->hdr_rndv.hdr_dst_req = sendreq->req_recv; + hdr->hdr_rndv.hdr_restartseq = sendreq->req_restartseq; + opal_output_verbose(30, mca_pml_bfo_output, + "RGET: restarting: PML=%d, RQS=%d, CTX=%d, SRC=%d, " + "src_req=%p, dst_req=%p, peer=%d", + (uint16_t)sendreq->req_send.req_base.req_sequence, + sendreq->req_restartseq, + sendreq->req_send.req_base.req_comm->c_contextid, + sendreq->req_send.req_base.req_comm->c_my_rank, + (void *)sendreq, sendreq->req_recv.pval, + sendreq->req_send.req_base.req_peer); + } +/* BFO FAILOVER CODE - end */ + hdr->hdr_rget.hdr_des.pval = src; + hdr->hdr_rget.hdr_seg_cnt = src->des_src_cnt; + + bfo_hdr_hton(hdr, MCA_PML_BFO_HDR_TYPE_RGET, + sendreq->req_send.req_base.req_proc); + + for( i = 0; i < src->des_src_cnt; i++ ) { + hdr->hdr_rget.hdr_segs[i].seg_addr.lval = ompi_ptr_ptol(src->des_src[i].seg_addr.pval); + hdr->hdr_rget.hdr_segs[i].seg_len = src->des_src[i].seg_len; + hdr->hdr_rget.hdr_segs[i].seg_key.key64 = src->des_src[i].seg_key.key64; + } + + des->des_cbfunc = mca_pml_bfo_send_ctl_completion; + + /** + * Well, it's a get so we will not know when the peer get the data anyway. + * If we generate the PERUSE event here, at least we will know when do we + * sent the GET message ... + */ + if( sendreq->req_send.req_bytes_packed > 0 ) { + PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_XFER_BEGIN, + &(sendreq->req_send.req_base), PERUSE_SEND ); + } + + } else { + + /* allocate a rendezvous header - dont eager send any data + * receiver will schedule rdma put(s) of the entire message + */ + + mca_bml_base_alloc(bml_btl, &des, + MCA_BTL_NO_ORDER, + sizeof(mca_pml_bfo_rendezvous_hdr_t), + MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); + if( OPAL_UNLIKELY(NULL == des)) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + segment = des->des_src; + + /* build hdr */ + hdr = (mca_pml_bfo_hdr_t*)segment->seg_addr.pval; + hdr->hdr_common.hdr_flags = MCA_PML_BFO_HDR_FLAGS_CONTIG|MCA_PML_BFO_HDR_FLAGS_PIN; + hdr->hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_RNDV; + hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid; + hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank; + hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag; + hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence; + hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed; + hdr->hdr_rndv.hdr_src_req.pval = sendreq; +/* BFO FAILOVER CODE - begin */ + if (0 < sendreq->req_restartseq) { + hdr->hdr_common.hdr_flags |= MCA_PML_BFO_HDR_FLAGS_RESTART; + hdr->hdr_rndv.hdr_dst_req = sendreq->req_recv; + hdr->hdr_rndv.hdr_restartseq = sendreq->req_restartseq; + opal_output_verbose(30, mca_pml_bfo_output, + "RNDV: restarting: PML=%d, RQS=%d, CTX=%d, SRC=%d, " + "src_req=%p, dst_req=%p, peer=%d", + (uint16_t)sendreq->req_send.req_base.req_sequence, + sendreq->req_restartseq, + sendreq->req_send.req_base.req_comm->c_contextid, + sendreq->req_send.req_base.req_comm->c_my_rank, + (void *)sendreq, sendreq->req_recv.pval, + sendreq->req_send.req_base.req_peer); + } +/* BFO FAILOVER CODE - end */ + + bfo_hdr_hton(hdr, MCA_PML_BFO_HDR_TYPE_RNDV, + sendreq->req_send.req_base.req_proc); + + /* update lengths with number of bytes actually packed */ + segment->seg_len = sizeof(mca_pml_bfo_rendezvous_hdr_t); + + /* first fragment of a long message */ + des->des_cbfunc = mca_pml_bfo_rndv_completion; + need_local_cb = true; + + /* wait for ack and completion */ + sendreq->req_state = 2; + } + + des->des_cbdata = sendreq; + + /* send */ + rc = mca_bml_base_send(bml_btl, des, hdr->hdr_common.hdr_type); + if( OPAL_LIKELY( rc >= 0 ) ) { + if( OPAL_LIKELY( 1 == rc ) && (true == need_local_cb)) { + mca_pml_bfo_rndv_completion_request( bml_btl->btl, sendreq, 0 ); + } +/* BFO FAILOVER CODE - begin */ + if ((des->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) && + (MCA_PML_BFO_HDR_TYPE_RNDV == hdr->hdr_common.hdr_type)) { + sendreq->req_events++; + } +/* BFO FAILOVER CODE - end */ + return OMPI_SUCCESS; + } + mca_bml_base_free(bml_btl, des); + return rc; +} + + +/** + * Rendezvous is required. Not doing rdma so eager send up to + * the btls eager limit. + */ + +int mca_pml_bfo_send_request_start_rndv( mca_pml_bfo_send_request_t* sendreq, + mca_bml_base_btl_t* bml_btl, + size_t size, + int flags ) +{ + mca_btl_base_descriptor_t* des; + mca_btl_base_segment_t* segment; + mca_pml_bfo_hdr_t* hdr; + int rc; + + /* prepare descriptor */ + if(size == 0) { + mca_bml_base_alloc( bml_btl, + &des, + MCA_BTL_NO_ORDER, + sizeof(mca_pml_bfo_rendezvous_hdr_t), + MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP ); + } else { + MEMCHECKER( + memchecker_call(&opal_memchecker_base_mem_defined, + sendreq->req_send.req_base.req_addr, + sendreq->req_send.req_base.req_count, + sendreq->req_send.req_base.req_datatype); + ); + mca_bml_base_prepare_src( bml_btl, + NULL, + &sendreq->req_send.req_base.req_convertor, + MCA_BTL_NO_ORDER, + sizeof(mca_pml_bfo_rendezvous_hdr_t), + &size, + MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP, + &des ); + MEMCHECKER( + memchecker_call(&opal_memchecker_base_mem_noaccess, + sendreq->req_send.req_base.req_addr, + sendreq->req_send.req_base.req_count, + sendreq->req_send.req_base.req_datatype); + ); + } + + if( OPAL_UNLIKELY(NULL == des) ) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + segment = des->des_src; + + /* build hdr */ + hdr = (mca_pml_bfo_hdr_t*)segment->seg_addr.pval; + hdr->hdr_common.hdr_flags = flags; + hdr->hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_RNDV; + hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid; + hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank; + hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag; + hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence; + hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed; + hdr->hdr_rndv.hdr_src_req.pval = sendreq; +/* BFO FAILOVER CODE - begin */ + if (0 < sendreq->req_restartseq) { + hdr->hdr_common.hdr_flags |= MCA_PML_BFO_HDR_FLAGS_RESTART; + hdr->hdr_rndv.hdr_dst_req = sendreq->req_recv; + hdr->hdr_rndv.hdr_restartseq = sendreq->req_restartseq; + opal_output_verbose(30, mca_pml_bfo_output, + "RNDV: restarting: PML=%d, RQS=%d, CTX=%d, SRC=%d, " + "src_req=%p, dst_req=%p, peer=%d", + (uint16_t)sendreq->req_send.req_base.req_sequence, sendreq->req_restartseq, + sendreq->req_send.req_base.req_comm->c_contextid, + sendreq->req_send.req_base.req_comm->c_my_rank, + (void *)sendreq, sendreq->req_recv.pval, + sendreq->req_send.req_base.req_peer); + } +/* BFO FAILOVER CODE - end */ + + bfo_hdr_hton(hdr, MCA_PML_BFO_HDR_TYPE_RNDV, + sendreq->req_send.req_base.req_proc); + + /* first fragment of a long message */ + des->des_cbdata = sendreq; + des->des_cbfunc = mca_pml_bfo_rndv_completion; + + /* wait for ack and completion */ + sendreq->req_state = 2; + + /* send */ + rc = mca_bml_base_send(bml_btl, des, MCA_PML_BFO_HDR_TYPE_RNDV); + if( OPAL_LIKELY( rc >= 0 ) ) { + if( OPAL_LIKELY( 1 == rc ) ) { + mca_pml_bfo_rndv_completion_request( bml_btl->btl, sendreq, size ); + } +/* BFO FAILOVER CODE - begin */ + if (des->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) { + sendreq->req_events++; + } +/* BFO FAILOVER CODE - end */ + return OMPI_SUCCESS; + } + mca_bml_base_free(bml_btl, des ); + return rc; +} + +void mca_pml_bfo_send_request_copy_in_out( mca_pml_bfo_send_request_t *sendreq, + uint64_t send_offset, + uint64_t send_length ) +{ + mca_pml_bfo_send_range_t *sr; + ompi_free_list_item_t *i; + mca_bml_base_endpoint_t* bml_endpoint = sendreq->req_endpoint; + int num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send); + int rc = OMPI_SUCCESS, n; + double weight_total = 0; + + if( OPAL_UNLIKELY(0 == send_length) ) + return; + + OMPI_FREE_LIST_WAIT(&mca_pml_bfo.send_ranges, i, rc); + + sr = (mca_pml_bfo_send_range_t*)i; + + sr->range_send_offset = send_offset; + sr->range_send_length = send_length; + sr->range_btl_idx = 0; + + for(n = 0; n < num_btls && n < mca_pml_bfo.max_send_per_range; n++) { + sr->range_btls[n].bml_btl = + mca_bml_base_btl_array_get_next(&bml_endpoint->btl_send); + weight_total += sr->range_btls[n].bml_btl->btl_weight; + } + + sr->range_btl_cnt = n; + mca_pml_bfo_calc_weighted_length(sr->range_btls, n, send_length, + weight_total); + + OPAL_THREAD_LOCK(&sendreq->req_send_range_lock); + opal_list_append(&sendreq->req_send_ranges, (opal_list_item_t*)sr); + OPAL_THREAD_UNLOCK(&sendreq->req_send_range_lock); +} + +static inline mca_pml_bfo_send_range_t * +get_send_range_nolock(mca_pml_bfo_send_request_t* sendreq) +{ + opal_list_item_t *item; + + item = opal_list_get_first(&sendreq->req_send_ranges); + + if(opal_list_get_end(&sendreq->req_send_ranges) == item) + return NULL; + + return (mca_pml_bfo_send_range_t*)item; +} + +static inline mca_pml_bfo_send_range_t * +get_send_range(mca_pml_bfo_send_request_t* sendreq) +{ + mca_pml_bfo_send_range_t *range; + + OPAL_THREAD_LOCK(&sendreq->req_send_range_lock); + range = get_send_range_nolock(sendreq); + OPAL_THREAD_UNLOCK(&sendreq->req_send_range_lock); + + return range; +} + +static inline mca_pml_bfo_send_range_t * +get_next_send_range(mca_pml_bfo_send_request_t* sendreq, + mca_pml_bfo_send_range_t *range) +{ + OPAL_THREAD_LOCK(&sendreq->req_send_range_lock); + opal_list_remove_item(&sendreq->req_send_ranges, (opal_list_item_t *)range); + OMPI_FREE_LIST_RETURN(&mca_pml_bfo.send_ranges, &range->base); + range = get_send_range_nolock(sendreq); + OPAL_THREAD_UNLOCK(&sendreq->req_send_range_lock); + + return range; +} + +/** + * Schedule pipeline of send descriptors for the given request. + * Up to the rdma threshold. If this is a send based protocol, + * the rdma threshold is the end of the message. Otherwise, schedule + * fragments up to the threshold to overlap initial registration/setup + * costs of the rdma. Only one thread can be inside this function. + */ + +int +mca_pml_bfo_send_request_schedule_once(mca_pml_bfo_send_request_t* sendreq) +{ + size_t prev_bytes_remaining = 0; + mca_pml_bfo_send_range_t *range; + int num_fail = 0; + + /* check pipeline_depth here before attempting to get any locks */ + if(true == sendreq->req_throttle_sends && + sendreq->req_pipeline_depth >= mca_pml_bfo.send_pipeline_depth) + return OMPI_SUCCESS; + + range = get_send_range(sendreq); + + while(range && (false == sendreq->req_throttle_sends || + sendreq->req_pipeline_depth < mca_pml_bfo.send_pipeline_depth)) { + mca_pml_bfo_frag_hdr_t* hdr; + mca_btl_base_descriptor_t* des; + int rc, btl_idx; + size_t size, offset, data_remaining = 0; + mca_bml_base_btl_t* bml_btl; + + assert(range->range_send_length != 0); +/* BFO FAILOVER CODE - begin */ + /* Failover code. If this is true, this means the request thinks we + * have more BTLs than there really are. This can happen because + * a BTL was removed from the available list. In this case, we + * want to start over. */ + if ((int)mca_bml_base_btl_array_get_size(&sendreq->req_endpoint->btl_send) + != range->range_btl_cnt) { + sendreq->req_error++; + return OMPI_ERROR; + } +/* BFO FAILOVER CODE - end */ + + if(prev_bytes_remaining == range->range_send_length) + num_fail++; + else + num_fail = 0; + + prev_bytes_remaining = range->range_send_length; + + if( OPAL_UNLIKELY(num_fail == range->range_btl_cnt) ) { + assert(sendreq->req_pending == MCA_PML_BFO_SEND_PENDING_NONE); + add_request_to_send_pending(sendreq, + MCA_PML_BFO_SEND_PENDING_SCHEDULE, true); + /* Note that request remains locked. send_request_process_pending() + * function will call shedule_exclusive() directly without taking + * the lock */ + return OMPI_ERR_OUT_OF_RESOURCE; + } + +cannot_pack: + do { + btl_idx = range->range_btl_idx; + if(++range->range_btl_idx == range->range_btl_cnt) + range->range_btl_idx = 0; + } while(!range->range_btls[btl_idx].length); + + bml_btl = range->range_btls[btl_idx].bml_btl; + /* If there is a remaining data from another BTL that was too small + * for converter to pack then send it through another BTL */ + range->range_btls[btl_idx].length += data_remaining; + size = range->range_btls[btl_idx].length; + + /* makes sure that we don't exceed BTL max send size */ + if(bml_btl->btl->btl_max_send_size != 0) { + size_t max_send_size = bml_btl->btl->btl_max_send_size - + sizeof(mca_pml_bfo_frag_hdr_t); + + if (size > max_send_size) { + size = max_send_size; + } + } + + /* pack into a descriptor */ + offset = (size_t)range->range_send_offset; + opal_convertor_set_position(&sendreq->req_send.req_base.req_convertor, + &offset); + range->range_send_offset = (uint64_t)offset; + + data_remaining = size; + MEMCHECKER( + memchecker_call(&opal_memchecker_base_mem_defined, + sendreq->req_send.req_base.req_addr, + sendreq->req_send.req_base.req_count, + sendreq->req_send.req_base.req_datatype); + ); + mca_bml_base_prepare_src(bml_btl, NULL, + &sendreq->req_send.req_base.req_convertor, + MCA_BTL_NO_ORDER, + sizeof(mca_pml_bfo_frag_hdr_t), + &size, MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK, &des); + MEMCHECKER( + memchecker_call(&opal_memchecker_base_mem_noaccess, + sendreq->req_send.req_base.req_addr, + sendreq->req_send.req_base.req_count, + sendreq->req_send.req_base.req_datatype); + ); + + if( OPAL_UNLIKELY(des == NULL || size == 0) ) { + if(des) { + /* Converter can't pack this chunk. Append to another chunk + * from other BTL */ + mca_bml_base_free(bml_btl, des); + range->range_btls[btl_idx].length -= data_remaining; + goto cannot_pack; + } + continue; + } + + des->des_cbfunc = mca_pml_bfo_frag_completion; + des->des_cbdata = sendreq; + + /* setup header */ + hdr = (mca_pml_bfo_frag_hdr_t*)des->des_src->seg_addr.pval; + hdr->hdr_common.hdr_flags = 0; + hdr->hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_FRAG; + hdr->hdr_frag_offset = range->range_send_offset; + hdr->hdr_src_req.pval = sendreq; + hdr->hdr_dst_req = sendreq->req_recv; + + bfo_hdr_hton(hdr, MCA_PML_BFO_HDR_TYPE_FRAG, + sendreq->req_send.req_base.req_proc); + +#if OMPI_WANT_PERUSE + PERUSE_TRACE_COMM_OMPI_EVENT(PERUSE_COMM_REQ_XFER_CONTINUE, + &(sendreq->req_send.req_base), size, PERUSE_SEND); +#endif /* OMPI_WANT_PERUSE */ + + /* initiate send - note that this may complete before the call returns */ + rc = mca_bml_base_send(bml_btl, des, MCA_PML_BFO_HDR_TYPE_FRAG); + if( OPAL_LIKELY(rc >= 0) ) { + /* update state */ + range->range_btls[btl_idx].length -= size; + range->range_send_length -= size; + range->range_send_offset += size; + OPAL_THREAD_ADD_SIZE_T(&sendreq->req_pipeline_depth, 1); + if(range->range_send_length == 0) { + range = get_next_send_range(sendreq, range); + prev_bytes_remaining = 0; + } +/* BFO FAILOVER CODE - begin */ + if (des->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) { + sendreq->req_events++; + } +/* BFO FAILOVER CODE - end */ + } else { + mca_bml_base_free(bml_btl,des); + } + } + + return OMPI_SUCCESS; +} + + +/** + * An RDMA put operation has completed: + * (1) Update request status and if required set completed + * (2) Send FIN control message to the destination + */ + +static void mca_pml_bfo_put_completion( mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* ep, + struct mca_btl_base_descriptor_t* des, + int status ) +{ + mca_pml_bfo_rdma_frag_t* frag = (mca_pml_bfo_rdma_frag_t*)des->des_cbdata; + mca_pml_bfo_send_request_t* sendreq = (mca_pml_bfo_send_request_t*)frag->rdma_req; + mca_bml_base_btl_t* bml_btl; +/* BFO FAILOVER CODE - begin */ + sendreq->req_events--; +/* BFO FAILOVER CODE - end */ + + /* check completion status */ + if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) { +/* BFO FAILOVER CODE - begin */ + sendreq->req_error++; +/* BFO FAILOVER CODE - end */ + } + +/* BFO FAILOVER CODE - begin */ + if ( OPAL_UNLIKELY(sendreq->req_error)) { + opal_output_verbose(30, mca_pml_bfo_output, + "RDMA write: completion: sendreq has error, outstanding events=%d, " + "PML=%d, RQS=%d, src_req=%p, dst_req=%p, status=%d, peer=%d", + sendreq->req_events, (uint16_t)sendreq->req_send.req_base.req_sequence, + sendreq->req_restartseq, (void *)sendreq, + sendreq->req_recv.pval, + status, sendreq->req_send.req_base.req_peer); + if (0 == sendreq->req_events) { + mca_pml_bfo_send_request_rndvrestartnotify(sendreq, false, + MCA_PML_BFO_HDR_TYPE_PUT, + status, btl); + } + MCA_PML_BFO_RDMA_FRAG_RETURN(frag); + return; + } +/* BFO FAILOVER CODE - end */ + +/* BFO FAILOVER CODE - begin */ + /* Find back the bml_btl that this btl belongs to. If we cannot + * find it, then it may have been removed from underneath us, so + * find the next available one to send the FIN message on. */ + bml_btl = mca_bml_base_btl_array_find(&sendreq->req_endpoint->btl_rdma, btl); + if( OPAL_UNLIKELY(NULL == bml_btl) ) { + opal_output_verbose(20, mca_pml_bfo_output, + "RDMA write completion: BML was removed from underneath us, " + "PML=%d, RQS=%d, src_req=%p, dst_req=%p, status=%d, peer=%d", + (uint16_t)sendreq->req_send.req_base.req_sequence, + sendreq->req_restartseq, (void *)sendreq, + sendreq->req_recv.pval, + status, sendreq->req_send.req_base.req_peer); + bml_btl = mca_bml_base_btl_array_get_next(&sendreq->req_endpoint->btl_rdma); + } +/* BFO FAILOVER CODE - end */ + + mca_pml_bfo_send_fin(sendreq->req_send.req_base.req_proc, + bml_btl, + frag->rdma_hdr.hdr_rdma.hdr_des, + des->order, 0, (uint16_t)sendreq->req_send.req_base.req_sequence, + sendreq->req_restartseq, sendreq->req_send.req_base.req_comm->c_contextid, + sendreq->req_send.req_base.req_comm->c_my_rank); + + /* check for request completion */ + OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, frag->rdma_length); + + send_request_pml_complete_check(sendreq); + + MCA_PML_BFO_RDMA_FRAG_RETURN(frag); + + MCA_PML_BFO_PROGRESS_PENDING(btl); +} + +int mca_pml_bfo_send_request_put_frag( mca_pml_bfo_rdma_frag_t* frag ) +{ + mca_mpool_base_registration_t* reg = NULL; + mca_bml_base_btl_t* bml_btl = frag->rdma_bml; + mca_btl_base_descriptor_t* des; + size_t save_size = frag->rdma_length; + int rc; + + /* setup descriptor */ + mca_bml_base_prepare_src( bml_btl, + reg, + &frag->convertor, + MCA_BTL_NO_ORDER, + 0, + &frag->rdma_length, + MCA_BTL_DES_FLAGS_BTL_OWNERSHIP, + &des ); + + if( OPAL_UNLIKELY(NULL == des) ) { + if(frag->retries < mca_pml_bfo.rdma_put_retries_limit) { + size_t offset = (size_t)frag->rdma_hdr.hdr_rdma.hdr_rdma_offset; + frag->rdma_length = save_size; + opal_convertor_set_position(&frag->convertor, &offset); + OPAL_THREAD_LOCK(&mca_pml_bfo.lock); + opal_list_append(&mca_pml_bfo.rdma_pending, (opal_list_item_t*)frag); + OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock); + } else { + mca_pml_bfo_send_request_t *sendreq = + (mca_pml_bfo_send_request_t*)frag->rdma_req; + + /* tell receiver to unregister memory */ + mca_pml_bfo_send_fin(sendreq->req_send.req_base.req_proc, + bml_btl, frag->rdma_hdr.hdr_rdma.hdr_des, + MCA_BTL_NO_ORDER, 1, (uint16_t)sendreq->req_send.req_base.req_sequence, + sendreq->req_restartseq, sendreq->req_send.req_base.req_comm->c_contextid, + sendreq->req_send.req_base.req_comm->c_my_rank); + + /* send fragment by copy in/out */ + mca_pml_bfo_send_request_copy_in_out(sendreq, + frag->rdma_hdr.hdr_rdma.hdr_rdma_offset, frag->rdma_length); + /* if a pointer to a receive request is not set it means that + * ACK was not yet received. Don't schedule sends before ACK */ + if(NULL != sendreq->req_recv.pval) + mca_pml_bfo_send_request_schedule(sendreq); + } + return OMPI_ERR_OUT_OF_RESOURCE; + } + + des->des_dst = frag->rdma_segs; + des->des_dst_cnt = frag->rdma_hdr.hdr_rdma.hdr_seg_cnt; + des->des_cbfunc = mca_pml_bfo_put_completion; + des->des_cbdata = frag; + + PERUSE_TRACE_COMM_OMPI_EVENT( PERUSE_COMM_REQ_XFER_CONTINUE, + &(((mca_pml_bfo_send_request_t*)frag->rdma_req)->req_send.req_base), save_size, PERUSE_SEND ); + + rc = mca_bml_base_put(bml_btl, des); + if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) { + mca_bml_base_free(bml_btl, des); + frag->rdma_length = save_size; + if(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) { + OPAL_THREAD_LOCK(&mca_pml_bfo.lock); + opal_list_append(&mca_pml_bfo.rdma_pending, (opal_list_item_t*)frag); + OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock); + return OMPI_ERR_OUT_OF_RESOURCE; + } else { + /* TSW - FIX */ + ORTE_ERROR_LOG(rc); + orte_errmgr.abort(-1, NULL); + } + } +/* BFO FAILOVER CODE - begin */ + if (des->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) { + mca_pml_bfo_send_request_t *sendreq = + (mca_pml_bfo_send_request_t*)frag->rdma_req; + sendreq->req_events++; + } +/* BFO FAILOVER CODE - end */ + + return OMPI_SUCCESS; +} + +/** + * Receiver has scheduled an RDMA operation: + * (1) Allocate an RDMA fragment to maintain the state of the operation + * (2) Call BTL prepare_src to pin/prepare source buffers + * (3) Queue the RDMA put + */ + +void mca_pml_bfo_send_request_put( mca_pml_bfo_send_request_t* sendreq, + mca_btl_base_module_t* btl, + mca_pml_bfo_rdma_hdr_t* hdr ) +{ + mca_bml_base_endpoint_t *bml_endpoint = sendreq->req_endpoint; + mca_pml_bfo_rdma_frag_t* frag; + int rc; + size_t i, size = 0; + + if(hdr->hdr_common.hdr_flags & MCA_PML_BFO_HDR_TYPE_ACK) { +/* BFO FAILOVER CODE - begin */ + /* Handle the failover case where a RNDV request may + * have turned into a RGET and therefore the state + * is not being tracked. */ + if (sendreq->req_state != 0) { + OPAL_THREAD_ADD32(&sendreq->req_state, -1); + } +/* BFO FAILOVER CODE - end */ + } +/* BFO FAILOVER CODE - begin */ + sendreq->req_recv = hdr->hdr_dst_req; /* only needed once, but it is OK */ + sendreq->req_acked = true; /* only needed once, but it is OK */ +/* BFO FAILOVER CODE - end */ + + MCA_PML_BFO_RDMA_FRAG_ALLOC(frag, rc); + + if( OPAL_UNLIKELY(NULL == frag) ) { + /* TSW - FIX */ + ORTE_ERROR_LOG(rc); + orte_errmgr.abort(-1, NULL); + } + + /* setup fragment */ + for( i = 0; i < hdr->hdr_seg_cnt; i++ ) { + frag->rdma_segs[i].seg_addr.lval = hdr->hdr_segs[i].seg_addr.lval; + frag->rdma_segs[i].seg_len = hdr->hdr_segs[i].seg_len; + frag->rdma_segs[i].seg_key.key64 = hdr->hdr_segs[i].seg_key.key64; + +#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT + if ((sendreq->req_send.req_base.req_proc->proc_arch & OPAL_ARCH_ISBIGENDIAN) != + (ompi_proc_local()->proc_arch & OPAL_ARCH_ISBIGENDIAN)) { + size += opal_swap_bytes4(frag->rdma_segs[i].seg_len); + } else +#endif + { + size += frag->rdma_segs[i].seg_len; + } + } + + frag->rdma_bml = mca_bml_base_btl_array_find(&bml_endpoint->btl_rdma, btl); +/* BFO FAILOVER CODE - begin */ + frag->rdma_btl = btl; + if( OPAL_UNLIKELY(NULL == frag->rdma_bml) ) { + opal_output(0, "[%s:%d] invalid bml for rdma put", __FILE__, __LINE__); + MCA_PML_BFO_RDMA_FRAG_RETURN(frag); + sendreq->req_error++; + if (0 == sendreq->req_events) { + opal_output(0, "[%s:%d] Issuing rndvrestartnotify", __FILE__, __LINE__); + mca_pml_bfo_send_request_rndvrestartnotify(sendreq, false, + MCA_PML_BFO_HDR_TYPE_PUT, + OMPI_ERROR, btl); + } + return; + } +/* BFO FAILOVER CODE - end */ + frag->rdma_hdr.hdr_rdma = *hdr; + frag->rdma_req = sendreq; + frag->rdma_ep = bml_endpoint; + frag->rdma_length = size; + frag->rdma_state = MCA_PML_BFO_RDMA_PUT; + frag->reg = NULL; + frag->retries = 0; + + /* lookup the corresponding registration */ + for(i=0; ireq_rdma_cnt; i++) { + if(sendreq->req_rdma[i].bml_btl == frag->rdma_bml) { + frag->reg = sendreq->req_rdma[i].btl_reg; + break; + } + } + + /* RDMA writes may proceed in parallel to send and to each other, so + * create clone of the convertor for each RDMA fragment + */ + size = hdr->hdr_rdma_offset; + opal_convertor_clone_with_position(&sendreq->req_send.req_base.req_convertor, + &frag->convertor, 0, &size); + + mca_pml_bfo_send_request_put_frag(frag); +} + diff --git a/ompi/mca/pml/bfo/pml_bfo_sendreq.h b/ompi/mca/pml/bfo/pml_bfo_sendreq.h new file mode 100644 index 0000000000..510c54e2f3 --- /dev/null +++ b/ompi/mca/pml/bfo/pml_bfo_sendreq.h @@ -0,0 +1,492 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2009 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef OMPI_PML_BFO_SEND_REQUEST_H +#define OMPI_PML_BFO_SEND_REQUEST_H + +#include "ompi/mca/btl/btl.h" +#include "ompi/mca/pml/base/pml_base_sendreq.h" +#include "ompi/mca/mpool/base/base.h" +#include "pml_bfo_comm.h" +#include "pml_bfo_hdr.h" +#include "pml_bfo_rdma.h" +#include "pml_bfo_rdmafrag.h" +#include "opal/datatype/opal_convertor.h" +#include "ompi/mca/bml/bml.h" + +BEGIN_C_DECLS + +typedef enum { + MCA_PML_BFO_SEND_PENDING_NONE, + MCA_PML_BFO_SEND_PENDING_SCHEDULE, + MCA_PML_BFO_SEND_PENDING_START +} mca_pml_bfo_send_pending_t; + +struct mca_pml_bfo_send_request_t { + mca_pml_base_send_request_t req_send; + mca_bml_base_endpoint_t* req_endpoint; + ompi_ptr_t req_recv; +/* BFO FAILOVER CODE - begin */ + int32_t req_events; /* number of outstanding events on request */ + int32_t req_restartseq; /* sequence number of restarted request */ + int32_t req_restart; /* state of restarted request */ + int32_t req_error; /* non-zero when error has occurred on request */ + bool req_acked; /* indicates request has been acked */ +/* BFO FAILOVER CODE - end */ + int32_t req_state; + int32_t req_lock; + bool req_throttle_sends; + size_t req_pipeline_depth; + size_t req_bytes_delivered; + uint32_t req_rdma_cnt; + mca_pml_bfo_send_pending_t req_pending; + opal_mutex_t req_send_range_lock; + opal_list_t req_send_ranges; + mca_pml_bfo_com_btl_t req_rdma[1]; +}; +typedef struct mca_pml_bfo_send_request_t mca_pml_bfo_send_request_t; + +OBJ_CLASS_DECLARATION(mca_pml_bfo_send_request_t); + +struct mca_pml_bfo_send_range_t { + ompi_free_list_item_t base; + uint64_t range_send_offset; + uint64_t range_send_length; + int range_btl_idx; + int range_btl_cnt; + mca_pml_bfo_com_btl_t range_btls[1]; +}; +typedef struct mca_pml_bfo_send_range_t mca_pml_bfo_send_range_t; +OBJ_CLASS_DECLARATION(mca_pml_bfo_send_range_t); + +static inline bool lock_send_request(mca_pml_bfo_send_request_t *sendreq) +{ + return OPAL_THREAD_ADD32(&sendreq->req_lock, 1) == 1; +} + +static inline bool unlock_send_request(mca_pml_bfo_send_request_t *sendreq) +{ + return OPAL_THREAD_ADD32(&sendreq->req_lock, -1) == 0; +} + +static inline void +add_request_to_send_pending(mca_pml_bfo_send_request_t* sendreq, + const mca_pml_bfo_send_pending_t type, + const bool append) +{ + opal_list_item_t *item = (opal_list_item_t*)sendreq; + + OPAL_THREAD_LOCK(&mca_pml_bfo.lock); + sendreq->req_pending = type; + if(append) + opal_list_append(&mca_pml_bfo.send_pending, item); + else + opal_list_prepend(&mca_pml_bfo.send_pending, item); + + OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock); +} + +static inline mca_pml_bfo_send_request_t* +get_request_from_send_pending(mca_pml_bfo_send_pending_t *type) +{ + mca_pml_bfo_send_request_t *sendreq; + + OPAL_THREAD_LOCK(&mca_pml_bfo.lock); + sendreq = (mca_pml_bfo_send_request_t*) + opal_list_remove_first(&mca_pml_bfo.send_pending); + if(sendreq) { + *type = sendreq->req_pending; + sendreq->req_pending = MCA_PML_BFO_SEND_PENDING_NONE; + } + OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock); + + return sendreq; +} + +#define MCA_PML_BFO_SEND_REQUEST_ALLOC( comm, \ + dst, \ + sendreq, \ + rc) \ + { \ + ompi_proc_t *proc = ompi_comm_peer_lookup( comm, dst ); \ + ompi_free_list_item_t* item; \ + \ + rc = OMPI_ERR_OUT_OF_RESOURCE; \ + if( OPAL_LIKELY(NULL != proc) ) { \ + rc = OMPI_SUCCESS; \ + OMPI_FREE_LIST_WAIT(&mca_pml_base_send_requests, item, rc); \ + sendreq = (mca_pml_bfo_send_request_t*)item; \ + sendreq->req_send.req_base.req_proc = proc; \ + } \ + } + + +#define MCA_PML_BFO_SEND_REQUEST_INIT( sendreq, \ + buf, \ + count, \ + datatype, \ + dst, \ + tag, \ + comm, \ + sendmode, \ + persistent) \ + { \ + MCA_PML_BASE_SEND_REQUEST_INIT(&sendreq->req_send, \ + buf, \ + count, \ + datatype, \ + dst, \ + tag, \ + comm, \ + sendmode, \ + persistent, \ + 0); /* convertor_flags */ \ + (sendreq)->req_recv.pval = NULL; \ + } + + +static inline void mca_pml_bfo_free_rdma_resources(mca_pml_bfo_send_request_t* sendreq) +{ + size_t r; + + /* return mpool resources */ + for(r = 0; r < sendreq->req_rdma_cnt; r++) { + mca_mpool_base_registration_t* reg = sendreq->req_rdma[r].btl_reg; + if( NULL != reg && reg->mpool != NULL ) { + reg->mpool->mpool_deregister(reg->mpool, reg); + } + } + sendreq->req_rdma_cnt = 0; +} + + +/** + * Start a send request. + */ + +#define MCA_PML_BFO_SEND_REQUEST_START(sendreq, rc) \ + do { \ + rc = mca_pml_bfo_send_request_start(sendreq); \ + } while (0) + + +/* + * Mark a send request as completed at the MPI level. + */ + +#define MCA_PML_BFO_SEND_REQUEST_MPI_COMPLETE(sendreq, with_signal) \ +do { \ + (sendreq)->req_send.req_base.req_ompi.req_status.MPI_SOURCE = \ + (sendreq)->req_send.req_base.req_comm->c_my_rank; \ + (sendreq)->req_send.req_base.req_ompi.req_status.MPI_TAG = \ + (sendreq)->req_send.req_base.req_tag; \ + (sendreq)->req_send.req_base.req_ompi.req_status.MPI_ERROR = OMPI_SUCCESS; \ + (sendreq)->req_send.req_base.req_ompi.req_status._count = \ + (int)(sendreq)->req_send.req_bytes_packed; \ + ompi_request_complete( &((sendreq)->req_send.req_base.req_ompi), (with_signal) ); \ + \ + PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_COMPLETE, \ + &(sendreq->req_send.req_base), PERUSE_SEND); \ +} while(0) + +/* + * Release resources associated with a request + */ + +#define MCA_PML_BFO_SEND_REQUEST_RETURN(sendreq) \ + do { \ + /* Let the base handle the reference counts */ \ + MCA_PML_BASE_SEND_REQUEST_FINI((&(sendreq)->req_send)); \ + OMPI_FREE_LIST_RETURN( &mca_pml_base_send_requests, \ + (ompi_free_list_item_t*)sendreq); \ + } while(0) + + +/* + * The PML has completed a send request. Note that this request + * may have been orphaned by the user or have already completed + * at the MPI level. + * This function will never be called directly from the upper level, as it + * should only be an internal call to the PML. + * + */ +static inline void +send_request_pml_complete(mca_pml_bfo_send_request_t *sendreq) +{ + assert(false == sendreq->req_send.req_base.req_pml_complete); + + if(sendreq->req_send.req_bytes_packed > 0) { + PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_XFER_END, + &(sendreq->req_send.req_base), PERUSE_SEND); + } + + /* return mpool resources */ + mca_pml_bfo_free_rdma_resources(sendreq); + + if (sendreq->req_send.req_send_mode == MCA_PML_BASE_SEND_BUFFERED && + sendreq->req_send.req_addr != sendreq->req_send.req_base.req_addr) { + mca_pml_base_bsend_request_fini((ompi_request_t*)sendreq); + } + + OPAL_THREAD_LOCK(&ompi_request_lock); + if(false == sendreq->req_send.req_base.req_ompi.req_complete) { + /* Should only be called for long messages (maybe synchronous) */ + MCA_PML_BFO_SEND_REQUEST_MPI_COMPLETE(sendreq, true); + } + sendreq->req_send.req_base.req_pml_complete = true; +/* BFO FAILOVER CODE - begin */ + assert(0 == sendreq->req_events); + sendreq->req_restartseq = 0; + /* Since sequence numbers increase monotonically and + * roll over, initialize it to a value far away from + * what it was. I cannot set it to something like -1 + * as that is not within the valid range. */ + sendreq->req_send.req_base.req_sequence = + sendreq->req_send.req_base.req_sequence - 10; +/* BFO FAILOVER CODE - end */ + + if(sendreq->req_send.req_base.req_free_called) { + MCA_PML_BFO_SEND_REQUEST_RETURN(sendreq); + } + OPAL_THREAD_UNLOCK(&ompi_request_lock); +} + +/* returns true if request was completed on PML level */ +static inline bool +send_request_pml_complete_check(mca_pml_bfo_send_request_t *sendreq) +{ +#if OPAL_HAVE_THREAD_SUPPORT + opal_atomic_rmb(); +#endif + /* if no more events are expected for the request and the whole message is + * already sent and send fragment scheduling isn't running in another + * thread then complete the request on PML level. From now on, if user + * called free on this request, the request structure can be reused for + * another request or if the request is persistent it can be restarted */ + if(sendreq->req_state == 0 && + sendreq->req_bytes_delivered >= sendreq->req_send.req_bytes_packed + && lock_send_request(sendreq)) { + send_request_pml_complete(sendreq); + return true; + } + + return false; +} + +/** + * Schedule additional fragments + */ +int +mca_pml_bfo_send_request_schedule_once(mca_pml_bfo_send_request_t*); + +static inline int +mca_pml_bfo_send_request_schedule_exclusive(mca_pml_bfo_send_request_t* sendreq) +{ + int rc; + do { + rc = mca_pml_bfo_send_request_schedule_once(sendreq); + if(OPAL_SOS_GET_ERROR_CODE(rc) == OMPI_ERR_OUT_OF_RESOURCE) + break; + } while(!unlock_send_request(sendreq)); + + if(OMPI_SUCCESS == rc) + send_request_pml_complete_check(sendreq); + + return rc; +} + +static inline void +mca_pml_bfo_send_request_schedule(mca_pml_bfo_send_request_t* sendreq) +{ + /* + * Only allow one thread in this routine for a given request. + * However, we cannot block callers on a mutex, so simply keep track + * of the number of times the routine has been called and run through + * the scheduling logic once for every call. + */ + + if(!lock_send_request(sendreq)) + return; + + mca_pml_bfo_send_request_schedule_exclusive(sendreq); +} + +/** + * Start the specified request + */ + +int mca_pml_bfo_send_request_start_buffered( + mca_pml_bfo_send_request_t* sendreq, + mca_bml_base_btl_t* bml_btl, + size_t size); + +int mca_pml_bfo_send_request_start_copy( + mca_pml_bfo_send_request_t* sendreq, + mca_bml_base_btl_t* bml_btl, + size_t size); + +int mca_pml_bfo_send_request_start_prepare( + mca_pml_bfo_send_request_t* sendreq, + mca_bml_base_btl_t* bml_btl, + size_t size); + +int mca_pml_bfo_send_request_start_rdma( + mca_pml_bfo_send_request_t* sendreq, + mca_bml_base_btl_t* bml_btl, + size_t size); + +int mca_pml_bfo_send_request_start_rndv( + mca_pml_bfo_send_request_t* sendreq, + mca_bml_base_btl_t* bml_btl, + size_t size, + int flags); + +static inline int +mca_pml_bfo_send_request_start_btl( mca_pml_bfo_send_request_t* sendreq, + mca_bml_base_btl_t* bml_btl ) +{ + size_t size = sendreq->req_send.req_bytes_packed; + mca_btl_base_module_t* btl = bml_btl->btl; + size_t eager_limit = btl->btl_eager_limit - sizeof(mca_pml_bfo_hdr_t); + int rc; + + assert(btl->btl_eager_limit >= sizeof(mca_pml_bfo_hdr_t)); + if( OPAL_LIKELY(size <= eager_limit) ) { + switch(sendreq->req_send.req_send_mode) { + case MCA_PML_BASE_SEND_SYNCHRONOUS: + rc = mca_pml_bfo_send_request_start_rndv(sendreq, bml_btl, size, 0); + break; + case MCA_PML_BASE_SEND_BUFFERED: + rc = mca_pml_bfo_send_request_start_copy(sendreq, bml_btl, size); + break; + case MCA_PML_BASE_SEND_COMPLETE: + rc = mca_pml_bfo_send_request_start_prepare(sendreq, bml_btl, size); + break; + default: + if (size != 0 && bml_btl->btl_flags & MCA_BTL_FLAGS_SEND_INPLACE) { + rc = mca_pml_bfo_send_request_start_prepare(sendreq, bml_btl, size); + } else { + rc = mca_pml_bfo_send_request_start_copy(sendreq, bml_btl, size); + } + break; + } + } else { + size = eager_limit; + if(OPAL_UNLIKELY(btl->btl_rndv_eager_limit < eager_limit)) + size = btl->btl_rndv_eager_limit; + if(sendreq->req_send.req_send_mode == MCA_PML_BASE_SEND_BUFFERED) { + rc = mca_pml_bfo_send_request_start_buffered(sendreq, bml_btl, size); + } else if + (opal_convertor_need_buffers(&sendreq->req_send.req_base.req_convertor) == false) { + unsigned char *base; + opal_convertor_get_current_pointer( &sendreq->req_send.req_base.req_convertor, (void**)&base ); + + if( 0 != (sendreq->req_rdma_cnt = (uint32_t)mca_pml_bfo_rdma_btls( + sendreq->req_endpoint, + base, + sendreq->req_send.req_bytes_packed, + sendreq->req_rdma))) { + rc = mca_pml_bfo_send_request_start_rdma(sendreq, bml_btl, + sendreq->req_send.req_bytes_packed); + if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) { + mca_pml_bfo_free_rdma_resources(sendreq); + } + } else { + rc = mca_pml_bfo_send_request_start_rndv(sendreq, bml_btl, size, + MCA_PML_BFO_HDR_FLAGS_CONTIG); + } + } else { + rc = mca_pml_bfo_send_request_start_rndv(sendreq, bml_btl, size, 0); + } + } + + return rc; +} + +static inline int +mca_pml_bfo_send_request_start( mca_pml_bfo_send_request_t* sendreq ) +{ + mca_pml_bfo_comm_t* comm = sendreq->req_send.req_base.req_comm->c_pml_comm; + mca_bml_base_endpoint_t* endpoint = (mca_bml_base_endpoint_t*) + sendreq->req_send.req_base.req_proc->proc_bml; + size_t i; + + if( OPAL_UNLIKELY(endpoint == NULL) ) { + return OMPI_ERR_UNREACH; + } + + sendreq->req_endpoint = endpoint; + sendreq->req_state = 0; + sendreq->req_lock = 0; + sendreq->req_pipeline_depth = 0; + sendreq->req_bytes_delivered = 0; + sendreq->req_pending = MCA_PML_BFO_SEND_PENDING_NONE; + sendreq->req_send.req_base.req_sequence = OPAL_THREAD_ADD32( + &comm->procs[sendreq->req_send.req_base.req_peer].send_sequence,1); +/* BFO FAILOVER CODE - begin */ + sendreq->req_restartseq = 0; /* counts up restarts */ + sendreq->req_restart = 0; /* reset in case we restart again */ + sendreq->req_error = 0; /* clear error state */ + sendreq->req_events = 0; /* clear events, probably 0 anyways */ + sendreq->req_acked = false; +/* BFO FAILOVER CODE - end */ + + MCA_PML_BASE_SEND_START( &sendreq->req_send.req_base ); + + for(i = 0; i < mca_bml_base_btl_array_get_size(&endpoint->btl_eager); i++) { + mca_bml_base_btl_t* bml_btl; + int rc; + + /* select a btl */ + bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager); + rc = mca_pml_bfo_send_request_start_btl(sendreq, bml_btl); + if( OPAL_LIKELY(OMPI_ERR_OUT_OF_RESOURCE != OPAL_SOS_GET_ERROR_CODE(rc)) ) + return rc; + } + add_request_to_send_pending(sendreq, MCA_PML_BFO_SEND_PENDING_START, true); + + return OMPI_SUCCESS; +} + +/** + * Initiate a put scheduled by the receiver. + */ + +void mca_pml_bfo_send_request_put( mca_pml_bfo_send_request_t* sendreq, + mca_btl_base_module_t* btl, + mca_pml_bfo_rdma_hdr_t* hdr ); + +int mca_pml_bfo_send_request_put_frag(mca_pml_bfo_rdma_frag_t* frag); + +/* This function tries to continue sendreq that was stuck because of resource + * unavailability. A sendreq may be added to send_pending list if there is no + * resource to send initial packet or there is not resource to schedule data + * for sending. The reason the sendreq was added to the list is stored inside + * sendreq struct and appropriate operation is retried when resource became + * available. bml_btl passed to the function doesn't represents sendreq + * destination, it represents BTL on which resource was freed, so only this BTL + * should be considered for sending packets */ +void mca_pml_bfo_send_request_process_pending(mca_btl_base_module_t *btl); + +void mca_pml_bfo_send_request_copy_in_out(mca_pml_bfo_send_request_t *sendreq, + uint64_t send_offset, uint64_t send_length); + +END_C_DECLS + +#endif /* OMPI_PML_BFO_SEND_REQUEST_H */ diff --git a/ompi/mca/pml/bfo/pml_bfo_start.c b/ompi/mca/pml/bfo/pml_bfo_start.c new file mode 100644 index 0000000000..46ccf429d0 --- /dev/null +++ b/ompi/mca/pml/bfo/pml_bfo_start.c @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2007 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2008 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#include "pml_bfo.h" +#include "pml_bfo_recvreq.h" +#include "pml_bfo_sendreq.h" +#include "ompi/memchecker.h" + + +int mca_pml_bfo_start(size_t count, ompi_request_t** requests) +{ + int rc; + size_t i; + bool reuse_old_request = true; + + for(i=0; ireq_type) { + continue; + } + + /* If the persistent request is currently active - obtain the + * request lock and verify the status is incomplete. if the + * pml layer has not completed the request - mark the request + * as free called - so that it will be freed when the request + * completes - and create a new request. + */ + + reuse_old_request = true; + switch(pml_request->req_ompi.req_state) { + case OMPI_REQUEST_INACTIVE: + if(pml_request->req_pml_complete == true) + break; + /* otherwise fall through */ + case OMPI_REQUEST_ACTIVE: { + + ompi_request_t *request; + OPAL_THREAD_LOCK(&ompi_request_lock); + if (pml_request->req_pml_complete == false) { + /* free request after it completes */ + pml_request->req_free_called = true; + } else { + /* can reuse the existing request */ + OPAL_THREAD_UNLOCK(&ompi_request_lock); + break; + } + + reuse_old_request = false; + /* allocate a new request */ + switch(pml_request->req_type) { + case MCA_PML_REQUEST_SEND: { + mca_pml_base_send_mode_t sendmode = + ((mca_pml_base_send_request_t*)pml_request)->req_send_mode; + rc = mca_pml_bfo_isend_init( + pml_request->req_addr, + pml_request->req_count, + pml_request->req_datatype, + pml_request->req_peer, + pml_request->req_tag, + sendmode, + pml_request->req_comm, + &request); + break; + } + case MCA_PML_REQUEST_RECV: + rc = mca_pml_bfo_irecv_init( + pml_request->req_addr, + pml_request->req_count, + pml_request->req_datatype, + pml_request->req_peer, + pml_request->req_tag, + pml_request->req_comm, + &request); + break; + default: + rc = OMPI_ERR_REQUEST; + break; + } + OPAL_THREAD_UNLOCK(&ompi_request_lock); + if(OMPI_SUCCESS != rc) + return rc; + pml_request = (mca_pml_base_request_t*)request; + requests[i] = request; + break; + } + default: + return OMPI_ERR_REQUEST; + } + + /* start the request */ + switch(pml_request->req_type) { + case MCA_PML_REQUEST_SEND: + { + mca_pml_bfo_send_request_t* sendreq = (mca_pml_bfo_send_request_t*)pml_request; + if( reuse_old_request && (sendreq->req_send.req_bytes_packed != 0) ) { + size_t offset = 0; + /** + * Reset the convertor in case we're dealing with the original + * request, which when completed do not reset the convertor. + */ + opal_convertor_set_position( &sendreq->req_send.req_base.req_convertor, + &offset ); + } + MCA_PML_BFO_SEND_REQUEST_START(sendreq, rc); + if(rc != OMPI_SUCCESS) + return rc; + break; + } + case MCA_PML_REQUEST_RECV: + { + mca_pml_bfo_recv_request_t* recvreq = (mca_pml_bfo_recv_request_t*)pml_request; + MCA_PML_BFO_RECV_REQUEST_START(recvreq); + break; + } + default: + return OMPI_ERR_REQUEST; + } + } + return OMPI_SUCCESS; +} + diff --git a/ompi/mca/pml/bfo/post_configure.sh b/ompi/mca/pml/bfo/post_configure.sh new file mode 100644 index 0000000000..77a7d52608 --- /dev/null +++ b/ompi/mca/pml/bfo/post_configure.sh @@ -0,0 +1 @@ +DIRECT_CALL_HEADER="ompi/mca/pml/bfo/pml_bfo.h"