diff --git a/ompi/mca/pml/dr/Makefile.am b/ompi/mca/pml/dr/Makefile.am deleted file mode 100644 index de774e3979..0000000000 --- a/ompi/mca/pml/dr/Makefile.am +++ /dev/null @@ -1,64 +0,0 @@ -# -# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana -# University Research and Technology -# Corporation. All rights reserved. -# Copyright (c) 2004-2005 The University of Tennessee and The University -# of Tennessee Research Foundation. All rights -# reserved. -# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart, -# University of Stuttgart. All rights reserved. -# Copyright (c) 2004-2005 The Regents of the University of California. -# All rights reserved. -# Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved. -# -# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -dist_pkgdata_DATA = \ - help-mpi-pml-dr.txt - -dr_sources = \ - pml_dr.c \ - pml_dr.h \ - pml_dr_comm.c \ - pml_dr_comm.h \ - pml_dr_component.c \ - pml_dr_component.h \ - pml_dr_endpoint.c \ - pml_dr_endpoint.h \ - pml_dr_hdr.h \ - pml_dr_iprobe.c \ - pml_dr_irecv.c \ - pml_dr_isend.c \ - pml_dr_recvfrag.c \ - pml_dr_recvfrag.h \ - pml_dr_recvreq.c \ - pml_dr_recvreq.h \ - pml_dr_sendreq.c \ - pml_dr_sendreq.h \ - pml_dr_start.c \ - pml_dr_vfrag.h \ - pml_dr_vfrag.c - -if MCA_BUILD_ompi_pml_dr_DSO -component_noinst = -component_install = mca_pml_dr.la -else -component_noinst = libmca_pml_dr.la -component_install = -endif - - -mcacomponentdir = $(pkglibdir) -mcacomponent_LTLIBRARIES = $(component_install) -mca_pml_dr_la_SOURCES = $(dr_sources) -mca_pml_dr_la_LDFLAGS = -module -avoid-version - -noinst_LTLIBRARIES = $(component_noinst) -libmca_pml_dr_la_SOURCES = $(dr_sources) -libmca_pml_dr_la_LDFLAGS = -module -avoid-version diff --git a/ompi/mca/pml/dr/configure.m4 b/ompi/mca/pml/dr/configure.m4 deleted file mode 100644 index 4df02ae7bb..0000000000 --- a/ompi/mca/pml/dr/configure.m4 +++ /dev/null @@ -1,26 +0,0 @@ -# -*- shell-script -*- -# -# Copyright (c) 2004-2008 The Trustees of the University of Tennessee. -# All rights reserved. -# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -# MCA_pml_dr_CONFIG([action-if-found], [action-if-not-found]) -# ----------------------------------------------------------- -AC_DEFUN([MCA_ompi_pml_dr_CONFIG],[ - AC_CONFIG_FILES([ompi/mca/pml/dr/Makefile]) - - # Dont compile DR if threading is enabled but there is no - # support for 64 bits atomics. - AS_IF([test $OPAL_ASM_SUPPORT_64BIT -eq 1], - [$1], - [AS_IF([test $OMPI_ENABLE_PROGRESS_THREADS -eq 1 -o $OMPI_ENABLE_THREAD_MULTIPLE -eq 1], - [$2], - [$1]) - ]) -])dnl diff --git a/ompi/mca/pml/dr/help-mpi-pml-dr.txt b/ompi/mca/pml/dr/help-mpi-pml-dr.txt deleted file mode 100644 index ed378d5003..0000000000 --- a/ompi/mca/pml/dr/help-mpi-pml-dr.txt +++ /dev/null @@ -1,20 +0,0 @@ -# -*- text -*- -# -# Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# -[eager_limit_too_small] -The "eager limit" MCA parameter in the %s BTL was set to a value which -is too low for Open MPI to function properly. Please re-run your job -with a higher eager limit value for this BTL; the exact MCA parameter -name and its corresponding minimum value is shown below. - - Local host: %s - BTL name: %s - BTL eager limit value: %d (set via btl_%s_eager_limit) - BTL eager limit minimum: %d - MCA parameter name: btl_%s_eager_limit diff --git a/ompi/mca/pml/dr/pml_dr.c b/ompi/mca/pml/dr/pml_dr.c deleted file mode 100644 index bead4b45c0..0000000000 --- a/ompi/mca/pml/dr/pml_dr.c +++ /dev/null @@ -1,333 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; -*- */ -/* - * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2011 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2006 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved - * Copyright (c) 2011 Sandia National Laboratories. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" - -#include -#include - -#include "opal/class/opal_bitmap.h" -#include "opal/util/show_help.h" -#include "ompi/mca/pml/pml.h" -#include "ompi/mca/btl/btl.h" -#include "ompi/mca/btl/base/base.h" -#include "pml_dr.h" -#include "pml_dr_component.h" -#include "pml_dr_comm.h" -#include "pml_dr_hdr.h" -#include "pml_dr_recvfrag.h" -#include "pml_dr_sendreq.h" -#include "pml_dr_recvreq.h" -#include "ompi/mca/bml/base/base.h" -#include "ompi/mca/pml/base/base.h" - -mca_pml_dr_t mca_pml_dr = { - { - mca_pml_dr_add_procs, - mca_pml_dr_del_procs, - mca_pml_dr_enable, - NULL, /*mca_pml_dr_progress,*/ - mca_pml_dr_add_comm, - mca_pml_dr_del_comm, - mca_pml_dr_irecv_init, - mca_pml_dr_irecv, - mca_pml_dr_recv, - mca_pml_dr_isend_init, - mca_pml_dr_isend, - mca_pml_dr_send, - mca_pml_dr_iprobe, - mca_pml_dr_probe, - mca_pml_dr_start, - mca_pml_dr_improbe, - mca_pml_dr_mprobe, - mca_pml_dr_imrecv, - mca_pml_dr_mrecv, - mca_pml_dr_dump, - NULL, - 65535, - INT_MAX - } -}; - -void mca_pml_dr_error_handler( struct mca_btl_base_module_t* btl, - int32_t flags, ompi_proc_t* errproc, - char* btlinfo ); - -int mca_pml_dr_enable(bool enable) -{ - if( false == enable ) return OMPI_SUCCESS; - - /* requests */ - ompi_free_list_init_new( &mca_pml_base_send_requests, - sizeof(mca_pml_dr_send_request_t), - opal_cache_line_size, - OBJ_CLASS(mca_pml_dr_send_request_t), - 0,opal_cache_line_size, - mca_pml_dr.free_list_num, - mca_pml_dr.free_list_max, - mca_pml_dr.free_list_inc, - NULL ); - - ompi_free_list_init_new( &mca_pml_base_recv_requests, - sizeof(mca_pml_dr_recv_request_t), - opal_cache_line_size, - OBJ_CLASS(mca_pml_dr_recv_request_t), - 0,opal_cache_line_size, - mca_pml_dr.free_list_num, - mca_pml_dr.free_list_max, - mca_pml_dr.free_list_inc, - NULL ); - - /* fragments */ - OBJ_CONSTRUCT(&mca_pml_dr.recv_frags, ompi_free_list_t); - ompi_free_list_init_new( &mca_pml_dr.recv_frags, - sizeof(mca_pml_dr_recv_frag_t), - opal_cache_line_size, - OBJ_CLASS(mca_pml_dr_recv_frag_t), - 0,opal_cache_line_size, - mca_pml_dr.free_list_num, - mca_pml_dr.free_list_max, - mca_pml_dr.free_list_inc, - NULL ); - - OBJ_CONSTRUCT(&mca_pml_dr.vfrags, ompi_free_list_t); - ompi_free_list_init_new( &mca_pml_dr.vfrags, - sizeof(mca_pml_dr_vfrag_t), - opal_cache_line_size, - OBJ_CLASS(mca_pml_dr_vfrag_t), - 0,opal_cache_line_size, - mca_pml_dr.free_list_num, - mca_pml_dr.free_list_max, - mca_pml_dr.free_list_inc, - NULL ); - - OBJ_CONSTRUCT(&mca_pml_dr.send_pending, opal_list_t); - OBJ_CONSTRUCT(&mca_pml_dr.send_active, opal_list_t); - OBJ_CONSTRUCT(&mca_pml_dr.acks_pending, opal_list_t); - OBJ_CONSTRUCT(&mca_pml_dr.buffers, ompi_free_list_t); - OBJ_CONSTRUCT(&mca_pml_dr.endpoints, opal_pointer_array_t); - OBJ_CONSTRUCT(&mca_pml_dr.lock, opal_mutex_t); - - mca_pml_dr.enabled = true; - return OMPI_SUCCESS; -} - -int mca_pml_dr_add_comm(ompi_communicator_t* comm) -{ - /* allocate pml specific comm data */ - mca_pml_dr_comm_t* pml_comm = OBJ_NEW(mca_pml_dr_comm_t); - int i; - - if (NULL == pml_comm) { - return OMPI_ERR_OUT_OF_RESOURCE; - } - mca_pml_dr_comm_init(pml_comm, comm); - comm->c_pml_comm = pml_comm; - - for( i = 0; i < comm->c_remote_group->grp_proc_count; i++ ) { - pml_comm->procs[i].ompi_proc = ompi_group_peer_lookup(comm->c_remote_group,i); - } - return OMPI_SUCCESS; -} - -int mca_pml_dr_del_comm(ompi_communicator_t* comm) -{ - OBJ_RELEASE(comm->c_pml_comm); - comm->c_pml_comm = NULL; - return OMPI_SUCCESS; -} - -/* - * For each proc setup a datastructure that indicates the PTLs - * that can be used to reach the destination. - * - */ - -int mca_pml_dr_add_procs(ompi_proc_t** procs, size_t nprocs) -{ - opal_bitmap_t reachable; - int rc; - size_t i; - opal_list_item_t *item; - - if(nprocs == 0) - return OMPI_SUCCESS; - -#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT - for (i = 0 ; i < nprocs ; ++i) { - if (procs[i]->proc_arch != ompi_proc_local()->proc_arch) { - return OMPI_ERR_NOT_SUPPORTED; - } - } -#endif - - /* make sure remote procs are using the same PML as us */ - if (OMPI_SUCCESS != (rc = mca_pml_base_pml_check_selected("dr", - procs, - nprocs))) { - return rc; - } - - OBJ_CONSTRUCT(&reachable, opal_bitmap_t); - rc = opal_bitmap_init(&reachable, (int)nprocs); - if(OMPI_SUCCESS != rc) - return rc; - - /* initialize bml endpoint data */ - rc = mca_bml.bml_add_procs( - nprocs, - procs, - &reachable - ); - if(OMPI_SUCCESS != rc) - return rc; - - /* Check that values supplied by all initialized btls will work - for us. Note that this is the list of all initialized BTLs, - not the ones used for the just added procs. This is a little - overkill and inaccurate, as we may end up not using the BTL in - question and all add_procs calls after the first one are - duplicating an already completed check. But the final - initialization of the PML occurs before the final - initialization of the BTLs, and iterating through the in-use - BTLs requires iterating over the procs, as the BML does not - expose all currently in use btls. */ - - for (item = opal_list_get_first(&mca_btl_base_modules_initialized) ; - item != opal_list_get_end(&mca_btl_base_modules_initialized) ; - item = opal_list_get_next(item)) { - mca_btl_base_selected_module_t *sm = - (mca_btl_base_selected_module_t*) item; - if (sm->btl_module->btl_eager_limit < sizeof(mca_pml_dr_hdr_t)) { - opal_show_help("help-mpi-pml-dr.txt", "eager_limit_too_small", - true, - sm->btl_component->btl_version.mca_component_name, - ompi_process_info.nodename, - sm->btl_component->btl_version.mca_component_name, - sm->btl_module->btl_eager_limit, - sm->btl_component->btl_version.mca_component_name, - sizeof(mca_pml_dr_hdr_t), - sm->btl_component->btl_version.mca_component_name); - rc = OMPI_ERR_BAD_PARAM; - return rc; - } - } - - /* register recv handler */ - rc = mca_bml.bml_register( - MCA_BTL_TAG_PML, - mca_pml_dr_recv_frag_callback, - NULL); - - if(OMPI_SUCCESS != rc) - return rc; - - /* register error handlers */ - rc = mca_bml.bml_register_error(mca_pml_dr_error_handler); - - if(OMPI_SUCCESS != rc) - return rc; - - ompi_free_list_init_new( - &mca_pml_dr.buffers, - sizeof(mca_pml_dr_buffer_t) + mca_pml_dr.eager_limit, - opal_cache_line_size, - OBJ_CLASS(mca_pml_dr_buffer_t), - 0,opal_cache_line_size, - 0, - mca_pml_dr.free_list_max, - mca_pml_dr.free_list_inc, - NULL); - - /* initialize pml endpoint data */ - for (i = 0 ; i < nprocs ; ++i) { - int idx; - mca_pml_dr_endpoint_t *endpoint; - - endpoint = OBJ_NEW(mca_pml_dr_endpoint_t); - endpoint->proc_ompi = procs[i]; - procs[i]->proc_pml = (struct mca_pml_endpoint_t*) endpoint; - MCA_PML_DR_DEBUG(10, (0, "%s:%d: adding endpoint %p to proc_pml %p\n", - __FILE__, __LINE__, (void*)endpoint, (void*)procs[i])); - - /* this won't work for comm spawn and other dynamic - processes, but will work for initial job start */ - idx = opal_pointer_array_add(&mca_pml_dr.endpoints, (void*) endpoint); - if(OPAL_EQUAL == ompi_rte_compare_name_fields(OMPI_RTE_CMP_ALL, - OMPI_PROC_MY_NAME, - &(endpoint->proc_ompi->proc_name))) { - mca_pml_dr.my_rank = idx; - } - endpoint->local = endpoint->dst = idx; - MCA_PML_DR_DEBUG(10, (0, "%s:%d: setting endpoint->dst to %d\n", - __FILE__, __LINE__, idx)); - - endpoint->bml_endpoint = procs[i]->proc_bml; - } - - for(i = 0; i < nprocs; i++) { - mca_pml_dr_endpoint_t* ep = (mca_pml_dr_endpoint_t*) - opal_pointer_array_get_item(&mca_pml_dr.endpoints, i); - ep->src = mca_pml_dr.my_rank; - } - return rc; -} - -/* - * iterate through each proc and notify any PTLs associated - * with the proc that it is/has gone away - */ - -int mca_pml_dr_del_procs(ompi_proc_t** procs, size_t nprocs) -{ - size_t i; - - /* clean up pml endpoint data */ - for (i = 0 ; i < nprocs ; ++i) { - if (NULL != procs[i]->proc_pml) { - OBJ_RELEASE(procs[i]->proc_pml); - } - } - - return mca_bml.bml_del_procs(nprocs, procs); -} - -int mca_pml_dr_dump( - struct ompi_communicator_t* comm, - int verbose) -{ - return OMPI_SUCCESS; -} - - - -void mca_pml_dr_error_handler( - struct mca_btl_base_module_t* btl, int32_t flags, - ompi_proc_t* errproc, char* btlinfo) { - /* try failover ! */ - opal_output(0, "%s:%d:%s: failing BTL: %s", __FILE__, __LINE__, __func__, - btl->btl_component->btl_version.mca_component_name); - mca_pml_dr_sendreq_cleanup_active(btl); - mca_bml.bml_del_btl(btl); - /* ompi_rte_abort(); */ -} - - diff --git a/ompi/mca/pml/dr/pml_dr.h b/ompi/mca/pml/dr/pml_dr.h deleted file mode 100644 index b04ac709e7..0000000000 --- a/ompi/mca/pml/dr/pml_dr.h +++ /dev/null @@ -1,246 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; -*- */ -/* - * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2007 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2006 The Regents of the University of California. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/** - * @file - */ - -#ifndef MCA_PML_DR_H -#define MCA_PML_DR_H - -#include "ompi_config.h" -#include "opal/util/crc.h" -#include "ompi/class/ompi_free_list.h" -#include "ompi/request/request.h" -#include "ompi/mca/pml/pml.h" -#include "ompi/mca/pml/base/pml_base_request.h" -#include "ompi/mca/pml/base/pml_base_bsend.h" -#include "ompi/mca/pml/base/pml_base_sendreq.h" -#include "opal/class/opal_pointer_array.h" -#include "ompi/datatype/ompi_datatype.h" - -BEGIN_C_DECLS - -/** - * DR PML module - */ - -struct mca_pml_dr_t { - mca_pml_base_module_t super; - - int priority; - int free_list_num; /* initial size of free list */ - int free_list_max; /* maximum size of free list */ - int free_list_inc; /* number of elements to grow free list */ - unsigned int eager_limit; /* maximum eager limit size - overrides btl setting */ - unsigned int send_pipeline_depth; - bool enabled; - - /* lock queue accesses */ - opal_mutex_t lock; - - time_t tout_ack; - time_t tout_watch_dog; - - /* pending lists */ - opal_list_t send_pending; - opal_list_t acks_pending; - - /* active lists */ - opal_list_t send_active; - - /* free lists */ - ompi_free_list_t recv_frags; - ompi_free_list_t vfrags; - ompi_free_list_t buffers; - - /* endpoint pointer array */ - opal_pointer_array_t endpoints; - - /* my 'global' rank */ - int32_t my_rank; - - struct timeval wdog_timer; - int wdog_timer_multiplier; - int wdog_retry_max; - - struct timeval ack_timer; - int ack_timer_multiplier; - int ack_retry_max; - - /* enable/disable csum */ - int enable_csum; -}; -typedef struct mca_pml_dr_t mca_pml_dr_t; - -extern mca_pml_dr_t mca_pml_dr; - -/* - * PML interface functions. - */ - -extern int mca_pml_dr_add_comm( - struct ompi_communicator_t* comm -); - -extern int mca_pml_dr_del_comm( - struct ompi_communicator_t* comm -); - -extern int mca_pml_dr_add_procs( - struct ompi_proc_t **procs, - size_t nprocs -); - -extern int mca_pml_dr_del_procs( - struct ompi_proc_t **procs, - size_t nprocs -); - -extern int mca_pml_dr_enable( - bool enable -); - -extern int mca_pml_dr_iprobe( - int dst, - int tag, - struct ompi_communicator_t* comm, - int *matched, - ompi_status_public_t* status -); - -extern int mca_pml_dr_probe( - int dst, - int tag, - struct ompi_communicator_t* comm, - ompi_status_public_t* status -); - -extern int mca_pml_dr_improbe(int dst, - int tag, - struct ompi_communicator_t* comm, - int *matched, - struct ompi_message_t **message, - ompi_status_public_t* status); - -extern int mca_pml_dr_mprobe(int dst, - int tag, - struct ompi_communicator_t* comm, - struct ompi_message_t **message, - ompi_status_public_t* status); - -extern int mca_pml_dr_isend_init( - void *buf, - size_t count, - ompi_datatype_t *datatype, - int dst, - int tag, - mca_pml_base_send_mode_t mode, - struct ompi_communicator_t* comm, - struct ompi_request_t **request -); - -extern int mca_pml_dr_isend( - void *buf, - size_t count, - ompi_datatype_t *datatype, - int dst, - int tag, - mca_pml_base_send_mode_t mode, - struct ompi_communicator_t* comm, - struct ompi_request_t **request -); - -extern int mca_pml_dr_send( - void *buf, - size_t count, - ompi_datatype_t *datatype, - int dst, - int tag, - mca_pml_base_send_mode_t mode, - struct ompi_communicator_t* comm -); - -extern int mca_pml_dr_irecv_init( - void *buf, - size_t count, - ompi_datatype_t *datatype, - int src, - int tag, - struct ompi_communicator_t* comm, - struct ompi_request_t **request -); - -extern int mca_pml_dr_irecv( - void *buf, - size_t count, - ompi_datatype_t *datatype, - int src, - int tag, - struct ompi_communicator_t* comm, - struct ompi_request_t **request -); - -extern int mca_pml_dr_recv( - void *buf, - size_t count, - ompi_datatype_t *datatype, - int src, - int tag, - struct ompi_communicator_t* comm, - ompi_status_public_t* status -); - -extern int mca_pml_dr_imrecv(void *buf, - size_t count, - ompi_datatype_t *datatype, - struct ompi_message_t **message, - struct ompi_request_t **request); - -extern int mca_pml_dr_mrecv(void *buf, - size_t count, - ompi_datatype_t *datatype, - struct ompi_message_t **message, - ompi_status_public_t* status); - -extern int mca_pml_dr_dump( - struct ompi_communicator_t* comm, - int verbose -); - -extern int mca_pml_dr_progress(void); - -extern int mca_pml_dr_start( - size_t count, - ompi_request_t** requests -); - -extern int mca_pml_dr_ft_event(int state); - -END_C_DECLS - - -#endif - - -#define MCA_PML_DR_DEBUG_LEVEL -1 -#define MCA_PML_DR_DEBUG(level,msg) \ - if(level <= MCA_PML_DR_DEBUG_LEVEL){ \ - OPAL_OUTPUT(msg); \ - } - diff --git a/ompi/mca/pml/dr/pml_dr_comm.c b/ompi/mca/pml/dr/pml_dr_comm.c deleted file mode 100644 index 6861016c15..0000000000 --- a/ompi/mca/pml/dr/pml_dr_comm.c +++ /dev/null @@ -1,119 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; -*- */ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2007 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2006 The Regents of the University of California. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" -#include - -#include "pml_dr.h" -#include "pml_dr_comm.h" -#include "pml_dr_endpoint.h" - -static void mca_pml_dr_comm_proc_construct(mca_pml_dr_comm_proc_t* proc) -{ - proc->expected_sequence = 1; - proc->send_sequence = 0; - OBJ_CONSTRUCT(&proc->frags_cant_match, opal_list_t); - OBJ_CONSTRUCT(&proc->specific_receives, opal_list_t); - OBJ_CONSTRUCT(&proc->matched_receives, opal_list_t); - OBJ_CONSTRUCT(&proc->unexpected_frags, opal_list_t); - -} - - -static void mca_pml_dr_comm_proc_destruct(mca_pml_dr_comm_proc_t* proc) -{ - OBJ_DESTRUCT(&proc->frags_cant_match); - OBJ_DESTRUCT(&proc->matched_receives); - OBJ_DESTRUCT(&proc->specific_receives); - OBJ_DESTRUCT(&proc->unexpected_frags); -} - - -static OBJ_CLASS_INSTANCE( - mca_pml_dr_comm_proc_t, - opal_object_t, - mca_pml_dr_comm_proc_construct, - mca_pml_dr_comm_proc_destruct); - -static void mca_pml_dr_comm_construct(mca_pml_dr_comm_t* comm) -{ - OBJ_CONSTRUCT(&comm->wild_receives, opal_list_t); - OBJ_CONSTRUCT(&comm->matching_lock, opal_mutex_t); - OBJ_CONSTRUCT(&comm->sparse_procs, opal_pointer_array_t); - comm->recv_sequence = 0; - comm->procs = NULL; - comm->num_procs = 0; -} - - -static void mca_pml_dr_comm_destruct(mca_pml_dr_comm_t* comm) -{ - size_t i; - for(i=0; inum_procs; i++) { - OBJ_DESTRUCT((&comm->procs[i])); - } - if(NULL != comm->procs) { - free(comm->procs); - } - - OBJ_DESTRUCT(&comm->wild_receives); - OBJ_DESTRUCT(&comm->matching_lock); - OBJ_DESTRUCT(&comm->sparse_procs); -} - - -OBJ_CLASS_INSTANCE( - mca_pml_dr_comm_t, - opal_object_t, - mca_pml_dr_comm_construct, - mca_pml_dr_comm_destruct); - - -int mca_pml_dr_comm_init(mca_pml_dr_comm_t* dr_comm, ompi_communicator_t* ompi_comm) -{ - size_t i; - size_t size = ompi_comm->c_remote_group->grp_proc_count; - - /* send message sequence-number support - sender side */ - dr_comm->procs = (mca_pml_dr_comm_proc_t*)malloc(sizeof(mca_pml_dr_comm_proc_t)*size); - if(NULL == dr_comm->procs) { - return OMPI_ERR_OUT_OF_RESOURCE; - } - for(i=0; iprocs+i; - OBJ_CONSTRUCT(proc, mca_pml_dr_comm_proc_t); - proc->comm_rank = i; - ompi_proc = ompi_group_peer_lookup(ompi_comm->c_remote_group,i); - proc->ompi_proc = ompi_proc; - pml_ep = (mca_pml_dr_endpoint_t*) ompi_proc->proc_pml; - opal_pointer_array_set_item(&dr_comm->sparse_procs, - pml_ep->dst, /* from our view this is the - peers source 'global rank' */ - proc); - proc->pml_endpoint = pml_ep; - proc->bml_endpoint = ompi_proc->proc_bml; - } - dr_comm->num_procs = size; - return OMPI_SUCCESS; -} - - diff --git a/ompi/mca/pml/dr/pml_dr_comm.h b/ompi/mca/pml/dr/pml_dr_comm.h deleted file mode 100644 index cc6e1d62d4..0000000000 --- a/ompi/mca/pml/dr/pml_dr_comm.h +++ /dev/null @@ -1,86 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; -*- */ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2007 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2006 The Regents of the University of California. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/** - * @file - */ -#ifndef MCA_PML_DR_COMM_H -#define MCA_PML_DR_COMM_H - -#include "opal/class/opal_list.h" -#include "ompi/communicator/communicator.h" -#include "ompi/proc/proc.h" -#include "pml_dr_endpoint.h" -BEGIN_C_DECLS - - -struct mca_pml_dr_comm_proc_t { - opal_object_t super; - uint16_t expected_sequence; /**< send message sequence number - receiver side */ -#if OPAL_ENABLE_MULTI_THREADS - volatile int32_t send_sequence; /**< send side sequence number */ -#else - int32_t send_sequence; /**< send side sequence number */ -#endif - opal_list_t frags_cant_match; /**< out-of-order fragment queues */ - opal_list_t specific_receives; /**< queues of unmatched specific receives */ - opal_list_t unexpected_frags; /**< unexpected fragment queues */ - opal_list_t matched_receives; /**< list of in-progress matched receives */ - ompi_proc_t* ompi_proc; /**< back pointer to ompi_proc_t */ - mca_pml_dr_endpoint_t* pml_endpoint; /**< back pointer to the PML endpoint */ - mca_bml_base_endpoint_t* bml_endpoint; /**< back pointer to the BML endpoint */ - int32_t comm_rank; /**< rank in the communicator */ -}; -typedef struct mca_pml_dr_comm_proc_t mca_pml_dr_comm_proc_t; - - /** - * Cached on ompi_communicator_t to hold queues/state - * used by the PML<->PTL interface for matching logic. - */ -struct mca_pml_comm_t { - opal_object_t super; -#if OPAL_ENABLE_MULTI_THREADS - volatile uint32_t recv_sequence; /**< recv request sequence number - receiver side */ -#else - uint32_t recv_sequence; /**< recv request sequence number - receiver side */ -#endif - opal_mutex_t matching_lock; /**< matching lock */ - opal_list_t wild_receives; /**< queue of unmatched wild (source process not specified) receives */ - opal_pointer_array_t sparse_procs; /**< sparse array, allows lookup of comm_proc using a global rank */ - mca_pml_dr_comm_proc_t* procs; - size_t num_procs; -}; -typedef struct mca_pml_comm_t mca_pml_dr_comm_t; - -OBJ_CLASS_DECLARATION(mca_pml_dr_comm_t); - - -/** - * Initialize an instance of mca_pml_dr_comm_t based on the communicator size. - * - * @param dr_comm Instance of mca_pml_dr_comm_t - * @param pml_comm Communicator - * @return OMPI_SUCCESS or error status on failure. - */ - -extern int mca_pml_dr_comm_init(mca_pml_dr_comm_t* dr_comm, ompi_communicator_t* ompi_comm); - - -END_C_DECLS -#endif - diff --git a/ompi/mca/pml/dr/pml_dr_component.c b/ompi/mca/pml/dr/pml_dr_component.c deleted file mode 100644 index 283a5b0507..0000000000 --- a/ompi/mca/pml/dr/pml_dr_component.c +++ /dev/null @@ -1,182 +0,0 @@ -/* - * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2007 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2006 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" -#include "opal/mca/event/event.h" -#include "mpi.h" -#include "ompi/mca/pml/pml.h" -#include "ompi/mca/btl/base/base.h" -#include "ompi/mca/pml/base/pml_base_bsend.h" -#include "pml_dr.h" -#include "pml_dr_hdr.h" -#include "pml_dr_sendreq.h" -#include "pml_dr_recvreq.h" -#include "pml_dr_recvfrag.h" -#include "pml_dr_endpoint.h" -#include "ompi/mca/bml/base/base.h" -#include "pml_dr_component.h" - -static int mca_pml_dr_component_register(void); -static int mca_pml_dr_component_open(void); -static int mca_pml_dr_component_close(void); -static mca_pml_base_module_t* -mca_pml_dr_component_init( int* priority, - bool enable_progress_threads, - bool enable_mpi_threads ); -static int mca_pml_dr_component_fini(void); - -static unsigned int mca_pml_dr_wdog_timer_sec; -static unsigned int mca_pml_dr_wdog_timer_usec; - -static unsigned int mca_pml_dr_ack_timer_sec; -static unsigned int mca_pml_dr_ack_timer_usec; - -mca_pml_base_component_2_0_0_t mca_pml_dr_component = { - - /* First, the mca_base_component_t struct containing meta - information about the component itself */ - - { - MCA_PML_BASE_VERSION_2_0_0, - - "dr", /* MCA component name */ - OMPI_MAJOR_VERSION, /* MCA component major version */ - OMPI_MINOR_VERSION, /* MCA component minor version */ - OMPI_RELEASE_VERSION, /* MCA component release version */ - mca_pml_dr_component_open, /* component open */ - mca_pml_dr_component_close, /* component close */ - NULL, - mca_pml_dr_component_register - }, - { - /* This component is not checkpoint ready */ - MCA_BASE_METADATA_PARAM_NONE - }, - - mca_pml_dr_component_init, /* component init */ - mca_pml_dr_component_fini /* component finalize */ -}; - - -static inline int -mca_pml_dr_param_register_int( const char* param_name, - int default_value, int *storage) -{ - *storage = default_value; - (void) mca_base_component_var_register(&mca_pml_dr_component.pmlm_version, param_name, - NULL, MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, storage); - - return *storage; -} - -static inline unsigned int -mca_pml_dr_param_register_uint( const char* param_name, - unsigned int default_value, - unsigned int *storage) -{ - *storage = default_value; - (void) mca_base_component_var_register(&mca_pml_dr_component.pmlm_version, param_name, - NULL, MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, storage); - - return *storage; -} - -static int mca_pml_dr_component_register(void) -{ - (void) mca_pml_dr_param_register_int("free_list_num", 4, &mca_pml_dr.free_list_num); - (void) mca_pml_dr_param_register_int("free_list_max", -1, &mca_pml_dr.free_list_max); - (void) mca_pml_dr_param_register_int("free_list_inc", 64, &mca_pml_dr.free_list_inc); - (void) mca_pml_dr_param_register_int("priority", 10, &mca_pml_dr.priority); - (void) mca_pml_dr_param_register_uint("eager_limit", 128 * 1024, &mca_pml_dr.eager_limit); - (void) mca_pml_dr_param_register_uint("send_pipeline_depth", 3, &mca_pml_dr.send_pipeline_depth); - (void) mca_pml_dr_param_register_int("wdog_timer_sec", 5, (int *) &mca_pml_dr_wdog_timer_sec); - (void) mca_pml_dr_param_register_int("wdog_timer_usec", 0, (int *) &mca_pml_dr_wdog_timer_usec); - (void) mca_pml_dr_param_register_int("wdog_timer_multiplier", 1, &mca_pml_dr.wdog_timer_multiplier); - (void) mca_pml_dr_param_register_int("wdog_retry_max", 1, &mca_pml_dr.wdog_retry_max); - (void) mca_pml_dr_param_register_int("ack_timer_sec", 10, (int *) &mca_pml_dr_ack_timer_sec); - (void) mca_pml_dr_param_register_int("ack_timer_usec", 0, (int *) &mca_pml_dr_ack_timer_usec); - (void) mca_pml_dr_param_register_int("ack_timer_multiplier", 1, &mca_pml_dr.ack_timer_multiplier); - (void) mca_pml_dr_param_register_int("ack_retry_max", 3, &mca_pml_dr.ack_retry_max); - - /* default is to csum all data */ - (void) mca_pml_dr_param_register_int("enable_csum", 1, &mca_pml_dr.enable_csum); - - return OMPI_SUCCESS; -} - -int mca_pml_dr_component_open(void) -{ - mca_pml_dr.wdog_timer.tv_sec = mca_pml_dr_wdog_timer_sec; - mca_pml_dr.wdog_timer.tv_usec = mca_pml_dr_wdog_timer_usec; - - mca_pml_dr.ack_timer.tv_sec = mca_pml_dr_ack_timer_sec; - mca_pml_dr.ack_timer.tv_usec = mca_pml_dr_ack_timer_usec; - - mca_pml_dr.enabled = false; - return mca_base_framework_open(&ompi_bml_base_framework, 0); -} - -int mca_pml_dr_component_close(void) -{ - return mca_base_framework_close(&ompi_bml_base_framework); -} - -mca_pml_base_module_t* mca_pml_dr_component_init(int* priority, - bool enable_progress_threads, - bool enable_mpi_threads) -{ - if((*priority) > mca_pml_dr.priority) { - *priority = mca_pml_dr.priority; - return NULL; - } - *priority = mca_pml_dr.priority; - - if(OMPI_SUCCESS != mca_bml_base_init( enable_progress_threads, - enable_mpi_threads )) { - return NULL; - } - - return &mca_pml_dr.super; -} - -int mca_pml_dr_component_fini(void) -{ - int rc; - - /* Shutdown BML */ - if(OMPI_SUCCESS != (rc = mca_bml.bml_finalize())) - return rc; - - if(!mca_pml_dr.enabled) - return OMPI_SUCCESS; /* never selected.. return success.. */ - mca_pml_dr.enabled = false; /* not anymore */ - - OBJ_DESTRUCT(&mca_pml_dr.send_pending); - OBJ_DESTRUCT(&mca_pml_dr.send_active); - OBJ_DESTRUCT(&mca_pml_dr.acks_pending); - OBJ_DESTRUCT(&mca_pml_dr.recv_frags); - OBJ_DESTRUCT(&mca_pml_dr.buffers); - - return OMPI_SUCCESS; -} - diff --git a/ompi/mca/pml/dr/pml_dr_component.h b/ompi/mca/pml/dr/pml_dr_component.h deleted file mode 100644 index 8b82504d40..0000000000 --- a/ompi/mca/pml/dr/pml_dr_component.h +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2006 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/** - * @file - */ - -#ifndef MCA_PML_DR_COMPONENT_H -#define MCA_PML_DR_COMPONENT_H - -BEGIN_C_DECLS - -/* - * PML module functions. - */ -OMPI_MODULE_DECLSPEC extern mca_pml_base_component_2_0_0_t mca_pml_dr_component; - -END_C_DECLS - -#endif diff --git a/ompi/mca/pml/dr/pml_dr_endpoint.c b/ompi/mca/pml/dr/pml_dr_endpoint.c deleted file mode 100644 index 651811cea2..0000000000 --- a/ompi/mca/pml/dr/pml_dr_endpoint.c +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" -#include "pml_dr.h" -#include "pml_dr_endpoint.h" - - - -static void mca_pml_dr_endpoint_construct(mca_pml_dr_endpoint_t* ep) -{ - OBJ_CONSTRUCT(&ep->seq_sends, ompi_seq_tracker_t); - OBJ_CONSTRUCT(&ep->seq_recvs, ompi_seq_tracker_t); - OBJ_CONSTRUCT(&ep->seq_recvs_matched, ompi_seq_tracker_t); - ep->vfrag_seq = 0; - ep->bml_endpoint = NULL; -} - - -static void mca_pml_dr_endpoint_destruct(mca_pml_dr_endpoint_t* ep) -{ - OBJ_DESTRUCT(&ep->seq_sends); - OBJ_DESTRUCT(&ep->seq_recvs); - OBJ_DESTRUCT(&ep->seq_recvs_matched); -} - - -OBJ_CLASS_INSTANCE( - mca_pml_dr_endpoint_t, - opal_object_t, - mca_pml_dr_endpoint_construct, - mca_pml_dr_endpoint_destruct); diff --git a/ompi/mca/pml/dr/pml_dr_endpoint.h b/ompi/mca/pml/dr/pml_dr_endpoint.h deleted file mode 100644 index c47d482a4e..0000000000 --- a/ompi/mca/pml/dr/pml_dr_endpoint.h +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/** - * @file - */ -#ifndef MCA_PML_ENDPOINT_H -#define MCA_PML_ENDPOINT_H - -#include "ompi/mca/bml/bml.h" -#include "ompi/class/ompi_seq_tracker.h" - -BEGIN_C_DECLS -/** - * This is the pml level endpoint - * simply inherity the bml_base_endpoint and - * add whatever else is needed - */ -struct mca_pml_dr_endpoint_t { - opal_object_t super; - ompi_proc_t *proc_ompi; /* back pointer to proc structure */ - mca_bml_base_endpoint_t *bml_endpoint; /* pointer to related bml endpoint */ - int32_t local; /* local view of the rank */ - int32_t src; /* peers view of the src rank */ - int32_t dst; /* peers destination rank */ - ompi_seq_tracker_t seq_sends; /**< Tracks the send vfrags that have been acked */ - ompi_seq_tracker_t seq_recvs; /**< Tracks the receive vfrags that have been acked */ - ompi_seq_tracker_t seq_recvs_matched; /**< Tracks the received vfrags that have been matched */ - int32_t vfrag_seq; /**< current virtual fragment identifier sequence */ -}; -typedef struct mca_pml_dr_endpoint_t mca_pml_dr_endpoint_t; -OBJ_CLASS_DECLARATION(mca_pml_dr_endpoint_t); - - -END_C_DECLS -#endif - diff --git a/ompi/mca/pml/dr/pml_dr_hdr.h b/ompi/mca/pml/dr/pml_dr_hdr.h deleted file mode 100644 index 2723474dec..0000000000 --- a/ompi/mca/pml/dr/pml_dr_hdr.h +++ /dev/null @@ -1,203 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2006 The Regents of the University of California. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/** - * @file - */ -#ifndef MCA_PML_DR_HEADER_H -#define MCA_PML_DR_HEADER_H - -#include "ompi_config.h" -#ifdef HAVE_SYS_TYPES_H -#include -#endif -#ifdef HAVE_NETINET_IN_H -#include -#endif - -#include "opal/types.h" - -#define MCA_PML_DR_HDR_TYPE_MATCH 0x01 -#define MCA_PML_DR_HDR_TYPE_RNDV 0x02 -#define MCA_PML_DR_HDR_TYPE_FRAG 0x04 -#define MCA_PML_DR_HDR_TYPE_ACK 0x80 - -#define MCA_PML_DR_HDR_TYPE_MATCH_ACK (MCA_PML_DR_HDR_TYPE_ACK | MCA_PML_DR_HDR_TYPE_MATCH) -#define MCA_PML_DR_HDR_TYPE_RNDV_ACK (MCA_PML_DR_HDR_TYPE_ACK | MCA_PML_DR_HDR_TYPE_RNDV) -#define MCA_PML_DR_HDR_TYPE_FRAG_ACK (MCA_PML_DR_HDR_TYPE_ACK | MCA_PML_DR_HDR_TYPE_FRAG) - -#define MCA_PML_DR_HDR_FLAGS_NBO 1 /* is the hdr in network byte order */ - -/** - * Common hdr attributes - must be first element in each hdr type - */ - -struct mca_pml_dr_common_hdr_t { - uint8_t hdr_type; /**< type of envelope */ - uint8_t hdr_flags; /**< flags indicating how fragment should be processed */ - uint16_t hdr_csum; /**< checksum over header */ - int32_t hdr_dst; /**< destination rank */ - int32_t hdr_src; /**< rank of sender */ - uint16_t hdr_ctx; /**< communicator index */ - uint32_t hdr_vid; /**< vfrag id */ -}; -typedef struct mca_pml_dr_common_hdr_t mca_pml_dr_common_hdr_t; - -#define MCA_PML_DR_COMMON_HDR_NTOH(h) -#define MCA_PML_DR_COMMON_HDR_HTON(h) - -/** - * Header definition for the first fragment, contains the - * attributes required to match the corresponding posted receive. - */ -struct mca_pml_dr_match_hdr_t { - mca_pml_dr_common_hdr_t hdr_common; /**< common attributes */ - - uint16_t hdr_seq; /**< message sequence number */ - int32_t hdr_tag; /**< user tag */ - uint32_t hdr_csum; /**< checksum over data */ - ompi_ptr_t hdr_src_ptr; /**< pointer to source vfrag - returned in ack */ -}; -typedef struct mca_pml_dr_match_hdr_t mca_pml_dr_match_hdr_t; - -#define MCA_PML_DR_MATCH_HDR_NTOH(h) \ - do { \ - MCA_PML_DR_COMMON_HDR_NTOH((h).hdr_common); \ - (h).hdr_ctx = ntohs((h).hdr_ctx); \ - (h).hdr_src = ntohl((h).hdr_src); \ - (h).hdr_tag = ntohl((h).hdr_tag); \ - (h).hdr_seq = ntohs((h).hdr_seq); \ - } while (0) - -#define MCA_PML_DR_MATCH_HDR_HTON(h) \ - do { \ - MCA_PML_DR_COMMON_HDR_HTON((h).hdr_common); \ - (h).hdr_ctx = htons((h).hdr_ctx); \ - (h).hdr_src = htonl((h).hdr_src); \ - (h).hdr_tag = htonl((h).hdr_tag); \ - (h).hdr_seq = htons((h).hdr_seq); \ - } while (0) - -/** - * Header definition for the first fragment when an acknowledgment - * is required. This could be the first fragment of a large message - * or a short message that requires an ack (synchronous). - */ -struct mca_pml_dr_rendezvous_hdr_t { - mca_pml_dr_match_hdr_t hdr_match; - uint64_t hdr_msg_length; /**< message length */ -}; -typedef struct mca_pml_dr_rendezvous_hdr_t mca_pml_dr_rendezvous_hdr_t; - -#define MCA_PML_DR_RNDV_HDR_NTOH(h) \ - do { \ - MCA_PML_DR_MATCH_HDR_NTOH((h).hdr_match); \ - (h).hdr_msg_length = ntoh64((h).hdr_msg_length); \ - } while (0) - -#define MCA_PML_DR_RNDV_HDR_HTON(h) \ - do { \ - MCA_PML_DR_MATCH_HDR_HTON((h).hdr_match); \ - (h).hdr_msg_length = hton64((h).hdr_msg_length); \ - } while (0) - -/** - * Header for subsequent fragments. - */ -struct mca_pml_dr_frag_hdr_t { - mca_pml_dr_common_hdr_t hdr_common; /**< common attributes */ - uint16_t hdr_vlen; /**< length of entire vfrag */ - uint16_t hdr_frag_idx; /**< bit index of this frag w/in vfrag */ - uint32_t hdr_frag_csum; /**< checksum over data */ - uint64_t hdr_frag_offset; /**< absolute offset of this fragment */ - ompi_ptr_t hdr_src_ptr; /**< pointer to source vfrag */ - ompi_ptr_t hdr_dst_ptr; /**< pointer to receive req */ -}; -typedef struct mca_pml_dr_frag_hdr_t mca_pml_dr_frag_hdr_t; - -#define MCA_PML_DR_FRAG_HDR_NTOH(h) \ - do { \ - MCA_PML_DR_COMMON_HDR_NTOH((h).hdr_common); \ - (h).hdr_frag_offset = ntoh64((h).hdr_frag_offset); \ - } while (0) - -#define MCA_PML_DR_FRAG_HDR_HTON(h) \ - do { \ - MCA_PML_DR_COMMON_HDR_HTON((h).hdr_common); \ - (h).hdr_frag_offset = hton64((h).hdr_frag_offset); \ - } while (0) - - -/** - * Header used to acknowledgment outstanding fragment(s). - */ - -struct mca_pml_dr_ack_hdr_t { - mca_pml_dr_common_hdr_t hdr_common; /**< common attributes */ - uint32_t hdr_vlen; /**< total bytes received in vfrag */ - uint64_t hdr_vmask; /**< acknowledged frags */ - ompi_ptr_t hdr_src_ptr; /**< source vfrag */ - ompi_ptr_t hdr_dst_ptr; /**< matched receive request */ -}; -typedef struct mca_pml_dr_ack_hdr_t mca_pml_dr_ack_hdr_t; - -#define MCA_PML_DR_ACK_HDR_NTOH(h) \ - do { \ - MCA_PML_DR_COMMON_HDR_NTOH(h.hdr_common); \ - (h).hdr_dst_size = ntoh64((h).hdr_dst_size); \ - } while (0) - -#define MCA_PML_DR_ACK_HDR_HTON(h) \ - do { \ - MCA_PML_DR_COMMON_HDR_HTON((h).hdr_common); \ - (h).hdr_dst_size = hton64((h).hdr_dst_size); \ - } while (0) - -/** - * Union of defined hdr types. - */ -union mca_pml_dr_hdr_t { - mca_pml_dr_common_hdr_t hdr_common; - mca_pml_dr_match_hdr_t hdr_match; - mca_pml_dr_rendezvous_hdr_t hdr_rndv; - mca_pml_dr_frag_hdr_t hdr_frag; - mca_pml_dr_ack_hdr_t hdr_ack; -}; -typedef union mca_pml_dr_hdr_t mca_pml_dr_hdr_t; - - -static inline size_t mca_pml_dr_hdr_size(uint8_t type) -{ - switch(type) { - case MCA_PML_DR_HDR_TYPE_MATCH: - return sizeof(mca_pml_dr_match_hdr_t); - case MCA_PML_DR_HDR_TYPE_RNDV: - return sizeof(mca_pml_dr_rendezvous_hdr_t); - case MCA_PML_DR_HDR_TYPE_FRAG: - return sizeof(mca_pml_dr_frag_hdr_t); - case MCA_PML_DR_HDR_TYPE_ACK: - case MCA_PML_DR_HDR_TYPE_MATCH_ACK: - case MCA_PML_DR_HDR_TYPE_RNDV_ACK: - case MCA_PML_DR_HDR_TYPE_FRAG_ACK: - return sizeof(mca_pml_dr_ack_hdr_t); - default: - assert(0); - } - return 0; -} - -#endif diff --git a/ompi/mca/pml/dr/pml_dr_iprobe.c b/ompi/mca/pml/dr/pml_dr_iprobe.c deleted file mode 100644 index bb78ab9ed8..0000000000 --- a/ompi/mca/pml/dr/pml_dr_iprobe.c +++ /dev/null @@ -1,98 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2006 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2009-2012 Oracle and/or its affiliates. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" -#include "ompi/request/request.h" -#include "pml_dr_recvreq.h" - - -int mca_pml_dr_iprobe(int src, - int tag, - struct ompi_communicator_t *comm, - int *matched, ompi_status_public_t * status) -{ - int rc = OMPI_SUCCESS; - mca_pml_dr_recv_request_t recvreq; - - OBJ_CONSTRUCT( &recvreq, mca_pml_dr_recv_request_t ); - recvreq.req_recv.req_base.req_ompi.req_type = OMPI_REQUEST_PML; - recvreq.req_recv.req_base.req_type = MCA_PML_REQUEST_IPROBE; - - MCA_PML_DR_RECV_REQUEST_INIT(&recvreq, NULL, 0, &ompi_mpi_char.dt, src, tag, comm, true); - MCA_PML_DR_RECV_REQUEST_START(&recvreq); - - if( recvreq.req_recv.req_base.req_ompi.req_complete == true ) { - if( NULL != status ) { - *status = recvreq.req_recv.req_base.req_ompi.req_status; - } - *matched = 1; - } else { - *matched = 0; - opal_progress(); - } - MCA_PML_BASE_RECV_REQUEST_FINI( &recvreq.req_recv ); - return rc; -} - - -int mca_pml_dr_probe(int src, - int tag, - struct ompi_communicator_t *comm, - ompi_status_public_t * status) -{ - mca_pml_dr_recv_request_t recvreq; - - OBJ_CONSTRUCT( &recvreq, mca_pml_dr_recv_request_t ); - recvreq.req_recv.req_base.req_ompi.req_type = OMPI_REQUEST_PML; - recvreq.req_recv.req_base.req_type = MCA_PML_REQUEST_PROBE; - - MCA_PML_DR_RECV_REQUEST_INIT(&recvreq, NULL, 0, &ompi_mpi_char.dt, src, tag, comm, true); - MCA_PML_DR_RECV_REQUEST_START(&recvreq); - - ompi_request_wait_completion(&recvreq.req_recv.req_base.req_ompi); - - if (NULL != status) { - *status = recvreq.req_recv.req_base.req_ompi.req_status; - } - MCA_PML_BASE_RECV_REQUEST_FINI( &recvreq.req_recv ); - return OMPI_SUCCESS; -} - - -int -mca_pml_dr_improbe(int dst, - int tag, - struct ompi_communicator_t* comm, - int *matched, - struct ompi_message_t **message, - ompi_status_public_t* status) -{ - return OMPI_ERR_NOT_SUPPORTED; -} - - -int -mca_pml_dr_mprobe(int dst, - int tag, - struct ompi_communicator_t* comm, - struct ompi_message_t **message, - ompi_status_public_t* status) -{ - return OMPI_ERR_NOT_SUPPORTED; -} diff --git a/ompi/mca/pml/dr/pml_dr_irecv.c b/ompi/mca/pml/dr/pml_dr_irecv.c deleted file mode 100644 index b7aa8ab33c..0000000000 --- a/ompi/mca/pml/dr/pml_dr_irecv.c +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2006 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2010-2012 Oracle and/or its affiliates. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" -#include "ompi/request/request.h" -#include "pml_dr_recvreq.h" - - -int mca_pml_dr_irecv_init(void *addr, - size_t count, - ompi_datatype_t * datatype, - int src, - int tag, - struct ompi_communicator_t *comm, - struct ompi_request_t **request) -{ - int rc; - mca_pml_dr_recv_request_t *recvreq; - MCA_PML_DR_RECV_REQUEST_ALLOC(recvreq, rc); - if (NULL == recvreq) - return rc; - - MCA_PML_DR_RECV_REQUEST_INIT(recvreq, - addr, - count, datatype, src, tag, comm, true); - - *request = (ompi_request_t *) recvreq; - return OMPI_SUCCESS; -} - -int mca_pml_dr_irecv(void *addr, - size_t count, - ompi_datatype_t * datatype, - int src, - int tag, - struct ompi_communicator_t *comm, - struct ompi_request_t **request) -{ - int rc; - - mca_pml_dr_recv_request_t *recvreq; - MCA_PML_DR_RECV_REQUEST_ALLOC(recvreq, rc); - if (NULL == recvreq) - return rc; - - MCA_PML_DR_RECV_REQUEST_INIT(recvreq, - addr, - count, datatype, src, tag, comm, false); - - MCA_PML_DR_RECV_REQUEST_START(recvreq); - *request = (ompi_request_t *) recvreq; - return OMPI_SUCCESS; -} - - -int mca_pml_dr_recv(void *addr, - size_t count, - ompi_datatype_t * datatype, - int src, - int tag, - struct ompi_communicator_t *comm, - ompi_status_public_t * status) -{ - int rc; - mca_pml_dr_recv_request_t *recvreq; - MCA_PML_DR_RECV_REQUEST_ALLOC(recvreq, rc); - if (NULL == recvreq) - return rc; - - MCA_PML_DR_RECV_REQUEST_INIT(recvreq, - addr, - count, datatype, src, tag, comm, false); - - MCA_PML_DR_RECV_REQUEST_START(recvreq); - ompi_request_wait_completion(&recvreq->req_recv.req_base.req_ompi); - - if (NULL != status) { /* return status */ - *status = recvreq->req_recv.req_base.req_ompi.req_status; - } - rc = recvreq->req_recv.req_base.req_ompi.req_status.MPI_ERROR; - ompi_request_free( (ompi_request_t**)&recvreq ); - return rc; -} - - -int -mca_pml_dr_imrecv(void *buf, - size_t count, - ompi_datatype_t *datatype, - struct ompi_message_t **message, - struct ompi_request_t **request) -{ - return OMPI_ERR_NOT_SUPPORTED; -} - - -int -mca_pml_dr_mrecv(void *buf, - size_t count, - ompi_datatype_t *datatype, - struct ompi_message_t **message, - ompi_status_public_t* status) -{ - return OMPI_ERR_NOT_SUPPORTED; -} diff --git a/ompi/mca/pml/dr/pml_dr_isend.c b/ompi/mca/pml/dr/pml_dr_isend.c deleted file mode 100644 index b4d9192dd5..0000000000 --- a/ompi/mca/pml/dr/pml_dr_isend.c +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2006 The Regents of the University of California. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" - -#include "pml_dr.h" -#include "pml_dr_sendreq.h" -#include "pml_dr_recvreq.h" - - -int mca_pml_dr_isend_init(void *buf, - size_t count, - ompi_datatype_t * datatype, - int dst, - int tag, - mca_pml_base_send_mode_t sendmode, - ompi_communicator_t * comm, - ompi_request_t ** request) -{ - int rc; - mca_pml_dr_send_request_t *sendreq = NULL; - MCA_PML_DR_SEND_REQUEST_ALLOC(comm, dst, sendreq, rc); - if (rc != OMPI_SUCCESS) - return rc; - - MCA_PML_DR_SEND_REQUEST_INIT(sendreq, - buf, - count, - datatype, - dst, tag, - comm, sendmode, true); - - *request = (ompi_request_t *) sendreq; - return OMPI_SUCCESS; -} - - -int mca_pml_dr_isend(void *buf, - size_t count, - ompi_datatype_t * datatype, - int dst, - int tag, - mca_pml_base_send_mode_t sendmode, - ompi_communicator_t * comm, - ompi_request_t ** request) -{ - int rc; - mca_pml_dr_send_request_t *sendreq = NULL; - MCA_PML_DR_SEND_REQUEST_ALLOC(comm, dst, sendreq, rc); - if (rc != OMPI_SUCCESS) - return rc; - - MCA_PML_DR_SEND_REQUEST_INIT(sendreq, - buf, - count, - datatype, - dst, tag, - comm, sendmode, false); - - MCA_PML_DR_SEND_REQUEST_START(sendreq, rc); - *request = (ompi_request_t *) sendreq; - return rc; -} - - -int mca_pml_dr_send(void *buf, - size_t count, - ompi_datatype_t * datatype, - int dst, - int tag, - mca_pml_base_send_mode_t sendmode, - ompi_communicator_t * comm) -{ - int rc; - mca_pml_dr_send_request_t *sendreq; - MCA_PML_DR_SEND_REQUEST_ALLOC(comm, dst, sendreq, rc); - if (rc != OMPI_SUCCESS) - return rc; - - MCA_PML_DR_SEND_REQUEST_INIT(sendreq, - buf, - count, - datatype, - dst, tag, - comm, sendmode, false); - - MCA_PML_DR_SEND_REQUEST_START(sendreq, rc); - if (rc != OMPI_SUCCESS) { - ompi_request_free((ompi_request_t **) & sendreq); - return rc; - } - - ompi_request_wait_completion(&sendreq->req_send.req_base.req_ompi); - - /* return request to pool */ - rc = sendreq->req_send.req_base.req_ompi.req_status.MPI_ERROR; - ompi_request_free((ompi_request_t **) & sendreq); - return rc; -} diff --git a/ompi/mca/pml/dr/pml_dr_recvfrag.c b/ompi/mca/pml/dr/pml_dr_recvfrag.c deleted file mode 100644 index 9d4a23925c..0000000000 --- a/ompi/mca/pml/dr/pml_dr_recvfrag.c +++ /dev/null @@ -1,1005 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; -*- */ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2007 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2006 The Regents of the University of California. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -/** - * @file - */ - -#include "ompi_config.h" - -#include "opal/class/opal_list.h" -#include "opal/threads/mutex.h" -#include "opal/util/crc.h" -#include "ompi/constants.h" -#include "ompi/communicator/communicator.h" -#include "ompi/mca/pml/pml.h" -#include "ompi/class/ompi_seq_tracker.h" -#include "pml_dr.h" -#include "pml_dr_comm.h" -#include "pml_dr_recvfrag.h" -#include "pml_dr_recvreq.h" -#include "pml_dr_sendreq.h" -#include "pml_dr_hdr.h" - - -#define MCA_PML_DR_HDR_VALIDATE_ACK(do_csum, hdr, type) \ -do { \ - mca_pml_dr_endpoint_t* ep; \ - if(do_csum) { \ - uint16_t csum = (uint16_t)opal_csum(hdr, sizeof(type)); \ - if(hdr->hdr_common.hdr_csum != csum) { \ - MCA_PML_DR_DEBUG(0, (0, "%s:%d: invalid header checksum: 0x%04x != 0x%04x\n", \ - __FILE__, __LINE__, hdr->hdr_common.hdr_csum, csum)); \ - return; \ - } \ - } \ - ep = (mca_pml_dr_endpoint_t*)opal_pointer_array_get_item(&mca_pml_dr.endpoints, hdr->hdr_common.hdr_src); \ - assert(ep != NULL); \ - if(ompi_seq_tracker_check_duplicate(&ep->seq_sends, hdr->hdr_common.hdr_vid)) { \ - MCA_PML_DR_DEBUG(0, (0, "%s:%d: dropping duplicate ack, vfrag ID %d", \ - __FILE__, __LINE__, hdr->hdr_common.hdr_vid)); \ - return; \ - } \ - MCA_PML_DR_DEBUG(1, (0, "%s:%d: couldn't find vfrag ID %d \n", \ - __FILE__, __LINE__, hdr->hdr_common.hdr_vid)); \ -} while (0) - - -OBJ_CLASS_INSTANCE( - mca_pml_dr_buffer_t, - ompi_free_list_item_t, - NULL, - NULL -); - -OBJ_CLASS_INSTANCE( - mca_pml_dr_recv_frag_t, - ompi_free_list_item_t, - NULL, - NULL -); - -/* - * Release resources. - */ - -static void mca_pml_dr_ctl_completion( - mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* ep, - struct mca_btl_base_descriptor_t* des, - int status) -{ - mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*)des->des_context; - mca_bml_base_free(bml_btl, des); -} - - -/** - * Callback from BTL on receive. - */ - -void mca_pml_dr_recv_frag_callback( - mca_btl_base_module_t* btl, - mca_btl_base_tag_t tag, - mca_btl_base_descriptor_t* des, - void* cbdata) -{ - mca_btl_base_segment_t* segments = des->des_dst; - mca_pml_dr_hdr_t* hdr = (mca_pml_dr_hdr_t*)segments->seg_addr.pval; - mca_pml_dr_comm_t *comm; - mca_pml_dr_comm_proc_t *proc; - mca_pml_dr_endpoint_t *ep; - ompi_communicator_t* ompi_comm; - uint16_t csum; - bool do_csum = mca_pml_dr.enable_csum && - (btl->btl_flags & MCA_BTL_FLAGS_NEED_CSUM); - - if(segments->seg_len < sizeof(mca_pml_dr_common_hdr_t)) { - MCA_PML_DR_DEBUG(0,(0, "%s:%d: ???? segments->seg_len:%lu < sizeof(mca_pml_dr_common_hdr_t):%lu\n", - __FILE__, __LINE__, (unsigned long)segments->seg_len, (unsigned long)sizeof(mca_pml_dr_common_hdr_t))); - - return; - } - - MCA_PML_DR_DEBUG(0,(0, "%s:%d: got a hdr of type %d\n", - __FILE__, __LINE__, hdr->hdr_common.hdr_type)); - - switch(hdr->hdr_common.hdr_type) { - case MCA_PML_DR_HDR_TYPE_MATCH: - { - if(do_csum) { - csum = (uint16_t)opal_csum(hdr, sizeof(mca_pml_dr_match_hdr_t)); - if(hdr->hdr_common.hdr_csum != csum) { - MCA_PML_DR_DEBUG(0,(0, "%s:%d: invalid header checksum: 0x%04x != 0x%04x\n", - __FILE__, __LINE__, hdr->hdr_common.hdr_csum, csum)); - return; - } - } - if(hdr->hdr_common.hdr_dst != mca_pml_dr.my_rank ) { - MCA_PML_DR_DEBUG(0, (0, "%s:%d: misdelivered packet [rank %d -> rank %d]\n", - __FILE__, __LINE__, hdr->hdr_common.hdr_src, hdr->hdr_common.hdr_dst)); - return; - } - ep = (mca_pml_dr_endpoint_t*)opal_pointer_array_get_item(&mca_pml_dr.endpoints, hdr->hdr_common.hdr_src); - assert(ep != NULL); - - if(ompi_seq_tracker_check_duplicate(&ep->seq_recvs, hdr->hdr_common.hdr_vid)) { - MCA_PML_DR_DEBUG(0,(0, "%s:%d: got a duplicate vfrag vfrag id %d\n", - __FILE__, __LINE__, hdr->hdr_common.hdr_vid)); - mca_pml_dr_recv_frag_ack(btl, - ep->bml_endpoint, - &hdr->hdr_common, - hdr->hdr_match.hdr_src_ptr.pval, - 1, 0); - return; - } - ompi_comm = ompi_comm_lookup(hdr->hdr_common.hdr_ctx); - if(NULL == ompi_comm ) { - OPAL_OUTPUT((0, "%s:%d: invalid communicator %d\n", - __FILE__, __LINE__, hdr->hdr_common.hdr_ctx)); - return; - } - comm = (mca_pml_dr_comm_t*)ompi_comm->c_pml_comm; - assert(hdr->hdr_common.hdr_src < opal_pointer_array_get_size(&comm->sparse_procs)); - proc = (mca_pml_dr_comm_proc_t*)opal_pointer_array_get_item(&comm->sparse_procs, hdr->hdr_common.hdr_src); - assert(proc != NULL); - assert(ep == proc->pml_endpoint); - mca_pml_dr_recv_frag_match(comm,proc,btl,&hdr->hdr_match,segments,des->des_dst_cnt); - break; - - } - case MCA_PML_DR_HDR_TYPE_MATCH_ACK: - { - MCA_PML_DR_HDR_VALIDATE_ACK(do_csum, hdr, mca_pml_dr_ack_hdr_t); - mca_pml_dr_send_request_match_ack(btl, &hdr->hdr_ack); - break; - } - case MCA_PML_DR_HDR_TYPE_RNDV: - { - if(do_csum) { - csum = (uint16_t)opal_csum(hdr, sizeof(mca_pml_dr_rendezvous_hdr_t)); - - if(hdr->hdr_common.hdr_csum != csum) { - MCA_PML_DR_DEBUG(0, (0, "%s:%d: invalid header checksum: 0x%04x != 0x%04x\n", - __FILE__, __LINE__, hdr->hdr_common.hdr_csum, csum)); - return; - } - } - if(hdr->hdr_common.hdr_dst != mca_pml_dr.my_rank ) { - MCA_PML_DR_DEBUG(0, (0, "%s:%d: misdelivered packet [rank %d -> rank %d]\n", - __FILE__, __LINE__, hdr->hdr_common.hdr_src, hdr->hdr_common.hdr_dst)); - return; - } - ep = (mca_pml_dr_endpoint_t*)opal_pointer_array_get_item(&mca_pml_dr.endpoints, hdr->hdr_common.hdr_src); - assert(ep != NULL); - - /* seq_recvs protected by matching lock */ - if(ompi_seq_tracker_check_duplicate(&ep->seq_recvs, hdr->hdr_common.hdr_vid)){ - mca_pml_dr_recv_request_t* recvreq; - ompi_comm = ompi_comm_lookup(hdr->hdr_common.hdr_ctx); - if(NULL == ompi_comm) { - if(ompi_seq_tracker_check_duplicate(&ep->seq_recvs_matched, hdr->hdr_common.hdr_vid)) { - MCA_PML_DR_DEBUG(0, (0, "%s:%d: acking duplicate matched rendezvous from sequence tracker\n", - __FILE__, __LINE__)); - mca_pml_dr_recv_frag_ack(btl, - ep->bml_endpoint, - &hdr->hdr_common, - hdr->hdr_match.hdr_src_ptr.pval, - ~(uint64_t) 0, hdr->hdr_rndv.hdr_msg_length); - return; - } else { - OPAL_OUTPUT((0, "%s:%d: the world as we know it is bad\n", __FILE__, __LINE__)); - ompi_rte_abort(-1, NULL); - } - } - comm = (mca_pml_dr_comm_t*)ompi_comm->c_pml_comm; - assert(hdr->hdr_common.hdr_src < opal_pointer_array_get_size(&comm->sparse_procs)); - proc = (mca_pml_dr_comm_proc_t*)opal_pointer_array_get_item(&comm->sparse_procs, hdr->hdr_common.hdr_src); - assert(proc != NULL); - assert(ep == proc->pml_endpoint); - - /* ack only if the vfrag has been matched */ - recvreq = - mca_pml_dr_comm_proc_check_matched(proc, hdr->hdr_common.hdr_vid); - if(NULL != recvreq) { - MCA_PML_DR_DEBUG(0,(0, "%s:%d: acking duplicate matched rendezvous from pending matched vfrag id %d\n", - __FILE__, __LINE__, hdr->hdr_common.hdr_vid)); - mca_pml_dr_recv_request_ack(btl, recvreq, &hdr->hdr_common, - hdr->hdr_match.hdr_src_ptr, recvreq->req_bytes_received, 1); - } else { - if(ompi_seq_tracker_check_duplicate(&ep->seq_recvs_matched, hdr->hdr_common.hdr_vid)) { - MCA_PML_DR_DEBUG(0,(0, "%s:%d: acking duplicate matched rendezvous from sequence tracker\n", __FILE__, __LINE__)); - mca_pml_dr_recv_frag_ack(btl, - ep->bml_endpoint, - &hdr->hdr_common, - hdr->hdr_match.hdr_src_ptr.pval, - ~(uint64_t) 0, hdr->hdr_rndv.hdr_msg_length); - } else { - MCA_PML_DR_DEBUG(0,(0, "%s:%d: droping duplicate unmatched rendezvous\n", __FILE__, __LINE__)); - } - } - } else { - ompi_comm = ompi_comm_lookup(hdr->hdr_common.hdr_ctx); - if(NULL == ompi_comm) { - OPAL_OUTPUT((0, "%s:%d: the world as we know it is bad\n", __FILE__, __LINE__)); - ompi_rte_abort(-1, NULL); - } - comm = (mca_pml_dr_comm_t*)ompi_comm->c_pml_comm; - assert(hdr->hdr_common.hdr_src < opal_pointer_array_get_size(&comm->sparse_procs)); - proc = (mca_pml_dr_comm_proc_t*)opal_pointer_array_get_item(&comm->sparse_procs, hdr->hdr_common.hdr_src); - assert(proc != NULL); - assert(ep == proc->pml_endpoint); - mca_pml_dr_recv_frag_match(comm,proc,btl,&hdr->hdr_match,segments,des->des_dst_cnt); - } - break; - } - case MCA_PML_DR_HDR_TYPE_RNDV_ACK: - { - MCA_PML_DR_HDR_VALIDATE_ACK(do_csum, hdr, mca_pml_dr_ack_hdr_t); - mca_pml_dr_send_request_rndv_ack(btl, &hdr->hdr_ack); - break; - } - case MCA_PML_DR_HDR_TYPE_FRAG: - { - mca_pml_dr_recv_request_t* recvreq; - - if(do_csum) { - csum = (uint16_t)opal_csum(hdr, sizeof(mca_pml_dr_frag_hdr_t)); - if(hdr->hdr_common.hdr_csum != csum) { - MCA_PML_DR_DEBUG(0,(0, "%s:%d: invalid header checksum: 0x%04x != 0x%04x\n", - __FILE__, __LINE__, hdr->hdr_common.hdr_csum, csum)); - return; - } - } - if(hdr->hdr_common.hdr_dst != mca_pml_dr.my_rank ) { - MCA_PML_DR_DEBUG(0,(0, "%s:%d: misdelivered packet [rank %d -> rank %d]\n", - __FILE__, __LINE__, hdr->hdr_common.hdr_src, hdr->hdr_common.hdr_dst)); - return; - } - ep = (mca_pml_dr_endpoint_t*)opal_pointer_array_get_item(&mca_pml_dr.endpoints, hdr->hdr_common.hdr_src); - assert(ep != NULL); - - /* seq_recvs protected by matching lock */ - if(ompi_seq_tracker_check_duplicate(&ep->seq_recvs, hdr->hdr_common.hdr_vid)) { - MCA_PML_DR_DEBUG(0,(0, "%s:%d: acking duplicate fragment\n", __FILE__, __LINE__)); - mca_pml_dr_recv_frag_ack(btl, - ep->bml_endpoint, - &hdr->hdr_common, - hdr->hdr_frag.hdr_src_ptr.pval, - ~(uint64_t) 0, 0); - } else { - ompi_comm = ompi_comm_lookup(hdr->hdr_common.hdr_ctx); - if(NULL == ompi_comm) { - MCA_PML_DR_DEBUG(0,(0, "%s:%d: the world as we know it is bad\n", __FILE__, __LINE__)); - ompi_rte_abort(-1, NULL); - } - comm = (mca_pml_dr_comm_t*)ompi_comm->c_pml_comm; - assert(hdr->hdr_common.hdr_src < opal_pointer_array_get_size(&comm->sparse_procs)); - proc = (mca_pml_dr_comm_proc_t*)opal_pointer_array_get_item(&comm->sparse_procs, hdr->hdr_common.hdr_src); - assert(proc != NULL); - assert(ep == proc->pml_endpoint); - - recvreq = (mca_pml_dr_recv_request_t*)hdr->hdr_frag.hdr_dst_ptr.pval; - mca_pml_dr_recv_request_progress(recvreq,btl,segments,des->des_dst_cnt); - } - break; - - } - case MCA_PML_DR_HDR_TYPE_FRAG_ACK: - { - MCA_PML_DR_HDR_VALIDATE_ACK(do_csum, hdr, mca_pml_dr_ack_hdr_t); - mca_pml_dr_send_request_frag_ack(btl, &hdr->hdr_ack); - break; - } - default: - MCA_PML_DR_DEBUG(0,(0, "%s:%d: dropping unknown header type\n", __FILE__,__LINE__)); - break; - } -} - - -/** - * Try and match the incoming message fragment to the list of - * "wild" receives - * - * @param hdr Matching data from recived fragment (IN) - * - * @param pml_comm Pointer to the communicator structure used for - * matching purposes. (IN) - * - * @return Matched receive - * - * This routine assumes that the appropriate matching locks are - * set by the upper level routine. - */ - -#define MCA_PML_DR_CHECK_WILD_RECEIVES_FOR_MATCH(hdr,comm,proc,return_match) \ -do { \ - /* local parameters */ \ - opal_list_t* wild_receives = &comm->wild_receives; \ - mca_pml_dr_recv_request_t *wild_recv; \ - int frag_tag,recv_tag; \ - \ - /* initialization */ \ - frag_tag=hdr->hdr_tag; \ - \ - /* \ - * Loop over the wild irecvs - no need to lock, the upper level \ - * locking is protecting from having other threads trying to \ - * change this list. \ - */ \ - for(wild_recv = (mca_pml_dr_recv_request_t *) \ - opal_list_get_first(wild_receives); \ - wild_recv != (mca_pml_dr_recv_request_t *) \ - opal_list_get_end(wild_receives); \ - wild_recv = (mca_pml_dr_recv_request_t *) \ - ((opal_list_item_t *)wild_recv)->opal_list_next) { \ - \ - recv_tag = wild_recv->req_recv.req_base.req_tag; \ - if ( \ - /* exact tag match */ \ - (frag_tag == recv_tag) || \ - /* wild tag match - negative tags (except for \ - * OMPI_ANY_TAG) are reserved for internal use, and will \ - * not be matched with OMPI_ANY_TAG */ \ - ( (recv_tag == OMPI_ANY_TAG) && (0 <= frag_tag) ) ) \ - \ - { \ - /* \ - * Mark that this is the matching irecv, and go to process it. \ - */ \ - return_match = wild_recv; \ - \ - /* remove this irecv from the postd wild ireceive list */ \ - opal_list_remove_item(wild_receives, \ - (opal_list_item_t *)wild_recv); \ -\ - /* found match - no need to continue */ \ - break; \ - } \ - } \ -} while(0) - - -/** - * Try and match the incoming message fragment to the list of - * "specific" receives - * - * @param hdr Matching data from recived fragment (IN) - * - * @param comm Pointer to the communicator structure used for - * matching purposes. (IN) - * - * @return Matched receive - * - * This routine assumes that the appropriate matching locks are - * set by the upper level routine. - */ -#define MCA_PML_DR_CHECK_SPECIFIC_RECEIVES_FOR_MATCH(hdr,comm,proc,return_match) \ -do { \ - /* local variables */ \ - opal_list_t* specific_receives = &proc->specific_receives; \ - mca_pml_dr_recv_request_t *specific_recv; \ - int recv_tag,frag_tag; \ - \ - /* initialization */ \ - frag_tag=hdr->hdr_tag; \ - \ - /* \ - * Loop over the specific irecvs. \ - */ \ - for(specific_recv = (mca_pml_dr_recv_request_t *) \ - opal_list_get_first(specific_receives); \ - specific_recv != (mca_pml_dr_recv_request_t *) \ - opal_list_get_end(specific_receives); \ - specific_recv = (mca_pml_dr_recv_request_t *) \ - ((opal_list_item_t *)specific_recv)->opal_list_next) { \ - /* \ - * Check for a match \ - */ \ - recv_tag = specific_recv->req_recv.req_base.req_tag; \ - if ( (frag_tag == recv_tag) || \ - ( (recv_tag == OMPI_ANY_TAG) && (0 <= frag_tag) ) ) { \ - \ - /* \ - * Match made \ - */ \ - return_match = specific_recv; \ - \ - /* remove descriptor from posted specific ireceive list */ \ - opal_list_remove_item(specific_receives, \ - (opal_list_item_t *)specific_recv); \ - \ - break; \ - } \ - } \ -} while(0) - -/** - * Try and match the incoming message fragment to the list of - * "wild" receives and "specific" receives. Used when both types - * of receives have been posted, i.e. when we need to coordinate - * between multiple lists to make sure ordered delivery occurs. - * - * @param hdr Matching data from recived fragment (IN) - * - * @param comm Pointer to the communicator structure used for - * matching purposes. (IN) - * - * @return Matched receive - * - * This routine assumes that the appropriate matching locks are - * set by the upper level routine. - */ - -#define MCA_PML_DR_CHECK_SPECIFIC_AND_WILD_RECEIVES_FOR_MATCH( \ - hdr,comm,proc,return_match) \ -do { \ - /* local variables */ \ - mca_pml_dr_recv_request_t *specific_recv, *wild_recv; \ - mca_pml_sequence_t wild_recv_seq, specific_recv_seq; \ - int frag_tag, wild_recv_tag, specific_recv_tag; \ - \ - /* initialization */ \ - frag_tag=hdr->hdr_tag; \ - \ - /* \ - * We know that when this is called, both specific and wild irecvs \ - * have been posted. \ - */ \ - specific_recv = (mca_pml_dr_recv_request_t *) \ - opal_list_get_first(&(proc)->specific_receives); \ - wild_recv = (mca_pml_dr_recv_request_t *) \ - opal_list_get_first(&comm->wild_receives); \ - \ - specific_recv_seq = specific_recv->req_recv.req_base.req_sequence; \ - wild_recv_seq = wild_recv->req_recv.req_base.req_sequence; \ - \ - while (true) { \ - if (wild_recv_seq < specific_recv_seq) { \ - /* \ - * wild recv is earlier than the specific one. \ - */ \ - /* \ - * try and match \ - */ \ - wild_recv_tag = wild_recv->req_recv.req_base.req_tag; \ - if ( (frag_tag == wild_recv_tag) || \ - ( (wild_recv_tag == OMPI_ANY_TAG) && (0 <= frag_tag) ) ) { \ - /* \ - * Match made \ - */ \ - return_match=wild_recv; \ - \ - /* remove this recv from the wild receive queue */ \ - opal_list_remove_item(&comm->wild_receives, \ - (opal_list_item_t *)wild_recv); \ - break; \ - } \ - \ - /* \ - * No match, go to the next. \ - */ \ - wild_recv=(mca_pml_dr_recv_request_t *) \ - ((opal_list_item_t *)wild_recv)->opal_list_next; \ - \ - /* \ - * If that was the last wild one, just look at the \ - * rest of the specific ones. \ - */ \ - if (wild_recv == (mca_pml_dr_recv_request_t *) \ - opal_list_get_end(&comm->wild_receives) ) \ - { \ - MCA_PML_DR_CHECK_SPECIFIC_RECEIVES_FOR_MATCH(hdr, comm, proc, return_match); \ - break; \ - } \ - \ - /* \ - * Get the sequence number for this recv, and go \ - * back to the top of the loop. \ - */ \ - wild_recv_seq = wild_recv->req_recv.req_base.req_sequence; \ - \ - } else { \ - /* \ - * specific recv is earlier than the wild one. \ - */ \ - specific_recv_tag=specific_recv->req_recv.req_base.req_tag; \ - if ( (frag_tag == specific_recv_tag) || \ - ( (specific_recv_tag == OMPI_ANY_TAG) && (0<=frag_tag)) ) \ - { \ - /* \ - * Match made \ - */ \ - return_match = specific_recv; \ - /* remove descriptor from specific receive list */ \ - opal_list_remove_item(&(proc)->specific_receives, \ - (opal_list_item_t *)specific_recv); \ - break; \ - } \ - \ - /* \ - * No match, go on to the next specific irecv. \ - */ \ - specific_recv = (mca_pml_dr_recv_request_t *) \ - ((opal_list_item_t *)specific_recv)->opal_list_next; \ - \ - /* \ - * If that was the last specific irecv, process the \ - * rest of the wild ones. \ - */ \ - if (specific_recv == (mca_pml_dr_recv_request_t *) \ - opal_list_get_end(&(proc)->specific_receives)) \ - { \ - MCA_PML_DR_CHECK_WILD_RECEIVES_FOR_MATCH(hdr, comm, proc, return_match); \ - break; \ - } \ - /* \ - * Get the sequence number for this recv, and go \ - * back to the top of the loop. \ - */ \ - specific_recv_seq = specific_recv->req_recv.req_base.req_sequence; \ - } \ - } \ -} while(0) - - -/* - * Specialized matching routines for internal use only. - */ - -static bool mca_pml_dr_check_cantmatch_for_match( - opal_list_t *additional_matches, - mca_pml_dr_comm_t* comm, - mca_pml_dr_comm_proc_t *proc); - - -/** - * RCS/CTS receive side matching - * - * @param hdr list of parameters needed for matching - * This list is also embeded in frag, - * but this allows to save a memory copy when - * a match is made in this routine. (IN) - * @param frag pointer to receive fragment which we want - * to match (IN/OUT). If a match is not made, - * hdr is copied to frag. - * @param match_made parameter indicating if we matched frag/ - * hdr (OUT) - * @param additional_matches if a match is made with frag, we - * may be able to match fragments that previously - * have arrived out-of-order. If this is the - * case, the associated fragment descriptors are - * put on this list for further processing. (OUT) - * - * @return OMPI error code - * - * This routine is used to try and match a newly arrived message fragment - * to pre-posted receives. The following assumptions are made - * - fragments are received out of order - * - for long messages, e.g. more than one fragment, a RTS/CTS algorithm - * is used. - * - 2nd and greater fragments include a receive descriptor pointer - * - fragments may be dropped - * - fragments may be corrupt - * - this routine may be called simultaneously by more than one thread - */ -bool mca_pml_dr_recv_frag_match( - mca_pml_dr_comm_t* comm, - mca_pml_dr_comm_proc_t *proc, - mca_btl_base_module_t *btl, - mca_pml_dr_match_hdr_t *hdr, - mca_btl_base_segment_t* segments, - size_t num_segments) -{ - /* local variables */ - uint16_t next_msg_seq_expected, frag_msg_seq; - mca_pml_dr_recv_request_t *match = NULL; - bool additional_match=false; - opal_list_t additional_matches; - ompi_proc_t* ompi_proc = proc->ompi_proc; - int rc; - uint32_t csum; - mca_pml_dr_endpoint_t* ep = (mca_pml_dr_endpoint_t*) proc->pml_endpoint; - bool do_csum = mca_pml_dr.enable_csum && - (btl->btl_flags & MCA_BTL_FLAGS_NEED_CSUM); - - /* source sequence number */ - frag_msg_seq = hdr->hdr_seq; - - /* get next expected message sequence number - if threaded - * run, lock to make sure that if another thread is processing - * a frag from the same message a match is made only once. - * Also, this prevents other posted receives (for a pair of - * end points) from being processed, and potentially "loosing" - * the fragment. - */ - OPAL_THREAD_LOCK(&comm->matching_lock); - - /* get sequence number of next message that can be processed */ - next_msg_seq_expected = (uint16_t)proc->expected_sequence; - if (frag_msg_seq == next_msg_seq_expected) { - - /* - * This is the sequence number we were expecting, - * so we can try matching it to already posted - * receives. - */ - - /* We're now expecting the next sequence number. */ - (proc->expected_sequence)++; -rematch: - - /* - * figure out what sort of matching logic to use, if need to - * look only at "specific" receives, or "wild" receives, - * or if we need to traverse both sets at the same time. - */ - if (opal_list_get_size(&proc->specific_receives) == 0 ){ - /* - * There are only wild irecvs, so specialize the algorithm. - */ - MCA_PML_DR_CHECK_WILD_RECEIVES_FOR_MATCH(hdr, comm, proc, match); - - } else if (opal_list_get_size(&comm->wild_receives) == 0 ) { - /* - * There are only specific irecvs, so specialize the algorithm. - */ - MCA_PML_DR_CHECK_SPECIFIC_RECEIVES_FOR_MATCH(hdr, comm, proc, match); - } else { - /* - * There are some of each. - */ - MCA_PML_DR_CHECK_SPECIFIC_AND_WILD_RECEIVES_FOR_MATCH(hdr, comm, proc, match); - } - - /* if match found, process data */ - if (match) { - - /* - * update delivered sequence number information, if needed. - */ - MCA_PML_DR_RECV_REQUEST_MATCHED(match,comm,proc,hdr); - if( match->req_recv.req_base.req_type == MCA_PML_REQUEST_PROBE ) { - - /* complete the probe */ - mca_pml_dr_recv_request_matched_probe(match,btl,segments,num_segments); - - /* retry the matchh */ - match = NULL; - goto rematch; - } - } else { - - /* if no match found, verify csum, if pass place on unexpected queue */ - mca_pml_dr_recv_frag_t* frag; - MCA_PML_DR_RECV_FRAG_ALLOC(frag, rc); - if(OMPI_SUCCESS != rc) { - OPAL_THREAD_UNLOCK(&comm->matching_lock); - return false; - } - MCA_PML_DR_RECV_FRAG_INIT(frag,ompi_proc,hdr,segments,num_segments,btl,csum); - if(do_csum && csum != hdr->hdr_csum) { - mca_pml_dr_recv_frag_ack(btl, - (mca_bml_base_endpoint_t*)ompi_proc->proc_bml, - &hdr->hdr_common, hdr->hdr_src_ptr.pval, 0, 0); - MCA_PML_DR_DEBUG(0,(0, "%s:%d: received corrupted data 0x%08x != 0x%08x (segments %lu length %lu)\n", - __FILE__, __LINE__, csum, hdr->hdr_csum, (unsigned long)num_segments, - (unsigned long)(segments[0].seg_len - mca_pml_dr_hdr_size(hdr->hdr_common.hdr_type)))); - MCA_PML_DR_RECV_FRAG_RETURN(frag); - OPAL_THREAD_UNLOCK(&comm->matching_lock); - return false; - } - opal_list_append( &proc->unexpected_frags, (opal_list_item_t *)frag ); - } - - /* - * Now that new message has arrived, check to see if - * any fragments on the c_c_frags_cant_match list - * may now be used to form new matchs - */ - if (0 < opal_list_get_size(&proc->frags_cant_match)) { - additional_match = mca_pml_dr_check_cantmatch_for_match(&additional_matches,comm,proc); - } - - } else { - - /* - * This message comes after the next expected, so it - * is ahead of sequence. If passes csum save it for later. - */ - - mca_pml_dr_recv_frag_t* frag; - MCA_PML_DR_RECV_FRAG_ALLOC(frag, rc); - if(OMPI_SUCCESS != rc) { - OPAL_THREAD_UNLOCK(&comm->matching_lock); - return false; - } - MCA_PML_DR_RECV_FRAG_INIT(frag,ompi_proc,hdr,segments,num_segments,btl,csum); - if(do_csum && csum != hdr->hdr_csum) { - mca_pml_dr_recv_frag_ack(btl, - (mca_bml_base_endpoint_t*)ompi_proc->proc_bml, - &hdr->hdr_common, hdr->hdr_src_ptr.pval, 0, 0); - MCA_PML_DR_DEBUG(0,(0, "%s:%d: received corrupted data 0x%08x != 0x%08x\n", - __FILE__, __LINE__, csum, hdr->hdr_csum)); - MCA_PML_DR_RECV_FRAG_RETURN(frag); - OPAL_THREAD_UNLOCK(&comm->matching_lock); - return false; - } - opal_list_append(&proc->frags_cant_match, (opal_list_item_t *)frag); - } - - - ompi_seq_tracker_insert(&ep->seq_recvs, hdr->hdr_common.hdr_vid); - OPAL_THREAD_UNLOCK(&comm->matching_lock); - - /* release matching lock before processing fragment */ - if(match != NULL) { - mca_pml_dr_recv_request_progress(match,btl,segments,num_segments); - } - - /* if buffered a short message - go ahead and ack */ - else if (hdr->hdr_common.hdr_type == MCA_PML_DR_HDR_TYPE_MATCH) { - MCA_PML_DR_DEBUG(1,(0, "%s:%d: received short message, acking now vfrag id: %d\n", - __FILE__, __LINE__, hdr->hdr_common.hdr_vid)); - - mca_pml_dr_recv_frag_ack(btl, - (mca_bml_base_endpoint_t*)ompi_proc->proc_bml, - &hdr->hdr_common, hdr->hdr_src_ptr.pval, 1, 0); - } - - if(additional_match) { - opal_list_item_t* item; - while(NULL != (item = opal_list_remove_first(&additional_matches))) { - mca_pml_dr_recv_frag_t* frag = (mca_pml_dr_recv_frag_t*)item; - mca_pml_dr_recv_request_progress(frag->request,frag->btl,frag->segments,frag->num_segments); - MCA_PML_DR_RECV_FRAG_RETURN(frag); - } - } - return (match != NULL); -} - - - -void mca_pml_dr_recv_frag_ack( - mca_btl_base_module_t* btl, - mca_bml_base_endpoint_t* endpoint, - mca_pml_dr_common_hdr_t* hdr, - void *src_ptr, - uint64_t mask, - uint16_t len) -{ - mca_btl_base_descriptor_t* des; - mca_bml_base_btl_t* bml_btl; - mca_pml_dr_recv_frag_t* frag; - mca_pml_dr_ack_hdr_t* ack; - int rc; - bool do_csum; - - /* use the same BTL for ACK's makes failover SANE */ - bml_btl = mca_bml_base_btl_array_find(&endpoint->btl_eager, - btl); - - do_csum = mca_pml_dr.enable_csum && - (bml_btl->btl_flags & MCA_BTL_FLAGS_NEED_CSUM); - - /* allocate descriptor */ - mca_bml_base_alloc(bml_btl, &des, MCA_BTL_NO_ORDER, - sizeof(mca_pml_dr_ack_hdr_t), MCA_BTL_DES_FLAGS_PRIORITY); - if(NULL == des) { - goto retry; - } - - /* fill out header */ - ack = (mca_pml_dr_ack_hdr_t*)des->des_src->seg_addr.pval; - ack->hdr_common.hdr_type = MCA_PML_DR_HDR_TYPE_ACK | hdr->hdr_type; - ack->hdr_common.hdr_flags = 0; - ack->hdr_common.hdr_src = hdr->hdr_dst; - ack->hdr_common.hdr_dst = hdr->hdr_src; - ack->hdr_common.hdr_vid = hdr->hdr_vid; - ack->hdr_common.hdr_ctx = hdr->hdr_ctx; - ack->hdr_vlen = len; - ack->hdr_vmask = mask; - ack->hdr_src_ptr.pval = src_ptr; - assert(ack->hdr_src_ptr.pval); - ack->hdr_dst_ptr.pval = NULL; - ack->hdr_common.hdr_csum = (uint16_t)(do_csum ? - opal_csum(ack, sizeof(mca_pml_dr_ack_hdr_t)) : - OPAL_CSUM_ZERO); - - /* initialize descriptor */ - des->des_flags |= MCA_BTL_DES_FLAGS_PRIORITY; - des->des_cbfunc = mca_pml_dr_ctl_completion; - - MCA_PML_DR_DEBUG(1,(0, "%s:%d: sending ack, vfrag ID %d", - __FILE__, __LINE__, hdr->hdr_vid)); - - rc = mca_bml_base_send(bml_btl, des, MCA_BTL_TAG_PML); - - if(rc != OMPI_SUCCESS) { - mca_bml_base_free(bml_btl, des); - goto retry; - } - - return; - - /* queue request to retry later */ -retry: - MCA_PML_DR_RECV_FRAG_ALLOC(frag,rc); - /* frag->hdr.hdr_match = *hdr; */ - frag->num_segments = 0; - opal_list_append(&mca_pml_dr.acks_pending, (opal_list_item_t*)frag); -} - - - -/** - * Scan the list of frags that came in ahead of time to see if any - * can be processed at this time. If they can, try and match the - * frags. - * - * @param additional_matches List to hold new matches with fragments - * from the c_frags_cant_match list. (IN/OUT) - * - * @param pml_comm Pointer to the communicator structure used for - * matching purposes. (IN) - * - * This routine assumes that the appropriate matching locks are - * set by the upper level routine. - */ - -static bool mca_pml_dr_check_cantmatch_for_match( - opal_list_t *additional_matches, - mca_pml_dr_comm_t* comm, - mca_pml_dr_comm_proc_t *proc) -{ - /* local parameters */ - int match_found; - uint16_t next_msg_seq_expected, frag_seq; - mca_pml_dr_recv_frag_t *frag; - bool match_made = false; - - /* - * Loop over all the out of sequence messages. No ordering is assumed - * in the c_frags_cant_match list. - */ - - match_found = 1; - while ((0 < opal_list_get_size(&proc->frags_cant_match)) && match_found) { - - /* initialize match flag for this search */ - match_found = 0; - - /* get sequence number of next message that can be processed */ - next_msg_seq_expected = proc->expected_sequence; - - /* search the list for a fragment from the send with sequence - * number next_msg_seq_expected - */ - for(frag = (mca_pml_dr_recv_frag_t *) - opal_list_get_first(&proc->frags_cant_match); - frag != (mca_pml_dr_recv_frag_t *) - opal_list_get_end(&proc->frags_cant_match); - frag = (mca_pml_dr_recv_frag_t *) - opal_list_get_next(frag)) - { - /* - * If the message has the next expected seq from that proc... - */ - frag_seq=frag->hdr.hdr_match.hdr_seq; - if (frag_seq == next_msg_seq_expected) { - mca_pml_dr_recv_request_t *match = NULL; - mca_pml_dr_match_hdr_t* hdr = &frag->hdr.hdr_match; - - /* We're now expecting the next sequence number. */ - (proc->expected_sequence)++; - - /* signal that match was made */ - match_found = 1; - - /* - * remove frag from list - */ - opal_list_remove_item(&proc->frags_cant_match, - (opal_list_item_t *)frag); - -rematch: - /* - * figure out what sort of matching logic to use, if need to - * look only at "specific" receives, or "wild" receives, - * or if we need to traverse both sets at the same time. - */ - proc = (mca_pml_dr_comm_proc_t*)opal_pointer_array_get_item(&comm->sparse_procs, - hdr->hdr_common.hdr_src); - - if (opal_list_get_size(&proc->specific_receives) == 0 ) { - /* - * There are only wild irecvs, so specialize the algorithm. - */ - MCA_PML_DR_CHECK_WILD_RECEIVES_FOR_MATCH(hdr, comm, proc, match); - } else if (opal_list_get_size(&comm->wild_receives) == 0 ) { - /* - * There are only specific irecvs, so specialize the algorithm. - */ - MCA_PML_DR_CHECK_SPECIFIC_RECEIVES_FOR_MATCH(hdr, comm, proc, match); - } else { - /* - * There are some of each. - */ - MCA_PML_DR_CHECK_SPECIFIC_AND_WILD_RECEIVES_FOR_MATCH(hdr, comm, proc, match); - - } - - /* if match found, process data */ - if (match) { - - /* - * If this was a probe need to queue fragment on unexpected list - */ - if( match->req_recv.req_base.req_type == MCA_PML_REQUEST_PROBE ) { - - /* complete the probe */ - mca_pml_dr_recv_request_matched_probe(match,frag->btl,frag->segments,frag->num_segments); - - /* retry the match */ - match = NULL; - goto rematch; - - } else { - - /* associate the receive descriptor with the fragment - * descriptor */ - frag->request=match; - match->req_proc = proc; - match->req_endpoint = (mca_pml_dr_endpoint_t*)proc->ompi_proc->proc_pml; - MCA_PML_DR_DEBUG(10, (0, "%s:%d: adding endpoint %p match %p\n", - __FILE__, __LINE__, (void*)proc->ompi_proc->proc_pml, (void*)match->req_endpoint)); - - /* add this fragment descriptor to the list of - * descriptors to be processed later - */ - if(match_made == false) { - match_made = true; - OBJ_CONSTRUCT(additional_matches, opal_list_t); - } - MCA_PML_DR_RECV_REQUEST_MATCHED(match,comm,proc,&frag->hdr.hdr_match); - opal_list_append(additional_matches, (opal_list_item_t *)frag); - } - - } else { - - /* if no match found, place on unexpected queue */ - opal_list_append( &proc->unexpected_frags, (opal_list_item_t *)frag); - - } - - /* c_frags_cant_match is not an ordered list, so exit loop - * and re-start search for next sequence number */ - break; - - } /* end if (frag_seq == next_msg_seq_expected) */ - - } /* end for (frag) loop */ - - } /* end while loop */ - - return match_made; -} - - diff --git a/ompi/mca/pml/dr/pml_dr_recvfrag.h b/ompi/mca/pml/dr/pml_dr_recvfrag.h deleted file mode 100644 index b5a186f27f..0000000000 --- a/ompi/mca/pml/dr/pml_dr_recvfrag.h +++ /dev/null @@ -1,169 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2006 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2006 The Regents of the University of California. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/** - * @file - */ - -#ifndef MCA_PML_DR_RECVFRAG_H -#define MCA_PML_DR_RECVFRAG_H - -#include "ompi/mca/btl/btl.h" -#include "ompi/mca/bml/bml.h" -#include "pml_dr_hdr.h" - -BEGIN_C_DECLS - -struct mca_pml_dr_buffer_t { - ompi_free_list_item_t super; - size_t len; - unsigned char addr[1]; -}; -typedef struct mca_pml_dr_buffer_t mca_pml_dr_buffer_t; - -OBJ_CLASS_DECLARATION(mca_pml_dr_buffer_t); - -struct mca_pml_dr_recv_frag_t { - ompi_free_list_item_t super; - mca_pml_dr_hdr_t hdr; - struct mca_pml_dr_recv_request_t* request; - size_t num_segments; - uint32_t csum; - mca_btl_base_module_t* btl; - ompi_proc_t* proc; - mca_btl_base_segment_t segments[MCA_BTL_DES_MAX_SEGMENTS]; - mca_pml_dr_buffer_t* buffers[MCA_BTL_DES_MAX_SEGMENTS]; -}; -typedef struct mca_pml_dr_recv_frag_t mca_pml_dr_recv_frag_t; - -OBJ_CLASS_DECLARATION(mca_pml_dr_recv_frag_t); - - -#define MCA_PML_DR_RECV_FRAG_ALLOC(frag,rc) \ -do { \ - ompi_free_list_item_t* item; \ - OMPI_FREE_LIST_WAIT(&mca_pml_dr.recv_frags, item, rc); \ - frag = (mca_pml_dr_recv_frag_t*)item; \ -} while(0) - - -#define MCA_PML_DR_RECV_FRAG_INIT(frag,oproc,hdr,segs,cnt,btl,csum) \ -do { \ - size_t i, length = 0; \ - uint32_t ui1 = 0; \ - size_t ui2 = 0; \ - mca_pml_dr_buffer_t** buffers = frag->buffers; \ - bool do_csum = mca_pml_dr.enable_csum && \ - (btl->btl_flags & MCA_BTL_FLAGS_NEED_CSUM); \ - /* init recv_frag */ \ - frag->btl = btl; \ - frag->hdr = *(mca_pml_dr_hdr_t*)hdr; \ - frag->num_segments = cnt; \ - frag->proc = oproc; \ - \ - csum = OPAL_CSUM_ZERO; \ - /* copy over data */ \ - for( i = 0; i < cnt; i++ ) { \ - ompi_free_list_item_t* item; \ - mca_pml_dr_buffer_t* buff; \ - OMPI_FREE_LIST_WAIT(&mca_pml_dr.buffers, item, rc); \ - buff = (mca_pml_dr_buffer_t*)item; \ - buffers[i] = buff; \ - frag->segments[i].seg_addr.pval = buff->addr; \ - frag->segments[i].seg_len = segs[i].seg_len; \ - if( do_csum ) { \ - size_t hdr_len = 0; \ - if( 0 == i ) { \ - hdr_len = mca_pml_dr_hdr_size(hdr->hdr_common.hdr_type);\ - memcpy( buff->addr, \ - segs[i].seg_addr.pval, \ - hdr_len ); \ - } \ - csum = OPAL_CSUM_BCOPY_PARTIAL( \ - ((unsigned char*)segs[i].seg_addr.pval)+hdr_len, \ - ((unsigned char*)buff->addr)+hdr_len, \ - segs[i].seg_len-hdr_len, segs[i].seg_len-hdr_len, \ - &ui1, &ui2); \ - length += segs[i].seg_len - hdr_len; \ - } else { \ - memcpy( buff->addr, \ - segs[i].seg_addr.pval, \ - segs[i].seg_len ); \ - } \ - } \ - frag->csum = csum; \ -} while(0) - - -#define MCA_PML_DR_RECV_FRAG_RETURN(frag) \ -do { \ - size_t i; \ - \ - /* return buffers */ \ - for(i=0; inum_segments; i++) { \ - OMPI_FREE_LIST_RETURN(&mca_pml_dr.buffers, \ - (ompi_free_list_item_t*)frag->buffers[i]); \ - } \ - frag->num_segments = 0; \ - \ - /* return recv_frag */ \ - OMPI_FREE_LIST_RETURN(&mca_pml_dr.recv_frags, \ - (ompi_free_list_item_t*)frag); \ -} while(0) - - -/** - * Generate an ack to the peer. - */ - -void mca_pml_dr_recv_frag_ack( - mca_btl_base_module_t* btl, - mca_bml_base_endpoint_t* endpoint, - mca_pml_dr_common_hdr_t* hdr, - void* src_ptr, - uint64_t mask, - uint16_t len); - -/** - * Callback from BTL on receipt of a recv_frag. - */ - -void mca_pml_dr_recv_frag_callback( mca_btl_base_module_t *btl, - mca_btl_base_tag_t tag, - mca_btl_base_descriptor_t* descriptor, - void* cbdata); - -/** - * Match incoming recv_frags against posted receives. - * Supports out of order delivery. - * - * @param frag_header (IN) Header of received recv_frag. - * @param frag_desc (IN) Received recv_frag descriptor. - * @param match_made (OUT) Flag indicating wether a match was made. - * @param additional_matches (OUT) List of additional matches - * @return OMPI_SUCCESS or error status on failure. - */ -bool mca_pml_dr_recv_frag_match( mca_pml_dr_comm_t* comm, - mca_pml_dr_comm_proc_t* proc, - mca_btl_base_module_t* btl, - mca_pml_dr_match_hdr_t *hdr, - mca_btl_base_segment_t* segments, - size_t num_segments); - -END_C_DECLS -#endif - diff --git a/ompi/mca/pml/dr/pml_dr_recvreq.c b/ompi/mca/pml/dr/pml_dr_recvreq.c deleted file mode 100644 index e0024639b7..0000000000 --- a/ompi/mca/pml/dr/pml_dr_recvreq.c +++ /dev/null @@ -1,552 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2006 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2006 The Regents of the University of California. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" -#include "opal/util/crc.h" - -#include "ompi/mca/pml/pml.h" -#include "ompi/mca/bml/bml.h" -#include "ompi/mca/btl/btl.h" -#include "pml_dr_comm.h" -#include "pml_dr_recvreq.h" -#include "pml_dr_recvfrag.h" -#include "pml_dr_sendreq.h" -#include "ompi/mca/bml/base/base.h" - - -/* - * this macro is needed for MATCH/RNDV headers, - * as we need to put the match back on the list if the checksum - * fails for later matching - */ -#define MCA_PML_DR_RECV_REQUEST_MATCH_ACK(btl,do_csum,recvreq,hdr,csum,bytes_received) \ -if(do_csum && csum != hdr->hdr_match.hdr_csum) { \ - /* failed the csum, put the request back on the list for \ - * matching later on retransmission \ - */ \ - if(recvreq->req_recv.req_base.req_peer == OMPI_ANY_SOURCE) { \ - mca_pml_dr_recv_request_match_wild(recvreq); \ - } else { \ - mca_pml_dr_recv_request_match_specific(recvreq); \ - } \ - mca_pml_dr_recv_frag_ack(btl, \ - recvreq->req_endpoint->bml_endpoint, \ - &hdr->hdr_common, \ - hdr->hdr_match.hdr_src_ptr.pval, \ - 0, 0); \ - MCA_PML_DR_DEBUG(0,(0, "%s:%d: [rank %d -> rank %d] " \ - "data checksum failed 0x%08x != 0x%08x\n", \ - __FILE__, __LINE__, \ - hdr->hdr_common.hdr_src, hdr->hdr_common.hdr_dst, \ - csum, hdr->hdr_match.hdr_csum)); \ - bytes_received = bytes_delivered = 0; \ -} else if (recvreq->req_acked == false) { \ - MCA_PML_DR_DEBUG(1,(0, "%s:%d: sending ack, vfrag ID %d", \ - __FILE__, __LINE__, recvreq->req_vfrag0.vf_id)); \ - mca_pml_dr_recv_request_ack(btl, recvreq, &hdr->hdr_common, \ - hdr->hdr_match.hdr_src_ptr, bytes_received, 1); \ -} - - -static mca_pml_dr_recv_frag_t* mca_pml_dr_recv_request_match_specific_proc( - mca_pml_dr_recv_request_t* request, mca_pml_dr_comm_proc_t* proc); - - -static int mca_pml_dr_recv_request_free(struct ompi_request_t** request) -{ - mca_pml_dr_recv_request_t* recvreq = *(mca_pml_dr_recv_request_t**)request; - assert( false == recvreq->req_recv.req_base.req_free_called ); - - OPAL_THREAD_LOCK(&ompi_request_lock); - recvreq->req_recv.req_base.req_free_called = true; - if( true == recvreq->req_recv.req_base.req_pml_complete ) { - MCA_PML_DR_RECV_REQUEST_RETURN( recvreq ); - } - OPAL_THREAD_UNLOCK(&ompi_request_lock); - - *request = MPI_REQUEST_NULL; - return OMPI_SUCCESS; -} - -static int mca_pml_dr_recv_request_cancel(struct ompi_request_t* ompi_request, int complete) -{ - mca_pml_dr_recv_request_t* request = (mca_pml_dr_recv_request_t*)ompi_request; - mca_pml_dr_comm_t* comm = request->req_recv.req_base.req_comm->c_pml_comm; - - if( true == ompi_request->req_complete ) { /* way to late to cancel this one */ - return OMPI_SUCCESS; - } - - /* The rest should be protected behind the match logic lock */ - OPAL_THREAD_LOCK(&comm->matching_lock); - if( OMPI_ANY_TAG == ompi_request->req_status.MPI_TAG ) { /* the match has not been already done */ - if( request->req_recv.req_base.req_peer == OMPI_ANY_SOURCE ) { - opal_list_remove_item( &comm->wild_receives, (opal_list_item_t*)request ); - } else { - mca_pml_dr_comm_proc_t* proc = comm->procs + request->req_recv.req_base.req_peer; - opal_list_remove_item(&proc->specific_receives, (opal_list_item_t*)request); - } - } - OPAL_THREAD_UNLOCK(&comm->matching_lock); - - OPAL_THREAD_LOCK(&ompi_request_lock); - ompi_request->req_status._cancelled = true; - /* This macro will set the req_complete to true so the MPI Test/Wait* functions - * on this request will be able to complete. As the status is marked as - * cancelled the cancel state will be detected. - */ - ompi_request_complete(ompi_request, true); - OPAL_THREAD_UNLOCK(&ompi_request_lock); - return OMPI_SUCCESS; -} - -static void mca_pml_dr_recv_request_construct(mca_pml_dr_recv_request_t* request) -{ - OBJ_CONSTRUCT(&request->req_vfrag0, mca_pml_dr_vfrag_t); - OBJ_CONSTRUCT(&request->req_vfrags, opal_list_t); - - request->req_vfrag0.vf_len = 1; - request->req_vfrag0.vf_mask = 1; - request->req_vfrag0.vf_recv.pval = request; - request->req_recv.req_base.req_type = MCA_PML_REQUEST_RECV; - request->req_recv.req_base.req_ompi.req_free = mca_pml_dr_recv_request_free; - request->req_recv.req_base.req_ompi.req_cancel = mca_pml_dr_recv_request_cancel; -} - -static void mca_pml_dr_recv_request_destruct(mca_pml_dr_recv_request_t* request) -{ - OBJ_DESTRUCT(&request->req_vfrag0); - OBJ_DESTRUCT(&request->req_vfrags); -} - - -OBJ_CLASS_INSTANCE( - mca_pml_dr_recv_request_t, - mca_pml_base_recv_request_t, - mca_pml_dr_recv_request_construct, - mca_pml_dr_recv_request_destruct); - - -/* - * Release resources. - */ - -static void mca_pml_dr_ctl_completion( - mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* ep, - struct mca_btl_base_descriptor_t* des, - int status) -{ - mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*)des->des_context; - mca_bml_base_free(bml_btl, des); -} - -/* - * Generate an ack to the peer. - */ - -void mca_pml_dr_recv_request_ack( - mca_btl_base_module_t* btl, - mca_pml_dr_recv_request_t* recvreq, - mca_pml_dr_common_hdr_t* hdr, - ompi_ptr_t src_ptr, - size_t vlen, - uint64_t mask) -{ - mca_btl_base_descriptor_t* des; - mca_bml_base_btl_t* bml_btl; - mca_pml_dr_ack_hdr_t* ack; - int rc; - bool do_csum; - - - /* use the same BTL for ACK's makes failover SANE */ - bml_btl = mca_bml_base_btl_array_find(&recvreq->req_endpoint->bml_endpoint->btl_eager, - btl); - /* allocate descriptor */ - do_csum = mca_pml_dr.enable_csum && - (bml_btl->btl_flags & MCA_BTL_FLAGS_NEED_CSUM); - mca_bml_base_alloc(bml_btl, &des, MCA_BTL_NO_ORDER, - sizeof(mca_pml_dr_ack_hdr_t), MCA_BTL_DES_FLAGS_PRIORITY); - if(NULL == des) { - return; - } - - /* fill out header */ - ack = (mca_pml_dr_ack_hdr_t*)des->des_src->seg_addr.pval; - ack->hdr_common.hdr_type = MCA_PML_DR_HDR_TYPE_ACK | hdr->hdr_type; - ack->hdr_common.hdr_flags = 0; - ack->hdr_common.hdr_dst = recvreq->req_endpoint->dst; - ack->hdr_common.hdr_src = recvreq->req_endpoint->src; - ack->hdr_common.hdr_ctx = recvreq->req_recv.req_base.req_comm->c_contextid; - ack->hdr_common.hdr_vid = hdr->hdr_vid; - ack->hdr_vlen = vlen; - ack->hdr_vmask = mask; - ack->hdr_src_ptr = src_ptr; - ack->hdr_dst_ptr.pval = recvreq; - ack->hdr_common.hdr_csum = (uint16_t)(do_csum? - opal_csum(ack, sizeof(mca_pml_dr_ack_hdr_t)) : - OPAL_CSUM_ZERO); - - /* initialize descriptor */ - des->des_flags |= MCA_BTL_DES_FLAGS_PRIORITY; - des->des_cbfunc = mca_pml_dr_ctl_completion; - - rc = mca_bml_base_send(bml_btl, des, MCA_BTL_TAG_PML); - if(rc != OMPI_SUCCESS) { - mca_bml_base_free(bml_btl, des); - } else { - recvreq->req_acked = true; - } -} - - - /* - * Update the recv request status to reflect the number of bytes - * received and actually delivered to the application. - */ - -void mca_pml_dr_recv_request_progress( - mca_pml_dr_recv_request_t* recvreq, - mca_btl_base_module_t* btl, - mca_btl_base_segment_t* segments, - size_t num_segments) -{ - size_t bytes_received = 0; - size_t bytes_delivered = 0; - size_t data_offset = 0; - mca_pml_dr_hdr_t* hdr = (mca_pml_dr_hdr_t*)segments->seg_addr.pval; - size_t i; - uint32_t csum = OPAL_CSUM_ZERO; - uint64_t bit; - mca_pml_dr_vfrag_t* vfrag; - bool do_csum = mca_pml_dr.enable_csum && - (btl->btl_flags & MCA_BTL_FLAGS_NEED_CSUM); - - - for(i=0; ihdr_common.hdr_type) { - case MCA_PML_DR_HDR_TYPE_MATCH: - - bytes_received -= sizeof(mca_pml_dr_match_hdr_t); - recvreq->req_vfrag0.vf_send = hdr->hdr_match.hdr_src_ptr; - MCA_PML_DR_RECV_REQUEST_BYTES_PACKED(recvreq, bytes_received); - - MCA_PML_DR_RECV_REQUEST_UNPACK( - recvreq, - segments, - num_segments, - sizeof(mca_pml_dr_match_hdr_t), - data_offset, - bytes_received, - bytes_delivered, - csum); - MCA_PML_DR_RECV_REQUEST_MATCH_ACK(btl,do_csum, recvreq,hdr,csum,bytes_received); - - break; - - case MCA_PML_DR_HDR_TYPE_RNDV: - - bytes_received -= sizeof(mca_pml_dr_rendezvous_hdr_t); - recvreq->req_vfrag0.vf_send = hdr->hdr_match.hdr_src_ptr; - MCA_PML_DR_RECV_REQUEST_BYTES_PACKED(recvreq, hdr->hdr_rndv.hdr_msg_length); - MCA_PML_DR_RECV_REQUEST_UNPACK( - recvreq, - segments, - num_segments, - sizeof(mca_pml_dr_rendezvous_hdr_t), - data_offset, - bytes_received, - bytes_delivered, - csum); - MCA_PML_DR_RECV_REQUEST_MATCH_ACK(btl,do_csum, recvreq,hdr,csum,bytes_received); - - break; - - case MCA_PML_DR_HDR_TYPE_FRAG: - bit = ((uint64_t)1 << hdr->hdr_frag.hdr_frag_idx); - MCA_PML_DR_RECV_REQUEST_VFRAG_LOOKUP(recvreq, &hdr->hdr_frag, vfrag); - if(vfrag->vf_ack & bit) { - if(vfrag->vf_ack == vfrag->vf_mask) { - MCA_PML_DR_DEBUG(1,(0, "%s:%d: sending ack, vfrag ID %d", - __FILE__, __LINE__, vfrag->vf_id)); - mca_pml_dr_recv_request_ack(btl, - recvreq, &hdr->hdr_common, - hdr->hdr_frag.hdr_src_ptr, - vfrag->vf_size, - vfrag->vf_mask); - } - return; - } - bytes_received -= sizeof(mca_pml_dr_frag_hdr_t); - data_offset = hdr->hdr_frag.hdr_frag_offset; - - MCA_PML_DR_RECV_REQUEST_UNPACK( - recvreq, - segments, - num_segments, - sizeof(mca_pml_dr_frag_hdr_t), - data_offset, - bytes_received, - bytes_delivered, - csum); - - /* update the mask to show that this vfrag was received, - * note that it might still fail the checksum though - */ - vfrag->vf_pending |= bit; - if(!do_csum || csum == hdr->hdr_frag.hdr_frag_csum) { - /* this part of the vfrag passed the checksum, - mark it so that we ack it after receiving the - entire vfrag */ - vfrag->vf_ack |= bit; - if((vfrag->vf_pending & vfrag->vf_mask) == vfrag->vf_mask) { - /* we have received all the pieces of the vfrag, ack - everything that passed the checksum */ - ompi_seq_tracker_insert(&recvreq->req_endpoint->seq_recvs, vfrag->vf_id); - MCA_PML_DR_DEBUG(1,(0, "%s:%d: sending ack, vfrag ID %d", - __FILE__, __LINE__, vfrag->vf_id)); - mca_pml_dr_recv_request_ack(btl, recvreq, &hdr->hdr_common, - hdr->hdr_frag.hdr_src_ptr, - vfrag->vf_size, vfrag->vf_mask); - } - } else { - MCA_PML_DR_DEBUG(0,(0, "%s:%d received corrupted fragment 0x%08x != 0x%08x\n", - __FILE__,__LINE__,csum,hdr->hdr_frag.hdr_frag_csum)); - bytes_received = bytes_delivered = 0; - } - break; - - default: - break; - } - - /* check completion status */ - OPAL_THREAD_LOCK(&ompi_request_lock); - recvreq->req_bytes_received += bytes_received; - recvreq->req_bytes_delivered += bytes_delivered; - if (recvreq->req_bytes_received == recvreq->req_recv.req_bytes_packed) { - MCA_PML_DR_RECV_REQUEST_PML_COMPLETE(recvreq); - } - OPAL_THREAD_UNLOCK(&ompi_request_lock); -} - - -/** - * Handle completion of a probe request - */ - -void mca_pml_dr_recv_request_matched_probe( - mca_pml_dr_recv_request_t* recvreq, - mca_btl_base_module_t* btl, - mca_btl_base_segment_t* segments, - size_t num_segments) -{ - size_t bytes_packed = 0; - mca_pml_dr_hdr_t* hdr = (mca_pml_dr_hdr_t*)segments->seg_addr.pval; - size_t i; - - switch(hdr->hdr_common.hdr_type) { - case MCA_PML_DR_HDR_TYPE_MATCH: - - for(i=0; ihdr_rndv.hdr_msg_length; - break; - } - - /* check completion status */ - OPAL_THREAD_LOCK(&ompi_request_lock); - recvreq->req_recv.req_base.req_ompi.req_status.MPI_TAG = hdr->hdr_match.hdr_tag; - recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE = recvreq->req_proc->comm_rank; - recvreq->req_bytes_received = bytes_packed; - recvreq->req_bytes_delivered = bytes_packed; - - ompi_request_completed++; - if(ompi_request_waiting) { - opal_condition_broadcast(&ompi_request_cond); - } - - /* mark probe request completed */ - MCA_PML_DR_RECV_REQUEST_PML_COMPLETE(recvreq); - OPAL_THREAD_UNLOCK(&ompi_request_lock); -} - -/* - * This routine is used to match a posted receive when the source process - * is specified. -*/ - -void mca_pml_dr_recv_request_match_specific(mca_pml_dr_recv_request_t* request) -{ - mca_pml_dr_comm_t* comm = request->req_recv.req_base.req_comm->c_pml_comm; - mca_pml_dr_comm_proc_t* proc = comm->procs + request->req_recv.req_base.req_peer; - mca_pml_dr_recv_frag_t* frag; - - /* check for a specific match */ - OPAL_THREAD_LOCK(&comm->matching_lock); - - /* assign sequence number */ - request->req_recv.req_base.req_sequence = comm->recv_sequence++; - - if (opal_list_get_size(&proc->unexpected_frags) > 0 && - (frag = mca_pml_dr_recv_request_match_specific_proc(request, proc)) != NULL) { - /* has the request already been matched */ - if(frag->hdr.hdr_common.hdr_type == MCA_PML_DR_HDR_TYPE_MATCH) - request->req_acked = true; - MCA_PML_DR_RECV_REQUEST_MATCHED(request,comm,proc,&frag->hdr.hdr_match); - OPAL_THREAD_UNLOCK(&comm->matching_lock); - - if( !((MCA_PML_REQUEST_IPROBE == request->req_recv.req_base.req_type) || - (MCA_PML_REQUEST_PROBE == request->req_recv.req_base.req_type)) ) { - mca_pml_dr_recv_request_progress(request,frag->btl,frag->segments,frag->num_segments); - MCA_PML_DR_RECV_FRAG_RETURN(frag); - } else { - mca_pml_dr_recv_request_matched_probe(request,frag->btl,frag->segments,frag->num_segments); - } - return; /* match found */ - } - - /* We didn't find any matches. Record this irecv so we can match - * it when the message comes in. - */ - if(request->req_recv.req_base.req_type != MCA_PML_REQUEST_IPROBE) { - opal_list_append(&proc->specific_receives, (opal_list_item_t*)request); - } - OPAL_THREAD_UNLOCK(&comm->matching_lock); -} - - -/* - * this routine is used to try and match a wild posted receive - where - * wild is determined by the value assigned to the source process -*/ - -void mca_pml_dr_recv_request_match_wild(mca_pml_dr_recv_request_t* request) -{ - mca_pml_dr_comm_t* comm = request->req_recv.req_base.req_comm->c_pml_comm; - mca_pml_dr_comm_proc_t* proc = comm->procs; - size_t proc_count = comm->num_procs; - size_t i; - - /* - * Loop over all the outstanding messages to find one that matches. - * There is an outer loop over lists of messages from each - * process, then an inner loop over the messages from the - * process. - */ - OPAL_THREAD_LOCK(&comm->matching_lock); - - /* assign sequence number */ - request->req_recv.req_base.req_sequence = comm->recv_sequence++; - - for (i = 0; i < proc_count; i++) { - mca_pml_dr_recv_frag_t* frag; - - /* continue if no frags to match */ - if (opal_list_get_size(&proc->unexpected_frags) == 0) { - proc++; - continue; - } - - /* loop over messages from the current proc */ - if ((frag = mca_pml_dr_recv_request_match_specific_proc(request, proc)) != NULL) { - /* has the request already been matched */ - if(frag->hdr.hdr_common.hdr_type == MCA_PML_DR_HDR_TYPE_MATCH) - request->req_acked = true; - MCA_PML_DR_RECV_REQUEST_MATCHED(request,comm,proc,&frag->hdr.hdr_match); - OPAL_THREAD_UNLOCK(&comm->matching_lock); - if( !((MCA_PML_REQUEST_IPROBE == request->req_recv.req_base.req_type) || - (MCA_PML_REQUEST_PROBE == request->req_recv.req_base.req_type)) ) { - mca_pml_dr_recv_request_progress(request,frag->btl,frag->segments,frag->num_segments); - MCA_PML_DR_RECV_FRAG_RETURN(frag); - } else { - mca_pml_dr_recv_request_matched_probe(request,frag->btl,frag->segments,frag->num_segments); - } - return; /* match found */ - } - proc++; - } - - /* We didn't find any matches. Record this irecv so we can match to - * it when the message comes in. - */ - - if(request->req_recv.req_base.req_type != MCA_PML_REQUEST_IPROBE) - opal_list_append(&comm->wild_receives, (opal_list_item_t*)request); - OPAL_THREAD_UNLOCK(&comm->matching_lock); -} - - -/* - * this routine tries to match a posted receive. If a match is found, - * it places the request in the appropriate matched receive list. This - * function has to be called with the communicator matching lock held. -*/ - -static mca_pml_dr_recv_frag_t* mca_pml_dr_recv_request_match_specific_proc( - mca_pml_dr_recv_request_t* request, - mca_pml_dr_comm_proc_t* proc) -{ - opal_list_t* unexpected_frags = &proc->unexpected_frags; - mca_pml_dr_recv_frag_t* frag; - mca_pml_dr_match_hdr_t* hdr; - int tag = request->req_recv.req_base.req_tag; - - if( OMPI_ANY_TAG == tag ) { - for (frag = (mca_pml_dr_recv_frag_t*)opal_list_get_first(unexpected_frags); - frag != (mca_pml_dr_recv_frag_t*)opal_list_get_end(unexpected_frags); - frag = (mca_pml_dr_recv_frag_t*)opal_list_get_next(frag)) { - hdr = &(frag->hdr.hdr_match); - - /* check first frag - we assume that process matching has been done already */ - if( hdr->hdr_tag >= 0 ) { - goto find_fragment; - } - } - } else { - for (frag = (mca_pml_dr_recv_frag_t*)opal_list_get_first(unexpected_frags); - frag != (mca_pml_dr_recv_frag_t*)opal_list_get_end(unexpected_frags); - frag = (mca_pml_dr_recv_frag_t*)opal_list_get_next(frag)) { - hdr = &(frag->hdr.hdr_match); - - /* check first frag - we assume that process matching has been done already */ - if ( tag == hdr->hdr_tag ) { - /* we assume that the tag is correct from MPI point of view (ie. >= 0 ) */ - goto find_fragment; - } - } - } - return NULL; - find_fragment: - if( !((MCA_PML_REQUEST_IPROBE == request->req_recv.req_base.req_type) || - (MCA_PML_REQUEST_PROBE == request->req_recv.req_base.req_type)) ) { - opal_list_remove_item(unexpected_frags, (opal_list_item_t*)frag); - frag->request = request; - } - return frag; -} - - diff --git a/ompi/mca/pml/dr/pml_dr_recvreq.h b/ompi/mca/pml/dr/pml_dr_recvreq.h deleted file mode 100644 index 1a057b3969..0000000000 --- a/ompi/mca/pml/dr/pml_dr_recvreq.h +++ /dev/null @@ -1,403 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2010 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2006 The Regents of the University of California. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/** - * @file - */ -#ifndef OMPI_PML_DR_RECV_REQUEST_H -#define OMPI_PML_DR_RECV_REQUEST_H - -#include "ompi_config.h" -#include "ompi/mca/mpool/base/base.h" -#include "ompi/mca/pml/base/pml_base_recvreq.h" - -#include "pml_dr.h" -#include "pml_dr_hdr.h" -#include "pml_dr_vfrag.h" -#include "pml_dr_comm.h" - -BEGIN_C_DECLS - - -struct mca_pml_dr_recv_request_t { - mca_pml_base_recv_request_t req_recv; - size_t req_bytes_received; - size_t req_bytes_delivered; - bool req_acked; - - /* filled in after match */ - struct mca_pml_dr_comm_proc_t* req_proc; - struct mca_pml_dr_endpoint_t* req_endpoint; - opal_mutex_t* req_mutex; - - /* vfrag state */ - mca_pml_dr_vfrag_t *req_vfrag; - mca_pml_dr_vfrag_t req_vfrag0; - opal_list_t req_vfrags; -}; -typedef struct mca_pml_dr_recv_request_t mca_pml_dr_recv_request_t; - - -OBJ_CLASS_DECLARATION(mca_pml_dr_recv_request_t); - - -/** - * Allocate a recv request from the modules free list. - * - * @param rc (OUT) OMPI_SUCCESS or error status on failure. - * @return Receive request. - */ -#define MCA_PML_DR_RECV_REQUEST_ALLOC(recvreq, rc) \ -do { \ - ompi_free_list_item_t* item; \ - rc = OMPI_SUCCESS; \ - OMPI_FREE_LIST_GET(&mca_pml_base_recv_requests, item, rc); \ - recvreq = (mca_pml_dr_recv_request_t*)item; \ -} while(0) - - -/** - * Initialize a receive request with call parameters. - * - * @param request (IN) Receive request. - * @param addr (IN) User buffer. - * @param count (IN) Number of elements of indicated datatype. - * @param datatype (IN) User defined datatype. - * @param src (IN) Source rank w/in the communicator. - * @param tag (IN) User defined tag. - * @param comm (IN) Communicator. - * @param persistent (IN) Is this a ersistent request. - */ -#define MCA_PML_DR_RECV_REQUEST_INIT( \ - request, \ - addr, \ - count, \ - datatype, \ - src, \ - tag, \ - comm, \ - persistent) \ -do { \ - MCA_PML_BASE_RECV_REQUEST_INIT( \ - &(request)->req_recv, \ - addr, \ - count, \ - datatype, \ - src, \ - tag, \ - comm, \ - persistent); \ -} while(0) - -/** - * Mark a recv request complete. - * - * @param request (IN) Receive request. - */ - -#define MCA_PML_DR_RECV_REQUEST_PML_COMPLETE(recvreq) \ -do { \ - ompi_free_list_item_t* item; \ - assert( false == recvreq->req_recv.req_base.req_pml_complete ); \ - OPAL_THREAD_LOCK((recvreq)->req_mutex); \ - while(NULL != (item = (ompi_free_list_item_t*) \ - opal_list_remove_first(&(recvreq)->req_vfrags))) { \ - OMPI_FREE_LIST_RETURN(&mca_pml_dr.vfrags, item); \ - } \ - OPAL_THREAD_UNLOCK((recvreq)->req_mutex); \ - \ - opal_list_remove_item(&(recvreq)->req_proc->matched_receives, \ - (opal_list_item_t*)(recvreq)); \ - \ - /* initialize request status */ \ - recvreq->req_recv.req_base.req_pml_complete = true; \ - if (recvreq->req_bytes_received > recvreq->req_bytes_delivered) { \ - recvreq->req_recv.req_base.req_ompi.req_status._ucount = \ - recvreq->req_bytes_delivered; \ - recvreq->req_recv.req_base.req_ompi.req_status.MPI_ERROR = \ - MPI_ERR_TRUNCATE; \ - } else { \ - recvreq->req_recv.req_base.req_ompi.req_status._ucount = \ - recvreq->req_bytes_received; \ - } \ - ompi_request_complete( &(recvreq->req_recv.req_base.req_ompi), true ); \ - \ - if( true == recvreq->req_recv.req_base.req_free_called ) { \ - MCA_PML_DR_RECV_REQUEST_RETURN( recvreq ); \ - } \ -} while(0) - - -/** - * Return a recv request to the modules free list. - * - * @param request (IN) Receive request. - */ -#define MCA_PML_DR_RECV_REQUEST_RETURN(recvreq) \ -do { \ - /* decrement reference counts */ \ - MCA_PML_BASE_RECV_REQUEST_FINI(&(recvreq)->req_recv); \ - OMPI_FREE_LIST_RETURN(&mca_pml_base_recv_requests, \ - (ompi_free_list_item_t*)(recvreq)); \ -} while(0) - -/** - * Attempt to match the request against the unexpected fragment list - * for all source ranks w/in the communicator. - * - * @param request (IN) Request to match. - */ -void mca_pml_dr_recv_request_match_wild(mca_pml_dr_recv_request_t* request); - -/** - * Attempt to match the request against the unexpected fragment list - * for a specific source rank. - * - * @param request (IN) Request to match. - */ -void mca_pml_dr_recv_request_match_specific(mca_pml_dr_recv_request_t* request); - - -/** - * Ack a matched request. - */ -void mca_pml_dr_recv_request_ack( - mca_btl_base_module_t* blt, - mca_pml_dr_recv_request_t* recvreq, - mca_pml_dr_common_hdr_t* hdr, - ompi_ptr_t src_ptr, - size_t vlen, - uint64_t vmask); - -/** - * Start an initialized request. - * - * @param request Receive request. - * @return OMPI_SUCESS or error status on failure. - */ -#define MCA_PML_DR_RECV_REQUEST_START(request) \ -do { \ - /* init/re-init the request */ \ - (request)->req_bytes_received = 0; \ - (request)->req_bytes_delivered = 0; \ - (request)->req_acked = false; \ - (request)->req_recv.req_base.req_pml_complete = false; \ - (request)->req_recv.req_base.req_ompi.req_complete = false; \ - (request)->req_recv.req_base.req_ompi.req_state = OMPI_REQUEST_ACTIVE; \ - (request)->req_vfrag = &(request)->req_vfrag0; \ - (request)->req_proc = NULL; \ - (request)->req_endpoint = NULL; \ - \ - /* always set the req_status.MPI_TAG to ANY_TAG before starting the \ - * request. This field is used if cancelled to find out if the request \ - * has been matched or not. \ - */ \ - (request)->req_recv.req_base.req_ompi.req_status.MPI_TAG = OMPI_ANY_TAG; \ - (request)->req_recv.req_base.req_ompi.req_status.MPI_ERROR = OMPI_SUCCESS; \ - (request)->req_recv.req_base.req_ompi.req_status._cancelled = 0; \ - \ - /* attempt to match unexpected recv */ \ - if((request)->req_recv.req_base.req_peer == OMPI_ANY_SOURCE) { \ - mca_pml_dr_recv_request_match_wild(request); \ - } else { \ - mca_pml_dr_recv_request_match_specific(request); \ - } \ -} while (0) - - -/** - * Initialize request when match is made - */ - -#define MCA_PML_DR_RECV_REQUEST_MATCHED(request,comm,proc,hdr) \ -do { \ - (request)->req_mutex = &comm->matching_lock; \ - (request)->req_proc = proc; \ - (request)->req_endpoint = (mca_pml_dr_endpoint_t*)proc->ompi_proc->proc_pml; \ - (request)->req_recv.req_base.req_ompi.req_status.MPI_TAG = (hdr)->hdr_tag; \ - (request)->req_recv.req_base.req_ompi.req_status.MPI_SOURCE = proc->comm_rank; \ - (request)->req_vfrag0.vf_id = (hdr)->hdr_common.hdr_vid; \ - opal_list_append(&proc->matched_receives, (opal_list_item_t*)request); \ - ompi_seq_tracker_insert(&request->req_endpoint->seq_recvs_matched, \ - (hdr)->hdr_common.hdr_vid); \ -} while(0) - - -/** - * Setup convertor if message length is non-zero - */ - -#define MCA_PML_DR_RECV_REQUEST_BYTES_PACKED(request, bytes_packed) \ -do { \ - bool do_csum = mca_pml_dr.enable_csum && \ - (request->req_endpoint->bml_endpoint->btl_flags_or & MCA_BTL_FLAGS_NEED_CSUM); \ - (request)->req_recv.req_bytes_packed = bytes_packed; \ - if((request)->req_recv.req_bytes_packed != 0) { \ - ompi_proc_t *proc = (request)->req_proc->ompi_proc; \ - opal_convertor_copy_and_prepare_for_recv( proc->proc_convertor, \ - &((request)->req_recv.req_base.req_datatype->super), \ - (request)->req_recv.req_base.req_count, \ - (request)->req_recv.req_base.req_addr, \ - (do_csum ? CONVERTOR_WITH_CHECKSUM: 0), \ - &(request)->req_recv.req_base.req_convertor ); \ - } \ -} while (0) - - -/** - * - */ - -#define MCA_PML_DR_RECV_REQUEST_UNPACK( \ - request, \ - segments, \ - num_segments, \ - seg_offset, \ - data_offset, \ - bytes_received, \ - bytes_delivered, \ - csum) \ -do { \ - if(request->req_recv.req_bytes_packed > 0) { \ - struct iovec iov[MCA_BTL_DES_MAX_SEGMENTS]; \ - uint32_t iov_count = 0; \ - size_t max_data = bytes_received; \ - size_t n, offset = seg_offset; \ - bool do_csum = mca_pml_dr.enable_csum && \ - (request->req_endpoint->bml_endpoint->btl_flags_or & MCA_BTL_FLAGS_NEED_CSUM); \ - \ - for(n=0; n= segment->seg_len) { \ - offset -= segment->seg_len; \ - } else { \ - iov[iov_count].iov_len = segment->seg_len - offset; \ - iov[iov_count].iov_base = (IOVBASE_TYPE*)((unsigned char*)segment->seg_addr.pval + offset); \ - offset = 0; \ - iov_count++; \ - } \ - } \ - opal_convertor_set_position( &(request->req_recv.req_base.req_convertor), \ - &data_offset); \ - assert((request->req_recv.req_base.req_convertor.flags & CONVERTOR_COMPLETED) == 0); \ - opal_convertor_unpack( &(request)->req_recv.req_base.req_convertor, \ - iov, \ - &iov_count, \ - &max_data); \ - bytes_delivered = max_data; \ - if(bytes_received && !bytes_delivered) assert(0); \ - csum = (do_csum ? \ - request->req_recv.req_base.req_convertor.checksum : OPAL_CSUM_ZERO); \ - } else { \ - bytes_delivered = 0; \ - csum = OPAL_CSUM_ZERO; \ - } \ -} while (0) - - -/** - * - */ - -void mca_pml_dr_recv_request_progress( - mca_pml_dr_recv_request_t* req, - struct mca_btl_base_module_t* btl, - mca_btl_base_segment_t* segments, - size_t num_segments); - -/** - * - */ - -void mca_pml_dr_recv_request_matched_probe( - mca_pml_dr_recv_request_t* req, - struct mca_btl_base_module_t* btl, - mca_btl_base_segment_t* segments, - size_t num_segments); - -/** - * - */ - -void mca_pml_dr_recv_request_schedule( - mca_pml_dr_recv_request_t* req); - -/** - * Look for matched receive. - * Must be called w/ matching lock held. - */ - -static inline struct mca_pml_dr_recv_request_t* mca_pml_dr_comm_proc_check_matched( - mca_pml_dr_comm_proc_t* dr_proc, - uint32_t vfrag_id) -{ - opal_list_item_t* item; - for(item = opal_list_get_first(&dr_proc->matched_receives); - item != opal_list_get_end(&dr_proc->matched_receives); - item = opal_list_get_next(item)) { - struct mca_pml_dr_recv_request_t* recvreq = (struct mca_pml_dr_recv_request_t*)item; - if(recvreq->req_vfrag0.vf_id == vfrag_id) - return recvreq; - } - return NULL; -} - -/* - * - */ - -#define MCA_PML_DR_RECV_REQUEST_VFRAG_LOOKUP(recvreq,hdr,vfrag) \ -do { \ - if((recvreq)->req_vfrag->vf_id == (hdr)->hdr_common.hdr_vid) { \ - vfrag = (recvreq)->req_vfrag; \ - } else { \ - opal_list_item_t* item; \ - int rc; \ - \ - vfrag = NULL; \ - OPAL_THREAD_LOCK(recvreq->req_mutex); \ - for(item = opal_list_get_first(&(recvreq)->req_vfrags); \ - item != opal_list_get_end(&(recvreq)->req_vfrags); \ - item = opal_list_get_next(item)) { \ - mca_pml_dr_vfrag_t* vf = (mca_pml_dr_vfrag_t*)item; \ - if(vf->vf_id == (hdr)->hdr_common.hdr_vid) { \ - vfrag = vf; \ - break; \ - } \ - } \ - if(NULL == vfrag) { \ - MCA_PML_DR_VFRAG_ALLOC(vfrag,rc); \ - if(NULL != vfrag) { \ - MCA_PML_DR_VFRAG_INIT(vfrag); \ - (vfrag)->vf_id = (hdr)->hdr_common.hdr_vid; \ - (vfrag)->vf_len = (hdr)->hdr_vlen; \ - if((hdr)->hdr_vlen == 64) { \ - (vfrag)->vf_mask = ~(uint64_t)0; \ - } else { \ - (vfrag)->vf_mask = (((uint64_t)1 << (hdr)->hdr_vlen)-1); \ - } \ - opal_list_append(&(recvreq)->req_vfrags, (opal_list_item_t*)vfrag); \ - (recvreq)->req_vfrag = vfrag; \ - } \ - } \ - OPAL_THREAD_UNLOCK(recvreq->req_mutex); \ - } \ -} while(0) - -END_C_DECLS -#endif - diff --git a/ompi/mca/pml/dr/pml_dr_sendreq.c b/ompi/mca/pml/dr/pml_dr_sendreq.c deleted file mode 100644 index f653add210..0000000000 --- a/ompi/mca/pml/dr/pml_dr_sendreq.c +++ /dev/null @@ -1,1169 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2011 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2006 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2007 Mellanox Technologies. - * All rights reserved. - * Copyright (c) 2011-2012 Los Alamos National Security, LLC. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - - -/*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/ - -#include "ompi_config.h" -#include "opal/util/crc.h" -#include "ompi/constants.h" -#include "ompi/mca/pml/pml.h" -#include "ompi/mca/btl/btl.h" -#include "ompi/class/ompi_seq_tracker.h" -#include "pml_dr.h" -#include "pml_dr_hdr.h" -#include "pml_dr_sendreq.h" -#include "pml_dr_recvreq.h" -#include "ompi/mca/bml/base/base.h" - - -/* - * The free call mark the final stage in a request life-cycle. Starting from this - * point the request is completed at both PML and user level, and can be used - * for others p2p communications. Therefore, in the case of the DR PML it should - * be added to the free request list. - */ -static int mca_pml_dr_send_request_free(struct ompi_request_t** request) -{ - mca_pml_dr_send_request_t* sendreq = *(mca_pml_dr_send_request_t**)request; - - assert( false == sendreq->req_send.req_base.req_free_called ); - OPAL_THREAD_LOCK(&ompi_request_lock); - sendreq->req_send.req_base.req_free_called = true; - if( true == sendreq->req_send.req_base.req_pml_complete ) { - MCA_PML_DR_SEND_REQUEST_RETURN( sendreq ); - } - OPAL_THREAD_UNLOCK(&ompi_request_lock); - - *request = MPI_REQUEST_NULL; - return OMPI_SUCCESS; -} - -static int mca_pml_dr_send_request_cancel(struct ompi_request_t* request, int complete) -{ - /* we dont cancel send requests by now */ - return OMPI_SUCCESS; -} - -static void mca_pml_dr_send_request_construct(mca_pml_dr_send_request_t* req) -{ - OBJ_CONSTRUCT(&req->req_vfrag0, mca_pml_dr_vfrag_t); - OBJ_CONSTRUCT(&req->req_retrans, opal_list_t); - - req->req_vfrag0.vf_len = 1; - req->req_vfrag0.vf_mask = 1; - req->req_vfrag0.vf_send.pval = req; - req->req_send.req_base.req_type = MCA_PML_REQUEST_SEND; - req->req_send.req_base.req_ompi.req_free = mca_pml_dr_send_request_free; - req->req_send.req_base.req_ompi.req_cancel = mca_pml_dr_send_request_cancel; -} - -static void mca_pml_dr_send_request_destruct(mca_pml_dr_send_request_t* req) -{ - OBJ_DESTRUCT(&req->req_vfrag0); - OBJ_DESTRUCT(&req->req_retrans); -} - - -OBJ_CLASS_INSTANCE( - mca_pml_dr_send_request_t, - mca_pml_base_send_request_t, - mca_pml_dr_send_request_construct, - mca_pml_dr_send_request_destruct); - -/** - * Handle error status on local completion - */ - -static void mca_pml_dr_error_completion( - struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* ep, - struct mca_btl_base_descriptor_t* descriptor, - int status) -{ - mca_pml_dr_vfrag_t* vfrag = (mca_pml_dr_vfrag_t*)descriptor->des_cbdata; - mca_pml_dr_send_request_t* sendreq = (mca_pml_dr_send_request_t*)vfrag->vf_send.pval; - - switch(status) { - case OMPI_ERR_UNREACH: - /** - * peer is no longer reachable through this btl - */ - mca_bml.bml_del_proc_btl(sendreq->req_proc->ompi_proc, btl); - break; - - case OMPI_ERR_FATAL: - /** - * btl is no longer available - */ - mca_bml.bml_del_btl(btl); - break; - default: - ompi_rte_abort(-1, NULL); - break; - } - - /* update pending counts */ - OPAL_THREAD_ADD_SIZE_T(&sendreq->req_pipeline_depth,-1); - OPAL_THREAD_ADD64(&vfrag->vf_pending,-1); - - /* reset vfrag state - select new BTL */ - mca_pml_dr_vfrag_reset(vfrag); - - /* reschedule vfrag */ - mca_pml_dr_vfrag_reschedule(vfrag); -} - -/** - * Completion of a short message - nothing left to schedule. - */ - -static void mca_pml_dr_match_completion( - struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* ep, - struct mca_btl_base_descriptor_t* descriptor, - int status) -{ - mca_pml_dr_vfrag_t* vfrag = (mca_pml_dr_vfrag_t*)descriptor->des_cbdata; - mca_pml_dr_send_request_t* sendreq = (mca_pml_dr_send_request_t*)vfrag->vf_send.pval; - - /* kill pending wdog timer */ - MCA_PML_DR_VFRAG_WDOG_STOP(vfrag); - - /* free any descriptor used to retransmit */ - if(descriptor != sendreq->req_descriptor) { - mca_bml_base_free( (mca_bml_base_btl_t*)descriptor->des_context, descriptor); - } - - /* check completion status */ - if(OMPI_SUCCESS != status) { - mca_pml_dr_error_completion(btl,ep,descriptor,status); - return; - } - - /* wait for local completion */ - OPAL_THREAD_ADD_SIZE_T(&sendreq->req_pipeline_depth,-1); - if(OPAL_THREAD_ADD64(&vfrag->vf_pending,-1) > 0) - return; - - /* wait for positive ack to complete request */ - OPAL_THREAD_LOCK(&ompi_request_lock); - if(vfrag->vf_ack == vfrag->vf_mask) { - - /* return descriptor */ - if(NULL != sendreq->req_descriptor) { - if(NULL != sendreq->req_descriptor->des_context) { - mca_bml_base_free((mca_bml_base_btl_t*)sendreq->req_descriptor->des_context, sendreq->req_descriptor); - } - sendreq->req_descriptor = NULL; - } - - /* update statistics and complete */ - sendreq->req_bytes_delivered = sendreq->req_send.req_bytes_packed; - MCA_PML_DR_DEBUG(1, (0, "%s:%d: inserting vid %d into sequence tracker\n", - __FILE__, __LINE__, vfrag->vf_id)); - - ompi_seq_tracker_insert(&sendreq->req_endpoint->seq_sends, vfrag->vf_id); - MCA_PML_DR_SEND_REQUEST_PML_COMPLETE(sendreq); - - /* on negative ack need to retransmit */ - } else if(vfrag->vf_state & MCA_PML_DR_VFRAG_NACKED) { - MCA_PML_DR_VFRAG_WDOG_START(vfrag); - MCA_PML_DR_SEND_REQUEST_EAGER_RETRY(sendreq, vfrag); - - /* start ack timer */ - } else { - MCA_PML_DR_VFRAG_ACK_START(vfrag); - } - OPAL_THREAD_UNLOCK(&ompi_request_lock); - - /* check for pending requests */ - MCA_PML_DR_SEND_REQUEST_PROCESS_PENDING(); -} - -/* - * Completion of the first fragment of a long message that - * requires an acknowledgement - */ - -static void mca_pml_dr_rndv_completion( - mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* ep, - struct mca_btl_base_descriptor_t* descriptor, - int status) -{ - mca_pml_dr_vfrag_t* vfrag = (mca_pml_dr_vfrag_t*)descriptor->des_cbdata; - mca_pml_dr_send_request_t* sendreq = (mca_pml_dr_send_request_t*)vfrag->vf_send.pval; - bool schedule = false; - - /* kill pending wdog timer */ - MCA_PML_DR_VFRAG_WDOG_STOP(vfrag); - - /* free any descriptor used to retransmit */ - if(descriptor != sendreq->req_descriptor) { - mca_bml_base_free( (mca_bml_base_btl_t*)descriptor->des_context, descriptor ); - } - - /* check completion status */ - if(OMPI_SUCCESS != status) { - mca_pml_dr_error_completion(btl,ep,descriptor,status); - return; - } - - /* local completion */ - OPAL_THREAD_ADD_SIZE_T(&sendreq->req_pipeline_depth,-1); - if(OPAL_THREAD_ADD64(&vfrag->vf_pending,-1) > 0) - return; - - /* wait for positive ack to complete request */ - OPAL_THREAD_LOCK(&ompi_request_lock); - if(vfrag->vf_ack == vfrag->vf_mask) { - if(sendreq->req_descriptor) { - if(NULL != sendreq->req_descriptor->des_context) { - mca_bml_base_free((mca_bml_base_btl_t*)sendreq->req_descriptor->des_context, sendreq->req_descriptor); - } - sendreq->req_descriptor = NULL; - } - - /* update statistics and complete */ - MCA_PML_DR_DEBUG(1, (0, "%s:%d: inserting vid %d into sequence tracker\n", - __FILE__, __LINE__, vfrag->vf_id)); - - ompi_seq_tracker_insert(&sendreq->req_endpoint->seq_sends, vfrag->vf_id); - if(sendreq->req_bytes_delivered == sendreq->req_send.req_bytes_packed) { - MCA_PML_DR_SEND_REQUEST_PML_COMPLETE(sendreq); - } else { - schedule = true; - } - OPAL_THREAD_UNLOCK(&ompi_request_lock); - if(schedule) { - mca_pml_dr_send_request_schedule(sendreq); - } - - /* on negative ack need to retransmit */ - } else if(vfrag->vf_state & MCA_PML_DR_VFRAG_NACKED) { - MCA_PML_DR_VFRAG_ACK_RESET(vfrag); - MCA_PML_DR_SEND_REQUEST_RNDV_PROBE(sendreq, vfrag); - OPAL_THREAD_UNLOCK(&ompi_request_lock); - - /* setup ack timer */ - } else { - OPAL_THREAD_UNLOCK(&ompi_request_lock); - MCA_PML_DR_VFRAG_ACK_START(vfrag); - } - - /* check for pending requests */ - MCA_PML_DR_SEND_REQUEST_PROCESS_PENDING(); -} - - -/** - * Completion of additional fragments of a large message - may need - * to schedule additional fragments. - */ - -static void mca_pml_dr_frag_completion( - mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* ep, - struct mca_btl_base_descriptor_t* descriptor, - int status) -{ - mca_pml_dr_vfrag_t* vfrag = (mca_pml_dr_vfrag_t*)descriptor->des_cbdata; - mca_pml_dr_send_request_t* sendreq = (mca_pml_dr_send_request_t*)vfrag->vf_send.pval; - mca_bml_base_btl_t* bml_btl = vfrag->bml_btl; - bool schedule = false; - - /* check completion status */ - if(OMPI_SUCCESS != status) { - mca_pml_dr_error_completion(btl,ep,descriptor,status); - return; - } - - /* have all pending frags completed for this vfrag? */ - OPAL_THREAD_LOCK(&ompi_request_lock); - vfrag->vf_pending -= 1; - if(vfrag->vf_pending == 0) { - MCA_PML_DR_VFRAG_WDOG_STOP(vfrag); - - /* has the vfrag already been acked */ - if (vfrag->vf_ack == vfrag->vf_mask) { - - sendreq->req_bytes_delivered += vfrag->vf_size; - assert(sendreq->req_bytes_delivered <= sendreq->req_send.req_bytes_packed); - - /* is this vfrag in the process of being retransmitted */ - if(vfrag->vf_state & MCA_PML_DR_VFRAG_RETRANS) { - opal_list_remove_item(&sendreq->req_retrans, (opal_list_item_t*)vfrag); - vfrag->vf_state &= ~MCA_PML_DR_VFRAG_RETRANS; - } - - /* record vfrag id to drop duplicate acks */ - MCA_PML_DR_DEBUG(1, (0, "%s:%d: inserting vid %d into sequence tracker\n", - __FILE__, __LINE__, vfrag->vf_id)); - - ompi_seq_tracker_insert(&sendreq->req_endpoint->seq_sends, vfrag->vf_id); - - /* return this vfrag */ - MCA_PML_DR_VFRAG_RETURN(vfrag); - - /* waiting on ack */ - } else if (vfrag->vf_idx == vfrag->vf_len) { - MCA_PML_DR_VFRAG_ACK_START(vfrag); - } - - /* if not reset the watchdog timer */ - } else { - MCA_PML_DR_VFRAG_WDOG_RESET(vfrag); - } - - /* are we done with this request ? */ - if(OPAL_THREAD_ADD_SIZE_T(&sendreq->req_pipeline_depth, -1) == 0 && - sendreq->req_bytes_delivered == sendreq->req_send.req_bytes_packed) { - MCA_PML_DR_SEND_REQUEST_PML_COMPLETE(sendreq); - } else if (sendreq->req_send_offset < sendreq->req_send.req_bytes_packed || - opal_list_get_size(&sendreq->req_retrans)) { - schedule = true; - } - OPAL_THREAD_UNLOCK(&ompi_request_lock); - - /* return the descriptor */ - mca_bml_base_free(bml_btl, descriptor); - - /* schedule remainder of message? */ - if(schedule) { - mca_pml_dr_send_request_schedule(sendreq); - } - - /* check for pending requests */ - MCA_PML_DR_SEND_REQUEST_PROCESS_PENDING(); -} - - - -/** - * Buffer the entire message and mark as complete. - */ - -int mca_pml_dr_send_request_start_buffered( - mca_pml_dr_send_request_t* sendreq, - mca_bml_base_btl_t* bml_btl, - size_t size) -{ - mca_btl_base_descriptor_t* descriptor; - mca_btl_base_segment_t* segment; - mca_pml_dr_hdr_t* hdr; - struct iovec iov; - unsigned int iov_count; - size_t max_data; - int rc; - uint32_t csum; - bool do_csum = mca_pml_dr.enable_csum && - (bml_btl->btl_flags & MCA_BTL_FLAGS_NEED_CSUM); - - /* allocate descriptor */ - mca_bml_base_alloc(bml_btl, &descriptor, MCA_BTL_NO_ORDER, - sizeof(mca_pml_dr_rendezvous_hdr_t) + size, - MCA_BTL_DES_FLAGS_PRIORITY); - if(NULL == descriptor) { - return OMPI_ERR_OUT_OF_RESOURCE; - } - sendreq->req_descriptor = descriptor; /* hang on to this for later */ - segment = descriptor->des_src; - - /* pack the data into the BTL supplied buffer */ - iov.iov_base = (IOVBASE_TYPE*)((unsigned char*)segment->seg_addr.pval + - sizeof(mca_pml_dr_rendezvous_hdr_t)); - iov.iov_len = size; - iov_count = 1; - max_data = size; - if((rc = opal_convertor_pack( &sendreq->req_send.req_base.req_convertor, - &iov, - &iov_count, - &max_data)) < 0) { - mca_bml_base_free(bml_btl, descriptor); - return rc; - } - csum = sendreq->req_send.req_base.req_convertor.checksum; - - /* update lengths */ - segment->seg_len = sizeof(mca_pml_dr_rendezvous_hdr_t) + max_data; - sendreq->req_vfrag0.vf_size = max_data; - sendreq->req_vfrag0.bml_btl = bml_btl; - sendreq->req_vfrag0.vf_state |= MCA_PML_DR_VFRAG_RNDV; - sendreq->req_vfrag0.vf_pending = 1; - - descriptor->des_cbfunc = mca_pml_dr_rndv_completion; - descriptor->des_flags |= MCA_BTL_DES_FLAGS_PRIORITY; - descriptor->des_cbdata = &sendreq->req_vfrag0; - - /* buffer the remainder of the message */ - rc = mca_pml_base_bsend_request_alloc((ompi_request_t*)sendreq); - if(OMPI_SUCCESS != rc) { - mca_bml_base_free(bml_btl, descriptor); - return rc; - } - - iov.iov_base = (IOVBASE_TYPE*)(((unsigned char*)sendreq->req_send.req_addr) + max_data); - iov.iov_len = sendreq->req_send.req_bytes_packed - max_data; - - max_data = iov.iov_len; - if((rc = opal_convertor_pack( &sendreq->req_send.req_base.req_convertor, - &iov, - &iov_count, - &max_data)) < 0) { - mca_bml_base_free(bml_btl, descriptor); - return rc; - } - - /* build rendezvous header */ - hdr = (mca_pml_dr_hdr_t*)segment->seg_addr.pval; - hdr->hdr_common.hdr_flags = 0; - hdr->hdr_common.hdr_csum = 0; - hdr->hdr_common.hdr_type = MCA_PML_DR_HDR_TYPE_RNDV; - hdr->hdr_common.hdr_dst = sendreq->req_endpoint->dst; - hdr->hdr_common.hdr_src = sendreq->req_endpoint->src; - hdr->hdr_common.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid; - hdr->hdr_common.hdr_vid = sendreq->req_vfrag0.vf_id; - hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag; - hdr->hdr_match.hdr_seq = sendreq->req_send.req_base.req_sequence; - hdr->hdr_match.hdr_csum = csum; - hdr->hdr_match.hdr_src_ptr.pval = &sendreq->req_vfrag0; - hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed; - hdr->hdr_common.hdr_csum = (do_csum ? - opal_csum(hdr, sizeof(mca_pml_dr_rendezvous_hdr_t)) : - OPAL_CSUM_ZERO); - - /* re-init convertor for packed data (it keep the flags) */ - opal_convertor_prepare_for_send( &sendreq->req_send.req_base.req_convertor, - &(ompi_mpi_byte.dt.super), - sendreq->req_send.req_bytes_packed, - sendreq->req_send.req_addr ); - - /* request is complete at mpi level */ - OPAL_THREAD_LOCK(&ompi_request_lock); - MCA_PML_DR_SEND_REQUEST_MPI_COMPLETE(sendreq); - OPAL_THREAD_UNLOCK(&ompi_request_lock); - - /* send */ - MCA_PML_DR_VFRAG_WDOG_START(&sendreq->req_vfrag0); - rc = mca_bml_base_send(bml_btl, descriptor, MCA_BTL_TAG_PML); - if(OMPI_SUCCESS != rc) { - mca_bml_base_free(bml_btl, descriptor ); - } - return rc; -} - - -/** - * BTL requires "specially" allocated memory. Request a segment that - * is used for initial hdr and any eager data. - */ - -int mca_pml_dr_send_request_start_copy( - mca_pml_dr_send_request_t* sendreq, - mca_bml_base_btl_t* bml_btl, - size_t size) -{ - mca_btl_base_descriptor_t* descriptor; - mca_btl_base_segment_t* segment; - mca_pml_dr_hdr_t* hdr; - struct iovec iov; - unsigned int iov_count; - size_t max_data; - int rc; - bool do_csum = mca_pml_dr.enable_csum && - (bml_btl->btl_flags & MCA_BTL_FLAGS_NEED_CSUM); - - /* allocate descriptor */ - mca_bml_base_alloc(bml_btl, &descriptor, - MCA_BTL_NO_ORDER, - sizeof(mca_pml_dr_match_hdr_t) + size, - MCA_BTL_DES_FLAGS_PRIORITY); - if(NULL == descriptor) { - return OMPI_ERR_OUT_OF_RESOURCE; - } - sendreq->req_descriptor = descriptor; /* hang on to this for later */ - segment = descriptor->des_src; - - /* pack the data into the supplied buffer */ - iov.iov_base = (IOVBASE_TYPE*)((unsigned char*)segment->seg_addr.pval + sizeof(mca_pml_dr_match_hdr_t)); - iov.iov_len = size; - iov_count = 1; - max_data = size; - if(size > 0) { - if((rc = opal_convertor_pack( &sendreq->req_send.req_base.req_convertor, - &iov, - &iov_count, - &max_data)) < 0) { - mca_bml_base_free(bml_btl, descriptor); - return rc; - } - } - - /* build match header */ - hdr = (mca_pml_dr_hdr_t*)segment->seg_addr.pval; - hdr->hdr_common.hdr_flags = 0; - hdr->hdr_common.hdr_csum = 0; - hdr->hdr_common.hdr_type = MCA_PML_DR_HDR_TYPE_MATCH; - hdr->hdr_common.hdr_dst = sendreq->req_endpoint->dst; - hdr->hdr_common.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid; - hdr->hdr_common.hdr_src = sendreq->req_endpoint->src; - hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag; - hdr->hdr_match.hdr_seq = sendreq->req_send.req_base.req_sequence; - hdr->hdr_match.hdr_csum = (size > 0 && do_csum ? - sendreq->req_send.req_base.req_convertor.checksum : OPAL_CSUM_ZERO); - hdr->hdr_match.hdr_src_ptr.pval = &sendreq->req_vfrag0; - hdr->hdr_common.hdr_vid = sendreq->req_vfrag0.vf_id; - hdr->hdr_common.hdr_csum = (do_csum ? - opal_csum(hdr, sizeof(mca_pml_dr_match_hdr_t)) : - OPAL_CSUM_ZERO); - - /* vfrag status */ - sendreq->req_vfrag0.vf_size = max_data; - sendreq->req_vfrag0.bml_btl = bml_btl; - sendreq->req_vfrag0.vf_pending = 1; - - /* short message */ - descriptor->des_cbfunc = mca_pml_dr_match_completion; - descriptor->des_flags |= MCA_BTL_DES_FLAGS_PRIORITY; - descriptor->des_cbdata = &sendreq->req_vfrag0; - segment->seg_len = sizeof(mca_pml_dr_match_hdr_t) + max_data; - - /* signal request completion */ - OPAL_THREAD_LOCK(&ompi_request_lock); - MCA_PML_DR_SEND_REQUEST_MPI_COMPLETE(sendreq); - OPAL_THREAD_UNLOCK(&ompi_request_lock); - - /* send */ - MCA_PML_DR_VFRAG_WDOG_START(&sendreq->req_vfrag0); - rc = mca_bml_base_send(bml_btl, descriptor, MCA_BTL_TAG_PML); - if(OMPI_SUCCESS != rc) { - mca_bml_base_free(bml_btl, descriptor ); - } - return rc; -} - -/** - * BTL can send directly from user buffer so allow the BTL - * to prepare the segment list. - */ - -int mca_pml_dr_send_request_start_prepare( - mca_pml_dr_send_request_t* sendreq, - mca_bml_base_btl_t* bml_btl, - size_t size) -{ - mca_btl_base_descriptor_t* descriptor; - mca_btl_base_segment_t* segment; - mca_pml_dr_hdr_t* hdr; - int rc; - bool do_csum = mca_pml_dr.enable_csum && - (bml_btl->btl_flags & MCA_BTL_FLAGS_NEED_CSUM); - - /* prepare descriptor */ - mca_bml_base_prepare_src( bml_btl, - NULL, - &sendreq->req_send.req_base.req_convertor, - MCA_BTL_NO_ORDER, - sizeof(mca_pml_dr_match_hdr_t), - &size, - MCA_BTL_DES_FLAGS_PRIORITY, - &descriptor ); - if(NULL == descriptor) { - return OMPI_ERR_OUT_OF_RESOURCE; - } - sendreq->req_descriptor = descriptor; /* hang on to this for later */ - segment = descriptor->des_src; - - /* build match header */ - hdr = (mca_pml_dr_hdr_t*)segment->seg_addr.pval; - hdr->hdr_common.hdr_flags = 0; - hdr->hdr_common.hdr_csum = 0; - hdr->hdr_common.hdr_type = MCA_PML_DR_HDR_TYPE_MATCH; - hdr->hdr_common.hdr_dst = sendreq->req_endpoint->dst; - hdr->hdr_common.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid; - hdr->hdr_common.hdr_src = sendreq->req_endpoint->src; - hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag; - hdr->hdr_match.hdr_seq = sendreq->req_send.req_base.req_sequence; - hdr->hdr_match.hdr_csum = (size > 0 && do_csum ? - sendreq->req_send.req_base.req_convertor.checksum : OPAL_CSUM_ZERO); - hdr->hdr_match.hdr_src_ptr.pval = &sendreq->req_vfrag0; - hdr->hdr_common.hdr_vid = sendreq->req_vfrag0.vf_id; - hdr->hdr_common.hdr_csum = (do_csum ? - opal_csum(hdr, sizeof(mca_pml_dr_match_hdr_t)) : - OPAL_CSUM_ZERO); - - /* short message */ - descriptor->des_cbfunc = mca_pml_dr_match_completion; - descriptor->des_flags |= MCA_BTL_DES_FLAGS_PRIORITY; - descriptor->des_cbdata = &sendreq->req_vfrag0; - - /* vfrag state */ - sendreq->req_vfrag0.vf_size = size; - sendreq->req_vfrag0.bml_btl = bml_btl; - sendreq->req_vfrag0.vf_pending = 1; - - /* send */ - MCA_PML_DR_VFRAG_WDOG_START(&sendreq->req_vfrag0); - rc = mca_bml_base_send(bml_btl, descriptor, MCA_BTL_TAG_PML); - if(OMPI_SUCCESS != rc) { - mca_bml_base_free(bml_btl, descriptor ); - } - return rc; -} - - -/** - * Rendezvous is required. Eager send up to - * the btls eager limit. - */ - -int mca_pml_dr_send_request_start_rndv( - mca_pml_dr_send_request_t* sendreq, - mca_bml_base_btl_t* bml_btl, - size_t size, - int flags) -{ - mca_btl_base_descriptor_t* des; - mca_btl_base_segment_t* segment; - mca_pml_dr_hdr_t* hdr; - int rc; - bool do_csum = mca_pml_dr.enable_csum && - (bml_btl->btl_flags & MCA_BTL_FLAGS_NEED_CSUM); - - - /* prepare descriptor */ - if(size == 0) { - mca_bml_base_alloc( bml_btl, - &des, - MCA_BTL_NO_ORDER, - sizeof(mca_pml_dr_rendezvous_hdr_t), - MCA_BTL_DES_FLAGS_PRIORITY ); - } else { - mca_bml_base_prepare_src( bml_btl, - NULL, - &sendreq->req_send.req_base.req_convertor, - MCA_BTL_NO_ORDER, - sizeof(mca_pml_dr_rendezvous_hdr_t), - &size, - MCA_BTL_DES_FLAGS_PRIORITY, - &des ); - } - - if(NULL == des) { - return OMPI_ERR_OUT_OF_RESOURCE; - } - sendreq->req_descriptor = des; /*hang on to this for later */ - segment = des->des_src; - - /* build hdr */ - hdr = (mca_pml_dr_hdr_t*)segment->seg_addr.pval; - hdr->hdr_common.hdr_flags = flags; - hdr->hdr_common.hdr_type = MCA_PML_DR_HDR_TYPE_RNDV; - hdr->hdr_common.hdr_dst = sendreq->req_endpoint->dst; - hdr->hdr_common.hdr_src = sendreq->req_endpoint->src; - hdr->hdr_common.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid; - hdr->hdr_common.hdr_vid = sendreq->req_vfrag0.vf_id; - hdr->hdr_common.hdr_csum = 0; - hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag; - hdr->hdr_match.hdr_seq = sendreq->req_send.req_base.req_sequence; - hdr->hdr_match.hdr_src_ptr.pval = &sendreq->req_vfrag0; - hdr->hdr_match.hdr_csum = size > 0 ? sendreq->req_send.req_base.req_convertor.checksum : OPAL_CSUM_ZERO; - hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed; - hdr->hdr_common.hdr_csum = (do_csum ? - opal_csum(hdr, sizeof(mca_pml_dr_rendezvous_hdr_t)) : - OPAL_CSUM_ZERO); - - - /* first fragment of a long message */ - des->des_flags |= MCA_BTL_DES_FLAGS_PRIORITY; - des->des_cbdata = &sendreq->req_vfrag0; - des->des_cbfunc = mca_pml_dr_rndv_completion; - - /* vfrag state */ - sendreq->req_vfrag0.vf_size = size; - sendreq->req_vfrag0.bml_btl = bml_btl; - sendreq->req_vfrag0.vf_state |= MCA_PML_DR_VFRAG_RNDV; - sendreq->req_vfrag0.vf_pending = 1; - - /* send */ - MCA_PML_DR_VFRAG_WDOG_START(&sendreq->req_vfrag0); - rc = mca_bml_base_send(bml_btl, des, MCA_BTL_TAG_PML); - if(OMPI_SUCCESS != rc) { - mca_bml_base_free(bml_btl, des ); - } - return rc; -} - -/** - * Schedule pipeline of send descriptors for the given request, - * using send protocol. - */ - -int mca_pml_dr_send_request_schedule(mca_pml_dr_send_request_t* sendreq) -{ - /* - * Only allow one thread in this routine for a given request. - * However, we cannot block callers on a mutex, so simply keep track - * of the number of times the routine has been called and run through - * the scheduling logic once for every call. - */ - - mca_pml_dr_endpoint_t* endpoint = sendreq->req_endpoint; - assert(sendreq->req_vfrag0.vf_recv.pval != NULL); - if(OPAL_THREAD_ADD32(&sendreq->req_lock,1) == 1) { - do { - size_t bytes_remaining; - /* - * VFrags w/ nacks or that timed out - */ - while(opal_list_get_size(&sendreq->req_retrans) && - sendreq->req_pipeline_depth < mca_pml_dr.send_pipeline_depth) { - mca_pml_dr_vfrag_t* vfrag; - - OPAL_THREAD_LOCK(&ompi_request_lock); - vfrag = (mca_pml_dr_vfrag_t*)opal_list_get_first(&sendreq->req_retrans); - OPAL_THREAD_UNLOCK(&ompi_request_lock); - if(NULL == vfrag) { - break; - } - - /* - * Retransmit fragments that have not been acked. - */ - while(vfrag->vf_idx < vfrag->vf_len && - sendreq->req_pipeline_depth < mca_pml_dr.send_pipeline_depth) { - if(((uint64_t)1 << vfrag->vf_idx) & ~vfrag->vf_ack) { - mca_bml_base_btl_t* bml_btl = vfrag->bml_btl; - mca_pml_dr_frag_hdr_t* hdr; - mca_btl_base_descriptor_t* des; - size_t offset_in_vfrag = vfrag->vf_max_send_size * vfrag->vf_idx; - size_t offset_in_msg = vfrag->vf_offset + offset_in_vfrag; - size_t size; - int rc; - bool do_csum = mca_pml_dr.enable_csum && - (bml_btl->btl_flags & MCA_BTL_FLAGS_NEED_CSUM); - - if(vfrag->vf_idx == vfrag->vf_len - 1) { - size = vfrag->vf_size - offset_in_vfrag; - } else { - size = vfrag->vf_max_send_size; - } - - /* pack into a descriptor */ - opal_convertor_set_position(&sendreq->req_send.req_base.req_convertor, &offset_in_msg); - mca_bml_base_prepare_src( bml_btl, - NULL, - &sendreq->req_send.req_base.req_convertor, - MCA_BTL_NO_ORDER, - sizeof(mca_pml_dr_frag_hdr_t), - &size, - 0, - &des ); - if(des == NULL) { - OPAL_THREAD_LOCK(&ompi_request_lock); - opal_list_remove_item(&mca_pml_dr.send_active, (opal_list_item_t*)sendreq); - opal_list_append(&mca_pml_dr.send_pending, (opal_list_item_t*)sendreq); - OPAL_THREAD_UNLOCK(&ompi_request_lock); - break; - } - des->des_cbfunc = mca_pml_dr_frag_completion; - des->des_cbdata = vfrag; - - /* setup header */ - hdr = (mca_pml_dr_frag_hdr_t*)des->des_src->seg_addr.pval; - hdr->hdr_common.hdr_flags = 0; - hdr->hdr_common.hdr_csum = 0; - hdr->hdr_common.hdr_type = MCA_PML_DR_HDR_TYPE_FRAG; - hdr->hdr_common.hdr_dst = sendreq->req_endpoint->dst; - hdr->hdr_common.hdr_vid = vfrag->vf_id; - hdr->hdr_common.hdr_src = sendreq->req_endpoint->src; - hdr->hdr_common.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid; - hdr->hdr_vlen = vfrag->vf_len; - hdr->hdr_frag_idx = vfrag->vf_idx; - hdr->hdr_frag_csum = sendreq->req_send.req_base.req_convertor.checksum; - hdr->hdr_frag_offset = offset_in_msg; - hdr->hdr_src_ptr.pval = vfrag; - hdr->hdr_dst_ptr = sendreq->req_vfrag0.vf_recv; - hdr->hdr_common.hdr_csum = (do_csum ? - opal_csum(hdr, sizeof(mca_pml_dr_frag_hdr_t)) : - OPAL_CSUM_ZERO); - - /* adjust number of outstanding operations */ - if(OPAL_THREAD_ADD64(&vfrag->vf_pending, 1) == 1) { - MCA_PML_DR_VFRAG_WDOG_START(vfrag); - } - OPAL_THREAD_ADD_SIZE_T(&sendreq->req_pipeline_depth,1); - - /* adjust send offset - may not have finished scheduling entire vfrag */ - if(offset_in_msg + size > sendreq->req_send_offset) { - sendreq->req_send_offset = offset_in_msg + size; - } - - /* initiate send - note that this may complete before the call returns */ - rc = mca_bml_base_send( bml_btl, des, MCA_BTL_TAG_PML); - - if(rc == OMPI_SUCCESS) { - bytes_remaining -= size; - } else { - OPAL_THREAD_ADD_SIZE_T(&sendreq->req_pipeline_depth,-1); - mca_bml_base_free(bml_btl,des); - OPAL_THREAD_LOCK(&ompi_request_lock); - opal_list_remove_item(&mca_pml_dr.send_active, (opal_list_item_t*) sendreq); - opal_list_append(&mca_pml_dr.send_pending, (opal_list_item_t*)sendreq); - OPAL_THREAD_UNLOCK(&ompi_request_lock); - break; - } - } - vfrag->vf_idx++; - } - - /* remove from retrans list */ - if(vfrag->vf_idx == vfrag->vf_len) { - OPAL_THREAD_LOCK(&ompi_request_lock); - opal_list_remove_item(&sendreq->req_retrans, (opal_list_item_t*)vfrag); - vfrag->vf_state &= ~MCA_PML_DR_VFRAG_RETRANS; - OPAL_THREAD_UNLOCK(&ompi_request_lock); - } - } - - /* allocate remaining bytes to BTLs */ - bytes_remaining = sendreq->req_send.req_bytes_packed - sendreq->req_send_offset; - while(bytes_remaining > 0 && - sendreq->req_pipeline_depth < mca_pml_dr.send_pipeline_depth) { - - mca_pml_dr_frag_hdr_t* hdr; - mca_btl_base_descriptor_t* des; - mca_bml_base_btl_t* bml_btl = NULL; - mca_pml_dr_vfrag_t* vfrag = sendreq->req_vfrag; - size_t size = bytes_remaining; - - /* offset tells us how much of the vfrag has been scheduled */ - size_t bytes_sent = sendreq->req_send_offset - vfrag->vf_offset; - int rc; - bool do_csum; - - /* do we need to allocate a new vfrag - (we scheduled all the vfrag already) */ - if(vfrag->vf_size == bytes_sent) { - bml_btl = mca_bml_base_btl_array_get_next(&endpoint->bml_endpoint->btl_send); - MCA_PML_DR_VFRAG_ALLOC(vfrag,rc); - if(NULL == vfrag) { - OPAL_THREAD_LOCK(&mca_pml_dr.lock); - opal_list_remove_item(&mca_pml_dr.send_active, (opal_list_item_t*)sendreq); - opal_list_append(&mca_pml_dr.send_pending, (opal_list_item_t*)sendreq); - OPAL_THREAD_UNLOCK(&mca_pml_dr.lock); - break; - } - MCA_PML_DR_SEND_REQUEST_VFRAG_INIT(sendreq,endpoint,bytes_remaining,vfrag); - vfrag->bml_btl = bml_btl; - bytes_sent = 0; - - } else { /* always schedule the vfrag accross the same btl */ - bml_btl = vfrag->bml_btl; - } - - do_csum = mca_pml_dr.enable_csum && - (bml_btl->btl_flags & MCA_BTL_FLAGS_NEED_CSUM); - - /* makes sure that we don't exceed vfrag size */ - if (size > vfrag->vf_max_send_size) { - size = vfrag->vf_max_send_size; - } - if (size > vfrag->vf_size - bytes_sent) { - size = vfrag->vf_size - bytes_sent; - } - - /* pack into a descriptor */ - opal_convertor_set_position(&sendreq->req_send.req_base.req_convertor, &sendreq->req_send_offset); - mca_bml_base_prepare_src( bml_btl, - NULL, - &sendreq->req_send.req_base.req_convertor, - MCA_BTL_NO_ORDER, - sizeof(mca_pml_dr_frag_hdr_t), - &size, - 0, - &des ); - if(des == NULL) { - OPAL_THREAD_LOCK(&mca_pml_dr.lock); - opal_list_remove_item(&mca_pml_dr.send_active, (opal_list_item_t*)sendreq); - opal_list_append(&mca_pml_dr.send_pending, (opal_list_item_t*)sendreq); - OPAL_THREAD_UNLOCK(&mca_pml_dr.lock); - break; - } - des->des_cbfunc = mca_pml_dr_frag_completion; - des->des_cbdata = vfrag; - - /* setup header */ - hdr = (mca_pml_dr_frag_hdr_t*)des->des_src->seg_addr.pval; - hdr->hdr_common.hdr_flags = 0; - hdr->hdr_common.hdr_csum = 0; - hdr->hdr_common.hdr_type = MCA_PML_DR_HDR_TYPE_FRAG; - hdr->hdr_common.hdr_dst = sendreq->req_endpoint->dst; - hdr->hdr_common.hdr_vid = vfrag->vf_id; - hdr->hdr_common.hdr_src = sendreq->req_endpoint->src; - hdr->hdr_common.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid; - hdr->hdr_vlen = vfrag->vf_len; - hdr->hdr_frag_idx = vfrag->vf_idx; - hdr->hdr_frag_csum = (do_csum ? - sendreq->req_send.req_base.req_convertor.checksum : OPAL_CSUM_ZERO); - hdr->hdr_frag_offset = sendreq->req_send_offset; - hdr->hdr_src_ptr.pval = vfrag; - hdr->hdr_dst_ptr = sendreq->req_vfrag0.vf_recv; - hdr->hdr_common.hdr_csum = (do_csum ? - opal_csum(hdr, sizeof(mca_pml_dr_frag_hdr_t)): OPAL_CSUM_ZERO); - - assert(hdr->hdr_frag_offset < sendreq->req_send.req_bytes_packed); - - /* update state */ - vfrag->vf_idx++; - if(OPAL_THREAD_ADD64(&vfrag->vf_pending,1) == 1) { - MCA_PML_DR_VFRAG_WDOG_START(vfrag); - } - sendreq->req_send_offset += size; - OPAL_THREAD_ADD_SIZE_T(&sendreq->req_pipeline_depth,1); - - /* initiate send - note that this may complete before the call returns */ - rc = mca_bml_base_send( bml_btl, des, MCA_BTL_TAG_PML); - - if(rc == OMPI_SUCCESS) { - bytes_remaining -= size; - } else { - sendreq->req_send_offset -= size; - OPAL_THREAD_ADD_SIZE_T(&sendreq->req_pipeline_depth,-1); - mca_bml_base_free(bml_btl,des); - OPAL_THREAD_LOCK(&ompi_request_lock); - opal_list_remove_item(&mca_pml_dr.send_active, (opal_list_item_t*)sendreq); - opal_list_append(&mca_pml_dr.send_pending, (opal_list_item_t*)sendreq); - OPAL_THREAD_UNLOCK(&ompi_request_lock); - break; - } - } - } while (OPAL_THREAD_ADD32(&sendreq->req_lock,-1) > 0); - } - return OMPI_SUCCESS; -} - - -/** - * Acknowledgment of match vfrag. - */ - -void mca_pml_dr_send_request_match_ack( - mca_btl_base_module_t* btl, - mca_pml_dr_ack_hdr_t* ack) -{ - mca_pml_dr_vfrag_t* vfrag = (mca_pml_dr_vfrag_t*)ack->hdr_src_ptr.pval; - mca_pml_dr_send_request_t* sendreq = (mca_pml_dr_send_request_t*)vfrag->vf_send.pval; - - OPAL_THREAD_LOCK(&ompi_request_lock); - - vfrag->vf_ack = ack->hdr_vmask & vfrag->vf_mask; - if (vfrag->vf_pending == 0) { - MCA_PML_DR_VFRAG_ACK_STOP(vfrag); - /* need to retransmit? */ - if(vfrag->vf_ack != vfrag->vf_mask) { - MCA_PML_DR_VFRAG_WDOG_START(vfrag); - MCA_PML_DR_SEND_REQUEST_EAGER_RETRY(sendreq, vfrag); - } else { - /* if already have local completion free descriptor and complete message */ - /* return descriptor */ - if(NULL != sendreq->req_descriptor) { - if(NULL != sendreq->req_descriptor->des_context) { - mca_bml_base_free((mca_bml_base_btl_t*)sendreq->req_descriptor->des_context, sendreq->req_descriptor ); - } - sendreq->req_descriptor = NULL; - } - - /* update statistics */ - sendreq->req_bytes_delivered = vfrag->vf_size; - MCA_PML_DR_DEBUG(1, (0, "%s:%d: inserting vid %d into sequence tracker\n", - __FILE__, __LINE__, vfrag->vf_id)); - ompi_seq_tracker_insert(&sendreq->req_endpoint->seq_sends, vfrag->vf_id); - MCA_PML_DR_SEND_REQUEST_PML_COMPLETE(sendreq); - } - - /* wait for local completion */ - } else { - /* need to retransmit? */ - if (vfrag->vf_ack != vfrag->vf_mask) { - vfrag->vf_state |= MCA_PML_DR_VFRAG_NACKED; - } else { - vfrag->vf_recv = ack->hdr_dst_ptr; - } - } - OPAL_THREAD_UNLOCK(&ompi_request_lock); -} - -/** - * Acknowledgment of rendezvous vfrag. - */ - -void mca_pml_dr_send_request_rndv_ack( - mca_btl_base_module_t* btl, - mca_pml_dr_ack_hdr_t* ack) -{ - mca_pml_dr_vfrag_t* vfrag = (mca_pml_dr_vfrag_t*)ack->hdr_src_ptr.pval; - mca_pml_dr_send_request_t* sendreq = (mca_pml_dr_send_request_t*)vfrag->vf_send.pval; - - OPAL_THREAD_LOCK(&ompi_request_lock); - - /* set acked bits */ - vfrag->vf_ack = ack->hdr_vmask & vfrag->vf_mask; - - /* local completion? */ - if (vfrag->vf_pending == 0) { - bool schedule = false; - MCA_PML_DR_VFRAG_ACK_STOP(vfrag); - - /* need to retransmit? */ - if(vfrag->vf_ack != vfrag->vf_mask) { - /* got a NACK, resend eager data! */ - MCA_PML_DR_VFRAG_WDOG_START(vfrag); - MCA_PML_DR_SEND_REQUEST_EAGER_RETRY(sendreq, vfrag); - OPAL_THREAD_UNLOCK(&ompi_request_lock); - } else { - /* return descriptor of first fragment */ - if(NULL != sendreq->req_descriptor) { - if(NULL != sendreq->req_descriptor->des_context) { - mca_bml_base_free((mca_bml_base_btl_t*)sendreq->req_descriptor->des_context, sendreq->req_descriptor); - } - sendreq->req_descriptor = NULL; - } - - /* done? */ - sendreq->req_send_offset = ack->hdr_vlen; - sendreq->req_bytes_delivered = ack->hdr_vlen; - if(sendreq->req_bytes_delivered == sendreq->req_send.req_bytes_packed) { - MCA_PML_DR_SEND_REQUEST_PML_COMPLETE(sendreq); - } else { - /* start scheduling with a new vfrag */ - vfrag->vf_recv = ack->hdr_dst_ptr; -/* vfrag->vf_state = 0; */ -/* sendreq->req_send_offset = ack->hdr_vlen; */ - vfrag->vf_state = 0; - vfrag->vf_size = ack->hdr_vlen; - schedule = true; - } - - /* stash the vfrag id for duplicate acks.. */ - MCA_PML_DR_DEBUG(1, (0, "%s:%d: inserting vid %d into sequence tracker\n", - __FILE__, __LINE__, vfrag->vf_id)); - - ompi_seq_tracker_insert(&sendreq->req_endpoint->seq_sends, vfrag->vf_id); - OPAL_THREAD_UNLOCK(&ompi_request_lock); - - if(schedule) { - mca_pml_dr_send_request_schedule(sendreq); - } - } - - /* wait for local completion */ - } else { - /* need to retransmit? */ - if(vfrag->vf_ack != vfrag->vf_mask) { - vfrag->vf_state |= MCA_PML_DR_VFRAG_NACKED; - } else { - /* will need this to schedule rest of the message */ - vfrag->vf_recv = ack->hdr_dst_ptr; - vfrag->vf_state = 0; - vfrag->vf_size = ack->hdr_vlen; - sendreq->req_send_offset = ack->hdr_vlen; - sendreq->req_bytes_delivered = ack->hdr_vlen; - } - OPAL_THREAD_UNLOCK(&ompi_request_lock); - } -} - -/** - * Acknowledgment of vfrag. - */ - -void mca_pml_dr_send_request_frag_ack( - mca_btl_base_module_t* btl, - mca_pml_dr_ack_hdr_t* ack) -{ - mca_pml_dr_vfrag_t* vfrag = (mca_pml_dr_vfrag_t*)ack->hdr_src_ptr.pval; - mca_pml_dr_send_request_t* sendreq = (mca_pml_dr_send_request_t*)vfrag->vf_send.pval; - bool schedule = false; - - MCA_PML_DR_VFRAG_ACK_STOP(vfrag); - OPAL_THREAD_LOCK(&ompi_request_lock); - - /* add in acknowledged fragments */ - vfrag->vf_ack |= (ack->hdr_vmask & vfrag->vf_mask); - - /* need to retransmit? */ - if(vfrag->vf_ack != vfrag->vf_mask) { - MCA_PML_DR_DEBUG(0,(0, "got a vfrag NACK, retransmitting %llx\n", (long long)~vfrag->vf_ack)); - MCA_PML_DR_SEND_REQUEST_VFRAG_RETRANS(sendreq,vfrag); - schedule = true; - - /* acked and local completion */ - } else if (vfrag->vf_pending == 0 && vfrag->vf_idx == vfrag->vf_len) { - - /* update statistics */ - sendreq->req_bytes_delivered += vfrag->vf_size; - assert(sendreq->req_bytes_delivered <= sendreq->req_send.req_bytes_packed); - - /* stash the vfid for duplicate acks.. */ - MCA_PML_DR_DEBUG(1,(0, "%s:%d: inserting vid %d into sequence tracker\n", - __FILE__, __LINE__, vfrag->vf_id)); - - ompi_seq_tracker_insert(&sendreq->req_endpoint->seq_sends, vfrag->vf_id); - /* return vfrag */ - MCA_PML_DR_VFRAG_RETURN(vfrag); - - /* are we done with this request ? */ - if(sendreq->req_bytes_delivered == sendreq->req_send.req_bytes_packed) { - MCA_PML_DR_SEND_REQUEST_PML_COMPLETE(sendreq); - /* is there something left to schedule */ - } else if (sendreq->req_send_offset < sendreq->req_send.req_bytes_packed) { - schedule = true; - } - } - OPAL_THREAD_UNLOCK(&ompi_request_lock); - if(schedule) { - mca_pml_dr_send_request_schedule(sendreq); - } -} - - -void mca_pml_dr_sendreq_cleanup_active(mca_btl_base_module_t* btl) { - opal_list_item_t* item; - - for (item = opal_list_get_first(&mca_pml_dr.send_active) ; - item != opal_list_get_end(&mca_pml_dr.send_active) ; - item = opal_list_get_next(item)) { - mca_pml_dr_send_request_t* sendreq = (mca_pml_dr_send_request_t*) item; - mca_btl_base_descriptor_t* des = sendreq->req_descriptor; - /* The des may be NULL if - * first fragment of rndv was delivered */ - if (NULL != des) { - mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*)des->des_context; - if( bml_btl && bml_btl->btl == btl) { - des->des_context = NULL; - } - } - - } -} diff --git a/ompi/mca/pml/dr/pml_dr_sendreq.h b/ompi/mca/pml/dr/pml_dr_sendreq.h deleted file mode 100644 index 76057097e4..0000000000 --- a/ompi/mca/pml/dr/pml_dr_sendreq.h +++ /dev/null @@ -1,491 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2010 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2006 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/** - * @file - */ -#ifndef OMPI_PML_DR_SEND_REQUEST_H -#define OMPI_PML_DR_SEND_REQUEST_H - -#include "opal/util/crc.h" -#include "ompi_config.h" -#include "opal/datatype/opal_convertor.h" -#include "ompi/mca/btl/btl.h" -#include "ompi/mca/pml/base/pml_base_sendreq.h" -#include "ompi/mca/mpool/base/base.h" -#include "ompi/mca/bml/bml.h" -#include "ompi/mca/btl/btl.h" - -#include "pml_dr_comm.h" -#include "pml_dr_hdr.h" -#include "pml_dr_vfrag.h" -#include "pml_dr_endpoint.h" -#include "opal/mca/event/event.h" - -BEGIN_C_DECLS - -struct mca_pml_dr_send_request_t { - mca_pml_base_send_request_t req_send; - mca_pml_dr_comm_proc_t* req_proc; - mca_pml_dr_endpoint_t* req_endpoint; -#if OPAL_ENABLE_MULTI_THREADS - volatile int32_t req_state; - volatile int32_t req_lock; -#else - int32_t req_state; - int32_t req_lock; -#endif - size_t req_pipeline_depth; - size_t req_bytes_delivered; - size_t req_send_offset; - - mca_pml_dr_vfrag_t* req_vfrag; - mca_pml_dr_vfrag_t req_vfrag0; - opal_list_t req_retrans; - mca_btl_base_descriptor_t* req_descriptor; /* descriptor for first frag, retransmission */ - -}; -typedef struct mca_pml_dr_send_request_t mca_pml_dr_send_request_t; - -OBJ_CLASS_DECLARATION(mca_pml_dr_send_request_t); - - -#define MCA_PML_DR_SEND_REQUEST_ALLOC( \ - comm, \ - dst, \ - sendreq, \ - rc) \ -{ \ - ompi_proc_t *proc = ompi_comm_peer_lookup( comm, dst ); \ - ompi_free_list_item_t* item; \ - \ - if(NULL == proc) { \ - rc = OMPI_ERR_OUT_OF_RESOURCE; \ - } else { \ - rc = OMPI_SUCCESS; \ - OMPI_FREE_LIST_WAIT(&mca_pml_base_send_requests, item, rc); \ - sendreq = (mca_pml_dr_send_request_t*)item; \ - sendreq->req_send.req_base.req_proc = proc; \ - opal_list_append(&mca_pml_dr.send_active, \ - (opal_list_item_t*) sendreq); \ - } \ -} - - -#define MCA_PML_DR_SEND_REQUEST_INIT( \ - sendreq, \ - addr, \ - count, \ - datatype, \ - peer, \ - tag, \ - comm, \ - sendmode, \ - persistent) \ -do { \ - mca_bml_base_endpoint_t* endpoint = \ - sendreq->req_send.req_base.req_proc->proc_bml; \ - bool do_csum = mca_pml_dr.enable_csum && \ - (endpoint->btl_flags_or & MCA_BTL_FLAGS_NEED_CSUM); \ - /* increment reference counts */ \ - OBJ_RETAIN(comm); \ - OBJ_RETAIN(datatype); \ - \ - OMPI_REQUEST_INIT(&(sendreq)->req_send.req_base.req_ompi, persistent); \ - (sendreq)->req_send.req_addr = addr; \ - (sendreq)->req_send.req_send_mode = sendmode; \ - (sendreq)->req_send.req_base.req_addr = addr; \ - (sendreq)->req_send.req_base.req_count = count; \ - (sendreq)->req_send.req_base.req_datatype = datatype; \ - (sendreq)->req_send.req_base.req_peer = (int32_t)peer; \ - (sendreq)->req_send.req_base.req_tag = (int32_t)tag; \ - (sendreq)->req_send.req_base.req_comm = comm; \ - (sendreq)->req_send.req_base.req_pml_complete = OPAL_INT_TO_BOOL(persistent); \ - (sendreq)->req_send.req_base.req_free_called = false; \ - (sendreq)->req_send.req_base.req_ompi.req_status._cancelled = 0; \ - \ - /* initialize datatype convertor for this request */ \ -/* if(count > 0) { */ \ - /* We will create a convertor specialized for the */ \ - /* remote architecture and prepared with the datatype. */ \ - opal_convertor_copy_and_prepare_for_send( \ - (sendreq)->req_send.req_base.req_proc->proc_convertor, \ - &((sendreq)->req_send.req_base.req_datatype->super), \ - (sendreq)->req_send.req_base.req_count, \ - (sendreq)->req_send.req_base.req_addr, \ - (do_csum ? CONVERTOR_WITH_CHECKSUM: 0), \ - &(sendreq)->req_send.req_base.req_convertor ); \ - opal_convertor_get_packed_size(&(sendreq)->req_send.req_base.req_convertor, \ - &((sendreq)->req_send.req_bytes_packed) ); \ - /* } else { */ \ - /* (sendreq)->req_send.req_bytes_packed = 0; */ \ - /* } */ \ -} while(0) - -#define MCA_PML_DR_SEND_REQUEST_START(sendreq, rc) \ -do { \ - mca_pml_dr_comm_t* comm = sendreq->req_send.req_base.req_comm->c_pml_comm; \ - mca_pml_dr_endpoint_t* pml_endpoint = \ - (mca_pml_dr_endpoint_t*)sendreq->req_send.req_base.req_proc->proc_pml; \ - mca_bml_base_endpoint_t* bml_endpoint = \ - sendreq->req_send.req_base.req_proc->proc_bml; \ - mca_pml_dr_comm_proc_t* proc = \ - comm->procs + sendreq->req_send.req_base.req_peer; \ - mca_bml_base_btl_t* bml_btl; \ - size_t size = sendreq->req_send.req_bytes_packed; \ - size_t eager_limit; \ - if(pml_endpoint == NULL || bml_endpoint == NULL) { \ - rc = OMPI_ERR_UNREACH; \ - break; \ - } \ - \ - bml_btl = mca_bml_base_btl_array_get_next(&bml_endpoint->btl_eager); \ - MCA_PML_DR_VFRAG_INIT(&sendreq->req_vfrag0); \ - sendreq->req_vfrag0.vf_id = OPAL_THREAD_ADD32(&pml_endpoint->vfrag_seq,1); \ - sendreq->req_vfrag0.bml_btl = bml_btl; \ - sendreq->req_vfrag = &sendreq->req_vfrag0; \ - sendreq->req_endpoint = pml_endpoint; \ - assert(pml_endpoint->bml_endpoint == bml_endpoint); \ - sendreq->req_proc = proc; \ - \ - sendreq->req_lock = 0; \ - sendreq->req_pipeline_depth = 1; \ - sendreq->req_bytes_delivered = 0; \ - sendreq->req_state = 0; \ - sendreq->req_send_offset = 0; \ - sendreq->req_send.req_base.req_pml_complete = false; \ - sendreq->req_send.req_base.req_ompi.req_complete = false; \ - sendreq->req_send.req_base.req_ompi.req_state = OMPI_REQUEST_ACTIVE; \ - sendreq->req_send.req_base.req_ompi.req_status._cancelled = 0; \ - sendreq->req_send.req_base.req_sequence = OPAL_THREAD_ADD32(&proc->send_sequence,1); \ - \ - /* select a btl */ \ - assert(bml_btl->btl->btl_eager_limit >= sizeof(mca_pml_dr_hdr_t)); \ - eager_limit = bml_btl->btl->btl_eager_limit - sizeof(mca_pml_dr_hdr_t); \ - if(size <= eager_limit) { \ - switch(sendreq->req_send.req_send_mode) { \ - case MCA_PML_BASE_SEND_SYNCHRONOUS: \ - rc = mca_pml_dr_send_request_start_rndv(sendreq, bml_btl, size, 0); \ - break; \ - case MCA_PML_BASE_SEND_BUFFERED: \ - rc = mca_pml_dr_send_request_start_copy(sendreq, bml_btl, size); \ - break; \ - case MCA_PML_BASE_SEND_COMPLETE: \ - rc = mca_pml_dr_send_request_start_prepare(sendreq, bml_btl, size); \ - break; \ - default: \ - if (bml_btl->btl_flags & MCA_BTL_FLAGS_SEND_INPLACE) { \ - rc = mca_pml_dr_send_request_start_prepare(sendreq, bml_btl, size); \ - } else { \ - rc = mca_pml_dr_send_request_start_copy(sendreq, bml_btl, size); \ - } \ - break; \ - } \ - } else { \ - size = eager_limit; \ - if(sendreq->req_send.req_send_mode == MCA_PML_BASE_SEND_BUFFERED) { \ - rc = mca_pml_dr_send_request_start_buffered(sendreq, bml_btl, size); \ - } else { \ - rc = mca_pml_dr_send_request_start_rndv(sendreq, bml_btl, size, 0); \ - } \ - } \ - } while (0) - - -/* - * Mark a send request as completed at the MPI level. - */ - -#define MCA_PML_DR_SEND_REQUEST_MPI_COMPLETE(sendreq) \ -do { \ - (sendreq)->req_send.req_base.req_ompi.req_status.MPI_SOURCE = \ - (sendreq)->req_send.req_base.req_comm->c_my_rank; \ - (sendreq)->req_send.req_base.req_ompi.req_status.MPI_TAG = \ - (sendreq)->req_send.req_base.req_tag; \ - (sendreq)->req_send.req_base.req_ompi.req_status.MPI_ERROR = OMPI_SUCCESS; \ - (sendreq)->req_send.req_base.req_ompi.req_status._ucount = \ - (sendreq)->req_send.req_bytes_packed; \ - ompi_request_complete( &((sendreq)->req_send.req_base.req_ompi), true ); \ -} while(0) - -/* - * The request fini is responsible for releasing all ressources at the PML - * level. It will never be called directly from the upper level, as it should - * only be an internal call to the PML. However, in the case when the user - * already lost the MPI reference to the request (MPI_Request_free was called) - * fini should completely free the MPI request. - */ - -#define MCA_PML_DR_SEND_REQUEST_PML_COMPLETE(sendreq) \ -do { \ - assert( false == sendreq->req_send.req_base.req_pml_complete ); \ - opal_list_remove_item(&mca_pml_dr.send_active, \ - (opal_list_item_t*) sendreq); \ - if (sendreq->req_send.req_send_mode == MCA_PML_BASE_SEND_BUFFERED && \ - sendreq->req_send.req_addr != sendreq->req_send.req_base.req_addr) { \ - mca_pml_base_bsend_request_fini((ompi_request_t*)sendreq); \ - } \ - \ - if( false == sendreq->req_send.req_base.req_ompi.req_complete ) { \ - /* Should only be called for long messages (maybe synchronous) */ \ - MCA_PML_DR_SEND_REQUEST_MPI_COMPLETE(sendreq); \ - } \ - sendreq->req_send.req_base.req_pml_complete = true; \ - \ - if( sendreq->req_send.req_base.req_free_called ) { \ - MCA_PML_DR_SEND_REQUEST_RETURN( sendreq ); \ - } else { \ - if(sendreq->req_send.req_base.req_ompi.req_persistent && \ - (0 != sendreq->req_send.req_bytes_packed) ) { \ - /* rewind convertor */ \ - size_t offset = 0; \ - opal_convertor_set_position(&sendreq->req_send.req_base.req_convertor, \ - &offset); \ - } \ - } \ -} while (0) - -/* - * Release resources associated with a request - */ - -#define MCA_PML_DR_SEND_REQUEST_RETURN(sendreq) \ - do { \ - /* Let the base handle the reference counts */ \ - MCA_PML_BASE_SEND_REQUEST_FINI((&(sendreq)->req_send)); \ - OMPI_FREE_LIST_RETURN( &mca_pml_base_send_requests, \ - (ompi_free_list_item_t*)sendreq ); \ -} while(0) - -/* - * Lookup/allocate a vfrag for the pending send - */ - -#define MCA_PML_DR_SEND_REQUEST_VFRAG_INIT(sendreq, pml_endpoint, size, vfrag) \ -do { \ - size_t max_send_size = pml_endpoint->bml_endpoint->btl_max_send_size - \ - sizeof(mca_pml_dr_frag_hdr_t); \ - size_t div = size / max_send_size; \ - \ - MCA_PML_DR_VFRAG_INIT(vfrag); \ - if(div == 0) { \ - vfrag->vf_len = 1; \ - vfrag->vf_size = size; \ - vfrag->vf_mask = 1; \ - } else if(div > 64) { \ - vfrag->vf_len = 64; \ - vfrag->vf_size = (max_send_size << 6); /* size * 64 */ \ - vfrag->vf_mask = ~(uint64_t)0; \ - } else if (div == 64) { \ - size_t mod = size % max_send_size; \ - vfrag->vf_len = 64; \ - vfrag->vf_size = (mod ? (size - mod) : size); \ - vfrag->vf_mask = ~(uint64_t)0; \ - } else { \ - size_t mod = size % max_send_size; \ - vfrag->vf_len = div + (mod ? 1 : 0); \ - vfrag->vf_size = size; \ - if(vfrag->vf_len == 64) \ - vfrag->vf_mask = ~(uint64_t)0; \ - else \ - vfrag->vf_mask = (((uint64_t)1 << vfrag->vf_len) - (uint64_t)1); \ - } \ - vfrag->vf_id = OPAL_THREAD_ADD32(&pml_endpoint->vfrag_seq,1); \ - vfrag->vf_offset = sendreq->req_send_offset; \ - vfrag->vf_max_send_size = max_send_size; \ - vfrag->vf_send.pval = sendreq; \ - sendreq->req_vfrag = vfrag; \ -} while(0) - -/* - * Reschedule unacked fragments - */ - -#define MCA_PML_DR_SEND_REQUEST_VFRAG_RETRANS(sendreq, vfrag) \ -do { \ - if(((vfrag)->vf_state & MCA_PML_DR_VFRAG_RETRANS) == 0) { \ - opal_list_append(&(sendreq)->req_retrans, (opal_list_item_t*)(vfrag));\ - (vfrag)->vf_state |= MCA_PML_DR_VFRAG_RETRANS; \ - } \ - (vfrag)->vf_state &= ~MCA_PML_DR_VFRAG_NACKED; \ - (vfrag)->vf_idx = 0; \ -} while(0) - -/* - * Update bytes delivered on request based on supplied descriptor - */ - -#define MCA_PML_DR_SEND_REQUEST_SET_BYTES_DELIVERED(sendreq, vfrag, hdrlen) \ -do { \ - sendreq->req_bytes_delivered += vfrag->vf_size; \ -} while(0) -/* - * Attempt to process any pending requests - */ - -#define MCA_PML_DR_SEND_REQUEST_PROCESS_PENDING() \ -do { \ - /* advance pending requests */ \ - while(opal_list_get_size(&mca_pml_dr.send_pending)) { \ - mca_pml_dr_send_request_t* sendreq; \ - OPAL_THREAD_LOCK(&ompi_request_lock); \ - sendreq = (mca_pml_dr_send_request_t*) \ - opal_list_remove_first(&mca_pml_dr.send_pending); \ - OPAL_THREAD_UNLOCK(&ompi_request_lock); \ - if(NULL == sendreq) \ - break; \ - opal_list_append(&mca_pml_dr.send_active, \ - (opal_list_item_t*) sendreq); \ - mca_pml_dr_send_request_schedule(sendreq); \ - } \ -} while (0) - - -/* - * Requeue first fragment of message for retransmission - */ - -#define MCA_PML_DR_SEND_REQUEST_EAGER_RETRY(sendreq, vfrag) \ - do { \ - mca_btl_base_descriptor_t *des_old, *des_new; \ - OPAL_THREAD_ADD64(&vfrag->vf_pending,1); \ - MCA_PML_DR_DEBUG(0, \ - (0, "%s:%d:%s: retransmitting eager\n", \ - __FILE__, __LINE__, __func__)); \ - assert(sendreq->req_descriptor->des_src != NULL); \ - \ - OPAL_THREAD_ADD_SIZE_T(&sendreq->req_pipeline_depth,1); \ - OPAL_THREAD_ADD64(&(vfrag)->vf_pending,1); \ - (vfrag)->vf_state &= ~MCA_PML_DR_VFRAG_NACKED; \ - \ - des_old = sendreq->req_descriptor; \ - mca_bml_base_alloc(vfrag->bml_btl, &des_new, \ - MCA_BTL_NO_ORDER, des_old->des_src->seg_len, \ - des_old->des_flags); \ - sendreq->req_descriptor = des_new; \ - memcpy(des_new->des_src->seg_addr.pval, \ - des_old->des_src->seg_addr.pval, \ - des_old->des_src->seg_len); \ - des_new->des_flags = des_old->des_flags; \ - des_new->des_cbdata = des_old->des_cbdata; \ - des_new->des_cbfunc = des_old->des_cbfunc; \ - mca_bml_base_send(vfrag->bml_btl, des_new, MCA_BTL_TAG_PML); \ - } while(0) - -/* - * Requeue first fragment of message for retransmission - */ - -#define MCA_PML_DR_SEND_REQUEST_RNDV_PROBE(sendreq, vfrag) \ -do { \ - mca_pml_dr_endpoint_t* endpoint = sendreq->req_endpoint; \ - mca_bml_base_btl_t* bml_btl = \ - mca_bml_base_btl_array_get_next(&endpoint->bml_endpoint->btl_eager); \ - mca_btl_base_descriptor_t *des_old, *des_new; \ - mca_pml_dr_hdr_t *hdr; \ - bool do_csum = mca_pml_dr.enable_csum && \ - (bml_btl->btl_flags & MCA_BTL_FLAGS_NEED_CSUM); \ - MCA_PML_DR_DEBUG(0,(0, "%s:%d:%s: (re)transmitting rndv probe\n", \ - __FILE__, __LINE__, __func__)); \ - OPAL_THREAD_ADD_SIZE_T(&sendreq->req_pipeline_depth,1); \ - OPAL_THREAD_ADD64(&vfrag->vf_pending,1); \ - (vfrag)->vf_state &= ~MCA_PML_DR_VFRAG_NACKED; \ - \ - assert(sendreq->req_descriptor->des_src != NULL); \ - des_old = sendreq->req_descriptor; \ - mca_bml_base_alloc(bml_btl, &des_new, \ - MCA_BTL_NO_ORDER, \ - sizeof(mca_pml_dr_rendezvous_hdr_t), \ - des_old->des_flags); \ - /* build hdr */ \ - hdr = (mca_pml_dr_hdr_t*)des_new->des_src->seg_addr.pval; \ - hdr->hdr_common.hdr_flags = 0; \ - hdr->hdr_common.hdr_type = MCA_PML_DR_HDR_TYPE_RNDV; \ - hdr->hdr_common.hdr_dst = endpoint->dst; \ - hdr->hdr_common.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid; \ - hdr->hdr_common.hdr_src = endpoint->src; \ - hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag; \ - hdr->hdr_match.hdr_seq = sendreq->req_send.req_base.req_sequence; \ - hdr->hdr_match.hdr_src_ptr.pval = &sendreq->req_vfrag0; \ - hdr->hdr_match.hdr_csum = OPAL_CSUM_ZERO; \ - hdr->hdr_common.hdr_vid = sendreq->req_vfrag0.vf_id; \ - hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed; \ - hdr->hdr_common.hdr_csum = (do_csum ? \ - opal_csum(hdr, sizeof(mca_pml_dr_rendezvous_hdr_t)): OPAL_CSUM_ZERO); \ - des_new->des_flags = des_old->des_flags; \ - des_new->des_cbdata = des_old->des_cbdata; \ - des_new->des_cbfunc = des_old->des_cbfunc; \ - mca_bml_base_send(bml_btl, des_new, MCA_BTL_TAG_PML); \ -} while(0) - -/** - * Start the specified request - */ - -int mca_pml_dr_send_request_start_buffered( - mca_pml_dr_send_request_t* sendreq, - mca_bml_base_btl_t* bml_btl, - size_t size); - -int mca_pml_dr_send_request_start_copy( - mca_pml_dr_send_request_t* sendreq, - mca_bml_base_btl_t* bml_btl, - size_t size); - -int mca_pml_dr_send_request_start_prepare( - mca_pml_dr_send_request_t* sendreq, - mca_bml_base_btl_t* bml_btl, - size_t size); - -int mca_pml_dr_send_request_start_rndv( - mca_pml_dr_send_request_t* sendreq, - mca_bml_base_btl_t* bml_btl, - size_t size, - int flags); - -/** - * Schedule additional fragments - */ -int mca_pml_dr_send_request_schedule( - mca_pml_dr_send_request_t* sendreq); - -int mca_pml_dr_send_request_reschedule( - mca_pml_dr_send_request_t* sendreq, - mca_pml_dr_vfrag_t* vfrag); - -/** - * Acknowledgment of vfrags - */ -void mca_pml_dr_send_request_match_ack( - mca_btl_base_module_t* btl, - mca_pml_dr_ack_hdr_t*); - -void mca_pml_dr_send_request_rndv_ack( - mca_btl_base_module_t* btl, - mca_pml_dr_ack_hdr_t*); - -void mca_pml_dr_send_request_frag_ack( - mca_btl_base_module_t* btl, - mca_pml_dr_ack_hdr_t*); - -void mca_pml_dr_sendreq_cleanup_active(mca_btl_base_module_t* btl); - - -END_C_DECLS -#endif - diff --git a/ompi/mca/pml/dr/pml_dr_start.c b/ompi/mca/pml/dr/pml_dr_start.c deleted file mode 100644 index 575411aa22..0000000000 --- a/ompi/mca/pml/dr/pml_dr_start.c +++ /dev/null @@ -1,127 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2006 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" - -#include "pml_dr.h" -#include "pml_dr_recvreq.h" -#include "pml_dr_sendreq.h" - - -int mca_pml_dr_start(size_t count, ompi_request_t** requests) -{ - int rc; - size_t i; - for(i=0; ireq_type) { - continue; - } - if(NULL == pml_request) - continue; - - /* If the persistent request is currebtly active - obtain the - * request lock and verify the status is incomplete. if the - * pml layer has not completed the request - mark the request - * as free called - so that it will be freed when the request - * completes - and create a new request. - */ - - switch(pml_request->req_ompi.req_state) { - case OMPI_REQUEST_INACTIVE: - if(pml_request->req_pml_complete == true) - break; - /* otherwise fall through */ - case OMPI_REQUEST_ACTIVE: { - - ompi_request_t *request; - OPAL_THREAD_LOCK(&ompi_request_lock); - if (pml_request->req_pml_complete == false) { - /* free request after it completes */ - pml_request->req_free_called = true; - } else { - /* can reuse the existing request */ - OPAL_THREAD_UNLOCK(&ompi_request_lock); - break; - } - - /* allocate a new request */ - switch(pml_request->req_type) { - case MCA_PML_REQUEST_SEND: { - mca_pml_base_send_mode_t sendmode = - ((mca_pml_base_send_request_t*)pml_request)->req_send_mode; - rc = mca_pml_dr_isend_init( - pml_request->req_addr, - pml_request->req_count, - pml_request->req_datatype, - pml_request->req_peer, - pml_request->req_tag, - sendmode, - pml_request->req_comm, - &request); - break; - } - case MCA_PML_REQUEST_RECV: - rc = mca_pml_dr_irecv_init( - pml_request->req_addr, - pml_request->req_count, - pml_request->req_datatype, - pml_request->req_peer, - pml_request->req_tag, - pml_request->req_comm, - &request); - break; - default: - rc = OMPI_ERR_REQUEST; - break; - } - OPAL_THREAD_UNLOCK(&ompi_request_lock); - if(OMPI_SUCCESS != rc) - return rc; - pml_request = (mca_pml_base_request_t*)request; - requests[i] = request; - break; - } - default: - return OMPI_ERR_REQUEST; - } - - /* start the request */ - switch(pml_request->req_type) { - case MCA_PML_REQUEST_SEND: - { - mca_pml_dr_send_request_t* sendreq = (mca_pml_dr_send_request_t*)pml_request; - MCA_PML_DR_SEND_REQUEST_START(sendreq, rc); - if(rc != OMPI_SUCCESS) - return rc; - break; - } - case MCA_PML_REQUEST_RECV: - { - mca_pml_dr_recv_request_t* recvreq = (mca_pml_dr_recv_request_t*)pml_request; - MCA_PML_DR_RECV_REQUEST_START(recvreq); - break; - } - default: - return OMPI_ERR_REQUEST; - } - } - return OMPI_SUCCESS; -} - diff --git a/ompi/mca/pml/dr/pml_dr_vfrag.c b/ompi/mca/pml/dr/pml_dr_vfrag.c deleted file mode 100644 index 98646032f9..0000000000 --- a/ompi/mca/pml/dr/pml_dr_vfrag.c +++ /dev/null @@ -1,209 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2006 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2006 The Regents of the University of California. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" -#include "pml_dr_vfrag.h" -#include "pml_dr_sendreq.h" -#include "ompi/mca/bml/base/base.h" - -static void mca_pml_dr_vfrag_wdog_timeout(int fd, short event, void* vfrag); -static void mca_pml_dr_vfrag_ack_timeout(int fd, short event, void* vfrag); - -static void mca_pml_dr_vfrag_construct(mca_pml_dr_vfrag_t* vfrag) -{ - vfrag->vf_send.pval = NULL; - vfrag->vf_recv.pval = NULL; - vfrag->vf_id = 0; - vfrag->vf_idx = 0; - vfrag->vf_len = 0; - vfrag->vf_offset = 0; - vfrag->vf_size = 0; - vfrag->vf_max_send_size = 0; - vfrag->vf_ack = 0; - vfrag->vf_mask = 1; - vfrag->vf_state = 0; - vfrag->vf_wdog_tv = mca_pml_dr.wdog_timer; - vfrag->vf_ack_tv = mca_pml_dr.ack_timer; - vfrag->vf_wdog_cnt = 0; - vfrag->vf_ack_cnt = 0; - opal_event_evtimer_set(opal_event_base, &vfrag->vf_wdog_ev, mca_pml_dr_vfrag_wdog_timeout, (void*) vfrag); - opal_event_evtimer_set(opal_event_base, &vfrag->vf_ack_ev, mca_pml_dr_vfrag_ack_timeout, (void*) vfrag); -} - - -static void mca_pml_dr_vfrag_destruct(mca_pml_dr_vfrag_t* vfrag) -{ -} - - -OBJ_CLASS_INSTANCE( - mca_pml_dr_vfrag_t, - ompi_free_list_item_t, - mca_pml_dr_vfrag_construct, - mca_pml_dr_vfrag_destruct -); - - -/** - * The wdog timer expired, better do something about it, like resend the current part of the vfrag - */ -static void mca_pml_dr_vfrag_wdog_timeout(int fd, short event, void* data) -{ - mca_pml_dr_vfrag_t* vfrag = (mca_pml_dr_vfrag_t*) data; - mca_pml_dr_send_request_t* sendreq = (mca_pml_dr_send_request_t*)vfrag->vf_send.pval; - - MCA_PML_DR_DEBUG(0,(0, "%s:%d:%s: wdog timeout: %p vid: %d", - __FILE__, __LINE__, __func__, (void*)vfrag, vfrag->vf_id)); - - /* update pending counts */ - OPAL_THREAD_ADD_SIZE_T(&sendreq->req_pipeline_depth,-vfrag->vf_pending); - OPAL_THREAD_ADD64(&vfrag->vf_pending,-vfrag->vf_pending); - - /* check for hung btl */ - if(++vfrag->vf_wdog_cnt == mca_pml_dr.wdog_retry_max) { - /* declare btl dead */ - if(vfrag->bml_btl->btl) { - opal_output(0, "%s:%d:%s: failing BTL: %s", __FILE__, __LINE__, __func__, - vfrag->bml_btl->btl->btl_component->btl_version.mca_component_name); - mca_pml_dr_sendreq_cleanup_active(vfrag->bml_btl->btl); - mca_bml.bml_del_btl(vfrag->bml_btl->btl); - } else { - opal_output(0, "%s:%d:%s: failing already failed BTL", __FILE__, __LINE__, __func__); - } - mca_pml_dr_vfrag_reset(vfrag); - } else if(NULL == vfrag->bml_btl->btl) { - mca_pml_dr_vfrag_reset(vfrag); - } - - /* back off watchdog timer */ - vfrag->vf_wdog_tv.tv_sec = - mca_pml_dr.wdog_timer.tv_sec + - mca_pml_dr.wdog_timer.tv_sec * mca_pml_dr.wdog_timer_multiplier * - vfrag->vf_wdog_cnt; - vfrag->vf_wdog_tv.tv_usec = - mca_pml_dr.wdog_timer.tv_usec + - mca_pml_dr.wdog_timer.tv_usec * mca_pml_dr.wdog_timer_multiplier * - vfrag->vf_wdog_cnt; - - /* reschedule vfrag */ - mca_pml_dr_vfrag_reschedule(vfrag); -} - - -/** - * The ack timer expired, better do something about it, like resend the entire vfrag? - */ -static void mca_pml_dr_vfrag_ack_timeout(int fd, short event, void* data) -{ - mca_pml_dr_vfrag_t* vfrag = (mca_pml_dr_vfrag_t*) data; - MCA_PML_DR_DEBUG(0,(0, "%s:%d:%s: ack timeout: %p", - __FILE__, __LINE__, __func__, (void*)vfrag)); - - /* stop ack timer */ - MCA_PML_DR_VFRAG_ACK_STOP(vfrag); - - /* check for hung btl */ - if(++vfrag->vf_ack_cnt == mca_pml_dr.ack_retry_max) { - /* declare btl dead */ - if(vfrag->bml_btl->btl) { - opal_output(0, "%s:%d:%s: failing BTL: %s", __FILE__, __LINE__, __func__, - vfrag->bml_btl->btl->btl_component->btl_version.mca_component_name); - mca_pml_dr_sendreq_cleanup_active(vfrag->bml_btl->btl); - mca_bml.bml_del_btl(vfrag->bml_btl->btl); - } else { - opal_output(0, "%s:%d:%s: failing already failed BTL", __FILE__, __LINE__, __func__); - } - mca_pml_dr_vfrag_reset(vfrag); - } else if(NULL == vfrag->bml_btl->btl) { - mca_pml_dr_vfrag_reset(vfrag); - } - - /* back off ack timer */ - vfrag->vf_ack_tv.tv_sec = - mca_pml_dr.ack_timer.tv_sec + - mca_pml_dr.ack_timer.tv_sec * mca_pml_dr.ack_timer_multiplier * - vfrag->vf_ack_cnt; - vfrag->vf_ack_tv.tv_usec = - mca_pml_dr.ack_timer.tv_usec + - mca_pml_dr.ack_timer.tv_usec * mca_pml_dr.ack_timer_multiplier * - vfrag->vf_ack_cnt; - - /* reschedule vfrag */ - mca_pml_dr_vfrag_reschedule(vfrag); -} - -/** - * Vfrag failure - declare btl dead and try to resend on an alternate btl - */ - -void mca_pml_dr_vfrag_reset(mca_pml_dr_vfrag_t* vfrag) -{ - mca_pml_dr_send_request_t* sendreq = (mca_pml_dr_send_request_t*)vfrag->vf_send.pval; - - /* update counters - give new BTL a fair chance :-) */ - vfrag->vf_ack_cnt = 0; - vfrag->vf_wdog_cnt = 0; - - /* lookup new bml_btl data structure */ - sendreq->req_endpoint = (mca_pml_dr_endpoint_t*)sendreq->req_send.req_base.req_proc->proc_pml; - - /* make sure a path is available */ - if(mca_bml_base_btl_array_get_size(&sendreq->req_endpoint->bml_endpoint->btl_eager) == 0 || - mca_bml_base_btl_array_get_size(&sendreq->req_endpoint->bml_endpoint->btl_eager) == 0) { - opal_output(0, "%s:%d:%s: no path to peer", __FILE__, __LINE__, __func__); - ompi_rte_abort(-1, NULL); - } - if(vfrag->vf_offset == 0) { - vfrag->bml_btl = mca_bml_base_btl_array_get_next(&sendreq->req_endpoint->bml_endpoint->btl_eager); - } else { - vfrag->bml_btl = mca_bml_base_btl_array_get_next(&sendreq->req_endpoint->bml_endpoint->btl_send); - } - opal_output(0, "%s:%d:%s: selected new BTL: %s", __FILE__, __LINE__, __func__, - vfrag->bml_btl->btl->btl_component->btl_version.mca_component_name); -} - - -/** - * Reschedule vfrag that has timed out - */ - -void mca_pml_dr_vfrag_reschedule(mca_pml_dr_vfrag_t* vfrag) -{ - mca_pml_dr_send_request_t* sendreq = (mca_pml_dr_send_request_t*)vfrag->vf_send.pval; - - /* start wdog timer */ - MCA_PML_DR_VFRAG_WDOG_START(vfrag); - - /* first frag within send request */ - OPAL_THREAD_LOCK(&ompi_request_lock); - if(vfrag == &sendreq->req_vfrag0) { - if(vfrag->vf_state & MCA_PML_DR_VFRAG_RNDV) { - MCA_PML_DR_SEND_REQUEST_RNDV_PROBE(sendreq, vfrag); - } else { - MCA_PML_DR_SEND_REQUEST_EAGER_RETRY(sendreq, vfrag); - } - OPAL_THREAD_UNLOCK(&ompi_request_lock); - - /* reschedule unacked portion of vfrag */ - } else { - MCA_PML_DR_SEND_REQUEST_VFRAG_RETRANS(sendreq, vfrag); - OPAL_THREAD_UNLOCK(&ompi_request_lock); - mca_pml_dr_send_request_schedule(sendreq); - } -} - diff --git a/ompi/mca/pml/dr/pml_dr_vfrag.h b/ompi/mca/pml/dr/pml_dr_vfrag.h deleted file mode 100644 index 634212822d..0000000000 --- a/ompi/mca/pml/dr/pml_dr_vfrag.h +++ /dev/null @@ -1,152 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2006 The Regents of the University of California. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/** - * @file - */ -#ifndef OMPI_PML_DR_VFRAG_H_ -#define OMPI_PML_DR_VFRAG_H_ - -#include "ompi_config.h" -#include "opal/mca/event/event.h" -#include "opal/types.h" -#include "pml_dr.h" - -BEGIN_C_DECLS - -#define MCA_PML_DR_VFRAG_NACKED 0x01 -#define MCA_PML_DR_VFRAG_RNDV 0x02 -#define MCA_PML_DR_VFRAG_RETRANS 0x04 - -struct mca_pml_dr_vfrag_t { - ompi_free_list_item_t super; - ompi_ptr_t vf_send; - ompi_ptr_t vf_recv; - uint32_t vf_id; - uint16_t vf_idx; - uint16_t vf_len; - size_t vf_offset; - size_t vf_size; - size_t vf_max_send_size; - uint64_t vf_ack; - uint64_t vf_mask; - int64_t vf_pending; - uint32_t vf_state; - struct mca_bml_base_btl_t* bml_btl; - - /* we need a timer for the vfrag for: - 1) a watchdog timer for local completion of the current - operation - 2) a timeout for ACK of the VRAG - */ - struct timeval vf_wdog_tv; - opal_event_t vf_wdog_ev; - uint8_t vf_wdog_cnt; - - struct timeval vf_ack_tv; - opal_event_t vf_ack_ev; - uint8_t vf_ack_cnt; -}; -typedef struct mca_pml_dr_vfrag_t mca_pml_dr_vfrag_t; - -OBJ_CLASS_DECLARATION(mca_pml_dr_vfrag_t); - -#define MCA_PML_DR_VFRAG_ALLOC(vfrag,rc) \ -do { \ - ompi_free_list_item_t* item; \ - OMPI_FREE_LIST_WAIT(&mca_pml_dr.vfrags, item, rc); \ - vfrag = (mca_pml_dr_vfrag_t*)item; \ -} while(0) - - -#define MCA_PML_DR_VFRAG_RETURN(vfrag) \ -do { \ - OMPI_FREE_LIST_RETURN(&mca_pml_dr.vfrags, \ - (ompi_free_list_item_t*)vfrag); \ -} while(0) - -#define MCA_PML_DR_VFRAG_INIT(vfrag) \ -do { \ - (vfrag)->vf_idx = 0; \ - (vfrag)->vf_ack = 0; \ - (vfrag)->vf_wdog_cnt = 0; \ - (vfrag)->vf_ack_cnt = 0; \ - (vfrag)->vf_recv.pval = NULL; \ - (vfrag)->vf_state = 0; \ - (vfrag)->vf_pending = 0; \ - (vfrag)->vf_wdog_tv = mca_pml_dr.wdog_timer; \ - (vfrag)->vf_ack_tv = mca_pml_dr.ack_timer; \ -} while(0) - - -/* - * Watchdog Timer - */ - -#define MCA_PML_DR_VFRAG_WDOG_START(vfrag) \ -do { \ - opal_event_add(&(vfrag)->vf_wdog_ev, &(vfrag)->vf_wdog_tv); \ -} while(0) - -#define MCA_PML_DR_VFRAG_WDOG_STOP(vfrag) \ -do { \ - opal_event_del(&(vfrag)->vf_wdog_ev); \ -} while(0) - -#define MCA_PML_DR_VFRAG_WDOG_RESET(vfrag) \ -do { \ - opal_event_del(&(vfrag)->vf_wdog_ev); \ - opal_event_add(&(vfrag)->vf_wdog_ev, &vfrag->vf_wdog_tv); \ -} while(0) - - -/* - * Ack Timer - */ - -#define MCA_PML_DR_VFRAG_ACK_START(vfrag) \ -do { \ - opal_event_add(&(vfrag)->vf_ack_ev, &(vfrag)->vf_ack_tv); \ -} while(0) - -#define MCA_PML_DR_VFRAG_ACK_STOP(vfrag) \ -do { \ - opal_event_del(&vfrag->vf_ack_ev); \ -} while(0) - -#define MCA_PML_DR_VFRAG_ACK_RESET(vfrag) \ -do { \ - MCA_PML_DR_VFRAG_ACK_STOP(vfrag); \ - MCA_PML_DR_VFRAG_ACK_START(vfrag); \ -} while(0) - - -/** - * Reset a VFRAG to use a new BTL - */ - -void mca_pml_dr_vfrag_reset(mca_pml_dr_vfrag_t*); - -/** - * Reschedule a vfrag that has timed out - */ - -void mca_pml_dr_vfrag_reschedule(mca_pml_dr_vfrag_t*); - -END_C_DECLS -#endif -