From 4d4b0a022ae5dd670d48ffae5d5f63e6a5e332dd Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Tue, 13 Feb 2007 12:01:36 +0000 Subject: [PATCH] Add error callback to sm BTL. Call it when allocation of the initial circular buffer fails. If cb is already allocated, but it is full and allocation of additional cb fails, we spin waiting for receiver to free space in existing cb. This commit was SVN r13635. --- ompi/class/ompi_fifo.h | 6 +++--- ompi/mca/btl/sm/btl_sm.c | 15 +++++++++++++-- ompi/mca/btl/sm/btl_sm.h | 12 ++++++++++++ ompi/mca/btl/sm/btl_sm_component.c | 16 +++++++++++----- ompi/mca/btl/sm/btl_sm_fifo.h | 11 +++++------ 5 files changed, 44 insertions(+), 16 deletions(-) diff --git a/ompi/class/ompi_fifo.h b/ompi/class/ompi_fifo.h index d277dfe19b..f0a35bacdc 100644 --- a/ompi/class/ompi_fifo.h +++ b/ompi/class/ompi_fifo.h @@ -338,7 +338,7 @@ static inline int ompi_fifo_write_to_head_same_base_addr(void *data, if(error_code != OMPI_CB_ERROR) { fifo->head->cb_overflow = false; opal_atomic_unlock(&(fifo->fifo_lock)); - return error_code; + return OMPI_SUCCESS; } /* see if next queue is available - while the next queue @@ -384,11 +384,11 @@ static inline int ompi_fifo_write_to_head_same_base_addr(void *data, error_code=ompi_cb_fifo_write_to_head_same_base_addr(data, (ompi_cb_fifo_t *)&(fifo->head->cb_fifo)); if( OMPI_CB_ERROR == error_code ) { - return error_code; + return OMPI_ERROR; } } - return error_code; + return OMPI_SUCCESS; } diff --git a/ompi/mca/btl/sm/btl_sm.c b/ompi/mca/btl/sm/btl_sm.c index c5a1564840..e503f84eb4 100644 --- a/ompi/mca/btl/sm/btl_sm.c +++ b/ompi/mca/btl/sm/btl_sm.c @@ -108,7 +108,7 @@ mca_btl_sm_t mca_btl_sm[2] = { NULL, /* get */ mca_btl_base_dump, NULL, /* mpool */ - NULL /* register error */ + mca_btl_sm_register_error_cb /* register error */ } }, { @@ -136,7 +136,7 @@ mca_btl_sm_t mca_btl_sm[2] = { NULL, /* get function */ mca_btl_base_dump, NULL, /* mpool */ - NULL /* register error */ + mca_btl_sm_register_error_cb /* register error */ } } }; @@ -770,6 +770,17 @@ int mca_btl_sm_register( return OMPI_SUCCESS; } +/* + * Register callback function for error handling.. + */ +int mca_btl_sm_register_error_cb( + struct mca_btl_base_module_t* btl, + mca_btl_base_module_error_cb_fn_t cbfunc) +{ + mca_btl_sm_t *sm_btl = (mca_btl_sm_t *)btl; + sm_btl->error_cb = cbfunc; + return OMPI_SUCCESS; +} /** * Allocate a segment. diff --git a/ompi/mca/btl/sm/btl_sm.h b/ompi/mca/btl/sm/btl_sm.h index 3eaa5e6f78..d29ffcf535 100644 --- a/ompi/mca/btl/sm/btl_sm.h +++ b/ompi/mca/btl/sm/btl_sm.h @@ -179,11 +179,23 @@ struct mca_btl_sm_t { mca_btl_base_module_t super; /**< base BTL interface */ bool btl_inited; /**< flag indicating if btl has been inited */ mca_btl_sm_recv_reg_t sm_reg[256]; + mca_btl_base_module_error_cb_fn_t error_cb; }; typedef struct mca_btl_sm_t mca_btl_sm_t; extern mca_btl_sm_t mca_btl_sm[2]; +/** + * Register a callback function that is called on error.. + * + * @param btl (IN) BTL module + * @return Status indicating if cleanup was successful + */ + +int mca_btl_sm_register_error_cb( + struct mca_btl_base_module_t* btl, + mca_btl_base_module_error_cb_fn_t cbfunc +); /** * Cleanup any resources held by the BTL. diff --git a/ompi/mca/btl/sm/btl_sm_component.c b/ompi/mca/btl/sm/btl_sm_component.c index 3232e72d79..e73a7dad46 100644 --- a/ompi/mca/btl/sm/btl_sm_component.c +++ b/ompi/mca/btl/sm/btl_sm_component.c @@ -49,6 +49,7 @@ #include "ompi/mca/pml/base/pml_base_module_exchange.h" #include "ompi/mca/mpool/base/base.h" #include "ompi/mca/common/sm/common_sm_mmap.h" +#include "ompi/mca/btl/base/btl_base_error.h" #include "btl_sm.h" #include "btl_sm_frag.h" #include "btl_sm_fifo.h" @@ -343,7 +344,7 @@ int mca_btl_sm_component_progress(void) ompi_fifo_t *fifo = NULL; int my_smp_rank=mca_btl_sm_component.my_smp_rank; int proc; - int rc = 0; + int rc = 0, btl = 0; /* send progress is made by the PML */ @@ -405,7 +406,7 @@ int mca_btl_sm_component_progress(void) MCA_BTL_SM_FIFO_WRITE( mca_btl_sm_component.sm_peers[peer_smp_rank], my_smp_rank, peer_smp_rank, frag, rc ); if(OMPI_SUCCESS != rc) - return rc; + goto err; break; } default: @@ -416,7 +417,7 @@ int mca_btl_sm_component_progress(void) MCA_BTL_SM_FIFO_WRITE( mca_btl_sm_component.sm_peers[peer_smp_rank], my_smp_rank, peer_smp_rank, frag, rc ); if(OMPI_SUCCESS != rc) - return rc; + goto err; break; } } @@ -424,6 +425,7 @@ int mca_btl_sm_component_progress(void) } /* end peer_local_smp_rank loop */ + btl = 1; /* loop over fifo's - procs with different base shared memory * virtual address as this process */ for( proc=0 ; proc < mca_btl_sm_component.num_smp_procs_different_base_addr @@ -497,7 +499,7 @@ int mca_btl_sm_component_progress(void) MCA_BTL_SM_FIFO_WRITE( mca_btl_sm_component.sm_peers[peer_smp_rank], my_smp_rank, peer_smp_rank, frag, rc ); if(OMPI_SUCCESS != rc) - return rc; + goto err; break; } default: @@ -508,11 +510,15 @@ int mca_btl_sm_component_progress(void) MCA_BTL_SM_FIFO_WRITE( mca_btl_sm_component.sm_peers[peer_smp_rank], my_smp_rank, peer_smp_rank, frag, rc ); if(OMPI_SUCCESS != rc) - return rc; + goto err; break; } } rc++; } /* end peer_local_smp_rank loop */ return rc; +err: + BTL_ERROR(("SM faild to send message due to shortage of shared memory.\n")); + mca_btl_sm[btl].error_cb(&mca_btl_sm[btl].super, MCA_BTL_ERROR_FLAGS_FATAL); + return rc; } diff --git a/ompi/mca/btl/sm/btl_sm_fifo.h b/ompi/mca/btl/sm/btl_sm_fifo.h index 574c3b862f..b14df7a29c 100644 --- a/ompi/mca/btl/sm/btl_sm_fifo.h +++ b/ompi/mca/btl/sm/btl_sm_fifo.h @@ -29,12 +29,11 @@ do { \ } \ \ /* post fragment */ \ - rc=ompi_fifo_write_to_head_same_base_addr(frag, fifo, \ - mca_btl_sm_component.sm_mpool); \ - if( 0 <= rc ) { \ - MCA_BTL_SM_SIGNAL_PEER(endpoint_peer); \ - rc=OMPI_SUCCESS; \ - } \ + while(ompi_fifo_write_to_head_same_base_addr(frag, fifo, \ + mca_btl_sm_component.sm_mpool) != OMPI_SUCCESS) \ + opal_progress(); \ + MCA_BTL_SM_SIGNAL_PEER(endpoint_peer); \ + rc=OMPI_SUCCESS; \ if(opal_using_threads()) \ opal_atomic_unlock(&fifo->head_lock); \ } while(0)