From 9d5eeecb8a5d437ea875933048f4506a41c065b3 Mon Sep 17 00:00:00 2001 From: Nathan Hjelm <hjelmn@lanl.gov> Date: Mon, 28 Mar 2016 10:52:40 -0600 Subject: [PATCH] pml/ob1: detect unreachable errors This commit adds code to detect when procs are unreachable when using the dynamic add_procs functionality. Fixes #1501 Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov> --- ompi/mca/bml/r2/bml_r2.c | 2 ++ ompi/mca/pml/ob1/pml_ob1_isend.c | 8 ++++++++ ompi/mca/pml/ob1/pml_ob1_recvreq.h | 2 ++ 3 files changed, 12 insertions(+) diff --git a/ompi/mca/bml/r2/bml_r2.c b/ompi/mca/bml/r2/bml_r2.c index 18fefe5254..e29928f673 100644 --- a/ompi/mca/bml/r2/bml_r2.c +++ b/ompi/mca/bml/r2/bml_r2.c @@ -410,6 +410,8 @@ static int mca_bml_r2_add_proc (struct ompi_proc_t *proc) } if (!btl_in_use) { + proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML] = NULL; + OBJ_RELEASE(bml_endpoint); /* no btl is available for this proc */ if (mca_bml_r2.show_unreach_errors) { opal_show_help ("help-mca-bml-r2.txt", "unreachable proc", true, diff --git a/ompi/mca/pml/ob1/pml_ob1_isend.c b/ompi/mca/pml/ob1/pml_ob1_isend.c index 893e6cebec..bd5fd767e3 100644 --- a/ompi/mca/pml/ob1/pml_ob1_isend.c +++ b/ompi/mca/pml/ob1/pml_ob1_isend.c @@ -140,6 +140,10 @@ int mca_pml_ob1_isend(const void *buf, int16_t seqn; int rc; + if (OPAL_UNLIKELY(NULL == endpoint)) { + return OMPI_ERR_UNREACH; + } + seqn = (uint16_t) OPAL_THREAD_ADD32(&ob1_proc->send_sequence, 1); if (MCA_PML_BASE_SEND_SYNCHRONOUS != sendmode) { @@ -189,6 +193,10 @@ int mca_pml_ob1_send(const void *buf, int16_t seqn; int rc; + if (OPAL_UNLIKELY(NULL == endpoint)) { + return OMPI_ERR_UNREACH; + } + if (OPAL_UNLIKELY(MCA_PML_BASE_SEND_BUFFERED == sendmode)) { /* large buffered sends *need* a real request so use isend instead */ ompi_request_t *brequest; diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.h b/ompi/mca/pml/ob1/pml_ob1_recvreq.h index 71fb8c3d5b..77300a8320 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvreq.h +++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.h @@ -435,6 +435,8 @@ static inline int mca_pml_ob1_recv_request_ack_send(ompi_proc_t* proc, mca_bml_base_btl_t* bml_btl; mca_bml_base_endpoint_t* endpoint = mca_bml_base_get_endpoint (proc); + assert (NULL != endpoint); + for(i = 0; i < mca_bml_base_btl_array_get_size(&endpoint->btl_eager); i++) { bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager); if(mca_pml_ob1_recv_request_ack_send_btl(proc, bml_btl, hdr_src_req,