From a081fba0465e0e03472fc45c9a4a7154f539e3f6 Mon Sep 17 00:00:00 2001 From: Sergey Oblomov Date: Wed, 18 Jul 2018 16:59:13 +0300 Subject: [PATCH] OSC/UCX: fixed hang on OSC init - there worked progress was missed on startup which caused hang on one of ranks Signed-off-by: Sergey Oblomov --- ompi/mca/osc/ucx/osc_ucx.h | 1 + ompi/mca/osc/ucx/osc_ucx_component.c | 20 ++++++++++---------- ompi/mca/osc/ucx/osc_ucx_request.c | 1 + 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/ompi/mca/osc/ucx/osc_ucx.h b/ompi/mca/osc/ucx/osc_ucx.h index 095de34c27..f4ffbf17bd 100644 --- a/ompi/mca/osc/ucx/osc_ucx.h +++ b/ompi/mca/osc/ucx/osc_ucx.h @@ -38,6 +38,7 @@ typedef struct ompi_osc_ucx_component { opal_free_list_t requests; /* request free list for the r* communication variants */ bool env_initialized; /* UCX environment is initialized or not */ int num_incomplete_req_ops; + int init_in_progress; unsigned int priority; } ompi_osc_ucx_component_t; diff --git a/ompi/mca/osc/ucx/osc_ucx_component.c b/ompi/mca/osc/ucx/osc_ucx_component.c index dc6c5f2e44..9090b9e677 100644 --- a/ompi/mca/osc/ucx/osc_ucx_component.c +++ b/ompi/mca/osc/ucx/osc_ucx_component.c @@ -45,7 +45,12 @@ ompi_osc_ucx_component_t mca_osc_ucx_component = { .osc_query = component_query, .osc_select = component_select, .osc_finalize = component_finalize, - } + }, + .ucp_context = NULL, + .ucp_worker = NULL, + .env_initialized = false, + .num_incomplete_req_ops = 0, + .init_in_progress = 1 }; ompi_osc_ucx_module_t ompi_osc_ucx_module_template = { @@ -105,24 +110,19 @@ static int component_register(void) { } static int progress_callback(void) { - if (mca_osc_ucx_component.ucp_worker != NULL && - mca_osc_ucx_component.num_incomplete_req_ops > 0) { + if ((mca_osc_ucx_component.ucp_worker != NULL) && + (mca_osc_ucx_component.num_incomplete_req_ops + + mca_osc_ucx_component.init_in_progress > 0)) { ucp_worker_progress(mca_osc_ucx_component.ucp_worker); } return 0; } static int component_init(bool enable_progress_threads, bool enable_mpi_threads) { - int ret = OMPI_SUCCESS; - - mca_osc_ucx_component.ucp_context = NULL; - mca_osc_ucx_component.ucp_worker = NULL; mca_osc_ucx_component.enable_mpi_threads = enable_mpi_threads; - mca_osc_ucx_component.env_initialized = false; - mca_osc_ucx_component.num_incomplete_req_ops = 0; opal_common_ucx_mca_register(); - return ret; + return OMPI_SUCCESS; } static int component_finalize(void) { diff --git a/ompi/mca/osc/ucx/osc_ucx_request.c b/ompi/mca/osc/ucx/osc_ucx_request.c index efbd9c38cc..146111f948 100644 --- a/ompi/mca/osc/ucx/osc_ucx_request.c +++ b/ompi/mca/osc/ucx/osc_ucx_request.c @@ -57,6 +57,7 @@ void req_completion(void *request, ucs_status_t status) { ompi_request_complete(&(req->external_req->super), true); ucp_request_release(req); mca_osc_ucx_component.num_incomplete_req_ops--; + mca_osc_ucx_component.init_in_progress = 0; assert(mca_osc_ucx_component.num_incomplete_req_ops >= 0); } }