OSC/UCX: fixed hang on OSC init
- there worked progress was missed on startup which caused hang on one of ranks Signed-off-by: Sergey Oblomov <sergeyo@mellanox.com>
Этот коммит содержится в:
родитель
4447738098
Коммит
a081fba046
@ -38,6 +38,7 @@ typedef struct ompi_osc_ucx_component {
|
||||
opal_free_list_t requests; /* request free list for the r* communication variants */
|
||||
bool env_initialized; /* UCX environment is initialized or not */
|
||||
int num_incomplete_req_ops;
|
||||
int init_in_progress;
|
||||
unsigned int priority;
|
||||
} ompi_osc_ucx_component_t;
|
||||
|
||||
|
@ -45,7 +45,12 @@ ompi_osc_ucx_component_t mca_osc_ucx_component = {
|
||||
.osc_query = component_query,
|
||||
.osc_select = component_select,
|
||||
.osc_finalize = component_finalize,
|
||||
}
|
||||
},
|
||||
.ucp_context = NULL,
|
||||
.ucp_worker = NULL,
|
||||
.env_initialized = false,
|
||||
.num_incomplete_req_ops = 0,
|
||||
.init_in_progress = 1
|
||||
};
|
||||
|
||||
ompi_osc_ucx_module_t ompi_osc_ucx_module_template = {
|
||||
@ -105,24 +110,19 @@ static int component_register(void) {
|
||||
}
|
||||
|
||||
static int progress_callback(void) {
|
||||
if (mca_osc_ucx_component.ucp_worker != NULL &&
|
||||
mca_osc_ucx_component.num_incomplete_req_ops > 0) {
|
||||
if ((mca_osc_ucx_component.ucp_worker != NULL) &&
|
||||
(mca_osc_ucx_component.num_incomplete_req_ops +
|
||||
mca_osc_ucx_component.init_in_progress > 0)) {
|
||||
ucp_worker_progress(mca_osc_ucx_component.ucp_worker);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int component_init(bool enable_progress_threads, bool enable_mpi_threads) {
|
||||
int ret = OMPI_SUCCESS;
|
||||
|
||||
mca_osc_ucx_component.ucp_context = NULL;
|
||||
mca_osc_ucx_component.ucp_worker = NULL;
|
||||
mca_osc_ucx_component.enable_mpi_threads = enable_mpi_threads;
|
||||
mca_osc_ucx_component.env_initialized = false;
|
||||
mca_osc_ucx_component.num_incomplete_req_ops = 0;
|
||||
|
||||
opal_common_ucx_mca_register();
|
||||
return ret;
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static int component_finalize(void) {
|
||||
|
@ -57,6 +57,7 @@ void req_completion(void *request, ucs_status_t status) {
|
||||
ompi_request_complete(&(req->external_req->super), true);
|
||||
ucp_request_release(req);
|
||||
mca_osc_ucx_component.num_incomplete_req_ops--;
|
||||
mca_osc_ucx_component.init_in_progress = 0;
|
||||
assert(mca_osc_ucx_component.num_incomplete_req_ops >= 0);
|
||||
}
|
||||
}
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user