OSC/UCX: fixed hang on OSC init
- there worked progress was missed on startup which caused hang on one of ranks Signed-off-by: Sergey Oblomov <sergeyo@mellanox.com>
Этот коммит содержится в:
родитель
4447738098
Коммит
a081fba046
@ -38,6 +38,7 @@ typedef struct ompi_osc_ucx_component {
|
|||||||
opal_free_list_t requests; /* request free list for the r* communication variants */
|
opal_free_list_t requests; /* request free list for the r* communication variants */
|
||||||
bool env_initialized; /* UCX environment is initialized or not */
|
bool env_initialized; /* UCX environment is initialized or not */
|
||||||
int num_incomplete_req_ops;
|
int num_incomplete_req_ops;
|
||||||
|
int init_in_progress;
|
||||||
unsigned int priority;
|
unsigned int priority;
|
||||||
} ompi_osc_ucx_component_t;
|
} ompi_osc_ucx_component_t;
|
||||||
|
|
||||||
|
@ -45,7 +45,12 @@ ompi_osc_ucx_component_t mca_osc_ucx_component = {
|
|||||||
.osc_query = component_query,
|
.osc_query = component_query,
|
||||||
.osc_select = component_select,
|
.osc_select = component_select,
|
||||||
.osc_finalize = component_finalize,
|
.osc_finalize = component_finalize,
|
||||||
}
|
},
|
||||||
|
.ucp_context = NULL,
|
||||||
|
.ucp_worker = NULL,
|
||||||
|
.env_initialized = false,
|
||||||
|
.num_incomplete_req_ops = 0,
|
||||||
|
.init_in_progress = 1
|
||||||
};
|
};
|
||||||
|
|
||||||
ompi_osc_ucx_module_t ompi_osc_ucx_module_template = {
|
ompi_osc_ucx_module_t ompi_osc_ucx_module_template = {
|
||||||
@ -105,24 +110,19 @@ static int component_register(void) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static int progress_callback(void) {
|
static int progress_callback(void) {
|
||||||
if (mca_osc_ucx_component.ucp_worker != NULL &&
|
if ((mca_osc_ucx_component.ucp_worker != NULL) &&
|
||||||
mca_osc_ucx_component.num_incomplete_req_ops > 0) {
|
(mca_osc_ucx_component.num_incomplete_req_ops +
|
||||||
|
mca_osc_ucx_component.init_in_progress > 0)) {
|
||||||
ucp_worker_progress(mca_osc_ucx_component.ucp_worker);
|
ucp_worker_progress(mca_osc_ucx_component.ucp_worker);
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int component_init(bool enable_progress_threads, bool enable_mpi_threads) {
|
static int component_init(bool enable_progress_threads, bool enable_mpi_threads) {
|
||||||
int ret = OMPI_SUCCESS;
|
|
||||||
|
|
||||||
mca_osc_ucx_component.ucp_context = NULL;
|
|
||||||
mca_osc_ucx_component.ucp_worker = NULL;
|
|
||||||
mca_osc_ucx_component.enable_mpi_threads = enable_mpi_threads;
|
mca_osc_ucx_component.enable_mpi_threads = enable_mpi_threads;
|
||||||
mca_osc_ucx_component.env_initialized = false;
|
|
||||||
mca_osc_ucx_component.num_incomplete_req_ops = 0;
|
|
||||||
|
|
||||||
opal_common_ucx_mca_register();
|
opal_common_ucx_mca_register();
|
||||||
return ret;
|
return OMPI_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int component_finalize(void) {
|
static int component_finalize(void) {
|
||||||
|
@ -57,6 +57,7 @@ void req_completion(void *request, ucs_status_t status) {
|
|||||||
ompi_request_complete(&(req->external_req->super), true);
|
ompi_request_complete(&(req->external_req->super), true);
|
||||||
ucp_request_release(req);
|
ucp_request_release(req);
|
||||||
mca_osc_ucx_component.num_incomplete_req_ops--;
|
mca_osc_ucx_component.num_incomplete_req_ops--;
|
||||||
|
mca_osc_ucx_component.init_in_progress = 0;
|
||||||
assert(mca_osc_ucx_component.num_incomplete_req_ops >= 0);
|
assert(mca_osc_ucx_component.num_incomplete_req_ops >= 0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user