1
1

opal/common/ucx: Set of bug fixes in wpool

Signed-off-by: Xin Zhao <xinz@mellanox.com>
Этот коммит содержится в:
Xin Zhao 2018-12-01 12:12:39 -08:00 коммит произвёл Artem Polyakov
родитель 344bb641a1
Коммит 07cb4134be
2 изменённых файлов: 23 добавлений и 25 удалений

Просмотреть файл

@ -817,20 +817,19 @@ static void _common_ucx_tls_cleanup(_tlocal_table_t *tls)
// Cleanup memory table
size = tls->mem_tbl_size;
for (i = 0; i < size; i++) {
if (NULL == tls->mem_tbl[i]->gmem){
continue;
if (NULL != tls->mem_tbl[i]->gmem){
_tlocal_mem_record_cleanup(tls->mem_tbl[i]);
}
_tlocal_mem_record_cleanup(tls->mem_tbl[i]);
free(tls->mem_tbl[i]);
}
// Cleanup ctx table
size = tls->ctx_tbl_size;
for (i = 0; i < size; i++) {
if (NULL == tls->ctx_tbl[i]->gctx){
continue;
if (NULL != tls->ctx_tbl[i]->gctx){
_tlocal_ctx_record_cleanup(tls->ctx_tbl[i]);
}
_tlocal_ctx_record_cleanup(tls->ctx_tbl[i]);
free(tls->ctx_tbl[i]);
}
@ -918,7 +917,7 @@ static _tlocal_ctx_t *
_tlocal_add_ctx(_tlocal_table_t *tls, opal_common_ucx_ctx_t *ctx)
{
size_t i, free_idx = -1;
int rc;
int rc, found = 0;
/* Try to find available record in the TLS table
* In parallel perform deferred cleanups */
@ -929,14 +928,15 @@ _tlocal_add_ctx(_tlocal_table_t *tls, opal_common_ucx_ctx_t *ctx)
_tlocal_ctx_record_cleanup(tls->ctx_tbl[i]);
}
}
if ((NULL != tls->ctx_tbl[i]->gctx) && (0 > free_idx)) {
if ((NULL == tls->ctx_tbl[i]->gctx) && !found) {
/* Found clean record */
free_idx = i;
found = 1;
}
}
/* if needed - extend the table */
if (0 > free_idx) {
if (!found) {
free_idx = tls->ctx_tbl_size;
rc = _tlocal_tls_ctxtbl_extend(tls, 4);
if (rc) {
@ -1025,15 +1025,6 @@ _tlocal_mem_record_cleanup(_tlocal_mem_t *mem_rec)
size_t i;
WPOOL_DBG_OUT(_dbg_tls || _dbg_mem, "record=%p, is_freed = %d\n",
(void *)mem_rec, mem_rec->gmem->released);
if (mem_rec->gmem->released) {
return;
}
/* Remove myself from the memory context structure
* This may result in context release as we are using
* delayed cleanup */
_common_ucx_mem_signout(mem_rec->gmem);
WPOOL_DBG_OUT(_dbg_tls || _dbg_mem, "gmem = %p mem_rec = %p\n",
(void *)mem_rec->gmem, (void *)mem_rec);
for(i = 0; i < mem_rec->gmem->ctx->comm_size; i++) {
if (mem_rec->mem->rkeys[i]) {
@ -1044,6 +1035,13 @@ _tlocal_mem_record_cleanup(_tlocal_mem_t *mem_rec)
}
free(mem_rec->mem->rkeys);
/* Remove myself from the memory context structure
* This may result in context release as we are using
* delayed cleanup */
_common_ucx_mem_signout(mem_rec->gmem);
WPOOL_DBG_OUT(_dbg_tls || _dbg_mem, "gmem = %p mem_rec = %p\n",
(void *)mem_rec->gmem, (void *)mem_rec);
/* Release fast-path pointers */
if (NULL != mem_rec->mem_tls_ptr) {
free(mem_rec->mem_tls_ptr);
@ -1059,24 +1057,24 @@ static _tlocal_mem_t *_tlocal_add_mem(_tlocal_table_t *tls,
{
size_t i, free_idx = -1;
_tlocal_ctx_t *ctx_rec = NULL;
int rc = OPAL_SUCCESS;
int rc = OPAL_SUCCESS, found = 0;
/* Try to find available spot in the table */
for (i=0; i<tls->mem_tbl_size; i++) {
if (NULL == tls->mem_tbl[i]->gmem) {
if (NULL != tls->mem_tbl[i]->gmem) {
if (tls->mem_tbl[i]->gmem->released) {
/* Found a dirty record. Need to clean it first */
_tlocal_mem_record_cleanup(tls->mem_tbl[i]);
break;
}
}
if ((NULL == tls->mem_tbl[i]->gmem) && (0 > free_idx)) {
if ((NULL == tls->mem_tbl[i]->gmem) && !found) {
/* Found a clear record */
free_idx = i;
found = 1;
}
}
if (0 > free_idx){
if (!found){
free_idx = tls->mem_tbl_size;
rc = _tlocal_tls_memtbl_extend(tls, 4);
if (rc != OPAL_SUCCESS) {

Просмотреть файл

@ -110,7 +110,7 @@ typedef int (*opal_common_ucx_exchange_func_t)(void *my_info, size_t my_info_len
void *metadata);
/* For developer use only */
#define OPAL_COMMON_UCX_WPOOL_DBG
//#define OPAL_COMMON_UCX_WPOOL_DBG
#ifdef OPAL_COMMON_UCX_WPOOL_DBG
extern __thread FILE *tls_pf;
extern __thread int initialized;
@ -168,7 +168,7 @@ static inline void opal_common_ucx_wpool_dbg_init(void)
}
#else
#define DBG_OUT(...)
#define WPOOL_DBG_OUT(...)
#endif