Mostly fix some of the verbose output. Also fix issue
where memory handle was blocking other registration. This commit was SVN r26124.
Этот коммит содержится в:
родитель
b36b6639b2
Коммит
41870ce6ee
@ -373,8 +373,8 @@ int cuda_getmemhandle(void *base, size_t size, mca_mpool_base_registration_t *ne
|
||||
return OMPI_ERROR;
|
||||
} else {
|
||||
opal_output_verbose(20, mca_common_cuda_output,
|
||||
"CUDA: cuIpcGetMemHandle passed: base=%p",
|
||||
base);
|
||||
"CUDA: cuIpcGetMemHandle passed: base=%p size=%d",
|
||||
base, (int)size);
|
||||
}
|
||||
|
||||
/* Need to get the real base and size of the memory handle. This is
|
||||
@ -416,10 +416,10 @@ int cuda_getmemhandle(void *base, size_t size, mca_mpool_base_registration_t *ne
|
||||
*/
|
||||
int cuda_ungetmemhandle(void *reg_data, mca_mpool_base_registration_t *reg)
|
||||
{
|
||||
CUDA_DUMP_EVTHANDLE((10, ((mca_mpool_common_cuda_reg_t *)reg)->evtHandle, "cuda_ungetmemhandle"));
|
||||
opal_output_verbose(5, mca_common_cuda_output,
|
||||
"CUDA: cuda_ungetmemhandle: base=%p",
|
||||
reg_data);
|
||||
CUDA_DUMP_EVTHANDLE((100, ((mca_mpool_common_cuda_reg_t *)reg)->evtHandle, "cuda_ungetmemhandle"));
|
||||
opal_output_verbose(10, mca_common_cuda_output,
|
||||
"CUDA: cuda_ungetmemhandle (no-op): base=%p", reg->base);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
@ -449,7 +449,8 @@ int cuda_openmemhandle(void *base, size_t size, mca_mpool_base_registration_t *n
|
||||
* to clear them out. */
|
||||
if (CUDA_ERROR_ALREADY_MAPPED == result) {
|
||||
opal_output_verbose(10, mca_common_cuda_output,
|
||||
"Failed to get handle for p=%p, signal upper layer\n", base);
|
||||
"CUDA: cuIpcOpenMemHandle returned CUDA_ERROR_ALREADY_MAPPED for "
|
||||
"p=%p,size=%d: notify memory pool\n", base, (int)size);
|
||||
return OMPI_ERR_WOULD_BLOCK;
|
||||
}
|
||||
if (CUDA_SUCCESS != result) {
|
||||
@ -459,8 +460,8 @@ int cuda_openmemhandle(void *base, size_t size, mca_mpool_base_registration_t *n
|
||||
return OMPI_ERROR;
|
||||
} else {
|
||||
opal_output_verbose(10, mca_common_cuda_output,
|
||||
"CUDA: cuIpcOpenMemHandle passed: base=%p",
|
||||
newreg->alloc_base);
|
||||
"CUDA: cuIpcOpenMemHandle passed: base=%p (remote base=%p,size=%d)",
|
||||
newreg->alloc_base, base, (int)size);
|
||||
CUDA_DUMP_MEMHANDLE((200, &memHandle, "cuIpcOpenMemHandle"));
|
||||
}
|
||||
|
||||
@ -484,7 +485,7 @@ int cuda_closememhandle(void *reg_data, mca_mpool_base_registration_t *reg)
|
||||
opal_output_verbose(10, mca_common_cuda_output,
|
||||
"CUDA: cuIpcCloseMemHandle passed: base=%p",
|
||||
cuda_reg->base.alloc_base);
|
||||
CUDA_DUMP_MEMHANDLE((10, cuda_reg->memHandle, "cuIpcCloseMemHandle"));
|
||||
CUDA_DUMP_MEMHANDLE((100, cuda_reg->memHandle, "cuIpcCloseMemHandle"));
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
@ -533,7 +534,7 @@ void mca_common_wait_stream_synchronize(mca_mpool_common_cuda_reg_t *rget_reg)
|
||||
CUresult result;
|
||||
|
||||
memcpy(&evtHandle, rget_reg->evtHandle, sizeof(evtHandle));
|
||||
CUDA_DUMP_EVTHANDLE((2, &evtHandle, "stream_synchronize"));
|
||||
CUDA_DUMP_EVTHANDLE((100, &evtHandle, "stream_synchronize"));
|
||||
|
||||
result = cuIpcOpenEventHandle(&event, evtHandle);
|
||||
if (CUDA_SUCCESS != result){
|
||||
@ -705,7 +706,7 @@ int progress_one_cuda_event(struct mca_btl_base_descriptor_t **frag) {
|
||||
}
|
||||
|
||||
*frag = cuda_event_frag_array[cuda_event_status_first_used];
|
||||
opal_output_verbose(5, mca_common_cuda_output,
|
||||
opal_output_verbose(10, mca_common_cuda_output,
|
||||
"CUDA: cuEventQuery returned %d", result);
|
||||
|
||||
/* Bump counters, loop around the circular buffer if necessary */
|
||||
@ -788,7 +789,7 @@ static void cuda_dump_evthandle(int verbose, void *evtHandle, char *str) {
|
||||
}
|
||||
memcpy(&evtH, evtHandle, sizeof(evtH));
|
||||
opal_output_verbose(verbose, mca_common_cuda_output,
|
||||
"%s:ctxId=%d, pid=%d, index=%d",
|
||||
"CUDA: %s:ctxId=%d, pid=%d, index=%d",
|
||||
str, (int)evtH.ctxId, evtH.pid, (int)evtH.index);
|
||||
}
|
||||
|
||||
|
@ -259,7 +259,7 @@ int mca_mpool_rgpusm_register(mca_mpool_base_module_t *mpool, void *addr,
|
||||
if (*reg != NULL) {
|
||||
mpool_rgpusm->stat_cache_hit++;
|
||||
opal_output_verbose(10, mca_mpool_rgpusm_component.output,
|
||||
"Found addr=%p, size=%d (base=%p,size=%d)in cache",
|
||||
"RGPUSM: Found addr=%p,size=%d (base=%p,size=%d) in cache",
|
||||
addr, (int)size, (*reg)->base,
|
||||
(int)((*reg)->bound - (*reg)->base));
|
||||
|
||||
@ -269,8 +269,11 @@ int mca_mpool_rgpusm_register(mca_mpool_base_module_t *mpool, void *addr,
|
||||
} else {
|
||||
/* This is an old registration. Need to boot it. */
|
||||
opal_output_verbose(10, mca_mpool_rgpusm_component.output,
|
||||
"Mismatched Handle: Evicting addr=%p, size=%d in cache",
|
||||
addr, (int)size);
|
||||
"RGPUSM: Mismatched Handle: Evicting/unregistering "
|
||||
"addr=%p,size=%d (base=%p,size=%d) from cache",
|
||||
addr, (int)size, (*reg)->base,
|
||||
(int)((*reg)->bound - (*reg)->base));
|
||||
|
||||
/* The ref_count has to be zero as this memory cannot possibly
|
||||
* be in use. Assert on that just to make sure. */
|
||||
assert(0 == (*reg)->ref_count);
|
||||
@ -295,13 +298,13 @@ int mca_mpool_rgpusm_register(mca_mpool_base_module_t *mpool, void *addr,
|
||||
/* If we have a registration here, then we know it is valid. */
|
||||
if (*reg != NULL) {
|
||||
opal_output_verbose(10, mca_mpool_rgpusm_component.output,
|
||||
"CACHE HIT is good: ep=%d, addr=%p, size=%d in cache",
|
||||
"RGPUSM: CACHE HIT is good: ep=%d, addr=%p, size=%d in cache",
|
||||
mypeer, addr, (int)size);
|
||||
|
||||
/* When using leave pinned, we keep an LRU list. */
|
||||
if ((0 == (*reg)->ref_count) && mca_mpool_rgpusm_component.leave_pinned) {
|
||||
opal_output_verbose(20, mca_mpool_rgpusm_component.output,
|
||||
"POP OFF LRU: ep=%d, addr=%p, size=%d in cache",
|
||||
"RGPUSM: POP OFF LRU: ep=%d, addr=%p, size=%d in cache",
|
||||
mypeer, addr, (int)size);
|
||||
opal_list_remove_item(&mpool_rgpusm->lru_list,
|
||||
(opal_list_item_t*)(*reg));
|
||||
@ -310,7 +313,7 @@ int mca_mpool_rgpusm_register(mca_mpool_base_module_t *mpool, void *addr,
|
||||
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
|
||||
opal_output(-1, "reg->ref_count=%d", (int)(*reg)->ref_count);
|
||||
opal_output_verbose(80, mca_mpool_rgpusm_component.output,
|
||||
"Found entry in cache addr=%p, size=%d", addr, (int)size);
|
||||
"RGPUSM: Found entry in cache addr=%p, size=%d", addr, (int)size);
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
@ -318,7 +321,7 @@ int mca_mpool_rgpusm_register(mca_mpool_base_module_t *mpool, void *addr,
|
||||
* so this is a new one, and we are going to use the cache. */
|
||||
assert(NULL == *reg);
|
||||
opal_output_verbose(10, mca_mpool_rgpusm_component.output,
|
||||
"New registration ep=%d, addr=%p, size=%d in cache",
|
||||
"RGPUSM: New registration ep=%d, addr=%p, size=%d. Need to register and insert in cache",
|
||||
mypeer, addr, (int)size);
|
||||
|
||||
OMPI_FREE_LIST_GET(&mpool_rgpusm->reg_list, item, rc);
|
||||
@ -362,24 +365,43 @@ int mca_mpool_rgpusm_register(mca_mpool_base_module_t *mpool, void *addr,
|
||||
* ensure we get the hit in the cache. */
|
||||
mpool->rcache->rcache_find(mpool->rcache, addr, 4, &oldreg);
|
||||
RESTORE_PAGE_ALIGNMENT();
|
||||
/* The ref_count has to be zero as this memory cannot possibly
|
||||
* be in use. Assert on that just to make sure. */
|
||||
assert(0 == oldreg->ref_count);
|
||||
if (mca_mpool_rgpusm_component.leave_pinned) {
|
||||
opal_list_remove_item(&mpool_rgpusm->lru_list,
|
||||
(opal_list_item_t*)oldreg);
|
||||
|
||||
/* For most cases, we will find a registration that overlaps.
|
||||
* Removal of it should allow the registration we are
|
||||
* attempting to succeed. */
|
||||
if (NULL != oldreg) {
|
||||
/* The ref_count has to be zero as this memory cannot
|
||||
* possibly be in use. Assert on that just to make sure. */
|
||||
assert(0 == oldreg->ref_count);
|
||||
if (mca_mpool_rgpusm_component.leave_pinned) {
|
||||
opal_list_remove_item(&mpool_rgpusm->lru_list,
|
||||
(opal_list_item_t*)oldreg);
|
||||
}
|
||||
|
||||
/* Bump the reference count to keep things copacetic in deregister */
|
||||
oldreg->ref_count++;
|
||||
/* Invalidate the registration so it will get booted out. */
|
||||
oldreg->flags |= MCA_MPOOL_FLAGS_INVALID;
|
||||
mca_mpool_rgpusm_deregister(mpool, oldreg);
|
||||
mpool_rgpusm->stat_evicted++;
|
||||
|
||||
/* And try again. This one usually works. */
|
||||
rc = mpool_rgpusm->resources.register_mem(addr, size, (mca_mpool_base_registration_t *)rgpusm_reg,
|
||||
(mca_mpool_base_registration_t *)rget_reg);
|
||||
}
|
||||
|
||||
/* Bump the reference count to keep things copacetic in deregister */
|
||||
oldreg->ref_count++;
|
||||
/* Invalidate the registration so it will get booted out. */
|
||||
oldreg->flags |= MCA_MPOOL_FLAGS_INVALID;
|
||||
mca_mpool_rgpusm_deregister(mpool, oldreg);
|
||||
mpool_rgpusm->stat_evicted++;
|
||||
|
||||
/* And try again. This only needs to be attempted one other time. */
|
||||
rc = mpool_rgpusm->resources.register_mem(addr, size, (mca_mpool_base_registration_t *)rgpusm_reg,
|
||||
(mca_mpool_base_registration_t *)rget_reg);
|
||||
/* There is a chance that another registration is blocking our
|
||||
* ability to register. Check the rc to see if we still need
|
||||
* to try and clear out registrations. */
|
||||
while (OMPI_SUCCESS != rc) {
|
||||
if (true != mca_mpool_rgpusm_deregister_lru(mpool)) {
|
||||
rc = OMPI_ERROR;
|
||||
break;
|
||||
}
|
||||
/* Clear out one registration. */
|
||||
rc = mpool_rgpusm->resources.register_mem(addr, size, (mca_mpool_base_registration_t *)rgpusm_reg,
|
||||
(mca_mpool_base_registration_t *)rget_reg);
|
||||
}
|
||||
}
|
||||
|
||||
if(rc != OMPI_SUCCESS) {
|
||||
@ -389,7 +411,7 @@ int mca_mpool_rgpusm_register(mca_mpool_base_module_t *mpool, void *addr,
|
||||
}
|
||||
|
||||
opal_output_verbose(80, mca_mpool_rgpusm_component.output,
|
||||
"About to insert in rgpusm cache addr=%p, size=%d", addr, (int)size);
|
||||
"RGPUSM: About to insert in rgpusm cache addr=%p, size=%d", addr, (int)size);
|
||||
SET_PAGE_ALIGNMENT_TO_ZERO();
|
||||
while((rc = mpool->rcache->rcache_insert(mpool->rcache, (mca_mpool_base_registration_t *)rgpusm_reg,
|
||||
mca_mpool_rgpusm_component.rcache_size_limit)) ==
|
||||
@ -404,10 +426,11 @@ int mca_mpool_rgpusm_register(mca_mpool_base_module_t *mpool, void *addr,
|
||||
if(rc != OMPI_SUCCESS) {
|
||||
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
|
||||
OMPI_FREE_LIST_RETURN(&mpool_rgpusm->reg_list, item);
|
||||
/* We cannot recover from this. We can be here if the size of the cache
|
||||
* is smaller than the amount of memory we are trying to register in a single
|
||||
* transfer. In that case, rc is MPI_ERR_OUT_OF_RESOURCES, but everything is
|
||||
* stuck at that point. Therefore, just error out completely.
|
||||
/* We cannot recover from this. We can be here if the size of
|
||||
* the cache is smaller than the amount of memory we are
|
||||
* trying to register in a single transfer. In that case, rc
|
||||
* is MPI_ERR_OUT_OF_RESOURCES, but everything is stuck at
|
||||
* that point. Therefore, just error out completely.
|
||||
*/
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user