1
1

Mostly fix some of the verbose output. Also fix issue

where memory handle was blocking other registration.

This commit was SVN r26124.
Этот коммит содержится в:
Rolf vandeVaart 2012-03-09 21:28:56 +00:00
родитель b36b6639b2
Коммит 41870ce6ee
2 изменённых файлов: 65 добавлений и 41 удалений

Просмотреть файл

@ -373,8 +373,8 @@ int cuda_getmemhandle(void *base, size_t size, mca_mpool_base_registration_t *ne
return OMPI_ERROR;
} else {
opal_output_verbose(20, mca_common_cuda_output,
"CUDA: cuIpcGetMemHandle passed: base=%p",
base);
"CUDA: cuIpcGetMemHandle passed: base=%p size=%d",
base, (int)size);
}
/* Need to get the real base and size of the memory handle. This is
@ -416,10 +416,10 @@ int cuda_getmemhandle(void *base, size_t size, mca_mpool_base_registration_t *ne
*/
int cuda_ungetmemhandle(void *reg_data, mca_mpool_base_registration_t *reg)
{
CUDA_DUMP_EVTHANDLE((10, ((mca_mpool_common_cuda_reg_t *)reg)->evtHandle, "cuda_ungetmemhandle"));
opal_output_verbose(5, mca_common_cuda_output,
"CUDA: cuda_ungetmemhandle: base=%p",
reg_data);
CUDA_DUMP_EVTHANDLE((100, ((mca_mpool_common_cuda_reg_t *)reg)->evtHandle, "cuda_ungetmemhandle"));
opal_output_verbose(10, mca_common_cuda_output,
"CUDA: cuda_ungetmemhandle (no-op): base=%p", reg->base);
return OMPI_SUCCESS;
}
@ -449,7 +449,8 @@ int cuda_openmemhandle(void *base, size_t size, mca_mpool_base_registration_t *n
* to clear them out. */
if (CUDA_ERROR_ALREADY_MAPPED == result) {
opal_output_verbose(10, mca_common_cuda_output,
"Failed to get handle for p=%p, signal upper layer\n", base);
"CUDA: cuIpcOpenMemHandle returned CUDA_ERROR_ALREADY_MAPPED for "
"p=%p,size=%d: notify memory pool\n", base, (int)size);
return OMPI_ERR_WOULD_BLOCK;
}
if (CUDA_SUCCESS != result) {
@ -459,8 +460,8 @@ int cuda_openmemhandle(void *base, size_t size, mca_mpool_base_registration_t *n
return OMPI_ERROR;
} else {
opal_output_verbose(10, mca_common_cuda_output,
"CUDA: cuIpcOpenMemHandle passed: base=%p",
newreg->alloc_base);
"CUDA: cuIpcOpenMemHandle passed: base=%p (remote base=%p,size=%d)",
newreg->alloc_base, base, (int)size);
CUDA_DUMP_MEMHANDLE((200, &memHandle, "cuIpcOpenMemHandle"));
}
@ -484,7 +485,7 @@ int cuda_closememhandle(void *reg_data, mca_mpool_base_registration_t *reg)
opal_output_verbose(10, mca_common_cuda_output,
"CUDA: cuIpcCloseMemHandle passed: base=%p",
cuda_reg->base.alloc_base);
CUDA_DUMP_MEMHANDLE((10, cuda_reg->memHandle, "cuIpcCloseMemHandle"));
CUDA_DUMP_MEMHANDLE((100, cuda_reg->memHandle, "cuIpcCloseMemHandle"));
}
return OMPI_SUCCESS;
@ -533,7 +534,7 @@ void mca_common_wait_stream_synchronize(mca_mpool_common_cuda_reg_t *rget_reg)
CUresult result;
memcpy(&evtHandle, rget_reg->evtHandle, sizeof(evtHandle));
CUDA_DUMP_EVTHANDLE((2, &evtHandle, "stream_synchronize"));
CUDA_DUMP_EVTHANDLE((100, &evtHandle, "stream_synchronize"));
result = cuIpcOpenEventHandle(&event, evtHandle);
if (CUDA_SUCCESS != result){
@ -705,7 +706,7 @@ int progress_one_cuda_event(struct mca_btl_base_descriptor_t **frag) {
}
*frag = cuda_event_frag_array[cuda_event_status_first_used];
opal_output_verbose(5, mca_common_cuda_output,
opal_output_verbose(10, mca_common_cuda_output,
"CUDA: cuEventQuery returned %d", result);
/* Bump counters, loop around the circular buffer if necessary */
@ -788,7 +789,7 @@ static void cuda_dump_evthandle(int verbose, void *evtHandle, char *str) {
}
memcpy(&evtH, evtHandle, sizeof(evtH));
opal_output_verbose(verbose, mca_common_cuda_output,
"%s:ctxId=%d, pid=%d, index=%d",
"CUDA: %s:ctxId=%d, pid=%d, index=%d",
str, (int)evtH.ctxId, evtH.pid, (int)evtH.index);
}

Просмотреть файл

@ -259,7 +259,7 @@ int mca_mpool_rgpusm_register(mca_mpool_base_module_t *mpool, void *addr,
if (*reg != NULL) {
mpool_rgpusm->stat_cache_hit++;
opal_output_verbose(10, mca_mpool_rgpusm_component.output,
"Found addr=%p, size=%d (base=%p,size=%d)in cache",
"RGPUSM: Found addr=%p,size=%d (base=%p,size=%d) in cache",
addr, (int)size, (*reg)->base,
(int)((*reg)->bound - (*reg)->base));
@ -269,8 +269,11 @@ int mca_mpool_rgpusm_register(mca_mpool_base_module_t *mpool, void *addr,
} else {
/* This is an old registration. Need to boot it. */
opal_output_verbose(10, mca_mpool_rgpusm_component.output,
"Mismatched Handle: Evicting addr=%p, size=%d in cache",
addr, (int)size);
"RGPUSM: Mismatched Handle: Evicting/unregistering "
"addr=%p,size=%d (base=%p,size=%d) from cache",
addr, (int)size, (*reg)->base,
(int)((*reg)->bound - (*reg)->base));
/* The ref_count has to be zero as this memory cannot possibly
* be in use. Assert on that just to make sure. */
assert(0 == (*reg)->ref_count);
@ -295,13 +298,13 @@ int mca_mpool_rgpusm_register(mca_mpool_base_module_t *mpool, void *addr,
/* If we have a registration here, then we know it is valid. */
if (*reg != NULL) {
opal_output_verbose(10, mca_mpool_rgpusm_component.output,
"CACHE HIT is good: ep=%d, addr=%p, size=%d in cache",
"RGPUSM: CACHE HIT is good: ep=%d, addr=%p, size=%d in cache",
mypeer, addr, (int)size);
/* When using leave pinned, we keep an LRU list. */
if ((0 == (*reg)->ref_count) && mca_mpool_rgpusm_component.leave_pinned) {
opal_output_verbose(20, mca_mpool_rgpusm_component.output,
"POP OFF LRU: ep=%d, addr=%p, size=%d in cache",
"RGPUSM: POP OFF LRU: ep=%d, addr=%p, size=%d in cache",
mypeer, addr, (int)size);
opal_list_remove_item(&mpool_rgpusm->lru_list,
(opal_list_item_t*)(*reg));
@ -310,7 +313,7 @@ int mca_mpool_rgpusm_register(mca_mpool_base_module_t *mpool, void *addr,
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
opal_output(-1, "reg->ref_count=%d", (int)(*reg)->ref_count);
opal_output_verbose(80, mca_mpool_rgpusm_component.output,
"Found entry in cache addr=%p, size=%d", addr, (int)size);
"RGPUSM: Found entry in cache addr=%p, size=%d", addr, (int)size);
return OMPI_SUCCESS;
}
@ -318,7 +321,7 @@ int mca_mpool_rgpusm_register(mca_mpool_base_module_t *mpool, void *addr,
* so this is a new one, and we are going to use the cache. */
assert(NULL == *reg);
opal_output_verbose(10, mca_mpool_rgpusm_component.output,
"New registration ep=%d, addr=%p, size=%d in cache",
"RGPUSM: New registration ep=%d, addr=%p, size=%d. Need to register and insert in cache",
mypeer, addr, (int)size);
OMPI_FREE_LIST_GET(&mpool_rgpusm->reg_list, item, rc);
@ -362,24 +365,43 @@ int mca_mpool_rgpusm_register(mca_mpool_base_module_t *mpool, void *addr,
* ensure we get the hit in the cache. */
mpool->rcache->rcache_find(mpool->rcache, addr, 4, &oldreg);
RESTORE_PAGE_ALIGNMENT();
/* The ref_count has to be zero as this memory cannot possibly
* be in use. Assert on that just to make sure. */
assert(0 == oldreg->ref_count);
if (mca_mpool_rgpusm_component.leave_pinned) {
opal_list_remove_item(&mpool_rgpusm->lru_list,
(opal_list_item_t*)oldreg);
/* For most cases, we will find a registration that overlaps.
* Removal of it should allow the registration we are
* attempting to succeed. */
if (NULL != oldreg) {
/* The ref_count has to be zero as this memory cannot
* possibly be in use. Assert on that just to make sure. */
assert(0 == oldreg->ref_count);
if (mca_mpool_rgpusm_component.leave_pinned) {
opal_list_remove_item(&mpool_rgpusm->lru_list,
(opal_list_item_t*)oldreg);
}
/* Bump the reference count to keep things copacetic in deregister */
oldreg->ref_count++;
/* Invalidate the registration so it will get booted out. */
oldreg->flags |= MCA_MPOOL_FLAGS_INVALID;
mca_mpool_rgpusm_deregister(mpool, oldreg);
mpool_rgpusm->stat_evicted++;
/* And try again. This one usually works. */
rc = mpool_rgpusm->resources.register_mem(addr, size, (mca_mpool_base_registration_t *)rgpusm_reg,
(mca_mpool_base_registration_t *)rget_reg);
}
/* Bump the reference count to keep things copacetic in deregister */
oldreg->ref_count++;
/* Invalidate the registration so it will get booted out. */
oldreg->flags |= MCA_MPOOL_FLAGS_INVALID;
mca_mpool_rgpusm_deregister(mpool, oldreg);
mpool_rgpusm->stat_evicted++;
/* And try again. This only needs to be attempted one other time. */
rc = mpool_rgpusm->resources.register_mem(addr, size, (mca_mpool_base_registration_t *)rgpusm_reg,
(mca_mpool_base_registration_t *)rget_reg);
/* There is a chance that another registration is blocking our
* ability to register. Check the rc to see if we still need
* to try and clear out registrations. */
while (OMPI_SUCCESS != rc) {
if (true != mca_mpool_rgpusm_deregister_lru(mpool)) {
rc = OMPI_ERROR;
break;
}
/* Clear out one registration. */
rc = mpool_rgpusm->resources.register_mem(addr, size, (mca_mpool_base_registration_t *)rgpusm_reg,
(mca_mpool_base_registration_t *)rget_reg);
}
}
if(rc != OMPI_SUCCESS) {
@ -389,7 +411,7 @@ int mca_mpool_rgpusm_register(mca_mpool_base_module_t *mpool, void *addr,
}
opal_output_verbose(80, mca_mpool_rgpusm_component.output,
"About to insert in rgpusm cache addr=%p, size=%d", addr, (int)size);
"RGPUSM: About to insert in rgpusm cache addr=%p, size=%d", addr, (int)size);
SET_PAGE_ALIGNMENT_TO_ZERO();
while((rc = mpool->rcache->rcache_insert(mpool->rcache, (mca_mpool_base_registration_t *)rgpusm_reg,
mca_mpool_rgpusm_component.rcache_size_limit)) ==
@ -404,10 +426,11 @@ int mca_mpool_rgpusm_register(mca_mpool_base_module_t *mpool, void *addr,
if(rc != OMPI_SUCCESS) {
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
OMPI_FREE_LIST_RETURN(&mpool_rgpusm->reg_list, item);
/* We cannot recover from this. We can be here if the size of the cache
* is smaller than the amount of memory we are trying to register in a single
* transfer. In that case, rc is MPI_ERR_OUT_OF_RESOURCES, but everything is
* stuck at that point. Therefore, just error out completely.
/* We cannot recover from this. We can be here if the size of
* the cache is smaller than the amount of memory we are
* trying to register in a single transfer. In that case, rc
* is MPI_ERR_OUT_OF_RESOURCES, but everything is stuck at
* that point. Therefore, just error out completely.
*/
return OMPI_ERROR;
}