Mostly fix some of the verbose output. Also fix issue
where memory handle was blocking other registration. This commit was SVN r26124.
Этот коммит содержится в:
родитель
b36b6639b2
Коммит
41870ce6ee
@ -373,8 +373,8 @@ int cuda_getmemhandle(void *base, size_t size, mca_mpool_base_registration_t *ne
|
|||||||
return OMPI_ERROR;
|
return OMPI_ERROR;
|
||||||
} else {
|
} else {
|
||||||
opal_output_verbose(20, mca_common_cuda_output,
|
opal_output_verbose(20, mca_common_cuda_output,
|
||||||
"CUDA: cuIpcGetMemHandle passed: base=%p",
|
"CUDA: cuIpcGetMemHandle passed: base=%p size=%d",
|
||||||
base);
|
base, (int)size);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Need to get the real base and size of the memory handle. This is
|
/* Need to get the real base and size of the memory handle. This is
|
||||||
@ -416,10 +416,10 @@ int cuda_getmemhandle(void *base, size_t size, mca_mpool_base_registration_t *ne
|
|||||||
*/
|
*/
|
||||||
int cuda_ungetmemhandle(void *reg_data, mca_mpool_base_registration_t *reg)
|
int cuda_ungetmemhandle(void *reg_data, mca_mpool_base_registration_t *reg)
|
||||||
{
|
{
|
||||||
CUDA_DUMP_EVTHANDLE((10, ((mca_mpool_common_cuda_reg_t *)reg)->evtHandle, "cuda_ungetmemhandle"));
|
CUDA_DUMP_EVTHANDLE((100, ((mca_mpool_common_cuda_reg_t *)reg)->evtHandle, "cuda_ungetmemhandle"));
|
||||||
opal_output_verbose(5, mca_common_cuda_output,
|
opal_output_verbose(10, mca_common_cuda_output,
|
||||||
"CUDA: cuda_ungetmemhandle: base=%p",
|
"CUDA: cuda_ungetmemhandle (no-op): base=%p", reg->base);
|
||||||
reg_data);
|
|
||||||
return OMPI_SUCCESS;
|
return OMPI_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -449,7 +449,8 @@ int cuda_openmemhandle(void *base, size_t size, mca_mpool_base_registration_t *n
|
|||||||
* to clear them out. */
|
* to clear them out. */
|
||||||
if (CUDA_ERROR_ALREADY_MAPPED == result) {
|
if (CUDA_ERROR_ALREADY_MAPPED == result) {
|
||||||
opal_output_verbose(10, mca_common_cuda_output,
|
opal_output_verbose(10, mca_common_cuda_output,
|
||||||
"Failed to get handle for p=%p, signal upper layer\n", base);
|
"CUDA: cuIpcOpenMemHandle returned CUDA_ERROR_ALREADY_MAPPED for "
|
||||||
|
"p=%p,size=%d: notify memory pool\n", base, (int)size);
|
||||||
return OMPI_ERR_WOULD_BLOCK;
|
return OMPI_ERR_WOULD_BLOCK;
|
||||||
}
|
}
|
||||||
if (CUDA_SUCCESS != result) {
|
if (CUDA_SUCCESS != result) {
|
||||||
@ -459,8 +460,8 @@ int cuda_openmemhandle(void *base, size_t size, mca_mpool_base_registration_t *n
|
|||||||
return OMPI_ERROR;
|
return OMPI_ERROR;
|
||||||
} else {
|
} else {
|
||||||
opal_output_verbose(10, mca_common_cuda_output,
|
opal_output_verbose(10, mca_common_cuda_output,
|
||||||
"CUDA: cuIpcOpenMemHandle passed: base=%p",
|
"CUDA: cuIpcOpenMemHandle passed: base=%p (remote base=%p,size=%d)",
|
||||||
newreg->alloc_base);
|
newreg->alloc_base, base, (int)size);
|
||||||
CUDA_DUMP_MEMHANDLE((200, &memHandle, "cuIpcOpenMemHandle"));
|
CUDA_DUMP_MEMHANDLE((200, &memHandle, "cuIpcOpenMemHandle"));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -484,7 +485,7 @@ int cuda_closememhandle(void *reg_data, mca_mpool_base_registration_t *reg)
|
|||||||
opal_output_verbose(10, mca_common_cuda_output,
|
opal_output_verbose(10, mca_common_cuda_output,
|
||||||
"CUDA: cuIpcCloseMemHandle passed: base=%p",
|
"CUDA: cuIpcCloseMemHandle passed: base=%p",
|
||||||
cuda_reg->base.alloc_base);
|
cuda_reg->base.alloc_base);
|
||||||
CUDA_DUMP_MEMHANDLE((10, cuda_reg->memHandle, "cuIpcCloseMemHandle"));
|
CUDA_DUMP_MEMHANDLE((100, cuda_reg->memHandle, "cuIpcCloseMemHandle"));
|
||||||
}
|
}
|
||||||
|
|
||||||
return OMPI_SUCCESS;
|
return OMPI_SUCCESS;
|
||||||
@ -533,7 +534,7 @@ void mca_common_wait_stream_synchronize(mca_mpool_common_cuda_reg_t *rget_reg)
|
|||||||
CUresult result;
|
CUresult result;
|
||||||
|
|
||||||
memcpy(&evtHandle, rget_reg->evtHandle, sizeof(evtHandle));
|
memcpy(&evtHandle, rget_reg->evtHandle, sizeof(evtHandle));
|
||||||
CUDA_DUMP_EVTHANDLE((2, &evtHandle, "stream_synchronize"));
|
CUDA_DUMP_EVTHANDLE((100, &evtHandle, "stream_synchronize"));
|
||||||
|
|
||||||
result = cuIpcOpenEventHandle(&event, evtHandle);
|
result = cuIpcOpenEventHandle(&event, evtHandle);
|
||||||
if (CUDA_SUCCESS != result){
|
if (CUDA_SUCCESS != result){
|
||||||
@ -705,7 +706,7 @@ int progress_one_cuda_event(struct mca_btl_base_descriptor_t **frag) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
*frag = cuda_event_frag_array[cuda_event_status_first_used];
|
*frag = cuda_event_frag_array[cuda_event_status_first_used];
|
||||||
opal_output_verbose(5, mca_common_cuda_output,
|
opal_output_verbose(10, mca_common_cuda_output,
|
||||||
"CUDA: cuEventQuery returned %d", result);
|
"CUDA: cuEventQuery returned %d", result);
|
||||||
|
|
||||||
/* Bump counters, loop around the circular buffer if necessary */
|
/* Bump counters, loop around the circular buffer if necessary */
|
||||||
@ -788,7 +789,7 @@ static void cuda_dump_evthandle(int verbose, void *evtHandle, char *str) {
|
|||||||
}
|
}
|
||||||
memcpy(&evtH, evtHandle, sizeof(evtH));
|
memcpy(&evtH, evtHandle, sizeof(evtH));
|
||||||
opal_output_verbose(verbose, mca_common_cuda_output,
|
opal_output_verbose(verbose, mca_common_cuda_output,
|
||||||
"%s:ctxId=%d, pid=%d, index=%d",
|
"CUDA: %s:ctxId=%d, pid=%d, index=%d",
|
||||||
str, (int)evtH.ctxId, evtH.pid, (int)evtH.index);
|
str, (int)evtH.ctxId, evtH.pid, (int)evtH.index);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -259,7 +259,7 @@ int mca_mpool_rgpusm_register(mca_mpool_base_module_t *mpool, void *addr,
|
|||||||
if (*reg != NULL) {
|
if (*reg != NULL) {
|
||||||
mpool_rgpusm->stat_cache_hit++;
|
mpool_rgpusm->stat_cache_hit++;
|
||||||
opal_output_verbose(10, mca_mpool_rgpusm_component.output,
|
opal_output_verbose(10, mca_mpool_rgpusm_component.output,
|
||||||
"Found addr=%p, size=%d (base=%p,size=%d)in cache",
|
"RGPUSM: Found addr=%p,size=%d (base=%p,size=%d) in cache",
|
||||||
addr, (int)size, (*reg)->base,
|
addr, (int)size, (*reg)->base,
|
||||||
(int)((*reg)->bound - (*reg)->base));
|
(int)((*reg)->bound - (*reg)->base));
|
||||||
|
|
||||||
@ -269,8 +269,11 @@ int mca_mpool_rgpusm_register(mca_mpool_base_module_t *mpool, void *addr,
|
|||||||
} else {
|
} else {
|
||||||
/* This is an old registration. Need to boot it. */
|
/* This is an old registration. Need to boot it. */
|
||||||
opal_output_verbose(10, mca_mpool_rgpusm_component.output,
|
opal_output_verbose(10, mca_mpool_rgpusm_component.output,
|
||||||
"Mismatched Handle: Evicting addr=%p, size=%d in cache",
|
"RGPUSM: Mismatched Handle: Evicting/unregistering "
|
||||||
addr, (int)size);
|
"addr=%p,size=%d (base=%p,size=%d) from cache",
|
||||||
|
addr, (int)size, (*reg)->base,
|
||||||
|
(int)((*reg)->bound - (*reg)->base));
|
||||||
|
|
||||||
/* The ref_count has to be zero as this memory cannot possibly
|
/* The ref_count has to be zero as this memory cannot possibly
|
||||||
* be in use. Assert on that just to make sure. */
|
* be in use. Assert on that just to make sure. */
|
||||||
assert(0 == (*reg)->ref_count);
|
assert(0 == (*reg)->ref_count);
|
||||||
@ -295,13 +298,13 @@ int mca_mpool_rgpusm_register(mca_mpool_base_module_t *mpool, void *addr,
|
|||||||
/* If we have a registration here, then we know it is valid. */
|
/* If we have a registration here, then we know it is valid. */
|
||||||
if (*reg != NULL) {
|
if (*reg != NULL) {
|
||||||
opal_output_verbose(10, mca_mpool_rgpusm_component.output,
|
opal_output_verbose(10, mca_mpool_rgpusm_component.output,
|
||||||
"CACHE HIT is good: ep=%d, addr=%p, size=%d in cache",
|
"RGPUSM: CACHE HIT is good: ep=%d, addr=%p, size=%d in cache",
|
||||||
mypeer, addr, (int)size);
|
mypeer, addr, (int)size);
|
||||||
|
|
||||||
/* When using leave pinned, we keep an LRU list. */
|
/* When using leave pinned, we keep an LRU list. */
|
||||||
if ((0 == (*reg)->ref_count) && mca_mpool_rgpusm_component.leave_pinned) {
|
if ((0 == (*reg)->ref_count) && mca_mpool_rgpusm_component.leave_pinned) {
|
||||||
opal_output_verbose(20, mca_mpool_rgpusm_component.output,
|
opal_output_verbose(20, mca_mpool_rgpusm_component.output,
|
||||||
"POP OFF LRU: ep=%d, addr=%p, size=%d in cache",
|
"RGPUSM: POP OFF LRU: ep=%d, addr=%p, size=%d in cache",
|
||||||
mypeer, addr, (int)size);
|
mypeer, addr, (int)size);
|
||||||
opal_list_remove_item(&mpool_rgpusm->lru_list,
|
opal_list_remove_item(&mpool_rgpusm->lru_list,
|
||||||
(opal_list_item_t*)(*reg));
|
(opal_list_item_t*)(*reg));
|
||||||
@ -310,7 +313,7 @@ int mca_mpool_rgpusm_register(mca_mpool_base_module_t *mpool, void *addr,
|
|||||||
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
|
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
|
||||||
opal_output(-1, "reg->ref_count=%d", (int)(*reg)->ref_count);
|
opal_output(-1, "reg->ref_count=%d", (int)(*reg)->ref_count);
|
||||||
opal_output_verbose(80, mca_mpool_rgpusm_component.output,
|
opal_output_verbose(80, mca_mpool_rgpusm_component.output,
|
||||||
"Found entry in cache addr=%p, size=%d", addr, (int)size);
|
"RGPUSM: Found entry in cache addr=%p, size=%d", addr, (int)size);
|
||||||
return OMPI_SUCCESS;
|
return OMPI_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -318,7 +321,7 @@ int mca_mpool_rgpusm_register(mca_mpool_base_module_t *mpool, void *addr,
|
|||||||
* so this is a new one, and we are going to use the cache. */
|
* so this is a new one, and we are going to use the cache. */
|
||||||
assert(NULL == *reg);
|
assert(NULL == *reg);
|
||||||
opal_output_verbose(10, mca_mpool_rgpusm_component.output,
|
opal_output_verbose(10, mca_mpool_rgpusm_component.output,
|
||||||
"New registration ep=%d, addr=%p, size=%d in cache",
|
"RGPUSM: New registration ep=%d, addr=%p, size=%d. Need to register and insert in cache",
|
||||||
mypeer, addr, (int)size);
|
mypeer, addr, (int)size);
|
||||||
|
|
||||||
OMPI_FREE_LIST_GET(&mpool_rgpusm->reg_list, item, rc);
|
OMPI_FREE_LIST_GET(&mpool_rgpusm->reg_list, item, rc);
|
||||||
@ -362,24 +365,43 @@ int mca_mpool_rgpusm_register(mca_mpool_base_module_t *mpool, void *addr,
|
|||||||
* ensure we get the hit in the cache. */
|
* ensure we get the hit in the cache. */
|
||||||
mpool->rcache->rcache_find(mpool->rcache, addr, 4, &oldreg);
|
mpool->rcache->rcache_find(mpool->rcache, addr, 4, &oldreg);
|
||||||
RESTORE_PAGE_ALIGNMENT();
|
RESTORE_PAGE_ALIGNMENT();
|
||||||
/* The ref_count has to be zero as this memory cannot possibly
|
|
||||||
* be in use. Assert on that just to make sure. */
|
/* For most cases, we will find a registration that overlaps.
|
||||||
assert(0 == oldreg->ref_count);
|
* Removal of it should allow the registration we are
|
||||||
if (mca_mpool_rgpusm_component.leave_pinned) {
|
* attempting to succeed. */
|
||||||
opal_list_remove_item(&mpool_rgpusm->lru_list,
|
if (NULL != oldreg) {
|
||||||
(opal_list_item_t*)oldreg);
|
/* The ref_count has to be zero as this memory cannot
|
||||||
|
* possibly be in use. Assert on that just to make sure. */
|
||||||
|
assert(0 == oldreg->ref_count);
|
||||||
|
if (mca_mpool_rgpusm_component.leave_pinned) {
|
||||||
|
opal_list_remove_item(&mpool_rgpusm->lru_list,
|
||||||
|
(opal_list_item_t*)oldreg);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Bump the reference count to keep things copacetic in deregister */
|
||||||
|
oldreg->ref_count++;
|
||||||
|
/* Invalidate the registration so it will get booted out. */
|
||||||
|
oldreg->flags |= MCA_MPOOL_FLAGS_INVALID;
|
||||||
|
mca_mpool_rgpusm_deregister(mpool, oldreg);
|
||||||
|
mpool_rgpusm->stat_evicted++;
|
||||||
|
|
||||||
|
/* And try again. This one usually works. */
|
||||||
|
rc = mpool_rgpusm->resources.register_mem(addr, size, (mca_mpool_base_registration_t *)rgpusm_reg,
|
||||||
|
(mca_mpool_base_registration_t *)rget_reg);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Bump the reference count to keep things copacetic in deregister */
|
/* There is a chance that another registration is blocking our
|
||||||
oldreg->ref_count++;
|
* ability to register. Check the rc to see if we still need
|
||||||
/* Invalidate the registration so it will get booted out. */
|
* to try and clear out registrations. */
|
||||||
oldreg->flags |= MCA_MPOOL_FLAGS_INVALID;
|
while (OMPI_SUCCESS != rc) {
|
||||||
mca_mpool_rgpusm_deregister(mpool, oldreg);
|
if (true != mca_mpool_rgpusm_deregister_lru(mpool)) {
|
||||||
mpool_rgpusm->stat_evicted++;
|
rc = OMPI_ERROR;
|
||||||
|
break;
|
||||||
/* And try again. This only needs to be attempted one other time. */
|
}
|
||||||
rc = mpool_rgpusm->resources.register_mem(addr, size, (mca_mpool_base_registration_t *)rgpusm_reg,
|
/* Clear out one registration. */
|
||||||
(mca_mpool_base_registration_t *)rget_reg);
|
rc = mpool_rgpusm->resources.register_mem(addr, size, (mca_mpool_base_registration_t *)rgpusm_reg,
|
||||||
|
(mca_mpool_base_registration_t *)rget_reg);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if(rc != OMPI_SUCCESS) {
|
if(rc != OMPI_SUCCESS) {
|
||||||
@ -389,7 +411,7 @@ int mca_mpool_rgpusm_register(mca_mpool_base_module_t *mpool, void *addr,
|
|||||||
}
|
}
|
||||||
|
|
||||||
opal_output_verbose(80, mca_mpool_rgpusm_component.output,
|
opal_output_verbose(80, mca_mpool_rgpusm_component.output,
|
||||||
"About to insert in rgpusm cache addr=%p, size=%d", addr, (int)size);
|
"RGPUSM: About to insert in rgpusm cache addr=%p, size=%d", addr, (int)size);
|
||||||
SET_PAGE_ALIGNMENT_TO_ZERO();
|
SET_PAGE_ALIGNMENT_TO_ZERO();
|
||||||
while((rc = mpool->rcache->rcache_insert(mpool->rcache, (mca_mpool_base_registration_t *)rgpusm_reg,
|
while((rc = mpool->rcache->rcache_insert(mpool->rcache, (mca_mpool_base_registration_t *)rgpusm_reg,
|
||||||
mca_mpool_rgpusm_component.rcache_size_limit)) ==
|
mca_mpool_rgpusm_component.rcache_size_limit)) ==
|
||||||
@ -404,10 +426,11 @@ int mca_mpool_rgpusm_register(mca_mpool_base_module_t *mpool, void *addr,
|
|||||||
if(rc != OMPI_SUCCESS) {
|
if(rc != OMPI_SUCCESS) {
|
||||||
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
|
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
|
||||||
OMPI_FREE_LIST_RETURN(&mpool_rgpusm->reg_list, item);
|
OMPI_FREE_LIST_RETURN(&mpool_rgpusm->reg_list, item);
|
||||||
/* We cannot recover from this. We can be here if the size of the cache
|
/* We cannot recover from this. We can be here if the size of
|
||||||
* is smaller than the amount of memory we are trying to register in a single
|
* the cache is smaller than the amount of memory we are
|
||||||
* transfer. In that case, rc is MPI_ERR_OUT_OF_RESOURCES, but everything is
|
* trying to register in a single transfer. In that case, rc
|
||||||
* stuck at that point. Therefore, just error out completely.
|
* is MPI_ERR_OUT_OF_RESOURCES, but everything is stuck at
|
||||||
|
* that point. Therefore, just error out completely.
|
||||||
*/
|
*/
|
||||||
return OMPI_ERROR;
|
return OMPI_ERROR;
|
||||||
}
|
}
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user