1
1

Don't allow any asynchronous CUDA operations.

There are 2 reasons for this:
- pending CUDA events are not progressed by this BTL, so anything that becomes
  asychronous will never be completed.
- we use the packed data on the shared memory backing file, and this will be
  returned to the peer process upon return (thus if we copy asynchronously we
  might not copy the right data).

Signed-off-by: George Bosilca <bosilca@icl.utk.edu>
Этот коммит содержится в:
George Bosilca 2020-07-15 01:37:09 -04:00
родитель 0e32b0acef
Коммит 8bc1f3d8fb
2 изменённых файлов: 39 добавлений и 33 удалений

Просмотреть файл

@ -1197,7 +1197,7 @@ static void mca_btl_smcuda_send_cuda_ipc_request(struct mca_btl_base_module_t* b
if (endpoint->ipcstate != IPC_INIT) {
OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
return;
} else {
}
endpoint->ipctries++;
if (endpoint->ipctries > MAXTRIES) {
endpoint->ipcstate = IPC_BAD;
@ -1207,7 +1207,6 @@ static void mca_btl_smcuda_send_cuda_ipc_request(struct mca_btl_base_module_t* b
/* All is good. Set up state and continue. */
endpoint->ipcstate = IPC_SENT;
OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
}
if ( mca_btl_smcuda_component.num_outstanding_frags * 2 > (int) mca_btl_smcuda_component.fifo_size ) {
mca_btl_smcuda_component_progress();

Просмотреть файл

@ -136,8 +136,15 @@ static inline unsigned int mca_btl_smcuda_param_register_uint(
return *storage;
}
static int mca_btl_smcuda_component_verify(void) {
static int mca_btl_smcuda_component_verify(void)
{
/* We canot support async memcpy right now */
if( (mca_btl_smcuda.super.btl_flags & MCA_BTL_FLAGS_CUDA_COPY_ASYNC_RECV) ||
(mca_btl_smcuda.super.btl_flags & MCA_BTL_FLAGS_CUDA_COPY_ASYNC_SEND) ) {
opal_output_verbose(10, opal_btl_base_framework.framework_output,
"btl: smcuda: disable all asynchronous memcpy support");
}
mca_btl_smcuda.super.btl_flags &= ~(MCA_BTL_FLAGS_CUDA_COPY_ASYNC_RECV | MCA_BTL_FLAGS_CUDA_COPY_ASYNC_SEND);
return mca_btl_base_param_verify(&mca_btl_smcuda.super);
}