From 8bc1f3d8fbe9a31179d160f43790fd38ce7f66e5 Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Wed, 15 Jul 2020 01:37:09 -0400 Subject: [PATCH] Don't allow any asynchronous CUDA operations. There are 2 reasons for this: - pending CUDA events are not progressed by this BTL, so anything that becomes asychronous will never be completed. - we use the packed data on the shared memory backing file, and this will be returned to the peer process upon return (thus if we copy asynchronously we might not copy the right data). Signed-off-by: George Bosilca --- opal/mca/btl/smcuda/btl_smcuda.c | 19 ++++---- opal/mca/btl/smcuda/btl_smcuda_component.c | 53 ++++++++++++---------- 2 files changed, 39 insertions(+), 33 deletions(-) diff --git a/opal/mca/btl/smcuda/btl_smcuda.c b/opal/mca/btl/smcuda/btl_smcuda.c index 70499fcaa9..c41808fcf6 100644 --- a/opal/mca/btl/smcuda/btl_smcuda.c +++ b/opal/mca/btl/smcuda/btl_smcuda.c @@ -1197,17 +1197,16 @@ static void mca_btl_smcuda_send_cuda_ipc_request(struct mca_btl_base_module_t* b if (endpoint->ipcstate != IPC_INIT) { OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock); return; - } else { - endpoint->ipctries++; - if (endpoint->ipctries > MAXTRIES) { - endpoint->ipcstate = IPC_BAD; - OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock); - return; - } - /* All is good. Set up state and continue. */ - endpoint->ipcstate = IPC_SENT; - OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock); } + endpoint->ipctries++; + if (endpoint->ipctries > MAXTRIES) { + endpoint->ipcstate = IPC_BAD; + OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock); + return; + } + /* All is good. Set up state and continue. */ + endpoint->ipcstate = IPC_SENT; + OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock); if ( mca_btl_smcuda_component.num_outstanding_frags * 2 > (int) mca_btl_smcuda_component.fifo_size ) { mca_btl_smcuda_component_progress(); diff --git a/opal/mca/btl/smcuda/btl_smcuda_component.c b/opal/mca/btl/smcuda/btl_smcuda_component.c index 3b770bb1a4..c31c6d9a93 100644 --- a/opal/mca/btl/smcuda/btl_smcuda_component.c +++ b/opal/mca/btl/smcuda/btl_smcuda_component.c @@ -136,8 +136,15 @@ static inline unsigned int mca_btl_smcuda_param_register_uint( return *storage; } -static int mca_btl_smcuda_component_verify(void) { - +static int mca_btl_smcuda_component_verify(void) +{ + /* We canot support async memcpy right now */ + if( (mca_btl_smcuda.super.btl_flags & MCA_BTL_FLAGS_CUDA_COPY_ASYNC_RECV) || + (mca_btl_smcuda.super.btl_flags & MCA_BTL_FLAGS_CUDA_COPY_ASYNC_SEND) ) { + opal_output_verbose(10, opal_btl_base_framework.framework_output, + "btl: smcuda: disable all asynchronous memcpy support"); + } + mca_btl_smcuda.super.btl_flags &= ~(MCA_BTL_FLAGS_CUDA_COPY_ASYNC_RECV | MCA_BTL_FLAGS_CUDA_COPY_ASYNC_SEND); return mca_btl_base_param_verify(&mca_btl_smcuda.super); } @@ -1100,27 +1107,27 @@ int mca_btl_smcuda_component_progress(void) } goto recheck_peer; } - default: - /* unknown */ - /* - * This code path should presumably never be called. - * It's unclear if it should exist or, if so, how it should be written. - * If we want to return it to the sending process, - * we have to figure out who the sender is. - * It seems we need to subtract the mask bits. - * Then, hopefully this is an sm header that has an smp_rank field. - * Presumably that means the received header was relative. - * Or, maybe this code should just be removed. - */ - opal_output(0, "mca_btl_smcuda_component_progress read an unknown type of header"); - hdr = (mca_btl_smcuda_hdr_t *) RELATIVE2VIRTUAL(hdr); - peer_smp_rank = hdr->my_smp_rank; - hdr = (mca_btl_smcuda_hdr_t*)((uintptr_t)hdr->frag | - MCA_BTL_SMCUDA_FRAG_STATUS_MASK); - MCA_BTL_SMCUDA_FIFO_WRITE( - mca_btl_smcuda_component.sm_peers[peer_smp_rank], - my_smp_rank, peer_smp_rank, hdr, false, true, rc); - break; + default: + /* unknown */ + /* + * This code path should presumably never be called. + * It's unclear if it should exist or, if so, how it should be written. + * If we want to return it to the sending process, + * we have to figure out who the sender is. + * It seems we need to subtract the mask bits. + * Then, hopefully this is an sm header that has an smp_rank field. + * Presumably that means the received header was relative. + * Or, maybe this code should just be removed. + */ + opal_output(0, "mca_btl_smcuda_component_progress read an unknown type of header"); + hdr = (mca_btl_smcuda_hdr_t *) RELATIVE2VIRTUAL(hdr); + peer_smp_rank = hdr->my_smp_rank; + hdr = (mca_btl_smcuda_hdr_t*)((uintptr_t)hdr->frag | + MCA_BTL_SMCUDA_FRAG_STATUS_MASK); + MCA_BTL_SMCUDA_FIFO_WRITE( + mca_btl_smcuda_component.sm_peers[peer_smp_rank], + my_smp_rank, peer_smp_rank, hdr, false, true, rc); + break; } } (void)rc; /* this is safe to ignore as the message is requeued till success */