1
1

Add runtime support to turn off CUDA IPC support.

This commit was SVN r29444.
Этот коммит содержится в:
Rolf vandeVaart 2013-10-16 16:48:18 +00:00
родитель 9f83405c78
Коммит 0cd1e8dfd9
3 изменённых файлов: 39 добавлений и 9 удалений

Просмотреть файл

@ -921,7 +921,7 @@ int mca_btl_smcuda_sendi( struct mca_btl_base_module_t* btl,
#if OMPI_CUDA_SUPPORT #if OMPI_CUDA_SUPPORT
/* Initiate setting up CUDA IPC support. */ /* Initiate setting up CUDA IPC support. */
if (mca_common_cuda_enabled && (IPC_INIT == endpoint->ipcstate)) { if (mca_common_cuda_enabled && (IPC_INIT == endpoint->ipcstate) && mca_btl_smcuda_component.use_cuda_ipc) {
mca_btl_smcuda_send_cuda_ipc_request(btl, endpoint); mca_btl_smcuda_send_cuda_ipc_request(btl, endpoint);
} }
#endif /* OMPI_CUDA_SUPPORT */ #endif /* OMPI_CUDA_SUPPORT */
@ -1004,10 +1004,12 @@ int mca_btl_smcuda_send( struct mca_btl_base_module_t* btl,
mca_btl_smcuda_component_progress(); mca_btl_smcuda_component_progress();
} }
#if OMPI_CUDA_SUPPORT
/* Initiate setting up CUDA IPC support */ /* Initiate setting up CUDA IPC support */
if (mca_common_cuda_enabled && (IPC_INIT == endpoint->ipcstate)) { if (mca_common_cuda_enabled && (IPC_INIT == endpoint->ipcstate) && mca_btl_smcuda_component.use_cuda_ipc) {
mca_btl_smcuda_send_cuda_ipc_request(btl, endpoint); mca_btl_smcuda_send_cuda_ipc_request(btl, endpoint);
} }
#endif /* OMPI_CUDA_SUPPORT */
/* available header space */ /* available header space */
frag->hdr->len = frag->segment.base.seg_len; frag->hdr->len = frag->segment.base.seg_len;

Просмотреть файл

@ -13,7 +13,7 @@
* Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2010 Los Alamos National Security, LLC. * Copyright (c) 2010 Los Alamos National Security, LLC.
* All rights reserved. * All rights reserved.
* Copyright (c) 2012 NVIDIA Corporation. All rights reserved. * Copyright (c) 2012-2013 NVIDIA Corporation. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -205,6 +205,8 @@ struct mca_btl_smcuda_component_t {
#if OMPI_CUDA_SUPPORT #if OMPI_CUDA_SUPPORT
int cuda_ipc_verbose; int cuda_ipc_verbose;
int cuda_ipc_output; int cuda_ipc_output;
int use_cuda_ipc;
int use_cuda_ipc_same_gpu;
#endif /* OMPI_CUDA_SUPPORT */ #endif /* OMPI_CUDA_SUPPORT */
}; };
typedef struct mca_btl_smcuda_component_t mca_btl_smcuda_component_t; typedef struct mca_btl_smcuda_component_t mca_btl_smcuda_component_t;

Просмотреть файл

@ -170,6 +170,8 @@ static int smcuda_register(void)
} else { } else {
mca_btl_smcuda.super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_LOW; mca_btl_smcuda.super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_LOW;
} }
mca_btl_smcuda_param_register_int("use_cuda_ipc", 1, &mca_btl_smcuda_component.use_cuda_ipc);
mca_btl_smcuda_param_register_int("use_cuda_ipc_same_gpu", 1, &mca_btl_smcuda_component.use_cuda_ipc_same_gpu);
mca_btl_smcuda_param_register_int("cuda_ipc_verbose", 0, &mca_btl_smcuda_component.cuda_ipc_verbose); mca_btl_smcuda_param_register_int("cuda_ipc_verbose", 0, &mca_btl_smcuda_component.cuda_ipc_verbose);
mca_btl_smcuda_component.cuda_ipc_output = opal_output_open(NULL); mca_btl_smcuda_component.cuda_ipc_output = opal_output_open(NULL);
opal_output_set_verbosity(mca_btl_smcuda_component.cuda_ipc_output, mca_btl_smcuda_component.cuda_ipc_verbose); opal_output_set_verbosity(mca_btl_smcuda_component.cuda_ipc_output, mca_btl_smcuda_component.cuda_ipc_verbose);
@ -734,12 +736,36 @@ static void btl_smcuda_control(mca_btl_base_module_t* btl,
return; return;
} }
/* Check for IPC support between devices. If the CUDA API call fails, then /* Check for IPC support between devices. If they are the
* just move endpoint into bad state. No need to send a reply. */ * same device and use_cuda_ipc_same_gpu is 1 (default),
res = mca_common_cuda_device_can_access_peer(&ipcaccess, mydevnum, ctrlhdr.cudev); * then assume CUDA IPC is possible. This could be a
if (0 != res) { * device running in DEFAULT mode or running under MPS.
endpoint->ipcstate = IPC_BAD; * Otherwise, check peer acces to determine CUDA IPC
return; * support. If the CUDA API call fails, then just move
* endpoint into bad state. No need to send a reply. */
if (mydevnum == ctrlhdr.cudev) {
if (mca_btl_smcuda_component.use_cuda_ipc_same_gpu) {
ipcaccess = 1;
} else {
opal_output_verbose(10, mca_btl_smcuda_component.cuda_ipc_output,
"Analyzed CUDA IPC request: myrank=%d, mydev=%d, peerrank=%d, "
"peerdev=%d --> Access is disabled by btl_smcuda_use_cuda_ipc_same_gpu",
endpoint->my_smp_rank, mydevnum, endpoint->peer_smp_rank,
ctrlhdr.cudev);
endpoint->ipcstate = IPC_BAD;
return;
}
} else {
res = mca_common_cuda_device_can_access_peer(&ipcaccess, mydevnum, ctrlhdr.cudev);
if (0 != res) {
opal_output_verbose(10, mca_btl_smcuda_component.cuda_ipc_output,
"Analyzed CUDA IPC request: myrank=%d, mydev=%d, peerrank=%d, "
"peerdev=%d --> Access is disabled because peer check failed with err=%d",
endpoint->my_smp_rank, mydevnum, endpoint->peer_smp_rank,
ctrlhdr.cudev, res);
endpoint->ipcstate = IPC_BAD;
return;
}
} }
assert(endpoint->peer_smp_rank == frag->hdr->my_smp_rank); assert(endpoint->peer_smp_rank == frag->hdr->my_smp_rank);