diff --git a/ompi/mca/btl/openib/connect/btl_openib_connect_rdmacm.c b/ompi/mca/btl/openib/connect/btl_openib_connect_rdmacm.c index a1c7b84817..749ed2c163 100644 --- a/ompi/mca/btl/openib/connect/btl_openib_connect_rdmacm.c +++ b/ompi/mca/btl/openib/connect/btl_openib_connect_rdmacm.c @@ -120,6 +120,7 @@ typedef struct { mca_btl_openib_endpoint_t *endpoint; uint8_t qpnum; bool already_disconnected; + uint16_t route_retry_count; struct rdma_cm_id *id; } id_context_t; @@ -151,7 +152,8 @@ static struct rdma_event_channel *event_channel = NULL; static int rdmacm_priority = 30; static uint16_t rdmacm_port = 0; static uint32_t rdmacm_addr = 0; -static int rdmacm_resolve_timeout = 2000; +static int rdmacm_resolve_timeout = 1000; +static int rdmacm_resolve_max_retry_count = 20; static bool rdmacm_reject_causes_connect_error = false; static volatile int disconnect_callbacks = 0; static bool rdmacm_component_initialized = false; @@ -173,6 +175,7 @@ static void id_context_constructor(id_context_t *context) context->contents = NULL; context->endpoint = NULL; context->qpnum = 255; + context->route_retry_count = 0; } static void id_context_destructor(id_context_t *context) @@ -245,6 +248,17 @@ static void rdmacm_component_register(void) "illegal timeout", true, value); } + mca_base_param_reg_int(&mca_btl_openib_component.super.btl_version, + "connect_rdmacm_retry_count", + "Maximum number of times rdmacm will retry route resolution", + false, false, rdmacm_resolve_max_retry_count, &value); + if (value > 0) { + rdmacm_resolve_max_retry_count = value; + } else { + orte_show_help("help-mpi-btl-openib-cpc-rdmacm.txt", + "illegal retry count", true, value); + } + mca_base_param_reg_int(&mca_btl_openib_component.super.btl_version, "connect_rdmacm_reject_causes_connect_error", "The drivers for some devices are buggy such that an RDMA REJECT action may result in a CONNECT_ERROR event instead of a REJECTED event. Setting this MCA parameter to true tells Open MPI to treat CONNECT_ERROR events on connections where a REJECT is expected as a REJECT (default: false)", @@ -1542,12 +1556,26 @@ static int event_handler(struct rdma_cm_event *event) case RDMA_CM_EVENT_UNREACHABLE: case RDMA_CM_EVENT_CONNECT_RESPONSE: case RDMA_CM_EVENT_ADDR_ERROR: - case RDMA_CM_EVENT_ROUTE_ERROR: case RDMA_CM_EVENT_DEVICE_REMOVAL: ompi_btl_openib_fd_run_in_main(show_help_rdmacm_event_error, event); rc = OMPI_ERROR; break; + case RDMA_CM_EVENT_ROUTE_ERROR: + /* Route lookup does not necessarily handle retries, and there + appear to be cases where the subnet manager node can no + longer handle incoming requests. The rdma connection + manager and lower level code doesn't handle retries, so we + have to. */ + if (context->route_retry_count < rdmacm_resolve_max_retry_count) { + context->route_retry_count++; + rc = resolve_route(context); + break; + } + ompi_btl_openib_fd_run_in_main(show_help_rdmacm_event_error, event); + rc = OMPI_ERROR; + break; + default: /* Unknown error */ BTL_ERROR(("Unknown RDMA CM error event_handler: %s, status = %d",