RDMA CM doesn't retry if a packet is dropped, just timesout during route
discovery, which results in a timeout and we don't recover. Instead, try to recover a couple of times by retrying. This commit was SVN r21619.
Этот коммит содержится в:
родитель
d7d07e0720
Коммит
ac34b1de69
@ -120,6 +120,7 @@ typedef struct {
|
||||
mca_btl_openib_endpoint_t *endpoint;
|
||||
uint8_t qpnum;
|
||||
bool already_disconnected;
|
||||
uint16_t route_retry_count;
|
||||
struct rdma_cm_id *id;
|
||||
} id_context_t;
|
||||
|
||||
@ -151,7 +152,8 @@ static struct rdma_event_channel *event_channel = NULL;
|
||||
static int rdmacm_priority = 30;
|
||||
static uint16_t rdmacm_port = 0;
|
||||
static uint32_t rdmacm_addr = 0;
|
||||
static int rdmacm_resolve_timeout = 2000;
|
||||
static int rdmacm_resolve_timeout = 1000;
|
||||
static int rdmacm_resolve_max_retry_count = 20;
|
||||
static bool rdmacm_reject_causes_connect_error = false;
|
||||
static volatile int disconnect_callbacks = 0;
|
||||
static bool rdmacm_component_initialized = false;
|
||||
@ -173,6 +175,7 @@ static void id_context_constructor(id_context_t *context)
|
||||
context->contents = NULL;
|
||||
context->endpoint = NULL;
|
||||
context->qpnum = 255;
|
||||
context->route_retry_count = 0;
|
||||
}
|
||||
|
||||
static void id_context_destructor(id_context_t *context)
|
||||
@ -245,6 +248,17 @@ static void rdmacm_component_register(void)
|
||||
"illegal timeout", true, value);
|
||||
}
|
||||
|
||||
mca_base_param_reg_int(&mca_btl_openib_component.super.btl_version,
|
||||
"connect_rdmacm_retry_count",
|
||||
"Maximum number of times rdmacm will retry route resolution",
|
||||
false, false, rdmacm_resolve_max_retry_count, &value);
|
||||
if (value > 0) {
|
||||
rdmacm_resolve_max_retry_count = value;
|
||||
} else {
|
||||
orte_show_help("help-mpi-btl-openib-cpc-rdmacm.txt",
|
||||
"illegal retry count", true, value);
|
||||
}
|
||||
|
||||
mca_base_param_reg_int(&mca_btl_openib_component.super.btl_version,
|
||||
"connect_rdmacm_reject_causes_connect_error",
|
||||
"The drivers for some devices are buggy such that an RDMA REJECT action may result in a CONNECT_ERROR event instead of a REJECTED event. Setting this MCA parameter to true tells Open MPI to treat CONNECT_ERROR events on connections where a REJECT is expected as a REJECT (default: false)",
|
||||
@ -1542,12 +1556,26 @@ static int event_handler(struct rdma_cm_event *event)
|
||||
case RDMA_CM_EVENT_UNREACHABLE:
|
||||
case RDMA_CM_EVENT_CONNECT_RESPONSE:
|
||||
case RDMA_CM_EVENT_ADDR_ERROR:
|
||||
case RDMA_CM_EVENT_ROUTE_ERROR:
|
||||
case RDMA_CM_EVENT_DEVICE_REMOVAL:
|
||||
ompi_btl_openib_fd_run_in_main(show_help_rdmacm_event_error, event);
|
||||
rc = OMPI_ERROR;
|
||||
break;
|
||||
|
||||
case RDMA_CM_EVENT_ROUTE_ERROR:
|
||||
/* Route lookup does not necessarily handle retries, and there
|
||||
appear to be cases where the subnet manager node can no
|
||||
longer handle incoming requests. The rdma connection
|
||||
manager and lower level code doesn't handle retries, so we
|
||||
have to. */
|
||||
if (context->route_retry_count < rdmacm_resolve_max_retry_count) {
|
||||
context->route_retry_count++;
|
||||
rc = resolve_route(context);
|
||||
break;
|
||||
}
|
||||
ompi_btl_openib_fd_run_in_main(show_help_rdmacm_event_error, event);
|
||||
rc = OMPI_ERROR;
|
||||
break;
|
||||
|
||||
default:
|
||||
/* Unknown error */
|
||||
BTL_ERROR(("Unknown RDMA CM error event_handler: %s, status = %d",
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user