Fix up some of the failover code in the openib BTL.
Need to use MCA_BTL_IB_FAILED state to signel failure, not MCA_BTL_IB_CLOSED. This commit was SVN r23883.
Этот коммит содержится в:
родитель
5fb2a2f2c9
Коммит
e9a7fea42d
@ -114,9 +114,9 @@ void mca_btl_openib_handle_endpoint_error(mca_btl_openib_module_t *openib_btl,
|
|||||||
* we change the state in the BTL layer. The change in the PML
|
* we change the state in the BTL layer. The change in the PML
|
||||||
* layer should prevent that we ever try to send on this BTL
|
* layer should prevent that we ever try to send on this BTL
|
||||||
* again. If we do, then this is an error case. */
|
* again. If we do, then this is an error case. */
|
||||||
if (MCA_BTL_IB_CLOSED != endpoint->endpoint_state) {
|
if (MCA_BTL_IB_FAILED != endpoint->endpoint_state) {
|
||||||
|
endpoint->endpoint_state = MCA_BTL_IB_FAILED;
|
||||||
mca_btl_openib_endpoint_notify(endpoint, MCA_BTL_OPENIB_CONTROL_EP_BROKEN, 0);
|
mca_btl_openib_endpoint_notify(endpoint, MCA_BTL_OPENIB_CONTROL_EP_BROKEN, 0);
|
||||||
endpoint->endpoint_state = MCA_BTL_IB_CLOSED;
|
|
||||||
}
|
}
|
||||||
opal_output_verbose(60, mca_btl_openib_component.verbose_failover,
|
opal_output_verbose(60, mca_btl_openib_component.verbose_failover,
|
||||||
"MCA_BTL_OPENIG_FRAG=%d, "
|
"MCA_BTL_OPENIG_FRAG=%d, "
|
||||||
@ -153,8 +153,8 @@ void mca_btl_openib_handle_endpoint_error(mca_btl_openib_module_t *openib_btl,
|
|||||||
* it know that this side is now broken. This is needed in the case
|
* it know that this side is now broken. This is needed in the case
|
||||||
* of a spurious error which may not cause the remote side to detect
|
* of a spurious error which may not cause the remote side to detect
|
||||||
* the error. */
|
* the error. */
|
||||||
if (MCA_BTL_IB_CLOSED != endpoint->endpoint_state) {
|
if (MCA_BTL_IB_FAILED != endpoint->endpoint_state) {
|
||||||
endpoint->endpoint_state = MCA_BTL_IB_CLOSED;
|
endpoint->endpoint_state = MCA_BTL_IB_FAILED;
|
||||||
mca_btl_openib_endpoint_notify(endpoint, MCA_BTL_OPENIB_CONTROL_EP_BROKEN, 0);
|
mca_btl_openib_endpoint_notify(endpoint, MCA_BTL_OPENIB_CONTROL_EP_BROKEN, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -224,24 +224,27 @@ void mca_btl_openib_handle_endpoint_error(mca_btl_openib_module_t *openib_btl,
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This functions allows a error to map out the entire BTL. First we
|
* This functions allows an error to map out the entire BTL. First a
|
||||||
* call up into the PML. Then we send messages to all the endpoints
|
* call is made up to the PML to map out all connections from this BTL.
|
||||||
* connected to this BTL.
|
* Then a message is sent to all the endpoints connected to this BTL.
|
||||||
|
* This function is enabled by the btl_openib_port_error_failover
|
||||||
|
* MCA parameter. If that parameter is not set, then this function
|
||||||
|
* does not do anything.
|
||||||
* @param openib_btl Pointer to BTL that had the error
|
* @param openib_btl Pointer to BTL that had the error
|
||||||
*/
|
*/
|
||||||
void mca_btl_openib_handle_btl_error(mca_btl_openib_module_t* openib_btl) {
|
void mca_btl_openib_handle_btl_error(mca_btl_openib_module_t* openib_btl) {
|
||||||
mca_btl_base_endpoint_t* endpoint;
|
mca_btl_base_endpoint_t* endpoint;
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
|
/* Check to see that the flag is set for the entire map out. */
|
||||||
|
if(mca_btl_openib_component.port_error_failover) {
|
||||||
/* Since we are not specifying a specific connection to bring down,
|
/* Since we are not specifying a specific connection to bring down,
|
||||||
* the PML layer will may out the entire BTL for future communication. */
|
* the PML layer will may out the entire BTL for future communication. */
|
||||||
if(mca_btl_openib_component.port_error_failover) {
|
|
||||||
char btlname[IBV_SYSFS_NAME_MAX];
|
char btlname[IBV_SYSFS_NAME_MAX];
|
||||||
snprintf(btlname, IBV_SYSFS_NAME_MAX-1, "lid=%d:name=%s",
|
snprintf(btlname, IBV_SYSFS_NAME_MAX-1, "lid=%d:name=%s",
|
||||||
openib_btl->lid, openib_btl->device->ib_dev->name);
|
openib_btl->lid, openib_btl->device->ib_dev->name);
|
||||||
openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_NONFATAL,
|
openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_NONFATAL,
|
||||||
NULL, btlname);
|
NULL, btlname);
|
||||||
}
|
|
||||||
|
|
||||||
/* Now send out messages to all endpoints that we are disconnecting.
|
/* Now send out messages to all endpoints that we are disconnecting.
|
||||||
* Only do ths to endpoints that are connected. Otherwise, the
|
* Only do ths to endpoints that are connected. Otherwise, the
|
||||||
@ -254,7 +257,8 @@ void mca_btl_openib_handle_btl_error(mca_btl_openib_module_t* openib_btl) {
|
|||||||
}
|
}
|
||||||
if (MCA_BTL_IB_CONNECTED == endpoint->endpoint_state) {
|
if (MCA_BTL_IB_CONNECTED == endpoint->endpoint_state) {
|
||||||
mca_btl_openib_endpoint_notify(endpoint, MCA_BTL_OPENIB_CONTROL_EP_BROKEN, 0);
|
mca_btl_openib_endpoint_notify(endpoint, MCA_BTL_OPENIB_CONTROL_EP_BROKEN, 0);
|
||||||
endpoint->endpoint_state = MCA_BTL_IB_CLOSED;
|
endpoint->endpoint_state = MCA_BTL_IB_FAILED;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -323,12 +327,13 @@ void btl_openib_handle_failover_control_messages(mca_btl_openib_control_header_t
|
|||||||
/* At this point, we have found the endpoint. Now decode the
|
/* At this point, we have found the endpoint. Now decode the
|
||||||
* message type and do the appropriate action. */
|
* message type and do the appropriate action. */
|
||||||
if (MCA_BTL_OPENIB_CONTROL_EP_BROKEN == ctl_hdr->type) {
|
if (MCA_BTL_OPENIB_CONTROL_EP_BROKEN == ctl_hdr->type) {
|
||||||
/* Now that we found a match, let us check to see
|
/* Now that we found a match, check the state of the
|
||||||
* notify the upper layer that it should no longer
|
* endpoint to see it is already in a failed state.
|
||||||
* be used. Note that we do not check the endpont
|
* If not, then notify the upper layer and error out
|
||||||
* state since we may want to map out an endpoint
|
* any pending fragments. */
|
||||||
* that is not even connected yet and is still in
|
if (MCA_BTL_IB_FAILED == newep->endpoint_state) {
|
||||||
* the MCA_BTL_IB_CLOSED state. */
|
return;
|
||||||
|
} else {
|
||||||
char btlname[IBV_SYSFS_NAME_MAX];
|
char btlname[IBV_SYSFS_NAME_MAX];
|
||||||
ompi_proc_t* remote_proc = NULL;
|
ompi_proc_t* remote_proc = NULL;
|
||||||
|
|
||||||
@ -348,8 +353,9 @@ void btl_openib_handle_failover_control_messages(mca_btl_openib_control_header_t
|
|||||||
newbtl->error_cb(&newbtl->super, MCA_BTL_ERROR_FLAGS_NONFATAL,
|
newbtl->error_cb(&newbtl->super, MCA_BTL_ERROR_FLAGS_NONFATAL,
|
||||||
remote_proc, btlname);
|
remote_proc, btlname);
|
||||||
error_out_all_pending_frags(newep, &newbtl->super);
|
error_out_all_pending_frags(newep, &newbtl->super);
|
||||||
newep->endpoint_state = MCA_BTL_IB_CLOSED;
|
newep->endpoint_state = MCA_BTL_IB_FAILED;
|
||||||
return;
|
return;
|
||||||
|
}
|
||||||
} else { /* MCA_BTL_OPENIB_CONTROL_EP_EAGER_RDMA_ERROR message */
|
} else { /* MCA_BTL_OPENIB_CONTROL_EP_EAGER_RDMA_ERROR message */
|
||||||
/* If we are still pointing at the location where
|
/* If we are still pointing at the location where
|
||||||
* we detected an error on the remote side, then
|
* we detected an error on the remote side, then
|
||||||
|
@ -887,6 +887,12 @@ static void rml_recv_cb(int status, orte_process_name_t* process_name,
|
|||||||
OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
|
OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case MCA_BTL_IB_FAILED:
|
||||||
|
/* This connection has been put in the failed state
|
||||||
|
* so just ignore the connection message. */
|
||||||
|
OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
|
||||||
|
break;
|
||||||
|
|
||||||
default :
|
default :
|
||||||
BTL_ERROR(("Invalid endpoint state %d", endpoint_state));
|
BTL_ERROR(("Invalid endpoint state %d", endpoint_state));
|
||||||
mca_btl_openib_endpoint_invoke_error(ib_endpoint);
|
mca_btl_openib_endpoint_invoke_error(ib_endpoint);
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user