1
1

HCA failover support in openib BTL

This commit was SVN r21767.
Этот коммит содержится в:
Rolf vandeVaart 2009-08-05 21:53:02 +00:00
родитель 91e52d062b
Коммит 41f38110ff
29 изменённых файлов: 602 добавлений и 20 удалений

Просмотреть файл

@ -39,6 +39,7 @@ int mca_bml_base_error_count;
int mca_bml_base_open(void)
{
int value;
/* See if we've already been here */
if (++mca_bml_base_already_opened > 1) {
return OMPI_SUCCESS;
@ -51,6 +52,15 @@ int mca_bml_base_open(void)
return OMPI_ERROR;
}
mca_base_param_reg_int_name("bml",
"base_verbose",
"Verbosity level of the BML framework",
false, false,
0,
&value);
mca_bml_base_output = opal_output_open(NULL);
opal_output_set_verbosity(mca_bml_base_output, value);
#if OPAL_ENABLE_DEBUG_RELIABILITY
do {
int param, value;

Просмотреть файл

@ -175,6 +175,7 @@ static inline mca_bml_base_btl_t* mca_bml_base_btl_array_get_index(mca_bml_base_
*
* @param index (OUT)
*/
extern int mca_bml_base_output;
static inline mca_bml_base_btl_t* mca_bml_base_btl_array_get_next(mca_bml_base_btl_array_t* array)
{
#if OPAL_ENABLE_DEBUG
@ -184,6 +185,8 @@ static inline mca_bml_base_btl_t* mca_bml_base_btl_array_get_next(mca_bml_base_b
}
#endif
if( 1 == array->arr_size ) {
opal_output_verbose(20, mca_bml_base_output,
"%s btl selected", array->bml_btls[0].btl->btl_ifname);
return &array->bml_btls[0]; /* force the return to avoid a jump */
} else {
size_t current_position = array->arr_index; /* force to always start from zero */
@ -192,6 +195,8 @@ static inline mca_bml_base_btl_t* mca_bml_base_btl_array_get_next(mca_bml_base_b
} else {
array->arr_index = current_position + 1; /* continue */
}
opal_output_verbose(20, mca_bml_base_output,
"%s btl selected", array->bml_btls[current_position].btl->btl_ifname);
return &array->bml_btls[current_position];
}
}

Просмотреть файл

@ -525,12 +525,15 @@ static int mca_bml_r2_del_proc_btl(ompi_proc_t* proc, mca_btl_base_module_t* btl
mca_btl_base_module_t* ep_btl;
double total_bandwidth = 0;
size_t b;
int rc = 0;
if(NULL == ep)
return OMPI_SUCCESS;
/* remove btl from eager list */
mca_bml_base_btl_array_remove(&ep->btl_eager, btl);
if (mca_bml_base_btl_array_remove(&ep->btl_eager, btl)) {
rc++;
}
/* remove btl from send list */
if(mca_bml_base_btl_array_remove(&ep->btl_send, btl)) {
@ -538,6 +541,7 @@ static int mca_bml_r2_del_proc_btl(ompi_proc_t* proc, mca_btl_base_module_t* btl
/* compute total_bandwidth and
reset max_send_size to the min of all btl's */
total_bandwidth = 0;
rc++;
for(b=0; b< mca_bml_base_btl_array_get_size(&ep->btl_send); b++) {
bml_btl = mca_bml_base_btl_array_get_index(&ep->btl_send, b);
ep_btl = bml_btl->btl;
@ -563,9 +567,10 @@ static int mca_bml_r2_del_proc_btl(ompi_proc_t* proc, mca_btl_base_module_t* btl
/* remove btl from RDMA list */
if(mca_bml_base_btl_array_remove(&ep->btl_rdma, btl)) {
/* computer total bandwidth */
total_bandwidth = 0;
rc++;
for(b=0; b< mca_bml_base_btl_array_get_size(&ep->btl_rdma); b++) {
bml_btl = mca_bml_base_btl_array_get_index(&ep->btl_rdma, b);
ep_btl = bml_btl->btl;
@ -593,7 +598,7 @@ static int mca_bml_r2_del_proc_btl(ompi_proc_t* proc, mca_btl_base_module_t* btl
}
}
return OMPI_SUCCESS;
return rc;
}
int mca_bml_r2_finalize( void )

Просмотреть файл

@ -197,6 +197,7 @@ typedef uint8_t mca_btl_base_tag_t;
/* error callback flags */
#define MCA_BTL_ERROR_FLAGS_FATAL 0x1
#define MCA_BTL_ERROR_FLAGS_NONFATAL 0x2
/**
* Asynchronous callback function on completion of an operation.
@ -509,7 +510,9 @@ typedef int (*mca_btl_base_module_register_fn_t)(
typedef void (*mca_btl_base_module_error_cb_fn_t)(
struct mca_btl_base_module_t* btl,
int32_t flags
int32_t flags,
struct ompi_proc_t* ompi_proc,
struct mca_btl_base_endpoint_t** newep
);
@ -757,6 +760,7 @@ struct mca_btl_base_module_t {
/* BTL common attributes */
mca_btl_base_component_t* btl_component; /**< pointer back to the BTL component structure */
char btl_ifname[8]; /**< name of interface associated with btl */
size_t btl_eager_limit; /**< maximum size of first fragment -- eager send */
size_t btl_rndv_eager_limit; /**< the size of a data sent in a first fragment of rendezvous protocol */
size_t btl_max_send_size; /**< maximum send fragment size supported by the BTL */

Просмотреть файл

@ -631,6 +631,7 @@ mca_btl_elan_register_error( struct mca_btl_base_module_t* btl,
mca_btl_elan_module_t mca_btl_elan_module = {
{
&mca_btl_elan_component.super,
"unknown",
0, /* max size of first fragment */
0, /* min send fragment size */
0, /* max send fragment size */

Просмотреть файл

@ -57,6 +57,7 @@ static int mca_btl_gm_put_nl(
mca_btl_gm_module_t mca_btl_gm_module = {
{
&mca_btl_gm_component.super,
"unknown",
0, /* max size of first fragment */
0, /* min send fragment size */
0, /* max send fragment size */

Просмотреть файл

@ -672,6 +672,7 @@ int mca_btl_mx_ft_event(int state) {
mca_btl_mx_module_t mca_btl_mx_module = {
{
&mca_btl_mx_component.super,
"unknown",
0, /* max size of first fragment */
0, /* min send fragment size */
0, /* max send fragment size */

Просмотреть файл

@ -42,6 +42,7 @@
mca_btl_ud_module_t mca_btl_ofud_module = {
{
&mca_btl_ofud_component.super,
"unknown",
0, /* eager_limit */
0, /* min_send_size */
0, /* max_send_size */

Просмотреть файл

@ -67,6 +67,7 @@
mca_btl_openib_module_t mca_btl_openib_module = {
{
&mca_btl_openib_component.super,
"unknown",
0, /* max size of first fragment */
0, /* min send fragment size */
0, /* max send fragment size */
@ -638,7 +639,7 @@ mca_btl_base_descriptor_t* mca_btl_openib_alloc(
assert(qp != MCA_BTL_NO_ORDER);
if(mca_btl_openib_component.use_message_coalescing &&
(flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP)) {
(flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP) && !(flags & MCA_BTL_IB_NO_COALESCE)) {
int prio = !(flags & MCA_BTL_DES_FLAGS_PRIORITY);
sfrag = check_coalescing(&ep->qps[qp].no_wqe_pending_frags[prio],
&ep->qps[qp].qp->lock, ep, size);
@ -1189,6 +1190,8 @@ int mca_btl_openib_sendi( struct mca_btl_base_module_t* btl,
if(!ib_rc) {
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
return OMPI_SUCCESS;
} else {
opal_output(0, "Error from ibv_post_send()");
}
/* Failed to send, do clean up all allocated resources */
@ -1219,6 +1222,9 @@ cant_send:
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
/* We can not send the data directly, so we just return descriptor */
*descriptor = mca_btl_openib_alloc(btl, ep, order, size, flags);
#if 0
opal_output(0, "Failed to send during sendi, send frag=%d back up", *descriptor);
#endif
return OMPI_ERR_RESOURCE_BUSY;
}
/*

Просмотреть файл

@ -54,6 +54,7 @@ BEGIN_C_DECLS
#define MCA_BTL_IB_LEAVE_PINNED 1
#define IB_DEFAULT_GID_PREFIX 0xfe80000000000000ll
#define MCA_BTL_IB_PKEY_MASK 0x7fff
#define MCA_BTL_IB_NO_COALESCE 0x4000
/*--------------------------------------------------------------------*/
@ -254,6 +255,8 @@ struct mca_btl_openib_component_t {
ompi_free_list_t recv_user_free;
/**< frags for coalesced massages */
ompi_free_list_t send_free_coalesced;
/** < whether to enable HCA failover mechanism */
bool enable_hca_failover;
}; typedef struct mca_btl_openib_component_t mca_btl_openib_component_t;
OMPI_MODULE_DECLSPEC extern mca_btl_openib_component_t mca_btl_openib_component;

Просмотреть файл

@ -66,6 +66,7 @@ const char *ibv_get_sysfs_path(void);
#include "orte/runtime/orte_globals.h"
#include "orte/mca/notifier/notifier.h"
#include "ompi/mca/pml/ob1/pml_ob1_hdr.h" /* For debugging only */
#include "ompi/proc/proc.h"
#include "ompi/mca/btl/btl.h"
#include "ompi/mca/mpool/base/base.h"
@ -646,6 +647,7 @@ static int init_one_port(opal_list_t *btl_list, mca_btl_openib_device_t *device,
ib_selected = OBJ_NEW(mca_btl_base_selected_module_t);
ib_selected->btl_module = (mca_btl_base_module_t*) openib_btl;
openib_btl->device = device;
strncpy(openib_btl->super.btl_ifname, ibv_get_device_name(device->ib_dev), 7);
openib_btl->port_num = (uint8_t) port_num;
openib_btl->pkey_index = pkey_index;
openib_btl->lid = lid;
@ -2810,6 +2812,250 @@ static void progress_pending_frags_srq(mca_btl_openib_module_t* openib_btl,
}
}
/**
* Take an existing frag and move it to another endpoint. We first
* allocate a new fragment from the new btl. We then copy over various
* fields from the old fragment to the new one. Then we copy the
* actually data that is to be transferred. This includes the openib
* header, the PML header, and all the data.
*/
static void mca_btl_openib_move_frag(mca_btl_openib_endpoint_t* ep,
mca_btl_openib_com_frag_t* oldfrag)
{
mca_btl_openib_com_frag_t* frag;
mca_btl_base_descriptor_t* olddes;
mca_btl_base_descriptor_t* des;
int coalesced_len, retval;
if (ep->endpoint_state != MCA_BTL_IB_CONNECTED) {
OPAL_OUTPUT((-1, "INFO: Reposting to unconnected endpoint"));
}
olddes = (mca_btl_base_descriptor_t *)oldfrag;
/* Check to see if this was a coalesced fragment. If so, then
* first walk through each coalesced fragment, turn it into a send
* fragment, and repost. */
coalesced_len = opal_list_get_size(&to_send_frag(olddes)->coalesced_frags);
if (coalesced_len > 0) {
mca_btl_openib_control_header_t *ctrl_hdr;
mca_btl_openib_header_coalesced_t *clsc_hdr;
opal_list_item_t *i;
mca_btl_base_descriptor_t* coalesced_des;
OPAL_OUTPUT((-1, "INFO: Reposting coalesced fragments"));
while((i = opal_list_remove_first(&to_send_frag(olddes)->coalesced_frags))) {
frag = (mca_btl_openib_com_frag_t *)
mca_btl_openib_alloc((mca_btl_base_module_t *)ep->endpoint_btl,
ep, to_base_frag(i)->base.order,
to_base_frag(i)->segment.seg_len,
to_base_frag(i)->base.des_flags | MCA_BTL_IB_NO_COALESCE);
coalesced_des = (mca_btl_base_descriptor_t *)i;
/* First adjust the values in the descriptor portion of the fragment */
des = (mca_btl_base_descriptor_t*)frag;
des->des_cbfunc = coalesced_des->des_cbfunc;
des->des_cbdata = coalesced_des->des_cbdata;
/* Now adjust fragment specific information */
frag->endpoint = ep;
/* Finally copy over the data that is actually being transmitted */
memcpy(to_base_frag(frag)->segment.seg_addr.pval, to_base_frag(i)->segment.seg_addr.pval,
to_base_frag(i)->segment.seg_len);
to_base_frag(frag)->segment.seg_len = to_base_frag(i)->segment.seg_len;
/* Restore the PML fragment type header used for callbacks */
clsc_hdr = (mca_btl_openib_header_coalesced_t *) to_coalesced_frag(i)->hdr;
to_send_frag(frag)->hdr->tag = clsc_hdr->tag;
OPAL_OUTPUT((0, "Tag pulled from old coalesced frag: tag=%d", clsc_hdr->tag));
/* Set to zero just to be safe */
to_send_frag(frag)->hdr->cm_seen = 0;
to_send_frag(frag)->hdr->credits = 0;
/* This function will either post the send or queue it up if the resource
* is busy. The resource could be busy if it is out of credits or out of
* wqe's. If we get something other then resource busy or success, then
* we will error out entirely as an unrecoverable error. */
retval = mca_btl_openib_endpoint_send(ep, (mca_btl_openib_send_frag_t*)frag);
if ((OMPI_SUCCESS != retval) && (OMPI_ERR_RESOURCE_BUSY != retval)) {
ep->endpoint_btl->error_cb(&ep->endpoint_btl->super, MCA_BTL_ERROR_FLAGS_FATAL, NULL, NULL);
}
}
}
/* Now move the actual frag that caused the error */
frag = (mca_btl_openib_com_frag_t *)
mca_btl_openib_alloc((mca_btl_base_module_t *)ep->endpoint_btl,
ep, to_base_frag(oldfrag)->base.order,
to_base_frag(oldfrag)->segment.seg_len,
to_base_frag(oldfrag)->base.des_flags | MCA_BTL_IB_NO_COALESCE);
OPAL_OUTPUT((-1, "Changing frag=%lx,btl=%s to frag=%lx,btl=%s, copying %d bytes\n",
oldfrag, oldfrag->endpoint->endpoint_btl->super.btl_ifname,
frag, ep->endpoint_btl->super.btl_ifname,
to_base_frag(oldfrag)->segment.seg_len));
/* First adjust the values in the descriptor portion of the fragment.
* Note that I do not currently set the des_context value. This field
* is often set to the bml endpoint when the frag is created. Not sure
* if I will ultimately need that. */
des = (mca_btl_base_descriptor_t*)frag;
des->des_cbfunc = olddes->des_cbfunc;
des->des_cbdata = olddes->des_cbdata;
/* Now adjust fragment specific information */
frag->endpoint = ep;
/* Finally copy over the data that is actually being transmitted */
memcpy(to_base_frag(frag)->segment.seg_addr.pval, to_base_frag(oldfrag)->segment.seg_addr.pval,
to_base_frag(oldfrag)->segment.seg_len);
to_base_frag(frag)->segment.seg_len = to_base_frag(oldfrag)->segment.seg_len;
/* Set the fields in the mca_btl_openib_header_t. The fields consist of:
* mca_btl_base_tag_t tag
* uint8_t cm_seen;
* uint16_t credits;
* The tag field gets the tag from the old fragment. The other two fields
* are set to zero. */
if (coalesced_len > 0) {
/* A coalesced fragment has the tag field in a different location */
mca_btl_openib_control_header_t *ctrl_hdr;
mca_btl_openib_header_coalesced_t *clsc_hdr;
/* Peel off the old PML tag from the header information. Need to work past
* the openib_header and control_header to get to coalesce_header */
ctrl_hdr = (mca_btl_openib_control_header_t*)(to_send_frag(oldfrag)->hdr + 1);
clsc_hdr = (mca_btl_openib_header_coalesced_t*)(ctrl_hdr + 1);
to_send_frag(frag)->hdr->tag = clsc_hdr->tag;
} else {
/* For normal send headers, copy over the tag. */
to_send_frag(frag)->hdr->tag = to_send_frag(oldfrag)->hdr->tag;
}
to_send_frag(frag)->hdr->cm_seen = 0;
to_send_frag(frag)->hdr->credits = 0;
/* This function will either post the send or queue it up if the resource
* is busy. The resource could be busy if it is out of credits or out of
* wqe's. If we get something other then resource busy or success, then
* we will error out entirely as an unrecoverable error. */
retval = mca_btl_openib_endpoint_send(ep, (mca_btl_openib_send_frag_t*)frag);
if ((OMPI_SUCCESS != retval) && (OMPI_ERR_RESOURCE_BUSY != retval)) {
ep->endpoint_btl->error_cb(&ep->endpoint_btl->super, MCA_BTL_ERROR_FLAGS_FATAL, NULL, NULL);
}
/* Some extra debugging tool. Should be removed eventually. This prints
* out the PML header that is in the newly created fragment. */
{
mca_pml_ob1_common_hdr_t* hdr;
mca_pml_ob1_match_hdr_t* mhdr;
mca_pml_ob1_frag_hdr_t* fhdr;
uint8_t type;
hdr = (mca_pml_ob1_common_hdr_t*)des->des_src->seg_addr.pval;
type = hdr->hdr_type;
switch (type) {
case MCA_PML_OB1_HDR_TYPE_MATCH:
mhdr = (mca_pml_ob1_match_hdr_t*)hdr;
OPAL_OUTPUT((-1, "MATCH,frag=%d,tag=%d,src=%d,seq=%d",
frag, mhdr->hdr_tag, mhdr->hdr_src, mhdr->hdr_seq));
break;
case MCA_PML_OB1_HDR_TYPE_FRAG:
fhdr = (mca_pml_ob1_frag_hdr_t*)hdr;
OPAL_OUTPUT((-1, "FRAG,frag=%lx,rreq=%lx,len=%d,offset=%d",
frag, fhdr->hdr_dst_req.pval, to_base_frag(frag)->segment.seg_len,
fhdr->hdr_frag_offset));
break;
case MCA_PML_OB1_HDR_TYPE_RNDV:
OPAL_OUTPUT((-1, "RNDV,frag=%lx", frag));
break;
case MCA_PML_OB1_HDR_TYPE_ACK:
OPAL_OUTPUT((-1, "ACK,frag=%lx", frag));
break;
default:
OPAL_OUTPUT((-1, "OTHER,frag=%lx", frag));
}
}
}
/**
* This function will move all the pending fragments from one endpoint
* to another. It walks through each qp with each priority and looks
* for both no_credits_pending_frags and no_wqe_pending_frags and
* moves any it finds. This is called when we detect an error on a
* btl and we are trying to recover.
*/
static void move_all_pending_frags(mca_btl_base_endpoint_t *old_ep,
mca_btl_base_endpoint_t *new_ep)
{
int qp, pri, rc, len, total;
opal_list_item_t *item;
mca_btl_openib_com_frag_t* frag;
total = 0;
/* Traverse all QPs and all priorities and move to other endpoint */
for (qp = 0; qp < mca_btl_openib_component.num_qps; ++qp) {
for (pri = 0; pri < 2; ++pri) {
/* All types of qp's have a no_wqe_pending_frags list */
len = opal_list_get_size(&old_ep->qps[qp].no_wqe_pending_frags[pri]);
if (len > 0) {
total += len;
opal_output(0, "Checking for no_wqe_pending_frags qp=%d, pri=%d, list size=%d",
qp, pri, len);
while (NULL != (item = opal_list_remove_first(&old_ep->qps[qp].
no_wqe_pending_frags[pri]))) {
frag = (mca_btl_openib_com_frag_t *) item;
mca_btl_openib_move_frag(new_ep, frag);
}
}
if (BTL_OPENIB_QP_TYPE_PP(qp)) {
len = opal_list_get_size(&old_ep->qps[qp].no_credits_pending_frags[pri]);
if (len > 0) {
total += len;
opal_output(0, "Checking for no_credits_pending_frags qp=%d, pri=%d, list size=%d",
qp, pri, len);
while (NULL != (item = opal_list_remove_first(&old_ep->qps[qp].
no_credits_pending_frags[pri]))) {
frag = (mca_btl_openib_com_frag_t *) item;
mca_btl_openib_move_frag(new_ep, frag);
}
}
} else if (BTL_OPENIB_QP_TYPE_SRQ(qp)) {
len = opal_list_get_size(&old_ep->endpoint_btl->qps[qp].u.srq_qp.pending_frags[pri]);
if (len > 0) {
total += len;
opal_output(0, "Checking for srq pending_frags qp=%d, pri=%d, list size=%d",
qp, pri, len);
while (NULL != (item = opal_list_remove_first(&old_ep->endpoint_btl->qps[qp].
u.srq_qp.pending_frags[pri]))) {
frag = (mca_btl_openib_com_frag_t *) item;
mca_btl_openib_move_frag(new_ep, frag);
}
}
}
}
}
/* Check for any frags from a connection that was never made. Not sure if this
* can actually happen. */
len = opal_list_get_size(&old_ep->pending_lazy_frags);
if (len > 0) {
total += len;
opal_output(0, "Checking for pending_lazy_frags, list size=%d", len);
while (NULL != (item = opal_list_remove_first(&(old_ep->pending_lazy_frags)))) {
frag = (mca_btl_openib_com_frag_t *) item;
mca_btl_openib_move_frag(new_ep, frag);
}
}
OPAL_OUTPUT((-1, "Finished checking for pending_frags, total moved=%d",
total));
}
static char *cq_name[] = {"HP CQ", "LP CQ"};
static void handle_wc(mca_btl_openib_device_t* device, const uint32_t cq,
struct ibv_wc *wc)
@ -2818,9 +3064,11 @@ static void handle_wc(mca_btl_openib_device_t* device, const uint32_t cq,
mca_btl_openib_com_frag_t* frag;
mca_btl_base_descriptor_t *des;
mca_btl_openib_endpoint_t* endpoint;
mca_btl_openib_endpoint_t* newep;
mca_btl_openib_module_t *openib_btl = NULL;
ompi_proc_t* remote_proc = NULL;
int qp, btl_ownership;
int holdon = 1;
des = (mca_btl_base_descriptor_t*)(uintptr_t)wc->wr_id;
frag = to_com_frag(des);
@ -2834,6 +3082,32 @@ static void handle_wc(mca_btl_openib_device_t* device, const uint32_t cq,
if(endpoint)
openib_btl = endpoint->endpoint_btl;
/* These are the three types of fragments we have seen so far */
if ((openib_frag_type(des) != MCA_BTL_OPENIB_FRAG_RECV) &&
(openib_frag_type(des) != MCA_BTL_OPENIB_FRAG_SEND) &&
(openib_frag_type(des) != MCA_BTL_OPENIB_FRAG_CONTROL)) {
OPAL_OUTPUT((0, "Fragment is type %d, size=%d", openib_frag_type(des), (int)wc->byte_len));
}
/* Quiet some of the receive frag errors */
if (openib_frag_type(des) != MCA_BTL_OPENIB_FRAG_RECV) {
OPAL_OUTPUT((-1, "Fragment is type %d, size=%d", openib_frag_type(des), (int)wc->byte_len));
OPAL_OUTPUT((-1, "\nCQ btl=%s: status=%s(%d),wr_id=%d,opcode=%d",
openib_btl->super.btl_ifname,
btl_openib_component_status_to_string(wc->status),
wc->status, (void *)(uintptr_t)wc->wr_id, wc->opcode));
if (des->des_src) {
mca_pml_ob1_frag_hdr_t* hdr = (mca_pml_ob1_frag_hdr_t*)des->des_src->seg_addr.pval;
if (MCA_PML_OB1_HDR_TYPE_FRAG == hdr->hdr_common.hdr_type) {
OPAL_OUTPUT((-1, "frag=TYPE_FRAG,offset=%d", hdr->hdr_frag_offset));
} else if (MCA_PML_OB1_HDR_TYPE_RNDV == hdr->hdr_common.hdr_type) {
OPAL_OUTPUT((-1, "frag=TYPE_RNDV"));
} else {
OPAL_OUTPUT((-1, "frag=OTHER"));
}
}
}
if(wc->status != IBV_WC_SUCCESS) {
OPAL_OUTPUT((-1, "Got WC: ERROR"));
goto error;
@ -2899,7 +3173,7 @@ static void handle_wc(mca_btl_openib_device_t* device, const uint32_t cq,
/* Process a RECV */
if(btl_openib_handle_incoming(openib_btl, endpoint, to_recv_frag(frag),
wc->byte_len) != OMPI_SUCCESS) {
openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL);
openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL, NULL, NULL);
break;
}
@ -2916,7 +3190,7 @@ static void handle_wc(mca_btl_openib_device_t* device, const uint32_t cq,
default:
BTL_ERROR(("Unhandled work completion opcode is %d", wc->opcode));
if(openib_btl)
openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL);
openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL, NULL, NULL);
break;
}
@ -2942,6 +3216,7 @@ error:
}
#endif
#if 0
if(IBV_WC_WR_FLUSH_ERR != wc->status || !flush_err_printed[cq]++) {
BTL_PEER_ERROR(remote_proc, ("error polling %s with status %s "
"status number %d for wr_id %" PRIx64 " opcode %d vendor error %d qp_idx %d",
@ -2957,7 +3232,13 @@ error:
wc->status, wc->wr_id,
wc->opcode, wc->vendor_err, qp);
}
if (openib_frag_type(des) != MCA_BTL_OPENIB_FRAG_RECV) {
OPAL_OUTPUT((0, "Error on btl=%s: wc->status=%s(%d), wc->wr_id=%d",
openib_btl->super.btl_ifname,
btl_openib_component_status_to_string(wc->status),
wc->status, (void *)(uintptr_t)wc->wr_id));
}
if (IBV_WC_RNR_RETRY_EXC_ERR == wc->status ||
IBV_WC_RETRY_EXC_ERR == wc->status) {
char *peer_hostname =
@ -2993,9 +3274,97 @@ error:
device_name, peer_hostname);
}
}
#endif
/* If failover is not enabled, just error out like we always did */
if(!mca_btl_openib_component.enable_hca_failover) {
openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL, NULL, NULL);
}
if(openib_btl)
openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL);
/* Here is where we figure out what to do with the unsent fragment. To keep
* things clear, I handle each one differently.
* Note: In the wc struct, these are the only valid fields with an error:
* wc->wr_id, wc->status, wc->vendor_err, wc->qp_num.
* This means we cannot key off of the wc->opcode to see what operation we did.
/* Drop any errors receiving on a PP connection. There is nothing else to do */
if ((openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_RECV) && (BTL_OPENIB_QP_TYPE_PP(qp))) {
OPAL_OUTPUT((-1, "RECV or CONTROL, dropping since connection is broken (des=%d)", des));
return;
}
/* Drop any CONTROL messages as they are only valid on this connection. */
if (openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_CONTROL) {
OPAL_OUTPUT((-1, "RECV or CONTROL, dropping since connection is broken (des=%d)", des));
return;
}
/* MCA_BTL_OPENIB_FRAG_EAGER_RDMA is a openib specific control message
* used to set up eager RDMA on a connection. Since the connection
* is broken, just drop it. */
if (openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_EAGER_RDMA) {
OPAL_OUTPUT((-1, "OPENIB_FRAG_EAGER_RDMA, dropping since connection is broken (des=%d)", des));
}
if ((openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_RECV) && !BTL_OPENIB_QP_TYPE_PP(qp)) {
OPAL_OUTPUT((0, "SRQ RECV type=%d, size=%d", openib_frag_type(des), (int)wc->byte_len));
return;
#if 0
while (holdon) {
holdon++;
opal_output(0, "SRQ RECV DETECTED - ATTACH DEBUGGER");
sleep(5);
}
#endif
}
#if 0
/* If we get an error on a receive then just map out the interface
* for any future sends. There is nothin to retransmit.
* NOTE: Not sure what to do with this yet */
if (((openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_RECV) && (BTL_OPENIB_QP_TYPE_PP(qp))) ||
(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_CONTROL)) {
openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_NONFATAL, remote_proc, &newep);
return;
}
#endif
#if 0
/* For shared receive queues, we need to return the fragments and
* repost the receives since they are a shared resource. For
* peer-to-peer queues, we do nothing.
* NOTE: Not sure what to do here yet. I cannot get the btl or the endpoint
* from the fragment that is returned. Usually, the endpoint is retrieved via
* the immediate data, but obviously the immediate data is non-existant on an
* error. All I really need is the btl but I am not sure where I get that
* from. I have observed that I am not getting many errors on the receive
* so I will not worry now about reposting them. */
if ((openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_RECV) && !BTL_OPENIB_QP_TYPE_PP(qp)) {
OPAL_OUTPUT((0, "SRQ RECV type=%d, size=%d", openib_frag_type(des), (int)wc->byte_len));
MCA_BTL_IB_FRAG_RETURN(frag);
mca_btl_openib_module_t *btl = endpoint->endpoint_btl;
OPAL_THREAD_ADD32(&btl->qps[qp].u.srq_qp.rd_posted, -1);
mca_btl_openib_post_srr(btl, qp);
return;
}
#endif
/* Need to keep calling this to get the alternative endpoint back.
* However, subsequent calls will not actually map anything out.
* Note that we do not call this on a SRQ receive error or any
* type of receive error. */
openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_NONFATAL, remote_proc, &newep);
/* Move all the pending frags to the new endpoint as they can no
* longer go out the broken endpoint. OPTIMIZATION: Like the PML
* callback, this really only needs to be called once. However, it
* does not hurt anything to keep calling it. Subsequent calls will
* just have nothing to move over. */
move_all_pending_frags(endpoint, newep);
/* Now move the fragment that triggered the error over to the
* other endpoint */
mca_btl_openib_move_frag(newep, frag);
}
static int poll_device(mca_btl_openib_device_t* device, int count)
@ -3029,6 +3398,7 @@ static int poll_device(mca_btl_openib_device_t* device, int count)
device->hp_cq_polls--;
}
OPAL_OUTPUT((-1, "ibv_poll_cq found CQ event on %s", device->ib_dev->name));
handle_wc(device, cq, &wc);
}
@ -3125,7 +3495,7 @@ static int progress_one_device(mca_btl_openib_device_t *device)
ret = btl_openib_handle_incoming(btl, to_com_frag(frag)->endpoint,
frag, size - sizeof(mca_btl_openib_footer_t));
if (ret != MPI_SUCCESS) {
btl->error_cb(&btl->super, MCA_BTL_ERROR_FLAGS_FATAL);
btl->error_cb(&btl->super, MCA_BTL_ERROR_FLAGS_FATAL, NULL, NULL);
return 0;
}
@ -3144,6 +3514,26 @@ static int progress_one_device(mca_btl_openib_device_t *device)
return count;
}
void btl_dump_pending_lists() {
int i,j;
mca_btl_openib_endpoint_t* endpoint;
for(i = 0; i < mca_btl_openib_component.devices_count; i++) {
mca_btl_openib_device_t *device =
opal_pointer_array_get_item(&mca_btl_openib_component.devices, i);
for (j = 0; j < 10; j++) {
endpoint = (mca_btl_openib_endpoint_t*)
opal_pointer_array_get_item(device->endpoints, j);
if (endpoint != NULL) {
opal_output(0, "pending_lazy_frags size = %d",
endpoint->pending_lazy_frags.opal_list_length);
}
}
}
}
/*
* IB component progress.
*/
@ -3176,7 +3566,7 @@ error:
mca_btl_openib_module_t* openib_btl =
mca_btl_openib_component.openib_btls[i];
if(openib_btl->device->got_fatal_event) {
openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL);
openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL, NULL, NULL);
}
}
return count;

Просмотреть файл

@ -1018,7 +1018,7 @@ void *mca_btl_openib_endpoint_invoke_error(void *context)
}
/* Invoke the callback to the upper layer */
btl->error_cb(&(btl->super), MCA_BTL_ERROR_FLAGS_FATAL);
btl->error_cb(&(btl->super), MCA_BTL_ERROR_FLAGS_FATAL, NULL, NULL);
/* Will likely never get here */
return NULL;

Просмотреть файл

@ -24,6 +24,7 @@
#ifndef MCA_BTL_IB_ENDPOINT_H
#define MCA_BTL_IB_ENDPOINT_H
#include <unistd.h>
#include "opal/class/opal_list.h"
#include "opal/event/event.h"
#include "opal/util/output.h"
@ -35,6 +36,7 @@
#include <string.h>
#include "ompi/mca/btl/base/btl_base_error.h"
#include "connect/base.h"
#include "ompi/mca/pml/ob1/pml_ob1_hdr.h" /* For debugging only */
BEGIN_C_DECLS
@ -420,6 +422,7 @@ static inline int check_endpoint_state(mca_btl_openib_endpoint_t *ep,
mca_btl_base_descriptor_t *des, opal_list_t *pending_list)
{
int rc = OMPI_ERR_RESOURCE_BUSY;
int holdon = 1;
switch(ep->endpoint_state) {
case MCA_BTL_IB_CLOSED:
@ -437,6 +440,13 @@ static inline int check_endpoint_state(mca_btl_openib_endpoint_t *ep,
/* fall through */
default:
opal_list_append(pending_list, (opal_list_item_t *)des);
#if 0
while (holdon) {
holdon++;
opal_output(0, "STARTING CONNECTION on %d - ATTACH DEBUGGER", getpid());
sleep(5);
}
#endif
break;
case MCA_BTL_IB_FAILED:
rc = OMPI_ERR_UNREACH;
@ -476,6 +486,40 @@ static inline int post_send(mca_btl_openib_endpoint_t *ep,
struct ibv_send_wr *sr_desc = &to_out_frag(frag)->sr_desc;
struct ibv_send_wr *bad_wr;
int qp = to_base_frag(frag)->base.order;
static int printstuff = 0;
if (printstuff == 1) {
/* Some extra debugging tool. Should be removed eventually. This prints
* out the PML header that is in the newly created fragment. */
mca_pml_ob1_common_hdr_t* hdr;
mca_pml_ob1_match_hdr_t* mhdr;
mca_pml_ob1_frag_hdr_t* fhdr;
uint8_t type;
hdr = (mca_pml_ob1_common_hdr_t*)seg->seg_addr.pval;
type = hdr->hdr_type;
switch (type) {
case MCA_PML_OB1_HDR_TYPE_MATCH:
mhdr = (mca_pml_ob1_match_hdr_t*)hdr;
OPAL_OUTPUT((-1, "MATCH,frag=%d,tag=%d,src=%d,seq=%d",
frag, mhdr->hdr_tag, mhdr->hdr_src, mhdr->hdr_seq));
break;
case MCA_PML_OB1_HDR_TYPE_FRAG:
fhdr = (mca_pml_ob1_frag_hdr_t*)hdr;
OPAL_OUTPUT((-1, "FRAG,frag=%lx,rreq=%lx,len=%d,offset=%d",
frag, fhdr->hdr_dst_req.pval, to_base_frag(frag)->segment.seg_len,
fhdr->hdr_frag_offset));
break;
case MCA_PML_OB1_HDR_TYPE_RNDV:
OPAL_OUTPUT((-1, "RNDV,frag=%lx", frag));
break;
case MCA_PML_OB1_HDR_TYPE_ACK:
OPAL_OUTPUT((-1, "ACK,frag=%lx", frag));
break;
default:
OPAL_OUTPUT((-1, "OTHER,frag=%lx", frag));
}
}
sg->length = seg->seg_len + sizeof(mca_btl_openib_header_t) +
(rdma ? sizeof(mca_btl_openib_footer_t) : 0) + frag->coalesced_length;
@ -527,7 +571,18 @@ static inline int post_send(mca_btl_openib_endpoint_t *ep,
#endif
assert(sg->addr == (uint64_t)(uintptr_t)frag->hdr);
return ibv_post_send(ep->qps[qp].qp->lcl_qp, sr_desc, &bad_wr);
{
int retval;
retval = ibv_post_send(ep->qps[qp].qp->lcl_qp, sr_desc, &bad_wr);
if (0 == retval) {
OPAL_OUTPUT((-1, "SUCCESS: Posted %d frag on %s\n", frag,
ep->endpoint_btl->super.btl_ifname));
} else {
OPAL_OUTPUT((0, "FAILURE: Did not posted %d frag on %s\n", frag,
ep->endpoint_btl->super.btl_ifname));
}
return retval;
}
}
END_C_DECLS

Просмотреть файл

@ -166,6 +166,7 @@ do { \
} while (0)
enum mca_btl_openib_frag_type_t {
MCA_BTL_OPENIB_FRAG_UNUSED, /* For debugging: Makes FRAG_RECV=1 */
MCA_BTL_OPENIB_FRAG_RECV,
MCA_BTL_OPENIB_FRAG_RECV_USER,
MCA_BTL_OPENIB_FRAG_SEND,

Просмотреть файл

@ -555,6 +555,11 @@ int btl_openib_register_mca_params(void)
NULL, &mca_btl_openib_component.ipaddr_exclude,
0));
CHECK(reg_int("enable_hca_failover", NULL,
"Enable failover from one HCA to another", 1, &ival, 0));
mca_btl_openib_component.enable_hca_failover = (0 != ival);
/* Register any MCA params for the connect pseudo-components */
if (OMPI_SUCCESS == ret) {
ret = ompi_btl_openib_connect_base_register();

Просмотреть файл

@ -36,6 +36,7 @@
mca_btl_pcie_module_t mca_btl_pcie_module = {
{
&mca_btl_pcie_component.super,
"unknown",
0, /* max size of first fragment */
0, /* Threshold below which BTL should not fragment */
0, /* max send fragment size */

Просмотреть файл

@ -38,6 +38,7 @@
mca_btl_portals_module_t mca_btl_portals_module = {
{
&mca_btl_portals_component.super,
"unknown",
/* NOTE: All these default values are set in
component_open() */

Просмотреть файл

@ -34,6 +34,7 @@
mca_btl_sctp_module_t mca_btl_sctp_module = {
{
&mca_btl_sctp_component.super,
"unknown",
0, /* max size of first fragment */
0, /* min send fragment size */
0, /* max send fragment size */

Просмотреть файл

@ -36,6 +36,7 @@
mca_btl_base_module_t mca_btl_self = {
&mca_btl_self_component.super,
"unknown",
0, /* btl_eager_limit */
0, /* btl_rndv_eager_limit */
0, /* btl_max_send_size */

Просмотреть файл

@ -56,6 +56,7 @@
mca_btl_sm_t mca_btl_sm = {
{
&mca_btl_sm_component.super,
"sm",
0, /* btl_eager_limit */
0, /* btl_rndv_eager_limit */
0, /* btl_max_send_size */

Просмотреть файл

@ -36,6 +36,7 @@
mca_btl_tcp_module_t mca_btl_tcp_module = {
{
&mca_btl_tcp_component.super,
"unknown",
0, /* max size of first fragment */
0, /* min send fragment size */
0, /* max send fragment size */

Просмотреть файл

@ -33,6 +33,7 @@
mca_btl_template_module_t mca_btl_template_module = {
{
&mca_btl_template_component.super,
"unknown",
0, /* max size of first fragment */
0, /* min send fragment size */
0, /* max send fragment size */

Просмотреть файл

@ -50,6 +50,7 @@ static int mca_btl_udapl_assign_netmask(mca_btl_udapl_module_t* udapl_btl);
mca_btl_udapl_module_t mca_btl_udapl_module = {
{
&mca_btl_udapl_component.super,
"unknown",
0, /* max size of first fragment */
0, /* min send fragment size */
0, /* max send fragment size */

Просмотреть файл

@ -41,6 +41,7 @@
#include "ompi/mca/bml/base/base.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/grpcomm/grpcomm.h"
#include "orte/mca/notifier/notifier.h"
#include "ompi/runtime/ompi_cr.h"
@ -70,7 +71,8 @@ mca_pml_ob1_t mca_pml_ob1 = {
void mca_pml_ob1_error_handler( struct mca_btl_base_module_t* btl,
int32_t flags );
int32_t flags, ompi_proc_t* ompi_proc,
struct mca_btl_base_endpoint_t** btl_endpoint);
int mca_pml_ob1_enable(bool enable)
{
@ -565,10 +567,70 @@ void mca_pml_ob1_process_pending_rdma(void)
}
void mca_pml_ob1_error_handler(
struct mca_btl_base_module_t* btl,
int32_t flags) {
orte_errmgr.abort(-1, NULL);
void mca_pml_ob1_error_handler( struct mca_btl_base_module_t* btl,
int32_t flags, ompi_proc_t *errproc,
struct mca_btl_base_endpoint_t** btl_endpoint)
{
ompi_proc_t** procs;
size_t p, num_procs;
mca_bml_base_endpoint_t* ep;
if (flags & MCA_BTL_ERROR_FLAGS_FATAL) {
orte_errmgr.abort(-1, NULL);
}
/**
* Just remove the offending bml_btl corresponding to the btl with the
* error. Let the other errors remove the other ones.
*/
procs = ompi_proc_all(&num_procs);
if(NULL != procs) {
if (0 < mca_bml.bml_del_proc_btl(errproc, btl)) {
opal_output(0, "PML error handler: rank=%d mapping out btl:name=%s,if=%s to rank=%d on node=%s",
ORTE_PROC_MY_NAME->vpid,
btl->btl_component->btl_version.mca_component_name,
btl->btl_ifname,
errproc->proc_name.vpid,
errproc->proc_hostname);
}
#if 0
for( p = 0; p < num_procs; p++ ) {
ompi_proc_t* proc = procs[p];
ep = (mca_bml_base_endpoint_t*)proc->proc_bml;
opal_output(0, "p=%d, eager=%d, send=%d, rdma=%d, proc=%s",
p,
ep->btl_eager.arr_size,
ep->btl_send.arr_size,
ep->btl_rdma.arr_size,
proc->proc_hostname);
}
#endif
ep = (mca_bml_base_endpoint_t*)errproc->proc_bml;
if ((ep->btl_eager.arr_size == 0) &&
(ep->btl_send.arr_size == 0) &&
(ep->btl_rdma.arr_size == 0)) {
opal_output(0, "NO MORE INTERFACES - BYE BYE");
orte_errmgr.abort(-1, NULL);
}
}
/**
* Now return the first one in the list. Odds are there were only
* two to start with and now we are down to one.
*/
if (NULL != btl_endpoint) {
*btl_endpoint = errproc->proc_bml->btl_send.bml_btls[0].btl_endpoint;
}
orte_notifier.log(ORTE_NOTIFIER_INFRA, ORTE_ERR_COMM_FAILURE,
"Mapping out btl component %s with interface %s",
btl->btl_component->btl_version.mca_component_name,
btl->btl_ifname);
}
#if OPAL_ENABLE_FT == 0

Просмотреть файл

@ -81,6 +81,7 @@ struct mca_pml_ob1_t {
typedef struct mca_pml_ob1_t mca_pml_ob1_t;
extern mca_pml_ob1_t mca_pml_ob1;
extern int mca_pml_ob1_output;
/*
* PML interface functions.

Просмотреть файл

@ -47,6 +47,7 @@ static mca_pml_base_module_t*
mca_pml_ob1_component_init( int* priority, bool enable_progress_threads,
bool enable_mpi_threads );
static int mca_pml_ob1_component_fini(void);
int mca_pml_ob1_output = 0;
mca_pml_base_component_2_0_0_t mca_pml_ob1_component = {
@ -93,6 +94,11 @@ static inline int mca_pml_ob1_param_register_int(
static int mca_pml_ob1_component_open(void)
{
mca_allocator_base_component_t* allocator_component;
int value;
value = mca_pml_ob1_param_register_int("verbose", 0);
mca_pml_ob1_output = opal_output_open(NULL);
opal_output_set_verbosity(mca_pml_ob1_output, value);
mca_pml_ob1.free_list_num =
mca_pml_ob1_param_register_int("free_list_num", 4);

Просмотреть файл

@ -317,6 +317,7 @@ void mca_pml_ob1_recv_frag_callback_frag(mca_btl_base_module_t* btl,
mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t* des,
void* cbdata ) {
int triperr = 1;
mca_btl_base_segment_t* segments = des->des_dst;
mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval;
mca_pml_ob1_recv_request_t* recvreq;
@ -326,6 +327,15 @@ void mca_pml_ob1_recv_frag_callback_frag(mca_btl_base_module_t* btl,
}
ob1_hdr_ntoh(hdr, MCA_PML_OB1_HDR_TYPE_FRAG);
recvreq = (mca_pml_ob1_recv_request_t*)hdr->hdr_frag.hdr_dst_req.pval;
if (recvreq->req_recv.req_base.req_ompi.req_state == OMPI_REQUEST_INVALID) {
while (triperr) {
triperr++;
opal_output(0, "ERROR DETECTED - ATTACH DEBUGGER");
sleep(5);
}
return;
}
mca_pml_ob1_recv_request_progress_frag(recvreq,btl,segments,des->des_dst_cnt);
return;
@ -592,6 +602,9 @@ static int mca_pml_ob1_recv_frag_match( mca_btl_base_module_t *btl,
/* get sequence number of next message that can be processed */
next_msg_seq_expected = (uint16_t)proc->expected_sequence;
opal_output_verbose(20, mca_pml_ob1_output,
"frag_msg_seq=%d, next_msg_seq_expected=%d",
frag_msg_seq, next_msg_seq_expected);
if(OPAL_UNLIKELY(frag_msg_seq != next_msg_seq_expected))
goto wrong_seq;
@ -671,5 +684,5 @@ wrong_seq:
num_segments, NULL);
OPAL_THREAD_UNLOCK(&comm->matching_lock);
return OMPI_SUCCESS;
}
}

Просмотреть файл

@ -422,6 +422,7 @@ void mca_pml_ob1_recv_request_progress_frag( mca_pml_ob1_recv_request_t* recvreq
0, bytes_received );
bytes_received -= sizeof(mca_pml_ob1_frag_hdr_t);
data_offset = hdr->hdr_frag.hdr_frag_offset;
OPAL_OUTPUT((-1, " Received SEND_FRAG, offset=%d", data_offset));
/*
* Make user buffer accessable(defined) before unpacking.
*/

Просмотреть файл

@ -36,6 +36,7 @@
OBJ_CLASS_INSTANCE(mca_pml_ob1_send_range_t, ompi_free_list_item_t,
NULL, NULL);
void mca_pml_ob1_send_request_process_pending(mca_bml_base_btl_t *bml_btl)
{
int i, s = opal_list_get_size(&mca_pml_ob1.send_pending);
@ -544,9 +545,11 @@ int mca_pml_ob1_send_request_start_copy( mca_pml_ob1_send_request_t* sendreq,
switch(rc) {
case OMPI_ERR_RESOURCE_BUSY:
/* No more resources. Allow the upper level to queue the send */
opal_output(0, "OMPI_ERR_RESOURCE_BUSY returned from mca_pml_ob1_send_request_start_copy");
rc = OMPI_ERR_OUT_OF_RESOURCE;
break;
default:
opal_output(0, "ERROR ERROR ERROR ERROR ERROR in start_copy");
mca_bml_base_free(bml_btl, des);
break;
}