HCA failover support in openib BTL
This commit was SVN r21767.
Этот коммит содержится в:
родитель
91e52d062b
Коммит
41f38110ff
@ -39,6 +39,7 @@ int mca_bml_base_error_count;
|
||||
|
||||
int mca_bml_base_open(void)
|
||||
{
|
||||
int value;
|
||||
/* See if we've already been here */
|
||||
if (++mca_bml_base_already_opened > 1) {
|
||||
return OMPI_SUCCESS;
|
||||
@ -51,6 +52,15 @@ int mca_bml_base_open(void)
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
mca_base_param_reg_int_name("bml",
|
||||
"base_verbose",
|
||||
"Verbosity level of the BML framework",
|
||||
false, false,
|
||||
0,
|
||||
&value);
|
||||
mca_bml_base_output = opal_output_open(NULL);
|
||||
opal_output_set_verbosity(mca_bml_base_output, value);
|
||||
|
||||
#if OPAL_ENABLE_DEBUG_RELIABILITY
|
||||
do {
|
||||
int param, value;
|
||||
|
@ -175,6 +175,7 @@ static inline mca_bml_base_btl_t* mca_bml_base_btl_array_get_index(mca_bml_base_
|
||||
*
|
||||
* @param index (OUT)
|
||||
*/
|
||||
extern int mca_bml_base_output;
|
||||
static inline mca_bml_base_btl_t* mca_bml_base_btl_array_get_next(mca_bml_base_btl_array_t* array)
|
||||
{
|
||||
#if OPAL_ENABLE_DEBUG
|
||||
@ -184,6 +185,8 @@ static inline mca_bml_base_btl_t* mca_bml_base_btl_array_get_next(mca_bml_base_b
|
||||
}
|
||||
#endif
|
||||
if( 1 == array->arr_size ) {
|
||||
opal_output_verbose(20, mca_bml_base_output,
|
||||
"%s btl selected", array->bml_btls[0].btl->btl_ifname);
|
||||
return &array->bml_btls[0]; /* force the return to avoid a jump */
|
||||
} else {
|
||||
size_t current_position = array->arr_index; /* force to always start from zero */
|
||||
@ -192,6 +195,8 @@ static inline mca_bml_base_btl_t* mca_bml_base_btl_array_get_next(mca_bml_base_b
|
||||
} else {
|
||||
array->arr_index = current_position + 1; /* continue */
|
||||
}
|
||||
opal_output_verbose(20, mca_bml_base_output,
|
||||
"%s btl selected", array->bml_btls[current_position].btl->btl_ifname);
|
||||
return &array->bml_btls[current_position];
|
||||
}
|
||||
}
|
||||
|
@ -525,12 +525,15 @@ static int mca_bml_r2_del_proc_btl(ompi_proc_t* proc, mca_btl_base_module_t* btl
|
||||
mca_btl_base_module_t* ep_btl;
|
||||
double total_bandwidth = 0;
|
||||
size_t b;
|
||||
int rc = 0;
|
||||
|
||||
if(NULL == ep)
|
||||
return OMPI_SUCCESS;
|
||||
|
||||
/* remove btl from eager list */
|
||||
mca_bml_base_btl_array_remove(&ep->btl_eager, btl);
|
||||
if (mca_bml_base_btl_array_remove(&ep->btl_eager, btl)) {
|
||||
rc++;
|
||||
}
|
||||
|
||||
/* remove btl from send list */
|
||||
if(mca_bml_base_btl_array_remove(&ep->btl_send, btl)) {
|
||||
@ -538,6 +541,7 @@ static int mca_bml_r2_del_proc_btl(ompi_proc_t* proc, mca_btl_base_module_t* btl
|
||||
/* compute total_bandwidth and
|
||||
reset max_send_size to the min of all btl's */
|
||||
total_bandwidth = 0;
|
||||
rc++;
|
||||
for(b=0; b< mca_bml_base_btl_array_get_size(&ep->btl_send); b++) {
|
||||
bml_btl = mca_bml_base_btl_array_get_index(&ep->btl_send, b);
|
||||
ep_btl = bml_btl->btl;
|
||||
@ -563,9 +567,10 @@ static int mca_bml_r2_del_proc_btl(ompi_proc_t* proc, mca_btl_base_module_t* btl
|
||||
|
||||
/* remove btl from RDMA list */
|
||||
if(mca_bml_base_btl_array_remove(&ep->btl_rdma, btl)) {
|
||||
|
||||
|
||||
/* computer total bandwidth */
|
||||
total_bandwidth = 0;
|
||||
rc++;
|
||||
for(b=0; b< mca_bml_base_btl_array_get_size(&ep->btl_rdma); b++) {
|
||||
bml_btl = mca_bml_base_btl_array_get_index(&ep->btl_rdma, b);
|
||||
ep_btl = bml_btl->btl;
|
||||
@ -593,7 +598,7 @@ static int mca_bml_r2_del_proc_btl(ompi_proc_t* proc, mca_btl_base_module_t* btl
|
||||
}
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
return rc;
|
||||
}
|
||||
|
||||
int mca_bml_r2_finalize( void )
|
||||
|
@ -197,6 +197,7 @@ typedef uint8_t mca_btl_base_tag_t;
|
||||
|
||||
/* error callback flags */
|
||||
#define MCA_BTL_ERROR_FLAGS_FATAL 0x1
|
||||
#define MCA_BTL_ERROR_FLAGS_NONFATAL 0x2
|
||||
|
||||
/**
|
||||
* Asynchronous callback function on completion of an operation.
|
||||
@ -509,7 +510,9 @@ typedef int (*mca_btl_base_module_register_fn_t)(
|
||||
|
||||
typedef void (*mca_btl_base_module_error_cb_fn_t)(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
int32_t flags
|
||||
int32_t flags,
|
||||
struct ompi_proc_t* ompi_proc,
|
||||
struct mca_btl_base_endpoint_t** newep
|
||||
);
|
||||
|
||||
|
||||
@ -757,6 +760,7 @@ struct mca_btl_base_module_t {
|
||||
|
||||
/* BTL common attributes */
|
||||
mca_btl_base_component_t* btl_component; /**< pointer back to the BTL component structure */
|
||||
char btl_ifname[8]; /**< name of interface associated with btl */
|
||||
size_t btl_eager_limit; /**< maximum size of first fragment -- eager send */
|
||||
size_t btl_rndv_eager_limit; /**< the size of a data sent in a first fragment of rendezvous protocol */
|
||||
size_t btl_max_send_size; /**< maximum send fragment size supported by the BTL */
|
||||
|
@ -631,6 +631,7 @@ mca_btl_elan_register_error( struct mca_btl_base_module_t* btl,
|
||||
mca_btl_elan_module_t mca_btl_elan_module = {
|
||||
{
|
||||
&mca_btl_elan_component.super,
|
||||
"unknown",
|
||||
0, /* max size of first fragment */
|
||||
0, /* min send fragment size */
|
||||
0, /* max send fragment size */
|
||||
|
@ -57,6 +57,7 @@ static int mca_btl_gm_put_nl(
|
||||
mca_btl_gm_module_t mca_btl_gm_module = {
|
||||
{
|
||||
&mca_btl_gm_component.super,
|
||||
"unknown",
|
||||
0, /* max size of first fragment */
|
||||
0, /* min send fragment size */
|
||||
0, /* max send fragment size */
|
||||
|
@ -672,6 +672,7 @@ int mca_btl_mx_ft_event(int state) {
|
||||
mca_btl_mx_module_t mca_btl_mx_module = {
|
||||
{
|
||||
&mca_btl_mx_component.super,
|
||||
"unknown",
|
||||
0, /* max size of first fragment */
|
||||
0, /* min send fragment size */
|
||||
0, /* max send fragment size */
|
||||
|
@ -42,6 +42,7 @@
|
||||
mca_btl_ud_module_t mca_btl_ofud_module = {
|
||||
{
|
||||
&mca_btl_ofud_component.super,
|
||||
"unknown",
|
||||
0, /* eager_limit */
|
||||
0, /* min_send_size */
|
||||
0, /* max_send_size */
|
||||
|
@ -67,6 +67,7 @@
|
||||
mca_btl_openib_module_t mca_btl_openib_module = {
|
||||
{
|
||||
&mca_btl_openib_component.super,
|
||||
"unknown",
|
||||
0, /* max size of first fragment */
|
||||
0, /* min send fragment size */
|
||||
0, /* max send fragment size */
|
||||
@ -638,7 +639,7 @@ mca_btl_base_descriptor_t* mca_btl_openib_alloc(
|
||||
assert(qp != MCA_BTL_NO_ORDER);
|
||||
|
||||
if(mca_btl_openib_component.use_message_coalescing &&
|
||||
(flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP)) {
|
||||
(flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP) && !(flags & MCA_BTL_IB_NO_COALESCE)) {
|
||||
int prio = !(flags & MCA_BTL_DES_FLAGS_PRIORITY);
|
||||
sfrag = check_coalescing(&ep->qps[qp].no_wqe_pending_frags[prio],
|
||||
&ep->qps[qp].qp->lock, ep, size);
|
||||
@ -1189,6 +1190,8 @@ int mca_btl_openib_sendi( struct mca_btl_base_module_t* btl,
|
||||
if(!ib_rc) {
|
||||
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
|
||||
return OMPI_SUCCESS;
|
||||
} else {
|
||||
opal_output(0, "Error from ibv_post_send()");
|
||||
}
|
||||
|
||||
/* Failed to send, do clean up all allocated resources */
|
||||
@ -1219,6 +1222,9 @@ cant_send:
|
||||
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
|
||||
/* We can not send the data directly, so we just return descriptor */
|
||||
*descriptor = mca_btl_openib_alloc(btl, ep, order, size, flags);
|
||||
#if 0
|
||||
opal_output(0, "Failed to send during sendi, send frag=%d back up", *descriptor);
|
||||
#endif
|
||||
return OMPI_ERR_RESOURCE_BUSY;
|
||||
}
|
||||
/*
|
||||
|
@ -54,6 +54,7 @@ BEGIN_C_DECLS
|
||||
#define MCA_BTL_IB_LEAVE_PINNED 1
|
||||
#define IB_DEFAULT_GID_PREFIX 0xfe80000000000000ll
|
||||
#define MCA_BTL_IB_PKEY_MASK 0x7fff
|
||||
#define MCA_BTL_IB_NO_COALESCE 0x4000
|
||||
|
||||
|
||||
/*--------------------------------------------------------------------*/
|
||||
@ -254,6 +255,8 @@ struct mca_btl_openib_component_t {
|
||||
ompi_free_list_t recv_user_free;
|
||||
/**< frags for coalesced massages */
|
||||
ompi_free_list_t send_free_coalesced;
|
||||
/** < whether to enable HCA failover mechanism */
|
||||
bool enable_hca_failover;
|
||||
}; typedef struct mca_btl_openib_component_t mca_btl_openib_component_t;
|
||||
|
||||
OMPI_MODULE_DECLSPEC extern mca_btl_openib_component_t mca_btl_openib_component;
|
||||
|
@ -66,6 +66,7 @@ const char *ibv_get_sysfs_path(void);
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/mca/notifier/notifier.h"
|
||||
|
||||
#include "ompi/mca/pml/ob1/pml_ob1_hdr.h" /* For debugging only */
|
||||
#include "ompi/proc/proc.h"
|
||||
#include "ompi/mca/btl/btl.h"
|
||||
#include "ompi/mca/mpool/base/base.h"
|
||||
@ -646,6 +647,7 @@ static int init_one_port(opal_list_t *btl_list, mca_btl_openib_device_t *device,
|
||||
ib_selected = OBJ_NEW(mca_btl_base_selected_module_t);
|
||||
ib_selected->btl_module = (mca_btl_base_module_t*) openib_btl;
|
||||
openib_btl->device = device;
|
||||
strncpy(openib_btl->super.btl_ifname, ibv_get_device_name(device->ib_dev), 7);
|
||||
openib_btl->port_num = (uint8_t) port_num;
|
||||
openib_btl->pkey_index = pkey_index;
|
||||
openib_btl->lid = lid;
|
||||
@ -2810,6 +2812,250 @@ static void progress_pending_frags_srq(mca_btl_openib_module_t* openib_btl,
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Take an existing frag and move it to another endpoint. We first
|
||||
* allocate a new fragment from the new btl. We then copy over various
|
||||
* fields from the old fragment to the new one. Then we copy the
|
||||
* actually data that is to be transferred. This includes the openib
|
||||
* header, the PML header, and all the data.
|
||||
*/
|
||||
static void mca_btl_openib_move_frag(mca_btl_openib_endpoint_t* ep,
|
||||
mca_btl_openib_com_frag_t* oldfrag)
|
||||
{
|
||||
mca_btl_openib_com_frag_t* frag;
|
||||
mca_btl_base_descriptor_t* olddes;
|
||||
mca_btl_base_descriptor_t* des;
|
||||
int coalesced_len, retval;
|
||||
|
||||
if (ep->endpoint_state != MCA_BTL_IB_CONNECTED) {
|
||||
OPAL_OUTPUT((-1, "INFO: Reposting to unconnected endpoint"));
|
||||
}
|
||||
|
||||
olddes = (mca_btl_base_descriptor_t *)oldfrag;
|
||||
|
||||
/* Check to see if this was a coalesced fragment. If so, then
|
||||
* first walk through each coalesced fragment, turn it into a send
|
||||
* fragment, and repost. */
|
||||
coalesced_len = opal_list_get_size(&to_send_frag(olddes)->coalesced_frags);
|
||||
if (coalesced_len > 0) {
|
||||
mca_btl_openib_control_header_t *ctrl_hdr;
|
||||
mca_btl_openib_header_coalesced_t *clsc_hdr;
|
||||
opal_list_item_t *i;
|
||||
mca_btl_base_descriptor_t* coalesced_des;
|
||||
OPAL_OUTPUT((-1, "INFO: Reposting coalesced fragments"));
|
||||
while((i = opal_list_remove_first(&to_send_frag(olddes)->coalesced_frags))) {
|
||||
|
||||
frag = (mca_btl_openib_com_frag_t *)
|
||||
mca_btl_openib_alloc((mca_btl_base_module_t *)ep->endpoint_btl,
|
||||
ep, to_base_frag(i)->base.order,
|
||||
to_base_frag(i)->segment.seg_len,
|
||||
to_base_frag(i)->base.des_flags | MCA_BTL_IB_NO_COALESCE);
|
||||
|
||||
coalesced_des = (mca_btl_base_descriptor_t *)i;
|
||||
|
||||
/* First adjust the values in the descriptor portion of the fragment */
|
||||
des = (mca_btl_base_descriptor_t*)frag;
|
||||
des->des_cbfunc = coalesced_des->des_cbfunc;
|
||||
des->des_cbdata = coalesced_des->des_cbdata;
|
||||
|
||||
/* Now adjust fragment specific information */
|
||||
frag->endpoint = ep;
|
||||
|
||||
/* Finally copy over the data that is actually being transmitted */
|
||||
memcpy(to_base_frag(frag)->segment.seg_addr.pval, to_base_frag(i)->segment.seg_addr.pval,
|
||||
to_base_frag(i)->segment.seg_len);
|
||||
to_base_frag(frag)->segment.seg_len = to_base_frag(i)->segment.seg_len;
|
||||
|
||||
/* Restore the PML fragment type header used for callbacks */
|
||||
clsc_hdr = (mca_btl_openib_header_coalesced_t *) to_coalesced_frag(i)->hdr;
|
||||
to_send_frag(frag)->hdr->tag = clsc_hdr->tag;
|
||||
|
||||
OPAL_OUTPUT((0, "Tag pulled from old coalesced frag: tag=%d", clsc_hdr->tag));
|
||||
|
||||
/* Set to zero just to be safe */
|
||||
to_send_frag(frag)->hdr->cm_seen = 0;
|
||||
to_send_frag(frag)->hdr->credits = 0;
|
||||
|
||||
/* This function will either post the send or queue it up if the resource
|
||||
* is busy. The resource could be busy if it is out of credits or out of
|
||||
* wqe's. If we get something other then resource busy or success, then
|
||||
* we will error out entirely as an unrecoverable error. */
|
||||
retval = mca_btl_openib_endpoint_send(ep, (mca_btl_openib_send_frag_t*)frag);
|
||||
if ((OMPI_SUCCESS != retval) && (OMPI_ERR_RESOURCE_BUSY != retval)) {
|
||||
ep->endpoint_btl->error_cb(&ep->endpoint_btl->super, MCA_BTL_ERROR_FLAGS_FATAL, NULL, NULL);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Now move the actual frag that caused the error */
|
||||
frag = (mca_btl_openib_com_frag_t *)
|
||||
mca_btl_openib_alloc((mca_btl_base_module_t *)ep->endpoint_btl,
|
||||
ep, to_base_frag(oldfrag)->base.order,
|
||||
to_base_frag(oldfrag)->segment.seg_len,
|
||||
to_base_frag(oldfrag)->base.des_flags | MCA_BTL_IB_NO_COALESCE);
|
||||
|
||||
OPAL_OUTPUT((-1, "Changing frag=%lx,btl=%s to frag=%lx,btl=%s, copying %d bytes\n",
|
||||
oldfrag, oldfrag->endpoint->endpoint_btl->super.btl_ifname,
|
||||
frag, ep->endpoint_btl->super.btl_ifname,
|
||||
to_base_frag(oldfrag)->segment.seg_len));
|
||||
|
||||
/* First adjust the values in the descriptor portion of the fragment.
|
||||
* Note that I do not currently set the des_context value. This field
|
||||
* is often set to the bml endpoint when the frag is created. Not sure
|
||||
* if I will ultimately need that. */
|
||||
des = (mca_btl_base_descriptor_t*)frag;
|
||||
des->des_cbfunc = olddes->des_cbfunc;
|
||||
des->des_cbdata = olddes->des_cbdata;
|
||||
|
||||
/* Now adjust fragment specific information */
|
||||
frag->endpoint = ep;
|
||||
|
||||
/* Finally copy over the data that is actually being transmitted */
|
||||
memcpy(to_base_frag(frag)->segment.seg_addr.pval, to_base_frag(oldfrag)->segment.seg_addr.pval,
|
||||
to_base_frag(oldfrag)->segment.seg_len);
|
||||
to_base_frag(frag)->segment.seg_len = to_base_frag(oldfrag)->segment.seg_len;
|
||||
|
||||
/* Set the fields in the mca_btl_openib_header_t. The fields consist of:
|
||||
* mca_btl_base_tag_t tag
|
||||
* uint8_t cm_seen;
|
||||
* uint16_t credits;
|
||||
* The tag field gets the tag from the old fragment. The other two fields
|
||||
* are set to zero. */
|
||||
|
||||
if (coalesced_len > 0) {
|
||||
/* A coalesced fragment has the tag field in a different location */
|
||||
mca_btl_openib_control_header_t *ctrl_hdr;
|
||||
mca_btl_openib_header_coalesced_t *clsc_hdr;
|
||||
/* Peel off the old PML tag from the header information. Need to work past
|
||||
* the openib_header and control_header to get to coalesce_header */
|
||||
ctrl_hdr = (mca_btl_openib_control_header_t*)(to_send_frag(oldfrag)->hdr + 1);
|
||||
clsc_hdr = (mca_btl_openib_header_coalesced_t*)(ctrl_hdr + 1);
|
||||
to_send_frag(frag)->hdr->tag = clsc_hdr->tag;
|
||||
} else {
|
||||
/* For normal send headers, copy over the tag. */
|
||||
to_send_frag(frag)->hdr->tag = to_send_frag(oldfrag)->hdr->tag;
|
||||
}
|
||||
to_send_frag(frag)->hdr->cm_seen = 0;
|
||||
to_send_frag(frag)->hdr->credits = 0;
|
||||
|
||||
/* This function will either post the send or queue it up if the resource
|
||||
* is busy. The resource could be busy if it is out of credits or out of
|
||||
* wqe's. If we get something other then resource busy or success, then
|
||||
* we will error out entirely as an unrecoverable error. */
|
||||
retval = mca_btl_openib_endpoint_send(ep, (mca_btl_openib_send_frag_t*)frag);
|
||||
if ((OMPI_SUCCESS != retval) && (OMPI_ERR_RESOURCE_BUSY != retval)) {
|
||||
ep->endpoint_btl->error_cb(&ep->endpoint_btl->super, MCA_BTL_ERROR_FLAGS_FATAL, NULL, NULL);
|
||||
}
|
||||
|
||||
/* Some extra debugging tool. Should be removed eventually. This prints
|
||||
* out the PML header that is in the newly created fragment. */
|
||||
{
|
||||
mca_pml_ob1_common_hdr_t* hdr;
|
||||
mca_pml_ob1_match_hdr_t* mhdr;
|
||||
mca_pml_ob1_frag_hdr_t* fhdr;
|
||||
uint8_t type;
|
||||
|
||||
hdr = (mca_pml_ob1_common_hdr_t*)des->des_src->seg_addr.pval;
|
||||
type = hdr->hdr_type;
|
||||
switch (type) {
|
||||
case MCA_PML_OB1_HDR_TYPE_MATCH:
|
||||
mhdr = (mca_pml_ob1_match_hdr_t*)hdr;
|
||||
OPAL_OUTPUT((-1, "MATCH,frag=%d,tag=%d,src=%d,seq=%d",
|
||||
frag, mhdr->hdr_tag, mhdr->hdr_src, mhdr->hdr_seq));
|
||||
break;
|
||||
case MCA_PML_OB1_HDR_TYPE_FRAG:
|
||||
fhdr = (mca_pml_ob1_frag_hdr_t*)hdr;
|
||||
OPAL_OUTPUT((-1, "FRAG,frag=%lx,rreq=%lx,len=%d,offset=%d",
|
||||
frag, fhdr->hdr_dst_req.pval, to_base_frag(frag)->segment.seg_len,
|
||||
fhdr->hdr_frag_offset));
|
||||
break;
|
||||
case MCA_PML_OB1_HDR_TYPE_RNDV:
|
||||
OPAL_OUTPUT((-1, "RNDV,frag=%lx", frag));
|
||||
break;
|
||||
case MCA_PML_OB1_HDR_TYPE_ACK:
|
||||
OPAL_OUTPUT((-1, "ACK,frag=%lx", frag));
|
||||
break;
|
||||
default:
|
||||
OPAL_OUTPUT((-1, "OTHER,frag=%lx", frag));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* This function will move all the pending fragments from one endpoint
|
||||
* to another. It walks through each qp with each priority and looks
|
||||
* for both no_credits_pending_frags and no_wqe_pending_frags and
|
||||
* moves any it finds. This is called when we detect an error on a
|
||||
* btl and we are trying to recover.
|
||||
*/
|
||||
static void move_all_pending_frags(mca_btl_base_endpoint_t *old_ep,
|
||||
mca_btl_base_endpoint_t *new_ep)
|
||||
{
|
||||
int qp, pri, rc, len, total;
|
||||
opal_list_item_t *item;
|
||||
mca_btl_openib_com_frag_t* frag;
|
||||
|
||||
total = 0;
|
||||
/* Traverse all QPs and all priorities and move to other endpoint */
|
||||
for (qp = 0; qp < mca_btl_openib_component.num_qps; ++qp) {
|
||||
for (pri = 0; pri < 2; ++pri) {
|
||||
/* All types of qp's have a no_wqe_pending_frags list */
|
||||
len = opal_list_get_size(&old_ep->qps[qp].no_wqe_pending_frags[pri]);
|
||||
if (len > 0) {
|
||||
total += len;
|
||||
opal_output(0, "Checking for no_wqe_pending_frags qp=%d, pri=%d, list size=%d",
|
||||
qp, pri, len);
|
||||
while (NULL != (item = opal_list_remove_first(&old_ep->qps[qp].
|
||||
no_wqe_pending_frags[pri]))) {
|
||||
frag = (mca_btl_openib_com_frag_t *) item;
|
||||
mca_btl_openib_move_frag(new_ep, frag);
|
||||
}
|
||||
}
|
||||
if (BTL_OPENIB_QP_TYPE_PP(qp)) {
|
||||
len = opal_list_get_size(&old_ep->qps[qp].no_credits_pending_frags[pri]);
|
||||
if (len > 0) {
|
||||
total += len;
|
||||
opal_output(0, "Checking for no_credits_pending_frags qp=%d, pri=%d, list size=%d",
|
||||
qp, pri, len);
|
||||
while (NULL != (item = opal_list_remove_first(&old_ep->qps[qp].
|
||||
no_credits_pending_frags[pri]))) {
|
||||
frag = (mca_btl_openib_com_frag_t *) item;
|
||||
mca_btl_openib_move_frag(new_ep, frag);
|
||||
}
|
||||
}
|
||||
|
||||
} else if (BTL_OPENIB_QP_TYPE_SRQ(qp)) {
|
||||
len = opal_list_get_size(&old_ep->endpoint_btl->qps[qp].u.srq_qp.pending_frags[pri]);
|
||||
if (len > 0) {
|
||||
total += len;
|
||||
opal_output(0, "Checking for srq pending_frags qp=%d, pri=%d, list size=%d",
|
||||
qp, pri, len);
|
||||
while (NULL != (item = opal_list_remove_first(&old_ep->endpoint_btl->qps[qp].
|
||||
u.srq_qp.pending_frags[pri]))) {
|
||||
frag = (mca_btl_openib_com_frag_t *) item;
|
||||
mca_btl_openib_move_frag(new_ep, frag);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Check for any frags from a connection that was never made. Not sure if this
|
||||
* can actually happen. */
|
||||
len = opal_list_get_size(&old_ep->pending_lazy_frags);
|
||||
if (len > 0) {
|
||||
total += len;
|
||||
opal_output(0, "Checking for pending_lazy_frags, list size=%d", len);
|
||||
while (NULL != (item = opal_list_remove_first(&(old_ep->pending_lazy_frags)))) {
|
||||
frag = (mca_btl_openib_com_frag_t *) item;
|
||||
mca_btl_openib_move_frag(new_ep, frag);
|
||||
}
|
||||
}
|
||||
|
||||
OPAL_OUTPUT((-1, "Finished checking for pending_frags, total moved=%d",
|
||||
total));
|
||||
}
|
||||
|
||||
static char *cq_name[] = {"HP CQ", "LP CQ"};
|
||||
static void handle_wc(mca_btl_openib_device_t* device, const uint32_t cq,
|
||||
struct ibv_wc *wc)
|
||||
@ -2818,9 +3064,11 @@ static void handle_wc(mca_btl_openib_device_t* device, const uint32_t cq,
|
||||
mca_btl_openib_com_frag_t* frag;
|
||||
mca_btl_base_descriptor_t *des;
|
||||
mca_btl_openib_endpoint_t* endpoint;
|
||||
mca_btl_openib_endpoint_t* newep;
|
||||
mca_btl_openib_module_t *openib_btl = NULL;
|
||||
ompi_proc_t* remote_proc = NULL;
|
||||
int qp, btl_ownership;
|
||||
int holdon = 1;
|
||||
|
||||
des = (mca_btl_base_descriptor_t*)(uintptr_t)wc->wr_id;
|
||||
frag = to_com_frag(des);
|
||||
@ -2834,6 +3082,32 @@ static void handle_wc(mca_btl_openib_device_t* device, const uint32_t cq,
|
||||
if(endpoint)
|
||||
openib_btl = endpoint->endpoint_btl;
|
||||
|
||||
/* These are the three types of fragments we have seen so far */
|
||||
if ((openib_frag_type(des) != MCA_BTL_OPENIB_FRAG_RECV) &&
|
||||
(openib_frag_type(des) != MCA_BTL_OPENIB_FRAG_SEND) &&
|
||||
(openib_frag_type(des) != MCA_BTL_OPENIB_FRAG_CONTROL)) {
|
||||
OPAL_OUTPUT((0, "Fragment is type %d, size=%d", openib_frag_type(des), (int)wc->byte_len));
|
||||
}
|
||||
|
||||
/* Quiet some of the receive frag errors */
|
||||
if (openib_frag_type(des) != MCA_BTL_OPENIB_FRAG_RECV) {
|
||||
OPAL_OUTPUT((-1, "Fragment is type %d, size=%d", openib_frag_type(des), (int)wc->byte_len));
|
||||
OPAL_OUTPUT((-1, "\nCQ btl=%s: status=%s(%d),wr_id=%d,opcode=%d",
|
||||
openib_btl->super.btl_ifname,
|
||||
btl_openib_component_status_to_string(wc->status),
|
||||
wc->status, (void *)(uintptr_t)wc->wr_id, wc->opcode));
|
||||
if (des->des_src) {
|
||||
mca_pml_ob1_frag_hdr_t* hdr = (mca_pml_ob1_frag_hdr_t*)des->des_src->seg_addr.pval;
|
||||
if (MCA_PML_OB1_HDR_TYPE_FRAG == hdr->hdr_common.hdr_type) {
|
||||
OPAL_OUTPUT((-1, "frag=TYPE_FRAG,offset=%d", hdr->hdr_frag_offset));
|
||||
} else if (MCA_PML_OB1_HDR_TYPE_RNDV == hdr->hdr_common.hdr_type) {
|
||||
OPAL_OUTPUT((-1, "frag=TYPE_RNDV"));
|
||||
} else {
|
||||
OPAL_OUTPUT((-1, "frag=OTHER"));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(wc->status != IBV_WC_SUCCESS) {
|
||||
OPAL_OUTPUT((-1, "Got WC: ERROR"));
|
||||
goto error;
|
||||
@ -2899,7 +3173,7 @@ static void handle_wc(mca_btl_openib_device_t* device, const uint32_t cq,
|
||||
/* Process a RECV */
|
||||
if(btl_openib_handle_incoming(openib_btl, endpoint, to_recv_frag(frag),
|
||||
wc->byte_len) != OMPI_SUCCESS) {
|
||||
openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL);
|
||||
openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL, NULL, NULL);
|
||||
break;
|
||||
}
|
||||
|
||||
@ -2916,7 +3190,7 @@ static void handle_wc(mca_btl_openib_device_t* device, const uint32_t cq,
|
||||
default:
|
||||
BTL_ERROR(("Unhandled work completion opcode is %d", wc->opcode));
|
||||
if(openib_btl)
|
||||
openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL);
|
||||
openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL, NULL, NULL);
|
||||
break;
|
||||
}
|
||||
|
||||
@ -2942,6 +3216,7 @@ error:
|
||||
}
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
if(IBV_WC_WR_FLUSH_ERR != wc->status || !flush_err_printed[cq]++) {
|
||||
BTL_PEER_ERROR(remote_proc, ("error polling %s with status %s "
|
||||
"status number %d for wr_id %" PRIx64 " opcode %d vendor error %d qp_idx %d",
|
||||
@ -2957,7 +3232,13 @@ error:
|
||||
wc->status, wc->wr_id,
|
||||
wc->opcode, wc->vendor_err, qp);
|
||||
}
|
||||
|
||||
if (openib_frag_type(des) != MCA_BTL_OPENIB_FRAG_RECV) {
|
||||
OPAL_OUTPUT((0, "Error on btl=%s: wc->status=%s(%d), wc->wr_id=%d",
|
||||
openib_btl->super.btl_ifname,
|
||||
btl_openib_component_status_to_string(wc->status),
|
||||
wc->status, (void *)(uintptr_t)wc->wr_id));
|
||||
}
|
||||
|
||||
if (IBV_WC_RNR_RETRY_EXC_ERR == wc->status ||
|
||||
IBV_WC_RETRY_EXC_ERR == wc->status) {
|
||||
char *peer_hostname =
|
||||
@ -2993,9 +3274,97 @@ error:
|
||||
device_name, peer_hostname);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
/* If failover is not enabled, just error out like we always did */
|
||||
if(!mca_btl_openib_component.enable_hca_failover) {
|
||||
openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL, NULL, NULL);
|
||||
}
|
||||
|
||||
if(openib_btl)
|
||||
openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL);
|
||||
/* Here is where we figure out what to do with the unsent fragment. To keep
|
||||
* things clear, I handle each one differently.
|
||||
* Note: In the wc struct, these are the only valid fields with an error:
|
||||
* wc->wr_id, wc->status, wc->vendor_err, wc->qp_num.
|
||||
* This means we cannot key off of the wc->opcode to see what operation we did.
|
||||
|
||||
/* Drop any errors receiving on a PP connection. There is nothing else to do */
|
||||
if ((openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_RECV) && (BTL_OPENIB_QP_TYPE_PP(qp))) {
|
||||
OPAL_OUTPUT((-1, "RECV or CONTROL, dropping since connection is broken (des=%d)", des));
|
||||
return;
|
||||
}
|
||||
|
||||
/* Drop any CONTROL messages as they are only valid on this connection. */
|
||||
if (openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_CONTROL) {
|
||||
OPAL_OUTPUT((-1, "RECV or CONTROL, dropping since connection is broken (des=%d)", des));
|
||||
return;
|
||||
}
|
||||
|
||||
/* MCA_BTL_OPENIB_FRAG_EAGER_RDMA is a openib specific control message
|
||||
* used to set up eager RDMA on a connection. Since the connection
|
||||
* is broken, just drop it. */
|
||||
if (openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_EAGER_RDMA) {
|
||||
OPAL_OUTPUT((-1, "OPENIB_FRAG_EAGER_RDMA, dropping since connection is broken (des=%d)", des));
|
||||
}
|
||||
|
||||
if ((openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_RECV) && !BTL_OPENIB_QP_TYPE_PP(qp)) {
|
||||
OPAL_OUTPUT((0, "SRQ RECV type=%d, size=%d", openib_frag_type(des), (int)wc->byte_len));
|
||||
return;
|
||||
#if 0
|
||||
while (holdon) {
|
||||
holdon++;
|
||||
opal_output(0, "SRQ RECV DETECTED - ATTACH DEBUGGER");
|
||||
sleep(5);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#if 0
|
||||
/* If we get an error on a receive then just map out the interface
|
||||
* for any future sends. There is nothin to retransmit.
|
||||
* NOTE: Not sure what to do with this yet */
|
||||
if (((openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_RECV) && (BTL_OPENIB_QP_TYPE_PP(qp))) ||
|
||||
(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_CONTROL)) {
|
||||
openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_NONFATAL, remote_proc, &newep);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
/* For shared receive queues, we need to return the fragments and
|
||||
* repost the receives since they are a shared resource. For
|
||||
* peer-to-peer queues, we do nothing.
|
||||
* NOTE: Not sure what to do here yet. I cannot get the btl or the endpoint
|
||||
* from the fragment that is returned. Usually, the endpoint is retrieved via
|
||||
* the immediate data, but obviously the immediate data is non-existant on an
|
||||
* error. All I really need is the btl but I am not sure where I get that
|
||||
* from. I have observed that I am not getting many errors on the receive
|
||||
* so I will not worry now about reposting them. */
|
||||
if ((openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_RECV) && !BTL_OPENIB_QP_TYPE_PP(qp)) {
|
||||
OPAL_OUTPUT((0, "SRQ RECV type=%d, size=%d", openib_frag_type(des), (int)wc->byte_len));
|
||||
MCA_BTL_IB_FRAG_RETURN(frag);
|
||||
mca_btl_openib_module_t *btl = endpoint->endpoint_btl;
|
||||
OPAL_THREAD_ADD32(&btl->qps[qp].u.srq_qp.rd_posted, -1);
|
||||
mca_btl_openib_post_srr(btl, qp);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Need to keep calling this to get the alternative endpoint back.
|
||||
* However, subsequent calls will not actually map anything out.
|
||||
* Note that we do not call this on a SRQ receive error or any
|
||||
* type of receive error. */
|
||||
openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_NONFATAL, remote_proc, &newep);
|
||||
|
||||
/* Move all the pending frags to the new endpoint as they can no
|
||||
* longer go out the broken endpoint. OPTIMIZATION: Like the PML
|
||||
* callback, this really only needs to be called once. However, it
|
||||
* does not hurt anything to keep calling it. Subsequent calls will
|
||||
* just have nothing to move over. */
|
||||
move_all_pending_frags(endpoint, newep);
|
||||
|
||||
/* Now move the fragment that triggered the error over to the
|
||||
* other endpoint */
|
||||
mca_btl_openib_move_frag(newep, frag);
|
||||
|
||||
}
|
||||
|
||||
static int poll_device(mca_btl_openib_device_t* device, int count)
|
||||
@ -3029,6 +3398,7 @@ static int poll_device(mca_btl_openib_device_t* device, int count)
|
||||
device->hp_cq_polls--;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT((-1, "ibv_poll_cq found CQ event on %s", device->ib_dev->name));
|
||||
handle_wc(device, cq, &wc);
|
||||
}
|
||||
|
||||
@ -3125,7 +3495,7 @@ static int progress_one_device(mca_btl_openib_device_t *device)
|
||||
ret = btl_openib_handle_incoming(btl, to_com_frag(frag)->endpoint,
|
||||
frag, size - sizeof(mca_btl_openib_footer_t));
|
||||
if (ret != MPI_SUCCESS) {
|
||||
btl->error_cb(&btl->super, MCA_BTL_ERROR_FLAGS_FATAL);
|
||||
btl->error_cb(&btl->super, MCA_BTL_ERROR_FLAGS_FATAL, NULL, NULL);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -3144,6 +3514,26 @@ static int progress_one_device(mca_btl_openib_device_t *device)
|
||||
return count;
|
||||
}
|
||||
|
||||
void btl_dump_pending_lists() {
|
||||
int i,j;
|
||||
mca_btl_openib_endpoint_t* endpoint;
|
||||
|
||||
for(i = 0; i < mca_btl_openib_component.devices_count; i++) {
|
||||
mca_btl_openib_device_t *device =
|
||||
opal_pointer_array_get_item(&mca_btl_openib_component.devices, i);
|
||||
for (j = 0; j < 10; j++) {
|
||||
endpoint = (mca_btl_openib_endpoint_t*)
|
||||
opal_pointer_array_get_item(device->endpoints, j);
|
||||
if (endpoint != NULL) {
|
||||
opal_output(0, "pending_lazy_frags size = %d",
|
||||
endpoint->pending_lazy_frags.opal_list_length);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* IB component progress.
|
||||
*/
|
||||
@ -3176,7 +3566,7 @@ error:
|
||||
mca_btl_openib_module_t* openib_btl =
|
||||
mca_btl_openib_component.openib_btls[i];
|
||||
if(openib_btl->device->got_fatal_event) {
|
||||
openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL);
|
||||
openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL, NULL, NULL);
|
||||
}
|
||||
}
|
||||
return count;
|
||||
|
@ -1018,7 +1018,7 @@ void *mca_btl_openib_endpoint_invoke_error(void *context)
|
||||
}
|
||||
|
||||
/* Invoke the callback to the upper layer */
|
||||
btl->error_cb(&(btl->super), MCA_BTL_ERROR_FLAGS_FATAL);
|
||||
btl->error_cb(&(btl->super), MCA_BTL_ERROR_FLAGS_FATAL, NULL, NULL);
|
||||
|
||||
/* Will likely never get here */
|
||||
return NULL;
|
||||
|
@ -24,6 +24,7 @@
|
||||
#ifndef MCA_BTL_IB_ENDPOINT_H
|
||||
#define MCA_BTL_IB_ENDPOINT_H
|
||||
|
||||
#include <unistd.h>
|
||||
#include "opal/class/opal_list.h"
|
||||
#include "opal/event/event.h"
|
||||
#include "opal/util/output.h"
|
||||
@ -35,6 +36,7 @@
|
||||
#include <string.h>
|
||||
#include "ompi/mca/btl/base/btl_base_error.h"
|
||||
#include "connect/base.h"
|
||||
#include "ompi/mca/pml/ob1/pml_ob1_hdr.h" /* For debugging only */
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
@ -420,6 +422,7 @@ static inline int check_endpoint_state(mca_btl_openib_endpoint_t *ep,
|
||||
mca_btl_base_descriptor_t *des, opal_list_t *pending_list)
|
||||
{
|
||||
int rc = OMPI_ERR_RESOURCE_BUSY;
|
||||
int holdon = 1;
|
||||
|
||||
switch(ep->endpoint_state) {
|
||||
case MCA_BTL_IB_CLOSED:
|
||||
@ -437,6 +440,13 @@ static inline int check_endpoint_state(mca_btl_openib_endpoint_t *ep,
|
||||
/* fall through */
|
||||
default:
|
||||
opal_list_append(pending_list, (opal_list_item_t *)des);
|
||||
#if 0
|
||||
while (holdon) {
|
||||
holdon++;
|
||||
opal_output(0, "STARTING CONNECTION on %d - ATTACH DEBUGGER", getpid());
|
||||
sleep(5);
|
||||
}
|
||||
#endif
|
||||
break;
|
||||
case MCA_BTL_IB_FAILED:
|
||||
rc = OMPI_ERR_UNREACH;
|
||||
@ -476,6 +486,40 @@ static inline int post_send(mca_btl_openib_endpoint_t *ep,
|
||||
struct ibv_send_wr *sr_desc = &to_out_frag(frag)->sr_desc;
|
||||
struct ibv_send_wr *bad_wr;
|
||||
int qp = to_base_frag(frag)->base.order;
|
||||
static int printstuff = 0;
|
||||
|
||||
if (printstuff == 1) {
|
||||
/* Some extra debugging tool. Should be removed eventually. This prints
|
||||
* out the PML header that is in the newly created fragment. */
|
||||
mca_pml_ob1_common_hdr_t* hdr;
|
||||
mca_pml_ob1_match_hdr_t* mhdr;
|
||||
mca_pml_ob1_frag_hdr_t* fhdr;
|
||||
uint8_t type;
|
||||
|
||||
hdr = (mca_pml_ob1_common_hdr_t*)seg->seg_addr.pval;
|
||||
type = hdr->hdr_type;
|
||||
switch (type) {
|
||||
case MCA_PML_OB1_HDR_TYPE_MATCH:
|
||||
mhdr = (mca_pml_ob1_match_hdr_t*)hdr;
|
||||
OPAL_OUTPUT((-1, "MATCH,frag=%d,tag=%d,src=%d,seq=%d",
|
||||
frag, mhdr->hdr_tag, mhdr->hdr_src, mhdr->hdr_seq));
|
||||
break;
|
||||
case MCA_PML_OB1_HDR_TYPE_FRAG:
|
||||
fhdr = (mca_pml_ob1_frag_hdr_t*)hdr;
|
||||
OPAL_OUTPUT((-1, "FRAG,frag=%lx,rreq=%lx,len=%d,offset=%d",
|
||||
frag, fhdr->hdr_dst_req.pval, to_base_frag(frag)->segment.seg_len,
|
||||
fhdr->hdr_frag_offset));
|
||||
break;
|
||||
case MCA_PML_OB1_HDR_TYPE_RNDV:
|
||||
OPAL_OUTPUT((-1, "RNDV,frag=%lx", frag));
|
||||
break;
|
||||
case MCA_PML_OB1_HDR_TYPE_ACK:
|
||||
OPAL_OUTPUT((-1, "ACK,frag=%lx", frag));
|
||||
break;
|
||||
default:
|
||||
OPAL_OUTPUT((-1, "OTHER,frag=%lx", frag));
|
||||
}
|
||||
}
|
||||
|
||||
sg->length = seg->seg_len + sizeof(mca_btl_openib_header_t) +
|
||||
(rdma ? sizeof(mca_btl_openib_footer_t) : 0) + frag->coalesced_length;
|
||||
@ -527,7 +571,18 @@ static inline int post_send(mca_btl_openib_endpoint_t *ep,
|
||||
#endif
|
||||
assert(sg->addr == (uint64_t)(uintptr_t)frag->hdr);
|
||||
|
||||
return ibv_post_send(ep->qps[qp].qp->lcl_qp, sr_desc, &bad_wr);
|
||||
{
|
||||
int retval;
|
||||
retval = ibv_post_send(ep->qps[qp].qp->lcl_qp, sr_desc, &bad_wr);
|
||||
if (0 == retval) {
|
||||
OPAL_OUTPUT((-1, "SUCCESS: Posted %d frag on %s\n", frag,
|
||||
ep->endpoint_btl->super.btl_ifname));
|
||||
} else {
|
||||
OPAL_OUTPUT((0, "FAILURE: Did not posted %d frag on %s\n", frag,
|
||||
ep->endpoint_btl->super.btl_ifname));
|
||||
}
|
||||
return retval;
|
||||
}
|
||||
}
|
||||
|
||||
END_C_DECLS
|
||||
|
@ -166,6 +166,7 @@ do { \
|
||||
} while (0)
|
||||
|
||||
enum mca_btl_openib_frag_type_t {
|
||||
MCA_BTL_OPENIB_FRAG_UNUSED, /* For debugging: Makes FRAG_RECV=1 */
|
||||
MCA_BTL_OPENIB_FRAG_RECV,
|
||||
MCA_BTL_OPENIB_FRAG_RECV_USER,
|
||||
MCA_BTL_OPENIB_FRAG_SEND,
|
||||
|
@ -555,6 +555,11 @@ int btl_openib_register_mca_params(void)
|
||||
NULL, &mca_btl_openib_component.ipaddr_exclude,
|
||||
0));
|
||||
|
||||
CHECK(reg_int("enable_hca_failover", NULL,
|
||||
"Enable failover from one HCA to another", 1, &ival, 0));
|
||||
mca_btl_openib_component.enable_hca_failover = (0 != ival);
|
||||
|
||||
|
||||
/* Register any MCA params for the connect pseudo-components */
|
||||
if (OMPI_SUCCESS == ret) {
|
||||
ret = ompi_btl_openib_connect_base_register();
|
||||
|
@ -36,6 +36,7 @@
|
||||
mca_btl_pcie_module_t mca_btl_pcie_module = {
|
||||
{
|
||||
&mca_btl_pcie_component.super,
|
||||
"unknown",
|
||||
0, /* max size of first fragment */
|
||||
0, /* Threshold below which BTL should not fragment */
|
||||
0, /* max send fragment size */
|
||||
|
@ -38,6 +38,7 @@
|
||||
mca_btl_portals_module_t mca_btl_portals_module = {
|
||||
{
|
||||
&mca_btl_portals_component.super,
|
||||
"unknown",
|
||||
|
||||
/* NOTE: All these default values are set in
|
||||
component_open() */
|
||||
|
@ -34,6 +34,7 @@
|
||||
mca_btl_sctp_module_t mca_btl_sctp_module = {
|
||||
{
|
||||
&mca_btl_sctp_component.super,
|
||||
"unknown",
|
||||
0, /* max size of first fragment */
|
||||
0, /* min send fragment size */
|
||||
0, /* max send fragment size */
|
||||
|
@ -36,6 +36,7 @@
|
||||
|
||||
mca_btl_base_module_t mca_btl_self = {
|
||||
&mca_btl_self_component.super,
|
||||
"unknown",
|
||||
0, /* btl_eager_limit */
|
||||
0, /* btl_rndv_eager_limit */
|
||||
0, /* btl_max_send_size */
|
||||
|
@ -56,6 +56,7 @@
|
||||
mca_btl_sm_t mca_btl_sm = {
|
||||
{
|
||||
&mca_btl_sm_component.super,
|
||||
"sm",
|
||||
0, /* btl_eager_limit */
|
||||
0, /* btl_rndv_eager_limit */
|
||||
0, /* btl_max_send_size */
|
||||
|
@ -36,6 +36,7 @@
|
||||
mca_btl_tcp_module_t mca_btl_tcp_module = {
|
||||
{
|
||||
&mca_btl_tcp_component.super,
|
||||
"unknown",
|
||||
0, /* max size of first fragment */
|
||||
0, /* min send fragment size */
|
||||
0, /* max send fragment size */
|
||||
|
@ -33,6 +33,7 @@
|
||||
mca_btl_template_module_t mca_btl_template_module = {
|
||||
{
|
||||
&mca_btl_template_component.super,
|
||||
"unknown",
|
||||
0, /* max size of first fragment */
|
||||
0, /* min send fragment size */
|
||||
0, /* max send fragment size */
|
||||
|
@ -50,6 +50,7 @@ static int mca_btl_udapl_assign_netmask(mca_btl_udapl_module_t* udapl_btl);
|
||||
mca_btl_udapl_module_t mca_btl_udapl_module = {
|
||||
{
|
||||
&mca_btl_udapl_component.super,
|
||||
"unknown",
|
||||
0, /* max size of first fragment */
|
||||
0, /* min send fragment size */
|
||||
0, /* max send fragment size */
|
||||
|
@ -41,6 +41,7 @@
|
||||
#include "ompi/mca/bml/base/base.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/grpcomm/grpcomm.h"
|
||||
#include "orte/mca/notifier/notifier.h"
|
||||
|
||||
#include "ompi/runtime/ompi_cr.h"
|
||||
|
||||
@ -70,7 +71,8 @@ mca_pml_ob1_t mca_pml_ob1 = {
|
||||
|
||||
|
||||
void mca_pml_ob1_error_handler( struct mca_btl_base_module_t* btl,
|
||||
int32_t flags );
|
||||
int32_t flags, ompi_proc_t* ompi_proc,
|
||||
struct mca_btl_base_endpoint_t** btl_endpoint);
|
||||
|
||||
int mca_pml_ob1_enable(bool enable)
|
||||
{
|
||||
@ -565,10 +567,70 @@ void mca_pml_ob1_process_pending_rdma(void)
|
||||
}
|
||||
|
||||
|
||||
void mca_pml_ob1_error_handler(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
int32_t flags) {
|
||||
orte_errmgr.abort(-1, NULL);
|
||||
void mca_pml_ob1_error_handler( struct mca_btl_base_module_t* btl,
|
||||
int32_t flags, ompi_proc_t *errproc,
|
||||
struct mca_btl_base_endpoint_t** btl_endpoint)
|
||||
{
|
||||
ompi_proc_t** procs;
|
||||
size_t p, num_procs;
|
||||
mca_bml_base_endpoint_t* ep;
|
||||
|
||||
if (flags & MCA_BTL_ERROR_FLAGS_FATAL) {
|
||||
orte_errmgr.abort(-1, NULL);
|
||||
}
|
||||
|
||||
/**
|
||||
* Just remove the offending bml_btl corresponding to the btl with the
|
||||
* error. Let the other errors remove the other ones.
|
||||
*/
|
||||
procs = ompi_proc_all(&num_procs);
|
||||
if(NULL != procs) {
|
||||
if (0 < mca_bml.bml_del_proc_btl(errproc, btl)) {
|
||||
opal_output(0, "PML error handler: rank=%d mapping out btl:name=%s,if=%s to rank=%d on node=%s",
|
||||
ORTE_PROC_MY_NAME->vpid,
|
||||
btl->btl_component->btl_version.mca_component_name,
|
||||
btl->btl_ifname,
|
||||
errproc->proc_name.vpid,
|
||||
errproc->proc_hostname);
|
||||
|
||||
}
|
||||
|
||||
#if 0
|
||||
for( p = 0; p < num_procs; p++ ) {
|
||||
ompi_proc_t* proc = procs[p];
|
||||
ep = (mca_bml_base_endpoint_t*)proc->proc_bml;
|
||||
opal_output(0, "p=%d, eager=%d, send=%d, rdma=%d, proc=%s",
|
||||
p,
|
||||
ep->btl_eager.arr_size,
|
||||
ep->btl_send.arr_size,
|
||||
ep->btl_rdma.arr_size,
|
||||
proc->proc_hostname);
|
||||
}
|
||||
#endif
|
||||
|
||||
ep = (mca_bml_base_endpoint_t*)errproc->proc_bml;
|
||||
|
||||
if ((ep->btl_eager.arr_size == 0) &&
|
||||
(ep->btl_send.arr_size == 0) &&
|
||||
(ep->btl_rdma.arr_size == 0)) {
|
||||
opal_output(0, "NO MORE INTERFACES - BYE BYE");
|
||||
orte_errmgr.abort(-1, NULL);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Now return the first one in the list. Odds are there were only
|
||||
* two to start with and now we are down to one.
|
||||
*/
|
||||
if (NULL != btl_endpoint) {
|
||||
*btl_endpoint = errproc->proc_bml->btl_send.bml_btls[0].btl_endpoint;
|
||||
}
|
||||
|
||||
orte_notifier.log(ORTE_NOTIFIER_INFRA, ORTE_ERR_COMM_FAILURE,
|
||||
"Mapping out btl component %s with interface %s",
|
||||
btl->btl_component->btl_version.mca_component_name,
|
||||
btl->btl_ifname);
|
||||
|
||||
}
|
||||
|
||||
#if OPAL_ENABLE_FT == 0
|
||||
|
@ -81,6 +81,7 @@ struct mca_pml_ob1_t {
|
||||
typedef struct mca_pml_ob1_t mca_pml_ob1_t;
|
||||
|
||||
extern mca_pml_ob1_t mca_pml_ob1;
|
||||
extern int mca_pml_ob1_output;
|
||||
|
||||
/*
|
||||
* PML interface functions.
|
||||
|
@ -47,6 +47,7 @@ static mca_pml_base_module_t*
|
||||
mca_pml_ob1_component_init( int* priority, bool enable_progress_threads,
|
||||
bool enable_mpi_threads );
|
||||
static int mca_pml_ob1_component_fini(void);
|
||||
int mca_pml_ob1_output = 0;
|
||||
|
||||
mca_pml_base_component_2_0_0_t mca_pml_ob1_component = {
|
||||
|
||||
@ -93,6 +94,11 @@ static inline int mca_pml_ob1_param_register_int(
|
||||
static int mca_pml_ob1_component_open(void)
|
||||
{
|
||||
mca_allocator_base_component_t* allocator_component;
|
||||
int value;
|
||||
|
||||
value = mca_pml_ob1_param_register_int("verbose", 0);
|
||||
mca_pml_ob1_output = opal_output_open(NULL);
|
||||
opal_output_set_verbosity(mca_pml_ob1_output, value);
|
||||
|
||||
mca_pml_ob1.free_list_num =
|
||||
mca_pml_ob1_param_register_int("free_list_num", 4);
|
||||
|
@ -317,6 +317,7 @@ void mca_pml_ob1_recv_frag_callback_frag(mca_btl_base_module_t* btl,
|
||||
mca_btl_base_tag_t tag,
|
||||
mca_btl_base_descriptor_t* des,
|
||||
void* cbdata ) {
|
||||
int triperr = 1;
|
||||
mca_btl_base_segment_t* segments = des->des_dst;
|
||||
mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval;
|
||||
mca_pml_ob1_recv_request_t* recvreq;
|
||||
@ -326,6 +327,15 @@ void mca_pml_ob1_recv_frag_callback_frag(mca_btl_base_module_t* btl,
|
||||
}
|
||||
ob1_hdr_ntoh(hdr, MCA_PML_OB1_HDR_TYPE_FRAG);
|
||||
recvreq = (mca_pml_ob1_recv_request_t*)hdr->hdr_frag.hdr_dst_req.pval;
|
||||
if (recvreq->req_recv.req_base.req_ompi.req_state == OMPI_REQUEST_INVALID) {
|
||||
while (triperr) {
|
||||
triperr++;
|
||||
opal_output(0, "ERROR DETECTED - ATTACH DEBUGGER");
|
||||
sleep(5);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
mca_pml_ob1_recv_request_progress_frag(recvreq,btl,segments,des->des_dst_cnt);
|
||||
|
||||
return;
|
||||
@ -592,6 +602,9 @@ static int mca_pml_ob1_recv_frag_match( mca_btl_base_module_t *btl,
|
||||
|
||||
/* get sequence number of next message that can be processed */
|
||||
next_msg_seq_expected = (uint16_t)proc->expected_sequence;
|
||||
opal_output_verbose(20, mca_pml_ob1_output,
|
||||
"frag_msg_seq=%d, next_msg_seq_expected=%d",
|
||||
frag_msg_seq, next_msg_seq_expected);
|
||||
if(OPAL_UNLIKELY(frag_msg_seq != next_msg_seq_expected))
|
||||
goto wrong_seq;
|
||||
|
||||
@ -671,5 +684,5 @@ wrong_seq:
|
||||
num_segments, NULL);
|
||||
OPAL_THREAD_UNLOCK(&comm->matching_lock);
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -422,6 +422,7 @@ void mca_pml_ob1_recv_request_progress_frag( mca_pml_ob1_recv_request_t* recvreq
|
||||
0, bytes_received );
|
||||
bytes_received -= sizeof(mca_pml_ob1_frag_hdr_t);
|
||||
data_offset = hdr->hdr_frag.hdr_frag_offset;
|
||||
OPAL_OUTPUT((-1, " Received SEND_FRAG, offset=%d", data_offset));
|
||||
/*
|
||||
* Make user buffer accessable(defined) before unpacking.
|
||||
*/
|
||||
|
@ -36,6 +36,7 @@
|
||||
OBJ_CLASS_INSTANCE(mca_pml_ob1_send_range_t, ompi_free_list_item_t,
|
||||
NULL, NULL);
|
||||
|
||||
|
||||
void mca_pml_ob1_send_request_process_pending(mca_bml_base_btl_t *bml_btl)
|
||||
{
|
||||
int i, s = opal_list_get_size(&mca_pml_ob1.send_pending);
|
||||
@ -544,9 +545,11 @@ int mca_pml_ob1_send_request_start_copy( mca_pml_ob1_send_request_t* sendreq,
|
||||
switch(rc) {
|
||||
case OMPI_ERR_RESOURCE_BUSY:
|
||||
/* No more resources. Allow the upper level to queue the send */
|
||||
opal_output(0, "OMPI_ERR_RESOURCE_BUSY returned from mca_pml_ob1_send_request_start_copy");
|
||||
rc = OMPI_ERR_OUT_OF_RESOURCE;
|
||||
break;
|
||||
default:
|
||||
opal_output(0, "ERROR ERROR ERROR ERROR ERROR in start_copy");
|
||||
mca_bml_base_free(bml_btl, des);
|
||||
break;
|
||||
}
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user