From 28a391730650244097e41021ff8b5b6657d7b5f2 Mon Sep 17 00:00:00 2001 From: Pavel Shamis Date: Mon, 28 Jan 2008 10:38:08 +0000 Subject: [PATCH] Adding APM support (over different lids). This commit was SVN r17280. --- ompi/mca/btl/openib/btl_openib.h | 2 + ompi/mca/btl/openib/btl_openib_async.c | 46 ++++++++++++++++++++- ompi/mca/btl/openib/btl_openib_async.h | 2 +- ompi/mca/btl/openib/btl_openib_component.c | 25 ++++++++++- ompi/mca/btl/openib/btl_openib_endpoint.c | 13 ++++++ ompi/mca/btl/openib/btl_openib_mca.c | 5 +++ ompi/mca/btl/openib/help-mpi-btl-openib.txt | 7 ++++ 7 files changed, 96 insertions(+), 4 deletions(-) diff --git a/ompi/mca/btl/openib/btl_openib.h b/ompi/mca/btl/openib/btl_openib.h index 38d60051a4..4c355cfb43 100644 --- a/ompi/mca/btl/openib/btl_openib.h +++ b/ompi/mca/btl/openib/btl_openib.h @@ -169,6 +169,7 @@ struct mca_btl_openib_component_t { int32_t max_eager_rdma; uint32_t btls_per_lid; uint32_t max_lmc; + int32_t apm; uint32_t buffer_alignment; /**< Preferred communication buffer alignment in Bytes (must be power of two) */ #if OMPI_HAVE_THREADS int32_t fatal_counter; /**< Counts number on fatal events that we got on all hcas */ @@ -334,6 +335,7 @@ struct mca_btl_openib_module_t { uint16_t pkey_index; struct ibv_port_attr ib_port_attr; uint16_t lid; /**< lid that is actually used (for LMC) */ + uint16_t apm_lmc_max; /**< the maximal lmc that can be used for apm */ uint8_t src_path_bits; /**< offset from base lid (for LMC) */ int32_t num_peers; diff --git a/ompi/mca/btl/openib/btl_openib_async.c b/ompi/mca/btl/openib/btl_openib_async.c index a352d62b4d..fa45bcea3c 100644 --- a/ompi/mca/btl/openib/btl_openib_async.c +++ b/ompi/mca/btl/openib/btl_openib_async.c @@ -206,6 +206,13 @@ static int btl_openib_async_hcah(struct mca_btl_openib_async_poll *hcas_poll, in } } switch(event.event_type) { + case IBV_EVENT_PATH_MIG: + if (0 != mca_btl_openib_component.apm) { + BTL_ERROR(("APM: Alternative path migration reported.")); + mca_btl_openib_load_apm(event.element.qp, + mca_btl_openib_component.openib_btls[j]); + } + break; case IBV_EVENT_DEVICE_FATAL: /* Set the flag to fatal */ hca->got_fatal_event = true; @@ -215,7 +222,6 @@ static int btl_openib_async_hcah(struct mca_btl_openib_async_poll *hcas_poll, in case IBV_EVENT_QP_FATAL: case IBV_EVENT_QP_REQ_ERR: case IBV_EVENT_QP_ACCESS_ERR: - case IBV_EVENT_PATH_MIG: case IBV_EVENT_PATH_MIG_ERR: case IBV_EVENT_SRQ_ERR: case IBV_EVENT_PORT_ERR: @@ -315,4 +321,42 @@ void* btl_openib_async_thread(void * async) } return PTHREAD_CANCELED; } + +/* Load new dlid to the QP */ +void mca_btl_openib_load_apm(struct ibv_qp *qp, struct mca_btl_openib_module_t *btl) +{ + struct ibv_qp_init_attr qp_init_attr; + struct ibv_qp_attr attr; + enum ibv_qp_attr_mask mask; + + BTL_VERBOSE(("APM: Loading alternative path")); + + if (mca_btl_openib_component.num_xrc_qps > 0) { + /* XRC API is not ready */ + } else { + if (ibv_query_qp(qp, &attr, mask, &qp_init_attr)) + BTL_ERROR(("Failed to ibv_query_qp, qp num: %d", qp->qp_num)); + } + + if (attr.ah_attr.src_path_bits - btl->src_path_bits < btl->apm_lmc_max) { + attr.alt_ah_attr.src_path_bits = attr.ah_attr.src_path_bits + 1; + } else { + BTL_ERROR(("Failed to load alternative path, all %d were used", + attr.ah_attr.src_path_bits - btl->src_path_bits)); + } + + attr.alt_ah_attr.dlid = attr.ah_attr.dlid + 1; + mask = IBV_QP_ALT_PATH|IBV_QP_PATH_MIG_STATE; + attr.alt_ah_attr.static_rate = attr.ah_attr.static_rate; + attr.alt_ah_attr.sl = attr.ah_attr.sl; + attr.alt_pkey_index = attr.pkey_index; + attr.alt_port_num = attr.port_num; + + if (mca_btl_openib_component.num_xrc_qps > 0) { + /* XRC API is not ready */ + } else { + if (ibv_modify_qp(qp, &attr, mask)) + BTL_ERROR(("Failed to ibv_query_qp, qp num: %d", qp->qp_num)); + } +} #endif diff --git a/ompi/mca/btl/openib/btl_openib_async.h b/ompi/mca/btl/openib/btl_openib_async.h index 09735bb5ab..d84573bba1 100644 --- a/ompi/mca/btl/openib/btl_openib_async.h +++ b/ompi/mca/btl/openib/btl_openib_async.h @@ -13,5 +13,5 @@ #define MCA_BTL_OPENIB_ASYNC_H void* btl_openib_async_thread(void *one_hca); - +void mca_btl_openib_load_apm(struct ibv_qp *qp, struct mca_btl_openib_module_t *btl); #endif diff --git a/ompi/mca/btl/openib/btl_openib_component.c b/ompi/mca/btl/openib/btl_openib_component.c index 559a26adb1..9e73214c5b 100644 --- a/ompi/mca/btl/openib/btl_openib_component.c +++ b/ompi/mca/btl/openib/btl_openib_component.c @@ -370,7 +370,7 @@ static int init_one_port(opal_list_t *btl_list, mca_btl_openib_hca_t *hca, uint8_t port_num, uint16_t pkey_index, struct ibv_port_attr *ib_port_attr) { - uint16_t lid, i, lmc; + uint16_t lid, i, lmc, lmc_step; mca_btl_openib_module_t *openib_btl; mca_btl_base_selected_module_t *ib_selected; union ibv_gid gid; @@ -388,14 +388,34 @@ static int init_one_port(opal_list_t *btl_list, mca_btl_openib_hca_t *hca, } lmc = (1 << ib_port_attr->lmc); + lmc_step = 1; if (0 != mca_btl_openib_component.max_lmc && mca_btl_openib_component.max_lmc < lmc) { lmc = mca_btl_openib_component.max_lmc; } + /* APM support */ + if (lmc > 1){ + if (-1 == mca_btl_openib_component.apm) { + lmc_step = lmc; + } else if (0 == lmc % (mca_btl_openib_component.apm + 1)) { + lmc_step = mca_btl_openib_component.apm + 1; + } else { + opal_show_help("help-mpi-btl-openib.txt", "apm with wrong lmc",true, + mca_btl_openib_component.apm, lmc); + return OMPI_ERROR; + } + } else { + if (mca_btl_openib_component.apm) { + /* Disable apm and report warning */ + mca_btl_openib_component.apm = 0; + opal_show_help("help-mpi-btl-openib.txt", "apm without lmc",true); + } + } + for(lid = ib_port_attr->lid; - lid < ib_port_attr->lid + lmc; lid++){ + lid < ib_port_attr->lid + lmc; lid += lmc_step){ for(i = 0; i < mca_btl_openib_component.btls_per_lid; i++){ char param[40]; int rc; @@ -415,6 +435,7 @@ static int init_one_port(opal_list_t *btl_list, mca_btl_openib_hca_t *hca, openib_btl->port_num = (uint8_t) port_num; openib_btl->pkey_index = pkey_index; openib_btl->lid = lid; + openib_btl->apm_lmc_max = lmc_step ; openib_btl->src_path_bits = lid - ib_port_attr->lid; /* store the subnet for multi-nic support */ openib_btl->port_info.subnet_id = subnet_id; diff --git a/ompi/mca/btl/openib/btl_openib_endpoint.c b/ompi/mca/btl/openib/btl_openib_endpoint.c index 29fa674a64..d820bc785c 100644 --- a/ompi/mca/btl/openib/btl_openib_endpoint.c +++ b/ompi/mca/btl/openib/btl_openib_endpoint.c @@ -43,6 +43,7 @@ #include "btl_openib_endpoint.h" #include "btl_openib_proc.h" #include "btl_openib_xrc.h" +#include "btl_openib_async.h" static void mca_btl_openib_endpoint_construct(mca_btl_base_endpoint_t* endpoint); static void mca_btl_openib_endpoint_destruct(mca_btl_base_endpoint_t* endpoint); @@ -539,6 +540,18 @@ void mca_btl_openib_endpoint_connected(mca_btl_openib_endpoint_t *endpoint) } } + /* Run over all qps and load alternative path */ + if (0 != mca_btl_openib_component.apm) { + int i; + if (MCA_BTL_XRC_ENABLED) { + mca_btl_openib_load_apm(endpoint->ib_addr->qp->lcl_qp, endpoint->endpoint_btl); + } else { + for(i = 0; i < mca_btl_openib_component.num_qps; i++) { + mca_btl_openib_load_apm(endpoint->qps[i].qp->lcl_qp, endpoint->endpoint_btl); + } + } + } + endpoint->endpoint_state = MCA_BTL_IB_CONNECTED; endpoint->endpoint_btl->hca->non_eager_rdma_endpoints++; diff --git a/ompi/mca/btl/openib/btl_openib_mca.c b/ompi/mca/btl/openib/btl_openib_mca.c index 813d701109..e964d23009 100644 --- a/ompi/mca/btl/openib/btl_openib_mca.c +++ b/ompi/mca/btl/openib/btl_openib_mca.c @@ -402,6 +402,11 @@ int btl_openib_register_mca_params(void) 0, &ival, REGINT_GE_ZERO)); mca_btl_openib_component.max_lmc = (uint32_t) ival; + CHECK(reg_int("enable_apm", "Maximum number of alterative paths for each HCA port " + "(must be >= -1, where 0 = disable apm, -1 = all availible alternative paths )", + 0, &ival, REGINT_NEG_ONE_OK|REGINT_GE_ZERO)); + mca_btl_openib_component.apm = (uint32_t) ival; + #if OMPI_HAVE_THREADS CHECK(reg_int("use_async_event_thread", "If nonzero, use the thread that will handle InfiniBand asyncihronous events ", diff --git a/ompi/mca/btl/openib/help-mpi-btl-openib.txt b/ompi/mca/btl/openib/help-mpi-btl-openib.txt index b0eb5f87f3..12e33a194e 100644 --- a/ompi/mca/btl/openib/help-mpi-btl-openib.txt +++ b/ompi/mca/btl/openib/help-mpi-btl-openib.txt @@ -437,3 +437,10 @@ num_pp_qps: %d WARNING: rd_win specification is non optimal. For maximum performance it is advisable to configure rd_win smaller then (rd_num - rd_low), but currently rd_win = %d and (rd_num - rd_low) = %d. +# +[apm without lmc] +WARNING: You can't enable APM support with LMC bit configured to 0. +APM support will be disabled. +# +[apm with wrong lmc] +Can not provide %d alternative paths with LMC bit configured to %d.