1
1

btl/ofi: progress now happens after a threshold.

This commit changed the way btl/ofi call progress. Before, we force
progression with every rdma/atomic call. This gives performance boost in
some case and slow down on others. Now we only force progression after
some number of rdma calls which result in better performance overall.

Also added new MCA parameter 'mca_btl_ofi_progress_threshold' to set
the threshold number. The new default is 64.

Also:
Added FI_DELIVERY_COMPLETE to tx_rtx flags to ensure that the completion
is generated after the message has been received on the remote side.

Signed-off-by: Thananon Patinyasakdikul <thananon.patinyasakdikul@intel.com>
Этот коммит содержится в:
Thananon Patinyasakdikul 2018-06-25 12:23:57 -07:00
родитель e59f58a57d
Коммит be76896f7c
6 изменённых файлов: 36 добавлений и 21 удалений

Просмотреть файл

@ -48,6 +48,8 @@
BEGIN_C_DECLS
#define MCA_BTL_OFI_MAX_MODULES 16
#define MCA_BTL_OFI_MAX_CQ_READ_ENTRIES 128
#define MCA_BTL_OFI_NUM_CQE_READ 64
#define MCA_BTL_OFI_PROGRESS_THRESHOLD 64
#define MCA_BTL_OFI_ABORT(args) mca_btl_ofi_exit(args)
@ -129,6 +131,7 @@ struct mca_btl_ofi_component_t {
int module_count;
int num_contexts_per_module;
int num_cqe_read;
int progress_threshold;
size_t namelen;

Просмотреть файл

@ -81,9 +81,6 @@ int mca_btl_ofi_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_end
MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl);
/* force a bit of progress. */
mca_btl_ofi_component.super.btl_progress();
return OPAL_SUCCESS;
}
@ -135,7 +132,6 @@ int mca_btl_ofi_aop (struct mca_btl_base_module_t *btl, mca_btl_base_endpoint_t
}
MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl);
mca_btl_ofi_component.super.btl_progress();
return OPAL_SUCCESS;
}
@ -192,8 +188,5 @@ int mca_btl_ofi_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_e
MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl);
/* force a bit of progress. */
mca_btl_ofi_component.super.btl_progress();
return OPAL_SUCCESS;
}

Просмотреть файл

@ -49,13 +49,17 @@ static int validate_info(struct fi_info *info)
{
int mr_mode;
BTL_VERBOSE(("validating device: %s", info->domain_attr->name));
/* we need exactly all the required bits */
if ((info->caps & MCA_BTL_OFI_REQUIRED_CAPS) != MCA_BTL_OFI_REQUIRED_CAPS) {
BTL_VERBOSE(("unsupported caps"));
return OPAL_ERROR;
}
/* we need FI_EP_RDM */
if (info->ep_attr->type != FI_EP_RDM) {
BTL_VERBOSE(("unsupported EP type"));
return OPAL_ERROR;
}
@ -63,9 +67,16 @@ static int validate_info(struct fi_info *info)
if (!(mr_mode == FI_MR_BASIC || mr_mode == FI_MR_SCALABLE ||
(mr_mode & ~(FI_MR_VIRT_ADDR | FI_MR_ALLOCATED | FI_MR_PROV_KEY)) == 0)) {
BTL_VERBOSE(("unsupported MR mode"));
return OPAL_ERROR;
}
if (!(info->tx_attr->op_flags | FI_DELIVERY_COMPLETE)) {
BTL_VERBOSE(("the endpoint tx_ctx does not support FI_DELIVERY_COMPLETE"));
return OPAL_ERROR;
}
BTL_VERBOSE(("device: %s is good to go.", info->domain_attr->name));
return OPAL_SUCCESS;
}
@ -102,14 +113,10 @@ static int mca_btl_ofi_component_register(void)
MCA_BASE_VAR_SCOPE_READONLY,
&prov_exclude);
/* Note: better leave it at 1 for now. osc rdma module is designed for 1 completion
* at a time. Dealing with more than 1 completion in 1 read will confuse the osc rdma.
* source: 8 hours of debugging. :(*/
mca_btl_ofi_component.num_cqe_read = 1;
mca_btl_ofi_component.num_cqe_read = MCA_BTL_OFI_NUM_CQE_READ;
(void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version,
"num_cq_read",
"Number of completion entries to read from a single cq_read. "
"(default: 1)",
"Number of completion entries to read from a single cq_read. ",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
@ -135,6 +142,7 @@ static int mca_btl_ofi_component_register(void)
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&mca_btl_ofi_component.num_contexts_per_module);
disable_sep = false;
(void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version,
"disable_sep",
@ -144,6 +152,17 @@ static int mca_btl_ofi_component_register(void)
MCA_BASE_VAR_SCOPE_READONLY,
&disable_sep);
mca_btl_ofi_component.progress_threshold = MCA_BTL_OFI_PROGRESS_THRESHOLD;
(void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version,
"progress_threshold",
"number of outstanding operation before btl will progress "
"automatically. Tuning this might improve performance on "
"certain type of application.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&mca_btl_ofi_component.progress_threshold);
/* for now we want this component to lose to btl/ugni and btl/vader */
module->super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH - 50;
@ -241,6 +260,8 @@ static mca_btl_base_module_t **mca_btl_ofi_component_init (int *num_btl_modules,
tx_attr.iov_limit = 1;
rx_attr.iov_limit = 1;
tx_attr.op_flags = FI_DELIVERY_COMPLETE;
mca_btl_ofi_component.module_count = 0;
/* do the query. */

Просмотреть файл

@ -176,6 +176,7 @@ mca_btl_ofi_context_t *mca_btl_ofi_context_alloc_scalable(struct fi_info *info,
struct fi_rx_attr rx_attr = {0};
mca_btl_ofi_context_t *contexts;
tx_attr.op_flags = FI_DELIVERY_COMPLETE;
contexts = (mca_btl_ofi_context_t*) calloc(num_contexts, sizeof(*contexts));
if (NULL == contexts) {

Просмотреть файл

@ -95,9 +95,6 @@ int mca_btl_ofi_get (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoi
MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl);
/* force a bit of progress */
mca_btl_ofi_component.super.btl_progress();
return OPAL_SUCCESS;
}
@ -143,9 +140,6 @@ int mca_btl_ofi_put (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoi
MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl);
/* force a bit of progress */
mca_btl_ofi_component.super.btl_progress();
return OPAL_SUCCESS;
}

Просмотреть файл

@ -29,8 +29,11 @@ mca_btl_ofi_completion_t *mca_btl_ofi_completion_alloc (
void *cbcontext, void *cbdata,
int type);
#define MCA_BTL_OFI_NUM_RDMA_INC(module) \
OPAL_THREAD_ADD_FETCH64(&(module)->outstanding_rdma, 1);
#define MCA_BTL_OFI_NUM_RDMA_INC(module) \
OPAL_THREAD_ADD_FETCH64(&(module)->outstanding_rdma, 1); \
if (module->outstanding_rdma > mca_btl_ofi_component.progress_threshold){ \
mca_btl_ofi_component.super.btl_progress(); \
}
#define MCA_BTL_OFI_NUM_RDMA_DEC(module) \
OPAL_THREAD_ADD_FETCH64(&(module)->outstanding_rdma, -1);