Adding support for on-demand SRQ pre-post (receive wqe allocation)
This commit was SVN r22313.
Этот коммит содержится в:
родитель
354bfe527f
Коммит
c036c6ef95
@ -223,6 +223,7 @@ static int adjust_cq(mca_btl_openib_device_t *device, const int cq)
|
||||
static int create_srq(mca_btl_openib_module_t *openib_btl)
|
||||
{
|
||||
int qp;
|
||||
int32_t rd_num, rd_curr_num;
|
||||
|
||||
/* create the SRQ's */
|
||||
for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) {
|
||||
@ -251,6 +252,24 @@ static int create_srq(mca_btl_openib_module_t *openib_btl)
|
||||
ibv_get_device_name(openib_btl->device->ib_dev));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
rd_num = mca_btl_openib_component.qp_infos[qp].rd_num;
|
||||
rd_curr_num = openib_btl->qps[qp].u.srq_qp.rd_curr_num = mca_btl_openib_component.qp_infos[qp].u.srq_qp.rd_init;
|
||||
|
||||
if(true == mca_btl_openib_component.enable_srq_resize) {
|
||||
if(0 == rd_curr_num) {
|
||||
openib_btl->qps[qp].u.srq_qp.rd_curr_num = 1;
|
||||
}
|
||||
|
||||
openib_btl->qps[qp].u.srq_qp.rd_low_local = rd_curr_num - (rd_curr_num >> 2);
|
||||
openib_btl->qps[qp].u.srq_qp.srq_limit_event_flag = true;
|
||||
} else {
|
||||
openib_btl->qps[qp].u.srq_qp.rd_curr_num = rd_num;
|
||||
openib_btl->qps[qp].u.srq_qp.rd_low_local = mca_btl_openib_component.qp_infos[qp].rd_low;
|
||||
/* Not used in this case, but we don't need a garbage */
|
||||
mca_btl_openib_component.qp_infos[qp].u.srq_qp.srq_limit = 0;
|
||||
openib_btl->qps[qp].u.srq_qp.srq_limit_event_flag = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -96,6 +96,12 @@ struct mca_btl_openib_pp_qp_info_t {
|
||||
|
||||
struct mca_btl_openib_srq_qp_info_t {
|
||||
int32_t sd_max;
|
||||
/* The init value for rd_curr_num variables of all SRQs */
|
||||
int32_t rd_init;
|
||||
/* The watermark, threshold - if the number of WQEs in SRQ is less then this value =>
|
||||
the SRQ limit event (IBV_EVENT_SRQ_LIMIT_REACHED) will be generated on corresponding SRQ.
|
||||
As result the maximal number of pre-posted WQEs on the SRQ will be increased */
|
||||
int32_t srq_limit;
|
||||
}; typedef struct mca_btl_openib_srq_qp_info_t mca_btl_openib_srq_qp_info_t;
|
||||
|
||||
struct mca_btl_openib_qp_info_t {
|
||||
@ -263,6 +269,8 @@ struct mca_btl_openib_component_t {
|
||||
ompi_free_list_t send_free_coalesced;
|
||||
/** Default receive queues */
|
||||
char* default_recv_qps;
|
||||
/** Whether we want a dynamically resizing srq, enabled by default */
|
||||
bool enable_srq_resize;
|
||||
}; typedef struct mca_btl_openib_component_t mca_btl_openib_component_t;
|
||||
|
||||
OMPI_MODULE_DECLSPEC extern mca_btl_openib_component_t mca_btl_openib_component;
|
||||
@ -363,6 +371,16 @@ struct mca_btl_openib_module_srq_qp_t {
|
||||
int32_t sd_credits; /* the max number of outstanding sends on a QP when using SRQ */
|
||||
/* i.e. the number of frags that can be outstanding (down counter) */
|
||||
opal_list_t pending_frags[2]; /**< list of high/low prio frags */
|
||||
/** The number of receive buffers that can be post in the current time.
|
||||
The value may be increased in the IBV_EVENT_SRQ_LIMIT_REACHED
|
||||
event handler. The value starts from (rd_num / 4) and increased up to rd_num */
|
||||
int32_t rd_curr_num;
|
||||
/** We post additional WQEs only if a number of WQEs (in specific SRQ) is less of this value.
|
||||
The value increased together with rd_curr_num. The value is unique for every SRQ. */
|
||||
int32_t rd_low_local;
|
||||
/** The flag points if we want to get the
|
||||
IBV_EVENT_SRQ_LIMIT_REACHED events for dynamically resizing SRQ */
|
||||
bool srq_limit_event_flag;
|
||||
}; typedef struct mca_btl_openib_module_srq_qp_t mca_btl_openib_module_srq_qp_t;
|
||||
|
||||
struct mca_btl_openib_module_qp_t {
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2008 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2008-2009 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2007-2009 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2006-2007 Voltaire All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
@ -226,10 +226,53 @@ static int btl_openib_async_commandh(struct mca_btl_openib_async_poll *devices_p
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/* The main idea of resizing SRQ algorithm -
|
||||
We create a SRQ with size = rd_num, but for efficient usage of resources
|
||||
the number of WQEs that we post = rd_curr_num < rd_num and this value is
|
||||
increased (by needs) in IBV_EVENT_SRQ_LIMIT_REACHED event handler (i.e. in this function),
|
||||
the event will thrown by device if number of WQEs in SRQ will be less than srq_limit */
|
||||
static int btl_openib_async_srq_limit_event(struct ibv_srq* srq,
|
||||
mca_btl_openib_module_t *openib_btl)
|
||||
{
|
||||
int qp;
|
||||
|
||||
for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) {
|
||||
if (!BTL_OPENIB_QP_TYPE_PP(qp)) {
|
||||
if(openib_btl->qps[qp].u.srq_qp.srq == srq) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(qp >= mca_btl_openib_component.num_qps) {
|
||||
orte_show_help("help-mpi-btl-openib.txt", "SRQ doesn't found",
|
||||
true,orte_process_info.nodename,
|
||||
ibv_get_device_name(openib_btl->device->ib_dev));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
/* dynamically re-size the SRQ to be larger */
|
||||
openib_btl->qps[qp].u.srq_qp.rd_curr_num <<= 1;
|
||||
|
||||
if(openib_btl->qps[qp].u.srq_qp.rd_curr_num >= mca_btl_openib_component.qp_infos[qp].rd_num) {
|
||||
openib_btl->qps[qp].u.srq_qp.rd_curr_num = mca_btl_openib_component.qp_infos[qp].rd_num;
|
||||
openib_btl->qps[qp].u.srq_qp.rd_low_local = mca_btl_openib_component.qp_infos[qp].rd_low;
|
||||
|
||||
openib_btl->qps[qp].u.srq_qp.srq_limit_event_flag = false;
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
openib_btl->qps[qp].u.srq_qp.rd_low_local <<= 1;
|
||||
openib_btl->qps[qp].u.srq_qp.srq_limit_event_flag = true;
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/* Function handle async device events */
|
||||
static int btl_openib_async_deviceh(struct mca_btl_openib_async_poll *devices_poll, int index)
|
||||
{
|
||||
int j;
|
||||
int j, btl_index = 0;
|
||||
mca_btl_openib_device_t *device = NULL;
|
||||
struct ibv_async_event event;
|
||||
bool xrc_event = false;
|
||||
@ -240,6 +283,8 @@ static int btl_openib_async_deviceh(struct mca_btl_openib_async_poll *devices_po
|
||||
if (mca_btl_openib_component.openib_btls[j]->device->ib_dev_context->async_fd ==
|
||||
devices_poll->async_pollfd[index].fd ) {
|
||||
device = mca_btl_openib_component.openib_btls[j]->device;
|
||||
btl_index = j;
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -306,7 +351,15 @@ static int btl_openib_async_deviceh(struct mca_btl_openib_async_poll *devices_po
|
||||
#if HAVE_DECL_IBV_EVENT_CLIENT_REREGISTER
|
||||
case IBV_EVENT_CLIENT_REREGISTER:
|
||||
#endif
|
||||
break;
|
||||
/* The event is signaled when number of prepost receive WQEs is going
|
||||
under predefined threshold - srq_limit */
|
||||
case IBV_EVENT_SRQ_LIMIT_REACHED:
|
||||
if(OMPI_SUCCESS != btl_openib_async_srq_limit_event(event.element.srq,
|
||||
mca_btl_openib_component.openib_btls[btl_index])) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
break;
|
||||
default:
|
||||
orte_show_help("help-mpi-btl-openib.txt", "of unknown event",
|
||||
|
@ -1376,8 +1376,8 @@ static int setup_qps(void)
|
||||
true, rd_win, rd_num - rd_low);
|
||||
}
|
||||
} else {
|
||||
int32_t sd_max;
|
||||
if (count < 3 || count > 5) {
|
||||
int32_t sd_max, rd_init, srq_limit;
|
||||
if (count < 3 || count > 7) {
|
||||
orte_show_help("help-mpi-btl-openib.txt",
|
||||
"invalid srq specification", true,
|
||||
orte_process_info.nodename, queues[qp]);
|
||||
@ -1391,15 +1391,47 @@ static int setup_qps(void)
|
||||
/* by default set rd_low to be 3/4 of rd_num */
|
||||
rd_low = atoi_param(P(3), rd_num - (rd_num / 4));
|
||||
sd_max = atoi_param(P(4), rd_low / 4);
|
||||
BTL_VERBOSE(("srq: rd_num is %d rd_low is %d sd_max is %d",
|
||||
rd_num, rd_low, sd_max));
|
||||
/* rd_init is initial value for rd_curr_num of all SRQs, 1/4 of rd_num by default */
|
||||
rd_init = atoi_param(P(5), rd_num / 4);
|
||||
/* by default set srq_limit to be 3/16 of rd_init (it's 1/4 of rd_low_local,
|
||||
the value of rd_low_local we calculate in create_srq function) */
|
||||
srq_limit = atoi_param(P(6), (rd_init - (rd_init / 4)) / 4);
|
||||
|
||||
/* If we set srq_limit less or greater than rd_init
|
||||
(init value for rd_curr_num) => we receive the IBV_EVENT_SRQ_LIMIT_REACHED
|
||||
event immediately and the value of rd_curr_num will be increased */
|
||||
|
||||
/* If we set srq_limit to zero, but size of SRQ greater than 1 and
|
||||
it is not a user request (param number 6 in --mca btl_openib_receive_queues) => set it to be 1 */
|
||||
if((0 == srq_limit) && (1 < rd_num) && (0 != P(6))) {
|
||||
srq_limit = 1;
|
||||
}
|
||||
|
||||
BTL_VERBOSE(("srq: rd_num is %d rd_low is %d sd_max is %d rd_max is %d srq_limit is %d",
|
||||
rd_num, rd_low, sd_max, rd_init, srq_limit));
|
||||
|
||||
/* Calculate the smallest freelist size that can be allowed */
|
||||
if (rd_num > min_freelist_size) {
|
||||
min_freelist_size = rd_num;
|
||||
}
|
||||
|
||||
if (rd_num < rd_init) {
|
||||
orte_show_help("help-mpi-btl-openib.txt", "rd_num must be >= rd_init",
|
||||
true, orte_process_info.nodename, queues[qp]);
|
||||
ret = OMPI_ERR_BAD_PARAM;
|
||||
goto error;
|
||||
}
|
||||
|
||||
if (rd_num < srq_limit) {
|
||||
orte_show_help("help-mpi-btl-openib.txt", "srq_limit must be > rd_num",
|
||||
true, orte_process_info.nodename, queues[qp]);
|
||||
ret = OMPI_ERR_BAD_PARAM;
|
||||
goto error;
|
||||
}
|
||||
|
||||
mca_btl_openib_component.qp_infos[qp].u.srq_qp.sd_max = sd_max;
|
||||
mca_btl_openib_component.qp_infos[qp].u.srq_qp.rd_init = rd_init;
|
||||
mca_btl_openib_component.qp_infos[qp].u.srq_qp.srq_limit = srq_limit;
|
||||
}
|
||||
|
||||
if (rd_num <= rd_low) {
|
||||
@ -3200,19 +3232,19 @@ error:
|
||||
|
||||
int mca_btl_openib_post_srr(mca_btl_openib_module_t* openib_btl, const int qp)
|
||||
{
|
||||
int rd_low = mca_btl_openib_component.qp_infos[qp].rd_low;
|
||||
int rd_num = mca_btl_openib_component.qp_infos[qp].rd_num;
|
||||
int rd_low_local = openib_btl->qps[qp].u.srq_qp.rd_low_local;
|
||||
int rd_curr_num = openib_btl->qps[qp].u.srq_qp.rd_curr_num;
|
||||
int num_post, i, rc;
|
||||
struct ibv_recv_wr *bad_wr, *wr_list = NULL, *wr = NULL;
|
||||
|
||||
assert(!BTL_OPENIB_QP_TYPE_PP(qp));
|
||||
|
||||
OPAL_THREAD_LOCK(&openib_btl->ib_lock);
|
||||
if(openib_btl->qps[qp].u.srq_qp.rd_posted > rd_low) {
|
||||
if(openib_btl->qps[qp].u.srq_qp.rd_posted > rd_low_local) {
|
||||
OPAL_THREAD_UNLOCK(&openib_btl->ib_lock);
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
num_post = rd_num - openib_btl->qps[qp].u.srq_qp.rd_posted;
|
||||
num_post = rd_curr_num - openib_btl->qps[qp].u.srq_qp.rd_posted;
|
||||
|
||||
for(i = 0; i < num_post; i++) {
|
||||
ompi_free_list_item_t* item;
|
||||
@ -3229,7 +3261,26 @@ int mca_btl_openib_post_srr(mca_btl_openib_module_t* openib_btl, const int qp)
|
||||
|
||||
rc = ibv_post_srq_recv(openib_btl->qps[qp].u.srq_qp.srq, wr_list, &bad_wr);
|
||||
if(OPAL_LIKELY(0 == rc)) {
|
||||
struct ibv_srq_attr srq_attr;
|
||||
|
||||
OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.rd_posted, num_post);
|
||||
|
||||
if(true == openib_btl->qps[qp].u.srq_qp.srq_limit_event_flag) {
|
||||
srq_attr.max_wr = openib_btl->qps[qp].u.srq_qp.rd_curr_num;
|
||||
srq_attr.max_sge = 1;
|
||||
srq_attr.srq_limit = mca_btl_openib_component.qp_infos[qp].u.srq_qp.srq_limit;
|
||||
|
||||
openib_btl->qps[qp].u.srq_qp.srq_limit_event_flag = false;
|
||||
if(ibv_modify_srq(openib_btl->qps[qp].u.srq_qp.srq, &srq_attr, IBV_SRQ_LIMIT)) {
|
||||
BTL_ERROR(("Failed to request limit event for srq on %s. "
|
||||
"Fatal error, stoping asynch event thread",
|
||||
ibv_get_device_name(openib_btl->device->ib_dev)));
|
||||
|
||||
OPAL_THREAD_UNLOCK(&openib_btl->ib_lock);
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
OPAL_THREAD_UNLOCK(&openib_btl->ib_lock);
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
@ -163,6 +163,11 @@ int btl_openib_register_mca_params(void)
|
||||
1, &ival, 0));
|
||||
mca_btl_openib_component.warn_nonexistent_if = (0 != ival);
|
||||
|
||||
CHECK(reg_int("enable_srq_resize", NULL,
|
||||
"Enable/Disable on demand SRQ resize. "
|
||||
"(0 = without resizing, nonzero = with resizing)", 1, &ival, 0));
|
||||
mca_btl_openib_component.enable_srq_resize = (0 != ival);
|
||||
|
||||
if (OMPI_HAVE_IBV_FORK_INIT) {
|
||||
ival2 = -1;
|
||||
} else {
|
||||
|
@ -168,6 +168,13 @@ peer to which it was connected:
|
||||
You may need to consult with your system administrator to get this
|
||||
problem fixed.
|
||||
#
|
||||
[SRQ doesn't found]
|
||||
The srq doesn't found.
|
||||
Below is some information about the host that raised the error:
|
||||
|
||||
Local host: %s
|
||||
Local device: %s
|
||||
#
|
||||
[srq rnr retry exceeded]
|
||||
The OpenFabrics "receiver not ready" retry count on a shared receive
|
||||
queue or XRC receive queue has been exceeded. This error can occur if
|
||||
@ -386,21 +393,27 @@ WARNING: An invalid shared receive queue specification was detected as
|
||||
part of the btl_openib_receive_queues MCA parameter. The OpenFabrics
|
||||
(openib) BTL will therefore be deactivated for this run.
|
||||
|
||||
Shared receive queues can take between 2 and 4 parameters:
|
||||
Shared receive queues can take between 2 and 6 parameters:
|
||||
|
||||
1. Buffer size in bytes (mandatory)
|
||||
2. Number of buffers (mandatory)
|
||||
3. Low buffer count watermark (optional; defaults to (num_buffers / 2))
|
||||
4. Maximum number of outstanding sends a sender can have (optional;
|
||||
defaults to (low_watermark / 4)
|
||||
5. Start value of number of receive buffers that will be pre-posted (optional; defaults to (num_buffers / 4))
|
||||
6. Event limit buffer count watermark (optional; defaults to (3/16 of start value of buffers number))
|
||||
|
||||
Example: S,1024,256,128,32
|
||||
Example: S,1024,256,128,32,32,8
|
||||
- 1024 byte buffers
|
||||
- 256 buffers to receive incoming MPI messages
|
||||
- When the number of available buffers reaches 128, re-post 128 more
|
||||
buffers to reach a total of 256
|
||||
- A sender will not send to a peer unless it has less than 32
|
||||
outstanding sends to that peer.
|
||||
- 32 receive buffers will be preposted.
|
||||
- When the number of not used receive buffers will decreased to 8
|
||||
the IBV_EVENT_SRQ_LIMIT_REACHED event will be signaled and the number
|
||||
of receive buffers that we can pre-post will be increased.
|
||||
|
||||
Local host: %s
|
||||
Bad queue specification: %s
|
||||
@ -414,6 +427,24 @@ be deactivated for this run.
|
||||
Local host: %s
|
||||
Bad queue specification: %s
|
||||
#
|
||||
[rd_num must be >= rd_init]
|
||||
WARNING: The number of buffers for a queue pair specified via the
|
||||
btl_openib_receive_queues MCA parameter (parameter #2) must be
|
||||
greater or equal to the initial SRQ size (parameter #5).
|
||||
The OpenFabrics (openib) BTL will therefore be deactivated for this run.
|
||||
|
||||
Local host: %s
|
||||
Bad queue specification: %s
|
||||
#
|
||||
[srq_limit must be > rd_num]
|
||||
WARNING: The number of buffers for a queue pair specified via the
|
||||
btl_openib_receive_queues MCA parameter (parameter #2) must be greater than the limit
|
||||
buffer count (parameter #6). The OpenFabrics (openib) BTL will therefore
|
||||
be deactivated for this run.
|
||||
|
||||
Local host: %s
|
||||
Bad queue specification: %s
|
||||
#
|
||||
[biggest qp size is too small]
|
||||
WARNING: The largest queue pair buffer size specified in the
|
||||
btl_openib_receive_queues MCA parameter is smaller than the maximum
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user