1
1

Adding support for on-demand SRQ pre-post (receive wqe allocation)

This commit was SVN r22313.
Этот коммит содержится в:
Vasily Filipov 2009-12-15 15:52:10 +00:00
родитель 354bfe527f
Коммит c036c6ef95
6 изменённых файлов: 189 добавлений и 12 удалений

Просмотреть файл

@ -223,6 +223,7 @@ static int adjust_cq(mca_btl_openib_device_t *device, const int cq)
static int create_srq(mca_btl_openib_module_t *openib_btl) static int create_srq(mca_btl_openib_module_t *openib_btl)
{ {
int qp; int qp;
int32_t rd_num, rd_curr_num;
/* create the SRQ's */ /* create the SRQ's */
for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) { for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) {
@ -251,6 +252,24 @@ static int create_srq(mca_btl_openib_module_t *openib_btl)
ibv_get_device_name(openib_btl->device->ib_dev)); ibv_get_device_name(openib_btl->device->ib_dev));
return OMPI_ERROR; return OMPI_ERROR;
} }
rd_num = mca_btl_openib_component.qp_infos[qp].rd_num;
rd_curr_num = openib_btl->qps[qp].u.srq_qp.rd_curr_num = mca_btl_openib_component.qp_infos[qp].u.srq_qp.rd_init;
if(true == mca_btl_openib_component.enable_srq_resize) {
if(0 == rd_curr_num) {
openib_btl->qps[qp].u.srq_qp.rd_curr_num = 1;
}
openib_btl->qps[qp].u.srq_qp.rd_low_local = rd_curr_num - (rd_curr_num >> 2);
openib_btl->qps[qp].u.srq_qp.srq_limit_event_flag = true;
} else {
openib_btl->qps[qp].u.srq_qp.rd_curr_num = rd_num;
openib_btl->qps[qp].u.srq_qp.rd_low_local = mca_btl_openib_component.qp_infos[qp].rd_low;
/* Not used in this case, but we don't need a garbage */
mca_btl_openib_component.qp_infos[qp].u.srq_qp.srq_limit = 0;
openib_btl->qps[qp].u.srq_qp.srq_limit_event_flag = false;
}
} }
} }

Просмотреть файл

@ -96,6 +96,12 @@ struct mca_btl_openib_pp_qp_info_t {
struct mca_btl_openib_srq_qp_info_t { struct mca_btl_openib_srq_qp_info_t {
int32_t sd_max; int32_t sd_max;
/* The init value for rd_curr_num variables of all SRQs */
int32_t rd_init;
/* The watermark, threshold - if the number of WQEs in SRQ is less then this value =>
the SRQ limit event (IBV_EVENT_SRQ_LIMIT_REACHED) will be generated on corresponding SRQ.
As result the maximal number of pre-posted WQEs on the SRQ will be increased */
int32_t srq_limit;
}; typedef struct mca_btl_openib_srq_qp_info_t mca_btl_openib_srq_qp_info_t; }; typedef struct mca_btl_openib_srq_qp_info_t mca_btl_openib_srq_qp_info_t;
struct mca_btl_openib_qp_info_t { struct mca_btl_openib_qp_info_t {
@ -263,6 +269,8 @@ struct mca_btl_openib_component_t {
ompi_free_list_t send_free_coalesced; ompi_free_list_t send_free_coalesced;
/** Default receive queues */ /** Default receive queues */
char* default_recv_qps; char* default_recv_qps;
/** Whether we want a dynamically resizing srq, enabled by default */
bool enable_srq_resize;
}; typedef struct mca_btl_openib_component_t mca_btl_openib_component_t; }; typedef struct mca_btl_openib_component_t mca_btl_openib_component_t;
OMPI_MODULE_DECLSPEC extern mca_btl_openib_component_t mca_btl_openib_component; OMPI_MODULE_DECLSPEC extern mca_btl_openib_component_t mca_btl_openib_component;
@ -363,6 +371,16 @@ struct mca_btl_openib_module_srq_qp_t {
int32_t sd_credits; /* the max number of outstanding sends on a QP when using SRQ */ int32_t sd_credits; /* the max number of outstanding sends on a QP when using SRQ */
/* i.e. the number of frags that can be outstanding (down counter) */ /* i.e. the number of frags that can be outstanding (down counter) */
opal_list_t pending_frags[2]; /**< list of high/low prio frags */ opal_list_t pending_frags[2]; /**< list of high/low prio frags */
/** The number of receive buffers that can be post in the current time.
The value may be increased in the IBV_EVENT_SRQ_LIMIT_REACHED
event handler. The value starts from (rd_num / 4) and increased up to rd_num */
int32_t rd_curr_num;
/** We post additional WQEs only if a number of WQEs (in specific SRQ) is less of this value.
The value increased together with rd_curr_num. The value is unique for every SRQ. */
int32_t rd_low_local;
/** The flag points if we want to get the
IBV_EVENT_SRQ_LIMIT_REACHED events for dynamically resizing SRQ */
bool srq_limit_event_flag;
}; typedef struct mca_btl_openib_module_srq_qp_t mca_btl_openib_module_srq_qp_t; }; typedef struct mca_btl_openib_module_srq_qp_t mca_btl_openib_module_srq_qp_t;
struct mca_btl_openib_module_qp_t { struct mca_btl_openib_module_qp_t {

Просмотреть файл

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2008 Mellanox Technologies. All rights reserved. * Copyright (c) 2008-2009 Mellanox Technologies. All rights reserved.
* Copyright (c) 2007-2009 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2007-2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2006-2007 Voltaire All rights reserved. * Copyright (c) 2006-2007 Voltaire All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
@ -226,10 +226,53 @@ static int btl_openib_async_commandh(struct mca_btl_openib_async_poll *devices_p
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
/* The main idea of resizing SRQ algorithm -
We create a SRQ with size = rd_num, but for efficient usage of resources
the number of WQEs that we post = rd_curr_num < rd_num and this value is
increased (by needs) in IBV_EVENT_SRQ_LIMIT_REACHED event handler (i.e. in this function),
the event will thrown by device if number of WQEs in SRQ will be less than srq_limit */
static int btl_openib_async_srq_limit_event(struct ibv_srq* srq,
mca_btl_openib_module_t *openib_btl)
{
int qp;
for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) {
if (!BTL_OPENIB_QP_TYPE_PP(qp)) {
if(openib_btl->qps[qp].u.srq_qp.srq == srq) {
break;
}
}
}
if(qp >= mca_btl_openib_component.num_qps) {
orte_show_help("help-mpi-btl-openib.txt", "SRQ doesn't found",
true,orte_process_info.nodename,
ibv_get_device_name(openib_btl->device->ib_dev));
return OMPI_ERROR;
}
/* dynamically re-size the SRQ to be larger */
openib_btl->qps[qp].u.srq_qp.rd_curr_num <<= 1;
if(openib_btl->qps[qp].u.srq_qp.rd_curr_num >= mca_btl_openib_component.qp_infos[qp].rd_num) {
openib_btl->qps[qp].u.srq_qp.rd_curr_num = mca_btl_openib_component.qp_infos[qp].rd_num;
openib_btl->qps[qp].u.srq_qp.rd_low_local = mca_btl_openib_component.qp_infos[qp].rd_low;
openib_btl->qps[qp].u.srq_qp.srq_limit_event_flag = false;
return OMPI_SUCCESS;
}
openib_btl->qps[qp].u.srq_qp.rd_low_local <<= 1;
openib_btl->qps[qp].u.srq_qp.srq_limit_event_flag = true;
return OMPI_SUCCESS;
}
/* Function handle async device events */ /* Function handle async device events */
static int btl_openib_async_deviceh(struct mca_btl_openib_async_poll *devices_poll, int index) static int btl_openib_async_deviceh(struct mca_btl_openib_async_poll *devices_poll, int index)
{ {
int j; int j, btl_index = 0;
mca_btl_openib_device_t *device = NULL; mca_btl_openib_device_t *device = NULL;
struct ibv_async_event event; struct ibv_async_event event;
bool xrc_event = false; bool xrc_event = false;
@ -240,6 +283,8 @@ static int btl_openib_async_deviceh(struct mca_btl_openib_async_poll *devices_po
if (mca_btl_openib_component.openib_btls[j]->device->ib_dev_context->async_fd == if (mca_btl_openib_component.openib_btls[j]->device->ib_dev_context->async_fd ==
devices_poll->async_pollfd[index].fd ) { devices_poll->async_pollfd[index].fd ) {
device = mca_btl_openib_component.openib_btls[j]->device; device = mca_btl_openib_component.openib_btls[j]->device;
btl_index = j;
break; break;
} }
} }
@ -306,7 +351,15 @@ static int btl_openib_async_deviceh(struct mca_btl_openib_async_poll *devices_po
#if HAVE_DECL_IBV_EVENT_CLIENT_REREGISTER #if HAVE_DECL_IBV_EVENT_CLIENT_REREGISTER
case IBV_EVENT_CLIENT_REREGISTER: case IBV_EVENT_CLIENT_REREGISTER:
#endif #endif
break;
/* The event is signaled when number of prepost receive WQEs is going
under predefined threshold - srq_limit */
case IBV_EVENT_SRQ_LIMIT_REACHED: case IBV_EVENT_SRQ_LIMIT_REACHED:
if(OMPI_SUCCESS != btl_openib_async_srq_limit_event(event.element.srq,
mca_btl_openib_component.openib_btls[btl_index])) {
return OMPI_ERROR;
}
break; break;
default: default:
orte_show_help("help-mpi-btl-openib.txt", "of unknown event", orte_show_help("help-mpi-btl-openib.txt", "of unknown event",

Просмотреть файл

@ -1376,8 +1376,8 @@ static int setup_qps(void)
true, rd_win, rd_num - rd_low); true, rd_win, rd_num - rd_low);
} }
} else { } else {
int32_t sd_max; int32_t sd_max, rd_init, srq_limit;
if (count < 3 || count > 5) { if (count < 3 || count > 7) {
orte_show_help("help-mpi-btl-openib.txt", orte_show_help("help-mpi-btl-openib.txt",
"invalid srq specification", true, "invalid srq specification", true,
orte_process_info.nodename, queues[qp]); orte_process_info.nodename, queues[qp]);
@ -1391,15 +1391,47 @@ static int setup_qps(void)
/* by default set rd_low to be 3/4 of rd_num */ /* by default set rd_low to be 3/4 of rd_num */
rd_low = atoi_param(P(3), rd_num - (rd_num / 4)); rd_low = atoi_param(P(3), rd_num - (rd_num / 4));
sd_max = atoi_param(P(4), rd_low / 4); sd_max = atoi_param(P(4), rd_low / 4);
BTL_VERBOSE(("srq: rd_num is %d rd_low is %d sd_max is %d", /* rd_init is initial value for rd_curr_num of all SRQs, 1/4 of rd_num by default */
rd_num, rd_low, sd_max)); rd_init = atoi_param(P(5), rd_num / 4);
/* by default set srq_limit to be 3/16 of rd_init (it's 1/4 of rd_low_local,
the value of rd_low_local we calculate in create_srq function) */
srq_limit = atoi_param(P(6), (rd_init - (rd_init / 4)) / 4);
/* If we set srq_limit less or greater than rd_init
(init value for rd_curr_num) => we receive the IBV_EVENT_SRQ_LIMIT_REACHED
event immediately and the value of rd_curr_num will be increased */
/* If we set srq_limit to zero, but size of SRQ greater than 1 and
it is not a user request (param number 6 in --mca btl_openib_receive_queues) => set it to be 1 */
if((0 == srq_limit) && (1 < rd_num) && (0 != P(6))) {
srq_limit = 1;
}
BTL_VERBOSE(("srq: rd_num is %d rd_low is %d sd_max is %d rd_max is %d srq_limit is %d",
rd_num, rd_low, sd_max, rd_init, srq_limit));
/* Calculate the smallest freelist size that can be allowed */ /* Calculate the smallest freelist size that can be allowed */
if (rd_num > min_freelist_size) { if (rd_num > min_freelist_size) {
min_freelist_size = rd_num; min_freelist_size = rd_num;
} }
if (rd_num < rd_init) {
orte_show_help("help-mpi-btl-openib.txt", "rd_num must be >= rd_init",
true, orte_process_info.nodename, queues[qp]);
ret = OMPI_ERR_BAD_PARAM;
goto error;
}
if (rd_num < srq_limit) {
orte_show_help("help-mpi-btl-openib.txt", "srq_limit must be > rd_num",
true, orte_process_info.nodename, queues[qp]);
ret = OMPI_ERR_BAD_PARAM;
goto error;
}
mca_btl_openib_component.qp_infos[qp].u.srq_qp.sd_max = sd_max; mca_btl_openib_component.qp_infos[qp].u.srq_qp.sd_max = sd_max;
mca_btl_openib_component.qp_infos[qp].u.srq_qp.rd_init = rd_init;
mca_btl_openib_component.qp_infos[qp].u.srq_qp.srq_limit = srq_limit;
} }
if (rd_num <= rd_low) { if (rd_num <= rd_low) {
@ -3200,19 +3232,19 @@ error:
int mca_btl_openib_post_srr(mca_btl_openib_module_t* openib_btl, const int qp) int mca_btl_openib_post_srr(mca_btl_openib_module_t* openib_btl, const int qp)
{ {
int rd_low = mca_btl_openib_component.qp_infos[qp].rd_low; int rd_low_local = openib_btl->qps[qp].u.srq_qp.rd_low_local;
int rd_num = mca_btl_openib_component.qp_infos[qp].rd_num; int rd_curr_num = openib_btl->qps[qp].u.srq_qp.rd_curr_num;
int num_post, i, rc; int num_post, i, rc;
struct ibv_recv_wr *bad_wr, *wr_list = NULL, *wr = NULL; struct ibv_recv_wr *bad_wr, *wr_list = NULL, *wr = NULL;
assert(!BTL_OPENIB_QP_TYPE_PP(qp)); assert(!BTL_OPENIB_QP_TYPE_PP(qp));
OPAL_THREAD_LOCK(&openib_btl->ib_lock); OPAL_THREAD_LOCK(&openib_btl->ib_lock);
if(openib_btl->qps[qp].u.srq_qp.rd_posted > rd_low) { if(openib_btl->qps[qp].u.srq_qp.rd_posted > rd_low_local) {
OPAL_THREAD_UNLOCK(&openib_btl->ib_lock); OPAL_THREAD_UNLOCK(&openib_btl->ib_lock);
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
num_post = rd_num - openib_btl->qps[qp].u.srq_qp.rd_posted; num_post = rd_curr_num - openib_btl->qps[qp].u.srq_qp.rd_posted;
for(i = 0; i < num_post; i++) { for(i = 0; i < num_post; i++) {
ompi_free_list_item_t* item; ompi_free_list_item_t* item;
@ -3229,7 +3261,26 @@ int mca_btl_openib_post_srr(mca_btl_openib_module_t* openib_btl, const int qp)
rc = ibv_post_srq_recv(openib_btl->qps[qp].u.srq_qp.srq, wr_list, &bad_wr); rc = ibv_post_srq_recv(openib_btl->qps[qp].u.srq_qp.srq, wr_list, &bad_wr);
if(OPAL_LIKELY(0 == rc)) { if(OPAL_LIKELY(0 == rc)) {
struct ibv_srq_attr srq_attr;
OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.rd_posted, num_post); OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.rd_posted, num_post);
if(true == openib_btl->qps[qp].u.srq_qp.srq_limit_event_flag) {
srq_attr.max_wr = openib_btl->qps[qp].u.srq_qp.rd_curr_num;
srq_attr.max_sge = 1;
srq_attr.srq_limit = mca_btl_openib_component.qp_infos[qp].u.srq_qp.srq_limit;
openib_btl->qps[qp].u.srq_qp.srq_limit_event_flag = false;
if(ibv_modify_srq(openib_btl->qps[qp].u.srq_qp.srq, &srq_attr, IBV_SRQ_LIMIT)) {
BTL_ERROR(("Failed to request limit event for srq on %s. "
"Fatal error, stoping asynch event thread",
ibv_get_device_name(openib_btl->device->ib_dev)));
OPAL_THREAD_UNLOCK(&openib_btl->ib_lock);
return OMPI_ERROR;
}
}
OPAL_THREAD_UNLOCK(&openib_btl->ib_lock); OPAL_THREAD_UNLOCK(&openib_btl->ib_lock);
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }

Просмотреть файл

@ -163,6 +163,11 @@ int btl_openib_register_mca_params(void)
1, &ival, 0)); 1, &ival, 0));
mca_btl_openib_component.warn_nonexistent_if = (0 != ival); mca_btl_openib_component.warn_nonexistent_if = (0 != ival);
CHECK(reg_int("enable_srq_resize", NULL,
"Enable/Disable on demand SRQ resize. "
"(0 = without resizing, nonzero = with resizing)", 1, &ival, 0));
mca_btl_openib_component.enable_srq_resize = (0 != ival);
if (OMPI_HAVE_IBV_FORK_INIT) { if (OMPI_HAVE_IBV_FORK_INIT) {
ival2 = -1; ival2 = -1;
} else { } else {

Просмотреть файл

@ -168,6 +168,13 @@ peer to which it was connected:
You may need to consult with your system administrator to get this You may need to consult with your system administrator to get this
problem fixed. problem fixed.
# #
[SRQ doesn't found]
The srq doesn't found.
Below is some information about the host that raised the error:
Local host: %s
Local device: %s
#
[srq rnr retry exceeded] [srq rnr retry exceeded]
The OpenFabrics "receiver not ready" retry count on a shared receive The OpenFabrics "receiver not ready" retry count on a shared receive
queue or XRC receive queue has been exceeded. This error can occur if queue or XRC receive queue has been exceeded. This error can occur if
@ -386,21 +393,27 @@ WARNING: An invalid shared receive queue specification was detected as
part of the btl_openib_receive_queues MCA parameter. The OpenFabrics part of the btl_openib_receive_queues MCA parameter. The OpenFabrics
(openib) BTL will therefore be deactivated for this run. (openib) BTL will therefore be deactivated for this run.
Shared receive queues can take between 2 and 4 parameters: Shared receive queues can take between 2 and 6 parameters:
1. Buffer size in bytes (mandatory) 1. Buffer size in bytes (mandatory)
2. Number of buffers (mandatory) 2. Number of buffers (mandatory)
3. Low buffer count watermark (optional; defaults to (num_buffers / 2)) 3. Low buffer count watermark (optional; defaults to (num_buffers / 2))
4. Maximum number of outstanding sends a sender can have (optional; 4. Maximum number of outstanding sends a sender can have (optional;
defaults to (low_watermark / 4) defaults to (low_watermark / 4)
5. Start value of number of receive buffers that will be pre-posted (optional; defaults to (num_buffers / 4))
6. Event limit buffer count watermark (optional; defaults to (3/16 of start value of buffers number))
Example: S,1024,256,128,32 Example: S,1024,256,128,32,32,8
- 1024 byte buffers - 1024 byte buffers
- 256 buffers to receive incoming MPI messages - 256 buffers to receive incoming MPI messages
- When the number of available buffers reaches 128, re-post 128 more - When the number of available buffers reaches 128, re-post 128 more
buffers to reach a total of 256 buffers to reach a total of 256
- A sender will not send to a peer unless it has less than 32 - A sender will not send to a peer unless it has less than 32
outstanding sends to that peer. outstanding sends to that peer.
- 32 receive buffers will be preposted.
- When the number of not used receive buffers will decreased to 8
the IBV_EVENT_SRQ_LIMIT_REACHED event will be signaled and the number
of receive buffers that we can pre-post will be increased.
Local host: %s Local host: %s
Bad queue specification: %s Bad queue specification: %s
@ -414,6 +427,24 @@ be deactivated for this run.
Local host: %s Local host: %s
Bad queue specification: %s Bad queue specification: %s
# #
[rd_num must be >= rd_init]
WARNING: The number of buffers for a queue pair specified via the
btl_openib_receive_queues MCA parameter (parameter #2) must be
greater or equal to the initial SRQ size (parameter #5).
The OpenFabrics (openib) BTL will therefore be deactivated for this run.
Local host: %s
Bad queue specification: %s
#
[srq_limit must be > rd_num]
WARNING: The number of buffers for a queue pair specified via the
btl_openib_receive_queues MCA parameter (parameter #2) must be greater than the limit
buffer count (parameter #6). The OpenFabrics (openib) BTL will therefore
be deactivated for this run.
Local host: %s
Bad queue specification: %s
#
[biggest qp size is too small] [biggest qp size is too small]
WARNING: The largest queue pair buffer size specified in the WARNING: The largest queue pair buffer size specified in the
btl_openib_receive_queues MCA parameter is smaller than the maximum btl_openib_receive_queues MCA parameter is smaller than the maximum