Merge pull request #3837 from tkordenbrock/topic/master/get.retry.timeout
master: mtl-portals4: add timeout to rendezvous get fragments
Этот коммит содержится в:
Коммит
0ce8590e7c
@ -73,6 +73,7 @@ struct mca_mtl_portals4_module_t {
|
||||
|
||||
/* free list of rendezvous get fragments */
|
||||
opal_free_list_t fl_rndv_get_frag;
|
||||
int get_retransmit_timeout;
|
||||
|
||||
/** Network interface handle for matched interface */
|
||||
ptl_handle_ni_t ni_h;
|
||||
|
@ -202,6 +202,16 @@ ompi_mtl_portals4_component_register(void)
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&ompi_mtl_portals4.max_msg_size_mtl);
|
||||
|
||||
ompi_mtl_portals4.get_retransmit_timeout=10000;
|
||||
(void) mca_base_component_var_register(&mca_mtl_portals4_component.mtl_version,
|
||||
"get_retransmit_timeout",
|
||||
"PtlGET retransmission timeout in usec",
|
||||
MCA_BASE_VAR_TYPE_INT,
|
||||
NULL, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&ompi_mtl_portals4.get_retransmit_timeout);
|
||||
|
||||
OBJ_RELEASE(new_enum);
|
||||
if (0 > ret) {
|
||||
return OMPI_ERR_NOT_SUPPORTED;
|
||||
|
@ -27,6 +27,7 @@
|
||||
#include "ompi/mca/mtl/base/base.h"
|
||||
#include "ompi/mca/mtl/base/mtl_base_datatype.h"
|
||||
#include "ompi/message/message.h"
|
||||
#include "opal/mca/timer/base/base.h"
|
||||
|
||||
#include "mtl_portals4.h"
|
||||
#include "mtl_portals4_endpoint.h"
|
||||
@ -81,6 +82,7 @@ read_msg(void *start, ptl_size_t length, ptl_process_t target,
|
||||
frag->frag_remote_offset = remote_offset + i * ompi_mtl_portals4.max_msg_size_mtl;
|
||||
|
||||
frag->event_callback = ompi_mtl_portals4_rndv_get_frag_progress;
|
||||
frag->frag_abs_timeout_usec = 0;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((90, ompi_mtl_base_framework.framework_output, "GET (fragment %d/%d, size %ld) send",
|
||||
i + 1, frag_count, frag->frag_length));
|
||||
@ -322,17 +324,41 @@ ompi_mtl_portals4_rndv_get_frag_progress(ptl_event_t *ev,
|
||||
ompi_mtl_portals4_recv_request_t* ptl_request =
|
||||
(ompi_mtl_portals4_recv_request_t*) rndv_get_frag->request;
|
||||
|
||||
assert(ev->type==PTL_EVENT_REPLY);
|
||||
assert(PTL_EVENT_REPLY == ev->type);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output,
|
||||
"Recv %lu (0x%lx) got reply event",
|
||||
ptl_request->opcount, ptl_request->hdr_data));
|
||||
|
||||
|
||||
if (OPAL_UNLIKELY(ev->ni_fail_type != PTL_NI_OK)) {
|
||||
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
||||
"%s:%d: PTL_EVENT_REPLY with ni_fail_type: %d",
|
||||
__FILE__, __LINE__, ev->ni_fail_type);
|
||||
|
||||
if (OPAL_UNLIKELY(ev->ni_fail_type != PTL_NI_DROPPED)) {
|
||||
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
||||
"PTL_EVENT_REPLY with ni_fail_type: %u => cannot retry",
|
||||
(uint32_t)ev->ni_fail_type);
|
||||
ret = PTL_FAIL;
|
||||
goto callback_error;
|
||||
}
|
||||
|
||||
if (0 == rndv_get_frag->frag_abs_timeout_usec) {
|
||||
/* this is the first retry of the frag. start the timer. */
|
||||
/* instead of recording the start time, record the end time
|
||||
* and avoid addition on each retry. */
|
||||
rndv_get_frag->frag_abs_timeout_usec = opal_timer_base_get_usec() + ompi_mtl_portals4.get_retransmit_timeout;
|
||||
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
||||
"setting frag timeout at %lu",
|
||||
rndv_get_frag->frag_abs_timeout_usec);
|
||||
} else if (opal_timer_base_get_usec() >= rndv_get_frag->frag_abs_timeout_usec) {
|
||||
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
||||
"timeout retrying GET");
|
||||
ret = PTL_FAIL;
|
||||
goto callback_error;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output,
|
||||
"Rendezvous Get Failed: Reissuing frag #%u", rndv_get_frag->frag_num));
|
||||
|
||||
|
@ -22,6 +22,7 @@
|
||||
|
||||
#include "opal/datatype/opal_convertor.h"
|
||||
#include "ompi/mca/mtl/mtl.h"
|
||||
#include "opal/mca/timer/base/base.h"
|
||||
|
||||
struct ompi_mtl_portals4_message_t;
|
||||
struct ompi_mtl_portals4_pending_request_t;
|
||||
@ -93,6 +94,8 @@ struct ompi_mtl_portals4_rndv_get_frag_t {
|
||||
ptl_process_t frag_target;
|
||||
ptl_hdr_data_t frag_match_bits;
|
||||
ptl_size_t frag_remote_offset;
|
||||
/* the absolute time at which this frag times out */
|
||||
opal_timer_t frag_abs_timeout_usec;
|
||||
|
||||
int (*event_callback)(ptl_event_t *ev, struct ompi_mtl_portals4_rndv_get_frag_t*);
|
||||
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user