1
1

Merge pull request #3837 from tkordenbrock/topic/master/get.retry.timeout

master: mtl-portals4: add timeout to rendezvous get fragments
Этот коммит содержится в:
Ryan Grant 2017-07-13 09:59:54 -06:00 коммит произвёл GitHub
родитель 6fb81f20e4 5ecd905358
Коммит 0ce8590e7c
4 изменённых файлов: 41 добавлений и 1 удалений

Просмотреть файл

@ -73,6 +73,7 @@ struct mca_mtl_portals4_module_t {
/* free list of rendezvous get fragments */
opal_free_list_t fl_rndv_get_frag;
int get_retransmit_timeout;
/** Network interface handle for matched interface */
ptl_handle_ni_t ni_h;

Просмотреть файл

@ -202,6 +202,16 @@ ompi_mtl_portals4_component_register(void)
MCA_BASE_VAR_SCOPE_READONLY,
&ompi_mtl_portals4.max_msg_size_mtl);
ompi_mtl_portals4.get_retransmit_timeout=10000;
(void) mca_base_component_var_register(&mca_mtl_portals4_component.mtl_version,
"get_retransmit_timeout",
"PtlGET retransmission timeout in usec",
MCA_BASE_VAR_TYPE_INT,
NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&ompi_mtl_portals4.get_retransmit_timeout);
OBJ_RELEASE(new_enum);
if (0 > ret) {
return OMPI_ERR_NOT_SUPPORTED;

Просмотреть файл

@ -27,6 +27,7 @@
#include "ompi/mca/mtl/base/base.h"
#include "ompi/mca/mtl/base/mtl_base_datatype.h"
#include "ompi/message/message.h"
#include "opal/mca/timer/base/base.h"
#include "mtl_portals4.h"
#include "mtl_portals4_endpoint.h"
@ -81,6 +82,7 @@ read_msg(void *start, ptl_size_t length, ptl_process_t target,
frag->frag_remote_offset = remote_offset + i * ompi_mtl_portals4.max_msg_size_mtl;
frag->event_callback = ompi_mtl_portals4_rndv_get_frag_progress;
frag->frag_abs_timeout_usec = 0;
OPAL_OUTPUT_VERBOSE((90, ompi_mtl_base_framework.framework_output, "GET (fragment %d/%d, size %ld) send",
i + 1, frag_count, frag->frag_length));
@ -322,17 +324,41 @@ ompi_mtl_portals4_rndv_get_frag_progress(ptl_event_t *ev,
ompi_mtl_portals4_recv_request_t* ptl_request =
(ompi_mtl_portals4_recv_request_t*) rndv_get_frag->request;
assert(ev->type==PTL_EVENT_REPLY);
assert(PTL_EVENT_REPLY == ev->type);
OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output,
"Recv %lu (0x%lx) got reply event",
ptl_request->opcount, ptl_request->hdr_data));
if (OPAL_UNLIKELY(ev->ni_fail_type != PTL_NI_OK)) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: PTL_EVENT_REPLY with ni_fail_type: %d",
__FILE__, __LINE__, ev->ni_fail_type);
if (OPAL_UNLIKELY(ev->ni_fail_type != PTL_NI_DROPPED)) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"PTL_EVENT_REPLY with ni_fail_type: %u => cannot retry",
(uint32_t)ev->ni_fail_type);
ret = PTL_FAIL;
goto callback_error;
}
if (0 == rndv_get_frag->frag_abs_timeout_usec) {
/* this is the first retry of the frag. start the timer. */
/* instead of recording the start time, record the end time
* and avoid addition on each retry. */
rndv_get_frag->frag_abs_timeout_usec = opal_timer_base_get_usec() + ompi_mtl_portals4.get_retransmit_timeout;
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"setting frag timeout at %lu",
rndv_get_frag->frag_abs_timeout_usec);
} else if (opal_timer_base_get_usec() >= rndv_get_frag->frag_abs_timeout_usec) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"timeout retrying GET");
ret = PTL_FAIL;
goto callback_error;
}
OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output,
"Rendezvous Get Failed: Reissuing frag #%u", rndv_get_frag->frag_num));

Просмотреть файл

@ -22,6 +22,7 @@
#include "opal/datatype/opal_convertor.h"
#include "ompi/mca/mtl/mtl.h"
#include "opal/mca/timer/base/base.h"
struct ompi_mtl_portals4_message_t;
struct ompi_mtl_portals4_pending_request_t;
@ -93,6 +94,8 @@ struct ompi_mtl_portals4_rndv_get_frag_t {
ptl_process_t frag_target;
ptl_hdr_data_t frag_match_bits;
ptl_size_t frag_remote_offset;
/* the absolute time at which this frag times out */
opal_timer_t frag_abs_timeout_usec;
int (*event_callback)(ptl_event_t *ev, struct ompi_mtl_portals4_rndv_get_frag_t*);