1
1

mtl/portals4: add timeout to get retransmit

Signed-off-by: Todd Kordenbrock <thkgcode@gmail.com>
Этот коммит содержится в:
Piotr Lesnicki 2017-05-23 20:32:55 +02:00 коммит произвёл Todd Kordenbrock
родитель c632784ca3
Коммит 06b15cebbf
4 изменённых файлов: 23 добавлений и 1 удалений

Просмотреть файл

@ -73,6 +73,7 @@ struct mca_mtl_portals4_module_t {
/* free list of rendezvous get fragments */
opal_free_list_t fl_rndv_get_frag;
int get_retransmit_timeout;
/** Network interface handle for matched interface */
ptl_handle_ni_t ni_h;

Просмотреть файл

@ -202,6 +202,16 @@ ompi_mtl_portals4_component_register(void)
MCA_BASE_VAR_SCOPE_READONLY,
&ompi_mtl_portals4.max_msg_size_mtl);
ompi_mtl_portals4.get_retransmit_timeout=10000;
(void) mca_base_component_var_register(&mca_mtl_portals4_component.mtl_version,
"get_retransmit_timeout",
"PtlGET retransmission timeout in usec",
MCA_BASE_VAR_TYPE_INT,
NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&ompi_mtl_portals4.get_retransmit_timeout);
OBJ_RELEASE(new_enum);
if (0 > ret) {
return OMPI_ERR_NOT_SUPPORTED;

Просмотреть файл

@ -27,6 +27,7 @@
#include "ompi/mca/mtl/base/base.h"
#include "ompi/mca/mtl/base/mtl_base_datatype.h"
#include "ompi/message/message.h"
#include "opal/mca/timer/base/base.h"
#include "mtl_portals4.h"
#include "mtl_portals4_endpoint.h"
@ -81,6 +82,7 @@ read_msg(void *start, ptl_size_t length, ptl_process_t target,
frag->frag_remote_offset = remote_offset + i * ompi_mtl_portals4.max_msg_size_mtl;
frag->event_callback = ompi_mtl_portals4_rndv_get_frag_progress;
frag->frag_start_time_usec = opal_timer_base_get_usec();
OPAL_OUTPUT_VERBOSE((90, ompi_mtl_base_framework.framework_output, "GET (fragment %d/%d, size %ld) send",
i + 1, frag_count, frag->frag_length));
@ -322,17 +324,24 @@ ompi_mtl_portals4_rndv_get_frag_progress(ptl_event_t *ev,
ompi_mtl_portals4_recv_request_t* ptl_request =
(ompi_mtl_portals4_recv_request_t*) rndv_get_frag->request;
assert(ev->type==PTL_EVENT_REPLY);
assert(PTL_EVENT_REPLY == ev->type);
OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output,
"Recv %lu (0x%lx) got reply event",
ptl_request->opcount, ptl_request->hdr_data));
if (OPAL_UNLIKELY(ev->ni_fail_type != PTL_NI_OK)) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: PTL_EVENT_REPLY with ni_fail_type: %d",
__FILE__, __LINE__, ev->ni_fail_type);
opal_timer_t time = opal_timer_base_get_usec() - rndv_get_frag->frag_start_time_usec;
if (time > (unsigned int) ompi_mtl_portals4.get_retransmit_timeout) {
mtl_ptl_error(1, "timeout retrying GET");
return OMPI_ERROR;
}
OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output,
"Rendezvous Get Failed: Reissuing frag #%u", rndv_get_frag->frag_num));

Просмотреть файл

@ -22,6 +22,7 @@
#include "opal/datatype/opal_convertor.h"
#include "ompi/mca/mtl/mtl.h"
#include "opal/mca/timer/base/base.h"
struct ompi_mtl_portals4_message_t;
struct ompi_mtl_portals4_pending_request_t;
@ -93,6 +94,7 @@ struct ompi_mtl_portals4_rndv_get_frag_t {
ptl_process_t frag_target;
ptl_hdr_data_t frag_match_bits;
ptl_size_t frag_remote_offset;
opal_timer_t frag_start_time_usec;
int (*event_callback)(ptl_event_t *ev, struct ompi_mtl_portals4_rndv_get_frag_t*);