1
1

Adjust the timeout for direct modex requests to reflect the size of the job. It can take several seconds to start all the procs, and we don't want to timeout due to differences in start times of the various procs

Signed-off-by: Ralph Castain <rhc@open-mpi.org>
Этот коммит содержится в:
Ralph Castain 2017-04-04 18:20:51 -07:00
родитель 9cb18b8348
Коммит 734b90aa6b
3 изменённых файлов: 20 добавлений и 1 удалений

Просмотреть файл

@ -536,6 +536,9 @@ static void pmix_server_dmdx_recv(int status, orte_process_name_t* sender,
req->proxy = *sender; req->proxy = *sender;
req->target = idreq; req->target = idreq;
req->remote_room_num = room_num; req->remote_room_num = room_num;
/* adjust the timeout to reflect the size of the job as it can take some
* amount of time to start the job */
ORTE_ADJUST_TIMEOUT(req);
if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) { if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) {
OBJ_RELEASE(req); OBJ_RELEASE(req);
send_error(rc, &idreq, sender); send_error(rc, &idreq, sender);
@ -558,6 +561,9 @@ static void pmix_server_dmdx_recv(int status, orte_process_name_t* sender,
req->proxy = *sender; req->proxy = *sender;
req->target = idreq; req->target = idreq;
req->remote_room_num = room_num; req->remote_room_num = room_num;
/* adjust the timeout to reflect the size of the job as it can take some
* amount of time to start the job */
ORTE_ADJUST_TIMEOUT(req);
if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) { if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) {
OBJ_RELEASE(req); OBJ_RELEASE(req);
send_error(rc, &idreq, sender); send_error(rc, &idreq, sender);

Просмотреть файл

@ -13,7 +13,7 @@
* All rights reserved. * All rights reserved.
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved. * Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2014 Mellanox Technologies, Inc. * Copyright (c) 2014 Mellanox Technologies, Inc.
* All rights reserved. * All rights reserved.
* Copyright (c) 2014-2017 Research Organization for Information Science * Copyright (c) 2014-2017 Research Organization for Information Science
@ -148,6 +148,10 @@ static void dmodex_req(int sd, short args, void *cbdata)
return; return;
} }
/* adjust the timeout to reflect the size of the job as it can take some
* amount of time to start the job */
ORTE_ADJUST_TIMEOUT(req);
/* has anyone already requested data for this target? If so, /* has anyone already requested data for this target? If so,
* then the data is already on its way */ * then the data is already on its way */
for (rnum=0; rnum < orte_pmix_server_globals.reqs.num_rooms; rnum++) { for (rnum=0; rnum < orte_pmix_server_globals.reqs.num_rooms; rnum++) {

Просмотреть файл

@ -48,6 +48,15 @@
BEGIN_C_DECLS BEGIN_C_DECLS
#define ORTED_PMIX_MIN_DMX_TIMEOUT 10
#define ORTE_ADJUST_TIMEOUT(a) \
do { \
(a)->timeout = (2 * orte_process_info.num_daemons) / 1000; \
if ((a)->timeout < ORTED_PMIX_MIN_DMX_TIMEOUT) { \
(a)->timeout = ORTED_PMIX_MIN_DMX_TIMEOUT; \
} \
} while(0)
/* object for tracking requests so we can /* object for tracking requests so we can
* correctly route the eventual reply */ * correctly route the eventual reply */
typedef struct { typedef struct {