Adjust the timeout for direct modex requests to reflect the size of the job. It can take several seconds to start all the procs, and we don't want to timeout due to differences in start times of the various procs
Signed-off-by: Ralph Castain <rhc@open-mpi.org>
Этот коммит содержится в:
родитель
9cb18b8348
Коммит
734b90aa6b
@ -536,6 +536,9 @@ static void pmix_server_dmdx_recv(int status, orte_process_name_t* sender,
|
||||
req->proxy = *sender;
|
||||
req->target = idreq;
|
||||
req->remote_room_num = room_num;
|
||||
/* adjust the timeout to reflect the size of the job as it can take some
|
||||
* amount of time to start the job */
|
||||
ORTE_ADJUST_TIMEOUT(req);
|
||||
if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) {
|
||||
OBJ_RELEASE(req);
|
||||
send_error(rc, &idreq, sender);
|
||||
@ -558,6 +561,9 @@ static void pmix_server_dmdx_recv(int status, orte_process_name_t* sender,
|
||||
req->proxy = *sender;
|
||||
req->target = idreq;
|
||||
req->remote_room_num = room_num;
|
||||
/* adjust the timeout to reflect the size of the job as it can take some
|
||||
* amount of time to start the job */
|
||||
ORTE_ADJUST_TIMEOUT(req);
|
||||
if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) {
|
||||
OBJ_RELEASE(req);
|
||||
send_error(rc, &idreq, sender);
|
||||
|
@ -13,7 +13,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
|
||||
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014 Mellanox Technologies, Inc.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2014-2017 Research Organization for Information Science
|
||||
@ -148,6 +148,10 @@ static void dmodex_req(int sd, short args, void *cbdata)
|
||||
return;
|
||||
}
|
||||
|
||||
/* adjust the timeout to reflect the size of the job as it can take some
|
||||
* amount of time to start the job */
|
||||
ORTE_ADJUST_TIMEOUT(req);
|
||||
|
||||
/* has anyone already requested data for this target? If so,
|
||||
* then the data is already on its way */
|
||||
for (rnum=0; rnum < orte_pmix_server_globals.reqs.num_rooms; rnum++) {
|
||||
|
@ -48,6 +48,15 @@
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
#define ORTED_PMIX_MIN_DMX_TIMEOUT 10
|
||||
#define ORTE_ADJUST_TIMEOUT(a) \
|
||||
do { \
|
||||
(a)->timeout = (2 * orte_process_info.num_daemons) / 1000; \
|
||||
if ((a)->timeout < ORTED_PMIX_MIN_DMX_TIMEOUT) { \
|
||||
(a)->timeout = ORTED_PMIX_MIN_DMX_TIMEOUT; \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
/* object for tracking requests so we can
|
||||
* correctly route the eventual reply */
|
||||
typedef struct {
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user