From 734b90aa6b1c1b873cc85e11b74db6fe58df97ec Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Tue, 4 Apr 2017 18:20:51 -0700 Subject: [PATCH] Adjust the timeout for direct modex requests to reflect the size of the job. It can take several seconds to start all the procs, and we don't want to timeout due to differences in start times of the various procs Signed-off-by: Ralph Castain --- orte/orted/pmix/pmix_server.c | 6 ++++++ orte/orted/pmix/pmix_server_fence.c | 6 +++++- orte/orted/pmix/pmix_server_internal.h | 9 +++++++++ 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/orte/orted/pmix/pmix_server.c b/orte/orted/pmix/pmix_server.c index 7cf0f5659a..8754ded276 100644 --- a/orte/orted/pmix/pmix_server.c +++ b/orte/orted/pmix/pmix_server.c @@ -536,6 +536,9 @@ static void pmix_server_dmdx_recv(int status, orte_process_name_t* sender, req->proxy = *sender; req->target = idreq; req->remote_room_num = room_num; + /* adjust the timeout to reflect the size of the job as it can take some + * amount of time to start the job */ + ORTE_ADJUST_TIMEOUT(req); if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) { OBJ_RELEASE(req); send_error(rc, &idreq, sender); @@ -558,6 +561,9 @@ static void pmix_server_dmdx_recv(int status, orte_process_name_t* sender, req->proxy = *sender; req->target = idreq; req->remote_room_num = room_num; + /* adjust the timeout to reflect the size of the job as it can take some + * amount of time to start the job */ + ORTE_ADJUST_TIMEOUT(req); if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) { OBJ_RELEASE(req); send_error(rc, &idreq, sender); diff --git a/orte/orted/pmix/pmix_server_fence.c b/orte/orted/pmix/pmix_server_fence.c index 10f750e9ca..59caa1469e 100644 --- a/orte/orted/pmix/pmix_server_fence.c +++ b/orte/orted/pmix/pmix_server_fence.c @@ -13,7 +13,7 @@ * All rights reserved. * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. - * Copyright (c) 2013-2016 Intel, Inc. All rights reserved. + * Copyright (c) 2013-2017 Intel, Inc. All rights reserved. * Copyright (c) 2014 Mellanox Technologies, Inc. * All rights reserved. * Copyright (c) 2014-2017 Research Organization for Information Science @@ -148,6 +148,10 @@ static void dmodex_req(int sd, short args, void *cbdata) return; } + /* adjust the timeout to reflect the size of the job as it can take some + * amount of time to start the job */ + ORTE_ADJUST_TIMEOUT(req); + /* has anyone already requested data for this target? If so, * then the data is already on its way */ for (rnum=0; rnum < orte_pmix_server_globals.reqs.num_rooms; rnum++) { diff --git a/orte/orted/pmix/pmix_server_internal.h b/orte/orted/pmix/pmix_server_internal.h index 3f232e7f42..53da91595c 100644 --- a/orte/orted/pmix/pmix_server_internal.h +++ b/orte/orted/pmix/pmix_server_internal.h @@ -48,6 +48,15 @@ BEGIN_C_DECLS +#define ORTED_PMIX_MIN_DMX_TIMEOUT 10 +#define ORTE_ADJUST_TIMEOUT(a) \ + do { \ + (a)->timeout = (2 * orte_process_info.num_daemons) / 1000; \ + if ((a)->timeout < ORTED_PMIX_MIN_DMX_TIMEOUT) { \ + (a)->timeout = ORTED_PMIX_MIN_DMX_TIMEOUT; \ + } \ + } while(0) + /* object for tracking requests so we can * correctly route the eventual reply */ typedef struct {