From db8943cedda1d4b1887085cb80133c6057c545a0 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Wed, 5 Apr 2017 04:27:32 -0700 Subject: [PATCH] Provide further (hopefully) helpful messages about the hotel size Signed-off-by: Ralph Castain --- orte/orted/help-orted.txt | 12 ++++++++++++ orte/orted/pmix/pmix_server.c | 4 ++-- orte/orted/pmix/pmix_server_dyn.c | 2 +- orte/orted/pmix/pmix_server_fence.c | 7 ++++--- orte/orted/pmix/pmix_server_pub.c | 3 ++- 5 files changed, 21 insertions(+), 7 deletions(-) diff --git a/orte/orted/help-orted.txt b/orte/orted/help-orted.txt index 6ab53cba82..c89d4e1015 100644 --- a/orte/orted/help-orted.txt +++ b/orte/orted/help-orted.txt @@ -68,3 +68,15 @@ A request has timed out and will therefore fail: Your job may terminate as a result of this problem. You may want to adjust the MCA parameter pmix_server_max_wait and try again. +# +[noroom] +A request for an asynchronous runtime operation cannot be fulfilled +because of a lack of room in the tracking array: + + Operation: %s + Number of rooms: %d + +This is usually caused by a large job that encounters significant +delays across the cluster when starting the application processes. +Your job may terminate as a result of this problem. You may want to +adjust the MCA parameter pmix_server_max_reqs and try again. diff --git a/orte/orted/pmix/pmix_server.c b/orte/orted/pmix/pmix_server.c index f8d81025ec..63b4dbfdd3 100644 --- a/orte/orted/pmix/pmix_server.c +++ b/orte/orted/pmix/pmix_server.c @@ -556,7 +556,7 @@ static void pmix_server_dmdx_recv(int status, orte_process_name_t* sender, * amount of time to start the job */ ORTE_ADJUST_TIMEOUT(req); if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) { - ORTE_ERROR_LOG(rc); + orte_show_help("help-orted.txt", "noroom", true, req->operation, orte_pmix_server_globals.num_rooms); OBJ_RELEASE(req); send_error(rc, &idreq, sender); } @@ -583,7 +583,7 @@ static void pmix_server_dmdx_recv(int status, orte_process_name_t* sender, * amount of time to start the job */ ORTE_ADJUST_TIMEOUT(req); if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) { - ORTE_ERROR_LOG(rc); + orte_show_help("help-orted.txt", "noroom", true, req->operation, orte_pmix_server_globals.num_rooms); OBJ_RELEASE(req); send_error(rc, &idreq, sender); return; diff --git a/orte/orted/pmix/pmix_server_dyn.c b/orte/orted/pmix/pmix_server_dyn.c index 15f51e1155..389c65a5fc 100644 --- a/orte/orted/pmix/pmix_server_dyn.c +++ b/orte/orted/pmix/pmix_server_dyn.c @@ -105,7 +105,7 @@ static void spawn(int sd, short args, void *cbdata) /* add this request to our tracker hotel */ if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) { - ORTE_ERROR_LOG(rc); + orte_show_help("help-orted.txt", "noroom", true, req->operation, orte_pmix_server_globals.num_rooms); goto callback; } diff --git a/orte/orted/pmix/pmix_server_fence.c b/orte/orted/pmix/pmix_server_fence.c index 59caa1469e..750ad09b39 100644 --- a/orte/orted/pmix/pmix_server_fence.c +++ b/orte/orted/pmix/pmix_server_fence.c @@ -37,6 +37,7 @@ #include "orte/mca/errmgr/errmgr.h" #include "orte/util/name_fns.h" +#include "orte/util/show_help.h" #include "orte/runtime/orte_globals.h" #include "orte/mca/grpcomm/grpcomm.h" #include "orte/mca/rml/rml.h" @@ -164,7 +165,7 @@ static void dmodex_req(int sd, short args, void *cbdata) /* save the request in the hotel until the * data is returned */ if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) { - ORTE_ERROR_LOG(rc); + orte_show_help("help-orted.txt", "noroom", true, req->operation, orte_pmix_server_globals.num_rooms); /* can't just return as that would cause the requestor * to hang, so instead execute the callback */ goto callback; @@ -180,7 +181,7 @@ static void dmodex_req(int sd, short args, void *cbdata) * that we don't know about yet. In this case, just * record the request and we will process it later */ if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) { - ORTE_ERROR_LOG(rc); + orte_show_help("help-orted.txt", "noroom", true, req->operation, orte_pmix_server_globals.num_rooms); /* can't just return as that would cause the requestor * to hang, so instead execute the callback */ goto callback; @@ -209,7 +210,7 @@ static void dmodex_req(int sd, short args, void *cbdata) /* track the request so we know the function and cbdata * to callback upon completion */ if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) { - ORTE_ERROR_LOG(rc); + orte_show_help("help-orted.txt", "noroom", true, req->operation, orte_pmix_server_globals.num_rooms); goto callback; } diff --git a/orte/orted/pmix/pmix_server_pub.c b/orte/orted/pmix/pmix_server_pub.c index 86d07cccb7..4dcb9cfb75 100644 --- a/orte/orted/pmix/pmix_server_pub.c +++ b/orte/orted/pmix/pmix_server_pub.c @@ -38,6 +38,7 @@ #include "orte/mca/errmgr/errmgr.h" #include "orte/util/name_fns.h" +#include "orte/util/show_help.h" #include "orte/runtime/orte_data_server.h" #include "orte/runtime/orte_globals.h" #include "orte/mca/rml/rml.h" @@ -52,7 +53,7 @@ static void execute(int sd, short args, void *cbdata) /* add this request to our tracker hotel */ if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) { - ORTE_ERROR_LOG(rc); + orte_show_help("help-orted.txt", "noroom", true, req->operation, orte_pmix_server_globals.num_rooms); goto callback; }