Provide further (hopefully) helpful messages about the hotel size
Signed-off-by: Ralph Castain <rhc@open-mpi.org>
Этот коммит содержится в:
родитель
840d6c9a1d
Коммит
db8943cedd
@ -68,3 +68,15 @@ A request has timed out and will therefore fail:
|
||||
|
||||
Your job may terminate as a result of this problem. You may want to
|
||||
adjust the MCA parameter pmix_server_max_wait and try again.
|
||||
#
|
||||
[noroom]
|
||||
A request for an asynchronous runtime operation cannot be fulfilled
|
||||
because of a lack of room in the tracking array:
|
||||
|
||||
Operation: %s
|
||||
Number of rooms: %d
|
||||
|
||||
This is usually caused by a large job that encounters significant
|
||||
delays across the cluster when starting the application processes.
|
||||
Your job may terminate as a result of this problem. You may want to
|
||||
adjust the MCA parameter pmix_server_max_reqs and try again.
|
||||
|
@ -556,7 +556,7 @@ static void pmix_server_dmdx_recv(int status, orte_process_name_t* sender,
|
||||
* amount of time to start the job */
|
||||
ORTE_ADJUST_TIMEOUT(req);
|
||||
if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
orte_show_help("help-orted.txt", "noroom", true, req->operation, orte_pmix_server_globals.num_rooms);
|
||||
OBJ_RELEASE(req);
|
||||
send_error(rc, &idreq, sender);
|
||||
}
|
||||
@ -583,7 +583,7 @@ static void pmix_server_dmdx_recv(int status, orte_process_name_t* sender,
|
||||
* amount of time to start the job */
|
||||
ORTE_ADJUST_TIMEOUT(req);
|
||||
if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
orte_show_help("help-orted.txt", "noroom", true, req->operation, orte_pmix_server_globals.num_rooms);
|
||||
OBJ_RELEASE(req);
|
||||
send_error(rc, &idreq, sender);
|
||||
return;
|
||||
|
@ -105,7 +105,7 @@ static void spawn(int sd, short args, void *cbdata)
|
||||
|
||||
/* add this request to our tracker hotel */
|
||||
if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
orte_show_help("help-orted.txt", "noroom", true, req->operation, orte_pmix_server_globals.num_rooms);
|
||||
goto callback;
|
||||
}
|
||||
|
||||
|
@ -37,6 +37,7 @@
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/mca/grpcomm/grpcomm.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
@ -164,7 +165,7 @@ static void dmodex_req(int sd, short args, void *cbdata)
|
||||
/* save the request in the hotel until the
|
||||
* data is returned */
|
||||
if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
orte_show_help("help-orted.txt", "noroom", true, req->operation, orte_pmix_server_globals.num_rooms);
|
||||
/* can't just return as that would cause the requestor
|
||||
* to hang, so instead execute the callback */
|
||||
goto callback;
|
||||
@ -180,7 +181,7 @@ static void dmodex_req(int sd, short args, void *cbdata)
|
||||
* that we don't know about yet. In this case, just
|
||||
* record the request and we will process it later */
|
||||
if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
orte_show_help("help-orted.txt", "noroom", true, req->operation, orte_pmix_server_globals.num_rooms);
|
||||
/* can't just return as that would cause the requestor
|
||||
* to hang, so instead execute the callback */
|
||||
goto callback;
|
||||
@ -209,7 +210,7 @@ static void dmodex_req(int sd, short args, void *cbdata)
|
||||
/* track the request so we know the function and cbdata
|
||||
* to callback upon completion */
|
||||
if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
orte_show_help("help-orted.txt", "noroom", true, req->operation, orte_pmix_server_globals.num_rooms);
|
||||
goto callback;
|
||||
}
|
||||
|
||||
|
@ -38,6 +38,7 @@
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/runtime/orte_data_server.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
@ -52,7 +53,7 @@ static void execute(int sd, short args, void *cbdata)
|
||||
|
||||
/* add this request to our tracker hotel */
|
||||
if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
orte_show_help("help-orted.txt", "noroom", true, req->operation, orte_pmix_server_globals.num_rooms);
|
||||
goto callback;
|
||||
}
|
||||
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user