1
1

Provide further (hopefully) helpful messages about the hotel size

Signed-off-by: Ralph Castain <rhc@open-mpi.org>
Этот коммит содержится в:
Ralph Castain 2017-04-05 04:27:32 -07:00
родитель 840d6c9a1d
Коммит db8943cedd
5 изменённых файлов: 21 добавлений и 7 удалений

Просмотреть файл

@ -68,3 +68,15 @@ A request has timed out and will therefore fail:
Your job may terminate as a result of this problem. You may want to Your job may terminate as a result of this problem. You may want to
adjust the MCA parameter pmix_server_max_wait and try again. adjust the MCA parameter pmix_server_max_wait and try again.
#
[noroom]
A request for an asynchronous runtime operation cannot be fulfilled
because of a lack of room in the tracking array:
Operation: %s
Number of rooms: %d
This is usually caused by a large job that encounters significant
delays across the cluster when starting the application processes.
Your job may terminate as a result of this problem. You may want to
adjust the MCA parameter pmix_server_max_reqs and try again.

Просмотреть файл

@ -556,7 +556,7 @@ static void pmix_server_dmdx_recv(int status, orte_process_name_t* sender,
* amount of time to start the job */ * amount of time to start the job */
ORTE_ADJUST_TIMEOUT(req); ORTE_ADJUST_TIMEOUT(req);
if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) { if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) {
ORTE_ERROR_LOG(rc); orte_show_help("help-orted.txt", "noroom", true, req->operation, orte_pmix_server_globals.num_rooms);
OBJ_RELEASE(req); OBJ_RELEASE(req);
send_error(rc, &idreq, sender); send_error(rc, &idreq, sender);
} }
@ -583,7 +583,7 @@ static void pmix_server_dmdx_recv(int status, orte_process_name_t* sender,
* amount of time to start the job */ * amount of time to start the job */
ORTE_ADJUST_TIMEOUT(req); ORTE_ADJUST_TIMEOUT(req);
if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) { if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) {
ORTE_ERROR_LOG(rc); orte_show_help("help-orted.txt", "noroom", true, req->operation, orte_pmix_server_globals.num_rooms);
OBJ_RELEASE(req); OBJ_RELEASE(req);
send_error(rc, &idreq, sender); send_error(rc, &idreq, sender);
return; return;

Просмотреть файл

@ -105,7 +105,7 @@ static void spawn(int sd, short args, void *cbdata)
/* add this request to our tracker hotel */ /* add this request to our tracker hotel */
if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) { if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) {
ORTE_ERROR_LOG(rc); orte_show_help("help-orted.txt", "noroom", true, req->operation, orte_pmix_server_globals.num_rooms);
goto callback; goto callback;
} }

Просмотреть файл

@ -37,6 +37,7 @@
#include "orte/mca/errmgr/errmgr.h" #include "orte/mca/errmgr/errmgr.h"
#include "orte/util/name_fns.h" #include "orte/util/name_fns.h"
#include "orte/util/show_help.h"
#include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_globals.h"
#include "orte/mca/grpcomm/grpcomm.h" #include "orte/mca/grpcomm/grpcomm.h"
#include "orte/mca/rml/rml.h" #include "orte/mca/rml/rml.h"
@ -164,7 +165,7 @@ static void dmodex_req(int sd, short args, void *cbdata)
/* save the request in the hotel until the /* save the request in the hotel until the
* data is returned */ * data is returned */
if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) { if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) {
ORTE_ERROR_LOG(rc); orte_show_help("help-orted.txt", "noroom", true, req->operation, orte_pmix_server_globals.num_rooms);
/* can't just return as that would cause the requestor /* can't just return as that would cause the requestor
* to hang, so instead execute the callback */ * to hang, so instead execute the callback */
goto callback; goto callback;
@ -180,7 +181,7 @@ static void dmodex_req(int sd, short args, void *cbdata)
* that we don't know about yet. In this case, just * that we don't know about yet. In this case, just
* record the request and we will process it later */ * record the request and we will process it later */
if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) { if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) {
ORTE_ERROR_LOG(rc); orte_show_help("help-orted.txt", "noroom", true, req->operation, orte_pmix_server_globals.num_rooms);
/* can't just return as that would cause the requestor /* can't just return as that would cause the requestor
* to hang, so instead execute the callback */ * to hang, so instead execute the callback */
goto callback; goto callback;
@ -209,7 +210,7 @@ static void dmodex_req(int sd, short args, void *cbdata)
/* track the request so we know the function and cbdata /* track the request so we know the function and cbdata
* to callback upon completion */ * to callback upon completion */
if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) { if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) {
ORTE_ERROR_LOG(rc); orte_show_help("help-orted.txt", "noroom", true, req->operation, orte_pmix_server_globals.num_rooms);
goto callback; goto callback;
} }

Просмотреть файл

@ -38,6 +38,7 @@
#include "orte/mca/errmgr/errmgr.h" #include "orte/mca/errmgr/errmgr.h"
#include "orte/util/name_fns.h" #include "orte/util/name_fns.h"
#include "orte/util/show_help.h"
#include "orte/runtime/orte_data_server.h" #include "orte/runtime/orte_data_server.h"
#include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_globals.h"
#include "orte/mca/rml/rml.h" #include "orte/mca/rml/rml.h"
@ -52,7 +53,7 @@ static void execute(int sd, short args, void *cbdata)
/* add this request to our tracker hotel */ /* add this request to our tracker hotel */
if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) { if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) {
ORTE_ERROR_LOG(rc); orte_show_help("help-orted.txt", "noroom", true, req->operation, orte_pmix_server_globals.num_rooms);
goto callback; goto callback;
} }