Resolve the direct modex race condition. The request hotel was running out of rooms, thereby returning an error upon checkin - and we had missed error_logging a couple of those places. Hence no error message and things just hung.
Output a (hopefully) helpful message when we timeout an operation Thanks to Nathan for tracking it down. Signed-off-by: Ralph Castain <rhc@open-mpi.org>
Этот коммит содержится в:
родитель
9a69b20d09
Коммит
b7e9711f45
@ -10,7 +10,7 @@
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
|
||||
# Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
@ -60,3 +60,11 @@ info key:
|
||||
key: %s
|
||||
|
||||
The operation will continue, but may not behave completely as expected.
|
||||
#
|
||||
[timedout]
|
||||
A request has timed out and will therefore fail:
|
||||
|
||||
Operation: %s
|
||||
|
||||
Your job may terminate as a result of this problem. You may want to
|
||||
adjust the MCA parameter pmix_server_max_wait and try again.
|
||||
|
@ -83,6 +83,8 @@ static void pmix_server_dmdx_resp(int status, orte_process_name_t* sender,
|
||||
opal_buffer_t *buffer,
|
||||
orte_rml_tag_t tg, void *cbdata);
|
||||
|
||||
#define ORTE_PMIX_SERVER_MIN_ROOMS 4096
|
||||
|
||||
pmix_server_globals_t orte_pmix_server_globals = {0};
|
||||
|
||||
static opal_pmix_server_module_t pmix_server = {
|
||||
@ -122,7 +124,7 @@ void pmix_server_register_params(void)
|
||||
orte_pmix_server_globals.verbosity);
|
||||
}
|
||||
/* specify the size of the hotel */
|
||||
orte_pmix_server_globals.num_rooms = 256;
|
||||
orte_pmix_server_globals.num_rooms = -1;
|
||||
(void) mca_base_var_register ("orte", "pmix", NULL, "server_max_reqs",
|
||||
"Maximum number of backlogged PMIx server direct modex requests",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
@ -158,7 +160,7 @@ static void eviction_cbfunc(struct opal_hotel_t *hotel,
|
||||
{
|
||||
pmix_server_req_t *req = (pmix_server_req_t*)occupant;
|
||||
bool timeout = false;
|
||||
int rc;
|
||||
int rc=OPAL_ERR_TIMEOUT;
|
||||
|
||||
/* decrement the request timeout */
|
||||
req->timeout -= orte_pmix_server_globals.timeout;
|
||||
@ -175,6 +177,8 @@ static void eviction_cbfunc(struct opal_hotel_t *hotel,
|
||||
}
|
||||
ORTE_ERROR_LOG(rc);
|
||||
/* fall thru and return an error so the caller doesn't hang */
|
||||
} else {
|
||||
orte_show_help("help-orted.txt", "timedout", true, req->operation);
|
||||
}
|
||||
/* don't let the caller hang */
|
||||
if (NULL != req->opcbfunc) {
|
||||
@ -205,6 +209,17 @@ int pmix_server_init(void)
|
||||
|
||||
/* setup the server's state variables */
|
||||
OBJ_CONSTRUCT(&orte_pmix_server_globals.reqs, opal_hotel_t);
|
||||
/* by the time we init the server, we should know how many nodes we
|
||||
* have in our environment - with the exception of mpirun. If the
|
||||
* user specified the size of the hotel, then use that value. Otherwise,
|
||||
* set the value to something large to avoid running out of rooms on
|
||||
* large machines */
|
||||
if (-1 == orte_pmix_server_globals.num_rooms) {
|
||||
orte_pmix_server_globals.num_rooms = orte_process_info.num_procs * 2;
|
||||
if (orte_pmix_server_globals.num_rooms < ORTE_PMIX_SERVER_MIN_ROOMS) {
|
||||
orte_pmix_server_globals.num_rooms = ORTE_PMIX_SERVER_MIN_ROOMS;
|
||||
}
|
||||
}
|
||||
if (OPAL_SUCCESS != (rc = opal_hotel_init(&orte_pmix_server_globals.reqs,
|
||||
orte_pmix_server_globals.num_rooms,
|
||||
orte_event_base, orte_pmix_server_globals.timeout*1000000,
|
||||
@ -533,6 +548,7 @@ static void pmix_server_dmdx_recv(int status, orte_process_name_t* sender,
|
||||
* condition, so just log the request and we will fill
|
||||
* it later */
|
||||
req = OBJ_NEW(pmix_server_req_t);
|
||||
(void)asprintf(&req->operation, "DMDX: %s:%d", __FILE__, __LINE__);
|
||||
req->proxy = *sender;
|
||||
req->target = idreq;
|
||||
req->remote_room_num = room_num;
|
||||
@ -540,6 +556,7 @@ static void pmix_server_dmdx_recv(int status, orte_process_name_t* sender,
|
||||
* amount of time to start the job */
|
||||
ORTE_ADJUST_TIMEOUT(req);
|
||||
if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(req);
|
||||
send_error(rc, &idreq, sender);
|
||||
}
|
||||
@ -558,6 +575,7 @@ static void pmix_server_dmdx_recv(int status, orte_process_name_t* sender,
|
||||
/* track the request since the call down to the PMIx server
|
||||
* is asynchronous */
|
||||
req = OBJ_NEW(pmix_server_req_t);
|
||||
(void)asprintf(&req->operation, "DMDX: %s:%d", __FILE__, __LINE__);
|
||||
req->proxy = *sender;
|
||||
req->target = idreq;
|
||||
req->remote_room_num = room_num;
|
||||
@ -565,6 +583,7 @@ static void pmix_server_dmdx_recv(int status, orte_process_name_t* sender,
|
||||
* amount of time to start the job */
|
||||
ORTE_ADJUST_TIMEOUT(req);
|
||||
if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(req);
|
||||
send_error(rc, &idreq, sender);
|
||||
return;
|
||||
@ -696,6 +715,7 @@ OBJ_CLASS_INSTANCE(orte_pmix_server_op_caddy_t,
|
||||
|
||||
static void rqcon(pmix_server_req_t *p)
|
||||
{
|
||||
p->operation = NULL;
|
||||
p->target = *ORTE_NAME_INVALID;
|
||||
p->proxy = *ORTE_NAME_INVALID;
|
||||
p->timeout = orte_pmix_server_globals.timeout;
|
||||
@ -710,6 +730,9 @@ static void rqcon(pmix_server_req_t *p)
|
||||
}
|
||||
static void rqdes(pmix_server_req_t *p)
|
||||
{
|
||||
if (NULL != p->operation) {
|
||||
free(p->operation);
|
||||
}
|
||||
if (NULL != p->jdata) {
|
||||
OBJ_RELEASE(p->jdata);
|
||||
}
|
||||
|
@ -62,6 +62,7 @@
|
||||
typedef struct {
|
||||
opal_object_t super;
|
||||
opal_event_t ev;
|
||||
char *operation;
|
||||
int status;
|
||||
int timeout;
|
||||
int room_num;
|
||||
@ -109,6 +110,7 @@ OBJ_CLASS_DECLARATION(orte_pmix_mdx_caddy_t);
|
||||
do { \
|
||||
pmix_server_req_t *_req; \
|
||||
_req = OBJ_NEW(pmix_server_req_t); \
|
||||
(void)asprintf(&_req->operation, "DMDX: %s:%d", __FILE__, __LINE__); \
|
||||
_req->target = (p); \
|
||||
_req->mdxcbfunc = (ocf); \
|
||||
_req->cbdata = (ocd); \
|
||||
@ -122,6 +124,7 @@ OBJ_CLASS_DECLARATION(orte_pmix_mdx_caddy_t);
|
||||
do { \
|
||||
pmix_server_req_t *_req; \
|
||||
_req = OBJ_NEW(pmix_server_req_t); \
|
||||
(void)asprintf(&_req->operation, "SPAWN: %s:%d", __FILE__, __LINE__); \
|
||||
_req->jdata = (j); \
|
||||
_req->spcbfunc = (ocf); \
|
||||
_req->cbdata = (ocd); \
|
||||
|
@ -13,7 +13,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
|
||||
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014 Mellanox Technologies, Inc.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2014-2016 Research Organization for Information Science
|
||||
@ -100,6 +100,7 @@ int pmix_server_publish_fn(opal_process_name_t *proc,
|
||||
|
||||
/* create the caddy */
|
||||
req = OBJ_NEW(pmix_server_req_t);
|
||||
(void)asprintf(&req->operation, "PUBLISH: %s:%d", __FILE__, __LINE__);
|
||||
req->opcbfunc = cbfunc;
|
||||
req->cbdata = cbdata;
|
||||
|
||||
@ -207,6 +208,7 @@ int pmix_server_lookup_fn(opal_process_name_t *proc, char **keys,
|
||||
|
||||
/* create the caddy */
|
||||
req = OBJ_NEW(pmix_server_req_t);
|
||||
(void)asprintf(&req->operation, "LOOKUP: %s:%d", __FILE__, __LINE__);
|
||||
req->lkcbfunc = cbfunc;
|
||||
req->cbdata = cbdata;
|
||||
|
||||
@ -302,6 +304,7 @@ int pmix_server_unpublish_fn(opal_process_name_t *proc, char **keys,
|
||||
|
||||
/* create the caddy */
|
||||
req = OBJ_NEW(pmix_server_req_t);
|
||||
(void)asprintf(&req->operation, "UNPUBLISH: %s:%d", __FILE__, __LINE__);
|
||||
req->opcbfunc = cbfunc;
|
||||
req->cbdata = cbdata;
|
||||
|
||||
@ -468,4 +471,3 @@ void pmix_server_keyval_client(int status, orte_process_name_t* sender,
|
||||
OBJ_RELEASE(req);
|
||||
}
|
||||
}
|
||||
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user