1
1

Resolve the direct modex race condition. The request hotel was running out of rooms, thereby returning an error upon checkin - and we had missed error_logging a couple of those places. Hence no error message and things just hung.

Output a (hopefully) helpful message when we timeout an operation

Thanks to Nathan for tracking it down.

Signed-off-by: Ralph Castain <rhc@open-mpi.org>
Этот коммит содержится в:
Ralph Castain 2017-04-04 21:09:02 -07:00
родитель 9a69b20d09
Коммит b7e9711f45
4 изменённых файлов: 41 добавлений и 5 удалений

Просмотреть файл

@ -10,7 +10,7 @@
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
# Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
@ -60,3 +60,11 @@ info key:
key: %s
The operation will continue, but may not behave completely as expected.
#
[timedout]
A request has timed out and will therefore fail:
Operation: %s
Your job may terminate as a result of this problem. You may want to
adjust the MCA parameter pmix_server_max_wait and try again.

Просмотреть файл

@ -83,6 +83,8 @@ static void pmix_server_dmdx_resp(int status, orte_process_name_t* sender,
opal_buffer_t *buffer,
orte_rml_tag_t tg, void *cbdata);
#define ORTE_PMIX_SERVER_MIN_ROOMS 4096
pmix_server_globals_t orte_pmix_server_globals = {0};
static opal_pmix_server_module_t pmix_server = {
@ -122,7 +124,7 @@ void pmix_server_register_params(void)
orte_pmix_server_globals.verbosity);
}
/* specify the size of the hotel */
orte_pmix_server_globals.num_rooms = 256;
orte_pmix_server_globals.num_rooms = -1;
(void) mca_base_var_register ("orte", "pmix", NULL, "server_max_reqs",
"Maximum number of backlogged PMIx server direct modex requests",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
@ -158,7 +160,7 @@ static void eviction_cbfunc(struct opal_hotel_t *hotel,
{
pmix_server_req_t *req = (pmix_server_req_t*)occupant;
bool timeout = false;
int rc;
int rc=OPAL_ERR_TIMEOUT;
/* decrement the request timeout */
req->timeout -= orte_pmix_server_globals.timeout;
@ -175,6 +177,8 @@ static void eviction_cbfunc(struct opal_hotel_t *hotel,
}
ORTE_ERROR_LOG(rc);
/* fall thru and return an error so the caller doesn't hang */
} else {
orte_show_help("help-orted.txt", "timedout", true, req->operation);
}
/* don't let the caller hang */
if (NULL != req->opcbfunc) {
@ -205,6 +209,17 @@ int pmix_server_init(void)
/* setup the server's state variables */
OBJ_CONSTRUCT(&orte_pmix_server_globals.reqs, opal_hotel_t);
/* by the time we init the server, we should know how many nodes we
* have in our environment - with the exception of mpirun. If the
* user specified the size of the hotel, then use that value. Otherwise,
* set the value to something large to avoid running out of rooms on
* large machines */
if (-1 == orte_pmix_server_globals.num_rooms) {
orte_pmix_server_globals.num_rooms = orte_process_info.num_procs * 2;
if (orte_pmix_server_globals.num_rooms < ORTE_PMIX_SERVER_MIN_ROOMS) {
orte_pmix_server_globals.num_rooms = ORTE_PMIX_SERVER_MIN_ROOMS;
}
}
if (OPAL_SUCCESS != (rc = opal_hotel_init(&orte_pmix_server_globals.reqs,
orte_pmix_server_globals.num_rooms,
orte_event_base, orte_pmix_server_globals.timeout*1000000,
@ -533,6 +548,7 @@ static void pmix_server_dmdx_recv(int status, orte_process_name_t* sender,
* condition, so just log the request and we will fill
* it later */
req = OBJ_NEW(pmix_server_req_t);
(void)asprintf(&req->operation, "DMDX: %s:%d", __FILE__, __LINE__);
req->proxy = *sender;
req->target = idreq;
req->remote_room_num = room_num;
@ -540,6 +556,7 @@ static void pmix_server_dmdx_recv(int status, orte_process_name_t* sender,
* amount of time to start the job */
ORTE_ADJUST_TIMEOUT(req);
if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(req);
send_error(rc, &idreq, sender);
}
@ -558,6 +575,7 @@ static void pmix_server_dmdx_recv(int status, orte_process_name_t* sender,
/* track the request since the call down to the PMIx server
* is asynchronous */
req = OBJ_NEW(pmix_server_req_t);
(void)asprintf(&req->operation, "DMDX: %s:%d", __FILE__, __LINE__);
req->proxy = *sender;
req->target = idreq;
req->remote_room_num = room_num;
@ -565,6 +583,7 @@ static void pmix_server_dmdx_recv(int status, orte_process_name_t* sender,
* amount of time to start the job */
ORTE_ADJUST_TIMEOUT(req);
if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(req);
send_error(rc, &idreq, sender);
return;
@ -696,6 +715,7 @@ OBJ_CLASS_INSTANCE(orte_pmix_server_op_caddy_t,
static void rqcon(pmix_server_req_t *p)
{
p->operation = NULL;
p->target = *ORTE_NAME_INVALID;
p->proxy = *ORTE_NAME_INVALID;
p->timeout = orte_pmix_server_globals.timeout;
@ -710,6 +730,9 @@ static void rqcon(pmix_server_req_t *p)
}
static void rqdes(pmix_server_req_t *p)
{
if (NULL != p->operation) {
free(p->operation);
}
if (NULL != p->jdata) {
OBJ_RELEASE(p->jdata);
}

Просмотреть файл

@ -62,6 +62,7 @@
typedef struct {
opal_object_t super;
opal_event_t ev;
char *operation;
int status;
int timeout;
int room_num;
@ -109,6 +110,7 @@ OBJ_CLASS_DECLARATION(orte_pmix_mdx_caddy_t);
do { \
pmix_server_req_t *_req; \
_req = OBJ_NEW(pmix_server_req_t); \
(void)asprintf(&_req->operation, "DMDX: %s:%d", __FILE__, __LINE__); \
_req->target = (p); \
_req->mdxcbfunc = (ocf); \
_req->cbdata = (ocd); \
@ -122,6 +124,7 @@ OBJ_CLASS_DECLARATION(orte_pmix_mdx_caddy_t);
do { \
pmix_server_req_t *_req; \
_req = OBJ_NEW(pmix_server_req_t); \
(void)asprintf(&_req->operation, "SPAWN: %s:%d", __FILE__, __LINE__); \
_req->jdata = (j); \
_req->spcbfunc = (ocf); \
_req->cbdata = (ocd); \

Просмотреть файл

@ -13,7 +13,7 @@
* All rights reserved.
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2014 Mellanox Technologies, Inc.
* All rights reserved.
* Copyright (c) 2014-2016 Research Organization for Information Science
@ -100,6 +100,7 @@ int pmix_server_publish_fn(opal_process_name_t *proc,
/* create the caddy */
req = OBJ_NEW(pmix_server_req_t);
(void)asprintf(&req->operation, "PUBLISH: %s:%d", __FILE__, __LINE__);
req->opcbfunc = cbfunc;
req->cbdata = cbdata;
@ -207,6 +208,7 @@ int pmix_server_lookup_fn(opal_process_name_t *proc, char **keys,
/* create the caddy */
req = OBJ_NEW(pmix_server_req_t);
(void)asprintf(&req->operation, "LOOKUP: %s:%d", __FILE__, __LINE__);
req->lkcbfunc = cbfunc;
req->cbdata = cbdata;
@ -302,6 +304,7 @@ int pmix_server_unpublish_fn(opal_process_name_t *proc, char **keys,
/* create the caddy */
req = OBJ_NEW(pmix_server_req_t);
(void)asprintf(&req->operation, "UNPUBLISH: %s:%d", __FILE__, __LINE__);
req->opcbfunc = cbfunc;
req->cbdata = cbdata;
@ -468,4 +471,3 @@ void pmix_server_keyval_client(int status, orte_process_name_t* sender,
OBJ_RELEASE(req);
}
}