Lesson to be learned: if you send an ack to a recv'd command, better not send it to the same tag it came from - at least, not if there is a persistent recv on that tag!
Fix the persistent daemon problem where it was exiting when a job completed. Problem was that the persistent daemon would order the job daemons to exit. They would then send an 'ack' back to the persistent daemon - but the ack consisted of an echo of the "exit" command, which was recv'd by the wrong listener who treated it as a properly sent cmd....and exited. This commit was SVN r12243.
Этот коммит содержится в:
родитель
6b697ad3dd
Коммит
153e38ffc9
@ -224,6 +224,7 @@ int orte_pls_base_get_active_daemons(opal_list_t *daemons, orte_jobid_t job)
|
||||
if (NULL != nodename) {
|
||||
dmn->nodename = strdup(nodename);
|
||||
}
|
||||
|
||||
/* add this daemon to the list */
|
||||
opal_list_append(daemons, &dmn->super);
|
||||
}
|
||||
|
@ -65,7 +65,7 @@ int orte_pls_base_orted_exit(opal_list_t *daemons)
|
||||
}
|
||||
|
||||
OBJ_CONSTRUCT(&answer, orte_buffer_t);
|
||||
if (0 > orte_rml.recv_buffer(dmn->name, &answer, ORTE_RML_TAG_PLS_ORTED)) {
|
||||
if (0 > orte_rml.recv_buffer(dmn->name, &answer, ORTE_RML_TAG_PLS_ORTED_ACK)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
|
||||
}
|
||||
OBJ_DESTRUCT(&answer);
|
||||
@ -116,7 +116,7 @@ int orte_pls_base_orted_kill_local_procs(opal_list_t *daemons, orte_jobid_t job)
|
||||
}
|
||||
|
||||
OBJ_CONSTRUCT(&answer, orte_buffer_t);
|
||||
if (0 > orte_rml.recv_buffer(dmn->name, &answer, ORTE_RML_TAG_PLS_ORTED)) {
|
||||
if (0 > orte_rml.recv_buffer(dmn->name, &answer, ORTE_RML_TAG_PLS_ORTED_ACK)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
|
||||
}
|
||||
OBJ_DESTRUCT(&answer);
|
||||
@ -168,7 +168,7 @@ int orte_pls_base_orted_signal_local_procs(opal_list_t *daemons, int32_t signal)
|
||||
}
|
||||
|
||||
OBJ_CONSTRUCT(&answer, orte_buffer_t);
|
||||
if (0 > orte_rml.recv_buffer(dmn->name, &answer, ORTE_RML_TAG_PLS_ORTED)) {
|
||||
if (0 > orte_rml.recv_buffer(dmn->name, &answer, ORTE_RML_TAG_PLS_ORTED_ACK)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
|
||||
}
|
||||
OBJ_DESTRUCT(&answer);
|
||||
@ -219,7 +219,7 @@ int orte_pls_base_orted_add_local_procs(opal_list_t *daemons, orte_gpr_notify_da
|
||||
}
|
||||
|
||||
OBJ_CONSTRUCT(&answer, orte_buffer_t);
|
||||
if (0 > orte_rml.recv_buffer(dmn->name, &answer, ORTE_RML_TAG_PLS_ORTED)) {
|
||||
if (0 > orte_rml.recv_buffer(dmn->name, &answer, ORTE_RML_TAG_PLS_ORTED_ACK)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
|
||||
}
|
||||
OBJ_DESTRUCT(&answer);
|
||||
|
@ -49,10 +49,13 @@ typedef uint32_t orte_rml_tag_t;
|
||||
#define ORTE_RML_TAG_RMAPS 12
|
||||
#define ORTE_RML_TAG_PLS 13
|
||||
#define ORTE_RML_TAG_PLS_ORTED 14
|
||||
#define ORTE_RML_TAG_ERRMGR 15
|
||||
#define ORTE_RML_TAG_BPROC 16
|
||||
#define ORTE_RML_TAG_BPROC_ABORT 17
|
||||
#define ORTE_RML_TAG_PLS_ORTED_ACK 15
|
||||
#define ORTE_RML_TAG_ERRMGR 16
|
||||
#define ORTE_RML_TAG_BPROC 17
|
||||
#define ORTE_RML_TAG_BPROC_ABORT 18
|
||||
|
||||
#define ORTE_RML_TAG_DYNAMIC 2000
|
||||
|
||||
#define ORTE_RML_TAG_MAX UINT32_MAX
|
||||
|
||||
|
||||
|
@ -262,6 +262,16 @@ int orte_init_stage1(bool infrastructure)
|
||||
/* all done with sds - clean up and call it a day */
|
||||
orte_sds_base_close();
|
||||
|
||||
/* initialize the rml module so it can open its interfaces - this
|
||||
* is needed so that we can get a uri for ourselves if we are an
|
||||
* HNP
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = orte_rml.init())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_rml.init";
|
||||
goto error;
|
||||
}
|
||||
|
||||
/* if I'm the seed, set the seed uri to be me! */
|
||||
if (orte_process_info.seed) {
|
||||
if (NULL != orte_universe_info.seed_uri) {
|
||||
|
@ -42,12 +42,6 @@ int orte_init_stage2(void)
|
||||
* Initialize the selected modules now that all components/name are available.
|
||||
*/
|
||||
|
||||
if (ORTE_SUCCESS != (ret = orte_rml.init())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error_str = "orte_rml.init";
|
||||
goto return_error;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (ret = orte_ns.init())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error_str = "orte_ns.init";
|
||||
|
@ -581,7 +581,9 @@ static void orte_daemon_recv_pls(int status, orte_process_name_t* sender,
|
||||
OPAL_THREAD_LOCK(&orted_globals.mutex);
|
||||
|
||||
if (orted_globals.debug_daemons) {
|
||||
opal_output(0, "[%lu,%lu,%lu] orted_recv_pls: received message", ORTE_NAME_ARGS(orte_process_info.my_name));
|
||||
opal_output(0, "[%lu,%lu,%lu] orted_recv_pls: received message from [%ld,%ld,%ld]",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(sender));
|
||||
}
|
||||
|
||||
/* unpack the command */
|
||||
@ -611,6 +613,10 @@ static void orte_daemon_recv_pls(int status, orte_process_name_t* sender,
|
||||
|
||||
/**** KILL_LOCAL_PROCS ****/
|
||||
case ORTE_DAEMON_KILL_LOCAL_PROCS:
|
||||
if (orted_globals.debug_daemons) {
|
||||
opal_output(0, "[%lu,%lu,%lu] orted_recv_pls: received kill_local_procs",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name));
|
||||
}
|
||||
/* unpack the jobid - could be JOBID_WILDCARD, which would indicatge
|
||||
* we should kill all local procs. Otherwise, only kill those within
|
||||
* the specified jobid
|
||||
@ -628,6 +634,10 @@ static void orte_daemon_recv_pls(int status, orte_process_name_t* sender,
|
||||
|
||||
/**** SIGNAL_LOCAL_PROCS ****/
|
||||
case ORTE_DAEMON_SIGNAL_LOCAL_PROCS:
|
||||
if (orted_globals.debug_daemons) {
|
||||
opal_output(0, "[%lu,%lu,%lu] orted_recv_pls: received signal_local_procs",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name));
|
||||
}
|
||||
/* get the signal */
|
||||
n = 1;
|
||||
if (ORTE_SUCCESS != (ret = orte_dss.unpack(buffer, &signal, &n, ORTE_INT32))) {
|
||||
@ -648,6 +658,10 @@ static void orte_daemon_recv_pls(int status, orte_process_name_t* sender,
|
||||
|
||||
/**** ADD_LOCAL_PROCS ****/
|
||||
case ORTE_DAEMON_ADD_LOCAL_PROCS:
|
||||
if (orted_globals.debug_daemons) {
|
||||
opal_output(0, "[%lu,%lu,%lu] orted_recv_pls: received add_local_procs",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name));
|
||||
}
|
||||
/* unpack the notify data object */
|
||||
if (ORTE_SUCCESS != (ret = orte_dss.unpack(buffer, &ndat, &n, ORTE_GPR_NOTIFY_DATA))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
@ -665,10 +679,14 @@ static void orte_daemon_recv_pls(int status, orte_process_name_t* sender,
|
||||
|
||||
/**** EXIT COMMAND ****/
|
||||
case ORTE_DAEMON_EXIT_CMD:
|
||||
if (orted_globals.debug_daemons) {
|
||||
opal_output(0, "[%lu,%lu,%lu] orted_recv_pls: received exit",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name));
|
||||
}
|
||||
/* send the response before we wakeup because otherwise
|
||||
* we'll depart before it gets out!
|
||||
*/
|
||||
if (0 > orte_rml.send_buffer(sender, answer, ORTE_RML_TAG_PLS_ORTED, 0)) {
|
||||
if (0 > orte_rml.send_buffer(sender, answer, ORTE_RML_TAG_PLS_ORTED_ACK, 0)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
|
||||
}
|
||||
orted_globals.exit_condition = true;
|
||||
@ -682,7 +700,7 @@ static void orte_daemon_recv_pls(int status, orte_process_name_t* sender,
|
||||
|
||||
DONE:
|
||||
/* send the response */
|
||||
if (0 > orte_rml.send_buffer(sender, answer, ORTE_RML_TAG_PLS_ORTED, 0)) {
|
||||
if (0 > orte_rml.send_buffer(sender, answer, ORTE_RML_TAG_PLS_ORTED_ACK, 0)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
|
||||
}
|
||||
OBJ_RELEASE(answer);
|
||||
@ -714,7 +732,9 @@ static void orte_daemon_recv(int status, orte_process_name_t* sender,
|
||||
OPAL_THREAD_LOCK(&orted_globals.mutex);
|
||||
|
||||
if (orted_globals.debug_daemons) {
|
||||
opal_output(0, "[%lu,%lu,%lu] ompid: received message", ORTE_NAME_ARGS(orte_process_info.my_name));
|
||||
opal_output(0, "[%lu,%lu,%lu] orted_recv: received message from [%ld,%ld,%ld]",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(sender));
|
||||
}
|
||||
|
||||
answer = OBJ_NEW(orte_buffer_t);
|
||||
@ -731,6 +751,11 @@ static void orte_daemon_recv(int status, orte_process_name_t* sender,
|
||||
|
||||
/**** EXIT COMMAND ****/
|
||||
if (ORTE_DAEMON_EXIT_CMD == command) {
|
||||
if (orted_globals.debug_daemons) {
|
||||
opal_output(0, "[%lu,%lu,%lu] orted_recv: received exit",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name));
|
||||
}
|
||||
|
||||
orted_globals.exit_condition = true;
|
||||
opal_condition_signal(&orted_globals.condition);
|
||||
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user