1
1

Lesson to be learned: if you send an ack to a recv'd command, better not send it to the same tag it came from - at least, not if there is a persistent recv on that tag!

Fix the persistent daemon problem where it was exiting when a job completed. Problem was that the persistent daemon would order the job daemons to exit. They would then send an 'ack' back to the persistent daemon - but the ack consisted of an echo of the "exit" command, which was recv'd by the wrong listener who treated it as a properly sent cmd....and exited.

This commit was SVN r12243.
Этот коммит содержится в:
Ralph Castain 2006-10-21 02:53:19 +00:00
родитель 6b697ad3dd
Коммит 153e38ffc9
6 изменённых файлов: 50 добавлений и 17 удалений

Просмотреть файл

@ -224,6 +224,7 @@ int orte_pls_base_get_active_daemons(opal_list_t *daemons, orte_jobid_t job)
if (NULL != nodename) {
dmn->nodename = strdup(nodename);
}
/* add this daemon to the list */
opal_list_append(daemons, &dmn->super);
}

Просмотреть файл

@ -65,7 +65,7 @@ int orte_pls_base_orted_exit(opal_list_t *daemons)
}
OBJ_CONSTRUCT(&answer, orte_buffer_t);
if (0 > orte_rml.recv_buffer(dmn->name, &answer, ORTE_RML_TAG_PLS_ORTED)) {
if (0 > orte_rml.recv_buffer(dmn->name, &answer, ORTE_RML_TAG_PLS_ORTED_ACK)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
}
OBJ_DESTRUCT(&answer);
@ -116,7 +116,7 @@ int orte_pls_base_orted_kill_local_procs(opal_list_t *daemons, orte_jobid_t job)
}
OBJ_CONSTRUCT(&answer, orte_buffer_t);
if (0 > orte_rml.recv_buffer(dmn->name, &answer, ORTE_RML_TAG_PLS_ORTED)) {
if (0 > orte_rml.recv_buffer(dmn->name, &answer, ORTE_RML_TAG_PLS_ORTED_ACK)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
}
OBJ_DESTRUCT(&answer);
@ -168,7 +168,7 @@ int orte_pls_base_orted_signal_local_procs(opal_list_t *daemons, int32_t signal)
}
OBJ_CONSTRUCT(&answer, orte_buffer_t);
if (0 > orte_rml.recv_buffer(dmn->name, &answer, ORTE_RML_TAG_PLS_ORTED)) {
if (0 > orte_rml.recv_buffer(dmn->name, &answer, ORTE_RML_TAG_PLS_ORTED_ACK)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
}
OBJ_DESTRUCT(&answer);
@ -219,7 +219,7 @@ int orte_pls_base_orted_add_local_procs(opal_list_t *daemons, orte_gpr_notify_da
}
OBJ_CONSTRUCT(&answer, orte_buffer_t);
if (0 > orte_rml.recv_buffer(dmn->name, &answer, ORTE_RML_TAG_PLS_ORTED)) {
if (0 > orte_rml.recv_buffer(dmn->name, &answer, ORTE_RML_TAG_PLS_ORTED_ACK)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
}
OBJ_DESTRUCT(&answer);

Просмотреть файл

@ -49,10 +49,13 @@ typedef uint32_t orte_rml_tag_t;
#define ORTE_RML_TAG_RMAPS 12
#define ORTE_RML_TAG_PLS 13
#define ORTE_RML_TAG_PLS_ORTED 14
#define ORTE_RML_TAG_ERRMGR 15
#define ORTE_RML_TAG_BPROC 16
#define ORTE_RML_TAG_BPROC_ABORT 17
#define ORTE_RML_TAG_PLS_ORTED_ACK 15
#define ORTE_RML_TAG_ERRMGR 16
#define ORTE_RML_TAG_BPROC 17
#define ORTE_RML_TAG_BPROC_ABORT 18
#define ORTE_RML_TAG_DYNAMIC 2000
#define ORTE_RML_TAG_MAX UINT32_MAX

Просмотреть файл

@ -262,6 +262,16 @@ int orte_init_stage1(bool infrastructure)
/* all done with sds - clean up and call it a day */
orte_sds_base_close();
/* initialize the rml module so it can open its interfaces - this
* is needed so that we can get a uri for ourselves if we are an
* HNP
*/
if (ORTE_SUCCESS != (ret = orte_rml.init())) {
ORTE_ERROR_LOG(ret);
error = "orte_rml.init";
goto error;
}
/* if I'm the seed, set the seed uri to be me! */
if (orte_process_info.seed) {
if (NULL != orte_universe_info.seed_uri) {

Просмотреть файл

@ -42,12 +42,6 @@ int orte_init_stage2(void)
* Initialize the selected modules now that all components/name are available.
*/
if (ORTE_SUCCESS != (ret = orte_rml.init())) {
ORTE_ERROR_LOG(ret);
error_str = "orte_rml.init";
goto return_error;
}
if (ORTE_SUCCESS != (ret = orte_ns.init())) {
ORTE_ERROR_LOG(ret);
error_str = "orte_ns.init";

Просмотреть файл

@ -581,7 +581,9 @@ static void orte_daemon_recv_pls(int status, orte_process_name_t* sender,
OPAL_THREAD_LOCK(&orted_globals.mutex);
if (orted_globals.debug_daemons) {
opal_output(0, "[%lu,%lu,%lu] orted_recv_pls: received message", ORTE_NAME_ARGS(orte_process_info.my_name));
opal_output(0, "[%lu,%lu,%lu] orted_recv_pls: received message from [%ld,%ld,%ld]",
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(sender));
}
/* unpack the command */
@ -611,6 +613,10 @@ static void orte_daemon_recv_pls(int status, orte_process_name_t* sender,
/**** KILL_LOCAL_PROCS ****/
case ORTE_DAEMON_KILL_LOCAL_PROCS:
if (orted_globals.debug_daemons) {
opal_output(0, "[%lu,%lu,%lu] orted_recv_pls: received kill_local_procs",
ORTE_NAME_ARGS(orte_process_info.my_name));
}
/* unpack the jobid - could be JOBID_WILDCARD, which would indicatge
* we should kill all local procs. Otherwise, only kill those within
* the specified jobid
@ -628,6 +634,10 @@ static void orte_daemon_recv_pls(int status, orte_process_name_t* sender,
/**** SIGNAL_LOCAL_PROCS ****/
case ORTE_DAEMON_SIGNAL_LOCAL_PROCS:
if (orted_globals.debug_daemons) {
opal_output(0, "[%lu,%lu,%lu] orted_recv_pls: received signal_local_procs",
ORTE_NAME_ARGS(orte_process_info.my_name));
}
/* get the signal */
n = 1;
if (ORTE_SUCCESS != (ret = orte_dss.unpack(buffer, &signal, &n, ORTE_INT32))) {
@ -648,6 +658,10 @@ static void orte_daemon_recv_pls(int status, orte_process_name_t* sender,
/**** ADD_LOCAL_PROCS ****/
case ORTE_DAEMON_ADD_LOCAL_PROCS:
if (orted_globals.debug_daemons) {
opal_output(0, "[%lu,%lu,%lu] orted_recv_pls: received add_local_procs",
ORTE_NAME_ARGS(orte_process_info.my_name));
}
/* unpack the notify data object */
if (ORTE_SUCCESS != (ret = orte_dss.unpack(buffer, &ndat, &n, ORTE_GPR_NOTIFY_DATA))) {
ORTE_ERROR_LOG(ret);
@ -665,10 +679,14 @@ static void orte_daemon_recv_pls(int status, orte_process_name_t* sender,
/**** EXIT COMMAND ****/
case ORTE_DAEMON_EXIT_CMD:
if (orted_globals.debug_daemons) {
opal_output(0, "[%lu,%lu,%lu] orted_recv_pls: received exit",
ORTE_NAME_ARGS(orte_process_info.my_name));
}
/* send the response before we wakeup because otherwise
* we'll depart before it gets out!
*/
if (0 > orte_rml.send_buffer(sender, answer, ORTE_RML_TAG_PLS_ORTED, 0)) {
if (0 > orte_rml.send_buffer(sender, answer, ORTE_RML_TAG_PLS_ORTED_ACK, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
}
orted_globals.exit_condition = true;
@ -682,7 +700,7 @@ static void orte_daemon_recv_pls(int status, orte_process_name_t* sender,
DONE:
/* send the response */
if (0 > orte_rml.send_buffer(sender, answer, ORTE_RML_TAG_PLS_ORTED, 0)) {
if (0 > orte_rml.send_buffer(sender, answer, ORTE_RML_TAG_PLS_ORTED_ACK, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
}
OBJ_RELEASE(answer);
@ -714,7 +732,9 @@ static void orte_daemon_recv(int status, orte_process_name_t* sender,
OPAL_THREAD_LOCK(&orted_globals.mutex);
if (orted_globals.debug_daemons) {
opal_output(0, "[%lu,%lu,%lu] ompid: received message", ORTE_NAME_ARGS(orte_process_info.my_name));
opal_output(0, "[%lu,%lu,%lu] orted_recv: received message from [%ld,%ld,%ld]",
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(sender));
}
answer = OBJ_NEW(orte_buffer_t);
@ -731,6 +751,11 @@ static void orte_daemon_recv(int status, orte_process_name_t* sender,
/**** EXIT COMMAND ****/
if (ORTE_DAEMON_EXIT_CMD == command) {
if (orted_globals.debug_daemons) {
opal_output(0, "[%lu,%lu,%lu] orted_recv: received exit",
ORTE_NAME_ARGS(orte_process_info.my_name));
}
orted_globals.exit_condition = true;
opal_condition_signal(&orted_globals.condition);