1
1

Attempt to solve a race condition showing up in some MTT runs. There were three entry points for proc termination info into the ODLS:

1. a direct callback from waitpid - this set the waitpid_fired flag

2. a notify event callback from the IOF - this set the iof complete flag

3. a message via the daemon cmd processor from the proc "de-registering" the sync, thus indicating it was going through MPI_Finalize.

The problem is that these could overlap, with the first two allowing the orted to declare the proc complete before the daemon had responded to #3.

This change forces all three events to flow through the daemon cmd processor, thus ensuring an ordered handling. I'm not certain this will solve the problem, but will await further MTT reports to see. Unfortunately, the problem doesn't show up on any manual or script-based tests I have been able to run, even when I duplicate the exact cmd that fails under MTT.

This commit was SVN r20074.
Этот коммит содержится в:
Ralph Castain 2008-12-05 04:20:00 +00:00
родитель a16de4ba54
Коммит 2940309613
6 изменённых файлов: 186 добавлений и 48 удалений

Просмотреть файл

@ -28,14 +28,16 @@
#include <string.h>
#endif /* HAVE_STRING_H */
#include "orte/util/show_help.h"
#include "opal/dss/dss.h"
#include "orte/util/show_help.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/odls/base/base.h"
#include "orte/mca/odls/odls_types.h"
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/ess/ess.h"
#include "orte/orted/orted.h"
#include "orte/mca/iof/iof.h"
#include "orte/mca/iof/base/base.h"
@ -83,6 +85,7 @@ void orte_iof_hnp_read_local_handler(int fd, short event, void *cbdata)
int32_t numbytes;
opal_list_item_t *item;
orte_iof_proc_t *proct;
int rc;
OPAL_THREAD_LOCK(&mca_iof_hnp_component.lock);
@ -238,9 +241,24 @@ void orte_iof_hnp_read_local_handler(int fd, short event, void *cbdata)
if (NULL == proct->revstdout &&
NULL == proct->revstderr &&
NULL == proct->revstddiag) {
opal_buffer_t cmdbuf;
orte_daemon_cmd_flag_t command;
/* this proc's iof is complete */
opal_list_remove_item(&mca_iof_hnp_component.procs, item);
ORTE_NOTIFY_EVENT(orte_odls_base_notify_iof_complete, &proct->name);
/* setup a cmd to notify that the iof is complete */
OBJ_CONSTRUCT(&cmdbuf, opal_buffer_t);
command = ORTE_DAEMON_IOF_COMPLETE;
if (ORTE_SUCCESS != (rc = opal_dss.pack(&cmdbuf, &command, 1, ORTE_DAEMON_CMD))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
if (ORTE_SUCCESS != (rc = opal_dss.pack(&cmdbuf, &proct->name, 1, ORTE_NAME))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
ORTE_MESSAGE_EVENT(ORTE_PROC_MY_NAME, &cmdbuf, ORTE_RML_TAG_DAEMON, orte_daemon_cmd_processor);
CLEANUP:
OBJ_DESTRUCT(&cmdbuf);
OBJ_RELEASE(proct);
}
break;

Просмотреть файл

@ -28,13 +28,15 @@
#include <string.h>
#endif /* HAVE_STRING_H */
#include "orte/util/show_help.h"
#include "opal/dss/dss.h"
#include "orte/util/show_help.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/odls/base/base.h"
#include "orte/mca/odls/odls_types.h"
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h"
#include "orte/orted/orted.h"
#include "orte/mca/iof/iof.h"
#include "orte/mca/iof/base/base.h"
@ -163,9 +165,24 @@ CLEAN_RETURN:
if (NULL == proct->revstdout &&
NULL == proct->revstderr &&
NULL == proct->revstddiag) {
opal_buffer_t cmdbuf;
orte_daemon_cmd_flag_t command;
/* this proc's iof is complete */
opal_list_remove_item(&mca_iof_orted_component.procs, item);
ORTE_NOTIFY_EVENT(orte_odls_base_notify_iof_complete, &proct->name);
/* setup a cmd to notify that the iof is complete */
OBJ_CONSTRUCT(&cmdbuf, opal_buffer_t);
command = ORTE_DAEMON_IOF_COMPLETE;
if (ORTE_SUCCESS != (rc = opal_dss.pack(&cmdbuf, &command, 1, ORTE_DAEMON_CMD))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
if (ORTE_SUCCESS != (rc = opal_dss.pack(&cmdbuf, &proct->name, 1, ORTE_NAME))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
ORTE_MESSAGE_EVENT(ORTE_PROC_MY_NAME, &cmdbuf, ORTE_RML_TAG_DAEMON, orte_daemon_cmd_processor);
CLEANUP:
OBJ_DESTRUCT(&cmdbuf);
OBJ_RELEASE(proct);
}
break;

Просмотреть файл

@ -75,10 +75,9 @@ ORTE_DECLSPEC int orte_odls_base_select(void);
ORTE_DECLSPEC int orte_odls_base_finalize(void);
ORTE_DECLSPEC int orte_odls_base_close(void);
/* declare that external-to-odls completion criteria for a
* proc have been met
*/
ORTE_DECLSPEC void orte_odls_base_notify_iof_complete(int fd, short event, void *proc);
/* proc termination entry points */
ORTE_DECLSPEC void orte_odls_base_notify_iof_complete(orte_process_name_t *proc);
ORTE_DECLSPEC void orte_base_default_waitpid_fired(orte_process_name_t *proc, int32_t status);
#endif /* ORTE_DISABLE_FULL_SUPPORT */

Просмотреть файл

@ -61,6 +61,7 @@
#include "orte/util/nidmap.h"
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wait.h"
#include "orte/orted/orted.h"
#if OPAL_ENABLE_FT == 1
#include "orte/mca/snapc/snapc.h"
@ -1796,22 +1797,20 @@ static void check_proc_complete(orte_odls_child_t *child)
unlock:
OBJ_DESTRUCT(&alert);
}
/* receive external-to-odls notification that a proc has met some completion
* requirements
*/
void orte_odls_base_notify_iof_complete(int fd, short event, void *data)
void orte_odls_base_notify_iof_complete(orte_process_name_t *proc)
{
orte_notify_event_t *nev = (orte_notify_event_t*)data;
orte_odls_child_t *child;
opal_list_item_t *item;
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:notify_iof_complete for child %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&nev->proc)));
ORTE_NAME_PRINT(proc)));
/* since we are going to be working with the global list of
* children, we need to protect that list from modification
@ -1826,8 +1825,8 @@ void orte_odls_base_notify_iof_complete(int fd, short event, void *data)
item = opal_list_get_next(item)) {
child = (orte_odls_child_t*)item;
if (child->name->jobid == nev->proc.jobid &&
child->name->vpid == nev->proc.vpid) { /* found it */
if (child->name->jobid == proc->jobid &&
child->name->vpid == proc->vpid) { /* found it */
goto GOTCHILD;
}
}
@ -1839,10 +1838,8 @@ void orte_odls_base_notify_iof_complete(int fd, short event, void *data)
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:proc_complete did not find child %s in table!",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&nev->proc)));
ORTE_NAME_PRINT(proc)));
/* release the event */
OBJ_RELEASE(nev);
/* it's just a race condition - don't error log it */
opal_condition_signal(&orte_odls_globals.cond);
OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex);
@ -1851,30 +1848,20 @@ void orte_odls_base_notify_iof_complete(int fd, short event, void *data)
GOTCHILD:
/* flag the iof as complete */
child->iof_complete = true;
/* release the event */
OBJ_RELEASE(nev);
/* now check to see if the proc is truly done */
check_proc_complete(child);
opal_condition_signal(&orte_odls_globals.cond);
OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex);
}
/*
* Wait for a callback indicating the child has completed.
*/
void odls_base_default_wait_local_proc(pid_t pid, int status, void* cbdata)
void orte_base_default_waitpid_fired(orte_process_name_t *proc, int32_t status)
{
orte_odls_child_t *child;
opal_list_item_t *item;
char *job, *vpid, *abort_file;
struct stat buf;
int rc;
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:wait_local_proc child process %ld terminated",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(long)pid));
/* since we are going to be working with the global list of
* children, we need to protect that list from modification
* by other threads. This will also be used to protect us
@ -1888,7 +1875,8 @@ void odls_base_default_wait_local_proc(pid_t pid, int status, void* cbdata)
item = opal_list_get_next(item)) {
child = (orte_odls_child_t*)item;
if (pid == child->pid) { /* found it */
if (proc->jobid == child->name->jobid &&
proc->vpid == child->name->vpid) { /* found it */
goto GOTCHILD;
}
}
@ -1898,9 +1886,9 @@ void odls_base_default_wait_local_proc(pid_t pid, int status, void* cbdata)
*/
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:wait_local_proc did not find pid %ld in table!",
"%s odls:waitpid_fired did not find child %s in table!",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(long)pid));
ORTE_NAME_PRINT(proc)));
/* it's just a race condition - don't error log it */
opal_condition_signal(&orte_odls_globals.cond);
@ -1913,18 +1901,12 @@ GOTCHILD:
*/
if (!child->alive) {
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:wait_local_proc child %s was already dead",
"%s odls:waitpid_fired child %s was already dead",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(child->name)));
goto MOVEON;
}
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:wait_local_proc pid %ld corresponds to %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(long)pid,
ORTE_NAME_PRINT(child->name)));
/* determine the state of this process */
if(WIFEXITED(status)) {
/* set the exit status appropriately */
@ -1953,7 +1935,7 @@ GOTCHILD:
orte_process_info.top_session_dir,
job, vpid, "abort", NULL );
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:wait_local_proc checking abort file %s",
"%s odls:waitpid_fired checking abort file %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), abort_file));
free(job);
@ -1964,7 +1946,7 @@ GOTCHILD:
*/
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:wait_local_proc child %s died by abort",
"%s odls:waitpid_fired child %s died by abort",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(child->name)));
@ -1981,7 +1963,7 @@ GOTCHILD:
child->state = ORTE_PROC_STATE_TERM_WO_SYNC;
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:wait_local_proc child process %s terminated normally "
"%s odls:waitpid_fired child process %s terminated normally "
"but did not provide a required sync - it "
"will be treated as an abnormal termination",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -1993,7 +1975,7 @@ GOTCHILD:
}
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:wait_local_proc child process %s terminated normally",
"%s odls:waitpid_fired child process %s terminated normally",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(child->name)));
@ -2015,7 +1997,7 @@ GOTCHILD:
child->exit_code = WTERMSIG(status) + 128;
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:wait_local_proc child process %s terminated with signal",
"%s odls:waitpid_fired child process %s terminated with signal",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(child->name)));
}
@ -2032,6 +2014,82 @@ MOVEON:
OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex);
}
/*
* Wait for a callback indicating the child has completed.
*/
void odls_base_default_wait_local_proc(pid_t pid, int status, void* cbdata)
{
orte_odls_child_t *child;
opal_list_item_t *item;
int rc;
opal_buffer_t cmdbuf;
orte_daemon_cmd_flag_t command;
int32_t istatus;
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:wait_local_proc child process %ld terminated",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(long)pid));
/* since we are going to be working with the global list of
* children, we need to protect that list from modification
* by other threads. This will also be used to protect us
* from race conditions on any abort situation
*/
OPAL_THREAD_LOCK(&orte_odls_globals.mutex);
/* find this child */
for (item = opal_list_get_first(&orte_odls_globals.children);
item != opal_list_get_end(&orte_odls_globals.children);
item = opal_list_get_next(item)) {
child = (orte_odls_child_t*)item;
if (pid == child->pid) { /* found it */
/* this is an independent entry point from the event library. To avoid
* race conditions, we need to get back into the progression of messages
* and commands to be processed by the daemon. We do this by re-posting
* the event into the daemon cmd processor
*/
OBJ_CONSTRUCT(&cmdbuf, opal_buffer_t);
command = ORTE_DAEMON_WAITPID_FIRED;
if (ORTE_SUCCESS != (rc = opal_dss.pack(&cmdbuf, &command, 1, ORTE_DAEMON_CMD))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
if (ORTE_SUCCESS != (rc = opal_dss.pack(&cmdbuf, child->name, 1, ORTE_NAME))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
istatus = status;
if (ORTE_SUCCESS != (rc = opal_dss.pack(&cmdbuf, &istatus, 1, OPAL_INT32))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
ORTE_MESSAGE_EVENT(ORTE_PROC_MY_NAME, &cmdbuf, ORTE_RML_TAG_DAEMON, orte_daemon_cmd_processor);
/* done */
opal_condition_signal(&orte_odls_globals.cond);
OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex);
return;
}
}
/* get here if we didn't find the child, or if the specified child
* is already dead. If the latter, then we have a problem as it
* means we are detecting it exiting multiple times
*/
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:wait_local_proc did not find pid %ld in table!",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(long)pid));
/* it's just a race condition - don't error log it */
CLEANUP:
opal_condition_signal(&orte_odls_globals.cond);
OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex);
return;
}
int orte_odls_base_default_kill_local_procs(orte_jobid_t job, bool set_state,
orte_odls_base_kill_local_fn_t kill_local,
orte_odls_base_child_died_fn_t child_died)

Просмотреть файл

@ -61,6 +61,10 @@ typedef uint8_t orte_daemon_cmd_flag_t;
/* collective-based cmds */
#define ORTE_DAEMON_COLL_CMD (orte_daemon_cmd_flag_t) 24
/* proc termination sync cmds */
#define ORTE_DAEMON_WAITPID_FIRED (orte_daemon_cmd_flag_t) 25
#define ORTE_DAEMON_IOF_COMPLETE (orte_daemon_cmd_flag_t) 26
END_C_DECLS
#endif

Просмотреть файл

@ -67,6 +67,7 @@
#include "orte/mca/rml/rml.h"
#include "orte/mca/rml/base/rml_contact.h"
#include "orte/mca/odls/odls.h"
#include "orte/mca/odls/base/base.h"
#include "orte/mca/plm/plm.h"
#include "orte/mca/plm/base/plm_private.h"
#include "orte/mca/routed/routed.h"
@ -436,6 +437,8 @@ static int process_commands(orte_process_name_t* sender,
opal_buffer_t *answer;
orte_rml_cmd_flag_t rml_cmd;
orte_job_t *jdata;
orte_process_name_t proc;
int32_t status;
/* unpack the command */
n = 1;
@ -615,6 +618,45 @@ static int process_commands(orte_process_name_t* sender,
}
break;
/**** WAITPID_FIRED COMMAND ****/
case ORTE_DAEMON_WAITPID_FIRED:
if (orte_debug_daemons_flag) {
opal_output(0, "%s orted_cmd: received waitpid_fired cmd",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
}
/* unpack the name of the proc that terminated */
n = 1;
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &proc, &n, ORTE_NAME))) {
ORTE_ERROR_LOG(ret);
goto CLEANUP;
}
/* unpack the termination status */
n = 1;
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &status, &n, OPAL_INT32))) {
ORTE_ERROR_LOG(ret);
goto CLEANUP;
}
/* pass it down for processing */
orte_base_default_waitpid_fired(&proc, status);
break;
/**** IOF_COMPLETE COMMAND ****/
case ORTE_DAEMON_IOF_COMPLETE:
if (orte_debug_daemons_flag) {
opal_output(0, "%s orted_cmd: received iof_complete cmd",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
}
/* unpack the name of the proc that completed */
n = 1;
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &proc, &n, ORTE_NAME))) {
ORTE_ERROR_LOG(ret);
goto CLEANUP;
}
/* pass it down for processing */
orte_odls_base_notify_iof_complete(&proc);
break;
/**** EXIT COMMAND ****/
case ORTE_DAEMON_EXIT_WITH_REPLY_CMD:
if (orte_debug_daemons_flag) {