Take another stab at resolving the "called-abort" requirement without getting stuck. Return to "drop a turd" mode, perhaps with a little more intelligence behind it. Don't worry about catching it if session dirs weren't created
cmr=v1.8.2:reviewer=jsquyres:subject=cleanup MPI_Abort hangs This commit was SVN r31543.
Этот коммит содержится в:
родитель
908e178d08
Коммит
e05b88fd18
@ -392,36 +392,19 @@ int orte_ess_base_app_finalize(void)
|
|||||||
*
|
*
|
||||||
* However, this causes a problem for OpenRTE as the system truly
|
* However, this causes a problem for OpenRTE as the system truly
|
||||||
* needs to know that this actually IS an abnormal termination.
|
* needs to know that this actually IS an abnormal termination.
|
||||||
* To get around the problem, we create a file in the session
|
* To get around the problem, we drop a marker in the proc-level
|
||||||
* directory - we don't need to put anything in it, though, as its
|
* session dir. If session dir's were not allowed, then we just
|
||||||
* very existence simply alerts us that this was an abnormal
|
* ignore this question.
|
||||||
* termination.
|
|
||||||
*
|
|
||||||
* The session directory finalize system will clean this file up
|
|
||||||
* for us automagically. However, it needs to stick around long
|
|
||||||
* enough for our local daemon to find it! So, we do NOT call
|
|
||||||
* session_dir_finalize here!!! Someone will clean up for us.
|
|
||||||
*
|
*
|
||||||
* In some cases, however, we DON'T want to create that alert. For
|
* In some cases, however, we DON'T want to create that alert. For
|
||||||
* example, if an orted detects that the HNP has died, then there
|
* example, if an orted detects that the HNP has died, then there
|
||||||
* is truly nobody to alert! In these cases, we pass report=false
|
* is truly nobody to alert! In these cases, we pass report=false
|
||||||
* to prevent the abort file from being created. This allows the
|
* to indicate that we don't want the marker dropped.
|
||||||
* session directory tree to cleanly be eliminated.
|
|
||||||
*/
|
*/
|
||||||
static void report_sync(int status, orte_process_name_t* sender,
|
|
||||||
opal_buffer_t *buffer,
|
|
||||||
orte_rml_tag_t tag, void *cbdata)
|
|
||||||
{
|
|
||||||
bool *sync_waiting = (bool*)cbdata;
|
|
||||||
/* flag as complete */
|
|
||||||
*sync_waiting = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
void orte_ess_base_app_abort(int status, bool report)
|
void orte_ess_base_app_abort(int status, bool report)
|
||||||
{
|
{
|
||||||
orte_daemon_cmd_flag_t cmd=ORTE_DAEMON_ABORT_CALLED;
|
int fd;
|
||||||
opal_buffer_t *buf;
|
char *myfile;
|
||||||
bool sync_waiting = true;
|
|
||||||
|
|
||||||
/* Exit - do NOT do a normal finalize as this will very likely
|
/* Exit - do NOT do a normal finalize as this will very likely
|
||||||
* hang the process. We are aborting due to an abnormal condition
|
* hang the process. We are aborting due to an abnormal condition
|
||||||
@ -435,30 +418,16 @@ void orte_ess_base_app_abort(int status, bool report)
|
|||||||
/* CRS cleanup since it may have a named pipe and thread active */
|
/* CRS cleanup since it may have a named pipe and thread active */
|
||||||
orte_cr_finalize();
|
orte_cr_finalize();
|
||||||
|
|
||||||
/* If we were asked to report this termination, do so - except
|
/* If we were asked to report this termination, do so.
|
||||||
* in cases of abnormal termination ordered by the RTE as
|
* Since singletons don't start an HNP unless necessary, and
|
||||||
* this means we can't rely on being able to communicate. Also,
|
|
||||||
* since singletons don't start an HNP unless necessary, and
|
|
||||||
* direct-launched procs don't have daemons at all, only send
|
* direct-launched procs don't have daemons at all, only send
|
||||||
* the message if routing is enabled as this indicates we
|
* the message if routing is enabled as this indicates we
|
||||||
* have someone to send to
|
* have someone to send to
|
||||||
*/
|
*/
|
||||||
if (report && !orte_abnormal_term_ordered && orte_routing_is_enabled) {
|
if (report && orte_routing_is_enabled && orte_create_session_dirs) {
|
||||||
buf = OBJ_NEW(opal_buffer_t);
|
myfile = opal_os_path(false, orte_process_info.proc_session_dir, "aborted", NULL);
|
||||||
opal_dss.pack(buf, &cmd, 1, ORTE_DAEMON_CMD);
|
fd = open(myfile, O_CREAT);
|
||||||
orte_rml.send_buffer_nb(ORTE_PROC_MY_DAEMON, buf, ORTE_RML_TAG_DAEMON, orte_rml_send_callback, NULL);
|
close(fd);
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_debug_output,
|
|
||||||
"%s orte_ess_app_abort: sent abort msg to %s",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_DAEMON)));
|
|
||||||
/* get the ack - need this to ensure that the sync communication
|
|
||||||
* gets serviced by the event library on the orted prior to the
|
|
||||||
* process exiting
|
|
||||||
*/
|
|
||||||
sync_waiting = true;
|
|
||||||
orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ABORT,
|
|
||||||
ORTE_RML_NON_PERSISTENT, report_sync, &sync_waiting);
|
|
||||||
ORTE_WAIT_FOR_COMPLETION(sync_waiting);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* - Clean out the global structures
|
/* - Clean out the global structures
|
||||||
|
@ -1869,39 +1869,6 @@ CLEANUP:
|
|||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
|
||||||
void orte_odls_base_default_report_abort(orte_process_name_t *proc)
|
|
||||||
{
|
|
||||||
orte_proc_t *child;
|
|
||||||
opal_buffer_t *buffer;
|
|
||||||
int rc, i;
|
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
|
|
||||||
"%s GOT ABORT REPORT FOR %s",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
||||||
ORTE_NAME_PRINT(proc)));
|
|
||||||
|
|
||||||
/* find this child */
|
|
||||||
for (i=0; i < orte_local_children->size; i++) {
|
|
||||||
if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (proc->jobid == child->name.jobid &&
|
|
||||||
proc->vpid == child->name.vpid) { /* found it */
|
|
||||||
child->aborted = true;
|
|
||||||
/* send ack */
|
|
||||||
buffer = OBJ_NEW(opal_buffer_t);
|
|
||||||
if (0 > (rc = orte_rml.send_buffer_nb(proc, buffer,
|
|
||||||
ORTE_RML_TAG_ABORT,
|
|
||||||
orte_rml_send_callback, NULL))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
OBJ_RELEASE(buffer);
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Wait for a callback indicating the child has completed.
|
* Wait for a callback indicating the child has completed.
|
||||||
*/
|
*/
|
||||||
@ -1912,6 +1879,7 @@ void odls_base_default_wait_local_proc(pid_t pid, int status, void* cbdata)
|
|||||||
int i;
|
int i;
|
||||||
orte_job_t *jobdat;
|
orte_job_t *jobdat;
|
||||||
orte_proc_state_t state=ORTE_PROC_STATE_WAITPID_FIRED;
|
orte_proc_state_t state=ORTE_PROC_STATE_WAITPID_FIRED;
|
||||||
|
char *abortfile, *jobfam, *job, *vpidstr;
|
||||||
|
|
||||||
/* find this child */
|
/* find this child */
|
||||||
for (i=0; i < orte_local_children->size; i++) {
|
for (i=0; i < orte_local_children->size; i++) {
|
||||||
@ -1936,8 +1904,8 @@ void odls_base_default_wait_local_proc(pid_t pid, int status, void* cbdata)
|
|||||||
}
|
}
|
||||||
|
|
||||||
opal_output_verbose(5, orte_odls_base_framework.framework_output,
|
opal_output_verbose(5, orte_odls_base_framework.framework_output,
|
||||||
"%s odls:wait_local_proc child process %s pid %ld terminated",
|
"%s odls:wait_local_proc child process %s pid %ld terminated",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
ORTE_NAME_PRINT(&proc->name), (long)pid);
|
ORTE_NAME_PRINT(&proc->name), (long)pid);
|
||||||
|
|
||||||
/* if the child was previously flagged as dead, then just
|
/* if the child was previously flagged as dead, then just
|
||||||
@ -1976,7 +1944,7 @@ void odls_base_default_wait_local_proc(pid_t pid, int status, void* cbdata)
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* determine the state of this process */
|
/* determine the state of this process */
|
||||||
if(WIFEXITED(status)) {
|
if (WIFEXITED(status)) {
|
||||||
/* set the exit status appropriately */
|
/* set the exit status appropriately */
|
||||||
proc->exit_code = WEXITSTATUS(status);
|
proc->exit_code = WEXITSTATUS(status);
|
||||||
|
|
||||||
@ -1985,19 +1953,56 @@ void odls_base_default_wait_local_proc(pid_t pid, int status, void* cbdata)
|
|||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
ORTE_NAME_PRINT(&proc->name), proc->exit_code));
|
ORTE_NAME_PRINT(&proc->name), proc->exit_code));
|
||||||
|
|
||||||
if (proc->aborted) {
|
/* provide a default state */
|
||||||
|
state = ORTE_PROC_STATE_WAITPID_FIRED;
|
||||||
|
|
||||||
|
/* check for the abort marker */
|
||||||
|
if (0 > asprintf(&jobfam, "%d", ORTE_JOB_FAMILY(proc->name.jobid))) {
|
||||||
|
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||||
|
goto MOVEON;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (0 > asprintf(&job, "%d", ORTE_LOCAL_JOBID(proc->name.jobid))) {
|
||||||
|
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||||
|
free(jobfam);
|
||||||
|
goto MOVEON;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ORTE_SUCCESS != orte_util_convert_vpid_to_string(&vpidstr, proc->name.vpid)) {
|
||||||
|
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||||
|
free(jobfam);
|
||||||
|
free(job);
|
||||||
|
goto MOVEON;
|
||||||
|
}
|
||||||
|
|
||||||
|
abortfile = opal_os_path(false, orte_process_info.top_session_dir, jobfam, job, vpidstr, NULL );
|
||||||
|
if (NULL == abortfile ) {
|
||||||
|
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||||
|
free(jobfam);
|
||||||
|
free(job);
|
||||||
|
free(vpidstr);
|
||||||
|
goto MOVEON;
|
||||||
|
}
|
||||||
|
free(jobfam);
|
||||||
|
free(job);
|
||||||
|
free(vpidstr);
|
||||||
|
|
||||||
|
if (access(abortfile, F_OK)) {
|
||||||
|
unlink(abortfile);
|
||||||
|
proc->aborted = true;
|
||||||
/* even though the process exited "normally", it happened
|
/* even though the process exited "normally", it happened
|
||||||
* via an orte_abort call, so we need to indicate this was
|
* via an orte_abort call
|
||||||
* an "abnormal" termination.
|
|
||||||
*/
|
*/
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
|
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
|
||||||
"%s odls:waitpid_fired child %s died by call to abort",
|
"%s odls:waitpid_fired child %s died by call to abort",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
ORTE_NAME_PRINT(&proc->name)));
|
ORTE_NAME_PRINT(&proc->name)));
|
||||||
state = ORTE_PROC_STATE_CALLED_ABORT;
|
state = ORTE_PROC_STATE_CALLED_ABORT;
|
||||||
|
free(abortfile);
|
||||||
goto MOVEON;
|
goto MOVEON;
|
||||||
}
|
}
|
||||||
|
free(abortfile);
|
||||||
|
|
||||||
/* check to see if a sync was required and if it was received */
|
/* check to see if a sync was required and if it was received */
|
||||||
if (proc->registered) {
|
if (proc->registered) {
|
||||||
if (proc->deregistered || orte_allowed_exit_without_sync || 0 != proc->exit_code) {
|
if (proc->deregistered || orte_allowed_exit_without_sync || 0 != proc->exit_code) {
|
||||||
|
@ -109,8 +109,6 @@ orte_odls_base_default_deliver_message(orte_jobid_t job, opal_buffer_t *buffer,
|
|||||||
|
|
||||||
ORTE_DECLSPEC void odls_base_default_wait_local_proc(pid_t pid, int status, void* cbdata);
|
ORTE_DECLSPEC void odls_base_default_wait_local_proc(pid_t pid, int status, void* cbdata);
|
||||||
|
|
||||||
ORTE_DECLSPEC void orte_odls_base_default_report_abort(orte_process_name_t *proc);
|
|
||||||
|
|
||||||
/* define a function type to signal a local proc */
|
/* define a function type to signal a local proc */
|
||||||
typedef int (*orte_odls_base_signal_local_fn_t)(pid_t pid, int signum);
|
typedef int (*orte_odls_base_signal_local_fn_t)(pid_t pid, int signum);
|
||||||
|
|
||||||
|
@ -239,14 +239,6 @@ void orte_daemon_recv(int status, orte_process_name_t* sender,
|
|||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case ORTE_DAEMON_ABORT_CALLED:
|
|
||||||
if (orte_debug_daemons_flag) {
|
|
||||||
opal_output(0, "%s orted_cmd: received abort report",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
|
||||||
}
|
|
||||||
orte_odls_base_default_report_abort(sender);
|
|
||||||
break;
|
|
||||||
|
|
||||||
case ORTE_DAEMON_ABORT_PROCS_CALLED:
|
case ORTE_DAEMON_ABORT_PROCS_CALLED:
|
||||||
if (orte_debug_daemons_flag) {
|
if (orte_debug_daemons_flag) {
|
||||||
opal_output(0, "%s orted_cmd: received abort_procs report",
|
opal_output(0, "%s orted_cmd: received abort_procs report",
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user