Take another stab at resolving the "called-abort" requirement without getting stuck. Return to "drop a turd" mode, perhaps with a little more intelligence behind it. Don't worry about catching it if session dirs weren't created
cmr=v1.8.2:reviewer=jsquyres:subject=cleanup MPI_Abort hangs This commit was SVN r31543.
Этот коммит содержится в:
родитель
908e178d08
Коммит
e05b88fd18
@ -392,36 +392,19 @@ int orte_ess_base_app_finalize(void)
|
||||
*
|
||||
* However, this causes a problem for OpenRTE as the system truly
|
||||
* needs to know that this actually IS an abnormal termination.
|
||||
* To get around the problem, we create a file in the session
|
||||
* directory - we don't need to put anything in it, though, as its
|
||||
* very existence simply alerts us that this was an abnormal
|
||||
* termination.
|
||||
*
|
||||
* The session directory finalize system will clean this file up
|
||||
* for us automagically. However, it needs to stick around long
|
||||
* enough for our local daemon to find it! So, we do NOT call
|
||||
* session_dir_finalize here!!! Someone will clean up for us.
|
||||
* To get around the problem, we drop a marker in the proc-level
|
||||
* session dir. If session dir's were not allowed, then we just
|
||||
* ignore this question.
|
||||
*
|
||||
* In some cases, however, we DON'T want to create that alert. For
|
||||
* example, if an orted detects that the HNP has died, then there
|
||||
* is truly nobody to alert! In these cases, we pass report=false
|
||||
* to prevent the abort file from being created. This allows the
|
||||
* session directory tree to cleanly be eliminated.
|
||||
* to indicate that we don't want the marker dropped.
|
||||
*/
|
||||
static void report_sync(int status, orte_process_name_t* sender,
|
||||
opal_buffer_t *buffer,
|
||||
orte_rml_tag_t tag, void *cbdata)
|
||||
{
|
||||
bool *sync_waiting = (bool*)cbdata;
|
||||
/* flag as complete */
|
||||
*sync_waiting = false;
|
||||
}
|
||||
|
||||
void orte_ess_base_app_abort(int status, bool report)
|
||||
{
|
||||
orte_daemon_cmd_flag_t cmd=ORTE_DAEMON_ABORT_CALLED;
|
||||
opal_buffer_t *buf;
|
||||
bool sync_waiting = true;
|
||||
int fd;
|
||||
char *myfile;
|
||||
|
||||
/* Exit - do NOT do a normal finalize as this will very likely
|
||||
* hang the process. We are aborting due to an abnormal condition
|
||||
@ -435,30 +418,16 @@ void orte_ess_base_app_abort(int status, bool report)
|
||||
/* CRS cleanup since it may have a named pipe and thread active */
|
||||
orte_cr_finalize();
|
||||
|
||||
/* If we were asked to report this termination, do so - except
|
||||
* in cases of abnormal termination ordered by the RTE as
|
||||
* this means we can't rely on being able to communicate. Also,
|
||||
* since singletons don't start an HNP unless necessary, and
|
||||
/* If we were asked to report this termination, do so.
|
||||
* Since singletons don't start an HNP unless necessary, and
|
||||
* direct-launched procs don't have daemons at all, only send
|
||||
* the message if routing is enabled as this indicates we
|
||||
* have someone to send to
|
||||
*/
|
||||
if (report && !orte_abnormal_term_ordered && orte_routing_is_enabled) {
|
||||
buf = OBJ_NEW(opal_buffer_t);
|
||||
opal_dss.pack(buf, &cmd, 1, ORTE_DAEMON_CMD);
|
||||
orte_rml.send_buffer_nb(ORTE_PROC_MY_DAEMON, buf, ORTE_RML_TAG_DAEMON, orte_rml_send_callback, NULL);
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_debug_output,
|
||||
"%s orte_ess_app_abort: sent abort msg to %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_DAEMON)));
|
||||
/* get the ack - need this to ensure that the sync communication
|
||||
* gets serviced by the event library on the orted prior to the
|
||||
* process exiting
|
||||
*/
|
||||
sync_waiting = true;
|
||||
orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ABORT,
|
||||
ORTE_RML_NON_PERSISTENT, report_sync, &sync_waiting);
|
||||
ORTE_WAIT_FOR_COMPLETION(sync_waiting);
|
||||
if (report && orte_routing_is_enabled && orte_create_session_dirs) {
|
||||
myfile = opal_os_path(false, orte_process_info.proc_session_dir, "aborted", NULL);
|
||||
fd = open(myfile, O_CREAT);
|
||||
close(fd);
|
||||
}
|
||||
|
||||
/* - Clean out the global structures
|
||||
|
@ -1869,39 +1869,6 @@ CLEANUP:
|
||||
return rc;
|
||||
}
|
||||
|
||||
void orte_odls_base_default_report_abort(orte_process_name_t *proc)
|
||||
{
|
||||
orte_proc_t *child;
|
||||
opal_buffer_t *buffer;
|
||||
int rc, i;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
|
||||
"%s GOT ABORT REPORT FOR %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(proc)));
|
||||
|
||||
/* find this child */
|
||||
for (i=0; i < orte_local_children->size; i++) {
|
||||
if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (proc->jobid == child->name.jobid &&
|
||||
proc->vpid == child->name.vpid) { /* found it */
|
||||
child->aborted = true;
|
||||
/* send ack */
|
||||
buffer = OBJ_NEW(opal_buffer_t);
|
||||
if (0 > (rc = orte_rml.send_buffer_nb(proc, buffer,
|
||||
ORTE_RML_TAG_ABORT,
|
||||
orte_rml_send_callback, NULL))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(buffer);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Wait for a callback indicating the child has completed.
|
||||
*/
|
||||
@ -1912,6 +1879,7 @@ void odls_base_default_wait_local_proc(pid_t pid, int status, void* cbdata)
|
||||
int i;
|
||||
orte_job_t *jobdat;
|
||||
orte_proc_state_t state=ORTE_PROC_STATE_WAITPID_FIRED;
|
||||
char *abortfile, *jobfam, *job, *vpidstr;
|
||||
|
||||
/* find this child */
|
||||
for (i=0; i < orte_local_children->size; i++) {
|
||||
@ -1936,8 +1904,8 @@ void odls_base_default_wait_local_proc(pid_t pid, int status, void* cbdata)
|
||||
}
|
||||
|
||||
opal_output_verbose(5, orte_odls_base_framework.framework_output,
|
||||
"%s odls:wait_local_proc child process %s pid %ld terminated",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
"%s odls:wait_local_proc child process %s pid %ld terminated",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proc->name), (long)pid);
|
||||
|
||||
/* if the child was previously flagged as dead, then just
|
||||
@ -1976,7 +1944,7 @@ void odls_base_default_wait_local_proc(pid_t pid, int status, void* cbdata)
|
||||
}
|
||||
|
||||
/* determine the state of this process */
|
||||
if(WIFEXITED(status)) {
|
||||
if (WIFEXITED(status)) {
|
||||
/* set the exit status appropriately */
|
||||
proc->exit_code = WEXITSTATUS(status);
|
||||
|
||||
@ -1985,19 +1953,56 @@ void odls_base_default_wait_local_proc(pid_t pid, int status, void* cbdata)
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proc->name), proc->exit_code));
|
||||
|
||||
if (proc->aborted) {
|
||||
/* provide a default state */
|
||||
state = ORTE_PROC_STATE_WAITPID_FIRED;
|
||||
|
||||
/* check for the abort marker */
|
||||
if (0 > asprintf(&jobfam, "%d", ORTE_JOB_FAMILY(proc->name.jobid))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
goto MOVEON;
|
||||
}
|
||||
|
||||
if (0 > asprintf(&job, "%d", ORTE_LOCAL_JOBID(proc->name.jobid))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
free(jobfam);
|
||||
goto MOVEON;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != orte_util_convert_vpid_to_string(&vpidstr, proc->name.vpid)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
free(jobfam);
|
||||
free(job);
|
||||
goto MOVEON;
|
||||
}
|
||||
|
||||
abortfile = opal_os_path(false, orte_process_info.top_session_dir, jobfam, job, vpidstr, NULL );
|
||||
if (NULL == abortfile ) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
free(jobfam);
|
||||
free(job);
|
||||
free(vpidstr);
|
||||
goto MOVEON;
|
||||
}
|
||||
free(jobfam);
|
||||
free(job);
|
||||
free(vpidstr);
|
||||
|
||||
if (access(abortfile, F_OK)) {
|
||||
unlink(abortfile);
|
||||
proc->aborted = true;
|
||||
/* even though the process exited "normally", it happened
|
||||
* via an orte_abort call, so we need to indicate this was
|
||||
* an "abnormal" termination.
|
||||
* via an orte_abort call
|
||||
*/
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
|
||||
"%s odls:waitpid_fired child %s died by call to abort",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proc->name)));
|
||||
state = ORTE_PROC_STATE_CALLED_ABORT;
|
||||
free(abortfile);
|
||||
goto MOVEON;
|
||||
}
|
||||
|
||||
free(abortfile);
|
||||
|
||||
/* check to see if a sync was required and if it was received */
|
||||
if (proc->registered) {
|
||||
if (proc->deregistered || orte_allowed_exit_without_sync || 0 != proc->exit_code) {
|
||||
|
@ -109,8 +109,6 @@ orte_odls_base_default_deliver_message(orte_jobid_t job, opal_buffer_t *buffer,
|
||||
|
||||
ORTE_DECLSPEC void odls_base_default_wait_local_proc(pid_t pid, int status, void* cbdata);
|
||||
|
||||
ORTE_DECLSPEC void orte_odls_base_default_report_abort(orte_process_name_t *proc);
|
||||
|
||||
/* define a function type to signal a local proc */
|
||||
typedef int (*orte_odls_base_signal_local_fn_t)(pid_t pid, int signum);
|
||||
|
||||
|
@ -239,14 +239,6 @@ void orte_daemon_recv(int status, orte_process_name_t* sender,
|
||||
}
|
||||
break;
|
||||
|
||||
case ORTE_DAEMON_ABORT_CALLED:
|
||||
if (orte_debug_daemons_flag) {
|
||||
opal_output(0, "%s orted_cmd: received abort report",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
}
|
||||
orte_odls_base_default_report_abort(sender);
|
||||
break;
|
||||
|
||||
case ORTE_DAEMON_ABORT_PROCS_CALLED:
|
||||
if (orte_debug_daemons_flag) {
|
||||
opal_output(0, "%s orted_cmd: received abort_procs report",
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user