1
1

Take another stab at resolving the "called-abort" requirement without getting stuck. Return to "drop a turd" mode, perhaps with a little more intelligence behind it. Don't worry about catching it if session dirs weren't created

cmr=v1.8.2:reviewer=jsquyres:subject=cleanup MPI_Abort hangs

This commit was SVN r31543.
Этот коммит содержится в:
Ralph Castain 2014-04-29 17:29:46 +00:00
родитель 908e178d08
Коммит e05b88fd18
4 изменённых файлов: 57 добавлений и 93 удалений

Просмотреть файл

@ -392,36 +392,19 @@ int orte_ess_base_app_finalize(void)
*
* However, this causes a problem for OpenRTE as the system truly
* needs to know that this actually IS an abnormal termination.
* To get around the problem, we create a file in the session
* directory - we don't need to put anything in it, though, as its
* very existence simply alerts us that this was an abnormal
* termination.
*
* The session directory finalize system will clean this file up
* for us automagically. However, it needs to stick around long
* enough for our local daemon to find it! So, we do NOT call
* session_dir_finalize here!!! Someone will clean up for us.
* To get around the problem, we drop a marker in the proc-level
* session dir. If session dir's were not allowed, then we just
* ignore this question.
*
* In some cases, however, we DON'T want to create that alert. For
* example, if an orted detects that the HNP has died, then there
* is truly nobody to alert! In these cases, we pass report=false
* to prevent the abort file from being created. This allows the
* session directory tree to cleanly be eliminated.
* to indicate that we don't want the marker dropped.
*/
static void report_sync(int status, orte_process_name_t* sender,
opal_buffer_t *buffer,
orte_rml_tag_t tag, void *cbdata)
{
bool *sync_waiting = (bool*)cbdata;
/* flag as complete */
*sync_waiting = false;
}
void orte_ess_base_app_abort(int status, bool report)
{
orte_daemon_cmd_flag_t cmd=ORTE_DAEMON_ABORT_CALLED;
opal_buffer_t *buf;
bool sync_waiting = true;
int fd;
char *myfile;
/* Exit - do NOT do a normal finalize as this will very likely
* hang the process. We are aborting due to an abnormal condition
@ -435,30 +418,16 @@ void orte_ess_base_app_abort(int status, bool report)
/* CRS cleanup since it may have a named pipe and thread active */
orte_cr_finalize();
/* If we were asked to report this termination, do so - except
* in cases of abnormal termination ordered by the RTE as
* this means we can't rely on being able to communicate. Also,
* since singletons don't start an HNP unless necessary, and
/* If we were asked to report this termination, do so.
* Since singletons don't start an HNP unless necessary, and
* direct-launched procs don't have daemons at all, only send
* the message if routing is enabled as this indicates we
* have someone to send to
*/
if (report && !orte_abnormal_term_ordered && orte_routing_is_enabled) {
buf = OBJ_NEW(opal_buffer_t);
opal_dss.pack(buf, &cmd, 1, ORTE_DAEMON_CMD);
orte_rml.send_buffer_nb(ORTE_PROC_MY_DAEMON, buf, ORTE_RML_TAG_DAEMON, orte_rml_send_callback, NULL);
OPAL_OUTPUT_VERBOSE((5, orte_debug_output,
"%s orte_ess_app_abort: sent abort msg to %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(ORTE_PROC_MY_DAEMON)));
/* get the ack - need this to ensure that the sync communication
* gets serviced by the event library on the orted prior to the
* process exiting
*/
sync_waiting = true;
orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ABORT,
ORTE_RML_NON_PERSISTENT, report_sync, &sync_waiting);
ORTE_WAIT_FOR_COMPLETION(sync_waiting);
if (report && orte_routing_is_enabled && orte_create_session_dirs) {
myfile = opal_os_path(false, orte_process_info.proc_session_dir, "aborted", NULL);
fd = open(myfile, O_CREAT);
close(fd);
}
/* - Clean out the global structures

Просмотреть файл

@ -1869,39 +1869,6 @@ CLEANUP:
return rc;
}
void orte_odls_base_default_report_abort(orte_process_name_t *proc)
{
orte_proc_t *child;
opal_buffer_t *buffer;
int rc, i;
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
"%s GOT ABORT REPORT FOR %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(proc)));
/* find this child */
for (i=0; i < orte_local_children->size; i++) {
if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
continue;
}
if (proc->jobid == child->name.jobid &&
proc->vpid == child->name.vpid) { /* found it */
child->aborted = true;
/* send ack */
buffer = OBJ_NEW(opal_buffer_t);
if (0 > (rc = orte_rml.send_buffer_nb(proc, buffer,
ORTE_RML_TAG_ABORT,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buffer);
}
break;
}
}
}
/*
* Wait for a callback indicating the child has completed.
*/
@ -1912,6 +1879,7 @@ void odls_base_default_wait_local_proc(pid_t pid, int status, void* cbdata)
int i;
orte_job_t *jobdat;
orte_proc_state_t state=ORTE_PROC_STATE_WAITPID_FIRED;
char *abortfile, *jobfam, *job, *vpidstr;
/* find this child */
for (i=0; i < orte_local_children->size; i++) {
@ -1936,8 +1904,8 @@ void odls_base_default_wait_local_proc(pid_t pid, int status, void* cbdata)
}
opal_output_verbose(5, orte_odls_base_framework.framework_output,
"%s odls:wait_local_proc child process %s pid %ld terminated",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
"%s odls:wait_local_proc child process %s pid %ld terminated",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name), (long)pid);
/* if the child was previously flagged as dead, then just
@ -1976,7 +1944,7 @@ void odls_base_default_wait_local_proc(pid_t pid, int status, void* cbdata)
}
/* determine the state of this process */
if(WIFEXITED(status)) {
if (WIFEXITED(status)) {
/* set the exit status appropriately */
proc->exit_code = WEXITSTATUS(status);
@ -1985,19 +1953,56 @@ void odls_base_default_wait_local_proc(pid_t pid, int status, void* cbdata)
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name), proc->exit_code));
if (proc->aborted) {
/* provide a default state */
state = ORTE_PROC_STATE_WAITPID_FIRED;
/* check for the abort marker */
if (0 > asprintf(&jobfam, "%d", ORTE_JOB_FAMILY(proc->name.jobid))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
goto MOVEON;
}
if (0 > asprintf(&job, "%d", ORTE_LOCAL_JOBID(proc->name.jobid))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
free(jobfam);
goto MOVEON;
}
if (ORTE_SUCCESS != orte_util_convert_vpid_to_string(&vpidstr, proc->name.vpid)) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
free(jobfam);
free(job);
goto MOVEON;
}
abortfile = opal_os_path(false, orte_process_info.top_session_dir, jobfam, job, vpidstr, NULL );
if (NULL == abortfile ) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
free(jobfam);
free(job);
free(vpidstr);
goto MOVEON;
}
free(jobfam);
free(job);
free(vpidstr);
if (access(abortfile, F_OK)) {
unlink(abortfile);
proc->aborted = true;
/* even though the process exited "normally", it happened
* via an orte_abort call, so we need to indicate this was
* an "abnormal" termination.
* via an orte_abort call
*/
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
"%s odls:waitpid_fired child %s died by call to abort",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name)));
state = ORTE_PROC_STATE_CALLED_ABORT;
free(abortfile);
goto MOVEON;
}
free(abortfile);
/* check to see if a sync was required and if it was received */
if (proc->registered) {
if (proc->deregistered || orte_allowed_exit_without_sync || 0 != proc->exit_code) {

Просмотреть файл

@ -109,8 +109,6 @@ orte_odls_base_default_deliver_message(orte_jobid_t job, opal_buffer_t *buffer,
ORTE_DECLSPEC void odls_base_default_wait_local_proc(pid_t pid, int status, void* cbdata);
ORTE_DECLSPEC void orte_odls_base_default_report_abort(orte_process_name_t *proc);
/* define a function type to signal a local proc */
typedef int (*orte_odls_base_signal_local_fn_t)(pid_t pid, int signum);

Просмотреть файл

@ -239,14 +239,6 @@ void orte_daemon_recv(int status, orte_process_name_t* sender,
}
break;
case ORTE_DAEMON_ABORT_CALLED:
if (orte_debug_daemons_flag) {
opal_output(0, "%s orted_cmd: received abort report",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
}
orte_odls_base_default_report_abort(sender);
break;
case ORTE_DAEMON_ABORT_PROCS_CALLED:
if (orte_debug_daemons_flag) {
opal_output(0, "%s orted_cmd: received abort_procs report",