1
1

Some minor cleanups. Get singletons working. Cleanup abort handling so it gets properly identified.

This commit was SVN r26261.
Этот коммит содержится в:
Ralph Castain 2012-04-10 19:08:54 +00:00
родитель 53bbcf4b5b
Коммит 14d5525fb1
7 изменённых файлов: 33 добавлений и 13 удалений

Просмотреть файл

@ -390,6 +390,10 @@ void orte_ess_base_app_abort(int status, bool report)
ORTE_RML_NON_PERSISTENT, report_sync, NULL)) {
return;
}
while (!sync_recvd) {
opal_progress();
}
return;
}
/* - Clean out the global structures

Просмотреть файл

@ -242,7 +242,12 @@ static int rte_init(void)
ORTE_ERROR_LOG(rc);
return rc;
}
/* set the collective ids */
orte_process_info.peer_modex = 0;
orte_process_info.peer_init_barrier = 1;
orte_process_info.peer_fini_barrier = 2;
return ORTE_SUCCESS;
}

Просмотреть файл

@ -1889,7 +1889,11 @@ void orte_odls_base_default_report_abort(orte_process_name_t *proc)
orte_proc_t *child;
opal_buffer_t *buffer;
int rc, i;
orte_ns_cmp_bitmask_t mask;
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s GOT ABORT REPORT FOR %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(proc)));
/* find this child */
for (i=0; i < orte_local_children->size; i++) {
@ -1897,11 +1901,9 @@ void orte_odls_base_default_report_abort(orte_process_name_t *proc)
continue;
}
mask = ORTE_NS_CMP_ALL;
if (OPAL_EQUAL ==
orte_util_compare_name_fields(mask, proc, &child->name)) { /* found it */
child->state = ORTE_PROC_STATE_CALLED_ABORT;
if (proc->jobid == child->name.jobid &&
proc->vpid == child->name.vpid) { /* found it */
child->aborted = true;
/* send ack */
buffer = OBJ_NEW(opal_buffer_t);
if (0 > (rc = orte_rml.send_buffer_nb(proc, buffer,
@ -1993,7 +1995,7 @@ void odls_base_default_wait_local_proc(pid_t pid, int status, void* cbdata)
/* set the exit status appropriately */
proc->exit_code = WEXITSTATUS(status);
if (ORTE_PROC_STATE_CALLED_ABORT == proc->state) {
if (proc->aborted) {
/* even though the process exited "normally", it happened
* via an orte_abort call, so we need to indicate this was
* an "abnormal" termination.

Просмотреть файл

@ -72,6 +72,7 @@
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ess/ess.h"
#include "orte/mca/grpcomm/grpcomm.h"
#include "orte/mca/grpcomm/base/base.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/rml/rml_types.h"
#include "orte/mca/odls/odls.h"
@ -510,7 +511,7 @@ int orte_daemon(int argc, char *argv[])
proc = OBJ_NEW(orte_proc_t);
proc->name.jobid = jdata->jobid;
proc->name.vpid = 0;
proc->alive = true;
proc->state = ORTE_PROC_STATE_RUNNING;
proc->app_idx = 0;
/* obviously, they are on my node */
@ -519,8 +520,16 @@ int orte_daemon(int argc, char *argv[])
OBJ_RETAIN(node); /* keep accounting straight */
opal_pointer_array_add(jdata->procs, proc);
jdata->num_procs = 1;
/* and obviously they are one of my local procs */
OBJ_RETAIN(proc);
opal_pointer_array_add(orte_local_children, proc);
jdata->num_local_procs = 1;
/* the singleton will use the first three collectives
* for its modex/barriers
*/
orte_grpcomm_base.coll_id += 3;
/* need to setup a pidmap for it */
buffer = OBJ_NEW(opal_buffer_t);
opal_dss.pack(buffer, &jdata->jobid, 1, ORTE_JOBID); /* jobid */
@ -603,7 +612,7 @@ int orte_daemon(int argc, char *argv[])
orted_globals.singleton_died_pipe,
OPAL_EV_READ,
pipe_closed,
&orted_globals.singleton_died_pipe);
pipe_handler);
opal_event_add(pipe_handler, NULL);
}

Просмотреть файл

@ -845,6 +845,7 @@ static void orte_proc_construct(orte_proc_t* proc)
proc->last_errmgr_state = ORTE_PROC_STATE_UNDEF;
proc->state = ORTE_PROC_STATE_UNDEF;
proc->alive = false;
proc->aborted = false;
proc->app_idx = 0;
#if OPAL_HAVE_HWLOC
proc->locale = NULL;

Просмотреть файл

@ -442,6 +442,8 @@ struct orte_proc_t {
* and has not yet terminated
*/
bool alive;
/* flag if it called abort */
bool aborted;
/* exit code */
orte_exit_code_t exit_code;
/* the app_context that generated this proc */

Просмотреть файл

@ -935,9 +935,6 @@ int orterun(int argc, char *argv[])
}
DONE:
/* update the exit status, in case it wasn't done */
ORTE_UPDATE_EXIT_STATUS(orte_exit_status);
/* cleanup and leave */
orte_finalize();