Some minor cleanups. Get singletons working. Cleanup abort handling so it gets properly identified.
This commit was SVN r26261.
Этот коммит содержится в:
родитель
53bbcf4b5b
Коммит
14d5525fb1
@ -390,6 +390,10 @@ void orte_ess_base_app_abort(int status, bool report)
|
||||
ORTE_RML_NON_PERSISTENT, report_sync, NULL)) {
|
||||
return;
|
||||
}
|
||||
while (!sync_recvd) {
|
||||
opal_progress();
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
/* - Clean out the global structures
|
||||
|
@ -242,7 +242,12 @@ static int rte_init(void)
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
/* set the collective ids */
|
||||
orte_process_info.peer_modex = 0;
|
||||
orte_process_info.peer_init_barrier = 1;
|
||||
orte_process_info.peer_fini_barrier = 2;
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -1889,7 +1889,11 @@ void orte_odls_base_default_report_abort(orte_process_name_t *proc)
|
||||
orte_proc_t *child;
|
||||
opal_buffer_t *buffer;
|
||||
int rc, i;
|
||||
orte_ns_cmp_bitmask_t mask;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||
"%s GOT ABORT REPORT FOR %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(proc)));
|
||||
|
||||
/* find this child */
|
||||
for (i=0; i < orte_local_children->size; i++) {
|
||||
@ -1897,11 +1901,9 @@ void orte_odls_base_default_report_abort(orte_process_name_t *proc)
|
||||
continue;
|
||||
}
|
||||
|
||||
mask = ORTE_NS_CMP_ALL;
|
||||
|
||||
if (OPAL_EQUAL ==
|
||||
orte_util_compare_name_fields(mask, proc, &child->name)) { /* found it */
|
||||
child->state = ORTE_PROC_STATE_CALLED_ABORT;
|
||||
if (proc->jobid == child->name.jobid &&
|
||||
proc->vpid == child->name.vpid) { /* found it */
|
||||
child->aborted = true;
|
||||
/* send ack */
|
||||
buffer = OBJ_NEW(opal_buffer_t);
|
||||
if (0 > (rc = orte_rml.send_buffer_nb(proc, buffer,
|
||||
@ -1993,7 +1995,7 @@ void odls_base_default_wait_local_proc(pid_t pid, int status, void* cbdata)
|
||||
/* set the exit status appropriately */
|
||||
proc->exit_code = WEXITSTATUS(status);
|
||||
|
||||
if (ORTE_PROC_STATE_CALLED_ABORT == proc->state) {
|
||||
if (proc->aborted) {
|
||||
/* even though the process exited "normally", it happened
|
||||
* via an orte_abort call, so we need to indicate this was
|
||||
* an "abnormal" termination.
|
||||
|
@ -72,6 +72,7 @@
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/ess/ess.h"
|
||||
#include "orte/mca/grpcomm/grpcomm.h"
|
||||
#include "orte/mca/grpcomm/base/base.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/rml/rml_types.h"
|
||||
#include "orte/mca/odls/odls.h"
|
||||
@ -510,7 +511,7 @@ int orte_daemon(int argc, char *argv[])
|
||||
proc = OBJ_NEW(orte_proc_t);
|
||||
proc->name.jobid = jdata->jobid;
|
||||
proc->name.vpid = 0;
|
||||
|
||||
proc->alive = true;
|
||||
proc->state = ORTE_PROC_STATE_RUNNING;
|
||||
proc->app_idx = 0;
|
||||
/* obviously, they are on my node */
|
||||
@ -519,8 +520,16 @@ int orte_daemon(int argc, char *argv[])
|
||||
OBJ_RETAIN(node); /* keep accounting straight */
|
||||
opal_pointer_array_add(jdata->procs, proc);
|
||||
jdata->num_procs = 1;
|
||||
/* and obviously they are one of my local procs */
|
||||
OBJ_RETAIN(proc);
|
||||
opal_pointer_array_add(orte_local_children, proc);
|
||||
jdata->num_local_procs = 1;
|
||||
|
||||
/* the singleton will use the first three collectives
|
||||
* for its modex/barriers
|
||||
*/
|
||||
orte_grpcomm_base.coll_id += 3;
|
||||
|
||||
/* need to setup a pidmap for it */
|
||||
buffer = OBJ_NEW(opal_buffer_t);
|
||||
opal_dss.pack(buffer, &jdata->jobid, 1, ORTE_JOBID); /* jobid */
|
||||
@ -603,7 +612,7 @@ int orte_daemon(int argc, char *argv[])
|
||||
orted_globals.singleton_died_pipe,
|
||||
OPAL_EV_READ,
|
||||
pipe_closed,
|
||||
&orted_globals.singleton_died_pipe);
|
||||
pipe_handler);
|
||||
opal_event_add(pipe_handler, NULL);
|
||||
}
|
||||
|
||||
|
@ -845,6 +845,7 @@ static void orte_proc_construct(orte_proc_t* proc)
|
||||
proc->last_errmgr_state = ORTE_PROC_STATE_UNDEF;
|
||||
proc->state = ORTE_PROC_STATE_UNDEF;
|
||||
proc->alive = false;
|
||||
proc->aborted = false;
|
||||
proc->app_idx = 0;
|
||||
#if OPAL_HAVE_HWLOC
|
||||
proc->locale = NULL;
|
||||
|
@ -442,6 +442,8 @@ struct orte_proc_t {
|
||||
* and has not yet terminated
|
||||
*/
|
||||
bool alive;
|
||||
/* flag if it called abort */
|
||||
bool aborted;
|
||||
/* exit code */
|
||||
orte_exit_code_t exit_code;
|
||||
/* the app_context that generated this proc */
|
||||
|
@ -935,9 +935,6 @@ int orterun(int argc, char *argv[])
|
||||
}
|
||||
|
||||
DONE:
|
||||
/* update the exit status, in case it wasn't done */
|
||||
ORTE_UPDATE_EXIT_STATUS(orte_exit_status);
|
||||
|
||||
/* cleanup and leave */
|
||||
orte_finalize();
|
||||
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user