1
1
- better error message if the daemon dies

This commit was SVN r6687.
Этот коммит содержится в:
Tim Prins 2005-07-29 20:02:56 +00:00
родитель 835dad20d5
Коммит 40bf905e8e
2 изменённых файлов: 48 добавлений и 13 удалений

Просмотреть файл

@ -289,11 +289,19 @@ static void orte_pls_bproc_waitpid_daemon_cb(pid_t wpid, int status, void *data)
if(!mca_pls_bproc_component.done_launching) {
/* if a daemon exits before we are done launching the user apps we send a
* message to ourself so we will break out of the recieve loop and exit */
int rc;
int32_t src = -1;
orte_buffer_t ack;
int rc;
int src[4] = {-1, -1};
src[2] = wpid;
src[3] = *(int *) data;
if(WIFSIGNALED(status)) {
src[1] = WTERMSIG(status);
}
OBJ_CONSTRUCT(&ack, orte_buffer_t);
orte_dps.pack(&ack, &src, 1, ORTE_INT32);
rc = orte_dps.pack(&ack, &src, 4, ORTE_INT);
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
}
rc = mca_oob_send_packed(MCA_OOB_NAME_SELF, &ack, MCA_OOB_TAG_BPROC, 0);
if(0 > rc) {
ORTE_ERROR_LOG(rc);
@ -366,7 +374,7 @@ static int orte_pls_bproc_launch_app(orte_cellid_t cellid, orte_jobid_t jobid,
int rc, i, j;
int * pids = NULL;
int argc;
int32_t src;
int src[4];
char ** argv = NULL;
char * var, * param;
char * orted_path;
@ -582,7 +590,9 @@ static int orte_pls_bproc_launch_app(orte_cellid_t cellid, orte_jobid_t jobid,
ORTE_ERROR_LOG(rc);
goto cleanup;
}
rc = orte_wait_cb(pids[i], orte_pls_bproc_waitpid_daemon_cb,proc_name);
free(var);
rc = orte_wait_cb(pids[i], orte_pls_bproc_waitpid_daemon_cb,
&node_list[i]);
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
goto cleanup;
@ -598,10 +608,19 @@ static int orte_pls_bproc_launch_app(orte_cellid_t cellid, orte_jobid_t jobid,
ORTE_ERROR_LOG(rc);
goto cleanup;
}
idx = 1;
orte_dps.unpack(&ack, &src, &idx, ORTE_INT32);
if(-1 == src) {
opal_output(0, "pls_bproc: daemon exited unexpectedly\n");
idx = 4;
rc = orte_dps.unpack(&ack, &src, &idx, ORTE_INT);
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
}
if(-1 == src[0]) {
if(-1 == src[1]) {
opal_output(0, "pls_bproc: daemon pid %d exited unexpectedly on "
"node %d\n",src[2], src[3]);
} else {
opal_output(0, "pls_bproc: daemon pid %d exited unexpectedly on "
"node %d on signal %d\n",src[2], src[3], src[1]);
}
rc = ORTE_ERROR;
ORTE_ERROR_LOG(rc);
orte_pls_bproc_terminate_job(daemon_jobid);

Просмотреть файл

@ -65,6 +65,8 @@ static void pls_bproc_orted_delete_dir_tree(char * path);
static int pls_bproc_orted_remove_dir(void);
static void pls_bproc_orted_kill_cb(int status, orte_process_name_t * peer,
orte_buffer_t* buffer, int tag, void* cbdata);
static void pls_bproc_orted_send_cb(int status, orte_process_name_t * peer,
orte_buffer_t* buffer, int tag, void* cbdata);
/**
* Creates the passed directory. If the directory already exists, it and its
@ -338,10 +340,20 @@ static int pls_bproc_orted_remove_dir() {
/**
* Callback function for when mpirun sends us a message saying all the child
* procs are done */
* procs are done
*/
static void pls_bproc_orted_kill_cb(int status, orte_process_name_t * peer,
orte_buffer_t* buffer, int tag, void* cbdata) {
OPAL_THREAD_LOCK(&mca_pls_bproc_orted_component.lock);
opal_condition_signal(&mca_pls_bproc_orted_component.condition);
OPAL_THREAD_UNLOCK(&mca_pls_bproc_orted_component.lock);
}
/**
* Callback function for when we tell mpirun we are ready
*/
static void pls_bproc_orted_send_cb(int status, orte_process_name_t * peer,
orte_buffer_t* buffer, int tag, void* cbdata) {
}
/**
@ -358,7 +370,7 @@ int orte_pls_bproc_orted_launch(orte_jobid_t jobid) {
int num_procs = 0;
size_t i;
size_t app_context;
int32_t src = 0;
int src = 0;
orte_buffer_t ack;
char * param;
bool connect_stdin;
@ -478,8 +490,12 @@ int orte_pls_bproc_orted_launch(orte_jobid_t jobid) {
goto cleanup;
}
/* do callback to say we are ready */
orte_dps.pack(&ack, &src, 1, ORTE_INT32);
rc = mca_oob_send_packed(MCA_OOB_NAME_SEED, &ack, MCA_OOB_TAG_BPROC, 0);
rc = orte_dps.pack(&ack, &src, 1, ORTE_INT);
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
}
rc = mca_oob_send_packed_nb(MCA_OOB_NAME_SEED, &ack, MCA_OOB_TAG_BPROC, 0,
pls_bproc_orted_send_cb, NULL);
if (0 > rc) {
ORTE_ERROR_LOG(rc);
goto cleanup;