diff --git a/orte/mca/pls/bproc/pls_bproc.c b/orte/mca/pls/bproc/pls_bproc.c index 3f581462aa..b08c0147d5 100644 --- a/orte/mca/pls/bproc/pls_bproc.c +++ b/orte/mca/pls/bproc/pls_bproc.c @@ -289,11 +289,19 @@ static void orte_pls_bproc_waitpid_daemon_cb(pid_t wpid, int status, void *data) if(!mca_pls_bproc_component.done_launching) { /* if a daemon exits before we are done launching the user apps we send a * message to ourself so we will break out of the recieve loop and exit */ - int rc; - int32_t src = -1; orte_buffer_t ack; + int rc; + int src[4] = {-1, -1}; + src[2] = wpid; + src[3] = *(int *) data; + if(WIFSIGNALED(status)) { + src[1] = WTERMSIG(status); + } OBJ_CONSTRUCT(&ack, orte_buffer_t); - orte_dps.pack(&ack, &src, 1, ORTE_INT32); + rc = orte_dps.pack(&ack, &src, 4, ORTE_INT); + if(ORTE_SUCCESS != rc) { + ORTE_ERROR_LOG(rc); + } rc = mca_oob_send_packed(MCA_OOB_NAME_SELF, &ack, MCA_OOB_TAG_BPROC, 0); if(0 > rc) { ORTE_ERROR_LOG(rc); @@ -366,7 +374,7 @@ static int orte_pls_bproc_launch_app(orte_cellid_t cellid, orte_jobid_t jobid, int rc, i, j; int * pids = NULL; int argc; - int32_t src; + int src[4]; char ** argv = NULL; char * var, * param; char * orted_path; @@ -582,7 +590,9 @@ static int orte_pls_bproc_launch_app(orte_cellid_t cellid, orte_jobid_t jobid, ORTE_ERROR_LOG(rc); goto cleanup; } - rc = orte_wait_cb(pids[i], orte_pls_bproc_waitpid_daemon_cb,proc_name); + free(var); + rc = orte_wait_cb(pids[i], orte_pls_bproc_waitpid_daemon_cb, + &node_list[i]); if(ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); goto cleanup; @@ -598,10 +608,19 @@ static int orte_pls_bproc_launch_app(orte_cellid_t cellid, orte_jobid_t jobid, ORTE_ERROR_LOG(rc); goto cleanup; } - idx = 1; - orte_dps.unpack(&ack, &src, &idx, ORTE_INT32); - if(-1 == src) { - opal_output(0, "pls_bproc: daemon exited unexpectedly\n"); + idx = 4; + rc = orte_dps.unpack(&ack, &src, &idx, ORTE_INT); + if(ORTE_SUCCESS != rc) { + ORTE_ERROR_LOG(rc); + } + if(-1 == src[0]) { + if(-1 == src[1]) { + opal_output(0, "pls_bproc: daemon pid %d exited unexpectedly on " + "node %d\n",src[2], src[3]); + } else { + opal_output(0, "pls_bproc: daemon pid %d exited unexpectedly on " + "node %d on signal %d\n",src[2], src[3], src[1]); + } rc = ORTE_ERROR; ORTE_ERROR_LOG(rc); orte_pls_bproc_terminate_job(daemon_jobid); diff --git a/orte/mca/pls/bproc_orted/pls_bproc_orted.c b/orte/mca/pls/bproc_orted/pls_bproc_orted.c index 837129252d..8110800821 100644 --- a/orte/mca/pls/bproc_orted/pls_bproc_orted.c +++ b/orte/mca/pls/bproc_orted/pls_bproc_orted.c @@ -65,6 +65,8 @@ static void pls_bproc_orted_delete_dir_tree(char * path); static int pls_bproc_orted_remove_dir(void); static void pls_bproc_orted_kill_cb(int status, orte_process_name_t * peer, orte_buffer_t* buffer, int tag, void* cbdata); +static void pls_bproc_orted_send_cb(int status, orte_process_name_t * peer, + orte_buffer_t* buffer, int tag, void* cbdata); /** * Creates the passed directory. If the directory already exists, it and its @@ -338,10 +340,20 @@ static int pls_bproc_orted_remove_dir() { /** * Callback function for when mpirun sends us a message saying all the child - * procs are done */ + * procs are done + */ static void pls_bproc_orted_kill_cb(int status, orte_process_name_t * peer, orte_buffer_t* buffer, int tag, void* cbdata) { + OPAL_THREAD_LOCK(&mca_pls_bproc_orted_component.lock); opal_condition_signal(&mca_pls_bproc_orted_component.condition); + OPAL_THREAD_UNLOCK(&mca_pls_bproc_orted_component.lock); +} + +/** + * Callback function for when we tell mpirun we are ready + */ +static void pls_bproc_orted_send_cb(int status, orte_process_name_t * peer, + orte_buffer_t* buffer, int tag, void* cbdata) { } /** @@ -358,7 +370,7 @@ int orte_pls_bproc_orted_launch(orte_jobid_t jobid) { int num_procs = 0; size_t i; size_t app_context; - int32_t src = 0; + int src = 0; orte_buffer_t ack; char * param; bool connect_stdin; @@ -478,8 +490,12 @@ int orte_pls_bproc_orted_launch(orte_jobid_t jobid) { goto cleanup; } /* do callback to say we are ready */ - orte_dps.pack(&ack, &src, 1, ORTE_INT32); - rc = mca_oob_send_packed(MCA_OOB_NAME_SEED, &ack, MCA_OOB_TAG_BPROC, 0); + rc = orte_dps.pack(&ack, &src, 1, ORTE_INT); + if(ORTE_SUCCESS != rc) { + ORTE_ERROR_LOG(rc); + } + rc = mca_oob_send_packed_nb(MCA_OOB_NAME_SEED, &ack, MCA_OOB_TAG_BPROC, 0, + pls_bproc_orted_send_cb, NULL); if (0 > rc) { ORTE_ERROR_LOG(rc); goto cleanup;