properly kill off the daemons.
This commit was SVN r6486.
Этот коммит содержится в:
родитель
5ebfad2f7d
Коммит
3295975cea
@ -59,7 +59,7 @@ orte_pls_base_module_t orte_pls_bproc_module = {
|
||||
orte_pls_bproc_finalize
|
||||
};
|
||||
|
||||
static int * orte_pls_bproc_daemon_pids = NULL;
|
||||
static orte_process_name_t ** orte_pls_bproc_daemon_names = NULL;
|
||||
static int orte_pls_bproc_num_daemons = 0;
|
||||
static int orte_pls_bproc_node_array(orte_rmaps_base_map_t* map,
|
||||
int ** node_array, int * node_array_len);
|
||||
@ -262,14 +262,20 @@ static void orte_pls_bproc_waitpid_cb(pid_t wpid, int status, void *data) {
|
||||
opal_output(0, "pls_bproc_waitpid_cb: error: process count is less than 0.\n");
|
||||
} else if(0 == mca_pls_bproc_component.num_procs &&
|
||||
mca_pls_bproc_component.done_launching) {
|
||||
orte_buffer_t ack;
|
||||
OBJ_CONSTRUCT(&ack, orte_buffer_t);
|
||||
rc = orte_dps.pack(&ack, &i, 1, ORTE_BYTE);
|
||||
if(rc != ORTE_SUCCESS) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
for(i = 0; i < orte_pls_bproc_num_daemons; i++) {
|
||||
if(0 < mca_pls_bproc_component.debug) {
|
||||
printf("killing daemon pid %d\n", orte_pls_bproc_daemon_pids[i]);
|
||||
}
|
||||
if(0 != kill(orte_pls_bproc_daemon_pids[i], 15)) {
|
||||
perror("kill of orted failed");
|
||||
rc = mca_oob_send_packed(orte_pls_bproc_daemon_names[i], &ack,
|
||||
MCA_OOB_TAG_BPROC, 0);
|
||||
if (0 > rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
}
|
||||
OBJ_DESTRUCT(&ack);
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&mca_pls_bproc_component.lock);
|
||||
}
|
||||
@ -323,8 +329,9 @@ static int orte_pls_bproc_launch_app(orte_jobid_t jobid,
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if(NULL == (orte_pls_bproc_daemon_pids = (int*)malloc(sizeof(int) * num_daemons))) {
|
||||
orte_pls_bproc_daemon_names = (orte_process_name_t **)
|
||||
malloc(sizeof(orte_process_name_t*)*num_daemons);
|
||||
if(NULL == orte_pls_bproc_daemon_names) {
|
||||
ORTE_ERROR_LOG(OMPI_ERR_OUT_OF_RESOURCE);
|
||||
goto cleanup;
|
||||
}
|
||||
@ -475,8 +482,8 @@ static int orte_pls_bproc_launch_app(orte_jobid_t jobid,
|
||||
}
|
||||
|
||||
/* launch the daemons */
|
||||
rc = bproc_vexecmove(num_daemons, node_list, orte_pls_bproc_daemon_pids,
|
||||
exec_path, argv, map->app->env);
|
||||
rc = bproc_vexecmove(num_daemons, node_list, pids, exec_path, argv,
|
||||
map->app->env);
|
||||
|
||||
if(rc != num_daemons) {
|
||||
opal_output(0, "Failed to launch proper number of daemons.");
|
||||
@ -484,15 +491,22 @@ static int orte_pls_bproc_launch_app(orte_jobid_t jobid,
|
||||
goto cleanup;
|
||||
}
|
||||
for(i = 0; i < num_daemons; i++) {
|
||||
if(0 >= orte_pls_bproc_daemon_pids[i]) {
|
||||
if(0 >= pids[i]) {
|
||||
opal_output(0, "pls_bproc: failed to launch all daemons. "
|
||||
"Daemon pid was %d on node %d and errno %d\n"
|
||||
"You may need to set the pls_bproc_orted paramater to "
|
||||
"point to where orted is installed",
|
||||
orte_pls_bproc_daemon_pids[i], node_list[i], errno);
|
||||
pids[i], node_list[i], errno);
|
||||
rc = ORTE_ERROR;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
} else {
|
||||
rc = orte_ns.create_process_name(&orte_pls_bproc_daemon_names[i],
|
||||
cellid, daemon_jobid, daemon_vpid_start + j);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
}
|
||||
orte_pls_bproc_num_daemons = num_daemons;
|
||||
@ -501,7 +515,7 @@ static int orte_pls_bproc_launch_app(orte_jobid_t jobid,
|
||||
|
||||
if(0 < mca_pls_bproc_component.debug) {
|
||||
opal_output(0, "PLS_BPROC DEBUG: %d daemons launched. First pid: %d\n",
|
||||
rc, *orte_pls_bproc_daemon_pids);
|
||||
rc, *pids);
|
||||
}
|
||||
|
||||
/* wait for communication back */
|
||||
|
@ -26,6 +26,7 @@
|
||||
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "opal/runtime/opal_progress.h"
|
||||
#include "opal/threads/condition.h"
|
||||
#include "opal/util/os_create_dirpath.h"
|
||||
#include "opal/util/os_path.h"
|
||||
#include "opal/util/output.h"
|
||||
@ -59,7 +60,8 @@ static int pls_bproc_orted_link_pipes(int proc_rank, orte_jobid_t jobid, int * f
|
||||
bool connect_stdin, size_t app_context);
|
||||
static void pls_bproc_orted_delete_dir_tree(char * path);
|
||||
static int pls_bproc_orted_remove_dir(void);
|
||||
|
||||
static void pls_bproc_orted_kill_cb(int status, orte_process_name_t * peer,
|
||||
orte_buffer_t* buffer, int tag, void* cbdata);
|
||||
/**
|
||||
* Creates the passed directory. If the directory already exists, it and its
|
||||
* contents will be deleted then the directory will be created.
|
||||
@ -328,6 +330,13 @@ static int pls_bproc_orted_remove_dir() {
|
||||
free(frontend);
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
/**
|
||||
* Callback function for when mpirun sends us a message saying all the child
|
||||
* procs are done */
|
||||
static void pls_bproc_orted_kill_cb(int status, orte_process_name_t * peer,
|
||||
orte_buffer_t* buffer, int tag, void* cbdata) {
|
||||
opal_condition_signal(&mca_pls_bproc_orted_component.condition);
|
||||
}
|
||||
|
||||
/**
|
||||
* Setup io for the current node, then tell orterun we are ready for the actual
|
||||
@ -442,7 +451,14 @@ int orte_pls_bproc_orted_launch(orte_jobid_t jobid)
|
||||
}
|
||||
|
||||
mca_pls_bproc_orted_component.num_procs = num_procs;
|
||||
|
||||
|
||||
/* post recieve for termination signal */
|
||||
rc = mca_oob_recv_packed_nb(MCA_OOB_NAME_SEED, MCA_OOB_TAG_BPROC, 0,
|
||||
pls_bproc_orted_kill_cb, NULL);
|
||||
if (0 > rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
/* do callback to say we are ready */
|
||||
orte_dps.pack(&ack, &src, 1, ORTE_INT32);
|
||||
rc = mca_oob_send_packed(MCA_OOB_NAME_SEED, &ack, MCA_OOB_TAG_BPROC, 0);
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user