1
1

Cleanup some missed updates to orte_wait_cb as params have changed

Refs trac:4717

This commit was SVN r32025.

The following Trac tickets were found above:
  Ticket 4717 --> https://svn.open-mpi.org/trac/ompi/ticket/4717
Этот коммит содержится в:
Ralph Castain 2014-06-17 23:40:31 +00:00
родитель 5dbf4a62c4
Коммит 8e7c0257f0
3 изменённых файлов: 37 добавлений и 21 удалений

Просмотреть файл

@ -107,7 +107,7 @@ orte_plm_base_module_t orte_plm_alps_module = {
/*
* Local variables
*/
static pid_t alps_pid = 0;
static orte_proc_t *alpsrun = NULL;
static bool failed_launch;
static void launch_daemons(int fd, short args, void *cbdata);
@ -462,8 +462,10 @@ static int plm_alps_terminate_orteds(void)
* do NOT ERROR_LOG any return code to avoid confusing, duplicate error
* messages
*/
orte_wait_cb_cancel(alps_pid);
if (NULL != alpsrun) {
orte_wait_cb_cancel(alpsrun);
}
/* now tell them to die */
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_CMD))) {
ORTE_ERROR_LOG(rc);
@ -478,8 +480,8 @@ static int plm_alps_terminate_orteds(void)
*/
static int plm_alps_signal_job(orte_jobid_t jobid, int32_t signal)
{
if (0 != alps_pid) {
kill(alps_pid, (int)signal);
if (NULL != alpsrun && 0 != alpsrun->pid) {
kill(alpsrun->pid, (int)signal);
}
return ORTE_SUCCESS;
}
@ -489,6 +491,10 @@ static int plm_alps_finalize(void)
{
int rc;
if (NULL != alpsrun) {
OBJ_RELEASE(alpsrun);
}
/* cleanup any pending recvs */
if (ORTE_SUCCESS != (rc = orte_plm_base_comm_stop())) {
ORTE_ERROR_LOG(rc);
@ -498,7 +504,7 @@ static int plm_alps_finalize(void)
}
static void alps_wait_cb(pid_t pid, int status, void* cbdata){
static void alps_wait_cb(orte_proc_t *proc, void* cbdata){
orte_job_t *jdata;
/* According to the ALPS folks, alps always returns the highest exit
@ -518,7 +524,7 @@ static void alps_wait_cb(pid_t pid, int status, void* cbdata){
*/
jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
if (0 != status) {
if (0 != proc->exit_code) {
if (failed_launch) {
/* report that the daemon has failed so we break out of the daemon
* callback receive and exit
@ -531,7 +537,6 @@ static void alps_wait_cb(pid_t pid, int status, void* cbdata){
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ABORTED);
}
}
}
@ -551,6 +556,11 @@ static int plm_alps_start_proc(int argc, char **argv, char **env,
return ORTE_ERR_SYS_LIMITS_CHILDREN;
}
alpsrun = OBJ_NEW(orte_proc_t);
alpsrun->pid = alps_pid;
/* setup the waitpid so we can find out if alps succeeds! */
orte_wait_cb(alpsrun, alps_wait_cb, NULL);
if (0 == alps_pid) { /* child */
char *bin_base = NULL, *lib_base = NULL;
@ -635,8 +645,6 @@ static int plm_alps_start_proc(int argc, char **argv, char **env,
sides of the fork... */
setpgid(alps_pid, alps_pid);
/* setup the waitpid so we can find out if alps succeeds! */
orte_wait_cb(alps_pid, alps_wait_cb, NULL);
free(exec_argv);
}

Просмотреть файл

@ -508,7 +508,7 @@ static int plm_slurm_finalize(void)
}
static void srun_wait_cb(pid_t pid, int status, void* cbdata){
static void srun_wait_cb(orte_proc_t *proc, void* cbdata){
orte_job_t *jdata;
/* According to the SLURM folks, srun always returns the highest exit
@ -545,7 +545,7 @@ static void srun_wait_cb(pid_t pid, int status, void* cbdata){
/* if this is after launch, then we need to abort only if the status
* returned is non-zero - i.e., if the orteds exited with an error
*/
if (0 != status) {
if (0 != proc->exit_code) {
/* an orted must have died unexpectedly after launch - report
* that the daemon has failed so we exit
*/
@ -555,7 +555,7 @@ static void srun_wait_cb(pid_t pid, int status, void* cbdata){
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ABORTED);
}
/* otherwise, check to see if this is the primary pid */
if (primary_srun_pid == pid) {
if (primary_srun_pid == proc->pid) {
/* in this case, we just want to fire the proper trigger so
* mpirun can exit
*/
@ -567,6 +567,8 @@ static void srun_wait_cb(pid_t pid, int status, void* cbdata){
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DAEMONS_TERMINATED);
}
}
/* done with this dummy */
OBJ_RELEASE(proc);
}
@ -576,6 +578,7 @@ static int plm_slurm_start_proc(int argc, char **argv, char **env,
int fd;
int srun_pid;
char *exec_argv = opal_path_findv(argv[0], 0, env, NULL);
orte_proc_t *dummy;
if (NULL == exec_argv) {
return ORTE_ERR_NOT_FOUND;
@ -588,6 +591,12 @@ static int plm_slurm_start_proc(int argc, char **argv, char **env,
return ORTE_ERR_SYS_LIMITS_CHILDREN;
}
/* setup a dummy proc object to track the srun */
dummy = OBJ_NEW(orte_proc_t);
dummy->pid = srun_pid;
/* setup the waitpid so we can find out if srun succeeds! */
orte_wait_cb(dummy, srun_wait_cb, NULL);
if (0 == srun_pid) { /* child */
char *bin_base = NULL, *lib_base = NULL;
@ -677,8 +686,6 @@ static int plm_slurm_start_proc(int argc, char **argv, char **env,
primary_pid_set = true;
}
/* setup the waitpid so we can find out if srun succeeds! */
orte_wait_cb(srun_pid, srun_wait_cb, NULL);
free(exec_argv);
}

Просмотреть файл

@ -1490,6 +1490,7 @@ static int start_compression(orte_sstore_stage_local_snapshot_info_t *handle_inf
{
int ret, exit_status = ORTE_SUCCESS;
char * postfix = NULL;
orte_proc_t *proc;
/* Sanity Check */
if( !orte_sstore_stage_enabled_compression ) {
@ -1531,11 +1532,10 @@ static int start_compression(orte_sstore_stage_local_snapshot_info_t *handle_inf
app_info->compress_pid,
ORTE_NAME_PRINT(&(app_info->name)) ));
if( ORTE_SUCCESS != (ret = orte_wait_cb(app_info->compress_pid, sstore_stage_local_compress_waitpid_cb, app_info) ) ) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
proc = OBJ_NEW(orte_proc_t);
proc->pid = app_info->compress_pid;
orte_wait_cb(proc, sstore_stage_local_compress_waitpid_cb, app_info);
cleanup:
if( NULL != postfix ) {
@ -1546,7 +1546,7 @@ static int start_compression(orte_sstore_stage_local_snapshot_info_t *handle_inf
return exit_status;
}
static void sstore_stage_local_compress_waitpid_cb(pid_t pid, int status, void* cbdata)
static void sstore_stage_local_compress_waitpid_cb(orte_proc_t *proc, void* cbdata)
{
orte_sstore_stage_local_app_snapshot_info_t *app_info = NULL;
@ -1558,6 +1558,7 @@ static void sstore_stage_local_compress_waitpid_cb(pid_t pid, int status, void*
ORTE_NAME_PRINT(&(app_info->name)) ));
app_info->compress_pid = 0;
OBJ_RELEASE(proc);
}
static int wait_all_compressed(orte_sstore_stage_local_snapshot_info_t *handle_info)