1
1

Cleanup some missed updates to orte_wait_cb as params have changed

Refs trac:4717

This commit was SVN r32025.

The following Trac tickets were found above:
  Ticket 4717 --> https://svn.open-mpi.org/trac/ompi/ticket/4717
Этот коммит содержится в:
Ralph Castain 2014-06-17 23:40:31 +00:00
родитель 5dbf4a62c4
Коммит 8e7c0257f0
3 изменённых файлов: 37 добавлений и 21 удалений

Просмотреть файл

@ -107,7 +107,7 @@ orte_plm_base_module_t orte_plm_alps_module = {
/* /*
* Local variables * Local variables
*/ */
static pid_t alps_pid = 0; static orte_proc_t *alpsrun = NULL;
static bool failed_launch; static bool failed_launch;
static void launch_daemons(int fd, short args, void *cbdata); static void launch_daemons(int fd, short args, void *cbdata);
@ -462,7 +462,9 @@ static int plm_alps_terminate_orteds(void)
* do NOT ERROR_LOG any return code to avoid confusing, duplicate error * do NOT ERROR_LOG any return code to avoid confusing, duplicate error
* messages * messages
*/ */
orte_wait_cb_cancel(alps_pid); if (NULL != alpsrun) {
orte_wait_cb_cancel(alpsrun);
}
/* now tell them to die */ /* now tell them to die */
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_CMD))) { if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_CMD))) {
@ -478,8 +480,8 @@ static int plm_alps_terminate_orteds(void)
*/ */
static int plm_alps_signal_job(orte_jobid_t jobid, int32_t signal) static int plm_alps_signal_job(orte_jobid_t jobid, int32_t signal)
{ {
if (0 != alps_pid) { if (NULL != alpsrun && 0 != alpsrun->pid) {
kill(alps_pid, (int)signal); kill(alpsrun->pid, (int)signal);
} }
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }
@ -489,6 +491,10 @@ static int plm_alps_finalize(void)
{ {
int rc; int rc;
if (NULL != alpsrun) {
OBJ_RELEASE(alpsrun);
}
/* cleanup any pending recvs */ /* cleanup any pending recvs */
if (ORTE_SUCCESS != (rc = orte_plm_base_comm_stop())) { if (ORTE_SUCCESS != (rc = orte_plm_base_comm_stop())) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
@ -498,7 +504,7 @@ static int plm_alps_finalize(void)
} }
static void alps_wait_cb(pid_t pid, int status, void* cbdata){ static void alps_wait_cb(orte_proc_t *proc, void* cbdata){
orte_job_t *jdata; orte_job_t *jdata;
/* According to the ALPS folks, alps always returns the highest exit /* According to the ALPS folks, alps always returns the highest exit
@ -518,7 +524,7 @@ static void alps_wait_cb(pid_t pid, int status, void* cbdata){
*/ */
jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
if (0 != status) { if (0 != proc->exit_code) {
if (failed_launch) { if (failed_launch) {
/* report that the daemon has failed so we break out of the daemon /* report that the daemon has failed so we break out of the daemon
* callback receive and exit * callback receive and exit
@ -531,7 +537,6 @@ static void alps_wait_cb(pid_t pid, int status, void* cbdata){
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ABORTED); ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ABORTED);
} }
} }
} }
@ -551,6 +556,11 @@ static int plm_alps_start_proc(int argc, char **argv, char **env,
return ORTE_ERR_SYS_LIMITS_CHILDREN; return ORTE_ERR_SYS_LIMITS_CHILDREN;
} }
alpsrun = OBJ_NEW(orte_proc_t);
alpsrun->pid = alps_pid;
/* setup the waitpid so we can find out if alps succeeds! */
orte_wait_cb(alpsrun, alps_wait_cb, NULL);
if (0 == alps_pid) { /* child */ if (0 == alps_pid) { /* child */
char *bin_base = NULL, *lib_base = NULL; char *bin_base = NULL, *lib_base = NULL;
@ -635,8 +645,6 @@ static int plm_alps_start_proc(int argc, char **argv, char **env,
sides of the fork... */ sides of the fork... */
setpgid(alps_pid, alps_pid); setpgid(alps_pid, alps_pid);
/* setup the waitpid so we can find out if alps succeeds! */
orte_wait_cb(alps_pid, alps_wait_cb, NULL);
free(exec_argv); free(exec_argv);
} }

Просмотреть файл

@ -508,7 +508,7 @@ static int plm_slurm_finalize(void)
} }
static void srun_wait_cb(pid_t pid, int status, void* cbdata){ static void srun_wait_cb(orte_proc_t *proc, void* cbdata){
orte_job_t *jdata; orte_job_t *jdata;
/* According to the SLURM folks, srun always returns the highest exit /* According to the SLURM folks, srun always returns the highest exit
@ -545,7 +545,7 @@ static void srun_wait_cb(pid_t pid, int status, void* cbdata){
/* if this is after launch, then we need to abort only if the status /* if this is after launch, then we need to abort only if the status
* returned is non-zero - i.e., if the orteds exited with an error * returned is non-zero - i.e., if the orteds exited with an error
*/ */
if (0 != status) { if (0 != proc->exit_code) {
/* an orted must have died unexpectedly after launch - report /* an orted must have died unexpectedly after launch - report
* that the daemon has failed so we exit * that the daemon has failed so we exit
*/ */
@ -555,7 +555,7 @@ static void srun_wait_cb(pid_t pid, int status, void* cbdata){
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ABORTED); ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ABORTED);
} }
/* otherwise, check to see if this is the primary pid */ /* otherwise, check to see if this is the primary pid */
if (primary_srun_pid == pid) { if (primary_srun_pid == proc->pid) {
/* in this case, we just want to fire the proper trigger so /* in this case, we just want to fire the proper trigger so
* mpirun can exit * mpirun can exit
*/ */
@ -567,6 +567,8 @@ static void srun_wait_cb(pid_t pid, int status, void* cbdata){
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DAEMONS_TERMINATED); ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DAEMONS_TERMINATED);
} }
} }
/* done with this dummy */
OBJ_RELEASE(proc);
} }
@ -576,6 +578,7 @@ static int plm_slurm_start_proc(int argc, char **argv, char **env,
int fd; int fd;
int srun_pid; int srun_pid;
char *exec_argv = opal_path_findv(argv[0], 0, env, NULL); char *exec_argv = opal_path_findv(argv[0], 0, env, NULL);
orte_proc_t *dummy;
if (NULL == exec_argv) { if (NULL == exec_argv) {
return ORTE_ERR_NOT_FOUND; return ORTE_ERR_NOT_FOUND;
@ -588,6 +591,12 @@ static int plm_slurm_start_proc(int argc, char **argv, char **env,
return ORTE_ERR_SYS_LIMITS_CHILDREN; return ORTE_ERR_SYS_LIMITS_CHILDREN;
} }
/* setup a dummy proc object to track the srun */
dummy = OBJ_NEW(orte_proc_t);
dummy->pid = srun_pid;
/* setup the waitpid so we can find out if srun succeeds! */
orte_wait_cb(dummy, srun_wait_cb, NULL);
if (0 == srun_pid) { /* child */ if (0 == srun_pid) { /* child */
char *bin_base = NULL, *lib_base = NULL; char *bin_base = NULL, *lib_base = NULL;
@ -677,8 +686,6 @@ static int plm_slurm_start_proc(int argc, char **argv, char **env,
primary_pid_set = true; primary_pid_set = true;
} }
/* setup the waitpid so we can find out if srun succeeds! */
orte_wait_cb(srun_pid, srun_wait_cb, NULL);
free(exec_argv); free(exec_argv);
} }

Просмотреть файл

@ -1490,6 +1490,7 @@ static int start_compression(orte_sstore_stage_local_snapshot_info_t *handle_inf
{ {
int ret, exit_status = ORTE_SUCCESS; int ret, exit_status = ORTE_SUCCESS;
char * postfix = NULL; char * postfix = NULL;
orte_proc_t *proc;
/* Sanity Check */ /* Sanity Check */
if( !orte_sstore_stage_enabled_compression ) { if( !orte_sstore_stage_enabled_compression ) {
@ -1531,11 +1532,10 @@ static int start_compression(orte_sstore_stage_local_snapshot_info_t *handle_inf
app_info->compress_pid, app_info->compress_pid,
ORTE_NAME_PRINT(&(app_info->name)) )); ORTE_NAME_PRINT(&(app_info->name)) ));
if( ORTE_SUCCESS != (ret = orte_wait_cb(app_info->compress_pid, sstore_stage_local_compress_waitpid_cb, app_info) ) ) { proc = OBJ_NEW(orte_proc_t);
ORTE_ERROR_LOG(ret); proc->pid = app_info->compress_pid;
exit_status = ret;
goto cleanup; orte_wait_cb(proc, sstore_stage_local_compress_waitpid_cb, app_info);
}
cleanup: cleanup:
if( NULL != postfix ) { if( NULL != postfix ) {
@ -1546,7 +1546,7 @@ static int start_compression(orte_sstore_stage_local_snapshot_info_t *handle_inf
return exit_status; return exit_status;
} }
static void sstore_stage_local_compress_waitpid_cb(pid_t pid, int status, void* cbdata) static void sstore_stage_local_compress_waitpid_cb(orte_proc_t *proc, void* cbdata)
{ {
orte_sstore_stage_local_app_snapshot_info_t *app_info = NULL; orte_sstore_stage_local_app_snapshot_info_t *app_info = NULL;
@ -1558,6 +1558,7 @@ static void sstore_stage_local_compress_waitpid_cb(pid_t pid, int status, void*
ORTE_NAME_PRINT(&(app_info->name)) )); ORTE_NAME_PRINT(&(app_info->name)) ));
app_info->compress_pid = 0; app_info->compress_pid = 0;
OBJ_RELEASE(proc);
} }
static int wait_all_compressed(orte_sstore_stage_local_snapshot_info_t *handle_info) static int wait_all_compressed(orte_sstore_stage_local_snapshot_info_t *handle_info)