Cleanup some missed updates to orte_wait_cb as params have changed
Refs trac:4717 This commit was SVN r32025. The following Trac tickets were found above: Ticket 4717 --> https://svn.open-mpi.org/trac/ompi/ticket/4717
Этот коммит содержится в:
родитель
5dbf4a62c4
Коммит
8e7c0257f0
@ -107,7 +107,7 @@ orte_plm_base_module_t orte_plm_alps_module = {
|
||||
/*
|
||||
* Local variables
|
||||
*/
|
||||
static pid_t alps_pid = 0;
|
||||
static orte_proc_t *alpsrun = NULL;
|
||||
static bool failed_launch;
|
||||
static void launch_daemons(int fd, short args, void *cbdata);
|
||||
|
||||
@ -462,8 +462,10 @@ static int plm_alps_terminate_orteds(void)
|
||||
* do NOT ERROR_LOG any return code to avoid confusing, duplicate error
|
||||
* messages
|
||||
*/
|
||||
orte_wait_cb_cancel(alps_pid);
|
||||
|
||||
if (NULL != alpsrun) {
|
||||
orte_wait_cb_cancel(alpsrun);
|
||||
}
|
||||
|
||||
/* now tell them to die */
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_CMD))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -478,8 +480,8 @@ static int plm_alps_terminate_orteds(void)
|
||||
*/
|
||||
static int plm_alps_signal_job(orte_jobid_t jobid, int32_t signal)
|
||||
{
|
||||
if (0 != alps_pid) {
|
||||
kill(alps_pid, (int)signal);
|
||||
if (NULL != alpsrun && 0 != alpsrun->pid) {
|
||||
kill(alpsrun->pid, (int)signal);
|
||||
}
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
@ -489,6 +491,10 @@ static int plm_alps_finalize(void)
|
||||
{
|
||||
int rc;
|
||||
|
||||
if (NULL != alpsrun) {
|
||||
OBJ_RELEASE(alpsrun);
|
||||
}
|
||||
|
||||
/* cleanup any pending recvs */
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_comm_stop())) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -498,7 +504,7 @@ static int plm_alps_finalize(void)
|
||||
}
|
||||
|
||||
|
||||
static void alps_wait_cb(pid_t pid, int status, void* cbdata){
|
||||
static void alps_wait_cb(orte_proc_t *proc, void* cbdata){
|
||||
orte_job_t *jdata;
|
||||
|
||||
/* According to the ALPS folks, alps always returns the highest exit
|
||||
@ -518,7 +524,7 @@ static void alps_wait_cb(pid_t pid, int status, void* cbdata){
|
||||
*/
|
||||
jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
|
||||
|
||||
if (0 != status) {
|
||||
if (0 != proc->exit_code) {
|
||||
if (failed_launch) {
|
||||
/* report that the daemon has failed so we break out of the daemon
|
||||
* callback receive and exit
|
||||
@ -531,7 +537,6 @@ static void alps_wait_cb(pid_t pid, int status, void* cbdata){
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ABORTED);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -551,6 +556,11 @@ static int plm_alps_start_proc(int argc, char **argv, char **env,
|
||||
return ORTE_ERR_SYS_LIMITS_CHILDREN;
|
||||
}
|
||||
|
||||
alpsrun = OBJ_NEW(orte_proc_t);
|
||||
alpsrun->pid = alps_pid;
|
||||
/* setup the waitpid so we can find out if alps succeeds! */
|
||||
orte_wait_cb(alpsrun, alps_wait_cb, NULL);
|
||||
|
||||
if (0 == alps_pid) { /* child */
|
||||
char *bin_base = NULL, *lib_base = NULL;
|
||||
|
||||
@ -635,8 +645,6 @@ static int plm_alps_start_proc(int argc, char **argv, char **env,
|
||||
sides of the fork... */
|
||||
setpgid(alps_pid, alps_pid);
|
||||
|
||||
/* setup the waitpid so we can find out if alps succeeds! */
|
||||
orte_wait_cb(alps_pid, alps_wait_cb, NULL);
|
||||
free(exec_argv);
|
||||
}
|
||||
|
||||
|
@ -508,7 +508,7 @@ static int plm_slurm_finalize(void)
|
||||
}
|
||||
|
||||
|
||||
static void srun_wait_cb(pid_t pid, int status, void* cbdata){
|
||||
static void srun_wait_cb(orte_proc_t *proc, void* cbdata){
|
||||
orte_job_t *jdata;
|
||||
|
||||
/* According to the SLURM folks, srun always returns the highest exit
|
||||
@ -545,7 +545,7 @@ static void srun_wait_cb(pid_t pid, int status, void* cbdata){
|
||||
/* if this is after launch, then we need to abort only if the status
|
||||
* returned is non-zero - i.e., if the orteds exited with an error
|
||||
*/
|
||||
if (0 != status) {
|
||||
if (0 != proc->exit_code) {
|
||||
/* an orted must have died unexpectedly after launch - report
|
||||
* that the daemon has failed so we exit
|
||||
*/
|
||||
@ -555,7 +555,7 @@ static void srun_wait_cb(pid_t pid, int status, void* cbdata){
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ABORTED);
|
||||
}
|
||||
/* otherwise, check to see if this is the primary pid */
|
||||
if (primary_srun_pid == pid) {
|
||||
if (primary_srun_pid == proc->pid) {
|
||||
/* in this case, we just want to fire the proper trigger so
|
||||
* mpirun can exit
|
||||
*/
|
||||
@ -567,6 +567,8 @@ static void srun_wait_cb(pid_t pid, int status, void* cbdata){
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DAEMONS_TERMINATED);
|
||||
}
|
||||
}
|
||||
/* done with this dummy */
|
||||
OBJ_RELEASE(proc);
|
||||
}
|
||||
|
||||
|
||||
@ -576,6 +578,7 @@ static int plm_slurm_start_proc(int argc, char **argv, char **env,
|
||||
int fd;
|
||||
int srun_pid;
|
||||
char *exec_argv = opal_path_findv(argv[0], 0, env, NULL);
|
||||
orte_proc_t *dummy;
|
||||
|
||||
if (NULL == exec_argv) {
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
@ -588,6 +591,12 @@ static int plm_slurm_start_proc(int argc, char **argv, char **env,
|
||||
return ORTE_ERR_SYS_LIMITS_CHILDREN;
|
||||
}
|
||||
|
||||
/* setup a dummy proc object to track the srun */
|
||||
dummy = OBJ_NEW(orte_proc_t);
|
||||
dummy->pid = srun_pid;
|
||||
/* setup the waitpid so we can find out if srun succeeds! */
|
||||
orte_wait_cb(dummy, srun_wait_cb, NULL);
|
||||
|
||||
if (0 == srun_pid) { /* child */
|
||||
char *bin_base = NULL, *lib_base = NULL;
|
||||
|
||||
@ -677,8 +686,6 @@ static int plm_slurm_start_proc(int argc, char **argv, char **env,
|
||||
primary_pid_set = true;
|
||||
}
|
||||
|
||||
/* setup the waitpid so we can find out if srun succeeds! */
|
||||
orte_wait_cb(srun_pid, srun_wait_cb, NULL);
|
||||
free(exec_argv);
|
||||
}
|
||||
|
||||
|
@ -1490,6 +1490,7 @@ static int start_compression(orte_sstore_stage_local_snapshot_info_t *handle_inf
|
||||
{
|
||||
int ret, exit_status = ORTE_SUCCESS;
|
||||
char * postfix = NULL;
|
||||
orte_proc_t *proc;
|
||||
|
||||
/* Sanity Check */
|
||||
if( !orte_sstore_stage_enabled_compression ) {
|
||||
@ -1531,11 +1532,10 @@ static int start_compression(orte_sstore_stage_local_snapshot_info_t *handle_inf
|
||||
app_info->compress_pid,
|
||||
ORTE_NAME_PRINT(&(app_info->name)) ));
|
||||
|
||||
if( ORTE_SUCCESS != (ret = orte_wait_cb(app_info->compress_pid, sstore_stage_local_compress_waitpid_cb, app_info) ) ) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
proc = OBJ_NEW(orte_proc_t);
|
||||
proc->pid = app_info->compress_pid;
|
||||
|
||||
orte_wait_cb(proc, sstore_stage_local_compress_waitpid_cb, app_info);
|
||||
|
||||
cleanup:
|
||||
if( NULL != postfix ) {
|
||||
@ -1546,7 +1546,7 @@ static int start_compression(orte_sstore_stage_local_snapshot_info_t *handle_inf
|
||||
return exit_status;
|
||||
}
|
||||
|
||||
static void sstore_stage_local_compress_waitpid_cb(pid_t pid, int status, void* cbdata)
|
||||
static void sstore_stage_local_compress_waitpid_cb(orte_proc_t *proc, void* cbdata)
|
||||
{
|
||||
orte_sstore_stage_local_app_snapshot_info_t *app_info = NULL;
|
||||
|
||||
@ -1558,6 +1558,7 @@ static void sstore_stage_local_compress_waitpid_cb(pid_t pid, int status, void*
|
||||
ORTE_NAME_PRINT(&(app_info->name)) ));
|
||||
|
||||
app_info->compress_pid = 0;
|
||||
OBJ_RELEASE(proc);
|
||||
}
|
||||
|
||||
static int wait_all_compressed(orte_sstore_stage_local_snapshot_info_t *handle_info)
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user