Cleanup some missed updates to orte_wait_cb as params have changed
Refs trac:4717 This commit was SVN r32025. The following Trac tickets were found above: Ticket 4717 --> https://svn.open-mpi.org/trac/ompi/ticket/4717
Этот коммит содержится в:
родитель
5dbf4a62c4
Коммит
8e7c0257f0
@ -107,7 +107,7 @@ orte_plm_base_module_t orte_plm_alps_module = {
|
|||||||
/*
|
/*
|
||||||
* Local variables
|
* Local variables
|
||||||
*/
|
*/
|
||||||
static pid_t alps_pid = 0;
|
static orte_proc_t *alpsrun = NULL;
|
||||||
static bool failed_launch;
|
static bool failed_launch;
|
||||||
static void launch_daemons(int fd, short args, void *cbdata);
|
static void launch_daemons(int fd, short args, void *cbdata);
|
||||||
|
|
||||||
@ -462,8 +462,10 @@ static int plm_alps_terminate_orteds(void)
|
|||||||
* do NOT ERROR_LOG any return code to avoid confusing, duplicate error
|
* do NOT ERROR_LOG any return code to avoid confusing, duplicate error
|
||||||
* messages
|
* messages
|
||||||
*/
|
*/
|
||||||
orte_wait_cb_cancel(alps_pid);
|
if (NULL != alpsrun) {
|
||||||
|
orte_wait_cb_cancel(alpsrun);
|
||||||
|
}
|
||||||
|
|
||||||
/* now tell them to die */
|
/* now tell them to die */
|
||||||
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_CMD))) {
|
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_CMD))) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
@ -478,8 +480,8 @@ static int plm_alps_terminate_orteds(void)
|
|||||||
*/
|
*/
|
||||||
static int plm_alps_signal_job(orte_jobid_t jobid, int32_t signal)
|
static int plm_alps_signal_job(orte_jobid_t jobid, int32_t signal)
|
||||||
{
|
{
|
||||||
if (0 != alps_pid) {
|
if (NULL != alpsrun && 0 != alpsrun->pid) {
|
||||||
kill(alps_pid, (int)signal);
|
kill(alpsrun->pid, (int)signal);
|
||||||
}
|
}
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
@ -489,6 +491,10 @@ static int plm_alps_finalize(void)
|
|||||||
{
|
{
|
||||||
int rc;
|
int rc;
|
||||||
|
|
||||||
|
if (NULL != alpsrun) {
|
||||||
|
OBJ_RELEASE(alpsrun);
|
||||||
|
}
|
||||||
|
|
||||||
/* cleanup any pending recvs */
|
/* cleanup any pending recvs */
|
||||||
if (ORTE_SUCCESS != (rc = orte_plm_base_comm_stop())) {
|
if (ORTE_SUCCESS != (rc = orte_plm_base_comm_stop())) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
@ -498,7 +504,7 @@ static int plm_alps_finalize(void)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static void alps_wait_cb(pid_t pid, int status, void* cbdata){
|
static void alps_wait_cb(orte_proc_t *proc, void* cbdata){
|
||||||
orte_job_t *jdata;
|
orte_job_t *jdata;
|
||||||
|
|
||||||
/* According to the ALPS folks, alps always returns the highest exit
|
/* According to the ALPS folks, alps always returns the highest exit
|
||||||
@ -518,7 +524,7 @@ static void alps_wait_cb(pid_t pid, int status, void* cbdata){
|
|||||||
*/
|
*/
|
||||||
jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
|
jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
|
||||||
|
|
||||||
if (0 != status) {
|
if (0 != proc->exit_code) {
|
||||||
if (failed_launch) {
|
if (failed_launch) {
|
||||||
/* report that the daemon has failed so we break out of the daemon
|
/* report that the daemon has failed so we break out of the daemon
|
||||||
* callback receive and exit
|
* callback receive and exit
|
||||||
@ -531,7 +537,6 @@ static void alps_wait_cb(pid_t pid, int status, void* cbdata){
|
|||||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ABORTED);
|
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ABORTED);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -551,6 +556,11 @@ static int plm_alps_start_proc(int argc, char **argv, char **env,
|
|||||||
return ORTE_ERR_SYS_LIMITS_CHILDREN;
|
return ORTE_ERR_SYS_LIMITS_CHILDREN;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
alpsrun = OBJ_NEW(orte_proc_t);
|
||||||
|
alpsrun->pid = alps_pid;
|
||||||
|
/* setup the waitpid so we can find out if alps succeeds! */
|
||||||
|
orte_wait_cb(alpsrun, alps_wait_cb, NULL);
|
||||||
|
|
||||||
if (0 == alps_pid) { /* child */
|
if (0 == alps_pid) { /* child */
|
||||||
char *bin_base = NULL, *lib_base = NULL;
|
char *bin_base = NULL, *lib_base = NULL;
|
||||||
|
|
||||||
@ -635,8 +645,6 @@ static int plm_alps_start_proc(int argc, char **argv, char **env,
|
|||||||
sides of the fork... */
|
sides of the fork... */
|
||||||
setpgid(alps_pid, alps_pid);
|
setpgid(alps_pid, alps_pid);
|
||||||
|
|
||||||
/* setup the waitpid so we can find out if alps succeeds! */
|
|
||||||
orte_wait_cb(alps_pid, alps_wait_cb, NULL);
|
|
||||||
free(exec_argv);
|
free(exec_argv);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -508,7 +508,7 @@ static int plm_slurm_finalize(void)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static void srun_wait_cb(pid_t pid, int status, void* cbdata){
|
static void srun_wait_cb(orte_proc_t *proc, void* cbdata){
|
||||||
orte_job_t *jdata;
|
orte_job_t *jdata;
|
||||||
|
|
||||||
/* According to the SLURM folks, srun always returns the highest exit
|
/* According to the SLURM folks, srun always returns the highest exit
|
||||||
@ -545,7 +545,7 @@ static void srun_wait_cb(pid_t pid, int status, void* cbdata){
|
|||||||
/* if this is after launch, then we need to abort only if the status
|
/* if this is after launch, then we need to abort only if the status
|
||||||
* returned is non-zero - i.e., if the orteds exited with an error
|
* returned is non-zero - i.e., if the orteds exited with an error
|
||||||
*/
|
*/
|
||||||
if (0 != status) {
|
if (0 != proc->exit_code) {
|
||||||
/* an orted must have died unexpectedly after launch - report
|
/* an orted must have died unexpectedly after launch - report
|
||||||
* that the daemon has failed so we exit
|
* that the daemon has failed so we exit
|
||||||
*/
|
*/
|
||||||
@ -555,7 +555,7 @@ static void srun_wait_cb(pid_t pid, int status, void* cbdata){
|
|||||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ABORTED);
|
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ABORTED);
|
||||||
}
|
}
|
||||||
/* otherwise, check to see if this is the primary pid */
|
/* otherwise, check to see if this is the primary pid */
|
||||||
if (primary_srun_pid == pid) {
|
if (primary_srun_pid == proc->pid) {
|
||||||
/* in this case, we just want to fire the proper trigger so
|
/* in this case, we just want to fire the proper trigger so
|
||||||
* mpirun can exit
|
* mpirun can exit
|
||||||
*/
|
*/
|
||||||
@ -567,6 +567,8 @@ static void srun_wait_cb(pid_t pid, int status, void* cbdata){
|
|||||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DAEMONS_TERMINATED);
|
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DAEMONS_TERMINATED);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
/* done with this dummy */
|
||||||
|
OBJ_RELEASE(proc);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -576,6 +578,7 @@ static int plm_slurm_start_proc(int argc, char **argv, char **env,
|
|||||||
int fd;
|
int fd;
|
||||||
int srun_pid;
|
int srun_pid;
|
||||||
char *exec_argv = opal_path_findv(argv[0], 0, env, NULL);
|
char *exec_argv = opal_path_findv(argv[0], 0, env, NULL);
|
||||||
|
orte_proc_t *dummy;
|
||||||
|
|
||||||
if (NULL == exec_argv) {
|
if (NULL == exec_argv) {
|
||||||
return ORTE_ERR_NOT_FOUND;
|
return ORTE_ERR_NOT_FOUND;
|
||||||
@ -588,6 +591,12 @@ static int plm_slurm_start_proc(int argc, char **argv, char **env,
|
|||||||
return ORTE_ERR_SYS_LIMITS_CHILDREN;
|
return ORTE_ERR_SYS_LIMITS_CHILDREN;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* setup a dummy proc object to track the srun */
|
||||||
|
dummy = OBJ_NEW(orte_proc_t);
|
||||||
|
dummy->pid = srun_pid;
|
||||||
|
/* setup the waitpid so we can find out if srun succeeds! */
|
||||||
|
orte_wait_cb(dummy, srun_wait_cb, NULL);
|
||||||
|
|
||||||
if (0 == srun_pid) { /* child */
|
if (0 == srun_pid) { /* child */
|
||||||
char *bin_base = NULL, *lib_base = NULL;
|
char *bin_base = NULL, *lib_base = NULL;
|
||||||
|
|
||||||
@ -677,8 +686,6 @@ static int plm_slurm_start_proc(int argc, char **argv, char **env,
|
|||||||
primary_pid_set = true;
|
primary_pid_set = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* setup the waitpid so we can find out if srun succeeds! */
|
|
||||||
orte_wait_cb(srun_pid, srun_wait_cb, NULL);
|
|
||||||
free(exec_argv);
|
free(exec_argv);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1490,6 +1490,7 @@ static int start_compression(orte_sstore_stage_local_snapshot_info_t *handle_inf
|
|||||||
{
|
{
|
||||||
int ret, exit_status = ORTE_SUCCESS;
|
int ret, exit_status = ORTE_SUCCESS;
|
||||||
char * postfix = NULL;
|
char * postfix = NULL;
|
||||||
|
orte_proc_t *proc;
|
||||||
|
|
||||||
/* Sanity Check */
|
/* Sanity Check */
|
||||||
if( !orte_sstore_stage_enabled_compression ) {
|
if( !orte_sstore_stage_enabled_compression ) {
|
||||||
@ -1531,11 +1532,10 @@ static int start_compression(orte_sstore_stage_local_snapshot_info_t *handle_inf
|
|||||||
app_info->compress_pid,
|
app_info->compress_pid,
|
||||||
ORTE_NAME_PRINT(&(app_info->name)) ));
|
ORTE_NAME_PRINT(&(app_info->name)) ));
|
||||||
|
|
||||||
if( ORTE_SUCCESS != (ret = orte_wait_cb(app_info->compress_pid, sstore_stage_local_compress_waitpid_cb, app_info) ) ) {
|
proc = OBJ_NEW(orte_proc_t);
|
||||||
ORTE_ERROR_LOG(ret);
|
proc->pid = app_info->compress_pid;
|
||||||
exit_status = ret;
|
|
||||||
goto cleanup;
|
orte_wait_cb(proc, sstore_stage_local_compress_waitpid_cb, app_info);
|
||||||
}
|
|
||||||
|
|
||||||
cleanup:
|
cleanup:
|
||||||
if( NULL != postfix ) {
|
if( NULL != postfix ) {
|
||||||
@ -1546,7 +1546,7 @@ static int start_compression(orte_sstore_stage_local_snapshot_info_t *handle_inf
|
|||||||
return exit_status;
|
return exit_status;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void sstore_stage_local_compress_waitpid_cb(pid_t pid, int status, void* cbdata)
|
static void sstore_stage_local_compress_waitpid_cb(orte_proc_t *proc, void* cbdata)
|
||||||
{
|
{
|
||||||
orte_sstore_stage_local_app_snapshot_info_t *app_info = NULL;
|
orte_sstore_stage_local_app_snapshot_info_t *app_info = NULL;
|
||||||
|
|
||||||
@ -1558,6 +1558,7 @@ static void sstore_stage_local_compress_waitpid_cb(pid_t pid, int status, void*
|
|||||||
ORTE_NAME_PRINT(&(app_info->name)) ));
|
ORTE_NAME_PRINT(&(app_info->name)) ));
|
||||||
|
|
||||||
app_info->compress_pid = 0;
|
app_info->compress_pid = 0;
|
||||||
|
OBJ_RELEASE(proc);
|
||||||
}
|
}
|
||||||
|
|
||||||
static int wait_all_compressed(orte_sstore_stage_local_snapshot_info_t *handle_info)
|
static int wait_all_compressed(orte_sstore_stage_local_snapshot_info_t *handle_info)
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user