Now that we understand why it failed before (thanks George!), go back to relaying cmds before processing them on the orteds so we pickup some speed by executing cmds in parallel. Modify the termination system accordingly.
Cleanup the termination in orterun when abnormally commanded via ctrl-c. We can just directly terminate_orteds as the orteds always kill any lingering local procs before exiting - no need to do the two-step cha-cha. This commit was SVN r21123.
Этот коммит содержится в:
родитель
80a1ae45ba
Коммит
a74b74a68c
@ -81,8 +81,6 @@ static int process_commands(orte_process_name_t* sender,
|
||||
opal_buffer_t *buffer,
|
||||
orte_rml_tag_t tag);
|
||||
|
||||
static bool exit_reqd;
|
||||
|
||||
/* instantiate this - it is shared via orted.h */
|
||||
struct timeval orte_daemon_msg_recvd;
|
||||
|
||||
@ -123,7 +121,6 @@ static void send_relay(opal_buffer_t *buf)
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_VPID_PRINT(nm->vpid)));
|
||||
|
||||
/* retain buffer so callback function can release it */
|
||||
target.vpid = nm->vpid;
|
||||
if (0 > (ret = orte_rml.send_buffer(&target, buf, ORTE_RML_TAG_DAEMON, 0))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
@ -175,6 +172,7 @@ void orte_daemon_cmd_processor(int fd, short event, void *data)
|
||||
orte_message_event_t *mev = (orte_message_event_t*)data;
|
||||
orte_process_name_t *sender = &(mev->sender);
|
||||
opal_buffer_t *buffer = mev->buffer;
|
||||
opal_buffer_t relay_buf;
|
||||
orte_rml_tag_t tag = mev->tag, target_tag;
|
||||
orte_jobid_t job;
|
||||
int ret;
|
||||
@ -273,26 +271,34 @@ void orte_daemon_cmd_processor(int fd, short event, void *data)
|
||||
}
|
||||
/* is this an add-procs cmd? */
|
||||
if (ORTE_DAEMON_ADD_LOCAL_PROCS == command) {
|
||||
/* yes - then it contains daemon update info - process it */
|
||||
/* store the time the cmd was recvd */
|
||||
if (orte_timing) {
|
||||
orte_daemon_msg_recvd.tv_sec = mesg_recvd.tv_sec;
|
||||
orte_daemon_msg_recvd.tv_usec = mesg_recvd.tv_usec;
|
||||
}
|
||||
/* cmd contains daemon update info - process it */
|
||||
if (ORTE_SUCCESS != (ret = orte_odls_base_default_update_daemon_info(buffer))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
goto CLEANUP;
|
||||
}
|
||||
/* flag this location */
|
||||
save_rel = buffer->unpack_ptr - buffer->base_ptr;
|
||||
/* store the time the cmd was recvd */
|
||||
if (orte_timing) {
|
||||
orte_daemon_msg_recvd.tv_sec = mesg_recvd.tv_sec;
|
||||
orte_daemon_msg_recvd.tv_usec = mesg_recvd.tv_usec;
|
||||
}
|
||||
}
|
||||
|
||||
/* setup the relay buffer */
|
||||
OBJ_CONSTRUCT(&relay_buf, opal_buffer_t);
|
||||
/* rewind the buffer to the beginning */
|
||||
buffer->unpack_ptr = buffer->base_ptr + unpack_rel;
|
||||
/* copy everything to the relay buffer */
|
||||
opal_dss.copy_payload(&relay_buf, buffer);
|
||||
/* do the relay */
|
||||
send_relay(&relay_buf);
|
||||
/* cleanup */
|
||||
OBJ_DESTRUCT(&relay_buf);
|
||||
|
||||
/* rewind the buffer to the right place for processing the cmd */
|
||||
buffer->unpack_ptr = buffer->base_ptr + save_rel;
|
||||
|
||||
/* init flag */
|
||||
exit_reqd = false;
|
||||
|
||||
/* process the command */
|
||||
if (ORTE_SUCCESS != (ret = process_commands(sender, buffer, tag))) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_debug_output,
|
||||
@ -300,16 +306,6 @@ void orte_daemon_cmd_processor(int fd, short event, void *data)
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_ERROR_NAME(ret)));
|
||||
}
|
||||
|
||||
/* rewind the buffer to the beginning */
|
||||
buffer->unpack_ptr = buffer->base_ptr + unpack_rel;
|
||||
/* do the relay */
|
||||
send_relay(buffer);
|
||||
|
||||
/* if we need to exit, do so now */
|
||||
if (exit_reqd) {
|
||||
orte_trigger_event(&orte_exit);
|
||||
}
|
||||
|
||||
/* done */
|
||||
goto CLEANUP;
|
||||
|
||||
@ -633,7 +629,7 @@ static int process_commands(orte_process_name_t* sender,
|
||||
orte_rml.send_buffer(ORTE_PROC_MY_HNP, &ack, ORTE_RML_TAG_PLM, 0);
|
||||
OBJ_DESTRUCT(&ack);
|
||||
}
|
||||
exit_reqd = true;
|
||||
orte_trigger_event(&orte_exit);
|
||||
return ORTE_SUCCESS;
|
||||
break;
|
||||
|
||||
@ -671,7 +667,7 @@ static int process_commands(orte_process_name_t* sender,
|
||||
*/
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
exit_reqd = true;
|
||||
orte_trigger_event(&orte_exit);
|
||||
return ORTE_SUCCESS;
|
||||
break;
|
||||
|
||||
|
@ -369,6 +369,7 @@ static int parse_globals(int argc, char* argv[], opal_cmd_line_t *cmd_line);
|
||||
static int parse_locals(int argc, char* argv[]);
|
||||
static int parse_appfile(char *filename, char ***env);
|
||||
static void dump_aborted_procs(void);
|
||||
static void just_quit(int fd, short ign, void *arg);
|
||||
|
||||
|
||||
int orterun(int argc, char *argv[])
|
||||
@ -754,29 +755,8 @@ int orterun(int argc, char *argv[])
|
||||
* to an error - so just cleanup and leave
|
||||
*/
|
||||
DONE:
|
||||
if (signals_set) {
|
||||
/* Remove the TERM and INT signal handlers */
|
||||
opal_signal_del(&term_handler);
|
||||
opal_signal_del(&int_handler);
|
||||
#ifndef __WINDOWS__
|
||||
/** Remove the USR signal handlers */
|
||||
opal_signal_del(&sigusr1_handler);
|
||||
opal_signal_del(&sigusr2_handler);
|
||||
if (orte_forward_job_control) {
|
||||
opal_signal_del(&sigtstp_handler);
|
||||
opal_signal_del(&sigcont_handler);
|
||||
}
|
||||
#endif /* __WINDOWS__ */
|
||||
}
|
||||
|
||||
/* whack any lingering session directory files from our jobs */
|
||||
orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
|
||||
|
||||
/* cleanup our data server */
|
||||
orte_data_server_finalize();
|
||||
|
||||
orte_finalize();
|
||||
free(orterun_basename);
|
||||
ORTE_UPDATE_EXIT_STATUS(orte_exit_status);
|
||||
just_quit(0,0,NULL);
|
||||
return orte_exit_status;
|
||||
}
|
||||
|
||||
@ -851,31 +831,8 @@ static void job_completed(int trigpipe, short event, void *arg)
|
||||
* all we can do is cleanly exit ourselves
|
||||
*/
|
||||
DONE:
|
||||
if (signals_set) {
|
||||
/* Remove the TERM and INT signal handlers */
|
||||
opal_signal_del(&term_handler);
|
||||
opal_signal_del(&int_handler);
|
||||
#ifndef __WINDOWS__
|
||||
/** Remove the USR signal handlers */
|
||||
opal_signal_del(&sigusr1_handler);
|
||||
opal_signal_del(&sigusr2_handler);
|
||||
if (orte_forward_job_control) {
|
||||
opal_signal_del(&sigtstp_handler);
|
||||
opal_signal_del(&sigcont_handler);
|
||||
}
|
||||
#endif /* __WINDOWS__ */
|
||||
}
|
||||
|
||||
/* whack any lingering session directory files from our jobs */
|
||||
orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
|
||||
|
||||
/* cleanup our data server */
|
||||
orte_data_server_finalize();
|
||||
|
||||
orte_finalize();
|
||||
free(orterun_basename);
|
||||
exit(rc);
|
||||
|
||||
ORTE_UPDATE_EXIT_STATUS(rc);
|
||||
just_quit(0, 0, NULL);
|
||||
}
|
||||
|
||||
static void terminated(int trigpipe, short event, void *arg)
|
||||
@ -903,6 +860,7 @@ static void terminated(int trigpipe, short event, void *arg)
|
||||
opal_signal_del(&sigcont_handler);
|
||||
}
|
||||
#endif /* __WINDOWS__ */
|
||||
signals_set = false;
|
||||
}
|
||||
|
||||
/* get the daemon job object */
|
||||
@ -949,13 +907,34 @@ static void terminated(int trigpipe, short event, void *arg)
|
||||
|
||||
finish:
|
||||
/* now clean ourselves up and exit */
|
||||
|
||||
just_quit(0, 0, NULL);
|
||||
}
|
||||
|
||||
static void just_quit(int fd, short ign, void *arg)
|
||||
{
|
||||
if (signals_set) {
|
||||
/* Remove the TERM and INT signal handlers */
|
||||
opal_signal_del(&term_handler);
|
||||
opal_signal_del(&int_handler);
|
||||
#ifndef __WINDOWS__
|
||||
/** Remove the USR signal handlers */
|
||||
opal_signal_del(&sigusr1_handler);
|
||||
opal_signal_del(&sigusr2_handler);
|
||||
if (orte_forward_job_control) {
|
||||
opal_signal_del(&sigtstp_handler);
|
||||
opal_signal_del(&sigcont_handler);
|
||||
}
|
||||
#endif /* __WINDOWS__ */
|
||||
signals_set = false;
|
||||
}
|
||||
|
||||
/* whack any lingering session directory files from our jobs */
|
||||
orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
|
||||
|
||||
|
||||
/* cleanup our data server */
|
||||
orte_data_server_finalize();
|
||||
|
||||
/* cleanup and leave */
|
||||
orte_finalize();
|
||||
free(orterun_basename);
|
||||
if (orte_debug_flag) {
|
||||
@ -964,6 +943,7 @@ finish:
|
||||
exit(orte_exit_status);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* On abnormal termination - dump the
|
||||
* exit status of the aborted procs.
|
||||
@ -1138,23 +1118,21 @@ static void abort_exit_callback(int fd, short ign, void *arg)
|
||||
* hit ctrl-c before we had a chance to setup the
|
||||
* job in the system - in which case there is nothing
|
||||
* to terminate!
|
||||
*
|
||||
* NOTE: we don't have to worry about jdata being NULL
|
||||
* because we don't setup to trap the signals until
|
||||
* after jdata has been OBJ_NEW'd
|
||||
*/
|
||||
if (jdata->jobid != ORTE_JOBID_INVALID) {
|
||||
/* terminate the job - this will wake us up and
|
||||
* call the "terminated" function so we clean up
|
||||
* and exit
|
||||
if (NULL != jdata &&
|
||||
jdata->jobid != ORTE_JOBID_INVALID &&
|
||||
!orte_never_launched) {
|
||||
/* if the debuggers were run, clean up */
|
||||
orte_debugger_finalize();
|
||||
/* terminate the orteds - they will automatically kill
|
||||
* their local procs
|
||||
*/
|
||||
ret = orte_plm.terminate_job(ORTE_JOBID_WILDCARD);
|
||||
ret = orte_plm.terminate_orteds();
|
||||
if (ORTE_SUCCESS != ret) {
|
||||
/* If we failed the terminate_job() above, then we
|
||||
* need to explicitly wake ourselves up to exit
|
||||
/* If we failed the terminate_orteds() above, then we
|
||||
* need to just die
|
||||
*/
|
||||
ORTE_UPDATE_EXIT_STATUS(ret);
|
||||
orte_trigger_event(&orte_exit);
|
||||
just_quit(fd, ign, arg);
|
||||
}
|
||||
/* give ourselves a time limit on how long to wait
|
||||
* for the job to die, just in case we can't make it go
|
||||
@ -1165,39 +1143,14 @@ static void abort_exit_callback(int fd, short ign, void *arg)
|
||||
ORTE_DETECT_TIMEOUT(&abort_exit_event, jdata->num_procs,
|
||||
orte_timeout_usec_per_proc,
|
||||
orte_max_timeout,
|
||||
timeout_callback);
|
||||
just_quit);
|
||||
|
||||
} else {
|
||||
/* if the jobid is invalid, then we didn't get to
|
||||
* the point of setting the job up, so there is nothing
|
||||
* to do but just clean ourselves up and exit
|
||||
/* if the jobid is invalid or we never launched,
|
||||
* there is nothing to do but just clean ourselves
|
||||
* up and exit
|
||||
*/
|
||||
if (signals_set) {
|
||||
/* Remove the TERM and INT signal handlers */
|
||||
opal_signal_del(&term_handler);
|
||||
opal_signal_del(&int_handler);
|
||||
#ifndef __WINDOWS__
|
||||
/** Remove the USR signal handlers */
|
||||
opal_signal_del(&sigusr1_handler);
|
||||
opal_signal_del(&sigusr2_handler);
|
||||
if (orte_forward_job_control) {
|
||||
opal_signal_del(&sigtstp_handler);
|
||||
opal_signal_del(&sigcont_handler);
|
||||
}
|
||||
#endif /* __WINDOWS__ */
|
||||
}
|
||||
orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
|
||||
|
||||
/* need to release jdata separately as it won't be
|
||||
* in the global array, and so won't be released
|
||||
* during finalize
|
||||
*/
|
||||
OBJ_RELEASE(jdata);
|
||||
|
||||
orte_finalize();
|
||||
free(orterun_basename);
|
||||
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
exit(orte_exit_status);
|
||||
just_quit(fd, ign, arg);
|
||||
}
|
||||
}
|
||||
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user