Resolve a race condition when responding to a SIGTERM to ensure that any final message from the application is correctly output. Remove a duplicate command, reduce the priority of the daemon exit command to MSG so that the IOF will have a chance to output cached messages. Update the signal trapping test.
Thanks to Paul Kapinos for reporting the problem. cmr=v1.7.5:reviewer=jsquyres:subject=resolve a race condition This commit was SVN r30942.
Этот коммит содержится в:
родитель
a2b539c763
Коммит
c9465d97b4
@ -2299,8 +2299,6 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
|
||||
* has happened
|
||||
*/
|
||||
child->waitpid_recvd = true;
|
||||
/* ensure the process is flagged as "not alive" */
|
||||
child->alive = false;
|
||||
child->pid = 0;
|
||||
|
||||
CLEANUP:
|
||||
|
@ -200,6 +200,10 @@ static bool odls_default_child_died(orte_proc_t *child)
|
||||
* the default 1s actually means 'somwhere between 0 and 1s'. */
|
||||
end = time(NULL) + orte_odls_globals.timeout_before_sigkill + 1;
|
||||
do {
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_odls_base_framework.framework_output,
|
||||
"%s odls:default:WAITPID CHECKING PID %d WITH TIMEOUT %d SECONDS",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)(child->pid),
|
||||
orte_odls_globals.timeout_before_sigkill + 1));
|
||||
ret = waitpid(child->pid, &child->exit_code, WNOHANG);
|
||||
if (child->pid == ret) {
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_odls_base_framework.framework_output,
|
||||
@ -222,7 +226,9 @@ static bool odls_default_child_died(orte_proc_t *child)
|
||||
* which will occasionally trip the timeout for cases that
|
||||
* are right on the edge.)
|
||||
*/
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_odls_base_framework.framework_output,
|
||||
"%s odls:default:WAITPID INDICATES PID %d MAY HAVE ALREADY EXITED",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)(child->pid)));
|
||||
/* Do nothing, process still alive */
|
||||
} else if (-1 == ret && ECHILD == errno) {
|
||||
/* The pid no longer exists, so we'll call this "good
|
||||
|
@ -12,6 +12,7 @@
|
||||
* Copyright (c) 2006-2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2014 Intel Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -449,21 +450,8 @@ static int plm_alps_terminate_orteds(void)
|
||||
orte_wait_cb_cancel(alps_pid);
|
||||
|
||||
/* now tell them to die */
|
||||
if (orte_abnormal_term_ordered) {
|
||||
/* cannot know if a daemon is able to
|
||||
* tell us it died, so just ensure they
|
||||
* all terminate
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_HALT_VM_CMD))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
} else {
|
||||
/* we need them to "phone home", though,
|
||||
* so we can know that they have exited
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_CMD))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_CMD))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
return rc;
|
||||
|
@ -126,7 +126,7 @@ static int isolated_terminate_orteds(void)
|
||||
int rc;
|
||||
|
||||
/* send ourselves the halt command */
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_HALT_VM_CMD))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_CMD))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
return rc;
|
||||
|
@ -14,6 +14,7 @@
|
||||
* reserved.
|
||||
* Copyright (c) 2008 Institut National de Recherche en Informatique
|
||||
* et Automatique. All rights reserved.
|
||||
* Copyright (c) 2014 Intel Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -380,22 +381,8 @@ static int plm_lsf_terminate_orteds(void)
|
||||
{
|
||||
int rc;
|
||||
|
||||
/* now tell them to die */
|
||||
if (orte_abnormal_term_ordered) {
|
||||
/* cannot know if a daemon is able to
|
||||
* tell us it died, so just ensure they
|
||||
* all terminate
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_HALT_VM_CMD))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
} else {
|
||||
/* we need them to "phone home", though,
|
||||
* so we can know that they have exited
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_CMD))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_CMD))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
return rc;
|
||||
|
@ -14,6 +14,7 @@
|
||||
* reserved.
|
||||
* Copyright (c) 2008-2009 Sun Microsystems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011 IBM Corporation. All rights reserved.
|
||||
* Copyright (c) 2014 Intel Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -1244,22 +1245,8 @@ static int rsh_terminate_orteds(void)
|
||||
{
|
||||
int rc;
|
||||
|
||||
/* now tell them to die */
|
||||
if (orte_abnormal_term_ordered) {
|
||||
/* cannot know if a daemon is able to
|
||||
* tell us it died, so just ensure they
|
||||
* all terminate
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_HALT_VM_CMD))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
} else {
|
||||
/* we need them to "phone home", though,
|
||||
* so we can know that they have exited
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_CMD))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_CMD))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
return rc;
|
||||
|
@ -12,6 +12,7 @@
|
||||
* Copyright (c) 2006-2007 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2014 Intel Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -451,21 +452,8 @@ static int plm_slurm_terminate_orteds(void)
|
||||
* exit. Instead, we simply trigger an exit for ourselves
|
||||
*/
|
||||
if (primary_pid_set) {
|
||||
if (orte_abnormal_term_ordered) {
|
||||
/* cannot know if a daemon is able to
|
||||
* tell us it died, so just ensure they
|
||||
* all terminate
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_HALT_VM_CMD))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
} else {
|
||||
/* we need them to "phone home"
|
||||
* so we can know that they have exited
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_CMD))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_CMD))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
} else {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
|
||||
|
@ -12,6 +12,7 @@
|
||||
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2014 Intel Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -484,22 +485,8 @@ int plm_tm_terminate_orteds(void)
|
||||
{
|
||||
int rc;
|
||||
|
||||
/* now tell them to die */
|
||||
if (orte_abnormal_term_ordered) {
|
||||
/* cannot know if a daemon is able to
|
||||
* tell us it died, so just ensure they
|
||||
* all terminate
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_HALT_VM_CMD))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
} else {
|
||||
/* we need them to "phone home", though,
|
||||
* so we can know that they have exited
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_CMD))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_CMD))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
return rc;
|
||||
|
@ -152,7 +152,7 @@ static int init(void)
|
||||
}
|
||||
/* add the termination response */
|
||||
if (ORTE_SUCCESS != (rc = orte_state.add_job_state(ORTE_JOB_STATE_DAEMONS_TERMINATED,
|
||||
orte_quit, ORTE_ERROR_PRI))) {
|
||||
orte_quit, ORTE_MSG_PRI))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
/* add a default error response */
|
||||
|
@ -453,19 +453,6 @@ void orte_daemon_recv(int status, orte_process_name_t* sender,
|
||||
return;
|
||||
break;
|
||||
|
||||
/**** HALT VM COMMAND ****/
|
||||
case ORTE_DAEMON_HALT_VM_CMD:
|
||||
if (orte_debug_daemons_flag) {
|
||||
opal_output(0, "%s orted_cmd: received halt vm",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
}
|
||||
/* kill the local procs */
|
||||
orte_odls.kill_local_procs(NULL);
|
||||
/* call our appropriate exit procedure */
|
||||
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
|
||||
return;
|
||||
break;
|
||||
|
||||
/**** SPAWN JOB COMMAND ****/
|
||||
case ORTE_DAEMON_SPAWN_JOB_CMD:
|
||||
if (orte_debug_daemons_flag) {
|
||||
@ -1095,8 +1082,6 @@ static char *get_orted_comm_cmd_str(int command)
|
||||
return strdup("ORTE_DAEMON_MESSAGE_LOCAL_PROCS");
|
||||
case ORTE_DAEMON_EXIT_CMD:
|
||||
return strdup("ORTE_DAEMON_EXIT_CMD");
|
||||
case ORTE_DAEMON_HALT_VM_CMD:
|
||||
return strdup("ORTE_DAEMON_HALT_VM_CMD");
|
||||
case ORTE_DAEMON_SPAWN_JOB_CMD:
|
||||
return strdup("ORTE_DAEMON_SPAWN_JOB_CMD");
|
||||
case ORTE_DAEMON_CONTACT_QUERY_CMD:
|
||||
|
@ -35,7 +35,25 @@ void exit_handler(int signum)
|
||||
{
|
||||
int rc;
|
||||
|
||||
exit(0);
|
||||
switch (signum) {
|
||||
case SIGINT:
|
||||
fprintf(stderr, "%s Trapped SIGINT\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
break;
|
||||
|
||||
case SIGHUP:
|
||||
fprintf(stderr, "%s Trapped SIGHUP\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
break;
|
||||
|
||||
case SIGTERM:
|
||||
fprintf(stderr, "%s Trapped SIGTERM\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
break;
|
||||
|
||||
default:
|
||||
fprintf(stderr, "%s Undefined signal %d trapped\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), signum);
|
||||
break;
|
||||
}
|
||||
|
||||
exit(1);
|
||||
}
|
||||
|
||||
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user