1
1

Resolve a race condition when responding to a SIGTERM to ensure that any final message from the application is correctly output. Remove a duplicate command, reduce the priority of the daemon exit command to MSG so that the IOF will have a chance to output cached messages. Update the signal trapping test.

Thanks to Paul Kapinos for reporting the problem.

cmr=v1.7.5:reviewer=jsquyres:subject=resolve a race condition

This commit was SVN r30942.
Этот коммит содержится в:
Ralph Castain 2014-03-05 04:38:17 +00:00
родитель a2b539c763
Коммит c9465d97b4
11 изменённых файлов: 43 добавлений и 99 удалений

Просмотреть файл

@ -2299,8 +2299,6 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
* has happened
*/
child->waitpid_recvd = true;
/* ensure the process is flagged as "not alive" */
child->alive = false;
child->pid = 0;
CLEANUP:

Просмотреть файл

@ -200,6 +200,10 @@ static bool odls_default_child_died(orte_proc_t *child)
* the default 1s actually means 'somwhere between 0 and 1s'. */
end = time(NULL) + orte_odls_globals.timeout_before_sigkill + 1;
do {
OPAL_OUTPUT_VERBOSE((2, orte_odls_base_framework.framework_output,
"%s odls:default:WAITPID CHECKING PID %d WITH TIMEOUT %d SECONDS",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)(child->pid),
orte_odls_globals.timeout_before_sigkill + 1));
ret = waitpid(child->pid, &child->exit_code, WNOHANG);
if (child->pid == ret) {
OPAL_OUTPUT_VERBOSE((2, orte_odls_base_framework.framework_output,
@ -222,7 +226,9 @@ static bool odls_default_child_died(orte_proc_t *child)
* which will occasionally trip the timeout for cases that
* are right on the edge.)
*/
OPAL_OUTPUT_VERBOSE((2, orte_odls_base_framework.framework_output,
"%s odls:default:WAITPID INDICATES PID %d MAY HAVE ALREADY EXITED",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)(child->pid)));
/* Do nothing, process still alive */
} else if (-1 == ret && ECHILD == errno) {
/* The pid no longer exists, so we'll call this "good

Просмотреть файл

@ -12,6 +12,7 @@
* Copyright (c) 2006-2011 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2014 Intel Corporation. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -449,21 +450,8 @@ static int plm_alps_terminate_orteds(void)
orte_wait_cb_cancel(alps_pid);
/* now tell them to die */
if (orte_abnormal_term_ordered) {
/* cannot know if a daemon is able to
* tell us it died, so just ensure they
* all terminate
*/
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_HALT_VM_CMD))) {
ORTE_ERROR_LOG(rc);
}
} else {
/* we need them to "phone home", though,
* so we can know that they have exited
*/
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_CMD))) {
ORTE_ERROR_LOG(rc);
}
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_CMD))) {
ORTE_ERROR_LOG(rc);
}
return rc;

Просмотреть файл

@ -126,7 +126,7 @@ static int isolated_terminate_orteds(void)
int rc;
/* send ourselves the halt command */
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_HALT_VM_CMD))) {
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_CMD))) {
ORTE_ERROR_LOG(rc);
}
return rc;

Просмотреть файл

@ -14,6 +14,7 @@
* reserved.
* Copyright (c) 2008 Institut National de Recherche en Informatique
* et Automatique. All rights reserved.
* Copyright (c) 2014 Intel Corporation. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -380,22 +381,8 @@ static int plm_lsf_terminate_orteds(void)
{
int rc;
/* now tell them to die */
if (orte_abnormal_term_ordered) {
/* cannot know if a daemon is able to
* tell us it died, so just ensure they
* all terminate
*/
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_HALT_VM_CMD))) {
ORTE_ERROR_LOG(rc);
}
} else {
/* we need them to "phone home", though,
* so we can know that they have exited
*/
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_CMD))) {
ORTE_ERROR_LOG(rc);
}
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_CMD))) {
ORTE_ERROR_LOG(rc);
}
return rc;

Просмотреть файл

@ -14,6 +14,7 @@
* reserved.
* Copyright (c) 2008-2009 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2011 IBM Corporation. All rights reserved.
* Copyright (c) 2014 Intel Corporation. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -1244,22 +1245,8 @@ static int rsh_terminate_orteds(void)
{
int rc;
/* now tell them to die */
if (orte_abnormal_term_ordered) {
/* cannot know if a daemon is able to
* tell us it died, so just ensure they
* all terminate
*/
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_HALT_VM_CMD))) {
ORTE_ERROR_LOG(rc);
}
} else {
/* we need them to "phone home", though,
* so we can know that they have exited
*/
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_CMD))) {
ORTE_ERROR_LOG(rc);
}
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_CMD))) {
ORTE_ERROR_LOG(rc);
}
return rc;

Просмотреть файл

@ -12,6 +12,7 @@
* Copyright (c) 2006-2007 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2014 Intel Corporation. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -451,21 +452,8 @@ static int plm_slurm_terminate_orteds(void)
* exit. Instead, we simply trigger an exit for ourselves
*/
if (primary_pid_set) {
if (orte_abnormal_term_ordered) {
/* cannot know if a daemon is able to
* tell us it died, so just ensure they
* all terminate
*/
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_HALT_VM_CMD))) {
ORTE_ERROR_LOG(rc);
}
} else {
/* we need them to "phone home"
* so we can know that they have exited
*/
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_CMD))) {
ORTE_ERROR_LOG(rc);
}
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_CMD))) {
ORTE_ERROR_LOG(rc);
}
} else {
OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,

Просмотреть файл

@ -12,6 +12,7 @@
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2014 Intel Corporation. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -484,22 +485,8 @@ int plm_tm_terminate_orteds(void)
{
int rc;
/* now tell them to die */
if (orte_abnormal_term_ordered) {
/* cannot know if a daemon is able to
* tell us it died, so just ensure they
* all terminate
*/
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_HALT_VM_CMD))) {
ORTE_ERROR_LOG(rc);
}
} else {
/* we need them to "phone home", though,
* so we can know that they have exited
*/
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_CMD))) {
ORTE_ERROR_LOG(rc);
}
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_CMD))) {
ORTE_ERROR_LOG(rc);
}
return rc;

Просмотреть файл

@ -152,7 +152,7 @@ static int init(void)
}
/* add the termination response */
if (ORTE_SUCCESS != (rc = orte_state.add_job_state(ORTE_JOB_STATE_DAEMONS_TERMINATED,
orte_quit, ORTE_ERROR_PRI))) {
orte_quit, ORTE_MSG_PRI))) {
ORTE_ERROR_LOG(rc);
}
/* add a default error response */

Просмотреть файл

@ -453,19 +453,6 @@ void orte_daemon_recv(int status, orte_process_name_t* sender,
return;
break;
/**** HALT VM COMMAND ****/
case ORTE_DAEMON_HALT_VM_CMD:
if (orte_debug_daemons_flag) {
opal_output(0, "%s orted_cmd: received halt vm",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
}
/* kill the local procs */
orte_odls.kill_local_procs(NULL);
/* call our appropriate exit procedure */
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
return;
break;
/**** SPAWN JOB COMMAND ****/
case ORTE_DAEMON_SPAWN_JOB_CMD:
if (orte_debug_daemons_flag) {
@ -1095,8 +1082,6 @@ static char *get_orted_comm_cmd_str(int command)
return strdup("ORTE_DAEMON_MESSAGE_LOCAL_PROCS");
case ORTE_DAEMON_EXIT_CMD:
return strdup("ORTE_DAEMON_EXIT_CMD");
case ORTE_DAEMON_HALT_VM_CMD:
return strdup("ORTE_DAEMON_HALT_VM_CMD");
case ORTE_DAEMON_SPAWN_JOB_CMD:
return strdup("ORTE_DAEMON_SPAWN_JOB_CMD");
case ORTE_DAEMON_CONTACT_QUERY_CMD:

Просмотреть файл

@ -35,7 +35,25 @@ void exit_handler(int signum)
{
int rc;
exit(0);
switch (signum) {
case SIGINT:
fprintf(stderr, "%s Trapped SIGINT\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
break;
case SIGHUP:
fprintf(stderr, "%s Trapped SIGHUP\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
break;
case SIGTERM:
fprintf(stderr, "%s Trapped SIGTERM\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
break;
default:
fprintf(stderr, "%s Undefined signal %d trapped\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), signum);
break;
}
exit(1);
}