1
1

Merge pull request #1724 from rhc54/topic/timeout

Add a timeout cmd line option and an option to report state info upon timeout to assist with debugging Jenkins tests
Этот коммит содержится в:
rhc54 2016-05-28 08:36:41 -07:00
родитель 59f4a765b3 ebe159acef
Коммит a93c01d4f4
7 изменённых файлов: 278 добавлений и 15 удалений

Просмотреть файл

@ -9,7 +9,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011-2016 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
@ -80,6 +80,9 @@ typedef uint8_t orte_daemon_cmd_flag_t;
/* add procs for the DVM */
#define ORTE_DAEMON_DVM_ADD_PROCS (orte_daemon_cmd_flag_t) 30
/* for debug purposes, get stack traces from all application procs */
#define ORTE_DAEMON_GET_STACK_TRACES (orte_daemon_cmd_flag_t) 31
/*
* Struct written up the pipe from the child to the parent.
*/

Просмотреть файл

@ -163,6 +163,9 @@ BEGIN_C_DECLS
/* error notifications */
#define ORTE_RML_TAG_NOTIFICATION 59
/* stacktrace for debug */
#define ORTE_RML_TAG_STACK_TRACE 60
#define ORTE_RML_TAG_MAX 100
#define ORTE_RML_TAG_NTOH(t) ntohl(t)

Просмотреть файл

@ -11,7 +11,7 @@
* All rights reserved.
* Copyright (c) 2006-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2009-2015 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2009-2016 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2015 Research Organization for Information Science
@ -92,6 +92,18 @@ static opal_cmd_line_init_t cmd_line_init[] = {
&orte_cmd_options.report_uri, OPAL_CMD_LINE_TYPE_STRING,
"Printout URI on stdout [-], stderr [+], or a file [anything else]" },
/* testing options */
{ NULL, '\0', "timeout", "timeout", 1,
&orte_cmd_options.timeout, OPAL_CMD_LINE_TYPE_INT,
"Timeout the job after the specified number of seconds" },
{ NULL, '\0', "report-state-on-timeout", "report-state-on-timeout", 0,
&orte_cmd_options.report_state_on_timeout, OPAL_CMD_LINE_TYPE_BOOL,
"Report all job and process states upon timeout" },
{ NULL, '\0', "get-stack-traces", "get-stack-traces", 0,
&orte_cmd_options.get_stack_traces, OPAL_CMD_LINE_TYPE_BOOL,
"Get stack traces of all application procs on timeout" },
/* exit status reporting */
{ "orte_report_child_jobs_separately", '\0', "report-child-jobs-separately", "report-child-jobs-separately", 0,
NULL, OPAL_CMD_LINE_TYPE_BOOL,

Просмотреть файл

@ -9,7 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2007-2016 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
@ -47,6 +47,7 @@
#include "opal/mca/base/base.h"
#include "opal/util/output.h"
#include "opal/util/opal_environ.h"
#include "opal/util/path.h"
#include "opal/runtime/opal.h"
#include "opal/runtime/opal_progress.h"
#include "opal/dss/dss.h"
@ -111,6 +112,9 @@ void orte_daemon_recv(int status, orte_process_name_t* sender,
bool found = false;
orte_node_t *node;
orte_grpcomm_signature_t *sig;
FILE *fp;
char gscmd[256], path[1035], *pathptr;
char string[256], *string_ptr = string;
/* unpack the command */
n = 1;
@ -1071,6 +1075,82 @@ void orte_daemon_recv(int status, orte_process_name_t* sender,
}
break;
case ORTE_DAEMON_GET_STACK_TRACES:
/* prep the response */
answer = OBJ_NEW(opal_buffer_t);
pathptr = path;
// Try to find the "gstack" executable. Failure to find the
// executable will be handled below, because the receiver
// expects to have the process name, hostname, and PID in the
// buffer before finding an error message.
char *gstack_exec;
gstack_exec = opal_find_absolute_path("gstack");
/* hit each local process with a gstack command */
for (i=0; i < orte_local_children->size; i++) {
if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) &&
ORTE_FLAG_TEST(proct, ORTE_PROC_FLAG_ALIVE)) {
relay_msg = OBJ_NEW(opal_buffer_t);
if (OPAL_SUCCESS != opal_dss.pack(relay_msg, &proct->name, 1, ORTE_NAME) ||
OPAL_SUCCESS != opal_dss.pack(relay_msg, &proct->node->name, 1, OPAL_STRING) ||
OPAL_SUCCESS != opal_dss.pack(relay_msg, &proct->pid, 1, OPAL_PID)) {
OBJ_RELEASE(relay_msg);
break;
}
// If we were able to find the gstack executable,
// above, then run the command here.
fp = NULL;
if (NULL != gstack_exec) {
(void) snprintf(gscmd, sizeof(gscmd), "%s %lu",
gstack_exec, (unsigned long) proct->pid);
fp = popen(gscmd, "r");
}
// If either we weren't able to find or run the gstack
// exectuable, send back a nice error message here.
if (NULL == gstack_exec || NULL == fp) {
(void) snprintf(string, sizeof(string),
"Failed to %s \"%s\" on %s to obtain stack traces",
(NULL == gstack_exec) ? "find" : "run",
(NULL == gstack_exec) ? "gstack" : gstack_exec,
proct->node->name);
if (OPAL_SUCCESS ==
opal_dss.pack(relay_msg, &string_ptr, 1, OPAL_STRING)) {
opal_dss.pack(answer, &relay_msg, 1, OPAL_BUFFER);
}
OBJ_RELEASE(relay_msg);
break;
}
/* Read the output a line at a time and pack it for transmission */
memset(path, 0, sizeof(path));
while (fgets(path, sizeof(path)-1, fp) != NULL) {
if (OPAL_SUCCESS != opal_dss.pack(relay_msg, &pathptr, 1, OPAL_STRING)) {
OBJ_RELEASE(relay_msg);
break;
}
memset(path, 0, sizeof(path));
}
/* close */
pclose(fp);
/* transfer this load */
if (OPAL_SUCCESS != opal_dss.pack(answer, &relay_msg, 1, OPAL_BUFFER)) {
OBJ_RELEASE(relay_msg);
break;
}
OBJ_RELEASE(relay_msg);
}
}
/* always send our response */
if (0 > (ret = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, answer,
ORTE_RML_TAG_STACK_TRACE,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(answer);
}
break;
default:
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
}
@ -1139,6 +1219,9 @@ static char *get_orted_comm_cmd_str(int command)
case ORTE_DAEMON_DVM_ADD_PROCS:
return strdup("ORTE_DAEMON_DVM_ADD_PROCS");
case ORTE_DAEMON_GET_STACK_TRACES:
return strdup("ORTE_DAEMON_GET_STACK_TRACES");
default:
return strdup("Unknown Command!");
}

Просмотреть файл

@ -10,7 +10,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006-2014 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2006-2016 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2007-2009 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2007-2016 Los Alamos National Security, LLC. All rights
* reserved.
@ -88,6 +88,7 @@
#include "orte/mca/rmaps/base/base.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/grpcomm/grpcomm.h"
#include "orte/mca/plm/base/plm_private.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/rml/base/rml_contact.h"
@ -114,6 +115,8 @@ static char **global_mca_env = NULL;
static orte_std_cntr_t total_num_apps = 0;
static bool want_prefix_by_default = (bool) ORTE_WANT_ORTERUN_PREFIX_BY_DEFAULT;
static opal_pointer_array_t tool_jobs;
static int timeout_seconds;
int orte_debugger_attach_fd = -1;
bool orte_debugger_fifo_active=false;
opal_event_t *orte_debugger_attach=NULL;
@ -853,13 +856,26 @@ int orte_submit_job(char *argv[], int *index,
/* check for a job timeout specification, to be provided in seconds
* as that is what MPICH used
*/
if (NULL != (param = getenv("MPIEXEC_TIMEOUT"))) {
param = NULL;
if (0 < orte_cmd_options.timeout ||
NULL != (param = getenv("MPIEXEC_TIMEOUT"))) {
if (NULL != param) {
timeout_seconds = strtol(param, NULL, 10);
/* both cannot be present, or they must agree */
if (0 < orte_cmd_options.timeout && timeout_seconds != orte_cmd_options.timeout) {
orte_show_help("help-orterun.txt", "orterun:timeoutconflict", false,
orte_basename, orte_cmd_options.timeout, param);
exit(1);
}
} else {
timeout_seconds = orte_cmd_options.timeout;
}
if (NULL == (orte_mpiexec_timeout = OBJ_NEW(orte_timer_t))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
ORTE_UPDATE_EXIT_STATUS(ORTE_ERR_OUT_OF_RESOURCE);
//goto DONE;
}
orte_mpiexec_timeout->tv.tv_sec = strtol(param, NULL, 10);
orte_mpiexec_timeout->tv.tv_sec = timeout_seconds;
orte_mpiexec_timeout->tv.tv_usec = 0;
opal_event_evtimer_set(orte_event_base, orte_mpiexec_timeout->ev,
orte_timeout_wakeup, jdata);
@ -887,7 +903,7 @@ int orte_submit_job(char *argv[], int *index,
if (NULL == fp) {
orte_show_help("help-orterun.txt", "orterun:write_file", false,
orte_basename, "uri", orte_cmd_options.report_uri);
exit(0);
exit(1);
}
fprintf(fp, "%s\n", (NULL == rml_uri) ? "NULL" : rml_uri);
fclose(fp);
@ -2824,16 +2840,77 @@ void orte_debugger_detached(int fd, short event, void *cbdata)
mpir_breakpoint_fired = false;
}
static uint32_t ntraces = 0;
static orte_timer_t stack_trace_timer;
static void stack_trace_recv(int status, orte_process_name_t* sender,
opal_buffer_t *buffer, orte_rml_tag_t tag,
void* cbdata)
{
opal_buffer_t *blob;
char *st;
int32_t cnt;
orte_process_name_t name;
char *hostname;
pid_t pid;
/* unpack the stack_trace blob */
cnt = 1;
while (OPAL_SUCCESS == opal_dss.unpack(buffer, &blob, &cnt, OPAL_BUFFER)) {
/* first piece is the name of the process */
cnt = 1;
if (OPAL_SUCCESS != opal_dss.unpack(blob, &name, &cnt, ORTE_NAME) ||
OPAL_SUCCESS != opal_dss.unpack(blob, &hostname, &cnt, OPAL_STRING) ||
OPAL_SUCCESS != opal_dss.unpack(blob, &pid, &cnt, OPAL_PID)) {
OBJ_RELEASE(blob);
continue;
}
fprintf(stderr, "STACK TRACE FOR PROC %s (%s, PID %lu)\n", ORTE_NAME_PRINT(&name), hostname, (unsigned long) pid);
free(hostname);
/* unpack the stack_trace until complete */
cnt = 1;
while (OPAL_SUCCESS == opal_dss.unpack(blob, &st, &cnt, OPAL_STRING)) {
fprintf(stderr, "\t%s", st); // has its own newline
free(st);
cnt = 1;
}
fprintf(stderr, "\n");
OBJ_RELEASE(blob);
cnt = 1;
}
++ntraces;
if (orte_process_info.num_procs == ntraces) {
/* cancel the timeout */
OBJ_DESTRUCT(&stack_trace_timer);
/* abort the job */
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_ALL_JOBS_COMPLETE);
/* set the global abnormal exit flag */
orte_abnormal_term_ordered = true;
}
}
static void stack_trace_timeout(int sd, short args, void *cbdata)
{
/* abort the job */
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_ALL_JOBS_COMPLETE);
/* set the global abnormal exit flag */
orte_abnormal_term_ordered = true;
}
void orte_timeout_wakeup(int sd, short args, void *cbdata)
{
char *tm;
orte_job_t *jdata;
orte_proc_t *proc;
int i;
int rc;
uint32_t key;
void *nptr;
/* this function gets called when the job execution time
* has hit a prescribed limit - so just abort
*/
tm = getenv("MPIEXEC_TIMEOUT");
orte_show_help("help-orterun.txt", "orterun:timeout",
true, (NULL == tm) ? "NULL" : tm);
true, timeout_seconds);
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
/* if we are testing HNP suicide, then just exit */
if (ORTE_PROC_IS_HNP &&
@ -2841,6 +2918,79 @@ void orte_timeout_wakeup(int sd, short args, void *cbdata)
opal_output(0, "HNP exiting w/o cleanup");
exit(1);
}
if (orte_cmd_options.report_state_on_timeout) {
/* cycle across all the jobs and report their state */
rc = opal_hash_table_get_first_key_uint32(orte_job_data, &key, (void **)&jdata, &nptr);
while (OPAL_SUCCESS == rc) {
/* don't use the opal_output system as it may be borked */
fprintf(stderr, "DATA FOR JOB: %s\n", ORTE_JOBID_PRINT(jdata->jobid));
fprintf(stderr, "\tNum apps: %d\tNum procs: %d\tJobState: %s\tAbort: %s\n",
(int)jdata->num_apps, (int)jdata->num_procs,
orte_job_state_to_str(jdata->state),
(ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) ? "True" : "False");
fprintf(stderr, "\tNum launched: %ld\tNum reported: %ld\tNum terminated: %ld\n",
(long)jdata->num_launched, (long)jdata->num_reported, (long)jdata->num_terminated);
fprintf(stderr, "\n\tProcs:\n");
for (i=0; i < jdata->procs->size; i++) {
if (NULL != (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) {
fprintf(stderr, "\t\tRank: %s\tNode: %s\tPID: %u\tState: %s\tExitCode %d\n",
ORTE_VPID_PRINT(proc->name.vpid),
(NULL == proc->node) ? "UNKNOWN" : proc->node->name,
(unsigned int)proc->pid,
orte_proc_state_to_str(proc->state), proc->exit_code);
}
}
fprintf(stderr, "\n");
rc = opal_hash_table_get_next_key_uint32(orte_job_data, &key, (void **)&jdata, nptr, &nptr);
}
}
/* if they asked for stack_traces, attempt to get them, but timeout
* if we cannot do so */
if (orte_cmd_options.get_stack_traces) {
orte_daemon_cmd_flag_t command = ORTE_DAEMON_GET_STACK_TRACES;
opal_buffer_t *buffer;
orte_grpcomm_signature_t *sig;
fprintf(stderr, "Waiting for stack traces (this may take a few moments)...\n");
/* set the recv */
orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_STACK_TRACE,
ORTE_RML_PERSISTENT, stack_trace_recv, NULL);
/* setup the buffer */
buffer = OBJ_NEW(opal_buffer_t);
/* pack the command */
if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &command, 1, ORTE_DAEMON_CMD))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buffer);
goto giveup;
}
/* goes to all daemons */
sig = OBJ_NEW(orte_grpcomm_signature_t);
sig->signature = (orte_process_name_t*)malloc(sizeof(orte_process_name_t));
sig->signature[0].jobid = ORTE_PROC_MY_NAME->jobid;
sig->signature[0].vpid = ORTE_VPID_WILDCARD;
sig->sz = 1;
if (ORTE_SUCCESS != (rc = orte_grpcomm.xcast(sig, ORTE_RML_TAG_DAEMON, buffer))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buffer);
OBJ_RELEASE(sig);
goto giveup;
}
OBJ_RELEASE(buffer);
/* maintain accounting */
OBJ_RELEASE(sig);
/* we will terminate after we get the stack_traces, but set a timeout
* just in case we never hear back from everyone */
OBJ_CONSTRUCT(&stack_trace_timer, orte_timer_t);
opal_event_evtimer_set(orte_event_base,
stack_trace_timer.ev, stack_trace_timeout, NULL);
opal_event_set_priority(stack_trace_timer.ev, ORTE_ERROR_PRI);
stack_trace_timer.tv.tv_sec = 30;
opal_event_evtimer_add(stack_trace_timer.ev, &stack_trace_timer.tv);
return;
}
giveup:
/* abort the job */
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_ALL_JOBS_COMPLETE);
/* set the global abnormal exit flag */

Просмотреть файл

@ -1,5 +1,6 @@
/*
* Copyright (c) 2015-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2016 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -92,6 +93,9 @@ struct orte_cmd_options_t {
bool enable_recovery;
char *hnp;
bool staged_exec;
int timeout;
bool report_state_on_timeout;
bool get_stack_traces;
};
typedef struct orte_cmd_options_t orte_cmd_options_t;
ORTE_DECLSPEC extern orte_cmd_options_t orte_cmd_options;

Просмотреть файл

@ -10,7 +10,7 @@
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2007-2010 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2007-2016 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2012 Oak Ridge National Labs. All rights reserved.
# $COPYRIGHT$
#
@ -645,11 +645,11 @@ Please correct this value and try again.
The user-provided time limit for job execution has been
reached:
MPIEXEC_TIMEOUT: %s seconds
MPIEXEC_TIMEOUT: %d seconds
The job will now be aborted. Please check your code and/or
adjust/remove the job execution time limit (as specified
by MPIEXEC_TIMEOUT in your environment).
The job will now be aborted. Please check your code and/or
adjust/remove the job execution time limit (as specified by
MPIEXEC_TIMEOUT in your environment or --timeout on the command line).
#
[orterun:conflict-env-set]
ERROR: You have attempted to pass environment variables to Open MPI
@ -669,3 +669,11 @@ system's restriction for Unix's socket's path-length.
Please try to set TMPDIR to something short (like /tmp) or change
Your computer's name (see uname -n).
#
[orterun:timeoutconflict]
Conflicting requests for timeout were given:
--timeout command line option: %d
MPIEXEC_TIMEOUT envar: %s
Only one method should be provided, or else they must agree. Please
correct and retry.