Merge pull request #1724 from rhc54/topic/timeout
Add a timeout cmd line option and an option to report state info upon timeout to assist with debugging Jenkins tests
Этот коммит содержится в:
Коммит
a93c01d4f4
@ -9,7 +9,7 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
|
||||
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011-2016 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
|
||||
@ -80,6 +80,9 @@ typedef uint8_t orte_daemon_cmd_flag_t;
|
||||
/* add procs for the DVM */
|
||||
#define ORTE_DAEMON_DVM_ADD_PROCS (orte_daemon_cmd_flag_t) 30
|
||||
|
||||
/* for debug purposes, get stack traces from all application procs */
|
||||
#define ORTE_DAEMON_GET_STACK_TRACES (orte_daemon_cmd_flag_t) 31
|
||||
|
||||
/*
|
||||
* Struct written up the pipe from the child to the parent.
|
||||
*/
|
||||
|
@ -163,6 +163,9 @@ BEGIN_C_DECLS
|
||||
/* error notifications */
|
||||
#define ORTE_RML_TAG_NOTIFICATION 59
|
||||
|
||||
/* stacktrace for debug */
|
||||
#define ORTE_RML_TAG_STACK_TRACE 60
|
||||
|
||||
#define ORTE_RML_TAG_MAX 100
|
||||
|
||||
#define ORTE_RML_TAG_NTOH(t) ntohl(t)
|
||||
|
@ -11,7 +11,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006-2013 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2009-2015 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2009-2016 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
|
||||
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2015 Research Organization for Information Science
|
||||
@ -92,6 +92,18 @@ static opal_cmd_line_init_t cmd_line_init[] = {
|
||||
&orte_cmd_options.report_uri, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Printout URI on stdout [-], stderr [+], or a file [anything else]" },
|
||||
|
||||
/* testing options */
|
||||
{ NULL, '\0', "timeout", "timeout", 1,
|
||||
&orte_cmd_options.timeout, OPAL_CMD_LINE_TYPE_INT,
|
||||
"Timeout the job after the specified number of seconds" },
|
||||
{ NULL, '\0', "report-state-on-timeout", "report-state-on-timeout", 0,
|
||||
&orte_cmd_options.report_state_on_timeout, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Report all job and process states upon timeout" },
|
||||
{ NULL, '\0', "get-stack-traces", "get-stack-traces", 0,
|
||||
&orte_cmd_options.get_stack_traces, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Get stack traces of all application procs on timeout" },
|
||||
|
||||
|
||||
/* exit status reporting */
|
||||
{ "orte_report_child_jobs_separately", '\0', "report-child-jobs-separately", "report-child-jobs-separately", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
|
@ -9,7 +9,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2007-2016 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
|
||||
@ -47,6 +47,7 @@
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/opal_environ.h"
|
||||
#include "opal/util/path.h"
|
||||
#include "opal/runtime/opal.h"
|
||||
#include "opal/runtime/opal_progress.h"
|
||||
#include "opal/dss/dss.h"
|
||||
@ -111,6 +112,9 @@ void orte_daemon_recv(int status, orte_process_name_t* sender,
|
||||
bool found = false;
|
||||
orte_node_t *node;
|
||||
orte_grpcomm_signature_t *sig;
|
||||
FILE *fp;
|
||||
char gscmd[256], path[1035], *pathptr;
|
||||
char string[256], *string_ptr = string;
|
||||
|
||||
/* unpack the command */
|
||||
n = 1;
|
||||
@ -1071,6 +1075,82 @@ void orte_daemon_recv(int status, orte_process_name_t* sender,
|
||||
}
|
||||
break;
|
||||
|
||||
case ORTE_DAEMON_GET_STACK_TRACES:
|
||||
/* prep the response */
|
||||
answer = OBJ_NEW(opal_buffer_t);
|
||||
pathptr = path;
|
||||
|
||||
// Try to find the "gstack" executable. Failure to find the
|
||||
// executable will be handled below, because the receiver
|
||||
// expects to have the process name, hostname, and PID in the
|
||||
// buffer before finding an error message.
|
||||
char *gstack_exec;
|
||||
gstack_exec = opal_find_absolute_path("gstack");
|
||||
|
||||
/* hit each local process with a gstack command */
|
||||
for (i=0; i < orte_local_children->size; i++) {
|
||||
if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) &&
|
||||
ORTE_FLAG_TEST(proct, ORTE_PROC_FLAG_ALIVE)) {
|
||||
relay_msg = OBJ_NEW(opal_buffer_t);
|
||||
if (OPAL_SUCCESS != opal_dss.pack(relay_msg, &proct->name, 1, ORTE_NAME) ||
|
||||
OPAL_SUCCESS != opal_dss.pack(relay_msg, &proct->node->name, 1, OPAL_STRING) ||
|
||||
OPAL_SUCCESS != opal_dss.pack(relay_msg, &proct->pid, 1, OPAL_PID)) {
|
||||
OBJ_RELEASE(relay_msg);
|
||||
break;
|
||||
}
|
||||
|
||||
// If we were able to find the gstack executable,
|
||||
// above, then run the command here.
|
||||
fp = NULL;
|
||||
if (NULL != gstack_exec) {
|
||||
(void) snprintf(gscmd, sizeof(gscmd), "%s %lu",
|
||||
gstack_exec, (unsigned long) proct->pid);
|
||||
fp = popen(gscmd, "r");
|
||||
}
|
||||
|
||||
// If either we weren't able to find or run the gstack
|
||||
// exectuable, send back a nice error message here.
|
||||
if (NULL == gstack_exec || NULL == fp) {
|
||||
(void) snprintf(string, sizeof(string),
|
||||
"Failed to %s \"%s\" on %s to obtain stack traces",
|
||||
(NULL == gstack_exec) ? "find" : "run",
|
||||
(NULL == gstack_exec) ? "gstack" : gstack_exec,
|
||||
proct->node->name);
|
||||
if (OPAL_SUCCESS ==
|
||||
opal_dss.pack(relay_msg, &string_ptr, 1, OPAL_STRING)) {
|
||||
opal_dss.pack(answer, &relay_msg, 1, OPAL_BUFFER);
|
||||
}
|
||||
OBJ_RELEASE(relay_msg);
|
||||
break;
|
||||
}
|
||||
/* Read the output a line at a time and pack it for transmission */
|
||||
memset(path, 0, sizeof(path));
|
||||
while (fgets(path, sizeof(path)-1, fp) != NULL) {
|
||||
if (OPAL_SUCCESS != opal_dss.pack(relay_msg, &pathptr, 1, OPAL_STRING)) {
|
||||
OBJ_RELEASE(relay_msg);
|
||||
break;
|
||||
}
|
||||
memset(path, 0, sizeof(path));
|
||||
}
|
||||
/* close */
|
||||
pclose(fp);
|
||||
/* transfer this load */
|
||||
if (OPAL_SUCCESS != opal_dss.pack(answer, &relay_msg, 1, OPAL_BUFFER)) {
|
||||
OBJ_RELEASE(relay_msg);
|
||||
break;
|
||||
}
|
||||
OBJ_RELEASE(relay_msg);
|
||||
}
|
||||
}
|
||||
/* always send our response */
|
||||
if (0 > (ret = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, answer,
|
||||
ORTE_RML_TAG_STACK_TRACE,
|
||||
orte_rml_send_callback, NULL))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(answer);
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
||||
}
|
||||
@ -1139,6 +1219,9 @@ static char *get_orted_comm_cmd_str(int command)
|
||||
case ORTE_DAEMON_DVM_ADD_PROCS:
|
||||
return strdup("ORTE_DAEMON_DVM_ADD_PROCS");
|
||||
|
||||
case ORTE_DAEMON_GET_STACK_TRACES:
|
||||
return strdup("ORTE_DAEMON_GET_STACK_TRACES");
|
||||
|
||||
default:
|
||||
return strdup("Unknown Command!");
|
||||
}
|
||||
|
@ -10,7 +10,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006-2014 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2006-2016 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2007-2009 Sun Microsystems, Inc. All rights reserved.
|
||||
* Copyright (c) 2007-2016 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
@ -88,6 +88,7 @@
|
||||
#include "orte/mca/rmaps/base/base.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/grpcomm/grpcomm.h"
|
||||
#include "orte/mca/plm/base/plm_private.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/rml/base/rml_contact.h"
|
||||
@ -114,6 +115,8 @@ static char **global_mca_env = NULL;
|
||||
static orte_std_cntr_t total_num_apps = 0;
|
||||
static bool want_prefix_by_default = (bool) ORTE_WANT_ORTERUN_PREFIX_BY_DEFAULT;
|
||||
static opal_pointer_array_t tool_jobs;
|
||||
static int timeout_seconds;
|
||||
|
||||
int orte_debugger_attach_fd = -1;
|
||||
bool orte_debugger_fifo_active=false;
|
||||
opal_event_t *orte_debugger_attach=NULL;
|
||||
@ -853,13 +856,26 @@ int orte_submit_job(char *argv[], int *index,
|
||||
/* check for a job timeout specification, to be provided in seconds
|
||||
* as that is what MPICH used
|
||||
*/
|
||||
if (NULL != (param = getenv("MPIEXEC_TIMEOUT"))) {
|
||||
param = NULL;
|
||||
if (0 < orte_cmd_options.timeout ||
|
||||
NULL != (param = getenv("MPIEXEC_TIMEOUT"))) {
|
||||
if (NULL != param) {
|
||||
timeout_seconds = strtol(param, NULL, 10);
|
||||
/* both cannot be present, or they must agree */
|
||||
if (0 < orte_cmd_options.timeout && timeout_seconds != orte_cmd_options.timeout) {
|
||||
orte_show_help("help-orterun.txt", "orterun:timeoutconflict", false,
|
||||
orte_basename, orte_cmd_options.timeout, param);
|
||||
exit(1);
|
||||
}
|
||||
} else {
|
||||
timeout_seconds = orte_cmd_options.timeout;
|
||||
}
|
||||
if (NULL == (orte_mpiexec_timeout = OBJ_NEW(orte_timer_t))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
ORTE_UPDATE_EXIT_STATUS(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
//goto DONE;
|
||||
}
|
||||
orte_mpiexec_timeout->tv.tv_sec = strtol(param, NULL, 10);
|
||||
orte_mpiexec_timeout->tv.tv_sec = timeout_seconds;
|
||||
orte_mpiexec_timeout->tv.tv_usec = 0;
|
||||
opal_event_evtimer_set(orte_event_base, orte_mpiexec_timeout->ev,
|
||||
orte_timeout_wakeup, jdata);
|
||||
@ -887,7 +903,7 @@ int orte_submit_job(char *argv[], int *index,
|
||||
if (NULL == fp) {
|
||||
orte_show_help("help-orterun.txt", "orterun:write_file", false,
|
||||
orte_basename, "uri", orte_cmd_options.report_uri);
|
||||
exit(0);
|
||||
exit(1);
|
||||
}
|
||||
fprintf(fp, "%s\n", (NULL == rml_uri) ? "NULL" : rml_uri);
|
||||
fclose(fp);
|
||||
@ -2824,16 +2840,77 @@ void orte_debugger_detached(int fd, short event, void *cbdata)
|
||||
mpir_breakpoint_fired = false;
|
||||
}
|
||||
|
||||
static uint32_t ntraces = 0;
|
||||
static orte_timer_t stack_trace_timer;
|
||||
|
||||
static void stack_trace_recv(int status, orte_process_name_t* sender,
|
||||
opal_buffer_t *buffer, orte_rml_tag_t tag,
|
||||
void* cbdata)
|
||||
{
|
||||
opal_buffer_t *blob;
|
||||
char *st;
|
||||
int32_t cnt;
|
||||
orte_process_name_t name;
|
||||
char *hostname;
|
||||
pid_t pid;
|
||||
|
||||
/* unpack the stack_trace blob */
|
||||
cnt = 1;
|
||||
while (OPAL_SUCCESS == opal_dss.unpack(buffer, &blob, &cnt, OPAL_BUFFER)) {
|
||||
/* first piece is the name of the process */
|
||||
cnt = 1;
|
||||
if (OPAL_SUCCESS != opal_dss.unpack(blob, &name, &cnt, ORTE_NAME) ||
|
||||
OPAL_SUCCESS != opal_dss.unpack(blob, &hostname, &cnt, OPAL_STRING) ||
|
||||
OPAL_SUCCESS != opal_dss.unpack(blob, &pid, &cnt, OPAL_PID)) {
|
||||
OBJ_RELEASE(blob);
|
||||
continue;
|
||||
}
|
||||
fprintf(stderr, "STACK TRACE FOR PROC %s (%s, PID %lu)\n", ORTE_NAME_PRINT(&name), hostname, (unsigned long) pid);
|
||||
free(hostname);
|
||||
/* unpack the stack_trace until complete */
|
||||
cnt = 1;
|
||||
while (OPAL_SUCCESS == opal_dss.unpack(blob, &st, &cnt, OPAL_STRING)) {
|
||||
fprintf(stderr, "\t%s", st); // has its own newline
|
||||
free(st);
|
||||
cnt = 1;
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
OBJ_RELEASE(blob);
|
||||
cnt = 1;
|
||||
}
|
||||
++ntraces;
|
||||
if (orte_process_info.num_procs == ntraces) {
|
||||
/* cancel the timeout */
|
||||
OBJ_DESTRUCT(&stack_trace_timer);
|
||||
/* abort the job */
|
||||
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_ALL_JOBS_COMPLETE);
|
||||
/* set the global abnormal exit flag */
|
||||
orte_abnormal_term_ordered = true;
|
||||
}
|
||||
}
|
||||
|
||||
static void stack_trace_timeout(int sd, short args, void *cbdata)
|
||||
{
|
||||
/* abort the job */
|
||||
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_ALL_JOBS_COMPLETE);
|
||||
/* set the global abnormal exit flag */
|
||||
orte_abnormal_term_ordered = true;
|
||||
}
|
||||
|
||||
void orte_timeout_wakeup(int sd, short args, void *cbdata)
|
||||
{
|
||||
char *tm;
|
||||
orte_job_t *jdata;
|
||||
orte_proc_t *proc;
|
||||
int i;
|
||||
int rc;
|
||||
uint32_t key;
|
||||
void *nptr;
|
||||
|
||||
/* this function gets called when the job execution time
|
||||
* has hit a prescribed limit - so just abort
|
||||
*/
|
||||
tm = getenv("MPIEXEC_TIMEOUT");
|
||||
orte_show_help("help-orterun.txt", "orterun:timeout",
|
||||
true, (NULL == tm) ? "NULL" : tm);
|
||||
true, timeout_seconds);
|
||||
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
/* if we are testing HNP suicide, then just exit */
|
||||
if (ORTE_PROC_IS_HNP &&
|
||||
@ -2841,6 +2918,79 @@ void orte_timeout_wakeup(int sd, short args, void *cbdata)
|
||||
opal_output(0, "HNP exiting w/o cleanup");
|
||||
exit(1);
|
||||
}
|
||||
if (orte_cmd_options.report_state_on_timeout) {
|
||||
/* cycle across all the jobs and report their state */
|
||||
rc = opal_hash_table_get_first_key_uint32(orte_job_data, &key, (void **)&jdata, &nptr);
|
||||
while (OPAL_SUCCESS == rc) {
|
||||
/* don't use the opal_output system as it may be borked */
|
||||
fprintf(stderr, "DATA FOR JOB: %s\n", ORTE_JOBID_PRINT(jdata->jobid));
|
||||
fprintf(stderr, "\tNum apps: %d\tNum procs: %d\tJobState: %s\tAbort: %s\n",
|
||||
(int)jdata->num_apps, (int)jdata->num_procs,
|
||||
orte_job_state_to_str(jdata->state),
|
||||
(ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) ? "True" : "False");
|
||||
fprintf(stderr, "\tNum launched: %ld\tNum reported: %ld\tNum terminated: %ld\n",
|
||||
(long)jdata->num_launched, (long)jdata->num_reported, (long)jdata->num_terminated);
|
||||
fprintf(stderr, "\n\tProcs:\n");
|
||||
for (i=0; i < jdata->procs->size; i++) {
|
||||
if (NULL != (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) {
|
||||
fprintf(stderr, "\t\tRank: %s\tNode: %s\tPID: %u\tState: %s\tExitCode %d\n",
|
||||
ORTE_VPID_PRINT(proc->name.vpid),
|
||||
(NULL == proc->node) ? "UNKNOWN" : proc->node->name,
|
||||
(unsigned int)proc->pid,
|
||||
orte_proc_state_to_str(proc->state), proc->exit_code);
|
||||
}
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
rc = opal_hash_table_get_next_key_uint32(orte_job_data, &key, (void **)&jdata, nptr, &nptr);
|
||||
}
|
||||
}
|
||||
/* if they asked for stack_traces, attempt to get them, but timeout
|
||||
* if we cannot do so */
|
||||
if (orte_cmd_options.get_stack_traces) {
|
||||
orte_daemon_cmd_flag_t command = ORTE_DAEMON_GET_STACK_TRACES;
|
||||
opal_buffer_t *buffer;
|
||||
orte_grpcomm_signature_t *sig;
|
||||
|
||||
fprintf(stderr, "Waiting for stack traces (this may take a few moments)...\n");
|
||||
|
||||
/* set the recv */
|
||||
orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_STACK_TRACE,
|
||||
ORTE_RML_PERSISTENT, stack_trace_recv, NULL);
|
||||
|
||||
/* setup the buffer */
|
||||
buffer = OBJ_NEW(opal_buffer_t);
|
||||
/* pack the command */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &command, 1, ORTE_DAEMON_CMD))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(buffer);
|
||||
goto giveup;
|
||||
}
|
||||
/* goes to all daemons */
|
||||
sig = OBJ_NEW(orte_grpcomm_signature_t);
|
||||
sig->signature = (orte_process_name_t*)malloc(sizeof(orte_process_name_t));
|
||||
sig->signature[0].jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
sig->signature[0].vpid = ORTE_VPID_WILDCARD;
|
||||
sig->sz = 1;
|
||||
if (ORTE_SUCCESS != (rc = orte_grpcomm.xcast(sig, ORTE_RML_TAG_DAEMON, buffer))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(buffer);
|
||||
OBJ_RELEASE(sig);
|
||||
goto giveup;
|
||||
}
|
||||
OBJ_RELEASE(buffer);
|
||||
/* maintain accounting */
|
||||
OBJ_RELEASE(sig);
|
||||
/* we will terminate after we get the stack_traces, but set a timeout
|
||||
* just in case we never hear back from everyone */
|
||||
OBJ_CONSTRUCT(&stack_trace_timer, orte_timer_t);
|
||||
opal_event_evtimer_set(orte_event_base,
|
||||
stack_trace_timer.ev, stack_trace_timeout, NULL);
|
||||
opal_event_set_priority(stack_trace_timer.ev, ORTE_ERROR_PRI);
|
||||
stack_trace_timer.tv.tv_sec = 30;
|
||||
opal_event_evtimer_add(stack_trace_timer.ev, &stack_trace_timer.tv);
|
||||
return;
|
||||
}
|
||||
giveup:
|
||||
/* abort the job */
|
||||
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_ALL_JOBS_COMPLETE);
|
||||
/* set the global abnormal exit flag */
|
||||
|
@ -1,5 +1,6 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2016 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2016 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -92,6 +93,9 @@ struct orte_cmd_options_t {
|
||||
bool enable_recovery;
|
||||
char *hnp;
|
||||
bool staged_exec;
|
||||
int timeout;
|
||||
bool report_state_on_timeout;
|
||||
bool get_stack_traces;
|
||||
};
|
||||
typedef struct orte_cmd_options_t orte_cmd_options_t;
|
||||
ORTE_DECLSPEC extern orte_cmd_options_t orte_cmd_options;
|
||||
|
@ -10,7 +10,7 @@
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2007-2010 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2007-2016 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2012 Oak Ridge National Labs. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
@ -645,11 +645,11 @@ Please correct this value and try again.
|
||||
The user-provided time limit for job execution has been
|
||||
reached:
|
||||
|
||||
MPIEXEC_TIMEOUT: %s seconds
|
||||
MPIEXEC_TIMEOUT: %d seconds
|
||||
|
||||
The job will now be aborted. Please check your code and/or
|
||||
adjust/remove the job execution time limit (as specified
|
||||
by MPIEXEC_TIMEOUT in your environment).
|
||||
The job will now be aborted. Please check your code and/or
|
||||
adjust/remove the job execution time limit (as specified by
|
||||
MPIEXEC_TIMEOUT in your environment or --timeout on the command line).
|
||||
#
|
||||
[orterun:conflict-env-set]
|
||||
ERROR: You have attempted to pass environment variables to Open MPI
|
||||
@ -669,3 +669,11 @@ system's restriction for Unix's socket's path-length.
|
||||
Please try to set TMPDIR to something short (like /tmp) or change
|
||||
Your computer's name (see uname -n).
|
||||
#
|
||||
[orterun:timeoutconflict]
|
||||
Conflicting requests for timeout were given:
|
||||
|
||||
--timeout command line option: %d
|
||||
MPIEXEC_TIMEOUT envar: %s
|
||||
|
||||
Only one method should be provided, or else they must agree. Please
|
||||
correct and retry.
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user