1
1

Merge pull request #3197 from rhc54/topic/errors

Provide a little more help on the error messages when an executable i…
Этот коммит содержится в:
Ralph Castain 2017-03-17 11:29:39 -07:00 коммит произвёл GitHub
родитель 45b46dc446 dc85e7fde7
Коммит afcc33862e
7 изменённых файлов: 33 добавлений и 33 удалений

Просмотреть файл

@ -6,6 +6,7 @@
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. # Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2014 Research Organization for Information Science # Copyright (c) 2014 Research Organization for Information Science
# and Technology (RIST). All rights reserved. # and Technology (RIST). All rights reserved.
# Copyright (c) 2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$ # $COPYRIGHT$
# #
# Additional copyrights may follow # Additional copyrights may follow
@ -46,6 +47,7 @@ Will continue attempting to launch the process.
The xterm option was asked to display a rank that is larger The xterm option was asked to display a rank that is larger
than the number of procs in the job: than the number of procs in the job:
Node: %s
Rank: %d Rank: %d
Num procs: %d Num procs: %d

Просмотреть файл

@ -634,21 +634,24 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
char **env = NULL, **argv = NULL, *cmd = NULL; char **env = NULL, **argv = NULL, *cmd = NULL;
int rc, i; int rc, i;
bool found; bool found;
orte_proc_state_t state;
/* thread-protect common values */ /* thread-protect common values */
env = opal_argv_copy(app->env); env = opal_argv_copy(app->env);
/* setup the pmix environment */
if (OPAL_SUCCESS != (rc = opal_pmix.server_setup_fork(&child->name, &env))) {
ORTE_ERROR_LOG(rc);
goto errorout;
}
/* ensure we clear any prior info regarding state or exit status in /* ensure we clear any prior info regarding state or exit status in
* case this is a restart * case this is a restart
*/ */
child->exit_code = 0; child->exit_code = 0;
ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_WAITPID); ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_WAITPID);
/* setup the pmix environment */
if (OPAL_SUCCESS != (rc = opal_pmix.server_setup_fork(&child->name, &env))) {
ORTE_ERROR_LOG(rc);
state = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
goto errorout;
}
/* if we are not forwarding output for this job, then /* if we are not forwarding output for this job, then
* flag iof as complete * flag iof as complete
*/ */
@ -693,8 +696,9 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
/* can't be done! */ /* can't be done! */
orte_show_help("help-orte-odls-base.txt", orte_show_help("help-orte-odls-base.txt",
"orte-odls-base:xterm-rank-out-of-bounds", "orte-odls-base:xterm-rank-out-of-bounds",
true, nm->name.vpid, jobdat->num_procs); true, orte_process_info.nodename,
child->exit_code = ORTE_PROC_STATE_FAILED_TO_LAUNCH; nm->name.vpid, jobdat->num_procs);
state = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
goto errorout; goto errorout;
} }
} }
@ -717,7 +721,7 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
orte_show_help("help-orte-odls-base.txt", orte_show_help("help-orte-odls-base.txt",
"orte-odls-base:fork-agent-not-found", "orte-odls-base:fork-agent-not-found",
true, orte_process_info.nodename, orte_fork_agent[0]); true, orte_process_info.nodename, orte_fork_agent[0]);
child->exit_code = ORTE_PROC_STATE_FAILED_TO_LAUNCH; state = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
goto errorout; goto errorout;
} }
} else { } else {
@ -730,7 +734,7 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
*/ */
if (ORTE_SUCCESS != (rc = orte_schizo.setup_child(jobdat, child, app, &env))) { if (ORTE_SUCCESS != (rc = orte_schizo.setup_child(jobdat, child, app, &env))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
child->exit_code = rc; state = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
goto errorout; goto errorout;
} }
@ -754,17 +758,8 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
} }
if (ORTE_SUCCESS != (rc = cd->fork_local(child, cmd, argv, env, jobdat, cd->opts))) { if (ORTE_SUCCESS != (rc = cd->fork_local(child, cmd, argv, env, jobdat, cd->opts))) {
child->exit_code = rc; /* error message already output */ /* error message already output */
goto errorout; state = ORTE_PROC_STATE_FAILED_TO_START;
}
if (ORTE_SUCCESS != rc) {
/* do NOT ERROR_LOG this error - it generates
* a message/node as most errors will be common
* across the entire cluster. Instead, we let orterun
* output a consolidated error message for us
*/
ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_ALIVE);
child->exit_code = rc; /* error message already output */
goto errorout; goto errorout;
} }
@ -782,7 +777,8 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
return; return;
errorout: errorout:
ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_START); ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_ALIVE);
ORTE_ACTIVATE_PROC_STATE(&child->name, state);
if (NULL != env) { if (NULL != env) {
opal_argv_free(env); opal_argv_free(env);
} }

Просмотреть файл

@ -12,6 +12,7 @@
# All rights reserved. # All rights reserved.
# Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved. # Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
# Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved. # Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$ # $COPYRIGHT$
# #
# Additional copyrights may follow # Additional copyrights may follow
@ -29,6 +30,7 @@ having specified a directory for your application. Your job will now
abort. abort.
Local host: %s Local host: %s
Working dir: %s
Application name: %s Application name: %s
Error: %s Error: %s
# #

Просмотреть файл

@ -328,6 +328,7 @@ static int do_child(orte_proc_t *child,
int i; int i;
sigset_t sigs; sigset_t sigs;
long fd, fdmax = sysconf(_SC_OPEN_MAX); long fd, fdmax = sysconf(_SC_OPEN_MAX);
char dir[MAXPATHLEN];
#if HAVE_SETPGID #if HAVE_SETPGID
/* Set a new process group for this child, so that any /* Set a new process group for this child, so that any
@ -425,9 +426,10 @@ static int do_child(orte_proc_t *child,
/* Exec the new executable */ /* Exec the new executable */
execve(app, argv, environ_copy); execve(app, argv, environ_copy);
getcwd(dir, sizeof(dir));
send_error_show_help(write_fd, 1, send_error_show_help(write_fd, 1,
"help-orte-odls-default.txt", "execve error", "help-orte-odls-default.txt", "execve error",
orte_process_info.nodename, app, strerror(errno)); orte_process_info.nodename, dir, app, strerror(errno));
/* Does not return */ /* Does not return */
} }

Просмотреть файл

@ -15,7 +15,7 @@
* Copyright (c) 2007-2015 Los Alamos National Security, LLC. All rights * Copyright (c) 2007-2015 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* Copyright (c) 2012 Oak Ridge National Labs. All rights reserved. * Copyright (c) 2012 Oak Ridge National Labs. All rights reserved.
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved. * Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -258,8 +258,8 @@ int orte_print_aborted_job(orte_job_t *job,
default: default:
if (0 != proc->exit_code) { if (0 != proc->exit_code) {
orte_show_help("help-orterun.txt", "orterun:proc-failed-to-start", true, orte_show_help("help-orterun.txt", "orterun:proc-failed-to-start", true,
orte_basename, ORTE_ERROR_NAME(proc->exit_code), node->name, orte_basename, proc->exit_code, ORTE_ERROR_NAME(proc->exit_code),
(unsigned long)proc->name.vpid); node->name, (unsigned long)proc->name.vpid);
} else { } else {
orte_show_help("help-orterun.txt", "orterun:proc-failed-to-start-no-status", true, orte_show_help("help-orterun.txt", "orterun:proc-failed-to-start-no-status", true,
orte_basename, node->name); orte_basename, node->name);

Просмотреть файл

@ -12,6 +12,7 @@
# All rights reserved. # All rights reserved.
# Copyright (c) 2007-2016 Cisco Systems, Inc. All rights reserved. # Copyright (c) 2007-2016 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2012 Oak Ridge National Labs. All rights reserved. # Copyright (c) 2012 Oak Ridge National Labs. All rights reserved.
# Copyright (c) 2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$ # $COPYRIGHT$
# #
# Additional copyrights may follow # Additional copyrights may follow
@ -296,6 +297,7 @@ while attempting to start process rank %lu.
%s was unable to start the specified application as it encountered an %s was unable to start the specified application as it encountered an
error: error:
Error code: %d
Error name: %s Error name: %s
Node: %s Node: %s

Просмотреть файл

@ -89,7 +89,7 @@ int orte_err2str(int errnum, const char **errmsg)
if (orte_report_silent_errors) { if (orte_report_silent_errors) {
retval = "Silent error"; retval = "Silent error";
} else { } else {
retval = NULL; retval = "";
} }
break; break;
case ORTE_ERR_ADDRESSEE_UNKNOWN: case ORTE_ERR_ADDRESSEE_UNKNOWN:
@ -174,7 +174,7 @@ int orte_err2str(int errnum, const char **errmsg)
if (orte_report_silent_errors) { if (orte_report_silent_errors) {
retval = "Next option"; retval = "Next option";
} else { } else {
retval = NULL; retval = "";
} }
break; break;
case ORTE_ERR_SENSOR_LIMIT_EXCEEDED: case ORTE_ERR_SENSOR_LIMIT_EXCEEDED:
@ -244,11 +244,7 @@ int orte_err2str(int errnum, const char **errmsg)
retval = "Partial success"; retval = "Partial success";
break; break;
default: default:
if (orte_report_silent_errors) { retval = "Unknown error";
retval = "Unknown error";
} else {
retval = NULL;
}
} }
*errmsg = retval; *errmsg = retval;