Provide a little more help on the error messages when an executable isn't found so we have some better idea where we were looking for it. Don't double-report such errors. Ensure the ORTE_ERROR_NAME doesn't get a NULL back for the string name of an error code as that might cause some systems to segfault
Signed-off-by: Ralph Castain <rhc@open-mpi.org>
Этот коммит содержится в:
родитель
45b46dc446
Коммит
dc85e7fde7
@ -6,6 +6,7 @@
|
||||
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2014 Research Organization for Information Science
|
||||
# and Technology (RIST). All rights reserved.
|
||||
# Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
@ -46,6 +47,7 @@ Will continue attempting to launch the process.
|
||||
The xterm option was asked to display a rank that is larger
|
||||
than the number of procs in the job:
|
||||
|
||||
Node: %s
|
||||
Rank: %d
|
||||
Num procs: %d
|
||||
|
||||
|
@ -634,21 +634,24 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
|
||||
char **env = NULL, **argv = NULL, *cmd = NULL;
|
||||
int rc, i;
|
||||
bool found;
|
||||
orte_proc_state_t state;
|
||||
|
||||
/* thread-protect common values */
|
||||
env = opal_argv_copy(app->env);
|
||||
|
||||
/* setup the pmix environment */
|
||||
if (OPAL_SUCCESS != (rc = opal_pmix.server_setup_fork(&child->name, &env))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto errorout;
|
||||
}
|
||||
|
||||
/* ensure we clear any prior info regarding state or exit status in
|
||||
* case this is a restart
|
||||
*/
|
||||
child->exit_code = 0;
|
||||
ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_WAITPID);
|
||||
|
||||
/* setup the pmix environment */
|
||||
if (OPAL_SUCCESS != (rc = opal_pmix.server_setup_fork(&child->name, &env))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
state = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
|
||||
goto errorout;
|
||||
}
|
||||
|
||||
/* if we are not forwarding output for this job, then
|
||||
* flag iof as complete
|
||||
*/
|
||||
@ -693,8 +696,9 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
|
||||
/* can't be done! */
|
||||
orte_show_help("help-orte-odls-base.txt",
|
||||
"orte-odls-base:xterm-rank-out-of-bounds",
|
||||
true, nm->name.vpid, jobdat->num_procs);
|
||||
child->exit_code = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
|
||||
true, orte_process_info.nodename,
|
||||
nm->name.vpid, jobdat->num_procs);
|
||||
state = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
|
||||
goto errorout;
|
||||
}
|
||||
}
|
||||
@ -717,7 +721,7 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
|
||||
orte_show_help("help-orte-odls-base.txt",
|
||||
"orte-odls-base:fork-agent-not-found",
|
||||
true, orte_process_info.nodename, orte_fork_agent[0]);
|
||||
child->exit_code = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
|
||||
state = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
|
||||
goto errorout;
|
||||
}
|
||||
} else {
|
||||
@ -730,7 +734,7 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_schizo.setup_child(jobdat, child, app, &env))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
child->exit_code = rc;
|
||||
state = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
|
||||
goto errorout;
|
||||
}
|
||||
|
||||
@ -754,17 +758,8 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (rc = cd->fork_local(child, cmd, argv, env, jobdat, cd->opts))) {
|
||||
child->exit_code = rc; /* error message already output */
|
||||
goto errorout;
|
||||
}
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
/* do NOT ERROR_LOG this error - it generates
|
||||
* a message/node as most errors will be common
|
||||
* across the entire cluster. Instead, we let orterun
|
||||
* output a consolidated error message for us
|
||||
*/
|
||||
ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_ALIVE);
|
||||
child->exit_code = rc; /* error message already output */
|
||||
/* error message already output */
|
||||
state = ORTE_PROC_STATE_FAILED_TO_START;
|
||||
goto errorout;
|
||||
}
|
||||
|
||||
@ -782,7 +777,8 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
|
||||
return;
|
||||
|
||||
errorout:
|
||||
ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_START);
|
||||
ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_ALIVE);
|
||||
ORTE_ACTIVATE_PROC_STATE(&child->name, state);
|
||||
if (NULL != env) {
|
||||
opal_argv_free(env);
|
||||
}
|
||||
|
@ -12,6 +12,7 @@
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
|
||||
# Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
@ -29,6 +30,7 @@ having specified a directory for your application. Your job will now
|
||||
abort.
|
||||
|
||||
Local host: %s
|
||||
Working dir: %s
|
||||
Application name: %s
|
||||
Error: %s
|
||||
#
|
||||
|
@ -328,6 +328,7 @@ static int do_child(orte_proc_t *child,
|
||||
int i;
|
||||
sigset_t sigs;
|
||||
long fd, fdmax = sysconf(_SC_OPEN_MAX);
|
||||
char dir[MAXPATHLEN];
|
||||
|
||||
#if HAVE_SETPGID
|
||||
/* Set a new process group for this child, so that any
|
||||
@ -425,9 +426,10 @@ static int do_child(orte_proc_t *child,
|
||||
/* Exec the new executable */
|
||||
|
||||
execve(app, argv, environ_copy);
|
||||
getcwd(dir, sizeof(dir));
|
||||
send_error_show_help(write_fd, 1,
|
||||
"help-orte-odls-default.txt", "execve error",
|
||||
orte_process_info.nodename, app, strerror(errno));
|
||||
orte_process_info.nodename, dir, app, strerror(errno));
|
||||
/* Does not return */
|
||||
}
|
||||
|
||||
|
@ -15,7 +15,7 @@
|
||||
* Copyright (c) 2007-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2012 Oak Ridge National Labs. All rights reserved.
|
||||
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -258,8 +258,8 @@ int orte_print_aborted_job(orte_job_t *job,
|
||||
default:
|
||||
if (0 != proc->exit_code) {
|
||||
orte_show_help("help-orterun.txt", "orterun:proc-failed-to-start", true,
|
||||
orte_basename, ORTE_ERROR_NAME(proc->exit_code), node->name,
|
||||
(unsigned long)proc->name.vpid);
|
||||
orte_basename, proc->exit_code, ORTE_ERROR_NAME(proc->exit_code),
|
||||
node->name, (unsigned long)proc->name.vpid);
|
||||
} else {
|
||||
orte_show_help("help-orterun.txt", "orterun:proc-failed-to-start-no-status", true,
|
||||
orte_basename, node->name);
|
||||
|
@ -12,6 +12,7 @@
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2007-2016 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2012 Oak Ridge National Labs. All rights reserved.
|
||||
# Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
@ -296,6 +297,7 @@ while attempting to start process rank %lu.
|
||||
%s was unable to start the specified application as it encountered an
|
||||
error:
|
||||
|
||||
Error code: %d
|
||||
Error name: %s
|
||||
Node: %s
|
||||
|
||||
|
@ -89,7 +89,7 @@ int orte_err2str(int errnum, const char **errmsg)
|
||||
if (orte_report_silent_errors) {
|
||||
retval = "Silent error";
|
||||
} else {
|
||||
retval = NULL;
|
||||
retval = "";
|
||||
}
|
||||
break;
|
||||
case ORTE_ERR_ADDRESSEE_UNKNOWN:
|
||||
@ -174,7 +174,7 @@ int orte_err2str(int errnum, const char **errmsg)
|
||||
if (orte_report_silent_errors) {
|
||||
retval = "Next option";
|
||||
} else {
|
||||
retval = NULL;
|
||||
retval = "";
|
||||
}
|
||||
break;
|
||||
case ORTE_ERR_SENSOR_LIMIT_EXCEEDED:
|
||||
@ -244,11 +244,7 @@ int orte_err2str(int errnum, const char **errmsg)
|
||||
retval = "Partial success";
|
||||
break;
|
||||
default:
|
||||
if (orte_report_silent_errors) {
|
||||
retval = "Unknown error";
|
||||
} else {
|
||||
retval = NULL;
|
||||
}
|
||||
retval = "Unknown error";
|
||||
}
|
||||
|
||||
*errmsg = retval;
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user