plm/lsf: Improve error message if lsb_launch fails
Signed-off-by: Joshua Hursey <jhursey@us.ibm.com>
Этот коммит содержится в:
родитель
f69466d633
Коммит
89c1aaf646
@ -10,27 +10,23 @@
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2017 IBM Corporation. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
[tm-bad-launchid]
|
||||
The TM (PBS / Torque) process starter cannot spawn the specified
|
||||
application on a remote node due to an invalid launch_id.
|
||||
[lsb_launch-failed]
|
||||
The LSF process starter (lsb_launch) failed to start the daemons on
|
||||
the nodes in the allocation.
|
||||
Returned : %d
|
||||
lsberrno : (%d) %s
|
||||
|
||||
Node name: %s
|
||||
Launch id: %d
|
||||
|
||||
This is most likely due to use of the "--hostfile" option to the
|
||||
command line. At this time, Open MPI/OpenRTE do not support this
|
||||
method of operation. Instead, the system expects to directly read
|
||||
information regarding the nodes to be used from the environment.
|
||||
|
||||
Removing "--hostfile" from the command line will likely allow the
|
||||
application to be launched. This will be fixed in a future release
|
||||
to support the use of "--hostfile" on the command line.
|
||||
This may mean that one or more of the nodes in the LSF allocation is
|
||||
not setup properly. Below is a list of the %d nodes that were passed
|
||||
to lsb_launch:
|
||||
%s
|
||||
#
|
||||
[multiple-prefixes]
|
||||
Multiple different --prefix options were specified to mpirun for the
|
||||
@ -40,18 +36,3 @@ starter in Open MPI.
|
||||
The first two prefix values supplied for node %s were:
|
||||
%s
|
||||
and %s
|
||||
#
|
||||
[tm-spawn-failed]
|
||||
The TM (PBS / Torque) process starter failed to spawn a daemon (orted)
|
||||
on a remote node.
|
||||
|
||||
Command line: %s
|
||||
Node name: %s
|
||||
Launch id: %d
|
||||
|
||||
If you do not understand this error mesage, please try the following:
|
||||
|
||||
1. Ensure that the executable "orted" is in your PATH
|
||||
2. Use the --prefix option to indicate where we can
|
||||
find that executable
|
||||
3. Talk to your local system administrator
|
||||
|
@ -15,6 +15,7 @@
|
||||
* Copyright (c) 2008 Institut National de Recherche en Informatique
|
||||
* et Automatique. All rights reserved.
|
||||
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2017 IBM Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -342,9 +343,14 @@ static void launch_daemons(int fd, short args, void *cbdata)
|
||||
* orterun can do the rest of its stuff. Instead, we'll catch any
|
||||
* failures and deal with them elsewhere
|
||||
*/
|
||||
if (lsb_launch(nodelist_argv, argv, LSF_DJOB_REPLACE_ENV | LSF_DJOB_NOWAIT, env) < 0) {
|
||||
if ( (rc = lsb_launch(nodelist_argv, argv, LSF_DJOB_REPLACE_ENV | LSF_DJOB_NOWAIT, env)) < 0) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_FAILED_TO_START);
|
||||
opal_output(0, "lsb_launch failed: %d", rc);
|
||||
char *flattened_nodelist = NULL;
|
||||
flattened_nodelist = opal_argv_join(nodelist_argv, '\n');
|
||||
orte_show_help("help-plm-lsf.txt", "lsb_launch-failed",
|
||||
true, rc, lsberrno, lsb_sysmsg(),
|
||||
opal_argv_count(nodelist_argv), flattened_nodelist);
|
||||
free(flattened_nodelist);
|
||||
rc = ORTE_ERR_FAILED_TO_START;
|
||||
orte_wait_enable(); /* re-enable our SIGCHLD handler */
|
||||
goto cleanup;
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user