From 96b3c503f999b4052161489d633b23f1c833e3ef Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Fri, 8 Oct 2004 16:22:35 +0000 Subject: [PATCH] * Add useful error messages to bootproxy for when it fails * don't push pcmclient env variables to clients, since they don't have useful information in them * add check to comm_spawn code to make sure we can spawn This commit was SVN r2993. --- src/communicator/comm_dyn.c | 7 +++- src/mca/pcm/base/pcm_base_util.c | 4 +- src/tools/bootproxy/Makefile.am | 4 ++ src/tools/bootproxy/bootproxy.c | 16 ++++++-- src/tools/bootproxy/help-bootproxy.txt | 53 ++++++++++++++++++++++++++ 5 files changed, 78 insertions(+), 6 deletions(-) create mode 100644 src/tools/bootproxy/help-bootproxy.txt diff --git a/src/communicator/comm_dyn.c b/src/communicator/comm_dyn.c index f66a32abaa..3a4078ba41 100644 --- a/src/communicator/comm_dyn.c +++ b/src/communicator/comm_dyn.c @@ -273,7 +273,12 @@ int ompi_comm_start_processes (char *command, char **argv, int maxprocs, new_jobid = ompi_name_server.create_jobid(); /* get the spawn handle to start spawning stuff */ - spawn_handle = ompi_rte_get_spawn_handle(OMPI_RTE_SPAWN_FROM_MPI, true); + spawn_handle = + ompi_rte_get_spawn_handle(OMPI_RTE_SPAWN_FROM_MPI|OMPI_RTE_SPAWN_HIGH_QOS, true); + if (NULL == spawn_handle) { + printf("show_help: get_spawn_handle failed\n"); + return -1; + } /* BWB - fix jobid, procs, and nodes */ nodelist = ompi_rte_allocate_resources(spawn_handle, new_jobid, 0, maxprocs); diff --git a/src/mca/pcm/base/pcm_base_util.c b/src/mca/pcm/base/pcm_base_util.c index e806637a73..cae9b59a60 100644 --- a/src/mca/pcm/base/pcm_base_util.c +++ b/src/mca/pcm/base/pcm_base_util.c @@ -68,7 +68,9 @@ mca_pcm_base_build_base_env(char **in_env, int *envc, char ***out_envp) int ret; for (i = 0 ; in_env[i] != NULL ; ++i) { - if (0 == strncmp("OMPI_", in_env[i], strlen("OMPI_"))) { + if ((0 == strncmp("OMPI_", in_env[i], strlen("OMPI_"))) && + (0 != strncmp("OMPI_MCA_pcmclient", in_env[i], + strlen("OMPI_MCA_pcmclient")))) { ret = ompi_argv_append(envc, &env, in_env[i]); if (OMPI_SUCCESS != ret) { ompi_argv_free(env); diff --git a/src/tools/bootproxy/Makefile.am b/src/tools/bootproxy/Makefile.am index 325c63217e..97244e47bc 100644 --- a/src/tools/bootproxy/Makefile.am +++ b/src/tools/bootproxy/Makefile.am @@ -7,6 +7,10 @@ include $(top_srcdir)/config/Makefile.options libs = $(top_builddir)/src/libmpi.la +EXTRA_DIST = $(pkgdata_DATA) + +pkgdata_DATA = help-bootproxy.txt + bin_PROGRAMS = mca_pcm_rsh_bootproxy mca_pcm_rsh_bootproxy_SOURCES = \ diff --git a/src/tools/bootproxy/bootproxy.c b/src/tools/bootproxy/bootproxy.c index 2dd817c62f..211c1b16d7 100644 --- a/src/tools/bootproxy/bootproxy.c +++ b/src/tools/bootproxy/bootproxy.c @@ -16,6 +16,7 @@ #include "runtime/runtime.h" #include "mca/pcm/base/base.h" #include "class/ompi_list.h" +#include "util/show_help.h" struct pid_item_t { @@ -107,7 +108,8 @@ main(int argc, char *argv[]) ret = mca_pcm_base_recv_schedule(stdin, &jobid, sched, &fork_num_procs); if (ret != OMPI_SUCCESS) { - fprintf(stderr, "Failure in receiving schedule information\n"); + ompi_show_help("help-bootproxy.txt", "could-not-receive-schedule", + true, ret); exit(1); } @@ -130,7 +132,8 @@ main(int argc, char *argv[]) if (sched->cwd != NULL) { ret = chdir(sched->cwd); if (ret != 0) { - perror("chdir"); + ompi_show_help("help-bootproxy.txt", "could-not-chdir", + true, sched->cwd, strerror(errno)); exit(1); } } @@ -142,7 +145,9 @@ main(int argc, char *argv[]) pid = fork(); if (pid < 0) { /* error :( */ - perror("fork"); + ompi_show_help("help-bootproxy.txt", "could-not-fork", + true, sched->argv[0], strerror(errno)); + exit(errno); } else if (pid == 0) { /* child */ @@ -158,7 +163,10 @@ main(int argc, char *argv[]) } execvp(sched->argv[0], sched->argv); - perror("exec"); + ompi_show_help("help-bootproxy.txt", "could-not-exec", + true, sched->argv[0], sched->cwd, + strerror(errno)); + exit(errno); } else { /* parent */ diff --git a/src/tools/bootproxy/help-bootproxy.txt b/src/tools/bootproxy/help-bootproxy.txt new file mode 100644 index 0000000000..77532bf84a --- /dev/null +++ b/src/tools/bootproxy/help-bootproxy.txt @@ -0,0 +1,53 @@ +# -*- text -*- +# +# $HEADER$ +# +# This is the US/English bootproxy help file for Open MPI. +# +[usage] +Usage: %s --local-offset --global_start_vpid + --num_procs [--high-qos] + + --local_offset Starting vpid from which to assign for all + processes started by this bootproxy + --global_start_vpid Starting vpid for the call to + spawn_procs that resulted in this + bootproxy being launched. + --num_procs Total number of processes started in the + call to spawn_procs that resulted in + this bootproxy being launched + --high-qos Do not exit until all processes started + have exited. Do not close + std{in,out,err}. If one process exits + with non-zero status, kill the remaining + started processes. +[could-not-receive-schedule] +Open MPI was unable to start a process on one or more nodes using the +bootproxy strartup mechanism. The bootproxy client was able to +start but was not able to receive the scheduling information from the +starting process. The error returned was %d. +[could-not-chdir] +Open MPI was unable to start a process on one or more nodes using the +bootproxy strartup mechanism. The bootproxy client was able to +start and receive startup information, but was unable to set the +current working directory of the new process. + + Current working directory was: %s + The error from chdir was: %s +[could-not-fork] +Open MPI was unable to start a process on one or more nodes using the +bootproxy strartup mechanism. The bootproxy client was able to +start and receive startup information, but was unable to fork the new +process. + + I was trying to start: %s + The error from fork was: %s +[could-not-exec] +Open MPI was unable to start a process on one or more nodes using the +bootproxy strartup mechanism. The bootproxy client was able to +start and receive startup information, but was unable to exec the new +process. + + I was trying to start: %s + Current working directory was: %s + The error from exec was: %s