From adf9c7cb81cb084a827d767c456ed66927880403 Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Wed, 15 Dec 2004 18:52:15 +0000 Subject: [PATCH] * fix book keeping error that caused badness when 1) processes were started on multiple nodes and 2) the processes unexpectedly quit This commit was SVN r3823. --- src/mca/pcm/rsh/help-mca-pcm-rsh.txt | 5 +++++ src/mca/pcm/rsh/pcm_rsh_spawn.c | 15 +++++++++++---- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/src/mca/pcm/rsh/help-mca-pcm-rsh.txt b/src/mca/pcm/rsh/help-mca-pcm-rsh.txt index 9a1b66425f..15e68f7d3f 100644 --- a/src/mca/pcm/rsh/help-mca-pcm-rsh.txt +++ b/src/mca/pcm/rsh/help-mca-pcm-rsh.txt @@ -42,3 +42,8 @@ Open MPI was unable to start a process on one or more nodes using the rsh startup mechanism. Open MPI was able to start the bootproxy on the remote node, but was unable to send the application schema to the remote bootproxy. The error was %s. +[spawn:no-process-status] +Open MPI was unable to find the process status entry for process name +%s. This is usually indicitive of an internal bug. Please let +the developers know how you came across this message so that it can +be fixed. diff --git a/src/mca/pcm/rsh/pcm_rsh_spawn.c b/src/mca/pcm/rsh/pcm_rsh_spawn.c index 849a0273f7..4709261396 100644 --- a/src/mca/pcm/rsh/pcm_rsh_spawn.c +++ b/src/mca/pcm/rsh/pcm_rsh_spawn.c @@ -538,7 +538,7 @@ internal_spawn_proc(mca_pcm_rsh_module_t *me, proc_cleanup: if (high_qos) { - for (i = 0 ; i < num_procs ; ++i) { + for (i = 0 ; i < start_node->count ; ++i) { ompi_process_name_t *name; name = ompi_name_server.create_process_name(0, jobid, my_start_vpid + i); @@ -612,9 +612,16 @@ internal_wait_cb(pid_t pid, int status, void *data) /* unregister all the procs */ for (i = 0 ; i < procs_len ; ++i) { proc_status = ompi_rte_get_process_status(procs[i]); - proc_status->status_key = OMPI_PROC_KILLED; - proc_status->exit_code = (ompi_exit_code_t)status; - ompi_rte_set_process_status(proc_status, procs[i]); + if (NULL == proc_status) { + char *name = ompi_name_server.get_proc_name_string(procs[i]); + ompi_show_help("help-mca-pcm-rsh.txt", + "spawn:no-process-status", true, name, status); + free(name); + } else { + proc_status->status_key = OMPI_PROC_KILLED; + proc_status->exit_code = (ompi_exit_code_t)status; + ompi_rte_set_process_status(proc_status, procs[i]); + } free(procs[i]); }