1
1
openmpi/orte/tools/orteprobe/orteprobe.c
Ralph Castain ab5ea61100 Bring over the rest of the ctrl-c fixes. This commit includes:
1. add a "cancel_operation" API to the pls components that allows orterun to demand that an orted operation (e.g., terminate_job) be immediately cancelled and abandoned.

2. changes the pls orted commands from blocking to non-blocking. This allows us to interrupt those operations should an orted be non-responsive. The change also adds an orte_abort_timeout that limits how long orterun will automatically wait for the orteds to respond - if the terminate command, for example, doesn't see orted response within that time, then we printout an appropriate error message and just give up.

3. modifies orterun to allow multiple ctrl-c's to simply abort the program even if the orteds have not responded

4. does some cleanup on the orte-level mca params so that their implementation looks a lot more like that of ompi - makes it easier to maintain. This change also includes the definition of an orte_abort_timeout struct and associated MCA param (can't have too many!) so you can set the time after which orterun gives up on waiting for orteds to respond

This needs more testing before migrating to 1.2.

This commit was SVN r13304.
2007-01-25 14:17:44 +00:00

478 строки
15 KiB
C

/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include <stdio.h>
#include <ctype.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#ifdef HAVE_NETDB_H
#include <netdb.h>
#endif
#ifdef HAVE_SYS_PARAM_H
#include <sys/param.h>
#endif
#include <fcntl.h>
#include <errno.h>
#include "orte/orte_constants.h"
#include "opal/runtime/opal.h"
#include "opal/threads/mutex.h"
#include "opal/threads/condition.h"
#include "opal/event/event.h"
#include "opal/util/argv.h"
#include "opal/util/path.h"
#include "opal/util/output.h"
#include "opal/util/show_help.h"
#include "opal/util/os_path.h"
#include "opal/util/cmd_line.h"
#include "opal/util/printf.h"
#include "opal/util/daemon_init.h"
#include "opal/util/malloc.h"
#include "opal/memoryhooks/memory.h"
#include "opal/mca/timer/base/base.h"
#include "opal/mca/memory/base/base.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h"
#include "orte/dss/dss.h"
#include "orte/util/sys_info.h"
#include "orte/util/proc_info.h"
#include "orte/util/univ_info.h"
#include "orte/util/session_dir.h"
#include "orte/util/universe_setup_file_io.h"
#include "orte/mca/rml/base/base.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/errmgr/base/base.h"
#include "orte/mca/ns/base/base.h"
#include "orte/mca/gpr/base/base.h"
#include "orte/mca/schema/base/base.h"
#include "orte/runtime/runtime.h"
#include "orte/runtime/params.h"
#include "orte/runtime/orte_wait.h"
#include "orte/tools/orteprobe/orteprobe.h"
#if 0
orteprobe_globals_t orteprobe_globals;
/*
* define the orteprobe context table for obtaining parameters
*/
opal_cmd_line_init_t orte_cmd_line_opts[] = {
/* Various "obvious" options */
{ NULL, NULL, NULL, 'h', NULL, "help", 0,
&orteprobe_globals.help, OPAL_CMD_LINE_TYPE_BOOL,
"This help message" },
{ NULL, NULL, NULL, '\0', NULL, "verbose", 0,
&orteprobe_globals.verbose, OPAL_CMD_LINE_TYPE_BOOL,
"Toggle Verbosity" },
{ NULL, NULL, NULL, 'd', NULL, "debug", 0,
&orteprobe_globals.debug, OPAL_CMD_LINE_TYPE_BOOL,
"Run in debug mode (not generally intended for users)" },
{ NULL, NULL, NULL, '\0', NULL, "name", 1,
&orteprobe_globals.name_string, OPAL_CMD_LINE_TYPE_STRING,
"Set the orte process name"},
{ NULL, NULL, NULL, '\0', NULL, "nsreplica", 1,
&orte_process_info.ns_replica_uri, OPAL_CMD_LINE_TYPE_STRING,
"Name service contact information."},
{ NULL, NULL, NULL, '\0', NULL, "gprreplica", 1,
&orte_process_info.gpr_replica_uri, OPAL_CMD_LINE_TYPE_STRING,
"Registry contact information."},
{ NULL, NULL, NULL, '\0', NULL, "nodename", 1,
&orte_system_info.nodename, OPAL_CMD_LINE_TYPE_STRING,
"Node name as specified by host/resource description." },
{ NULL, NULL, NULL, '\0', NULL, "requestor", 1,
&orteprobe_globals.requestor_string, OPAL_CMD_LINE_TYPE_STRING,
"Set the orte process name"},
{ "seed", NULL, NULL, '\0', NULL, "seed", 0,
&orte_process_info.seed, OPAL_CMD_LINE_TYPE_BOOL,
"seed"},
{ "universe", "persistence", NULL, '\0', NULL, "persistent", 0,
&orte_universe_info.persistence, OPAL_CMD_LINE_TYPE_BOOL,
"persistent"},
{ "universe", "scope", NULL, '\0', NULL, "scope", 1,
&orte_universe_info.scope, OPAL_CMD_LINE_TYPE_STRING,
"scope"},
/* End of list */
{ NULL, NULL, NULL, '\0', NULL, NULL, 0,
NULL, OPAL_CMD_LINE_TYPE_NULL, NULL }
};
#endif
#if !defined(__WINDOWS__)
extern char **environ;
#endif /* !defined(__WINDOWS__) */
int main(int argc, char *argv[])
{
#if 0
int ret = 0, ortedargc;
opal_cmd_line_t *cmd_line = NULL;
char *contact_path = NULL, *orted=NULL;
char *log_path = NULL, **ortedargv;
char *universe, orted_uri[256], **orted_uri_ptr, *path, *param;
orte_universe_t univ;
orte_buffer_t buffer;
orte_process_name_t requestor;
int id, orted_pipe[2];
pid_t pid;
#if defined(HAVE_FORK) && defined(HAVE_PIPE)
if (ORTE_SUCCESS != (ret = opal_init())) {
return ret;
}
/* Setup MCA params */
mca_base_param_init();
orte_register_params(false);
/* setup to check common command line options that just report and die */
memset(&orteprobe_globals, 0, sizeof(orteprobe_globals));
cmd_line = OBJ_NEW(opal_cmd_line_t);
opal_cmd_line_create(cmd_line, orte_cmd_line_opts);
ret = opal_cmd_line_parse(cmd_line, true, argc, argv);
if (ORTE_SUCCESS != ret) {
return ret;
}
/* check for help request */
if (orteprobe_globals.help) {
char *args = NULL;
args = opal_cmd_line_get_usage_msg(cmd_line);
opal_show_help("help-orteprobe.txt", "orteprobe:usage", false,
argv[0], args);
free(args);
return 1;
}
/*
* Attempt to parse the probe's name and save in proc_info
*/
if (orteprobe_globals.name_string) {
ret = orte_ns.convert_string_to_process_name(
&orte_process_info.my_name, orteprobe_globals.name_string);
if(ORTE_SUCCESS != ret) {
fprintf(stderr, "orteprobe: Couldn't convert environmental string to probe's process name\n");
return 1;
}
}
/*
* If threads are supported - assume that we are using threads - and reset otherwise.
*/
opal_set_using_threads(OMPI_HAVE_THREAD_SUPPORT);
/* Ensure the system_info structure is instantiated and initialized */
if (ORTE_SUCCESS != (ret = orte_sys_info())) {
return ret;
}
/* Ensure the process info structure is instantiated and initialized */
if (ORTE_SUCCESS != (ret = orte_proc_info())) {
return ret;
}
/* Ensure the universe_info structure is instantiated and initialized */
if (ORTE_SUCCESS != (ret = orte_univ_info())) {
return ret;
}
/*
* Initialize the data storage service.
*/
if (ORTE_SUCCESS != (ret = orte_dss_open())) {
ORTE_ERROR_LOG(ret);
return ret;
}
/*
* Open the name services to ensure access to local functions
*/
if (ORTE_SUCCESS != (ret = orte_ns_base_open())) {
return ret;
}
/* Open the error manager to activate error logging - needs local name services */
if (ORTE_SUCCESS != (ret = orte_errmgr_base_open())) {
return ret;
}
/***** ERROR LOGGING NOW AVAILABLE *****/
/*
* Initialize the event library
*/
if (ORTE_SUCCESS != (ret = opal_event_init())) {
ORTE_ERROR_LOG(ret);
return ret;
}
/*
* Intialize the general progress engine
*/
if (ORTE_SUCCESS != (ret = opal_progress_init())) {
ORTE_ERROR_LOG(ret);
return ret;
}
/*
* Internal startup
*/
if (ORTE_SUCCESS != (ret = orte_wait_init())) {
ORTE_ERROR_LOG(ret);
return ret;
}
/*
* Runtime Messaging Layer
*/
if (ORTE_SUCCESS != (ret = orte_rml_base_open())) {
ORTE_ERROR_LOG(ret);
return ret;
}
/*
* Runtime Messaging Layer
*/
if (ORTE_SUCCESS != (ret = orte_rml_base_select())) {
ORTE_ERROR_LOG(ret);
return ret;
}
/*
* Registry
*/
if (ORTE_SUCCESS != (ret = orte_gpr_base_open())) {
ORTE_ERROR_LOG(ret);
return ret;
}
/*
* Initialize schema utilities
*/
if (ORTE_SUCCESS != (ret = orte_schema_base_open())) {
ORTE_ERROR_LOG(ret);
return ret;
}
/*
* Attempt to parse the requestor's name and contact info
*/
if (orteprobe_globals.requestor_string) {
if(ORTE_SUCCESS != (ret = orte_rml.parse_uris(
orteprobe_globals.requestor_string, &requestor, NULL))) {
fprintf(stderr, "orteprobe: Couldn't parse environmental string for requestor's contact info\n");
return 1;
}
/* set the contact info */
if (ORTE_SUCCESS != (ret = orte_rml.set_uri(orteprobe_globals.requestor_string))) {
fprintf(stderr, "orteprobe: Couldn't set contact info for requestor\n");
return ret;
}
} else {
fprintf(stderr, "orteprobe: No contact info received for requestor\n");
return 1;
}
/* see if a universe already exists on this machine and
* will allow contact with us
*/
if (ORTE_SUCCESS == (ret = orte_universe_exists(&univ))) {
/* universe is here! send info back and die */
if(orteprobe_globals.verbose)
fprintf(stderr, "orteprobe: Contacted existing universe - sending contact info back\n");
OBJ_CONSTRUCT(&buffer, orte_buffer_t);
orted_uri_ptr = &(univ.seed_uri);
if (ORTE_SUCCESS != (ret = orte_dss.pack(&buffer, &orted_uri_ptr, 1, ORTE_STRING))) {
fprintf(stderr, "orteprobe: failed to pack contact info for existing universe\n");
orte_errmgr.error_detected(1, NULL);
}
if (0 > orte_rml.send_buffer(&requestor, &buffer, ORTE_RML_TAG_PROBE, 0)) {
fprintf(stderr, "orteprobe: comm failure when sending contact info for existing univ back to requestor\n");
OBJ_DESTRUCT(&buffer);
orte_errmgr.error_detected(1, NULL);
}
OBJ_DESTRUCT(&buffer);
} else {
/* existing universe is not here or does not allow contact.
* ensure we have a unique universe name, fork/exec an appropriate
* daemon, and then tell whomever spawned us how to talk to the new
* daemon
*/
if(orteprobe_globals.verbose)
fprintf(stderr, "orteprobe: Could not connect to existing universe\n");
if (ORTE_ERR_NOT_FOUND != ret) {
if(orteprobe_globals.verbose)
fprintf(stderr, "orteprobe: Existing universe did not respond\n");
/* if it exists but no contact could be established,
* define unique name based on current one.
*/
universe = strdup(orte_universe_info.name);
free(orte_universe_info.name);
orte_universe_info.name = NULL;
pid = getpid();
if (0 > asprintf(&orte_universe_info.name, "%s-%d", universe, pid)) {
fprintf(stderr, "orteprobe: failed to create unique universe name");
orte_errmgr.error_detected(1, NULL);
}
}
/* setup to fork/exec the new universe */
/* setup the pipe to get the contact info back */
if (pipe(orted_pipe)) {
fprintf (stderr, "orteprobe: Pipe failed\n");
orte_errmgr.error_detected(1, NULL);
}
/* get name of orted application - just in case user specified something different */
id = mca_base_param_register_string("orted",NULL,NULL,NULL,"orted");
mca_base_param_lookup_string(id, &orted);
if(orteprobe_globals.verbose)
fprintf(stderr, "orteprobe: Using \"%s\" for orted command\n", orted);
/* Initialize the argv array */
ortedargv = opal_argv_split(orted, ' ');
ortedargc = opal_argv_count(ortedargv);
if (ortedargc <= 0) {
fprintf(stderr, "orteprobe: could not initialize argv array for daemon\n");
orte_errmgr.error_detected(1, NULL);
}
/* setup the path */
path = opal_path_findv(ortedargv[0], 0, environ, NULL);
if(orteprobe_globals.verbose)
fprintf(stderr, "orteprobe: Path setup as \"%s\"\n", path);
/* tell the daemon it's the seed */
opal_argv_append(&ortedargc, &ortedargv, "--seed");
/* tell the daemon it's scope */
opal_argv_append(&ortedargc, &ortedargv, "--scope");
opal_argv_append(&ortedargc, &ortedargv, orte_universe_info.scope);
/* tell the daemon if it's to be persistent */
if (orte_universe_info.persistence) {
opal_argv_append(&ortedargc, &ortedargv, "--persistent");
}
/* tell the daemon to report its uri to us */
asprintf(&param, "%d", orted_pipe[1]);
opal_argv_append(&ortedargc, &ortedargv, "--report-uri");
opal_argv_append(&ortedargc, &ortedargv, param);
free(param);
if(orteprobe_globals.verbose)
fprintf(stderr, "orteprobe: Forking now\n");
/* Create the child process. */
pid = fork ();
if (pid == (pid_t) 0) {
/* This is the child process.
Close read end first. */
execv(path, ortedargv);
fprintf(stderr, "orteprobe: execv failed with errno=%d\n", errno);
orte_errmgr.error_detected(1, NULL);
} else if (pid < (pid_t) 0) {
/* The fork failed. */
fprintf (stderr, "orteprobe: Fork failed\n");
orte_errmgr.error_detected(1, NULL);
} else {
/* This is the parent process.
Close write end first. */
if(orteprobe_globals.verbose)
fprintf(stderr, "orteprobe: Attempting to read from daemon\n");
read(orted_pipe[0], orted_uri, 255);
close(orted_pipe[0]);
/* send back the info */
OBJ_CONSTRUCT(&buffer, orte_buffer_t);
param = orted_uri;
orted_uri_ptr = &param;
if (ORTE_SUCCESS != (ret = orte_dss.pack(&buffer, &orted_uri_ptr[0], 1, ORTE_STRING))) {
fprintf(stderr, "orteprobe: failed to pack daemon uri\n");
orte_errmgr.error_detected(1, NULL);
}
if (0 > orte_rml.send_buffer(&requestor, &buffer, ORTE_RML_TAG_PROBE, 0)) {
fprintf(stderr, "orteprobe: could not send daemon uri info back to probe\n");
OBJ_DESTRUCT(&buffer);
orte_errmgr.error_detected(1, NULL);
}
OBJ_DESTRUCT(&buffer);
}
}
if(orteprobe_globals.verbose)
fprintf(stderr, "orteprobe: All finished!\n");
/* cleanup */
if (NULL != contact_path) {
unlink(contact_path);
}
if (NULL != log_path) {
unlink(log_path);
}
/* finalize the system */
orte_finalize();
opal_finalize();
exit(0);
#else /* HAVE_FORK && HAVE_PIPE */
fprintf(stderr, "orteprobe: system appears to not support remote probes\n");
exit(1);
#endif
#endif
exit(1);
}