/* * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. * Copyright (c) 2004-2005 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ */ #include "orte_config.h" #include #include #ifdef HAVE_UNISTD_H #include #endif #ifdef HAVE_NETDB_H #include #endif #ifdef HAVE_SYS_PARAM_H #include #endif #include #include #include "include/orte_constants.h" #include "opal/runtime/opal.h" #include "opal/threads/mutex.h" #include "opal/threads/condition.h" #include "dps/dps.h" #include "opal/event/event.h" #include "opal/util/argv.h" #include "opal/util/path.h" #include "opal/util/output.h" #include "opal/util/show_help.h" #include "util/sys_info.h" #include "opal/util/os_path.h" #include "opal/util/cmd_line.h" #include "util/proc_info.h" #include "util/univ_info.h" #include "util/session_dir.h" #include "opal/util/printf.h" #include "opal/util/daemon_init.h" #include "util/universe_setup_file_io.h" #include "opal/util/malloc.h" #include "opal/memoryhooks/memory.h" #include "opal/mca/timer/base/base.h" #include "opal/mca/memory/base/base.h" #include "mca/base/base.h" #include "mca/base/mca_base_param.h" #include "mca/rml/base/base.h" #include "mca/rml/rml.h" #include "mca/errmgr/base/base.h" #include "mca/ns/base/base.h" #include "mca/gpr/base/base.h" #include "mca/schema/base/base.h" #include "mca/soh/base/base.h" #include "runtime/runtime.h" #include "runtime/orte_wait.h" #include "tools/orteprobe/orteprobe.h" orteprobe_globals_t orteprobe_globals; /* * define the orteprobe context table for obtaining parameters */ opal_cmd_line_init_t orte_cmd_line_opts[] = { /* Various "obvious" options */ { NULL, NULL, NULL, 'h', NULL, "help", 0, &orteprobe_globals.help, OPAL_CMD_LINE_TYPE_BOOL, "This help message" }, { NULL, NULL, NULL, '\0', NULL, "verbose", 0, &orteprobe_globals.verbose, OPAL_CMD_LINE_TYPE_BOOL, "Toggle Verbosity" }, { NULL, NULL, NULL, 'd', NULL, "debug", 0, &orteprobe_globals.debug, OPAL_CMD_LINE_TYPE_BOOL, "Run in debug mode (not generally intended for users)" }, { NULL, NULL, NULL, '\0', NULL, "name", 1, &orteprobe_globals.name_string, OPAL_CMD_LINE_TYPE_STRING, "Set the orte process name"}, { NULL, NULL, NULL, '\0', NULL, "nsreplica", 1, &orte_process_info.ns_replica_uri, OPAL_CMD_LINE_TYPE_STRING, "Name service contact information."}, { NULL, NULL, NULL, '\0', NULL, "gprreplica", 1, &orte_process_info.gpr_replica_uri, OPAL_CMD_LINE_TYPE_STRING, "Registry contact information."}, { NULL, NULL, NULL, '\0', NULL, "nodename", 1, &orte_system_info.nodename, OPAL_CMD_LINE_TYPE_STRING, "Node name as specified by host/resource description." }, { NULL, NULL, NULL, '\0', NULL, "requestor", 1, &orteprobe_globals.requestor_string, OPAL_CMD_LINE_TYPE_STRING, "Set the orte process name"}, { "seed", NULL, NULL, '\0', NULL, "seed", 0, &orte_process_info.seed, OPAL_CMD_LINE_TYPE_BOOL, "seed"}, { "universe", "persistence", NULL, '\0', NULL, "persistent", 0, &orte_universe_info.persistence, OPAL_CMD_LINE_TYPE_BOOL, "persistent"}, { "universe", "scope", NULL, '\0', NULL, "scope", 1, &orte_universe_info.scope, OPAL_CMD_LINE_TYPE_STRING, "scope"}, /* End of list */ { NULL, NULL, NULL, '\0', NULL, NULL, 0, NULL, OPAL_CMD_LINE_TYPE_NULL, NULL } }; extern char **environ; int main(int argc, char *argv[]) { int ret = 0, ortedargc; opal_cmd_line_t *cmd_line = NULL; char *contact_path = NULL, *orted=NULL; char *log_path = NULL, **ortedargv; char *universe, orted_uri[256], **orted_uri_ptr, *path, *param; orte_universe_t univ; orte_buffer_t buffer; orte_process_name_t requestor; int id, orted_pipe[2]; pid_t pid; #if defined(HAVE_FORK) && defined(HAVE_PIPE) if (ORTE_SUCCESS != (ret = opal_init())) { return ret; } /* setup to check common command line options that just report and die */ memset(&orteprobe_globals, 0, sizeof(orteprobe_globals)); cmd_line = OBJ_NEW(opal_cmd_line_t); opal_cmd_line_create(cmd_line, orte_cmd_line_opts); ret = opal_cmd_line_parse(cmd_line, true, argc, argv); if (OMPI_SUCCESS != ret) { return ret; } /* check for help request */ if (orteprobe_globals.help) { char *args = NULL; args = opal_cmd_line_get_usage_msg(cmd_line); opal_show_help("help-orteprobe.txt", "orteprobe:usage", false, argv[0], args); free(args); return 1; } /* * Attempt to parse the probe's name and save in proc_info */ if (orteprobe_globals.name_string) { ret = orte_ns_base_convert_string_to_process_name( &orte_process_info.my_name, orteprobe_globals.name_string); if(ORTE_SUCCESS != ret) { fprintf(stderr, "orteprobe: Couldn't convert environmental string to probe's process name\n"); return 1; } } /* * If threads are supported - assume that we are using threads - and reset otherwise. */ opal_set_using_threads(OMPI_HAVE_THREAD_SUPPORT); /* Ensure the system_info structure is instantiated and initialized */ if (ORTE_SUCCESS != (ret = orte_sys_info())) { return ret; } /* Ensure the process info structure is instantiated and initialized */ if (ORTE_SUCCESS != (ret = orte_proc_info())) { return ret; } /* Ensure the universe_info structure is instantiated and initialized */ if (ORTE_SUCCESS != (ret = orte_univ_info())) { return ret; } /* * Initialize the data packing service. */ if (ORTE_SUCCESS != (ret = orte_dps_open())) { ORTE_ERROR_LOG(ret); return ret; } /* * Open the name services to ensure access to local functions */ if (OMPI_SUCCESS != (ret = orte_ns_base_open())) { return ret; } /* Open the error manager to activate error logging - needs local name services */ if (ORTE_SUCCESS != (ret = orte_errmgr_base_open())) { return ret; } /***** ERROR LOGGING NOW AVAILABLE *****/ /* * Initialize the event library */ if (OMPI_SUCCESS != (ret = opal_event_init())) { ORTE_ERROR_LOG(ret); return ret; } /* * Intialize the general progress engine */ if (OMPI_SUCCESS != (ret = opal_progress_init())) { ORTE_ERROR_LOG(ret); return ret; } /* * Internal startup */ if (OMPI_SUCCESS != (ret = orte_wait_init())) { ORTE_ERROR_LOG(ret); return ret; } /* * Runtime Messaging Layer */ if (OMPI_SUCCESS != (ret = orte_rml_base_open())) { ORTE_ERROR_LOG(ret); return ret; } /* * Runtime Messaging Layer */ if (OMPI_SUCCESS != (ret = orte_rml_base_select())) { ORTE_ERROR_LOG(ret); return ret; } /* * Registry */ if (ORTE_SUCCESS != (ret = orte_gpr_base_open())) { ORTE_ERROR_LOG(ret); return ret; } /* * Initialize schema utilities */ if (ORTE_SUCCESS != (ret = orte_schema_base_open())) { ORTE_ERROR_LOG(ret); return ret; } /* * Attempt to parse the requestor's name and contact info */ if (orteprobe_globals.requestor_string) { if(ORTE_SUCCESS != (ret = orte_rml.parse_uris( orteprobe_globals.requestor_string, &requestor, NULL))) { fprintf(stderr, "orteprobe: Couldn't parse environmental string for requestor's contact info\n"); return 1; } /* set the contact info */ if (ORTE_SUCCESS != (ret = orte_rml.set_uri(orteprobe_globals.requestor_string))) { fprintf(stderr, "orteprobe: Couldn't set contact info for requestor\n"); return ret; } } else { fprintf(stderr, "orteprobe: No contact info received for requestor\n"); return 1; } /* see if a universe already exists on this machine and * will allow contact with us */ if (ORTE_SUCCESS == (ret = orte_universe_exists(&univ))) { /* universe is here! send info back and die */ if(orteprobe_globals.verbose) fprintf(stderr, "orteprobe: Contacted existing universe - sending contact info back\n"); OBJ_CONSTRUCT(&buffer, orte_buffer_t); orted_uri_ptr = &(univ.seed_uri); if (ORTE_SUCCESS != (ret = orte_dps.pack(&buffer, &orted_uri_ptr, 1, ORTE_STRING))) { fprintf(stderr, "orteprobe: failed to pack contact info for existing universe\n"); orte_abort(1, NULL); } if (0 > orte_rml.send_buffer(&requestor, &buffer, ORTE_RML_TAG_PROBE, 0)) { fprintf(stderr, "orteprobe: comm failure when sending contact info for existing univ back to requestor\n"); OBJ_DESTRUCT(&buffer); orte_abort(1, NULL); } OBJ_DESTRUCT(&buffer); } else { /* existing universe is not here or does not allow contact. * ensure we have a unique universe name, fork/exec an appropriate * daemon, and then tell whomever spawned us how to talk to the new * daemon */ if(orteprobe_globals.verbose) fprintf(stderr, "orteprobe: Could not connect to existing universe\n"); if (ORTE_ERR_NOT_FOUND != ret) { if(orteprobe_globals.verbose) fprintf(stderr, "orteprobe: Existing universe did not respond\n"); /* if it exists but no contact could be established, * define unique name based on current one. */ universe = strdup(orte_universe_info.name); free(orte_universe_info.name); orte_universe_info.name = NULL; pid = getpid(); if (0 > asprintf(&orte_universe_info.name, "%s-%d", universe, pid)) { fprintf(stderr, "orteprobe: failed to create unique universe name"); orte_abort(1, NULL); } } /* setup to fork/exec the new universe */ /* setup the pipe to get the contact info back */ if (pipe(orted_pipe)) { fprintf (stderr, "orteprobe: Pipe failed\n"); orte_abort(1, NULL); } /* get name of orted application - just in case user specified something different */ id = mca_base_param_register_string("orted",NULL,NULL,NULL,"orted"); mca_base_param_lookup_string(id, &orted); if(orteprobe_globals.verbose) fprintf(stderr, "orteprobe: Using \"%s\" for orted command\n", orted); /* Initialize the argv array */ ortedargv = opal_argv_split(orted, ' '); ortedargc = opal_argv_count(ortedargv); if (ortedargc <= 0) { fprintf(stderr, "orteprobe: could not initialize argv array for daemon\n"); orte_abort(1, NULL); } /* setup the path */ path = opal_path_findv(ortedargv[0], 0, environ, NULL); if(orteprobe_globals.verbose) fprintf(stderr, "orteprobe: Path setup as \"%s\"\n", path); /* tell the daemon it's the seed */ opal_argv_append(&ortedargc, &ortedargv, "--seed"); /* tell the daemon it's scope */ opal_argv_append(&ortedargc, &ortedargv, "--scope"); opal_argv_append(&ortedargc, &ortedargv, orte_universe_info.scope); /* tell the daemon if it's to be persistent */ if (orte_universe_info.persistence) { opal_argv_append(&ortedargc, &ortedargv, "--persistent"); } /* tell the daemon to report its uri to us */ asprintf(¶m, "%d", orted_pipe[1]); opal_argv_append(&ortedargc, &ortedargv, "--report-uri"); opal_argv_append(&ortedargc, &ortedargv, param); free(param); if(orteprobe_globals.verbose) fprintf(stderr, "orteprobe: Forking now\n"); /* Create the child process. */ pid = fork (); if (pid == (pid_t) 0) { /* This is the child process. Close read end first. */ execv(path, ortedargv); fprintf(stderr, "orteprobe: execv failed with errno=%d\n", errno); orte_abort(1, NULL); } else if (pid < (pid_t) 0) { /* The fork failed. */ fprintf (stderr, "orteprobe: Fork failed\n"); orte_abort(1, NULL); } else { /* This is the parent process. Close write end first. */ if(orteprobe_globals.verbose) fprintf(stderr, "orteprobe: Attempting to read from daemon\n"); read(orted_pipe[0], orted_uri, 255); close(orted_pipe[0]); /* send back the info */ OBJ_CONSTRUCT(&buffer, orte_buffer_t); param = orted_uri; orted_uri_ptr = ¶m; if (ORTE_SUCCESS != (ret = orte_dps.pack(&buffer, &orted_uri_ptr[0], 1, ORTE_STRING))) { fprintf(stderr, "orteprobe: failed to pack daemon uri\n"); orte_abort(1, NULL); } if (0 > orte_rml.send_buffer(&requestor, &buffer, ORTE_RML_TAG_PROBE, 0)) { fprintf(stderr, "orteprobe: could not send daemon uri info back to probe\n"); OBJ_DESTRUCT(&buffer); orte_abort(1, NULL); } OBJ_DESTRUCT(&buffer); } } if(orteprobe_globals.verbose) fprintf(stderr, "orteprobe: All finished!\n"); /* cleanup */ if (NULL != contact_path) { unlink(contact_path); } if (NULL != log_path) { unlink(log_path); } /* finalize the system */ orte_finalize(); opal_finalize(); exit(0); #else /* HAVE_FORK && HAVE_PIPE */ fprintf(stderr, "orteprobe: system appears to not support remote probes\n"); exit(1); #endif }