openmpi/orte/tools/orterun/orterun.h

/*
 * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
 *                         University Research and Technology
 *                         Corporation.  All rights reserved.
 * Copyright (c) 2004-2005 The University of Tennessee and The University
 *                         of Tennessee Research Foundation.  All rights
 *                         reserved.
 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, 
 *                         University of Stuttgart.  All rights reserved.
 * Copyright (c) 2004-2005 The Regents of the University of California.
 *                         All rights reserved.
 * Copyright (c) 2007      Cisco, Inc.  All rights reserved.
 * $COPYRIGHT$
 * 
 * Additional copyrights may follow
 * 
 * $HEADER$
 */

#ifndef ORTERUN_ORTERUN_H
#define ORTERUN_ORTERUN_H

#include "orte_config.h"

#include "opal/threads/condition.h"
#include "opal/util/cmd_line.h"

#include "orte/runtime/orte_globals.h"

BEGIN_C_DECLS

/**
 * Main body of orterun functionality
 */
int orterun(int argc, char *argv[]);

/**
 * Global struct for catching orterun command line options.
 */
struct globals_t {
    bool help;
    bool version;
    bool verbose;
    bool quiet;
    bool exit;
    bool no_wait_for_job_completion;
    bool by_node;
    bool by_slot;
    bool do_not_launch;
    bool debugger;
    int num_procs;
    char *env_val;
    char *appfile;
    char *wdir;
    char *path;
    bool preload_binary;
    char *preload_files;
    char *preload_files_dest_dir;
    opal_mutex_t lock;
    bool sleep;
    char *ompi_server;
};

/**
 * Struct holding values gleaned from the orterun command line
 */
ORTE_DECLSPEC extern struct globals_t orterun_globals;

/**
 * Whether orterun_globals has been initialized yet or not
 */
ORTE_DECLSPEC extern bool globals_init;

/**
 * Struct holding list of allowable command line parameters
 */
ORTE_DECLSPEC extern opal_cmd_line_init_t cmd_line_init[];

END_C_DECLS

#endif /* ORTERUN_ORTERUN_H */
A little more clean-up. TotalView now works with --enable-debug build. Tested with: pls = rsh totalview.6.6.0-2 Linux cadillac82.ccstar.lanl.gov 2.4.24 #1 SMP Thu Jul 1 15:28:04 MDT 2004 i686 i686 i386 GNU/Linux This commit was SVN r7108. 2005-08-31 16:15:59 +00:00			`/*`
Update the copyright notices for IU and UTK. This commit was SVN r7999. 2005-11-05 19:57:48 +00:00			`* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana`
			`* University Research and Technology`
			`* Corporation. All rights reserved.`
			`* Copyright (c) 2004-2005 The University of Tennessee and The University`
			`* of Tennessee Research Foundation. All rights`
			`* reserved.`
A little more clean-up. TotalView now works with --enable-debug build. Tested with: pls = rsh totalview.6.6.0-2 Linux cadillac82.ccstar.lanl.gov 2.4.24 #1 SMP Thu Jul 1 15:28:04 MDT 2004 i686 i686 i386 GNU/Linux This commit was SVN r7108. 2005-08-31 16:15:59 +00:00			`* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,`
			`* University of Stuttgart. All rights reserved.`
			`* Copyright (c) 2004-2005 The Regents of the University of California.`
			`* All rights reserved.`
Add support for DDT parallel debugger, which required several things: * Making some symbols and types be global (vs. static) in orterun * Adding a "ddt" entry in the MCA parameter orte_base_user_debugger default value * Add support for @executable@, @executable_argv@, and @single_app@ tokens in the orte_base_user_debugger MCA parameter. * Added various error checks and corresponding help messages after finding a debugger in the PATH Fixes trac:1081 This commit was SVN r15323. The following Trac tickets were found above: Ticket 1081 --> https://svn.open-mpi.org/trac/ompi/ticket/1081 2007-07-10 12:53:48 +00:00			`* Copyright (c) 2007 Cisco, Inc. All rights reserved.`
A little more clean-up. TotalView now works with --enable-debug build. Tested with: pls = rsh totalview.6.6.0-2 Linux cadillac82.ccstar.lanl.gov 2.4.24 #1 SMP Thu Jul 1 15:28:04 MDT 2004 i686 i686 i386 GNU/Linux This commit was SVN r7108. 2005-08-31 16:15:59 +00:00			`* $COPYRIGHT$`
			`*`
			`* Additional copyrights may follow`
			`*`
			`* $HEADER$`
			`*/`

			`#ifndef ORTERUN_ORTERUN_H`
			`#define ORTERUN_ORTERUN_H`

			`#include "orte_config.h"`

Add support for DDT parallel debugger, which required several things: * Making some symbols and types be global (vs. static) in orterun * Adding a "ddt" entry in the MCA parameter orte_base_user_debugger default value * Add support for @executable@, @executable_argv@, and @single_app@ tokens in the orte_base_user_debugger MCA parameter. * Added various error checks and corresponding help messages after finding a debugger in the PATH Fixes trac:1081 This commit was SVN r15323. The following Trac tickets were found above: Ticket 1081 --> https://svn.open-mpi.org/trac/ompi/ticket/1081 2007-07-10 12:53:48 +00:00			`#include "opal/threads/condition.h"`
This commit contains the following: * Fix some missing includes in a few places. * Add the cr_request() functionality to the BLCR CRS component. We are now dependent upon the 0.6.* series of BLCR. * Made the CR notification mechanism a registered function. This way we can have an OPAL-only version and it can be replaced at runtime with the ORTE version. * Add a 'opal_cr_allow_opal_only' parameter that will enable OPAL-only CR functionality when the user wants it. Default: Disabled. * Fix the placement of a checkpoint request check in MPI_Init * Pull the OPAL notification mechanism into the SnapC framework. * We no longer fork/exec the 'opal-checkpoint' command for local checkpointing, the Local coordinator in the orted does this directly. * The Local and Application coordinator talk together bypassing the OPAL notifiation mechanism. * Optimized the Local <-> App Coordinator communication. * Improved the structure used to track vpid_snapshots in the local coord. * Fix a race condition in which an application under heavy communication load may produce an inconsistent global checkpoint. This commit was SVN r16389. 2007-10-08 20:53:02 +00:00			`#include "opal/util/cmd_line.h"`
Add support for DDT parallel debugger, which required several things: * Making some symbols and types be global (vs. static) in orterun * Adding a "ddt" entry in the MCA parameter orte_base_user_debugger default value * Add support for @executable@, @executable_argv@, and @single_app@ tokens in the orte_base_user_debugger MCA parameter. * Added various error checks and corresponding help messages after finding a debugger in the PATH Fixes trac:1081 This commit was SVN r15323. The following Trac tickets were found above: Ticket 1081 --> https://svn.open-mpi.org/trac/ompi/ticket/1081 2007-07-10 12:53:48 +00:00
Merge the ORTE devel branch into the main trunk. Details of what this means will be circulated separately. Remains to be tested to ensure everything came over cleanly, so please continue to withhold commits a little longer This commit was SVN r17632. 2008-02-28 01:57:57 +00:00			`#include "orte/runtime/orte_globals.h"`

Add support for DDT parallel debugger, which required several things: * Making some symbols and types be global (vs. static) in orterun * Adding a "ddt" entry in the MCA parameter orte_base_user_debugger default value * Add support for @executable@, @executable_argv@, and @single_app@ tokens in the orte_base_user_debugger MCA parameter. * Added various error checks and corresponding help messages after finding a debugger in the PATH Fixes trac:1081 This commit was SVN r15323. The following Trac tickets were found above: Ticket 1081 --> https://svn.open-mpi.org/trac/ompi/ticket/1081 2007-07-10 12:53:48 +00:00			`BEGIN_C_DECLS`

			`/**`
			`* Main body of orterun functionality`
			`*/`
A little more clean-up. TotalView now works with --enable-debug build. Tested with: pls = rsh totalview.6.6.0-2 Linux cadillac82.ccstar.lanl.gov 2.4.24 #1 SMP Thu Jul 1 15:28:04 MDT 2004 i686 i686 i386 GNU/Linux This commit was SVN r7108. 2005-08-31 16:15:59 +00:00			`int orterun(int argc, char *argv[]);`

Add support for DDT parallel debugger, which required several things: * Making some symbols and types be global (vs. static) in orterun * Adding a "ddt" entry in the MCA parameter orte_base_user_debugger default value * Add support for @executable@, @executable_argv@, and @single_app@ tokens in the orte_base_user_debugger MCA parameter. * Added various error checks and corresponding help messages after finding a debugger in the PATH Fixes trac:1081 This commit was SVN r15323. The following Trac tickets were found above: Ticket 1081 --> https://svn.open-mpi.org/trac/ompi/ticket/1081 2007-07-10 12:53:48 +00:00			`/**`
			`* Global struct for catching orterun command line options.`
			`*/`
			`struct globals_t {`
			`bool help;`
			`bool version;`
			`bool verbose;`
			`bool quiet;`
			`bool exit;`
			`bool no_wait_for_job_completion;`
			`bool by_node;`
			`bool by_slot;`
			`bool do_not_launch;`
			`bool debugger;`
			`int num_procs;`
			`char *env_val;`
			`char *appfile;`
			`char *wdir;`
			`char *path;`
			`bool preload_binary;`
			`char *preload_files;`
			`char *preload_files_dest_dir;`
			`opal_mutex_t lock;`
When we can detect that a daemon has failed, then we would like to terminate the system without having it lock up. The "hang" is currently caused by the system attempting to send messages to the daemons (specifically, ordering them to kill their local procs and then terminate). Unfortunately, without some idea of which daemon has died, the system hangs while attempting to send a message to someone who is no longer alive. This commit introduces the necessary logic to avoid that conflict. If a PLS component can identify that a daemon has failed, then we will set a flag indicating that fact. The xcast system will subsequently check that flag and, if it is set, will send all messages direct to the recipient. In the case of "kill local procs" and "terminate", the messages will go directly to each orted, thus bypassing any orted that has failed. In addition, the xcast system will -not- wait for the messages to complete, but will return immediately (i.e., operate in non-blocking mode). Orterun will wait (via an event timer) for a period of time based on the number of daemons in the system to allow the messages to attempt to be delivered - at the end of that time, orterun will simply exit, alerting the user to the problem and -strongly- recommending they run orte-clean. I could only test this on slurm for the case where all daemons unexpectedly died - srun apparently only executes its waitpid callback when all launched functions terminate. I have asked that Jeff integrate this capability into the OOB as he is working on it so that we execute it whenever a socket to an orted is unexpectedly closed. Meantime, the functionality will rarely get called, but at least the logic is available for anyone whose environment can support it. This commit was SVN r16451. 2007-10-15 18:00:30 +00:00			`bool sleep;`
Merge the ORTE devel branch into the main trunk. Details of what this means will be circulated separately. Remains to be tested to ensure everything came over cleanly, so please continue to withhold commits a little longer This commit was SVN r17632. 2008-02-28 01:57:57 +00:00			`char *ompi_server;`
Add support for DDT parallel debugger, which required several things: * Making some symbols and types be global (vs. static) in orterun * Adding a "ddt" entry in the MCA parameter orte_base_user_debugger default value * Add support for @executable@, @executable_argv@, and @single_app@ tokens in the orte_base_user_debugger MCA parameter. * Added various error checks and corresponding help messages after finding a debugger in the PATH Fixes trac:1081 This commit was SVN r15323. The following Trac tickets were found above: Ticket 1081 --> https://svn.open-mpi.org/trac/ompi/ticket/1081 2007-07-10 12:53:48 +00:00			`};`

			`/**`
			`* Struct holding values gleaned from the orterun command line`
			`*/`
			`ORTE_DECLSPEC extern struct globals_t orterun_globals;`

			`/**`
			`* Whether orterun_globals has been initialized yet or not`
			`*/`
			`ORTE_DECLSPEC extern bool globals_init;`

			`/**`
			`* Struct holding list of allowable command line parameters`
			`*/`
			`ORTE_DECLSPEC extern opal_cmd_line_init_t cmd_line_init[];`

			`END_C_DECLS`

A little more clean-up. TotalView now works with --enable-debug build. Tested with: pls = rsh totalview.6.6.0-2 Linux cadillac82.ccstar.lanl.gov 2.4.24 #1 SMP Thu Jul 1 15:28:04 MDT 2004 i686 i686 i386 GNU/Linux This commit was SVN r7108. 2005-08-31 16:15:59 +00:00			`#endif /* ORTERUN_ORTERUN_H */`