2008-02-28 04:57:57 +03:00
|
|
|
/*
|
2010-03-13 02:57:50 +03:00
|
|
|
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
2008-02-28 04:57:57 +03:00
|
|
|
* University Research and Technology
|
|
|
|
* Corporation. All rights reserved.
|
2008-02-28 08:32:23 +03:00
|
|
|
* Copyright (c) 2004-2008 The University of Tennessee and The University
|
2008-02-28 04:57:57 +03:00
|
|
|
* of Tennessee Research Foundation. All rights
|
|
|
|
* reserved.
|
|
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
|
|
* University of Stuttgart. All rights reserved.
|
|
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
|
|
* All rights reserved.
|
2009-01-30 21:50:10 +03:00
|
|
|
* Copyright (c) 2007-2009 Sun Microsystems, Inc. All rights reserved.
|
2008-02-28 04:57:57 +03:00
|
|
|
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
|
|
|
|
* $COPYRIGHT$
|
|
|
|
*
|
|
|
|
* Additional copyrights may follow
|
|
|
|
*
|
|
|
|
* $HEADER$
|
|
|
|
*/
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @file
|
|
|
|
*
|
|
|
|
* Global params for OpenRTE
|
|
|
|
*/
|
|
|
|
#ifndef ORTE_RUNTIME_ORTE_GLOBALS_H
|
|
|
|
#define ORTE_RUNTIME_ORTE_GLOBALS_H
|
|
|
|
|
|
|
|
#include "orte_config.h"
|
|
|
|
#include "orte/types.h"
|
|
|
|
|
|
|
|
#include <sys/types.h>
|
|
|
|
#ifdef HAVE_SYS_TIME_H
|
|
|
|
#include <sys/time.h>
|
|
|
|
#endif
|
|
|
|
|
2008-02-28 08:32:23 +03:00
|
|
|
#include "opal/class/opal_pointer_array.h"
|
2009-06-17 06:54:20 +04:00
|
|
|
#include "opal/class/opal_value_array.h"
|
2010-04-23 08:44:41 +04:00
|
|
|
#include "opal/threads/threads.h"
|
2008-02-28 04:57:57 +03:00
|
|
|
|
|
|
|
#include "orte/mca/plm/plm_types.h"
|
2010-04-23 08:44:41 +04:00
|
|
|
#include "orte/mca/rml/rml_types.h"
|
2008-02-28 04:57:57 +03:00
|
|
|
#include "orte/util/proc_info.h"
|
2009-03-18 00:34:30 +03:00
|
|
|
#include "orte/util/name_fns.h"
|
2008-06-18 07:15:56 +04:00
|
|
|
#include "orte/runtime/runtime.h"
|
2008-08-05 19:09:29 +04:00
|
|
|
#include "orte/runtime/orte_wait.h"
|
2008-02-28 04:57:57 +03:00
|
|
|
|
2008-09-01 21:15:01 +04:00
|
|
|
|
|
|
|
BEGIN_C_DECLS
|
|
|
|
|
2008-11-24 22:57:08 +03:00
|
|
|
ORTE_DECLSPEC extern int orte_debug_verbosity; /* instantiated in orte/runtime/orte_init.c */
|
|
|
|
ORTE_DECLSPEC extern char *orte_prohibited_session_dirs; /* instantiated in orte/runtime/orte_init.c */
|
|
|
|
ORTE_DECLSPEC extern bool orte_xml_output; /* instantiated in orte/runtime/orte_globals.c */
|
2009-09-02 22:03:10 +04:00
|
|
|
ORTE_DECLSPEC extern FILE *orte_xml_fp; /* instantiated in orte/runtime/orte_globals.c */
|
2008-11-24 22:57:08 +03:00
|
|
|
ORTE_DECLSPEC extern bool orte_help_want_aggregate; /* instantiated in orte/util/show_help.c */
|
2009-07-15 23:43:26 +04:00
|
|
|
ORTE_DECLSPEC extern char *orte_job_ident; /* instantiated in orte/runtime/orte_globals.c */
|
2010-03-02 18:18:33 +03:00
|
|
|
ORTE_DECLSPEC extern bool orte_create_session_dirs; /* instantiated in orte/runtime/orte_init.c */
|
2010-04-02 18:19:38 +04:00
|
|
|
ORTE_DECLSPEC extern bool orte_execute_quiet; /* instantiated in orte/runtime/orte_globals.c */
|
2008-09-01 21:15:01 +04:00
|
|
|
|
|
|
|
/* Shortcut for some commonly used names */
|
|
|
|
#define ORTE_NAME_WILDCARD (&orte_name_wildcard)
|
2009-04-29 06:13:14 +04:00
|
|
|
ORTE_DECLSPEC extern orte_process_name_t orte_name_wildcard; /** instantiated in orte/runtime/orte_init.c */
|
2008-09-01 21:15:01 +04:00
|
|
|
#define ORTE_NAME_INVALID (&orte_name_invalid)
|
2009-04-29 06:13:14 +04:00
|
|
|
ORTE_DECLSPEC extern orte_process_name_t orte_name_invalid; /** instantiated in orte/runtime/orte_init.c */
|
2008-09-01 21:15:01 +04:00
|
|
|
|
2009-03-06 00:56:03 +03:00
|
|
|
#define ORTE_PROC_MY_NAME (&orte_process_info.my_name)
|
2008-09-01 21:15:01 +04:00
|
|
|
|
|
|
|
/* define a special name that belongs to orterun */
|
2009-03-06 00:56:03 +03:00
|
|
|
#define ORTE_PROC_MY_HNP (&orte_process_info.my_hnp)
|
2008-09-01 21:15:01 +04:00
|
|
|
|
|
|
|
/* define the name of my daemon */
|
2009-03-06 00:56:03 +03:00
|
|
|
#define ORTE_PROC_MY_DAEMON (&orte_process_info.my_daemon)
|
2008-09-01 21:15:01 +04:00
|
|
|
|
|
|
|
/* See comment in orte/tools/orterun/debuggers.c about this MCA
|
|
|
|
param */
|
|
|
|
ORTE_DECLSPEC extern bool orte_in_parallel_debugger;
|
|
|
|
|
2010-01-07 21:14:03 +03:00
|
|
|
/* error manager callback function */
|
|
|
|
typedef void (*orte_err_cb_fn_t)(orte_process_name_t *proc, orte_proc_state_t state, void *cbdata);
|
|
|
|
|
|
|
|
typedef uint16_t orte_mapping_policy_t;
|
2008-09-01 21:15:01 +04:00
|
|
|
|
|
|
|
#if ORTE_DISABLE_FULL_SUPPORT
|
|
|
|
|
|
|
|
/* These types are used in interface functions that should never be
|
|
|
|
used or implemented in the non-full interface, but need to be
|
|
|
|
declared for various reasons. So have a dummy type to keep things
|
|
|
|
simple (and throw an error if someone does try to use them) */
|
|
|
|
struct orte_job_t;
|
|
|
|
typedef struct orte_job_t orte_job_t;
|
|
|
|
|
|
|
|
#else
|
|
|
|
|
2008-02-28 04:57:57 +03:00
|
|
|
#define ORTE_GLOBAL_ARRAY_BLOCK_SIZE 64
|
|
|
|
#define ORTE_GLOBAL_ARRAY_MAX_SIZE INT_MAX
|
|
|
|
|
2008-03-05 04:46:30 +03:00
|
|
|
/* define a default error return code for ORTE */
|
|
|
|
#define ORTE_ERROR_DEFAULT_EXIT_CODE 1
|
2008-02-28 04:57:57 +03:00
|
|
|
|
2008-08-05 19:09:29 +04:00
|
|
|
/**
|
|
|
|
* Define a macro for updating the orte_exit_status
|
|
|
|
* The macro provides a convenient way of doing this
|
|
|
|
* so that we can add thread locking at some point
|
|
|
|
* since the orte_exit_status is a global variable.
|
|
|
|
*
|
|
|
|
* Ensure that we do not overwrite the exit status if it has
|
|
|
|
* already been set to some non-zero value. If we don't make
|
|
|
|
* this check, then different parts of the code could overwrite
|
|
|
|
* each other's exit status in the case of abnormal termination.
|
|
|
|
*
|
|
|
|
* For example, if a process aborts, we would record the initial
|
|
|
|
* exit code from the aborted process. However, subsequent processes
|
|
|
|
* will have been aborted by signal as we kill the job. We don't want
|
|
|
|
* the subsequent processes to overwrite the original exit code so
|
|
|
|
* we can tell the user the exit code from the process that caused
|
|
|
|
* the whole thing to happen.
|
|
|
|
*/
|
|
|
|
#define ORTE_UPDATE_EXIT_STATUS(newstatus) \
|
|
|
|
do { \
|
|
|
|
if (0 == orte_exit_status && 0 != newstatus) { \
|
|
|
|
OPAL_OUTPUT_VERBOSE((1, orte_debug_output, \
|
|
|
|
"%s:%s(%d) updating exit status to %d", \
|
2009-03-06 00:50:47 +03:00
|
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \
|
2008-08-05 19:09:29 +04:00
|
|
|
__FILE__, __LINE__, newstatus)); \
|
|
|
|
orte_exit_status = newstatus; \
|
|
|
|
} \
|
|
|
|
} while(0);
|
|
|
|
|
2009-09-09 09:28:45 +04:00
|
|
|
/* sometimes we need to reset the exit status - for example, when we
|
|
|
|
* are restarting a failed process
|
|
|
|
*/
|
|
|
|
#define ORTE_RESET_EXIT_STATUS() \
|
|
|
|
do { \
|
|
|
|
OPAL_OUTPUT_VERBOSE((1, orte_debug_output, \
|
|
|
|
"%s:%s(%d) reseting exit status", \
|
|
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \
|
|
|
|
__FILE__, __LINE__)); \
|
|
|
|
orte_exit_status = 0; \
|
|
|
|
} while(0);
|
|
|
|
|
2008-08-05 19:09:29 +04:00
|
|
|
|
2009-01-12 22:12:58 +03:00
|
|
|
/* define a macro for computing time differences - used for timing tests
|
|
|
|
* across the code base
|
|
|
|
*/
|
|
|
|
#define ORTE_COMPUTE_TIME_DIFF(r, ur, s1, us1, s2, us2) \
|
|
|
|
do { \
|
|
|
|
(r) = (s2) - (s1); \
|
|
|
|
if ((us2) >= (us1)) { \
|
|
|
|
(ur) = (us2) - (us1); \
|
|
|
|
} else { \
|
|
|
|
(r)--; \
|
|
|
|
(ur) = 1000000 - (us1) + (us2); \
|
|
|
|
} \
|
|
|
|
} while(0);
|
|
|
|
|
|
|
|
|
2008-02-28 04:57:57 +03:00
|
|
|
/* global type definitions used by RTE - instanced in orte_globals.c */
|
|
|
|
|
|
|
|
/************
|
|
|
|
* Declare this to allow us to use it before fully
|
|
|
|
* defining it - resolves potential circular definition
|
|
|
|
*/
|
|
|
|
struct orte_proc_t;
|
2009-08-11 06:51:27 +04:00
|
|
|
struct orte_job_map_t;
|
2008-02-28 04:57:57 +03:00
|
|
|
/************/
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Information about a specific application to be launched in the RTE.
|
|
|
|
*/
|
|
|
|
typedef struct {
|
|
|
|
/** Parent object */
|
|
|
|
opal_object_t super;
|
|
|
|
/** Unique index when multiple apps per job */
|
2010-02-27 20:37:34 +03:00
|
|
|
orte_app_idx_t idx;
|
2008-02-28 04:57:57 +03:00
|
|
|
/** Absolute pathname of argv[0] */
|
|
|
|
char *app;
|
|
|
|
/** Number of copies of this process that are to be launched */
|
|
|
|
orte_std_cntr_t num_procs;
|
|
|
|
/** Standard argv-style array, including a final NULL pointer */
|
|
|
|
char **argv;
|
|
|
|
/** Standard environ-style array, including a final NULL pointer */
|
|
|
|
char **env;
|
|
|
|
/** Current working directory for this app */
|
|
|
|
char *cwd;
|
|
|
|
/** Whether the cwd was set by the user or by the system */
|
|
|
|
bool user_specified_cwd;
|
|
|
|
/* Any hostfile that was specified */
|
|
|
|
char *hostfile;
|
|
|
|
/* Hostfile for adding hosts to an existing allocation */
|
|
|
|
char *add_hostfile;
|
2009-07-14 18:34:11 +04:00
|
|
|
/* Hosts to be added to an existing allocation - analagous to -host */
|
|
|
|
char **add_host;
|
2008-03-06 01:12:27 +03:00
|
|
|
/** argv of hosts passed in to -host */
|
|
|
|
char ** dash_host;
|
|
|
|
/** Prefix directory for this app (or NULL if no override necessary) */
|
2008-02-28 04:57:57 +03:00
|
|
|
char *prefix_dir;
|
2009-02-05 01:37:24 +03:00
|
|
|
/** Preload the binary on the remote machine (in PLM via FileM) */
|
2008-02-28 04:57:57 +03:00
|
|
|
bool preload_binary;
|
2009-06-12 21:53:13 +04:00
|
|
|
/** Preload the libraries on the remote machine (in PLM via FileM) */
|
|
|
|
bool preload_libs;
|
2008-02-28 04:57:57 +03:00
|
|
|
/** Preload the comma separated list of files to the remote machines cwd */
|
|
|
|
char * preload_files;
|
|
|
|
/** Destination directory for the preloaded files
|
|
|
|
* If NULL then the absolute and relative paths are obeyed */
|
2009-02-12 18:56:45 +03:00
|
|
|
char *preload_files_dest_dir;
|
|
|
|
/** Source directory for the preloaded files
|
|
|
|
* If NULL then the absolute and relative paths are obeyed */
|
|
|
|
char *preload_files_src_dir;
|
2009-01-25 15:39:24 +03:00
|
|
|
/* is being used on the local node */
|
|
|
|
bool used_on_node;
|
2008-02-28 04:57:57 +03:00
|
|
|
} orte_app_context_t;
|
|
|
|
|
|
|
|
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_app_context_t);
|
|
|
|
|
|
|
|
|
|
|
|
typedef struct {
|
|
|
|
/** Base object so this can be put on a list */
|
|
|
|
opal_list_item_t super;
|
|
|
|
/* index of this node object in global array */
|
|
|
|
orte_std_cntr_t index;
|
|
|
|
/** String node name */
|
|
|
|
char *name;
|
2009-01-15 21:11:50 +03:00
|
|
|
/* argv-like array of aliases for this node */
|
|
|
|
char **alias;
|
2008-04-30 23:49:53 +04:00
|
|
|
/* daemon on this node */
|
2008-02-28 04:57:57 +03:00
|
|
|
struct orte_proc_t *daemon;
|
|
|
|
/* whether or not this daemon has been launched */
|
|
|
|
bool daemon_launched;
|
|
|
|
/** Launch id - needed by some systems to launch a proc on this node */
|
|
|
|
int32_t launch_id;
|
|
|
|
/** number of procs on this node */
|
|
|
|
orte_vpid_t num_procs;
|
|
|
|
/* array of pointers to procs on this node */
|
2008-02-28 08:32:23 +03:00
|
|
|
opal_pointer_array_t *procs;
|
2008-04-30 23:49:53 +04:00
|
|
|
/* next node rank on this node */
|
2008-09-25 17:39:08 +04:00
|
|
|
orte_node_rank_t next_node_rank;
|
2008-02-28 04:57:57 +03:00
|
|
|
/* whether or not we are oversubscribed */
|
|
|
|
bool oversubscribed;
|
|
|
|
/** State of this node */
|
|
|
|
orte_node_state_t state;
|
|
|
|
/** A "soft" limit on the number of slots available on the node.
|
|
|
|
This will typically correspond to the number of physical CPUs
|
|
|
|
that we have been allocated on this note and would be the
|
|
|
|
"ideal" number of processes for us to launch. */
|
|
|
|
orte_std_cntr_t slots;
|
|
|
|
/** How many processes have already been launched, used by one or
|
|
|
|
more jobs on this node. */
|
|
|
|
orte_std_cntr_t slots_inuse;
|
|
|
|
/** This represents the number of slots we (the allocator) are
|
|
|
|
attempting to allocate to the current job - or the number of
|
|
|
|
slots allocated to a specific job on a query for the jobs
|
|
|
|
allocations */
|
|
|
|
orte_std_cntr_t slots_alloc;
|
|
|
|
/** A "hard" limit (if set -- a value of 0 implies no hard limit)
|
|
|
|
on the number of slots that can be allocated on a given
|
|
|
|
node. This is for some environments (e.g. grid) there may be
|
|
|
|
fixed limits on the number of slots that can be used.
|
|
|
|
|
|
|
|
This value also could have been a boolean - but we may want to
|
|
|
|
allow the hard limit be different than the soft limit - in
|
|
|
|
other words allow the node to be oversubscribed up to a
|
|
|
|
specified limit. For example, if we have two processors, we
|
|
|
|
may want to allow up to four processes but no more. */
|
|
|
|
orte_std_cntr_t slots_max;
|
2009-08-11 06:51:27 +04:00
|
|
|
/* number of physical boards in the node - defaults to 1 */
|
|
|
|
uint8_t boards;
|
|
|
|
/* number of sockets on each board - defaults to 1 */
|
|
|
|
uint8_t sockets_per_board;
|
|
|
|
/* number of cores per socket - defaults to 1 */
|
|
|
|
uint8_t cores_per_socket;
|
|
|
|
/* cpus on this node that are assigned for our use */
|
|
|
|
char *cpu_set;
|
2008-02-28 04:57:57 +03:00
|
|
|
/** Username on this node, if specified */
|
|
|
|
char *username;
|
2009-12-01 02:11:25 +03:00
|
|
|
/* list of known system resources for this node */
|
|
|
|
opal_list_t resources;
|
2008-02-28 04:57:57 +03:00
|
|
|
} orte_node_t;
|
|
|
|
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_node_t);
|
|
|
|
|
2008-03-07 00:56:00 +03:00
|
|
|
/* define a set of flags to control the launch of a job */
|
Roll in the revamped IOF subsystem. Per the devel mailing list email, this is a complete rewrite of the iof framework designed to simplify the code for maintainability, and to support features we had planned to do, but were too difficult to implement in the old code. Specifically, the new code:
1. completely and cleanly separates responsibilities between the HNP, orted, and tool components.
2. removes all wireup messaging during launch and shutdown.
3. maintains flow control for stdin to avoid large-scale consumption of memory by orteds when large input files are forwarded. This is done using an xon/xoff protocol.
4. enables specification of stdin recipients on the mpirun cmd line. Allowed options include rank, "all", or "none". Default is rank 0.
5. creates a new MPI_Info key "ompi_stdin_target" that supports the above options for child jobs. Default is "none".
6. adds a new tool "orte-iof" that can connect to a running mpirun and display the output. Cmd line options allow selection of any combination of stdout, stderr, and stddiag. Default is stdout.
7. adds a new mpirun and orte-iof cmd line option "tag-output" that will tag each line of output with process name and stream ident. For example, "[1,0]<stdout>this is output"
This is not intended for the 1.3 release as it is a major change requiring considerable soak time.
This commit was SVN r19767.
2008-10-18 04:00:49 +04:00
|
|
|
typedef uint8_t orte_job_controls_t;
|
|
|
|
#define ORTE_JOB_CONTROL OPAL_UINT8
|
|
|
|
|
2009-02-05 01:37:24 +03:00
|
|
|
#define ORTE_JOB_CONTROL_LOCAL_SLAVE 0x01
|
Roll in the revamped IOF subsystem. Per the devel mailing list email, this is a complete rewrite of the iof framework designed to simplify the code for maintainability, and to support features we had planned to do, but were too difficult to implement in the old code. Specifically, the new code:
1. completely and cleanly separates responsibilities between the HNP, orted, and tool components.
2. removes all wireup messaging during launch and shutdown.
3. maintains flow control for stdin to avoid large-scale consumption of memory by orteds when large input files are forwarded. This is done using an xon/xoff protocol.
4. enables specification of stdin recipients on the mpirun cmd line. Allowed options include rank, "all", or "none". Default is rank 0.
5. creates a new MPI_Info key "ompi_stdin_target" that supports the above options for child jobs. Default is "none".
6. adds a new tool "orte-iof" that can connect to a running mpirun and display the output. Cmd line options allow selection of any combination of stdout, stderr, and stddiag. Default is stdout.
7. adds a new mpirun and orte-iof cmd line option "tag-output" that will tag each line of output with process name and stream ident. For example, "[1,0]<stdout>this is output"
This is not intended for the 1.3 release as it is a major change requiring considerable soak time.
This commit was SVN r19767.
2008-10-18 04:00:49 +04:00
|
|
|
#define ORTE_JOB_CONTROL_NON_ORTE_JOB 0x02
|
|
|
|
#define ORTE_JOB_CONTROL_DEBUGGER_DAEMON 0x04
|
|
|
|
#define ORTE_JOB_CONTROL_FORWARD_OUTPUT 0x08
|
|
|
|
#define ORTE_JOB_CONTROL_DO_NOT_MONITOR 0x10
|
|
|
|
#define ORTE_JOB_CONTROL_FORWARD_COMM 0x20
|
2009-06-27 00:54:58 +04:00
|
|
|
#define ORTE_JOB_CONTROL_CONTINUOUS_OP 0x40
|
2010-03-24 00:28:02 +03:00
|
|
|
#define ORTE_JOB_CONTROL_RECOVERABLE 0x80
|
2008-03-07 00:56:00 +03:00
|
|
|
|
2009-08-11 06:51:27 +04:00
|
|
|
#define ORTE_MAPPING_POLICY OPAL_UINT16
|
|
|
|
/* put the rank assignment method in the upper 8 bits */
|
2010-04-10 09:02:47 +04:00
|
|
|
#define ORTE_MAPPING_USE_VM 0x0100
|
2009-08-11 06:51:27 +04:00
|
|
|
#define ORTE_MAPPING_BYNODE 0x0200
|
|
|
|
#define ORTE_MAPPING_BYSLOT 0x0400
|
|
|
|
#define ORTE_MAPPING_BYSOCKET 0x0800
|
|
|
|
#define ORTE_MAPPING_BYBOARD 0x1000
|
|
|
|
#define ORTE_MAPPING_NO_USE_LOCAL 0x2000
|
|
|
|
#define ORTE_MAPPING_NPERXXX 0x4000
|
2009-08-13 20:08:43 +04:00
|
|
|
#define ORTE_MAPPING_BYUSER 0x8000
|
2009-08-11 06:51:27 +04:00
|
|
|
/* nice macro for setting these */
|
|
|
|
#define ORTE_SET_MAPPING_POLICY(pol) \
|
|
|
|
orte_default_mapping_policy = (orte_default_mapping_policy & 0x00ff) | (pol);
|
2009-09-22 22:44:53 +04:00
|
|
|
/* macro to detect if some other policy has been set */
|
|
|
|
#define ORTE_XSET_MAPPING_POLICY(pol) \
|
|
|
|
do { \
|
|
|
|
orte_mapping_policy_t tmp; \
|
|
|
|
tmp = (orte_default_mapping_policy & 0xff00) & ~(pol); \
|
|
|
|
if (0 == tmp) { \
|
|
|
|
ORTE_SET_MAPPING_POLICY((pol)); \
|
|
|
|
} \
|
|
|
|
} while(0);
|
|
|
|
/* macro to add another mapping policy */
|
2009-08-11 06:51:27 +04:00
|
|
|
#define ORTE_ADD_MAPPING_POLICY(pol) \
|
|
|
|
orte_default_mapping_policy |= (pol);
|
|
|
|
|
|
|
|
/* put the binding policy in the lower 8 bits, using the paffinity values */
|
|
|
|
#define ORTE_BIND_TO_NONE (uint16_t)OPAL_PAFFINITY_DO_NOT_BIND
|
|
|
|
#define ORTE_BIND_TO_CORE (uint16_t)OPAL_PAFFINITY_BIND_TO_CORE
|
|
|
|
#define ORTE_BIND_TO_SOCKET (uint16_t)OPAL_PAFFINITY_BIND_TO_SOCKET
|
|
|
|
#define ORTE_BIND_TO_BOARD (uint16_t)OPAL_PAFFINITY_BIND_TO_BOARD
|
2009-09-18 23:48:42 +04:00
|
|
|
#define ORTE_BIND_IF_SUPPORTED (uint16_t)OPAL_PAFFINITY_BIND_IF_SUPPORTED
|
2009-08-11 06:51:27 +04:00
|
|
|
/* nice macro for setting these */
|
|
|
|
#define ORTE_SET_BINDING_POLICY(pol) \
|
|
|
|
orte_default_mapping_policy = (orte_default_mapping_policy & 0xff00) | (pol);
|
2009-08-26 06:01:49 +04:00
|
|
|
/* macro to detect if some other policy has been set */
|
|
|
|
#define ORTE_XSET_BINDING_POLICY(pol) \
|
|
|
|
do { \
|
|
|
|
orte_mapping_policy_t tmp; \
|
2009-09-03 21:58:23 +04:00
|
|
|
tmp = (orte_default_mapping_policy & 0x00ff) & ~(pol); \
|
2009-08-26 06:01:49 +04:00
|
|
|
if (0 == tmp) { \
|
|
|
|
ORTE_SET_BINDING_POLICY((pol)); \
|
|
|
|
} \
|
|
|
|
} while(0);
|
2009-09-18 23:48:42 +04:00
|
|
|
/* macro to detect if binding was qualified */
|
|
|
|
#define ORTE_BINDING_NOT_REQUIRED(n) \
|
|
|
|
(ORTE_BIND_IF_SUPPORTED & (n))
|
2009-08-11 06:51:27 +04:00
|
|
|
|
2008-02-28 04:57:57 +03:00
|
|
|
typedef struct {
|
|
|
|
/** Base object so this can be put on a list */
|
|
|
|
opal_list_item_t super;
|
|
|
|
/* jobid for this job */
|
|
|
|
orte_jobid_t jobid;
|
|
|
|
/* app_context array for this job */
|
2008-02-28 08:32:23 +03:00
|
|
|
opal_pointer_array_t *apps;
|
2008-02-28 04:57:57 +03:00
|
|
|
/* number of app_contexts in the array */
|
2010-02-27 20:37:34 +03:00
|
|
|
orte_app_idx_t num_apps;
|
2008-03-07 00:56:00 +03:00
|
|
|
/* flags to control the launch of this job - see above
|
|
|
|
* for description of supported flags
|
|
|
|
*/
|
Roll in the revamped IOF subsystem. Per the devel mailing list email, this is a complete rewrite of the iof framework designed to simplify the code for maintainability, and to support features we had planned to do, but were too difficult to implement in the old code. Specifically, the new code:
1. completely and cleanly separates responsibilities between the HNP, orted, and tool components.
2. removes all wireup messaging during launch and shutdown.
3. maintains flow control for stdin to avoid large-scale consumption of memory by orteds when large input files are forwarded. This is done using an xon/xoff protocol.
4. enables specification of stdin recipients on the mpirun cmd line. Allowed options include rank, "all", or "none". Default is rank 0.
5. creates a new MPI_Info key "ompi_stdin_target" that supports the above options for child jobs. Default is "none".
6. adds a new tool "orte-iof" that can connect to a running mpirun and display the output. Cmd line options allow selection of any combination of stdout, stderr, and stddiag. Default is stdout.
7. adds a new mpirun and orte-iof cmd line option "tag-output" that will tag each line of output with process name and stream ident. For example, "[1,0]<stdout>this is output"
This is not intended for the 1.3 release as it is a major change requiring considerable soak time.
This commit was SVN r19767.
2008-10-18 04:00:49 +04:00
|
|
|
orte_job_controls_t controls;
|
|
|
|
/* rank desiring stdin - for now, either one rank, all ranks
|
|
|
|
* (wildcard), or none (invalid)
|
|
|
|
*/
|
|
|
|
orte_vpid_t stdin_target;
|
2008-02-28 04:57:57 +03:00
|
|
|
/* total slots allocated to this job */
|
|
|
|
orte_std_cntr_t total_slots_alloc;
|
|
|
|
/* number of procs in this job */
|
|
|
|
orte_vpid_t num_procs;
|
|
|
|
/* array of pointers to procs in this job */
|
2008-02-28 08:32:23 +03:00
|
|
|
opal_pointer_array_t *procs;
|
2008-02-28 04:57:57 +03:00
|
|
|
/* map of the job */
|
2009-08-11 06:51:27 +04:00
|
|
|
struct orte_job_map_t *map;
|
2008-02-28 04:57:57 +03:00
|
|
|
/* bookmark for where we are in mapping - this
|
|
|
|
* indicates the node where we stopped
|
|
|
|
*/
|
|
|
|
orte_node_t *bookmark;
|
|
|
|
/** Whether or not to override oversubscription based on local
|
|
|
|
* hardware - used to indicate uncertainty in number of
|
|
|
|
* actual processors available on this node
|
|
|
|
*/
|
|
|
|
bool oversubscribe_override;
|
|
|
|
/* state of the overall job */
|
|
|
|
orte_job_state_t state;
|
|
|
|
/* number of procs launched */
|
|
|
|
orte_vpid_t num_launched;
|
|
|
|
/* number of procs reporting contact info */
|
|
|
|
orte_vpid_t num_reported;
|
|
|
|
/* number of procs terminated */
|
|
|
|
orte_vpid_t num_terminated;
|
2010-04-23 08:44:41 +04:00
|
|
|
/* number of daemons reported launched so we can track progress */
|
|
|
|
orte_vpid_t num_daemons_reported;
|
|
|
|
/* lock/cond/flag for tracking when all procs reported */
|
|
|
|
opal_mutex_t reported_lock;
|
|
|
|
opal_condition_t reported_cond;
|
|
|
|
bool not_reported;
|
2008-02-28 04:57:57 +03:00
|
|
|
/* did this job abort? */
|
|
|
|
bool abort;
|
|
|
|
/* proc that caused that to happen */
|
|
|
|
struct orte_proc_t *aborted_proc;
|
2010-04-28 08:06:57 +04:00
|
|
|
/* enable recovery of these processes */
|
|
|
|
bool enable_recovery;
|
2010-04-27 02:15:57 +04:00
|
|
|
/* max number of times a process can be restarted locally */
|
|
|
|
int32_t max_local_restarts;
|
|
|
|
/* max number of times a process can be relocated to another node */
|
|
|
|
int32_t max_global_restarts;
|
2010-04-23 08:44:41 +04:00
|
|
|
/* time launch message was sent */
|
|
|
|
struct timeval launch_msg_sent;
|
|
|
|
/* max time for launch msg to be received */
|
|
|
|
struct timeval max_launch_msg_recvd;
|
2010-03-13 02:57:50 +03:00
|
|
|
#if OPAL_ENABLE_FT_CR == 1
|
2008-02-28 04:57:57 +03:00
|
|
|
/* ckpt state */
|
|
|
|
size_t ckpt_state;
|
|
|
|
/* snapshot reference */
|
|
|
|
char *ckpt_snapshot_ref;
|
|
|
|
/* snapshot location */
|
|
|
|
char *ckpt_snapshot_loc;
|
|
|
|
#endif
|
|
|
|
} orte_job_t;
|
|
|
|
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_job_t);
|
|
|
|
|
|
|
|
struct orte_proc_t {
|
|
|
|
/** Base object so this can be put on a list */
|
|
|
|
opal_list_item_t super;
|
|
|
|
/* process name */
|
|
|
|
orte_process_name_t name;
|
|
|
|
/* pid */
|
|
|
|
pid_t pid;
|
2008-04-30 23:49:53 +04:00
|
|
|
/* local rank amongst my peers on the node
|
|
|
|
* where this is running - this value is
|
|
|
|
* needed by MPI procs so that the lowest
|
|
|
|
* rank on a node can perform certain fns -
|
|
|
|
* e.g., open an sm backing file
|
|
|
|
*/
|
2008-09-25 17:39:08 +04:00
|
|
|
orte_local_rank_t local_rank;
|
2008-04-30 23:49:53 +04:00
|
|
|
/* local rank on the node across all procs
|
|
|
|
* and jobs known to this HNP - this is
|
|
|
|
* needed so that procs can do things like
|
|
|
|
* know which static IP port to use
|
|
|
|
*/
|
2008-09-25 17:39:08 +04:00
|
|
|
orte_node_rank_t node_rank;
|
2010-03-24 00:28:02 +03:00
|
|
|
/* Last state used to trigger the errmgr for this proc */
|
|
|
|
orte_proc_state_t last_errmgr_state;
|
2008-02-28 04:57:57 +03:00
|
|
|
/* process state */
|
|
|
|
orte_proc_state_t state;
|
|
|
|
/* exit code */
|
|
|
|
orte_exit_code_t exit_code;
|
|
|
|
/* the app_context that generated this proc */
|
2010-02-27 20:37:34 +03:00
|
|
|
orte_app_idx_t app_idx;
|
2008-02-28 04:57:57 +03:00
|
|
|
/* a cpu list, if specified by the user */
|
|
|
|
char *slot_list;
|
|
|
|
/* pointer to the node where this proc is executing */
|
|
|
|
orte_node_t *node;
|
|
|
|
/* name of the node where this proc is executing - this
|
|
|
|
* is used simply to pass that info to a calling
|
|
|
|
* tool since it may not have a node array available
|
|
|
|
*/
|
|
|
|
char *nodename;
|
|
|
|
/* RML contact info */
|
|
|
|
char *rml_uri;
|
2010-01-07 04:19:44 +03:00
|
|
|
/* number of times this process has been restarted */
|
|
|
|
int32_t restarts;
|
2010-04-27 02:15:57 +04:00
|
|
|
/* number of times this process has been relocated */
|
|
|
|
int32_t relocates;
|
2010-03-13 02:57:50 +03:00
|
|
|
#if OPAL_ENABLE_FT_CR == 1
|
2008-02-28 04:57:57 +03:00
|
|
|
/* ckpt state */
|
|
|
|
size_t ckpt_state;
|
|
|
|
/* snapshot reference */
|
|
|
|
char *ckpt_snapshot_ref;
|
|
|
|
/* snapshot location */
|
|
|
|
char *ckpt_snapshot_loc;
|
|
|
|
#endif
|
|
|
|
};
|
|
|
|
typedef struct orte_proc_t orte_proc_t;
|
|
|
|
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_proc_t);
|
|
|
|
|
2009-01-07 17:58:38 +03:00
|
|
|
typedef struct {
|
|
|
|
opal_list_item_t super;
|
|
|
|
char *name;
|
|
|
|
int32_t size;
|
|
|
|
uint8_t *bytes;
|
|
|
|
} orte_attr_t;
|
|
|
|
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_attr_t);
|
2008-04-30 23:49:53 +04:00
|
|
|
|
|
|
|
typedef struct {
|
Repair the MPI-2 dynamic operations. This includes:
1. repair of the linear and direct routed modules
2. repair of the ompi/pubsub/orte module to correctly init routes to the ompi-server, and correctly handle failure to correctly parse the provided ompi-server URI
3. modification of orterun to accept both "file" and "FILE" for designating where the ompi-server URI is to be found - purely a convenience feature
4. resolution of a message ordering problem during the connect/accept handshake that allowed the "send-first" proc to attempt to send to the "recv-first" proc before the HNP had actually updated its routes.
Let this be a further reminder to all - message ordering is NOT guaranteed in the OOB
5. Repair the ompi/dpm/orte module to correctly init routes during connect/accept.
Reminder to all: messages sent to procs in another job family (i.e., started by a different mpirun) are ALWAYS routed through the respective HNPs. As per the comments in orte/routed, this is REQUIRED to maintain connect/accept (where only the root proc on each side is capable of init'ing the routes), allow communication between mpirun's using different routing modules, and to minimize connections on tools such as ompi-server. It is all taken care of "under the covers" by the OOB to ensure that a route back to the sender is maintained, even when the different mpirun's are using different routed modules.
6. corrections in the orte/odls to ensure proper identification of daemons participating in a dynamic launch
7. corrections in build/nidmap to support update of an existing nidmap during dynamic launch
8. corrected implementation of the update_arch function in the ESS, along with consolidation of a number of ESS operations into base functions for easier maintenance. The ability to support info from multiple jobs was added, although we don't currently do so - this will come later to support further fault recovery strategies
9. minor updates to several functions to remove unnecessary and/or no longer used variables and envar's, add some debugging output, etc.
10. addition of a new macro ORTE_PROC_IS_DAEMON that resolves to true if the provided proc is a daemon
There is still more cleanup to be done for efficiency, but this at least works.
Tested on single-node Mac, multi-node SLURM via odin. Tests included connect/accept, publish/lookup/unpublish, comm_spawn, comm_spawn_multiple, and singleton comm_spawn.
Fixes ticket #1256
This commit was SVN r18804.
2008-07-03 21:53:37 +04:00
|
|
|
/* base object */
|
|
|
|
opal_object_t super;
|
2009-02-06 18:29:33 +03:00
|
|
|
/* index in the array */
|
|
|
|
int index;
|
2008-04-30 23:49:53 +04:00
|
|
|
/* nodename */
|
|
|
|
char *name;
|
Repair the MPI-2 dynamic operations. This includes:
1. repair of the linear and direct routed modules
2. repair of the ompi/pubsub/orte module to correctly init routes to the ompi-server, and correctly handle failure to correctly parse the provided ompi-server URI
3. modification of orterun to accept both "file" and "FILE" for designating where the ompi-server URI is to be found - purely a convenience feature
4. resolution of a message ordering problem during the connect/accept handshake that allowed the "send-first" proc to attempt to send to the "recv-first" proc before the HNP had actually updated its routes.
Let this be a further reminder to all - message ordering is NOT guaranteed in the OOB
5. Repair the ompi/dpm/orte module to correctly init routes during connect/accept.
Reminder to all: messages sent to procs in another job family (i.e., started by a different mpirun) are ALWAYS routed through the respective HNPs. As per the comments in orte/routed, this is REQUIRED to maintain connect/accept (where only the root proc on each side is capable of init'ing the routes), allow communication between mpirun's using different routing modules, and to minimize connections on tools such as ompi-server. It is all taken care of "under the covers" by the OOB to ensure that a route back to the sender is maintained, even when the different mpirun's are using different routed modules.
6. corrections in the orte/odls to ensure proper identification of daemons participating in a dynamic launch
7. corrections in build/nidmap to support update of an existing nidmap during dynamic launch
8. corrected implementation of the update_arch function in the ESS, along with consolidation of a number of ESS operations into base functions for easier maintenance. The ability to support info from multiple jobs was added, although we don't currently do so - this will come later to support further fault recovery strategies
9. minor updates to several functions to remove unnecessary and/or no longer used variables and envar's, add some debugging output, etc.
10. addition of a new macro ORTE_PROC_IS_DAEMON that resolves to true if the provided proc is a daemon
There is still more cleanup to be done for efficiency, but this at least works.
Tested on single-node Mac, multi-node SLURM via odin. Tests included connect/accept, publish/lookup/unpublish, comm_spawn, comm_spawn_multiple, and singleton comm_spawn.
Fixes ticket #1256
This commit was SVN r18804.
2008-07-03 21:53:37 +04:00
|
|
|
/* vpid of this job family's daemon on this node */
|
2008-05-28 22:38:47 +04:00
|
|
|
orte_vpid_t daemon;
|
2010-03-23 23:47:41 +03:00
|
|
|
/* list of interface attributes */
|
2009-01-07 17:58:38 +03:00
|
|
|
opal_list_t attrs;
|
2010-03-23 23:47:41 +03:00
|
|
|
/* list of system info */
|
|
|
|
opal_list_t sysinfo;
|
2010-05-05 04:48:43 +04:00
|
|
|
#if ORTE_ENABLE_HEARTBEAT
|
|
|
|
/* seconds when last heartbeat was detected */
|
|
|
|
double beat;
|
|
|
|
/* number of missed heartbeats */
|
|
|
|
int missed;
|
|
|
|
#endif
|
2008-04-30 23:49:53 +04:00
|
|
|
} orte_nid_t;
|
Repair the MPI-2 dynamic operations. This includes:
1. repair of the linear and direct routed modules
2. repair of the ompi/pubsub/orte module to correctly init routes to the ompi-server, and correctly handle failure to correctly parse the provided ompi-server URI
3. modification of orterun to accept both "file" and "FILE" for designating where the ompi-server URI is to be found - purely a convenience feature
4. resolution of a message ordering problem during the connect/accept handshake that allowed the "send-first" proc to attempt to send to the "recv-first" proc before the HNP had actually updated its routes.
Let this be a further reminder to all - message ordering is NOT guaranteed in the OOB
5. Repair the ompi/dpm/orte module to correctly init routes during connect/accept.
Reminder to all: messages sent to procs in another job family (i.e., started by a different mpirun) are ALWAYS routed through the respective HNPs. As per the comments in orte/routed, this is REQUIRED to maintain connect/accept (where only the root proc on each side is capable of init'ing the routes), allow communication between mpirun's using different routing modules, and to minimize connections on tools such as ompi-server. It is all taken care of "under the covers" by the OOB to ensure that a route back to the sender is maintained, even when the different mpirun's are using different routed modules.
6. corrections in the orte/odls to ensure proper identification of daemons participating in a dynamic launch
7. corrections in build/nidmap to support update of an existing nidmap during dynamic launch
8. corrected implementation of the update_arch function in the ESS, along with consolidation of a number of ESS operations into base functions for easier maintenance. The ability to support info from multiple jobs was added, although we don't currently do so - this will come later to support further fault recovery strategies
9. minor updates to several functions to remove unnecessary and/or no longer used variables and envar's, add some debugging output, etc.
10. addition of a new macro ORTE_PROC_IS_DAEMON that resolves to true if the provided proc is a daemon
There is still more cleanup to be done for efficiency, but this at least works.
Tested on single-node Mac, multi-node SLURM via odin. Tests included connect/accept, publish/lookup/unpublish, comm_spawn, comm_spawn_multiple, and singleton comm_spawn.
Fixes ticket #1256
This commit was SVN r18804.
2008-07-03 21:53:37 +04:00
|
|
|
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_nid_t);
|
2008-04-30 23:49:53 +04:00
|
|
|
|
|
|
|
typedef struct {
|
2009-02-06 18:29:33 +03:00
|
|
|
/* base object */
|
|
|
|
opal_object_t super;
|
2008-04-30 23:49:53 +04:00
|
|
|
/* index to node */
|
|
|
|
int32_t node;
|
|
|
|
/* local rank */
|
2008-09-25 17:39:08 +04:00
|
|
|
orte_local_rank_t local_rank;
|
2008-04-30 23:49:53 +04:00
|
|
|
/* node rank */
|
2008-09-25 17:39:08 +04:00
|
|
|
orte_node_rank_t node_rank;
|
2008-04-30 23:49:53 +04:00
|
|
|
} orte_pmap_t;
|
2009-02-06 18:29:33 +03:00
|
|
|
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_pmap_t);
|
2008-04-30 23:49:53 +04:00
|
|
|
|
Repair the MPI-2 dynamic operations. This includes:
1. repair of the linear and direct routed modules
2. repair of the ompi/pubsub/orte module to correctly init routes to the ompi-server, and correctly handle failure to correctly parse the provided ompi-server URI
3. modification of orterun to accept both "file" and "FILE" for designating where the ompi-server URI is to be found - purely a convenience feature
4. resolution of a message ordering problem during the connect/accept handshake that allowed the "send-first" proc to attempt to send to the "recv-first" proc before the HNP had actually updated its routes.
Let this be a further reminder to all - message ordering is NOT guaranteed in the OOB
5. Repair the ompi/dpm/orte module to correctly init routes during connect/accept.
Reminder to all: messages sent to procs in another job family (i.e., started by a different mpirun) are ALWAYS routed through the respective HNPs. As per the comments in orte/routed, this is REQUIRED to maintain connect/accept (where only the root proc on each side is capable of init'ing the routes), allow communication between mpirun's using different routing modules, and to minimize connections on tools such as ompi-server. It is all taken care of "under the covers" by the OOB to ensure that a route back to the sender is maintained, even when the different mpirun's are using different routed modules.
6. corrections in the orte/odls to ensure proper identification of daemons participating in a dynamic launch
7. corrections in build/nidmap to support update of an existing nidmap during dynamic launch
8. corrected implementation of the update_arch function in the ESS, along with consolidation of a number of ESS operations into base functions for easier maintenance. The ability to support info from multiple jobs was added, although we don't currently do so - this will come later to support further fault recovery strategies
9. minor updates to several functions to remove unnecessary and/or no longer used variables and envar's, add some debugging output, etc.
10. addition of a new macro ORTE_PROC_IS_DAEMON that resolves to true if the provided proc is a daemon
There is still more cleanup to be done for efficiency, but this at least works.
Tested on single-node Mac, multi-node SLURM via odin. Tests included connect/accept, publish/lookup/unpublish, comm_spawn, comm_spawn_multiple, and singleton comm_spawn.
Fixes ticket #1256
This commit was SVN r18804.
2008-07-03 21:53:37 +04:00
|
|
|
typedef struct {
|
|
|
|
/* base object */
|
|
|
|
opal_object_t super;
|
|
|
|
/* jobid */
|
|
|
|
orte_jobid_t job;
|
2008-11-01 00:10:00 +03:00
|
|
|
/* number of procs in this job */
|
|
|
|
orte_vpid_t num_procs;
|
Repair the MPI-2 dynamic operations. This includes:
1. repair of the linear and direct routed modules
2. repair of the ompi/pubsub/orte module to correctly init routes to the ompi-server, and correctly handle failure to correctly parse the provided ompi-server URI
3. modification of orterun to accept both "file" and "FILE" for designating where the ompi-server URI is to be found - purely a convenience feature
4. resolution of a message ordering problem during the connect/accept handshake that allowed the "send-first" proc to attempt to send to the "recv-first" proc before the HNP had actually updated its routes.
Let this be a further reminder to all - message ordering is NOT guaranteed in the OOB
5. Repair the ompi/dpm/orte module to correctly init routes during connect/accept.
Reminder to all: messages sent to procs in another job family (i.e., started by a different mpirun) are ALWAYS routed through the respective HNPs. As per the comments in orte/routed, this is REQUIRED to maintain connect/accept (where only the root proc on each side is capable of init'ing the routes), allow communication between mpirun's using different routing modules, and to minimize connections on tools such as ompi-server. It is all taken care of "under the covers" by the OOB to ensure that a route back to the sender is maintained, even when the different mpirun's are using different routed modules.
6. corrections in the orte/odls to ensure proper identification of daemons participating in a dynamic launch
7. corrections in build/nidmap to support update of an existing nidmap during dynamic launch
8. corrected implementation of the update_arch function in the ESS, along with consolidation of a number of ESS operations into base functions for easier maintenance. The ability to support info from multiple jobs was added, although we don't currently do so - this will come later to support further fault recovery strategies
9. minor updates to several functions to remove unnecessary and/or no longer used variables and envar's, add some debugging output, etc.
10. addition of a new macro ORTE_PROC_IS_DAEMON that resolves to true if the provided proc is a daemon
There is still more cleanup to be done for efficiency, but this at least works.
Tested on single-node Mac, multi-node SLURM via odin. Tests included connect/accept, publish/lookup/unpublish, comm_spawn, comm_spawn_multiple, and singleton comm_spawn.
Fixes ticket #1256
This commit was SVN r18804.
2008-07-03 21:53:37 +04:00
|
|
|
/* array of data for procs */
|
2009-02-25 05:43:22 +03:00
|
|
|
opal_pointer_array_t pmap;
|
Repair the MPI-2 dynamic operations. This includes:
1. repair of the linear and direct routed modules
2. repair of the ompi/pubsub/orte module to correctly init routes to the ompi-server, and correctly handle failure to correctly parse the provided ompi-server URI
3. modification of orterun to accept both "file" and "FILE" for designating where the ompi-server URI is to be found - purely a convenience feature
4. resolution of a message ordering problem during the connect/accept handshake that allowed the "send-first" proc to attempt to send to the "recv-first" proc before the HNP had actually updated its routes.
Let this be a further reminder to all - message ordering is NOT guaranteed in the OOB
5. Repair the ompi/dpm/orte module to correctly init routes during connect/accept.
Reminder to all: messages sent to procs in another job family (i.e., started by a different mpirun) are ALWAYS routed through the respective HNPs. As per the comments in orte/routed, this is REQUIRED to maintain connect/accept (where only the root proc on each side is capable of init'ing the routes), allow communication between mpirun's using different routing modules, and to minimize connections on tools such as ompi-server. It is all taken care of "under the covers" by the OOB to ensure that a route back to the sender is maintained, even when the different mpirun's are using different routed modules.
6. corrections in the orte/odls to ensure proper identification of daemons participating in a dynamic launch
7. corrections in build/nidmap to support update of an existing nidmap during dynamic launch
8. corrected implementation of the update_arch function in the ESS, along with consolidation of a number of ESS operations into base functions for easier maintenance. The ability to support info from multiple jobs was added, although we don't currently do so - this will come later to support further fault recovery strategies
9. minor updates to several functions to remove unnecessary and/or no longer used variables and envar's, add some debugging output, etc.
10. addition of a new macro ORTE_PROC_IS_DAEMON that resolves to true if the provided proc is a daemon
There is still more cleanup to be done for efficiency, but this at least works.
Tested on single-node Mac, multi-node SLURM via odin. Tests included connect/accept, publish/lookup/unpublish, comm_spawn, comm_spawn_multiple, and singleton comm_spawn.
Fixes ticket #1256
This commit was SVN r18804.
2008-07-03 21:53:37 +04:00
|
|
|
} orte_jmap_t;
|
|
|
|
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_jmap_t);
|
|
|
|
|
2009-06-17 06:54:20 +04:00
|
|
|
typedef struct {
|
|
|
|
/* list object */
|
|
|
|
opal_list_item_t super;
|
|
|
|
char *prefix;
|
2009-06-18 08:36:00 +04:00
|
|
|
opal_value_array_t suffix;
|
2009-06-17 06:54:20 +04:00
|
|
|
opal_value_array_t nodes;
|
|
|
|
opal_value_array_t cnt;
|
|
|
|
opal_value_array_t starting_vpid;
|
|
|
|
opal_value_array_t ppn;
|
|
|
|
opal_value_array_t nrank;
|
|
|
|
} orte_regex_node_t;
|
|
|
|
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_regex_node_t);
|
|
|
|
|
2008-02-28 04:57:57 +03:00
|
|
|
/**
|
|
|
|
* Get a job data object
|
|
|
|
* We cannot just reference a job data object with its jobid as
|
|
|
|
* the jobid is no longer an index into the array. This change
|
|
|
|
* was necessitated by modification of the jobid to include
|
|
|
|
* an mpirun-unique qualifer to eliminate any global name
|
|
|
|
* service
|
|
|
|
*/
|
|
|
|
ORTE_DECLSPEC orte_job_t* orte_get_job_data_object(orte_jobid_t job);
|
|
|
|
|
|
|
|
/* global variables used by RTE - instanced in orte_globals.c */
|
2008-11-01 00:10:00 +03:00
|
|
|
ORTE_DECLSPEC extern bool orte_timing;
|
2009-01-08 17:25:56 +03:00
|
|
|
ORTE_DECLSPEC extern FILE *orte_timing_output;
|
2009-01-12 22:12:58 +03:00
|
|
|
ORTE_DECLSPEC extern bool orte_timing_details;
|
2009-08-20 15:12:45 +04:00
|
|
|
ORTE_DECLSPEC extern bool orte_debug_daemons_flag;
|
|
|
|
ORTE_DECLSPEC extern bool orte_debug_daemons_file_flag;
|
2008-08-14 22:59:01 +04:00
|
|
|
ORTE_DECLSPEC extern bool orte_leave_session_attached;
|
2008-04-17 17:50:59 +04:00
|
|
|
ORTE_DECLSPEC extern bool orte_do_not_launch;
|
2008-02-28 04:57:57 +03:00
|
|
|
ORTE_DECLSPEC extern bool orted_spin_flag;
|
2009-09-22 06:16:40 +04:00
|
|
|
ORTE_DECLSPEC extern bool orte_daemon_bootstrap;
|
2009-12-01 02:11:25 +03:00
|
|
|
ORTE_DECLSPEC extern char *orte_local_cpu_model;
|
2009-08-21 22:03:34 +04:00
|
|
|
|
|
|
|
/* ORTE OOB port flags */
|
2008-03-28 05:20:37 +03:00
|
|
|
ORTE_DECLSPEC extern bool orte_static_ports;
|
2009-08-21 22:03:34 +04:00
|
|
|
ORTE_DECLSPEC extern char *orte_oob_static_ports;
|
2009-08-22 06:58:20 +04:00
|
|
|
ORTE_DECLSPEC extern bool orte_standalone_operation;
|
2009-08-21 22:03:34 +04:00
|
|
|
|
2008-04-02 00:32:17 +04:00
|
|
|
ORTE_DECLSPEC extern bool orte_keep_fqdn_hostnames;
|
2008-11-24 22:57:08 +03:00
|
|
|
ORTE_DECLSPEC extern bool orte_show_resolved_nodenames;
|
2008-05-29 17:38:27 +04:00
|
|
|
ORTE_DECLSPEC extern int orted_debug_failure;
|
2008-06-03 01:46:34 +04:00
|
|
|
ORTE_DECLSPEC extern int orted_debug_failure_delay;
|
2008-06-24 21:50:56 +04:00
|
|
|
ORTE_DECLSPEC extern bool orte_homogeneous_nodes;
|
|
|
|
ORTE_DECLSPEC extern bool orte_hetero_apps;
|
2008-08-19 19:19:30 +04:00
|
|
|
ORTE_DECLSPEC extern bool orte_never_launched;
|
2008-09-23 19:46:34 +04:00
|
|
|
ORTE_DECLSPEC extern bool orte_devel_level_output;
|
2008-02-28 04:57:57 +03:00
|
|
|
|
|
|
|
ORTE_DECLSPEC extern char **orte_launch_environ;
|
2008-04-14 22:26:08 +04:00
|
|
|
|
2008-07-25 21:13:22 +04:00
|
|
|
ORTE_DECLSPEC extern bool orte_hnp_is_allocated;
|
2008-08-04 18:25:19 +04:00
|
|
|
ORTE_DECLSPEC extern bool orte_allocation_required;
|
2008-07-25 21:13:22 +04:00
|
|
|
|
Per the July technical meeting:
Standardize the handling of the orte launch agent option across PLMs. This has been a consistent complaint I have received - each PLM would register its own MCA param to get input on the launch agent for remote nodes (in fact, one or two didn't, but most did). This would then get handled in various and contradictory ways.
Some PLMs would accept only a one-word input. Others accepted multi-word args such as "valgrind orted", but then some would error by putting any prefix specified on the cmd line in front of the incorrect argument.
For example, while using the rsh launcher, if you specified "valgrind orted" as your launch agent and had "--prefix foo" on you cmd line, you would attempt to execute "ssh foo/valgrind orted" - which obviously wouldn't work.
This was all -very- confusing to users, who had to know which PLM was being used so they could even set the right mca param in the first place! And since we don't warn about non-recognized or non-used mca params, half of the time they would wind up not doing what they thought they were telling us to do.
To solve this problem, we did the following:
1. removed all mca params from the individual plms for the launch agent
2. added a new mca param "orte_launch_agent" for this purpose. To further simplify for users, this comes with a new cmd line option "--launch-agent" that can take a multi-word string argument. The value of the param defaults to "orted".
3. added a PLM base function that processes the orte_launch_agent value and adds the contents to a provided argv array. This can subsequently be harvested at-will to handle multi-word values
4. modified the PLMs to use this new function. All the PLMs except for the rsh PLM required very minor change - just called the function and moved on. The rsh PLM required much larger changes as - because of the rsh/ssh cmd line limitations - we had to correctly prepend any provided prefix to the correct argv entry.
5. added a new opal_argv_join_range function that allows the caller to "join" argv entries between two specified indices
Please let me know of any problems. I tried to make this as clean as possible, but cannot compile all PLMs to ensure all is correct.
This commit was SVN r19097.
2008-07-30 22:26:24 +04:00
|
|
|
ORTE_DECLSPEC extern char *orte_launch_agent;
|
2008-02-28 04:57:57 +03:00
|
|
|
ORTE_DECLSPEC extern char **orted_cmd_line;
|
2008-08-05 19:09:29 +04:00
|
|
|
|
2010-02-27 11:32:25 +03:00
|
|
|
/* debugger flags */
|
2008-08-13 21:47:24 +04:00
|
|
|
ORTE_DECLSPEC extern orte_job_t *orte_debugger_daemon;
|
|
|
|
ORTE_DECLSPEC extern bool orte_enable_debug_cospawn_while_running;
|
|
|
|
ORTE_DECLSPEC extern int orte_debugger_check_rate;
|
2010-02-27 11:32:25 +03:00
|
|
|
ORTE_DECLSPEC extern bool orte_output_debugger_proctable;
|
2008-08-13 21:47:24 +04:00
|
|
|
|
2008-08-05 19:09:29 +04:00
|
|
|
/* exit triggers and flags */
|
2009-08-20 15:12:45 +04:00
|
|
|
ORTE_DECLSPEC extern orte_trigger_event_t orte_exit;
|
|
|
|
ORTE_DECLSPEC extern orte_trigger_event_t orteds_exit;
|
2008-02-28 04:57:57 +03:00
|
|
|
ORTE_DECLSPEC extern int orte_exit_status;
|
|
|
|
ORTE_DECLSPEC extern bool orte_abnormal_term_ordered;
|
2008-11-01 00:10:00 +03:00
|
|
|
ORTE_DECLSPEC extern bool orte_routing_is_enabled;
|
2009-02-27 13:16:25 +03:00
|
|
|
ORTE_DECLSPEC extern bool orte_job_term_ordered;
|
2008-06-03 01:46:34 +04:00
|
|
|
|
|
|
|
ORTE_DECLSPEC extern int orte_startup_timeout;
|
Afraid this has a couple of things mixed into the commit. Couldn't be helped - had missed one commit prior to running out the door on vacation.
Fix race conditions in abnormal terminations. We had done a first-cut at this in a prior commit. However, the window remained partially open due to the fact that the HNP has multiple paths leading to orte_finalize. Most of our frameworks don't care if they are finalized more than once, but one of them does, which meant we segfaulted if orte_finalize got called more than once. Besides, we really shouldn't be doing that anyway.
So we now introduce a set of atomic locks that prevent us from multiply calling abort, attempting to call orte_finalize, etc. My initial tests indicate this is working cleanly, but since it is a race condition issue, more testing will have to be done before we know for sure that this problem has been licked.
Also, some updates relevant to the tool comm library snuck in here. Since those also touched the orted code (as did the prior changes), I didn't want to attempt to separate them out - besides, they are coming in soon anyway. More on them later as that functionality approaches completion.
This commit was SVN r17843.
2008-03-17 20:58:59 +03:00
|
|
|
|
2008-02-28 04:57:57 +03:00
|
|
|
ORTE_DECLSPEC extern int orte_timeout_usec_per_proc;
|
|
|
|
ORTE_DECLSPEC extern float orte_max_timeout;
|
|
|
|
|
2008-05-01 23:19:34 +04:00
|
|
|
ORTE_DECLSPEC extern opal_buffer_t *orte_tree_launch_cmd;
|
|
|
|
|
2008-02-28 04:57:57 +03:00
|
|
|
/* global arrays for data storage */
|
2008-02-28 08:32:23 +03:00
|
|
|
ORTE_DECLSPEC extern opal_pointer_array_t *orte_job_data;
|
|
|
|
ORTE_DECLSPEC extern opal_pointer_array_t *orte_node_pool;
|
2008-02-28 04:57:57 +03:00
|
|
|
|
2008-12-08 22:00:36 +03:00
|
|
|
/* a clean output channel without prefix */
|
|
|
|
ORTE_DECLSPEC extern int orte_clean_output;
|
|
|
|
|
2009-01-07 17:58:38 +03:00
|
|
|
/* RHC: FLAG TO SELECT WHETHER OR NOT TO SEND PROFILE FILE IN NIDMAP */
|
|
|
|
ORTE_DECLSPEC extern bool orte_send_profile;
|
|
|
|
|
|
|
|
/* Nidmap and job maps */
|
|
|
|
ORTE_DECLSPEC extern opal_pointer_array_t orte_nidmap;
|
|
|
|
ORTE_DECLSPEC extern opal_pointer_array_t orte_jobmap;
|
2009-06-24 00:25:38 +04:00
|
|
|
ORTE_DECLSPEC extern bool orte_use_regexp;
|
|
|
|
ORTE_DECLSPEC extern char *orted_launch_cmd;
|
2009-01-07 17:58:38 +03:00
|
|
|
|
2009-01-08 17:25:56 +03:00
|
|
|
/* list of local children on a daemon */
|
|
|
|
ORTE_DECLSPEC extern opal_list_t orte_local_children;
|
2010-04-23 08:44:41 +04:00
|
|
|
ORTE_DECLSPEC extern opal_mutex_t orte_local_children_lock;
|
|
|
|
ORTE_DECLSPEC extern opal_condition_t orte_local_children_cond;
|
|
|
|
|
2009-01-08 17:25:56 +03:00
|
|
|
/* list of job data for local children on a daemon */
|
|
|
|
ORTE_DECLSPEC extern opal_list_t orte_local_jobdata;
|
2010-04-23 08:44:41 +04:00
|
|
|
ORTE_DECLSPEC extern opal_mutex_t orte_local_jobdata_lock;
|
|
|
|
ORTE_DECLSPEC extern opal_condition_t orte_local_jobdata_cond;
|
|
|
|
|
2009-01-30 21:50:10 +03:00
|
|
|
/* whether or not to forward SIGTSTP and SIGCONT signals */
|
|
|
|
ORTE_DECLSPEC extern bool orte_forward_job_control;
|
2009-01-08 17:25:56 +03:00
|
|
|
|
2009-01-31 01:47:30 +03:00
|
|
|
/* IOF controls */
|
|
|
|
ORTE_DECLSPEC extern bool orte_tag_output;
|
|
|
|
ORTE_DECLSPEC extern bool orte_timestamp_output;
|
|
|
|
ORTE_DECLSPEC extern char *orte_output_filename;
|
|
|
|
/* generate new xterm windows to display output from specified ranks */
|
|
|
|
ORTE_DECLSPEC extern char *orte_xterm;
|
2009-01-07 17:58:38 +03:00
|
|
|
|
2009-02-09 23:44:44 +03:00
|
|
|
/* rsh support */
|
|
|
|
ORTE_DECLSPEC extern char *orte_rsh_agent;
|
2009-05-30 05:10:25 +04:00
|
|
|
ORTE_DECLSPEC extern bool orte_assume_same_shell;
|
2009-02-09 23:44:44 +03:00
|
|
|
|
2009-05-11 18:03:07 +04:00
|
|
|
/* whether or not to barrier the orteds upon exit */
|
|
|
|
ORTE_DECLSPEC extern bool orte_orted_exit_with_barrier;
|
|
|
|
|
2009-06-03 03:52:02 +04:00
|
|
|
/* whether or not to report launch progress */
|
|
|
|
ORTE_DECLSPEC extern bool orte_report_launch_progress;
|
|
|
|
|
2009-08-11 06:51:27 +04:00
|
|
|
/* cluster hardware info */
|
|
|
|
ORTE_DECLSPEC extern uint8_t orte_default_num_boards;
|
|
|
|
ORTE_DECLSPEC extern uint8_t orte_default_num_sockets_per_board;
|
|
|
|
ORTE_DECLSPEC extern uint8_t orte_default_num_cores_per_socket;
|
|
|
|
|
|
|
|
/* allocation specification */
|
|
|
|
ORTE_DECLSPEC extern char *orte_default_cpu_set;
|
2009-08-13 20:08:43 +04:00
|
|
|
ORTE_DECLSPEC extern char *orte_default_hostfile;
|
|
|
|
ORTE_DECLSPEC extern char *orte_rankfile;
|
2010-02-24 11:50:03 +03:00
|
|
|
#ifdef __WINDOWS__
|
2010-02-23 22:42:51 +03:00
|
|
|
ORTE_DECLSPEC extern char *orte_ccp_headnode;
|
2010-02-24 11:50:03 +03:00
|
|
|
#endif
|
2009-08-11 06:51:27 +04:00
|
|
|
|
|
|
|
/* default rank assigment and binding policy */
|
|
|
|
ORTE_DECLSPEC extern orte_mapping_policy_t orte_default_mapping_policy;
|
|
|
|
|
2009-09-09 09:28:45 +04:00
|
|
|
/* tool communication controls */
|
|
|
|
ORTE_DECLSPEC extern bool orte_report_events;
|
|
|
|
ORTE_DECLSPEC extern char *orte_report_events_uri;
|
|
|
|
|
2009-09-28 07:17:15 +04:00
|
|
|
/* report bindings */
|
|
|
|
ORTE_DECLSPEC extern bool orte_report_bindings;
|
|
|
|
|
2010-01-14 20:59:42 +03:00
|
|
|
/* barrier control */
|
|
|
|
ORTE_DECLSPEC extern bool orte_do_not_barrier;
|
|
|
|
|
2010-04-28 08:06:57 +04:00
|
|
|
/* process recovery */
|
|
|
|
ORTE_DECLSPEC extern bool orte_enable_recovery;
|
|
|
|
ORTE_DECLSPEC extern int32_t orte_max_global_restarts;
|
|
|
|
ORTE_DECLSPEC extern int32_t orte_max_local_restarts;
|
|
|
|
|
2010-04-23 08:44:41 +04:00
|
|
|
/* comm interface */
|
|
|
|
typedef void (*orte_default_cbfunc_t)(int fd, short event, void *data);
|
|
|
|
|
|
|
|
typedef int (*orte_default_comm_fn_t)(orte_process_name_t *recipient,
|
|
|
|
opal_buffer_t *buf,
|
|
|
|
orte_rml_tag_t tag,
|
|
|
|
orte_default_cbfunc_t cbfunc);
|
|
|
|
/* comm fn for updating state */
|
|
|
|
ORTE_DECLSPEC extern orte_default_comm_fn_t orte_comm;
|
|
|
|
ORTE_DECLSPEC int orte_global_comm(orte_process_name_t *recipient,
|
|
|
|
opal_buffer_t *buf, orte_rml_tag_t tag,
|
|
|
|
orte_default_cbfunc_t cbfunc);
|
|
|
|
|
|
|
|
|
2008-06-18 07:15:56 +04:00
|
|
|
#endif /* ORTE_DISABLE_FULL_SUPPORT */
|
2008-02-28 04:57:57 +03:00
|
|
|
|
|
|
|
END_C_DECLS
|
|
|
|
|
|
|
|
#endif /* ORTE_RUNTIME_ORTE_GLOBALS_H */
|