2008-02-28 04:57:57 +03:00
|
|
|
/*
|
2010-03-13 02:57:50 +03:00
|
|
|
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
2008-02-28 04:57:57 +03:00
|
|
|
* University Research and Technology
|
|
|
|
* Corporation. All rights reserved.
|
2011-06-24 00:38:02 +04:00
|
|
|
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
2008-02-28 04:57:57 +03:00
|
|
|
* of Tennessee Research Foundation. All rights
|
|
|
|
* reserved.
|
|
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
|
|
* University of Stuttgart. All rights reserved.
|
|
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
|
|
* All rights reserved.
|
2010-08-09 23:28:56 +04:00
|
|
|
* Copyright (c) 2007-2010 Oracle and/or its affiliates. All rights reserved.
|
Per RFC, bring in the following changes:
* Remove paffinity, maffinity, and carto frameworks -- they've been
wholly replaced by hwloc.
* Move ompi_mpi_init() affinity-setting/checking code down to ORTE.
* Update sm, smcuda, wv, and openib components to no longer use carto.
Instead, use hwloc data. There are still optimizations possible in
the sm/smcuda BTLs (i.e., making multiple mpools). Also, the old
carto-based code found out how many NUMA nodes were ''available''
-- not how many were used ''in this job''. The new hwloc-using
code computes the same value -- it was not updated to calculate how
many NUMA nodes are used ''by this job.''
* Note that I cannot compile the smcuda and wv BTLs -- I ''think''
they're right, but they need to be verified by their owners.
* The openib component now does a bunch of stuff to figure out where
"near" OpenFabrics devices are. '''THIS IS A CHANGE IN DEFAULT
BEHAVIOR!!''' and still needs to be verified by OpenFabrics vendors
(I do not have a NUMA machine with an OpenFabrics device that is a
non-uniform distance from multiple different NUMA nodes).
* Completely rewrite the OMPI_Affinity_str() routine from the
"affinity" mpiext extension. This extension now understands
hyperthreads; the output format of it has changed a bit to reflect
this new information.
* Bunches of minor changes around the code base to update names/types
from maffinity/paffinity-based names to hwloc-based names.
* Add some helper functions into the hwloc base, mainly having to do
with the fact that we have the hwloc data reporting ''all''
topology information, but sometimes you really only want the
(online | available) data.
This commit was SVN r26391.
2012-05-07 18:52:54 +04:00
|
|
|
* Copyright (c) 2007-2012 Cisco Systems, Inc. All rights reserved.
|
2013-01-18 09:00:05 +04:00
|
|
|
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
|
2012-04-06 18:23:13 +04:00
|
|
|
* All rights reserved.
|
2008-02-28 04:57:57 +03:00
|
|
|
* $COPYRIGHT$
|
|
|
|
*
|
|
|
|
* Additional copyrights may follow
|
|
|
|
*
|
|
|
|
* $HEADER$
|
|
|
|
*/
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @file
|
|
|
|
*
|
|
|
|
* Global params for OpenRTE
|
|
|
|
*/
|
|
|
|
#ifndef ORTE_RUNTIME_ORTE_GLOBALS_H
|
|
|
|
#define ORTE_RUNTIME_ORTE_GLOBALS_H
|
|
|
|
|
|
|
|
#include "orte_config.h"
|
|
|
|
#include "orte/types.h"
|
|
|
|
|
|
|
|
#include <sys/types.h>
|
|
|
|
#ifdef HAVE_SYS_TIME_H
|
|
|
|
#include <sys/time.h>
|
|
|
|
#endif
|
|
|
|
|
2008-02-28 08:32:23 +03:00
|
|
|
#include "opal/class/opal_pointer_array.h"
|
2009-06-17 06:54:20 +04:00
|
|
|
#include "opal/class/opal_value_array.h"
|
2011-06-30 07:12:38 +04:00
|
|
|
#include "opal/class/opal_ring_buffer.h"
|
2010-04-23 08:44:41 +04:00
|
|
|
#include "opal/threads/threads.h"
|
2012-04-06 18:23:13 +04:00
|
|
|
#include "opal/mca/event/event.h"
|
2011-09-11 23:02:24 +04:00
|
|
|
#include "opal/mca/hwloc/hwloc.h"
|
Per RFC, bring in the following changes:
* Remove paffinity, maffinity, and carto frameworks -- they've been
wholly replaced by hwloc.
* Move ompi_mpi_init() affinity-setting/checking code down to ORTE.
* Update sm, smcuda, wv, and openib components to no longer use carto.
Instead, use hwloc data. There are still optimizations possible in
the sm/smcuda BTLs (i.e., making multiple mpools). Also, the old
carto-based code found out how many NUMA nodes were ''available''
-- not how many were used ''in this job''. The new hwloc-using
code computes the same value -- it was not updated to calculate how
many NUMA nodes are used ''by this job.''
* Note that I cannot compile the smcuda and wv BTLs -- I ''think''
they're right, but they need to be verified by their owners.
* The openib component now does a bunch of stuff to figure out where
"near" OpenFabrics devices are. '''THIS IS A CHANGE IN DEFAULT
BEHAVIOR!!''' and still needs to be verified by OpenFabrics vendors
(I do not have a NUMA machine with an OpenFabrics device that is a
non-uniform distance from multiple different NUMA nodes).
* Completely rewrite the OMPI_Affinity_str() routine from the
"affinity" mpiext extension. This extension now understands
hyperthreads; the output format of it has changed a bit to reflect
this new information.
* Bunches of minor changes around the code base to update names/types
from maffinity/paffinity-based names to hwloc-based names.
* Add some helper functions into the hwloc base, mainly having to do
with the fact that we have the hwloc data reporting ''all''
topology information, but sometimes you really only want the
(online | available) data.
This commit was SVN r26391.
2012-05-07 18:52:54 +04:00
|
|
|
#include "opal/mca/hwloc/base/base.h"
|
2008-02-28 04:57:57 +03:00
|
|
|
|
|
|
|
#include "orte/mca/plm/plm_types.h"
|
2010-04-23 08:44:41 +04:00
|
|
|
#include "orte/mca/rml/rml_types.h"
|
2008-02-28 04:57:57 +03:00
|
|
|
#include "orte/util/proc_info.h"
|
2009-03-18 00:34:30 +03:00
|
|
|
#include "orte/util/name_fns.h"
|
2012-04-06 18:23:13 +04:00
|
|
|
#include "orte/util/error_strings.h"
|
2008-06-18 07:15:56 +04:00
|
|
|
#include "orte/runtime/runtime.h"
|
2008-08-05 19:09:29 +04:00
|
|
|
#include "orte/runtime/orte_wait.h"
|
2008-02-28 04:57:57 +03:00
|
|
|
|
2008-09-01 21:15:01 +04:00
|
|
|
|
|
|
|
BEGIN_C_DECLS
|
|
|
|
|
2008-11-24 22:57:08 +03:00
|
|
|
ORTE_DECLSPEC extern int orte_debug_verbosity; /* instantiated in orte/runtime/orte_init.c */
|
|
|
|
ORTE_DECLSPEC extern char *orte_prohibited_session_dirs; /* instantiated in orte/runtime/orte_init.c */
|
|
|
|
ORTE_DECLSPEC extern bool orte_xml_output; /* instantiated in orte/runtime/orte_globals.c */
|
2009-09-02 22:03:10 +04:00
|
|
|
ORTE_DECLSPEC extern FILE *orte_xml_fp; /* instantiated in orte/runtime/orte_globals.c */
|
2008-11-24 22:57:08 +03:00
|
|
|
ORTE_DECLSPEC extern bool orte_help_want_aggregate; /* instantiated in orte/util/show_help.c */
|
2009-07-15 23:43:26 +04:00
|
|
|
ORTE_DECLSPEC extern char *orte_job_ident; /* instantiated in orte/runtime/orte_globals.c */
|
2010-03-02 18:18:33 +03:00
|
|
|
ORTE_DECLSPEC extern bool orte_create_session_dirs; /* instantiated in orte/runtime/orte_init.c */
|
2010-04-02 18:19:38 +04:00
|
|
|
ORTE_DECLSPEC extern bool orte_execute_quiet; /* instantiated in orte/runtime/orte_globals.c */
|
2010-10-16 07:29:47 +04:00
|
|
|
ORTE_DECLSPEC extern bool orte_report_silent_errors; /* instantiated in orte/runtime/orte_globals.c */
|
2012-04-06 18:23:13 +04:00
|
|
|
ORTE_DECLSPEC extern opal_event_base_t *orte_event_base; /* instantiated in orte/runtime/orte_init.c */
|
Per RFC, bring in the following changes:
* Remove paffinity, maffinity, and carto frameworks -- they've been
wholly replaced by hwloc.
* Move ompi_mpi_init() affinity-setting/checking code down to ORTE.
* Update sm, smcuda, wv, and openib components to no longer use carto.
Instead, use hwloc data. There are still optimizations possible in
the sm/smcuda BTLs (i.e., making multiple mpools). Also, the old
carto-based code found out how many NUMA nodes were ''available''
-- not how many were used ''in this job''. The new hwloc-using
code computes the same value -- it was not updated to calculate how
many NUMA nodes are used ''by this job.''
* Note that I cannot compile the smcuda and wv BTLs -- I ''think''
they're right, but they need to be verified by their owners.
* The openib component now does a bunch of stuff to figure out where
"near" OpenFabrics devices are. '''THIS IS A CHANGE IN DEFAULT
BEHAVIOR!!''' and still needs to be verified by OpenFabrics vendors
(I do not have a NUMA machine with an OpenFabrics device that is a
non-uniform distance from multiple different NUMA nodes).
* Completely rewrite the OMPI_Affinity_str() routine from the
"affinity" mpiext extension. This extension now understands
hyperthreads; the output format of it has changed a bit to reflect
this new information.
* Bunches of minor changes around the code base to update names/types
from maffinity/paffinity-based names to hwloc-based names.
* Add some helper functions into the hwloc base, mainly having to do
with the fact that we have the hwloc data reporting ''all''
topology information, but sometimes you really only want the
(online | available) data.
This commit was SVN r26391.
2012-05-07 18:52:54 +04:00
|
|
|
ORTE_DECLSPEC extern bool orte_event_base_active; /* instantiated in orte/runtime/orte_init.c */
|
|
|
|
ORTE_DECLSPEC extern bool orte_proc_is_bound; /* instantiated in orte/runtime/orte_init.c */
|
2012-11-15 19:54:38 +04:00
|
|
|
ORTE_DECLSPEC extern int orte_progress_thread_debug; /* instantiated in orte/runtime/orte_init.c */
|
|
|
|
|
|
|
|
|
Per RFC, bring in the following changes:
* Remove paffinity, maffinity, and carto frameworks -- they've been
wholly replaced by hwloc.
* Move ompi_mpi_init() affinity-setting/checking code down to ORTE.
* Update sm, smcuda, wv, and openib components to no longer use carto.
Instead, use hwloc data. There are still optimizations possible in
the sm/smcuda BTLs (i.e., making multiple mpools). Also, the old
carto-based code found out how many NUMA nodes were ''available''
-- not how many were used ''in this job''. The new hwloc-using
code computes the same value -- it was not updated to calculate how
many NUMA nodes are used ''by this job.''
* Note that I cannot compile the smcuda and wv BTLs -- I ''think''
they're right, but they need to be verified by their owners.
* The openib component now does a bunch of stuff to figure out where
"near" OpenFabrics devices are. '''THIS IS A CHANGE IN DEFAULT
BEHAVIOR!!''' and still needs to be verified by OpenFabrics vendors
(I do not have a NUMA machine with an OpenFabrics device that is a
non-uniform distance from multiple different NUMA nodes).
* Completely rewrite the OMPI_Affinity_str() routine from the
"affinity" mpiext extension. This extension now understands
hyperthreads; the output format of it has changed a bit to reflect
this new information.
* Bunches of minor changes around the code base to update names/types
from maffinity/paffinity-based names to hwloc-based names.
* Add some helper functions into the hwloc base, mainly having to do
with the fact that we have the hwloc data reporting ''all''
topology information, but sometimes you really only want the
(online | available) data.
This commit was SVN r26391.
2012-05-07 18:52:54 +04:00
|
|
|
#if OPAL_HAVE_HWLOC
|
|
|
|
/**
|
|
|
|
* Global indicating where this process was bound to at launch (will
|
|
|
|
* be NULL if !orte_proc_is_bound)
|
|
|
|
*/
|
|
|
|
OPAL_DECLSPEC extern hwloc_cpuset_t orte_proc_applied_binding; /* instantiated in orte/runtime/orte_init.c */
|
|
|
|
#endif
|
|
|
|
|
2008-09-01 21:15:01 +04:00
|
|
|
|
|
|
|
/* Shortcut for some commonly used names */
|
|
|
|
#define ORTE_NAME_WILDCARD (&orte_name_wildcard)
|
2009-04-29 06:13:14 +04:00
|
|
|
ORTE_DECLSPEC extern orte_process_name_t orte_name_wildcard; /** instantiated in orte/runtime/orte_init.c */
|
2008-09-01 21:15:01 +04:00
|
|
|
#define ORTE_NAME_INVALID (&orte_name_invalid)
|
2009-04-29 06:13:14 +04:00
|
|
|
ORTE_DECLSPEC extern orte_process_name_t orte_name_invalid; /** instantiated in orte/runtime/orte_init.c */
|
2008-09-01 21:15:01 +04:00
|
|
|
|
2009-03-06 00:56:03 +03:00
|
|
|
#define ORTE_PROC_MY_NAME (&orte_process_info.my_name)
|
2008-09-01 21:15:01 +04:00
|
|
|
|
2011-06-24 00:38:02 +04:00
|
|
|
/* define a special name that point to my parent (aka the process that spawned me) */
|
|
|
|
#define ORTE_PROC_MY_PARENT (&orte_process_info.my_parent)
|
|
|
|
|
2008-09-01 21:15:01 +04:00
|
|
|
/* define a special name that belongs to orterun */
|
2009-03-06 00:56:03 +03:00
|
|
|
#define ORTE_PROC_MY_HNP (&orte_process_info.my_hnp)
|
2008-09-01 21:15:01 +04:00
|
|
|
|
|
|
|
/* define the name of my daemon */
|
2009-03-06 00:56:03 +03:00
|
|
|
#define ORTE_PROC_MY_DAEMON (&orte_process_info.my_daemon)
|
2008-09-01 21:15:01 +04:00
|
|
|
|
|
|
|
ORTE_DECLSPEC extern bool orte_in_parallel_debugger;
|
|
|
|
|
2010-01-07 21:14:03 +03:00
|
|
|
/* error manager callback function */
|
|
|
|
typedef void (*orte_err_cb_fn_t)(orte_process_name_t *proc, orte_proc_state_t state, void *cbdata);
|
|
|
|
|
2010-08-18 01:51:38 +04:00
|
|
|
ORTE_DECLSPEC extern int orte_exit_status;
|
|
|
|
|
2012-04-11 01:50:01 +04:00
|
|
|
/* ORTE event priorities - we define these
|
|
|
|
* at levels that permit higher layers such as
|
|
|
|
* OMPI to handle their events at higher priority,
|
|
|
|
* with the exception of errors. Errors generally
|
|
|
|
* require exception handling (e.g., ctrl-c termination)
|
|
|
|
* that overrides the need to process MPI messages
|
|
|
|
*/
|
|
|
|
#define ORTE_ERROR_PRI OPAL_EV_ERROR_PRI
|
|
|
|
#define ORTE_MSG_PRI OPAL_EV_MSG_LO_PRI
|
|
|
|
#define ORTE_SYS_PRI OPAL_EV_SYS_LO_PRI
|
|
|
|
#define ORTE_INFO_PRI OPAL_EV_INFO_LO_PRI
|
|
|
|
|
|
|
|
/* State Machine lists */
|
|
|
|
ORTE_DECLSPEC extern opal_list_t orte_job_states;
|
|
|
|
ORTE_DECLSPEC extern opal_list_t orte_proc_states;
|
|
|
|
|
|
|
|
/* a clean output channel without prefix */
|
|
|
|
ORTE_DECLSPEC extern int orte_clean_output;
|
|
|
|
|
2008-09-01 21:15:01 +04:00
|
|
|
#if ORTE_DISABLE_FULL_SUPPORT
|
|
|
|
|
|
|
|
/* These types are used in interface functions that should never be
|
|
|
|
used or implemented in the non-full interface, but need to be
|
|
|
|
declared for various reasons. So have a dummy type to keep things
|
|
|
|
simple (and throw an error if someone does try to use them) */
|
|
|
|
struct orte_job_t;
|
2010-08-18 01:51:38 +04:00
|
|
|
struct orte_proc_t;
|
|
|
|
struct orte_node_t;
|
|
|
|
struct orte_app_context_t;
|
|
|
|
|
2008-09-01 21:15:01 +04:00
|
|
|
typedef struct orte_job_t orte_job_t;
|
2010-08-18 01:51:38 +04:00
|
|
|
typedef struct orte_proc_t orte_proc_t;
|
|
|
|
typedef struct orte_node_t orte_node_t;
|
|
|
|
typedef struct orte_app_context_t orte_app_context_t;
|
2008-09-01 21:15:01 +04:00
|
|
|
|
|
|
|
#else
|
|
|
|
|
2012-05-20 19:14:43 +04:00
|
|
|
#if ORTE_ENABLE_PROGRESS_THREADS
|
2012-04-06 18:23:13 +04:00
|
|
|
ORTE_DECLSPEC extern opal_thread_t orte_progress_thread;
|
|
|
|
#endif
|
|
|
|
|
2008-02-28 04:57:57 +03:00
|
|
|
#define ORTE_GLOBAL_ARRAY_BLOCK_SIZE 64
|
|
|
|
#define ORTE_GLOBAL_ARRAY_MAX_SIZE INT_MAX
|
|
|
|
|
2008-03-05 04:46:30 +03:00
|
|
|
/* define a default error return code for ORTE */
|
|
|
|
#define ORTE_ERROR_DEFAULT_EXIT_CODE 1
|
2008-02-28 04:57:57 +03:00
|
|
|
|
2008-08-05 19:09:29 +04:00
|
|
|
/**
|
|
|
|
* Define a macro for updating the orte_exit_status
|
|
|
|
* The macro provides a convenient way of doing this
|
|
|
|
* so that we can add thread locking at some point
|
|
|
|
* since the orte_exit_status is a global variable.
|
|
|
|
*
|
|
|
|
* Ensure that we do not overwrite the exit status if it has
|
|
|
|
* already been set to some non-zero value. If we don't make
|
|
|
|
* this check, then different parts of the code could overwrite
|
|
|
|
* each other's exit status in the case of abnormal termination.
|
|
|
|
*
|
|
|
|
* For example, if a process aborts, we would record the initial
|
|
|
|
* exit code from the aborted process. However, subsequent processes
|
|
|
|
* will have been aborted by signal as we kill the job. We don't want
|
|
|
|
* the subsequent processes to overwrite the original exit code so
|
|
|
|
* we can tell the user the exit code from the process that caused
|
|
|
|
* the whole thing to happen.
|
|
|
|
*/
|
|
|
|
#define ORTE_UPDATE_EXIT_STATUS(newstatus) \
|
|
|
|
do { \
|
|
|
|
if (0 == orte_exit_status && 0 != newstatus) { \
|
|
|
|
OPAL_OUTPUT_VERBOSE((1, orte_debug_output, \
|
|
|
|
"%s:%s(%d) updating exit status to %d", \
|
2009-03-06 00:50:47 +03:00
|
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \
|
2008-08-05 19:09:29 +04:00
|
|
|
__FILE__, __LINE__, newstatus)); \
|
|
|
|
orte_exit_status = newstatus; \
|
|
|
|
} \
|
|
|
|
} while(0);
|
|
|
|
|
2009-09-09 09:28:45 +04:00
|
|
|
/* sometimes we need to reset the exit status - for example, when we
|
|
|
|
* are restarting a failed process
|
|
|
|
*/
|
|
|
|
#define ORTE_RESET_EXIT_STATUS() \
|
|
|
|
do { \
|
|
|
|
OPAL_OUTPUT_VERBOSE((1, orte_debug_output, \
|
|
|
|
"%s:%s(%d) reseting exit status", \
|
|
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \
|
|
|
|
__FILE__, __LINE__)); \
|
|
|
|
orte_exit_status = 0; \
|
|
|
|
} while(0);
|
|
|
|
|
2008-08-05 19:09:29 +04:00
|
|
|
|
2009-01-12 22:12:58 +03:00
|
|
|
/* define a macro for computing time differences - used for timing tests
|
|
|
|
* across the code base
|
|
|
|
*/
|
|
|
|
#define ORTE_COMPUTE_TIME_DIFF(r, ur, s1, us1, s2, us2) \
|
|
|
|
do { \
|
|
|
|
(r) = (s2) - (s1); \
|
|
|
|
if ((us2) >= (us1)) { \
|
|
|
|
(ur) = (us2) - (us1); \
|
|
|
|
} else { \
|
|
|
|
(r)--; \
|
|
|
|
(ur) = 1000000 - (us1) + (us2); \
|
|
|
|
} \
|
|
|
|
} while(0);
|
|
|
|
|
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here:
https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement
The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation.
In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions:
1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior.
2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation.
3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so.
As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes.
This commit was SVN r25476.
2011-11-15 07:40:11 +04:00
|
|
|
/* define a set of flags to control the launch of a job */
|
|
|
|
typedef uint16_t orte_job_controls_t;
|
|
|
|
#define ORTE_JOB_CONTROL OPAL_UINT16
|
|
|
|
|
|
|
|
#define ORTE_JOB_CONTROL_NON_ORTE_JOB 0x0002
|
|
|
|
#define ORTE_JOB_CONTROL_DEBUGGER_DAEMON 0x0014
|
|
|
|
#define ORTE_JOB_CONTROL_FORWARD_OUTPUT 0x0008
|
|
|
|
#define ORTE_JOB_CONTROL_DO_NOT_MONITOR 0x0010
|
|
|
|
#define ORTE_JOB_CONTROL_FORWARD_COMM 0x0020
|
|
|
|
#define ORTE_JOB_CONTROL_CONTINUOUS_OP 0x0040
|
|
|
|
#define ORTE_JOB_CONTROL_RECOVERABLE 0x0080
|
|
|
|
#define ORTE_JOB_CONTROL_SPIN_FOR_DEBUG 0x0100
|
2012-04-06 18:23:13 +04:00
|
|
|
#define ORTE_JOB_CONTROL_RESTART 0x0200
|
|
|
|
#define ORTE_JOB_CONTROL_PROCS_MIGRATING 0x0400
|
2012-05-03 01:00:22 +04:00
|
|
|
#define ORTE_JOB_CONTROL_MAPPER 0x0800
|
|
|
|
#define ORTE_JOB_CONTROL_REDUCER 0x1000
|
|
|
|
#define ORTE_JOB_CONTROL_COMBINER 0x2000
|
2012-08-03 20:30:05 +04:00
|
|
|
#define ORTE_JOB_CONTROL_NO_VM 0x4000
|
2013-02-16 00:20:49 +04:00
|
|
|
#define ORTE_JOB_CONTROL_INDEX_ARGV 0x8000
|
2012-05-03 01:00:22 +04:00
|
|
|
|
2008-02-28 04:57:57 +03:00
|
|
|
/* global type definitions used by RTE - instanced in orte_globals.c */
|
|
|
|
|
|
|
|
/************
|
|
|
|
* Declare this to allow us to use it before fully
|
|
|
|
* defining it - resolves potential circular definition
|
|
|
|
*/
|
|
|
|
struct orte_proc_t;
|
2009-08-11 06:51:27 +04:00
|
|
|
struct orte_job_map_t;
|
2008-02-28 04:57:57 +03:00
|
|
|
/************/
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Information about a specific application to be launched in the RTE.
|
|
|
|
*/
|
|
|
|
typedef struct {
|
|
|
|
/** Parent object */
|
|
|
|
opal_object_t super;
|
|
|
|
/** Unique index when multiple apps per job */
|
2010-02-27 20:37:34 +03:00
|
|
|
orte_app_idx_t idx;
|
2008-02-28 04:57:57 +03:00
|
|
|
/** Absolute pathname of argv[0] */
|
|
|
|
char *app;
|
|
|
|
/** Number of copies of this process that are to be launched */
|
|
|
|
orte_std_cntr_t num_procs;
|
2012-08-29 01:20:17 +04:00
|
|
|
/** Array of pointers to the proc objects for procs of this app_context
|
|
|
|
* NOTE - not always used
|
|
|
|
*/
|
|
|
|
opal_pointer_array_t procs;
|
|
|
|
/** State of the app_context */
|
|
|
|
orte_app_state_t state;
|
2012-08-12 05:28:23 +04:00
|
|
|
/** First MPI rank of this app_context in the job */
|
|
|
|
orte_vpid_t first_rank;
|
2008-02-28 04:57:57 +03:00
|
|
|
/** Standard argv-style array, including a final NULL pointer */
|
|
|
|
char **argv;
|
|
|
|
/** Standard environ-style array, including a final NULL pointer */
|
|
|
|
char **env;
|
|
|
|
/** Current working directory for this app */
|
|
|
|
char *cwd;
|
|
|
|
/** Whether the cwd was set by the user or by the system */
|
|
|
|
bool user_specified_cwd;
|
2012-08-24 01:28:05 +04:00
|
|
|
/** Whether to set the current working directory to the proc session dir */
|
|
|
|
bool set_cwd_to_session_dir;
|
2008-02-28 04:57:57 +03:00
|
|
|
/* Any hostfile that was specified */
|
|
|
|
char *hostfile;
|
|
|
|
/* Hostfile for adding hosts to an existing allocation */
|
|
|
|
char *add_hostfile;
|
2009-07-14 18:34:11 +04:00
|
|
|
/* Hosts to be added to an existing allocation - analagous to -host */
|
|
|
|
char **add_host;
|
2008-03-06 01:12:27 +03:00
|
|
|
/** argv of hosts passed in to -host */
|
|
|
|
char ** dash_host;
|
2011-05-29 02:18:19 +04:00
|
|
|
/** list of resource constraints to be applied
|
|
|
|
* when selecting hosts for this app
|
|
|
|
*/
|
|
|
|
opal_list_t resource_constraints;
|
2008-03-06 01:12:27 +03:00
|
|
|
/** Prefix directory for this app (or NULL if no override necessary) */
|
2008-02-28 04:57:57 +03:00
|
|
|
char *prefix_dir;
|
2009-02-05 01:37:24 +03:00
|
|
|
/** Preload the binary on the remote machine (in PLM via FileM) */
|
2008-02-28 04:57:57 +03:00
|
|
|
bool preload_binary;
|
|
|
|
/** Preload the comma separated list of files to the remote machines cwd */
|
|
|
|
char * preload_files;
|
2009-01-25 15:39:24 +03:00
|
|
|
/* is being used on the local node */
|
|
|
|
bool used_on_node;
|
A number of C/R enhancements per RFC below:
http://www.open-mpi.org/community/lists/devel/2010/07/8240.php
Documentation:
http://osl.iu.edu/research/ft/
Major Changes:
--------------
* Added C/R-enabled Debugging support.
Enabled with the --enable-crdebug flag. See the following website for more information:
http://osl.iu.edu/research/ft/crdebug/
* Added Stable Storage (SStore) framework for checkpoint storage
* 'central' component does a direct to central storage save
* 'stage' component stages checkpoints to central storage while the application continues execution.
* 'stage' supports offline compression of checkpoints before moving (sstore_stage_compress)
* 'stage' supports local caching of checkpoints to improve automatic recovery (sstore_stage_caching)
* Added Compression (compress) framework to support
* Add two new ErrMgr recovery policies
* {{{crmig}}} C/R Process Migration
* {{{autor}}} C/R Automatic Recovery
* Added the {{{ompi-migrate}}} command line tool to support the {{{crmig}}} ErrMgr component
* Added CR MPI Ext functions (enable them with {{{--enable-mpi-ext=cr}}} configure option)
* {{{OMPI_CR_Checkpoint}}} (Fixes trac:2342)
* {{{OMPI_CR_Restart}}}
* {{{OMPI_CR_Migrate}}} (may need some more work for mapping rules)
* {{{OMPI_CR_INC_register_callback}}} (Fixes trac:2192)
* {{{OMPI_CR_Quiesce_start}}}
* {{{OMPI_CR_Quiesce_checkpoint}}}
* {{{OMPI_CR_Quiesce_end}}}
* {{{OMPI_CR_self_register_checkpoint_callback}}}
* {{{OMPI_CR_self_register_restart_callback}}}
* {{{OMPI_CR_self_register_continue_callback}}}
* The ErrMgr predicted_fault() interface has been changed to take an opal_list_t of ErrMgr defined types. This will allow us to better support a wider range of fault prediction services in the future.
* Add a progress meter to:
* FileM rsh (filem_rsh_process_meter)
* SnapC full (snapc_full_progress_meter)
* SStore stage (sstore_stage_progress_meter)
* Added 2 new command line options to ompi-restart
* --showme : Display the full command line that would have been exec'ed.
* --mpirun_opts : Command line options to pass directly to mpirun. (Fixes trac:2413)
* Deprecated some MCA params:
* crs_base_snapshot_dir deprecated, use sstore_stage_local_snapshot_dir
* snapc_base_global_snapshot_dir deprecated, use sstore_base_global_snapshot_dir
* snapc_base_global_shared deprecated, use sstore_stage_global_is_shared
* snapc_base_store_in_place deprecated, replaced with different components of SStore
* snapc_base_global_snapshot_ref deprecated, use sstore_base_global_snapshot_ref
* snapc_base_establish_global_snapshot_dir deprecated, never well supported
* snapc_full_skip_filem deprecated, use sstore_stage_skip_filem
Minor Changes:
--------------
* Fixes trac:1924 : {{{ompi-restart}}} now recognizes path prefixed checkpoint handles and does the right thing.
* Fixes trac:2097 : {{{ompi-info}}} should now report all available CRS components
* Fixes trac:2161 : Manual checkpoint movement. A user can 'mv' a checkpoint directory from the original location to another and still restart from it.
* Fixes trac:2208 : Honor various TMPDIR varaibles instead of forcing {{{/tmp}}}
* Move {{{ompi_cr_continue_like_restart}}} to {{{orte_cr_continue_like_restart}}} to be more flexible in where this should be set.
* opal_crs_base_metadata_write* functions have been moved to SStore to support a wider range of metadata handling functionality.
* Cleanup the CRS framework and components to work with the SStore framework.
* Cleanup the SnapC framework and components to work with the SStore framework (cleans up these code paths considerably).
* Add 'quiesce' hook to CRCP for a future enhancement.
* We now require a BLCR version that supports {{{cr_request_file()}}} or {{{cr_request_checkpoint()}}} in order to make the code more maintainable. Note that {{{cr_request_file}}} has been deprecated since 0.7.0, so we prefer to use {{{cr_request_checkpoint()}}}.
* Add optional application level INC callbacks (registered through the CR MPI Ext interface).
* Increase the {{{opal_cr_thread_sleep_wait}}} parameter to 1000 microseconds to make the C/R thread less aggressive.
* {{{opal-restart}}} now looks for cache directories before falling back on stable storage when asked.
* {{{opal-restart}}} also support local decompression before restarting
* {{{orte-checkpoint}}} now uses the SStore framework to work with the metadata
* {{{orte-restart}}} now uses the SStore framework to work with the metadata
* Remove the {{{orte-restart}}} preload option. This was removed since the user only needs to select the 'stage' component in order to support this functionality.
* Since the '-am' parameter is saved in the metadata, {{{ompi-restart}}} no longer hard codes {{{-am ft-enable-cr}}}.
* Fix {{{hnp}}} ErrMgr so that if a previous component in the stack has 'fixed' the problem, then it should be skipped.
* Make sure to decrement the number of 'num_local_procs' in the orted when one goes away.
* odls now checks the SStore framework to see if it needs to load any checkpoint files before launching (to support 'stage'). This separates the SStore logic from the --preload-[binary|files] options.
* Add unique IDs to the named pipes established between the orted and the app in SnapC. This is to better support migration and automatic recovery activities.
* Improve the checks for 'already checkpointing' error path.
* A a recovery output timer, to show how long it takes to restart a job
* Do a better job of cleaning up the old session directory on restart.
* Add a local module to the autor and crmig ErrMgr components. These small modules prevent the 'orted' component from attempting a local recovery (Which does not work for MPI apps at the moment)
* Add a fix for bounding the checkpointable region between MPI_Init and MPI_Finalize.
This commit was SVN r23587.
The following Trac tickets were found above:
Ticket 1924 --> https://svn.open-mpi.org/trac/ompi/ticket/1924
Ticket 2097 --> https://svn.open-mpi.org/trac/ompi/ticket/2097
Ticket 2161 --> https://svn.open-mpi.org/trac/ompi/ticket/2161
Ticket 2192 --> https://svn.open-mpi.org/trac/ompi/ticket/2192
Ticket 2208 --> https://svn.open-mpi.org/trac/ompi/ticket/2208
Ticket 2342 --> https://svn.open-mpi.org/trac/ompi/ticket/2342
Ticket 2413 --> https://svn.open-mpi.org/trac/ompi/ticket/2413
2010-08-11 00:51:11 +04:00
|
|
|
#if OPAL_ENABLE_FT_CR == 1
|
|
|
|
/** What files SStore should load before local launch, if any */
|
|
|
|
char *sstore_load;
|
|
|
|
#endif
|
2011-02-14 23:49:12 +03:00
|
|
|
/* recovery policy has been defined */
|
|
|
|
bool recovery_defined;
|
|
|
|
/* max number of times a process can be restarted */
|
|
|
|
int32_t max_restarts;
|
2012-10-14 07:31:32 +04:00
|
|
|
/* maximum number of procs/node for this app */
|
2012-10-14 07:45:28 +04:00
|
|
|
orte_vpid_t max_procs_per_node;
|
2013-01-20 04:33:42 +04:00
|
|
|
/* flag if nodes requested in -host are "mandatory" vs "optional" */
|
|
|
|
bool mandatory;
|
|
|
|
/* min number of nodes required */
|
|
|
|
int64_t min_number_of_nodes;
|
2008-02-28 04:57:57 +03:00
|
|
|
} orte_app_context_t;
|
|
|
|
|
|
|
|
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_app_context_t);
|
|
|
|
|
|
|
|
|
|
|
|
typedef struct {
|
|
|
|
/** Base object so this can be put on a list */
|
|
|
|
opal_list_item_t super;
|
|
|
|
/* index of this node object in global array */
|
|
|
|
orte_std_cntr_t index;
|
|
|
|
/** String node name */
|
|
|
|
char *name;
|
2009-01-15 21:11:50 +03:00
|
|
|
/* argv-like array of aliases for this node */
|
|
|
|
char **alias;
|
2008-04-30 23:49:53 +04:00
|
|
|
/* daemon on this node */
|
2008-02-28 04:57:57 +03:00
|
|
|
struct orte_proc_t *daemon;
|
|
|
|
/* whether or not this daemon has been launched */
|
|
|
|
bool daemon_launched;
|
2012-05-03 01:00:22 +04:00
|
|
|
/* whether or not the location has been verified - used
|
|
|
|
* for environments where the daemon's final destination
|
|
|
|
* is uncertain
|
|
|
|
*/
|
|
|
|
bool location_verified;
|
2008-02-28 04:57:57 +03:00
|
|
|
/** Launch id - needed by some systems to launch a proc on this node */
|
|
|
|
int32_t launch_id;
|
|
|
|
/** number of procs on this node */
|
|
|
|
orte_vpid_t num_procs;
|
|
|
|
/* array of pointers to procs on this node */
|
2008-02-28 08:32:23 +03:00
|
|
|
opal_pointer_array_t *procs;
|
2008-04-30 23:49:53 +04:00
|
|
|
/* next node rank on this node */
|
2008-09-25 17:39:08 +04:00
|
|
|
orte_node_rank_t next_node_rank;
|
2008-02-28 04:57:57 +03:00
|
|
|
/* whether or not we are oversubscribed */
|
|
|
|
bool oversubscribed;
|
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here:
https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement
The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation.
In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions:
1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior.
2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation.
3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so.
As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes.
This commit was SVN r25476.
2011-11-15 07:40:11 +04:00
|
|
|
/* whether we have been added to the current map */
|
|
|
|
bool mapped;
|
2008-02-28 04:57:57 +03:00
|
|
|
/** State of this node */
|
|
|
|
orte_node_state_t state;
|
|
|
|
/** A "soft" limit on the number of slots available on the node.
|
|
|
|
This will typically correspond to the number of physical CPUs
|
|
|
|
that we have been allocated on this note and would be the
|
|
|
|
"ideal" number of processes for us to launch. */
|
|
|
|
orte_std_cntr_t slots;
|
If (and only if) a user requests, set the default number of slots on any node to the number of objects of the specified type. This *only* takes effect in an unmanaged environment - i.e., if an external resource manager assigns us a number of slots, then that is what we use. However, if we are using a hostfile, then the user may or may not have given us a value for the number of slots on each node.
For those nodes (and *only* those nodes) where the user does *not* specify a slot count, we will set the number of slots according to their direction: either to the number of cores, numas, sockets, or hwthreads. Otherwise, the slot count is set to 1.
Note that the default behavior remains unchanged: in the absence of any value for #slots, and in the absence of any directive to set #slots, we will set #slots=1.
This commit was SVN r27236.
2012-09-05 00:58:26 +04:00
|
|
|
/* a flag indicating that the number of slots was specified - used
|
|
|
|
* only in non-managed environments
|
|
|
|
*/
|
|
|
|
bool slots_given;
|
2008-02-28 04:57:57 +03:00
|
|
|
/** How many processes have already been launched, used by one or
|
|
|
|
more jobs on this node. */
|
|
|
|
orte_std_cntr_t slots_inuse;
|
|
|
|
/** A "hard" limit (if set -- a value of 0 implies no hard limit)
|
|
|
|
on the number of slots that can be allocated on a given
|
|
|
|
node. This is for some environments (e.g. grid) there may be
|
|
|
|
fixed limits on the number of slots that can be used.
|
|
|
|
|
|
|
|
This value also could have been a boolean - but we may want to
|
|
|
|
allow the hard limit be different than the soft limit - in
|
|
|
|
other words allow the node to be oversubscribed up to a
|
|
|
|
specified limit. For example, if we have two processors, we
|
|
|
|
may want to allow up to four processes but no more. */
|
|
|
|
orte_std_cntr_t slots_max;
|
|
|
|
/** Username on this node, if specified */
|
|
|
|
char *username;
|
2011-09-11 23:02:24 +04:00
|
|
|
#if OPAL_HAVE_HWLOC
|
|
|
|
/* system topology for this node */
|
|
|
|
hwloc_topology_t topology;
|
|
|
|
#endif
|
2011-06-30 07:12:38 +04:00
|
|
|
/* history of resource usage - sized by sensor framework */
|
|
|
|
opal_ring_buffer_t stats;
|
2008-02-28 04:57:57 +03:00
|
|
|
} orte_node_t;
|
|
|
|
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_node_t);
|
|
|
|
|
|
|
|
typedef struct {
|
|
|
|
/** Base object so this can be put on a list */
|
|
|
|
opal_list_item_t super;
|
|
|
|
/* jobid for this job */
|
|
|
|
orte_jobid_t jobid;
|
2012-08-29 01:20:17 +04:00
|
|
|
/* flag indicating that the job has been updated
|
|
|
|
* and needs to be included in the pidmap message
|
|
|
|
*/
|
|
|
|
bool updated;
|
2008-02-28 04:57:57 +03:00
|
|
|
/* app_context array for this job */
|
2008-02-28 08:32:23 +03:00
|
|
|
opal_pointer_array_t *apps;
|
2008-02-28 04:57:57 +03:00
|
|
|
/* number of app_contexts in the array */
|
2010-02-27 20:37:34 +03:00
|
|
|
orte_app_idx_t num_apps;
|
2008-03-07 00:56:00 +03:00
|
|
|
/* flags to control the launch of this job - see above
|
|
|
|
* for description of supported flags
|
|
|
|
*/
|
Roll in the revamped IOF subsystem. Per the devel mailing list email, this is a complete rewrite of the iof framework designed to simplify the code for maintainability, and to support features we had planned to do, but were too difficult to implement in the old code. Specifically, the new code:
1. completely and cleanly separates responsibilities between the HNP, orted, and tool components.
2. removes all wireup messaging during launch and shutdown.
3. maintains flow control for stdin to avoid large-scale consumption of memory by orteds when large input files are forwarded. This is done using an xon/xoff protocol.
4. enables specification of stdin recipients on the mpirun cmd line. Allowed options include rank, "all", or "none". Default is rank 0.
5. creates a new MPI_Info key "ompi_stdin_target" that supports the above options for child jobs. Default is "none".
6. adds a new tool "orte-iof" that can connect to a running mpirun and display the output. Cmd line options allow selection of any combination of stdout, stderr, and stddiag. Default is stdout.
7. adds a new mpirun and orte-iof cmd line option "tag-output" that will tag each line of output with process name and stream ident. For example, "[1,0]<stdout>this is output"
This is not intended for the 1.3 release as it is a major change requiring considerable soak time.
This commit was SVN r19767.
2008-10-18 04:00:49 +04:00
|
|
|
orte_job_controls_t controls;
|
2012-09-05 05:30:39 +04:00
|
|
|
/* flag to indicate that MPI is allowed on this job - i.e.,
|
|
|
|
* that all members of the job are being simultaneously
|
|
|
|
* launched
|
|
|
|
*/
|
|
|
|
bool gang_launched;
|
Roll in the revamped IOF subsystem. Per the devel mailing list email, this is a complete rewrite of the iof framework designed to simplify the code for maintainability, and to support features we had planned to do, but were too difficult to implement in the old code. Specifically, the new code:
1. completely and cleanly separates responsibilities between the HNP, orted, and tool components.
2. removes all wireup messaging during launch and shutdown.
3. maintains flow control for stdin to avoid large-scale consumption of memory by orteds when large input files are forwarded. This is done using an xon/xoff protocol.
4. enables specification of stdin recipients on the mpirun cmd line. Allowed options include rank, "all", or "none". Default is rank 0.
5. creates a new MPI_Info key "ompi_stdin_target" that supports the above options for child jobs. Default is "none".
6. adds a new tool "orte-iof" that can connect to a running mpirun and display the output. Cmd line options allow selection of any combination of stdout, stderr, and stddiag. Default is stdout.
7. adds a new mpirun and orte-iof cmd line option "tag-output" that will tag each line of output with process name and stream ident. For example, "[1,0]<stdout>this is output"
This is not intended for the 1.3 release as it is a major change requiring considerable soak time.
This commit was SVN r19767.
2008-10-18 04:00:49 +04:00
|
|
|
/* rank desiring stdin - for now, either one rank, all ranks
|
|
|
|
* (wildcard), or none (invalid)
|
|
|
|
*/
|
|
|
|
orte_vpid_t stdin_target;
|
2012-05-03 01:00:22 +04:00
|
|
|
/* job that is to receive the stdout (on its stdin) from this one */
|
|
|
|
orte_jobid_t stdout_target;
|
2012-04-13 04:38:47 +04:00
|
|
|
/* collective ids */
|
|
|
|
orte_grpcomm_coll_id_t peer_modex;
|
|
|
|
orte_grpcomm_coll_id_t peer_init_barrier;
|
|
|
|
orte_grpcomm_coll_id_t peer_fini_barrier;
|
2008-02-28 04:57:57 +03:00
|
|
|
/* total slots allocated to this job */
|
|
|
|
orte_std_cntr_t total_slots_alloc;
|
|
|
|
/* number of procs in this job */
|
|
|
|
orte_vpid_t num_procs;
|
|
|
|
/* array of pointers to procs in this job */
|
2008-02-28 08:32:23 +03:00
|
|
|
opal_pointer_array_t *procs;
|
2008-02-28 04:57:57 +03:00
|
|
|
/* map of the job */
|
2009-08-11 06:51:27 +04:00
|
|
|
struct orte_job_map_t *map;
|
2008-02-28 04:57:57 +03:00
|
|
|
/* bookmark for where we are in mapping - this
|
|
|
|
* indicates the node where we stopped
|
|
|
|
*/
|
|
|
|
orte_node_t *bookmark;
|
|
|
|
/* state of the overall job */
|
|
|
|
orte_job_state_t state;
|
2012-04-06 18:23:13 +04:00
|
|
|
/* some procs in this job are being restarted */
|
|
|
|
bool restart;
|
2012-08-30 00:35:52 +04:00
|
|
|
/* number of procs mapped */
|
|
|
|
orte_vpid_t num_mapped;
|
2008-02-28 04:57:57 +03:00
|
|
|
/* number of procs launched */
|
|
|
|
orte_vpid_t num_launched;
|
|
|
|
/* number of procs reporting contact info */
|
|
|
|
orte_vpid_t num_reported;
|
|
|
|
/* number of procs terminated */
|
|
|
|
orte_vpid_t num_terminated;
|
2010-04-23 08:44:41 +04:00
|
|
|
/* number of daemons reported launched so we can track progress */
|
|
|
|
orte_vpid_t num_daemons_reported;
|
2012-04-06 18:23:13 +04:00
|
|
|
/* number of procs with non-zero exit codes */
|
|
|
|
int32_t num_non_zero_exit;
|
|
|
|
/* originator of a dynamic spawn */
|
|
|
|
orte_process_name_t originator;
|
2008-02-28 04:57:57 +03:00
|
|
|
/* did this job abort? */
|
|
|
|
bool abort;
|
|
|
|
/* proc that caused that to happen */
|
|
|
|
struct orte_proc_t *aborted_proc;
|
2011-02-14 23:49:12 +03:00
|
|
|
/* recovery policy has been defined */
|
|
|
|
bool recovery_defined;
|
2010-04-28 08:06:57 +04:00
|
|
|
/* enable recovery of these processes */
|
|
|
|
bool enable_recovery;
|
2010-04-23 08:44:41 +04:00
|
|
|
/* time launch message was sent */
|
|
|
|
struct timeval launch_msg_sent;
|
2012-04-06 18:23:13 +04:00
|
|
|
/* time launch message was recvd */
|
|
|
|
struct timeval launch_msg_recvd;
|
2010-04-23 08:44:41 +04:00
|
|
|
/* max time for launch msg to be received */
|
|
|
|
struct timeval max_launch_msg_recvd;
|
2012-04-06 18:23:13 +04:00
|
|
|
orte_vpid_t num_local_procs;
|
2012-10-30 03:11:30 +04:00
|
|
|
/* file maps associates with this job */
|
2012-11-10 18:09:12 +04:00
|
|
|
opal_buffer_t *file_maps;
|
2010-03-13 02:57:50 +03:00
|
|
|
#if OPAL_ENABLE_FT_CR == 1
|
2008-02-28 04:57:57 +03:00
|
|
|
/* ckpt state */
|
|
|
|
size_t ckpt_state;
|
|
|
|
/* snapshot reference */
|
|
|
|
char *ckpt_snapshot_ref;
|
|
|
|
/* snapshot location */
|
|
|
|
char *ckpt_snapshot_loc;
|
|
|
|
#endif
|
|
|
|
} orte_job_t;
|
|
|
|
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_job_t);
|
|
|
|
|
|
|
|
struct orte_proc_t {
|
|
|
|
/** Base object so this can be put on a list */
|
|
|
|
opal_list_item_t super;
|
|
|
|
/* process name */
|
|
|
|
orte_process_name_t name;
|
2013-01-08 08:41:12 +04:00
|
|
|
/* the vpid of my parent */
|
|
|
|
orte_vpid_t parent;
|
2008-02-28 04:57:57 +03:00
|
|
|
/* pid */
|
|
|
|
pid_t pid;
|
2008-04-30 23:49:53 +04:00
|
|
|
/* local rank amongst my peers on the node
|
|
|
|
* where this is running - this value is
|
|
|
|
* needed by MPI procs so that the lowest
|
|
|
|
* rank on a node can perform certain fns -
|
|
|
|
* e.g., open an sm backing file
|
|
|
|
*/
|
2008-09-25 17:39:08 +04:00
|
|
|
orte_local_rank_t local_rank;
|
2008-04-30 23:49:53 +04:00
|
|
|
/* local rank on the node across all procs
|
|
|
|
* and jobs known to this HNP - this is
|
|
|
|
* needed so that procs can do things like
|
|
|
|
* know which static IP port to use
|
|
|
|
*/
|
2008-09-25 17:39:08 +04:00
|
|
|
orte_node_rank_t node_rank;
|
2011-06-17 00:31:30 +04:00
|
|
|
/* rank of this proc within its app context - this
|
|
|
|
* will just equal its vpid for single app_context
|
|
|
|
* applications
|
|
|
|
*/
|
|
|
|
int32_t app_rank;
|
2010-03-24 00:28:02 +03:00
|
|
|
/* Last state used to trigger the errmgr for this proc */
|
|
|
|
orte_proc_state_t last_errmgr_state;
|
2008-02-28 04:57:57 +03:00
|
|
|
/* process state */
|
|
|
|
orte_proc_state_t state;
|
2012-04-06 18:23:13 +04:00
|
|
|
/* shortcut for determinng proc has been launched
|
|
|
|
* and has not yet terminated
|
|
|
|
*/
|
|
|
|
bool alive;
|
2012-04-10 23:08:54 +04:00
|
|
|
/* flag if it called abort */
|
|
|
|
bool aborted;
|
2012-08-29 01:20:17 +04:00
|
|
|
/* flag that the proc has been updated and need to be
|
|
|
|
* included in the next pidmap message
|
|
|
|
*/
|
|
|
|
bool updated;
|
2008-02-28 04:57:57 +03:00
|
|
|
/* exit code */
|
|
|
|
orte_exit_code_t exit_code;
|
|
|
|
/* the app_context that generated this proc */
|
2010-02-27 20:37:34 +03:00
|
|
|
orte_app_idx_t app_idx;
|
2011-10-29 19:12:45 +04:00
|
|
|
#if OPAL_HAVE_HWLOC
|
|
|
|
/* hwloc object to which this process was mapped */
|
|
|
|
hwloc_obj_t locale;
|
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here:
https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement
The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation.
In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions:
1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior.
2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation.
3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so.
As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes.
This commit was SVN r25476.
2011-11-15 07:40:11 +04:00
|
|
|
/* where the proc was bound */
|
|
|
|
unsigned int bind_idx;
|
|
|
|
/* string representation of cpu bindings */
|
|
|
|
char *cpu_bitmap;
|
2011-10-29 19:12:45 +04:00
|
|
|
#endif
|
2008-02-28 04:57:57 +03:00
|
|
|
/* pointer to the node where this proc is executing */
|
|
|
|
orte_node_t *node;
|
2012-04-06 18:23:13 +04:00
|
|
|
/* indicate that this proc is local */
|
|
|
|
bool local_proc;
|
|
|
|
/* indicate that this proc should not barrier - used
|
|
|
|
* for restarting processes
|
|
|
|
*/
|
|
|
|
bool do_not_barrier;
|
2011-02-20 21:46:21 +03:00
|
|
|
/* pointer to the node where this proc last executed */
|
|
|
|
orte_node_t *prior_node;
|
2008-02-28 04:57:57 +03:00
|
|
|
/* name of the node where this proc is executing - this
|
|
|
|
* is used simply to pass that info to a calling
|
|
|
|
* tool since it may not have a node array available
|
|
|
|
*/
|
|
|
|
char *nodename;
|
|
|
|
/* RML contact info */
|
|
|
|
char *rml_uri;
|
2010-01-07 04:19:44 +03:00
|
|
|
/* number of times this process has been restarted */
|
|
|
|
int32_t restarts;
|
2011-06-21 20:08:41 +04:00
|
|
|
/* time of last restart */
|
|
|
|
struct timeval last_failure;
|
|
|
|
/* number of failures in "fast" window */
|
|
|
|
int32_t fast_failures;
|
2011-02-16 21:50:51 +03:00
|
|
|
/* flag to indicate proc has reported in */
|
|
|
|
bool reported;
|
|
|
|
/* if heartbeat recvd during last time period */
|
2011-05-20 04:21:33 +04:00
|
|
|
int beat;
|
2011-06-30 07:12:38 +04:00
|
|
|
/* history of resource usage - sized by sensor framework */
|
|
|
|
opal_ring_buffer_t stats;
|
2012-04-06 18:23:13 +04:00
|
|
|
/* track finalization */
|
|
|
|
bool registered;
|
2012-08-29 01:20:17 +04:00
|
|
|
bool mpi_proc;
|
2012-04-06 18:23:13 +04:00
|
|
|
bool deregistered;
|
|
|
|
bool iof_complete;
|
|
|
|
bool waitpid_recvd;
|
2010-03-13 02:57:50 +03:00
|
|
|
#if OPAL_ENABLE_FT_CR == 1
|
2008-02-28 04:57:57 +03:00
|
|
|
/* ckpt state */
|
|
|
|
size_t ckpt_state;
|
|
|
|
/* snapshot reference */
|
|
|
|
char *ckpt_snapshot_ref;
|
|
|
|
/* snapshot location */
|
|
|
|
char *ckpt_snapshot_loc;
|
|
|
|
#endif
|
|
|
|
};
|
|
|
|
typedef struct orte_proc_t orte_proc_t;
|
|
|
|
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_proc_t);
|
|
|
|
|
|
|
|
/**
|
2012-06-27 18:53:55 +04:00
|
|
|
* Get a job data object
|
2008-02-28 04:57:57 +03:00
|
|
|
* We cannot just reference a job data object with its jobid as
|
|
|
|
* the jobid is no longer an index into the array. This change
|
|
|
|
* was necessitated by modification of the jobid to include
|
|
|
|
* an mpirun-unique qualifer to eliminate any global name
|
|
|
|
* service
|
|
|
|
*/
|
|
|
|
ORTE_DECLSPEC orte_job_t* orte_get_job_data_object(orte_jobid_t job);
|
|
|
|
|
2012-06-27 18:53:55 +04:00
|
|
|
/**
|
|
|
|
* Get a proc data object
|
|
|
|
*/
|
|
|
|
ORTE_DECLSPEC orte_proc_t* orte_get_proc_object(orte_process_name_t *proc);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Get the daemon vpid hosting a given proc
|
|
|
|
*/
|
|
|
|
ORTE_DECLSPEC orte_vpid_t orte_get_proc_daemon_vpid(orte_process_name_t *proc);
|
|
|
|
|
|
|
|
/* Get the hostname of a proc */
|
|
|
|
ORTE_DECLSPEC char* orte_get_proc_hostname(orte_process_name_t *proc);
|
|
|
|
|
|
|
|
/* get the node rank of a proc */
|
|
|
|
ORTE_DECLSPEC orte_node_rank_t orte_get_proc_node_rank(orte_process_name_t *proc);
|
|
|
|
|
2011-02-14 22:45:59 +03:00
|
|
|
/* Find the lowest vpid alive in a given job */
|
|
|
|
ORTE_DECLSPEC orte_vpid_t orte_get_lowest_vpid_alive(orte_jobid_t job);
|
|
|
|
|
2008-02-28 04:57:57 +03:00
|
|
|
/* global variables used by RTE - instanced in orte_globals.c */
|
2008-11-01 00:10:00 +03:00
|
|
|
ORTE_DECLSPEC extern bool orte_timing;
|
2009-01-08 17:25:56 +03:00
|
|
|
ORTE_DECLSPEC extern FILE *orte_timing_output;
|
2009-01-12 22:12:58 +03:00
|
|
|
ORTE_DECLSPEC extern bool orte_timing_details;
|
2009-08-20 15:12:45 +04:00
|
|
|
ORTE_DECLSPEC extern bool orte_debug_daemons_flag;
|
|
|
|
ORTE_DECLSPEC extern bool orte_debug_daemons_file_flag;
|
2008-08-14 22:59:01 +04:00
|
|
|
ORTE_DECLSPEC extern bool orte_leave_session_attached;
|
2008-04-17 17:50:59 +04:00
|
|
|
ORTE_DECLSPEC extern bool orte_do_not_launch;
|
2008-02-28 04:57:57 +03:00
|
|
|
ORTE_DECLSPEC extern bool orted_spin_flag;
|
2010-08-09 23:28:56 +04:00
|
|
|
ORTE_DECLSPEC extern char *orte_local_cpu_type;
|
2009-12-01 02:11:25 +03:00
|
|
|
ORTE_DECLSPEC extern char *orte_local_cpu_model;
|
2010-07-18 01:03:27 +04:00
|
|
|
ORTE_DECLSPEC extern char *orte_basename;
|
2009-08-21 22:03:34 +04:00
|
|
|
|
|
|
|
/* ORTE OOB port flags */
|
2008-03-28 05:20:37 +03:00
|
|
|
ORTE_DECLSPEC extern bool orte_static_ports;
|
2009-08-21 22:03:34 +04:00
|
|
|
ORTE_DECLSPEC extern char *orte_oob_static_ports;
|
2009-08-22 06:58:20 +04:00
|
|
|
ORTE_DECLSPEC extern bool orte_standalone_operation;
|
2012-06-15 14:15:07 +04:00
|
|
|
ORTE_DECLSPEC extern bool orte_use_common_port;
|
2009-08-21 22:03:34 +04:00
|
|
|
|
2012-11-16 08:04:29 +04:00
|
|
|
/* nodename flags */
|
2008-04-02 00:32:17 +04:00
|
|
|
ORTE_DECLSPEC extern bool orte_keep_fqdn_hostnames;
|
2011-12-01 18:24:43 +04:00
|
|
|
ORTE_DECLSPEC extern bool orte_have_fqdn_allocation;
|
2008-11-24 22:57:08 +03:00
|
|
|
ORTE_DECLSPEC extern bool orte_show_resolved_nodenames;
|
2012-11-16 08:04:29 +04:00
|
|
|
ORTE_DECLSPEC extern bool orte_retain_aliases;
|
2013-01-18 09:00:05 +04:00
|
|
|
ORTE_DECLSPEC extern int orte_use_hostname_alias;
|
2012-11-16 08:04:29 +04:00
|
|
|
|
|
|
|
/* debug flags */
|
2008-05-29 17:38:27 +04:00
|
|
|
ORTE_DECLSPEC extern int orted_debug_failure;
|
2008-06-03 01:46:34 +04:00
|
|
|
ORTE_DECLSPEC extern int orted_debug_failure_delay;
|
2012-11-16 08:04:29 +04:00
|
|
|
|
|
|
|
/* homegeneity flags */
|
2008-06-24 21:50:56 +04:00
|
|
|
ORTE_DECLSPEC extern bool orte_homogeneous_nodes;
|
|
|
|
ORTE_DECLSPEC extern bool orte_hetero_apps;
|
2011-11-01 22:43:10 +04:00
|
|
|
ORTE_DECLSPEC extern bool orte_hetero_nodes;
|
2012-11-16 08:04:29 +04:00
|
|
|
|
2008-08-19 19:19:30 +04:00
|
|
|
ORTE_DECLSPEC extern bool orte_never_launched;
|
2008-09-23 19:46:34 +04:00
|
|
|
ORTE_DECLSPEC extern bool orte_devel_level_output;
|
2011-10-29 19:12:45 +04:00
|
|
|
ORTE_DECLSPEC extern bool orte_display_topo_with_map;
|
2011-11-03 18:22:07 +04:00
|
|
|
ORTE_DECLSPEC extern bool orte_display_diffable_output;
|
2008-02-28 04:57:57 +03:00
|
|
|
|
|
|
|
ORTE_DECLSPEC extern char **orte_launch_environ;
|
2008-04-14 22:26:08 +04:00
|
|
|
|
2008-07-25 21:13:22 +04:00
|
|
|
ORTE_DECLSPEC extern bool orte_hnp_is_allocated;
|
2008-08-04 18:25:19 +04:00
|
|
|
ORTE_DECLSPEC extern bool orte_allocation_required;
|
2012-09-04 20:34:05 +04:00
|
|
|
ORTE_DECLSPEC extern bool orte_managed_allocation;
|
If (and only if) a user requests, set the default number of slots on any node to the number of objects of the specified type. This *only* takes effect in an unmanaged environment - i.e., if an external resource manager assigns us a number of slots, then that is what we use. However, if we are using a hostfile, then the user may or may not have given us a value for the number of slots on each node.
For those nodes (and *only* those nodes) where the user does *not* specify a slot count, we will set the number of slots according to their direction: either to the number of cores, numas, sockets, or hwthreads. Otherwise, the slot count is set to 1.
Note that the default behavior remains unchanged: in the absence of any value for #slots, and in the absence of any directive to set #slots, we will set #slots=1.
This commit was SVN r27236.
2012-09-05 00:58:26 +04:00
|
|
|
ORTE_DECLSPEC extern char *orte_set_slots;
|
|
|
|
ORTE_DECLSPEC extern bool orte_display_allocation;
|
|
|
|
ORTE_DECLSPEC extern bool orte_display_devel_allocation;
|
2012-09-05 22:42:09 +04:00
|
|
|
ORTE_DECLSPEC extern bool orte_soft_locations;
|
2008-07-25 21:13:22 +04:00
|
|
|
|
2011-06-30 07:12:38 +04:00
|
|
|
/* launch agents */
|
Per the July technical meeting:
Standardize the handling of the orte launch agent option across PLMs. This has been a consistent complaint I have received - each PLM would register its own MCA param to get input on the launch agent for remote nodes (in fact, one or two didn't, but most did). This would then get handled in various and contradictory ways.
Some PLMs would accept only a one-word input. Others accepted multi-word args such as "valgrind orted", but then some would error by putting any prefix specified on the cmd line in front of the incorrect argument.
For example, while using the rsh launcher, if you specified "valgrind orted" as your launch agent and had "--prefix foo" on you cmd line, you would attempt to execute "ssh foo/valgrind orted" - which obviously wouldn't work.
This was all -very- confusing to users, who had to know which PLM was being used so they could even set the right mca param in the first place! And since we don't warn about non-recognized or non-used mca params, half of the time they would wind up not doing what they thought they were telling us to do.
To solve this problem, we did the following:
1. removed all mca params from the individual plms for the launch agent
2. added a new mca param "orte_launch_agent" for this purpose. To further simplify for users, this comes with a new cmd line option "--launch-agent" that can take a multi-word string argument. The value of the param defaults to "orted".
3. added a PLM base function that processes the orte_launch_agent value and adds the contents to a provided argv array. This can subsequently be harvested at-will to handle multi-word values
4. modified the PLMs to use this new function. All the PLMs except for the rsh PLM required very minor change - just called the function and moved on. The rsh PLM required much larger changes as - because of the rsh/ssh cmd line limitations - we had to correctly prepend any provided prefix to the correct argv entry.
5. added a new opal_argv_join_range function that allows the caller to "join" argv entries between two specified indices
Please let me know of any problems. I tried to make this as clean as possible, but cannot compile all PLMs to ensure all is correct.
This commit was SVN r19097.
2008-07-30 22:26:24 +04:00
|
|
|
ORTE_DECLSPEC extern char *orte_launch_agent;
|
2008-02-28 04:57:57 +03:00
|
|
|
ORTE_DECLSPEC extern char **orted_cmd_line;
|
2011-06-30 07:12:38 +04:00
|
|
|
ORTE_DECLSPEC extern char **orte_fork_agent;
|
2008-08-05 19:09:29 +04:00
|
|
|
|
2010-10-23 00:07:24 +04:00
|
|
|
/* debugger job */
|
2012-01-11 19:53:09 +04:00
|
|
|
ORTE_DECLSPEC extern bool orte_debugger_dump_proctable;
|
|
|
|
ORTE_DECLSPEC extern char *orte_debugger_test_daemon;
|
|
|
|
ORTE_DECLSPEC extern bool orte_debugger_test_attach;
|
|
|
|
ORTE_DECLSPEC extern int orte_debugger_check_rate;
|
2008-08-13 21:47:24 +04:00
|
|
|
|
2010-07-18 01:03:27 +04:00
|
|
|
/* exit flags */
|
2008-02-28 04:57:57 +03:00
|
|
|
ORTE_DECLSPEC extern bool orte_abnormal_term_ordered;
|
2008-11-01 00:10:00 +03:00
|
|
|
ORTE_DECLSPEC extern bool orte_routing_is_enabled;
|
2009-02-27 13:16:25 +03:00
|
|
|
ORTE_DECLSPEC extern bool orte_job_term_ordered;
|
2010-05-23 06:57:03 +04:00
|
|
|
ORTE_DECLSPEC extern bool orte_orteds_term_ordered;
|
2012-11-10 18:09:12 +04:00
|
|
|
ORTE_DECLSPEC extern bool orte_allowed_exit_without_sync;
|
2008-06-03 01:46:34 +04:00
|
|
|
ORTE_DECLSPEC extern int orte_startup_timeout;
|
Afraid this has a couple of things mixed into the commit. Couldn't be helped - had missed one commit prior to running out the door on vacation.
Fix race conditions in abnormal terminations. We had done a first-cut at this in a prior commit. However, the window remained partially open due to the fact that the HNP has multiple paths leading to orte_finalize. Most of our frameworks don't care if they are finalized more than once, but one of them does, which meant we segfaulted if orte_finalize got called more than once. Besides, we really shouldn't be doing that anyway.
So we now introduce a set of atomic locks that prevent us from multiply calling abort, attempting to call orte_finalize, etc. My initial tests indicate this is working cleanly, but since it is a race condition issue, more testing will have to be done before we know for sure that this problem has been licked.
Also, some updates relevant to the tool comm library snuck in here. Since those also touched the orted code (as did the prior changes), I didn't want to attempt to separate them out - besides, they are coming in soon anyway. More on them later as that functionality approaches completion.
This commit was SVN r17843.
2008-03-17 20:58:59 +03:00
|
|
|
|
2008-02-28 04:57:57 +03:00
|
|
|
ORTE_DECLSPEC extern int orte_timeout_usec_per_proc;
|
|
|
|
ORTE_DECLSPEC extern float orte_max_timeout;
|
|
|
|
|
2008-05-01 23:19:34 +04:00
|
|
|
ORTE_DECLSPEC extern opal_buffer_t *orte_tree_launch_cmd;
|
|
|
|
|
2008-02-28 04:57:57 +03:00
|
|
|
/* global arrays for data storage */
|
2008-02-28 08:32:23 +03:00
|
|
|
ORTE_DECLSPEC extern opal_pointer_array_t *orte_job_data;
|
|
|
|
ORTE_DECLSPEC extern opal_pointer_array_t *orte_node_pool;
|
2011-09-11 23:02:24 +04:00
|
|
|
ORTE_DECLSPEC extern opal_pointer_array_t *orte_node_topologies;
|
2012-04-06 18:23:13 +04:00
|
|
|
ORTE_DECLSPEC extern opal_pointer_array_t *orte_local_children;
|
2012-05-03 01:00:22 +04:00
|
|
|
ORTE_DECLSPEC extern uint16_t orte_num_jobs;
|
2008-02-28 04:57:57 +03:00
|
|
|
|
2009-01-30 21:50:10 +03:00
|
|
|
/* whether or not to forward SIGTSTP and SIGCONT signals */
|
|
|
|
ORTE_DECLSPEC extern bool orte_forward_job_control;
|
2009-01-08 17:25:56 +03:00
|
|
|
|
2009-01-31 01:47:30 +03:00
|
|
|
/* IOF controls */
|
|
|
|
ORTE_DECLSPEC extern bool orte_tag_output;
|
|
|
|
ORTE_DECLSPEC extern bool orte_timestamp_output;
|
|
|
|
ORTE_DECLSPEC extern char *orte_output_filename;
|
|
|
|
/* generate new xterm windows to display output from specified ranks */
|
|
|
|
ORTE_DECLSPEC extern char *orte_xterm;
|
2009-01-07 17:58:38 +03:00
|
|
|
|
2009-06-03 03:52:02 +04:00
|
|
|
/* whether or not to report launch progress */
|
|
|
|
ORTE_DECLSPEC extern bool orte_report_launch_progress;
|
|
|
|
|
2009-08-11 06:51:27 +04:00
|
|
|
/* allocation specification */
|
2009-08-13 20:08:43 +04:00
|
|
|
ORTE_DECLSPEC extern char *orte_default_hostfile;
|
2012-02-15 08:16:05 +04:00
|
|
|
ORTE_DECLSPEC extern bool orte_default_hostfile_given;
|
2009-08-13 20:08:43 +04:00
|
|
|
ORTE_DECLSPEC extern char *orte_rankfile;
|
2010-02-24 11:50:03 +03:00
|
|
|
#ifdef __WINDOWS__
|
2010-02-23 22:42:51 +03:00
|
|
|
ORTE_DECLSPEC extern char *orte_ccp_headnode;
|
2010-02-24 11:50:03 +03:00
|
|
|
#endif
|
2011-07-07 22:54:30 +04:00
|
|
|
ORTE_DECLSPEC extern int orte_num_allocated_nodes;
|
|
|
|
ORTE_DECLSPEC extern char *orte_node_regex;
|
2009-08-11 06:51:27 +04:00
|
|
|
|
2009-09-09 09:28:45 +04:00
|
|
|
/* tool communication controls */
|
|
|
|
ORTE_DECLSPEC extern bool orte_report_events;
|
|
|
|
ORTE_DECLSPEC extern char *orte_report_events_uri;
|
|
|
|
|
2010-04-28 08:06:57 +04:00
|
|
|
/* process recovery */
|
|
|
|
ORTE_DECLSPEC extern bool orte_enable_recovery;
|
2011-02-14 23:49:12 +03:00
|
|
|
ORTE_DECLSPEC extern int32_t orte_max_restarts;
|
2012-04-06 18:23:13 +04:00
|
|
|
/* barrier control */
|
|
|
|
ORTE_DECLSPEC extern bool orte_do_not_barrier;
|
2010-04-28 08:06:57 +04:00
|
|
|
|
2010-05-12 22:11:58 +04:00
|
|
|
/* exit status reporting */
|
|
|
|
ORTE_DECLSPEC extern bool orte_report_child_jobs_separately;
|
|
|
|
ORTE_DECLSPEC extern struct timeval orte_child_time_to_exit;
|
2011-04-14 19:04:21 +04:00
|
|
|
ORTE_DECLSPEC extern bool orte_abort_non_zero_exit;
|
2011-03-13 03:46:42 +03:00
|
|
|
|
2011-06-30 07:12:38 +04:00
|
|
|
/* length of stat history to keep */
|
|
|
|
ORTE_DECLSPEC extern int orte_stat_history_size;
|
|
|
|
|
2011-12-07 01:31:22 +04:00
|
|
|
/* envars to forward */
|
|
|
|
ORTE_DECLSPEC extern char *orte_forward_envars;
|
|
|
|
|
2012-05-03 01:00:22 +04:00
|
|
|
/* map-reduce mode */
|
|
|
|
ORTE_DECLSPEC extern bool orte_map_reduce;
|
2012-11-10 18:09:12 +04:00
|
|
|
ORTE_DECLSPEC extern bool orte_staged_execution;
|
2012-05-03 01:00:22 +04:00
|
|
|
|
2012-04-27 18:39:34 +04:00
|
|
|
/* map stddiag output to stderr so it isn't forwarded to mpirun */
|
|
|
|
ORTE_DECLSPEC extern bool orte_map_stddiag_to_stderr;
|
|
|
|
|
2012-05-27 20:48:19 +04:00
|
|
|
/* maximum size of virtual machine - used to subdivide allocation */
|
|
|
|
ORTE_DECLSPEC extern int orte_max_vm_size;
|
|
|
|
|
2012-06-08 05:23:08 +04:00
|
|
|
/* record the selected oob component */
|
|
|
|
ORTE_DECLSPEC extern char *orte_selected_oob_component;
|
|
|
|
|
2012-08-29 01:20:17 +04:00
|
|
|
/* global nidmap/pidmap for daemons to give to apps */
|
|
|
|
ORTE_DECLSPEC extern opal_byte_object_t orte_nidmap;
|
|
|
|
ORTE_DECLSPEC extern opal_byte_object_t orte_pidmap;
|
|
|
|
|
2008-06-18 07:15:56 +04:00
|
|
|
#endif /* ORTE_DISABLE_FULL_SUPPORT */
|
2008-02-28 04:57:57 +03:00
|
|
|
|
|
|
|
END_C_DECLS
|
|
|
|
|
|
|
|
#endif /* ORTE_RUNTIME_ORTE_GLOBALS_H */
|