1
1
openmpi/orte/runtime/orte_globals.h

589 строки
22 KiB
C
Исходник Обычный вид История

/*
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
2015-06-24 06:59:57 +03:00
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007-2010 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2007-2017 Cisco Systems, Inc. All rights reserved
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2017 IBM Corporation. All rights reserved.
* $COPYRIGHT$
2015-06-24 06:59:57 +03:00
*
* Additional copyrights may follow
2015-06-24 06:59:57 +03:00
*
* $HEADER$
*/
/**
* @file
*
* Global params for OpenRTE
*/
#ifndef ORTE_RUNTIME_ORTE_GLOBALS_H
#define ORTE_RUNTIME_ORTE_GLOBALS_H
#include "orte_config.h"
#include "orte/types.h"
#include <sys/types.h>
#ifdef HAVE_SYS_TIME_H
#include <sys/time.h>
#endif
#include "opal/class/opal_hash_table.h"
#include "opal/class/opal_pointer_array.h"
#include "opal/class/opal_value_array.h"
#include "opal/class/opal_ring_buffer.h"
#include "opal/threads/threads.h"
#include "opal/mca/event/event.h"
#include "opal/mca/hwloc/hwloc-internal.h"
Per RFC, bring in the following changes: * Remove paffinity, maffinity, and carto frameworks -- they've been wholly replaced by hwloc. * Move ompi_mpi_init() affinity-setting/checking code down to ORTE. * Update sm, smcuda, wv, and openib components to no longer use carto. Instead, use hwloc data. There are still optimizations possible in the sm/smcuda BTLs (i.e., making multiple mpools). Also, the old carto-based code found out how many NUMA nodes were ''available'' -- not how many were used ''in this job''. The new hwloc-using code computes the same value -- it was not updated to calculate how many NUMA nodes are used ''by this job.'' * Note that I cannot compile the smcuda and wv BTLs -- I ''think'' they're right, but they need to be verified by their owners. * The openib component now does a bunch of stuff to figure out where "near" OpenFabrics devices are. '''THIS IS A CHANGE IN DEFAULT BEHAVIOR!!''' and still needs to be verified by OpenFabrics vendors (I do not have a NUMA machine with an OpenFabrics device that is a non-uniform distance from multiple different NUMA nodes). * Completely rewrite the OMPI_Affinity_str() routine from the "affinity" mpiext extension. This extension now understands hyperthreads; the output format of it has changed a bit to reflect this new information. * Bunches of minor changes around the code base to update names/types from maffinity/paffinity-based names to hwloc-based names. * Add some helper functions into the hwloc base, mainly having to do with the fact that we have the hwloc data reporting ''all'' topology information, but sometimes you really only want the (online | available) data. This commit was SVN r26391.
2012-05-07 18:52:54 +04:00
#include "opal/mca/hwloc/base/base.h"
#include "orte/mca/plm/plm_types.h"
#include "orte/mca/rml/rml_types.h"
#include "orte/util/attr.h"
#include "orte/util/proc_info.h"
Preparation work for another commit (after RFC): - This patch solely _adds_ required headers and is rather localized The next patch (after RFC) heavily removes headers (based on script) - ompi/communicator/communicator.h: For sources that use ompi_mpi_comm_world, don't require them to include "mpi.h" - ompi/debuggers/ompi_common_dll.c: mca_topo_base_comm_1_0_0_t needs #include "ompi/mca/topo/topo.h" - ompi/errhandler/errhandler_predefined.h: ompi/communicator/communicator.h depends on this header file! To prevent recursion just have fwd declarations. #include "ompi/types.h" for fwd declarations of the main structs. - ompi/mca/btl/btl.h: #include "opal/types.h" for ompi_ptr_t - ompi/mca/mpool/base/mpool_base_tree.c: We use ompi_free_list_t and ompi_rb_tree_t, so have the proper classes - ompi/mca/op/op.h: Op is pretty self-contained: Nobody up to now has done #include "opal/class/opal_object.h" - ompi/mca/osc/pt2pt/osc_pt2pt_replyreq.h: #include "opal/types.h" for ompi_ptr_t - ompi/mca/pml/base/base.h: We use opal_lists - ompi/mca/pml/dr/pml_dr_vfrag.h: #include "opal/types.h" for ompi_ptr_t - ompi/mca/pml/ob1/pml_ob1_hdr.h: #include "ompi/mca/btl/btl.h" for mca_btl_base_segment_t - opal/dss/dss_unpack.c: #include "opal/types.h" - opal/mca/base/base.h: #include "opal/util/cmd_line.h" for opal_cmd_line_t - orte/mca/oob/tcp/oob_tcp.c: #include "opal/types.h" for opal_socklen_t - orte/mca/oob/tcp/oob_tcp.h: #include "opal/threads/threads.h" for opal_thread_t - orte/mca/oob/tcp/oob_tcp_msg.c: #include "opal/types.h" - orte/mca/oob/tcp/oob_tcp_peer.c: #include "opal/types.h" for opal_socklen_t - orte/mca/oob/tcp/oob_tcp_send.c: #include "opal/types.h" - orte/mca/plm/base/plm_base_proxy.c: #include "orte/util/name_fns.h" for ORTE_NAME_PRINT - orte/mca/rml/base/rml_base_receive.c: #include "opal/util/output.h" for OPAL_OUTPUT_VERBOSE - orte/mca/rml/oob/rml_oob_recv.c: #include "opal/types.h" for ompi_iov_base_ptr_t - orte/mca/rml/oob/rml_oob_send.c: #include "opal/types.h" for ompi_iov_base_ptr_t - orte/runtime/orte_data_server.c #include "opal/util/output.h" for OPAL_OUTPUT_VERBOSE - orte/runtime/orte_globals.h: #include "orte/util/name_fns.h" for ORTE_NAME_PRINT Tested on Linux/x86-64 This commit was SVN r20817.
2009-03-18 00:34:30 +03:00
#include "orte/util/name_fns.h"
#include "orte/util/error_strings.h"
#include "orte/runtime/runtime.h"
BEGIN_C_DECLS
ORTE_DECLSPEC extern int orte_debug_verbosity; /* instantiated in orte/runtime/orte_init.c */
ORTE_DECLSPEC extern char *orte_prohibited_session_dirs; /* instantiated in orte/runtime/orte_init.c */
ORTE_DECLSPEC extern bool orte_xml_output; /* instantiated in orte/runtime/orte_globals.c */
ORTE_DECLSPEC extern FILE *orte_xml_fp; /* instantiated in orte/runtime/orte_globals.c */
ORTE_DECLSPEC extern bool orte_help_want_aggregate; /* instantiated in orte/util/show_help.c */
ORTE_DECLSPEC extern char *orte_job_ident; /* instantiated in orte/runtime/orte_globals.c */
ORTE_DECLSPEC extern bool orte_create_session_dirs; /* instantiated in orte/runtime/orte_init.c */
ORTE_DECLSPEC extern bool orte_execute_quiet; /* instantiated in orte/runtime/orte_globals.c */
ORTE_DECLSPEC extern bool orte_report_silent_errors; /* instantiated in orte/runtime/orte_globals.c */
ORTE_DECLSPEC extern opal_event_base_t *orte_event_base; /* instantiated in orte/runtime/orte_init.c */
Per RFC, bring in the following changes: * Remove paffinity, maffinity, and carto frameworks -- they've been wholly replaced by hwloc. * Move ompi_mpi_init() affinity-setting/checking code down to ORTE. * Update sm, smcuda, wv, and openib components to no longer use carto. Instead, use hwloc data. There are still optimizations possible in the sm/smcuda BTLs (i.e., making multiple mpools). Also, the old carto-based code found out how many NUMA nodes were ''available'' -- not how many were used ''in this job''. The new hwloc-using code computes the same value -- it was not updated to calculate how many NUMA nodes are used ''by this job.'' * Note that I cannot compile the smcuda and wv BTLs -- I ''think'' they're right, but they need to be verified by their owners. * The openib component now does a bunch of stuff to figure out where "near" OpenFabrics devices are. '''THIS IS A CHANGE IN DEFAULT BEHAVIOR!!''' and still needs to be verified by OpenFabrics vendors (I do not have a NUMA machine with an OpenFabrics device that is a non-uniform distance from multiple different NUMA nodes). * Completely rewrite the OMPI_Affinity_str() routine from the "affinity" mpiext extension. This extension now understands hyperthreads; the output format of it has changed a bit to reflect this new information. * Bunches of minor changes around the code base to update names/types from maffinity/paffinity-based names to hwloc-based names. * Add some helper functions into the hwloc base, mainly having to do with the fact that we have the hwloc data reporting ''all'' topology information, but sometimes you really only want the (online | available) data. This commit was SVN r26391.
2012-05-07 18:52:54 +04:00
ORTE_DECLSPEC extern bool orte_event_base_active; /* instantiated in orte/runtime/orte_init.c */
ORTE_DECLSPEC extern bool orte_proc_is_bound; /* instantiated in orte/runtime/orte_init.c */
ORTE_DECLSPEC extern int orte_progress_thread_debug; /* instantiated in orte/runtime/orte_init.c */
ORTE_DECLSPEC extern char *orte_mgmt_transport;
ORTE_DECLSPEC extern char *orte_coll_transport;
ORTE_DECLSPEC extern int orte_mgmt_conduit;
ORTE_DECLSPEC extern int orte_coll_conduit;
Per RFC, bring in the following changes: * Remove paffinity, maffinity, and carto frameworks -- they've been wholly replaced by hwloc. * Move ompi_mpi_init() affinity-setting/checking code down to ORTE. * Update sm, smcuda, wv, and openib components to no longer use carto. Instead, use hwloc data. There are still optimizations possible in the sm/smcuda BTLs (i.e., making multiple mpools). Also, the old carto-based code found out how many NUMA nodes were ''available'' -- not how many were used ''in this job''. The new hwloc-using code computes the same value -- it was not updated to calculate how many NUMA nodes are used ''by this job.'' * Note that I cannot compile the smcuda and wv BTLs -- I ''think'' they're right, but they need to be verified by their owners. * The openib component now does a bunch of stuff to figure out where "near" OpenFabrics devices are. '''THIS IS A CHANGE IN DEFAULT BEHAVIOR!!''' and still needs to be verified by OpenFabrics vendors (I do not have a NUMA machine with an OpenFabrics device that is a non-uniform distance from multiple different NUMA nodes). * Completely rewrite the OMPI_Affinity_str() routine from the "affinity" mpiext extension. This extension now understands hyperthreads; the output format of it has changed a bit to reflect this new information. * Bunches of minor changes around the code base to update names/types from maffinity/paffinity-based names to hwloc-based names. * Add some helper functions into the hwloc base, mainly having to do with the fact that we have the hwloc data reporting ''all'' topology information, but sometimes you really only want the (online | available) data. This commit was SVN r26391.
2012-05-07 18:52:54 +04:00
/**
* Global indicating where this process was bound to at launch (will
* be NULL if !orte_proc_is_bound)
*/
OPAL_DECLSPEC extern hwloc_cpuset_t orte_proc_applied_binding; /* instantiated in orte/runtime/orte_init.c */
/* Shortcut for some commonly used names */
#define ORTE_NAME_WILDCARD (&orte_name_wildcard)
ORTE_DECLSPEC extern orte_process_name_t orte_name_wildcard; /** instantiated in orte/runtime/orte_init.c */
#define ORTE_NAME_INVALID (&orte_name_invalid)
ORTE_DECLSPEC extern orte_process_name_t orte_name_invalid; /** instantiated in orte/runtime/orte_init.c */
#define ORTE_PROC_MY_NAME (&orte_process_info.my_name)
/* define a special name that point to my parent (aka the process that spawned me) */
#define ORTE_PROC_MY_PARENT (&orte_process_info.my_parent)
/* define a special name that belongs to orterun */
#define ORTE_PROC_MY_HNP (&orte_process_info.my_hnp)
/* define the name of my daemon */
#define ORTE_PROC_MY_DAEMON (&orte_process_info.my_daemon)
ORTE_DECLSPEC extern bool orte_in_parallel_debugger;
/* error manager callback function */
typedef void (*orte_err_cb_fn_t)(orte_process_name_t *proc, orte_proc_state_t state, void *cbdata);
/* define an object for timer events */
typedef struct {
opal_object_t super;
struct timeval tv;
opal_event_t *ev;
void *payload;
} orte_timer_t;
OBJ_CLASS_DECLARATION(orte_timer_t);
ORTE_DECLSPEC extern int orte_exit_status;
/* ORTE event priorities - we define these
* at levels that permit higher layers such as
* OMPI to handle their events at higher priority,
* with the exception of errors. Errors generally
* require exception handling (e.g., ctrl-c termination)
* that overrides the need to process MPI messages
*/
#define ORTE_ERROR_PRI OPAL_EV_ERROR_PRI
#define ORTE_MSG_PRI OPAL_EV_MSG_LO_PRI
#define ORTE_SYS_PRI OPAL_EV_SYS_LO_PRI
#define ORTE_INFO_PRI OPAL_EV_INFO_LO_PRI
Per the meeting on moving the BTLs to OPAL, move the ORTE database "db" framework to OPAL so the relocated BTLs can access it. Because the data is indexed by process, this requires that we define a new "opal_identifier_t" that corresponds to the orte_process_name_t struct. In order to support multiple run-times, this is defined in opal/mca/db/db_types.h as a uint64_t without identifying the meaning of any part of that data. A few changes were required to support this move: 1. the PMI component used to identify rte-related data (e.g., host name, bind level) and package them as a unit to reduce the number of PMI keys. This code was moved up to the ORTE layer as the OPAL layer has no understanding of these concepts. In addition, the component locally stored data based on process jobid/vpid - this could no longer be supported (see below for the solution). 2. the hash component was updated to use the new opal_identifier_t instead of orte_process_name_t as its index for storing data in the hash tables. Previously, we did a hash on the vpid and stored the data in a 32-bit hash table. In the revised system, we don't see a separate "vpid" field - we only have a 64-bit opaque value. The orte_process_name_t hash turned out to do nothing useful, so we now store the data in a 64-bit hash table. Preliminary tests didn't show any identifiable change in behavior or performance, but we'll have to see if a move back to the 32-bit table is required at some later time. 3. the db framework was a "select one" system. However, since the PMI component could no longer use its internal storage system, the framework has now been changed to a "select many" mode of operation. This allows the hash component to handle all internal storage, while the PMI component only handles pushing/pulling things from the PMI system. This was something we had planned for some time - when fetching data, we first check internal storage to see if we already have it, and then automatically go to the global system to look for it if we don't. Accordingly, the framework was provided with a custom query function used during "select" that lets you seperately specify the "store" and "fetch" ordering. 4. the ORTE grpcomm and ess/pmi components, and the nidmap code, were updated to work with the new db framework and to specify internal/global storage options. No changes were made to the MPI layer, except for modifying the ORTE component of the OMPI/rte framework to support the new db framework. This commit was SVN r28112.
2013-02-26 21:50:04 +04:00
/* define some common keys used in ORTE */
#define ORTE_DB_DAEMON_VPID "orte.daemon.vpid"
/* State Machine lists */
ORTE_DECLSPEC extern opal_list_t orte_job_states;
ORTE_DECLSPEC extern opal_list_t orte_proc_states;
/* a clean output channel without prefix */
ORTE_DECLSPEC extern int orte_clean_output;
#define ORTE_GLOBAL_ARRAY_BLOCK_SIZE 64
#define ORTE_GLOBAL_ARRAY_MAX_SIZE INT_MAX
/* define a default error return code for ORTE */
#define ORTE_ERROR_DEFAULT_EXIT_CODE 1
/**
* Define a macro for updating the orte_exit_status
* The macro provides a convenient way of doing this
* so that we can add thread locking at some point
* since the orte_exit_status is a global variable.
*
* Ensure that we do not overwrite the exit status if it has
* already been set to some non-zero value. If we don't make
* this check, then different parts of the code could overwrite
* each other's exit status in the case of abnormal termination.
*
* For example, if a process aborts, we would record the initial
* exit code from the aborted process. However, subsequent processes
* will have been aborted by signal as we kill the job. We don't want
* the subsequent processes to overwrite the original exit code so
* we can tell the user the exit code from the process that caused
* the whole thing to happen.
*/
#define ORTE_UPDATE_EXIT_STATUS(newstatus) \
do { \
if (0 == orte_exit_status && 0 != newstatus) { \
OPAL_OUTPUT_VERBOSE((1, orte_debug_output, \
"%s:%s(%d) updating exit status to %d", \
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \
__FILE__, __LINE__, newstatus)); \
orte_exit_status = newstatus; \
} \
} while(0);
/* sometimes we need to reset the exit status - for example, when we
* are restarting a failed process
*/
#define ORTE_RESET_EXIT_STATUS() \
do { \
OPAL_OUTPUT_VERBOSE((1, orte_debug_output, \
"%s:%s(%d) reseting exit status", \
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \
__FILE__, __LINE__)); \
orte_exit_status = 0; \
} while(0);
/* define a macro for computing time differences - used for timing tests
* across the code base
*/
#define ORTE_COMPUTE_TIME_DIFF(r, ur, s1, us1, s2, us2) \
do { \
(r) = (s2) - (s1); \
if ((us2) >= (us1)) { \
(ur) = (us2) - (us1); \
} else { \
(r)--; \
(ur) = 1000000 - (us1) + (us2); \
} \
} while(0);
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here: https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation. In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions: 1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior. 2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation. 3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so. As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes. This commit was SVN r25476.
2011-11-15 07:40:11 +04:00
/* define a set of flags to control the launch of a job */
typedef uint16_t orte_job_controls_t;
#define ORTE_JOB_CONTROL OPAL_UINT16
/* global type definitions used by RTE - instanced in orte_globals.c */
/************
* Declare this to allow us to use it before fully
* defining it - resolves potential circular definition
*/
struct orte_proc_t;
struct orte_job_map_t;
/************/
/* define an object for storing node topologies */
typedef struct {
opal_object_t super;
hwloc_topology_t topo;
char *sig;
} orte_topology_t;
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_topology_t);
/**
* Information about a specific application to be launched in the RTE.
*/
typedef struct {
/** Parent object */
opal_object_t super;
/** Unique index when multiple apps per job */
orte_app_idx_t idx;
/** Absolute pathname of argv[0] */
char *app;
/** Number of copies of this process that are to be launched */
orte_std_cntr_t num_procs;
/** Array of pointers to the proc objects for procs of this app_context
* NOTE - not always used
*/
opal_pointer_array_t procs;
/** State of the app_context */
orte_app_state_t state;
/** First MPI rank of this app_context in the job */
orte_vpid_t first_rank;
/** Standard argv-style array, including a final NULL pointer */
char **argv;
/** Standard environ-style array, including a final NULL pointer */
char **env;
/** Current working directory for this app */
char *cwd;
/* flags */
orte_app_context_flags_t flags;
/* provide a list of attributes for this app_context in place
* of having a continually-expanding list of fixed-use values.
* This is a list of opal_value_t's, with the intent of providing
* flexibility without constantly expanding the memory footprint
* every time we want some new (rarely used) option
*/
opal_list_t attributes;
} orte_app_context_t;
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_app_context_t);
typedef struct {
/** Base object so this can be put on a list */
opal_list_item_t super;
/* index of this node object in global array */
orte_std_cntr_t index;
/** String node name */
char *name;
/* daemon on this node */
struct orte_proc_t *daemon;
/** number of procs on this node */
orte_vpid_t num_procs;
/* array of pointers to procs on this node */
opal_pointer_array_t *procs;
/* next node rank on this node */
orte_node_rank_t next_node_rank;
/** State of this node */
orte_node_state_t state;
/** A "soft" limit on the number of slots available on the node.
This will typically correspond to the number of physical CPUs
that we have been allocated on this note and would be the
"ideal" number of processes for us to launch. */
orte_std_cntr_t slots;
/** How many processes have already been launched, used by one or
more jobs on this node. */
orte_std_cntr_t slots_inuse;
/** A "hard" limit (if set -- a value of 0 implies no hard limit)
on the number of slots that can be allocated on a given
node. This is for some environments (e.g. grid) there may be
fixed limits on the number of slots that can be used.
2015-06-24 06:59:57 +03:00
This value also could have been a boolean - but we may want to
allow the hard limit be different than the soft limit - in
other words allow the node to be oversubscribed up to a
specified limit. For example, if we have two processors, we
may want to allow up to four processes but no more. */
orte_std_cntr_t slots_max;
/* system topology for this node */
orte_topology_t *topology;
/* flags */
orte_node_flags_t flags;
/* list of orte_attribute_t */
opal_list_t attributes;
} orte_node_t;
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_node_t);
typedef struct {
/** Base object so this can be put on a list */
opal_list_item_t super;
/* personality for this job */
char **personality;
/* jobid for this job */
orte_jobid_t jobid;
Per the PMIx RFC: WHAT: Merge the PMIx branch into the devel repo, creating a new OPAL “lmix” framework to abstract PMI support for all RTEs. Replace the ORTE daemon-level collectives with a new PMIx server and update the ORTE grpcomm framework to support server-to-server collectives WHY: We’ve had problems dealing with variations in PMI implementations, and need to extend the existing PMI definitions to meet exascale requirements. WHEN: Mon, Aug 25 WHERE: https://github.com/rhc54/ompi-svn-mirror.git Several community members have been working on a refactoring of the current PMI support within OMPI. Although the APIs are common, Slurm and Cray implement a different range of capabilities, and package them differently. For example, Cray provides an integrated PMI-1/2 library, while Slurm separates the two and requires the user to specify the one to be used at runtime. In addition, several bugs in the Slurm implementations have caused problems requiring extra coding. All this has led to a slew of #if’s in the PMI code and bugs when the corner-case logic for one implementation accidentally traps the other. Extending this support to other implementations would have increased this complexity to an unacceptable level. Accordingly, we have: * created a new OPAL “pmix” framework to abstract the PMI support, with separate components for Cray, Slurm PMI-1, and Slurm PMI-2 implementations. * Replaced the current ORTE grpcomm daemon-based collective operation with an integrated PMIx server, and updated the grpcomm APIs to provide more flexible, multi-algorithm support for collective operations. At this time, only the xcast and allgather operations are supported. * Replaced the current global collective id with a signature based on the names of the participating procs. The allows an unlimited number of collectives to be executed by any group of processes, subject to the requirement that only one collective can be active at a time for a unique combination of procs. Note that a proc can be involved in any number of simultaneous collectives - it is the specific combination of procs that is subject to the constraint * removed the prior OMPI/OPAL modex code * added new macros for executing modex send/recv to simplify use of the new APIs. The send macros allow the caller to specify whether or not the BTL supports async modex operations - if so, then the non-blocking “fence” operation is used, if the active PMIx component supports it. Otherwise, the default is a full blocking modex exchange as we currently perform. * retained the current flag that directs us to use a blocking fence operation, but only to retrieve data upon demand This commit was SVN r32570.
2014-08-21 22:56:47 +04:00
/* offset to the total number of procs so shared memory
* components can potentially connect to any spawned jobs*/
orte_vpid_t offset;
/* app_context array for this job */
opal_pointer_array_t *apps;
/* number of app_contexts in the array */
orte_app_idx_t num_apps;
/* rank desiring stdin - for now, either one rank, all ranks
* (wildcard), or none (invalid)
*/
orte_vpid_t stdin_target;
/* total slots allocated to this job */
orte_std_cntr_t total_slots_alloc;
/* number of procs in this job */
orte_vpid_t num_procs;
/* array of pointers to procs in this job */
opal_pointer_array_t *procs;
/* map of the job */
struct orte_job_map_t *map;
/* bookmark for where we are in mapping - this
* indicates the node where we stopped
*/
orte_node_t *bookmark;
/* if we are binding, bookmark the index of the
* last object we bound to */
unsigned int bkmark_obj;
/* state of the overall job */
orte_job_state_t state;
/* number of procs mapped */
orte_vpid_t num_mapped;
/* number of procs launched */
orte_vpid_t num_launched;
/* number of procs reporting contact info */
orte_vpid_t num_reported;
/* number of procs terminated */
orte_vpid_t num_terminated;
/* number of daemons reported launched so we can track progress */
orte_vpid_t num_daemons_reported;
/* originator of a dynamic spawn */
orte_process_name_t originator;
/* number of local procs */
orte_vpid_t num_local_procs;
/* flags */
orte_job_flags_t flags;
/* attributes */
opal_list_t attributes;
} orte_job_t;
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_job_t);
struct orte_proc_t {
/** Base object so this can be put on a list */
opal_list_item_t super;
/* process name */
orte_process_name_t name;
/* the vpid of my parent - the daemon vpid for an app
* or the vpid of the parent in the routing tree of
* a daemon */
orte_vpid_t parent;
/* pid */
pid_t pid;
/* local rank amongst my peers on the node
* where this is running - this value is
* needed by MPI procs so that the lowest
* rank on a node can perform certain fns -
* e.g., open an sm backing file
*/
orte_local_rank_t local_rank;
/* local rank on the node across all procs
* and jobs known to this HNP - this is
* needed so that procs can do things like
* know which static IP port to use
*/
orte_node_rank_t node_rank;
/* rank of this proc within its app context - this
* will just equal its vpid for single app_context
* applications
*/
int32_t app_rank;
/* Last state used to trigger the errmgr for this proc */
orte_proc_state_t last_errmgr_state;
/* process state */
orte_proc_state_t state;
/* exit code */
orte_exit_code_t exit_code;
/* the app_context that generated this proc */
orte_app_idx_t app_idx;
/* pointer to the node where this proc is executing */
orte_node_t *node;
/* RML contact info */
char *rml_uri;
/* some boolean flags */
orte_proc_flags_t flags;
/* list of opal_value_t attributes */
opal_list_t attributes;
};
typedef struct orte_proc_t orte_proc_t;
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_proc_t);
/**
* Get a job data object
* We cannot just reference a job data object with its jobid as
* the jobid is no longer an index into the array. This change
* was necessitated by modification of the jobid to include
* an mpirun-unique qualifer to eliminate any global name
* service
*/
ORTE_DECLSPEC orte_job_t* orte_get_job_data_object(orte_jobid_t job);
/**
* Get a proc data object
*/
ORTE_DECLSPEC orte_proc_t* orte_get_proc_object(orte_process_name_t *proc);
/**
* Get the daemon vpid hosting a given proc
*/
ORTE_DECLSPEC orte_vpid_t orte_get_proc_daemon_vpid(orte_process_name_t *proc);
/* Get the hostname of a proc */
ORTE_DECLSPEC char* orte_get_proc_hostname(orte_process_name_t *proc);
/* get the node rank of a proc */
ORTE_DECLSPEC orte_node_rank_t orte_get_proc_node_rank(orte_process_name_t *proc);
/* Find the lowest vpid alive in a given job */
ORTE_DECLSPEC orte_vpid_t orte_get_lowest_vpid_alive(orte_jobid_t job);
/* global variables used by RTE - instanced in orte_globals.c */
ORTE_DECLSPEC extern bool orte_debug_daemons_flag;
ORTE_DECLSPEC extern bool orte_debug_daemons_file_flag;
ORTE_DECLSPEC extern bool orte_leave_session_attached;
ORTE_DECLSPEC extern bool orte_do_not_launch;
ORTE_DECLSPEC extern bool orted_spin_flag;
ORTE_DECLSPEC extern char *orte_local_cpu_type;
ORTE_DECLSPEC extern char *orte_local_cpu_model;
Start reducing our dependency on the event library by removing at least one instance where we use it to redirect the program counter. Rolf reported occasional hangs of mpirun in very specific circumstances after all daemons were done. A review of MTT results indicates this may have been happening more generally in a small fraction of cases. The problem was tracked to use of the grpcomm.onesided_barrier to control daemon/mpirun termination. This relied on messaging -and- required that the program counter jump from the errmgr back to grpcomm. On rare occasions, this jump did not occur, causing mpirun to hang. This patch looks more invasive than it is - most of the affected files simply had one or two lines removed. The essence of the change is: * pulled the job_complete and quit routines out of orterun and orted_main and put them in a common place * modified the errmgr to directly call the new routines when termination is detected * removed the grpcomm.onesided_barrier and its associated RML tag * add a new "num_routes" API to the routed framework that reports back the number of dependent routes. When route_lost is called, the daemon's list of "children" is checked and adjusted if that route went to a "leaf" in the routing tree * use connection termination between daemons to track rollup of the daemon tree. Daemons and HNP now terminate once num_routes returns zero Also picked up in this commit is the addition of a new bool flag to the app_context struct, and increasing the job_control field from 8 to 16 bits. Both trivial. This commit was SVN r23429.
2010-07-18 01:03:27 +04:00
ORTE_DECLSPEC extern char *orte_basename;
**************************************************************** This change contains a non-mandatory modification of the MPI-RTE interface. Anyone wishing to support coprocessors such as the Xeon Phi may wish to add the required definition and underlying support **************************************************************** Add locality support for coprocessors such as the Intel Xeon Phi. Detecting that we are on a coprocessor inside of a host node isn't straightforward. There are no good "hooks" provided for programmatically detecting that "we are on a coprocessor running its own OS", and the ORTE daemon just thinks it is on another node. However, in order to properly use the Phi's public interface for MPI transport, it is necessary that the daemon detect that it is colocated with procs on the host. So we have to split the locality to separately record "on the same host" vs "on the same board". We already have the board-level locality flag, but not quite enough flexibility to handle this use-case. Thus, do the following: 1. add OPAL_PROC_ON_HOST flag to indicate we share a host, but not necessarily the same board 2. modify OPAL_PROC_ON_NODE to indicate we share both a host AND the same board. Note that we have to modify the OPAL_PROC_ON_LOCAL_NODE macro to explicitly check both conditions 3. add support in opal/mca/hwloc/base/hwloc_base_util.c for the host to check for coprocessors, and for daemons to check to see if they are on a coprocessor. The former is done via hwloc, but support for the latter is not yet provided by hwloc. So the code for detecting we are on a coprocessor currently is Xeon Phi specific - hopefully, we will find more generic methods in the future. 4. modify the orted and the hnp startup so they check for coprocessors and to see if they are on a coprocessor, and have the orteds pass that info back in their callback message. Automatically detect that coprocessors have been found and identify which coprocessors are on which hosts. Note that this algo isn't scalable at the moment - this will hopefully be improved over time. 5. modify the ompi proc locality detection function to look for coprocessor host info IF the OMPI_RTE_HOST_ID database key has been defined. RTE's that choose not to provide this support do not have to do anything - the associated code will simply be ignored. 6. include some cleanup of the hwloc open/close code so it conforms to how we did things in other frameworks (e.g., having a single "frame" file instead of open/close). Also, fix the locality flags - e.g., being on the same node means you must also be on the same cluster/cu, so ensure those flags are also set. cmr:v1.7.4:reviewer=hjelmn This commit was SVN r29435.
2013-10-14 20:52:58 +04:00
ORTE_DECLSPEC extern bool orte_coprocessors_detected;
ORTE_DECLSPEC extern opal_hash_table_t *orte_coprocessors;
ORTE_DECLSPEC extern char *orte_topo_signature;
Create an alternative mapping method that pushes responsibility onto the backend daemons. By default, let mpirun only pack the app_context info and send that to the backend daemons where the mapping will be done. This significantly reduces the computational time on mpirun as it isn't running up/down the topology tree computing thousands of binding locations, and it reduces the launch message to a very small number of bytes. When running -novm, fall back to the old way of doing things where mpirun computes the entire map and binding, and then sends the full info to the backend daemon. Add a new cmd line option/mca param --fwd-mpirun-port that allows mpirun to dynamically select a port, but then passes that back to all the other daemons so they will use that port as a static port for their own wireup. In this mode, we no longer "phone home" directly to mpirun, but instead use the static port to wireup at daemon start. We then use the routing tree to rollup the initial launch report, and limit the number of open sockets on mpirun's node. Update ras simulator to track the new nidmap code Cleanup some bugs in the nidmap regex code, and enhance the error message for not enough slots to include the host on which the problem is found. Update gadget platform file Initialize the range count when starting a new range Fix the no-np case in managed allocation Ensure DVM node usage gets cleaned up after each job Update scaling.pl script to use --fwd-mpirun-port. Pre-connect the daemon to its parent during launch while we are otherwise waiting for the daemon's children to send their "phone home" rollup messages Signed-off-by: Ralph Castain <rhc@open-mpi.org>
2017-02-02 03:33:14 +03:00
ORTE_DECLSPEC extern bool orte_no_vm;
/* ORTE OOB port flags */
ORTE_DECLSPEC extern bool orte_static_ports;
ORTE_DECLSPEC extern char *orte_oob_static_ports;
ORTE_DECLSPEC extern bool orte_standalone_operation;
Create an alternative mapping method that pushes responsibility onto the backend daemons. By default, let mpirun only pack the app_context info and send that to the backend daemons where the mapping will be done. This significantly reduces the computational time on mpirun as it isn't running up/down the topology tree computing thousands of binding locations, and it reduces the launch message to a very small number of bytes. When running -novm, fall back to the old way of doing things where mpirun computes the entire map and binding, and then sends the full info to the backend daemon. Add a new cmd line option/mca param --fwd-mpirun-port that allows mpirun to dynamically select a port, but then passes that back to all the other daemons so they will use that port as a static port for their own wireup. In this mode, we no longer "phone home" directly to mpirun, but instead use the static port to wireup at daemon start. We then use the routing tree to rollup the initial launch report, and limit the number of open sockets on mpirun's node. Update ras simulator to track the new nidmap code Cleanup some bugs in the nidmap regex code, and enhance the error message for not enough slots to include the host on which the problem is found. Update gadget platform file Initialize the range count when starting a new range Fix the no-np case in managed allocation Ensure DVM node usage gets cleaned up after each job Update scaling.pl script to use --fwd-mpirun-port. Pre-connect the daemon to its parent during launch while we are otherwise waiting for the daemon's children to send their "phone home" rollup messages Signed-off-by: Ralph Castain <rhc@open-mpi.org>
2017-02-02 03:33:14 +03:00
ORTE_DECLSPEC extern bool orte_fwd_mpirun_port;
/* nodename flags */
ORTE_DECLSPEC extern bool orte_keep_fqdn_hostnames;
ORTE_DECLSPEC extern bool orte_have_fqdn_allocation;
ORTE_DECLSPEC extern bool orte_show_resolved_nodenames;
ORTE_DECLSPEC extern bool orte_retain_aliases;
ORTE_DECLSPEC extern int orte_use_hostname_alias;
ORTE_DECLSPEC extern int orte_hostname_cutoff;
/* debug flags */
ORTE_DECLSPEC extern int orted_debug_failure;
ORTE_DECLSPEC extern int orted_debug_failure_delay;
ORTE_DECLSPEC extern bool orte_never_launched;
ORTE_DECLSPEC extern bool orte_devel_level_output;
ORTE_DECLSPEC extern bool orte_display_topo_with_map;
ORTE_DECLSPEC extern bool orte_display_diffable_output;
ORTE_DECLSPEC extern char **orte_launch_environ;
ORTE_DECLSPEC extern bool orte_hnp_is_allocated;
ORTE_DECLSPEC extern bool orte_allocation_required;
ORTE_DECLSPEC extern bool orte_managed_allocation;
ORTE_DECLSPEC extern char *orte_set_slots;
ORTE_DECLSPEC extern bool orte_display_allocation;
ORTE_DECLSPEC extern bool orte_display_devel_allocation;
ORTE_DECLSPEC extern bool orte_soft_locations;
ORTE_DECLSPEC extern bool orte_hnp_connected;
/* launch agents */
Per the July technical meeting: Standardize the handling of the orte launch agent option across PLMs. This has been a consistent complaint I have received - each PLM would register its own MCA param to get input on the launch agent for remote nodes (in fact, one or two didn't, but most did). This would then get handled in various and contradictory ways. Some PLMs would accept only a one-word input. Others accepted multi-word args such as "valgrind orted", but then some would error by putting any prefix specified on the cmd line in front of the incorrect argument. For example, while using the rsh launcher, if you specified "valgrind orted" as your launch agent and had "--prefix foo" on you cmd line, you would attempt to execute "ssh foo/valgrind orted" - which obviously wouldn't work. This was all -very- confusing to users, who had to know which PLM was being used so they could even set the right mca param in the first place! And since we don't warn about non-recognized or non-used mca params, half of the time they would wind up not doing what they thought they were telling us to do. To solve this problem, we did the following: 1. removed all mca params from the individual plms for the launch agent 2. added a new mca param "orte_launch_agent" for this purpose. To further simplify for users, this comes with a new cmd line option "--launch-agent" that can take a multi-word string argument. The value of the param defaults to "orted". 3. added a PLM base function that processes the orte_launch_agent value and adds the contents to a provided argv array. This can subsequently be harvested at-will to handle multi-word values 4. modified the PLMs to use this new function. All the PLMs except for the rsh PLM required very minor change - just called the function and moved on. The rsh PLM required much larger changes as - because of the rsh/ssh cmd line limitations - we had to correctly prepend any provided prefix to the correct argv entry. 5. added a new opal_argv_join_range function that allows the caller to "join" argv entries between two specified indices Please let me know of any problems. I tried to make this as clean as possible, but cannot compile all PLMs to ensure all is correct. This commit was SVN r19097.
2008-07-30 22:26:24 +04:00
ORTE_DECLSPEC extern char *orte_launch_agent;
ORTE_DECLSPEC extern char **orted_cmd_line;
ORTE_DECLSPEC extern char **orte_fork_agent;
/* debugger job */
ORTE_DECLSPEC extern bool orte_debugger_dump_proctable;
ORTE_DECLSPEC extern char *orte_debugger_test_daemon;
ORTE_DECLSPEC extern bool orte_debugger_test_attach;
ORTE_DECLSPEC extern int orte_debugger_check_rate;
Start reducing our dependency on the event library by removing at least one instance where we use it to redirect the program counter. Rolf reported occasional hangs of mpirun in very specific circumstances after all daemons were done. A review of MTT results indicates this may have been happening more generally in a small fraction of cases. The problem was tracked to use of the grpcomm.onesided_barrier to control daemon/mpirun termination. This relied on messaging -and- required that the program counter jump from the errmgr back to grpcomm. On rare occasions, this jump did not occur, causing mpirun to hang. This patch looks more invasive than it is - most of the affected files simply had one or two lines removed. The essence of the change is: * pulled the job_complete and quit routines out of orterun and orted_main and put them in a common place * modified the errmgr to directly call the new routines when termination is detected * removed the grpcomm.onesided_barrier and its associated RML tag * add a new "num_routes" API to the routed framework that reports back the number of dependent routes. When route_lost is called, the daemon's list of "children" is checked and adjusted if that route went to a "leaf" in the routing tree * use connection termination between daemons to track rollup of the daemon tree. Daemons and HNP now terminate once num_routes returns zero Also picked up in this commit is the addition of a new bool flag to the app_context struct, and increasing the job_control field from 8 to 16 bits. Both trivial. This commit was SVN r23429.
2010-07-18 01:03:27 +04:00
/* exit flags */
ORTE_DECLSPEC extern bool orte_abnormal_term_ordered;
ORTE_DECLSPEC extern bool orte_routing_is_enabled;
ORTE_DECLSPEC extern bool orte_job_term_ordered;
ORTE_DECLSPEC extern bool orte_orteds_term_ordered;
ORTE_DECLSPEC extern bool orte_allowed_exit_without_sync;
ORTE_DECLSPEC extern int orte_startup_timeout;
ORTE_DECLSPEC extern int orte_timeout_usec_per_proc;
ORTE_DECLSPEC extern float orte_max_timeout;
ORTE_DECLSPEC extern orte_timer_t *orte_mpiexec_timeout;
ORTE_DECLSPEC extern opal_buffer_t *orte_tree_launch_cmd;
/* global arrays for data storage */
ORTE_DECLSPEC extern opal_hash_table_t *orte_job_data;
ORTE_DECLSPEC extern opal_pointer_array_t *orte_node_pool;
ORTE_DECLSPEC extern opal_pointer_array_t *orte_node_topologies;
ORTE_DECLSPEC extern opal_pointer_array_t *orte_local_children;
ORTE_DECLSPEC extern orte_vpid_t orte_total_procs;
/* IOF controls */
ORTE_DECLSPEC extern bool orte_tag_output;
ORTE_DECLSPEC extern bool orte_timestamp_output;
ORTE_DECLSPEC extern char *orte_output_filename;
/* generate new xterm windows to display output from specified ranks */
ORTE_DECLSPEC extern char *orte_xterm;
/* whether or not to report launch progress */
ORTE_DECLSPEC extern bool orte_report_launch_progress;
/* allocation specification */
ORTE_DECLSPEC extern char *orte_default_hostfile;
ORTE_DECLSPEC extern bool orte_default_hostfile_given;
ORTE_DECLSPEC extern char *orte_rankfile;
ORTE_DECLSPEC extern int orte_num_allocated_nodes;
ORTE_DECLSPEC extern char *orte_node_regex;
ORTE_DECLSPEC extern char *orte_default_dash_host;
/* PMI version control */
ORTE_DECLSPEC extern int orted_pmi_version;
/* tool communication controls */
ORTE_DECLSPEC extern bool orte_report_events;
ORTE_DECLSPEC extern char *orte_report_events_uri;
/* process recovery */
ORTE_DECLSPEC extern bool orte_enable_recovery;
ORTE_DECLSPEC extern int32_t orte_max_restarts;
/* barrier control */
ORTE_DECLSPEC extern bool orte_do_not_barrier;
/* exit status reporting */
ORTE_DECLSPEC extern bool orte_report_child_jobs_separately;
ORTE_DECLSPEC extern struct timeval orte_child_time_to_exit;
ORTE_DECLSPEC extern bool orte_abort_non_zero_exit;
/* length of stat history to keep */
ORTE_DECLSPEC extern int orte_stat_history_size;
/* envars to forward */
ORTE_DECLSPEC extern char **orte_forwarded_envars;
/* map stddiag output to stderr so it isn't forwarded to mpirun */
ORTE_DECLSPEC extern bool orte_map_stddiag_to_stderr;
ORTE_DECLSPEC extern bool orte_map_stddiag_to_stdout;
/* maximum size of virtual machine - used to subdivide allocation */
ORTE_DECLSPEC extern int orte_max_vm_size;
MCA/base: Add new MCA variable system Features: - Support for an override parameter file (openmpi-mca-param-override.conf). Variable values in this file can not be overridden by any file or environment value. - Support for boolean, unsigned, and unsigned long long variables. - Support for true/false values. - Support for enumerations on integer variables. - Support for MPIT scope, verbosity, and binding. - Support for command line source. - Support for setting variable source via the environment using OMPI_MCA_SOURCE_<var name>=source (either command or file:filename) - Cleaner API. - Support for variable groups (equivalent to MPIT categories). Notes: - Variables must be created with a backing store (char **, int *, or bool *) that must live at least as long as the variable. - Creating a variable with the MCA_BASE_VAR_FLAG_SETTABLE enables the use of mca_base_var_set_value() to change the value. - String values are duplicated when the variable is registered. It is up to the caller to free the original value if necessary. The new value will be freed by the mca_base_var system and must not be freed by the user. - Variables with constant scope may not be settable. - Variable groups (and all associated variables) are deregistered when the component is closed or the component repository item is freed. This prevents a segmentation fault from accessing a variable after its component is unloaded. - After some discussion we decided we should remove the automatic registration of component priority variables. Few component actually made use of this feature. - The enumerator interface was updated to be general enough to handle future uses of the interface. - The code to generate ompi_info output has been moved into the MCA variable system. See mca_base_var_dump(). opal: update core and components to mca_base_var system orte: update core and components to mca_base_var system ompi: update core and components to mca_base_var system This commit also modifies the rmaps framework. The following variables were moved from ppr and lama: rmaps_base_pernode, rmaps_base_n_pernode, rmaps_base_n_persocket. Both lama and ppr create synonyms for these variables. This commit was SVN r28236.
2013-03-28 01:09:41 +04:00
/* user debugger */
ORTE_DECLSPEC extern char *orte_base_user_debugger;
/* binding directives for daemons to restrict them
* to certain cores
*/
ORTE_DECLSPEC extern char *orte_daemon_cores;
/* Max time to wait for stack straces to return */
ORTE_DECLSPEC extern int orte_stack_trace_wait_timeout;
END_C_DECLS
#endif /* ORTE_RUNTIME_ORTE_GLOBALS_H */