a1d296ae03
Fix a few bugs in the mappers: 1. Ensure that bynode with no -np fills all available slots - it just does so with the ranks set bynode instead of byslot 2. fix --nolocal behavior so it works correctly in all cases. We still have to test the host's name using opal_ifislocal in the mapper because the name returned by gethostname to orte_process_info.hostname can be an FQDN, but a hostfile may contain a non-FQDN version. 3. Add missing --nolocal logic to the seq mapper Oversubscribed mapping seemed to be working okay without repair, so I couldn't verify my own bug report in that regard. Also included are some preliminary changes to support the modified hostfile behavior, which will be committed shortly: 1. removed the totally useless "allocate" field in the orte_node_t object since every node is automatically allocated for use - and everything ignored the field anyway 2. correctly initialize the slots_alloc field when the allocation is read This commit was SVN r19030.
376 строки
12 KiB
C
376 строки
12 KiB
C
/*
|
|
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
|
* University Research and Technology
|
|
* Corporation. All rights reserved.
|
|
* Copyright (c) 2004-2008 The University of Tennessee and The University
|
|
* of Tennessee Research Foundation. All rights
|
|
* reserved.
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
* University of Stuttgart. All rights reserved.
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
* All rights reserved.
|
|
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
|
|
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
/**
|
|
* @file
|
|
*
|
|
* Global params for OpenRTE
|
|
*/
|
|
#ifndef ORTE_RUNTIME_ORTE_GLOBALS_H
|
|
#define ORTE_RUNTIME_ORTE_GLOBALS_H
|
|
|
|
#include "orte_config.h"
|
|
#include "orte/types.h"
|
|
|
|
#include <sys/types.h>
|
|
#ifdef HAVE_SYS_TIME_H
|
|
#include <sys/time.h>
|
|
#endif
|
|
|
|
#include "opal/threads/mutex.h"
|
|
#include "opal/threads/condition.h"
|
|
#include "opal/class/opal_pointer_array.h"
|
|
|
|
#include "orte/mca/plm/plm_types.h"
|
|
#include "orte/mca/rmaps/rmaps_types.h"
|
|
#include "orte/util/proc_info.h"
|
|
#include "orte/runtime/runtime.h"
|
|
|
|
#define ORTE_GLOBAL_ARRAY_BLOCK_SIZE 64
|
|
#define ORTE_GLOBAL_ARRAY_MAX_SIZE INT_MAX
|
|
|
|
/* define a default error return code for ORTE */
|
|
#define ORTE_ERROR_DEFAULT_EXIT_CODE 1
|
|
|
|
BEGIN_C_DECLS
|
|
|
|
ORTE_DECLSPEC extern bool orte_help_want_aggregate;
|
|
ORTE_DECLSPEC extern char *orte_prohibited_session_dirs;
|
|
|
|
#define ORTE_PROC_MY_NAME (&orte_process_info.my_name)
|
|
|
|
/* define a special name that belongs to orterun */
|
|
#define ORTE_PROC_MY_HNP (&orte_process_info.my_hnp)
|
|
|
|
/* define the name of my daemon */
|
|
#define ORTE_PROC_MY_DAEMON (&orte_process_info.my_daemon)
|
|
|
|
/*
|
|
* Shortcut for some commonly used names
|
|
*/
|
|
#define ORTE_NAME_WILDCARD (&orte_name_wildcard)
|
|
ORTE_DECLSPEC extern orte_process_name_t orte_name_wildcard; /** instantiated in orte/runtime/orte_init.c */
|
|
|
|
#define ORTE_NAME_INVALID (&orte_name_invalid)
|
|
ORTE_DECLSPEC extern orte_process_name_t orte_name_invalid; /** instantiated in orte/runtime/orte_init.c */
|
|
|
|
/* global type definitions used by RTE - instanced in orte_globals.c */
|
|
|
|
/************
|
|
* Declare this to allow us to use it before fully
|
|
* defining it - resolves potential circular definition
|
|
*/
|
|
struct orte_proc_t;
|
|
/************/
|
|
|
|
/**
|
|
* Information about a specific application to be launched in the RTE.
|
|
*/
|
|
typedef struct {
|
|
/** Parent object */
|
|
opal_object_t super;
|
|
/** Unique index when multiple apps per job */
|
|
int8_t idx;
|
|
/** Absolute pathname of argv[0] */
|
|
char *app;
|
|
/** Number of copies of this process that are to be launched */
|
|
orte_std_cntr_t num_procs;
|
|
/** Standard argv-style array, including a final NULL pointer */
|
|
char **argv;
|
|
/** Standard environ-style array, including a final NULL pointer */
|
|
char **env;
|
|
/** Current working directory for this app */
|
|
char *cwd;
|
|
/** Whether the cwd was set by the user or by the system */
|
|
bool user_specified_cwd;
|
|
/* Any hostfile that was specified */
|
|
char *hostfile;
|
|
/* Hostfile for adding hosts to an existing allocation */
|
|
char *add_hostfile;
|
|
/** argv of hosts passed in to -host */
|
|
char ** dash_host;
|
|
/** Prefix directory for this app (or NULL if no override necessary) */
|
|
char *prefix_dir;
|
|
/** Preload the binary on the remote machine (in PLS via FileM) */
|
|
bool preload_binary;
|
|
/** Preload the comma separated list of files to the remote machines cwd */
|
|
char * preload_files;
|
|
/** Destination directory for the preloaded files
|
|
* If NULL then the absolute and relative paths are obeyed */
|
|
char * preload_files_dest_dir;
|
|
} orte_app_context_t;
|
|
|
|
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_app_context_t);
|
|
|
|
|
|
typedef struct {
|
|
/** Base object so this can be put on a list */
|
|
opal_list_item_t super;
|
|
/* index of this node object in global array */
|
|
orte_std_cntr_t index;
|
|
/** String node name */
|
|
char *name;
|
|
/* daemon on this node */
|
|
struct orte_proc_t *daemon;
|
|
/* whether or not this daemon has been launched */
|
|
bool daemon_launched;
|
|
/** Launch id - needed by some systems to launch a proc on this node */
|
|
int32_t launch_id;
|
|
/** number of procs on this node */
|
|
orte_vpid_t num_procs;
|
|
/* array of pointers to procs on this node */
|
|
opal_pointer_array_t *procs;
|
|
/* next node rank on this node */
|
|
uint8_t next_node_rank;
|
|
/* whether or not we are oversubscribed */
|
|
bool oversubscribed;
|
|
/** The node architecture, as reported by the remote node. This
|
|
* value is a bit-map that identifies whether or not the node
|
|
* is big/little endian, etc.
|
|
*/
|
|
int32_t arch;
|
|
/** State of this node */
|
|
orte_node_state_t state;
|
|
/** A "soft" limit on the number of slots available on the node.
|
|
This will typically correspond to the number of physical CPUs
|
|
that we have been allocated on this note and would be the
|
|
"ideal" number of processes for us to launch. */
|
|
orte_std_cntr_t slots;
|
|
/** How many processes have already been launched, used by one or
|
|
more jobs on this node. */
|
|
orte_std_cntr_t slots_inuse;
|
|
/** This represents the number of slots we (the allocator) are
|
|
attempting to allocate to the current job - or the number of
|
|
slots allocated to a specific job on a query for the jobs
|
|
allocations */
|
|
orte_std_cntr_t slots_alloc;
|
|
/** A "hard" limit (if set -- a value of 0 implies no hard limit)
|
|
on the number of slots that can be allocated on a given
|
|
node. This is for some environments (e.g. grid) there may be
|
|
fixed limits on the number of slots that can be used.
|
|
|
|
This value also could have been a boolean - but we may want to
|
|
allow the hard limit be different than the soft limit - in
|
|
other words allow the node to be oversubscribed up to a
|
|
specified limit. For example, if we have two processors, we
|
|
may want to allow up to four processes but no more. */
|
|
orte_std_cntr_t slots_max;
|
|
/** Username on this node, if specified */
|
|
char *username;
|
|
char *slot_list;
|
|
} orte_node_t;
|
|
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_node_t);
|
|
|
|
/* define a set of flags to control the launch of a job */
|
|
#define ORTE_JOB_CONTROL_LOCAL_SPAWN (uint16_t) 0x01
|
|
#define ORTE_JOB_CONTROL_NON_ORTE_JOB (uint16_t) 0x02
|
|
|
|
typedef struct {
|
|
/** Base object so this can be put on a list */
|
|
opal_list_item_t super;
|
|
/* jobid for this job */
|
|
orte_jobid_t jobid;
|
|
/* app_context array for this job */
|
|
opal_pointer_array_t *apps;
|
|
/* number of app_contexts in the array */
|
|
orte_std_cntr_t num_apps;
|
|
/* flags to control the launch of this job - see above
|
|
* for description of supported flags
|
|
*/
|
|
uint16_t controls;
|
|
/* total slots allocated to this job */
|
|
orte_std_cntr_t total_slots_alloc;
|
|
/* number of procs in this job */
|
|
orte_vpid_t num_procs;
|
|
/* array of pointers to procs in this job */
|
|
opal_pointer_array_t *procs;
|
|
/* map of the job */
|
|
orte_job_map_t *map;
|
|
/* bookmark for where we are in mapping - this
|
|
* indicates the node where we stopped
|
|
*/
|
|
orte_node_t *bookmark;
|
|
/** Whether or not to override oversubscription based on local
|
|
* hardware - used to indicate uncertainty in number of
|
|
* actual processors available on this node
|
|
*/
|
|
bool oversubscribe_override;
|
|
/* state of the overall job */
|
|
orte_job_state_t state;
|
|
/* number of procs launched */
|
|
orte_vpid_t num_launched;
|
|
/* number of procs reporting contact info */
|
|
orte_vpid_t num_reported;
|
|
/* number of procs terminated */
|
|
orte_vpid_t num_terminated;
|
|
/* did this job abort? */
|
|
bool abort;
|
|
/* proc that caused that to happen */
|
|
struct orte_proc_t *aborted_proc;
|
|
#if OPAL_ENABLE_FT == 1
|
|
/* ckpt state */
|
|
size_t ckpt_state;
|
|
/* snapshot reference */
|
|
char *ckpt_snapshot_ref;
|
|
/* snapshot location */
|
|
char *ckpt_snapshot_loc;
|
|
#endif
|
|
} orte_job_t;
|
|
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_job_t);
|
|
|
|
struct orte_proc_t {
|
|
/** Base object so this can be put on a list */
|
|
opal_list_item_t super;
|
|
/* process name */
|
|
orte_process_name_t name;
|
|
/* pid */
|
|
pid_t pid;
|
|
/* local rank amongst my peers on the node
|
|
* where this is running - this value is
|
|
* needed by MPI procs so that the lowest
|
|
* rank on a node can perform certain fns -
|
|
* e.g., open an sm backing file
|
|
*/
|
|
uint8_t local_rank;
|
|
/* local rank on the node across all procs
|
|
* and jobs known to this HNP - this is
|
|
* needed so that procs can do things like
|
|
* know which static IP port to use
|
|
*/
|
|
uint8_t node_rank;
|
|
/* process state */
|
|
orte_proc_state_t state;
|
|
/* exit code */
|
|
orte_exit_code_t exit_code;
|
|
/* the app_context that generated this proc */
|
|
int8_t app_idx;
|
|
/* a cpu list, if specified by the user */
|
|
char *slot_list;
|
|
/* pointer to the node where this proc is executing */
|
|
orte_node_t *node;
|
|
/* name of the node where this proc is executing - this
|
|
* is used simply to pass that info to a calling
|
|
* tool since it may not have a node array available
|
|
*/
|
|
char *nodename;
|
|
/* RML contact info */
|
|
char *rml_uri;
|
|
/* seconds when last heartbeat was detected */
|
|
int beat;
|
|
#if OPAL_ENABLE_FT == 1
|
|
/* ckpt state */
|
|
size_t ckpt_state;
|
|
/* snapshot reference */
|
|
char *ckpt_snapshot_ref;
|
|
/* snapshot location */
|
|
char *ckpt_snapshot_loc;
|
|
#endif
|
|
};
|
|
typedef struct orte_proc_t orte_proc_t;
|
|
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_proc_t);
|
|
|
|
|
|
typedef struct {
|
|
/* base object */
|
|
opal_object_t super;
|
|
/* nodename */
|
|
char *name;
|
|
/* vpid of this job family's daemon on this node */
|
|
orte_vpid_t daemon;
|
|
/* arch of node */
|
|
uint32_t arch;
|
|
} orte_nid_t;
|
|
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_nid_t);
|
|
|
|
typedef struct {
|
|
/* index to node */
|
|
int32_t node;
|
|
/* local rank */
|
|
uint8_t local_rank;
|
|
/* node rank */
|
|
uint8_t node_rank;
|
|
} orte_pmap_t;
|
|
|
|
typedef struct {
|
|
/* base object */
|
|
opal_object_t super;
|
|
/* jobid */
|
|
orte_jobid_t job;
|
|
/* array of data for procs */
|
|
opal_value_array_t pmap;
|
|
} orte_jmap_t;
|
|
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_jmap_t);
|
|
|
|
#if !ORTE_DISABLE_FULL_SUPPORT
|
|
|
|
/**
|
|
* Get a job data object
|
|
* We cannot just reference a job data object with its jobid as
|
|
* the jobid is no longer an index into the array. This change
|
|
* was necessitated by modification of the jobid to include
|
|
* an mpirun-unique qualifer to eliminate any global name
|
|
* service
|
|
*/
|
|
ORTE_DECLSPEC orte_job_t* orte_get_job_data_object(orte_jobid_t job);
|
|
|
|
/* global variables used by RTE - instanced in orte_globals.c */
|
|
ORTE_DECLSPEC extern bool orte_reuse_daemons, orte_timing;
|
|
ORTE_DECLSPEC extern bool orte_debug_daemons_flag, orte_debug_daemons_file_flag;
|
|
ORTE_DECLSPEC extern bool orte_do_not_launch;
|
|
ORTE_DECLSPEC extern bool orted_spin_flag;
|
|
ORTE_DECLSPEC extern bool orte_static_ports;
|
|
ORTE_DECLSPEC extern int32_t orte_contiguous_nodes;
|
|
ORTE_DECLSPEC extern bool orte_keep_fqdn_hostnames;
|
|
ORTE_DECLSPEC extern bool orte_xml_output;
|
|
ORTE_DECLSPEC extern int orte_debug_verbosity;
|
|
ORTE_DECLSPEC extern int orted_debug_failure;
|
|
ORTE_DECLSPEC extern int orted_debug_failure_delay;
|
|
ORTE_DECLSPEC extern bool orte_homogeneous_nodes;
|
|
ORTE_DECLSPEC extern bool orte_hetero_apps;
|
|
|
|
ORTE_DECLSPEC extern char **orte_launch_environ;
|
|
ORTE_DECLSPEC extern opal_pointer_array_t orte_daemonmap;
|
|
|
|
ORTE_DECLSPEC extern char **orted_cmd_line;
|
|
ORTE_DECLSPEC extern int orte_exit, orteds_exit;
|
|
ORTE_DECLSPEC extern int orte_exit_status;
|
|
ORTE_DECLSPEC extern bool orte_abnormal_term_ordered;
|
|
ORTE_DECLSPEC extern bool orte_shutdown_in_progress;
|
|
|
|
ORTE_DECLSPEC extern int orte_heartbeat_rate;
|
|
ORTE_DECLSPEC extern int orte_startup_timeout;
|
|
|
|
ORTE_DECLSPEC extern int orte_timeout_usec_per_proc;
|
|
ORTE_DECLSPEC extern float orte_max_timeout;
|
|
|
|
ORTE_DECLSPEC extern char *orte_default_hostfile;
|
|
|
|
ORTE_DECLSPEC extern opal_buffer_t *orte_tree_launch_cmd;
|
|
|
|
/* global arrays for data storage */
|
|
ORTE_DECLSPEC extern opal_pointer_array_t *orte_job_data;
|
|
ORTE_DECLSPEC extern opal_pointer_array_t *orte_node_pool;
|
|
|
|
#endif /* ORTE_DISABLE_FULL_SUPPORT */
|
|
|
|
END_C_DECLS
|
|
|
|
#endif /* ORTE_RUNTIME_ORTE_GLOBALS_H */
|