2015-01-26 07:16:45 +03:00
/*
* Copyright ( c ) 2004 - 2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation . All rights reserved .
* Copyright ( c ) 2004 - 2011 The University of Tennessee and The University
* of Tennessee Research Foundation . All rights
* reserved .
* Copyright ( c ) 2004 - 2005 High Performance Computing Center Stuttgart ,
* University of Stuttgart . All rights reserved .
* Copyright ( c ) 2004 - 2005 The Regents of the University of California .
* All rights reserved .
2015-06-24 06:59:57 +03:00
* Copyright ( c ) 2006 - 2013 Los Alamos National Security , LLC .
2015-01-26 07:16:45 +03:00
* All rights reserved .
2016-05-27 19:45:34 +03:00
* Copyright ( c ) 2009 - 2016 Cisco Systems , Inc . All rights reserved .
2015-01-26 07:16:45 +03:00
* Copyright ( c ) 2011 Oak Ridge National Labs . All rights reserved .
2016-04-02 18:50:05 +03:00
* Copyright ( c ) 2013 - 2016 Intel , Inc . All rights reserved .
2015-03-03 06:36:21 +03:00
* Copyright ( c ) 2015 Research Organization for Information Science
* and Technology ( RIST ) . All rights reserved .
2015-01-26 07:16:45 +03:00
* $ COPYRIGHT $
*
* Additional copyrights may follow
*
* $ HEADER $
*
*/
# include "orte_config.h"
# include "orte/types.h"
# include "opal/types.h"
# ifdef HAVE_UNISTD_H
# include <unistd.h>
# endif
# include <ctype.h>
2015-02-10 18:22:10 +03:00
# include "opal/util/argv.h"
2015-01-27 05:15:57 +03:00
# include "opal/util/opal_environ.h"
# include "opal/util/os_dirpath.h"
2015-01-26 07:16:45 +03:00
# include "opal/util/show_help.h"
2015-01-27 05:15:57 +03:00
# include "opal/mca/shmem/base/base.h"
2015-01-26 07:16:45 +03:00
# include "orte/mca/errmgr/errmgr.h"
2015-01-27 05:15:57 +03:00
# include "orte/mca/ess/base/base.h"
# include "orte/mca/rmaps/rmaps_types.h"
2016-04-02 18:50:05 +03:00
# include "orte/orted/orted_submit.h"
2015-01-26 07:16:45 +03:00
# include "orte/util/name_fns.h"
2015-01-27 05:15:57 +03:00
# include "orte/util/session_dir.h"
2015-01-26 07:16:45 +03:00
# include "orte/util/show_help.h"
# include "orte/runtime/orte_globals.h"
2016-04-02 18:50:05 +03:00
# include "orte/mca/schizo/base/base.h"
2015-01-27 05:15:57 +03:00
2016-04-02 18:50:05 +03:00
static int define_cli ( opal_cmd_line_t * cli ) ;
static int parse_cli ( int argc , int start , char * * argv ) ;
static int parse_env ( char * path ,
2015-01-27 05:15:57 +03:00
opal_cmd_line_t * cmd_line ,
char * * srcenv ,
char * * * dstenv ) ;
static int setup_fork ( orte_job_t * jdata ,
orte_app_context_t * context ) ;
static int setup_child ( orte_job_t * jobdat ,
orte_proc_t * child ,
orte_app_context_t * app ) ;
orte_schizo_base_module_t orte_schizo_ompi_module = {
2016-04-02 18:50:05 +03:00
. define_cli = define_cli ,
2016-02-25 20:05:38 +03:00
. parse_cli = parse_cli ,
. parse_env = parse_env ,
. setup_fork = setup_fork ,
. setup_child = setup_child
2015-01-26 07:16:45 +03:00
} ;
2016-04-02 18:50:05 +03:00
static opal_cmd_line_init_t cmd_line_init [ ] = {
/* Various "obvious" options */
{ NULL , ' h ' , NULL , " help " , 0 ,
2016-05-06 20:55:21 +03:00
& orte_cmd_options . help , OPAL_CMD_LINE_TYPE_BOOL ,
2016-04-02 18:50:05 +03:00
" This help message " } ,
{ NULL , ' V ' , NULL , " version " , 0 ,
2016-05-06 20:55:21 +03:00
& orte_cmd_options . version , OPAL_CMD_LINE_TYPE_BOOL ,
2016-04-02 18:50:05 +03:00
" Print version and exit " } ,
{ NULL , ' v ' , NULL , " verbose " , 0 ,
2016-05-06 20:55:21 +03:00
& orte_cmd_options . verbose , OPAL_CMD_LINE_TYPE_BOOL ,
2016-04-02 18:50:05 +03:00
" Be verbose " } ,
{ " orte_execute_quiet " , ' q ' , NULL , " quiet " , 0 ,
NULL , OPAL_CMD_LINE_TYPE_BOOL ,
" Suppress helpful messages " } ,
{ NULL , ' \0 ' , " report-pid " , " report-pid " , 1 ,
2016-05-06 20:55:21 +03:00
& orte_cmd_options . report_pid , OPAL_CMD_LINE_TYPE_STRING ,
2016-04-02 18:50:05 +03:00
" Printout pid on stdout [-], stderr [+], or a file [anything else] " } ,
{ NULL , ' \0 ' , " report-uri " , " report-uri " , 1 ,
2016-05-06 20:55:21 +03:00
& orte_cmd_options . report_uri , OPAL_CMD_LINE_TYPE_STRING ,
2016-04-02 18:50:05 +03:00
" Printout URI on stdout [-], stderr [+], or a file [anything else] " } ,
2016-05-27 19:45:34 +03:00
/* testing options */
{ NULL , ' \0 ' , " timeout " , " timeout " , 1 ,
& orte_cmd_options . timeout , OPAL_CMD_LINE_TYPE_INT ,
" Timeout the job after the specified number of seconds " } ,
{ NULL , ' \0 ' , " report-state-on-timeout " , " report-state-on-timeout " , 0 ,
& orte_cmd_options . report_state_on_timeout , OPAL_CMD_LINE_TYPE_BOOL ,
" Report all job and process states upon timeout " } ,
{ NULL , ' \0 ' , " get-stack-traces " , " get-stack-traces " , 0 ,
& orte_cmd_options . get_stack_traces , OPAL_CMD_LINE_TYPE_BOOL ,
" Get stack traces of all application procs on timeout " } ,
2016-04-02 18:50:05 +03:00
/* exit status reporting */
{ " orte_report_child_jobs_separately " , ' \0 ' , " report-child-jobs-separately " , " report-child-jobs-separately " , 0 ,
NULL , OPAL_CMD_LINE_TYPE_BOOL ,
" Return the exit status of the primary job only " } ,
/* uri of the dvm, or at least where to get it */
{ NULL , ' \0 ' , " hnp " , " hnp " , 1 ,
2016-05-06 20:55:21 +03:00
& orte_cmd_options . hnp , OPAL_CMD_LINE_TYPE_STRING ,
2016-04-02 18:50:05 +03:00
" Specify the URI of the HNP, or the name of the file (specified as file:filename) that contains that info " } ,
/* hetero apps */
{ " orte_hetero_apps " , ' \0 ' , NULL , " hetero-apps " , 0 ,
NULL , OPAL_CMD_LINE_TYPE_BOOL ,
" Indicates that multiple app_contexts are being provided that are a mix of 32/64 bit binaries " } ,
/* select XML output */
{ " orte_xml_output " , ' \0 ' , " xml " , " xml " , 0 ,
NULL , OPAL_CMD_LINE_TYPE_BOOL ,
" Provide all output in XML format " } ,
{ " orte_xml_file " , ' \0 ' , " xml-file " , " xml-file " , 1 ,
NULL , OPAL_CMD_LINE_TYPE_STRING ,
" Provide all output in XML format to the specified file " } ,
/* tag output */
{ " orte_tag_output " , ' \0 ' , " tag-output " , " tag-output " , 0 ,
2016-05-06 20:55:21 +03:00
& orte_cmd_options . tag_output , OPAL_CMD_LINE_TYPE_BOOL ,
2016-04-02 18:50:05 +03:00
" Tag all output with [job,rank] " } ,
{ " orte_timestamp_output " , ' \0 ' , " timestamp-output " , " timestamp-output " , 0 ,
2016-05-06 20:55:21 +03:00
& orte_cmd_options . timestamp_output , OPAL_CMD_LINE_TYPE_BOOL ,
2016-04-02 18:50:05 +03:00
" Timestamp all application process output " } ,
{ " orte_output_filename " , ' \0 ' , " output-filename " , " output-filename " , 1 ,
2016-05-06 20:55:21 +03:00
& orte_cmd_options . output_filename , OPAL_CMD_LINE_TYPE_STRING ,
2016-04-02 18:50:05 +03:00
" Redirect output from application processes into filename/job/rank/std[out,err,diag] " } ,
{ NULL , ' \0 ' , " merge-stderr-to-stdout " , " merge-stderr-to-stdout " , 0 ,
2016-05-06 20:55:21 +03:00
& orte_cmd_options . merge , OPAL_CMD_LINE_TYPE_BOOL ,
2016-04-02 18:50:05 +03:00
" Merge stderr to stdout for each process " } ,
{ " orte_xterm " , ' \0 ' , " xterm " , " xterm " , 1 ,
NULL , OPAL_CMD_LINE_TYPE_STRING ,
" Create a new xterm window and display output from the specified ranks there " } ,
/* select stdin option */
{ NULL , ' \0 ' , " stdin " , " stdin " , 1 ,
2016-05-06 20:55:21 +03:00
& orte_cmd_options . stdin_target , OPAL_CMD_LINE_TYPE_STRING ,
2016-04-02 18:50:05 +03:00
" Specify procs to receive stdin [rank, all, none] (default: 0, indicating rank 0) " } ,
/* request that argv[0] be indexed */
{ NULL , ' \0 ' , " index-argv-by-rank " , " index-argv-by-rank " , 0 ,
2016-05-06 20:55:21 +03:00
& orte_cmd_options . index_argv , OPAL_CMD_LINE_TYPE_BOOL ,
2016-04-02 18:50:05 +03:00
" Uniquely index argv[0] for each process using its rank " } ,
/* Specify the launch agent to be used */
{ " orte_launch_agent " , ' \0 ' , " launch-agent " , " launch-agent " , 1 ,
NULL , OPAL_CMD_LINE_TYPE_STRING ,
" Command used to start processes on remote nodes (default: orted) " } ,
/* Preload the binary on the remote machine */
{ NULL , ' s ' , NULL , " preload-binary " , 0 ,
2016-05-06 20:55:21 +03:00
& orte_cmd_options . preload_binaries , OPAL_CMD_LINE_TYPE_BOOL ,
2016-04-02 18:50:05 +03:00
" Preload the binary on the remote machine before starting the remote process. " } ,
/* Preload files on the remote machine */
{ NULL , ' \0 ' , NULL , " preload-files " , 1 ,
2016-05-06 20:55:21 +03:00
& orte_cmd_options . preload_files , OPAL_CMD_LINE_TYPE_STRING ,
2016-04-02 18:50:05 +03:00
" Preload the comma separated list of files to the remote machines current working directory before starting the remote process. " } ,
# if OPAL_ENABLE_FT_CR == 1
/* Tell SStore to preload a snapshot before launch */
{ NULL , ' \0 ' , NULL , " sstore-load " , 1 ,
2016-05-06 20:55:21 +03:00
& orte_cmd_options . sstore_load , OPAL_CMD_LINE_TYPE_STRING ,
2016-04-02 18:50:05 +03:00
" Internal Use Only! Tell SStore to preload a snapshot before launch. " } ,
# endif
/* Use an appfile */
{ NULL , ' \0 ' , NULL , " app " , 1 ,
2016-05-06 20:55:21 +03:00
& orte_cmd_options . appfile , OPAL_CMD_LINE_TYPE_STRING ,
2016-04-02 18:50:05 +03:00
" Provide an appfile; ignore all other command line options " } ,
/* Number of processes; -c, -n, --n, -np, and --np are all
synonyms */
{ NULL , ' c ' , " np " , " np " , 1 ,
2016-05-06 20:55:21 +03:00
& orte_cmd_options . num_procs , OPAL_CMD_LINE_TYPE_INT ,
2016-04-02 18:50:05 +03:00
" Number of processes to run " } ,
{ NULL , ' \0 ' , " n " , " n " , 1 ,
2016-05-06 20:55:21 +03:00
& orte_cmd_options . num_procs , OPAL_CMD_LINE_TYPE_INT ,
2016-04-02 18:50:05 +03:00
" Number of processes to run " } ,
/* maximum size of VM - typically used to subdivide an allocation */
{ " orte_max_vm_size " , ' \0 ' , " max-vm-size " , " max-vm-size " , 1 ,
NULL , OPAL_CMD_LINE_TYPE_INT ,
" Number of processes to run " } ,
/* Set a hostfile */
{ NULL , ' \0 ' , " hostfile " , " hostfile " , 1 ,
NULL , OPAL_CMD_LINE_TYPE_STRING ,
" Provide a hostfile " } ,
{ NULL , ' \0 ' , " machinefile " , " machinefile " , 1 ,
NULL , OPAL_CMD_LINE_TYPE_STRING ,
" Provide a hostfile " } ,
{ " orte_default_hostfile " , ' \0 ' , " default-hostfile " , " default-hostfile " , 1 ,
NULL , OPAL_CMD_LINE_TYPE_STRING ,
" Provide a default hostfile " } ,
{ " opal_if_do_not_resolve " , ' \0 ' , " do-not-resolve " , " do-not-resolve " , 0 ,
NULL , OPAL_CMD_LINE_TYPE_BOOL ,
" Do not attempt to resolve interfaces " } ,
/* uri of PMIx publish/lookup server, or at least where to get it */
{ " pmix_server_uri " , ' \0 ' , " ompi-server " , " ompi-server " , 1 ,
NULL , OPAL_CMD_LINE_TYPE_STRING ,
" Specify the URI of the publish/lookup server, or the name of the file (specified as file:filename) that contains that info " } ,
{ " carto_file_path " , ' \0 ' , " cf " , " cartofile " , 1 ,
NULL , OPAL_CMD_LINE_TYPE_STRING ,
" Provide a cartography file " } ,
{ " orte_rankfile " , ' \0 ' , " rf " , " rankfile " , 1 ,
NULL , OPAL_CMD_LINE_TYPE_STRING ,
" Provide a rankfile file " } ,
/* Export environment variables; potentially used multiple times,
so it does not make sense to set into a variable */
{ NULL , ' x ' , NULL , NULL , 1 ,
NULL , OPAL_CMD_LINE_TYPE_NULL ,
" Export an environment variable, optionally specifying a value (e.g., \" -x foo \" exports the environment variable foo and takes its value from the current environment; \" -x foo=bar \" exports the environment variable name foo and sets its value to \" bar \" in the started processes) " } ,
/* Mapping controls */
{ " rmaps_base_display_map " , ' \0 ' , " display-map " , " display-map " , 0 ,
NULL , OPAL_CMD_LINE_TYPE_BOOL ,
" Display the process map just before launch " } ,
{ " rmaps_base_display_devel_map " , ' \0 ' , " display-devel-map " , " display-devel-map " , 0 ,
NULL , OPAL_CMD_LINE_TYPE_BOOL ,
" Display a detailed process map (mostly intended for developers) just before launch " } ,
{ " rmaps_base_display_topo_with_map " , ' \0 ' , " display-topo " , " display-topo " , 0 ,
NULL , OPAL_CMD_LINE_TYPE_BOOL ,
" Display the topology as part of the process map (mostly intended for developers) just before launch " } ,
{ " rmaps_base_display_diffable_map " , ' \0 ' , " display-diffable-map " , " display-diffable-map " , 0 ,
NULL , OPAL_CMD_LINE_TYPE_BOOL ,
" Display a diffable process map (mostly intended for developers) just before launch " } ,
{ NULL , ' H ' , " host " , " host " , 1 ,
NULL , OPAL_CMD_LINE_TYPE_STRING ,
" List of hosts to invoke processes on " } ,
{ " rmaps_base_no_schedule_local " , ' \0 ' , " nolocal " , " nolocal " , 0 ,
2016-05-06 20:55:21 +03:00
& orte_cmd_options . nolocal , OPAL_CMD_LINE_TYPE_BOOL ,
2016-04-02 18:50:05 +03:00
" Do not run any MPI applications on the local node " } ,
{ " rmaps_base_no_oversubscribe " , ' \0 ' , " nooversubscribe " , " nooversubscribe " , 0 ,
2016-05-06 20:55:21 +03:00
& orte_cmd_options . no_oversubscribe , OPAL_CMD_LINE_TYPE_BOOL ,
2016-04-02 18:50:05 +03:00
" Nodes are not to be oversubscribed, even if the system supports such operation " } ,
{ " rmaps_base_oversubscribe " , ' \0 ' , " oversubscribe " , " oversubscribe " , 0 ,
2016-05-06 20:55:21 +03:00
& orte_cmd_options . oversubscribe , OPAL_CMD_LINE_TYPE_BOOL ,
2016-04-02 18:50:05 +03:00
" Nodes are allowed to be oversubscribed, even on a managed system, and overloading of processing elements " } ,
{ " rmaps_base_cpus_per_rank " , ' \0 ' , " cpus-per-proc " , " cpus-per-proc " , 1 ,
2016-05-06 20:55:21 +03:00
& orte_cmd_options . cpus_per_proc , OPAL_CMD_LINE_TYPE_INT ,
2016-04-02 18:50:05 +03:00
" Number of cpus to use for each process [default=1] " } ,
{ " rmaps_base_cpus_per_rank " , ' \0 ' , " cpus-per-rank " , " cpus-per-rank " , 1 ,
2016-05-06 20:55:21 +03:00
& orte_cmd_options . cpus_per_proc , OPAL_CMD_LINE_TYPE_INT ,
2016-04-02 18:50:05 +03:00
" Synonym for cpus-per-proc " } ,
/* backward compatiblity */
{ " rmaps_base_bycore " , ' \0 ' , " bycore " , " bycore " , 0 ,
NULL , OPAL_CMD_LINE_TYPE_BOOL ,
" Whether to map and rank processes round-robin by core " } ,
{ " rmaps_base_bynode " , ' \0 ' , " bynode " , " bynode " , 0 ,
NULL , OPAL_CMD_LINE_TYPE_BOOL ,
" Whether to map and rank processes round-robin by node " } ,
{ " rmaps_base_byslot " , ' \0 ' , " byslot " , " byslot " , 0 ,
NULL , OPAL_CMD_LINE_TYPE_BOOL ,
" Whether to map and rank processes round-robin by slot " } ,
/* Nperxxx options that do not require topology and are always
* available - included for backwards compatibility
*/
{ " rmaps_ppr_pernode " , ' \0 ' , " pernode " , " pernode " , 0 ,
NULL , OPAL_CMD_LINE_TYPE_BOOL ,
" Launch one process per available node " } ,
{ " rmaps_ppr_n_pernode " , ' \0 ' , " npernode " , " npernode " , 1 ,
NULL , OPAL_CMD_LINE_TYPE_INT ,
" Launch n processes per node on all allocated nodes " } ,
{ " rmaps_ppr_n_pernode " , ' \0 ' , " N " , NULL , 1 ,
NULL , OPAL_CMD_LINE_TYPE_INT ,
" Launch n processes per node on all allocated nodes (synonym for npernode) " } ,
/* declare hardware threads as independent cpus */
{ " hwloc_base_use_hwthreads_as_cpus " , ' \0 ' , " use-hwthread-cpus " , " use-hwthread-cpus " , 0 ,
NULL , OPAL_CMD_LINE_TYPE_BOOL ,
" Use hardware threads as independent cpus " } ,
/* include npersocket for backwards compatibility */
{ " rmaps_ppr_n_persocket " , ' \0 ' , " npersocket " , " npersocket " , 1 ,
NULL , OPAL_CMD_LINE_TYPE_INT ,
" Launch n processes per socket on all allocated nodes " } ,
/* Mapping options */
{ " rmaps_base_mapping_policy " , ' \0 ' , NULL , " map-by " , 1 ,
2016-05-06 20:55:21 +03:00
& orte_cmd_options . mapping_policy , OPAL_CMD_LINE_TYPE_STRING ,
2016-04-02 18:50:05 +03:00
" Mapping Policy [slot | hwthread | core | socket (default) | numa | board | node] " } ,
/* Ranking options */
{ " rmaps_base_ranking_policy " , ' \0 ' , NULL , " rank-by " , 1 ,
2016-05-06 20:55:21 +03:00
& orte_cmd_options . ranking_policy , OPAL_CMD_LINE_TYPE_STRING ,
2016-04-02 18:50:05 +03:00
" Ranking Policy [slot (default) | hwthread | core | socket | numa | board | node] " } ,
/* Binding options */
{ " hwloc_base_binding_policy " , ' \0 ' , NULL , " bind-to " , 1 ,
2016-05-06 20:55:21 +03:00
& orte_cmd_options . binding_policy , OPAL_CMD_LINE_TYPE_STRING ,
2016-04-02 18:50:05 +03:00
" Policy for binding processes. Allowed values: none, hwthread, core, l1cache, l2cache, l3cache, socket, numa, board ( \" none \" is the default when oversubscribed, \" core \" is the default when np<=2, and \" socket \" is the default when np>2). Allowed qualifiers: overload-allowed, if-supported " } ,
/* backward compatiblity */
{ " hwloc_base_bind_to_core " , ' \0 ' , " bind-to-core " , " bind-to-core " , 0 ,
NULL , OPAL_CMD_LINE_TYPE_BOOL ,
" Bind processes to cores " } ,
{ " hwloc_base_bind_to_socket " , ' \0 ' , " bind-to-socket " , " bind-to-socket " , 0 ,
NULL , OPAL_CMD_LINE_TYPE_BOOL ,
" Bind processes to sockets " } ,
{ " hwloc_base_report_bindings " , ' \0 ' , " report-bindings " , " report-bindings " , 0 ,
2016-05-06 20:55:21 +03:00
& orte_cmd_options . report_bindings , OPAL_CMD_LINE_TYPE_BOOL ,
2016-04-02 18:50:05 +03:00
" Whether to report process bindings to stderr " } ,
/* slot list option */
{ " hwloc_base_slot_list " , ' \0 ' , " slot-list " , " slot-list " , 1 ,
2016-05-06 20:55:21 +03:00
& orte_cmd_options . slot_list , OPAL_CMD_LINE_TYPE_STRING ,
2016-04-02 18:50:05 +03:00
" List of processor IDs to bind processes to [default=NULL] " } ,
/* generalized pattern mapping option */
{ " rmaps_ppr_pattern " , ' \0 ' , NULL , " ppr " , 1 ,
NULL , OPAL_CMD_LINE_TYPE_STRING ,
" Comma-separated list of number of processes on a given resource type [default: none] " } ,
/* Allocation options */
{ " orte_display_alloc " , ' \0 ' , " display-allocation " , " display-allocation " , 0 ,
NULL , OPAL_CMD_LINE_TYPE_BOOL ,
" Display the allocation being used by this job " } ,
{ " orte_display_devel_alloc " , ' \0 ' , " display-devel-allocation " , " display-devel-allocation " , 0 ,
NULL , OPAL_CMD_LINE_TYPE_BOOL ,
" Display a detailed list (mostly intended for developers) of the allocation being used by this job " } ,
{ " hwloc_base_cpu_set " , ' \0 ' , " cpu-set " , " cpu-set " , 1 ,
NULL , OPAL_CMD_LINE_TYPE_STRING ,
" Comma-separated list of ranges specifying logical cpus allocated to this job [default: none] " } ,
/* mpiexec-like arguments */
{ NULL , ' \0 ' , " wdir " , " wdir " , 1 ,
2016-05-06 20:55:21 +03:00
& orte_cmd_options . wdir , OPAL_CMD_LINE_TYPE_STRING ,
2016-04-02 18:50:05 +03:00
" Set the working directory of the started processes " } ,
{ NULL , ' \0 ' , " wd " , " wd " , 1 ,
2016-05-06 20:55:21 +03:00
& orte_cmd_options . wdir , OPAL_CMD_LINE_TYPE_STRING ,
2016-04-02 18:50:05 +03:00
" Synonym for --wdir " } ,
{ NULL , ' \0 ' , " set-cwd-to-session-dir " , " set-cwd-to-session-dir " , 0 ,
2016-05-06 20:55:21 +03:00
& orte_cmd_options . set_cwd_to_session_dir , OPAL_CMD_LINE_TYPE_BOOL ,
2016-04-02 18:50:05 +03:00
" Set the working directory of the started processes to their session directory " } ,
{ NULL , ' \0 ' , " path " , " path " , 1 ,
2016-05-06 20:55:21 +03:00
& orte_cmd_options . path , OPAL_CMD_LINE_TYPE_STRING ,
2016-04-02 18:50:05 +03:00
" PATH to be used to look for executables to start processes " } ,
/* User-level debugger arguments */
{ NULL , ' \0 ' , " tv " , " tv " , 0 ,
2016-05-06 20:55:21 +03:00
& orte_cmd_options . debugger , OPAL_CMD_LINE_TYPE_BOOL ,
2016-04-02 18:50:05 +03:00
" Deprecated backwards compatibility flag; synonym for \" --debug \" " } ,
{ NULL , ' \0 ' , " debug " , " debug " , 0 ,
2016-05-06 20:55:21 +03:00
& orte_cmd_options . debugger , OPAL_CMD_LINE_TYPE_BOOL ,
2016-04-02 18:50:05 +03:00
" Invoke the user-level debugger indicated by the orte_base_user_debugger MCA parameter " } ,
{ " orte_base_user_debugger " , ' \0 ' , " debugger " , " debugger " , 1 ,
NULL , OPAL_CMD_LINE_TYPE_STRING ,
" Sequence of debuggers to search for when \" --debug \" is used " } ,
{ " orte_output_debugger_proctable " , ' \0 ' , " output-proctable " , " output-proctable " , 0 ,
NULL , OPAL_CMD_LINE_TYPE_BOOL ,
" Output the debugger proctable after launch " } ,
/* OpenRTE arguments */
{ " orte_debug " , ' d ' , " debug-devel " , " debug-devel " , 0 ,
NULL , OPAL_CMD_LINE_TYPE_BOOL ,
" Enable debugging of OpenRTE " } ,
{ " orte_debug_daemons " , ' \0 ' , " debug-daemons " , " debug-daemons " , 0 ,
NULL , OPAL_CMD_LINE_TYPE_INT ,
" Enable debugging of any OpenRTE daemons used by this application " } ,
{ " orte_debug_daemons_file " , ' \0 ' , " debug-daemons-file " , " debug-daemons-file " , 0 ,
NULL , OPAL_CMD_LINE_TYPE_BOOL ,
" Enable debugging of any OpenRTE daemons used by this application, storing output in files " } ,
{ " orte_leave_session_attached " , ' \0 ' , " leave-session-attached " , " leave-session-attached " , 0 ,
NULL , OPAL_CMD_LINE_TYPE_BOOL ,
" Enable debugging of OpenRTE " } ,
{ " orte_do_not_launch " , ' \0 ' , " do-not-launch " , " do-not-launch " , 0 ,
NULL , OPAL_CMD_LINE_TYPE_BOOL ,
" Perform all necessary operations to prepare to launch the application, but do not actually launch it " } ,
{ NULL , ' \0 ' , NULL , " prefix " , 1 ,
NULL , OPAL_CMD_LINE_TYPE_STRING ,
" Prefix where Open MPI is installed on remote nodes " } ,
{ NULL , ' \0 ' , NULL , " noprefix " , 0 ,
NULL , OPAL_CMD_LINE_TYPE_STRING ,
" Disable automatic --prefix behavior " } ,
{ " orte_report_launch_progress " , ' \0 ' , " show-progress " , " show-progress " , 0 ,
NULL , OPAL_CMD_LINE_TYPE_BOOL ,
" Output a brief periodic report on launch progress " } ,
{ " orte_use_regexp " , ' \0 ' , " use-regexp " , " use-regexp " , 0 ,
NULL , OPAL_CMD_LINE_TYPE_BOOL ,
" Use regular expressions for launch " } ,
{ " orte_report_events " , ' \0 ' , " report-events " , " report-events " , 1 ,
NULL , OPAL_CMD_LINE_TYPE_STRING ,
" Report events to a tool listening at the specified URI " } ,
{ " orte_enable_recovery " , ' \0 ' , " enable-recovery " , " enable-recovery " , 0 ,
2016-05-06 20:55:21 +03:00
& orte_cmd_options . enable_recovery , OPAL_CMD_LINE_TYPE_BOOL ,
2016-04-02 18:50:05 +03:00
" Enable recovery from process failure [Default = disabled] " } ,
{ " orte_max_restarts " , ' \0 ' , " max-restarts " , " max-restarts " , 1 ,
NULL , OPAL_CMD_LINE_TYPE_INT ,
" Max number of times to restart a failed process " } ,
{ " orte_hetero_nodes " , ' \0 ' , NULL , " hetero-nodes " , 0 ,
NULL , OPAL_CMD_LINE_TYPE_BOOL ,
" Nodes in cluster may differ in topology, so send the topology back from each node [Default = false] " } ,
# if OPAL_ENABLE_CRDEBUG == 1
{ " opal_cr_enable_crdebug " , ' \0 ' , " crdebug " , " crdebug " , 0 ,
NULL , OPAL_CMD_LINE_TYPE_BOOL ,
" Enable C/R Debugging " } ,
# endif
{ NULL , ' \0 ' , " disable-recovery " , " disable-recovery " , 0 ,
2016-05-06 20:55:21 +03:00
& orte_cmd_options . disable_recovery , OPAL_CMD_LINE_TYPE_BOOL ,
2016-04-02 18:50:05 +03:00
" Disable recovery (resets all recovery options to off) " } ,
{ " state_novm_select " , ' \0 ' , " novm " , " novm " , 0 ,
NULL , OPAL_CMD_LINE_TYPE_BOOL ,
" Execute without creating an allocation-spanning virtual machine (only start daemons on nodes hosting application procs) " } ,
{ NULL , ' \0 ' , " allow-run-as-root " , " allow-run-as-root " , 0 ,
2016-05-06 20:55:21 +03:00
& orte_cmd_options . run_as_root , OPAL_CMD_LINE_TYPE_BOOL ,
2016-04-02 18:50:05 +03:00
" Allow execution as root (STRONGLY DISCOURAGED) " } ,
{ NULL , ' \0 ' , " personality " , " personality " , 1 ,
2016-05-06 20:55:21 +03:00
& orte_cmd_options . personality , OPAL_CMD_LINE_TYPE_STRING ,
2016-04-02 18:50:05 +03:00
" Comma-separated list of programming model, languages, and containers being used (default= \" ompi \" ) " } ,
{ NULL , ' \0 ' , " dvm " , " dvm " , 0 ,
2016-05-06 20:55:21 +03:00
& orte_cmd_options . create_dvm , OPAL_CMD_LINE_TYPE_BOOL ,
2016-04-02 18:50:05 +03:00
" Create a persistent distributed virtual machine (DVM) " } ,
/* tell the dvm to terminate */
{ NULL , ' \0 ' , " terminate " , " terminate " , 0 ,
2016-05-06 20:55:21 +03:00
& orte_cmd_options . terminate_dvm , OPAL_CMD_LINE_TYPE_BOOL ,
2016-04-02 18:50:05 +03:00
" Terminate the DVM " } ,
/* End of list */
{ NULL , ' \0 ' , NULL , NULL , 0 ,
NULL , OPAL_CMD_LINE_TYPE_NULL , NULL }
} ;
static int define_cli ( opal_cmd_line_t * cli )
{
int i , rc ;
bool takeus = false ;
opal_output_verbose ( 1 , orte_schizo_base_framework . framework_output ,
" %s schizo:ompi: define_cli " ,
ORTE_NAME_PRINT ( ORTE_PROC_MY_NAME ) ) ;
/* protect against bozo error */
if ( NULL = = cli ) {
return ORTE_ERR_BAD_PARAM ;
}
if ( NULL ! = orte_schizo_base . personalities ) {
/* if we aren't included, then ignore us */
for ( i = 0 ; NULL ! = orte_schizo_base . personalities [ i ] ; i + + ) {
if ( 0 = = strcmp ( orte_schizo_base . personalities [ i ] , " ompi " ) ) {
takeus = true ;
break ;
}
}
if ( ! takeus ) {
return ORTE_ERR_TAKE_NEXT_OPTION ;
}
}
/* just add ours to the end */
rc = opal_cmd_line_add ( cli , cmd_line_init ) ;
return rc ;
}
static int parse_cli ( int argc , int start , char * * argv )
2015-01-26 07:16:45 +03:00
{
2015-01-27 05:15:57 +03:00
int i , j , k ;
bool ignore ;
char * no_dups [ ] = {
" grpcomm " ,
" odls " ,
" rml " ,
" routed " ,
NULL
} ;
2016-02-17 19:32:17 +03:00
bool takeus = false ;
2015-06-24 06:59:57 +03:00
2016-04-02 18:50:05 +03:00
opal_output_verbose ( 1 , orte_schizo_base_framework . framework_output ,
" %s schizo:ompi: parse_cli " ,
ORTE_NAME_PRINT ( ORTE_PROC_MY_NAME ) ) ;
/* if they gave us a list of personalities,
* see if we are included */
if ( NULL ! = orte_schizo_base . personalities ) {
for ( i = 0 ; NULL ! = orte_schizo_base . personalities [ i ] ; i + + ) {
if ( 0 = = strcmp ( orte_schizo_base . personalities [ i ] , " ompi " ) ) {
takeus = true ;
break ;
}
2016-02-17 19:32:17 +03:00
}
2016-04-02 18:50:05 +03:00
if ( ! takeus ) {
return ORTE_ERR_TAKE_NEXT_OPTION ;
}
} else {
/* attempt to auto-detect CLI options that
* we recognize */
2016-02-11 10:43:13 +03:00
}
2015-01-27 05:15:57 +03:00
for ( i = 0 ; i < ( argc - start ) ; + + i ) {
if ( 0 = = strcmp ( " -mca " , argv [ i ] ) | |
0 = = strcmp ( " --mca " , argv [ i ] ) ) {
/* ignore this one */
if ( 0 = = strcmp ( argv [ i + 1 ] , " mca_base_env_list " ) ) {
i + = 2 ;
continue ;
}
/* It would be nice to avoid increasing the length
* of the orted cmd line by removing any non - ORTE
* params . However , this raises a problem since
* there could be OPAL directives that we really
* - do - want the orted to see - it ' s only the OMPI
* related directives we could ignore . This becomes
* a very complicated procedure , however , since
* the OMPI mca params are not cleanly separated - so
* filtering them out is nearly impossible .
*
* see if this is already present so we at least can
* avoid growing the cmd line with duplicates
*/
ignore = false ;
if ( NULL ! = orted_cmd_line ) {
for ( j = 0 ; NULL ! = orted_cmd_line [ j ] ; j + + ) {
if ( 0 = = strcmp ( argv [ i + 1 ] , orted_cmd_line [ j ] ) ) {
/* already here - if the value is the same,
* we can quitely ignore the fact that they
* provide it more than once . However , some
* frameworks are known to have problems if the
* value is different . We don ' t have a good way
* to know this , but we at least make a crude
* attempt here to protect ourselves .
*/
if ( 0 = = strcmp ( argv [ i + 2 ] , orted_cmd_line [ j + 1 ] ) ) {
/* values are the same */
ignore = true ;
break ;
} else {
/* values are different - see if this is a problem */
for ( k = 0 ; NULL ! = no_dups [ k ] ; k + + ) {
if ( 0 = = strcmp ( no_dups [ k ] , argv [ i + 1 ] ) ) {
/* print help message
* and abort as we cannot know which one is correct
*/
orte_show_help ( " help-orterun.txt " , " orterun:conflicting-params " ,
true , orte_basename , argv [ i + 1 ] ,
argv [ i + 2 ] , orted_cmd_line [ j + 1 ] ) ;
return ORTE_ERR_BAD_PARAM ;
}
}
/* this passed muster - just ignore it */
ignore = true ;
break ;
}
}
}
}
if ( ! ignore ) {
opal_argv_append_nosize ( & orted_cmd_line , argv [ i ] ) ;
opal_argv_append_nosize ( & orted_cmd_line , argv [ i + 1 ] ) ;
opal_argv_append_nosize ( & orted_cmd_line , argv [ i + 2 ] ) ;
}
i + = 2 ;
}
2015-01-26 07:16:45 +03:00
}
2015-01-27 05:15:57 +03:00
return ORTE_SUCCESS ;
2015-01-26 07:16:45 +03:00
}
2016-04-02 18:50:05 +03:00
static int parse_env ( char * path ,
2015-01-27 05:15:57 +03:00
opal_cmd_line_t * cmd_line ,
char * * srcenv ,
char * * * dstenv )
2015-01-26 07:16:45 +03:00
{
2015-01-27 05:15:57 +03:00
int i , j ;
char * param ;
char * value ;
char * env_set_flag ;
char * * vars ;
2016-02-17 19:32:17 +03:00
bool takeus = false ;
2015-06-24 06:59:57 +03:00
2016-04-02 18:50:05 +03:00
opal_output_verbose ( 1 , orte_schizo_base_framework . framework_output ,
" %s schizo:ompi: parse_env " ,
ORTE_NAME_PRINT ( ORTE_PROC_MY_NAME ) ) ;
if ( NULL ! = orte_schizo_base . personalities ) {
/* see if we are included */
for ( i = 0 ; NULL ! = orte_schizo_base . personalities [ i ] ; i + + ) {
if ( 0 = = strcmp ( orte_schizo_base . personalities [ i ] , " ompi " ) ) {
takeus = true ;
break ;
}
}
if ( ! takeus ) {
return ORTE_ERR_TAKE_NEXT_OPTION ;
2016-02-17 19:32:17 +03:00
}
2016-02-11 10:43:13 +03:00
}
2015-01-27 05:15:57 +03:00
for ( i = 0 ; NULL ! = srcenv [ i ] ; + + i ) {
if ( 0 = = strncmp ( " OMPI_ " , srcenv [ i ] , 5 ) ) {
/* check for duplicate in app->env - this
* would have been placed there by the
* cmd line processor . By convention , we
* always let the cmd line override the
* environment
*/
param = strdup ( srcenv [ i ] ) ;
value = strchr ( param , ' = ' ) ;
* value = ' \0 ' ;
value + + ;
opal_setenv ( param , value , false , dstenv ) ;
free ( param ) ;
2015-01-26 07:16:45 +03:00
}
}
2015-06-24 06:59:57 +03:00
2015-03-16 19:03:55 +03:00
/* set necessary env variables for external usage from tune conf file*/
int set_from_file = 0 ;
vars = NULL ;
if ( OPAL_SUCCESS = = mca_base_var_process_env_list_from_file ( & vars ) & &
NULL ! = vars ) {
for ( i = 0 ; NULL ! = vars [ i ] ; i + + ) {
value = strchr ( vars [ i ] , ' = ' ) ;
/* terminate the name of the param */
* value = ' \0 ' ;
/* step over the equals */
value + + ;
/* overwrite any prior entry */
opal_setenv ( vars [ i ] , value , true , dstenv ) ;
/* save it for any comm_spawn'd apps */
opal_setenv ( vars [ i ] , value , true , & orte_forwarded_envars ) ;
}
set_from_file = 1 ;
opal_argv_free ( vars ) ;
}
2015-01-27 05:15:57 +03:00
/* Did the user request to export any environment variables on the cmd line? */
env_set_flag = getenv ( " OMPI_MCA_mca_base_env_list " ) ;
if ( opal_cmd_line_is_taken ( cmd_line , " x " ) ) {
if ( NULL ! = env_set_flag ) {
orte_show_help ( " help-orterun.txt " , " orterun:conflict-env-set " , false ) ;
return ORTE_ERR_FATAL ;
}
j = opal_cmd_line_get_ninsts ( cmd_line , " x " ) ;
for ( i = 0 ; i < j ; + + i ) {
param = opal_cmd_line_get_param ( cmd_line , " x " , i , 0 ) ;
if ( NULL ! = ( value = strchr ( param , ' = ' ) ) ) {
/* terminate the name of the param */
* value = ' \0 ' ;
/* step over the equals */
value + + ;
/* overwrite any prior entry */
opal_setenv ( param , value , true , dstenv ) ;
/* save it for any comm_spawn'd apps */
opal_setenv ( param , value , true , & orte_forwarded_envars ) ;
} else {
value = getenv ( param ) ;
if ( NULL ! = value ) {
/* overwrite any prior entry */
opal_setenv ( param , value , true , dstenv ) ;
/* save it for any comm_spawn'd apps */
opal_setenv ( param , value , true , & orte_forwarded_envars ) ;
} else {
opal_output ( 0 , " Warning: could not find environment variable \" %s \" \n " , param ) ;
}
}
2015-01-26 07:16:45 +03:00
}
2015-01-27 05:15:57 +03:00
} else if ( NULL ! = env_set_flag ) {
2015-03-16 19:03:55 +03:00
/* if mca_base_env_list was set, check if some of env vars were set via -x from a conf file.
* If this is the case , error out .
*/
if ( ! set_from_file ) {
/* set necessary env variables for external usage */
vars = NULL ;
2016-05-07 01:38:39 +03:00
if ( OPAL_SUCCESS = = mca_base_var_process_env_list ( env_set_flag , & vars ) & &
2015-03-16 19:03:55 +03:00
NULL ! = vars ) {
for ( i = 0 ; NULL ! = vars [ i ] ; i + + ) {
value = strchr ( vars [ i ] , ' = ' ) ;
/* terminate the name of the param */
* value = ' \0 ' ;
/* step over the equals */
value + + ;
/* overwrite any prior entry */
opal_setenv ( vars [ i ] , value , true , dstenv ) ;
/* save it for any comm_spawn'd apps */
opal_setenv ( vars [ i ] , value , true , & orte_forwarded_envars ) ;
}
opal_argv_free ( vars ) ;
2015-01-26 07:16:45 +03:00
}
2015-03-16 19:03:55 +03:00
} else {
orte_show_help ( " help-orterun.txt " , " orterun:conflict-env-set " , false ) ;
return ORTE_ERR_FATAL ;
2015-01-26 07:16:45 +03:00
}
}
2015-01-27 05:15:57 +03:00
/* If the user specified --path, store it in the user's app
environment via the OMPI_exec_path variable . */
if ( NULL ! = path ) {
asprintf ( & value , " OMPI_exec_path=%s " , path ) ;
opal_argv_append_nosize ( dstenv , value ) ;
/* save it for any comm_spawn'd apps */
opal_argv_append_nosize ( & orte_forwarded_envars , value ) ;
free ( value ) ;
2015-01-26 07:16:45 +03:00
}
2015-01-27 05:15:57 +03:00
return ORTE_SUCCESS ;
2015-01-26 07:16:45 +03:00
}
2015-01-27 05:15:57 +03:00
static int setup_fork ( orte_job_t * jdata ,
orte_app_context_t * app )
2015-01-26 07:16:45 +03:00
{
2015-01-27 05:15:57 +03:00
int i ;
char * param ;
bool oversubscribed ;
orte_node_t * node ;
char * * envcpy , * * nps , * * firstranks ;
char * npstring , * firstrankstring ;
char * num_app_ctx ;
2016-02-17 19:32:17 +03:00
bool takeus = false ;
2016-05-19 17:35:23 +03:00
orte_app_context_t * tmp_app ;
2015-06-24 06:59:57 +03:00
2016-04-02 18:50:05 +03:00
opal_output_verbose ( 1 , orte_schizo_base_framework . framework_output ,
" %s schizo:ompi: setup_fork " ,
ORTE_NAME_PRINT ( ORTE_PROC_MY_NAME ) ) ;
if ( NULL ! = orte_schizo_base . personalities ) {
2016-02-17 19:32:17 +03:00
/* see if we are included */
2016-04-02 18:50:05 +03:00
for ( i = 0 ; NULL ! = jdata - > personality [ i ] ; i + + ) {
if ( 0 = = strcmp ( jdata - > personality [ i ] , " ompi " ) ) {
takeus = true ;
break ;
}
}
if ( ! takeus ) {
return ORTE_ERR_TAKE_NEXT_OPTION ;
2016-02-17 19:32:17 +03:00
}
2016-02-11 10:43:13 +03:00
}
2015-01-27 05:15:57 +03:00
/* see if the mapper thinks we are oversubscribed */
oversubscribed = false ;
if ( NULL = = ( node = ( orte_node_t * ) opal_pointer_array_get_item ( orte_node_pool , ORTE_PROC_MY_NAME - > vpid ) ) ) {
ORTE_ERROR_LOG ( ORTE_ERR_NOT_FOUND ) ;
return ORTE_ERR_NOT_FOUND ;
}
if ( ORTE_FLAG_TEST ( node , ORTE_NODE_FLAG_OVERSUBSCRIBED ) ) {
oversubscribed = true ;
}
2015-01-26 07:16:45 +03:00
2015-01-27 05:15:57 +03:00
/* setup base environment: copy the current environ and merge
in the app context environ */
if ( NULL ! = app - > env ) {
/* manually free original context->env to avoid a memory leak */
char * * tmp = app - > env ;
envcpy = opal_environ_merge ( orte_launch_environ , app - > env ) ;
if ( NULL ! = tmp ) {
opal_argv_free ( tmp ) ;
2015-01-26 07:16:45 +03:00
}
2015-01-27 05:15:57 +03:00
} else {
envcpy = opal_argv_copy ( orte_launch_environ ) ;
2015-01-26 07:16:45 +03:00
}
2015-01-27 05:15:57 +03:00
app - > env = envcpy ;
2015-06-24 06:59:57 +03:00
2015-01-27 05:15:57 +03:00
/* special case handling for --prefix: this is somewhat icky,
but at least some users do this . : - \ It is possible that
when using - - prefix , the user will also " -x PATH " and / or
" -x LD_LIBRARY_PATH " , which would therefore clobber the
work that was done in the prior pls to ensure that we have
the prefix at the beginning of the PATH and
LD_LIBRARY_PATH . So examine the context - > env and see if we
find PATH or LD_LIBRARY_PATH . If found , that means the
prior work was clobbered , and we need to re - prefix those
variables . */
param = NULL ;
orte_get_attribute ( & app - > attributes , ORTE_APP_PREFIX_DIR , ( void * * ) & param , OPAL_STRING ) ;
for ( i = 0 ; NULL ! = param & & NULL ! = app - > env & & NULL ! = app - > env [ i ] ; + + i ) {
char * newenv ;
2015-06-24 06:59:57 +03:00
2015-01-27 05:15:57 +03:00
/* Reset PATH */
if ( 0 = = strncmp ( " PATH= " , app - > env [ i ] , 5 ) ) {
asprintf ( & newenv , " %s/bin:%s " , param , app - > env [ i ] + 5 ) ;
opal_setenv ( " PATH " , newenv , true , & app - > env ) ;
free ( newenv ) ;
}
2015-06-24 06:59:57 +03:00
2015-01-27 05:15:57 +03:00
/* Reset LD_LIBRARY_PATH */
else if ( 0 = = strncmp ( " LD_LIBRARY_PATH= " , app - > env [ i ] , 16 ) ) {
asprintf ( & newenv , " %s/lib:%s " , param , app - > env [ i ] + 16 ) ;
opal_setenv ( " LD_LIBRARY_PATH " , newenv , true , & app - > env ) ;
free ( newenv ) ;
2015-01-26 07:16:45 +03:00
}
}
2015-01-27 05:15:57 +03:00
if ( NULL ! = param ) {
free ( param ) ;
2015-01-26 07:16:45 +03:00
}
2015-01-27 05:15:57 +03:00
/* pass my contact info to the local proc so we can talk */
opal_setenv ( " OMPI_MCA_orte_local_daemon_uri " , orte_process_info . my_daemon_uri , true , & app - > env ) ;
2015-06-24 06:59:57 +03:00
2015-01-27 05:15:57 +03:00
/* pass the hnp's contact info to the local proc in case it
* needs it
*/
if ( NULL ! = orte_process_info . my_hnp_uri ) {
opal_setenv ( " OMPI_MCA_orte_hnp_uri " , orte_process_info . my_hnp_uri , true , & app - > env ) ;
2015-01-26 07:16:45 +03:00
}
2015-06-24 06:59:57 +03:00
2015-01-27 05:15:57 +03:00
/* setup yield schedule - do not override any user-supplied directive! */
if ( oversubscribed ) {
opal_setenv ( " OMPI_MCA_mpi_yield_when_idle " , " 1 " , false , & app - > env ) ;
} else {
opal_setenv ( " OMPI_MCA_mpi_yield_when_idle " , " 0 " , false , & app - > env ) ;
}
2015-06-24 06:59:57 +03:00
2015-01-27 05:15:57 +03:00
/* set the app_context number into the environment */
asprintf ( & param , " %ld " , ( long ) app - > idx ) ;
opal_setenv ( " OMPI_MCA_orte_app_num " , param , true , & app - > env ) ;
free ( param ) ;
2015-06-24 06:59:57 +03:00
2015-01-27 05:15:57 +03:00
/* although the total_slots_alloc is the universe size, users
* would appreciate being given a public environmental variable
* that also represents this value - something MPI specific - so
* do that here . Also required by the ompi_attributes code !
*
* AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT .
* We know - just live with it
*/
asprintf ( & param , " %ld " , ( long ) jdata - > total_slots_alloc ) ;
opal_setenv ( " OMPI_UNIVERSE_SIZE " , param , true , & app - > env ) ;
free ( param ) ;
2015-06-24 06:59:57 +03:00
2015-01-27 05:15:57 +03:00
/* pass the number of nodes involved in this job */
asprintf ( & param , " %ld " , ( long ) ( jdata - > map - > num_nodes ) ) ;
opal_setenv ( " OMPI_MCA_orte_num_nodes " , param , true , & app - > env ) ;
free ( param ) ;
2015-09-05 02:54:40 +03:00
/* pass a param telling the child what type and model of cpu we are on,
* if we know it . If hwloc has the value , use what it knows . Otherwise ,
* see if we were explicitly given it and use that value .
*/
hwloc_obj_t obj ;
char * htmp ;
if ( NULL ! = opal_hwloc_topology ) {
obj = hwloc_get_root_obj ( opal_hwloc_topology ) ;
if ( NULL ! = ( htmp = ( char * ) hwloc_obj_get_info_by_name ( obj , " CPUType " ) ) | |
NULL ! = ( htmp = orte_local_cpu_type ) ) {
opal_setenv ( " OMPI_MCA_orte_cpu_type " , htmp , true , & app - > env ) ;
}
if ( NULL ! = ( htmp = ( char * ) hwloc_obj_get_info_by_name ( obj , " CPUModel " ) ) | |
NULL ! = ( htmp = orte_local_cpu_model ) ) {
opal_setenv ( " OMPI_MCA_orte_cpu_model " , htmp , true , & app - > env ) ;
}
} else {
if ( NULL ! = orte_local_cpu_type ) {
opal_setenv ( " OMPI_MCA_orte_cpu_type " , orte_local_cpu_type , true , & app - > env ) ;
}
if ( NULL ! = orte_local_cpu_model ) {
opal_setenv ( " OMPI_MCA_orte_cpu_model " , orte_local_cpu_model , true , & app - > env ) ;
2015-01-26 07:16:45 +03:00
}
}
2015-01-27 05:15:57 +03:00
/* get shmem's best component name so we can provide a hint to the shmem
* framework . the idea here is to have someone figure out what component to
* select ( via the shmem framework ) and then have the rest of the
* components in shmem obey that decision . for more details take a look at
* the shmem framework in opal .
*/
if ( NULL ! = ( param = opal_shmem_base_best_runnable_component_name ( ) ) ) {
opal_setenv ( " OMPI_MCA_shmem_RUNTIME_QUERY_hint " , param , true , & app - > env ) ;
free ( param ) ;
2015-01-26 07:16:45 +03:00
}
2015-06-24 06:59:57 +03:00
2015-01-27 05:15:57 +03:00
/* Set an info MCA param that tells the launched processes that
* any binding policy was applied by us ( e . g . , so that
* MPI_INIT doesn ' t try to bind itself )
*/
opal_setenv ( " OMPI_MCA_orte_bound_at_launch " , " 1 " , true , & app - > env ) ;
2015-01-26 07:16:45 +03:00
2015-12-10 06:54:44 +03:00
/* tell the ESS to avoid the singleton component - but don't override
2015-01-27 05:15:57 +03:00
* anything that may have been provided elsewhere
*/
2015-12-10 06:54:44 +03:00
opal_setenv ( " OMPI_MCA_ess " , " ^singleton " , false , & app - > env ) ;
2015-01-26 07:16:45 +03:00
2015-12-21 19:36:12 +03:00
/* ensure that the spawned process ignores direct launch components,
* but do not overrride anything we were given */
opal_setenv ( " OMPI_MCA_pmix " , " ^s1,s2,cray " , false , & app - > env ) ;
2015-09-09 04:37:09 +03:00
2015-01-27 05:15:57 +03:00
/* since we want to pass the name as separate components, make sure
* that the " name " environmental variable is cleared !
*/
opal_unsetenv ( " OMPI_MCA_orte_ess_name " , & app - > env ) ;
asprintf ( & param , " %ld " , ( long ) jdata - > num_procs ) ;
opal_setenv ( " OMPI_MCA_orte_ess_num_procs " , param , true , & app - > env ) ;
/* although the num_procs is the comm_world size, users
* would appreciate being given a public environmental variable
* that also represents this value - something MPI specific - so
* do that here .
*
* AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT .
* We know - just live with it
*/
opal_setenv ( " OMPI_COMM_WORLD_SIZE " , param , true , & app - > env ) ;
free ( param ) ;
/* users would appreciate being given a public environmental variable
* that also represents this value - something MPI specific - so
* do that here .
*
* AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT .
* We know - just live with it
*/
asprintf ( & param , " %ld " , ( long ) jdata - > num_local_procs ) ;
opal_setenv ( " OMPI_COMM_WORLD_LOCAL_SIZE " , param , true , & app - > env ) ;
free ( param ) ;
2015-06-24 06:59:57 +03:00
2016-05-30 04:56:18 +03:00
/* forcibly set the local tmpdir base and top session dir to match ours */
2015-01-27 05:15:57 +03:00
opal_setenv ( " OMPI_MCA_orte_tmpdir_base " , orte_process_info . tmpdir_base , true , & app - > env ) ;
2016-05-30 04:56:18 +03:00
opal_setenv ( " OMPI_MCA_orte_top_session_dir " , orte_process_info . top_session_dir , true , & app - > env ) ;
2015-01-26 07:16:45 +03:00
2015-01-27 05:15:57 +03:00
/* MPI-3 requires we provide some further info to the procs,
* so we pass them as envars to avoid introducing further
* ORTE calls in the MPI layer
*/
asprintf ( & num_app_ctx , " %lu " , ( unsigned long ) jdata - > num_apps ) ;
/* build some common envars we need to pass for MPI-3 compatibility */
nps = NULL ;
firstranks = NULL ;
for ( i = 0 ; i < jdata - > apps - > size ; i + + ) {
2016-05-19 17:35:23 +03:00
if ( NULL = = ( tmp_app = ( orte_app_context_t * ) opal_pointer_array_get_item ( jdata - > apps , i ) ) ) {
2015-01-27 05:15:57 +03:00
continue ;
}
2016-05-19 17:35:23 +03:00
opal_argv_append_nosize ( & nps , ORTE_VPID_PRINT ( tmp_app - > num_procs ) ) ;
opal_argv_append_nosize ( & firstranks , ORTE_VPID_PRINT ( tmp_app - > first_rank ) ) ;
2015-01-27 05:15:57 +03:00
}
npstring = opal_argv_join ( nps , ' ' ) ;
firstrankstring = opal_argv_join ( firstranks , ' ' ) ;
opal_argv_free ( nps ) ;
opal_argv_free ( firstranks ) ;
/* add the MPI-3 envars */
opal_setenv ( " OMPI_NUM_APP_CTX " , num_app_ctx , true , & app - > env ) ;
opal_setenv ( " OMPI_FIRST_RANKS " , firstrankstring , true , & app - > env ) ;
opal_setenv ( " OMPI_APP_CTX_NUM_PROCS " , npstring , true , & app - > env ) ;
free ( num_app_ctx ) ;
free ( firstrankstring ) ;
free ( npstring ) ;
return ORTE_SUCCESS ;
2015-01-26 07:16:45 +03:00
}
2015-01-27 05:15:57 +03:00
static int setup_child ( orte_job_t * jdata ,
orte_proc_t * child ,
orte_app_context_t * app )
2015-01-26 07:16:45 +03:00
{
2015-01-27 05:15:57 +03:00
char * param , * value ;
2016-02-17 19:32:17 +03:00
int rc , i ;
2015-01-27 05:15:57 +03:00
int32_t nrestarts = 0 , * nrptr ;
2016-02-17 19:32:17 +03:00
bool takeus = false ;
2015-01-26 07:16:45 +03:00
2016-04-02 18:50:05 +03:00
opal_output_verbose ( 1 , orte_schizo_base_framework . framework_output ,
" %s schizo:ompi: setup_child " ,
ORTE_NAME_PRINT ( ORTE_PROC_MY_NAME ) ) ;
if ( NULL ! = orte_schizo_base . personalities ) {
/* see if we are included */
for ( i = 0 ; NULL ! = jdata - > personality [ i ] ; i + + ) {
if ( 0 = = strcmp ( jdata - > personality [ i ] , " ompi " ) ) {
takeus = true ;
break ;
}
}
if ( ! takeus ) {
return ORTE_ERR_TAKE_NEXT_OPTION ;
2016-02-17 19:32:17 +03:00
}
2016-02-11 10:43:13 +03:00
}
2015-01-27 05:15:57 +03:00
/* setup the jobid */
if ( ORTE_SUCCESS ! = ( rc = orte_util_convert_jobid_to_string ( & value , child - > name . jobid ) ) ) {
ORTE_ERROR_LOG ( rc ) ;
return rc ;
2015-01-26 07:16:45 +03:00
}
2015-01-27 05:15:57 +03:00
opal_setenv ( " OMPI_MCA_ess_base_jobid " , value , true , & app - > env ) ;
free ( value ) ;
2015-01-26 07:16:45 +03:00
2015-01-27 05:15:57 +03:00
/* setup the vpid */
if ( ORTE_SUCCESS ! = ( rc = orte_util_convert_vpid_to_string ( & value , child - > name . vpid ) ) ) {
ORTE_ERROR_LOG ( rc ) ;
return rc ;
2015-01-26 07:16:45 +03:00
}
2015-01-27 05:15:57 +03:00
opal_setenv ( " OMPI_MCA_ess_base_vpid " , value , true , & app - > env ) ;
/* although the vpid IS the process' rank within the job, users
* would appreciate being given a public environmental variable
* that also represents this value - something MPI specific - so
* do that here .
*
* AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT .
* We know - just live with it
*/
opal_setenv ( " OMPI_COMM_WORLD_RANK " , value , true , & app - > env ) ;
free ( value ) ; /* done with this now */
2015-06-24 06:59:57 +03:00
2015-01-27 05:15:57 +03:00
/* users would appreciate being given a public environmental variable
* that also represents the local rank value - something MPI specific - so
* do that here .
*
* AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT .
* We know - just live with it
*/
if ( ORTE_LOCAL_RANK_INVALID = = child - > local_rank ) {
ORTE_ERROR_LOG ( ORTE_ERR_VALUE_OUT_OF_BOUNDS ) ;
rc = ORTE_ERR_VALUE_OUT_OF_BOUNDS ;
return rc ;
2015-01-26 07:16:45 +03:00
}
2015-01-27 05:15:57 +03:00
asprintf ( & value , " %lu " , ( unsigned long ) child - > local_rank ) ;
opal_setenv ( " OMPI_COMM_WORLD_LOCAL_RANK " , value , true , & app - > env ) ;
free ( value ) ;
2015-06-24 06:59:57 +03:00
2015-01-27 05:15:57 +03:00
/* users would appreciate being given a public environmental variable
* that also represents the node rank value - something MPI specific - so
* do that here .
*
* AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT .
* We know - just live with it
2015-01-26 07:16:45 +03:00
*/
2015-01-27 05:15:57 +03:00
if ( ORTE_NODE_RANK_INVALID = = child - > node_rank ) {
ORTE_ERROR_LOG ( ORTE_ERR_VALUE_OUT_OF_BOUNDS ) ;
rc = ORTE_ERR_VALUE_OUT_OF_BOUNDS ;
return rc ;
2015-01-26 07:16:45 +03:00
}
2015-01-27 05:15:57 +03:00
asprintf ( & value , " %lu " , ( unsigned long ) child - > node_rank ) ;
opal_setenv ( " OMPI_COMM_WORLD_NODE_RANK " , value , true , & app - > env ) ;
/* set an mca param for it too */
opal_setenv ( " OMPI_MCA_orte_ess_node_rank " , value , true , & app - > env ) ;
free ( value ) ;
/* provide the identifier for the PMIx connection - the
* PMIx connection is made prior to setting the process
* name itself . Although in most cases the ID and the
* process name are the same , it isn ' t necessarily
* required */
orte_util_convert_process_name_to_string ( & value , & child - > name ) ;
opal_setenv ( " PMIX_ID " , value , true , & app - > env ) ;
free ( value ) ;
nrptr = & nrestarts ;
if ( orte_get_attribute ( & child - > attributes , ORTE_PROC_NRESTARTS , ( void * * ) & nrptr , OPAL_INT32 ) ) {
/* pass the number of restarts for this proc - will be zero for
* an initial start , but procs would like to know if they are being
* restarted so they can take appropriate action
2015-01-26 07:16:45 +03:00
*/
2015-01-27 05:15:57 +03:00
asprintf ( & value , " %d " , nrestarts ) ;
opal_setenv ( " OMPI_MCA_orte_num_restarts " , value , true , & app - > env ) ;
free ( value ) ;
2015-01-26 07:16:45 +03:00
}
2015-06-24 06:59:57 +03:00
2015-01-27 05:15:57 +03:00
/* if the proc should not barrier in orte_init, tell it */
if ( orte_get_attribute ( & child - > attributes , ORTE_PROC_NOBARRIER , NULL , OPAL_BOOL )
| | 0 < nrestarts ) {
opal_setenv ( " OMPI_MCA_orte_do_not_barrier " , " 1 " , true , & app - > env ) ;
2015-01-26 07:16:45 +03:00
}
2015-06-24 06:59:57 +03:00
2015-01-27 05:15:57 +03:00
/* if we are using staged execution, tell it */
if ( orte_staged_execution ) {
opal_setenv ( " OMPI_MCA_orte_staged_execution " , " 1 " , true , & app - > env ) ;
2015-01-26 07:16:45 +03:00
}
2015-01-27 05:15:57 +03:00
/* if the proc isn't going to forward IO, then we need to flag that
* it has " completed " iof termination as otherwise it will never fire
*/
if ( ! ORTE_FLAG_TEST ( jdata , ORTE_JOB_FLAG_FORWARD_OUTPUT ) ) {
ORTE_FLAG_SET ( child , ORTE_PROC_FLAG_IOF_COMPLETE ) ;
2015-01-26 07:16:45 +03:00
}
2015-01-27 05:15:57 +03:00
/* construct the proc's session dir name */
if ( NULL ! = orte_process_info . tmpdir_base ) {
value = strdup ( orte_process_info . tmpdir_base ) ;
} else {
value = NULL ;
2015-01-26 07:16:45 +03:00
}
2015-01-27 05:15:57 +03:00
param = NULL ;
if ( ORTE_SUCCESS ! = ( rc = orte_session_dir_get_name ( & param , & value , NULL ,
orte_process_info . nodename ,
2016-07-06 00:58:49 +03:00
& child - > name ) ) ) {
2015-01-27 05:15:57 +03:00
ORTE_ERROR_LOG ( rc ) ;
2015-03-03 06:36:21 +03:00
if ( NULL ! = value ) {
free ( value ) ;
}
2015-01-27 05:15:57 +03:00
return rc ;
2015-01-26 07:16:45 +03:00
}
2015-01-27 05:15:57 +03:00
free ( value ) ;
/* pass an envar so the proc can find any files it had prepositioned */
opal_setenv ( " OMPI_FILE_LOCATION " , param , true , & app - > env ) ;
/* if the user wanted the cwd to be the proc's session dir, then
* switch to that location now
*/
if ( orte_get_attribute ( & app - > attributes , ORTE_APP_SSNDIR_CWD , NULL , OPAL_BOOL ) ) {
/* create the session dir - may not exist */
if ( OPAL_SUCCESS ! = ( rc = opal_os_dirpath_create ( param , S_IRWXU ) ) ) {
ORTE_ERROR_LOG ( rc ) ;
/* doesn't exist with correct permissions, and/or we can't
* create it - either way , we are done
2015-01-26 07:16:45 +03:00
*/
2015-01-27 05:15:57 +03:00
free ( param ) ;
return rc ;
2015-01-26 07:16:45 +03:00
}
2015-01-27 05:15:57 +03:00
/* change to it */
if ( 0 ! = chdir ( param ) ) {
free ( param ) ;
return ORTE_ERROR ;
}
/* It seems that chdir doesn't
* adjust the $ PWD enviro variable when it changes the directory . This
* can cause a user to get a different response when doing getcwd vs
* looking at the enviro variable . To keep this consistent , we explicitly
* ensure that the PWD enviro variable matches the CWD we moved to .
*
* NOTE : if a user ' s program does a chdir ( ) , then $ PWD will once
* again not match getcwd ! This is beyond our control - we are only
* ensuring they start out matching .
2015-01-26 07:16:45 +03:00
*/
2015-01-27 05:15:57 +03:00
opal_setenv ( " PWD " , param , true , & app - > env ) ;
/* update the initial wdir value too */
opal_setenv ( " OMPI_MCA_initial_wdir " , param , true , & app - > env ) ;
2015-01-26 07:16:45 +03:00
}
2015-01-27 05:15:57 +03:00
free ( param ) ;
return ORTE_SUCCESS ;
2015-01-26 07:16:45 +03:00
}