2015-01-30 11:00:43 -08:00
/* -*- C -*-
*
* Copyright ( c ) 2004 - 2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation . All rights reserved .
* Copyright ( c ) 2004 - 2008 The University of Tennessee and The University
* of Tennessee Research Foundation . All rights
* reserved .
* Copyright ( c ) 2004 - 2005 High Performance Computing Center Stuttgart ,
* University of Stuttgart . All rights reserved .
* Copyright ( c ) 2004 - 2005 The Regents of the University of California .
* All rights reserved .
* Copyright ( c ) 2006 - 2014 Cisco Systems , Inc . All rights reserved .
* Copyright ( c ) 2007 - 2009 Sun Microsystems , Inc . All rights reserved .
* Copyright ( c ) 2007 - 2013 Los Alamos National Security , LLC . All rights
* reserved .
* Copyright ( c ) 2013 - 2015 Intel , Inc . All rights reserved .
* $ COPYRIGHT $
*
* Additional copyrights may follow
*
* $ HEADER $
*/
# include "orte_config.h"
# include "orte/constants.h"
# ifdef HAVE_STRING_H
# include <string.h>
# endif
# include <stdio.h>
# ifdef HAVE_STDLIB_H
# include <stdlib.h>
# endif /* HAVE_STDLIB_H */
# ifdef HAVE_STRINGS_H
# include <strings.h>
# endif /* HAVE_STRINGS_H */
# ifdef HAVE_UNISTD_H
# include <unistd.h>
# endif
# ifdef HAVE_SYS_PARAM_H
# include <sys/param.h>
# endif
# include <errno.h>
# include <signal.h>
# include <ctype.h>
# ifdef HAVE_SYS_TYPES_H
# include <sys/types.h>
# endif /* HAVE_SYS_TYPES_H */
# ifdef HAVE_SYS_WAIT_H
# include <sys/wait.h>
# endif /* HAVE_SYS_WAIT_H */
# ifdef HAVE_SYS_TIME_H
# include <sys/time.h>
# endif /* HAVE_SYS_TIME_H */
# include <fcntl.h>
# ifdef HAVE_SYS_STAT_H
# include <sys/stat.h>
# endif
# include "opal/dss/dss.h"
# include "opal/mca/event/event.h"
# include "opal/mca/installdirs/installdirs.h"
# include "opal/mca/hwloc/base/base.h"
# include "opal/mca/base/base.h"
# include "opal/util/argv.h"
# include "opal/util/output.h"
# include "opal/util/basename.h"
# include "opal/util/cmd_line.h"
# include "opal/util/opal_environ.h"
# include "opal/util/opal_getcwd.h"
# include "opal/util/show_help.h"
# include "opal/util/fd.h"
# include "opal/sys/atomic.h"
# if OPAL_ENABLE_FT_CR == 1
# include "opal/runtime/opal_cr.h"
# endif
# include "opal/version.h"
# include "opal/runtime/opal.h"
# include "opal/util/os_path.h"
# include "opal/util/path.h"
# include "opal/class/opal_pointer_array.h"
# include "opal/dss/dss.h"
2015-02-01 12:14:44 -08:00
# include "orte/mca/odls/odls_types.h"
2015-01-30 11:00:43 -08:00
# include "orte/mca/plm/plm.h"
2015-02-10 12:13:21 -08:00
# include "orte/mca/rmaps/rmaps_types.h"
# include "orte/mca/rmaps/base/base.h"
2015-01-30 11:00:43 -08:00
# include "orte/mca/schizo/schizo.h"
# include "orte/mca/errmgr/errmgr.h"
# include "orte/mca/rml/rml.h"
# include "orte/mca/rml/base/rml_contact.h"
# include "orte/mca/routed/routed.h"
# include "orte/runtime/runtime.h"
# include "orte/runtime/orte_globals.h"
# include "orte/runtime/orte_wait.h"
# include "orte/runtime/orte_quit.h"
# include "orte/util/show_help.h"
/*
* Globals
*/
static char * * global_mca_env = NULL ;
static orte_std_cntr_t total_num_apps = 0 ;
static bool want_prefix_by_default = ( bool ) ORTE_WANT_ORTERUN_PREFIX_BY_DEFAULT ;
2015-02-04 06:20:11 -08:00
volatile bool mywait = true ;
volatile bool myspawn = true ;
2015-01-30 11:00:43 -08:00
/*
* Globals
*/
static struct {
bool help ;
bool version ;
char * report_pid ;
char * stdin_target ;
bool index_argv ;
bool preload_binaries ;
char * preload_files ;
char * appfile ;
int num_procs ;
char * hnp ;
char * wdir ;
bool set_cwd_to_session_dir ;
char * path ;
bool enable_recovery ;
char * personality ;
char * basename ;
char * prefix ;
2015-02-01 12:14:44 -08:00
bool terminate ;
2015-02-10 10:47:32 -08:00
bool nolocal ;
bool no_oversubscribe ;
bool oversubscribe ;
int cpus_per_proc ;
bool pernode ;
int npernode ;
bool use_hwthreads_as_cpus ;
int npersocket ;
char * mapping_policy ;
char * ranking_policy ;
char * binding_policy ;
bool report_bindings ;
char * slot_list ;
2015-02-10 12:13:21 -08:00
bool debug ;
2015-01-30 11:00:43 -08:00
} myglobals ;
static opal_cmd_line_init_t cmd_line_init [ ] = {
/* Various "obvious" options */
{ NULL , ' h ' , NULL , " help " , 0 ,
& myglobals . help , OPAL_CMD_LINE_TYPE_BOOL ,
" This help message " } ,
{ NULL , ' V ' , NULL , " version " , 0 ,
& myglobals . version , OPAL_CMD_LINE_TYPE_BOOL ,
" Print version and exit " } ,
{ NULL , ' \0 ' , " report-pid " , " report-pid " , 1 ,
& myglobals . report_pid , OPAL_CMD_LINE_TYPE_STRING ,
" Printout pid on stdout [-], stderr [+], or a file [anything else] " } ,
/* select stdin option */
{ NULL , ' \0 ' , " stdin " , " stdin " , 1 ,
& myglobals . stdin_target , OPAL_CMD_LINE_TYPE_STRING ,
" Specify procs to receive stdin [rank, all, none] (default: 0, indicating rank 0) " } ,
/* request that argv[0] be indexed */
{ NULL , ' \0 ' , " index-argv-by-rank " , " index-argv-by-rank " , 0 ,
& myglobals . index_argv , OPAL_CMD_LINE_TYPE_BOOL ,
" Uniquely index argv[0] for each process using its rank " } ,
/* Preload the binary on the remote machine */
{ NULL , ' s ' , NULL , " preload-binary " , 0 ,
& myglobals . preload_binaries , OPAL_CMD_LINE_TYPE_BOOL ,
" Preload the binary on the remote machine before starting the remote process. " } ,
/* Preload files on the remote machine */
{ NULL , ' \0 ' , NULL , " preload-files " , 1 ,
& myglobals . preload_files , OPAL_CMD_LINE_TYPE_STRING ,
" Preload the comma separated list of files to the remote machines current working directory before starting the remote process. " } ,
/* Use an appfile */
{ NULL , ' \0 ' , NULL , " app " , 1 ,
& myglobals . appfile , OPAL_CMD_LINE_TYPE_STRING ,
" Provide an appfile; ignore all other command line options " } ,
/* Number of processes; -c, -n, --n, -np, and --np are all
synonyms */
{ NULL , ' c ' , " np " , " np " , 1 ,
& myglobals . num_procs , OPAL_CMD_LINE_TYPE_INT ,
" Number of processes to run " } ,
{ NULL , ' \0 ' , " n " , " n " , 1 ,
& myglobals . num_procs , OPAL_CMD_LINE_TYPE_INT ,
" Number of processes to run " } ,
/* uri of Open MPI HNP, or at least where to get it */
{ NULL , ' \0 ' , " hnp " , " hnp " , 1 ,
& myglobals . hnp , OPAL_CMD_LINE_TYPE_STRING ,
" Specify the URI of the Open MPI server, or the name of the file (specified as file:filename) that contains that info " } ,
2015-02-01 12:14:44 -08:00
/* uri of Open MPI HNP, or at least where to get it */
{ NULL , ' \0 ' , " terminate " , " terminate " , 0 ,
& myglobals . terminate , OPAL_CMD_LINE_TYPE_BOOL ,
" Terminate the DVM " } ,
2015-01-30 11:00:43 -08:00
/* Export environment variables; potentially used multiple times,
so it does not make sense to set into a variable */
{ NULL , ' x ' , NULL , NULL , 1 ,
NULL , OPAL_CMD_LINE_TYPE_NULL ,
" Export an environment variable, optionally specifying a value (e.g., \" -x foo \" exports the environment variable foo and takes its value from the current environment; \" -x foo=bar \" exports the environment variable name foo and sets its value to \" bar \" in the started processes) " } ,
/* Mapping controls */
{ NULL , ' H ' , " host " , " host " , 1 ,
NULL , OPAL_CMD_LINE_TYPE_STRING ,
" List of hosts to invoke processes on " } ,
2015-02-10 10:47:32 -08:00
{ NULL , ' \0 ' , " nolocal " , " nolocal " , 0 ,
& myglobals . nolocal , OPAL_CMD_LINE_TYPE_BOOL ,
2015-01-30 11:00:43 -08:00
" Do not run any MPI applications on the local node " } ,
2015-02-10 10:47:32 -08:00
{ NULL , ' \0 ' , " nooversubscribe " , " nooversubscribe " , 0 ,
& myglobals . no_oversubscribe , OPAL_CMD_LINE_TYPE_BOOL ,
2015-01-30 11:00:43 -08:00
" Nodes are not to be oversubscribed, even if the system supports such operation " } ,
2015-02-10 10:47:32 -08:00
{ NULL , ' \0 ' , " oversubscribe " , " oversubscribe " , 0 ,
& myglobals . oversubscribe , OPAL_CMD_LINE_TYPE_BOOL ,
2015-01-30 11:00:43 -08:00
" Nodes are allowed to be oversubscribed, even on a managed system, and overloading of processing elements " } ,
2015-02-10 10:47:32 -08:00
{ NULL , ' \0 ' , " cpus-per-proc " , " cpus-per-proc " , 1 ,
& myglobals . cpus_per_proc , OPAL_CMD_LINE_TYPE_INT ,
2015-01-30 11:00:43 -08:00
" Number of cpus to use for each process [default=1] " } ,
/* Nperxxx options that do not require topology and are always
* available - included for backwards compatibility
*/
2015-02-10 10:47:32 -08:00
{ NULL , ' \0 ' , " pernode " , " pernode " , 0 ,
& myglobals . pernode , OPAL_CMD_LINE_TYPE_BOOL ,
2015-01-30 11:00:43 -08:00
" Launch one process per available node " } ,
2015-02-10 10:47:32 -08:00
{ NULL , ' \0 ' , " npernode " , " npernode " , 1 ,
& myglobals . npernode , OPAL_CMD_LINE_TYPE_INT ,
" Launch n processes per node on all allocated nodes " } ,
{ NULL , ' \0 ' , " N " , NULL , 1 ,
& myglobals . npernode , OPAL_CMD_LINE_TYPE_INT ,
2015-01-30 11:00:43 -08:00
" Launch n processes per node on all allocated nodes (synonym for npernode) " } ,
# if OPAL_HAVE_HWLOC
/* declare hardware threads as independent cpus */
2015-02-10 10:47:32 -08:00
{ NULL , ' \0 ' , " use-hwthread-cpus " , " use-hwthread-cpus " , 0 ,
& myglobals . use_hwthreads_as_cpus , OPAL_CMD_LINE_TYPE_BOOL ,
2015-01-30 11:00:43 -08:00
" Use hardware threads as independent cpus " } ,
/* include npersocket for backwards compatibility */
2015-02-10 10:47:32 -08:00
{ NULL , ' \0 ' , " npersocket " , " npersocket " , 1 ,
& myglobals . npersocket , OPAL_CMD_LINE_TYPE_INT ,
2015-01-30 11:00:43 -08:00
" Launch n processes per socket on all allocated nodes " } ,
/* Mapping options */
2015-02-10 10:47:32 -08:00
{ NULL , ' \0 ' , NULL , " map-by " , 1 ,
& myglobals . mapping_policy , OPAL_CMD_LINE_TYPE_STRING ,
2015-01-30 11:00:43 -08:00
" Mapping Policy [slot | hwthread | core | socket (default) | numa | board | node] " } ,
/* Ranking options */
2015-02-10 10:47:32 -08:00
{ NULL , ' \0 ' , NULL , " rank-by " , 1 ,
& myglobals . ranking_policy , OPAL_CMD_LINE_TYPE_STRING ,
2015-01-30 11:00:43 -08:00
" Ranking Policy [slot (default) | hwthread | core | socket | numa | board | node] " } ,
/* Binding options */
2015-02-10 10:47:32 -08:00
{ NULL , ' \0 ' , NULL , " bind-to " , 1 ,
& myglobals . binding_policy , OPAL_CMD_LINE_TYPE_STRING ,
2015-01-30 11:00:43 -08:00
" Policy for binding processes. Allowed values: none, hwthread, core, l1cache, l2cache, l3cache, socket, numa, board ( \" none \" is the default when oversubscribed, \" core \" is the default when np<=2, and \" socket \" is the default when np>2). Allowed qualifiers: overload-allowed, if-supported " } ,
2015-02-10 10:47:32 -08:00
{ NULL , ' \0 ' , " report-bindings " , " report-bindings " , 0 ,
& myglobals . report_bindings , OPAL_CMD_LINE_TYPE_BOOL ,
2015-01-30 11:00:43 -08:00
" Whether to report process bindings to stderr " } ,
/* slot list option */
2015-02-10 10:47:32 -08:00
{ NULL , ' \0 ' , " slot-list " , " slot-list " , 1 ,
& myglobals . slot_list , OPAL_CMD_LINE_TYPE_STRING ,
2015-01-30 11:00:43 -08:00
" List of processor IDs to bind processes to [default=NULL] " } ,
# else
/* Mapping options */
2015-02-10 10:47:32 -08:00
{ NULL , ' \0 ' , NULL , " map-by " , 1 ,
& myglobals . mapping_policy , OPAL_CMD_LINE_TYPE_STRING ,
2015-01-30 11:00:43 -08:00
" Mapping Policy [slot (default) | node] " } ,
/* Ranking options */
2015-02-10 10:47:32 -08:00
{ NULL , ' \0 ' , NULL , " rank-by " , 1 ,
& myglobals . ranking_policy , OPAL_CMD_LINE_TYPE_STRING ,
2015-01-30 11:00:43 -08:00
" Ranking Policy [slot (default) | node] " } ,
# endif
/* mpiexec-like arguments */
{ NULL , ' \0 ' , " wdir " , " wdir " , 1 ,
& myglobals . wdir , OPAL_CMD_LINE_TYPE_STRING ,
" Set the working directory of the started processes " } ,
{ NULL , ' \0 ' , " wd " , " wd " , 1 ,
& myglobals . wdir , OPAL_CMD_LINE_TYPE_STRING ,
" Synonym for --wdir " } ,
{ NULL , ' \0 ' , " set-cwd-to-session-dir " , " set-cwd-to-session-dir " , 0 ,
& myglobals . set_cwd_to_session_dir , OPAL_CMD_LINE_TYPE_BOOL ,
" Set the working directory of the started processes to their session directory " } ,
{ NULL , ' \0 ' , " path " , " path " , 1 ,
& myglobals . path , OPAL_CMD_LINE_TYPE_STRING ,
" PATH to be used to look for executables to start processes " } ,
{ NULL , ' \0 ' , " enable-recovery " , " enable-recovery " , 0 ,
& myglobals . enable_recovery , OPAL_CMD_LINE_TYPE_BOOL ,
" Enable recovery (resets all recovery options to on) " } ,
{ NULL , ' \0 ' , " personality " , " personality " , 1 ,
& myglobals . personality , OPAL_CMD_LINE_TYPE_STRING ,
" Programming model/language being used (default= \" ompi \" ) " } ,
2015-02-10 12:13:21 -08:00
{ NULL , ' d ' , " debug-devel " , " debug-devel " , 0 ,
& myglobals . debug , OPAL_CMD_LINE_TYPE_BOOL ,
" Enable debugging of OpenRTE " } ,
/* End of list */
2015-01-30 11:00:43 -08:00
{ NULL , ' \0 ' , NULL , NULL , 0 ,
NULL , OPAL_CMD_LINE_TYPE_NULL , NULL }
} ;
/*
* Local functions
*/
static int create_app ( int argc , char * argv [ ] ,
orte_job_t * jdata ,
orte_app_context_t * * app ,
bool * made_app , char * * * app_env ) ;
static int init_globals ( void ) ;
static int parse_globals ( int argc , char * argv [ ] , opal_cmd_line_t * cmd_line ) ;
static int parse_locals ( orte_job_t * jdata , int argc , char * argv [ ] ) ;
static void set_classpath_jar_file ( orte_app_context_t * app , int index , char * jarfile ) ;
static int parse_appfile ( orte_job_t * jdata , char * filename , char * * * env ) ;
2015-02-04 06:20:11 -08:00
static void orte_timeout_wakeup ( int sd , short args , void * cbdata ) ;
static void local_recv ( int status , orte_process_name_t * sender ,
opal_buffer_t * buffer ,
orte_rml_tag_t tag , void * cbdata ) ;
static void spawn_recv ( int status , orte_process_name_t * sender ,
opal_buffer_t * buffer ,
orte_rml_tag_t tag , void * cbdata ) ;
2015-01-30 11:00:43 -08:00
int main ( int argc , char * argv [ ] )
{
2015-02-10 10:47:32 -08:00
int rc , i ;
2015-01-30 11:00:43 -08:00
opal_cmd_line_t cmd_line ;
char * param ;
orte_job_t * jdata = NULL ;
char * hnpenv ;
2015-02-04 06:20:11 -08:00
opal_buffer_t * req ;
orte_daemon_cmd_flag_t cmd = ORTE_DAEMON_SPAWN_JOB_CMD ;
2015-01-30 11:00:43 -08:00
/* Setup and parse the command line */
memset ( & myglobals , 0 , sizeof ( myglobals ) ) ;
/* find our basename (the name of the executable) so that we can
use it in pretty - print error messages */
myglobals . basename = opal_basename ( argv [ 0 ] ) ;
opal_cmd_line_create ( & cmd_line , cmd_line_init ) ;
mca_base_cmd_line_setup ( & cmd_line ) ;
if ( OPAL_SUCCESS ! = ( rc = opal_cmd_line_parse ( & cmd_line , true ,
argc , argv ) ) ) {
if ( OPAL_ERR_SILENT ! = rc ) {
fprintf ( stderr , " %s: command line error (%s) \n " , argv [ 0 ] ,
opal_strerror ( rc ) ) ;
}
return rc ;
}
/*
* Since this process can now handle MCA / GMCA parameters , make sure to
* process them .
*/
if ( OPAL_SUCCESS ! = mca_base_cmd_line_process_args ( & cmd_line , & environ , & environ ) ) {
exit ( 1 ) ;
}
/* Ensure that enough of OPAL is setup for us to be able to run */
/*
* NOTE : ( JJH )
* We need to allow ' mca_base_cmd_line_process_args ( ) ' to process command
* line arguments * before * calling opal_init_util ( ) since the command
* line could contain MCA parameters that affect the way opal_init_util ( )
* functions . AMCA parameters are one such option normally received on the
* command line that affect the way opal_init_util ( ) behaves .
* It is " safe " to call mca_base_cmd_line_process_args ( ) before
* opal_init_util ( ) since mca_base_cmd_line_process_args ( ) does * not *
* depend upon opal_init_util ( ) functionality .
*/
/* Need to initialize OPAL so that install_dirs are filled in */
if ( OPAL_SUCCESS ! = opal_init ( & argc , & argv ) ) {
exit ( 1 ) ;
}
/* Check for some "global" command line params */
parse_globals ( argc , argv , & cmd_line ) ;
2015-02-10 12:13:21 -08:00
2015-01-30 11:00:43 -08:00
/* if they didn't point us at an HNP, that's an error */
if ( NULL = = myglobals . hnp ) {
fprintf ( stderr , " orte-submit: required option --hnp not provided \n " ) ;
exit ( 1 ) ;
}
OBJ_DESTRUCT ( & cmd_line ) ;
2015-02-10 07:22:10 -08:00
if ( 0 = = strncasecmp ( myglobals . hnp , " file " , strlen ( " file " ) ) ) {
2015-01-30 11:00:43 -08:00
char input [ 1024 ] , * filename ;
FILE * fp ;
/* it is a file - get the filename */
filename = strchr ( myglobals . hnp , ' : ' ) ;
if ( NULL = = filename ) {
/* filename is not correctly formatted */
orte_show_help ( " help-orte-top.txt " , " orte-top:hnp-filename-bad " , true , " uri " , myglobals . hnp ) ;
exit ( 1 ) ;
}
+ + filename ; /* space past the : */
if ( 0 > = strlen ( filename ) ) {
/* they forgot to give us the name! */
orte_show_help ( " help-orte-top.txt " , " orte-top:hnp-filename-bad " , true , " uri " , myglobals . hnp ) ;
exit ( 1 ) ;
}
/* open the file and extract the uri */
fp = fopen ( filename , " r " ) ;
if ( NULL = = fp ) { /* can't find or read file! */
orte_show_help ( " help-orte-top.txt " , " orte-top:hnp-filename-access " , true , myglobals . hnp ) ;
exit ( 1 ) ;
}
if ( NULL = = fgets ( input , 1024 , fp ) ) {
/* something malformed about file */
fclose ( fp ) ;
orte_show_help ( " help-orte-top.txt " , " orte-top:hnp-file-bad " , true , myglobals . hnp ) ;
exit ( 1 ) ;
}
fclose ( fp ) ;
input [ strlen ( input ) - 1 ] = ' \0 ' ; /* remove newline */
/* construct the target hnp info */
asprintf ( & hnpenv , " OMPI_MCA_orte_hnp_uri=%s " , input ) ;
} else {
/* should just be the uri itself - construct the target hnp info */
asprintf ( & hnpenv , " OMPI_MCA_orte_hnp_uri=%s " , myglobals . hnp ) ;
}
putenv ( hnpenv ) ; // must not free
2015-02-10 10:47:32 -08:00
2015-01-30 11:00:43 -08:00
/* Setup MCA params */
orte_register_params ( ) ;
/* flag that I am a TOOL */
orte_process_info . proc_type = ORTE_PROC_TOOL ;
2015-02-10 12:13:21 -08:00
if ( myglobals . debug ) {
orte_devel_level_output = true ;
}
2015-01-30 11:00:43 -08:00
/* Intialize our Open RTE environment
* Set the flag telling orte_init that I am NOT a
* singleton , but am " infrastructure " - prevents setting
* up incorrect infrastructure that only a singleton would
* require
*/
if ( ORTE_SUCCESS ! = ( rc = orte_init ( & argc , & argv , ORTE_PROC_TOOL ) ) ) {
/* cannot call ORTE_ERROR_LOG as it could be the errmgr
* never got loaded !
*/
return rc ;
}
/* finalize OPAL. As it was opened again from orte_init->opal_init
* we continue to have a reference count on it . So we have to finalize it twice . . .
*/
opal_finalize ( ) ;
2015-02-10 10:47:32 -08:00
for ( i = 0 ; NULL ! = environ [ i ] ; i + + ) {
if ( 0 = = strncmp ( environ [ i ] , " OMPI " , 4 ) ) {
fprintf ( stderr , " %s \n " , environ [ i ] ) ;
}
}
2015-01-30 11:00:43 -08:00
/* set the info in our contact table */
orte_rml . set_contact_info ( orte_process_info . my_hnp_uri ) ;
/* extract the name */
if ( ORTE_SUCCESS ! = orte_rml_base_parse_uris ( orte_process_info . my_hnp_uri , ORTE_PROC_MY_HNP , NULL ) ) {
orte_show_help ( " help-orte-top.txt " , " orte-top:hnp-uri-bad " , true , orte_process_info . my_hnp_uri ) ;
exit ( 1 ) ;
}
/* set the route to be direct */
if ( ORTE_SUCCESS ! = orte_routed . update_route ( ORTE_PROC_MY_HNP , ORTE_PROC_MY_HNP ) ) {
orte_show_help ( " help-orte-top.txt " , " orte-top:hnp-uri-bad " , true , orte_process_info . my_hnp_uri ) ;
orte_finalize ( ) ;
exit ( 1 ) ;
}
/* set the target hnp as our lifeline so we will terminate if it exits */
orte_routed . set_lifeline ( ORTE_PROC_MY_HNP ) ;
2015-02-01 12:14:44 -08:00
/* setup to listen for HNP response to my commands */
orte_rml . recv_buffer_nb ( ORTE_NAME_WILDCARD , ORTE_RML_TAG_TOOL ,
ORTE_RML_PERSISTENT , local_recv , NULL ) ;
/* set a timeout event in case the HNP doesn't answer */
/* if this is the terminate command, just send it */
if ( myglobals . terminate ) {
opal_buffer_t * buf ;
2015-02-10 08:27:13 -08:00
orte_daemon_cmd_flag_t cmd = ORTE_DAEMON_HALT_DVM_CMD ;
2015-02-01 12:14:44 -08:00
buf = OBJ_NEW ( opal_buffer_t ) ;
2015-02-03 07:24:43 -08:00
opal_dss . pack ( buf , & cmd , 1 , ORTE_DAEMON_CMD ) ;
2015-02-01 12:14:44 -08:00
orte_rml . send_buffer_nb ( ORTE_PROC_MY_HNP , buf ,
ORTE_RML_TAG_DAEMON ,
orte_rml_send_callback , NULL ) ;
goto waiting ;
}
2015-01-30 11:00:43 -08:00
/* default our personality to OMPI */
if ( NULL = = myglobals . personality ) {
myglobals . personality = strdup ( " ompi " ) ;
}
/* create a new job object to hold the info for this one - the
* jobid field will be filled in by the PLM when the job is
* launched
*/
jdata = OBJ_NEW ( orte_job_t ) ;
if ( NULL = = jdata ) {
/* cannot call ORTE_ERROR_LOG as the errmgr
* hasn ' t been loaded yet !
*/
return ORTE_ERR_OUT_OF_RESOURCE ;
}
jdata - > personality = strdup ( myglobals . personality ) ;
/* check what user wants us to do with stdin */
if ( NULL ! = myglobals . stdin_target ) {
if ( 0 = = strcmp ( myglobals . stdin_target , " all " ) ) {
jdata - > stdin_target = ORTE_VPID_WILDCARD ;
} else if ( 0 = = strcmp ( myglobals . stdin_target , " none " ) ) {
jdata - > stdin_target = ORTE_VPID_INVALID ;
} else {
jdata - > stdin_target = strtoul ( myglobals . stdin_target , NULL , 10 ) ;
}
}
/* if we want the argv's indexed, indicate that */
if ( myglobals . index_argv ) {
orte_set_attribute ( & jdata - > attributes , ORTE_JOB_INDEX_ARGV , ORTE_ATTR_GLOBAL , NULL , OPAL_BOOL ) ;
}
/* Parse each app, adding it to the job object */
parse_locals ( jdata , argc , argv ) ;
2015-02-10 10:47:32 -08:00
2015-02-10 12:13:21 -08:00
/* create the map object to communicate policies */
jdata - > map = OBJ_NEW ( orte_job_map_t ) ;
if ( NULL ! = myglobals . mapping_policy ) {
if ( ORTE_SUCCESS ! = ( rc = orte_rmaps_base_set_mapping_policy ( & jdata - > map - > mapping , NULL , myglobals . mapping_policy ) ) ) {
ORTE_ERROR_LOG ( rc ) ;
exit ( rc ) ;
}
}
if ( NULL ! = myglobals . ranking_policy ) {
if ( ORTE_SUCCESS ! = ( rc = orte_rmaps_base_set_ranking_policy ( & jdata - > map - > ranking ,
jdata - > map - > mapping ,
myglobals . ranking_policy ) ) ) {
ORTE_ERROR_LOG ( rc ) ;
exit ( rc ) ;
}
}
2015-02-10 13:56:14 -08:00
if ( NULL ! = myglobals . binding_policy ) {
if ( ORTE_SUCCESS ! = ( rc = opal_hwloc_base_set_binding_policy ( & jdata - > map - > binding ,
myglobals . binding_policy ) ) ) {
ORTE_ERROR_LOG ( rc ) ;
exit ( rc ) ;
}
}
/* if they asked for nolocal, mark it so */
2015-02-10 12:13:21 -08:00
if ( myglobals . nolocal ) {
ORTE_SET_MAPPING_DIRECTIVE ( jdata - > map - > mapping , ORTE_MAPPING_NO_USE_LOCAL ) ;
}
if ( myglobals . no_oversubscribe ) {
ORTE_UNSET_MAPPING_DIRECTIVE ( jdata - > map - > mapping , ORTE_MAPPING_NO_OVERSUBSCRIBE ) ;
}
if ( myglobals . oversubscribe ) {
ORTE_UNSET_MAPPING_DIRECTIVE ( jdata - > map - > mapping , ORTE_MAPPING_NO_OVERSUBSCRIBE ) ;
}
2015-02-10 13:56:14 -08:00
if ( myglobals . report_bindings ) {
orte_set_attribute ( & jdata - > attributes , ORTE_JOB_REPORT_BINDINGS , ORTE_ATTR_GLOBAL , NULL , OPAL_BOOL ) ;
}
if ( myglobals . slot_list ) {
orte_set_attribute ( & jdata - > attributes , ORTE_JOB_SLOT_LIST , ORTE_ATTR_GLOBAL , myglobals . slot_list , OPAL_STRING ) ;
}
if ( NULL = = myglobals . personality ) {
/* default to ompi */
jdata - > personality = strdup ( " ompi " ) ;
} else {
jdata - > personality = strdup ( myglobals . personality ) ;
}
2015-01-30 11:00:43 -08:00
if ( 0 = = jdata - > num_apps ) {
/* This should never happen -- this case should be caught in
create_app ( ) , but let ' s just double check . . . */
orte_show_help ( " help-orterun.txt " , " orterun:nothing-to-do " ,
true , myglobals . basename ) ;
exit ( ORTE_ERROR_DEFAULT_EXIT_CODE ) ;
}
/* check for a job timeout specification, to be provided in seconds
* as that is what MPICH used
*/
if ( NULL ! = ( param = getenv ( " MPIEXEC_TIMEOUT " ) ) ) {
if ( NULL = = ( orte_mpiexec_timeout = OBJ_NEW ( orte_timer_t ) ) ) {
ORTE_ERROR_LOG ( ORTE_ERR_OUT_OF_RESOURCE ) ;
ORTE_UPDATE_EXIT_STATUS ( ORTE_ERR_OUT_OF_RESOURCE ) ;
goto DONE ;
}
orte_mpiexec_timeout - > tv . tv_sec = strtol ( param , NULL , 10 ) ;
orte_mpiexec_timeout - > tv . tv_usec = 0 ;
opal_event_evtimer_set ( orte_event_base , orte_mpiexec_timeout - > ev ,
orte_timeout_wakeup , jdata ) ;
opal_event_set_priority ( orte_mpiexec_timeout - > ev , ORTE_ERROR_PRI ) ;
opal_event_evtimer_add ( orte_mpiexec_timeout - > ev , & orte_mpiexec_timeout - > tv ) ;
}
/* if recovery was disabled on the cmd line, do so */
if ( myglobals . enable_recovery ) {
ORTE_FLAG_SET ( jdata , ORTE_JOB_FLAG_RECOVERABLE ) ;
}
/* ask the HNP to spawn the job for us */
2015-02-04 06:20:11 -08:00
// post recv on tag_confirm_spawn, pass jdata as cbdata
orte_rml . recv_buffer_nb ( ORTE_NAME_WILDCARD , ORTE_RML_TAG_CONFIRM_SPAWN ,
ORTE_RML_PERSISTENT , spawn_recv , jdata ) ;
// pack the ORTE_DAEMON_SPAWN_JOB_CMD command and job object and send to HNP at tag ORTE_RML_TAG_DAEMON
req = OBJ_NEW ( opal_buffer_t ) ;
if ( OPAL_SUCCESS ! = ( rc = opal_dss . pack ( req , & cmd , 1 , ORTE_DAEMON_CMD ) ) ) {
ORTE_ERROR_LOG ( rc ) ;
exit ( rc ) ;
}
if ( OPAL_SUCCESS ! = ( rc = opal_dss . pack ( req , & jdata , 1 , ORTE_JOB ) ) ) {
ORTE_ERROR_LOG ( rc ) ;
exit ( rc ) ;
}
orte_rml . send_buffer_nb ( ORTE_PROC_MY_HNP , req , ORTE_RML_TAG_DAEMON , orte_rml_send_callback , NULL ) ;
// wait for response and unpack the status, jobid
ORTE_WAIT_FOR_COMPLETION ( myspawn ) ;
opal_output ( 0 , " Job %s has launched " , ORTE_JOBID_PRINT ( jdata - > jobid ) ) ;
2015-02-01 12:14:44 -08:00
waiting :
2015-02-03 07:24:43 -08:00
ORTE_WAIT_FOR_COMPLETION ( mywait ) ;
2015-01-30 11:00:43 -08:00
DONE :
/* cleanup and leave */
orte_finalize ( ) ;
if ( orte_debug_flag ) {
fprintf ( stderr , " exiting with status %d \n " , orte_exit_status ) ;
}
exit ( orte_exit_status ) ;
}
static int init_globals ( void )
{
/* Reset the other fields every time */
myglobals . help = false ;
myglobals . version = false ;
myglobals . num_procs = 0 ;
if ( NULL ! = myglobals . appfile ) {
free ( myglobals . appfile ) ;
}
myglobals . appfile = NULL ;
if ( NULL ! = myglobals . wdir ) {
free ( myglobals . wdir ) ;
}
myglobals . set_cwd_to_session_dir = false ;
myglobals . wdir = NULL ;
if ( NULL ! = myglobals . path ) {
free ( myglobals . path ) ;
}
myglobals . path = NULL ;
myglobals . preload_binaries = false ;
myglobals . preload_files = NULL ;
/* All done */
return ORTE_SUCCESS ;
}
static int parse_globals ( int argc , char * argv [ ] , opal_cmd_line_t * cmd_line )
{
/* print version if requested. Do this before check for help so
that - - version - - help works as one might expect . */
if ( myglobals . version ) {
char * str , * project_name = NULL ;
if ( 0 = = strcmp ( myglobals . basename , " ompi-submit " ) ) {
project_name = " Open MPI " ;
} else {
project_name = " OpenRTE " ;
}
str = opal_show_help_string ( " help-orterun.txt " , " orterun:version " ,
false ,
myglobals . basename , project_name , OPAL_VERSION ,
PACKAGE_BUGREPORT ) ;
if ( NULL ! = str ) {
printf ( " %s " , str ) ;
free ( str ) ;
}
exit ( 0 ) ;
}
/* Check for help request */
if ( myglobals . help ) {
char * str , * args = NULL ;
char * project_name = NULL ;
if ( 0 = = strcmp ( myglobals . basename , " ompi-submit " ) ) {
project_name = " Open MPI " ;
} else {
project_name = " OpenRTE " ;
}
args = opal_cmd_line_get_usage_msg ( cmd_line ) ;
str = opal_show_help_string ( " help-orterun.txt " , " orterun:usage " , false ,
myglobals . basename , project_name , OPAL_VERSION ,
myglobals . basename , args ,
PACKAGE_BUGREPORT ) ;
if ( NULL ! = str ) {
printf ( " %s " , str ) ;
free ( str ) ;
}
free ( args ) ;
/* If someone asks for help, that should be all we do */
exit ( 0 ) ;
}
/* check for request to report pid */
if ( NULL ! = myglobals . report_pid ) {
FILE * fp ;
if ( 0 = = strcmp ( myglobals . report_pid , " - " ) ) {
/* if '-', then output to stdout */
printf ( " %d \n " , ( int ) getpid ( ) ) ;
} else if ( 0 = = strcmp ( myglobals . report_pid , " + " ) ) {
/* if '+', output to stderr */
fprintf ( stderr , " %d \n " , ( int ) getpid ( ) ) ;
} else {
fp = fopen ( myglobals . report_pid , " w " ) ;
if ( NULL = = fp ) {
orte_show_help ( " help-orterun.txt " , " orterun:write_file " , false ,
myglobals . basename , " pid " , myglobals . report_pid ) ;
exit ( 0 ) ;
}
fprintf ( fp , " %d \n " , ( int ) getpid ( ) ) ;
fclose ( fp ) ;
}
}
return ORTE_SUCCESS ;
}
static int parse_locals ( orte_job_t * jdata , int argc , char * argv [ ] )
{
int i , rc , app_num ;
int temp_argc ;
char * * temp_argv , * * env ;
orte_app_context_t * app ;
bool made_app ;
orte_std_cntr_t j , size1 ;
/* Make the apps */
temp_argc = 0 ;
temp_argv = NULL ;
opal_argv_append ( & temp_argc , & temp_argv , argv [ 0 ] ) ;
/* NOTE: This bogus env variable is necessary in the calls to
create_app ( ) , below . See comment immediately before the
create_app ( ) function for an explanation . */
env = NULL ;
for ( app_num = 0 , i = 1 ; i < argc ; + + i ) {
if ( 0 = = strcmp ( argv [ i ] , " : " ) ) {
/* Make an app with this argv */
if ( opal_argv_count ( temp_argv ) > 1 ) {
if ( NULL ! = env ) {
opal_argv_free ( env ) ;
env = NULL ;
}
app = NULL ;
rc = create_app ( temp_argc , temp_argv , jdata , & app , & made_app , & env ) ;
/** keep track of the number of apps - point this app_context to that index */
if ( ORTE_SUCCESS ! = rc ) {
/* Assume that the error message has already been
printed ; no need to cleanup - - we can just
exit */
exit ( 1 ) ;
}
if ( made_app ) {
app - > idx = app_num ;
+ + app_num ;
opal_pointer_array_add ( jdata - > apps , app ) ;
+ + jdata - > num_apps ;
}
/* Reset the temps */
temp_argc = 0 ;
temp_argv = NULL ;
opal_argv_append ( & temp_argc , & temp_argv , argv [ 0 ] ) ;
}
} else {
opal_argv_append ( & temp_argc , & temp_argv , argv [ i ] ) ;
}
}
if ( opal_argv_count ( temp_argv ) > 1 ) {
app = NULL ;
rc = create_app ( temp_argc , temp_argv , jdata , & app , & made_app , & env ) ;
if ( ORTE_SUCCESS ! = rc ) {
/* Assume that the error message has already been printed;
no need to cleanup - - we can just exit */
exit ( 1 ) ;
}
if ( made_app ) {
app - > idx = app_num ;
+ + app_num ;
opal_pointer_array_add ( jdata - > apps , app ) ;
+ + jdata - > num_apps ;
}
}
if ( NULL ! = env ) {
opal_argv_free ( env ) ;
}
opal_argv_free ( temp_argv ) ;
/* Once we've created all the apps, add the global MCA params to
each app ' s environment ( checking for duplicates , of
course - - yay opal_environ_merge ( ) ) . */
if ( NULL ! = global_mca_env ) {
size1 = ( size_t ) opal_pointer_array_get_size ( jdata - > apps ) ;
/* Iterate through all the apps */
for ( j = 0 ; j < size1 ; + + j ) {
app = ( orte_app_context_t * )
opal_pointer_array_get_item ( jdata - > apps , j ) ;
if ( NULL ! = app ) {
/* Use handy utility function */
env = opal_environ_merge ( global_mca_env , app - > env ) ;
opal_argv_free ( app - > env ) ;
app - > env = env ;
}
}
}
/* Now take a subset of the MCA params and set them as MCA
overrides here in orterun ( so that when we orte_init ( ) later ,
all the components see these MCA params ) . Here ' s how we decide
which subset of the MCA params we set here in orterun :
1. If any global MCA params were set , use those
2. If no global MCA params were set and there was only one app ,
then use its app MCA params
3. Otherwise , don ' t set any
*/
env = NULL ;
if ( NULL ! = global_mca_env ) {
env = global_mca_env ;
} else {
if ( opal_pointer_array_get_size ( jdata - > apps ) > = 1 ) {
/* Remember that pointer_array's can be padded with NULL
entries ; so only use the app ' s env if there is exactly
1 non - NULL entry */
app = ( orte_app_context_t * )
opal_pointer_array_get_item ( jdata - > apps , 0 ) ;
if ( NULL ! = app ) {
env = app - > env ;
for ( j = 1 ; j < opal_pointer_array_get_size ( jdata - > apps ) ; + + j ) {
if ( NULL ! = opal_pointer_array_get_item ( jdata - > apps , j ) ) {
env = NULL ;
break ;
}
}
}
}
}
if ( NULL ! = env ) {
size1 = opal_argv_count ( env ) ;
for ( j = 0 ; j < size1 ; + + j ) {
/* Use-after-Free error possible here. putenv does not copy
* the string passed to it , and instead stores only the pointer .
* env [ j ] may be freed later , in which case the pointer
* in environ will now be left dangling into a deallocated
* region .
* So we make a copy of the variable .
*/
char * s = strdup ( env [ j ] ) ;
if ( NULL = = s ) {
return OPAL_ERR_OUT_OF_RESOURCE ;
}
putenv ( s ) ;
}
}
/* All done */
return ORTE_SUCCESS ;
}
/*
* This function takes a " char ***app_env " parameter to handle the
* specific case :
*
* orterun - - mca foo bar - app appfile
*
* That is , we ' ll need to keep foo = bar , but the presence of the app
* file will cause an invocation of parse_appfile ( ) , which will cause
* one or more recursive calls back to create_app ( ) . Since the
* foo = bar value applies globally to all apps in the appfile , we need
* to pass in the " base " environment ( that contains the foo = bar value )
* when we parse each line in the appfile .
*
* This is really just a special case - - when we have a simple case like :
*
* orterun - - mca foo bar - np 4 hostname
*
* Then the upper - level function ( parse_locals ( ) ) calls create_app ( )
* with a NULL value for app_env , meaning that there is no " base "
* environment that the app needs to be created from .
*/
static int create_app ( int argc , char * argv [ ] ,
orte_job_t * jdata ,
orte_app_context_t * * app_ptr ,
bool * made_app , char * * * app_env )
{
opal_cmd_line_t cmd_line ;
char cwd [ OPAL_PATH_MAX ] ;
int i , j , count , rc ;
char * param , * value ;
orte_app_context_t * app = NULL ;
bool cmd_line_made = false ;
bool found = false ;
char * appname ;
* made_app = false ;
/* Pre-process the command line if we are going to parse an appfile later.
* save any mca command line args so they can be passed
* separately to the daemons .
* Use Case :
* $ cat launch . appfile
* - np 1 - mca aaa bbb . / my - app - mca ccc ddd
* - np 1 - mca aaa bbb . / my - app - mca eee fff
* $ mpirun - np 2 - mca foo bar - - app launch . appfile
* Only pick up ' - mca foo bar ' on this pass .
*/
if ( NULL ! = myglobals . appfile ) {
if ( ORTE_SUCCESS ! = ( rc = orte_schizo . parse_cli ( myglobals . personality , argc , 0 , argv ) ) ) {
goto cleanup ;
}
}
/* Parse application command line options. */
init_globals ( ) ;
opal_cmd_line_create ( & cmd_line , cmd_line_init ) ;
mca_base_cmd_line_setup ( & cmd_line ) ;
cmd_line_made = true ;
rc = opal_cmd_line_parse ( & cmd_line , true , argc , argv ) ;
if ( ORTE_SUCCESS ! = rc ) {
goto cleanup ;
}
mca_base_cmd_line_process_args ( & cmd_line , app_env , & global_mca_env ) ;
/* Is there an appfile in here? */
if ( NULL ! = myglobals . appfile ) {
OBJ_DESTRUCT ( & cmd_line ) ;
return parse_appfile ( jdata , strdup ( myglobals . appfile ) , app_env ) ;
}
/* Setup application context */
app = OBJ_NEW ( orte_app_context_t ) ;
opal_cmd_line_get_tail ( & cmd_line , & count , & app - > argv ) ;
/* See if we have anything left */
if ( 0 = = count ) {
orte_show_help ( " help-orterun.txt " , " orterun:executable-not-specified " ,
true , myglobals . basename , myglobals . basename ) ;
rc = ORTE_ERR_NOT_FOUND ;
goto cleanup ;
}
/*
* Get mca parameters so we can pass them to the daemons .
* Use the count determined above to make sure we do not go past
* the executable name . Example :
* mpirun - np 2 - mca foo bar . / my - app - mca bip bop
* We want to pick up ' - mca foo bar ' but not ' - mca bip bop '
*/
if ( ORTE_SUCCESS ! = ( rc = orte_schizo . parse_cli ( myglobals . personality ,
argc , count , argv ) ) ) {
goto cleanup ;
}
/* Grab all OMPI_* environment variables */
app - > env = opal_argv_copy ( * app_env ) ;
if ( ORTE_SUCCESS ! = ( rc = orte_schizo . parse_env ( myglobals . personality ,
myglobals . path ,
& cmd_line , NULL ,
environ , & app - > env ) ) ) {
goto cleanup ;
}
/* Did the user request a specific wdir? */
if ( NULL ! = myglobals . wdir ) {
/* if this is a relative path, convert it to an absolute path */
if ( opal_path_is_absolute ( myglobals . wdir ) ) {
app - > cwd = strdup ( myglobals . wdir ) ;
} else {
/* get the cwd */
if ( OPAL_SUCCESS ! = ( rc = opal_getcwd ( cwd , sizeof ( cwd ) ) ) ) {
orte_show_help ( " help-orterun.txt " , " orterun:init-failure " ,
true , " get the cwd " , rc ) ;
goto cleanup ;
}
/* construct the absolute path */
app - > cwd = opal_os_path ( false , cwd , myglobals . wdir , NULL ) ;
}
orte_set_attribute ( & app - > attributes , ORTE_APP_USER_CWD , ORTE_ATTR_GLOBAL , NULL , OPAL_BOOL ) ;
} else if ( myglobals . set_cwd_to_session_dir ) {
orte_set_attribute ( & app - > attributes , ORTE_APP_SSNDIR_CWD , ORTE_ATTR_GLOBAL , NULL , OPAL_BOOL ) ;
orte_set_attribute ( & app - > attributes , ORTE_APP_USER_CWD , ORTE_ATTR_GLOBAL , NULL , OPAL_BOOL ) ;
} else {
if ( OPAL_SUCCESS ! = ( rc = opal_getcwd ( cwd , sizeof ( cwd ) ) ) ) {
orte_show_help ( " help-orterun.txt " , " orterun:init-failure " ,
true , " get the cwd " , rc ) ;
goto cleanup ;
}
app - > cwd = strdup ( cwd ) ;
}
/* if this is the first app_context, check for prefix directions.
* We only do this for the first app_context because the launchers
* only look at the first one when setting the prefix - we do NOT
* support per - app_context prefix settings !
*/
if ( 0 = = total_num_apps ) {
/* Check to see if the user explicitly wanted to disable automatic
- - prefix behavior */
if ( opal_cmd_line_is_taken ( & cmd_line , " noprefix " ) ) {
want_prefix_by_default = false ;
}
/* Did the user specify a prefix, or want prefix by default? */
if ( opal_cmd_line_is_taken ( & cmd_line , " prefix " ) | | want_prefix_by_default ) {
size_t param_len ;
/* if both the prefix was given and we have a prefix
* given above , check to see if they match
*/
if ( opal_cmd_line_is_taken ( & cmd_line , " prefix " ) & &
NULL ! = myglobals . prefix ) {
/* if they don't match, then that merits a warning */
param = strdup ( opal_cmd_line_get_param ( & cmd_line , " prefix " , 0 , 0 ) ) ;
/* ensure we strip any trailing '/' */
if ( 0 = = strcmp ( OPAL_PATH_SEP , & ( param [ strlen ( param ) - 1 ] ) ) ) {
param [ strlen ( param ) - 1 ] = ' \0 ' ;
}
value = strdup ( myglobals . prefix ) ;
if ( 0 = = strcmp ( OPAL_PATH_SEP , & ( value [ strlen ( value ) - 1 ] ) ) ) {
value [ strlen ( value ) - 1 ] = ' \0 ' ;
}
if ( 0 ! = strcmp ( param , value ) ) {
orte_show_help ( " help-orterun.txt " , " orterun:app-prefix-conflict " ,
true , myglobals . basename , value , param ) ;
/* let the global-level prefix take precedence since we
* know that one is being used
*/
free ( param ) ;
param = strdup ( myglobals . prefix ) ;
}
free ( value ) ;
} else if ( NULL ! = myglobals . prefix ) {
param = myglobals . prefix ;
} else if ( opal_cmd_line_is_taken ( & cmd_line , " prefix " ) ) {
/* must be --prefix alone */
param = strdup ( opal_cmd_line_get_param ( & cmd_line , " prefix " , 0 , 0 ) ) ;
} else {
/* --enable-orterun-prefix-default was given to orterun */
param = strdup ( opal_install_dirs . prefix ) ;
}
if ( NULL ! = param ) {
/* "Parse" the param, aka remove superfluous path_sep. */
param_len = strlen ( param ) ;
while ( 0 = = strcmp ( OPAL_PATH_SEP , & ( param [ param_len - 1 ] ) ) ) {
param [ param_len - 1 ] = ' \0 ' ;
param_len - - ;
if ( 0 = = param_len ) {
orte_show_help ( " help-orterun.txt " , " orterun:empty-prefix " ,
true , myglobals . basename , myglobals . basename ) ;
return ORTE_ERR_FATAL ;
}
}
orte_set_attribute ( & app - > attributes , ORTE_APP_PREFIX_DIR , ORTE_ATTR_GLOBAL , param , OPAL_STRING ) ;
free ( param ) ;
}
}
}
/* Did the user specify a hostfile. Need to check for both
* hostfile and machine file .
* We can only deal with one hostfile per app context , otherwise give an error .
*/
if ( 0 < ( j = opal_cmd_line_get_ninsts ( & cmd_line , " hostfile " ) ) ) {
if ( 1 < j ) {
orte_show_help ( " help-orterun.txt " , " orterun:multiple-hostfiles " ,
true , myglobals . basename , NULL ) ;
return ORTE_ERR_FATAL ;
} else {
value = opal_cmd_line_get_param ( & cmd_line , " hostfile " , 0 , 0 ) ;
2015-02-10 10:47:32 -08:00
orte_set_attribute ( & app - > attributes , ORTE_APP_HOSTFILE , ORTE_ATTR_GLOBAL , value , OPAL_STRING ) ;
2015-01-30 11:00:43 -08:00
}
}
if ( 0 < ( j = opal_cmd_line_get_ninsts ( & cmd_line , " machinefile " ) ) ) {
if ( 1 < j | | orte_get_attribute ( & app - > attributes , ORTE_APP_HOSTFILE , NULL , OPAL_STRING ) ) {
orte_show_help ( " help-orterun.txt " , " orterun:multiple-hostfiles " ,
true , myglobals . basename , NULL ) ;
return ORTE_ERR_FATAL ;
} else {
value = opal_cmd_line_get_param ( & cmd_line , " machinefile " , 0 , 0 ) ;
2015-02-10 10:47:32 -08:00
orte_set_attribute ( & app - > attributes , ORTE_APP_HOSTFILE , ORTE_ATTR_GLOBAL , value , OPAL_STRING ) ;
2015-01-30 11:00:43 -08:00
}
}
/* Did the user specify any hosts? */
if ( 0 < ( j = opal_cmd_line_get_ninsts ( & cmd_line , " host " ) ) ) {
char * * targ = NULL , * tval ;
for ( i = 0 ; i < j ; + + i ) {
value = opal_cmd_line_get_param ( & cmd_line , " host " , i , 0 ) ;
opal_argv_append_nosize ( & targ , value ) ;
}
tval = opal_argv_join ( targ , ' , ' ) ;
2015-02-10 10:47:32 -08:00
orte_set_attribute ( & app - > attributes , ORTE_APP_DASH_HOST , ORTE_ATTR_GLOBAL , tval , OPAL_STRING ) ;
2015-01-30 11:00:43 -08:00
opal_argv_free ( targ ) ;
free ( tval ) ;
}
/* check for bozo error */
if ( 0 > myglobals . num_procs ) {
orte_show_help ( " help-orterun.txt " , " orterun:negative-nprocs " ,
true , myglobals . basename , app - > argv [ 0 ] ,
myglobals . num_procs , NULL ) ;
return ORTE_ERR_FATAL ;
}
app - > num_procs = ( orte_std_cntr_t ) myglobals . num_procs ;
total_num_apps + + ;
/* Capture any preload flags */
if ( myglobals . preload_binaries ) {
2015-02-10 10:47:32 -08:00
orte_set_attribute ( & app - > attributes , ORTE_APP_PRELOAD_BIN , ORTE_ATTR_GLOBAL , NULL , OPAL_BOOL ) ;
2015-01-30 11:00:43 -08:00
}
/* if we were told to cwd to the session dir and the app was given in
* relative syntax , then we need to preload the binary to
* find the app - don ' t do this for java apps , however , as we
* can ' t easily find the class on the cmd line . Java apps have to
* preload their binary via the preload_files option
*/
if ( ! opal_path_is_absolute ( app - > argv [ 0 ] ) & &
NULL = = strstr ( app - > argv [ 0 ] , " java " ) ) {
if ( myglobals . preload_binaries ) {
orte_set_attribute ( & app - > attributes , ORTE_APP_SSNDIR_CWD , ORTE_ATTR_GLOBAL , NULL , OPAL_BOOL ) ;
} else if ( orte_get_attribute ( & app - > attributes , ORTE_APP_SSNDIR_CWD , NULL , OPAL_BOOL ) ) {
2015-02-10 10:47:32 -08:00
orte_set_attribute ( & app - > attributes , ORTE_APP_PRELOAD_BIN , ORTE_ATTR_GLOBAL , NULL , OPAL_BOOL ) ;
2015-01-30 11:00:43 -08:00
}
}
if ( NULL ! = myglobals . preload_files ) {
2015-02-10 10:47:32 -08:00
orte_set_attribute ( & app - > attributes , ORTE_APP_PRELOAD_FILES , ORTE_ATTR_GLOBAL ,
2015-01-30 11:00:43 -08:00
myglobals . preload_files , OPAL_STRING ) ;
}
/* Do not try to find argv[0] here -- the starter is responsible
for that because it may not be relevant to try to find it on
the node where orterun is executing . So just strdup ( ) argv [ 0 ]
into app . */
app - > app = strdup ( app - > argv [ 0 ] ) ;
if ( NULL = = app - > app ) {
orte_show_help ( " help-orterun.txt " , " orterun:call-failed " ,
true , myglobals . basename , " library " , " strdup returned NULL " , errno ) ;
rc = ORTE_ERR_NOT_FOUND ;
goto cleanup ;
}
/* if this is a Java application, we have a bit more work to do. Such
* applications actually need to be run under the Java virtual machine
* and the " java " command will start the " executable " . So we need to ensure
* that all the proper java - specific paths are provided
*/
appname = opal_basename ( app - > app ) ;
if ( 0 = = strcmp ( appname , " java " ) ) {
/* see if we were given a library path */
found = false ;
for ( i = 1 ; NULL ! = app - > argv [ i ] ; i + + ) {
if ( NULL ! = strstr ( app - > argv [ i ] , " java.library.path " ) ) {
/* yep - but does it include the path to the mpi libs? */
found = true ;
if ( NULL = = strstr ( app - > argv [ i ] , opal_install_dirs . libdir ) ) {
/* doesn't appear to - add it to be safe */
if ( ' : ' = = app - > argv [ i ] [ strlen ( app - > argv [ i ] - 1 ) ] ) {
asprintf ( & value , " -Djava.library.path=%s%s " , app - > argv [ i ] , opal_install_dirs . libdir ) ;
} else {
asprintf ( & value , " -Djava.library.path=%s:%s " , app - > argv [ i ] , opal_install_dirs . libdir ) ;
}
free ( app - > argv [ i ] ) ;
app - > argv [ i ] = value ;
}
break ;
}
}
if ( ! found ) {
/* need to add it right after the java command */
asprintf ( & value , " -Djava.library.path=%s " , opal_install_dirs . libdir ) ;
opal_argv_insert_element ( & app - > argv , 1 , value ) ;
free ( value ) ;
}
/* see if we were given a class path */
found = false ;
for ( i = 1 ; NULL ! = app - > argv [ i ] ; i + + ) {
if ( NULL ! = strstr ( app - > argv [ i ] , " cp " ) | |
NULL ! = strstr ( app - > argv [ i ] , " classpath " ) ) {
/* yep - but does it include the path to the mpi libs? */
found = true ;
/* check if mpi.jar exists - if so, add it */
value = opal_os_path ( false , opal_install_dirs . libdir , " mpi.jar " , NULL ) ;
if ( access ( value , F_OK ) ! = - 1 ) {
set_classpath_jar_file ( app , i + 1 , " mpi.jar " ) ;
}
free ( value ) ;
/* check for oshmem support */
value = opal_os_path ( false , opal_install_dirs . libdir , " shmem.jar " , NULL ) ;
if ( access ( value , F_OK ) ! = - 1 ) {
set_classpath_jar_file ( app , i + 1 , " shmem.jar " ) ;
}
free ( value ) ;
/* always add the local directory */
asprintf ( & value , " %s:%s " , app - > cwd , app - > argv [ i + 1 ] ) ;
free ( app - > argv [ i + 1 ] ) ;
app - > argv [ i + 1 ] = value ;
break ;
}
}
if ( ! found ) {
/* check to see if CLASSPATH is in the environment */
found = false ; // just to be pedantic
for ( i = 0 ; NULL ! = environ [ i ] ; i + + ) {
if ( 0 = = strncmp ( environ [ i ] , " CLASSPATH " , strlen ( " CLASSPATH " ) ) ) {
value = strchr ( environ [ i ] , ' = ' ) ;
+ + value ; /* step over the = */
opal_argv_insert_element ( & app - > argv , 1 , value ) ;
/* check for mpi.jar */
value = opal_os_path ( false , opal_install_dirs . libdir , " mpi.jar " , NULL ) ;
if ( access ( value , F_OK ) ! = - 1 ) {
set_classpath_jar_file ( app , 1 , " mpi.jar " ) ;
}
free ( value ) ;
/* check for shmem.jar */
value = opal_os_path ( false , opal_install_dirs . libdir , " shmem.jar " , NULL ) ;
if ( access ( value , F_OK ) ! = - 1 ) {
set_classpath_jar_file ( app , 1 , " shmem.jar " ) ;
}
free ( value ) ;
/* always add the local directory */
( void ) asprintf ( & value , " %s:%s " , app - > cwd , app - > argv [ 1 ] ) ;
free ( app - > argv [ 1 ] ) ;
app - > argv [ 1 ] = value ;
opal_argv_insert_element ( & app - > argv , 1 , " -cp " ) ;
found = true ;
break ;
}
}
if ( ! found ) {
/* need to add it right after the java command - have
* to include the working directory and trust that
* the user set cwd if necessary
*/
char * str , * str2 ;
/* always start with the working directory */
str = strdup ( app - > cwd ) ;
/* check for mpi.jar */
value = opal_os_path ( false , opal_install_dirs . libdir , " mpi.jar " , NULL ) ;
if ( access ( value , F_OK ) ! = - 1 ) {
( void ) asprintf ( & str2 , " %s:%s " , str , value ) ;
free ( str ) ;
str = str2 ;
}
free ( value ) ;
/* check for shmem.jar */
value = opal_os_path ( false , opal_install_dirs . libdir , " shmem.jar " , NULL ) ;
if ( access ( value , F_OK ) ! = - 1 ) {
asprintf ( & str2 , " %s:%s " , str , value ) ;
free ( str ) ;
str = str2 ;
}
free ( value ) ;
opal_argv_insert_element ( & app - > argv , 1 , str ) ;
free ( str ) ;
opal_argv_insert_element ( & app - > argv , 1 , " -cp " ) ;
}
}
/* try to find the actual command - may not be perfect */
for ( i = 1 ; i < opal_argv_count ( app - > argv ) ; i + + ) {
if ( NULL ! = strstr ( app - > argv [ i ] , " java.library.path " ) ) {
continue ;
} else if ( NULL ! = strstr ( app - > argv [ i ] , " cp " ) | |
NULL ! = strstr ( app - > argv [ i ] , " classpath " ) ) {
/* skip the next field */
i + + ;
continue ;
}
/* declare this the winner */
opal_setenv ( " OMPI_COMMAND " , app - > argv [ i ] , true , & app - > env ) ;
/* collect everything else as the cmd line */
if ( ( i + 1 ) < opal_argv_count ( app - > argv ) ) {
value = opal_argv_join ( & app - > argv [ i + 1 ] , ' ' ) ;
opal_setenv ( " OMPI_ARGV " , value , true , & app - > env ) ;
free ( value ) ;
}
break ;
}
} else {
/* add the cmd to the environment for MPI_Info to pickup */
opal_setenv ( " OMPI_COMMAND " , appname , true , & app - > env ) ;
if ( 1 < opal_argv_count ( app - > argv ) ) {
value = opal_argv_join ( & app - > argv [ 1 ] , ' ' ) ;
opal_setenv ( " OMPI_ARGV " , value , true , & app - > env ) ;
free ( value ) ;
}
}
free ( appname ) ;
* app_ptr = app ;
app = NULL ;
* made_app = true ;
/* All done */
cleanup :
if ( NULL ! = app ) {
OBJ_RELEASE ( app ) ;
}
if ( cmd_line_made ) {
OBJ_DESTRUCT ( & cmd_line ) ;
}
return rc ;
}
static void set_classpath_jar_file ( orte_app_context_t * app , int index , char * jarfile )
{
if ( NULL = = strstr ( app - > argv [ index ] , jarfile ) ) {
/* nope - need to add it */
char * fmt = ' : ' = = app - > argv [ index ] [ strlen ( app - > argv [ index ] - 1 ) ]
? " %s%s/%s " : " %s:%s/%s " ;
char * str ;
asprintf ( & str , fmt , app - > argv [ index ] , opal_install_dirs . libdir , jarfile ) ;
free ( app - > argv [ index ] ) ;
app - > argv [ index ] = str ;
}
}
static int parse_appfile ( orte_job_t * jdata , char * filename , char * * * env )
{
size_t i , len ;
FILE * fp ;
char line [ BUFSIZ ] ;
int rc , argc , app_num ;
char * * argv ;
orte_app_context_t * app ;
bool blank , made_app ;
char bogus [ ] = " bogus " ;
char * * tmp_env ;
/*
* Make sure to clear out this variable so we don ' t do anything odd in
* app_create ( )
*/
if ( NULL ! = myglobals . appfile ) {
free ( myglobals . appfile ) ;
myglobals . appfile = NULL ;
}
/* Try to open the file */
fp = fopen ( filename , " r " ) ;
if ( NULL = = fp ) {
orte_show_help ( " help-orterun.txt " , " orterun:appfile-not-found " , true ,
filename ) ;
return ORTE_ERR_NOT_FOUND ;
}
/* Read in line by line */
line [ sizeof ( line ) - 1 ] = ' \0 ' ;
app_num = 0 ;
do {
/* We need a bogus argv[0] (because when argv comes in from
the command line , argv [ 0 ] is " orterun " , so the parsing
logic ignores it ) . So create one here rather than making
an argv and then pre - pending a new argv [ 0 ] ( which would be
rather inefficient ) . */
line [ 0 ] = ' \0 ' ;
strcat ( line , bogus ) ;
if ( NULL = = fgets ( line + sizeof ( bogus ) - 1 ,
sizeof ( line ) - sizeof ( bogus ) - 1 , fp ) ) {
break ;
}
/* Remove a trailing newline */
len = strlen ( line ) ;
if ( len > 0 & & ' \n ' = = line [ len - 1 ] ) {
line [ len - 1 ] = ' \0 ' ;
if ( len > 0 ) {
- - len ;
}
}
/* Remove comments */
for ( i = 0 ; i < len ; + + i ) {
if ( ' # ' = = line [ i ] ) {
line [ i ] = ' \0 ' ;
break ;
} else if ( i + 1 < len & & ' / ' = = line [ i ] & & ' / ' = = line [ i + 1 ] ) {
line [ i ] = ' \0 ' ;
break ;
}
}
/* Is this a blank line? */
len = strlen ( line ) ;
for ( blank = true , i = sizeof ( bogus ) ; i < len ; + + i ) {
if ( ! isspace ( line [ i ] ) ) {
blank = false ;
break ;
}
}
if ( blank ) {
continue ;
}
/* We got a line with *something* on it. So process it */
argv = opal_argv_split ( line , ' ' ) ;
argc = opal_argv_count ( argv ) ;
if ( argc > 0 ) {
/* Create a temporary env to use in the recursive call --
that is : don ' t disturb the original env so that we can
have a consistent global env . This allows for the
case :
orterun - - mca foo bar - - appfile file
where the " file " contains multiple apps . In this case ,
each app in " file " will get * only * foo = bar as the base
environment from which its specific environment is
constructed . */
if ( NULL ! = * env ) {
tmp_env = opal_argv_copy ( * env ) ;
if ( NULL = = tmp_env ) {
return ORTE_ERR_OUT_OF_RESOURCE ;
}
} else {
tmp_env = NULL ;
}
rc = create_app ( argc , argv , jdata , & app , & made_app , & tmp_env ) ;
if ( ORTE_SUCCESS ! = rc ) {
/* Assume that the error message has already been
printed ; no need to cleanup - - we can just exit */
exit ( 1 ) ;
}
if ( NULL ! = tmp_env ) {
opal_argv_free ( tmp_env ) ;
}
if ( made_app ) {
app - > idx = app_num ;
+ + app_num ;
opal_pointer_array_add ( jdata - > apps , app ) ;
+ + jdata - > num_apps ;
}
}
} while ( ! feof ( fp ) ) ;
fclose ( fp ) ;
/* All done */
free ( filename ) ;
return ORTE_SUCCESS ;
}
void orte_timeout_wakeup ( int sd , short args , void * cbdata )
{
char * tm ;
/* this function gets called when the job execution time
* has hit a prescribed limit - so just abort
*/
tm = getenv ( " MPIEXEC_TIMEOUT " ) ;
orte_show_help ( " help-orterun.txt " , " orterun:timeout " ,
true , ( NULL = = tm ) ? " NULL " : tm ) ;
ORTE_UPDATE_EXIT_STATUS ( ORTE_ERROR_DEFAULT_EXIT_CODE ) ;
2015-02-03 07:24:43 -08:00
exit ( orte_exit_status ) ;
2015-01-30 11:00:43 -08:00
}
static void local_recv ( int status , orte_process_name_t * sender ,
opal_buffer_t * buffer ,
orte_rml_tag_t tag , void * cbdata )
{
int rc , ret ;
int32_t cnt ;
2015-02-03 07:24:43 -08:00
2015-01-30 11:00:43 -08:00
/* unpack the completion status of the job */
cnt = 1 ;
if ( OPAL_SUCCESS ! = ( rc = opal_dss . unpack ( buffer , & ret , & cnt , OPAL_INT ) ) ) {
ORTE_UPDATE_EXIT_STATUS ( rc ) ;
}
/* update our exit status to match */
ORTE_UPDATE_EXIT_STATUS ( ret ) ;
2015-02-03 07:24:43 -08:00
exit ( orte_exit_status ) ;
2015-01-30 11:00:43 -08:00
}
2015-02-04 06:20:11 -08:00
static void spawn_recv ( int status , orte_process_name_t * sender ,
opal_buffer_t * buffer ,
orte_rml_tag_t tag , void * cbdata )
{
orte_job_t * jdata = ( orte_job_t * ) cbdata ;
int32_t cnt ;
// extract the returned jobid
cnt = 1 ;
opal_dss . unpack ( buffer , & jdata - > jobid , & cnt , ORTE_JOBID ) ;
// release the wait
myspawn = false ;
}