/* * Copyright (c) 2004-2008 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ * * These symbols are in a file by themselves to provide nice linker * semantics. Since linkers generally pull in symbols by object * files, keeping these symbols as the only symbols in this file * prevents utility programs such as "ompi_info" from having to import * entire components just to query their version and parameters. */ #include "orte_config.h" #include "orte/constants.h" #include #ifdef HAVE_UNISTD_H #include #endif #include #include #ifdef HAVE_STRINGS_H #include #endif #ifdef HAVE_SYS_SELECT_H #include #endif #ifdef HAVE_SYS_TIME_H #include #endif #ifdef HAVE_SYS_TYPES_H #include #endif #ifdef HAVE_SYS_STAT_H #include #endif #ifdef HAVE_SYS_WAIT_H #include #endif #include #include #ifdef HAVE_PWD_H #include #endif #include "opal/mca/installdirs/installdirs.h" #include "opal/mca/base/mca_base_param.h" #include "opal/util/if.h" #include "opal/util/output.h" #include "opal/util/os_path.h" #include "opal/util/path.h" #include "opal/event/event.h" #include "opal/util/argv.h" #include "opal/util/opal_environ.h" #include "orte/util/show_help.h" #include "opal/util/basename.h" #include "opal/util/opal_environ.h" #include "orte/util/proc_info.h" #include "orte/util/univ_info.h" #include "orte/runtime/orte_wait.h" #include "orte/runtime/orte_globals.h" #include "orte/runtime/params.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/ras/ras_types.h" #include "orte/mca/rmaps/rmaps.h" #include "orte/mca/plm/plm.h" #include "orte/mca/plm/base/base.h" #include "orte/mca/plm/base/plm_private.h" #include "orte/mca/plm/submit/plm_submit.h" static int orte_plm_submit_init(void); #if OPAL_HAVE_POSIX_THREADS && OPAL_THREADS_HAVE_DIFFERENT_PIDS && OPAL_ENABLE_PROGRESS_THREADS static int orte_plm_submit_launch_threaded(orte_jobid_t jobid); #endif orte_plm_base_module_t orte_plm_submit_module = { orte_plm_submit_init, orte_plm_base_set_hnp_name, #if OPAL_HAVE_POSIX_THREADS && OPAL_THREADS_HAVE_DIFFERENT_PIDS && OPAL_ENABLE_PROGRESS_THREADS orte_plm_submit_launch_threaded, #else orte_plm_submit_launch, #endif NULL, orte_plm_submit_terminate_job, orte_plm_submit_terminate_orteds, orte_plm_submit_signal_job, orte_plm_submit_finalize }; static void set_handler_default(int sig); enum { ORTE_PLM_submit_SHELL_BASH = 0, ORTE_PLM_submit_SHELL_ZSH, ORTE_PLM_submit_SHELL_TCSH, ORTE_PLM_submit_SHELL_CSH, ORTE_PLM_submit_SHELL_KSH, ORTE_PLM_submit_SHELL_SH, ORTE_PLM_submit_SHELL_UNKNOWN }; typedef int orte_plm_submit_shell; static const char * orte_plm_submit_shell_name[] = { "bash", "zsh", "tcsh", /* tcsh has to be first otherwise strstr finds csh */ "csh", "ksh", "sh", "unknown" }; /* local global storage of timing variables */ static struct timeval joblaunchstart, joblaunchstop; /* global storage of active jobid being launched */ static orte_jobid_t active_job=ORTE_JOBID_INVALID; /* * Init module */ static int orte_plm_submit_init(void) { int rc; if (ORTE_SUCCESS != (rc = orte_plm_base_comm_start())) { ORTE_ERROR_LOG(rc); } return rc; } /** * Check the Shell variable on the specified node */ static int orte_plm_submit_probe(orte_node_t *node, orte_plm_submit_shell * shell) { char ** argv; int argc, rc = ORTE_SUCCESS, i; int fd[2]; pid_t pid; char outbuf[4096]; if (mca_plm_submit_component.debug) { opal_output(0, "plm:submit: going to check SHELL variable on node %s\n", node->name); } *shell = ORTE_PLM_submit_SHELL_UNKNOWN; if (pipe(fd)) { opal_output(0, "plm:submit: pipe failed with errno=%d\n", errno); return ORTE_ERR_IN_ERRNO; } if ((pid = fork()) < 0) { opal_output(0, "plm:submit: fork failed with errno=%d\n", errno); return ORTE_ERR_IN_ERRNO; } else if (pid == 0) { /* child */ if (dup2(fd[1], 1) < 0) { opal_output(0, "plm:submit: dup2 failed with errno=%d\n", errno); exit(01); } /* Build argv array */ argv = opal_argv_copy(mca_plm_submit_component.agent_argv); argc = mca_plm_submit_component.agent_argc; opal_argv_append(&argc, &argv, node->name); opal_argv_append(&argc, &argv, "echo $SHELL"); execvp(argv[0], argv); exit(errno); } if (close(fd[1])) { opal_output(0, "plm:submit: close failed with errno=%d\n", errno); return ORTE_ERR_IN_ERRNO; } { ssize_t ret = 1; char* ptr = outbuf; size_t outbufsize = sizeof(outbuf); do { ret = read (fd[0], ptr, outbufsize-1); if (ret < 0) { if (errno == EINTR) continue; opal_output( 0, "Unable to detect the remote shell (error %s)\n", strerror(errno) ); rc = ORTE_ERR_IN_ERRNO; break; } if( outbufsize > 1 ) { outbufsize -= ret; ptr += ret; } } while( 0 != ret ); *ptr = '\0'; } close(fd[0]); if( outbuf[0] != '\0' ) { char *sh_name = rindex(outbuf, '/'); if( NULL != sh_name ) { sh_name++; /* skip '/' */ /* We cannot use "echo -n $SHELL" because -n is not portable. Therefore * we have to remove the "\n" */ if ( sh_name[strlen(sh_name)-1] == '\n' ) { sh_name[strlen(sh_name)-1] = '\0'; } /* Search for the substring of known shell-names */ for (i = 0; i < (int)(sizeof (orte_plm_submit_shell_name)/ sizeof(orte_plm_submit_shell_name[0])); i++) { if ( 0 == strcmp(sh_name, orte_plm_submit_shell_name[i]) ) { *shell = i; break; } } } } if (mca_plm_submit_component.debug) { if( ORTE_PLM_submit_SHELL_UNKNOWN == *shell ) { opal_output(0, "plm:submit: node:%s has unhandled SHELL\n", node->name); } else { opal_output(0, "plm:submit: node:%s has SHELL: %s\n", node->name, orte_plm_submit_shell_name[*shell]); } } return rc; } /** * Fill the exec_path variable with the directory to the orted */ static int orte_plm_submit_fill_exec_path ( char ** exec_path) { struct stat buf; asprintf(exec_path, "%s/orted", opal_install_dirs.bindir); if (0 != stat(*exec_path, &buf)) { char *path = getenv("PATH"); if (NULL == path) { path = ("PATH is empty!"); } orte_show_help("help-plm-submit.txt", "no-local-orted", true, path, opal_install_dirs.bindir); return ORTE_ERR_NOT_FOUND; } return ORTE_SUCCESS; } /** * Callback on daemon exit. */ static void orte_plm_submit_wait_daemon(pid_t pid, int status, void* cbdata) { unsigned long deltat; if (! WIFEXITED(status) || ! WEXITSTATUS(status) == 0) { /* tell the user something went wrong */ opal_output(0, "ERROR: A daemon failed to start as expected."); opal_output(0, "ERROR: There may be more information available from"); opal_output(0, "ERROR: the remote shell (see above)."); if (WIFEXITED(status)) { opal_output(0, "ERROR: The daemon exited unexpectedly with status %d.", WEXITSTATUS(status)); } else if (WIFSIGNALED(status)) { #ifdef WCOREDUMP if (WCOREDUMP(status)) { opal_output(0, "The daemon received a signal %d (with core).", WTERMSIG(status)); } else { opal_output(0, "The daemon received a signal %d.", WTERMSIG(status)); } #else opal_output(0, "The daemon received a signal %d.", WTERMSIG(status)); #endif /* WCOREDUMP */ } else { opal_output(0, "No extra status information is available: %d.", status); } /* The usual reasons for ssh to exit abnormally all are a pretty good indication that the child processes aren't going to start up properly. Set the job state to indicate we failed to launch so orterun's exit status will be non-zero and forcibly terminate the job so orterun can exit */ orte_plm_base_launch_failed(active_job, true, pid, status, ORTE_JOB_STATE_FAILED_TO_START); } /* if abnormal exit */ /* release any waiting threads */ OPAL_THREAD_LOCK(&mca_plm_submit_component.lock); if (mca_plm_submit_component.num_children-- >= mca_plm_submit_component.num_concurrent || mca_plm_submit_component.num_children == 0) { opal_condition_signal(&mca_plm_submit_component.cond); } if (mca_plm_submit_component.timing && mca_plm_submit_component.num_children == 0) { if (0 != gettimeofday(&joblaunchstop, NULL)) { opal_output(0, "plm_submit: could not obtain job launch stop time"); } else { deltat = (joblaunchstop.tv_sec - joblaunchstart.tv_sec)*1000000 + (joblaunchstop.tv_usec - joblaunchstart.tv_usec); opal_output(0, "plm_submit: total time to launch job is %lu usec", deltat); } } OPAL_THREAD_UNLOCK(&mca_plm_submit_component.lock); } /** * Launch a daemon (bootproxy) on each node. The daemon will be responsible * for launching the application. */ /* When working in this function, ALWAYS jump to "cleanup" if * you encounter an error so that orterun will be woken up and * the job can cleanly terminate */ int orte_plm_submit_launch(orte_job_t *jdata) { orte_job_map_t *map; orte_std_cntr_t num_nodes; int node_name_index1; int proc_vpid_index; int local_exec_index, local_exec_index_end; char *vpid_string = NULL; char *param; char **argv = NULL; char *prefix_dir; int argc; int rc; sigset_t sigs; struct passwd *p; bool remote_sh = false, remote_csh = false; bool local_sh = false, local_csh = false; char *lib_base = NULL, *bin_base = NULL; bool failed_launch = true; orte_app_context_t **apps; orte_node_t **nodes; orte_std_cntr_t nnode; if (mca_plm_submit_component.timing) { if (0 != gettimeofday(&joblaunchstart, NULL)) { opal_output(0, "plm_submit: could not obtain start time"); joblaunchstart.tv_sec = 0; joblaunchstart.tv_usec = 0; } } /* create a jobid for this job */ if (ORTE_SUCCESS != (rc = orte_plm_base_create_jobid(&jdata->jobid))) { ORTE_ERROR_LOG(rc); goto cleanup; } OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, "%s plm:submit: launching job %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid))); /* setup the job */ if (ORTE_SUCCESS != (rc = orte_plm_base_setup_job(jdata))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* set the active jobid */ active_job = jobid; /* Get the map for this job */ if (NULL == (map = orte_rmaps.get_job_map(active_job))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); rc = ORTE_ERR_NOT_FOUND; goto cleanup; } apps = (orte_app_context_t**)jdata->apps->addr; nodes = (orte_node_t**)map->nodes->addr; /* account for any reuse of daemons */ if (ORTE_SUCCESS != (rc = orte_plm_base_launch_on_existing_daemons(map))) { ORTE_ERROR_LOG(rc); goto cleanup; } num_nodes = map->num_new_daemons; if (0 == num_nodes) { /* have all the daemons we need - launch app */ OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, "%s plm:submit: no new daemons to launch", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto launch_apps; } if (mca_plm_submit_component.debug_daemons && mca_plm_submit_component.num_concurrent < num_nodes) { /** * If we are in '--debug-daemons' we keep the ssh connection * alive for the span of the run. If we use this option * AND we launch on more than "num_concurrent" machines * then we will deadlock. No connections are terminated * until the job is complete, no job is started * since all the orteds are waiting for all the others * to come online, and the others ore not launched because * we are waiting on those that have started to terminate * their ssh tunnels. :( * As we cannot run in this situation, pretty print the error * and return an error code. */ orte_show_help("help-plm-submit.txt", "deadlock-params", true, mca_plm_submit_component.num_concurrent, num_nodes); rc = ORTE_ERR_FATAL; goto cleanup; } /* * After a discussion between Ralph & Jeff, we concluded that we * really are handling the prefix dir option incorrectly. It currently * is associated with an app_context, yet it really refers to the * location where OpenRTE/Open MPI is installed on a NODE. Fixing * this right now would involve significant change to orterun as well * as elsewhere, so we will intentionally leave this incorrect at this * point. The error, however, is identical to that seen in all prior * releases of OpenRTE/Open MPI, so our behavior is no worse than before. * * A note to fix this, along with ideas on how to do so, has been filed * on the project's Trac system under "feature enhancement". * * For now, default to the prefix_dir provided in the first app_context. * Since there always MUST be at least one app_context, we are safe in * doing this. */ prefix_dir = apps[0]->prefix_dir; /* What is our local shell? */ p = getpwuid(getuid()); if( NULL == p ) { /* This user is unknown to the system. Therefore, there is no reason we * spawn whatsoever in his name. Give up with a HUGE error message. */ orte_show_help( "help-plm-submit.txt", "unknown-user", true, (int)getuid() ); rc = ORTE_ERR_FATAL; goto cleanup; } else { int i; char *sh_name = NULL; sh_name = rindex(p->pw_shell, '/'); sh_name++; /* skip the '\' */ for (i = 0; i < (int)(sizeof (orte_plm_submit_shell_name)/ sizeof(orte_plm_submit_shell_name[0])); i++) { if ( 0 == strcmp(sh_name, orte_plm_submit_shell_name[i]) ) { switch (i) { case ORTE_PLM_submit_SHELL_SH: /* fall through */ case ORTE_PLM_submit_SHELL_KSH: /* fall through */ case ORTE_PLM_submit_SHELL_ZSH: /* fall through */ case ORTE_PLM_submit_SHELL_BASH: local_sh = true; break; case ORTE_PLM_submit_SHELL_TCSH: /* fall through */ case ORTE_PLM_submit_SHELL_CSH: local_csh = true; break; /* The match has been done, there is no need for a default case here */ } /* I did match one of the known shells, so now we're done with the shell detection */ break; } } if ( i == ORTE_PLM_submit_SHELL_UNKNOWN ) { opal_output(0, "WARNING: local probe returned unhandled shell:%s assuming bash\n", sh_name); local_sh = true; } if (mca_plm_submit_component.debug) { opal_output(0, "plm:submit: local csh: %d, local sh: %d\n", local_csh, local_sh); } } /* What is our remote shell? */ if (mca_plm_submit_component.assume_same_shell) { remote_sh = local_sh; remote_csh = local_csh; if (mca_plm_submit_component.debug) { opal_output(0, "plm:submit: assuming same remote shell as local shell"); } } else { orte_plm_submit_shell shell; rc = orte_plm_submit_probe(nodes[0], &shell); if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); goto cleanup; } switch (shell) { case ORTE_PLM_submit_SHELL_SH: /* fall through */ case ORTE_PLM_submit_SHELL_KSH: /* fall through */ case ORTE_PLM_submit_SHELL_BASH: remote_sh = true; break; case ORTE_PLM_submit_SHELL_TCSH: /* fall through */ case ORTE_PLM_submit_SHELL_CSH: remote_csh = true; break; default: opal_output(0, "WARNING: submit probe returned unhandled shell:%s assuming bash\n", orte_plm_submit_shell_name[shell]); remote_sh = true; } } if (mca_plm_submit_component.debug) { opal_output(0, "plm:submit: remote csh: %d, remote sh: %d\n", remote_csh, remote_sh); } /* * Build argv array */ argv = opal_argv_copy(mca_plm_submit_component.agent_argv); argc = mca_plm_submit_component.agent_argc; node_name_index1 = argc; opal_argv_append(&argc, &argv, "