/* * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. * Copyright (c) 2004-2007 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2006-2007 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2007 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ * * These symbols are in a file by themselves to provide nice linker * semantics. Since linkers generally pull in symbols by object * files, keeping these symbols as the only symbols in this file * prevents utility programs such as "ompi_info" from having to import * entire components just to query their version and parameters. */ #include "orte_config.h" #include "orte/constants.h" #include #ifdef HAVE_UNISTD_H #include #endif #include #include #ifdef HAVE_STRINGS_H #include #endif #ifdef HAVE_SYS_SELECT_H #include #endif #ifdef HAVE_SYS_TIME_H #include #endif #ifdef HAVE_SYS_TYPES_H #include #endif #ifdef HAVE_SYS_STAT_H #include #endif #ifdef HAVE_SYS_WAIT_H #include #endif #include #include #ifdef HAVE_PWD_H #include #endif #include "opal/mca/installdirs/installdirs.h" #include "opal/mca/base/mca_base_param.h" #include "opal/util/if.h" #include "opal/util/os_path.h" #include "opal/util/path.h" #include "opal/event/event.h" #include "opal/util/show_help.h" #include "opal/util/argv.h" #include "opal/util/opal_environ.h" #include "opal/util/output.h" #include "opal/util/trace.h" #include "opal/util/basename.h" #include "opal/util/bit_ops.h" #include "orte/util/session_dir.h" #include "orte/runtime/orte_wait.h" #include "orte/runtime/orte_wakeup.h" #include "orte/runtime/orte_globals.h" #include "orte/util/name_fns.h" #include "orte/util/nidmap.h" #include "orte/mca/rml/rml.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/ras/ras_types.h" #include "orte/mca/rmaps/rmaps.h" #include "orte/mca/routed/routed.h" #include "orte/mca/grpcomm/grpcomm.h" #include "orte/mca/odls/odls.h" #include "orte/mca/rml/base/rml_contact.h" #include "orte/mca/plm/plm.h" #include "orte/mca/plm/base/base.h" #include "orte/mca/plm/base/plm_private.h" #include "orte/mca/plm/rsh/plm_rsh.h" #if OMPI_HAVE_POSIX_THREADS && OMPI_THREADS_HAVE_DIFFERENT_PIDS && OMPI_ENABLE_PROGRESS_THREADS static int orte_plm_rsh_launch_threaded(orte_job_t *jdata); #endif static int remote_spawn(opal_buffer_t *launch); orte_plm_base_module_t orte_plm_rsh_module = { orte_plm_rsh_init, orte_plm_base_set_hnp_name, #if OMPI_HAVE_POSIX_THREADS && OMPI_THREADS_HAVE_DIFFERENT_PIDS && OMPI_ENABLE_PROGRESS_THREADS orte_plm_rsh_launch_threaded, #else orte_plm_rsh_launch, #endif remote_spawn, orte_plm_rsh_terminate_job, orte_plm_rsh_terminate_orteds, orte_plm_rsh_signal_job, orte_plm_rsh_finalize }; typedef enum { ORTE_PLM_RSH_SHELL_BASH = 0, ORTE_PLM_RSH_SHELL_ZSH, ORTE_PLM_RSH_SHELL_TCSH, ORTE_PLM_RSH_SHELL_CSH, ORTE_PLM_RSH_SHELL_KSH, ORTE_PLM_RSH_SHELL_SH, ORTE_PLM_RSH_SHELL_UNKNOWN } orte_plm_rsh_shell_t; /* These strings *must* follow the same order as the enum ORTE_PLM_RSH_SHELL_* */ static const char * orte_plm_rsh_shell_name[] = { "bash", "zsh", "tcsh", /* tcsh has to be first otherwise strstr finds csh */ "csh", "ksh", "sh", "unknown" }; /* * Local functions */ static void set_handler_default(int sig); static orte_plm_rsh_shell_t find_shell(char *shell); static int find_children(int rank, int parent, int me, int num_procs); static int daemon_callback(orte_std_cntr_t num_children); /* local global storage of timing variables */ static struct timeval joblaunchstart, joblaunchstop; /* local global storage */ static orte_jobid_t active_job=ORTE_JOBID_INVALID; static orte_job_t *jdatorted; static orte_proc_t **pdatorted; static opal_buffer_t *launch_cmd; /** * Init the module */ int orte_plm_rsh_init(void) { int rc; if (ORTE_SUCCESS != (rc = orte_plm_base_comm_start())) { ORTE_ERROR_LOG(rc); } return rc; } /** * Check the Shell variable on the specified node */ static int orte_plm_rsh_probe(char *nodename, orte_plm_rsh_shell_t *shell) { char ** argv; int argc, rc = ORTE_SUCCESS, i; int fd[2]; pid_t pid; char outbuf[4096]; OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, "%s plm:rsh: going to check SHELL variable on node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), nodename)); *shell = ORTE_PLM_RSH_SHELL_UNKNOWN; if (pipe(fd)) { OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, "%s plm:rsh: pipe failed with errno=%d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno)); return ORTE_ERR_IN_ERRNO; } if ((pid = fork()) < 0) { OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, "%s plm:rsh: fork failed with errno=%d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno)); return ORTE_ERR_IN_ERRNO; } else if (pid == 0) { /* child */ if (dup2(fd[1], 1) < 0) { OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, "%s plm:rsh: dup2 failed with errno=%d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno)); exit(01); } /* Build argv array */ argv = opal_argv_copy(mca_plm_rsh_component.agent_argv); argc = mca_plm_rsh_component.agent_argc; opal_argv_append(&argc, &argv, nodename); opal_argv_append(&argc, &argv, "echo $SHELL"); execvp(argv[0], argv); exit(errno); } if (close(fd[1])) { OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, "%s plm:rsh: close failed with errno=%d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno)); return ORTE_ERR_IN_ERRNO; } { ssize_t ret = 1; char* ptr = outbuf; size_t outbufsize = sizeof(outbuf); do { ret = read (fd[0], ptr, outbufsize-1); if (ret < 0) { if (errno == EINTR) continue; OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, "%s plm:rsh: Unable to detect the remote shell (error %s)", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), strerror(errno))); rc = ORTE_ERR_IN_ERRNO; break; } if( outbufsize > 1 ) { outbufsize -= ret; ptr += ret; } } while( 0 != ret ); *ptr = '\0'; } close(fd[0]); if( outbuf[0] != '\0' ) { char *sh_name = rindex(outbuf, '/'); if( NULL != sh_name ) { sh_name++; /* skip '/' */ /* We cannot use "echo -n $SHELL" because -n is not portable. Therefore * we have to remove the "\n" */ if ( sh_name[strlen(sh_name)-1] == '\n' ) { sh_name[strlen(sh_name)-1] = '\0'; } /* Search for the substring of known shell-names */ for (i = 0; i < (int)(sizeof (orte_plm_rsh_shell_name)/ sizeof(orte_plm_rsh_shell_name[0])); i++) { if ( 0 == strcmp(sh_name, orte_plm_rsh_shell_name[i]) ) { *shell = i; break; } } } } OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, "%s plm:rsh: node %s has SHELL: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), nodename, (ORTE_PLM_RSH_SHELL_UNKNOWN == *shell) ? "UNHANDLED" : orte_plm_rsh_shell_name[*shell])); return rc; } static int total_num_daemons_calledback; static bool total_callback_failed; static void process_remote_launch_report(int fd, short event, void *data) { orte_message_event_t *mev = (orte_message_event_t*)data; opal_buffer_t *buffer = mev->buffer; orte_vpid_t vpid=ORTE_VPID_INVALID; orte_std_cntr_t cnt, numd, i; int rc; uint8_t flag; char *rml_uri; orte_process_name_t daemon; OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, "%s plm:ssh:report_remote_launch from daemon %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&mev->sender))); /* unpack number of daemons being reported */ cnt = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &numd, &cnt, ORTE_STD_CNTR))) { ORTE_ERROR_LOG(rc); total_callback_failed = true; return; } /* unpack flag that indicates if any failed */ cnt = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &flag, &cnt, OPAL_UINT8))) { ORTE_ERROR_LOG(rc); total_callback_failed = true; return; } /* did any fail? */ if (0 != flag) { /* unpack the failed vpid */ cnt = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &vpid, &cnt, ORTE_VPID))) { ORTE_ERROR_LOG(rc); total_callback_failed = true; } if (ORTE_VPID_INVALID != vpid) { /* note that this daemon failed */ pdatorted[vpid]->state = ORTE_PROC_STATE_FAILED_TO_START; } /* report that the daemon has failed so we can exit */ orte_plm_base_launch_failed(active_job, true, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, ORTE_JOB_STATE_FAILED_TO_START); return; } /* get their uri info */ for (i=0; i < numd; i++) { cnt=1; opal_dss.unpack(buffer, &rml_uri, &cnt, OPAL_STRING); orte_rml.set_contact_info(rml_uri); orte_rml_base_parse_uris(rml_uri, &daemon, NULL); pdatorted[daemon.vpid]->rml_uri = strdup(rml_uri); orte_routed.update_route(&daemon, &daemon); } /* update num recvd */ total_num_daemons_calledback += numd; OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, "%s plm:ssh:report_remote_launch reported %d for total of %d daemons reported", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)numd, total_num_daemons_calledback)); } /* * Need a callback function to report failure of a remote daemon's launch */ static void report_remote_launch(int status, orte_process_name_t* sender, opal_buffer_t *buffer, orte_rml_tag_t tag, void *cbdata) { int rc; /* don't process this right away - we need to get out of the recv before * we process the message as it may ask us to do something that involves * more messaging! Instead, setup an event so that the message gets processed * as soon as we leave the recv. * * The macro makes a copy of the buffer, which we release when processed - the incoming * buffer, however, is NOT released here, although its payload IS transferred * to the message buffer for later processing */ ORTE_MESSAGE_EVENT(sender, buffer, tag, process_remote_launch_report); /* reissue the recv */ rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_REPORT_REMOTE_LAUNCH, ORTE_RML_NON_PERSISTENT, report_remote_launch, NULL); if (rc != ORTE_SUCCESS) { ORTE_ERROR_LOG(rc); total_callback_failed = true; } } /** * Callback on daemon exit. */ static void orte_plm_rsh_wait_daemon(pid_t pid, int status, void* cbdata) { unsigned long deltat; orte_std_cntr_t cnt=1; uint8_t flag; if (! WIFEXITED(status) || ! WEXITSTATUS(status) == 0) { /* if abnormal exit */ /* if we are not the HNP, send a message to the HNP alerting it * to the failure */ if (!orte_process_info.hnp) { opal_buffer_t buf; orte_vpid_t *vpid=(orte_vpid_t*)cbdata; OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, "%s daemon %d failed with status %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)*vpid, WEXITSTATUS(status))); OBJ_CONSTRUCT(&buf, opal_buffer_t); opal_dss.pack(&buf, &cnt, 1, ORTE_STD_CNTR); flag = 1; opal_dss.pack(&buf, &flag, 1, OPAL_UINT8); opal_dss.pack(&buf, vpid, 1, ORTE_VPID); orte_rml.send_buffer(ORTE_PROC_MY_HNP, &buf, ORTE_RML_TAG_REPORT_REMOTE_LAUNCH, 0); OBJ_DESTRUCT(&buf); } else { orte_proc_t *daemon=(orte_proc_t*)cbdata; OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, "%s daemon %d failed with status %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)daemon->name.vpid, WEXITSTATUS(status))); /* note that this daemon failed */ daemon->state = ORTE_PROC_STATE_FAILED_TO_START; /* report that the daemon has failed so we can exit */ orte_plm_base_launch_failed(active_job, true, pid, status, ORTE_JOB_STATE_FAILED_TO_START); } } /* release any waiting threads */ OPAL_THREAD_LOCK(&mca_plm_rsh_component.lock); if (mca_plm_rsh_component.num_children-- >= mca_plm_rsh_component.num_concurrent || mca_plm_rsh_component.num_children == 0) { opal_condition_signal(&mca_plm_rsh_component.cond); } if (orte_timing && mca_plm_rsh_component.num_children == 0) { if (0 != gettimeofday(&joblaunchstop, NULL)) { opal_output(0, "plm_rsh: could not obtain job launch stop time"); } else { deltat = (joblaunchstop.tv_sec - joblaunchstart.tv_sec)*1000000 + (joblaunchstop.tv_usec - joblaunchstart.tv_usec); opal_output(0, "plm_rsh: total time to launch job is %lu usec", deltat); } } OPAL_THREAD_UNLOCK(&mca_plm_rsh_component.lock); } static int setup_launch(int *argcptr, char ***argvptr, char *nodename, int *node_name_index1, int *node_name_index2, int *local_exec_index, int *proc_vpid_index, char **lib_base, char **bin_base, bool *remote_sh, bool *remote_csh) { struct passwd *p; int argc; char **argv; char *param; orte_plm_rsh_shell_t shell; bool local_sh = false, local_csh = false; int rc; /* What is our local shell? */ p = getpwuid(getuid()); if( NULL == p ) { /* This user is unknown to the system. Therefore, there is no reason we * spawn whatsoever in his name. Give up with a HUGE error message. */ opal_show_help( "help-plm-rsh.txt", "unknown-user", true, (int)getuid() ); return ORTE_ERR_FATAL; } else { param = p->pw_shell; shell = find_shell(p->pw_shell); } /* If we didn't find it in getpwuid(), try looking at the $SHELL environment variable (see https://svn.open-mpi.org/trac/ompi/ticket/1060) */ if (ORTE_PLM_RSH_SHELL_UNKNOWN == shell && NULL != (param = getenv("SHELL"))) { shell = find_shell(param); } switch (shell) { case ORTE_PLM_RSH_SHELL_SH: /* fall through */ case ORTE_PLM_RSH_SHELL_KSH: /* fall through */ case ORTE_PLM_RSH_SHELL_ZSH: /* fall through */ case ORTE_PLM_RSH_SHELL_BASH: local_sh = true; break; case ORTE_PLM_RSH_SHELL_TCSH: /* fall through */ case ORTE_PLM_RSH_SHELL_CSH: local_csh = true; break; default: opal_output(0, "WARNING: local probe returned unhandled shell:%s assuming bash\n", (NULL != param) ? param : "unknown"); *remote_sh = true; break; } OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, "%s plm:rsh: local csh: %d, local sh: %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), local_csh, local_sh)); /* What is our remote shell? */ if (mca_plm_rsh_component.assume_same_shell) { *remote_sh = local_sh; *remote_csh = local_csh; OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, "%s plm:rsh: assuming same remote shell as local shell", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); } else { orte_plm_rsh_shell_t shell; rc = orte_plm_rsh_probe(nodename, &shell); if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); return rc; } switch (shell) { case ORTE_PLM_RSH_SHELL_SH: /* fall through */ case ORTE_PLM_RSH_SHELL_KSH: /* fall through */ case ORTE_PLM_RSH_SHELL_ZSH: /* fall through */ case ORTE_PLM_RSH_SHELL_BASH: *remote_sh = true; break; case ORTE_PLM_RSH_SHELL_TCSH: /* fall through */ case ORTE_PLM_RSH_SHELL_CSH: *remote_csh = true; break; default: opal_output(0, "WARNING: rsh probe returned unhandled shell; assuming bash\n"); *remote_sh = true; } } OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, "%s plm:rsh: remote csh: %d, remote sh: %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), *remote_csh, *remote_sh)); /* * Build argv array */ argv = opal_argv_copy(mca_plm_rsh_component.agent_argv); argc = mca_plm_rsh_component.agent_argc; *node_name_index1 = argc; opal_argv_append(&argc, &argv, "