/* * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. * Copyright (c) 2004-2007 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2006-2007 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2007 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2008-2009 Sun Microsystems, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow * */ #include "orte_config.h" #include "orte/constants.h" #include #ifdef HAVE_UNISTD_H #include #endif #include #include #ifdef HAVE_STRINGS_H #include #endif #ifdef HAVE_SYS_SELECT_H #include #endif #ifdef HAVE_SYS_TIME_H #include #endif #ifdef HAVE_SYS_TYPES_H #include #endif #ifdef HAVE_SYS_STAT_H #include #endif #ifdef HAVE_SYS_WAIT_H #include #endif #include #include #ifdef HAVE_PWD_H #include #endif #include "opal/mca/installdirs/installdirs.h" #include "opal/mca/base/mca_base_param.h" #include "opal/util/output.h" #include "opal/util/opal_sos.h" #include "opal/mca/event/event.h" #include "opal/util/argv.h" #include "opal/util/opal_environ.h" #include "opal/util/basename.h" #include "opal/util/bit_ops.h" #include "opal/class/opal_pointer_array.h" #include "orte/util/show_help.h" #include "orte/runtime/orte_wait.h" #include "orte/runtime/orte_globals.h" #include "orte/util/name_fns.h" #include "orte/util/nidmap.h" #include "orte/util/proc_info.h" #include "orte/mca/rml/rml.h" #include "orte/mca/rml/rml_types.h" #include "orte/mca/ess/ess.h" #include "orte/mca/ess/base/base.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/rmaps/rmaps.h" #include "orte/mca/routed/routed.h" #include "orte/mca/rml/base/rml_contact.h" #include "orte/mca/plm/plm.h" #include "orte/mca/plm/base/base.h" #include "orte/mca/plm/base/plm_private.h" #include "orte/mca/plm/base/plm_base_rsh_support.h" #include "orte/mca/plm/rshbase/plm_rshbase.h" static int init(void); static int spawn(orte_job_t *jdata); static int terminate_orteds(void); static int signal_job(orte_jobid_t jobid, int32_t signal); static int finalize(void); orte_plm_base_module_t orte_plm_rshbase_module = { init, orte_plm_base_set_hnp_name, spawn, NULL, orte_plm_base_orted_terminate_job, terminate_orteds, orte_plm_base_orted_kill_local_procs, signal_job, finalize }; /* local global storage of timing variables */ static struct timeval joblaunchstart, joblaunchstop; /* local global storage */ static int num_in_progress=0; /** * Init the module */ static int init(void) { int rc; /* we were selected, so setup the launch agent */ if (ORTE_SUCCESS != (rc = orte_plm_base_rsh_launch_agent_setup(NULL, NULL))) { ORTE_ERROR_LOG(rc); return rc; } if (ORTE_SUCCESS != (rc = orte_plm_base_comm_start())) { ORTE_ERROR_LOG(rc); } return rc; } /** * Callback on daemon exit. */ static void orte_plm_rsh_wait_daemon(pid_t pid, int status, void* cbdata) { orte_std_cntr_t cnt=1; uint8_t flag; orte_job_t *jdata; if (! WIFEXITED(status) || ! WEXITSTATUS(status) == 0) { /* if abnormal exit */ /* if we are not the HNP, send a message to the HNP alerting it * to the failure */ if (!ORTE_PROC_IS_HNP) { opal_buffer_t buf; orte_vpid_t *vpid=(orte_vpid_t*)cbdata; OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, "%s daemon %d failed with status %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)*vpid, WEXITSTATUS(status))); OBJ_CONSTRUCT(&buf, opal_buffer_t); opal_dss.pack(&buf, &cnt, 1, ORTE_STD_CNTR); flag = 1; opal_dss.pack(&buf, &flag, 1, OPAL_UINT8); opal_dss.pack(&buf, vpid, 1, ORTE_VPID); orte_rml.send_buffer(ORTE_PROC_MY_HNP, &buf, ORTE_RML_TAG_REPORT_REMOTE_LAUNCH, 0); OBJ_DESTRUCT(&buf); } else { orte_proc_t *daemon=(orte_proc_t*)cbdata; jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, "%s daemon %d failed with status %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)daemon->name.vpid, WEXITSTATUS(status))); /* note that this daemon failed */ daemon->state = ORTE_PROC_STATE_FAILED_TO_START; /* increment the #daemons terminated so we will exit properly */ jdata->num_terminated++; #if 0 /* report that the daemon has failed so we can exit */ orte_errmgr.update_state(ORTE_PROC_MY_NAME->jobid, ORTE_JOB_STATE_FAILED_TO_START, NULL, ORTE_PROC_STATE_UNDEF, status); #else /* JJH: Look into a better way of doing this. If we let the daemon * know, then it kills the job when we are trying to restart.. */ opal_output(0, "%s daemon %s failed. SKIPPING orte_plm_base_launch_failed()", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&daemon->name)); #endif } } /* release any waiting threads */ num_in_progress--; OPAL_THREAD_LOCK(&mca_plm_rshbase_component.lock); if (num_in_progress <= mca_plm_rshbase_component.num_concurrent) { opal_condition_signal(&mca_plm_rshbase_component.cond); } OPAL_THREAD_UNLOCK(&mca_plm_rshbase_component.lock); } /** * Launch a daemon (bootproxy) on each node. The daemon will be responsible * for launching the application. */ /* When working in this function, ALWAYS jump to "cleanup" if * you encounter an error so that orterun will be woken up and * the job can cleanly terminate */ static int spawn(orte_job_t *jdata) { int rc; orte_job_map_t *map; orte_app_context_t *app; orte_node_t *node; int nnode; int argc; char **argv=NULL; char *prefix_dir; int node_name_index1; int proc_vpid_index; pid_t pid; bool failed_launch = true; orte_jobid_t active_job, failed_job; /* wait for the launch to complete */ OPAL_THREAD_LOCK(&orte_plm_globals.spawn_lock); while (orte_plm_globals.spawn_in_progress) { opal_condition_wait(&orte_plm_globals.spawn_in_progress_cond, &orte_plm_globals.spawn_lock); } OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, "released to spawn")); orte_plm_globals.spawn_in_progress = true; orte_plm_globals.spawn_status = ORTE_ERR_FATAL; OPAL_THREAD_UNLOCK(&orte_plm_globals.spawn_lock); if (jdata->controls & ORTE_JOB_CONTROL_LOCAL_SLAVE) { /* if this is a request to launch a local slave, * then we will not be launching an orted - we will * directly ssh the slave process itself. No mapping * is performed to support this - the caller must * provide all the info required to launch the job, * including the target hosts */ rc = orte_plm_base_local_slave_launch(jdata); OPAL_THREAD_LOCK(&orte_plm_globals.spawn_lock); orte_plm_globals.spawn_in_progress = false; OPAL_THREAD_UNLOCK(&orte_plm_globals.spawn_lock); return rc; } /* default to declaring the daemon launch as having failed */ failed_job = ORTE_PROC_MY_NAME->jobid; /* if we are timing, record the start time */ if (orte_timing) { gettimeofday(&orte_plm_globals.daemonlaunchstart, NULL); joblaunchstart = orte_plm_globals.daemonlaunchstart; } /* setup the job */ if (ORTE_SUCCESS != (rc = orte_plm_base_setup_job(jdata))) { ORTE_ERROR_LOG(rc); return rc; } OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, "%s plm:rsh: launching job %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid))); /* set the active jobid */ active_job = jdata->jobid; /* Get the map for this job */ if (NULL == (map = orte_rmaps.get_job_map(jdata->jobid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); rc = ORTE_ERR_NOT_FOUND; goto cleanup; } if (0 == map->num_new_daemons) { /* have all the daemons we need - launch app */ OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, "%s plm:rsh: no new daemons to launch", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto launch_apps; } if ((0 < opal_output_get_verbosity(orte_plm_globals.output) || orte_leave_session_attached) && mca_plm_rshbase_component.num_concurrent < map->num_new_daemons) { /** * If we are in '--debug-daemons' we keep the ssh connection * alive for the span of the run. If we use this option * AND we launch on more than "num_concurrent" machines * then we will deadlock. No connections are terminated * until the job is complete, no job is started * since all the orteds are waiting for all the others * to come online, and the others ore not launched because * we are waiting on those that have started to terminate * their ssh tunnels. :( * As we cannot run in this situation, pretty print the error * and return an error code. */ orte_show_help("help-plm-rsh.txt", "deadlock-params", true, mca_plm_rshbase_component.num_concurrent, map->num_new_daemons); rc = ORTE_ERR_FATAL; goto cleanup; } /* * After a discussion between Ralph & Jeff, we concluded that we * really are handling the prefix dir option incorrectly. It currently * is associated with an app_context, yet it really refers to the * location where OpenRTE/Open MPI is installed on a NODE. Fixing * this right now would involve significant change to orterun as well * as elsewhere, so we will intentionally leave this incorrect at this * point. The error, however, is identical to that seen in all prior * releases of OpenRTE/Open MPI, so our behavior is no worse than before. * * A note to fix this, along with ideas on how to do so, has been filed * on the project's Trac system under "feature enhancement". * * For now, default to the prefix_dir provided in the first app_context. * Since there always MUST be at least one app_context, we are safe in * doing this. */ app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, 0); /* we also need at least one node name so we can check what shell is * being used, if we have to */ node = NULL; for (nnode = 0; nnode < map->nodes->size; nnode++) { if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, nnode))) { break; } } if (NULL == node) { /* well, if there isn't even one node in the map, then we are hammered */ rc = ORTE_ERR_FATAL; goto cleanup; } prefix_dir = app->prefix_dir; /* setup the launch */ if (ORTE_SUCCESS != (rc = orte_plm_base_rsh_setup_launch(&argc, &argv, node->name, &node_name_index1, &proc_vpid_index, prefix_dir))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* set the active jobid */ active_job = jdata->jobid; /* * Iterate through each of the nodes */ for (nnode=0; nnode < map->nodes->size; nnode++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, nnode))) { continue; } /* if this daemon already exists, don't launch it! */ if (node->daemon_launched) { OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, "%s plm:rsh:launch daemon already exists on node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name)); continue; } /* if the node's daemon has not been defined, then we * have an error! */ if (NULL == node->daemon) { ORTE_ERROR_LOG(ORTE_ERR_FATAL); OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, "%s plm:rsh:launch daemon failed to be defined on node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name)); rc = ORTE_ERR_FATAL; goto cleanup; } /* setup node name */ free(argv[node_name_index1]); if (NULL != node->username && 0 != strlen (node->username)) { asprintf (&argv[node_name_index1], "%s@%s", node->username, node->name); } else { argv[node_name_index1] = strdup(node->name); } OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, "%s plm:rsh: launching on node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name)); /* fork a child to exec the rsh/ssh session */ pid = fork(); if (pid < 0) { ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_CHILDREN); rc = ORTE_ERR_SYS_LIMITS_CHILDREN; goto cleanup; } /* child */ if (pid == 0) { /* do the ssh launch - this will exit if it fails */ orte_plm_base_ssh_child(argc, argv, node->daemon->name.vpid, proc_vpid_index); } else { /* father */ /* indicate this daemon has been launched */ node->daemon->state = ORTE_PROC_STATE_LAUNCHED; /* record the pid */ node->daemon->pid = pid; OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, "%s plm:rsh: recording launch of daemon %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&node->daemon->name))); /* setup callback on sigchild - wait until setup above is complete * as the callback can occur in the call to orte_wait_cb */ orte_wait_cb(pid, orte_plm_rsh_wait_daemon, (void*)node->daemon); OPAL_THREAD_LOCK(&mca_plm_rshbase_component.lock); /* This situation can lead to a deadlock if '--debug-daemons' is set. * However, the deadlock condition is tested at the begining of this * function, so we're quite confident it should not happens here. */ if (num_in_progress++ >= mca_plm_rshbase_component.num_concurrent) { opal_condition_wait(&mca_plm_rshbase_component.cond, &mca_plm_rshbase_component.lock); } OPAL_THREAD_UNLOCK(&mca_plm_rshbase_component.lock); } } /* wait for daemons to callback */ if (ORTE_SUCCESS != (rc = orte_plm_base_daemon_callback(map->num_new_daemons))) { OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, "%s plm:rshbase: daemon launch failed for job %s on error %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(active_job), ORTE_ERROR_NAME(rc))); goto cleanup; } launch_apps: /* if we get here, then the daemons succeeded, so any failure would now be * for the application job */ failed_job = active_job; if (ORTE_SUCCESS != (rc = orte_plm_base_launch_apps(active_job))) { OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, "%s plm:rsh: launch of apps failed for job %s on error %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(active_job), ORTE_ERROR_NAME(rc))); goto cleanup; } /* wait for the launch to complete */ OPAL_THREAD_LOCK(&orte_plm_globals.spawn_lock); while (!orte_plm_globals.spawn_complete) { opal_condition_wait(&orte_plm_globals.spawn_cond, &orte_plm_globals.spawn_lock); } OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, "completed spawn for job %s", ORTE_JOBID_PRINT(jdata->jobid))); orte_plm_globals.spawn_in_progress = false; opal_condition_broadcast(&orte_plm_globals.spawn_in_progress_cond); OPAL_THREAD_UNLOCK(&orte_plm_globals.spawn_lock); /* get here if launch went okay */ failed_launch = false; if (orte_timing ) { if (0 != gettimeofday(&joblaunchstop, NULL)) { opal_output(0, "plm_rsh: could not obtain job launch stop time"); } else { opal_output(0, "plm_rsh: total job launch time is %ld usec", (joblaunchstop.tv_sec - joblaunchstart.tv_sec)*1000000 + (joblaunchstop.tv_usec - joblaunchstart.tv_usec)); } } cleanup: if (NULL != argv) { opal_argv_free(argv); } /* check for failed launch - if so, force terminate */ if (failed_launch) { orte_errmgr.update_state(failed_job, ORTE_JOB_STATE_FAILED_TO_START, NULL, ORTE_PROC_STATE_UNDEF, 0, ORTE_ERROR_DEFAULT_EXIT_CODE); } return rc; } /** * Terminate the orteds for a given job */ static int terminate_orteds(void) { int rc; /* now tell them to die - we need them to "phone home", though, * so we can know that they have exited */ if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_CMD))) { ORTE_ERROR_LOG(rc); } return rc; } static int signal_job(orte_jobid_t jobid, int32_t signal) { int rc; /* order them to pass this signal to their local procs */ if (ORTE_SUCCESS != (rc = orte_plm_base_orted_signal_local_procs(jobid, signal))) { ORTE_ERROR_LOG(rc); } return rc; } static int finalize(void) { int rc; /* cleanup any pending recvs */ if (ORTE_SUCCESS != (rc = orte_plm_base_comm_stop())) { ORTE_ERROR_LOG(rc); } return rc; }