2006-08-11 01:46:52 +04:00
|
|
|
/*
|
|
|
|
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
|
|
|
* University Research and Technology
|
|
|
|
* Corporation. All rights reserved.
|
|
|
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
|
|
|
* of Tennessee Research Foundation. All rights
|
|
|
|
* reserved.
|
|
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
|
|
* University of Stuttgart. All rights reserved.
|
|
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
|
|
* All rights reserved.
|
|
|
|
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
|
2007-06-13 19:30:18 +04:00
|
|
|
* Copyright (c) 2006-2007 Sun Microsystems, Inc. All rights reserved.
|
2006-08-11 01:46:52 +04:00
|
|
|
* Use is subject to license terms.
|
2007-06-05 07:03:59 +04:00
|
|
|
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
|
|
|
|
* reserved.
|
2006-08-11 01:46:52 +04:00
|
|
|
* $COPYRIGHT$
|
|
|
|
*
|
|
|
|
* Additional copyrights may follow
|
|
|
|
*
|
|
|
|
* $HEADER$
|
|
|
|
*
|
|
|
|
* These symbols are in a file by themselves to provide nice linker
|
|
|
|
* semantics. Since linkers generally pull in symbols by object
|
|
|
|
* files, keeping these symbols as the only symbols in this file
|
|
|
|
* prevents utility programs such as "ompi_info" from having to import
|
|
|
|
* entire components just to query their version and parameters.
|
|
|
|
*/
|
|
|
|
/**
|
|
|
|
* @file:
|
|
|
|
* Part of the gridengine launcher.
|
|
|
|
* See pls_gridengine.h for an overview of how it works.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "orte_config.h"
|
2006-09-15 01:29:51 +04:00
|
|
|
#include "orte/orte_constants.h"
|
2006-08-11 01:46:52 +04:00
|
|
|
|
|
|
|
#include <stdlib.h>
|
|
|
|
#ifdef HAVE_UNISTD_H
|
|
|
|
#include <unistd.h>
|
|
|
|
#endif
|
|
|
|
#include <errno.h>
|
|
|
|
#include <string.h>
|
|
|
|
#ifdef HAVE_SYS_SELECT_H
|
|
|
|
#include <sys/select.h>
|
|
|
|
#endif
|
|
|
|
#ifdef HAVE_SYS_TIME_H
|
|
|
|
#include <sys/time.h>
|
|
|
|
#endif
|
|
|
|
#ifdef HAVE_SYS_TYPES_H
|
|
|
|
#include <sys/types.h>
|
|
|
|
#endif
|
|
|
|
#ifdef HAVE_SYS_STAT_H
|
|
|
|
#include <sys/stat.h>
|
|
|
|
#endif
|
|
|
|
#ifdef HAVE_SYS_WAIT_H
|
|
|
|
#include <sys/wait.h>
|
|
|
|
#endif
|
|
|
|
#include <fcntl.h>
|
|
|
|
#include <signal.h>
|
|
|
|
#ifdef HAVE_PWD_H
|
|
|
|
#include <pwd.h>
|
|
|
|
#endif
|
|
|
|
|
2007-04-21 04:15:05 +04:00
|
|
|
#include "opal/mca/installdirs/installdirs.h"
|
2006-08-11 01:46:52 +04:00
|
|
|
#include "opal/mca/base/mca_base_param.h"
|
|
|
|
#include "opal/util/if.h"
|
2006-08-24 00:40:01 +04:00
|
|
|
#include "opal/util/os_path.h"
|
2006-08-11 01:46:52 +04:00
|
|
|
#include "opal/util/path.h"
|
|
|
|
#include "opal/event/event.h"
|
|
|
|
#include "opal/util/show_help.h"
|
|
|
|
#include "opal/util/argv.h"
|
|
|
|
#include "opal/util/opal_environ.h"
|
|
|
|
#include "opal/util/output.h"
|
|
|
|
#include "opal/util/basename.h"
|
2006-09-15 01:29:51 +04:00
|
|
|
|
Bring in the generalized xcast communication system along with the correspondingly revised orted launch. I will send a message out to developers explaining the basic changes. In brief:
1. generalize orte_rml.xcast to become a general broadcast-like messaging system. Messages can now be sent to any tag on the daemons or processes. Note that any message sent via xcast will be delivered to ALL processes in the specified job - you don't get to pick and choose. At a later date, we will introduce an augmented capability that will use the daemons as relays, but will allow you to send to a specified array of process names.
2. extended orte_rml.xcast so it supports more scalable message routing methodologies. At the moment, we support three: (a) direct, which sends the message directly to all recipients; (b) linear, which sends the message to the local daemon on each node, which then relays it to its own local procs; and (b) binomial, which sends the message via a binomial algo across all the daemons, each of which then relays to its own local procs. The crossover points between the algos are adjustable via MCA param, or you can simply demand that a specific algo be used.
3. orteds no longer exhibit two types of behavior: bootproxy or VM. Orteds now always behave like they are part of a virtual machine - they simply launch a job if mpirun tells them to do so. This is another step towards creating an "orteboot" functionality, but also provided a clean system for supporting message relaying.
Note one major impact of this commit: multiple daemons on a node cannot be supported any longer! Only a single daemon/node is now allowed.
This commit is known to break support for the following environments: POE, Xgrid, Xcpu, Windows. It has been tested on rsh, SLURM, and Bproc. Modifications for TM support have been made but could not be verified due to machine problems at LANL. Modifications for SGE have been made but could not be verified. The developers for the non-verified environments will be separately notified along with suggestions on how to fix the problems.
This commit was SVN r15007.
2007-06-12 17:28:54 +04:00
|
|
|
#include "orte/runtime/params.h"
|
2006-08-11 01:46:52 +04:00
|
|
|
#include "orte/util/univ_info.h"
|
|
|
|
#include "orte/util/session_dir.h"
|
2006-09-15 01:29:51 +04:00
|
|
|
#include "orte/util/sys_info.h"
|
2006-08-11 01:46:52 +04:00
|
|
|
#include "orte/runtime/orte_wait.h"
|
2007-04-26 19:08:37 +04:00
|
|
|
#include "orte/runtime/orte_wakeup.h"
|
2006-08-11 01:46:52 +04:00
|
|
|
#include "orte/mca/ns/ns.h"
|
|
|
|
#include "orte/mca/rml/rml.h"
|
|
|
|
#include "orte/mca/gpr/gpr.h"
|
|
|
|
#include "orte/mca/errmgr/errmgr.h"
|
2006-09-15 01:29:51 +04:00
|
|
|
#include "orte/mca/ras/ras_types.h"
|
2006-10-07 19:45:24 +04:00
|
|
|
#include "orte/mca/rmaps/rmaps.h"
|
2006-08-16 20:35:09 +04:00
|
|
|
#include "orte/mca/smr/smr.h"
|
2006-09-15 01:29:51 +04:00
|
|
|
|
|
|
|
#include "orte/mca/pls/pls.h"
|
2006-11-16 18:11:45 +03:00
|
|
|
#include "orte/mca/pls/base/base.h"
|
2006-09-15 01:29:51 +04:00
|
|
|
#include "orte/mca/pls/base/pls_private.h"
|
2006-08-11 01:46:52 +04:00
|
|
|
#include "orte/mca/pls/gridengine/pls_gridengine.h"
|
|
|
|
|
2006-09-15 01:29:51 +04:00
|
|
|
orte_pls_base_module_t orte_pls_gridengine_module = {
|
|
|
|
orte_pls_gridengine_launch_job,
|
2006-08-11 01:46:52 +04:00
|
|
|
orte_pls_gridengine_terminate_job,
|
2006-09-15 01:29:51 +04:00
|
|
|
orte_pls_gridengine_terminate_orteds,
|
2006-08-11 01:46:52 +04:00
|
|
|
orte_pls_gridengine_terminate_proc,
|
|
|
|
orte_pls_gridengine_signal_job,
|
|
|
|
orte_pls_gridengine_signal_proc,
|
|
|
|
orte_pls_gridengine_finalize
|
|
|
|
};
|
|
|
|
|
|
|
|
static void set_handler_default(int sig);
|
|
|
|
|
2007-04-26 19:08:37 +04:00
|
|
|
/* global storage of active jobid being launched */
|
|
|
|
static orte_jobid_t active_job=ORTE_JOBID_INVALID;
|
|
|
|
|
2006-08-11 01:46:52 +04:00
|
|
|
/**
|
|
|
|
* Fill the orted_path variable with the directory to the orted
|
|
|
|
*/
|
|
|
|
static int orte_pls_gridengine_fill_orted_path(char** orted_path)
|
|
|
|
{
|
|
|
|
struct stat buf;
|
|
|
|
|
2007-04-21 04:15:05 +04:00
|
|
|
asprintf(orted_path, "%s/orted", opal_install_dirs.bindir);
|
2006-08-11 01:46:52 +04:00
|
|
|
if (0 != stat(*orted_path, &buf)) {
|
|
|
|
char *path = getenv("PATH");
|
|
|
|
if (NULL == path) {
|
|
|
|
path = ("PATH is empty!");
|
|
|
|
}
|
|
|
|
opal_show_help("help-pls-gridengine.txt", "no-local-orted",
|
2007-04-21 04:15:05 +04:00
|
|
|
true, path, opal_install_dirs.bindir);
|
2006-08-11 01:46:52 +04:00
|
|
|
return ORTE_ERR_NOT_FOUND;
|
|
|
|
}
|
|
|
|
|
|
|
|
return ORTE_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Callback on daemon exit.
|
|
|
|
*/
|
|
|
|
static void orte_pls_gridengine_wait_daemon(pid_t pid, int status, void* cbdata)
|
|
|
|
{
|
|
|
|
if (! WIFEXITED(status) || ! WEXITSTATUS(status) == 0) {
|
2006-09-15 01:29:51 +04:00
|
|
|
/* tell the user something went wrong. We need to do this BEFORE we
|
|
|
|
* set the state to ABORTED as that action will cause a trigger to
|
|
|
|
* fire that will kill the job before any output would get printed!
|
|
|
|
*/
|
2007-04-26 19:08:37 +04:00
|
|
|
opal_output(0, "ERROR: A daemon failed to start as expected.");
|
2006-08-11 01:46:52 +04:00
|
|
|
opal_output(0, "ERROR: There may be more information available from");
|
|
|
|
opal_output(0, "ERROR: the 'qstat -t' command on the Grid Engine tasks.");
|
|
|
|
opal_output(0, "ERROR: If the problem persists, please restart the");
|
|
|
|
opal_output(0, "ERROR: Grid Engine PE job");
|
|
|
|
if (WIFEXITED(status)) {
|
|
|
|
opal_output(0, "ERROR: The daemon exited unexpectedly with status %d.",
|
2006-09-15 01:29:51 +04:00
|
|
|
WEXITSTATUS(status));
|
2006-08-11 01:46:52 +04:00
|
|
|
} else if (WIFSIGNALED(status)) {
|
|
|
|
#ifdef WCOREDUMP
|
|
|
|
if (WCOREDUMP(status)) {
|
|
|
|
opal_output(0, "The daemon received a signal %d (with core).",
|
2006-09-15 01:29:51 +04:00
|
|
|
WTERMSIG(status));
|
2006-08-11 01:46:52 +04:00
|
|
|
} else {
|
|
|
|
opal_output(0, "The daemon received a signal %d.", WTERMSIG(status));
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
opal_output(0, "The daemon received a signal %d.", WTERMSIG(status));
|
|
|
|
#endif /* WCOREDUMP */
|
|
|
|
} else {
|
|
|
|
opal_output(0, "No extra status information is available: %d.", status);
|
|
|
|
}
|
2006-09-15 01:29:51 +04:00
|
|
|
|
These changes were mostly captured in a prior RFC (except for #2 below) and are aimed specifically at improving startup performance and setting up the remaining modifications described in that RFC.
The commit has been tested for C/R and Cray operations, and on Odin (SLURM, rsh) and RoadRunner (TM). I tried to update all environments, but obviously could not test them. I know that Windows needs some work, and have highlighted what is know to be needed in the odls process component.
This represents a lot of work by Brian, Tim P, Josh, and myself, with much advice from Jeff and others. For posterity, I have appended a copy of the email describing the work that was done:
As we have repeatedly noted, the modex operation in MPI_Init is the single greatest consumer of time during startup. To-date, we have executed that operation as an ORTE stage gate that held the process until a startup message containing all required modex (and OOB contact info - see #3 below) info could be sent to it. Each process would send its data to the HNP's registry, which assembled and sent the message when all processes had reported in.
In addition, ORTE had taken responsibility for monitoring process status as it progressed through a series of "stage gates". The process reported its status at each gate, and ORTE would then send a "release" message once all procs had reported in.
The incoming changes revamp these procedures in three ways:
1. eliminating the ORTE stage gate system and cleanly delineating responsibility between the OMPI and ORTE layers for MPI init/finalize. The modex stage gate (STG1) has been replaced by a collective operation in the modex itself that performs an allgather on the required modex info. The allgather is implemented using the orte_grpcomm framework since the BTL's are not active at that point. At the moment, the grpcomm framework only has a "basic" component analogous to OMPI's "basic" coll framework - I would recommend that the MPI team create additional, more advanced components to improve performance of this step.
The other stage gates have been replaced by orte_grpcomm barrier functions. We tried to use MPI barriers instead (since the BTL's are active at that point), but - as we discussed on the telecon - these are not currently true barriers so the job would hang when we fell through while messages were still in process. Note that the grpcomm barrier doesn't actually resolve that problem, but Brian has pointed out that we are unlikely to ever see it violated. Again, you might want to spend a little time on an advanced barrier algorithm as the one in "basic" is very simplistic.
Summarizing this change: ORTE no longer tracks process state nor has direct responsibility for synchronizing jobs. This is now done via collective operations within the MPI layer, albeit using ORTE collective communication services. I -strongly- urge the MPI team to implement advanced collective algorithms to improve the performance of this critical procedure.
2. reducing the volume of data exchanged during modex. Data in the modex consisted of the process name, the name of the node where that process is located (expressed as a string), plus a string representation of all contact info. The nodename was required in order for the modex to determine if the process was local or not - in addition, some people like to have it to print pretty error messages when a connection failed.
The size of this data has been reduced in three ways:
(a) reducing the size of the process name itself. The process name consisted of two 32-bit fields for the jobid and vpid. This is far larger than any current system, or system likely to exist in the near future, can support. Accordingly, the default size of these fields has been reduced to 16-bits, which means you can have 32k procs in each of 32k jobs. Since the daemons must have a vpid, and we require one daemon/node, this also restricts the default configuration to 32k nodes.
To support any future "mega-clusters", a configuration option --enable-jumbo-apps has been added. This option increases the jobid and vpid field sizes to 32-bits. Someday, if necessary, someone can add yet another option to increase them to 64-bits, I suppose.
(b) replacing the string nodename with an integer nodeid. Since we have one daemon/node, the nodeid corresponds to the local daemon's vpid. This replaces an often lengthy string with only 2 (or at most 4) bytes, a substantial reduction.
(c) when the mca param requesting that nodenames be sent to support pretty error messages, a second mca param is now used to request FQDN - otherwise, the domain name is stripped (by default) from the message to save space. If someone wants to combine those into a single param somehow (perhaps with an argument?), they are welcome to do so - I didn't want to alter what people are already using.
While these may seem like small savings, they actually amount to a significant impact when aggregated across the entire modex operation. Since every proc must receive the modex data regardless of the collective used to send it, just reducing the size of the process name removes nearly 400MBytes of communication from a 32k proc job (admittedly, much of this comm may occur in parallel). So it does add up pretty quickly.
3. routing RML messages to reduce connections. The default messaging system remains point-to-point - i.e., each proc opens a socket to every proc it communicates with and sends its messages directly. A new option uses the orteds as routers - i.e., each proc only opens a single socket to its local orted. All messages are sent from the proc to the orted, which forwards the message to the orted on the node where the intended recipient proc is located - that orted then forwards the message to its local proc (the recipient). This greatly reduces the connection storm we have encountered during startup.
It also has the benefit of removing the sharing of every proc's OOB contact with every other proc. The orted routing tables are populated during launch since every orted gets a map of where every proc is being placed. Each proc, therefore, only needs to know the contact info for its local daemon, which is passed in via the environment when the proc is fork/exec'd by the daemon. This alone removes ~50 bytes/process of communication that was in the current STG1 startup message - so for our 32k proc job, this saves us roughly 32k*50 = 1.6MBytes sent to 32k procs = 51GBytes of messaging.
Note that you can use the new routing method by specifying -mca routed tree - if you so desire. This mode will become the default at some point in the future.
There are a few minor additional changes in the commit that I'll just note in passing:
* propagation of command line mca params to the orteds - fixes ticket #1073. See note there for details.
* requiring of "finalize" prior to "exit" for MPI procs - fixes ticket #1144. See note there for details.
* cleanup of some stale header files
This commit was SVN r16364.
2007-10-05 23:48:23 +04:00
|
|
|
/* report that the daemon has failed so we break out of the daemon
|
|
|
|
* callback receive and can exit
|
2007-07-12 23:53:18 +04:00
|
|
|
*/
|
These changes were mostly captured in a prior RFC (except for #2 below) and are aimed specifically at improving startup performance and setting up the remaining modifications described in that RFC.
The commit has been tested for C/R and Cray operations, and on Odin (SLURM, rsh) and RoadRunner (TM). I tried to update all environments, but obviously could not test them. I know that Windows needs some work, and have highlighted what is know to be needed in the odls process component.
This represents a lot of work by Brian, Tim P, Josh, and myself, with much advice from Jeff and others. For posterity, I have appended a copy of the email describing the work that was done:
As we have repeatedly noted, the modex operation in MPI_Init is the single greatest consumer of time during startup. To-date, we have executed that operation as an ORTE stage gate that held the process until a startup message containing all required modex (and OOB contact info - see #3 below) info could be sent to it. Each process would send its data to the HNP's registry, which assembled and sent the message when all processes had reported in.
In addition, ORTE had taken responsibility for monitoring process status as it progressed through a series of "stage gates". The process reported its status at each gate, and ORTE would then send a "release" message once all procs had reported in.
The incoming changes revamp these procedures in three ways:
1. eliminating the ORTE stage gate system and cleanly delineating responsibility between the OMPI and ORTE layers for MPI init/finalize. The modex stage gate (STG1) has been replaced by a collective operation in the modex itself that performs an allgather on the required modex info. The allgather is implemented using the orte_grpcomm framework since the BTL's are not active at that point. At the moment, the grpcomm framework only has a "basic" component analogous to OMPI's "basic" coll framework - I would recommend that the MPI team create additional, more advanced components to improve performance of this step.
The other stage gates have been replaced by orte_grpcomm barrier functions. We tried to use MPI barriers instead (since the BTL's are active at that point), but - as we discussed on the telecon - these are not currently true barriers so the job would hang when we fell through while messages were still in process. Note that the grpcomm barrier doesn't actually resolve that problem, but Brian has pointed out that we are unlikely to ever see it violated. Again, you might want to spend a little time on an advanced barrier algorithm as the one in "basic" is very simplistic.
Summarizing this change: ORTE no longer tracks process state nor has direct responsibility for synchronizing jobs. This is now done via collective operations within the MPI layer, albeit using ORTE collective communication services. I -strongly- urge the MPI team to implement advanced collective algorithms to improve the performance of this critical procedure.
2. reducing the volume of data exchanged during modex. Data in the modex consisted of the process name, the name of the node where that process is located (expressed as a string), plus a string representation of all contact info. The nodename was required in order for the modex to determine if the process was local or not - in addition, some people like to have it to print pretty error messages when a connection failed.
The size of this data has been reduced in three ways:
(a) reducing the size of the process name itself. The process name consisted of two 32-bit fields for the jobid and vpid. This is far larger than any current system, or system likely to exist in the near future, can support. Accordingly, the default size of these fields has been reduced to 16-bits, which means you can have 32k procs in each of 32k jobs. Since the daemons must have a vpid, and we require one daemon/node, this also restricts the default configuration to 32k nodes.
To support any future "mega-clusters", a configuration option --enable-jumbo-apps has been added. This option increases the jobid and vpid field sizes to 32-bits. Someday, if necessary, someone can add yet another option to increase them to 64-bits, I suppose.
(b) replacing the string nodename with an integer nodeid. Since we have one daemon/node, the nodeid corresponds to the local daemon's vpid. This replaces an often lengthy string with only 2 (or at most 4) bytes, a substantial reduction.
(c) when the mca param requesting that nodenames be sent to support pretty error messages, a second mca param is now used to request FQDN - otherwise, the domain name is stripped (by default) from the message to save space. If someone wants to combine those into a single param somehow (perhaps with an argument?), they are welcome to do so - I didn't want to alter what people are already using.
While these may seem like small savings, they actually amount to a significant impact when aggregated across the entire modex operation. Since every proc must receive the modex data regardless of the collective used to send it, just reducing the size of the process name removes nearly 400MBytes of communication from a 32k proc job (admittedly, much of this comm may occur in parallel). So it does add up pretty quickly.
3. routing RML messages to reduce connections. The default messaging system remains point-to-point - i.e., each proc opens a socket to every proc it communicates with and sends its messages directly. A new option uses the orteds as routers - i.e., each proc only opens a single socket to its local orted. All messages are sent from the proc to the orted, which forwards the message to the orted on the node where the intended recipient proc is located - that orted then forwards the message to its local proc (the recipient). This greatly reduces the connection storm we have encountered during startup.
It also has the benefit of removing the sharing of every proc's OOB contact with every other proc. The orted routing tables are populated during launch since every orted gets a map of where every proc is being placed. Each proc, therefore, only needs to know the contact info for its local daemon, which is passed in via the environment when the proc is fork/exec'd by the daemon. This alone removes ~50 bytes/process of communication that was in the current STG1 startup message - so for our 32k proc job, this saves us roughly 32k*50 = 1.6MBytes sent to 32k procs = 51GBytes of messaging.
Note that you can use the new routing method by specifying -mca routed tree - if you so desire. This mode will become the default at some point in the future.
There are a few minor additional changes in the commit that I'll just note in passing:
* propagation of command line mca params to the orteds - fixes ticket #1073. See note there for details.
* requiring of "finalize" prior to "exit" for MPI procs - fixes ticket #1144. See note there for details.
* cleanup of some stale header files
This commit was SVN r16364.
2007-10-05 23:48:23 +04:00
|
|
|
orte_pls_base_daemon_failed(active_job, true, pid, status, ORTE_JOB_STATE_FAILED_TO_START);
|
2006-08-11 01:46:52 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Launch a daemon (bootproxy) on each node. The daemon will be responsible
|
|
|
|
* for launching the application.
|
|
|
|
*/
|
2007-04-26 19:08:37 +04:00
|
|
|
/* When working in this function, ALWAYS jump to "cleanup" if
|
|
|
|
* you encounter an error so that orterun will be woken up and
|
|
|
|
* the job can cleanly terminate
|
|
|
|
*/
|
2006-09-15 01:29:51 +04:00
|
|
|
int orte_pls_gridengine_launch_job(orte_jobid_t jobid)
|
2006-08-11 01:46:52 +04:00
|
|
|
{
|
2007-04-26 19:08:37 +04:00
|
|
|
orte_job_map_t *map=NULL;
|
2006-10-07 19:45:24 +04:00
|
|
|
opal_list_item_t *n_item;
|
2006-08-15 23:54:10 +04:00
|
|
|
orte_std_cntr_t num_nodes;
|
2006-08-11 01:46:52 +04:00
|
|
|
int node_name_index1;
|
|
|
|
int node_name_index2;
|
|
|
|
int proc_name_index;
|
|
|
|
int orted_index;
|
2006-10-07 19:45:24 +04:00
|
|
|
char *prefix_dir;
|
2007-04-10 18:23:32 +04:00
|
|
|
char *param;
|
2007-04-26 19:08:37 +04:00
|
|
|
char **argv=NULL;
|
|
|
|
char **env=NULL;
|
2006-08-11 01:46:52 +04:00
|
|
|
int argc;
|
|
|
|
int rc;
|
|
|
|
sigset_t sigs;
|
|
|
|
char *lib_base = NULL, *bin_base = NULL;
|
|
|
|
char *sge_root, *sge_arch;
|
2007-04-26 19:08:37 +04:00
|
|
|
bool failed_launch = true;
|
2006-09-15 01:29:51 +04:00
|
|
|
|
2007-04-26 19:08:37 +04:00
|
|
|
/* set the active jobid */
|
|
|
|
active_job = jobid;
|
2006-09-15 01:29:51 +04:00
|
|
|
|
2006-10-07 19:45:24 +04:00
|
|
|
/* Get the map for this job.
|
2006-08-11 01:46:52 +04:00
|
|
|
* We need the entire mapping for a couple of reasons:
|
|
|
|
* - need the prefix to start with.
|
|
|
|
* - need to know if we are launching on a subset of the allocated nodes
|
|
|
|
* All other mapping responsibilities fall to orted in the fork PLS
|
|
|
|
*/
|
2006-10-07 19:45:24 +04:00
|
|
|
rc = orte_rmaps.get_job_map(&map, jobid);
|
2006-08-11 01:46:52 +04:00
|
|
|
if (ORTE_SUCCESS != rc) {
|
2006-09-15 01:29:51 +04:00
|
|
|
ORTE_ERROR_LOG(rc);
|
2007-04-26 19:08:37 +04:00
|
|
|
goto cleanup;
|
2006-08-11 01:46:52 +04:00
|
|
|
}
|
|
|
|
|
2007-06-13 19:02:47 +04:00
|
|
|
num_nodes = map->num_new_daemons;
|
2006-11-16 18:11:45 +03:00
|
|
|
if (num_nodes == 0) {
|
2007-07-12 23:53:18 +04:00
|
|
|
/* have all the daemons we need - launch app */
|
|
|
|
if (mca_pls_gridengine_component.debug) {
|
|
|
|
opal_output(0, "pls:rsh: no new daemons to launch");
|
|
|
|
}
|
|
|
|
goto launch_apps;
|
2006-11-16 18:11:45 +03:00
|
|
|
}
|
|
|
|
|
2006-08-11 01:46:52 +04:00
|
|
|
/*
|
|
|
|
* Build argv array
|
|
|
|
*/
|
|
|
|
argv = opal_argv_split("qrsh", ' ');
|
|
|
|
argc = opal_argv_count(argv);
|
|
|
|
/* gridengine specific flags */
|
|
|
|
opal_argv_append(&argc, &argv, "-inherit");/*run tasks within curr job*/
|
|
|
|
opal_argv_append(&argc, &argv, "-noshell");/*execute w/o wrapping shell*/
|
|
|
|
opal_argv_append(&argc, &argv, "-nostdin");/*suppress input stream stdin*/
|
|
|
|
opal_argv_append(&argc, &argv, "-V"); /*task to have the env as job*/
|
|
|
|
if (mca_pls_gridengine_component.verbose) {
|
|
|
|
opal_argv_append(&argc, &argv, "-verbose");
|
|
|
|
}
|
|
|
|
|
|
|
|
node_name_index1 = argc;
|
|
|
|
opal_argv_append(&argc, &argv, "<template>");
|
2007-07-12 23:53:18 +04:00
|
|
|
|
|
|
|
/* add the orted daemon in command */
|
2006-08-11 01:46:52 +04:00
|
|
|
orted_index = argc;
|
|
|
|
opal_argv_append(&argc, &argv, mca_pls_gridengine_component.orted);
|
|
|
|
|
2007-06-13 21:11:37 +04:00
|
|
|
/* By default, --no-daemonize will be used and orted will be forced to
|
|
|
|
* to stay in the same ptree as sge_shephard. The problem with
|
|
|
|
* --no-daemonize is that the qrsh -inherit connections will stay
|
|
|
|
* persistent for the whole duration of the task to the remote nodes,
|
|
|
|
* which may not be ideal for large number of nodes */
|
|
|
|
if (! mca_pls_gridengine_component.daemonize_orted) {
|
2007-07-12 23:53:18 +04:00
|
|
|
/* the actual orted option will be added when we
|
|
|
|
* append_basic_args
|
|
|
|
*/
|
|
|
|
orte_no_daemonize_flag = true;
|
2007-06-13 21:11:37 +04:00
|
|
|
}
|
|
|
|
|
2007-07-12 23:53:18 +04:00
|
|
|
/* Add basic orted command line options, including
|
|
|
|
* all debug options
|
|
|
|
*/
|
2007-04-10 18:23:32 +04:00
|
|
|
orte_pls_base_orted_append_basic_args(&argc, &argv,
|
|
|
|
&proc_name_index,
|
These changes were mostly captured in a prior RFC (except for #2 below) and are aimed specifically at improving startup performance and setting up the remaining modifications described in that RFC.
The commit has been tested for C/R and Cray operations, and on Odin (SLURM, rsh) and RoadRunner (TM). I tried to update all environments, but obviously could not test them. I know that Windows needs some work, and have highlighted what is know to be needed in the odls process component.
This represents a lot of work by Brian, Tim P, Josh, and myself, with much advice from Jeff and others. For posterity, I have appended a copy of the email describing the work that was done:
As we have repeatedly noted, the modex operation in MPI_Init is the single greatest consumer of time during startup. To-date, we have executed that operation as an ORTE stage gate that held the process until a startup message containing all required modex (and OOB contact info - see #3 below) info could be sent to it. Each process would send its data to the HNP's registry, which assembled and sent the message when all processes had reported in.
In addition, ORTE had taken responsibility for monitoring process status as it progressed through a series of "stage gates". The process reported its status at each gate, and ORTE would then send a "release" message once all procs had reported in.
The incoming changes revamp these procedures in three ways:
1. eliminating the ORTE stage gate system and cleanly delineating responsibility between the OMPI and ORTE layers for MPI init/finalize. The modex stage gate (STG1) has been replaced by a collective operation in the modex itself that performs an allgather on the required modex info. The allgather is implemented using the orte_grpcomm framework since the BTL's are not active at that point. At the moment, the grpcomm framework only has a "basic" component analogous to OMPI's "basic" coll framework - I would recommend that the MPI team create additional, more advanced components to improve performance of this step.
The other stage gates have been replaced by orte_grpcomm barrier functions. We tried to use MPI barriers instead (since the BTL's are active at that point), but - as we discussed on the telecon - these are not currently true barriers so the job would hang when we fell through while messages were still in process. Note that the grpcomm barrier doesn't actually resolve that problem, but Brian has pointed out that we are unlikely to ever see it violated. Again, you might want to spend a little time on an advanced barrier algorithm as the one in "basic" is very simplistic.
Summarizing this change: ORTE no longer tracks process state nor has direct responsibility for synchronizing jobs. This is now done via collective operations within the MPI layer, albeit using ORTE collective communication services. I -strongly- urge the MPI team to implement advanced collective algorithms to improve the performance of this critical procedure.
2. reducing the volume of data exchanged during modex. Data in the modex consisted of the process name, the name of the node where that process is located (expressed as a string), plus a string representation of all contact info. The nodename was required in order for the modex to determine if the process was local or not - in addition, some people like to have it to print pretty error messages when a connection failed.
The size of this data has been reduced in three ways:
(a) reducing the size of the process name itself. The process name consisted of two 32-bit fields for the jobid and vpid. This is far larger than any current system, or system likely to exist in the near future, can support. Accordingly, the default size of these fields has been reduced to 16-bits, which means you can have 32k procs in each of 32k jobs. Since the daemons must have a vpid, and we require one daemon/node, this also restricts the default configuration to 32k nodes.
To support any future "mega-clusters", a configuration option --enable-jumbo-apps has been added. This option increases the jobid and vpid field sizes to 32-bits. Someday, if necessary, someone can add yet another option to increase them to 64-bits, I suppose.
(b) replacing the string nodename with an integer nodeid. Since we have one daemon/node, the nodeid corresponds to the local daemon's vpid. This replaces an often lengthy string with only 2 (or at most 4) bytes, a substantial reduction.
(c) when the mca param requesting that nodenames be sent to support pretty error messages, a second mca param is now used to request FQDN - otherwise, the domain name is stripped (by default) from the message to save space. If someone wants to combine those into a single param somehow (perhaps with an argument?), they are welcome to do so - I didn't want to alter what people are already using.
While these may seem like small savings, they actually amount to a significant impact when aggregated across the entire modex operation. Since every proc must receive the modex data regardless of the collective used to send it, just reducing the size of the process name removes nearly 400MBytes of communication from a 32k proc job (admittedly, much of this comm may occur in parallel). So it does add up pretty quickly.
3. routing RML messages to reduce connections. The default messaging system remains point-to-point - i.e., each proc opens a socket to every proc it communicates with and sends its messages directly. A new option uses the orteds as routers - i.e., each proc only opens a single socket to its local orted. All messages are sent from the proc to the orted, which forwards the message to the orted on the node where the intended recipient proc is located - that orted then forwards the message to its local proc (the recipient). This greatly reduces the connection storm we have encountered during startup.
It also has the benefit of removing the sharing of every proc's OOB contact with every other proc. The orted routing tables are populated during launch since every orted gets a map of where every proc is being placed. Each proc, therefore, only needs to know the contact info for its local daemon, which is passed in via the environment when the proc is fork/exec'd by the daemon. This alone removes ~50 bytes/process of communication that was in the current STG1 startup message - so for our 32k proc job, this saves us roughly 32k*50 = 1.6MBytes sent to 32k procs = 51GBytes of messaging.
Note that you can use the new routing method by specifying -mca routed tree - if you so desire. This mode will become the default at some point in the future.
There are a few minor additional changes in the commit that I'll just note in passing:
* propagation of command line mca params to the orteds - fixes ticket #1073. See note there for details.
* requiring of "finalize" prior to "exit" for MPI procs - fixes ticket #1144. See note there for details.
* cleanup of some stale header files
This commit was SVN r16364.
2007-10-05 23:48:23 +04:00
|
|
|
&node_name_index2);
|
2006-08-11 01:46:52 +04:00
|
|
|
|
2006-10-20 20:50:13 +04:00
|
|
|
/* setup environment. The environment is common to all the daemons
|
|
|
|
* so we only need to do this once
|
|
|
|
*/
|
|
|
|
env = opal_argv_copy(environ);
|
|
|
|
|
|
|
|
/* clean out any MCA component selection directives that
|
|
|
|
* won't work on remote nodes
|
|
|
|
*/
|
|
|
|
orte_pls_base_purge_mca_params(&env);
|
|
|
|
|
|
|
|
if (mca_pls_gridengine_component.debug) {
|
2006-08-11 01:46:52 +04:00
|
|
|
param = opal_argv_join(argv, ' ');
|
|
|
|
if (NULL != param) {
|
|
|
|
opal_output(0, "pls:gridengine: final template argv:");
|
|
|
|
opal_output(0, "pls:gridengine: %s", param);
|
|
|
|
free(param);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Figure out the basenames for the libdir and bindir. There is a
|
|
|
|
lengthy comment about this in pls_rsh_module.c explaining all
|
2006-10-07 19:45:24 +04:00
|
|
|
the rationale for how / why we're doing this.
|
|
|
|
*/
|
2006-08-11 01:46:52 +04:00
|
|
|
|
2007-04-21 04:15:05 +04:00
|
|
|
lib_base = opal_basename(opal_install_dirs.libdir);
|
|
|
|
bin_base = opal_basename(opal_install_dirs.bindir);
|
2006-08-11 01:46:52 +04:00
|
|
|
|
2006-10-07 19:45:24 +04:00
|
|
|
/* See the note about prefix_dir in the orte/mca/pls/slurm/pls_slurm.c
|
|
|
|
* module. Fo here, just note that we must have at least one app_context,
|
|
|
|
* and we take the prefix_dir from that first one.
|
|
|
|
*/
|
|
|
|
prefix_dir = map->apps[0]->prefix_dir;
|
|
|
|
|
2006-10-20 20:50:13 +04:00
|
|
|
/* If we have a prefix, then modify the PATH and
|
|
|
|
LD_LIBRARY_PATH environment variables.
|
|
|
|
*/
|
|
|
|
if (NULL != prefix_dir) {
|
|
|
|
char *oldenv, *newenv;
|
|
|
|
|
|
|
|
/* Reset PATH */
|
|
|
|
newenv = opal_os_path( false, prefix_dir, bin_base, NULL );
|
|
|
|
oldenv = getenv("PATH");
|
|
|
|
if (NULL != oldenv) {
|
|
|
|
char *temp;
|
|
|
|
asprintf(&temp, "%s:%s", newenv, oldenv);
|
|
|
|
free( newenv );
|
|
|
|
newenv = temp;
|
|
|
|
}
|
|
|
|
opal_setenv("PATH", newenv, true, &env);
|
|
|
|
if (mca_pls_gridengine_component.debug) {
|
|
|
|
opal_output(0, "pls:gridengine: reset PATH: %s", newenv);
|
|
|
|
}
|
|
|
|
free(newenv);
|
|
|
|
|
|
|
|
/* Reset LD_LIBRARY_PATH */
|
|
|
|
newenv = opal_os_path( false, prefix_dir, lib_base, NULL );
|
|
|
|
oldenv = getenv("LD_LIBRARY_PATH");
|
|
|
|
if (NULL != oldenv) {
|
|
|
|
char* temp;
|
|
|
|
asprintf(&temp, "%s:%s", newenv, oldenv);
|
|
|
|
free(newenv);
|
|
|
|
newenv = temp;
|
|
|
|
}
|
|
|
|
opal_setenv("LD_LIBRARY_PATH", newenv, true, &env);
|
|
|
|
if (mca_pls_gridengine_component.debug) {
|
|
|
|
opal_output(0, "pls:gridengine: reset LD_LIBRARY_PATH: %s",
|
|
|
|
newenv);
|
|
|
|
}
|
|
|
|
free(newenv);
|
|
|
|
}
|
|
|
|
|
2007-06-13 19:30:18 +04:00
|
|
|
/*
|
2006-10-07 19:45:24 +04:00
|
|
|
* Iterate through the nodes.
|
2006-08-11 01:46:52 +04:00
|
|
|
*/
|
2006-10-07 19:45:24 +04:00
|
|
|
for(n_item = opal_list_get_first(&map->nodes);
|
|
|
|
n_item != opal_list_get_end(&map->nodes);
|
|
|
|
n_item = opal_list_get_next(n_item)) {
|
|
|
|
orte_mapped_node_t* rmaps_node = (orte_mapped_node_t*)n_item;
|
|
|
|
pid_t pid;
|
|
|
|
char *exec_path, *orted_path;
|
|
|
|
char **exec_argv;
|
2007-06-13 19:30:18 +04:00
|
|
|
|
|
|
|
/* if this daemon already exists, don't launch it! */
|
|
|
|
if (rmaps_node->daemon_preexists) {
|
|
|
|
continue;
|
|
|
|
}
|
2006-10-07 19:45:24 +04:00
|
|
|
|
|
|
|
/* setup node name */
|
|
|
|
free(argv[node_name_index1]);
|
|
|
|
if (NULL != rmaps_node->username &&
|
|
|
|
0 != strlen (rmaps_node->username)) {
|
|
|
|
asprintf(&argv[node_name_index1], "%s@%s",
|
|
|
|
rmaps_node->username, rmaps_node->nodename);
|
|
|
|
} else {
|
|
|
|
argv[node_name_index1] = strdup(rmaps_node->nodename);
|
|
|
|
}
|
2006-08-11 01:46:52 +04:00
|
|
|
|
2006-10-07 19:45:24 +04:00
|
|
|
free(argv[node_name_index2]);
|
|
|
|
argv[node_name_index2] = strdup(rmaps_node->nodename);
|
2006-08-11 01:46:52 +04:00
|
|
|
|
|
|
|
#ifdef __WINDOWS__
|
2006-10-07 19:45:24 +04:00
|
|
|
printf("Unimplemented feature for windows\n");
|
2007-04-26 19:08:37 +04:00
|
|
|
rc = ORTE_ERR_NOT_IMPLEMENTED;
|
|
|
|
goto cleanup;
|
2006-08-11 01:46:52 +04:00
|
|
|
#else
|
2006-10-07 19:45:24 +04:00
|
|
|
/* fork a child to do qrsh */
|
|
|
|
pid = fork();
|
2006-08-11 01:46:52 +04:00
|
|
|
#endif
|
2006-10-07 19:45:24 +04:00
|
|
|
if (pid < 0) {
|
2007-04-26 19:08:37 +04:00
|
|
|
ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_CHILDREN);
|
|
|
|
rc = ORTE_ERR_SYS_LIMITS_CHILDREN;
|
2006-10-07 19:45:24 +04:00
|
|
|
goto cleanup;
|
|
|
|
}
|
2006-08-11 01:46:52 +04:00
|
|
|
|
2006-10-07 19:45:24 +04:00
|
|
|
/* child */
|
|
|
|
if (pid == 0) {
|
|
|
|
char* name_string;
|
|
|
|
char* var;
|
|
|
|
long fd, fdmax = sysconf(_SC_OPEN_MAX);
|
2006-08-11 01:46:52 +04:00
|
|
|
|
2006-10-07 19:45:24 +04:00
|
|
|
if (mca_pls_gridengine_component.debug) {
|
|
|
|
opal_output(0, "pls:gridengine: launching on node %s",
|
|
|
|
rmaps_node->nodename);
|
|
|
|
}
|
2006-08-11 01:46:52 +04:00
|
|
|
|
2006-10-07 19:45:24 +04:00
|
|
|
/* setting exec_argv and exec_path for qrsh */
|
|
|
|
exec_argv = &argv[0];
|
|
|
|
|
|
|
|
sge_root = getenv("SGE_ROOT");
|
|
|
|
sge_arch = getenv("ARC");
|
|
|
|
asprintf(&exec_path, "%s/bin/%s/qrsh", sge_root, sge_arch);
|
|
|
|
exec_path = opal_path_findv(exec_path, X_OK, environ, NULL);
|
|
|
|
if (NULL == exec_path) {
|
|
|
|
opal_show_help("help-pls-gridengine.txt", "bad-qrsh-path",
|
|
|
|
true, exec_path, sge_root, sge_arch);
|
2007-04-26 19:08:37 +04:00
|
|
|
exit(-1); /* forked child must ALWAYS exit, not return */
|
2006-10-07 19:45:24 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
if (mca_pls_gridengine_component.debug) {
|
|
|
|
opal_output(0, "pls:gridengine: exec_argv[0]=%s, exec_path=%s",
|
|
|
|
exec_argv[0], exec_path);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* setting orted_path for orted */
|
|
|
|
orted_path = opal_path_findv(exec_argv[orted_index], 0, environ, NULL);
|
2006-08-11 01:46:52 +04:00
|
|
|
|
2006-10-07 19:45:24 +04:00
|
|
|
if (NULL == orted_path && NULL == prefix_dir) {
|
|
|
|
rc = orte_pls_gridengine_fill_orted_path(&orted_path);
|
|
|
|
if (ORTE_SUCCESS != rc) {
|
2007-04-26 19:08:37 +04:00
|
|
|
exit(-1); /* forked child must ALWAYS exit, not return */
|
2006-08-11 01:46:52 +04:00
|
|
|
}
|
2006-10-07 19:45:24 +04:00
|
|
|
} else {
|
|
|
|
if (NULL != prefix_dir) {
|
|
|
|
orted_path = opal_os_path( false, prefix_dir, bin_base, "orted", NULL );
|
2006-08-11 01:46:52 +04:00
|
|
|
}
|
2006-10-07 19:45:24 +04:00
|
|
|
/* If we yet did not fill up the orted_path, do so now */
|
|
|
|
if (NULL == orted_path) {
|
2006-08-11 01:46:52 +04:00
|
|
|
rc = orte_pls_gridengine_fill_orted_path(&orted_path);
|
|
|
|
if (ORTE_SUCCESS != rc) {
|
2007-04-26 19:08:37 +04:00
|
|
|
exit(-1); /* forked child must ALWAYS exit, not return */
|
2006-08-11 01:46:52 +04:00
|
|
|
}
|
|
|
|
}
|
2006-10-07 19:45:24 +04:00
|
|
|
}
|
|
|
|
asprintf(&argv[orted_index], orted_path);
|
|
|
|
if (mca_pls_gridengine_component.debug) {
|
|
|
|
opal_output(0, "pls:gridengine: orted_path=%s", orted_path);
|
|
|
|
}
|
|
|
|
|
|
|
|
var = getenv("HOME");
|
|
|
|
if (NULL != var) {
|
|
|
|
if (mca_pls_gridengine_component.debug) {
|
|
|
|
opal_output(0, "pls:gridengine: changing to directory %s",
|
|
|
|
var);
|
2006-08-11 01:46:52 +04:00
|
|
|
}
|
2006-10-07 19:45:24 +04:00
|
|
|
/* Ignore errors -- what are we going to do?
|
|
|
|
(and we ignore errors on the remote nodes
|
|
|
|
in the fork pls, so this is consistent) */
|
|
|
|
chdir(var);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* setup process name */
|
2007-06-13 19:02:47 +04:00
|
|
|
rc = orte_ns.get_proc_name_string(&name_string, rmaps_node->daemon);
|
2006-10-07 19:45:24 +04:00
|
|
|
if (ORTE_SUCCESS != rc) {
|
2007-06-13 19:30:18 +04:00
|
|
|
opal_output(0, "pls:gridengine: unable to get daemon name as string");
|
2006-10-07 19:45:24 +04:00
|
|
|
exit(-1);
|
|
|
|
}
|
|
|
|
free(argv[proc_name_index]);
|
|
|
|
argv[proc_name_index] = strdup(name_string);
|
|
|
|
|
|
|
|
if (!mca_pls_gridengine_component.debug) {
|
|
|
|
/* setup stdin */
|
|
|
|
int fd = open("/dev/null", O_RDWR, 0);
|
|
|
|
dup2(fd, 0);
|
|
|
|
close(fd);
|
|
|
|
}
|
2006-08-11 01:46:52 +04:00
|
|
|
|
2006-10-07 19:45:24 +04:00
|
|
|
/* close all file descriptors w/ exception of stdin/stdout/stderr */
|
|
|
|
for(fd=3; fd<fdmax; fd++)
|
|
|
|
close(fd);
|
2006-08-11 01:46:52 +04:00
|
|
|
|
2006-10-07 19:45:24 +04:00
|
|
|
/* Set signal handlers back to the default. Do this close
|
|
|
|
to the execve() because the event library may (and likely
|
|
|
|
will) reset them. If we don't do this, the event
|
|
|
|
library may have left some set that, at least on some
|
|
|
|
OS's, don't get reset via fork() or exec(). Hence, the
|
|
|
|
orted could be unkillable (for example). */
|
|
|
|
|
|
|
|
set_handler_default(SIGTERM);
|
|
|
|
set_handler_default(SIGINT);
|
2006-08-11 01:46:52 +04:00
|
|
|
#ifndef __WINDOWS__
|
2006-10-07 19:45:24 +04:00
|
|
|
set_handler_default(SIGHUP);
|
|
|
|
set_handler_default(SIGPIPE);
|
2006-08-11 01:46:52 +04:00
|
|
|
#endif
|
2006-10-07 19:45:24 +04:00
|
|
|
set_handler_default(SIGCHLD);
|
|
|
|
|
|
|
|
/* Unblock all signals, for many of the same reasons that
|
|
|
|
we set the default handlers, above. This is noticable
|
|
|
|
on Linux where the event library blocks SIGTERM, but we
|
|
|
|
don't want that blocked by the orted (or, more
|
|
|
|
specifically, we don't want it to be blocked by the
|
|
|
|
orted and then inherited by the ORTE processes that it
|
|
|
|
forks, making them unkillable by SIGTERM). */
|
2006-08-11 01:46:52 +04:00
|
|
|
#ifndef __WINDOWS__
|
2006-10-07 19:45:24 +04:00
|
|
|
sigprocmask(0, 0, &sigs);
|
|
|
|
sigprocmask(SIG_UNBLOCK, &sigs, 0);
|
2006-08-11 01:46:52 +04:00
|
|
|
#endif
|
2006-10-07 19:45:24 +04:00
|
|
|
|
|
|
|
/* exec the daemon */
|
|
|
|
if (mca_pls_gridengine_component.debug) {
|
|
|
|
param = opal_argv_join(exec_argv, ' ');
|
|
|
|
if (NULL != param) {
|
|
|
|
opal_output(0, "pls:gridengine: executing: %s", param);
|
|
|
|
free(param);
|
2006-09-15 01:29:51 +04:00
|
|
|
}
|
2006-08-11 01:46:52 +04:00
|
|
|
}
|
2006-10-07 19:45:24 +04:00
|
|
|
execve(exec_path, exec_argv, env);
|
|
|
|
opal_output(0, "pls:gridengine: execve failed with errno=%d\n", errno);
|
|
|
|
exit(-1);
|
|
|
|
} else { /* parent */
|
|
|
|
if (mca_pls_gridengine_component.debug) {
|
|
|
|
opal_output(0, "pls:gridengine: parent");
|
2007-06-13 19:30:18 +04:00
|
|
|
}
|
|
|
|
|
2006-10-07 19:45:24 +04:00
|
|
|
/* setup callback on sigchild - wait until setup above is complete
|
|
|
|
* as the callback can occur in the call to orte_wait_cb
|
|
|
|
*/
|
2007-07-12 23:53:18 +04:00
|
|
|
orte_wait_cb(pid, orte_pls_gridengine_wait_daemon, NULL);
|
2006-10-07 19:45:24 +04:00
|
|
|
|
2006-08-11 01:46:52 +04:00
|
|
|
}
|
|
|
|
}
|
2007-07-12 23:53:18 +04:00
|
|
|
|
|
|
|
/* wait for daemons to callback */
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_pls_base_daemon_callback(map->num_new_daemons))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
|
|
|
|
launch_apps:
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_pls_base_launch_apps(map))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* get here if launch went okay */
|
2007-04-26 19:08:37 +04:00
|
|
|
failed_launch = false;
|
2006-08-11 01:46:52 +04:00
|
|
|
|
2007-07-12 23:53:18 +04:00
|
|
|
cleanup:
|
2007-04-26 19:08:37 +04:00
|
|
|
if (NULL != map) {
|
|
|
|
OBJ_RELEASE(map);
|
|
|
|
}
|
2006-10-08 02:44:00 +04:00
|
|
|
|
2006-08-11 01:46:52 +04:00
|
|
|
if (NULL != lib_base) {
|
|
|
|
free(lib_base);
|
|
|
|
}
|
|
|
|
if (NULL != bin_base) {
|
|
|
|
free(bin_base);
|
|
|
|
}
|
2007-06-13 19:30:18 +04:00
|
|
|
if (NULL != argv) {
|
|
|
|
opal_argv_free(argv);
|
|
|
|
}
|
2007-04-26 19:08:37 +04:00
|
|
|
if (NULL != env) {
|
|
|
|
opal_argv_free(env);
|
|
|
|
}
|
2006-08-11 01:46:52 +04:00
|
|
|
|
2007-04-26 19:08:37 +04:00
|
|
|
/* check for failed launch - if so, force terminate */
|
|
|
|
if (failed_launch) {
|
These changes were mostly captured in a prior RFC (except for #2 below) and are aimed specifically at improving startup performance and setting up the remaining modifications described in that RFC.
The commit has been tested for C/R and Cray operations, and on Odin (SLURM, rsh) and RoadRunner (TM). I tried to update all environments, but obviously could not test them. I know that Windows needs some work, and have highlighted what is know to be needed in the odls process component.
This represents a lot of work by Brian, Tim P, Josh, and myself, with much advice from Jeff and others. For posterity, I have appended a copy of the email describing the work that was done:
As we have repeatedly noted, the modex operation in MPI_Init is the single greatest consumer of time during startup. To-date, we have executed that operation as an ORTE stage gate that held the process until a startup message containing all required modex (and OOB contact info - see #3 below) info could be sent to it. Each process would send its data to the HNP's registry, which assembled and sent the message when all processes had reported in.
In addition, ORTE had taken responsibility for monitoring process status as it progressed through a series of "stage gates". The process reported its status at each gate, and ORTE would then send a "release" message once all procs had reported in.
The incoming changes revamp these procedures in three ways:
1. eliminating the ORTE stage gate system and cleanly delineating responsibility between the OMPI and ORTE layers for MPI init/finalize. The modex stage gate (STG1) has been replaced by a collective operation in the modex itself that performs an allgather on the required modex info. The allgather is implemented using the orte_grpcomm framework since the BTL's are not active at that point. At the moment, the grpcomm framework only has a "basic" component analogous to OMPI's "basic" coll framework - I would recommend that the MPI team create additional, more advanced components to improve performance of this step.
The other stage gates have been replaced by orte_grpcomm barrier functions. We tried to use MPI barriers instead (since the BTL's are active at that point), but - as we discussed on the telecon - these are not currently true barriers so the job would hang when we fell through while messages were still in process. Note that the grpcomm barrier doesn't actually resolve that problem, but Brian has pointed out that we are unlikely to ever see it violated. Again, you might want to spend a little time on an advanced barrier algorithm as the one in "basic" is very simplistic.
Summarizing this change: ORTE no longer tracks process state nor has direct responsibility for synchronizing jobs. This is now done via collective operations within the MPI layer, albeit using ORTE collective communication services. I -strongly- urge the MPI team to implement advanced collective algorithms to improve the performance of this critical procedure.
2. reducing the volume of data exchanged during modex. Data in the modex consisted of the process name, the name of the node where that process is located (expressed as a string), plus a string representation of all contact info. The nodename was required in order for the modex to determine if the process was local or not - in addition, some people like to have it to print pretty error messages when a connection failed.
The size of this data has been reduced in three ways:
(a) reducing the size of the process name itself. The process name consisted of two 32-bit fields for the jobid and vpid. This is far larger than any current system, or system likely to exist in the near future, can support. Accordingly, the default size of these fields has been reduced to 16-bits, which means you can have 32k procs in each of 32k jobs. Since the daemons must have a vpid, and we require one daemon/node, this also restricts the default configuration to 32k nodes.
To support any future "mega-clusters", a configuration option --enable-jumbo-apps has been added. This option increases the jobid and vpid field sizes to 32-bits. Someday, if necessary, someone can add yet another option to increase them to 64-bits, I suppose.
(b) replacing the string nodename with an integer nodeid. Since we have one daemon/node, the nodeid corresponds to the local daemon's vpid. This replaces an often lengthy string with only 2 (or at most 4) bytes, a substantial reduction.
(c) when the mca param requesting that nodenames be sent to support pretty error messages, a second mca param is now used to request FQDN - otherwise, the domain name is stripped (by default) from the message to save space. If someone wants to combine those into a single param somehow (perhaps with an argument?), they are welcome to do so - I didn't want to alter what people are already using.
While these may seem like small savings, they actually amount to a significant impact when aggregated across the entire modex operation. Since every proc must receive the modex data regardless of the collective used to send it, just reducing the size of the process name removes nearly 400MBytes of communication from a 32k proc job (admittedly, much of this comm may occur in parallel). So it does add up pretty quickly.
3. routing RML messages to reduce connections. The default messaging system remains point-to-point - i.e., each proc opens a socket to every proc it communicates with and sends its messages directly. A new option uses the orteds as routers - i.e., each proc only opens a single socket to its local orted. All messages are sent from the proc to the orted, which forwards the message to the orted on the node where the intended recipient proc is located - that orted then forwards the message to its local proc (the recipient). This greatly reduces the connection storm we have encountered during startup.
It also has the benefit of removing the sharing of every proc's OOB contact with every other proc. The orted routing tables are populated during launch since every orted gets a map of where every proc is being placed. Each proc, therefore, only needs to know the contact info for its local daemon, which is passed in via the environment when the proc is fork/exec'd by the daemon. This alone removes ~50 bytes/process of communication that was in the current STG1 startup message - so for our 32k proc job, this saves us roughly 32k*50 = 1.6MBytes sent to 32k procs = 51GBytes of messaging.
Note that you can use the new routing method by specifying -mca routed tree - if you so desire. This mode will become the default at some point in the future.
There are a few minor additional changes in the commit that I'll just note in passing:
* propagation of command line mca params to the orteds - fixes ticket #1073. See note there for details.
* requiring of "finalize" prior to "exit" for MPI procs - fixes ticket #1144. See note there for details.
* cleanup of some stale header files
This commit was SVN r16364.
2007-10-05 23:48:23 +04:00
|
|
|
orte_pls_base_daemon_failed(jobid, false, -1, 0, ORTE_JOB_STATE_FAILED_TO_START);
|
2007-04-26 19:08:37 +04:00
|
|
|
}
|
|
|
|
|
2006-08-11 01:46:52 +04:00
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Query the registry for all nodes participating in the job
|
|
|
|
*/
|
2007-01-25 17:17:44 +03:00
|
|
|
int orte_pls_gridengine_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs)
|
2006-08-11 01:46:52 +04:00
|
|
|
{
|
2006-09-15 01:29:51 +04:00
|
|
|
int rc;
|
|
|
|
|
|
|
|
/* order them to kill their local procs for this job */
|
2007-04-24 05:58:40 +04:00
|
|
|
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_kill_local_procs(jobid, timeout, attrs))) {
|
2006-09-15 01:29:51 +04:00
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
}
|
|
|
|
|
|
|
|
return rc;
|
2006-08-11 01:46:52 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
int orte_pls_gridengine_terminate_proc(const orte_process_name_t* proc)
|
|
|
|
{
|
2006-09-15 01:29:51 +04:00
|
|
|
return ORTE_ERR_NOT_IMPLEMENTED;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Terminate the orteds for a given job
|
|
|
|
*/
|
2007-04-25 15:51:18 +04:00
|
|
|
int orte_pls_gridengine_terminate_orteds(struct timeval *timeout, opal_list_t *attrs)
|
2006-09-15 01:29:51 +04:00
|
|
|
{
|
|
|
|
int rc;
|
|
|
|
|
|
|
|
/* now tell them to die! */
|
2007-04-24 05:58:40 +04:00
|
|
|
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_exit(timeout, attrs))) {
|
2006-09-15 01:29:51 +04:00
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
}
|
|
|
|
|
|
|
|
return rc;
|
2006-08-11 01:46:52 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Signal all processes associated with this job
|
|
|
|
*/
|
Bring over the update to terminate orteds that are generated by a dynamic spawn such as comm_spawn. This introduces the concept of a job "family" - i.e., jobs that have a parent/child relationship. Comm_spawn'ed jobs have a parent (the one that spawned them). We track that relationship throughout the lineage - i.e., if a comm_spawned job in turn calls comm_spawn, then it has a parent (the one that spawned it) and a "root" job (the original job that started things).
Accordingly, there are new APIs to the name service to support the ability to get a job's parent, root, immediate children, and all its descendants. In addition, the terminate_job, terminate_orted, and signal_job APIs for the PLS have been modified to accept attributes that define the extent of their actions. For example, doing a "terminate_job" with an attribute of ORTE_NS_INCLUDE_DESCENDANTS will terminate the given jobid AND all jobs that descended from it.
I have tested this capability on a MacBook under rsh, Odin under SLURM, and LANL's Flash (bproc). It worked successfully on non-MPI jobs (both simple and including a spawn), and MPI jobs (again, both simple and with a spawn).
This commit was SVN r12597.
2006-11-14 22:34:59 +03:00
|
|
|
int orte_pls_gridengine_signal_job(orte_jobid_t jobid, int32_t signal, opal_list_t *attrs)
|
2006-08-11 01:46:52 +04:00
|
|
|
{
|
2006-09-15 01:29:51 +04:00
|
|
|
int rc;
|
2007-04-24 05:58:40 +04:00
|
|
|
|
2006-09-15 01:29:51 +04:00
|
|
|
/* order them to pass this signal to their local procs */
|
2007-04-24 05:58:40 +04:00
|
|
|
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_signal_local_procs(jobid, signal, attrs))) {
|
2006-09-15 01:29:51 +04:00
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
}
|
|
|
|
|
|
|
|
return rc;
|
2006-08-11 01:46:52 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Signal a specific process.
|
|
|
|
*/
|
|
|
|
int orte_pls_gridengine_signal_proc(const orte_process_name_t* proc, int32_t signal)
|
|
|
|
{
|
2006-09-15 01:29:51 +04:00
|
|
|
return ORTE_ERR_NOT_IMPLEMENTED;
|
2006-08-11 01:46:52 +04:00
|
|
|
}
|
|
|
|
|
2007-01-25 17:17:44 +03:00
|
|
|
|
2006-08-11 01:46:52 +04:00
|
|
|
/**
|
|
|
|
* Finalize
|
|
|
|
*/
|
|
|
|
int orte_pls_gridengine_finalize(void)
|
|
|
|
{
|
2006-09-15 01:29:51 +04:00
|
|
|
int rc;
|
|
|
|
|
2006-08-11 01:46:52 +04:00
|
|
|
/* cleanup any pending recvs */
|
2006-09-15 01:29:51 +04:00
|
|
|
if (ORTE_SUCCESS != (rc = orte_pls_base_comm_stop())) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
}
|
2006-08-11 01:46:52 +04:00
|
|
|
return ORTE_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Set signal handler
|
|
|
|
*/
|
|
|
|
static void set_handler_default(int sig)
|
|
|
|
{
|
|
|
|
#ifndef __WINDOWS__
|
|
|
|
struct sigaction act;
|
|
|
|
act.sa_handler = SIG_DFL;
|
|
|
|
act.sa_flags = 0;
|
|
|
|
sigemptyset(&act.sa_mask);
|
|
|
|
|
|
|
|
sigaction(sig, &act, (struct sigaction *)0);
|
|
|
|
#endif
|
|
|
|
}
|