2006-09-15 01:29:51 +04:00
|
|
|
/*
|
2007-03-17 02:11:45 +03:00
|
|
|
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
2006-09-15 01:29:51 +04:00
|
|
|
* University Research and Technology
|
|
|
|
* Corporation. All rights reserved.
|
|
|
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
|
|
|
* of Tennessee Research Foundation. All rights
|
|
|
|
* reserved.
|
|
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
|
|
* University of Stuttgart. All rights reserved.
|
|
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
|
|
* All rights reserved.
|
2007-02-13 02:59:27 +03:00
|
|
|
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
|
2006-09-15 01:29:51 +04:00
|
|
|
* $COPYRIGHT$
|
|
|
|
*
|
|
|
|
* Additional copyrights may follow
|
|
|
|
*
|
|
|
|
* $HEADER$
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "orte_config.h"
|
|
|
|
#include "orte/orte_constants.h"
|
|
|
|
|
|
|
|
#include <stdlib.h>
|
|
|
|
#ifdef HAVE_UNISTD_H
|
|
|
|
#include <unistd.h>
|
|
|
|
#endif
|
|
|
|
#include <errno.h>
|
2007-06-13 02:43:18 +04:00
|
|
|
#ifdef HAVE_SYS_TYPES_H
|
2006-09-15 01:29:51 +04:00
|
|
|
#include <sys/types.h>
|
2006-11-13 21:51:18 +03:00
|
|
|
#endif
|
2006-09-15 01:29:51 +04:00
|
|
|
#ifdef HAVE_SYS_WAIT_H
|
|
|
|
#include <sys/wait.h>
|
|
|
|
#endif
|
|
|
|
#include <signal.h>
|
|
|
|
#ifdef HAVE_FCNTL_H
|
|
|
|
#include <fcntl.h>
|
|
|
|
#endif
|
2006-11-13 21:51:18 +03:00
|
|
|
#ifdef HAVE_SYS_TIME_H
|
|
|
|
#include <sys/time.h>
|
|
|
|
#endif
|
2006-09-15 01:29:51 +04:00
|
|
|
#ifdef HAVE_SYS_PARAM_H
|
|
|
|
#include <sys/param.h>
|
|
|
|
#endif
|
|
|
|
#ifdef HAVE_NETDB_H
|
|
|
|
#include <netdb.h>
|
|
|
|
#endif
|
|
|
|
#ifdef HAVE_SYS_STAT_H
|
|
|
|
#include <sys/stat.h>
|
|
|
|
#endif /* HAVE_SYS_STAT_H */
|
|
|
|
|
2006-11-13 21:51:18 +03:00
|
|
|
#if defined(HAVE_SCHED_YIELD)
|
|
|
|
/* Only if we have sched_yield() */
|
|
|
|
#ifdef HAVE_SCHED_H
|
|
|
|
#include <sched.h>
|
|
|
|
#endif
|
|
|
|
#else
|
|
|
|
/* Only do these if we don't have <sched.h> */
|
|
|
|
#ifdef HAVE_SYS_SELECT_H
|
|
|
|
#include <sys/select.h>
|
|
|
|
#endif
|
|
|
|
#endif /* HAVE_SCHED_YIELD */
|
|
|
|
|
2006-09-15 01:29:51 +04:00
|
|
|
#include "opal/event/event.h"
|
|
|
|
#include "opal/util/argv.h"
|
|
|
|
#include "opal/util/output.h"
|
|
|
|
#include "opal/util/os_path.h"
|
|
|
|
#include "opal/util/show_help.h"
|
|
|
|
#include "opal/util/path.h"
|
|
|
|
#include "opal/util/basename.h"
|
|
|
|
#include "opal/util/opal_environ.h"
|
|
|
|
#include "opal/mca/base/mca_base_param.h"
|
2007-02-06 22:51:05 +03:00
|
|
|
#include "opal/util/num_procs.h"
|
2007-04-24 22:54:45 +04:00
|
|
|
#include "opal/util/sys_limits.h"
|
2006-09-15 01:29:51 +04:00
|
|
|
|
|
|
|
#include "orte/dss/dss.h"
|
|
|
|
#include "orte/util/sys_info.h"
|
|
|
|
#include "orte/util/univ_info.h"
|
|
|
|
#include "orte/util/session_dir.h"
|
|
|
|
#include "orte/runtime/orte_wait.h"
|
Bring in the code for routing xcast stage gate messages via the local orteds. This code is inactive unless you specifically request it via an mca param oob_xcast_mode (can be set to "linear" or "direct"). Direct mode is the old standard method where we send messages directly to each MPI process. Linear mode sends the xcast message via the orteds, with the HNP sending the message to each orted directly.
There is a binomial algorithm in the code (i.e., the HNP would send to a subset of the orteds, which then relay it on according to the typical log-2 algo), but that has a bug in it so the code won't let you select it even if you tried (and the mca param doesn't show, so you'd *really* have to try).
This also involved a slight change to the oob.xcast API, so propagated that as required.
Note: this has *only* been tested on rsh, SLURM, and Bproc environments (now that it has been transferred to the OMPI trunk, I'll need to re-test it [only done rsh so far]). It should work fine on any environment that uses the ORTE daemons - anywhere else, you are on your own... :-)
Also, correct a mistake where the orte_debug_flag was declared an int, but the mca param was set as a bool. Move the storage for that flag to the orte/runtime/params.c and orte/runtime/params.h files appropriately.
This commit was SVN r14475.
2007-04-23 22:41:04 +04:00
|
|
|
#include "orte/runtime/params.h"
|
2006-09-15 01:29:51 +04:00
|
|
|
#include "orte/mca/errmgr/errmgr.h"
|
|
|
|
#include "orte/mca/errmgr/base/base.h"
|
|
|
|
#include "orte/mca/iof/iof.h"
|
|
|
|
#include "orte/mca/iof/base/iof_base_setup.h"
|
|
|
|
#include "orte/mca/ns/ns.h"
|
|
|
|
#include "orte/mca/sds/base/base.h"
|
|
|
|
#include "orte/mca/rmgr/rmgr.h"
|
|
|
|
#include "orte/mca/rml/rml.h"
|
|
|
|
#include "orte/mca/gpr/gpr.h"
|
|
|
|
#include "orte/mca/rmaps/base/base.h"
|
|
|
|
#include "orte/mca/smr/smr.h"
|
2007-03-17 02:11:45 +03:00
|
|
|
#include "orte/mca/filem/filem.h"
|
|
|
|
#include "orte/mca/filem/base/base.h"
|
2007-06-18 19:39:04 +04:00
|
|
|
#if OPAL_ENABLE_FT == 1
|
|
|
|
#include "orte/mca/snapc/snapc.h"
|
|
|
|
#endif
|
2006-09-15 01:29:51 +04:00
|
|
|
|
|
|
|
#include "orte/mca/odls/base/odls_private.h"
|
|
|
|
#include "orte/mca/odls/default/odls_default.h"
|
|
|
|
|
2007-03-17 02:11:45 +03:00
|
|
|
static int orte_pls_fork_preload_append_binary(orte_app_context_t* context,
|
|
|
|
orte_filem_base_request_t *filem_request);
|
|
|
|
static int orte_pls_fork_preload_append_files(orte_app_context_t* context,
|
|
|
|
orte_filem_base_request_t *filem_request);
|
|
|
|
static bool is_preload_local_dup(char *local_ref, orte_filem_base_request_t *filem_request);
|
Bring in the generalized xcast communication system along with the correspondingly revised orted launch. I will send a message out to developers explaining the basic changes. In brief:
1. generalize orte_rml.xcast to become a general broadcast-like messaging system. Messages can now be sent to any tag on the daemons or processes. Note that any message sent via xcast will be delivered to ALL processes in the specified job - you don't get to pick and choose. At a later date, we will introduce an augmented capability that will use the daemons as relays, but will allow you to send to a specified array of process names.
2. extended orte_rml.xcast so it supports more scalable message routing methodologies. At the moment, we support three: (a) direct, which sends the message directly to all recipients; (b) linear, which sends the message to the local daemon on each node, which then relays it to its own local procs; and (b) binomial, which sends the message via a binomial algo across all the daemons, each of which then relays to its own local procs. The crossover points between the algos are adjustable via MCA param, or you can simply demand that a specific algo be used.
3. orteds no longer exhibit two types of behavior: bootproxy or VM. Orteds now always behave like they are part of a virtual machine - they simply launch a job if mpirun tells them to do so. This is another step towards creating an "orteboot" functionality, but also provided a clean system for supporting message relaying.
Note one major impact of this commit: multiple daemons on a node cannot be supported any longer! Only a single daemon/node is now allowed.
This commit is known to break support for the following environments: POE, Xgrid, Xcpu, Windows. It has been tested on rsh, SLURM, and Bproc. Modifications for TM support have been made but could not be verified due to machine problems at LANL. Modifications for SGE have been made but could not be verified. The developers for the non-verified environments will be separately notified along with suggestions on how to fix the problems.
This commit was SVN r15007.
2007-06-12 17:28:54 +04:00
|
|
|
|
2007-06-07 00:18:37 +04:00
|
|
|
/*
|
|
|
|
* External Interface
|
|
|
|
*/
|
|
|
|
static int orte_odls_default_get_add_procs_data(orte_gpr_notify_data_t **data, orte_job_map_t *map);
|
|
|
|
static int orte_odls_default_launch_local_procs(orte_gpr_notify_data_t *data, char **base_environ);
|
|
|
|
static int orte_odls_default_kill_local_procs(orte_jobid_t job, bool set_state);
|
|
|
|
static int orte_odls_default_signal_local_procs(const orte_process_name_t *proc,
|
|
|
|
int32_t signal);
|
|
|
|
static int orte_odls_default_deliver_message(orte_jobid_t job, orte_buffer_t *buffer, orte_rml_tag_t tag);
|
2007-03-17 02:11:45 +03:00
|
|
|
|
2006-09-15 01:29:51 +04:00
|
|
|
static void set_handler_default(int sig);
|
|
|
|
|
|
|
|
orte_odls_base_module_t orte_odls_default_module = {
|
2006-11-16 00:12:27 +03:00
|
|
|
orte_odls_default_get_add_procs_data,
|
2006-09-15 01:29:51 +04:00
|
|
|
orte_odls_default_launch_local_procs,
|
|
|
|
orte_odls_default_kill_local_procs,
|
Bring in the code for routing xcast stage gate messages via the local orteds. This code is inactive unless you specifically request it via an mca param oob_xcast_mode (can be set to "linear" or "direct"). Direct mode is the old standard method where we send messages directly to each MPI process. Linear mode sends the xcast message via the orteds, with the HNP sending the message to each orted directly.
There is a binomial algorithm in the code (i.e., the HNP would send to a subset of the orteds, which then relay it on according to the typical log-2 algo), but that has a bug in it so the code won't let you select it even if you tried (and the mca param doesn't show, so you'd *really* have to try).
This also involved a slight change to the oob.xcast API, so propagated that as required.
Note: this has *only* been tested on rsh, SLURM, and Bproc environments (now that it has been transferred to the OMPI trunk, I'll need to re-test it [only done rsh so far]). It should work fine on any environment that uses the ORTE daemons - anywhere else, you are on your own... :-)
Also, correct a mistake where the orte_debug_flag was declared an int, but the mca param was set as a bool. Move the storage for that flag to the orte/runtime/params.c and orte/runtime/params.h files appropriately.
This commit was SVN r14475.
2007-04-23 22:41:04 +04:00
|
|
|
orte_odls_default_signal_local_procs,
|
|
|
|
orte_odls_default_deliver_message
|
2006-09-15 01:29:51 +04:00
|
|
|
};
|
|
|
|
|
2006-11-16 00:12:27 +03:00
|
|
|
int orte_odls_default_get_add_procs_data(orte_gpr_notify_data_t **data,
|
2006-11-17 22:06:10 +03:00
|
|
|
orte_job_map_t *map)
|
2006-11-16 00:12:27 +03:00
|
|
|
{
|
|
|
|
orte_gpr_notify_data_t *ndat;
|
|
|
|
orte_gpr_value_t **values, *value;
|
|
|
|
orte_std_cntr_t cnt;
|
|
|
|
char *glob_tokens[] = {
|
|
|
|
ORTE_JOB_GLOBALS,
|
|
|
|
NULL
|
|
|
|
};
|
|
|
|
char *glob_keys[] = {
|
|
|
|
ORTE_JOB_APP_CONTEXT_KEY,
|
|
|
|
ORTE_JOB_VPID_START_KEY,
|
|
|
|
ORTE_JOB_VPID_RANGE_KEY,
|
2007-07-02 05:33:35 +04:00
|
|
|
ORTE_JOB_TOTAL_SLOTS_ALLOC_KEY,
|
2006-11-16 00:12:27 +03:00
|
|
|
NULL
|
|
|
|
};
|
2006-11-17 22:06:10 +03:00
|
|
|
opal_list_item_t *item, *m_item;
|
|
|
|
orte_mapped_node_t *node;
|
2006-11-16 00:12:27 +03:00
|
|
|
orte_mapped_proc_t *proc;
|
|
|
|
int rc;
|
|
|
|
char *segment;
|
|
|
|
|
|
|
|
/* set default answer */
|
|
|
|
*data = NULL;
|
|
|
|
|
|
|
|
ndat = OBJ_NEW(orte_gpr_notify_data_t);
|
|
|
|
if (NULL == ndat) {
|
|
|
|
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
|
|
|
return ORTE_ERR_OUT_OF_RESOURCE;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* construct a fake trigger name so that the we can extract the jobid from it later */
|
2006-11-17 22:06:10 +03:00
|
|
|
if (ORTE_SUCCESS != (rc = orte_schema.get_std_trigger_name(&(ndat->target), "bogus", map->job))) {
|
2006-11-16 00:12:27 +03:00
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
OBJ_RELEASE(ndat);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* get the segment name */
|
2006-11-17 22:06:10 +03:00
|
|
|
if (ORTE_SUCCESS != (rc = orte_schema.get_job_segment_name(&segment, map->job))) {
|
2006-11-16 00:12:27 +03:00
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
OBJ_RELEASE(ndat);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* get the info from the job globals container first */
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_gpr.get(ORTE_GPR_TOKENS_AND | ORTE_GPR_KEYS_OR,
|
|
|
|
segment, glob_tokens, glob_keys, &cnt, &values))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
OBJ_RELEASE(ndat);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* there can only be one value here since we only specified a single container.
|
|
|
|
* Just transfer the returned value to the ndat structure
|
|
|
|
*/
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_pointer_array_add(&cnt, ndat->values, values[0]))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
OBJ_RELEASE(ndat);
|
|
|
|
OBJ_RELEASE(values[0]);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
ndat->cnt = 1;
|
|
|
|
|
2006-11-17 22:06:10 +03:00
|
|
|
/* the remainder of our required info is in the mapped_node objects, so all we
|
2006-11-16 00:12:27 +03:00
|
|
|
* have to do is transfer it over
|
|
|
|
*/
|
2006-11-17 22:06:10 +03:00
|
|
|
for (m_item = opal_list_get_first(&map->nodes);
|
|
|
|
m_item != opal_list_get_end(&map->nodes);
|
|
|
|
m_item = opal_list_get_next(m_item)) {
|
|
|
|
node = (orte_mapped_node_t*)m_item;
|
2006-11-16 00:12:27 +03:00
|
|
|
|
2006-11-17 22:06:10 +03:00
|
|
|
for (item = opal_list_get_first(&node->procs);
|
|
|
|
item != opal_list_get_end(&node->procs);
|
|
|
|
item = opal_list_get_next(item)) {
|
|
|
|
proc = (orte_mapped_proc_t*)item;
|
|
|
|
|
2007-02-06 22:51:05 +03:00
|
|
|
/* must not have any tokens so that launch_procs can process it correctly */
|
Compute and pass the local_rank and local number of procs (in that proc's job) on the node.
To be precise, given this hypothetical launching pattern:
host1: vpids 0, 2, 4, 6
host2: vpids 1, 3, 5, 7
The local_rank for these procs would be:
host1: vpids 0->local_rank 0, v2->lr1, v4->lr2, v6->lr3
host2: vpids 1->local_rank 0, v3->lr1, v5->lr2, v7->lr3
and the number of local procs on each node would be four. If vpid=0 then does a comm_spawn of one process on host1, the values of the parent job would remain unchanged. The local_rank of the child process would be 0 and its num_local_procs would be 1 since it is in a separate jobid.
I have verified this functionality for the rsh case - need to verify that slurm and other cases also get the right values. Some consolidation of common code is probably going to occur in the SDS components to make this simpler and more maintainable in the future.
This commit was SVN r14706.
2007-05-21 18:30:10 +04:00
|
|
|
if (ORTE_SUCCESS != (rc = orte_gpr.create_value(&value, 0, segment, 5, 0))) {
|
2006-11-17 22:06:10 +03:00
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
OBJ_RELEASE(ndat);
|
|
|
|
OBJ_RELEASE(value);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[0]),
|
|
|
|
ORTE_PROC_NAME_KEY,
|
|
|
|
ORTE_NAME, &proc->name))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
OBJ_RELEASE(ndat);
|
|
|
|
OBJ_RELEASE(value);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[1]),
|
|
|
|
ORTE_PROC_APP_CONTEXT_KEY,
|
|
|
|
ORTE_STD_CNTR, &proc->app_idx))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
OBJ_RELEASE(ndat);
|
|
|
|
OBJ_RELEASE(value);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[2]),
|
|
|
|
ORTE_NODE_NAME_KEY,
|
|
|
|
ORTE_STRING, node->nodename))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
OBJ_RELEASE(ndat);
|
|
|
|
OBJ_RELEASE(value);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
Compute and pass the local_rank and local number of procs (in that proc's job) on the node.
To be precise, given this hypothetical launching pattern:
host1: vpids 0, 2, 4, 6
host2: vpids 1, 3, 5, 7
The local_rank for these procs would be:
host1: vpids 0->local_rank 0, v2->lr1, v4->lr2, v6->lr3
host2: vpids 1->local_rank 0, v3->lr1, v5->lr2, v7->lr3
and the number of local procs on each node would be four. If vpid=0 then does a comm_spawn of one process on host1, the values of the parent job would remain unchanged. The local_rank of the child process would be 0 and its num_local_procs would be 1 since it is in a separate jobid.
I have verified this functionality for the rsh case - need to verify that slurm and other cases also get the right values. Some consolidation of common code is probably going to occur in the SDS components to make this simpler and more maintainable in the future.
This commit was SVN r14706.
2007-05-21 18:30:10 +04:00
|
|
|
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[3]),
|
|
|
|
ORTE_PROC_LOCAL_RANK_KEY,
|
|
|
|
ORTE_VPID, &proc->local_rank))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
OBJ_RELEASE(ndat);
|
|
|
|
OBJ_RELEASE(value);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[4]),
|
|
|
|
ORTE_NODE_NUM_PROCS_KEY,
|
|
|
|
ORTE_STD_CNTR, &node->num_procs))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
OBJ_RELEASE(ndat);
|
|
|
|
OBJ_RELEASE(value);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
2006-11-17 22:06:10 +03:00
|
|
|
if (ORTE_SUCCESS != (rc = orte_pointer_array_add(&cnt, ndat->values, value))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
OBJ_RELEASE(ndat);
|
|
|
|
OBJ_RELEASE(values[0]);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
ndat->cnt += 1;
|
2006-11-16 00:12:27 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
*data = ndat;
|
|
|
|
return ORTE_SUCCESS;
|
|
|
|
}
|
|
|
|
|
2007-06-13 02:43:18 +04:00
|
|
|
static bool odls_default_child_died(pid_t pid, unsigned int timeout,
|
|
|
|
int *exit_status)
|
2006-09-15 01:29:51 +04:00
|
|
|
{
|
|
|
|
time_t end;
|
|
|
|
pid_t ret;
|
2006-11-13 21:51:18 +03:00
|
|
|
#if !defined(HAVE_SCHED_YIELD)
|
|
|
|
struct timeval t;
|
|
|
|
fd_set bogus;
|
|
|
|
#endif
|
|
|
|
|
2006-09-15 01:29:51 +04:00
|
|
|
end = time(NULL) + timeout;
|
|
|
|
do {
|
|
|
|
ret = waitpid(pid, exit_status, WNOHANG);
|
|
|
|
if (pid == ret) {
|
|
|
|
/* It died -- return success */
|
|
|
|
return true;
|
|
|
|
} else if (-1 == ret && ECHILD == errno) {
|
|
|
|
/* The pid no longer exists, so we'll call this "good
|
|
|
|
enough for government work" */
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2006-11-13 21:51:18 +03:00
|
|
|
#if defined(HAVE_SCHED_YIELD)
|
2006-11-13 15:45:03 +03:00
|
|
|
sched_yield();
|
2006-11-13 21:51:18 +03:00
|
|
|
#else
|
|
|
|
/* Bogus delay for 1 usec */
|
|
|
|
t.tv_sec = 0;
|
|
|
|
t.tv_usec = 1;
|
|
|
|
FD_ZERO(&bogus);
|
|
|
|
FD_SET(0, &bogus);
|
|
|
|
select(1, &bogus, NULL, NULL, &t);
|
2006-11-13 15:45:03 +03:00
|
|
|
#endif
|
2006-11-13 21:51:18 +03:00
|
|
|
|
2006-09-15 01:29:51 +04:00
|
|
|
} while (time(NULL) < end);
|
|
|
|
|
|
|
|
/* The child didn't die, so return false */
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
int orte_odls_default_kill_local_procs(orte_jobid_t job, bool set_state)
|
|
|
|
{
|
2006-11-11 07:03:45 +03:00
|
|
|
orte_odls_child_t *child;
|
2007-04-24 22:54:45 +04:00
|
|
|
opal_list_item_t *item, *next;
|
2006-09-15 01:29:51 +04:00
|
|
|
int rc, exit_status;
|
|
|
|
opal_list_t procs_killed;
|
|
|
|
orte_namelist_t *proc;
|
|
|
|
|
|
|
|
OBJ_CONSTRUCT(&procs_killed, opal_list_t);
|
|
|
|
|
2006-11-17 00:15:25 +03:00
|
|
|
opal_output(orte_odls_globals.output, "[%ld,%ld,%ld] odls_kill_local_proc: working on job %ld",
|
|
|
|
ORTE_NAME_ARGS(ORTE_PROC_MY_NAME), (long)job);
|
|
|
|
|
2006-09-15 01:29:51 +04:00
|
|
|
/* since we are going to be working with the global list of
|
|
|
|
* children, we need to protect that list from modification
|
|
|
|
* by other threads
|
|
|
|
*/
|
|
|
|
OPAL_THREAD_LOCK(&orte_odls_default.mutex);
|
|
|
|
|
|
|
|
for (item = opal_list_get_first(&orte_odls_default.children);
|
|
|
|
item != opal_list_get_end(&orte_odls_default.children);
|
2007-04-24 22:54:45 +04:00
|
|
|
item = next) {
|
2006-11-11 07:03:45 +03:00
|
|
|
child = (orte_odls_child_t*)item;
|
2006-09-15 01:29:51 +04:00
|
|
|
|
2007-04-24 22:54:45 +04:00
|
|
|
/* preserve the pointer to the next item in list in case we release it */
|
|
|
|
next = opal_list_get_next(item);
|
|
|
|
|
2006-11-17 00:15:25 +03:00
|
|
|
opal_output(orte_odls_globals.output, "[%ld,%ld,%ld] odls_kill_local_proc: checking child process [%ld,%ld,%ld]",
|
|
|
|
ORTE_NAME_ARGS(ORTE_PROC_MY_NAME), ORTE_NAME_ARGS(child->name));
|
|
|
|
|
Bring in the generalized xcast communication system along with the correspondingly revised orted launch. I will send a message out to developers explaining the basic changes. In brief:
1. generalize orte_rml.xcast to become a general broadcast-like messaging system. Messages can now be sent to any tag on the daemons or processes. Note that any message sent via xcast will be delivered to ALL processes in the specified job - you don't get to pick and choose. At a later date, we will introduce an augmented capability that will use the daemons as relays, but will allow you to send to a specified array of process names.
2. extended orte_rml.xcast so it supports more scalable message routing methodologies. At the moment, we support three: (a) direct, which sends the message directly to all recipients; (b) linear, which sends the message to the local daemon on each node, which then relays it to its own local procs; and (b) binomial, which sends the message via a binomial algo across all the daemons, each of which then relays to its own local procs. The crossover points between the algos are adjustable via MCA param, or you can simply demand that a specific algo be used.
3. orteds no longer exhibit two types of behavior: bootproxy or VM. Orteds now always behave like they are part of a virtual machine - they simply launch a job if mpirun tells them to do so. This is another step towards creating an "orteboot" functionality, but also provided a clean system for supporting message relaying.
Note one major impact of this commit: multiple daemons on a node cannot be supported any longer! Only a single daemon/node is now allowed.
This commit is known to break support for the following environments: POE, Xgrid, Xcpu, Windows. It has been tested on rsh, SLURM, and Bproc. Modifications for TM support have been made but could not be verified due to machine problems at LANL. Modifications for SGE have been made but could not be verified. The developers for the non-verified environments will be separately notified along with suggestions on how to fix the problems.
This commit was SVN r15007.
2007-06-12 17:28:54 +04:00
|
|
|
/* do we have a child from the specified job? Because the
|
|
|
|
* job could be given as a WILDCARD value, we must use
|
|
|
|
* the dss.compare function to check for equality.
|
|
|
|
*/
|
|
|
|
if (ORTE_EQUAL != orte_dss.compare(&job, &(child->name->jobid), ORTE_JOBID)) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* remove the child from the list since it is either already dead or soon going to be dead */
|
|
|
|
opal_list_remove_item(&orte_odls_default.children, item);
|
|
|
|
|
2006-09-15 01:29:51 +04:00
|
|
|
/* is this process alive? if not, then nothing for us
|
|
|
|
* to do to it
|
|
|
|
*/
|
|
|
|
if (!child->alive) {
|
2007-04-24 22:54:45 +04:00
|
|
|
opal_output(orte_odls_globals.output, "[%ld,%ld,%ld] odls_kill_local_proc: child [%ld,%ld,%ld] is not alive",
|
|
|
|
ORTE_NAME_ARGS(ORTE_PROC_MY_NAME), ORTE_NAME_ARGS(child->name));
|
Bring in the generalized xcast communication system along with the correspondingly revised orted launch. I will send a message out to developers explaining the basic changes. In brief:
1. generalize orte_rml.xcast to become a general broadcast-like messaging system. Messages can now be sent to any tag on the daemons or processes. Note that any message sent via xcast will be delivered to ALL processes in the specified job - you don't get to pick and choose. At a later date, we will introduce an augmented capability that will use the daemons as relays, but will allow you to send to a specified array of process names.
2. extended orte_rml.xcast so it supports more scalable message routing methodologies. At the moment, we support three: (a) direct, which sends the message directly to all recipients; (b) linear, which sends the message to the local daemon on each node, which then relays it to its own local procs; and (b) binomial, which sends the message via a binomial algo across all the daemons, each of which then relays to its own local procs. The crossover points between the algos are adjustable via MCA param, or you can simply demand that a specific algo be used.
3. orteds no longer exhibit two types of behavior: bootproxy or VM. Orteds now always behave like they are part of a virtual machine - they simply launch a job if mpirun tells them to do so. This is another step towards creating an "orteboot" functionality, but also provided a clean system for supporting message relaying.
Note one major impact of this commit: multiple daemons on a node cannot be supported any longer! Only a single daemon/node is now allowed.
This commit is known to break support for the following environments: POE, Xgrid, Xcpu, Windows. It has been tested on rsh, SLURM, and Bproc. Modifications for TM support have been made but could not be verified due to machine problems at LANL. Modifications for SGE have been made but could not be verified. The developers for the non-verified environments will be separately notified along with suggestions on how to fix the problems.
This commit was SVN r15007.
2007-06-12 17:28:54 +04:00
|
|
|
/* ensure, though, that the state is terminated so we don't lockup if
|
|
|
|
* the proc never started
|
|
|
|
*/
|
|
|
|
goto MOVEON;
|
2006-09-15 01:29:51 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/* de-register the SIGCHILD callback for this pid */
|
2007-04-24 22:54:45 +04:00
|
|
|
if (ORTE_SUCCESS != (rc = orte_wait_cb_cancel(child->pid))) {
|
Bring in the generalized xcast communication system along with the correspondingly revised orted launch. I will send a message out to developers explaining the basic changes. In brief:
1. generalize orte_rml.xcast to become a general broadcast-like messaging system. Messages can now be sent to any tag on the daemons or processes. Note that any message sent via xcast will be delivered to ALL processes in the specified job - you don't get to pick and choose. At a later date, we will introduce an augmented capability that will use the daemons as relays, but will allow you to send to a specified array of process names.
2. extended orte_rml.xcast so it supports more scalable message routing methodologies. At the moment, we support three: (a) direct, which sends the message directly to all recipients; (b) linear, which sends the message to the local daemon on each node, which then relays it to its own local procs; and (b) binomial, which sends the message via a binomial algo across all the daemons, each of which then relays to its own local procs. The crossover points between the algos are adjustable via MCA param, or you can simply demand that a specific algo be used.
3. orteds no longer exhibit two types of behavior: bootproxy or VM. Orteds now always behave like they are part of a virtual machine - they simply launch a job if mpirun tells them to do so. This is another step towards creating an "orteboot" functionality, but also provided a clean system for supporting message relaying.
Note one major impact of this commit: multiple daemons on a node cannot be supported any longer! Only a single daemon/node is now allowed.
This commit is known to break support for the following environments: POE, Xgrid, Xcpu, Windows. It has been tested on rsh, SLURM, and Bproc. Modifications for TM support have been made but could not be verified due to machine problems at LANL. Modifications for SGE have been made but could not be verified. The developers for the non-verified environments will be separately notified along with suggestions on how to fix the problems.
This commit was SVN r15007.
2007-06-12 17:28:54 +04:00
|
|
|
/* no need to error_log this - it just means that the pid is already gone */
|
|
|
|
goto MOVEON;
|
2007-04-24 22:54:45 +04:00
|
|
|
}
|
|
|
|
|
2006-09-15 01:29:51 +04:00
|
|
|
/* Send a sigterm to the process. If we get ESRCH back, that
|
|
|
|
means the process is already dead, so just move on. */
|
|
|
|
if (0 != kill(child->pid, SIGTERM) && ESRCH != errno) {
|
|
|
|
int err = errno;
|
|
|
|
opal_show_help("help-odls-default.txt",
|
|
|
|
"odls-default:could-not-send-kill",
|
|
|
|
true, orte_system_info.nodename, child->pid, err);
|
|
|
|
goto MOVEON;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* The kill succeeded. Wait up to timeout_before_sigkill
|
|
|
|
seconds to see if it died. */
|
|
|
|
|
|
|
|
if (!odls_default_child_died(child->pid, orte_odls_globals.timeout_before_sigkill, &exit_status)) {
|
|
|
|
/* try killing it again */
|
|
|
|
kill(child->pid, SIGKILL);
|
|
|
|
/* Double check that it actually died this time */
|
|
|
|
if (!odls_default_child_died(child->pid, orte_odls_globals.timeout_before_sigkill, &exit_status)) {
|
|
|
|
opal_show_help("help-odls-default.txt",
|
|
|
|
"odls-default:could-not-kill",
|
|
|
|
true, orte_system_info.nodename, child->pid);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
MOVEON:
|
|
|
|
/* set the process to "not alive" */
|
|
|
|
child->alive = false;
|
|
|
|
|
|
|
|
/* add this proc to the local list */
|
|
|
|
proc = OBJ_NEW(orte_namelist_t);
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_dss.copy((void**)&(proc->name), child->name, ORTE_NAME))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
opal_condition_signal(&orte_odls_default.cond);
|
|
|
|
OPAL_THREAD_UNLOCK(&orte_odls_default.mutex);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
opal_list_append(&procs_killed, &proc->item);
|
2007-04-24 22:54:45 +04:00
|
|
|
|
|
|
|
/* release the object since we killed it */
|
|
|
|
OBJ_RELEASE(child);
|
2006-09-15 01:29:51 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/* we are done with the global list, so we can now release
|
|
|
|
* any waiting threads - this also allows any callbacks to work
|
|
|
|
*/
|
|
|
|
opal_condition_signal(&orte_odls_default.cond);
|
|
|
|
OPAL_THREAD_UNLOCK(&orte_odls_default.mutex);
|
|
|
|
|
|
|
|
/* deconstruct the local list and update the process states on the registry, if indicated */
|
|
|
|
while (NULL != (item = opal_list_remove_first(&procs_killed))) {
|
|
|
|
proc = (orte_namelist_t*)item;
|
|
|
|
if (set_state) {
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_smr.set_proc_state(proc->name, ORTE_PROC_STATE_TERMINATED, exit_status))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
/* don't exit out even if this didn't work - we still might need to kill more
|
|
|
|
* processes, so just keep trucking
|
|
|
|
*/
|
|
|
|
}
|
|
|
|
}
|
|
|
|
OBJ_RELEASE(proc);
|
|
|
|
}
|
|
|
|
|
|
|
|
OBJ_DESTRUCT(&procs_killed);
|
|
|
|
|
|
|
|
return ORTE_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Wait for a callback indicating the child has completed.
|
|
|
|
*/
|
|
|
|
|
|
|
|
static void odls_default_wait_local_proc(pid_t pid, int status, void* cbdata)
|
|
|
|
{
|
2006-11-11 07:03:45 +03:00
|
|
|
orte_odls_child_t *child;
|
2006-09-15 01:29:51 +04:00
|
|
|
opal_list_item_t *item;
|
|
|
|
bool aborted;
|
|
|
|
char *job, *vpid, *abort_file;
|
|
|
|
struct stat buf;
|
|
|
|
int rc;
|
|
|
|
|
2007-04-24 22:54:45 +04:00
|
|
|
opal_output(orte_odls_globals.output, "odls: child process %ld terminated", (long)pid);
|
Bring over the update to terminate orteds that are generated by a dynamic spawn such as comm_spawn. This introduces the concept of a job "family" - i.e., jobs that have a parent/child relationship. Comm_spawn'ed jobs have a parent (the one that spawned them). We track that relationship throughout the lineage - i.e., if a comm_spawned job in turn calls comm_spawn, then it has a parent (the one that spawned it) and a "root" job (the original job that started things).
Accordingly, there are new APIs to the name service to support the ability to get a job's parent, root, immediate children, and all its descendants. In addition, the terminate_job, terminate_orted, and signal_job APIs for the PLS have been modified to accept attributes that define the extent of their actions. For example, doing a "terminate_job" with an attribute of ORTE_NS_INCLUDE_DESCENDANTS will terminate the given jobid AND all jobs that descended from it.
I have tested this capability on a MacBook under rsh, Odin under SLURM, and LANL's Flash (bproc). It worked successfully on non-MPI jobs (both simple and including a spawn), and MPI jobs (again, both simple and with a spawn).
This commit was SVN r12597.
2006-11-14 22:34:59 +03:00
|
|
|
|
2006-09-15 01:29:51 +04:00
|
|
|
/* since we are going to be working with the global list of
|
|
|
|
* children, we need to protect that list from modification
|
|
|
|
* by other threads. This will also be used to protect us
|
|
|
|
* from race conditions on any abort situation
|
|
|
|
*/
|
|
|
|
OPAL_THREAD_LOCK(&orte_odls_default.mutex);
|
|
|
|
|
|
|
|
/* find this child */
|
|
|
|
for (item = opal_list_get_first(&orte_odls_default.children);
|
|
|
|
item != opal_list_get_end(&orte_odls_default.children);
|
|
|
|
item = opal_list_get_next(item)) {
|
2006-11-11 07:03:45 +03:00
|
|
|
child = (orte_odls_child_t*)item;
|
2006-09-15 01:29:51 +04:00
|
|
|
if (child->alive && pid == child->pid) { /* found it */
|
|
|
|
goto GOTCHILD;
|
|
|
|
}
|
|
|
|
}
|
2007-06-09 02:59:31 +04:00
|
|
|
/* get here if we didn't find the child, or if the specified child
|
|
|
|
* is already dead. If the latter, then we have a problem as it
|
|
|
|
* means we are detecting it exiting multiple times
|
2006-09-15 01:29:51 +04:00
|
|
|
*/
|
2007-06-09 02:59:31 +04:00
|
|
|
opal_output(orte_odls_globals.output, "odls: did not find pid %ld in table!", (long) pid);
|
2006-09-15 01:29:51 +04:00
|
|
|
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
|
|
|
opal_condition_signal(&orte_odls_default.cond);
|
|
|
|
OPAL_THREAD_UNLOCK(&orte_odls_default.mutex);
|
|
|
|
return;
|
|
|
|
|
|
|
|
GOTCHILD:
|
2007-06-09 02:59:31 +04:00
|
|
|
/* If this child was the (vpid==0), we hooked it up to orterun's
|
|
|
|
STDIN SOURCE earlier (do not change this without also changing
|
|
|
|
odsl_default_fork_local_proc()). So we have to tell the SOURCE
|
|
|
|
a) that we don't want any more data and b) that it should not
|
|
|
|
expect any more ACKs from this endpoint (so that the svc
|
|
|
|
component can still flush/shut down cleanly).
|
|
|
|
|
|
|
|
Note that the source may have already detected that this
|
|
|
|
process died as part of an OOB/RML exception, but that's ok --
|
|
|
|
its "exception" detection capabilities are not reliable, so we
|
|
|
|
*have* to do this unpublish here, even if it arrives after an
|
|
|
|
exception is detected and handled (in which case this unpublish
|
|
|
|
request will be ignored/discarded. */
|
|
|
|
opal_output(orte_odls_globals.output,
|
|
|
|
"odls: pid %ld corresponds to [%lu,%lu,%lu]\n",
|
|
|
|
(long) pid, ORTE_NAME_ARGS(child->name));
|
|
|
|
if (0 == child->name->vpid) {
|
|
|
|
rc = orte_iof.iof_unpublish(child->name, ORTE_NS_CMP_ALL,
|
|
|
|
ORTE_IOF_STDIN);
|
|
|
|
if (ORTE_SUCCESS != rc) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
/* We can't really abort, so keep going... */
|
|
|
|
}
|
|
|
|
}
|
|
|
|
opal_output(orte_odls_globals.output, "orted sent IOF unpub message!\n");
|
|
|
|
|
2007-07-13 17:05:46 +04:00
|
|
|
#if 0
|
2007-06-09 02:59:31 +04:00
|
|
|
/* Note that the svc IOF component will detect an exception on the
|
|
|
|
oob because we're shutting it down, so it will take care of
|
|
|
|
closing down any streams that it has open to us. */
|
2006-09-15 01:29:51 +04:00
|
|
|
orte_iof.iof_flush();
|
2007-07-13 17:05:46 +04:00
|
|
|
#endif
|
2006-09-15 01:29:51 +04:00
|
|
|
|
|
|
|
/* determine the state of this process */
|
|
|
|
aborted = false;
|
|
|
|
if(WIFEXITED(status)) {
|
|
|
|
/* even though the process exited "normally", it is quite
|
|
|
|
* possible that this happened via an orte_abort call - in
|
|
|
|
* which case, we need to indicate this was an "abnormal"
|
|
|
|
* termination. See the note in "orte_abort.c" for
|
|
|
|
* an explanation of this process.
|
|
|
|
*
|
|
|
|
* For our purposes here, we need to check for the existence
|
|
|
|
* of an "abort" file in this process' session directory. If
|
|
|
|
* we find it, then we know that this was an abnormal termination.
|
|
|
|
*/
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_ns.convert_jobid_to_string(&job, child->name->jobid))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
goto MOVEON;
|
|
|
|
}
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_ns.convert_vpid_to_string(&vpid, child->name->vpid))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
free(job);
|
|
|
|
goto MOVEON;
|
|
|
|
}
|
|
|
|
abort_file = opal_os_path(false, orte_process_info.universe_session_dir,
|
|
|
|
job, vpid, "abort", NULL );
|
|
|
|
free(job);
|
2006-11-16 00:12:27 +03:00
|
|
|
free(vpid);
|
2006-09-15 01:29:51 +04:00
|
|
|
if (0 == stat(abort_file, &buf)) {
|
|
|
|
/* the abort file must exist - there is nothing in it we need. It's
|
|
|
|
* meer existence indicates that an abnormal termination occurred
|
|
|
|
*/
|
2006-11-14 00:51:34 +03:00
|
|
|
opal_output(orte_odls_globals.output, "odls: child [%ld,%ld,%ld] died by abort",
|
|
|
|
ORTE_NAME_ARGS(child->name));
|
2006-09-15 01:29:51 +04:00
|
|
|
aborted = true;
|
|
|
|
free(abort_file);
|
2006-11-14 00:51:34 +03:00
|
|
|
} else {
|
Bring over the update to terminate orteds that are generated by a dynamic spawn such as comm_spawn. This introduces the concept of a job "family" - i.e., jobs that have a parent/child relationship. Comm_spawn'ed jobs have a parent (the one that spawned them). We track that relationship throughout the lineage - i.e., if a comm_spawned job in turn calls comm_spawn, then it has a parent (the one that spawned it) and a "root" job (the original job that started things).
Accordingly, there are new APIs to the name service to support the ability to get a job's parent, root, immediate children, and all its descendants. In addition, the terminate_job, terminate_orted, and signal_job APIs for the PLS have been modified to accept attributes that define the extent of their actions. For example, doing a "terminate_job" with an attribute of ORTE_NS_INCLUDE_DESCENDANTS will terminate the given jobid AND all jobs that descended from it.
I have tested this capability on a MacBook under rsh, Odin under SLURM, and LANL's Flash (bproc). It worked successfully on non-MPI jobs (both simple and including a spawn), and MPI jobs (again, both simple and with a spawn).
This commit was SVN r12597.
2006-11-14 22:34:59 +03:00
|
|
|
opal_output(orte_odls_globals.output, "odls: child process [%ld,%ld,%ld] terminated normally",
|
2006-11-14 00:51:34 +03:00
|
|
|
ORTE_NAME_ARGS(child->name));
|
2006-09-15 01:29:51 +04:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
/* the process was terminated with a signal! That's definitely
|
|
|
|
* abnormal, so indicate that condition
|
|
|
|
*/
|
Bring over the update to terminate orteds that are generated by a dynamic spawn such as comm_spawn. This introduces the concept of a job "family" - i.e., jobs that have a parent/child relationship. Comm_spawn'ed jobs have a parent (the one that spawned them). We track that relationship throughout the lineage - i.e., if a comm_spawned job in turn calls comm_spawn, then it has a parent (the one that spawned it) and a "root" job (the original job that started things).
Accordingly, there are new APIs to the name service to support the ability to get a job's parent, root, immediate children, and all its descendants. In addition, the terminate_job, terminate_orted, and signal_job APIs for the PLS have been modified to accept attributes that define the extent of their actions. For example, doing a "terminate_job" with an attribute of ORTE_NS_INCLUDE_DESCENDANTS will terminate the given jobid AND all jobs that descended from it.
I have tested this capability on a MacBook under rsh, Odin under SLURM, and LANL's Flash (bproc). It worked successfully on non-MPI jobs (both simple and including a spawn), and MPI jobs (again, both simple and with a spawn).
This commit was SVN r12597.
2006-11-14 22:34:59 +03:00
|
|
|
opal_output(orte_odls_globals.output, "odls: child process [%ld,%ld,%ld] terminated with signal",
|
2006-11-14 00:51:34 +03:00
|
|
|
ORTE_NAME_ARGS(child->name));
|
2006-09-15 01:29:51 +04:00
|
|
|
aborted = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
MOVEON:
|
|
|
|
/* set this proc to "not alive" */
|
|
|
|
child->alive = false;
|
|
|
|
|
|
|
|
/* Clean up the session directory as if we were the process
|
|
|
|
* itself. This covers the case where the process died abnormally
|
|
|
|
* and didn't cleanup its own session directory.
|
|
|
|
*/
|
|
|
|
orte_session_dir_finalize(child->name);
|
|
|
|
|
2006-11-11 07:03:45 +03:00
|
|
|
/* set the proc state in the child structure */
|
|
|
|
if (aborted) {
|
|
|
|
child->state = ORTE_PROC_STATE_ABORTED;
|
|
|
|
} else {
|
|
|
|
child->state = ORTE_PROC_STATE_TERMINATED;
|
|
|
|
}
|
|
|
|
|
2006-09-15 01:29:51 +04:00
|
|
|
/* Need to unlock before we call set_proc_state as this is going to generate
|
|
|
|
* a trigger that will eventually callback to us
|
|
|
|
*/
|
|
|
|
opal_condition_signal(&orte_odls_default.cond);
|
|
|
|
OPAL_THREAD_UNLOCK(&orte_odls_default.mutex);
|
|
|
|
|
2007-04-24 22:54:45 +04:00
|
|
|
if (ORTE_SUCCESS != (rc = orte_smr.set_proc_state(child->name, child->state, status))) {
|
2006-09-15 01:29:51 +04:00
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Fork/exec the specified processes
|
|
|
|
*/
|
|
|
|
|
|
|
|
static int odls_default_fork_local_proc(
|
|
|
|
orte_app_context_t* context,
|
2006-11-11 07:03:45 +03:00
|
|
|
orte_odls_child_t *child,
|
2006-09-15 01:29:51 +04:00
|
|
|
orte_vpid_t vpid_start,
|
|
|
|
orte_vpid_t vpid_range,
|
2007-07-02 05:33:35 +04:00
|
|
|
orte_std_cntr_t total_slots_alloc,
|
2007-02-06 22:51:05 +03:00
|
|
|
bool want_processor,
|
|
|
|
size_t processor,
|
|
|
|
bool oversubscribed,
|
2006-10-11 19:18:57 +04:00
|
|
|
char **base_environ)
|
2006-09-15 01:29:51 +04:00
|
|
|
{
|
|
|
|
pid_t pid;
|
|
|
|
orte_iof_base_io_conf_t opts;
|
|
|
|
int rc;
|
|
|
|
sigset_t sigs;
|
|
|
|
int i = 0, p[2];
|
|
|
|
|
2007-04-24 22:54:45 +04:00
|
|
|
/* check the system limits - if we are at our max allowed children, then
|
|
|
|
* we won't be allowed to do this anyway, so we may as well abort now.
|
|
|
|
* According to the documentation, num_procs = 0 is equivalent to
|
|
|
|
* to no limit, so treat it as unlimited here.
|
|
|
|
*/
|
|
|
|
if (opal_sys_limits.initialized) {
|
|
|
|
if (0 < opal_sys_limits.num_procs &&
|
|
|
|
opal_sys_limits.num_procs <= (int)opal_list_get_size(&orte_odls_default.children)) {
|
|
|
|
/* at the system limit - abort */
|
|
|
|
ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_CHILDREN);
|
|
|
|
child->state = ORTE_PROC_STATE_FAILED_TO_START;
|
|
|
|
child->exit_code = ORTE_ERR_SYS_LIMITS_CHILDREN;
|
|
|
|
return ORTE_ERR_SYS_LIMITS_CHILDREN;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2006-09-15 01:29:51 +04:00
|
|
|
/* should pull this information from MPIRUN instead of going with
|
|
|
|
default */
|
|
|
|
opts.usepty = OMPI_ENABLE_PTY_SUPPORT;
|
|
|
|
|
2007-06-09 02:59:31 +04:00
|
|
|
/* BWB - Fix post beta. Should setup stdin in orterun and make
|
|
|
|
part of the app_context. Do not change this without also
|
|
|
|
changing the reverse of this in
|
|
|
|
odls_default_wait_local_proc(). */
|
2006-09-15 01:29:51 +04:00
|
|
|
if (child->name->vpid == 0) {
|
|
|
|
opts.connect_stdin = true;
|
|
|
|
} else {
|
|
|
|
opts.connect_stdin = false;
|
|
|
|
}
|
|
|
|
|
2007-04-24 22:54:45 +04:00
|
|
|
if (ORTE_SUCCESS != (rc = orte_iof_base_setup_prefork(&opts))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
child->state = ORTE_PROC_STATE_FAILED_TO_START;
|
|
|
|
child->exit_code = rc;
|
|
|
|
return rc;
|
2006-09-15 01:29:51 +04:00
|
|
|
}
|
2007-04-24 22:54:45 +04:00
|
|
|
|
2006-09-15 01:29:51 +04:00
|
|
|
/* A pipe is used to communicate between the parent and child to
|
|
|
|
indicate whether the exec ultiimately succeeded or failed. The
|
|
|
|
child sets the pipe to be close-on-exec; the child only ever
|
|
|
|
writes anything to the pipe if there is an error (e.g.,
|
|
|
|
executable not found, exec() fails, etc.). The parent does a
|
|
|
|
blocking read on the pipe; if the pipe closed with no data,
|
|
|
|
then the exec() succeeded. If the parent reads something from
|
|
|
|
the pipe, then the child was letting us know that it failed. */
|
|
|
|
if (pipe(p) < 0) {
|
2007-04-24 22:54:45 +04:00
|
|
|
ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_PIPES);
|
|
|
|
child->state = ORTE_PROC_STATE_FAILED_TO_START;
|
|
|
|
child->exit_code = ORTE_ERR_SYS_LIMITS_PIPES;
|
|
|
|
return ORTE_ERR_SYS_LIMITS_PIPES;
|
2006-09-15 01:29:51 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Fork off the child */
|
|
|
|
pid = fork();
|
|
|
|
if(pid < 0) {
|
2007-04-24 22:54:45 +04:00
|
|
|
ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_CHILDREN);
|
|
|
|
child->state = ORTE_PROC_STATE_FAILED_TO_START;
|
|
|
|
child->exit_code = ORTE_ERR_SYS_LIMITS_CHILDREN;
|
|
|
|
return ORTE_ERR_SYS_LIMITS_CHILDREN;
|
2006-09-15 01:29:51 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
if (pid == 0) {
|
|
|
|
char *param, *param2;
|
|
|
|
char *uri;
|
|
|
|
char **environ_copy;
|
|
|
|
long fd, fdmax = sysconf(_SC_OPEN_MAX);
|
|
|
|
|
|
|
|
/* Setup the pipe to be close-on-exec */
|
|
|
|
close(p[0]);
|
|
|
|
fcntl(p[1], F_SETFD, FD_CLOEXEC);
|
|
|
|
|
|
|
|
/* Try to change to the context cwd and check that the app
|
2006-10-11 22:43:13 +04:00
|
|
|
exists and is executable The resource manager functions will
|
|
|
|
take care of outputting a pretty error message, if required
|
|
|
|
*/
|
2006-09-15 01:29:51 +04:00
|
|
|
if (ORTE_SUCCESS != (i = orte_rmgr.check_context_cwd(context, true))) {
|
2006-10-11 22:43:13 +04:00
|
|
|
/* Tell the parent that Badness happened */
|
2006-09-15 01:29:51 +04:00
|
|
|
write(p[1], &i, sizeof(int));
|
2006-12-18 05:30:05 +03:00
|
|
|
exit(1);
|
2006-09-15 01:29:51 +04:00
|
|
|
}
|
|
|
|
if (ORTE_SUCCESS != (i = orte_rmgr.check_context_app(context))) {
|
|
|
|
/* Tell the parent that Badness happened */
|
|
|
|
write(p[1], &i, sizeof(int));
|
2006-12-18 05:30:05 +03:00
|
|
|
exit(1);
|
2006-09-15 01:29:51 +04:00
|
|
|
}
|
|
|
|
|
2007-04-24 22:54:45 +04:00
|
|
|
/* setup stdout/stderr so that any error messages that we may
|
|
|
|
print out will get displayed back at orterun.
|
|
|
|
|
|
|
|
NOTE: Definitely do this AFTER we check contexts so that any
|
|
|
|
error message from those two functions doesn't come out to the
|
|
|
|
user. IF we didn't do it in this order, THEN a user who gives
|
|
|
|
us a bad executable name or working directory would get N
|
|
|
|
error messages, where N=num_procs. This would be very annoying
|
|
|
|
for large jobs, so instead we set things up so that orterun
|
|
|
|
always outputs a nice, single message indicating what happened
|
|
|
|
*/
|
|
|
|
if (ORTE_SUCCESS != (i = orte_iof_base_setup_child(&opts))) {
|
|
|
|
write(p[1], &i, sizeof(int));
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
|
2006-09-15 01:29:51 +04:00
|
|
|
/* setup base environment: copy the current environ and merge
|
|
|
|
in the app context environ */
|
|
|
|
if (NULL != context->env) {
|
2006-10-11 19:18:57 +04:00
|
|
|
environ_copy = opal_environ_merge(base_environ, context->env);
|
2006-09-15 01:29:51 +04:00
|
|
|
} else {
|
2006-10-11 19:18:57 +04:00
|
|
|
environ_copy = opal_argv_copy(base_environ);
|
2006-09-15 01:29:51 +04:00
|
|
|
}
|
2006-11-13 21:51:18 +03:00
|
|
|
|
2006-09-15 01:29:51 +04:00
|
|
|
/* special case handling for --prefix: this is somewhat icky,
|
|
|
|
but at least some users do this. :-\ It is possible that
|
|
|
|
when using --prefix, the user will also "-x PATH" and/or
|
|
|
|
"-x LD_LIBRARY_PATH", which would therefore clobber the
|
|
|
|
work that was done in the prior pls to ensure that we have
|
|
|
|
the prefix at the beginning of the PATH and
|
|
|
|
LD_LIBRARY_PATH. So examine the context->env and see if we
|
|
|
|
find PATH or LD_LIBRARY_PATH. If found, that means the
|
|
|
|
prior work was clobbered, and we need to re-prefix those
|
|
|
|
variables. */
|
|
|
|
for (i = 0; NULL != context->env && NULL != context->env[i]; ++i) {
|
|
|
|
char *newenv;
|
|
|
|
|
|
|
|
/* Reset PATH */
|
|
|
|
if (0 == strncmp("PATH=", context->env[i], 5)) {
|
|
|
|
asprintf(&newenv, "%s/bin:%s",
|
|
|
|
context->prefix_dir, context->env[i] + 5);
|
|
|
|
opal_setenv("PATH", newenv, true, &environ_copy);
|
|
|
|
free(newenv);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Reset LD_LIBRARY_PATH */
|
|
|
|
else if (0 == strncmp("LD_LIBRARY_PATH=", context->env[i], 16)) {
|
|
|
|
asprintf(&newenv, "%s/lib:%s",
|
|
|
|
context->prefix_dir, context->env[i] + 16);
|
|
|
|
opal_setenv("LD_LIBRARY_PATH", newenv, true, &environ_copy);
|
|
|
|
free(newenv);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Bring in the code for routing xcast stage gate messages via the local orteds. This code is inactive unless you specifically request it via an mca param oob_xcast_mode (can be set to "linear" or "direct"). Direct mode is the old standard method where we send messages directly to each MPI process. Linear mode sends the xcast message via the orteds, with the HNP sending the message to each orted directly.
There is a binomial algorithm in the code (i.e., the HNP would send to a subset of the orteds, which then relay it on according to the typical log-2 algo), but that has a bug in it so the code won't let you select it even if you tried (and the mca param doesn't show, so you'd *really* have to try).
This also involved a slight change to the oob.xcast API, so propagated that as required.
Note: this has *only* been tested on rsh, SLURM, and Bproc environments (now that it has been transferred to the OMPI trunk, I'll need to re-test it [only done rsh so far]). It should work fine on any environment that uses the ORTE daemons - anywhere else, you are on your own... :-)
Also, correct a mistake where the orte_debug_flag was declared an int, but the mca param was set as a bool. Move the storage for that flag to the orte/runtime/params.c and orte/runtime/params.h files appropriately.
This commit was SVN r14475.
2007-04-23 22:41:04 +04:00
|
|
|
/* pass my contact info to the local proc so we can talk */
|
|
|
|
uri = orte_rml.get_uri();
|
|
|
|
param = mca_base_param_environ_variable("orte","local_daemon","uri");
|
|
|
|
opal_setenv(param, uri, true, &environ_copy);
|
|
|
|
free(param);
|
|
|
|
free(uri);
|
|
|
|
|
2007-02-06 22:51:05 +03:00
|
|
|
/* setup yield schedule and processor affinity
|
|
|
|
* We default here to always setting the affinity processor if we want
|
|
|
|
* it. The processor affinity system then determines
|
|
|
|
* if processor affinity is enabled/requested - if so, it then uses
|
|
|
|
* this value to select the process to which the proc is "assigned".
|
|
|
|
* Otherwise, the paffinity subsystem just ignores this value anyway
|
|
|
|
*/
|
|
|
|
if (oversubscribed) {
|
|
|
|
param = mca_base_param_environ_variable("mpi", NULL, "yield_when_idle");
|
|
|
|
opal_setenv(param, "1", false, &environ_copy);
|
|
|
|
} else {
|
|
|
|
param = mca_base_param_environ_variable("mpi", NULL, "yield_when_idle");
|
|
|
|
opal_setenv(param, "0", false, &environ_copy);
|
|
|
|
}
|
|
|
|
free(param);
|
|
|
|
|
|
|
|
if (want_processor) {
|
|
|
|
param = mca_base_param_environ_variable("mpi", NULL,
|
|
|
|
"paffinity_processor");
|
|
|
|
asprintf(¶m2, "%lu", (unsigned long) processor);
|
|
|
|
opal_setenv(param, param2, false, &environ_copy);
|
|
|
|
free(param);
|
|
|
|
free(param2);
|
|
|
|
} else {
|
|
|
|
param = mca_base_param_environ_variable("mpi", NULL,
|
|
|
|
"paffinity_processor");
|
|
|
|
opal_unsetenv(param, &environ_copy);
|
|
|
|
free(param);
|
|
|
|
}
|
|
|
|
|
2006-09-15 01:29:51 +04:00
|
|
|
/* setup universe info */
|
|
|
|
if (NULL != orte_universe_info.name) {
|
|
|
|
param = mca_base_param_environ_variable("universe", NULL, NULL);
|
|
|
|
asprintf(&uri, "%s@%s:%s", orte_universe_info.uid,
|
|
|
|
orte_universe_info.host,
|
|
|
|
orte_universe_info.name);
|
|
|
|
opal_setenv(param, uri, true, &environ_copy);
|
|
|
|
free(param);
|
|
|
|
free(uri);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* setup ns contact info */
|
|
|
|
if(NULL != orte_process_info.ns_replica_uri) {
|
|
|
|
uri = strdup(orte_process_info.ns_replica_uri);
|
|
|
|
} else {
|
|
|
|
uri = orte_rml.get_uri();
|
|
|
|
}
|
|
|
|
param = mca_base_param_environ_variable("ns","replica","uri");
|
|
|
|
opal_setenv(param, uri, true, &environ_copy);
|
|
|
|
free(param);
|
|
|
|
free(uri);
|
|
|
|
|
|
|
|
/* setup gpr contact info */
|
|
|
|
if(NULL != orte_process_info.gpr_replica_uri) {
|
|
|
|
uri = strdup(orte_process_info.gpr_replica_uri);
|
|
|
|
} else {
|
|
|
|
uri = orte_rml.get_uri();
|
|
|
|
}
|
|
|
|
param = mca_base_param_environ_variable("gpr","replica","uri");
|
|
|
|
opal_setenv(param, uri, true, &environ_copy);
|
|
|
|
free(param);
|
|
|
|
free(uri);
|
2006-11-13 21:51:18 +03:00
|
|
|
|
2006-12-07 06:11:20 +03:00
|
|
|
/* set the app_context number into the environment */
|
|
|
|
param = mca_base_param_environ_variable("orte","app","num");
|
|
|
|
asprintf(¶m2, "%ld", (long)child->app_idx);
|
|
|
|
opal_setenv(param, param2, true, &environ_copy);
|
|
|
|
free(param);
|
|
|
|
free(param2);
|
|
|
|
|
2007-07-02 05:33:35 +04:00
|
|
|
/* set the universe size in the environment */
|
|
|
|
param = mca_base_param_environ_variable("orte","universe","size");
|
|
|
|
asprintf(¶m2, "%ld", (long)total_slots_alloc);
|
|
|
|
opal_setenv(param, param2, true, &environ_copy);
|
|
|
|
free(param);
|
|
|
|
free(param2);
|
|
|
|
|
2006-09-15 01:29:51 +04:00
|
|
|
/* use same nodename as the starting daemon (us) */
|
|
|
|
param = mca_base_param_environ_variable("orte", "base", "nodename");
|
|
|
|
opal_setenv(param, orte_system_info.nodename, true, &environ_copy);
|
|
|
|
free(param);
|
|
|
|
|
Compute and pass the local_rank and local number of procs (in that proc's job) on the node.
To be precise, given this hypothetical launching pattern:
host1: vpids 0, 2, 4, 6
host2: vpids 1, 3, 5, 7
The local_rank for these procs would be:
host1: vpids 0->local_rank 0, v2->lr1, v4->lr2, v6->lr3
host2: vpids 1->local_rank 0, v3->lr1, v5->lr2, v7->lr3
and the number of local procs on each node would be four. If vpid=0 then does a comm_spawn of one process on host1, the values of the parent job would remain unchanged. The local_rank of the child process would be 0 and its num_local_procs would be 1 since it is in a separate jobid.
I have verified this functionality for the rsh case - need to verify that slurm and other cases also get the right values. Some consolidation of common code is probably going to occur in the SDS components to make this simpler and more maintainable in the future.
This commit was SVN r14706.
2007-05-21 18:30:10 +04:00
|
|
|
/* push data into environment */
|
2006-09-15 01:29:51 +04:00
|
|
|
orte_ns_nds_env_put(child->name, vpid_start, vpid_range,
|
Compute and pass the local_rank and local number of procs (in that proc's job) on the node.
To be precise, given this hypothetical launching pattern:
host1: vpids 0, 2, 4, 6
host2: vpids 1, 3, 5, 7
The local_rank for these procs would be:
host1: vpids 0->local_rank 0, v2->lr1, v4->lr2, v6->lr3
host2: vpids 1->local_rank 0, v3->lr1, v5->lr2, v7->lr3
and the number of local procs on each node would be four. If vpid=0 then does a comm_spawn of one process on host1, the values of the parent job would remain unchanged. The local_rank of the child process would be 0 and its num_local_procs would be 1 since it is in a separate jobid.
I have verified this functionality for the rsh case - need to verify that slurm and other cases also get the right values. Some consolidation of common code is probably going to occur in the SDS components to make this simpler and more maintainable in the future.
This commit was SVN r14706.
2007-05-21 18:30:10 +04:00
|
|
|
child->local_rank, child->num_procs,
|
2006-09-15 01:29:51 +04:00
|
|
|
&environ_copy);
|
|
|
|
|
|
|
|
/* close all file descriptors w/ exception of stdin/stdout/stderr */
|
|
|
|
for(fd=3; fd<fdmax; fd++)
|
|
|
|
close(fd);
|
|
|
|
|
|
|
|
if (context->argv == NULL) {
|
|
|
|
context->argv = malloc(sizeof(char*)*2);
|
|
|
|
context->argv[0] = strdup(context->app);
|
|
|
|
context->argv[1] = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Set signal handlers back to the default. Do this close to
|
|
|
|
the exev() because the event library may (and likely will)
|
|
|
|
reset them. If we don't do this, the event library may
|
|
|
|
have left some set that, at least on some OS's, don't get
|
|
|
|
reset via fork() or exec(). Hence, the launched process
|
|
|
|
could be unkillable (for example). */
|
|
|
|
|
|
|
|
set_handler_default(SIGTERM);
|
|
|
|
set_handler_default(SIGINT);
|
|
|
|
set_handler_default(SIGHUP);
|
|
|
|
set_handler_default(SIGPIPE);
|
|
|
|
set_handler_default(SIGCHLD);
|
|
|
|
|
|
|
|
/* Unblock all signals, for many of the same reasons that we
|
|
|
|
set the default handlers, above. This is noticable on
|
|
|
|
Linux where the event library blocks SIGTERM, but we don't
|
|
|
|
want that blocked by the launched process. */
|
|
|
|
sigprocmask(0, 0, &sigs);
|
|
|
|
sigprocmask(SIG_UNBLOCK, &sigs, 0);
|
|
|
|
|
|
|
|
/* Exec the new executable */
|
|
|
|
|
|
|
|
execve(context->app, context->argv, environ_copy);
|
2006-10-11 22:43:13 +04:00
|
|
|
opal_show_help("help-odls-default.txt", "orte-odls-default:execv-error",
|
2006-09-15 01:29:51 +04:00
|
|
|
true, context->app, strerror(errno));
|
2006-12-18 05:30:05 +03:00
|
|
|
exit(1);
|
2006-09-15 01:29:51 +04:00
|
|
|
} else {
|
|
|
|
|
|
|
|
/* connect endpoints IOF */
|
|
|
|
rc = orte_iof_base_setup_parent(child->name, &opts);
|
|
|
|
if(ORTE_SUCCESS != rc) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Wait to read something from the pipe or close */
|
|
|
|
close(p[1]);
|
|
|
|
while (1) {
|
|
|
|
rc = read(p[0], &i, sizeof(int));
|
|
|
|
if (rc < 0) {
|
|
|
|
/* Signal interrupts are ok */
|
|
|
|
if (errno == EINTR) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
/* Other errno's are bad */
|
2007-04-24 22:54:45 +04:00
|
|
|
child->state = ORTE_PROC_STATE_FAILED_TO_START;
|
|
|
|
child->exit_code = ORTE_ERR_PIPE_READ_FAILURE;
|
|
|
|
opal_output(orte_odls_globals.output, "odls: got code %d back from child", i);
|
|
|
|
return ORTE_ERR_PIPE_READ_FAILURE;
|
2006-09-15 01:29:51 +04:00
|
|
|
break;
|
|
|
|
} else if (0 == rc) {
|
|
|
|
/* Child was successful in exec'ing! */
|
|
|
|
break;
|
|
|
|
} else {
|
2007-04-24 22:54:45 +04:00
|
|
|
/* Doh -- child failed.
|
|
|
|
Let the calling function
|
|
|
|
know about the failure. The actual exit status of child proc
|
|
|
|
cannot be found here - all we can do is report the ORTE error
|
|
|
|
code that was reported back to us. The calling func needs to report the
|
|
|
|
failure to launch this process through the SMR or else
|
|
|
|
everyone else will hang.
|
2006-09-15 01:29:51 +04:00
|
|
|
*/
|
2007-04-24 22:54:45 +04:00
|
|
|
child->state = ORTE_PROC_STATE_FAILED_TO_START;
|
|
|
|
child->exit_code = i;
|
|
|
|
opal_output(orte_odls_globals.output, "odls: got code %d back from child", i);
|
2007-02-13 02:59:27 +03:00
|
|
|
return i;
|
2006-09-15 01:29:51 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2006-11-11 07:03:45 +03:00
|
|
|
/* set the proc state to LAUNCHED and save the pid */
|
|
|
|
child->state = ORTE_PROC_STATE_LAUNCHED;
|
2006-09-15 01:29:51 +04:00
|
|
|
child->pid = pid;
|
|
|
|
child->alive = true;
|
|
|
|
}
|
2006-11-11 07:03:45 +03:00
|
|
|
|
2006-09-15 01:29:51 +04:00
|
|
|
return ORTE_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Launch all processes allocated to the current node.
|
|
|
|
*/
|
|
|
|
|
2006-10-11 19:18:57 +04:00
|
|
|
int orte_odls_default_launch_local_procs(orte_gpr_notify_data_t *data, char **base_environ)
|
2006-09-15 01:29:51 +04:00
|
|
|
{
|
|
|
|
int rc;
|
2007-07-02 05:33:35 +04:00
|
|
|
orte_std_cntr_t i, j, kv, kv2, *sptr, total_slots_alloc;
|
2006-09-15 01:29:51 +04:00
|
|
|
orte_gpr_value_t *value, **values;
|
|
|
|
orte_gpr_keyval_t *kval;
|
|
|
|
orte_app_context_t *app;
|
|
|
|
orte_jobid_t job;
|
|
|
|
orte_vpid_t *vptr, start, range;
|
|
|
|
char *node_name;
|
|
|
|
opal_list_t app_context_list;
|
2006-11-11 07:03:45 +03:00
|
|
|
orte_odls_child_t *child;
|
2006-09-15 01:29:51 +04:00
|
|
|
odls_default_app_context_t *app_item;
|
2007-02-06 22:51:05 +03:00
|
|
|
int num_processors;
|
|
|
|
bool oversubscribed=false, want_processor, *bptr, override_oversubscribed=false;
|
2006-09-15 01:29:51 +04:00
|
|
|
opal_list_item_t *item, *item2;
|
2007-04-24 22:54:45 +04:00
|
|
|
bool quit_flag;
|
|
|
|
bool node_included;
|
2007-03-17 02:11:45 +03:00
|
|
|
orte_filem_base_request_t *filem_request;
|
Bring in the code for routing xcast stage gate messages via the local orteds. This code is inactive unless you specifically request it via an mca param oob_xcast_mode (can be set to "linear" or "direct"). Direct mode is the old standard method where we send messages directly to each MPI process. Linear mode sends the xcast message via the orteds, with the HNP sending the message to each orted directly.
There is a binomial algorithm in the code (i.e., the HNP would send to a subset of the orteds, which then relay it on according to the typical log-2 algo), but that has a bug in it so the code won't let you select it even if you tried (and the mca param doesn't show, so you'd *really* have to try).
This also involved a slight change to the oob.xcast API, so propagated that as required.
Note: this has *only* been tested on rsh, SLURM, and Bproc environments (now that it has been transferred to the OMPI trunk, I'll need to re-test it [only done rsh so far]). It should work fine on any environment that uses the ORTE daemons - anywhere else, you are on your own... :-)
Also, correct a mistake where the orte_debug_flag was declared an int, but the mca param was set as a bool. Move the storage for that flag to the orte/runtime/params.c and orte/runtime/params.h files appropriately.
This commit was SVN r14475.
2007-04-23 22:41:04 +04:00
|
|
|
char *job_str, *uri_file, *my_uri, *session_dir=NULL;
|
|
|
|
FILE *fp;
|
2006-09-15 01:29:51 +04:00
|
|
|
|
|
|
|
/* parse the returned data to create the required structures
|
|
|
|
* for a fork launch. Since the data will contain information
|
|
|
|
* on procs for ALL nodes, we first have to find the value
|
|
|
|
* struct that contains info for our node.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* first, retrieve the job number we are to launch from the
|
|
|
|
* returned data - we can extract the jobid directly from the
|
|
|
|
* subscription name we created
|
|
|
|
*/
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_schema.extract_jobid_from_std_trigger_name(&job, data->target))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
2006-11-13 21:51:18 +03:00
|
|
|
|
|
|
|
opal_output(orte_odls_globals.output, "odls: setting up launch for job %ld", (long)job);
|
2007-06-18 19:39:04 +04:00
|
|
|
|
2006-09-15 01:29:51 +04:00
|
|
|
/* We need to create a list of the app_contexts
|
|
|
|
* so we can know what to launch - the process info only gives
|
|
|
|
* us an index into the app_context array, not the app_context
|
|
|
|
* info itself.
|
|
|
|
*/
|
|
|
|
|
|
|
|
OBJ_CONSTRUCT(&app_context_list, opal_list_t);
|
|
|
|
|
2006-09-19 17:05:40 +04:00
|
|
|
/* set the default values to INVALID */
|
|
|
|
start = ORTE_VPID_INVALID;
|
|
|
|
range = ORTE_VPID_INVALID;
|
|
|
|
|
2007-04-24 22:54:45 +04:00
|
|
|
/* set the flag indicating this node is not included in the launch data */
|
|
|
|
node_included = false;
|
|
|
|
|
2006-09-15 01:29:51 +04:00
|
|
|
values = (orte_gpr_value_t**)(data->values)->addr;
|
|
|
|
for (j=0, i=0; i < data->cnt && j < (data->values)->size; j++) { /* loop through all returned values */
|
|
|
|
if (NULL != values[j]) {
|
|
|
|
i++;
|
|
|
|
value = values[j];
|
|
|
|
|
2006-12-10 02:10:25 +03:00
|
|
|
if (NULL != value->tokens) {
|
2006-11-16 00:12:27 +03:00
|
|
|
/* this came from the globals container, so it must contain
|
2006-09-15 01:29:51 +04:00
|
|
|
* the app_context(s), vpid_start, and vpid_range entries. Only one
|
|
|
|
* value object should ever come from that container
|
|
|
|
*/
|
|
|
|
for (kv=0; kv < value->cnt; kv++) {
|
|
|
|
kval = value->keyvals[kv];
|
|
|
|
if (strcmp(kval->key, ORTE_JOB_VPID_START_KEY) == 0) {
|
|
|
|
/* this can only occur once, so just store it */
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&vptr, kval->value, ORTE_VPID))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
start = *vptr;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (strcmp(kval->key, ORTE_JOB_VPID_RANGE_KEY) == 0) {
|
|
|
|
/* this can only occur once, so just store it */
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&vptr, kval->value, ORTE_VPID))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
range = *vptr;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (strcmp(kval->key, ORTE_JOB_APP_CONTEXT_KEY) == 0) {
|
|
|
|
/* this can occur multiple times since we allow multiple
|
|
|
|
* app_contexts on the orterun command line. Add them
|
|
|
|
* to the list
|
|
|
|
*/
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&app, kval->value, ORTE_APP_CONTEXT))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
app_item = OBJ_NEW(odls_default_app_context_t);
|
|
|
|
if (NULL == app_item) {
|
|
|
|
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
|
|
|
return ORTE_ERR_OUT_OF_RESOURCE;
|
|
|
|
}
|
|
|
|
app_item->app_context = app;
|
|
|
|
opal_list_append(&app_context_list, &app_item->super);
|
|
|
|
kval->value->data = NULL; /* protect the data storage from later release */
|
2007-02-06 22:51:05 +03:00
|
|
|
}
|
|
|
|
if (strcmp(kval->key, ORTE_JOB_OVERSUBSCRIBE_OVERRIDE_KEY) == 0) {
|
|
|
|
/* this can only occur once, so just store it */
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&bptr, kval->value, ORTE_BOOL))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
override_oversubscribed = *bptr;
|
|
|
|
continue;
|
|
|
|
}
|
2007-07-02 05:33:35 +04:00
|
|
|
if (strcmp(kval->key, ORTE_JOB_TOTAL_SLOTS_ALLOC_KEY) == 0) {
|
|
|
|
/* this can only occur once, so just store it */
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&sptr, kval->value, ORTE_STD_CNTR))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
total_slots_alloc = *sptr;
|
|
|
|
continue;
|
|
|
|
}
|
2006-09-15 01:29:51 +04:00
|
|
|
} /* end for loop to process global data */
|
|
|
|
} else {
|
|
|
|
/* this must have come from one of the process containers, so it must
|
|
|
|
* contain data for a proc structure - see if it
|
|
|
|
* belongs to this node
|
|
|
|
*/
|
|
|
|
for (kv=0; kv < value->cnt; kv++) {
|
|
|
|
kval = value->keyvals[kv];
|
|
|
|
if (strcmp(kval->key, ORTE_NODE_NAME_KEY) == 0) {
|
|
|
|
/* Most C-compilers will bark if we try to directly compare the string in the
|
|
|
|
* kval data area against a regular string, so we need to "get" the data
|
|
|
|
* so we can access it */
|
2006-11-16 00:12:27 +03:00
|
|
|
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&node_name, kval->value, ORTE_STRING))) {
|
2006-09-15 01:29:51 +04:00
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
/* if this is our node...must also protect against a zero-length string */
|
|
|
|
if (NULL != node_name && 0 == strcmp(node_name, orte_system_info.nodename)) {
|
2007-04-24 22:54:45 +04:00
|
|
|
/* indicate that there is something for us to do */
|
|
|
|
node_included = true;
|
|
|
|
|
2006-09-15 01:29:51 +04:00
|
|
|
/* ...harvest the info into a new child structure */
|
2006-11-11 07:03:45 +03:00
|
|
|
child = OBJ_NEW(orte_odls_child_t);
|
2006-09-15 01:29:51 +04:00
|
|
|
for (kv2 = 0; kv2 < value->cnt; kv2++) {
|
|
|
|
kval = value->keyvals[kv2];
|
|
|
|
if(strcmp(kval->key, ORTE_PROC_NAME_KEY) == 0) {
|
|
|
|
/* copy the name into the child object */
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_dss.copy((void**)&(child->name), kval->value->data, ORTE_NAME))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if(strcmp(kval->key, ORTE_PROC_APP_CONTEXT_KEY) == 0) {
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&sptr, kval->value, ORTE_STD_CNTR))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
child->app_idx = *sptr; /* save the index into the app_context objects */
|
|
|
|
continue;
|
|
|
|
}
|
Compute and pass the local_rank and local number of procs (in that proc's job) on the node.
To be precise, given this hypothetical launching pattern:
host1: vpids 0, 2, 4, 6
host2: vpids 1, 3, 5, 7
The local_rank for these procs would be:
host1: vpids 0->local_rank 0, v2->lr1, v4->lr2, v6->lr3
host2: vpids 1->local_rank 0, v3->lr1, v5->lr2, v7->lr3
and the number of local procs on each node would be four. If vpid=0 then does a comm_spawn of one process on host1, the values of the parent job would remain unchanged. The local_rank of the child process would be 0 and its num_local_procs would be 1 since it is in a separate jobid.
I have verified this functionality for the rsh case - need to verify that slurm and other cases also get the right values. Some consolidation of common code is probably going to occur in the SDS components to make this simpler and more maintainable in the future.
This commit was SVN r14706.
2007-05-21 18:30:10 +04:00
|
|
|
if(strcmp(kval->key, ORTE_PROC_LOCAL_RANK_KEY) == 0) {
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&vptr, kval->value, ORTE_VPID))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
child->local_rank = *vptr; /* save the local_rank */
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if(strcmp(kval->key, ORTE_NODE_NUM_PROCS_KEY) == 0) {
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&sptr, kval->value, ORTE_STD_CNTR))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
child->num_procs = *sptr; /* save the number of procs from this job on this node */
|
|
|
|
continue;
|
|
|
|
}
|
2007-02-06 22:51:05 +03:00
|
|
|
if(strcmp(kval->key, ORTE_NODE_OVERSUBSCRIBED_KEY) == 0) {
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&bptr, kval->value, ORTE_BOOL))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
oversubscribed = *bptr;
|
|
|
|
continue;
|
|
|
|
}
|
2006-09-15 01:29:51 +04:00
|
|
|
} /* kv2 */
|
|
|
|
/* protect operation on the global list of children */
|
|
|
|
OPAL_THREAD_LOCK(&orte_odls_default.mutex);
|
|
|
|
opal_list_append(&orte_odls_default.children, &child->super);
|
|
|
|
opal_condition_signal(&orte_odls_default.cond);
|
|
|
|
OPAL_THREAD_UNLOCK(&orte_odls_default.mutex);
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} /* for kv */
|
|
|
|
}
|
|
|
|
} /* for j */
|
|
|
|
}
|
|
|
|
|
2007-04-24 22:54:45 +04:00
|
|
|
/* if there is nothing for us to do, just return */
|
|
|
|
if (!node_included) {
|
|
|
|
return ORTE_SUCCESS;
|
|
|
|
}
|
|
|
|
|
Bring in the code for routing xcast stage gate messages via the local orteds. This code is inactive unless you specifically request it via an mca param oob_xcast_mode (can be set to "linear" or "direct"). Direct mode is the old standard method where we send messages directly to each MPI process. Linear mode sends the xcast message via the orteds, with the HNP sending the message to each orted directly.
There is a binomial algorithm in the code (i.e., the HNP would send to a subset of the orteds, which then relay it on according to the typical log-2 algo), but that has a bug in it so the code won't let you select it even if you tried (and the mca param doesn't show, so you'd *really* have to try).
This also involved a slight change to the oob.xcast API, so propagated that as required.
Note: this has *only* been tested on rsh, SLURM, and Bproc environments (now that it has been transferred to the OMPI trunk, I'll need to re-test it [only done rsh so far]). It should work fine on any environment that uses the ORTE daemons - anywhere else, you are on your own... :-)
Also, correct a mistake where the orte_debug_flag was declared an int, but the mca param was set as a bool. Move the storage for that flag to the orte/runtime/params.c and orte/runtime/params.h files appropriately.
This commit was SVN r14475.
2007-04-23 22:41:04 +04:00
|
|
|
/* record my uri in a file within the session directory so the local proc
|
|
|
|
* can contact me
|
|
|
|
*/
|
|
|
|
opal_output(orte_odls_globals.output, "odls: dropping local uri file");
|
|
|
|
|
|
|
|
/* put the file in the job session dir for the job being launched */
|
|
|
|
orte_ns.convert_jobid_to_string(&job_str, job);
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_session_dir(true, NULL, NULL, NULL,
|
|
|
|
NULL, NULL, job_str, NULL))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* get the session dir name so we can put the file there */
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_session_dir_get_name(&session_dir, NULL, NULL, NULL,
|
|
|
|
NULL, NULL, NULL, job_str, NULL))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
free(job_str);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
free(job_str);
|
|
|
|
|
|
|
|
/* create the file and put my uri into it */
|
|
|
|
uri_file = opal_os_path(false, session_dir, "orted-uri.txt", NULL);
|
|
|
|
fp = fopen(uri_file, "w");
|
|
|
|
if (NULL == fp) {
|
|
|
|
ORTE_ERROR_LOG(ORTE_ERR_FILE_OPEN_FAILURE);
|
|
|
|
return ORTE_ERR_FILE_OPEN_FAILURE;
|
|
|
|
}
|
|
|
|
my_uri = orte_rml.get_uri();
|
|
|
|
fprintf(fp, "%s\n", my_uri);
|
|
|
|
fclose(fp);
|
|
|
|
free(uri_file);
|
|
|
|
free(my_uri);
|
|
|
|
|
2007-06-18 19:39:04 +04:00
|
|
|
#if OPAL_ENABLE_FT == 1
|
|
|
|
/*
|
|
|
|
* Notify the local SnapC component regarding new job
|
|
|
|
*/
|
|
|
|
if( ORTE_SUCCESS != (rc = orte_snapc.setup_job(job) ) ) {
|
|
|
|
/* Silent Failure :/ JJH */
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2007-03-17 02:11:45 +03:00
|
|
|
/* Now we preload any files that are needed. This is done on a per
|
|
|
|
* app context basis */
|
|
|
|
for (item = opal_list_get_first(&app_context_list);
|
|
|
|
item != opal_list_get_end(&app_context_list);
|
|
|
|
item = opal_list_get_next(item)) {
|
|
|
|
app_item = (odls_default_app_context_t*)item;
|
|
|
|
if(app_item->app_context->preload_binary || NULL != app_item->app_context->preload_files) {
|
|
|
|
filem_request = OBJ_NEW(orte_filem_base_request_t);
|
|
|
|
filem_request->num_procs = 1;
|
|
|
|
filem_request->proc_name = (orte_process_name_t*)malloc(sizeof(orte_process_name_t) * filem_request->num_procs);
|
|
|
|
filem_request->proc_name[0].cellid = orte_process_info.gpr_replica->cellid;
|
|
|
|
filem_request->proc_name[0].jobid = orte_process_info.gpr_replica->jobid;
|
|
|
|
filem_request->proc_name[0].vpid = orte_process_info.gpr_replica->vpid;
|
|
|
|
if(app_item->app_context->preload_binary) {
|
|
|
|
if( ORTE_SUCCESS != (rc = orte_pls_fork_preload_append_binary(app_item->app_context,
|
|
|
|
filem_request) ) ){
|
|
|
|
opal_show_help("help-orte-odls-default.txt",
|
|
|
|
"orte-odls-default:could-not-preload-binary",
|
|
|
|
true, app_item->app_context->app);
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
/* Keep accumulating files anyway */
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if( NULL != app_item->app_context->preload_files) {
|
|
|
|
if( ORTE_SUCCESS != (rc = orte_pls_fork_preload_append_files(app_item->app_context,
|
|
|
|
filem_request) ) ){
|
|
|
|
opal_show_help("help-orte-odls-default.txt",
|
|
|
|
"orte-odls-default:could-not-preload-files",
|
|
|
|
true, app_item->app_context->preload_files);
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
/* Keep accumulating files anyway */
|
|
|
|
}
|
|
|
|
}
|
|
|
|
/* Actually bring over the files */
|
|
|
|
if( ORTE_SUCCESS != (rc = orte_filem.get(filem_request)) ) {
|
|
|
|
opal_show_help("help-orte-odls-default.txt",
|
|
|
|
"orte-odls-default:could-not-preload",
|
|
|
|
true, opal_argv_join(filem_request->local_targets, ' '));
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
}
|
|
|
|
OBJ_DESTRUCT(filem_request);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2007-02-06 22:51:05 +03:00
|
|
|
/* setup for processor affinity. If there are enough physical processors on this node, then
|
|
|
|
* we indicate which processor each process should be assigned to, IFF the user has requested
|
|
|
|
* processor affinity be used - the paffinity subsystem will make that final determination. All
|
|
|
|
* we do here is indicate that we should do the definitions just in case paffinity is active
|
|
|
|
*/
|
|
|
|
if (OPAL_SUCCESS != opal_get_num_processors(&num_processors)) {
|
|
|
|
/* if we cannot find the number of local processors, then default to conservative
|
|
|
|
* settings
|
|
|
|
*/
|
|
|
|
want_processor = false; /* default to not being a hog */
|
|
|
|
opal_output(orte_odls_globals.output,
|
|
|
|
"odls: could not get number of processors - using conservative settings");
|
|
|
|
} else {
|
|
|
|
opal_output(orte_odls_globals.output,
|
|
|
|
"odls: got %ld processors", (long)num_processors);
|
|
|
|
|
|
|
|
/* only do this if we can actually get info on the number of processors */
|
|
|
|
if (opal_list_get_size(&orte_odls_default.children) > (size_t)num_processors) {
|
|
|
|
want_processor = false;
|
|
|
|
} else {
|
|
|
|
want_processor = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* now let's deal with the oversubscribed flag - and the use-case where a hostfile or some
|
|
|
|
* other non-guaranteed-accurate method was used to inform us about our allocation. Since
|
|
|
|
* the information on the number of slots on this node could have been incorrect, we need
|
|
|
|
* to check it against the local number of processors to ensure we don't overload them
|
|
|
|
*/
|
|
|
|
if (override_oversubscribed) {
|
|
|
|
opal_output(orte_odls_globals.output, "odls: overriding oversubscription");
|
|
|
|
if (opal_list_get_size(&orte_odls_default.children) > (size_t)num_processors) {
|
|
|
|
/* if the #procs > #processors, declare us oversubscribed regardless
|
|
|
|
* of what the mapper claimed - the user may have told us something
|
|
|
|
* incorrect
|
|
|
|
*/
|
|
|
|
oversubscribed = true;
|
|
|
|
} else {
|
|
|
|
/* likewise, if there are more processors here than we were told,
|
|
|
|
* declare us to not be oversubscribed so we can be aggressive. This
|
|
|
|
* covers the case where the user didn't tell us anything about the
|
|
|
|
* number of available slots, so we defaulted to a value of 1
|
|
|
|
*/
|
|
|
|
oversubscribed = false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
opal_output(orte_odls_globals.output, "odls: oversubscribed set to %s want_processor set to %s",
|
|
|
|
oversubscribed ? "true" : "false", want_processor ? "true" : "false");
|
|
|
|
|
2006-09-15 01:29:51 +04:00
|
|
|
/* okay, now let's launch our local procs using a fork/exec */
|
|
|
|
i = 0;
|
|
|
|
/* protect operations involving the global list of children */
|
|
|
|
OPAL_THREAD_LOCK(&orte_odls_default.mutex);
|
|
|
|
|
2007-04-24 22:54:45 +04:00
|
|
|
quit_flag = false;
|
2006-09-15 01:29:51 +04:00
|
|
|
for (item = opal_list_get_first(&orte_odls_default.children);
|
2007-04-24 22:54:45 +04:00
|
|
|
!quit_flag && item != opal_list_get_end(&orte_odls_default.children);
|
2006-09-15 01:29:51 +04:00
|
|
|
item = opal_list_get_next(item)) {
|
2006-11-11 07:03:45 +03:00
|
|
|
child = (orte_odls_child_t*)item;
|
2006-09-15 01:29:51 +04:00
|
|
|
|
|
|
|
/* is this child already alive? This can happen if
|
|
|
|
* we are asked to launch additional processes.
|
|
|
|
* If it has been launched, then do nothing
|
|
|
|
*/
|
|
|
|
if (child->alive) {
|
2007-04-24 22:54:45 +04:00
|
|
|
opal_output(orte_odls_globals.output, "odls: child [%ld,%ld,%ld] is already alive",
|
|
|
|
ORTE_NAME_ARGS(child->name));
|
2006-09-15 01:29:51 +04:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* do we have a child from the specified job. Because the
|
|
|
|
* job could be given as a WILDCARD value, we must use
|
|
|
|
* the dss.compare function to check for equality.
|
|
|
|
*/
|
|
|
|
if (ORTE_EQUAL != orte_dss.compare(&job, &(child->name->jobid), ORTE_JOBID)) {
|
2007-04-24 22:54:45 +04:00
|
|
|
opal_output(orte_odls_globals.output, "odls: child [%ld,%ld,%ld] is not in job %ld being launched",
|
|
|
|
ORTE_NAME_ARGS(child->name), (long)job);
|
2006-09-15 01:29:51 +04:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2006-11-13 21:51:18 +03:00
|
|
|
opal_output(orte_odls_globals.output, "odls: preparing to launch child [%ld, %ld, %ld]",
|
|
|
|
ORTE_NAME_ARGS(child->name));
|
|
|
|
|
2006-09-15 01:29:51 +04:00
|
|
|
/* find the indicated app_context in the list */
|
|
|
|
for (item2 = opal_list_get_first(&app_context_list);
|
|
|
|
item2 != opal_list_get_end(&app_context_list);
|
|
|
|
item2 = opal_list_get_next(item2)) {
|
|
|
|
app_item = (odls_default_app_context_t*)item2;
|
|
|
|
if (child->app_idx == app_item->app_context->idx) {
|
|
|
|
app = app_item->app_context;
|
|
|
|
goto DOFORK;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
/* get here if we couldn't find the app_context */
|
|
|
|
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
|
|
|
opal_condition_signal(&orte_odls_default.cond);
|
|
|
|
OPAL_THREAD_UNLOCK(&orte_odls_default.mutex);
|
|
|
|
return ORTE_ERR_NOT_FOUND;
|
|
|
|
|
|
|
|
DOFORK:
|
|
|
|
/* must unlock prior to fork to keep things clean in the
|
|
|
|
* event library
|
|
|
|
*/
|
|
|
|
opal_condition_signal(&orte_odls_default.cond);
|
|
|
|
OPAL_THREAD_UNLOCK(&orte_odls_default.mutex);
|
|
|
|
|
|
|
|
if (ORTE_SUCCESS != (rc = odls_default_fork_local_proc(app, child, start,
|
2007-07-02 05:33:35 +04:00
|
|
|
range, total_slots_alloc,
|
|
|
|
want_processor,
|
2007-02-06 22:51:05 +03:00
|
|
|
i, oversubscribed,
|
|
|
|
base_environ))) {
|
2007-04-24 22:54:45 +04:00
|
|
|
/* do NOT ERROR_LOG this error - it generates
|
|
|
|
* a message/node as most errors will be common
|
|
|
|
* across the entire cluster. Instead, we let orterun
|
|
|
|
* output a consolidated error message for us
|
|
|
|
*/
|
|
|
|
quit_flag = true;
|
2006-09-15 01:29:51 +04:00
|
|
|
}
|
2006-09-22 23:24:42 +04:00
|
|
|
/* reaquire lock so we don't double unlock... */
|
|
|
|
OPAL_THREAD_LOCK(&orte_odls_default.mutex);
|
2006-09-15 01:29:51 +04:00
|
|
|
i++;
|
|
|
|
}
|
|
|
|
|
2006-11-11 07:03:45 +03:00
|
|
|
/* report the proc info and state in the registry */
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_odls_base_report_spawn(&orte_odls_default.children))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* setup the waitpids on the children */
|
|
|
|
for (item = opal_list_get_first(&orte_odls_default.children);
|
|
|
|
item != opal_list_get_end(&orte_odls_default.children);
|
|
|
|
item = opal_list_get_next(item)) {
|
|
|
|
child = (orte_odls_child_t*)item;
|
|
|
|
|
|
|
|
if (ORTE_PROC_STATE_LAUNCHED == child->state) {
|
2007-05-23 20:11:50 +04:00
|
|
|
OPAL_THREAD_UNLOCK(&orte_odls_default.mutex);
|
2006-11-11 07:03:45 +03:00
|
|
|
orte_wait_cb(child->pid, odls_default_wait_local_proc, NULL);
|
2007-05-23 20:11:50 +04:00
|
|
|
OPAL_THREAD_LOCK(&orte_odls_default.mutex);
|
2006-11-11 07:03:45 +03:00
|
|
|
child->state = ORTE_PROC_STATE_RUNNING;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2006-09-15 01:29:51 +04:00
|
|
|
/* cleanup */
|
|
|
|
while (NULL != (item = opal_list_remove_first(&app_context_list))) {
|
|
|
|
OBJ_RELEASE(item);
|
|
|
|
}
|
|
|
|
OBJ_DESTRUCT(&app_context_list);
|
|
|
|
|
|
|
|
opal_condition_signal(&orte_odls_default.cond);
|
|
|
|
OPAL_THREAD_UNLOCK(&orte_odls_default.mutex);
|
2006-11-11 07:03:45 +03:00
|
|
|
return rc;
|
2006-09-15 01:29:51 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Pass a signal to my local procs
|
|
|
|
*/
|
|
|
|
|
|
|
|
static int send_signal(pid_t pid, int signal)
|
|
|
|
{
|
|
|
|
int rc = ORTE_SUCCESS;
|
|
|
|
|
|
|
|
if (kill(pid, signal) != 0) {
|
|
|
|
switch(errno) {
|
|
|
|
case EINVAL:
|
|
|
|
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
|
|
|
rc = ORTE_ERR_BAD_PARAM;
|
|
|
|
break;
|
|
|
|
case ESRCH:
|
|
|
|
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
|
|
|
rc = ORTE_ERR_NOT_FOUND;
|
|
|
|
break;
|
|
|
|
case EPERM:
|
|
|
|
ORTE_ERROR_LOG(ORTE_ERR_PERM);
|
|
|
|
rc = ORTE_ERR_PERM;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
ORTE_ERROR_LOG(ORTE_ERROR);
|
|
|
|
rc = ORTE_ERROR;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return rc;
|
|
|
|
}
|
2006-10-07 05:42:23 +04:00
|
|
|
int orte_odls_default_signal_local_procs(const orte_process_name_t *proc, int32_t signal)
|
2006-09-15 01:29:51 +04:00
|
|
|
{
|
|
|
|
int rc;
|
|
|
|
opal_list_item_t *item;
|
2006-11-11 07:03:45 +03:00
|
|
|
orte_odls_child_t *child;
|
2006-09-15 01:29:51 +04:00
|
|
|
|
|
|
|
/* protect operations involving the global list of children */
|
|
|
|
OPAL_THREAD_LOCK(&orte_odls_default.mutex);
|
|
|
|
|
|
|
|
/* if procs is NULL, then we want to signal all
|
|
|
|
* of the local procs, so just do that case
|
|
|
|
*/
|
|
|
|
if (NULL == proc) {
|
2006-09-18 16:40:42 +04:00
|
|
|
rc = ORTE_SUCCESS; /* pre-set this as an empty list causes us to drop to bottom */
|
2006-09-15 01:29:51 +04:00
|
|
|
for (item = opal_list_get_first(&orte_odls_default.children);
|
|
|
|
item != opal_list_get_end(&orte_odls_default.children);
|
|
|
|
item = opal_list_get_next(item)) {
|
2006-11-11 07:03:45 +03:00
|
|
|
child = (orte_odls_child_t*)item;
|
2006-09-15 01:29:51 +04:00
|
|
|
if (ORTE_SUCCESS != (rc = send_signal(child->pid, (int)signal))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
opal_condition_signal(&orte_odls_default.cond);
|
|
|
|
OPAL_THREAD_UNLOCK(&orte_odls_default.mutex);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* we want it sent to some specified process, so find it */
|
|
|
|
for (item = opal_list_get_first(&orte_odls_default.children);
|
|
|
|
item != opal_list_get_end(&orte_odls_default.children);
|
|
|
|
item = opal_list_get_next(item)) {
|
2006-11-11 07:03:45 +03:00
|
|
|
child = (orte_odls_child_t*)item;
|
2006-10-07 05:42:23 +04:00
|
|
|
if (ORTE_EQUAL == orte_dss.compare(&(child->name), (orte_process_name_t*)proc, ORTE_NAME)) {
|
2006-09-15 01:29:51 +04:00
|
|
|
/* unlock before signaling as this may generate a callback */
|
|
|
|
opal_condition_signal(&orte_odls_default.cond);
|
|
|
|
OPAL_THREAD_UNLOCK(&orte_odls_default.mutex);
|
|
|
|
if (ORTE_SUCCESS != (rc = send_signal(child->pid, (int)signal))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
}
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* only way to get here is if we couldn't find the specified proc.
|
|
|
|
* report that as an error and return it
|
|
|
|
*/
|
|
|
|
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
|
|
|
opal_condition_signal(&orte_odls_default.cond);
|
|
|
|
OPAL_THREAD_UNLOCK(&orte_odls_default.mutex);
|
|
|
|
return ORTE_ERR_NOT_FOUND;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
Bring in the code for routing xcast stage gate messages via the local orteds. This code is inactive unless you specifically request it via an mca param oob_xcast_mode (can be set to "linear" or "direct"). Direct mode is the old standard method where we send messages directly to each MPI process. Linear mode sends the xcast message via the orteds, with the HNP sending the message to each orted directly.
There is a binomial algorithm in the code (i.e., the HNP would send to a subset of the orteds, which then relay it on according to the typical log-2 algo), but that has a bug in it so the code won't let you select it even if you tried (and the mca param doesn't show, so you'd *really* have to try).
This also involved a slight change to the oob.xcast API, so propagated that as required.
Note: this has *only* been tested on rsh, SLURM, and Bproc environments (now that it has been transferred to the OMPI trunk, I'll need to re-test it [only done rsh so far]). It should work fine on any environment that uses the ORTE daemons - anywhere else, you are on your own... :-)
Also, correct a mistake where the orte_debug_flag was declared an int, but the mca param was set as a bool. Move the storage for that flag to the orte/runtime/params.c and orte/runtime/params.h files appropriately.
This commit was SVN r14475.
2007-04-23 22:41:04 +04:00
|
|
|
int orte_odls_default_deliver_message(orte_jobid_t job, orte_buffer_t *buffer, orte_rml_tag_t tag)
|
|
|
|
{
|
|
|
|
int rc;
|
|
|
|
opal_list_item_t *item;
|
|
|
|
orte_odls_child_t *child;
|
|
|
|
|
|
|
|
/* protect operations involving the global list of children */
|
|
|
|
OPAL_THREAD_LOCK(&orte_odls_default.mutex);
|
|
|
|
|
|
|
|
for (item = opal_list_get_first(&orte_odls_default.children);
|
|
|
|
item != opal_list_get_end(&orte_odls_default.children);
|
|
|
|
item = opal_list_get_next(item)) {
|
|
|
|
child = (orte_odls_child_t*)item;
|
|
|
|
|
|
|
|
/* do we have a child from the specified job. Because the
|
|
|
|
* job could be given as a WILDCARD value, we must use
|
|
|
|
* the dss.compare function to check for equality.
|
|
|
|
*/
|
|
|
|
if (ORTE_EQUAL != orte_dss.compare(&job, &(child->name->jobid), ORTE_JOBID)) {
|
|
|
|
continue;
|
|
|
|
}
|
Commit the orted-failed-to-start code. This correctly causes the system to detect the failure of an orted to start and allows the system to terminate all procs/orteds that *did* start.
The primary change that underlies all this is in the OOB. Specifically, the problem in the code until now has been that the OOB attempts to resolve an address when we call the "send" to an unknown recipient. The OOB would then wait forever if that recipient never actually started (and hence, never reported back its OOB contact info). In the case of an orted that failed to start, we would correctly detect that the orted hadn't started, but then we would attempt to order all orteds (including the one that failed to start) to die. This would cause the OOB to "hang" the system.
Unfortunately, revising how the OOB resolves addresses introduced a number of additional problems. Specifically, and most troublesome, was the fact that comm_spawn involved the immediate transmission of the rendezvous point from parent-to-child after the child was spawned. The current code used the OOB address resolution as a "barrier" - basically, the parent would attempt to send the info to the child, and then "hold" there until the child's contact info had arrived (meaning the child had started) and the send could be completed.
Note that this also caused comm_spawn to "hang" the entire system if the child never started... The app-failed-to-start helped improve that behavior - this code provides additional relief.
With this change, the OOB will return an ADDRESSEE_UNKNOWN error if you attempt to send to a recipient whose contact info isn't already in the OOB's hash tables. To resolve comm_spawn issues, we also now force the cross-sharing of connection info between parent and child jobs during spawn.
Finally, to aid in setting triggers to the right values, we introduce the "arith" API for the GPR. This function allows you to atomically change the value in a registry location (either divide, multiply, add, or subtract) by the provided operand. It is equivalent to first fetching the value using a "get", then modifying it, and then putting the result back into the registry via a "put".
This commit was SVN r14711.
2007-05-21 22:31:28 +04:00
|
|
|
opal_output(orte_odls_globals.output, "odls: sending message to tag %lu on child [%ld, %ld, %ld]",
|
|
|
|
(unsigned long)tag, ORTE_NAME_ARGS(child->name));
|
Bring in the code for routing xcast stage gate messages via the local orteds. This code is inactive unless you specifically request it via an mca param oob_xcast_mode (can be set to "linear" or "direct"). Direct mode is the old standard method where we send messages directly to each MPI process. Linear mode sends the xcast message via the orteds, with the HNP sending the message to each orted directly.
There is a binomial algorithm in the code (i.e., the HNP would send to a subset of the orteds, which then relay it on according to the typical log-2 algo), but that has a bug in it so the code won't let you select it even if you tried (and the mca param doesn't show, so you'd *really* have to try).
This also involved a slight change to the oob.xcast API, so propagated that as required.
Note: this has *only* been tested on rsh, SLURM, and Bproc environments (now that it has been transferred to the OMPI trunk, I'll need to re-test it [only done rsh so far]). It should work fine on any environment that uses the ORTE daemons - anywhere else, you are on your own... :-)
Also, correct a mistake where the orte_debug_flag was declared an int, but the mca param was set as a bool. Move the storage for that flag to the orte/runtime/params.c and orte/runtime/params.h files appropriately.
This commit was SVN r14475.
2007-04-23 22:41:04 +04:00
|
|
|
|
|
|
|
/* if so, send the message */
|
|
|
|
rc = orte_rml.send_buffer(child->name, buffer, tag, 0);
|
|
|
|
if (rc < 0) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
opal_condition_signal(&orte_odls_default.cond);
|
|
|
|
OPAL_THREAD_UNLOCK(&orte_odls_default.mutex);
|
|
|
|
return ORTE_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2006-09-15 01:29:51 +04:00
|
|
|
static void set_handler_default(int sig)
|
|
|
|
{
|
|
|
|
struct sigaction act;
|
|
|
|
|
|
|
|
act.sa_handler = SIG_DFL;
|
|
|
|
act.sa_flags = 0;
|
|
|
|
sigemptyset(&act.sa_mask);
|
|
|
|
|
|
|
|
sigaction(sig, &act, (struct sigaction *)0);
|
|
|
|
}
|
2007-03-17 02:11:45 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* The difference between preloading a file, and a binary file is that
|
|
|
|
* we may need to update the app_context to reflect the placement of the binary file
|
|
|
|
* on the local machine.
|
|
|
|
*/
|
|
|
|
static int orte_pls_fork_preload_append_binary(orte_app_context_t* context,
|
|
|
|
orte_filem_base_request_t *filem_request) {
|
|
|
|
char * local_bin = NULL;
|
|
|
|
int tmp_argc = 0;
|
|
|
|
/*
|
|
|
|
* Append the local placement
|
|
|
|
*/
|
|
|
|
asprintf(&local_bin, "%s/%s", orte_process_info.job_session_dir, opal_basename(context->app));
|
|
|
|
if(is_preload_local_dup(local_bin, filem_request) ) {
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
opal_argv_append(&filem_request->num_targets, &(filem_request->local_targets), local_bin);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Append the remote file
|
|
|
|
*/
|
|
|
|
tmp_argc = 0;
|
|
|
|
opal_argv_append(&tmp_argc, &filem_request->remote_targets, context->app);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Append the flag
|
|
|
|
*/
|
|
|
|
filem_request->target_flags = (int *)realloc(filem_request->target_flags,
|
|
|
|
sizeof(int) * (filem_request->num_targets + 1));
|
|
|
|
filem_request->target_flags[filem_request->num_targets-1] = ORTE_FILEM_TYPE_FILE;
|
|
|
|
|
|
|
|
cleanup:
|
|
|
|
/*
|
|
|
|
* Adjust the process name
|
|
|
|
*/
|
|
|
|
if(NULL != context->app)
|
|
|
|
free(context->app);
|
|
|
|
context->app = local_bin;
|
|
|
|
|
|
|
|
return ORTE_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static int orte_pls_fork_preload_append_files(orte_app_context_t* context,
|
|
|
|
orte_filem_base_request_t *filem_request) {
|
|
|
|
char * local_ref = NULL;
|
|
|
|
int i, tmp_argc = 0, remote_argc = 0;
|
|
|
|
char **remote_targets = NULL;
|
|
|
|
char * temp = NULL;
|
|
|
|
|
|
|
|
remote_targets = opal_argv_split(context->preload_files, ',');
|
|
|
|
remote_argc = opal_argv_count(remote_targets);
|
|
|
|
|
|
|
|
for(i = 0; i < remote_argc; ++i) {
|
|
|
|
if(NULL != context->preload_files_dest_dir) {
|
|
|
|
if(context->preload_files_dest_dir[0] == '.') {
|
|
|
|
asprintf(&local_ref, "%s/%s/%s", context->cwd, context->preload_files_dest_dir, opal_basename(remote_targets[i]) );
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
asprintf(&local_ref, "%s/%s", context->preload_files_dest_dir, opal_basename(remote_targets[i]) );
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
/*
|
|
|
|
* If the preload_files_dest_dir is not specified
|
|
|
|
* If this is an absolute path, copy it to that path. Otherwise copy it to the cwd.
|
|
|
|
*/
|
|
|
|
if('/' == remote_targets[i][0]) {
|
|
|
|
asprintf(&local_ref, "%s", remote_targets[i]);
|
|
|
|
} else {
|
|
|
|
asprintf(&local_ref, "%s/%s", context->cwd, opal_basename(remote_targets[i]) );
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
asprintf(&temp, "test -e %s", local_ref);
|
|
|
|
if(0 == system(temp)) {
|
|
|
|
char hostname[MAXHOSTNAMELEN];
|
|
|
|
gethostname(hostname, sizeof(hostname));
|
|
|
|
opal_show_help("help-orte-pls-fork.txt",
|
|
|
|
"orte-pls-fork:preload-file-exists",
|
|
|
|
true, local_ref, hostname);
|
|
|
|
free(temp);
|
|
|
|
temp = NULL;
|
|
|
|
free(local_ref);
|
|
|
|
local_ref = NULL;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
free(temp);
|
|
|
|
temp = NULL;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Is this a duplicate
|
|
|
|
*/
|
|
|
|
if(is_preload_local_dup(local_ref, filem_request) ) {
|
|
|
|
free(local_ref);
|
|
|
|
local_ref = NULL;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Append the local files we want
|
|
|
|
*/
|
|
|
|
opal_argv_append(&filem_request->num_targets, &filem_request->local_targets, local_ref);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Append the remote files we want
|
|
|
|
*/
|
|
|
|
tmp_argc = filem_request->num_targets - 1;
|
|
|
|
opal_argv_append(&tmp_argc, &filem_request->remote_targets, remote_targets[i]);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Set the flags
|
|
|
|
*/
|
|
|
|
filem_request->target_flags = (int *)realloc(filem_request->target_flags, sizeof(int) * 1);
|
|
|
|
filem_request->target_flags[filem_request->num_targets-1] = ORTE_FILEM_TYPE_UNKNOWN;
|
|
|
|
|
|
|
|
free(local_ref);
|
|
|
|
local_ref = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
if(NULL != local_ref)
|
|
|
|
free(local_ref);
|
|
|
|
if(NULL != remote_targets)
|
|
|
|
opal_argv_free(remote_targets);
|
|
|
|
|
|
|
|
return ORTE_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Keeps us from transfering the same file more than once.
|
|
|
|
*/
|
|
|
|
static bool is_preload_local_dup(char *local_ref, orte_filem_base_request_t *filem_request) {
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for(i = 0; i < filem_request->num_targets; ++i) {
|
|
|
|
if(0 == strncmp(local_ref, filem_request->local_targets[i], strlen(local_ref)+1) ) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|