2004-07-02 01:24:53 +00:00
|
|
|
/*
|
2005-11-05 19:57:48 +00:00
|
|
|
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
|
|
|
* University Research and Technology
|
|
|
|
* Corporation. All rights reserved.
|
|
|
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
|
|
|
* of Tennessee Research Foundation. All rights
|
|
|
|
* reserved.
|
2006-02-07 03:32:36 +00:00
|
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
2004-11-28 20:09:25 +00:00
|
|
|
* University of Stuttgart. All rights reserved.
|
2005-03-24 12:43:37 +00:00
|
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
|
|
* All rights reserved.
|
2004-11-22 01:38:40 +00:00
|
|
|
* $COPYRIGHT$
|
2006-02-07 03:32:36 +00:00
|
|
|
*
|
2004-11-22 01:38:40 +00:00
|
|
|
* Additional copyrights may follow
|
2006-02-07 03:32:36 +00:00
|
|
|
*
|
2004-07-02 01:24:53 +00:00
|
|
|
* $HEADER$
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
2005-03-14 20:57:21 +00:00
|
|
|
#include "orte_config.h"
|
2006-02-12 01:33:29 +00:00
|
|
|
#include "orte/orte_constants.h"
|
2005-03-14 20:57:21 +00:00
|
|
|
|
2006-02-12 01:33:29 +00:00
|
|
|
#include "opal/mca/mca.h"
|
|
|
|
#include "opal/mca/base/base.h"
|
|
|
|
#include "opal/mca/base/mca_base_param.h"
|
|
|
|
#include "orte/mca/errmgr/errmgr.h"
|
2005-07-03 23:31:27 +00:00
|
|
|
#include "opal/util/output.h"
|
2005-03-14 20:57:21 +00:00
|
|
|
|
2006-02-12 01:33:29 +00:00
|
|
|
#include "orte/dss/dss.h"
|
2005-05-01 00:58:06 +00:00
|
|
|
|
2006-02-12 01:33:29 +00:00
|
|
|
#include "orte/mca/ns/base/base.h"
|
Bring over the update to terminate orteds that are generated by a dynamic spawn such as comm_spawn. This introduces the concept of a job "family" - i.e., jobs that have a parent/child relationship. Comm_spawn'ed jobs have a parent (the one that spawned them). We track that relationship throughout the lineage - i.e., if a comm_spawned job in turn calls comm_spawn, then it has a parent (the one that spawned it) and a "root" job (the original job that started things).
Accordingly, there are new APIs to the name service to support the ability to get a job's parent, root, immediate children, and all its descendants. In addition, the terminate_job, terminate_orted, and signal_job APIs for the PLS have been modified to accept attributes that define the extent of their actions. For example, doing a "terminate_job" with an attribute of ORTE_NS_INCLUDE_DESCENDANTS will terminate the given jobid AND all jobs that descended from it.
I have tested this capability on a MacBook under rsh, Odin under SLURM, and LANL's Flash (bproc). It worked successfully on non-MPI jobs (both simple and including a spawn), and MPI jobs (again, both simple and with a spawn).
This commit was SVN r12597.
2006-11-14 19:34:59 +00:00
|
|
|
#include "orte/mca/ns/base/ns_private.h"
|
2004-07-02 01:24:53 +00:00
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The following file was created by configure. It contains extern
|
|
|
|
* statements and the definition of an array of pointers to each
|
2004-08-02 00:24:22 +00:00
|
|
|
* component's public mca_base_component_t struct.
|
2004-07-02 01:24:53 +00:00
|
|
|
*/
|
|
|
|
|
2005-07-04 18:24:58 +00:00
|
|
|
#include "orte/mca/ns/base/static-components.h"
|
2004-07-02 01:24:53 +00:00
|
|
|
|
2004-07-11 02:31:30 +00:00
|
|
|
/*
|
|
|
|
* globals
|
|
|
|
*/
|
|
|
|
|
Bring over the update to terminate orteds that are generated by a dynamic spawn such as comm_spawn. This introduces the concept of a job "family" - i.e., jobs that have a parent/child relationship. Comm_spawn'ed jobs have a parent (the one that spawned them). We track that relationship throughout the lineage - i.e., if a comm_spawned job in turn calls comm_spawn, then it has a parent (the one that spawned it) and a "root" job (the original job that started things).
Accordingly, there are new APIs to the name service to support the ability to get a job's parent, root, immediate children, and all its descendants. In addition, the terminate_job, terminate_orted, and signal_job APIs for the PLS have been modified to accept attributes that define the extent of their actions. For example, doing a "terminate_job" with an attribute of ORTE_NS_INCLUDE_DESCENDANTS will terminate the given jobid AND all jobs that descended from it.
I have tested this capability on a MacBook under rsh, Odin under SLURM, and LANL's Flash (bproc). It worked successfully on non-MPI jobs (both simple and including a spawn), and MPI jobs (again, both simple and with a spawn).
This commit was SVN r12597.
2006-11-14 19:34:59 +00:00
|
|
|
orte_process_name_t orte_ns_name_wildcard = {ORTE_CELLID_WILDCARD, ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD};
|
|
|
|
orte_process_name_t orte_ns_name_invalid = {ORTE_CELLID_INVALID, ORTE_JOBID_INVALID, ORTE_VPID_INVALID};
|
|
|
|
orte_process_name_t orte_ns_name_my_hnp = {0, 0, 0};
|
2005-03-14 20:57:21 +00:00
|
|
|
|
2004-07-02 01:24:53 +00:00
|
|
|
/*
|
|
|
|
* Global variables
|
|
|
|
*/
|
|
|
|
int mca_ns_base_output = -1;
|
2006-08-20 15:54:04 +00:00
|
|
|
mca_ns_base_module_t orte_ns = {
|
2005-08-07 13:21:52 +00:00
|
|
|
/* init */
|
2005-03-14 20:57:21 +00:00
|
|
|
orte_ns_base_module_init_not_available,
|
2005-08-07 13:21:52 +00:00
|
|
|
/* cell functions */
|
2005-03-14 20:57:21 +00:00
|
|
|
orte_ns_base_create_cellid_not_available,
|
2005-05-16 21:01:09 +00:00
|
|
|
orte_ns_base_get_cell_info_not_available,
|
2005-08-07 13:21:52 +00:00
|
|
|
orte_ns_base_get_cellid_string,
|
|
|
|
orte_ns_base_convert_cellid_to_string,
|
|
|
|
orte_ns_base_convert_string_to_cellid,
|
Bring over the update to terminate orteds that are generated by a dynamic spawn such as comm_spawn. This introduces the concept of a job "family" - i.e., jobs that have a parent/child relationship. Comm_spawn'ed jobs have a parent (the one that spawned them). We track that relationship throughout the lineage - i.e., if a comm_spawned job in turn calls comm_spawn, then it has a parent (the one that spawned it) and a "root" job (the original job that started things).
Accordingly, there are new APIs to the name service to support the ability to get a job's parent, root, immediate children, and all its descendants. In addition, the terminate_job, terminate_orted, and signal_job APIs for the PLS have been modified to accept attributes that define the extent of their actions. For example, doing a "terminate_job" with an attribute of ORTE_NS_INCLUDE_DESCENDANTS will terminate the given jobid AND all jobs that descended from it.
I have tested this capability on a MacBook under rsh, Odin under SLURM, and LANL's Flash (bproc). It worked successfully on non-MPI jobs (both simple and including a spawn), and MPI jobs (again, both simple and with a spawn).
This commit was SVN r12597.
2006-11-14 19:34:59 +00:00
|
|
|
/* node functions */
|
|
|
|
orte_ns_base_create_nodeids_not_available,
|
|
|
|
orte_ns_base_get_node_info_not_available,
|
|
|
|
orte_ns_base_convert_nodeid_to_string,
|
|
|
|
orte_ns_base_convert_string_to_nodeid,
|
2005-08-07 13:21:52 +00:00
|
|
|
/* jobid functions */
|
2005-03-14 20:57:21 +00:00
|
|
|
orte_ns_base_create_jobid_not_available,
|
Bring over the update to terminate orteds that are generated by a dynamic spawn such as comm_spawn. This introduces the concept of a job "family" - i.e., jobs that have a parent/child relationship. Comm_spawn'ed jobs have a parent (the one that spawned them). We track that relationship throughout the lineage - i.e., if a comm_spawned job in turn calls comm_spawn, then it has a parent (the one that spawned it) and a "root" job (the original job that started things).
Accordingly, there are new APIs to the name service to support the ability to get a job's parent, root, immediate children, and all its descendants. In addition, the terminate_job, terminate_orted, and signal_job APIs for the PLS have been modified to accept attributes that define the extent of their actions. For example, doing a "terminate_job" with an attribute of ORTE_NS_INCLUDE_DESCENDANTS will terminate the given jobid AND all jobs that descended from it.
I have tested this capability on a MacBook under rsh, Odin under SLURM, and LANL's Flash (bproc). It worked successfully on non-MPI jobs (both simple and including a spawn), and MPI jobs (again, both simple and with a spawn).
This commit was SVN r12597.
2006-11-14 19:34:59 +00:00
|
|
|
orte_ns_base_get_job_descendants_not_available,
|
|
|
|
orte_ns_base_get_job_children_not_available,
|
|
|
|
orte_ns_base_get_root_job_not_available,
|
|
|
|
orte_ns_base_get_parent_job_not_available,
|
2005-08-07 13:21:52 +00:00
|
|
|
orte_ns_base_get_jobid_string,
|
|
|
|
orte_ns_base_convert_jobid_to_string,
|
|
|
|
orte_ns_base_convert_string_to_jobid,
|
|
|
|
orte_ns_base_get_vpid_range_not_available,
|
Bring over the update to terminate orteds that are generated by a dynamic spawn such as comm_spawn. This introduces the concept of a job "family" - i.e., jobs that have a parent/child relationship. Comm_spawn'ed jobs have a parent (the one that spawned them). We track that relationship throughout the lineage - i.e., if a comm_spawned job in turn calls comm_spawn, then it has a parent (the one that spawned it) and a "root" job (the original job that started things).
Accordingly, there are new APIs to the name service to support the ability to get a job's parent, root, immediate children, and all its descendants. In addition, the terminate_job, terminate_orted, and signal_job APIs for the PLS have been modified to accept attributes that define the extent of their actions. For example, doing a "terminate_job" with an attribute of ORTE_NS_INCLUDE_DESCENDANTS will terminate the given jobid AND all jobs that descended from it.
I have tested this capability on a MacBook under rsh, Odin under SLURM, and LANL's Flash (bproc). It worked successfully on non-MPI jobs (both simple and including a spawn), and MPI jobs (again, both simple and with a spawn).
This commit was SVN r12597.
2006-11-14 19:34:59 +00:00
|
|
|
/* vpid functions */
|
2005-08-07 13:21:52 +00:00
|
|
|
orte_ns_base_get_vpid_string,
|
|
|
|
orte_ns_base_convert_vpid_to_string,
|
|
|
|
orte_ns_base_convert_string_to_vpid,
|
|
|
|
/* name functions */
|
2005-03-14 20:57:21 +00:00
|
|
|
orte_ns_base_create_process_name,
|
2005-05-24 13:39:15 +00:00
|
|
|
orte_ns_base_create_my_name_not_available,
|
2005-03-14 20:57:21 +00:00
|
|
|
orte_ns_base_convert_string_to_process_name,
|
|
|
|
orte_ns_base_get_proc_name_string,
|
Bring over the update to terminate orteds that are generated by a dynamic spawn such as comm_spawn. This introduces the concept of a job "family" - i.e., jobs that have a parent/child relationship. Comm_spawn'ed jobs have a parent (the one that spawned them). We track that relationship throughout the lineage - i.e., if a comm_spawned job in turn calls comm_spawn, then it has a parent (the one that spawned it) and a "root" job (the original job that started things).
Accordingly, there are new APIs to the name service to support the ability to get a job's parent, root, immediate children, and all its descendants. In addition, the terminate_job, terminate_orted, and signal_job APIs for the PLS have been modified to accept attributes that define the extent of their actions. For example, doing a "terminate_job" with an attribute of ORTE_NS_INCLUDE_DESCENDANTS will terminate the given jobid AND all jobs that descended from it.
I have tested this capability on a MacBook under rsh, Odin under SLURM, and LANL's Flash (bproc). It worked successfully on non-MPI jobs (both simple and including a spawn), and MPI jobs (again, both simple and with a spawn).
This commit was SVN r12597.
2006-11-14 19:34:59 +00:00
|
|
|
orte_ns_base_compare_fields,
|
2005-08-07 13:21:52 +00:00
|
|
|
/* peer functions */
|
Bring over the update to terminate orteds that are generated by a dynamic spawn such as comm_spawn. This introduces the concept of a job "family" - i.e., jobs that have a parent/child relationship. Comm_spawn'ed jobs have a parent (the one that spawned them). We track that relationship throughout the lineage - i.e., if a comm_spawned job in turn calls comm_spawn, then it has a parent (the one that spawned it) and a "root" job (the original job that started things).
Accordingly, there are new APIs to the name service to support the ability to get a job's parent, root, immediate children, and all its descendants. In addition, the terminate_job, terminate_orted, and signal_job APIs for the PLS have been modified to accept attributes that define the extent of their actions. For example, doing a "terminate_job" with an attribute of ORTE_NS_INCLUDE_DESCENDANTS will terminate the given jobid AND all jobs that descended from it.
I have tested this capability on a MacBook under rsh, Odin under SLURM, and LANL's Flash (bproc). It worked successfully on non-MPI jobs (both simple and including a spawn), and MPI jobs (again, both simple and with a spawn).
This commit was SVN r12597.
2006-11-14 19:34:59 +00:00
|
|
|
orte_ns_base_get_peers_not_available,
|
2005-08-07 13:21:52 +00:00
|
|
|
/* tag server functions */
|
2005-03-14 20:57:21 +00:00
|
|
|
orte_ns_base_assign_rml_tag_not_available,
|
2005-08-07 13:21:52 +00:00
|
|
|
/* data type functions */
|
2005-05-01 00:54:12 +00:00
|
|
|
orte_ns_base_define_data_type_not_available,
|
2005-08-07 13:21:52 +00:00
|
|
|
/* diagnostic functions */
|
|
|
|
orte_ns_base_dump_cells_not_available,
|
|
|
|
orte_ns_base_dump_jobs_not_available,
|
|
|
|
orte_ns_base_dump_tags_not_available,
|
|
|
|
orte_ns_base_dump_datatypes_not_available
|
2004-12-03 21:05:22 +00:00
|
|
|
};
|
2005-08-07 13:21:52 +00:00
|
|
|
|
2004-07-11 04:34:47 +00:00
|
|
|
bool mca_ns_base_selected = false;
|
2005-07-03 16:22:16 +00:00
|
|
|
opal_list_t mca_ns_base_components_available;
|
2004-07-11 04:34:47 +00:00
|
|
|
mca_ns_base_component_t mca_ns_base_selected_component;
|
2004-07-02 01:24:53 +00:00
|
|
|
|
|
|
|
|
2004-10-20 18:18:07 +00:00
|
|
|
/* constructor - used to initialize namelist instance */
|
2006-09-14 21:29:51 +00:00
|
|
|
static void orte_namelist_construct(orte_namelist_t* list)
|
2004-10-20 18:18:07 +00:00
|
|
|
{
|
|
|
|
list->name = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* destructor - used to free any resources held by instance */
|
2006-09-14 21:29:51 +00:00
|
|
|
static void orte_namelist_destructor(orte_namelist_t* list)
|
2004-10-20 18:18:07 +00:00
|
|
|
{
|
|
|
|
if (NULL != list->name) {
|
2006-02-07 03:32:36 +00:00
|
|
|
free(list->name);
|
2004-10-20 18:18:07 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2005-07-03 16:06:07 +00:00
|
|
|
/* define instance of opal_class_t */
|
2004-10-20 18:18:07 +00:00
|
|
|
OBJ_CLASS_INSTANCE(
|
2006-09-14 21:29:51 +00:00
|
|
|
orte_namelist_t, /* type name */
|
2006-02-07 03:32:36 +00:00
|
|
|
opal_list_item_t, /* parent "class" name */
|
2006-09-14 21:29:51 +00:00
|
|
|
orte_namelist_construct, /* constructor */
|
|
|
|
orte_namelist_destructor); /* destructor */
|
2004-10-20 18:18:07 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
2004-07-02 01:24:53 +00:00
|
|
|
/**
|
2004-07-11 04:34:47 +00:00
|
|
|
* Function for finding and opening either all MCA components, or the one
|
2004-07-02 01:24:53 +00:00
|
|
|
* that was specifically requested via a MCA parameter.
|
|
|
|
*/
|
2005-03-14 20:57:21 +00:00
|
|
|
int orte_ns_base_open(void)
|
2004-07-02 01:24:53 +00:00
|
|
|
{
|
2005-05-01 00:58:06 +00:00
|
|
|
int param, value, rc;
|
|
|
|
orte_data_type_t tmp;
|
2006-04-04 11:05:52 +00:00
|
|
|
opal_output_stream_t kill_prefix;
|
2005-03-24 20:23:15 +00:00
|
|
|
|
|
|
|
/* Debugging / verbose output */
|
2006-04-04 11:05:52 +00:00
|
|
|
/** setup the structure to kill the blasted prefix that opal_output
|
|
|
|
* now defaults to including so the output can be legible again!
|
|
|
|
*/
|
|
|
|
OBJ_CONSTRUCT(&kill_prefix, opal_output_stream_t);
|
|
|
|
kill_prefix.lds_want_stderr = true;
|
|
|
|
kill_prefix.lds_prefix = NULL;
|
|
|
|
|
2005-08-15 18:25:35 +00:00
|
|
|
param = mca_base_param_reg_int_name("ns_base", "verbose",
|
|
|
|
"Verbosity level for the ns framework",
|
|
|
|
false, false, 0, &value);
|
2005-03-24 20:23:15 +00:00
|
|
|
if (value != 0) {
|
2006-04-04 11:05:52 +00:00
|
|
|
kill_prefix.lds_verbose_level = value;
|
2005-03-24 20:23:15 +00:00
|
|
|
}
|
2006-04-04 11:05:52 +00:00
|
|
|
mca_ns_base_output = opal_output_open(&kill_prefix);
|
2004-07-12 02:44:25 +00:00
|
|
|
|
2005-05-01 00:58:06 +00:00
|
|
|
/* register the base system types with the DPS */
|
|
|
|
tmp = ORTE_NAME;
|
2006-02-07 03:32:36 +00:00
|
|
|
if (ORTE_SUCCESS != (rc = orte_dss.register_type(orte_ns_base_pack_name,
|
|
|
|
orte_ns_base_unpack_name,
|
|
|
|
(orte_dss_copy_fn_t)orte_ns_base_copy_name,
|
|
|
|
(orte_dss_compare_fn_t)orte_ns_base_compare_name,
|
|
|
|
(orte_dss_size_fn_t)orte_ns_base_std_size,
|
|
|
|
(orte_dss_print_fn_t)orte_ns_base_print_name,
|
|
|
|
(orte_dss_release_fn_t)orte_ns_base_std_release,
|
|
|
|
ORTE_DSS_UNSTRUCTURED,
|
|
|
|
"ORTE_NAME", &tmp))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
2005-05-01 00:58:06 +00:00
|
|
|
tmp = ORTE_VPID;
|
2006-02-07 03:32:36 +00:00
|
|
|
if (ORTE_SUCCESS != (rc = orte_dss.register_type(orte_ns_base_pack_vpid,
|
|
|
|
orte_ns_base_unpack_vpid,
|
|
|
|
(orte_dss_copy_fn_t)orte_ns_base_copy_vpid,
|
|
|
|
(orte_dss_compare_fn_t)orte_ns_base_compare_vpid,
|
|
|
|
(orte_dss_size_fn_t)orte_ns_base_std_size,
|
|
|
|
(orte_dss_print_fn_t)orte_ns_base_std_print,
|
|
|
|
(orte_dss_release_fn_t)orte_ns_base_std_release,
|
|
|
|
ORTE_DSS_UNSTRUCTURED,
|
|
|
|
"ORTE_VPID", &tmp))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
2005-05-01 00:58:06 +00:00
|
|
|
tmp = ORTE_JOBID;
|
2006-02-07 03:32:36 +00:00
|
|
|
if (ORTE_SUCCESS != (rc = orte_dss.register_type(orte_ns_base_pack_jobid,
|
|
|
|
orte_ns_base_unpack_jobid,
|
|
|
|
(orte_dss_copy_fn_t)orte_ns_base_copy_jobid,
|
|
|
|
(orte_dss_compare_fn_t)orte_ns_base_compare_jobid,
|
|
|
|
(orte_dss_size_fn_t)orte_ns_base_std_size,
|
|
|
|
(orte_dss_print_fn_t)orte_ns_base_std_print,
|
|
|
|
(orte_dss_release_fn_t)orte_ns_base_std_release,
|
|
|
|
ORTE_DSS_UNSTRUCTURED,
|
|
|
|
"ORTE_JOBID", &tmp))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
2005-04-13 03:19:48 +00:00
|
|
|
|
2006-02-07 03:32:36 +00:00
|
|
|
tmp = ORTE_CELLID;
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_dss.register_type(orte_ns_base_pack_cellid,
|
|
|
|
orte_ns_base_unpack_cellid,
|
|
|
|
(orte_dss_copy_fn_t)orte_ns_base_copy_cellid,
|
|
|
|
(orte_dss_compare_fn_t)orte_ns_base_compare_cellid,
|
|
|
|
(orte_dss_size_fn_t)orte_ns_base_std_size,
|
|
|
|
(orte_dss_print_fn_t)orte_ns_base_std_print,
|
|
|
|
(orte_dss_release_fn_t)orte_ns_base_std_release,
|
|
|
|
ORTE_DSS_UNSTRUCTURED,
|
|
|
|
"ORTE_CELLID", &tmp))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Open up all available components */
|
|
|
|
|
|
|
|
if (ORTE_SUCCESS !=
|
2005-08-15 18:25:35 +00:00
|
|
|
mca_base_components_open("ns", mca_ns_base_output,
|
2006-02-07 03:32:36 +00:00
|
|
|
mca_ns_base_static_components,
|
2005-04-13 03:19:48 +00:00
|
|
|
&mca_ns_base_components_available, true)) {
|
2005-03-24 20:23:15 +00:00
|
|
|
return ORTE_ERROR;
|
|
|
|
}
|
2004-07-12 02:44:25 +00:00
|
|
|
|
2005-03-24 20:23:15 +00:00
|
|
|
/* All done */
|
2004-07-02 01:24:53 +00:00
|
|
|
|
2005-03-24 20:23:15 +00:00
|
|
|
return ORTE_SUCCESS;
|
2004-07-02 01:24:53 +00:00
|
|
|
}
|