2005-01-12 20:51:34 +00:00
|
|
|
/*
|
2007-03-16 23:11:45 +00:00
|
|
|
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
2005-11-05 19:57:48 +00:00
|
|
|
* University Research and Technology
|
|
|
|
* Corporation. All rights reserved.
|
|
|
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
|
|
|
* of Tennessee Research Foundation. All rights
|
|
|
|
* reserved.
|
2005-01-12 20:51:34 +00:00
|
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
|
|
* University of Stuttgart. All rights reserved.
|
2005-03-24 12:43:37 +00:00
|
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
|
|
* All rights reserved.
|
2007-06-08 22:59:31 +00:00
|
|
|
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
|
|
|
|
* Copyright (c) 2007 Cisco, Inc. All rights reserved.
|
2005-01-12 20:51:34 +00:00
|
|
|
* $COPYRIGHT$
|
|
|
|
*
|
|
|
|
* Additional copyrights may follow
|
|
|
|
*
|
|
|
|
* $HEADER$
|
|
|
|
*/
|
2006-02-12 01:33:29 +00:00
|
|
|
#include "orte_config.h"
|
2005-01-12 20:51:34 +00:00
|
|
|
#include <errno.h>
|
2005-12-12 20:04:00 +00:00
|
|
|
#ifdef HAVE_UNISTD_H
|
2005-01-12 20:51:34 +00:00
|
|
|
#include <unistd.h>
|
2005-12-12 20:04:00 +00:00
|
|
|
#endif /* HAVE_UNISTD_H */
|
|
|
|
#ifdef HAVE_STRING_H
|
2005-01-12 20:51:34 +00:00
|
|
|
#include <string.h>
|
2005-12-12 20:04:00 +00:00
|
|
|
#endif /* HAVE_STRING_H */
|
2005-01-12 20:51:34 +00:00
|
|
|
|
2006-02-12 01:33:29 +00:00
|
|
|
#include "orte/orte_constants.h"
|
2005-07-03 23:31:27 +00:00
|
|
|
#include "opal/util/output.h"
|
2006-02-12 01:33:29 +00:00
|
|
|
#include "orte/mca/oob/base/base.h"
|
|
|
|
#include "orte/mca/iof/base/base.h"
|
|
|
|
#include "orte/mca/iof/base/iof_base_endpoint.h"
|
2005-01-12 20:51:34 +00:00
|
|
|
#include "iof_svc.h"
|
2005-03-29 19:40:38 +00:00
|
|
|
#include "iof_svc_pub.h"
|
|
|
|
#include "iof_svc_sub.h"
|
2005-01-12 20:51:34 +00:00
|
|
|
|
|
|
|
|
2005-03-14 20:57:21 +00:00
|
|
|
orte_iof_base_module_t orte_iof_svc_module = {
|
|
|
|
orte_iof_svc_publish,
|
|
|
|
orte_iof_svc_unpublish,
|
|
|
|
orte_iof_svc_subscribe,
|
|
|
|
orte_iof_svc_unsubscribe,
|
2007-06-08 22:59:31 +00:00
|
|
|
orte_iof_svc_push,
|
|
|
|
orte_iof_svc_pull,
|
2006-02-03 21:01:11 +00:00
|
|
|
orte_iof_base_flush,
|
2007-03-16 23:11:45 +00:00
|
|
|
orte_iof_svc_finalize,
|
|
|
|
orte_iof_svc_ft_event
|
2005-01-12 20:51:34 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
|
2007-06-08 22:59:31 +00:00
|
|
|
/*
|
|
|
|
* Create an endpoint for a local file descriptor and "publish" it
|
|
|
|
* under the name of the origin process. If the publish mode is a
|
|
|
|
* SINK, then create a publication entry for it so that incoming
|
|
|
|
* messages can be forwarded to it.
|
2005-01-12 20:51:34 +00:00
|
|
|
*
|
2007-06-08 22:59:31 +00:00
|
|
|
* SOURCEs do not need to create publication records because a) the
|
|
|
|
* endpoint will automatically wake up the event engine and read off
|
|
|
|
* the fd whenever there is data available, and b) this data is then
|
|
|
|
* automatically sent to the iof svc component for possible
|
|
|
|
* forwarding.
|
2005-01-12 20:51:34 +00:00
|
|
|
*/
|
|
|
|
|
2005-03-14 20:57:21 +00:00
|
|
|
int orte_iof_svc_publish(
|
2007-06-08 22:59:31 +00:00
|
|
|
const orte_process_name_t* origin,
|
2005-03-14 20:57:21 +00:00
|
|
|
orte_iof_base_mode_t mode,
|
|
|
|
orte_iof_base_tag_t tag,
|
2005-01-12 20:51:34 +00:00
|
|
|
int fd)
|
|
|
|
{
|
|
|
|
int rc;
|
|
|
|
|
|
|
|
/* setup a local endpoint to reflect registration */
|
2005-03-14 20:57:21 +00:00
|
|
|
rc = orte_iof_base_endpoint_create(
|
2007-06-08 22:59:31 +00:00
|
|
|
origin,
|
2005-01-12 20:51:34 +00:00
|
|
|
mode,
|
|
|
|
tag,
|
|
|
|
fd);
|
2007-06-08 22:59:31 +00:00
|
|
|
if (ORTE_SUCCESS != rc) {
|
2005-01-12 20:51:34 +00:00
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* publish endpoint */
|
2007-06-08 22:59:31 +00:00
|
|
|
if (ORTE_IOF_SINK == mode) {
|
2005-03-29 19:40:38 +00:00
|
|
|
rc = orte_iof_svc_pub_create(
|
2007-06-08 22:59:31 +00:00
|
|
|
origin,
|
Bring over the update to terminate orteds that are generated by a dynamic spawn such as comm_spawn. This introduces the concept of a job "family" - i.e., jobs that have a parent/child relationship. Comm_spawn'ed jobs have a parent (the one that spawned them). We track that relationship throughout the lineage - i.e., if a comm_spawned job in turn calls comm_spawn, then it has a parent (the one that spawned it) and a "root" job (the original job that started things).
Accordingly, there are new APIs to the name service to support the ability to get a job's parent, root, immediate children, and all its descendants. In addition, the terminate_job, terminate_orted, and signal_job APIs for the PLS have been modified to accept attributes that define the extent of their actions. For example, doing a "terminate_job" with an attribute of ORTE_NS_INCLUDE_DESCENDANTS will terminate the given jobid AND all jobs that descended from it.
I have tested this capability on a MacBook under rsh, Odin under SLURM, and LANL's Flash (bproc). It worked successfully on non-MPI jobs (both simple and including a spawn), and MPI jobs (again, both simple and with a spawn).
This commit was SVN r12597.
2006-11-14 19:34:59 +00:00
|
|
|
ORTE_PROC_MY_NAME,
|
2005-03-14 20:57:21 +00:00
|
|
|
ORTE_NS_CMP_ALL,
|
2005-01-12 20:51:34 +00:00
|
|
|
tag);
|
|
|
|
}
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2007-06-08 22:59:31 +00:00
|
|
|
/*
|
|
|
|
* Remove all registrations matching the specified origin process
|
|
|
|
* name, mask and tag values (where, here in the svc component, origin
|
|
|
|
* should usually be just this process -- ths svc component is
|
|
|
|
* unlikely to act as an IOF proxy for any other processes like the
|
|
|
|
* orted does).
|
2005-01-12 20:51:34 +00:00
|
|
|
*/
|
|
|
|
|
2005-03-14 20:57:21 +00:00
|
|
|
int orte_iof_svc_unpublish(
|
2007-06-08 22:59:31 +00:00
|
|
|
const orte_process_name_t* origin,
|
2005-03-14 20:57:21 +00:00
|
|
|
orte_ns_cmp_bitmask_t mask,
|
|
|
|
orte_iof_base_tag_t tag)
|
2005-01-12 20:51:34 +00:00
|
|
|
{
|
|
|
|
int rc;
|
2007-07-12 19:53:18 +00:00
|
|
|
|
|
|
|
/* Delete the corresponding publish. Note that it may have
|
|
|
|
already been deleted by some other entity (e.g., message
|
|
|
|
arriving saying to unpublish), so we may get a NOT_FOUND.
|
|
|
|
That's ok/not an error -- the only end result that we want is
|
|
|
|
that there is no corresponding publish. */
|
2005-03-29 19:40:38 +00:00
|
|
|
rc = orte_iof_svc_pub_delete(
|
2007-06-08 22:59:31 +00:00
|
|
|
origin,
|
Bring over the update to terminate orteds that are generated by a dynamic spawn such as comm_spawn. This introduces the concept of a job "family" - i.e., jobs that have a parent/child relationship. Comm_spawn'ed jobs have a parent (the one that spawned them). We track that relationship throughout the lineage - i.e., if a comm_spawned job in turn calls comm_spawn, then it has a parent (the one that spawned it) and a "root" job (the original job that started things).
Accordingly, there are new APIs to the name service to support the ability to get a job's parent, root, immediate children, and all its descendants. In addition, the terminate_job, terminate_orted, and signal_job APIs for the PLS have been modified to accept attributes that define the extent of their actions. For example, doing a "terminate_job" with an attribute of ORTE_NS_INCLUDE_DESCENDANTS will terminate the given jobid AND all jobs that descended from it.
I have tested this capability on a MacBook under rsh, Odin under SLURM, and LANL's Flash (bproc). It worked successfully on non-MPI jobs (both simple and including a spawn), and MPI jobs (again, both simple and with a spawn).
This commit was SVN r12597.
2006-11-14 19:34:59 +00:00
|
|
|
ORTE_PROC_MY_NAME,
|
2005-01-12 20:51:34 +00:00
|
|
|
mask,
|
|
|
|
tag);
|
2007-07-12 19:53:18 +00:00
|
|
|
if (ORTE_SUCCESS != rc && ORTE_ERR_NOT_FOUND != rc) {
|
2005-01-12 20:51:34 +00:00
|
|
|
return rc;
|
2007-06-08 22:59:31 +00:00
|
|
|
}
|
2005-01-12 20:51:34 +00:00
|
|
|
|
2007-07-12 19:53:18 +00:00
|
|
|
/* delete local endpoint. Note that the endpoint may have already
|
|
|
|
been deleted (e.g., if some entity noticed that the fd closed
|
|
|
|
and called orte_iof_base_endpoint_delete on the corresopnding
|
|
|
|
endpoint already). So if we get NOT_FOUND, ignore that error
|
|
|
|
-- the end result is what we want: the endpoint is deleted when
|
|
|
|
we return. */
|
2005-03-14 20:57:21 +00:00
|
|
|
rc = orte_iof_base_endpoint_delete(
|
2007-06-08 22:59:31 +00:00
|
|
|
origin,
|
2005-01-12 20:51:34 +00:00
|
|
|
mask,
|
|
|
|
tag);
|
2007-07-12 19:53:18 +00:00
|
|
|
if (ORTE_ERR_NOT_FOUND == rc || ORTE_SUCCESS == rc) {
|
|
|
|
return ORTE_SUCCESS;
|
|
|
|
} else {
|
|
|
|
return rc;
|
|
|
|
}
|
2005-01-12 20:51:34 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Explicitly push data from the specified file descriptor
|
2007-06-08 22:59:31 +00:00
|
|
|
* to the indicated set of SINK peers.
|
2005-01-12 20:51:34 +00:00
|
|
|
*/
|
|
|
|
|
2005-03-14 20:57:21 +00:00
|
|
|
int orte_iof_svc_push(
|
2007-06-08 22:59:31 +00:00
|
|
|
const orte_process_name_t* sink_name,
|
|
|
|
orte_ns_cmp_bitmask_t sink_mask,
|
|
|
|
orte_iof_base_tag_t sink_tag,
|
2005-01-12 20:51:34 +00:00
|
|
|
int fd)
|
|
|
|
{
|
|
|
|
int rc;
|
|
|
|
|
2007-06-08 22:59:31 +00:00
|
|
|
/* Setup a subscription. This will be matched against a publish
|
|
|
|
of a SINK from a remote process. */
|
2005-03-29 19:40:38 +00:00
|
|
|
rc = orte_iof_svc_sub_create(
|
Bring over the update to terminate orteds that are generated by a dynamic spawn such as comm_spawn. This introduces the concept of a job "family" - i.e., jobs that have a parent/child relationship. Comm_spawn'ed jobs have a parent (the one that spawned them). We track that relationship throughout the lineage - i.e., if a comm_spawned job in turn calls comm_spawn, then it has a parent (the one that spawned it) and a "root" job (the original job that started things).
Accordingly, there are new APIs to the name service to support the ability to get a job's parent, root, immediate children, and all its descendants. In addition, the terminate_job, terminate_orted, and signal_job APIs for the PLS have been modified to accept attributes that define the extent of their actions. For example, doing a "terminate_job" with an attribute of ORTE_NS_INCLUDE_DESCENDANTS will terminate the given jobid AND all jobs that descended from it.
I have tested this capability on a MacBook under rsh, Odin under SLURM, and LANL's Flash (bproc). It worked successfully on non-MPI jobs (both simple and including a spawn), and MPI jobs (again, both simple and with a spawn).
This commit was SVN r12597.
2006-11-14 19:34:59 +00:00
|
|
|
ORTE_PROC_MY_NAME,
|
2005-03-14 20:57:21 +00:00
|
|
|
ORTE_NS_CMP_ALL,
|
2007-06-08 22:59:31 +00:00
|
|
|
sink_tag,
|
|
|
|
sink_name,
|
|
|
|
sink_mask,
|
|
|
|
sink_tag);
|
|
|
|
if (ORTE_SUCCESS != rc) {
|
2005-01-12 20:51:34 +00:00
|
|
|
return rc;
|
2007-06-08 22:59:31 +00:00
|
|
|
}
|
2005-01-12 20:51:34 +00:00
|
|
|
|
2007-06-08 22:59:31 +00:00
|
|
|
/* Setup a local endpoint to reflect registration. This will
|
|
|
|
enter the fd into the event engine and wakeup when there is
|
|
|
|
data to read. The data will be put in an IOF fragment and RML
|
|
|
|
send to iof_svc_proxy_recv() (i.e., in this module!) for
|
|
|
|
handling (i.e., matching and forwarding to the publish(es) that
|
|
|
|
was(were) matched to the above subscription).
|
|
|
|
|
|
|
|
Create this endpoint *after* we make the above subscription so
|
|
|
|
that it is not found and attached to the subscription.
|
|
|
|
Instead, data that is consumed by the event engine callbacks
|
|
|
|
will be RML-sent to iof_svc_proxy_recv(), as described
|
|
|
|
above. */
|
2005-03-14 20:57:21 +00:00
|
|
|
rc = orte_iof_base_endpoint_create(
|
Bring over the update to terminate orteds that are generated by a dynamic spawn such as comm_spawn. This introduces the concept of a job "family" - i.e., jobs that have a parent/child relationship. Comm_spawn'ed jobs have a parent (the one that spawned them). We track that relationship throughout the lineage - i.e., if a comm_spawned job in turn calls comm_spawn, then it has a parent (the one that spawned it) and a "root" job (the original job that started things).
Accordingly, there are new APIs to the name service to support the ability to get a job's parent, root, immediate children, and all its descendants. In addition, the terminate_job, terminate_orted, and signal_job APIs for the PLS have been modified to accept attributes that define the extent of their actions. For example, doing a "terminate_job" with an attribute of ORTE_NS_INCLUDE_DESCENDANTS will terminate the given jobid AND all jobs that descended from it.
I have tested this capability on a MacBook under rsh, Odin under SLURM, and LANL's Flash (bproc). It worked successfully on non-MPI jobs (both simple and including a spawn), and MPI jobs (again, both simple and with a spawn).
This commit was SVN r12597.
2006-11-14 19:34:59 +00:00
|
|
|
ORTE_PROC_MY_NAME,
|
2005-03-14 20:57:21 +00:00
|
|
|
ORTE_IOF_SOURCE,
|
2007-06-08 22:59:31 +00:00
|
|
|
sink_tag,
|
2005-01-12 20:51:34 +00:00
|
|
|
fd);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2007-06-08 22:59:31 +00:00
|
|
|
/*
|
|
|
|
* Explicitly pull data from the specified set of SOURCE peers
|
2005-01-12 20:51:34 +00:00
|
|
|
* and dump to the indicated file descriptor.
|
|
|
|
*/
|
|
|
|
|
2005-03-14 20:57:21 +00:00
|
|
|
int orte_iof_svc_pull(
|
2007-06-08 22:59:31 +00:00
|
|
|
const orte_process_name_t* source_name,
|
|
|
|
orte_ns_cmp_bitmask_t source_mask,
|
|
|
|
orte_iof_base_tag_t source_tag,
|
2005-01-12 20:51:34 +00:00
|
|
|
int fd)
|
|
|
|
{
|
|
|
|
int rc;
|
|
|
|
|
2007-06-08 22:59:31 +00:00
|
|
|
/* setup a local endpoint -- *before* we create the subscription
|
|
|
|
so that the subscription will find the endpoint and attach it
|
|
|
|
to the subscription */
|
2005-03-14 20:57:21 +00:00
|
|
|
rc = orte_iof_base_endpoint_create(
|
Bring over the update to terminate orteds that are generated by a dynamic spawn such as comm_spawn. This introduces the concept of a job "family" - i.e., jobs that have a parent/child relationship. Comm_spawn'ed jobs have a parent (the one that spawned them). We track that relationship throughout the lineage - i.e., if a comm_spawned job in turn calls comm_spawn, then it has a parent (the one that spawned it) and a "root" job (the original job that started things).
Accordingly, there are new APIs to the name service to support the ability to get a job's parent, root, immediate children, and all its descendants. In addition, the terminate_job, terminate_orted, and signal_job APIs for the PLS have been modified to accept attributes that define the extent of their actions. For example, doing a "terminate_job" with an attribute of ORTE_NS_INCLUDE_DESCENDANTS will terminate the given jobid AND all jobs that descended from it.
I have tested this capability on a MacBook under rsh, Odin under SLURM, and LANL's Flash (bproc). It worked successfully on non-MPI jobs (both simple and including a spawn), and MPI jobs (again, both simple and with a spawn).
This commit was SVN r12597.
2006-11-14 19:34:59 +00:00
|
|
|
ORTE_PROC_MY_NAME,
|
2005-03-14 20:57:21 +00:00
|
|
|
ORTE_IOF_SINK,
|
2007-06-08 22:59:31 +00:00
|
|
|
source_tag,
|
2005-01-12 20:51:34 +00:00
|
|
|
fd);
|
2007-06-08 22:59:31 +00:00
|
|
|
if (ORTE_SUCCESS != rc) {
|
2005-01-12 20:51:34 +00:00
|
|
|
return rc;
|
2007-06-08 22:59:31 +00:00
|
|
|
}
|
2005-01-12 20:51:34 +00:00
|
|
|
|
|
|
|
/* create a subscription */
|
2005-03-29 19:40:38 +00:00
|
|
|
rc = orte_iof_svc_sub_create(
|
2007-06-08 22:59:31 +00:00
|
|
|
source_name,
|
|
|
|
source_mask,
|
|
|
|
source_tag,
|
Bring over the update to terminate orteds that are generated by a dynamic spawn such as comm_spawn. This introduces the concept of a job "family" - i.e., jobs that have a parent/child relationship. Comm_spawn'ed jobs have a parent (the one that spawned them). We track that relationship throughout the lineage - i.e., if a comm_spawned job in turn calls comm_spawn, then it has a parent (the one that spawned it) and a "root" job (the original job that started things).
Accordingly, there are new APIs to the name service to support the ability to get a job's parent, root, immediate children, and all its descendants. In addition, the terminate_job, terminate_orted, and signal_job APIs for the PLS have been modified to accept attributes that define the extent of their actions. For example, doing a "terminate_job" with an attribute of ORTE_NS_INCLUDE_DESCENDANTS will terminate the given jobid AND all jobs that descended from it.
I have tested this capability on a MacBook under rsh, Odin under SLURM, and LANL's Flash (bproc). It worked successfully on non-MPI jobs (both simple and including a spawn), and MPI jobs (again, both simple and with a spawn).
This commit was SVN r12597.
2006-11-14 19:34:59 +00:00
|
|
|
ORTE_PROC_MY_NAME,
|
2005-03-14 20:57:21 +00:00
|
|
|
ORTE_NS_CMP_ALL,
|
2007-06-08 22:59:31 +00:00
|
|
|
source_tag);
|
2005-01-12 20:51:34 +00:00
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Subscribe to receive a callback on receipt of data
|
2007-06-08 22:59:31 +00:00
|
|
|
* from a specified set of origin peers.
|
2005-01-12 20:51:34 +00:00
|
|
|
*/
|
|
|
|
|
2005-03-14 20:57:21 +00:00
|
|
|
int orte_iof_svc_subscribe(
|
2007-06-08 22:59:31 +00:00
|
|
|
const orte_process_name_t* origin_name,
|
|
|
|
orte_ns_cmp_bitmask_t origin_mask,
|
|
|
|
orte_iof_base_tag_t origin_tag,
|
2005-11-10 04:49:51 +00:00
|
|
|
orte_iof_base_callback_fn_t cbfunc,
|
2005-01-12 20:51:34 +00:00
|
|
|
void* cbdata)
|
|
|
|
{
|
2005-11-10 04:49:51 +00:00
|
|
|
int rc;
|
|
|
|
|
|
|
|
/* create a local registration to reflect the callback */
|
2007-06-08 22:59:31 +00:00
|
|
|
rc = orte_iof_base_callback_create(ORTE_PROC_MY_NAME, origin_tag,
|
|
|
|
cbfunc, cbdata);
|
|
|
|
if (ORTE_SUCCESS != rc) {
|
2005-11-10 04:49:51 +00:00
|
|
|
return rc;
|
2007-06-08 22:59:31 +00:00
|
|
|
}
|
2005-11-10 04:49:51 +00:00
|
|
|
|
2005-01-12 20:51:34 +00:00
|
|
|
/* setup local subscription */
|
2005-11-10 04:49:51 +00:00
|
|
|
rc = orte_iof_svc_sub_create(
|
2007-06-08 22:59:31 +00:00
|
|
|
origin_name,
|
|
|
|
origin_mask,
|
|
|
|
origin_tag,
|
Bring over the update to terminate orteds that are generated by a dynamic spawn such as comm_spawn. This introduces the concept of a job "family" - i.e., jobs that have a parent/child relationship. Comm_spawn'ed jobs have a parent (the one that spawned them). We track that relationship throughout the lineage - i.e., if a comm_spawned job in turn calls comm_spawn, then it has a parent (the one that spawned it) and a "root" job (the original job that started things).
Accordingly, there are new APIs to the name service to support the ability to get a job's parent, root, immediate children, and all its descendants. In addition, the terminate_job, terminate_orted, and signal_job APIs for the PLS have been modified to accept attributes that define the extent of their actions. For example, doing a "terminate_job" with an attribute of ORTE_NS_INCLUDE_DESCENDANTS will terminate the given jobid AND all jobs that descended from it.
I have tested this capability on a MacBook under rsh, Odin under SLURM, and LANL's Flash (bproc). It worked successfully on non-MPI jobs (both simple and including a spawn), and MPI jobs (again, both simple and with a spawn).
This commit was SVN r12597.
2006-11-14 19:34:59 +00:00
|
|
|
ORTE_PROC_MY_NAME,
|
2005-11-10 04:49:51 +00:00
|
|
|
ORTE_NS_CMP_ALL,
|
2007-06-08 22:59:31 +00:00
|
|
|
origin_tag);
|
2005-11-10 04:49:51 +00:00
|
|
|
return rc;
|
2005-01-12 20:51:34 +00:00
|
|
|
}
|
|
|
|
|
2005-03-14 20:57:21 +00:00
|
|
|
int orte_iof_svc_unsubscribe(
|
2007-06-08 22:59:31 +00:00
|
|
|
const orte_process_name_t* origin_name,
|
|
|
|
orte_ns_cmp_bitmask_t origin_mask,
|
|
|
|
orte_iof_base_tag_t origin_tag)
|
2005-01-12 20:51:34 +00:00
|
|
|
{
|
2005-11-21 19:46:47 +00:00
|
|
|
int rc;
|
|
|
|
|
|
|
|
/* delete local subscription */
|
|
|
|
rc = orte_iof_svc_sub_delete(
|
2007-06-08 22:59:31 +00:00
|
|
|
origin_name,
|
|
|
|
origin_mask,
|
|
|
|
origin_tag,
|
Bring over the update to terminate orteds that are generated by a dynamic spawn such as comm_spawn. This introduces the concept of a job "family" - i.e., jobs that have a parent/child relationship. Comm_spawn'ed jobs have a parent (the one that spawned them). We track that relationship throughout the lineage - i.e., if a comm_spawned job in turn calls comm_spawn, then it has a parent (the one that spawned it) and a "root" job (the original job that started things).
Accordingly, there are new APIs to the name service to support the ability to get a job's parent, root, immediate children, and all its descendants. In addition, the terminate_job, terminate_orted, and signal_job APIs for the PLS have been modified to accept attributes that define the extent of their actions. For example, doing a "terminate_job" with an attribute of ORTE_NS_INCLUDE_DESCENDANTS will terminate the given jobid AND all jobs that descended from it.
I have tested this capability on a MacBook under rsh, Odin under SLURM, and LANL's Flash (bproc). It worked successfully on non-MPI jobs (both simple and including a spawn), and MPI jobs (again, both simple and with a spawn).
This commit was SVN r12597.
2006-11-14 19:34:59 +00:00
|
|
|
ORTE_PROC_MY_NAME,
|
2005-11-21 19:46:47 +00:00
|
|
|
ORTE_NS_CMP_ALL,
|
2007-06-08 22:59:31 +00:00
|
|
|
origin_tag);
|
|
|
|
if (ORTE_SUCCESS != rc) {
|
2005-11-21 19:46:47 +00:00
|
|
|
return rc;
|
2007-06-08 22:59:31 +00:00
|
|
|
}
|
2005-11-21 19:46:47 +00:00
|
|
|
|
|
|
|
/* cleanup any locally registered callback */
|
2007-06-08 22:59:31 +00:00
|
|
|
return orte_iof_base_callback_delete(ORTE_PROC_MY_NAME, origin_tag);
|
2005-01-12 20:51:34 +00:00
|
|
|
}
|