2004-12-21 22:16:09 +00:00
|
|
|
/*
|
2007-03-16 23:11:45 +00:00
|
|
|
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
2005-11-05 19:57:48 +00:00
|
|
|
* University Research and Technology
|
|
|
|
* Corporation. All rights reserved.
|
|
|
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
|
|
|
* of Tennessee Research Foundation. All rights
|
|
|
|
* reserved.
|
2004-12-21 22:16:09 +00:00
|
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
|
|
* University of Stuttgart. All rights reserved.
|
2005-03-24 12:43:37 +00:00
|
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
|
|
* All rights reserved.
|
2004-12-21 22:16:09 +00:00
|
|
|
* $COPYRIGHT$
|
|
|
|
*
|
|
|
|
* Additional copyrights may follow
|
|
|
|
*
|
|
|
|
* $HEADER$
|
|
|
|
*/
|
2006-02-12 01:33:29 +00:00
|
|
|
#include "orte_config.h"
|
2004-12-21 22:16:09 +00:00
|
|
|
#include <errno.h>
|
2005-12-12 20:04:00 +00:00
|
|
|
#ifdef HAVE_UNISTD_H
|
2004-12-21 22:16:09 +00:00
|
|
|
#include <unistd.h>
|
2005-12-12 20:04:00 +00:00
|
|
|
#endif /* HAVE_UNISTD_H */
|
|
|
|
#ifdef HAVE_STRING_H
|
2004-12-21 22:16:09 +00:00
|
|
|
#include <string.h>
|
2005-12-12 20:04:00 +00:00
|
|
|
#endif /* HAVE_STRING_H */
|
2004-12-21 22:16:09 +00:00
|
|
|
|
2006-02-12 01:33:29 +00:00
|
|
|
#include "orte/orte_constants.h"
|
2005-07-03 23:31:27 +00:00
|
|
|
#include "opal/util/output.h"
|
2006-02-12 01:33:29 +00:00
|
|
|
#include "orte/mca/iof/iof.h"
|
|
|
|
#include "orte/mca/rml/rml.h"
|
|
|
|
#include "orte/mca/rml/rml_types.h"
|
|
|
|
#include "orte/mca/iof/iof.h"
|
|
|
|
#include "orte/mca/iof/base/base.h"
|
|
|
|
#include "orte/mca/iof/base/iof_base_endpoint.h"
|
|
|
|
#include "orte/mca/errmgr/errmgr.h"
|
2004-12-21 22:16:09 +00:00
|
|
|
#include "iof_proxy.h"
|
2005-01-12 20:51:34 +00:00
|
|
|
#include "iof_proxy_svc.h"
|
2004-12-21 22:16:09 +00:00
|
|
|
|
|
|
|
|
2005-03-14 20:57:21 +00:00
|
|
|
orte_iof_base_module_t orte_iof_proxy_module = {
|
|
|
|
orte_iof_proxy_publish,
|
|
|
|
orte_iof_proxy_unpublish,
|
|
|
|
orte_iof_proxy_push,
|
|
|
|
orte_iof_proxy_pull,
|
|
|
|
orte_iof_proxy_subscribe,
|
|
|
|
orte_iof_proxy_unsubscribe,
|
2007-01-30 06:34:38 +00:00
|
|
|
orte_iof_base_flush,
|
2007-03-16 23:11:45 +00:00
|
|
|
orte_iof_proxy_finalize,
|
|
|
|
orte_iof_proxy_ft_event
|
2004-12-21 22:16:09 +00:00
|
|
|
};
|
|
|
|
|
2007-03-16 23:11:45 +00:00
|
|
|
int orte_iof_proxy_finalize(void ) {
|
|
|
|
return ORTE_SUCCESS;
|
|
|
|
}
|
2004-12-21 22:16:09 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Publish a local file descriptor as an endpoint that is logically
|
|
|
|
* associated with the specified process name (e.g. master side of a
|
|
|
|
* pipe/pty connected to a child process)
|
|
|
|
*
|
|
|
|
* @param name
|
|
|
|
* @param mode
|
|
|
|
* @param tag
|
|
|
|
* @param fd
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
2005-03-14 20:57:21 +00:00
|
|
|
int orte_iof_proxy_publish(
|
|
|
|
const orte_process_name_t* name,
|
|
|
|
orte_iof_base_mode_t mode,
|
|
|
|
orte_iof_base_tag_t tag,
|
2004-12-21 22:16:09 +00:00
|
|
|
int fd)
|
|
|
|
{
|
2005-01-12 20:51:34 +00:00
|
|
|
int rc;
|
|
|
|
|
2005-03-14 20:57:21 +00:00
|
|
|
if(mca_iof_proxy_component.proxy_debug > 1) {
|
|
|
|
char* name_str;
|
|
|
|
orte_ns.get_proc_name_string(&name_str, name);
|
2005-07-03 23:31:27 +00:00
|
|
|
opal_output(0, "orte_iof_proxy_publish(%s,%d,%d,%d)\n", name_str, mode, tag, fd);
|
2005-03-14 20:57:21 +00:00
|
|
|
free(name_str);
|
|
|
|
}
|
|
|
|
|
2005-01-12 20:51:34 +00:00
|
|
|
/* publish to server */
|
2005-03-14 20:57:21 +00:00
|
|
|
if(mode == ORTE_IOF_SINK) {
|
|
|
|
rc = orte_iof_proxy_svc_publish(name,tag);
|
2005-03-29 19:40:38 +00:00
|
|
|
if(rc != ORTE_SUCCESS)
|
2005-01-12 20:51:34 +00:00
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* setup a local endpoint to reflect registration */
|
2005-03-14 20:57:21 +00:00
|
|
|
rc = orte_iof_base_endpoint_create(
|
2005-01-12 20:51:34 +00:00
|
|
|
name,
|
|
|
|
mode,
|
|
|
|
tag,
|
|
|
|
fd);
|
|
|
|
return rc;
|
2004-12-21 22:16:09 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Remove all registrations matching the specified process
|
|
|
|
* name, mask and tag values.
|
|
|
|
*
|
|
|
|
* @param name
|
|
|
|
* @param mask
|
|
|
|
* @param tag
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
2005-03-14 20:57:21 +00:00
|
|
|
int orte_iof_proxy_unpublish(
|
|
|
|
const orte_process_name_t* name,
|
|
|
|
orte_ns_cmp_bitmask_t mask,
|
|
|
|
orte_iof_base_tag_t tag)
|
2004-12-21 22:16:09 +00:00
|
|
|
{
|
2005-01-12 20:51:34 +00:00
|
|
|
int rc;
|
|
|
|
|
|
|
|
/* cleanup server */
|
2005-03-14 20:57:21 +00:00
|
|
|
orte_iof_proxy_svc_unpublish(
|
2005-01-12 20:51:34 +00:00
|
|
|
name,
|
|
|
|
mask,
|
|
|
|
tag);
|
|
|
|
|
|
|
|
/* setup a local endpoint to reflect registration */
|
2005-03-14 20:57:21 +00:00
|
|
|
rc = orte_iof_base_endpoint_delete(
|
2005-01-12 20:51:34 +00:00
|
|
|
name,
|
|
|
|
mask,
|
|
|
|
tag);
|
|
|
|
return rc;
|
2004-12-21 22:16:09 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Explicitly push data from the specified file descriptor
|
|
|
|
* to the indicated set of peers.
|
|
|
|
*
|
|
|
|
* @param dst_name Name used to qualify set of peers.
|
|
|
|
* @param dst_mask Mask that specified how name is interpreted.
|
|
|
|
* @param dst_tag Match a specific peer endpoint.
|
|
|
|
* @param fd Local file descriptor.
|
|
|
|
*/
|
|
|
|
|
2005-03-14 20:57:21 +00:00
|
|
|
int orte_iof_proxy_push(
|
|
|
|
const orte_process_name_t* dst_name,
|
|
|
|
orte_ns_cmp_bitmask_t dst_mask,
|
|
|
|
orte_iof_base_tag_t dst_tag,
|
2004-12-21 22:16:09 +00:00
|
|
|
int fd)
|
|
|
|
{
|
2005-01-12 20:51:34 +00:00
|
|
|
int rc;
|
|
|
|
|
|
|
|
/* send a subscription to server on behalf of the destination */
|
2005-03-14 20:57:21 +00:00
|
|
|
rc = orte_iof_proxy_svc_subscribe(
|
Bring over the update to terminate orteds that are generated by a dynamic spawn such as comm_spawn. This introduces the concept of a job "family" - i.e., jobs that have a parent/child relationship. Comm_spawn'ed jobs have a parent (the one that spawned them). We track that relationship throughout the lineage - i.e., if a comm_spawned job in turn calls comm_spawn, then it has a parent (the one that spawned it) and a "root" job (the original job that started things).
Accordingly, there are new APIs to the name service to support the ability to get a job's parent, root, immediate children, and all its descendants. In addition, the terminate_job, terminate_orted, and signal_job APIs for the PLS have been modified to accept attributes that define the extent of their actions. For example, doing a "terminate_job" with an attribute of ORTE_NS_INCLUDE_DESCENDANTS will terminate the given jobid AND all jobs that descended from it.
I have tested this capability on a MacBook under rsh, Odin under SLURM, and LANL's Flash (bproc). It worked successfully on non-MPI jobs (both simple and including a spawn), and MPI jobs (again, both simple and with a spawn).
This commit was SVN r12597.
2006-11-14 19:34:59 +00:00
|
|
|
ORTE_PROC_MY_NAME,
|
2005-03-14 20:57:21 +00:00
|
|
|
ORTE_NS_CMP_ALL,
|
2005-01-12 20:51:34 +00:00
|
|
|
dst_tag,
|
|
|
|
dst_name,
|
|
|
|
dst_mask,
|
|
|
|
dst_tag
|
|
|
|
);
|
2005-03-29 19:40:38 +00:00
|
|
|
if(rc != ORTE_SUCCESS)
|
2005-01-12 20:51:34 +00:00
|
|
|
return rc;
|
|
|
|
|
|
|
|
/* setup a local endpoint to reflect registration */
|
2005-03-14 20:57:21 +00:00
|
|
|
rc = orte_iof_base_endpoint_create(
|
Bring over the update to terminate orteds that are generated by a dynamic spawn such as comm_spawn. This introduces the concept of a job "family" - i.e., jobs that have a parent/child relationship. Comm_spawn'ed jobs have a parent (the one that spawned them). We track that relationship throughout the lineage - i.e., if a comm_spawned job in turn calls comm_spawn, then it has a parent (the one that spawned it) and a "root" job (the original job that started things).
Accordingly, there are new APIs to the name service to support the ability to get a job's parent, root, immediate children, and all its descendants. In addition, the terminate_job, terminate_orted, and signal_job APIs for the PLS have been modified to accept attributes that define the extent of their actions. For example, doing a "terminate_job" with an attribute of ORTE_NS_INCLUDE_DESCENDANTS will terminate the given jobid AND all jobs that descended from it.
I have tested this capability on a MacBook under rsh, Odin under SLURM, and LANL's Flash (bproc). It worked successfully on non-MPI jobs (both simple and including a spawn), and MPI jobs (again, both simple and with a spawn).
This commit was SVN r12597.
2006-11-14 19:34:59 +00:00
|
|
|
ORTE_PROC_MY_NAME,
|
2005-03-14 20:57:21 +00:00
|
|
|
ORTE_IOF_SOURCE,
|
2005-01-12 20:51:34 +00:00
|
|
|
dst_tag,
|
|
|
|
fd);
|
|
|
|
|
|
|
|
return rc;
|
2004-12-21 22:16:09 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Explicitly pull data from the specified set of peers
|
|
|
|
* and dump to the indicated file descriptor.
|
|
|
|
*
|
|
|
|
* @param dst_name Name used to qualify set of peers.
|
|
|
|
* @param dst_mask Mask that specified how name is interpreted.
|
|
|
|
* @param dst_tag Match a specific peer endpoint.
|
|
|
|
* @param fd Local file descriptor.
|
|
|
|
*/
|
|
|
|
|
2005-03-14 20:57:21 +00:00
|
|
|
int orte_iof_proxy_pull(
|
|
|
|
const orte_process_name_t* src_name,
|
|
|
|
orte_ns_cmp_bitmask_t src_mask,
|
|
|
|
orte_iof_base_tag_t src_tag,
|
2004-12-21 22:16:09 +00:00
|
|
|
int fd)
|
|
|
|
{
|
2005-01-12 20:51:34 +00:00
|
|
|
/* setup a local endpoint */
|
|
|
|
int rc;
|
2005-03-14 20:57:21 +00:00
|
|
|
rc = orte_iof_base_endpoint_create(
|
Bring over the update to terminate orteds that are generated by a dynamic spawn such as comm_spawn. This introduces the concept of a job "family" - i.e., jobs that have a parent/child relationship. Comm_spawn'ed jobs have a parent (the one that spawned them). We track that relationship throughout the lineage - i.e., if a comm_spawned job in turn calls comm_spawn, then it has a parent (the one that spawned it) and a "root" job (the original job that started things).
Accordingly, there are new APIs to the name service to support the ability to get a job's parent, root, immediate children, and all its descendants. In addition, the terminate_job, terminate_orted, and signal_job APIs for the PLS have been modified to accept attributes that define the extent of their actions. For example, doing a "terminate_job" with an attribute of ORTE_NS_INCLUDE_DESCENDANTS will terminate the given jobid AND all jobs that descended from it.
I have tested this capability on a MacBook under rsh, Odin under SLURM, and LANL's Flash (bproc). It worked successfully on non-MPI jobs (both simple and including a spawn), and MPI jobs (again, both simple and with a spawn).
This commit was SVN r12597.
2006-11-14 19:34:59 +00:00
|
|
|
ORTE_PROC_MY_NAME,
|
2005-03-29 19:40:38 +00:00
|
|
|
ORTE_IOF_SINK,
|
2005-01-12 20:51:34 +00:00
|
|
|
src_tag,
|
|
|
|
fd);
|
2005-03-29 19:40:38 +00:00
|
|
|
if(rc != ORTE_SUCCESS) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
2005-01-12 20:51:34 +00:00
|
|
|
return rc;
|
2005-03-29 19:40:38 +00:00
|
|
|
}
|
2005-01-12 20:51:34 +00:00
|
|
|
|
2005-03-29 19:40:38 +00:00
|
|
|
/* publish this endpoint */
|
|
|
|
rc = orte_iof_proxy_svc_publish(
|
Bring over the update to terminate orteds that are generated by a dynamic spawn such as comm_spawn. This introduces the concept of a job "family" - i.e., jobs that have a parent/child relationship. Comm_spawn'ed jobs have a parent (the one that spawned them). We track that relationship throughout the lineage - i.e., if a comm_spawned job in turn calls comm_spawn, then it has a parent (the one that spawned it) and a "root" job (the original job that started things).
Accordingly, there are new APIs to the name service to support the ability to get a job's parent, root, immediate children, and all its descendants. In addition, the terminate_job, terminate_orted, and signal_job APIs for the PLS have been modified to accept attributes that define the extent of their actions. For example, doing a "terminate_job" with an attribute of ORTE_NS_INCLUDE_DESCENDANTS will terminate the given jobid AND all jobs that descended from it.
I have tested this capability on a MacBook under rsh, Odin under SLURM, and LANL's Flash (bproc). It worked successfully on non-MPI jobs (both simple and including a spawn), and MPI jobs (again, both simple and with a spawn).
This commit was SVN r12597.
2006-11-14 19:34:59 +00:00
|
|
|
ORTE_PROC_MY_NAME,
|
2005-03-29 19:40:38 +00:00
|
|
|
src_tag);
|
|
|
|
if(rc != ORTE_SUCCESS) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* subscribe to peer */
|
2005-03-14 20:57:21 +00:00
|
|
|
rc = orte_iof_proxy_svc_subscribe(
|
2005-01-12 20:51:34 +00:00
|
|
|
src_name,
|
|
|
|
src_mask,
|
|
|
|
src_tag,
|
Bring over the update to terminate orteds that are generated by a dynamic spawn such as comm_spawn. This introduces the concept of a job "family" - i.e., jobs that have a parent/child relationship. Comm_spawn'ed jobs have a parent (the one that spawned them). We track that relationship throughout the lineage - i.e., if a comm_spawned job in turn calls comm_spawn, then it has a parent (the one that spawned it) and a "root" job (the original job that started things).
Accordingly, there are new APIs to the name service to support the ability to get a job's parent, root, immediate children, and all its descendants. In addition, the terminate_job, terminate_orted, and signal_job APIs for the PLS have been modified to accept attributes that define the extent of their actions. For example, doing a "terminate_job" with an attribute of ORTE_NS_INCLUDE_DESCENDANTS will terminate the given jobid AND all jobs that descended from it.
I have tested this capability on a MacBook under rsh, Odin under SLURM, and LANL's Flash (bproc). It worked successfully on non-MPI jobs (both simple and including a spawn), and MPI jobs (again, both simple and with a spawn).
This commit was SVN r12597.
2006-11-14 19:34:59 +00:00
|
|
|
ORTE_PROC_MY_NAME,
|
2005-03-14 20:57:21 +00:00
|
|
|
ORTE_NS_CMP_ALL,
|
2005-01-12 20:51:34 +00:00
|
|
|
src_tag);
|
2005-03-29 19:40:38 +00:00
|
|
|
if(rc != ORTE_SUCCESS) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
2005-01-12 20:51:34 +00:00
|
|
|
return rc;
|
2004-12-21 22:16:09 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Setup buffering for a specified set of endpoints.
|
|
|
|
*/
|
|
|
|
|
2005-03-14 20:57:21 +00:00
|
|
|
int orte_iof_proxy_buffer(
|
|
|
|
const orte_process_name_t* src_name,
|
|
|
|
orte_ns_cmp_bitmask_t src_mask,
|
|
|
|
orte_iof_base_tag_t src_tag,
|
2004-12-21 22:16:09 +00:00
|
|
|
size_t buffer_size)
|
|
|
|
{
|
2005-03-29 19:40:38 +00:00
|
|
|
return ORTE_ERROR;
|
2004-12-21 22:16:09 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Subscribe to receive a callback on receipt of data
|
|
|
|
* from a specified set of peers.
|
|
|
|
*/
|
|
|
|
|
2005-03-14 20:57:21 +00:00
|
|
|
int orte_iof_proxy_subscribe(
|
|
|
|
const orte_process_name_t* src_name,
|
|
|
|
orte_ns_cmp_bitmask_t src_mask,
|
|
|
|
orte_iof_base_tag_t src_tag,
|
2005-11-10 04:49:51 +00:00
|
|
|
orte_iof_base_callback_fn_t cbfunc,
|
2004-12-21 22:16:09 +00:00
|
|
|
void* cbdata)
|
|
|
|
{
|
2005-01-12 20:51:34 +00:00
|
|
|
int rc;
|
|
|
|
|
|
|
|
/* create a local registration to reflect the callback */
|
Bring over the update to terminate orteds that are generated by a dynamic spawn such as comm_spawn. This introduces the concept of a job "family" - i.e., jobs that have a parent/child relationship. Comm_spawn'ed jobs have a parent (the one that spawned them). We track that relationship throughout the lineage - i.e., if a comm_spawned job in turn calls comm_spawn, then it has a parent (the one that spawned it) and a "root" job (the original job that started things).
Accordingly, there are new APIs to the name service to support the ability to get a job's parent, root, immediate children, and all its descendants. In addition, the terminate_job, terminate_orted, and signal_job APIs for the PLS have been modified to accept attributes that define the extent of their actions. For example, doing a "terminate_job" with an attribute of ORTE_NS_INCLUDE_DESCENDANTS will terminate the given jobid AND all jobs that descended from it.
I have tested this capability on a MacBook under rsh, Odin under SLURM, and LANL's Flash (bproc). It worked successfully on non-MPI jobs (both simple and including a spawn), and MPI jobs (again, both simple and with a spawn).
This commit was SVN r12597.
2006-11-14 19:34:59 +00:00
|
|
|
rc = orte_iof_base_callback_create(ORTE_PROC_MY_NAME,src_tag,cbfunc,cbdata);
|
2006-02-12 01:33:29 +00:00
|
|
|
if(rc != ORTE_SUCCESS)
|
2005-11-10 04:49:51 +00:00
|
|
|
return rc;
|
2005-01-12 20:51:34 +00:00
|
|
|
|
|
|
|
/* send a subscription message to the service */
|
2005-03-14 20:57:21 +00:00
|
|
|
rc = orte_iof_proxy_svc_subscribe(
|
2005-01-12 20:51:34 +00:00
|
|
|
src_name,
|
|
|
|
src_mask,
|
|
|
|
src_tag,
|
Bring over the update to terminate orteds that are generated by a dynamic spawn such as comm_spawn. This introduces the concept of a job "family" - i.e., jobs that have a parent/child relationship. Comm_spawn'ed jobs have a parent (the one that spawned them). We track that relationship throughout the lineage - i.e., if a comm_spawned job in turn calls comm_spawn, then it has a parent (the one that spawned it) and a "root" job (the original job that started things).
Accordingly, there are new APIs to the name service to support the ability to get a job's parent, root, immediate children, and all its descendants. In addition, the terminate_job, terminate_orted, and signal_job APIs for the PLS have been modified to accept attributes that define the extent of their actions. For example, doing a "terminate_job" with an attribute of ORTE_NS_INCLUDE_DESCENDANTS will terminate the given jobid AND all jobs that descended from it.
I have tested this capability on a MacBook under rsh, Odin under SLURM, and LANL's Flash (bproc). It worked successfully on non-MPI jobs (both simple and including a spawn), and MPI jobs (again, both simple and with a spawn).
This commit was SVN r12597.
2006-11-14 19:34:59 +00:00
|
|
|
ORTE_PROC_MY_NAME,
|
2005-03-14 20:57:21 +00:00
|
|
|
ORTE_NS_CMP_ALL,
|
2005-01-12 20:51:34 +00:00
|
|
|
src_tag);
|
|
|
|
return rc;
|
2004-12-21 22:16:09 +00:00
|
|
|
}
|
|
|
|
|
2005-03-14 20:57:21 +00:00
|
|
|
int orte_iof_proxy_unsubscribe(
|
|
|
|
const orte_process_name_t* src_name,
|
|
|
|
orte_ns_cmp_bitmask_t src_mask,
|
|
|
|
orte_iof_base_tag_t src_tag)
|
2004-12-21 22:16:09 +00:00
|
|
|
{
|
2005-01-12 20:51:34 +00:00
|
|
|
int rc;
|
|
|
|
|
|
|
|
/* send an unsubscribe message to the service */
|
2005-03-14 20:57:21 +00:00
|
|
|
rc = orte_iof_proxy_svc_unsubscribe(
|
2005-01-12 20:51:34 +00:00
|
|
|
src_name,
|
|
|
|
src_mask,
|
|
|
|
src_tag,
|
Bring over the update to terminate orteds that are generated by a dynamic spawn such as comm_spawn. This introduces the concept of a job "family" - i.e., jobs that have a parent/child relationship. Comm_spawn'ed jobs have a parent (the one that spawned them). We track that relationship throughout the lineage - i.e., if a comm_spawned job in turn calls comm_spawn, then it has a parent (the one that spawned it) and a "root" job (the original job that started things).
Accordingly, there are new APIs to the name service to support the ability to get a job's parent, root, immediate children, and all its descendants. In addition, the terminate_job, terminate_orted, and signal_job APIs for the PLS have been modified to accept attributes that define the extent of their actions. For example, doing a "terminate_job" with an attribute of ORTE_NS_INCLUDE_DESCENDANTS will terminate the given jobid AND all jobs that descended from it.
I have tested this capability on a MacBook under rsh, Odin under SLURM, and LANL's Flash (bproc). It worked successfully on non-MPI jobs (both simple and including a spawn), and MPI jobs (again, both simple and with a spawn).
This commit was SVN r12597.
2006-11-14 19:34:59 +00:00
|
|
|
ORTE_PROC_MY_NAME,
|
2005-03-14 20:57:21 +00:00
|
|
|
ORTE_NS_CMP_ALL,
|
2005-01-12 20:51:34 +00:00
|
|
|
src_tag);
|
2006-02-12 01:33:29 +00:00
|
|
|
if(rc != ORTE_SUCCESS)
|
2005-11-10 04:49:51 +00:00
|
|
|
return rc;
|
|
|
|
|
2005-01-12 20:51:34 +00:00
|
|
|
/* remove local callback */
|
Bring over the update to terminate orteds that are generated by a dynamic spawn such as comm_spawn. This introduces the concept of a job "family" - i.e., jobs that have a parent/child relationship. Comm_spawn'ed jobs have a parent (the one that spawned them). We track that relationship throughout the lineage - i.e., if a comm_spawned job in turn calls comm_spawn, then it has a parent (the one that spawned it) and a "root" job (the original job that started things).
Accordingly, there are new APIs to the name service to support the ability to get a job's parent, root, immediate children, and all its descendants. In addition, the terminate_job, terminate_orted, and signal_job APIs for the PLS have been modified to accept attributes that define the extent of their actions. For example, doing a "terminate_job" with an attribute of ORTE_NS_INCLUDE_DESCENDANTS will terminate the given jobid AND all jobs that descended from it.
I have tested this capability on a MacBook under rsh, Odin under SLURM, and LANL's Flash (bproc). It worked successfully on non-MPI jobs (both simple and including a spawn), and MPI jobs (again, both simple and with a spawn).
This commit was SVN r12597.
2006-11-14 19:34:59 +00:00
|
|
|
return orte_iof_base_callback_delete(ORTE_PROC_MY_NAME,src_tag);
|
2004-12-21 22:16:09 +00:00
|
|
|
}
|
|
|
|
|
2007-03-16 23:11:45 +00:00
|
|
|
int orte_iof_proxy_ft_event(int state) {
|
|
|
|
int ret, exit_status = ORTE_SUCCESS;
|
|
|
|
|
|
|
|
if(OPAL_CRS_CHECKPOINT == state) {
|
|
|
|
/*
|
|
|
|
* Flush
|
|
|
|
*/
|
|
|
|
if( ORTE_SUCCESS != (ret = orte_iof_base_flush() ) ) {
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Stop receiving events
|
|
|
|
*/
|
|
|
|
orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_IOF_SVC);
|
|
|
|
}
|
|
|
|
else if(OPAL_CRS_CONTINUE == state) {
|
|
|
|
/*
|
|
|
|
* Restart Receiving events
|
|
|
|
*/
|
|
|
|
if(ORTE_SUCCESS != (ret = orte_rml.recv_nb(
|
|
|
|
ORTE_NAME_WILDCARD,
|
|
|
|
mca_iof_proxy_component.proxy_iov,
|
|
|
|
1,
|
|
|
|
ORTE_RML_TAG_IOF_SVC,
|
|
|
|
ORTE_RML_ALLOC|ORTE_RML_PERSISTENT,
|
|
|
|
orte_iof_proxy_svc_recv,
|
|
|
|
NULL
|
|
|
|
) ) ) {
|
|
|
|
exit_status = ret;
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
else if(OPAL_CRS_RESTART == state) {
|
|
|
|
/*
|
|
|
|
* Restart Receiving events
|
|
|
|
*/
|
|
|
|
if(ORTE_SUCCESS != (ret = orte_rml.recv_nb(
|
|
|
|
ORTE_NAME_WILDCARD,
|
|
|
|
mca_iof_proxy_component.proxy_iov,
|
|
|
|
1,
|
|
|
|
ORTE_RML_TAG_IOF_SVC,
|
|
|
|
ORTE_RML_ALLOC|ORTE_RML_PERSISTENT,
|
|
|
|
orte_iof_proxy_svc_recv,
|
|
|
|
NULL
|
|
|
|
) ) ) {
|
|
|
|
exit_status = ret;
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else if(OPAL_CRS_TERM == state ) {
|
|
|
|
;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
;
|
|
|
|
}
|
|
|
|
|
|
|
|
cleanup:
|
|
|
|
return exit_status;
|
|
|
|
}
|