2004-12-21 22:16:09 +00:00
/*
2007-03-16 23:11:45 +00:00
* Copyright ( c ) 2004 - 2007 The Trustees of Indiana University and Indiana
2005-11-05 19:57:48 +00:00
* University Research and Technology
* Corporation . All rights reserved .
* Copyright ( c ) 2004 - 2005 The University of Tennessee and The University
* of Tennessee Research Foundation . All rights
* reserved .
2004-12-21 22:16:09 +00:00
* Copyright ( c ) 2004 - 2005 High Performance Computing Center Stuttgart ,
* University of Stuttgart . All rights reserved .
2005-03-24 12:43:37 +00:00
* Copyright ( c ) 2004 - 2005 The Regents of the University of California .
* All rights reserved .
2007-06-08 22:59:31 +00:00
* Copyright ( c ) 2007 Cisco , Inc . All rights reserved .
2004-12-21 22:16:09 +00:00
* $ COPYRIGHT $
*
* Additional copyrights may follow
*
* $ HEADER $
*/
2007-06-08 22:59:31 +00:00
2006-02-12 01:33:29 +00:00
# include "orte_config.h"
2007-06-08 22:59:31 +00:00
2004-12-21 22:16:09 +00:00
# include <errno.h>
2005-12-12 20:04:00 +00:00
# ifdef HAVE_UNISTD_H
2004-12-21 22:16:09 +00:00
# include <unistd.h>
2005-12-12 20:04:00 +00:00
# endif /* HAVE_UNISTD_H */
# ifdef HAVE_STRING_H
2004-12-21 22:16:09 +00:00
# include <string.h>
2005-12-12 20:04:00 +00:00
# endif /* HAVE_STRING_H */
2004-12-21 22:16:09 +00:00
2006-02-12 01:33:29 +00:00
# include "orte/orte_constants.h"
2005-07-03 23:31:27 +00:00
# include "opal/util/output.h"
2006-02-12 01:33:29 +00:00
# include "orte/mca/iof/iof.h"
# include "orte/mca/rml/rml.h"
# include "orte/mca/rml/rml_types.h"
# include "orte/mca/iof/iof.h"
# include "orte/mca/iof/base/base.h"
# include "orte/mca/iof/base/iof_base_endpoint.h"
# include "orte/mca/errmgr/errmgr.h"
2004-12-21 22:16:09 +00:00
# include "iof_proxy.h"
2005-01-12 20:51:34 +00:00
# include "iof_proxy_svc.h"
2004-12-21 22:16:09 +00:00
2005-03-14 20:57:21 +00:00
orte_iof_base_module_t orte_iof_proxy_module = {
orte_iof_proxy_publish ,
orte_iof_proxy_unpublish ,
orte_iof_proxy_subscribe ,
orte_iof_proxy_unsubscribe ,
2007-06-08 22:59:31 +00:00
orte_iof_proxy_push ,
orte_iof_proxy_pull ,
2007-01-30 06:34:38 +00:00
orte_iof_base_flush ,
2007-03-16 23:11:45 +00:00
orte_iof_proxy_finalize ,
orte_iof_proxy_ft_event
2004-12-21 22:16:09 +00:00
} ;
2007-06-08 22:59:31 +00:00
/*
* Finalize module ; nothing to do
*/
int orte_iof_proxy_finalize ( void )
{
2007-03-16 23:11:45 +00:00
return ORTE_SUCCESS ;
}
2004-12-21 22:16:09 +00:00
/**
2007-06-08 22:59:31 +00:00
* Create an endpoint for a local file descriptor and " publish " it
* under the name of the origin process . If the publish mode is a
* SINK , then create a publication entry for it so that incoming
* messages can be forwarded to it .
2004-12-21 22:16:09 +00:00
*
2007-06-08 22:59:31 +00:00
* SOURCEs do not need to create publication records because a ) the
* endpoint will automatically wake up the event engine and read off
* the fd whenever there is data available , and b ) this data is then
* automatically sent to the iof svc component for possible
* forwarding .
2004-12-21 22:16:09 +00:00
*/
2005-03-14 20:57:21 +00:00
int orte_iof_proxy_publish (
2007-06-08 22:59:31 +00:00
const orte_process_name_t * origin ,
2005-03-14 20:57:21 +00:00
orte_iof_base_mode_t mode ,
orte_iof_base_tag_t tag ,
2004-12-21 22:16:09 +00:00
int fd )
{
2005-01-12 20:51:34 +00:00
int rc ;
2007-06-08 22:59:31 +00:00
if ( orte_iof_base . iof_output > = 0 ) {
2005-03-14 20:57:21 +00:00
char * name_str ;
2007-06-08 22:59:31 +00:00
orte_ns . get_proc_name_string ( & name_str , origin ) ;
opal_output ( orte_iof_base . iof_output ,
" orte_iof_proxy_publish(%s,%d,%d,%d) \n " ,
name_str , mode , tag , fd ) ;
2005-03-14 20:57:21 +00:00
free ( name_str ) ;
}
rc = orte_iof_base_endpoint_create (
2007-06-08 22:59:31 +00:00
origin ,
2005-01-12 20:51:34 +00:00
mode ,
tag ,
fd ) ;
2007-06-08 22:59:31 +00:00
if ( ORTE_SUCCESS ! = rc ) {
return rc ;
}
/* publish to server */
if ( ORTE_IOF_SINK = = mode ) {
rc = orte_iof_proxy_svc_publish ( origin , tag ) ;
if ( rc ! = ORTE_SUCCESS ) {
return rc ;
}
}
return ORTE_SUCCESS ;
2004-12-21 22:16:09 +00:00
}
/**
2007-06-08 22:59:31 +00:00
* Remove all registrations matching the specified origin process
2004-12-21 22:16:09 +00:00
* name , mask and tag values .
*/
2005-03-14 20:57:21 +00:00
int orte_iof_proxy_unpublish (
2007-06-08 22:59:31 +00:00
const orte_process_name_t * origin ,
2005-03-14 20:57:21 +00:00
orte_ns_cmp_bitmask_t mask ,
orte_iof_base_tag_t tag )
2004-12-21 22:16:09 +00:00
{
2005-01-12 20:51:34 +00:00
int rc ;
2007-06-08 22:59:31 +00:00
#if 0
{
int i = 0 ;
opal_output ( orte_iof_base . iof_output , " [%lu,%lu,%lu] orted: ******** ABOUT TO IOF PROXY UNPUBLISH, %d " , ORTE_NAME_ARGS ( orte_process_info . my_name ) , getpid ( ) ) ;
fflush ( stderr ) ;
while ( 0 = = i ) sleep ( 5 ) ;
}
# endif
2005-01-12 20:51:34 +00:00
/* cleanup server */
2005-03-14 20:57:21 +00:00
orte_iof_proxy_svc_unpublish (
2007-06-08 22:59:31 +00:00
origin ,
2005-01-12 20:51:34 +00:00
mask ,
tag ) ;
2007-06-08 22:59:31 +00:00
/* delete local endpoint */
2005-03-14 20:57:21 +00:00
rc = orte_iof_base_endpoint_delete (
2007-06-08 22:59:31 +00:00
origin ,
2005-01-12 20:51:34 +00:00
mask ,
tag ) ;
return rc ;
2004-12-21 22:16:09 +00:00
}
/**
* Explicitly push data from the specified file descriptor
2007-06-08 22:59:31 +00:00
* to the indicated SINK set of peers .
2004-12-21 22:16:09 +00:00
*/
2005-03-14 20:57:21 +00:00
int orte_iof_proxy_push (
2007-06-08 22:59:31 +00:00
const orte_process_name_t * sink_name ,
orte_ns_cmp_bitmask_t sink_mask ,
orte_iof_base_tag_t sink_tag ,
2004-12-21 22:16:09 +00:00
int fd )
{
2005-01-12 20:51:34 +00:00
int rc ;
2007-06-08 22:59:31 +00:00
/* setup a local endpoint to reflect registration. Do this before
we send the subscription to the server in case a callback
occurs * while * we are sending the subscription request . */
2005-03-14 20:57:21 +00:00
rc = orte_iof_base_endpoint_create (
Bring over the update to terminate orteds that are generated by a dynamic spawn such as comm_spawn. This introduces the concept of a job "family" - i.e., jobs that have a parent/child relationship. Comm_spawn'ed jobs have a parent (the one that spawned them). We track that relationship throughout the lineage - i.e., if a comm_spawned job in turn calls comm_spawn, then it has a parent (the one that spawned it) and a "root" job (the original job that started things).
Accordingly, there are new APIs to the name service to support the ability to get a job's parent, root, immediate children, and all its descendants. In addition, the terminate_job, terminate_orted, and signal_job APIs for the PLS have been modified to accept attributes that define the extent of their actions. For example, doing a "terminate_job" with an attribute of ORTE_NS_INCLUDE_DESCENDANTS will terminate the given jobid AND all jobs that descended from it.
I have tested this capability on a MacBook under rsh, Odin under SLURM, and LANL's Flash (bproc). It worked successfully on non-MPI jobs (both simple and including a spawn), and MPI jobs (again, both simple and with a spawn).
This commit was SVN r12597.
2006-11-14 19:34:59 +00:00
ORTE_PROC_MY_NAME ,
2005-03-14 20:57:21 +00:00
ORTE_IOF_SOURCE ,
2007-06-08 22:59:31 +00:00
sink_tag ,
2005-01-12 20:51:34 +00:00
fd ) ;
2007-06-08 22:59:31 +00:00
if ( ORTE_SUCCESS ! = rc ) {
return rc ;
}
/* send a subscription to server on behalf of the destination */
rc = orte_iof_proxy_svc_subscribe (
ORTE_PROC_MY_NAME ,
ORTE_NS_CMP_ALL ,
sink_tag ,
sink_name ,
sink_mask ,
sink_tag
) ;
return rc ;
2004-12-21 22:16:09 +00:00
}
/**
2007-06-08 22:59:31 +00:00
* Explicitly pull data from the specified set of SOURCE peers and
* dump to the indicated file descriptor .
2004-12-21 22:16:09 +00:00
*/
2005-03-14 20:57:21 +00:00
int orte_iof_proxy_pull (
2007-06-08 22:59:31 +00:00
const orte_process_name_t * source_name ,
orte_ns_cmp_bitmask_t source_mask ,
orte_iof_base_tag_t source_tag ,
2004-12-21 22:16:09 +00:00
int fd )
{
2005-01-12 20:51:34 +00:00
/* setup a local endpoint */
int rc ;
2005-03-14 20:57:21 +00:00
rc = orte_iof_base_endpoint_create (
Bring over the update to terminate orteds that are generated by a dynamic spawn such as comm_spawn. This introduces the concept of a job "family" - i.e., jobs that have a parent/child relationship. Comm_spawn'ed jobs have a parent (the one that spawned them). We track that relationship throughout the lineage - i.e., if a comm_spawned job in turn calls comm_spawn, then it has a parent (the one that spawned it) and a "root" job (the original job that started things).
Accordingly, there are new APIs to the name service to support the ability to get a job's parent, root, immediate children, and all its descendants. In addition, the terminate_job, terminate_orted, and signal_job APIs for the PLS have been modified to accept attributes that define the extent of their actions. For example, doing a "terminate_job" with an attribute of ORTE_NS_INCLUDE_DESCENDANTS will terminate the given jobid AND all jobs that descended from it.
I have tested this capability on a MacBook under rsh, Odin under SLURM, and LANL's Flash (bproc). It worked successfully on non-MPI jobs (both simple and including a spawn), and MPI jobs (again, both simple and with a spawn).
This commit was SVN r12597.
2006-11-14 19:34:59 +00:00
ORTE_PROC_MY_NAME ,
2005-03-29 19:40:38 +00:00
ORTE_IOF_SINK ,
2007-06-08 22:59:31 +00:00
source_tag ,
2005-01-12 20:51:34 +00:00
fd ) ;
2007-06-08 22:59:31 +00:00
if ( ORTE_SUCCESS ! = rc ) {
2005-03-29 19:40:38 +00:00
ORTE_ERROR_LOG ( rc ) ;
2005-01-12 20:51:34 +00:00
return rc ;
2005-03-29 19:40:38 +00:00
}
2005-01-12 20:51:34 +00:00
2005-03-29 19:40:38 +00:00
/* publish this endpoint */
rc = orte_iof_proxy_svc_publish (
Bring over the update to terminate orteds that are generated by a dynamic spawn such as comm_spawn. This introduces the concept of a job "family" - i.e., jobs that have a parent/child relationship. Comm_spawn'ed jobs have a parent (the one that spawned them). We track that relationship throughout the lineage - i.e., if a comm_spawned job in turn calls comm_spawn, then it has a parent (the one that spawned it) and a "root" job (the original job that started things).
Accordingly, there are new APIs to the name service to support the ability to get a job's parent, root, immediate children, and all its descendants. In addition, the terminate_job, terminate_orted, and signal_job APIs for the PLS have been modified to accept attributes that define the extent of their actions. For example, doing a "terminate_job" with an attribute of ORTE_NS_INCLUDE_DESCENDANTS will terminate the given jobid AND all jobs that descended from it.
I have tested this capability on a MacBook under rsh, Odin under SLURM, and LANL's Flash (bproc). It worked successfully on non-MPI jobs (both simple and including a spawn), and MPI jobs (again, both simple and with a spawn).
This commit was SVN r12597.
2006-11-14 19:34:59 +00:00
ORTE_PROC_MY_NAME ,
2007-06-08 22:59:31 +00:00
source_tag ) ;
if ( ORTE_SUCCESS ! = rc ) {
2005-03-29 19:40:38 +00:00
ORTE_ERROR_LOG ( rc ) ;
return rc ;
}
/* subscribe to peer */
2005-03-14 20:57:21 +00:00
rc = orte_iof_proxy_svc_subscribe (
2007-06-08 22:59:31 +00:00
source_name ,
source_mask ,
source_tag ,
Bring over the update to terminate orteds that are generated by a dynamic spawn such as comm_spawn. This introduces the concept of a job "family" - i.e., jobs that have a parent/child relationship. Comm_spawn'ed jobs have a parent (the one that spawned them). We track that relationship throughout the lineage - i.e., if a comm_spawned job in turn calls comm_spawn, then it has a parent (the one that spawned it) and a "root" job (the original job that started things).
Accordingly, there are new APIs to the name service to support the ability to get a job's parent, root, immediate children, and all its descendants. In addition, the terminate_job, terminate_orted, and signal_job APIs for the PLS have been modified to accept attributes that define the extent of their actions. For example, doing a "terminate_job" with an attribute of ORTE_NS_INCLUDE_DESCENDANTS will terminate the given jobid AND all jobs that descended from it.
I have tested this capability on a MacBook under rsh, Odin under SLURM, and LANL's Flash (bproc). It worked successfully on non-MPI jobs (both simple and including a spawn), and MPI jobs (again, both simple and with a spawn).
This commit was SVN r12597.
2006-11-14 19:34:59 +00:00
ORTE_PROC_MY_NAME ,
2005-03-14 20:57:21 +00:00
ORTE_NS_CMP_ALL ,
2007-06-08 22:59:31 +00:00
source_tag ) ;
if ( ORTE_SUCCESS ! = rc ) {
2005-03-29 19:40:38 +00:00
ORTE_ERROR_LOG ( rc ) ;
return rc ;
}
2005-01-12 20:51:34 +00:00
return rc ;
2004-12-21 22:16:09 +00:00
}
/*
* Subscribe to receive a callback on receipt of data
2007-06-08 22:59:31 +00:00
* from a specified set of origin peers .
2004-12-21 22:16:09 +00:00
*/
2005-03-14 20:57:21 +00:00
int orte_iof_proxy_subscribe (
2007-06-08 22:59:31 +00:00
const orte_process_name_t * origin_name ,
orte_ns_cmp_bitmask_t origin_mask ,
orte_iof_base_tag_t origin_tag ,
2005-11-10 04:49:51 +00:00
orte_iof_base_callback_fn_t cbfunc ,
2004-12-21 22:16:09 +00:00
void * cbdata )
{
2005-01-12 20:51:34 +00:00
int rc ;
/* create a local registration to reflect the callback */
2007-06-08 22:59:31 +00:00
rc = orte_iof_base_callback_create ( ORTE_PROC_MY_NAME , origin_tag , cbfunc , cbdata ) ;
if ( ORTE_SUCCESS ! = rc ) {
2005-11-10 04:49:51 +00:00
return rc ;
2007-06-08 22:59:31 +00:00
}
2005-01-12 20:51:34 +00:00
/* send a subscription message to the service */
2005-03-14 20:57:21 +00:00
rc = orte_iof_proxy_svc_subscribe (
2007-06-08 22:59:31 +00:00
origin_name ,
origin_mask ,
origin_tag ,
Bring over the update to terminate orteds that are generated by a dynamic spawn such as comm_spawn. This introduces the concept of a job "family" - i.e., jobs that have a parent/child relationship. Comm_spawn'ed jobs have a parent (the one that spawned them). We track that relationship throughout the lineage - i.e., if a comm_spawned job in turn calls comm_spawn, then it has a parent (the one that spawned it) and a "root" job (the original job that started things).
Accordingly, there are new APIs to the name service to support the ability to get a job's parent, root, immediate children, and all its descendants. In addition, the terminate_job, terminate_orted, and signal_job APIs for the PLS have been modified to accept attributes that define the extent of their actions. For example, doing a "terminate_job" with an attribute of ORTE_NS_INCLUDE_DESCENDANTS will terminate the given jobid AND all jobs that descended from it.
I have tested this capability on a MacBook under rsh, Odin under SLURM, and LANL's Flash (bproc). It worked successfully on non-MPI jobs (both simple and including a spawn), and MPI jobs (again, both simple and with a spawn).
This commit was SVN r12597.
2006-11-14 19:34:59 +00:00
ORTE_PROC_MY_NAME ,
2005-03-14 20:57:21 +00:00
ORTE_NS_CMP_ALL ,
2007-06-08 22:59:31 +00:00
origin_tag ) ;
2005-01-12 20:51:34 +00:00
return rc ;
2004-12-21 22:16:09 +00:00
}
2007-06-08 22:59:31 +00:00
/*
* Remove a subscription
*/
2005-03-14 20:57:21 +00:00
int orte_iof_proxy_unsubscribe (
2007-06-08 22:59:31 +00:00
const orte_process_name_t * origin_name ,
orte_ns_cmp_bitmask_t origin_mask ,
orte_iof_base_tag_t origin_tag )
2004-12-21 22:16:09 +00:00
{
2005-01-12 20:51:34 +00:00
int rc ;
/* send an unsubscribe message to the service */
2005-03-14 20:57:21 +00:00
rc = orte_iof_proxy_svc_unsubscribe (
2007-06-08 22:59:31 +00:00
origin_name ,
origin_mask ,
origin_tag ,
Bring over the update to terminate orteds that are generated by a dynamic spawn such as comm_spawn. This introduces the concept of a job "family" - i.e., jobs that have a parent/child relationship. Comm_spawn'ed jobs have a parent (the one that spawned them). We track that relationship throughout the lineage - i.e., if a comm_spawned job in turn calls comm_spawn, then it has a parent (the one that spawned it) and a "root" job (the original job that started things).
Accordingly, there are new APIs to the name service to support the ability to get a job's parent, root, immediate children, and all its descendants. In addition, the terminate_job, terminate_orted, and signal_job APIs for the PLS have been modified to accept attributes that define the extent of their actions. For example, doing a "terminate_job" with an attribute of ORTE_NS_INCLUDE_DESCENDANTS will terminate the given jobid AND all jobs that descended from it.
I have tested this capability on a MacBook under rsh, Odin under SLURM, and LANL's Flash (bproc). It worked successfully on non-MPI jobs (both simple and including a spawn), and MPI jobs (again, both simple and with a spawn).
This commit was SVN r12597.
2006-11-14 19:34:59 +00:00
ORTE_PROC_MY_NAME ,
2005-03-14 20:57:21 +00:00
ORTE_NS_CMP_ALL ,
2007-06-08 22:59:31 +00:00
origin_tag ) ;
if ( ORTE_SUCCESS ! = rc ) {
2005-11-10 04:49:51 +00:00
return rc ;
2007-06-08 22:59:31 +00:00
}
2005-11-10 04:49:51 +00:00
2005-01-12 20:51:34 +00:00
/* remove local callback */
2007-06-08 22:59:31 +00:00
return orte_iof_base_callback_delete ( ORTE_PROC_MY_NAME , origin_tag ) ;
2004-12-21 22:16:09 +00:00
}
2007-06-08 22:59:31 +00:00
/*
* FT event
*/
2007-03-16 23:11:45 +00:00
int orte_iof_proxy_ft_event ( int state ) {
int ret , exit_status = ORTE_SUCCESS ;
if ( OPAL_CRS_CHECKPOINT = = state ) {
/*
* Flush
*/
if ( ORTE_SUCCESS ! = ( ret = orte_iof_base_flush ( ) ) ) {
return ret ;
}
/*
* Stop receiving events
*/
orte_rml . recv_cancel ( ORTE_NAME_WILDCARD , ORTE_RML_TAG_IOF_SVC ) ;
}
else if ( OPAL_CRS_CONTINUE = = state ) {
/*
* Restart Receiving events
*/
if ( ORTE_SUCCESS ! = ( ret = orte_rml . recv_nb (
ORTE_NAME_WILDCARD ,
mca_iof_proxy_component . proxy_iov ,
1 ,
ORTE_RML_TAG_IOF_SVC ,
ORTE_RML_ALLOC | ORTE_RML_PERSISTENT ,
orte_iof_proxy_svc_recv ,
NULL
) ) ) {
exit_status = ret ;
goto cleanup ;
}
}
else if ( OPAL_CRS_RESTART = = state ) {
/*
* Restart Receiving events
*/
if ( ORTE_SUCCESS ! = ( ret = orte_rml . recv_nb (
ORTE_NAME_WILDCARD ,
mca_iof_proxy_component . proxy_iov ,
1 ,
ORTE_RML_TAG_IOF_SVC ,
ORTE_RML_ALLOC | ORTE_RML_PERSISTENT ,
orte_iof_proxy_svc_recv ,
NULL
) ) ) {
exit_status = ret ;
goto cleanup ;
}
}
else if ( OPAL_CRS_TERM = = state ) {
;
}
else {
;
}
cleanup :
return exit_status ;
}