2006-02-12 04:33:29 +03:00
|
|
|
#include "orte_config.h"
|
2005-07-04 03:31:27 +04:00
|
|
|
#include "opal/util/output.h"
|
2006-02-12 04:33:29 +03:00
|
|
|
#include "orte/mca/iof/base/iof_base_header.h"
|
2005-03-29 23:40:38 +04:00
|
|
|
#include "iof_svc.h"
|
|
|
|
#include "iof_svc_proxy.h"
|
|
|
|
#include "iof_svc_pub.h"
|
|
|
|
#include "iof_svc_sub.h"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
OBJ_CLASS_INSTANCE(
|
|
|
|
orte_iof_svc_pub_t,
|
2005-07-03 20:22:16 +04:00
|
|
|
opal_list_item_t,
|
2007-06-09 02:59:31 +04:00
|
|
|
NULL, NULL);
|
2005-03-29 23:40:38 +04:00
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
* (1) Create an entry to represent the published endpoint
|
|
|
|
* (2) Lookup any subscriptions that match and install on the
|
|
|
|
* subscription as a destination endpoint.
|
|
|
|
*/
|
2007-06-09 02:59:31 +04:00
|
|
|
|
2005-03-29 23:40:38 +04:00
|
|
|
int orte_iof_svc_pub_create(
|
|
|
|
const orte_process_name_t *pub_name,
|
|
|
|
const orte_process_name_t *pub_proxy,
|
|
|
|
orte_ns_cmp_bitmask_t pub_mask,
|
|
|
|
orte_iof_base_tag_t pub_tag)
|
|
|
|
{
|
2005-06-23 20:10:46 +04:00
|
|
|
orte_iof_svc_pub_t* pub;
|
2005-07-03 20:22:16 +04:00
|
|
|
opal_list_item_t* item;
|
2005-03-29 23:40:38 +04:00
|
|
|
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_LOCK(&mca_iof_svc_component.svc_lock);
|
2005-06-23 20:10:46 +04:00
|
|
|
|
2007-06-09 02:59:31 +04:00
|
|
|
/* has matching publish already been created? */
|
2005-07-03 20:22:16 +04:00
|
|
|
for(item = opal_list_get_first(&mca_iof_svc_component.svc_published);
|
|
|
|
item != opal_list_get_end(&mca_iof_svc_component.svc_published);
|
|
|
|
item = opal_list_get_next(item)) {
|
2005-06-23 20:10:46 +04:00
|
|
|
pub = (orte_iof_svc_pub_t*)item;
|
Bring over the update to terminate orteds that are generated by a dynamic spawn such as comm_spawn. This introduces the concept of a job "family" - i.e., jobs that have a parent/child relationship. Comm_spawn'ed jobs have a parent (the one that spawned them). We track that relationship throughout the lineage - i.e., if a comm_spawned job in turn calls comm_spawn, then it has a parent (the one that spawned it) and a "root" job (the original job that started things).
Accordingly, there are new APIs to the name service to support the ability to get a job's parent, root, immediate children, and all its descendants. In addition, the terminate_job, terminate_orted, and signal_job APIs for the PLS have been modified to accept attributes that define the extent of their actions. For example, doing a "terminate_job" with an attribute of ORTE_NS_INCLUDE_DESCENDANTS will terminate the given jobid AND all jobs that descended from it.
I have tested this capability on a MacBook under rsh, Odin under SLURM, and LANL's Flash (bproc). It worked successfully on non-MPI jobs (both simple and including a spawn), and MPI jobs (again, both simple and with a spawn).
This commit was SVN r12597.
2006-11-14 22:34:59 +03:00
|
|
|
if(orte_ns.compare_fields(pub_mask,pub_name,&pub->pub_name) == 0 &&
|
|
|
|
orte_ns.compare_fields(ORTE_NS_CMP_ALL,pub_proxy,&pub->pub_proxy) == 0 &&
|
2005-06-23 20:10:46 +04:00
|
|
|
pub_tag == pub->pub_tag) {
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_UNLOCK(&mca_iof_svc_component.svc_lock);
|
2006-02-12 04:33:29 +03:00
|
|
|
return ORTE_SUCCESS;
|
2005-06-23 20:10:46 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2007-06-09 02:59:31 +04:00
|
|
|
/* create a new publish entry; associate it with the corresponding
|
|
|
|
endpoint */
|
2005-06-23 20:10:46 +04:00
|
|
|
pub = OBJ_NEW(orte_iof_svc_pub_t);
|
2005-03-29 23:40:38 +04:00
|
|
|
pub->pub_name = *pub_name;
|
|
|
|
pub->pub_proxy = *pub_proxy;
|
|
|
|
pub->pub_mask = pub_mask;
|
|
|
|
pub->pub_tag = pub_tag;
|
2007-06-09 02:59:31 +04:00
|
|
|
pub->pub_endpoint =
|
|
|
|
orte_iof_base_endpoint_match(pub_name,pub_mask,pub_tag);
|
2007-07-20 06:34:29 +04:00
|
|
|
opal_output(orte_iof_base.iof_output, "created svc pub, name %s, proxy %s, tag %d / mask %x, endpoint %p\n",
|
|
|
|
ORTE_NAME_PRINT((orte_process_name_t*)pub_name), ORTE_NAME_PRINT((orte_process_name_t*)pub_proxy),
|
2007-06-09 02:59:31 +04:00
|
|
|
pub_tag, pub_mask, (char*) pub->pub_endpoint);
|
2005-03-29 23:40:38 +04:00
|
|
|
|
|
|
|
/* append this published endpoint to any matching subscription */
|
2005-07-03 20:22:16 +04:00
|
|
|
for(item = opal_list_get_first(&mca_iof_svc_component.svc_subscribed);
|
|
|
|
item != opal_list_get_end(&mca_iof_svc_component.svc_subscribed);
|
|
|
|
item = opal_list_get_next(item)) {
|
2005-03-29 23:40:38 +04:00
|
|
|
orte_iof_svc_sub_t* sub = (orte_iof_svc_sub_t*)item;
|
|
|
|
if(orte_iof_svc_fwd_match(sub,pub)) {
|
|
|
|
orte_iof_svc_fwd_create(sub,pub);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* append this published endpoint to the global list */
|
2005-07-03 20:22:16 +04:00
|
|
|
opal_list_append(&mca_iof_svc_component.svc_published, &pub->super);
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_UNLOCK(&mca_iof_svc_component.svc_lock);
|
2005-03-29 23:40:38 +04:00
|
|
|
return ORTE_SUCCESS;
|
|
|
|
}
|
|
|
|
|
2007-06-09 02:59:31 +04:00
|
|
|
|
2005-03-29 23:40:38 +04:00
|
|
|
/**
|
2007-06-09 02:59:31 +04:00
|
|
|
* Look for a matching publish
|
2005-03-29 23:40:38 +04:00
|
|
|
*/
|
|
|
|
|
|
|
|
orte_iof_svc_pub_t* orte_iof_svc_pub_lookup(
|
|
|
|
const orte_process_name_t *pub_name,
|
|
|
|
const orte_process_name_t *pub_proxy,
|
|
|
|
orte_ns_cmp_bitmask_t pub_mask,
|
|
|
|
orte_iof_base_tag_t pub_tag)
|
|
|
|
{
|
2005-07-03 20:22:16 +04:00
|
|
|
opal_list_item_t* item;
|
|
|
|
for(item = opal_list_get_first(&mca_iof_svc_component.svc_published);
|
|
|
|
item != opal_list_get_end(&mca_iof_svc_component.svc_published);
|
|
|
|
item = opal_list_get_next(item)) {
|
2005-03-29 23:40:38 +04:00
|
|
|
orte_iof_svc_pub_t* pub = (orte_iof_svc_pub_t*)item;
|
Bring over the update to terminate orteds that are generated by a dynamic spawn such as comm_spawn. This introduces the concept of a job "family" - i.e., jobs that have a parent/child relationship. Comm_spawn'ed jobs have a parent (the one that spawned them). We track that relationship throughout the lineage - i.e., if a comm_spawned job in turn calls comm_spawn, then it has a parent (the one that spawned it) and a "root" job (the original job that started things).
Accordingly, there are new APIs to the name service to support the ability to get a job's parent, root, immediate children, and all its descendants. In addition, the terminate_job, terminate_orted, and signal_job APIs for the PLS have been modified to accept attributes that define the extent of their actions. For example, doing a "terminate_job" with an attribute of ORTE_NS_INCLUDE_DESCENDANTS will terminate the given jobid AND all jobs that descended from it.
I have tested this capability on a MacBook under rsh, Odin under SLURM, and LANL's Flash (bproc). It worked successfully on non-MPI jobs (both simple and including a spawn), and MPI jobs (again, both simple and with a spawn).
This commit was SVN r12597.
2006-11-14 22:34:59 +03:00
|
|
|
if (orte_ns.compare_fields(ORTE_NS_CMP_ALL, &pub->pub_name,pub_name) == 0 &&
|
|
|
|
orte_ns.compare_fields(ORTE_NS_CMP_ALL, &pub->pub_proxy,pub_proxy) == 0 &&
|
2005-03-29 23:40:38 +04:00
|
|
|
pub->pub_mask == pub_mask &&
|
|
|
|
pub->pub_tag == pub_tag) {
|
|
|
|
return pub;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Remove the published endpoint and cleanup any associated
|
|
|
|
* forwarding entries.
|
|
|
|
*/
|
2007-06-09 02:59:31 +04:00
|
|
|
|
2005-03-29 23:40:38 +04:00
|
|
|
int orte_iof_svc_pub_delete(
|
|
|
|
const orte_process_name_t *pub_name,
|
|
|
|
const orte_process_name_t *pub_proxy,
|
|
|
|
orte_ns_cmp_bitmask_t pub_mask,
|
|
|
|
orte_iof_base_tag_t pub_tag)
|
|
|
|
{
|
2005-07-03 20:22:16 +04:00
|
|
|
opal_list_item_t* item;
|
2005-03-29 23:40:38 +04:00
|
|
|
orte_iof_svc_pub_t* pub;
|
|
|
|
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_LOCK(&mca_iof_svc_component.svc_lock);
|
2005-03-29 23:40:38 +04:00
|
|
|
pub = orte_iof_svc_pub_lookup(pub_name,pub_proxy,pub_mask,pub_tag);
|
|
|
|
if(NULL == pub) {
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_UNLOCK(&mca_iof_svc_component.svc_lock);
|
2005-03-29 23:40:38 +04:00
|
|
|
return ORTE_ERR_NOT_FOUND;
|
|
|
|
}
|
|
|
|
|
2005-07-03 20:22:16 +04:00
|
|
|
for(item = opal_list_get_first(&mca_iof_svc_component.svc_subscribed);
|
|
|
|
item != opal_list_get_end(&mca_iof_svc_component.svc_subscribed);
|
|
|
|
item = opal_list_get_next(item)) {
|
2005-03-29 23:40:38 +04:00
|
|
|
orte_iof_svc_sub_t* sub = (orte_iof_svc_sub_t*)item;
|
|
|
|
if(orte_iof_svc_fwd_match(sub,pub)) {
|
|
|
|
orte_iof_svc_fwd_delete(sub,pub);
|
|
|
|
}
|
|
|
|
}
|
2005-07-03 20:22:16 +04:00
|
|
|
opal_list_remove_item(&mca_iof_svc_component.svc_published, &pub->super);
|
2005-03-29 23:40:38 +04:00
|
|
|
OBJ_RELEASE(pub);
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_UNLOCK(&mca_iof_svc_component.svc_lock);
|
2005-03-29 23:40:38 +04:00
|
|
|
return ORTE_SUCCESS;
|
|
|
|
}
|
2007-06-09 02:59:31 +04:00
|
|
|
|
2005-03-29 23:40:38 +04:00
|
|
|
|
2005-10-07 01:21:26 +04:00
|
|
|
/*
|
|
|
|
* Remove all publications associated w/ the given process name.
|
|
|
|
*/
|
|
|
|
|
|
|
|
void orte_iof_svc_pub_delete_all(
|
|
|
|
const orte_process_name_t* name)
|
|
|
|
{
|
|
|
|
opal_list_item_t* p_item;
|
|
|
|
|
|
|
|
OPAL_THREAD_LOCK(&mca_iof_svc_component.svc_lock);
|
|
|
|
p_item = opal_list_get_first(&mca_iof_svc_component.svc_published);
|
|
|
|
while(p_item != opal_list_get_end(&mca_iof_svc_component.svc_published)) {
|
|
|
|
opal_list_item_t* p_next = opal_list_get_next(p_item);
|
|
|
|
orte_iof_svc_pub_t* pub = (orte_iof_svc_pub_t*)p_item;
|
|
|
|
|
Bring over the update to terminate orteds that are generated by a dynamic spawn such as comm_spawn. This introduces the concept of a job "family" - i.e., jobs that have a parent/child relationship. Comm_spawn'ed jobs have a parent (the one that spawned them). We track that relationship throughout the lineage - i.e., if a comm_spawned job in turn calls comm_spawn, then it has a parent (the one that spawned it) and a "root" job (the original job that started things).
Accordingly, there are new APIs to the name service to support the ability to get a job's parent, root, immediate children, and all its descendants. In addition, the terminate_job, terminate_orted, and signal_job APIs for the PLS have been modified to accept attributes that define the extent of their actions. For example, doing a "terminate_job" with an attribute of ORTE_NS_INCLUDE_DESCENDANTS will terminate the given jobid AND all jobs that descended from it.
I have tested this capability on a MacBook under rsh, Odin under SLURM, and LANL's Flash (bproc). It worked successfully on non-MPI jobs (both simple and including a spawn), and MPI jobs (again, both simple and with a spawn).
This commit was SVN r12597.
2006-11-14 22:34:59 +03:00
|
|
|
if (orte_ns.compare_fields(ORTE_NS_CMP_ALL, &pub->pub_name,name) == 0 ||
|
|
|
|
orte_ns.compare_fields(ORTE_NS_CMP_ALL, &pub->pub_proxy,name) == 0) {
|
2005-10-07 01:21:26 +04:00
|
|
|
|
|
|
|
opal_list_item_t* s_item;
|
|
|
|
for(s_item = opal_list_get_first(&mca_iof_svc_component.svc_subscribed);
|
|
|
|
s_item != opal_list_get_end(&mca_iof_svc_component.svc_subscribed);
|
|
|
|
s_item = opal_list_get_next(s_item)) {
|
|
|
|
orte_iof_svc_sub_t* sub = (orte_iof_svc_sub_t*)s_item;
|
|
|
|
if(orte_iof_svc_fwd_match(sub,pub)) {
|
|
|
|
orte_iof_svc_fwd_delete(sub,pub);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
opal_list_remove_item(&mca_iof_svc_component.svc_published, p_item);
|
|
|
|
OBJ_RELEASE(pub);
|
|
|
|
}
|
|
|
|
p_item = p_next;
|
|
|
|
}
|
|
|
|
OPAL_THREAD_UNLOCK(&mca_iof_svc_component.svc_lock);
|
|
|
|
}
|
2005-03-29 23:40:38 +04:00
|
|
|
|