4f3a11b4db
A bunch of fixes from the /tmp/iof-fixes branch that fix up ''some'' (but not ''all'') of the problems that we have seen with iof: * Reading very large files via stdin redirected to orteun (Sun saw this) * Reading a little bit of a large file redirected to orterun's stdin and then either closing stdin or exiting the process The Big Change was to make the proxy iof (the one running in non-HNP orteds) send back a "I'm closing the stream" ACK back to the service iof. This tells the HNP that there will be nothing more coming from that peer, and therefore the iof forward should be removed. Many other minor cleanups/fixes, terminology changes, and documentation additions are included in this commit as well. However, there are still some pretty big outstanding issues with IOF that are not addressed either by #967 or this commit. A few examples: * IOF was designed to allow multiple subscribers to a single stream. We're not entirely sure that this works (for one thing, there is nothing in the ORTE/OMPI code base that uses this functionality). * There are also resources leaked when processes/jobs exit (per Ralph's first comment on this ticket). * There is no feedback to close orterun's stdin when all subscribers to the corresponding stream have closed stdin. This commit was SVN r14967. The following Trac tickets were found above: Ticket 967 --> https://svn.open-mpi.org/trac/ompi/ticket/967
174 строки
5.8 KiB
C
174 строки
5.8 KiB
C
#include "orte_config.h"
|
|
#include "opal/util/output.h"
|
|
#include "orte/mca/oob/oob.h"
|
|
#include "orte/mca/iof/base/iof_base_header.h"
|
|
#include "iof_svc.h"
|
|
#include "iof_svc_proxy.h"
|
|
#include "iof_svc_pub.h"
|
|
#include "iof_svc_sub.h"
|
|
|
|
|
|
|
|
OBJ_CLASS_INSTANCE(
|
|
orte_iof_svc_pub_t,
|
|
opal_list_item_t,
|
|
NULL, NULL);
|
|
|
|
|
|
/**
|
|
* (1) Create an entry to represent the published endpoint
|
|
* (2) Lookup any subscriptions that match and install on the
|
|
* subscription as a destination endpoint.
|
|
*/
|
|
|
|
int orte_iof_svc_pub_create(
|
|
const orte_process_name_t *pub_name,
|
|
const orte_process_name_t *pub_proxy,
|
|
orte_ns_cmp_bitmask_t pub_mask,
|
|
orte_iof_base_tag_t pub_tag)
|
|
{
|
|
orte_iof_svc_pub_t* pub;
|
|
opal_list_item_t* item;
|
|
|
|
OPAL_THREAD_LOCK(&mca_iof_svc_component.svc_lock);
|
|
|
|
/* has matching publish already been created? */
|
|
for(item = opal_list_get_first(&mca_iof_svc_component.svc_published);
|
|
item != opal_list_get_end(&mca_iof_svc_component.svc_published);
|
|
item = opal_list_get_next(item)) {
|
|
pub = (orte_iof_svc_pub_t*)item;
|
|
if(orte_ns.compare_fields(pub_mask,pub_name,&pub->pub_name) == 0 &&
|
|
orte_ns.compare_fields(ORTE_NS_CMP_ALL,pub_proxy,&pub->pub_proxy) == 0 &&
|
|
pub_tag == pub->pub_tag) {
|
|
OPAL_THREAD_UNLOCK(&mca_iof_svc_component.svc_lock);
|
|
return ORTE_SUCCESS;
|
|
}
|
|
}
|
|
|
|
/* create a new publish entry; associate it with the corresponding
|
|
endpoint */
|
|
pub = OBJ_NEW(orte_iof_svc_pub_t);
|
|
pub->pub_name = *pub_name;
|
|
pub->pub_proxy = *pub_proxy;
|
|
pub->pub_mask = pub_mask;
|
|
pub->pub_tag = pub_tag;
|
|
pub->pub_endpoint =
|
|
orte_iof_base_endpoint_match(pub_name,pub_mask,pub_tag);
|
|
opal_output(orte_iof_base.iof_output, "created svc pub, name [%lu,%lu,%lu], proxy [%lu,%lu,%lu], tag %d / mask %x, endpoint %p\n",
|
|
ORTE_NAME_ARGS(pub_name), ORTE_NAME_ARGS(pub_proxy),
|
|
pub_tag, pub_mask, (char*) pub->pub_endpoint);
|
|
|
|
/* append this published endpoint to any matching subscription */
|
|
for(item = opal_list_get_first(&mca_iof_svc_component.svc_subscribed);
|
|
item != opal_list_get_end(&mca_iof_svc_component.svc_subscribed);
|
|
item = opal_list_get_next(item)) {
|
|
orte_iof_svc_sub_t* sub = (orte_iof_svc_sub_t*)item;
|
|
if(orte_iof_svc_fwd_match(sub,pub)) {
|
|
orte_iof_svc_fwd_create(sub,pub);
|
|
}
|
|
}
|
|
|
|
/* append this published endpoint to the global list */
|
|
opal_list_append(&mca_iof_svc_component.svc_published, &pub->super);
|
|
OPAL_THREAD_UNLOCK(&mca_iof_svc_component.svc_lock);
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
|
|
/**
|
|
* Look for a matching publish
|
|
*/
|
|
|
|
orte_iof_svc_pub_t* orte_iof_svc_pub_lookup(
|
|
const orte_process_name_t *pub_name,
|
|
const orte_process_name_t *pub_proxy,
|
|
orte_ns_cmp_bitmask_t pub_mask,
|
|
orte_iof_base_tag_t pub_tag)
|
|
{
|
|
opal_list_item_t* item;
|
|
for(item = opal_list_get_first(&mca_iof_svc_component.svc_published);
|
|
item != opal_list_get_end(&mca_iof_svc_component.svc_published);
|
|
item = opal_list_get_next(item)) {
|
|
orte_iof_svc_pub_t* pub = (orte_iof_svc_pub_t*)item;
|
|
if (orte_ns.compare_fields(ORTE_NS_CMP_ALL, &pub->pub_name,pub_name) == 0 &&
|
|
orte_ns.compare_fields(ORTE_NS_CMP_ALL, &pub->pub_proxy,pub_proxy) == 0 &&
|
|
pub->pub_mask == pub_mask &&
|
|
pub->pub_tag == pub_tag) {
|
|
return pub;
|
|
}
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
/**
|
|
* Remove the published endpoint and cleanup any associated
|
|
* forwarding entries.
|
|
*/
|
|
|
|
int orte_iof_svc_pub_delete(
|
|
const orte_process_name_t *pub_name,
|
|
const orte_process_name_t *pub_proxy,
|
|
orte_ns_cmp_bitmask_t pub_mask,
|
|
orte_iof_base_tag_t pub_tag)
|
|
{
|
|
opal_list_item_t* item;
|
|
orte_iof_svc_pub_t* pub;
|
|
|
|
OPAL_THREAD_LOCK(&mca_iof_svc_component.svc_lock);
|
|
pub = orte_iof_svc_pub_lookup(pub_name,pub_proxy,pub_mask,pub_tag);
|
|
if(NULL == pub) {
|
|
OPAL_THREAD_UNLOCK(&mca_iof_svc_component.svc_lock);
|
|
return ORTE_ERR_NOT_FOUND;
|
|
}
|
|
|
|
for(item = opal_list_get_first(&mca_iof_svc_component.svc_subscribed);
|
|
item != opal_list_get_end(&mca_iof_svc_component.svc_subscribed);
|
|
item = opal_list_get_next(item)) {
|
|
orte_iof_svc_sub_t* sub = (orte_iof_svc_sub_t*)item;
|
|
if(orte_iof_svc_fwd_match(sub,pub)) {
|
|
orte_iof_svc_fwd_delete(sub,pub);
|
|
}
|
|
}
|
|
opal_list_remove_item(&mca_iof_svc_component.svc_published, &pub->super);
|
|
OBJ_RELEASE(pub);
|
|
OPAL_THREAD_UNLOCK(&mca_iof_svc_component.svc_lock);
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
|
|
/*
|
|
* Remove all publications associated w/ the given process name.
|
|
*/
|
|
|
|
void orte_iof_svc_pub_delete_all(
|
|
const orte_process_name_t* name)
|
|
{
|
|
opal_list_item_t* p_item;
|
|
|
|
OPAL_THREAD_LOCK(&mca_iof_svc_component.svc_lock);
|
|
p_item = opal_list_get_first(&mca_iof_svc_component.svc_published);
|
|
while(p_item != opal_list_get_end(&mca_iof_svc_component.svc_published)) {
|
|
opal_list_item_t* p_next = opal_list_get_next(p_item);
|
|
orte_iof_svc_pub_t* pub = (orte_iof_svc_pub_t*)p_item;
|
|
|
|
if (orte_ns.compare_fields(ORTE_NS_CMP_ALL, &pub->pub_name,name) == 0 ||
|
|
orte_ns.compare_fields(ORTE_NS_CMP_ALL, &pub->pub_proxy,name) == 0) {
|
|
|
|
opal_list_item_t* s_item;
|
|
for(s_item = opal_list_get_first(&mca_iof_svc_component.svc_subscribed);
|
|
s_item != opal_list_get_end(&mca_iof_svc_component.svc_subscribed);
|
|
s_item = opal_list_get_next(s_item)) {
|
|
orte_iof_svc_sub_t* sub = (orte_iof_svc_sub_t*)s_item;
|
|
if(orte_iof_svc_fwd_match(sub,pub)) {
|
|
orte_iof_svc_fwd_delete(sub,pub);
|
|
}
|
|
}
|
|
opal_list_remove_item(&mca_iof_svc_component.svc_published, p_item);
|
|
OBJ_RELEASE(pub);
|
|
}
|
|
p_item = p_next;
|
|
}
|
|
OPAL_THREAD_UNLOCK(&mca_iof_svc_component.svc_lock);
|
|
}
|
|
|