2004-11-20 22:12:43 +03:00
|
|
|
/*
|
2004-11-22 04:38:40 +03:00
|
|
|
* Copyright (c) 2004-2005 The Trustees of Indiana University.
|
|
|
|
* All rights reserved.
|
|
|
|
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
|
|
|
* All rights reserved.
|
2004-11-28 23:09:25 +03:00
|
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
|
|
* University of Stuttgart. All rights reserved.
|
2005-03-24 15:43:37 +03:00
|
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
|
|
* All rights reserved.
|
2004-11-22 04:38:40 +03:00
|
|
|
* $COPYRIGHT$
|
|
|
|
*
|
|
|
|
* Additional copyrights may follow
|
|
|
|
*
|
2004-11-20 22:12:43 +03:00
|
|
|
* $HEADER$
|
|
|
|
*/
|
|
|
|
/** @file:
|
|
|
|
*
|
|
|
|
* The Open MPI general purpose registry - implementation.
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* includes
|
|
|
|
*/
|
|
|
|
|
2005-03-14 23:57:21 +03:00
|
|
|
#include "orte_config.h"
|
|
|
|
|
|
|
|
#include "include/orte_constants.h"
|
2005-07-04 03:31:27 +04:00
|
|
|
#include "opal/util/output.h"
|
2005-03-14 23:57:21 +03:00
|
|
|
#include "util/proc_info.h"
|
|
|
|
|
|
|
|
#include "mca/errmgr/errmgr.h"
|
|
|
|
#include "mca/ns/ns_types.h"
|
|
|
|
#include "mca/oob/oob_types.h"
|
|
|
|
#include "mca/rml/rml.h"
|
2004-11-20 22:12:43 +03:00
|
|
|
|
|
|
|
#include "gpr_proxy.h"
|
|
|
|
|
2005-03-14 23:57:21 +03:00
|
|
|
int
|
2005-06-24 20:59:37 +04:00
|
|
|
orte_gpr_proxy_enter_subscription(size_t cnt, orte_gpr_subscription_t **subscriptions)
|
2004-11-20 22:12:43 +03:00
|
|
|
{
|
2005-03-14 23:57:21 +03:00
|
|
|
orte_gpr_proxy_subscriber_t *sub;
|
2005-06-24 20:59:37 +04:00
|
|
|
size_t i, id;
|
2005-03-14 23:57:21 +03:00
|
|
|
|
|
|
|
for (i=0; i < cnt; i++) {
|
|
|
|
sub = OBJ_NEW(orte_gpr_proxy_subscriber_t);
|
|
|
|
if (NULL == sub) {
|
|
|
|
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
|
|
|
return ORTE_ERR_OUT_OF_RESOURCE;
|
|
|
|
}
|
|
|
|
sub->callback = subscriptions[i]->cbfunc;
|
|
|
|
sub->user_tag = subscriptions[i]->user_tag;
|
2005-06-24 20:59:37 +04:00
|
|
|
if (0 > orte_pointer_array_add(&id, orte_gpr_proxy_globals.subscriptions, sub)) {
|
2005-03-14 23:57:21 +03:00
|
|
|
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
|
|
|
return ORTE_ERR_OUT_OF_RESOURCE;
|
|
|
|
}
|
2005-06-24 20:59:37 +04:00
|
|
|
sub->id = (orte_gpr_subscription_id_t)id;
|
|
|
|
subscriptions[i]->id = sub->id;
|
|
|
|
(orte_gpr_proxy_globals.num_subs)++;
|
Well, we are getting closer to resolving the comm_spawn problem. For the benefit of those that haven't been in the midst of this discussion, the problem is that this is the first case where the process starting a set of processes has not been mpirun and is not guaranteed to be alive throughout the lifetime of the spawned processes. This sounds simple, but actually has some profound impacts.
Most of this checkin consists of more debugging stuff. Hopefully, you won't see any printf's that aren't protected by debug flags - if you do, let me know and I'll take them out with my apologies.
Outside of debugging, the biggest change was a revamp of the shutdown process. For several reasons, we had chosen to have all processes "wait" for a shutdown message before exiting. This message is typically generated by mpirun, but in the case of comm_spawn we needed to do something else. We have decided that the best way to solve this problem is to:
(a) replace the shutdown message (which needed to be generated by somebody - usually mpirun) with an oob_barrier call. This still requires that the rank 0 process be alive. However, we terminate all processes if one abnormally terminates anyway, so this isn't a problem (with the standard or our implementation); and
(b) have the state-of-health monitoring subsystem issue the call to cleanup the job from the registry. Since the state-of-health subsystem isn't available yet, we have temporarily assigned that responsibility to the rank 0 process. Once the state-of-health subsystem is available, we will have it monitor the job for all-processes-complete and then it can tell the registry to cleanup the job (i.e., remove all data relating to this job).
Hope that helps a little. I'll put all this into the design docs soon.
This commit was SVN r3754.
2004-12-09 00:44:41 +03:00
|
|
|
}
|
|
|
|
|
2005-03-14 23:57:21 +03:00
|
|
|
return ORTE_SUCCESS;
|
2004-11-20 22:12:43 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2005-03-14 23:57:21 +03:00
|
|
|
int
|
2005-06-24 20:59:37 +04:00
|
|
|
orte_gpr_proxy_enter_trigger(size_t cnt, orte_gpr_trigger_t **trigs)
|
2004-11-20 22:12:43 +03:00
|
|
|
{
|
2005-06-24 20:59:37 +04:00
|
|
|
size_t i;
|
|
|
|
|
|
|
|
for (i=0; i < cnt; i++) {
|
|
|
|
if (ORTE_GPR_TRIGGER_ID_MAX-1 > orte_gpr_proxy_globals.trig_cntr) {
|
|
|
|
trigs[i]->id = orte_gpr_proxy_globals.trig_cntr;
|
|
|
|
(orte_gpr_proxy_globals.trig_cntr)++;
|
|
|
|
} else {
|
|
|
|
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
|
|
|
return ORTE_ERR_OUT_OF_RESOURCE;
|
|
|
|
}
|
Well, we are getting closer to resolving the comm_spawn problem. For the benefit of those that haven't been in the midst of this discussion, the problem is that this is the first case where the process starting a set of processes has not been mpirun and is not guaranteed to be alive throughout the lifetime of the spawned processes. This sounds simple, but actually has some profound impacts.
Most of this checkin consists of more debugging stuff. Hopefully, you won't see any printf's that aren't protected by debug flags - if you do, let me know and I'll take them out with my apologies.
Outside of debugging, the biggest change was a revamp of the shutdown process. For several reasons, we had chosen to have all processes "wait" for a shutdown message before exiting. This message is typically generated by mpirun, but in the case of comm_spawn we needed to do something else. We have decided that the best way to solve this problem is to:
(a) replace the shutdown message (which needed to be generated by somebody - usually mpirun) with an oob_barrier call. This still requires that the rank 0 process be alive. However, we terminate all processes if one abnormally terminates anyway, so this isn't a problem (with the standard or our implementation); and
(b) have the state-of-health monitoring subsystem issue the call to cleanup the job from the registry. Since the state-of-health subsystem isn't available yet, we have temporarily assigned that responsibility to the rank 0 process. Once the state-of-health subsystem is available, we will have it monitor the job for all-processes-complete and then it can tell the registry to cleanup the job (i.e., remove all data relating to this job).
Hope that helps a little. I'll put all this into the design docs soon.
This commit was SVN r3754.
2004-12-09 00:44:41 +03:00
|
|
|
}
|
2005-06-24 20:59:37 +04:00
|
|
|
|
2005-03-14 23:57:21 +03:00
|
|
|
return ORTE_SUCCESS;
|
2004-11-20 22:12:43 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2005-06-24 20:59:37 +04:00
|
|
|
int
|
|
|
|
orte_gpr_proxy_remove_subscription(orte_gpr_subscription_id_t id)
|
2004-11-20 22:12:43 +03:00
|
|
|
{
|
2005-06-24 20:59:37 +04:00
|
|
|
if (NULL != (orte_gpr_proxy_globals.subscriptions)->addr[id]) {
|
|
|
|
OBJ_RELEASE((orte_gpr_proxy_globals.subscriptions)->addr[id]);
|
|
|
|
orte_pointer_array_set_item(orte_gpr_proxy_globals.subscriptions, (size_t)id, NULL);
|
2004-11-20 22:12:43 +03:00
|
|
|
}
|
2005-06-24 20:59:37 +04:00
|
|
|
|
2005-03-14 23:57:21 +03:00
|
|
|
return ORTE_SUCCESS;
|
2004-11-20 22:12:43 +03:00
|
|
|
}
|
|
|
|
|