1
1
openmpi/orte/mca/iof/svc/iof_svc_component.c
Ralph Castain 6d6cebb4a7 Bring over the update to terminate orteds that are generated by a dynamic spawn such as comm_spawn. This introduces the concept of a job "family" - i.e., jobs that have a parent/child relationship. Comm_spawn'ed jobs have a parent (the one that spawned them). We track that relationship throughout the lineage - i.e., if a comm_spawned job in turn calls comm_spawn, then it has a parent (the one that spawned it) and a "root" job (the original job that started things).
Accordingly, there are new APIs to the name service to support the ability to get a job's parent, root, immediate children, and all its descendants. In addition, the terminate_job, terminate_orted, and signal_job APIs for the PLS have been modified to accept attributes that define the extent of their actions. For example, doing a "terminate_job" with an attribute of ORTE_NS_INCLUDE_DESCENDANTS will terminate the given jobid AND all jobs that descended from it.

I have tested this capability on a MacBook under rsh, Odin under SLURM, and LANL's Flash (bproc). It worked successfully on non-MPI jobs (both simple and including a spawn), and MPI jobs (again, both simple and with a spawn).

This commit was SVN r12597.
2006-11-14 19:34:59 +00:00

193 строки
5.0 KiB
C

/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/util/proc_info.h"
#include "opal/util/output.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/rml/rml_types.h"
#include "iof_svc.h"
#include "iof_svc_proxy.h"
#include "iof_svc_pub.h"
#include "iof_svc_sub.h"
/*
* Local functions
*/
static int orte_iof_svc_open(void);
static int orte_iof_svc_close(void);
static orte_iof_base_module_t* orte_iof_svc_init(
int* priority,
bool *allow_multi_user_threads,
bool *have_hidden_threads);
/*
* Local variables
*/
static bool initialized = false;
orte_iof_svc_component_t mca_iof_svc_component = {
{
/* First, the mca_base_component_t struct containing meta
information about the component itself */
{
/* Indicate that we are a iof v1.0.0 component (which also
implies a specific MCA version) */
ORTE_IOF_BASE_VERSION_1_0_0,
"svc", /* MCA component name */
ORTE_MAJOR_VERSION, /* MCA component major version */
ORTE_MINOR_VERSION, /* MCA component minor version */
ORTE_RELEASE_VERSION, /* MCA component release version */
orte_iof_svc_open, /* component open */
orte_iof_svc_close /* component close */
},
/* Next the MCA v1.0.0 component meta data */
{
/* Whether the component is checkpointable or not */
false
},
orte_iof_svc_init
}
};
#if 0
static char* orte_iof_svc_param_register_string(
const char* param_name,
const char* default_value)
{
char *param_value;
int id = mca_base_param_register_string("iof","svc",param_name,NULL,default_value);
mca_base_param_lookup_string(id, &param_value);
return param_value;
}
#endif
static int orte_iof_svc_param_register_int(
const char* param_name,
int default_value)
{
int id = mca_base_param_register_int("iof","svc",param_name,NULL,default_value);
int param_value = default_value;
mca_base_param_lookup_int(id,&param_value);
return param_value;
}
/**
* component open/close/init function
*/
static int orte_iof_svc_open(void)
{
mca_iof_svc_component.svc_debug = orte_iof_svc_param_register_int("debug", 1);
return ORTE_SUCCESS;
}
static int orte_iof_svc_close(void)
{
opal_list_item_t* item;
if (initialized) {
OPAL_THREAD_LOCK(&mca_iof_svc_component.svc_lock);
while((item = opal_list_remove_first(&mca_iof_svc_component.svc_subscribed)) != NULL) {
OBJ_RELEASE(item);
}
while((item = opal_list_remove_first(&mca_iof_svc_component.svc_published)) != NULL) {
OBJ_RELEASE(item);
}
OPAL_THREAD_UNLOCK(&mca_iof_svc_component.svc_lock);
orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_IOF_SVC);
}
return ORTE_SUCCESS;
}
/**
* Callback when peer is disconnected
*/
static void
orte_iof_svc_exception_handler(const orte_process_name_t* peer, orte_rml_exception_t reason)
{
orte_iof_svc_sub_delete_all(peer);
orte_iof_svc_pub_delete_all(peer);
}
/**
* Module Initialization
*/
static orte_iof_base_module_t*
orte_iof_svc_init(int* priority, bool *allow_multi_user_threads, bool *have_hidden_threads)
{
int rc;
if (false == orte_process_info.seed) {
return NULL;
}
*priority = 1;
*allow_multi_user_threads = true;
*have_hidden_threads = false;
OBJ_CONSTRUCT(&mca_iof_svc_component.svc_subscribed, opal_list_t);
OBJ_CONSTRUCT(&mca_iof_svc_component.svc_published, opal_list_t);
OBJ_CONSTRUCT(&mca_iof_svc_component.svc_lock, opal_mutex_t);
/* post non-blocking recv */
mca_iof_svc_component.svc_iov[0].iov_base = NULL;
mca_iof_svc_component.svc_iov[0].iov_len = 0;
rc = orte_rml.recv_nb(
ORTE_NAME_WILDCARD,
mca_iof_svc_component.svc_iov,
1,
ORTE_RML_TAG_IOF_SVC,
ORTE_RML_ALLOC|ORTE_RML_PERSISTENT,
orte_iof_svc_proxy_recv,
NULL
);
if(rc != ORTE_SUCCESS) {
opal_output(0, "orte_iof_svc_init: unable to post non-blocking recv");
return NULL;
}
rc = orte_rml.add_exception_handler(orte_iof_svc_exception_handler);
initialized = true;
return &orte_iof_svc_module;
}
int
orte_iof_svc_finalize(void)
{
orte_rml.del_exception_handler(orte_iof_svc_exception_handler);
return ORTE_SUCCESS;
}