diff --git a/orte/mca/rmgr/cnos/configure.m4 b/orte/mca/rmgr/cnos/configure.m4 index 5466d4415a..10c83365e0 100644 --- a/orte/mca/rmgr/cnos/configure.m4 +++ b/orte/mca/rmgr/cnos/configure.m4 @@ -31,5 +31,7 @@ AC_DEFUN([MCA_rmgr_cnos_CONFIG],[ [rmgr_cnos_happy="no"]) fi + AC_CHECK_FUNCS([killrank cnos_pm_barrier]) + AS_IF([test "$rmgr_cnos_happy" = "yes"], [$1], [$2]) ])dnl diff --git a/orte/mca/rmgr/cnos/rmgr_cnos.c b/orte/mca/rmgr/cnos/rmgr_cnos.c index 943cf2829c..d5053ef6fc 100644 --- a/orte/mca/rmgr/cnos/rmgr_cnos.c +++ b/orte/mca/rmgr/cnos/rmgr_cnos.c @@ -23,9 +23,14 @@ #ifdef HAVE_STRING_H #include #endif /* HAVE_STRING_H */ +#ifdef HAVE_CNOS_PM_BARRIER +#include +#endif #include "orte/orte_constants.h" #include "orte/mca/rmgr/base/base.h" +#include "orte/mca/ns/ns.h" +#include "orte/util/proc_info.h" #include "rmgr_cnos.h" @@ -61,6 +66,8 @@ static int orte_rmgr_cnos_spawn( orte_rmgr_cb_fn_t cbfn, orte_proc_state_t cb_conditions); +static int orte_rmgr_cnos_finalize(void); + orte_rmgr_base_module_t orte_rmgr_cnos_module = { orte_rmgr_cnos_query, orte_rmgr_cnos_create, @@ -73,7 +80,7 @@ orte_rmgr_base_module_t orte_rmgr_cnos_module = { orte_rmgr_cnos_spawn, orte_rmgr_base_proc_stage_gate_init, orte_rmgr_base_proc_stage_gate_mgr, - NULL, /* finalize */ + orte_rmgr_cnos_finalize }; @@ -114,15 +121,56 @@ static int orte_rmgr_cnos_launch(orte_jobid_t jobid) return ORTE_ERR_NOT_SUPPORTED; } +#ifdef HAVE_KILLRANK +#include "catamount/types.h" +/* secret sauce on the Cray machine */ +extern int killrank(rank_t RANK, int SIG); +#endif + static int orte_rmgr_cnos_terminate_job(orte_jobid_t jobid) { - abort(); +#ifdef HAVE_KILLRANK + orte_jobid_t my_jobid; + + orte_ns.get_jobid(&my_jobid, orte_process_info.my_name); + + /* make sure it's my job */ + if (jobid == my_jobid) { + killrank(-1, SIGKILL); + } else { + return ORTE_ERR_NOT_SUPPORTED; + } +#else + exit(0); +#endif + return ORTE_SUCCESS; } static int orte_rmgr_cnos_terminate_proc(const orte_process_name_t* proc_name) { - abort(); +#ifdef HAVE_KILLRANK + orte_jobid_t my_jobid; + orte_jobid_t his_jobid; + orte_vpid_t my_vpid; + orte_vpid_t his_vpid; + + orte_ns.get_jobid(&my_jobid, orte_process_info.my_name); + orte_ns.get_jobid(&his_jobid, proc_name); + + orte_ns.get_vpid(&his_vpid, proc_name); + + /* make sure it's my job. This may end up killing me, but what + the heck. */ + if (jobid == my_jobid) { + killrank((int) his_vpid, SIGKILL); + } else { + return ORTE_ERR_NOT_SUPPORTED; + } +#else + exit(0); +#endif + return ORTE_SUCCESS; } @@ -138,4 +186,14 @@ static int orte_rmgr_cnos_spawn( } +static int orte_rmgr_cnos_finalize(void) +{ +#ifdef HAVE_CNOS_PM_BARRIER + /* register with the process manager so that everyone aborts if + any one process aborts. This is a bit slower than it needs to + be, but useful. */ + cnos_pm_barrier(1); +#endif + return ORTE_SUCCESS; +} diff --git a/orte/mca/rmgr/cnos/rmgr_cnos_component.c b/orte/mca/rmgr/cnos/rmgr_cnos_component.c index 9f09ad7a03..c7d3fccafb 100644 --- a/orte/mca/rmgr/cnos/rmgr_cnos_component.c +++ b/orte/mca/rmgr/cnos/rmgr_cnos_component.c @@ -15,6 +15,11 @@ */ #include "orte_config.h" + +#ifdef HAVE_CNOS_PM_BARRIER +#include +#endif + #include "orte/orte_constants.h" #include "orte/util/proc_info.h" #include "opal/util/output.h" @@ -75,6 +80,14 @@ static int orte_rmgr_cnos_open(void) static orte_rmgr_base_module_t *orte_rmgr_cnos_init(int* priority) { *priority = 1; + +#ifdef HAVE_CNOS_PM_BARRIER + /* register with the process manager so that everyone aborts if + any one process aborts. This is a bit slower than it needs to + be, but useful. */ + cnos_pm_barrier(0); +#endif + return &orte_rmgr_cnos_module; }