2005-03-14 23:57:21 +03:00
|
|
|
/*
|
2006-03-29 05:26:16 +04:00
|
|
|
* Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
|
2005-11-05 22:57:48 +03:00
|
|
|
* University Research and Technology
|
|
|
|
* Corporation. All rights reserved.
|
|
|
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
|
|
|
* of Tennessee Research Foundation. All rights
|
|
|
|
* reserved.
|
2006-02-08 20:40:11 +03:00
|
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
2005-03-14 23:57:21 +03:00
|
|
|
* University of Stuttgart. All rights reserved.
|
2005-03-24 15:43:37 +03:00
|
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
|
|
* All rights reserved.
|
2005-03-14 23:57:21 +03:00
|
|
|
* $COPYRIGHT$
|
2006-02-08 20:40:11 +03:00
|
|
|
*
|
2005-03-14 23:57:21 +03:00
|
|
|
* Additional copyrights may follow
|
2006-02-08 20:40:11 +03:00
|
|
|
*
|
2005-03-14 23:57:21 +03:00
|
|
|
* $HEADER$
|
|
|
|
*/
|
2005-09-19 19:29:14 +04:00
|
|
|
#include "orte_config.h"
|
2005-12-12 23:04:00 +03:00
|
|
|
#ifdef HAVE_SYS_TIME_H
|
2005-10-27 21:04:10 +04:00
|
|
|
#include <sys/time.h>
|
2005-12-12 23:04:00 +03:00
|
|
|
#endif /* HAVE_SYS_TIME_H */
|
2005-03-14 23:57:21 +03:00
|
|
|
#include <errno.h>
|
2005-12-12 23:04:00 +03:00
|
|
|
#ifdef HAVE_UNISTD_H
|
2005-03-14 23:57:21 +03:00
|
|
|
#include <unistd.h>
|
2005-12-12 23:04:00 +03:00
|
|
|
#endif /* HAVE_UNISTD_H */
|
|
|
|
#ifdef HAVE_STRING_H
|
2005-03-14 23:57:21 +03:00
|
|
|
#include <string.h>
|
2005-12-12 23:04:00 +03:00
|
|
|
#endif /* HAVE_STRING_H */
|
2005-03-14 23:57:21 +03:00
|
|
|
|
2005-09-19 19:29:14 +04:00
|
|
|
#include "opal/util/trace.h"
|
|
|
|
|
2006-02-12 04:33:29 +03:00
|
|
|
#include "orte/orte_constants.h"
|
2005-09-19 19:29:14 +04:00
|
|
|
#include "orte/mca/errmgr/errmgr.h"
|
|
|
|
#include "orte/mca/rds/base/base.h"
|
|
|
|
#include "orte/mca/ras/base/base.h"
|
|
|
|
#include "orte/mca/rmaps/base/base.h"
|
|
|
|
#include "orte/mca/rmgr/base/base.h"
|
|
|
|
#include "orte/mca/pls/base/base.h"
|
|
|
|
#include "orte/mca/gpr/gpr.h"
|
|
|
|
#include "orte/mca/iof/iof.h"
|
|
|
|
#include "orte/mca/ns/ns.h"
|
|
|
|
#include "orte/mca/rml/rml.h"
|
2005-10-04 21:19:23 +04:00
|
|
|
#include "orte/mca/soh/soh.h"
|
2005-09-19 19:29:14 +04:00
|
|
|
|
|
|
|
#include "orte/mca/rmgr/urm/rmgr_urm.h"
|
2005-03-14 23:57:21 +03:00
|
|
|
|
|
|
|
|
2005-03-31 19:47:37 +04:00
|
|
|
static int orte_rmgr_urm_query(void);
|
|
|
|
|
2005-03-14 23:57:21 +03:00
|
|
|
static int orte_rmgr_urm_create(
|
|
|
|
orte_app_context_t** app_context,
|
|
|
|
size_t num_context,
|
|
|
|
orte_jobid_t* jobid);
|
|
|
|
|
|
|
|
static int orte_rmgr_urm_allocate(
|
|
|
|
orte_jobid_t jobid);
|
|
|
|
|
|
|
|
static int orte_rmgr_urm_deallocate(
|
|
|
|
orte_jobid_t jobid);
|
|
|
|
|
|
|
|
static int orte_rmgr_urm_map(
|
|
|
|
orte_jobid_t jobid);
|
|
|
|
|
|
|
|
static int orte_rmgr_urm_launch(
|
|
|
|
orte_jobid_t jobid);
|
|
|
|
|
|
|
|
static int orte_rmgr_urm_terminate_job(
|
|
|
|
orte_jobid_t jobid);
|
|
|
|
|
|
|
|
static int orte_rmgr_urm_terminate_proc(
|
|
|
|
const orte_process_name_t* proc_name);
|
|
|
|
|
2006-06-08 22:27:17 +04:00
|
|
|
static int orte_rmgr_urm_signal_job(
|
|
|
|
orte_jobid_t jobid, int32_t signal);
|
|
|
|
|
|
|
|
static int orte_rmgr_urm_signal_proc(
|
|
|
|
const orte_process_name_t* proc_name,
|
|
|
|
int32_t signal);
|
|
|
|
|
2005-03-14 23:57:21 +03:00
|
|
|
static int orte_rmgr_urm_spawn(
|
|
|
|
orte_app_context_t** app_context,
|
|
|
|
size_t num_context,
|
|
|
|
orte_jobid_t* jobid,
|
2006-02-08 20:40:11 +03:00
|
|
|
orte_rmgr_cb_fn_t cbfn,
|
|
|
|
orte_proc_state_t cb_conditions);
|
2005-03-14 23:57:21 +03:00
|
|
|
|
2005-04-01 03:22:44 +04:00
|
|
|
static int orte_rmgr_urm_finalize(void);
|
|
|
|
|
|
|
|
|
2005-03-14 23:57:21 +03:00
|
|
|
orte_rmgr_base_module_t orte_rmgr_urm_module = {
|
2005-03-31 19:47:37 +04:00
|
|
|
orte_rmgr_urm_query,
|
2005-03-14 23:57:21 +03:00
|
|
|
orte_rmgr_urm_create,
|
|
|
|
orte_rmgr_urm_allocate,
|
|
|
|
orte_rmgr_urm_deallocate,
|
|
|
|
orte_rmgr_urm_map,
|
|
|
|
orte_rmgr_urm_launch,
|
|
|
|
orte_rmgr_urm_terminate_job,
|
|
|
|
orte_rmgr_urm_terminate_proc,
|
2006-06-08 22:27:17 +04:00
|
|
|
orte_rmgr_urm_signal_job,
|
|
|
|
orte_rmgr_urm_signal_proc,
|
2005-03-14 23:57:21 +03:00
|
|
|
orte_rmgr_urm_spawn,
|
|
|
|
orte_rmgr_base_proc_stage_gate_init,
|
|
|
|
orte_rmgr_base_proc_stage_gate_mgr,
|
2005-04-01 03:22:44 +04:00
|
|
|
orte_rmgr_urm_finalize
|
2005-03-14 23:57:21 +03:00
|
|
|
};
|
|
|
|
|
|
|
|
|
2005-03-31 19:47:37 +04:00
|
|
|
/*
|
|
|
|
* Resource discovery
|
|
|
|
*/
|
|
|
|
|
|
|
|
static int orte_rmgr_urm_query(void)
|
|
|
|
{
|
|
|
|
int rc;
|
2005-09-19 19:29:14 +04:00
|
|
|
|
|
|
|
OPAL_TRACE(1);
|
2006-02-08 20:40:11 +03:00
|
|
|
|
2005-03-31 19:47:37 +04:00
|
|
|
if(ORTE_SUCCESS != (rc = orte_rds_base_query())) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
return ORTE_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2005-03-14 23:57:21 +03:00
|
|
|
/*
|
|
|
|
* Create the job segment and initialize the application context.
|
|
|
|
*/
|
|
|
|
|
|
|
|
static int orte_rmgr_urm_create(
|
|
|
|
orte_app_context_t** app_context,
|
|
|
|
size_t num_context,
|
|
|
|
orte_jobid_t* jobid)
|
|
|
|
{
|
|
|
|
int rc;
|
|
|
|
|
2005-09-19 19:29:14 +04:00
|
|
|
OPAL_TRACE(1);
|
2006-02-08 20:40:11 +03:00
|
|
|
|
2005-03-14 23:57:21 +03:00
|
|
|
/* allocate a jobid */
|
2005-04-14 19:32:21 +04:00
|
|
|
if (ORTE_SUCCESS != (rc = orte_ns.create_jobid(jobid))) {
|
2005-03-31 19:47:37 +04:00
|
|
|
ORTE_ERROR_LOG(rc);
|
2005-03-14 23:57:21 +03:00
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
2005-10-08 02:24:52 +04:00
|
|
|
/* create and initialize job segment */ /* JJH C/N mapping before this */
|
2006-02-08 20:40:11 +03:00
|
|
|
if (ORTE_SUCCESS !=
|
|
|
|
(rc = orte_rmgr_base_put_app_context(*jobid, app_context,
|
2005-03-14 23:57:21 +03:00
|
|
|
num_context))) {
|
2005-03-31 19:47:37 +04:00
|
|
|
ORTE_ERROR_LOG(rc);
|
2005-03-14 23:57:21 +03:00
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
2006-07-28 01:21:10 +04:00
|
|
|
/* setup the launch system's stage gate counters and subscriptions */
|
|
|
|
if (ORTE_SUCCESS !=
|
|
|
|
(rc = orte_rmgr_base_proc_stage_gate_init(*jobid))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2005-03-14 23:57:21 +03:00
|
|
|
return ORTE_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static int orte_rmgr_urm_allocate(orte_jobid_t jobid)
|
|
|
|
{
|
2005-10-08 02:24:52 +04:00
|
|
|
OPAL_TRACE(1);
|
2006-02-08 20:40:11 +03:00
|
|
|
|
2005-10-08 02:24:52 +04:00
|
|
|
return orte_ras_base_allocate(jobid, &mca_rmgr_urm_component.urm_ras);
|
2005-03-14 23:57:21 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static int orte_rmgr_urm_deallocate(orte_jobid_t jobid)
|
|
|
|
{
|
2005-09-19 19:29:14 +04:00
|
|
|
OPAL_TRACE(1);
|
2006-02-08 20:40:11 +03:00
|
|
|
|
2005-03-14 23:57:21 +03:00
|
|
|
return mca_rmgr_urm_component.urm_ras->deallocate(jobid);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int orte_rmgr_urm_map(orte_jobid_t jobid)
|
|
|
|
{
|
2005-09-19 19:29:14 +04:00
|
|
|
OPAL_TRACE(1);
|
2006-02-08 20:40:11 +03:00
|
|
|
|
2005-03-14 23:57:21 +03:00
|
|
|
return mca_rmgr_urm_component.urm_rmaps->map(jobid);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int orte_rmgr_urm_launch(orte_jobid_t jobid)
|
|
|
|
{
|
2005-10-04 21:19:23 +04:00
|
|
|
int ret, ret2;
|
|
|
|
|
2005-09-19 19:29:14 +04:00
|
|
|
OPAL_TRACE(1);
|
2006-02-08 20:40:11 +03:00
|
|
|
|
|
|
|
if (ORTE_SUCCESS !=
|
2005-10-04 21:19:23 +04:00
|
|
|
(ret = mca_rmgr_urm_component.urm_pls->launch(jobid))) {
|
|
|
|
ORTE_ERROR_LOG(ret);
|
|
|
|
ret2 = orte_soh.set_job_soh(jobid, ORTE_JOB_STATE_ABORTED);
|
|
|
|
if (ORTE_SUCCESS != ret2) {
|
|
|
|
ORTE_ERROR_LOG(ret2);
|
|
|
|
return ret2;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
2005-03-14 23:57:21 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static int orte_rmgr_urm_terminate_job(orte_jobid_t jobid)
|
|
|
|
{
|
2006-05-09 17:40:41 +04:00
|
|
|
int ret;
|
|
|
|
orte_jobid_t my_jobid;
|
|
|
|
|
2005-09-19 19:29:14 +04:00
|
|
|
OPAL_TRACE(1);
|
2006-02-08 20:40:11 +03:00
|
|
|
|
2006-05-09 17:40:41 +04:00
|
|
|
ret = orte_ns.get_jobid(&my_jobid, orte_process_info.my_name);
|
|
|
|
if (ORTE_SUCCESS == ret) {
|
|
|
|
/* if our jobid is the one we're trying to kill AND we're a
|
|
|
|
singleton, then calling the urm_pls isn't going to be able
|
|
|
|
to do anything. Just call exit. */
|
|
|
|
if (orte_process_info.singleton && jobid == my_jobid) {
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2005-03-14 23:57:21 +03:00
|
|
|
return mca_rmgr_urm_component.urm_pls->terminate_job(jobid);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int orte_rmgr_urm_terminate_proc(const orte_process_name_t* proc_name)
|
|
|
|
{
|
2005-09-19 19:29:14 +04:00
|
|
|
OPAL_TRACE(1);
|
2006-02-08 20:40:11 +03:00
|
|
|
|
2006-06-08 22:27:17 +04:00
|
|
|
if ((0 == orte_ns.compare(ORTE_NS_CMP_ALL, proc_name,
|
2006-05-09 17:40:41 +04:00
|
|
|
orte_process_info.my_name)) &&
|
|
|
|
(orte_process_info.singleton)) {
|
|
|
|
/* if we're trying to get ourselves killed and we're a
|
|
|
|
singleton, calling terminate_proc isn't going to work
|
|
|
|
properly -- there's no pls setup properly for us. Just
|
|
|
|
call exit and be done. */
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
|
2005-03-14 23:57:21 +03:00
|
|
|
return mca_rmgr_urm_component.urm_pls->terminate_proc(proc_name);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2006-06-08 22:27:17 +04:00
|
|
|
static int orte_rmgr_urm_signal_job(orte_jobid_t jobid, int32_t signal)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
orte_jobid_t my_jobid;
|
|
|
|
|
|
|
|
OPAL_TRACE(1);
|
|
|
|
|
|
|
|
ret = orte_ns.get_jobid(&my_jobid, orte_process_info.my_name);
|
|
|
|
if (ORTE_SUCCESS == ret) {
|
|
|
|
/** if our jobid is the one we're trying to signal AND we're a
|
|
|
|
* singleton, then calling the urm_pls isn't going to be able
|
|
|
|
* to do anything - we already have the signal! */
|
|
|
|
if (orte_process_info.singleton && jobid == my_jobid) {
|
|
|
|
return ORTE_SUCCESS;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return mca_rmgr_urm_component.urm_pls->signal_job(jobid, signal);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int orte_rmgr_urm_signal_proc(const orte_process_name_t* proc_name, int32_t signal)
|
|
|
|
{
|
|
|
|
OPAL_TRACE(1);
|
|
|
|
|
|
|
|
if ((0 == orte_ns.compare(ORTE_NS_CMP_ALL, proc_name,
|
|
|
|
orte_process_info.my_name)) &&
|
|
|
|
(orte_process_info.singleton)) {
|
|
|
|
/** if we're trying to signal ourselves and we're a
|
|
|
|
* singleton, calling signal_proc isn't going to work
|
|
|
|
* properly -- there's no pls setup properly for us. Besides, we
|
|
|
|
* already have the signal!
|
|
|
|
*/
|
|
|
|
return ORTE_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
return mca_rmgr_urm_component.urm_pls->signal_proc(proc_name, signal);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2005-06-01 23:23:23 +04:00
|
|
|
static void orte_rmgr_urm_wireup_stdin(orte_jobid_t jobid)
|
|
|
|
{
|
|
|
|
int rc;
|
|
|
|
orte_process_name_t* name;
|
|
|
|
|
2005-09-19 19:29:14 +04:00
|
|
|
OPAL_TRACE(1);
|
2006-02-08 20:40:11 +03:00
|
|
|
|
2005-06-01 23:23:23 +04:00
|
|
|
if (ORTE_SUCCESS != (rc = orte_ns.create_process_name(&name, 0, jobid, 0))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_iof.iof_push(name, ORTE_NS_CMP_JOBID, ORTE_IOF_STDIN, 0))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2005-03-14 23:57:21 +03:00
|
|
|
|
|
|
|
static void orte_rmgr_urm_callback(orte_gpr_notify_data_t *data, void *cbdata)
|
|
|
|
{
|
2006-03-29 05:26:16 +04:00
|
|
|
orte_rmgr_cb_fn_t cbfunc;
|
|
|
|
union {
|
|
|
|
orte_rmgr_cb_fn_t func;
|
|
|
|
void * ptr;
|
|
|
|
} cbfunc_union;
|
2005-07-18 22:49:00 +04:00
|
|
|
orte_gpr_value_t **values, *value;
|
2005-03-14 23:57:21 +03:00
|
|
|
orte_gpr_keyval_t** keyvals;
|
|
|
|
orte_jobid_t jobid;
|
2005-07-18 22:49:00 +04:00
|
|
|
size_t i, j, k;
|
2005-05-01 04:47:35 +04:00
|
|
|
int rc;
|
2005-03-14 23:57:21 +03:00
|
|
|
|
2005-09-19 19:29:14 +04:00
|
|
|
OPAL_TRACE(1);
|
2006-02-08 20:40:11 +03:00
|
|
|
|
2006-03-29 05:26:16 +04:00
|
|
|
/* stupid ISO C forbids conversion of object pointer to function
|
|
|
|
pointer. So we do this, which is the same thing, but without
|
|
|
|
the warning from GCC */
|
|
|
|
cbfunc_union.ptr = cbdata;
|
|
|
|
cbfunc = cbfunc_union.func;
|
|
|
|
|
2005-06-24 20:59:37 +04:00
|
|
|
/* we made sure in the subscriptions that at least one
|
|
|
|
* value is always returned
|
|
|
|
* get the jobid from the segment name in the first value
|
|
|
|
*/
|
2005-07-18 22:49:00 +04:00
|
|
|
values = (orte_gpr_value_t**)(data->values)->addr;
|
2005-06-24 20:59:37 +04:00
|
|
|
if (ORTE_SUCCESS != (rc =
|
|
|
|
orte_schema.extract_jobid_from_segment_name(&jobid,
|
|
|
|
values[0]->segment))) {
|
2005-03-14 23:57:21 +03:00
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2005-07-18 22:49:00 +04:00
|
|
|
for(i = 0, k=0; k < data->cnt &&
|
|
|
|
i < (data->values)->size; i++) {
|
|
|
|
if (NULL != values[i]) {
|
|
|
|
k++;
|
|
|
|
value = values[i];
|
|
|
|
/* determine the state change */
|
|
|
|
keyvals = value->keyvals;
|
|
|
|
for(j=0; j<value->cnt; j++) {
|
|
|
|
orte_gpr_keyval_t* keyval = keyvals[j];
|
2006-02-08 20:40:11 +03:00
|
|
|
if(strcmp(keyval->key, ORTE_PROC_NUM_AT_INIT) == 0) {
|
|
|
|
(*cbfunc)(jobid,ORTE_PROC_STATE_INIT);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if(strcmp(keyval->key, ORTE_PROC_NUM_LAUNCHED) == 0) {
|
|
|
|
(*cbfunc)(jobid,ORTE_PROC_STATE_LAUNCHED);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if(strcmp(keyval->key, ORTE_PROC_NUM_RUNNING) == 0) {
|
|
|
|
(*cbfunc)(jobid,ORTE_PROC_STATE_RUNNING);
|
|
|
|
continue;
|
|
|
|
}
|
2005-07-18 22:49:00 +04:00
|
|
|
if(strcmp(keyval->key, ORTE_PROC_NUM_AT_STG1) == 0) {
|
|
|
|
(*cbfunc)(jobid,ORTE_PROC_STATE_AT_STG1);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if(strcmp(keyval->key, ORTE_PROC_NUM_AT_STG2) == 0) {
|
2005-09-20 23:12:59 +04:00
|
|
|
(*cbfunc)(jobid,ORTE_PROC_STATE_AT_STG2);
|
2005-07-18 22:49:00 +04:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if(strcmp(keyval->key, ORTE_PROC_NUM_AT_STG3) == 0) {
|
|
|
|
(*cbfunc)(jobid,ORTE_PROC_STATE_AT_STG3);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if(strcmp(keyval->key, ORTE_PROC_NUM_FINALIZED) == 0) {
|
|
|
|
(*cbfunc)(jobid,ORTE_PROC_STATE_FINALIZED);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if(strcmp(keyval->key, ORTE_PROC_NUM_TERMINATED) == 0) {
|
|
|
|
(*cbfunc)(jobid,ORTE_PROC_STATE_TERMINATED);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if(strcmp(keyval->key, ORTE_PROC_NUM_ABORTED) == 0) {
|
|
|
|
(*cbfunc)(jobid,ORTE_PROC_STATE_ABORTED);
|
|
|
|
continue;
|
|
|
|
}
|
2005-03-14 23:57:21 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2006-02-08 20:40:11 +03:00
|
|
|
/**
|
|
|
|
* define a callback point for completing the wireup of the stdin for io forwarding
|
|
|
|
*/
|
|
|
|
static void orte_rmgr_urm_wireup_callback(orte_gpr_notify_data_t *data, void *cbdata)
|
|
|
|
{
|
|
|
|
orte_gpr_value_t **values;
|
|
|
|
orte_jobid_t jobid;
|
|
|
|
int rc;
|
|
|
|
|
|
|
|
OPAL_TRACE(1);
|
|
|
|
|
|
|
|
/* we made sure in the subscriptions that at least one
|
|
|
|
* value is always returned
|
|
|
|
* get the jobid from the segment name in the first value
|
|
|
|
*/
|
|
|
|
values = (orte_gpr_value_t**)(data->values)->addr;
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_schema.extract_jobid_from_segment_name(&jobid, values[0]->segment))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
orte_rmgr_urm_wireup_stdin(jobid);
|
|
|
|
}
|
|
|
|
|
2005-03-14 23:57:21 +03:00
|
|
|
/*
|
|
|
|
* Shortcut for the multiple steps involved in spawning a new job.
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
|
|
static int orte_rmgr_urm_spawn(
|
|
|
|
orte_app_context_t** app_context,
|
|
|
|
size_t num_context,
|
|
|
|
orte_jobid_t* jobid,
|
2006-02-08 20:40:11 +03:00
|
|
|
orte_rmgr_cb_fn_t cbfunc,
|
|
|
|
orte_proc_state_t cb_conditions)
|
2005-03-14 23:57:21 +03:00
|
|
|
{
|
|
|
|
int rc;
|
|
|
|
orte_process_name_t* name;
|
2006-02-08 20:40:11 +03:00
|
|
|
|
2005-09-19 19:29:14 +04:00
|
|
|
OPAL_TRACE(1);
|
2006-02-08 20:40:11 +03:00
|
|
|
|
|
|
|
/*
|
2005-03-14 23:57:21 +03:00
|
|
|
* Perform resource discovery.
|
|
|
|
*/
|
|
|
|
if (mca_rmgr_urm_component.urm_rds == false &&
|
|
|
|
ORTE_SUCCESS != (rc = orte_rds_base_query())) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
} else {
|
|
|
|
mca_rmgr_urm_component.urm_rds = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Initialize job segment and allocate resources
|
2005-10-08 02:24:52 +04:00
|
|
|
*/ /* JJH Insert C/N mapping stuff here */
|
2006-02-08 20:40:11 +03:00
|
|
|
if (ORTE_SUCCESS !=
|
2005-03-14 23:57:21 +03:00
|
|
|
(rc = orte_rmgr_urm_create(app_context,num_context,jobid))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
2005-10-27 21:04:10 +04:00
|
|
|
|
2005-03-14 23:57:21 +03:00
|
|
|
if (ORTE_SUCCESS != (rc = orte_rmgr_urm_allocate(*jobid))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
2005-10-27 21:04:10 +04:00
|
|
|
|
2005-03-14 23:57:21 +03:00
|
|
|
if (ORTE_SUCCESS != (rc = orte_rmgr_urm_map(*jobid))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2006-02-08 20:40:11 +03:00
|
|
|
* setup I/O forwarding
|
2005-03-14 23:57:21 +03:00
|
|
|
*/
|
|
|
|
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_ns.create_process_name(&name, 0, *jobid, 0))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_iof.iof_pull(name, ORTE_NS_CMP_JOBID, ORTE_IOF_STDOUT, 1))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_iof.iof_pull(name, ORTE_NS_CMP_JOBID, ORTE_IOF_STDERR, 2))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
2006-07-11 01:25:33 +04:00
|
|
|
|
2006-02-08 20:40:11 +03:00
|
|
|
/** setup the subscription so we can complete the wireup when all processes reach LAUNCHED */
|
|
|
|
rc = orte_rmgr_base_proc_stage_gate_subscribe(*jobid, orte_rmgr_urm_wireup_callback, NULL, ORTE_PROC_STATE_LAUNCHED);
|
|
|
|
if(ORTE_SUCCESS != rc) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
2005-03-14 23:57:21 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* setup callback
|
|
|
|
*/
|
|
|
|
|
|
|
|
if(NULL != cbfunc) {
|
2006-03-29 05:26:16 +04:00
|
|
|
union {
|
|
|
|
orte_rmgr_cb_fn_t func;
|
|
|
|
void * ptr;
|
|
|
|
} cbfunc_union;
|
|
|
|
void *cbdata;
|
|
|
|
|
|
|
|
/* stupid ISO C forbids conversion of object pointer to function
|
|
|
|
pointer. So we do this, which is the same thing, but without
|
|
|
|
the warning from GCC */
|
|
|
|
cbfunc_union.func = cbfunc;
|
|
|
|
cbdata = cbfunc_union.ptr;
|
|
|
|
|
|
|
|
rc = orte_rmgr_base_proc_stage_gate_subscribe(*jobid, orte_rmgr_urm_callback, cbdata, cb_conditions);
|
2005-03-14 23:57:21 +03:00
|
|
|
if(ORTE_SUCCESS != rc) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* launch the job
|
|
|
|
*/
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_rmgr_urm_launch(*jobid))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
orte_ns.free_name(&name);
|
|
|
|
return ORTE_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2005-04-01 03:22:44 +04:00
|
|
|
static int orte_rmgr_urm_finalize(void)
|
|
|
|
{
|
2005-04-15 21:04:57 +04:00
|
|
|
int rc;
|
|
|
|
|
2005-09-19 19:29:14 +04:00
|
|
|
OPAL_TRACE(1);
|
2006-02-08 20:40:11 +03:00
|
|
|
|
2005-04-15 21:04:57 +04:00
|
|
|
/**
|
|
|
|
* Finalize Process Launch Subsystem (PLS)
|
|
|
|
*/
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_pls_base_finalize())) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Finalize Resource Mapping Subsystem (RMAPS)
|
|
|
|
*/
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_rmaps_base_finalize())) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Finalize Resource Allocation Subsystem (RAS)
|
|
|
|
*/
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_ras_base_finalize())) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Finalize Resource Discovery Subsystem (RDS)
|
|
|
|
*/
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_rds_base_finalize())) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
2005-04-01 03:22:44 +04:00
|
|
|
/* Cancel pending receive. */
|
|
|
|
|
|
|
|
orte_rml.recv_cancel(ORTE_RML_NAME_ANY, ORTE_RML_TAG_RMGR_SVC);
|
|
|
|
|
|
|
|
return ORTE_SUCCESS;
|
|
|
|
}
|
|
|
|
|