2005-03-14 23:57:21 +03:00
|
|
|
/*
|
2005-11-05 22:57:48 +03:00
|
|
|
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
|
|
|
* University Research and Technology
|
|
|
|
* Corporation. All rights reserved.
|
|
|
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
|
|
|
* of Tennessee Research Foundation. All rights
|
|
|
|
* reserved.
|
2005-03-14 23:57:21 +03:00
|
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
|
|
* University of Stuttgart. All rights reserved.
|
2005-03-24 15:43:37 +03:00
|
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
|
|
* All rights reserved.
|
2005-03-14 23:57:21 +03:00
|
|
|
* $COPYRIGHT$
|
|
|
|
*
|
|
|
|
* Additional copyrights may follow
|
|
|
|
*
|
|
|
|
* $HEADER$
|
|
|
|
*/
|
|
|
|
|
|
|
|
/** @file **/
|
|
|
|
|
|
|
|
#include "orte_config.h"
|
|
|
|
|
|
|
|
#include <sys/types.h>
|
2005-04-11 22:43:57 +04:00
|
|
|
#ifdef HAVE_UNISTD_H
|
2005-03-14 23:57:21 +03:00
|
|
|
#include <unistd.h>
|
2005-04-11 08:47:58 +04:00
|
|
|
#endif
|
2005-03-14 23:57:21 +03:00
|
|
|
|
|
|
|
#include "include/constants.h"
|
2005-07-04 03:09:55 +04:00
|
|
|
#include "opal/event/event.h"
|
2005-07-04 03:31:27 +04:00
|
|
|
#include "opal/util/output.h"
|
2005-07-04 03:09:55 +04:00
|
|
|
#include "opal/event/event.h"
|
2005-07-04 02:45:48 +04:00
|
|
|
#include "opal/threads/mutex.h"
|
2005-03-14 23:57:21 +03:00
|
|
|
#include "mca/mca.h"
|
|
|
|
#include "mca/base/base.h"
|
|
|
|
#include "mca/base/mca_base_param.h"
|
|
|
|
#include "mca/iof/base/base.h"
|
|
|
|
#include "mca/rml/base/base.h"
|
|
|
|
#include "mca/errmgr/base/base.h"
|
|
|
|
#include "mca/ns/base/base.h"
|
|
|
|
#include "mca/gpr/base/base.h"
|
|
|
|
#include "mca/rmgr/base/base.h"
|
2005-09-01 01:51:10 +04:00
|
|
|
#include "mca/soh/base/base.h"
|
2005-03-14 23:57:21 +03:00
|
|
|
#include "util/proc_info.h"
|
2005-03-19 02:58:36 +03:00
|
|
|
#include "util/sys_info.h"
|
|
|
|
#include "util/univ_info.h"
|
|
|
|
#include "util/session_dir.h"
|
2005-03-14 23:57:21 +03:00
|
|
|
|
|
|
|
#include "runtime/runtime.h"
|
|
|
|
#include "runtime/runtime_internal.h"
|
|
|
|
#include "runtime/orte_wait.h"
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Cleanup and restart a selected set of services.
|
|
|
|
*/
|
|
|
|
|
|
|
|
int orte_restart(orte_process_name_t *name, const char* uri)
|
|
|
|
{
|
2005-03-22 03:31:17 +03:00
|
|
|
int rc;
|
2005-03-14 23:57:21 +03:00
|
|
|
orte_process_name_t* old_name;
|
|
|
|
orte_process_name_t* new_name;
|
2005-03-19 02:40:08 +03:00
|
|
|
|
2005-03-14 23:57:21 +03:00
|
|
|
if (ORTE_SUCCESS != (rc = orte_ns.copy_process_name(&old_name, orte_process_info.my_name))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_ns.copy_process_name(&new_name, name))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Restart event library
|
|
|
|
*/
|
|
|
|
|
2005-07-04 03:09:55 +04:00
|
|
|
if (ORTE_SUCCESS != (rc = opal_event_restart())) {
|
2005-03-14 23:57:21 +03:00
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Close selected components.
|
|
|
|
*/
|
|
|
|
|
2005-03-19 02:40:08 +03:00
|
|
|
orte_iof_base.iof_flush = false;
|
2005-03-14 23:57:21 +03:00
|
|
|
if (ORTE_SUCCESS != (rc = orte_iof_base_close())) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
2005-09-01 01:51:10 +04:00
|
|
|
if (ORTE_SUCCESS != (rc = orte_soh_base_close())) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
2005-03-14 23:57:21 +03:00
|
|
|
if (ORTE_SUCCESS != (rc = orte_gpr_base_close())) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
2005-03-18 06:43:59 +03:00
|
|
|
if (ORTE_SUCCESS != (rc = orte_ns_base_close())) {
|
2005-03-14 23:57:21 +03:00
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
2005-03-18 06:43:59 +03:00
|
|
|
if (ORTE_SUCCESS != (rc = orte_rml_base_close())) {
|
2005-03-14 23:57:21 +03:00
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_wait_finalize())) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* setup new global state
|
|
|
|
*/
|
2005-03-19 03:01:13 +03:00
|
|
|
orte_process_info.seed = false;
|
|
|
|
|
|
|
|
/* if NULL, set ns_replica to old_name and set the corresponding uri parameter */
|
|
|
|
if (NULL == orte_process_info.ns_replica) {
|
|
|
|
orte_process_info.ns_replica = old_name;
|
|
|
|
orte_process_info.ns_replica_uri = strdup(uri);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* if NULL, set gpr_replica to old_name and set the corresponding uri parameter */
|
|
|
|
if (NULL == orte_process_info.gpr_replica) {
|
|
|
|
orte_process_info.gpr_replica = old_name;
|
|
|
|
orte_process_info.gpr_replica_uri = strdup(uri);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* ensure my_name is set to the new_name */
|
|
|
|
if (NULL != orte_process_info.my_name) {
|
|
|
|
free(orte_process_info.my_name);
|
|
|
|
}
|
|
|
|
orte_process_info.my_name = new_name;
|
|
|
|
|
|
|
|
#if 0
|
2005-03-19 02:58:36 +03:00
|
|
|
/* close the proc_info structure so it can be reinitialized */
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_proc_info_finalize())) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* set seed flag to false */
|
|
|
|
id = mca_base_param_register_int("seed", NULL, NULL, NULL, (int)false);
|
|
|
|
if (ORTE_SUCCESS != (rc = mca_base_param_set_int(id, (int)false))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* call proc_info to reset the structure */
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_proc_info())) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* finalize the sys_info structure so it can be reinitialized */
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_sys_info_finalize())) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* call the sys_info function to load structure with any new info */
|
|
|
|
orte_system_info.init = false;
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_sys_info())) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* establish the session directory structure for this process */
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_ns.get_jobid_string(&jobid_str, orte_process_info.my_name))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_ns.get_vpid_string(&procid_str, orte_process_info.my_name))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (orte_debug_flag) {
|
2005-07-04 03:31:27 +04:00
|
|
|
opal_output(0, "[%lu,%lu,%lu] setting up session dir with",
|
2005-03-19 02:58:36 +03:00
|
|
|
ORTE_NAME_ARGS(orte_process_info.my_name));
|
|
|
|
if (NULL != orte_process_info.tmpdir_base) {
|
2005-07-04 03:31:27 +04:00
|
|
|
opal_output(0, "\ttmpdir %s", orte_process_info.tmpdir_base);
|
2005-03-19 02:58:36 +03:00
|
|
|
}
|
2005-07-04 03:31:27 +04:00
|
|
|
opal_output(0, "\tuniverse %s", orte_universe_info.name);
|
|
|
|
opal_output(0, "\tuser %s", orte_system_info.user);
|
|
|
|
opal_output(0, "\thost %s", orte_system_info.nodename);
|
|
|
|
opal_output(0, "\tjobid %s", jobid_str);
|
|
|
|
opal_output(0, "\tprocid %s", procid_str);
|
2005-03-19 02:58:36 +03:00
|
|
|
}
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_session_dir(true,
|
|
|
|
orte_process_info.tmpdir_base,
|
|
|
|
orte_system_info.user,
|
|
|
|
orte_system_info.nodename, NULL,
|
|
|
|
orte_universe_info.name,
|
|
|
|
jobid_str, procid_str))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
if (jobid_str != NULL) free(jobid_str);
|
|
|
|
if (procid_str != NULL) free(procid_str);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
if (NULL != jobid_str) {
|
|
|
|
free(jobid_str);
|
|
|
|
}
|
|
|
|
if (NULL != procid_str) {
|
|
|
|
free(procid_str);
|
|
|
|
}
|
2005-03-19 03:01:13 +03:00
|
|
|
#endif
|
2005-03-19 02:58:36 +03:00
|
|
|
|
2005-03-14 23:57:21 +03:00
|
|
|
/*
|
|
|
|
* Re-open components.
|
|
|
|
*/
|
|
|
|
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_wait_init())) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_ns_base_open())) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_rml_base_open())) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_gpr_base_open())) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
2005-09-01 01:51:10 +04:00
|
|
|
if (ORTE_SUCCESS != (rc = orte_soh_base_open())) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
2005-03-14 23:57:21 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Select new modules.
|
|
|
|
*/
|
|
|
|
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_rml_base_select())) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_ns_base_select())) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_gpr_base_select())) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
2005-09-01 01:51:10 +04:00
|
|
|
if (ORTE_SUCCESS != (rc = orte_soh_base_select())) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
2005-03-14 23:57:21 +03:00
|
|
|
|
|
|
|
/*
|
2005-03-19 03:01:13 +03:00
|
|
|
* Set contact info for the replicas
|
2005-03-14 23:57:21 +03:00
|
|
|
*/
|
|
|
|
|
2005-03-19 03:01:13 +03:00
|
|
|
if (ORTE_SUCCESS != (rc = orte_rml.set_uri(orte_process_info.ns_replica_uri))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_rml.set_uri(orte_process_info.gpr_replica_uri))) {
|
2005-03-14 23:57:21 +03:00
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Re-init selected modules.
|
|
|
|
*/
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_rml.init())) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_ns.init())) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_gpr.init())) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Complete restart
|
|
|
|
*/
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_iof_base_open())) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_iof_base_select())) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
return ORTE_SUCCESS;
|
|
|
|
}
|
|
|
|
|