2005-05-05 23:20:47 +04:00
|
|
|
/*
|
|
|
|
* Copyright (c) 2004-2005 The Trustees of Indiana University.
|
|
|
|
* All rights reserved.
|
|
|
|
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
|
|
|
* All rights reserved.
|
|
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
|
|
* University of Stuttgart. All rights reserved.
|
|
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
|
|
* All rights reserved.
|
|
|
|
* $COPYRIGHT$
|
|
|
|
*
|
|
|
|
* Additional copyrights may follow
|
|
|
|
*
|
|
|
|
* $HEADER$
|
|
|
|
*/
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @file
|
|
|
|
*
|
|
|
|
* Establish a Head Node Process on a cluster's front end
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
|
|
#include "orte_config.h"
|
|
|
|
|
2005-05-13 01:44:23 +04:00
|
|
|
#include <stdlib.h>
|
2005-05-13 19:05:07 +04:00
|
|
|
#ifdef HAVE_UNISTD_H
|
2005-05-13 01:44:23 +04:00
|
|
|
#include <unistd.h>
|
2005-05-13 19:05:07 +04:00
|
|
|
#endif
|
2005-05-13 01:44:23 +04:00
|
|
|
#include <errno.h>
|
2005-05-05 23:20:47 +04:00
|
|
|
#include <string.h>
|
2005-05-13 01:44:23 +04:00
|
|
|
#include <sys/types.h>
|
|
|
|
#include <sys/stat.h>
|
2005-05-13 19:05:07 +04:00
|
|
|
#ifdef HAVE_SYS_WAIT_H
|
2005-05-13 01:44:23 +04:00
|
|
|
#include <sys/wait.h>
|
2005-05-13 19:05:07 +04:00
|
|
|
#endif
|
2005-05-13 01:44:23 +04:00
|
|
|
#include <fcntl.h>
|
|
|
|
|
2005-05-05 23:20:47 +04:00
|
|
|
|
|
|
|
#include "include/orte_constants.h"
|
2005-05-18 22:24:14 +04:00
|
|
|
#include "dps/dps.h"
|
2005-07-04 03:09:55 +04:00
|
|
|
#include "opal/event/event.h"
|
2005-07-04 02:45:48 +04:00
|
|
|
#include "opal/threads/mutex.h"
|
|
|
|
#include "opal/threads/condition.h"
|
2005-05-13 01:44:23 +04:00
|
|
|
#include "runtime/orte_wait.h"
|
2005-07-04 04:13:44 +04:00
|
|
|
#include "opal/util/argv.h"
|
2005-07-04 05:36:20 +04:00
|
|
|
#include "opal/util/opal_environ.h"
|
2005-07-04 03:31:27 +04:00
|
|
|
#include "opal/util/output.h"
|
2005-07-04 05:59:52 +04:00
|
|
|
#include "opal/util/path.h"
|
2005-05-05 23:20:47 +04:00
|
|
|
#include "util/univ_info.h"
|
|
|
|
#include "util/sys_info.h"
|
|
|
|
#include "util/proc_info.h"
|
2005-07-04 05:59:52 +04:00
|
|
|
#include "opal/util/os_path.h"
|
2005-05-05 23:20:47 +04:00
|
|
|
#include "util/session_dir.h"
|
|
|
|
#include "util/universe_setup_file_io.h"
|
|
|
|
|
2005-05-13 01:44:23 +04:00
|
|
|
#include "mca/base/mca_base_param.h"
|
|
|
|
#include "mca/soh/soh.h"
|
2005-05-05 23:20:47 +04:00
|
|
|
#include "mca/rml/rml.h"
|
2005-05-18 00:21:59 +04:00
|
|
|
#include "mca/rds/rds_types.h"
|
2005-05-05 23:20:47 +04:00
|
|
|
#include "mca/ns/ns.h"
|
2005-05-18 00:21:59 +04:00
|
|
|
#include "mca/gpr/gpr.h"
|
2005-05-05 23:20:47 +04:00
|
|
|
#include "mca/errmgr/errmgr.h"
|
|
|
|
|
|
|
|
#include "runtime/runtime.h"
|
2005-08-11 00:01:25 +04:00
|
|
|
#include "runtime/orte_setup_hnp.h"
|
2005-05-05 23:20:47 +04:00
|
|
|
|
2005-05-13 01:44:23 +04:00
|
|
|
extern char **environ;
|
|
|
|
|
2005-05-18 00:21:59 +04:00
|
|
|
/* Local condition variables and mutex
|
|
|
|
*/
|
2005-07-04 02:45:48 +04:00
|
|
|
static opal_mutex_t orte_setup_hnp_mutex;
|
|
|
|
static opal_condition_t orte_setup_hnp_condition;
|
2005-05-23 18:22:35 +04:00
|
|
|
/* Local return code */
|
|
|
|
static int orte_setup_hnp_rc;
|
|
|
|
/* Local uri storage */
|
|
|
|
static char *orte_setup_hnp_orted_uri;
|
2005-05-18 00:21:59 +04:00
|
|
|
|
2005-08-11 00:01:25 +04:00
|
|
|
static orte_setup_hnp_cb_data_t orte_setup_hnp_cbdata;
|
2005-05-13 01:44:23 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* NON-BLOCKING RECEIVER
|
|
|
|
*/
|
|
|
|
static void orte_setup_hnp_recv(int status, orte_process_name_t* sender,
|
|
|
|
orte_buffer_t* buffer, orte_rml_tag_t tag,
|
|
|
|
void* cbdata);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* PID WAIT CALLBACK
|
|
|
|
*/
|
|
|
|
static void orte_setup_hnp_wait(pid_t wpid, int status, void *data);
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* ORTE_SETUP_HNP
|
|
|
|
*/
|
|
|
|
int orte_setup_hnp(char *target_cluster, char *headnode, char *username)
|
|
|
|
{
|
2005-05-13 19:05:07 +04:00
|
|
|
#ifndef WIN32
|
2005-05-17 01:01:09 +04:00
|
|
|
char **argv, *param, *uri, *uid, *hn=NULL;
|
2005-05-13 01:44:23 +04:00
|
|
|
char *path, *name_string, *orteprobe;
|
2005-05-18 21:56:51 +04:00
|
|
|
int argc, rc=ORTE_SUCCESS, id, intparam;
|
2005-05-13 01:44:23 +04:00
|
|
|
pid_t pid;
|
2005-05-18 00:21:59 +04:00
|
|
|
bool can_launch=false, on_gpr=false;
|
2005-05-17 01:01:09 +04:00
|
|
|
orte_cellid_t cellid=ORTE_CELLID_MAX;
|
2005-05-13 01:44:23 +04:00
|
|
|
orte_jobid_t jobid;
|
|
|
|
orte_vpid_t vpid;
|
2005-05-18 00:21:59 +04:00
|
|
|
size_t i, j, k, cnt=0;
|
|
|
|
orte_gpr_value_t **values=NULL, *value;
|
|
|
|
orte_gpr_keyval_t **keyvals;
|
|
|
|
char *keys[4], *tokens[3], *cellname;
|
|
|
|
struct timeval tv;
|
|
|
|
struct timespec ts;
|
2005-08-27 00:13:35 +04:00
|
|
|
bool infrastructure = true;
|
2005-05-18 00:21:59 +04:00
|
|
|
|
2005-05-13 01:44:23 +04:00
|
|
|
/* get the nodename for the headnode of the target cluster */
|
|
|
|
if (NULL == headnode) { /* not provided, so try to look it up */
|
2005-05-18 00:21:59 +04:00
|
|
|
tokens[0] = target_cluster;
|
|
|
|
tokens[1] = NULL;
|
|
|
|
keys[0] = ORTE_RDS_FE_NAME;
|
|
|
|
keys[1] = ORTE_RDS_FE_SSH;
|
|
|
|
keys[2] = ORTE_CELLID_KEY;
|
|
|
|
keys[3] = NULL;
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_gpr.get(ORTE_GPR_TOKENS_OR | ORTE_GPR_KEYS_OR,
|
|
|
|
ORTE_RESOURCE_SEGMENT,
|
|
|
|
tokens, keys, &cnt, &values))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
if (0 == cnt || 0 == values[0]->cnt) { /* nothing found */
|
|
|
|
goto MOVEON;
|
|
|
|
}
|
|
|
|
on_gpr = true;
|
|
|
|
/* need to decide what to do if more than value found. Some
|
|
|
|
* clusters have more than one head node, so which one do
|
|
|
|
* we choose? For now, just take the first one returned.
|
|
|
|
*/
|
|
|
|
keyvals = values[0]->keyvals;
|
|
|
|
for (i=0; i < values[0]->cnt; i++) {
|
|
|
|
if (0 == strcmp(keyvals[i]->key, ORTE_RDS_FE_NAME)) {
|
|
|
|
hn = strdup(keyvals[i]->value.strptr);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (0 == strcmp(keyvals[i]->key, ORTE_RDS_FE_SSH)) {
|
|
|
|
can_launch = keyvals[i]->value.tf_flag;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (0 == strcmp(keyvals[i]->key, ORTE_CELLID_KEY)) {
|
|
|
|
cellid = keyvals[i]->value.cellid;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
goto MOVEON;
|
|
|
|
|
2005-05-13 01:44:23 +04:00
|
|
|
} else { /* lookup the headnode's cellid */
|
2005-08-11 00:01:25 +04:00
|
|
|
hn = strdup(headnode);
|
2005-05-18 00:21:59 +04:00
|
|
|
keys[0] = ORTE_RDS_FE_NAME;
|
|
|
|
keys[1] = ORTE_RDS_FE_SSH;
|
|
|
|
keys[2] = ORTE_CELLID_KEY;
|
|
|
|
keys[3] = NULL;
|
2005-08-11 00:01:25 +04:00
|
|
|
|
|
|
|
rc = orte_gpr.get(ORTE_GPR_TOKENS_OR | ORTE_GPR_KEYS_OR,
|
|
|
|
ORTE_RESOURCE_SEGMENT,
|
|
|
|
NULL, keys, &cnt, &values);
|
|
|
|
if (ORTE_SUCCESS != rc) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
2005-05-18 00:21:59 +04:00
|
|
|
}
|
2005-08-11 00:01:25 +04:00
|
|
|
/* Nothing found */
|
|
|
|
if (0 == cnt || 0 == values[0]->cnt) {
|
2005-05-18 00:21:59 +04:00
|
|
|
goto MOVEON;
|
|
|
|
}
|
2005-08-11 00:01:25 +04:00
|
|
|
|
2005-05-18 00:21:59 +04:00
|
|
|
on_gpr = true;
|
|
|
|
for (i=0; i < cnt; i++) {
|
|
|
|
keyvals = values[i]->keyvals;
|
|
|
|
for (j=0; j < values[i]->cnt; j++) {
|
|
|
|
if ((0 == strcmp(keyvals[j]->key, ORTE_RDS_FE_NAME)) &&
|
|
|
|
0 == strcmp(keyvals[j]->value.strptr, headnode)) {
|
|
|
|
/* okay, this is the right cell - now need to find
|
|
|
|
* the ssh flag (if provided) and cellid
|
|
|
|
*/
|
|
|
|
for (k=0; k < values[i]->cnt; k++) {
|
|
|
|
if (0 == strcmp(keyvals[k]->key, ORTE_RDS_FE_SSH)) {
|
|
|
|
can_launch = keyvals[k]->value.tf_flag;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (0 == strcmp(keyvals[k]->key, ORTE_CELLID_KEY)) {
|
|
|
|
cellid = keyvals[k]->value.cellid;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
goto MOVEON;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
MOVEON:
|
|
|
|
if (NULL != values) {
|
2005-08-11 00:01:25 +04:00
|
|
|
for (i=0; i < cnt; i++)
|
|
|
|
OBJ_RELEASE(values[i]);
|
2005-05-18 00:21:59 +04:00
|
|
|
free(values);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!on_gpr && (NULL != target_cluster || NULL != headnode)) {
|
|
|
|
/* if we couldn't find anything about this cell on the gpr, then
|
|
|
|
* we need to put the required headnode data on the registry. We need
|
|
|
|
* it to be there so other functions/processes can find it, if needed.
|
|
|
|
* User must provide either a target_cluster name (which then must be
|
|
|
|
* synonymous with the headnode name), a headnode name (on a named or
|
|
|
|
* unnamed target_cluster), or both.
|
|
|
|
*/
|
2005-08-11 00:01:25 +04:00
|
|
|
|
2005-05-18 00:21:59 +04:00
|
|
|
/* get new cellid for this site/resource */
|
|
|
|
if (NULL != target_cluster) {
|
|
|
|
cellname = strdup(target_cluster);
|
|
|
|
} else {
|
|
|
|
/* if the target_cluster was NULL, then headnode CAN'T be NULL
|
|
|
|
* or else we wouldn't get here
|
|
|
|
*/
|
|
|
|
cellname = strdup(headnode);
|
|
|
|
}
|
2005-08-11 00:01:25 +04:00
|
|
|
|
2005-05-18 00:21:59 +04:00
|
|
|
/* can't know the site name, so it becomes "unknown" */
|
2005-08-11 00:01:25 +04:00
|
|
|
rc = orte_ns.create_cellid(&cellid, "unknown", cellname);
|
|
|
|
if (ORTE_SUCCESS != rc ) {
|
2005-05-18 00:21:59 +04:00
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
free(cellname);
|
|
|
|
return rc;
|
|
|
|
}
|
2005-08-11 00:01:25 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Store the cell info on the resource segment of the registry
|
|
|
|
*/
|
2005-05-18 00:21:59 +04:00
|
|
|
value = OBJ_NEW(orte_gpr_value_t);
|
|
|
|
if (NULL == value) {
|
|
|
|
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
|
|
|
return ORTE_ERR_OUT_OF_RESOURCE;
|
|
|
|
}
|
|
|
|
value->addr_mode = ORTE_GPR_TOKENS_XAND | ORTE_GPR_KEYS_OR;
|
2005-08-11 00:01:25 +04:00
|
|
|
value->segment = strdup(ORTE_RESOURCE_SEGMENT);
|
2005-05-18 00:21:59 +04:00
|
|
|
|
2005-08-11 00:01:25 +04:00
|
|
|
value->cnt = 4;
|
2005-05-18 00:21:59 +04:00
|
|
|
value->keyvals = (orte_gpr_keyval_t**)malloc(value->cnt * sizeof(orte_gpr_keyval_t*));
|
|
|
|
if (NULL == value->keyvals) {
|
|
|
|
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
|
|
|
return ORTE_ERR_OUT_OF_RESOURCE;
|
|
|
|
}
|
2005-08-11 00:01:25 +04:00
|
|
|
|
2005-05-18 00:21:59 +04:00
|
|
|
for (i=0; i < value->cnt; i++) {
|
|
|
|
value->keyvals[i] = OBJ_NEW(orte_gpr_keyval_t);
|
|
|
|
if (NULL == value->keyvals[i]) {
|
|
|
|
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
|
|
|
return ORTE_ERR_OUT_OF_RESOURCE;
|
|
|
|
}
|
|
|
|
}
|
2005-08-11 00:01:25 +04:00
|
|
|
|
|
|
|
/* Set Cell Name */
|
|
|
|
value->keyvals[0]->key = strdup(ORTE_RDS_NAME);
|
|
|
|
value->keyvals[0]->type = ORTE_STRING;
|
2005-05-18 00:21:59 +04:00
|
|
|
value->keyvals[0]->value.strptr = strdup(cellname);
|
2005-08-11 00:01:25 +04:00
|
|
|
|
|
|
|
/* Set Cell ID */
|
|
|
|
value->keyvals[1]->key = strdup(ORTE_CELLID_KEY);
|
|
|
|
value->keyvals[1]->type = ORTE_CELLID;
|
2005-05-18 00:21:59 +04:00
|
|
|
value->keyvals[1]->value.cellid = cellid;
|
2005-08-11 00:01:25 +04:00
|
|
|
|
|
|
|
/* Set Front End Name */
|
|
|
|
value->keyvals[2]->key = strdup(ORTE_RDS_FE_NAME);
|
2005-05-18 00:21:59 +04:00
|
|
|
value->keyvals[2]->type = ORTE_STRING;
|
|
|
|
if (NULL == headnode) {
|
|
|
|
value->keyvals[2]->value.strptr = strdup(cellname);
|
|
|
|
} else {
|
|
|
|
value->keyvals[2]->value.strptr = strdup(headnode);
|
|
|
|
}
|
2005-08-11 00:01:25 +04:00
|
|
|
|
|
|
|
/* Asssume ability to ssh to front end node*/
|
|
|
|
value->keyvals[3]->key = strdup(ORTE_RDS_FE_SSH);
|
|
|
|
value->keyvals[3]->type = ORTE_BOOL;
|
2005-05-18 00:21:59 +04:00
|
|
|
value->keyvals[3]->value.tf_flag = true;
|
|
|
|
|
|
|
|
value->num_tokens = 3;
|
|
|
|
value->tokens = (char**)malloc(3 * sizeof(char*));
|
|
|
|
if (NULL == value->tokens) {
|
|
|
|
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
|
|
|
return ORTE_ERR_OUT_OF_RESOURCE;
|
|
|
|
}
|
2005-08-11 00:01:25 +04:00
|
|
|
|
|
|
|
rc = orte_schema.get_node_tokens(&value->tokens, &value->num_tokens, cellid, cellname);
|
|
|
|
if (ORTE_SUCCESS != rc) {
|
2005-05-18 00:21:59 +04:00
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
2005-08-11 00:01:25 +04:00
|
|
|
/* Place tokens in GPR */
|
|
|
|
rc = orte_gpr.put(1, &value);
|
|
|
|
if (ORTE_SUCCESS != rc) {
|
2005-05-18 00:21:59 +04:00
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
OBJ_RELEASE(value);
|
|
|
|
return rc;
|
|
|
|
}
|
2005-08-11 00:01:25 +04:00
|
|
|
|
2005-05-18 00:21:59 +04:00
|
|
|
OBJ_RELEASE(value);
|
|
|
|
free(cellname);
|
2005-08-11 00:01:25 +04:00
|
|
|
|
2005-05-18 00:21:59 +04:00
|
|
|
can_launch = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!can_launch || ORTE_CELLID_MAX == cellid) {
|
|
|
|
return ORTE_ERR_UNREACH;
|
2005-05-13 01:44:23 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/* get the user's name on the headnode */
|
|
|
|
if (NULL == username) {
|
|
|
|
uid = strdup(orte_system_info.user);
|
|
|
|
} else {
|
|
|
|
uid = strdup(username);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* SETUP TO LAUNCH PROBE */
|
|
|
|
|
2005-05-18 00:21:59 +04:00
|
|
|
/* setup the conditioned wait and mutex variables */
|
2005-07-04 02:45:48 +04:00
|
|
|
OBJ_CONSTRUCT(&orte_setup_hnp_mutex, opal_mutex_t);
|
|
|
|
OBJ_CONSTRUCT(&orte_setup_hnp_condition, opal_condition_t);
|
2005-05-18 00:21:59 +04:00
|
|
|
|
2005-05-13 01:44:23 +04:00
|
|
|
/* get a jobid for the probe */
|
2005-08-11 00:01:25 +04:00
|
|
|
rc = orte_ns.create_jobid(&jobid);
|
|
|
|
if (ORTE_SUCCESS != rc) {
|
2005-05-13 01:44:23 +04:00
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
2005-08-11 00:01:25 +04:00
|
|
|
|
2005-05-13 01:44:23 +04:00
|
|
|
/* get a vpid for the probe */
|
2005-08-11 00:01:25 +04:00
|
|
|
rc = orte_ns.reserve_range(jobid, 1, &vpid);
|
|
|
|
if (ORTE_SUCCESS != rc ) {
|
2005-05-13 01:44:23 +04:00
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
2005-08-11 00:01:25 +04:00
|
|
|
|
2005-05-13 01:44:23 +04:00
|
|
|
/* initialize probe's process name... */
|
|
|
|
rc = orte_ns.create_process_name(&(orte_setup_hnp_cbdata.name), cellid, jobid, vpid);
|
2005-08-11 00:01:25 +04:00
|
|
|
if (ORTE_SUCCESS != rc) {
|
2005-05-13 01:44:23 +04:00
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
2005-08-11 00:01:25 +04:00
|
|
|
|
2005-05-13 01:44:23 +04:00
|
|
|
/* ...and get string representation */
|
2005-08-11 00:01:25 +04:00
|
|
|
rc = orte_ns.get_proc_name_string(&name_string, orte_setup_hnp_cbdata.name);
|
|
|
|
if (ORTE_SUCCESS != rc ) {
|
2005-05-13 01:44:23 +04:00
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
goto CLEANUP;
|
|
|
|
}
|
2005-08-11 00:01:25 +04:00
|
|
|
|
2005-05-13 01:44:23 +04:00
|
|
|
/* setup callback data on sigchild */
|
2005-08-11 00:01:25 +04:00
|
|
|
if (NULL != target_cluster) {
|
|
|
|
orte_setup_hnp_cbdata.target_cluster = strdup(target_cluster);
|
|
|
|
} else {
|
|
|
|
orte_setup_hnp_cbdata.target_cluster = NULL;
|
|
|
|
}
|
|
|
|
|
2005-05-13 01:44:23 +04:00
|
|
|
orte_setup_hnp_cbdata.headnode = strdup(headnode);
|
|
|
|
orte_setup_hnp_cbdata.jobid = jobid;
|
|
|
|
|
2005-05-17 01:01:09 +04:00
|
|
|
/* get name of probe application - just in case user specified something different */
|
2005-05-13 01:44:23 +04:00
|
|
|
id = mca_base_param_register_string("orteprobe",NULL,NULL,NULL,"orteprobe");
|
|
|
|
mca_base_param_lookup_string(id, &orteprobe);
|
|
|
|
|
2005-05-17 01:01:09 +04:00
|
|
|
/* get rsh/ssh launch mechanism parameters */
|
|
|
|
id = mca_base_param_register_string("pls","rsh","agent",NULL,"ssh");
|
|
|
|
mca_base_param_lookup_string(id, ¶m);
|
|
|
|
|
2005-05-13 01:44:23 +04:00
|
|
|
/* Initialize the argv array */
|
2005-07-04 04:13:44 +04:00
|
|
|
argv = opal_argv_split(param, ' ');
|
|
|
|
argc = opal_argv_count(argv);
|
2005-05-13 01:44:23 +04:00
|
|
|
if (argc <= 0) {
|
|
|
|
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
|
|
|
rc = ORTE_ERR_BAD_PARAM;
|
|
|
|
goto CLEANUP;
|
|
|
|
}
|
|
|
|
free(param);
|
|
|
|
|
|
|
|
/* setup the path */
|
2005-07-04 05:59:52 +04:00
|
|
|
path = opal_path_findv(argv[0], 0, environ, NULL);
|
2005-05-13 01:44:23 +04:00
|
|
|
|
|
|
|
/* add the username and nodename */
|
2005-07-04 04:13:44 +04:00
|
|
|
opal_argv_append(&argc, &argv, "-l");
|
|
|
|
opal_argv_append(&argc, &argv, uid);
|
|
|
|
opal_argv_append(&argc, &argv, hn);
|
2005-05-13 01:44:23 +04:00
|
|
|
|
|
|
|
/* add the probe application */
|
2005-07-04 04:13:44 +04:00
|
|
|
opal_argv_append(&argc, &argv, orteprobe);
|
2005-05-13 01:44:23 +04:00
|
|
|
|
|
|
|
/* tell the probe it's name */
|
2005-07-04 04:13:44 +04:00
|
|
|
opal_argv_append(&argc, &argv, "--name");
|
|
|
|
opal_argv_append(&argc, &argv, name_string);
|
2005-05-13 01:44:23 +04:00
|
|
|
|
|
|
|
/* setup probe's ns contact info */
|
2005-07-04 04:13:44 +04:00
|
|
|
opal_argv_append(&argc, &argv, "--nsreplica");
|
2005-05-13 01:44:23 +04:00
|
|
|
if(NULL != orte_process_info.ns_replica_uri) {
|
|
|
|
uri = strdup(orte_process_info.ns_replica_uri);
|
|
|
|
} else {
|
|
|
|
uri = orte_rml.get_uri();
|
|
|
|
}
|
|
|
|
asprintf(¶m, "\"%s\"", uri);
|
2005-07-04 04:13:44 +04:00
|
|
|
opal_argv_append(&argc, &argv, param);
|
2005-05-18 00:21:59 +04:00
|
|
|
free(param);
|
2005-05-13 01:44:23 +04:00
|
|
|
free(uri);
|
|
|
|
|
|
|
|
/* setup probe's gpr contact info */
|
2005-07-04 04:13:44 +04:00
|
|
|
opal_argv_append(&argc, &argv, "--gprreplica");
|
2005-05-13 01:44:23 +04:00
|
|
|
if(NULL != orte_process_info.gpr_replica_uri) {
|
|
|
|
uri = strdup(orte_process_info.gpr_replica_uri);
|
|
|
|
} else {
|
|
|
|
uri = orte_rml.get_uri();
|
|
|
|
}
|
|
|
|
asprintf(¶m, "\"%s\"", uri);
|
2005-07-04 04:13:44 +04:00
|
|
|
opal_argv_append(&argc, &argv, param);
|
2005-05-18 00:21:59 +04:00
|
|
|
free(param);
|
2005-05-13 01:44:23 +04:00
|
|
|
free(uri);
|
|
|
|
|
|
|
|
/* tell the probe who to report to */
|
|
|
|
uri = orte_rml.get_uri();
|
2005-05-18 00:21:59 +04:00
|
|
|
asprintf(¶m, "\"%s\"", uri);
|
2005-07-04 04:13:44 +04:00
|
|
|
opal_argv_append(&argc, &argv, "--requestor");
|
|
|
|
opal_argv_append(&argc, &argv, param);
|
2005-05-18 00:21:59 +04:00
|
|
|
free(param);
|
2005-05-13 01:44:23 +04:00
|
|
|
free(uri);
|
|
|
|
|
2005-05-18 21:56:51 +04:00
|
|
|
/* pass along any parameters for the head node process
|
|
|
|
* in case one needs to be created
|
|
|
|
*/
|
2005-08-11 00:01:25 +04:00
|
|
|
id = mca_base_param_register_string("scope",NULL,NULL,NULL,"public");
|
2005-05-18 21:56:51 +04:00
|
|
|
mca_base_param_lookup_string(id, ¶m);
|
2005-07-04 04:13:44 +04:00
|
|
|
opal_argv_append(&argc, &argv, "--scope");
|
|
|
|
opal_argv_append(&argc, &argv, param);
|
2005-05-18 21:56:51 +04:00
|
|
|
free(param);
|
|
|
|
|
|
|
|
id = mca_base_param_register_int("persistent",NULL,NULL,NULL,(int)false);
|
|
|
|
mca_base_param_lookup_int(id, &intparam);
|
|
|
|
if (intparam) {
|
2005-07-04 04:13:44 +04:00
|
|
|
opal_argv_append(&argc, &argv, "--persistent");
|
2005-05-18 21:56:51 +04:00
|
|
|
}
|
|
|
|
|
2005-05-13 01:44:23 +04:00
|
|
|
/* issue the non-blocking recv to get the probe's findings */
|
|
|
|
rc = orte_rml.recv_buffer_nb(ORTE_RML_NAME_ANY, ORTE_RML_TAG_PROBE,
|
|
|
|
0, orte_setup_hnp_recv, NULL);
|
|
|
|
if(rc < 0) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
goto CLEANUP;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* fork a child to exec the rsh/ssh session */
|
2005-05-23 18:22:35 +04:00
|
|
|
orte_setup_hnp_rc = ORTE_SUCCESS;
|
2005-05-13 01:44:23 +04:00
|
|
|
pid = fork();
|
|
|
|
if (pid < 0) {
|
|
|
|
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
|
|
|
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
|
|
|
goto CLEANUP;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (pid == 0) { /* child */
|
|
|
|
/* exec the probe launch */
|
|
|
|
execv(path, argv);
|
|
|
|
ORTE_ERROR_LOG(ORTE_ERROR);
|
2005-07-04 03:31:27 +04:00
|
|
|
opal_output(0, "orte_setup_hnp: execv failed with errno=%d\n", errno);
|
2005-05-13 01:44:23 +04:00
|
|
|
return ORTE_ERROR;
|
2005-05-05 23:20:47 +04:00
|
|
|
|
2005-05-13 01:44:23 +04:00
|
|
|
} else { /* parent */
|
|
|
|
orte_wait_cb(pid, orte_setup_hnp_wait, &orte_setup_hnp_cbdata);
|
|
|
|
|
2005-05-18 00:21:59 +04:00
|
|
|
/* block until a timeout occurs or probe dies/calls back */
|
|
|
|
gettimeofday(&tv, NULL);
|
|
|
|
ts.tv_sec = tv.tv_sec + 1000000;
|
|
|
|
ts.tv_nsec = 0;
|
|
|
|
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_LOCK(&orte_setup_hnp_mutex);
|
|
|
|
opal_condition_timedwait(&orte_setup_hnp_condition, &orte_setup_hnp_mutex, &ts);
|
|
|
|
OPAL_THREAD_UNLOCK(&orte_setup_hnp_mutex);
|
2005-05-23 18:22:35 +04:00
|
|
|
|
|
|
|
if (ORTE_SUCCESS == orte_setup_hnp_rc) {
|
2005-08-27 00:13:35 +04:00
|
|
|
/* Remember if we were infrastructre or not */
|
|
|
|
id = mca_base_param_find("orte", NULL, "infrastructure");
|
|
|
|
mca_base_param_lookup_int(id, &intparam);
|
|
|
|
if ( ((int)true) != intparam) {
|
|
|
|
infrastructure = false;
|
|
|
|
}
|
|
|
|
|
2005-05-23 18:22:35 +04:00
|
|
|
/* need to restart the local system so it can connect to the remote daemon.
|
|
|
|
* we only want to clear the run-time itself - we cannot close the OPAL
|
|
|
|
* utilities, though, or we will lose all of our MCA parameters
|
|
|
|
*/
|
|
|
|
orte_system_finalize();
|
2005-08-11 00:01:25 +04:00
|
|
|
|
2005-05-23 18:22:35 +04:00
|
|
|
/*
|
|
|
|
* now set the relevant MCA parameters to point us at the remote daemon...
|
|
|
|
*/
|
2005-08-11 00:01:25 +04:00
|
|
|
rc = opal_setenv("OMPI_MCA_gpr_replica_uri",
|
|
|
|
orte_setup_hnp_orted_uri, true, &environ);
|
|
|
|
if (ORTE_SUCCESS != rc) {
|
2005-05-23 18:22:35 +04:00
|
|
|
fprintf(stderr, "orte_setup_hnp: could not set gpr_replica_uri in environ\n");
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
2005-08-11 00:01:25 +04:00
|
|
|
rc = opal_setenv("OMPI_MCA_ns_replica_uri",
|
|
|
|
orte_setup_hnp_orted_uri, true, &environ);
|
|
|
|
if (ORTE_SUCCESS != rc) {
|
2005-05-23 18:22:35 +04:00
|
|
|
fprintf(stderr, "orte_setup_hnp: could not set ns_replica_uri in environ\n");
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
2005-07-04 05:36:20 +04:00
|
|
|
opal_unsetenv("OMPI_MCA_seed", &environ);
|
2005-08-11 00:01:25 +04:00
|
|
|
|
|
|
|
rc = opal_setenv("OMPI_MCA_universe_uri",
|
|
|
|
orte_setup_hnp_orted_uri, true, &environ);
|
|
|
|
if (ORTE_SUCCESS != rc) {
|
2005-05-25 20:29:01 +04:00
|
|
|
fprintf(stderr, "orte_setup_hnp: could not set universe_uri in environ\n");
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
2005-05-23 18:22:35 +04:00
|
|
|
/*
|
|
|
|
* ...re-init ourselves...
|
|
|
|
*/
|
2005-08-27 00:13:35 +04:00
|
|
|
rc = orte_system_init(infrastructure);
|
2005-08-11 00:01:25 +04:00
|
|
|
if (ORTE_SUCCESS != rc) {
|
2005-05-23 18:22:35 +04:00
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
2005-08-11 00:01:25 +04:00
|
|
|
|
2005-05-23 18:22:35 +04:00
|
|
|
/*
|
|
|
|
* ...and we are now ready to go!
|
|
|
|
*/
|
|
|
|
return ORTE_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
return orte_setup_hnp_rc;
|
2005-05-18 00:21:59 +04:00
|
|
|
}
|
|
|
|
|
2005-05-13 01:44:23 +04:00
|
|
|
CLEANUP:
|
|
|
|
return rc;
|
2005-05-13 19:05:07 +04:00
|
|
|
|
|
|
|
#else
|
|
|
|
printf ("This function has not been implemented in windows yet, file %s line %d\n", __FILE__, __LINE__);
|
|
|
|
abort();
|
|
|
|
#endif
|
|
|
|
|
2005-05-13 01:44:23 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
static void orte_setup_hnp_recv(int status, orte_process_name_t* sender,
|
|
|
|
orte_buffer_t* buffer, orte_rml_tag_t tag,
|
|
|
|
void* cbdata)
|
|
|
|
{
|
2005-05-18 22:24:14 +04:00
|
|
|
size_t n=1;
|
|
|
|
int rc;
|
|
|
|
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_LOCK(&orte_setup_hnp_mutex);
|
2005-05-23 18:22:35 +04:00
|
|
|
if (ORTE_SUCCESS != (rc = orte_dps.unpack(buffer, &orte_setup_hnp_orted_uri, &n, ORTE_STRING))) {
|
2005-05-18 22:24:14 +04:00
|
|
|
ORTE_ERROR_LOG(rc);
|
2005-05-23 18:22:35 +04:00
|
|
|
orte_setup_hnp_rc = rc;
|
2005-07-04 02:45:48 +04:00
|
|
|
opal_condition_signal(&orte_setup_hnp_condition);
|
|
|
|
OPAL_THREAD_UNLOCK(&orte_setup_hnp_mutex);
|
2005-05-23 18:22:35 +04:00
|
|
|
return;
|
2005-05-18 22:24:14 +04:00
|
|
|
}
|
2005-05-23 18:22:35 +04:00
|
|
|
orte_setup_hnp_rc = ORTE_SUCCESS;
|
2005-07-04 02:45:48 +04:00
|
|
|
opal_condition_signal(&orte_setup_hnp_condition);
|
|
|
|
OPAL_THREAD_UNLOCK(&orte_setup_hnp_mutex);
|
2005-05-13 01:44:23 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
static void orte_setup_hnp_wait(pid_t wpid, int status, void *cbdata)
|
2005-05-05 23:20:47 +04:00
|
|
|
{
|
2005-05-13 01:44:23 +04:00
|
|
|
orte_setup_hnp_cb_data_t *data;
|
|
|
|
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_LOCK(&orte_setup_hnp_mutex);
|
2005-05-18 00:21:59 +04:00
|
|
|
|
2005-05-13 01:44:23 +04:00
|
|
|
data = (orte_setup_hnp_cb_data_t*)cbdata;
|
|
|
|
|
|
|
|
/* if ssh exited abnormally, print something useful to the user and cleanup
|
|
|
|
* the registry entries for the HNP jobid.
|
|
|
|
This should somehow be pushed up to the calling level, but we
|
|
|
|
don't really have a way to do that just yet.
|
|
|
|
*/
|
|
|
|
if (! WIFEXITED(status) || ! WEXITSTATUS(status) == 0) {
|
|
|
|
/* tell the user something went wrong */
|
2005-07-04 03:31:27 +04:00
|
|
|
opal_output(0, "ERROR: The probe on head node %s of the %s cluster failed to start as expected.",
|
2005-05-13 01:44:23 +04:00
|
|
|
data->headnode, data->target_cluster);
|
2005-07-04 03:31:27 +04:00
|
|
|
opal_output(0, "ERROR: There may be more information available from");
|
|
|
|
opal_output(0, "ERROR: the remote shell (see above).");
|
2005-05-13 01:44:23 +04:00
|
|
|
if (WIFEXITED(status)) {
|
2005-07-04 03:31:27 +04:00
|
|
|
opal_output(0, "ERROR: The probe exited unexpectedly with status %d.",
|
2005-05-13 01:44:23 +04:00
|
|
|
WEXITSTATUS(status));
|
|
|
|
} else if (WIFSIGNALED(status)) {
|
|
|
|
#ifdef WCOREDUMP
|
|
|
|
if (WCOREDUMP(status)) {
|
2005-07-04 03:31:27 +04:00
|
|
|
opal_output(0, "The probe received a signal %d (with core).",
|
2005-05-13 01:44:23 +04:00
|
|
|
WTERMSIG(status));
|
|
|
|
} else {
|
2005-07-04 03:31:27 +04:00
|
|
|
opal_output(0, "The probe received a signal %d.", WTERMSIG(status));
|
2005-05-13 01:44:23 +04:00
|
|
|
}
|
|
|
|
#else
|
2005-07-04 03:31:27 +04:00
|
|
|
opal_output(0, "The probe received a signal %d.", WTERMSIG(status));
|
2005-05-13 01:44:23 +04:00
|
|
|
#endif /* WCOREDUMP */
|
|
|
|
} else {
|
2005-07-04 03:31:27 +04:00
|
|
|
opal_output(0, "No extra status information is available: %d.", status);
|
2005-05-13 01:44:23 +04:00
|
|
|
}
|
|
|
|
}
|
2005-05-18 00:21:59 +04:00
|
|
|
|
2005-07-04 02:45:48 +04:00
|
|
|
opal_condition_signal(&orte_setup_hnp_condition);
|
|
|
|
OPAL_THREAD_UNLOCK(&orte_setup_hnp_mutex);
|
2005-05-18 00:21:59 +04:00
|
|
|
|
2005-05-05 23:20:47 +04:00
|
|
|
}
|
2005-05-13 01:44:23 +04:00
|
|
|
|