2005-05-05 23:20:47 +04:00
|
|
|
/*
|
2005-11-05 22:57:48 +03:00
|
|
|
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
|
|
|
* University Research and Technology
|
|
|
|
* Corporation. All rights reserved.
|
2006-08-23 07:32:36 +04:00
|
|
|
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
2005-11-05 22:57:48 +03:00
|
|
|
* of Tennessee Research Foundation. All rights
|
|
|
|
* reserved.
|
2006-02-07 06:32:36 +03:00
|
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
2005-05-05 23:20:47 +04:00
|
|
|
* University of Stuttgart. All rights reserved.
|
|
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
|
|
* All rights reserved.
|
|
|
|
* $COPYRIGHT$
|
2006-02-07 06:32:36 +03:00
|
|
|
*
|
2005-05-05 23:20:47 +04:00
|
|
|
* Additional copyrights may follow
|
2006-02-07 06:32:36 +03:00
|
|
|
*
|
2005-05-05 23:20:47 +04:00
|
|
|
* $HEADER$
|
|
|
|
*/
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @file
|
|
|
|
*
|
|
|
|
* Establish a Head Node Process on a cluster's front end
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
|
|
#include "orte_config.h"
|
|
|
|
|
2005-05-13 01:44:23 +04:00
|
|
|
#include <stdlib.h>
|
2005-05-13 19:05:07 +04:00
|
|
|
#ifdef HAVE_UNISTD_H
|
2005-05-13 01:44:23 +04:00
|
|
|
#include <unistd.h>
|
2005-05-13 19:05:07 +04:00
|
|
|
#endif
|
2005-05-13 01:44:23 +04:00
|
|
|
#include <errno.h>
|
2005-05-05 23:20:47 +04:00
|
|
|
#include <string.h>
|
2005-05-13 01:44:23 +04:00
|
|
|
#include <sys/types.h>
|
|
|
|
#include <sys/stat.h>
|
2005-05-13 19:05:07 +04:00
|
|
|
#ifdef HAVE_SYS_WAIT_H
|
2005-05-13 01:44:23 +04:00
|
|
|
#include <sys/wait.h>
|
2005-05-13 19:05:07 +04:00
|
|
|
#endif
|
2005-05-13 01:44:23 +04:00
|
|
|
#include <fcntl.h>
|
|
|
|
|
2006-02-12 04:33:29 +03:00
|
|
|
#include "orte/orte_constants.h"
|
2005-05-05 23:20:47 +04:00
|
|
|
|
2005-07-04 03:09:55 +04:00
|
|
|
#include "opal/event/event.h"
|
2005-07-04 02:45:48 +04:00
|
|
|
#include "opal/threads/mutex.h"
|
|
|
|
#include "opal/threads/condition.h"
|
2005-07-04 04:13:44 +04:00
|
|
|
#include "opal/util/argv.h"
|
2005-07-04 05:36:20 +04:00
|
|
|
#include "opal/util/opal_environ.h"
|
2005-07-04 03:31:27 +04:00
|
|
|
#include "opal/util/output.h"
|
2005-07-04 05:59:52 +04:00
|
|
|
#include "opal/util/path.h"
|
|
|
|
#include "opal/util/os_path.h"
|
2006-02-07 06:32:36 +03:00
|
|
|
#include "opal/mca/base/mca_base_param.h"
|
|
|
|
|
|
|
|
#include "orte/dss/dss.h"
|
|
|
|
#include "orte/runtime/orte_wait.h"
|
|
|
|
#include "orte/util/univ_info.h"
|
|
|
|
#include "orte/util/sys_info.h"
|
|
|
|
#include "orte/util/proc_info.h"
|
|
|
|
#include "orte/util/session_dir.h"
|
|
|
|
#include "orte/util/universe_setup_file_io.h"
|
2006-08-16 20:35:09 +04:00
|
|
|
#include "orte/mca/smr/smr.h"
|
2006-02-07 06:32:36 +03:00
|
|
|
#include "orte/mca/rml/rml.h"
|
|
|
|
#include "orte/mca/rds/rds_types.h"
|
|
|
|
#include "orte/mca/ns/ns.h"
|
|
|
|
#include "orte/mca/gpr/gpr.h"
|
|
|
|
#include "orte/mca/errmgr/errmgr.h"
|
|
|
|
#include "orte/runtime/runtime.h"
|
|
|
|
#include "orte/runtime/orte_setup_hnp.h"
|
2005-05-05 23:20:47 +04:00
|
|
|
|
2006-08-23 07:32:36 +04:00
|
|
|
#if !defined(__WINDOWS__)
|
2005-05-13 01:44:23 +04:00
|
|
|
extern char **environ;
|
2006-08-23 07:32:36 +04:00
|
|
|
#endif /* !defined(__WINDOWS__) */
|
2005-05-13 01:44:23 +04:00
|
|
|
|
2005-05-18 00:21:59 +04:00
|
|
|
/* Local condition variables and mutex
|
|
|
|
*/
|
2005-07-04 02:45:48 +04:00
|
|
|
static opal_mutex_t orte_setup_hnp_mutex;
|
|
|
|
static opal_condition_t orte_setup_hnp_condition;
|
2005-05-23 18:22:35 +04:00
|
|
|
/* Local return code */
|
|
|
|
static int orte_setup_hnp_rc;
|
|
|
|
/* Local uri storage */
|
|
|
|
static char *orte_setup_hnp_orted_uri;
|
2005-05-18 00:21:59 +04:00
|
|
|
|
2005-08-11 00:01:25 +04:00
|
|
|
static orte_setup_hnp_cb_data_t orte_setup_hnp_cbdata;
|
2005-05-13 01:44:23 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* NON-BLOCKING RECEIVER
|
|
|
|
*/
|
|
|
|
static void orte_setup_hnp_recv(int status, orte_process_name_t* sender,
|
|
|
|
orte_buffer_t* buffer, orte_rml_tag_t tag,
|
|
|
|
void* cbdata);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* PID WAIT CALLBACK
|
|
|
|
*/
|
|
|
|
static void orte_setup_hnp_wait(pid_t wpid, int status, void *data);
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* ORTE_SETUP_HNP
|
|
|
|
*/
|
|
|
|
int orte_setup_hnp(char *target_cluster, char *headnode, char *username)
|
|
|
|
{
|
2005-05-17 01:01:09 +04:00
|
|
|
char **argv, *param, *uri, *uid, *hn=NULL;
|
2005-05-13 01:44:23 +04:00
|
|
|
char *path, *name_string, *orteprobe;
|
2005-05-18 21:56:51 +04:00
|
|
|
int argc, rc=ORTE_SUCCESS, id, intparam;
|
2005-05-13 01:44:23 +04:00
|
|
|
pid_t pid;
|
2005-05-18 00:21:59 +04:00
|
|
|
bool can_launch=false, on_gpr=false;
|
2006-02-07 06:32:36 +03:00
|
|
|
orte_cellid_t cellid=ORTE_CELLID_MAX, *cptr;
|
2005-05-13 01:44:23 +04:00
|
|
|
orte_jobid_t jobid;
|
|
|
|
orte_vpid_t vpid;
|
2006-08-15 23:54:10 +04:00
|
|
|
orte_std_cntr_t i, j, k, cnt=0;
|
2005-05-18 00:21:59 +04:00
|
|
|
orte_gpr_value_t **values=NULL, *value;
|
|
|
|
orte_gpr_keyval_t **keyvals;
|
|
|
|
char *keys[4], *tokens[3], *cellname;
|
|
|
|
struct timeval tv;
|
|
|
|
struct timespec ts;
|
2006-02-07 06:32:36 +03:00
|
|
|
bool infrastructure = true, *bptr, tf_flag;
|
2005-05-18 00:21:59 +04:00
|
|
|
|
2005-05-13 01:44:23 +04:00
|
|
|
/* get the nodename for the headnode of the target cluster */
|
|
|
|
if (NULL == headnode) { /* not provided, so try to look it up */
|
2005-05-18 00:21:59 +04:00
|
|
|
tokens[0] = target_cluster;
|
|
|
|
tokens[1] = NULL;
|
|
|
|
keys[0] = ORTE_RDS_FE_NAME;
|
|
|
|
keys[1] = ORTE_RDS_FE_SSH;
|
|
|
|
keys[2] = ORTE_CELLID_KEY;
|
|
|
|
keys[3] = NULL;
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_gpr.get(ORTE_GPR_TOKENS_OR | ORTE_GPR_KEYS_OR,
|
|
|
|
ORTE_RESOURCE_SEGMENT,
|
|
|
|
tokens, keys, &cnt, &values))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
if (0 == cnt || 0 == values[0]->cnt) { /* nothing found */
|
|
|
|
goto MOVEON;
|
|
|
|
}
|
|
|
|
on_gpr = true;
|
|
|
|
/* need to decide what to do if more than value found. Some
|
|
|
|
* clusters have more than one head node, so which one do
|
|
|
|
* we choose? For now, just take the first one returned.
|
|
|
|
*/
|
|
|
|
keyvals = values[0]->keyvals;
|
|
|
|
for (i=0; i < values[0]->cnt; i++) {
|
|
|
|
if (0 == strcmp(keyvals[i]->key, ORTE_RDS_FE_NAME)) {
|
2006-08-23 07:32:36 +04:00
|
|
|
hn = strdup((const char*)keyvals[i]->value->data);
|
2005-05-18 00:21:59 +04:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (0 == strcmp(keyvals[i]->key, ORTE_RDS_FE_SSH)) {
|
2006-02-07 06:32:36 +03:00
|
|
|
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&bptr, keyvals[i]->value, ORTE_BOOL))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
can_launch = *bptr;
|
2005-05-18 00:21:59 +04:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (0 == strcmp(keyvals[i]->key, ORTE_CELLID_KEY)) {
|
2006-02-07 06:32:36 +03:00
|
|
|
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&cptr, keyvals[i]->value, ORTE_CELLID))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
cellid = *cptr;
|
2005-05-18 00:21:59 +04:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
goto MOVEON;
|
2006-02-07 06:32:36 +03:00
|
|
|
|
2005-05-13 01:44:23 +04:00
|
|
|
} else { /* lookup the headnode's cellid */
|
2005-08-11 00:01:25 +04:00
|
|
|
hn = strdup(headnode);
|
2005-05-18 00:21:59 +04:00
|
|
|
keys[0] = ORTE_RDS_FE_NAME;
|
|
|
|
keys[1] = ORTE_RDS_FE_SSH;
|
|
|
|
keys[2] = ORTE_CELLID_KEY;
|
|
|
|
keys[3] = NULL;
|
2005-08-11 00:01:25 +04:00
|
|
|
|
|
|
|
rc = orte_gpr.get(ORTE_GPR_TOKENS_OR | ORTE_GPR_KEYS_OR,
|
|
|
|
ORTE_RESOURCE_SEGMENT,
|
|
|
|
NULL, keys, &cnt, &values);
|
|
|
|
if (ORTE_SUCCESS != rc) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
2005-05-18 00:21:59 +04:00
|
|
|
}
|
2005-08-11 00:01:25 +04:00
|
|
|
/* Nothing found */
|
|
|
|
if (0 == cnt || 0 == values[0]->cnt) {
|
2005-05-18 00:21:59 +04:00
|
|
|
goto MOVEON;
|
|
|
|
}
|
2005-08-11 00:01:25 +04:00
|
|
|
|
2005-05-18 00:21:59 +04:00
|
|
|
on_gpr = true;
|
|
|
|
for (i=0; i < cnt; i++) {
|
|
|
|
keyvals = values[i]->keyvals;
|
|
|
|
for (j=0; j < values[i]->cnt; j++) {
|
|
|
|
if ((0 == strcmp(keyvals[j]->key, ORTE_RDS_FE_NAME)) &&
|
2006-08-23 07:32:36 +04:00
|
|
|
0 == strcmp((const char*)keyvals[j]->value->data, headnode)) {
|
2005-05-18 00:21:59 +04:00
|
|
|
/* okay, this is the right cell - now need to find
|
|
|
|
* the ssh flag (if provided) and cellid
|
|
|
|
*/
|
|
|
|
for (k=0; k < values[i]->cnt; k++) {
|
|
|
|
if (0 == strcmp(keyvals[k]->key, ORTE_RDS_FE_SSH)) {
|
2006-02-07 06:32:36 +03:00
|
|
|
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&bptr, keyvals[i]->value, ORTE_BOOL))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
can_launch = *bptr;
|
2005-05-18 00:21:59 +04:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (0 == strcmp(keyvals[k]->key, ORTE_CELLID_KEY)) {
|
2006-02-07 06:32:36 +03:00
|
|
|
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&cptr, keyvals[i]->value, ORTE_CELLID))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
cellid = *cptr;
|
2005-05-18 00:21:59 +04:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
goto MOVEON;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
MOVEON:
|
|
|
|
if (NULL != values) {
|
2006-02-07 06:32:36 +03:00
|
|
|
for (i=0; i < cnt; i++)
|
2005-08-11 00:01:25 +04:00
|
|
|
OBJ_RELEASE(values[i]);
|
2005-05-18 00:21:59 +04:00
|
|
|
free(values);
|
|
|
|
}
|
2006-02-07 06:32:36 +03:00
|
|
|
|
2005-05-18 00:21:59 +04:00
|
|
|
if (!on_gpr && (NULL != target_cluster || NULL != headnode)) {
|
|
|
|
/* if we couldn't find anything about this cell on the gpr, then
|
|
|
|
* we need to put the required headnode data on the registry. We need
|
|
|
|
* it to be there so other functions/processes can find it, if needed.
|
|
|
|
* User must provide either a target_cluster name (which then must be
|
|
|
|
* synonymous with the headnode name), a headnode name (on a named or
|
|
|
|
* unnamed target_cluster), or both.
|
|
|
|
*/
|
2005-08-11 00:01:25 +04:00
|
|
|
|
2005-05-18 00:21:59 +04:00
|
|
|
/* get new cellid for this site/resource */
|
|
|
|
if (NULL != target_cluster) {
|
|
|
|
cellname = strdup(target_cluster);
|
|
|
|
} else {
|
|
|
|
/* if the target_cluster was NULL, then headnode CAN'T be NULL
|
|
|
|
* or else we wouldn't get here
|
|
|
|
*/
|
|
|
|
cellname = strdup(headnode);
|
|
|
|
}
|
2005-08-11 00:01:25 +04:00
|
|
|
|
2005-05-18 00:21:59 +04:00
|
|
|
/* can't know the site name, so it becomes "unknown" */
|
2005-08-11 00:01:25 +04:00
|
|
|
rc = orte_ns.create_cellid(&cellid, "unknown", cellname);
|
|
|
|
if (ORTE_SUCCESS != rc ) {
|
2005-05-18 00:21:59 +04:00
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
free(cellname);
|
|
|
|
return rc;
|
|
|
|
}
|
2005-08-11 00:01:25 +04:00
|
|
|
|
2006-02-07 06:32:36 +03:00
|
|
|
/*
|
|
|
|
* Store the cell info on the resource segment of the registry
|
2005-08-11 00:01:25 +04:00
|
|
|
*/
|
2006-02-07 06:32:36 +03:00
|
|
|
if (ORTE_SUCCESS != (rc = orte_gpr.create_value(&value, ORTE_GPR_TOKENS_XAND | ORTE_GPR_KEYS_OR,
|
|
|
|
ORTE_RESOURCE_SEGMENT, 4, 0))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
2005-05-18 00:21:59 +04:00
|
|
|
}
|
|
|
|
|
2006-02-07 06:32:36 +03:00
|
|
|
rc = orte_schema.get_node_tokens(&(value->tokens), &(value->num_tokens), cellid, cellname);
|
|
|
|
if (ORTE_SUCCESS != rc) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
OBJ_RELEASE(value);
|
|
|
|
return rc;
|
2005-05-18 00:21:59 +04:00
|
|
|
}
|
2005-08-11 00:01:25 +04:00
|
|
|
|
|
|
|
/* Set Cell Name */
|
2006-02-07 06:32:36 +03:00
|
|
|
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[0]), ORTE_RDS_NAME, ORTE_STRING, cellname))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
OBJ_RELEASE(value);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
2005-08-11 00:01:25 +04:00
|
|
|
/* Set Cell ID */
|
2006-02-07 06:32:36 +03:00
|
|
|
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[1]), ORTE_CELLID_KEY, ORTE_CELLID, &cellid))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
OBJ_RELEASE(value);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
2005-08-11 00:01:25 +04:00
|
|
|
/* Set Front End Name */
|
2005-05-18 00:21:59 +04:00
|
|
|
if (NULL == headnode) {
|
2006-02-07 06:32:36 +03:00
|
|
|
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[2]), ORTE_RDS_FE_NAME, ORTE_STRING, cellname))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
OBJ_RELEASE(value);
|
|
|
|
return rc;
|
|
|
|
}
|
2005-05-18 00:21:59 +04:00
|
|
|
} else {
|
2006-02-07 06:32:36 +03:00
|
|
|
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[2]), ORTE_RDS_FE_NAME, ORTE_STRING, headnode))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
OBJ_RELEASE(value);
|
|
|
|
return rc;
|
|
|
|
}
|
2005-05-18 00:21:59 +04:00
|
|
|
}
|
2005-08-11 00:01:25 +04:00
|
|
|
|
2006-02-07 06:32:36 +03:00
|
|
|
/* Asssume ability to ssh to front end node*/
|
|
|
|
tf_flag = true;
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[3]), ORTE_RDS_FE_SSH, ORTE_BOOL, &tf_flag))) {
|
2005-05-18 00:21:59 +04:00
|
|
|
ORTE_ERROR_LOG(rc);
|
2006-02-07 06:32:36 +03:00
|
|
|
OBJ_RELEASE(value);
|
2005-05-18 00:21:59 +04:00
|
|
|
return rc;
|
|
|
|
}
|
2006-02-07 06:32:36 +03:00
|
|
|
|
|
|
|
/* Place value in GPR */
|
2005-08-11 00:01:25 +04:00
|
|
|
rc = orte_gpr.put(1, &value);
|
|
|
|
if (ORTE_SUCCESS != rc) {
|
2005-05-18 00:21:59 +04:00
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
OBJ_RELEASE(value);
|
|
|
|
return rc;
|
|
|
|
}
|
2005-08-11 00:01:25 +04:00
|
|
|
|
2005-05-18 00:21:59 +04:00
|
|
|
OBJ_RELEASE(value);
|
|
|
|
free(cellname);
|
2005-08-11 00:01:25 +04:00
|
|
|
|
2005-05-18 00:21:59 +04:00
|
|
|
can_launch = true;
|
|
|
|
}
|
2006-02-07 06:32:36 +03:00
|
|
|
|
2005-05-18 00:21:59 +04:00
|
|
|
if (!can_launch || ORTE_CELLID_MAX == cellid) {
|
|
|
|
return ORTE_ERR_UNREACH;
|
2005-05-13 01:44:23 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/* get the user's name on the headnode */
|
|
|
|
if (NULL == username) {
|
|
|
|
uid = strdup(orte_system_info.user);
|
|
|
|
} else {
|
|
|
|
uid = strdup(username);
|
|
|
|
}
|
2006-02-07 06:32:36 +03:00
|
|
|
|
2005-05-13 01:44:23 +04:00
|
|
|
/* SETUP TO LAUNCH PROBE */
|
2006-02-07 06:32:36 +03:00
|
|
|
|
2005-05-18 00:21:59 +04:00
|
|
|
/* setup the conditioned wait and mutex variables */
|
2005-07-04 02:45:48 +04:00
|
|
|
OBJ_CONSTRUCT(&orte_setup_hnp_mutex, opal_mutex_t);
|
|
|
|
OBJ_CONSTRUCT(&orte_setup_hnp_condition, opal_condition_t);
|
2005-05-18 00:21:59 +04:00
|
|
|
|
2005-05-13 01:44:23 +04:00
|
|
|
/* get a jobid for the probe */
|
2005-08-11 00:01:25 +04:00
|
|
|
rc = orte_ns.create_jobid(&jobid);
|
|
|
|
if (ORTE_SUCCESS != rc) {
|
2005-05-13 01:44:23 +04:00
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
2005-08-11 00:01:25 +04:00
|
|
|
|
2005-05-13 01:44:23 +04:00
|
|
|
/* get a vpid for the probe */
|
2005-08-11 00:01:25 +04:00
|
|
|
rc = orte_ns.reserve_range(jobid, 1, &vpid);
|
|
|
|
if (ORTE_SUCCESS != rc ) {
|
2005-05-13 01:44:23 +04:00
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
2005-08-11 00:01:25 +04:00
|
|
|
|
2005-05-13 01:44:23 +04:00
|
|
|
/* initialize probe's process name... */
|
|
|
|
rc = orte_ns.create_process_name(&(orte_setup_hnp_cbdata.name), cellid, jobid, vpid);
|
2005-08-11 00:01:25 +04:00
|
|
|
if (ORTE_SUCCESS != rc) {
|
2005-05-13 01:44:23 +04:00
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
2005-08-11 00:01:25 +04:00
|
|
|
|
2005-05-13 01:44:23 +04:00
|
|
|
/* ...and get string representation */
|
2005-08-11 00:01:25 +04:00
|
|
|
rc = orte_ns.get_proc_name_string(&name_string, orte_setup_hnp_cbdata.name);
|
|
|
|
if (ORTE_SUCCESS != rc ) {
|
2005-05-13 01:44:23 +04:00
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
goto CLEANUP;
|
|
|
|
}
|
2005-08-11 00:01:25 +04:00
|
|
|
|
2005-05-13 01:44:23 +04:00
|
|
|
/* setup callback data on sigchild */
|
2005-08-11 00:01:25 +04:00
|
|
|
if (NULL != target_cluster) {
|
|
|
|
orte_setup_hnp_cbdata.target_cluster = strdup(target_cluster);
|
|
|
|
} else {
|
|
|
|
orte_setup_hnp_cbdata.target_cluster = NULL;
|
|
|
|
}
|
|
|
|
|
2005-05-13 01:44:23 +04:00
|
|
|
orte_setup_hnp_cbdata.headnode = strdup(headnode);
|
|
|
|
orte_setup_hnp_cbdata.jobid = jobid;
|
2006-02-07 06:32:36 +03:00
|
|
|
|
2005-05-17 01:01:09 +04:00
|
|
|
/* get name of probe application - just in case user specified something different */
|
2005-05-13 01:44:23 +04:00
|
|
|
id = mca_base_param_register_string("orteprobe",NULL,NULL,NULL,"orteprobe");
|
|
|
|
mca_base_param_lookup_string(id, &orteprobe);
|
2006-02-07 06:32:36 +03:00
|
|
|
|
2005-05-17 01:01:09 +04:00
|
|
|
/* get rsh/ssh launch mechanism parameters */
|
|
|
|
id = mca_base_param_register_string("pls","rsh","agent",NULL,"ssh");
|
|
|
|
mca_base_param_lookup_string(id, ¶m);
|
|
|
|
|
2005-05-13 01:44:23 +04:00
|
|
|
/* Initialize the argv array */
|
2005-07-04 04:13:44 +04:00
|
|
|
argv = opal_argv_split(param, ' ');
|
|
|
|
argc = opal_argv_count(argv);
|
2005-05-13 01:44:23 +04:00
|
|
|
if (argc <= 0) {
|
|
|
|
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
|
|
|
rc = ORTE_ERR_BAD_PARAM;
|
|
|
|
goto CLEANUP;
|
|
|
|
}
|
|
|
|
free(param);
|
2006-02-07 06:32:36 +03:00
|
|
|
|
2005-05-13 01:44:23 +04:00
|
|
|
/* setup the path */
|
2005-07-04 05:59:52 +04:00
|
|
|
path = opal_path_findv(argv[0], 0, environ, NULL);
|
2005-05-13 01:44:23 +04:00
|
|
|
|
|
|
|
/* add the username and nodename */
|
2005-07-04 04:13:44 +04:00
|
|
|
opal_argv_append(&argc, &argv, "-l");
|
|
|
|
opal_argv_append(&argc, &argv, uid);
|
|
|
|
opal_argv_append(&argc, &argv, hn);
|
2005-05-13 01:44:23 +04:00
|
|
|
|
|
|
|
/* add the probe application */
|
2005-07-04 04:13:44 +04:00
|
|
|
opal_argv_append(&argc, &argv, orteprobe);
|
2006-02-07 06:32:36 +03:00
|
|
|
|
2005-05-13 01:44:23 +04:00
|
|
|
/* tell the probe it's name */
|
2005-07-04 04:13:44 +04:00
|
|
|
opal_argv_append(&argc, &argv, "--name");
|
|
|
|
opal_argv_append(&argc, &argv, name_string);
|
2005-05-13 01:44:23 +04:00
|
|
|
|
|
|
|
/* setup probe's ns contact info */
|
2005-07-04 04:13:44 +04:00
|
|
|
opal_argv_append(&argc, &argv, "--nsreplica");
|
2005-05-13 01:44:23 +04:00
|
|
|
if(NULL != orte_process_info.ns_replica_uri) {
|
|
|
|
uri = strdup(orte_process_info.ns_replica_uri);
|
|
|
|
} else {
|
|
|
|
uri = orte_rml.get_uri();
|
|
|
|
}
|
|
|
|
asprintf(¶m, "\"%s\"", uri);
|
2005-07-04 04:13:44 +04:00
|
|
|
opal_argv_append(&argc, &argv, param);
|
2005-05-18 00:21:59 +04:00
|
|
|
free(param);
|
2005-05-13 01:44:23 +04:00
|
|
|
free(uri);
|
|
|
|
|
|
|
|
/* setup probe's gpr contact info */
|
2005-07-04 04:13:44 +04:00
|
|
|
opal_argv_append(&argc, &argv, "--gprreplica");
|
2005-05-13 01:44:23 +04:00
|
|
|
if(NULL != orte_process_info.gpr_replica_uri) {
|
|
|
|
uri = strdup(orte_process_info.gpr_replica_uri);
|
|
|
|
} else {
|
|
|
|
uri = orte_rml.get_uri();
|
|
|
|
}
|
|
|
|
asprintf(¶m, "\"%s\"", uri);
|
2005-07-04 04:13:44 +04:00
|
|
|
opal_argv_append(&argc, &argv, param);
|
2005-05-18 00:21:59 +04:00
|
|
|
free(param);
|
2005-05-13 01:44:23 +04:00
|
|
|
free(uri);
|
|
|
|
|
|
|
|
/* tell the probe who to report to */
|
|
|
|
uri = orte_rml.get_uri();
|
2005-05-18 00:21:59 +04:00
|
|
|
asprintf(¶m, "\"%s\"", uri);
|
2005-07-04 04:13:44 +04:00
|
|
|
opal_argv_append(&argc, &argv, "--requestor");
|
|
|
|
opal_argv_append(&argc, &argv, param);
|
2005-05-18 00:21:59 +04:00
|
|
|
free(param);
|
2005-05-13 01:44:23 +04:00
|
|
|
free(uri);
|
2006-02-07 06:32:36 +03:00
|
|
|
|
2005-05-18 21:56:51 +04:00
|
|
|
/* pass along any parameters for the head node process
|
|
|
|
* in case one needs to be created
|
|
|
|
*/
|
2005-08-11 00:01:25 +04:00
|
|
|
id = mca_base_param_register_string("scope",NULL,NULL,NULL,"public");
|
2005-05-18 21:56:51 +04:00
|
|
|
mca_base_param_lookup_string(id, ¶m);
|
2005-07-04 04:13:44 +04:00
|
|
|
opal_argv_append(&argc, &argv, "--scope");
|
|
|
|
opal_argv_append(&argc, &argv, param);
|
2005-05-18 21:56:51 +04:00
|
|
|
free(param);
|
2006-02-07 06:32:36 +03:00
|
|
|
|
2005-05-18 21:56:51 +04:00
|
|
|
id = mca_base_param_register_int("persistent",NULL,NULL,NULL,(int)false);
|
|
|
|
mca_base_param_lookup_int(id, &intparam);
|
|
|
|
if (intparam) {
|
2005-07-04 04:13:44 +04:00
|
|
|
opal_argv_append(&argc, &argv, "--persistent");
|
2005-05-18 21:56:51 +04:00
|
|
|
}
|
2006-02-07 06:32:36 +03:00
|
|
|
|
2005-05-13 01:44:23 +04:00
|
|
|
/* issue the non-blocking recv to get the probe's findings */
|
|
|
|
rc = orte_rml.recv_buffer_nb(ORTE_RML_NAME_ANY, ORTE_RML_TAG_PROBE,
|
|
|
|
0, orte_setup_hnp_recv, NULL);
|
|
|
|
if(rc < 0) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
goto CLEANUP;
|
|
|
|
}
|
2006-02-07 06:32:36 +03:00
|
|
|
|
2006-08-23 07:32:36 +04:00
|
|
|
#ifndef __WINDOWS__
|
2005-05-13 01:44:23 +04:00
|
|
|
/* fork a child to exec the rsh/ssh session */
|
2005-05-23 18:22:35 +04:00
|
|
|
orte_setup_hnp_rc = ORTE_SUCCESS;
|
2005-05-13 01:44:23 +04:00
|
|
|
pid = fork();
|
|
|
|
if (pid < 0) {
|
|
|
|
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
|
|
|
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
|
|
|
goto CLEANUP;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (pid == 0) { /* child */
|
|
|
|
/* exec the probe launch */
|
|
|
|
execv(path, argv);
|
|
|
|
ORTE_ERROR_LOG(ORTE_ERROR);
|
2005-07-04 03:31:27 +04:00
|
|
|
opal_output(0, "orte_setup_hnp: execv failed with errno=%d\n", errno);
|
2005-05-13 01:44:23 +04:00
|
|
|
return ORTE_ERROR;
|
2005-05-05 23:20:47 +04:00
|
|
|
|
2005-05-13 01:44:23 +04:00
|
|
|
} else { /* parent */
|
|
|
|
orte_wait_cb(pid, orte_setup_hnp_wait, &orte_setup_hnp_cbdata);
|
|
|
|
|
2005-05-18 00:21:59 +04:00
|
|
|
/* block until a timeout occurs or probe dies/calls back */
|
|
|
|
gettimeofday(&tv, NULL);
|
|
|
|
ts.tv_sec = tv.tv_sec + 1000000;
|
|
|
|
ts.tv_nsec = 0;
|
2006-02-07 06:32:36 +03:00
|
|
|
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_LOCK(&orte_setup_hnp_mutex);
|
|
|
|
opal_condition_timedwait(&orte_setup_hnp_condition, &orte_setup_hnp_mutex, &ts);
|
|
|
|
OPAL_THREAD_UNLOCK(&orte_setup_hnp_mutex);
|
2005-05-23 18:22:35 +04:00
|
|
|
|
|
|
|
if (ORTE_SUCCESS == orte_setup_hnp_rc) {
|
2005-08-27 00:13:35 +04:00
|
|
|
/* Remember if we were infrastructre or not */
|
|
|
|
id = mca_base_param_find("orte", NULL, "infrastructure");
|
|
|
|
mca_base_param_lookup_int(id, &intparam);
|
|
|
|
if ( ((int)true) != intparam) {
|
|
|
|
infrastructure = false;
|
|
|
|
}
|
|
|
|
|
2006-06-29 23:05:41 +04:00
|
|
|
/* need to restart the local system so it can connect to the remote daemon. */
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_restart(orte_setup_hnp_cbdata.name, orte_setup_hnp_orted_uri))) {
|
|
|
|
/** can't use ORTE_ERROR_LOG here as it may no longer be valid. Since we may
|
|
|
|
* have gotten part way through the shutdown/restart process, we can't have
|
|
|
|
* any idea of our current state - all we can really do at this point is
|
|
|
|
* abort
|
|
|
|
*/
|
|
|
|
fprintf(stderr, "orte_setup_hnp: aborted during restart of local process\n");
|
2005-05-23 18:22:35 +04:00
|
|
|
}
|
2005-08-11 00:01:25 +04:00
|
|
|
|
2005-05-23 18:22:35 +04:00
|
|
|
/*
|
|
|
|
* ...and we are now ready to go!
|
|
|
|
*/
|
|
|
|
return ORTE_SUCCESS;
|
|
|
|
}
|
2006-02-07 06:32:36 +03:00
|
|
|
|
2005-05-23 18:22:35 +04:00
|
|
|
return orte_setup_hnp_rc;
|
2005-05-18 00:21:59 +04:00
|
|
|
}
|
2005-05-13 19:05:07 +04:00
|
|
|
#else
|
2006-08-23 07:32:36 +04:00
|
|
|
ORTE_ERROR_LOG(ORTE_ERROR);
|
|
|
|
opal_output(0, "This function has not been implemented in windows yet, file %s line %d\n", __FILE__, __LINE__);
|
2005-05-13 19:05:07 +04:00
|
|
|
abort();
|
|
|
|
#endif
|
|
|
|
|
2006-08-23 07:32:36 +04:00
|
|
|
CLEANUP:
|
|
|
|
return rc;
|
2005-05-13 01:44:23 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
static void orte_setup_hnp_recv(int status, orte_process_name_t* sender,
|
|
|
|
orte_buffer_t* buffer, orte_rml_tag_t tag,
|
|
|
|
void* cbdata)
|
|
|
|
{
|
2006-08-15 23:54:10 +04:00
|
|
|
orte_std_cntr_t n=1;
|
2005-05-18 22:24:14 +04:00
|
|
|
int rc;
|
2006-02-07 06:32:36 +03:00
|
|
|
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_LOCK(&orte_setup_hnp_mutex);
|
2006-02-07 06:32:36 +03:00
|
|
|
if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &orte_setup_hnp_orted_uri, &n, ORTE_STRING))) {
|
2005-05-18 22:24:14 +04:00
|
|
|
ORTE_ERROR_LOG(rc);
|
2005-05-23 18:22:35 +04:00
|
|
|
orte_setup_hnp_rc = rc;
|
2005-07-04 02:45:48 +04:00
|
|
|
opal_condition_signal(&orte_setup_hnp_condition);
|
|
|
|
OPAL_THREAD_UNLOCK(&orte_setup_hnp_mutex);
|
2005-05-23 18:22:35 +04:00
|
|
|
return;
|
2005-05-18 22:24:14 +04:00
|
|
|
}
|
2005-05-23 18:22:35 +04:00
|
|
|
orte_setup_hnp_rc = ORTE_SUCCESS;
|
2005-07-04 02:45:48 +04:00
|
|
|
opal_condition_signal(&orte_setup_hnp_condition);
|
|
|
|
OPAL_THREAD_UNLOCK(&orte_setup_hnp_mutex);
|
2005-05-13 01:44:23 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
static void orte_setup_hnp_wait(pid_t wpid, int status, void *cbdata)
|
2005-05-05 23:20:47 +04:00
|
|
|
{
|
2005-05-13 01:44:23 +04:00
|
|
|
orte_setup_hnp_cb_data_t *data;
|
2006-02-07 06:32:36 +03:00
|
|
|
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_LOCK(&orte_setup_hnp_mutex);
|
2005-05-18 00:21:59 +04:00
|
|
|
|
2005-05-13 01:44:23 +04:00
|
|
|
data = (orte_setup_hnp_cb_data_t*)cbdata;
|
|
|
|
|
|
|
|
/* if ssh exited abnormally, print something useful to the user and cleanup
|
|
|
|
* the registry entries for the HNP jobid.
|
|
|
|
This should somehow be pushed up to the calling level, but we
|
|
|
|
don't really have a way to do that just yet.
|
|
|
|
*/
|
|
|
|
if (! WIFEXITED(status) || ! WEXITSTATUS(status) == 0) {
|
|
|
|
/* tell the user something went wrong */
|
2005-07-04 03:31:27 +04:00
|
|
|
opal_output(0, "ERROR: The probe on head node %s of the %s cluster failed to start as expected.",
|
2005-05-13 01:44:23 +04:00
|
|
|
data->headnode, data->target_cluster);
|
2005-07-04 03:31:27 +04:00
|
|
|
opal_output(0, "ERROR: There may be more information available from");
|
|
|
|
opal_output(0, "ERROR: the remote shell (see above).");
|
2005-05-13 01:44:23 +04:00
|
|
|
if (WIFEXITED(status)) {
|
2005-07-04 03:31:27 +04:00
|
|
|
opal_output(0, "ERROR: The probe exited unexpectedly with status %d.",
|
2005-05-13 01:44:23 +04:00
|
|
|
WEXITSTATUS(status));
|
|
|
|
} else if (WIFSIGNALED(status)) {
|
|
|
|
#ifdef WCOREDUMP
|
|
|
|
if (WCOREDUMP(status)) {
|
2005-07-04 03:31:27 +04:00
|
|
|
opal_output(0, "The probe received a signal %d (with core).",
|
2005-05-13 01:44:23 +04:00
|
|
|
WTERMSIG(status));
|
|
|
|
} else {
|
2005-07-04 03:31:27 +04:00
|
|
|
opal_output(0, "The probe received a signal %d.", WTERMSIG(status));
|
2005-05-13 01:44:23 +04:00
|
|
|
}
|
|
|
|
#else
|
2005-07-04 03:31:27 +04:00
|
|
|
opal_output(0, "The probe received a signal %d.", WTERMSIG(status));
|
2005-05-13 01:44:23 +04:00
|
|
|
#endif /* WCOREDUMP */
|
|
|
|
} else {
|
2005-07-04 03:31:27 +04:00
|
|
|
opal_output(0, "No extra status information is available: %d.", status);
|
2005-05-13 01:44:23 +04:00
|
|
|
}
|
|
|
|
}
|
2005-05-18 00:21:59 +04:00
|
|
|
|
2005-07-04 02:45:48 +04:00
|
|
|
opal_condition_signal(&orte_setup_hnp_condition);
|
|
|
|
OPAL_THREAD_UNLOCK(&orte_setup_hnp_mutex);
|
2005-05-18 00:21:59 +04:00
|
|
|
|
2005-05-05 23:20:47 +04:00
|
|
|
}
|
2005-05-13 01:44:23 +04:00
|
|
|
|