1
1

A bit of cleanup and a couple of bug fixes for remote orted launching

using orteprobe.

Created a header file for orte_setup_hnp. [HNP = Head Node Process]

General cleanup and added a bit of documentation in orte_setup_hnp.c
Also fixed a cellid tokens issue (circa line 285)
Changed the launched scope from private to public

In orteprobe:
- added reference to orted.h to avoid duplicate header contents in orteprobe.h
- removed the version tag, and put in a verbose argument
- Fixed a buffer packing problem that was causing the parent from receiving the
  proper contact information for the new daemon.

This commit was SVN r6802.
Этот коммит содержится в:
Josh Hursey 2005-08-10 20:01:25 +00:00
родитель b405316075
Коммит afe7e687cb
4 изменённых файлов: 166 добавлений и 93 удалений

Просмотреть файл

@ -63,19 +63,10 @@
#include "mca/errmgr/errmgr.h"
#include "runtime/runtime.h"
#include "runtime/orte_setup_hnp.h"
extern char **environ;
/*
* Local data structure
*/
typedef struct {
char *target_cluster;
char *headnode;
orte_process_name_t *name;
orte_jobid_t jobid;
} orte_setup_hnp_cb_data_t;
/* Local condition variables and mutex
*/
static opal_mutex_t orte_setup_hnp_mutex;
@ -85,7 +76,7 @@ static int orte_setup_hnp_rc;
/* Local uri storage */
static char *orte_setup_hnp_orted_uri;
static orte_setup_hnp_cb_data_t orte_setup_hnp_cbdata = {NULL, NULL, NULL, 0};
static orte_setup_hnp_cb_data_t orte_setup_hnp_cbdata;
/*
* NON-BLOCKING RECEIVER
@ -161,20 +152,24 @@ int orte_setup_hnp(char *target_cluster, char *headnode, char *username)
goto MOVEON;
} else { /* lookup the headnode's cellid */
hn = strdup(headnode);
hn = strdup(headnode);
keys[0] = ORTE_RDS_FE_NAME;
keys[1] = ORTE_RDS_FE_SSH;
keys[2] = ORTE_CELLID_KEY;
keys[3] = NULL;
if (ORTE_SUCCESS != (rc = orte_gpr.get(ORTE_GPR_TOKENS_OR | ORTE_GPR_KEYS_OR,
ORTE_RESOURCE_SEGMENT,
NULL, keys, &cnt, &values))) {
ORTE_ERROR_LOG(rc);
return rc;
rc = orte_gpr.get(ORTE_GPR_TOKENS_OR | ORTE_GPR_KEYS_OR,
ORTE_RESOURCE_SEGMENT,
NULL, keys, &cnt, &values);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (0 == cnt || 0 == values[0]->cnt) { /* nothing found */
/* Nothing found */
if (0 == cnt || 0 == values[0]->cnt) {
goto MOVEON;
}
on_gpr = true;
for (i=0; i < cnt; i++) {
keyvals = values[i]->keyvals;
@ -202,7 +197,8 @@ int orte_setup_hnp(char *target_cluster, char *headnode, char *username)
MOVEON:
if (NULL != values) {
for (i=0; i < cnt; i++) OBJ_RELEASE(values[i]);
for (i=0; i < cnt; i++)
OBJ_RELEASE(values[i]);
free(values);
}
@ -214,6 +210,7 @@ MOVEON:
* synonymous with the headnode name), a headnode name (on a named or
* unnamed target_cluster), or both.
*/
/* get new cellid for this site/resource */
if (NULL != target_cluster) {
cellname = strdup(target_cluster);
@ -223,28 +220,33 @@ MOVEON:
*/
cellname = strdup(headnode);
}
/* can't know the site name, so it becomes "unknown" */
if (ORTE_SUCCESS != (rc = orte_ns.create_cellid(&cellid, "UNKNOWN",
cellname))) {
rc = orte_ns.create_cellid(&cellid, "unknown", cellname);
if (ORTE_SUCCESS != rc ) {
ORTE_ERROR_LOG(rc);
free(cellname);
return rc;
}
/* now store the cell info on the resource segment of the registry */
/*
* Store the cell info on the resource segment of the registry
*/
value = OBJ_NEW(orte_gpr_value_t);
if (NULL == value) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
value->addr_mode = ORTE_GPR_TOKENS_XAND | ORTE_GPR_KEYS_OR;
value->segment = strdup(ORTE_RESOURCE_SEGMENT);
value->segment = strdup(ORTE_RESOURCE_SEGMENT);
value->cnt = 4;
value->cnt = 4;
value->keyvals = (orte_gpr_keyval_t**)malloc(value->cnt * sizeof(orte_gpr_keyval_t*));
if (NULL == value->keyvals) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
for (i=0; i < value->cnt; i++) {
value->keyvals[i] = OBJ_NEW(orte_gpr_keyval_t);
if (NULL == value->keyvals[i]) {
@ -252,21 +254,29 @@ MOVEON:
return ORTE_ERR_OUT_OF_RESOURCE;
}
}
value->keyvals[0]->key = strdup(ORTE_RDS_NAME);
value->keyvals[0]->type = ORTE_STRING;
/* Set Cell Name */
value->keyvals[0]->key = strdup(ORTE_RDS_NAME);
value->keyvals[0]->type = ORTE_STRING;
value->keyvals[0]->value.strptr = strdup(cellname);
value->keyvals[1]->key = strdup(ORTE_CELLID_KEY);
value->keyvals[1]->type = ORTE_CELLID;
/* Set Cell ID */
value->keyvals[1]->key = strdup(ORTE_CELLID_KEY);
value->keyvals[1]->type = ORTE_CELLID;
value->keyvals[1]->value.cellid = cellid;
value->keyvals[2]->key = strdup(ORTE_RDS_FE_NAME);
/* Set Front End Name */
value->keyvals[2]->key = strdup(ORTE_RDS_FE_NAME);
value->keyvals[2]->type = ORTE_STRING;
if (NULL == headnode) {
value->keyvals[2]->value.strptr = strdup(cellname);
} else {
value->keyvals[2]->value.strptr = strdup(headnode);
}
value->keyvals[3]->key = strdup(ORTE_RDS_FE_SSH);
value->keyvals[3]->type = ORTE_BOOL;
/* Asssume ability to ssh to front end node*/
value->keyvals[3]->key = strdup(ORTE_RDS_FE_SSH);
value->keyvals[3]->type = ORTE_BOOL;
value->keyvals[3]->value.tf_flag = true;
value->num_tokens = 3;
@ -275,25 +285,27 @@ MOVEON:
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (ORTE_SUCCESS != (rc = orte_ns.convert_cellid_to_string(&(value->tokens[0]), cellid))) {
rc = orte_schema.get_node_tokens(&value->tokens, &value->num_tokens, cellid, cellname);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
return rc;
}
value->tokens[1] = strdup("UNKNOWN"); /* site name is unknown */
value->tokens[2] = strdup(cellname);
if (ORTE_SUCCESS != orte_gpr.put(1, &value)) {
/* Place tokens in GPR */
rc = orte_gpr.put(1, &value);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(value);
return rc;
}
OBJ_RELEASE(value);
free(cellname);
can_launch = true;
}
orte_gpr.dump_segment(NULL, 0);
if (!can_launch || ORTE_CELLID_MAX == cellid) {
return ORTE_ERR_UNREACH;
}
@ -312,28 +324,40 @@ MOVEON:
OBJ_CONSTRUCT(&orte_setup_hnp_condition, opal_condition_t);
/* get a jobid for the probe */
if (ORTE_SUCCESS != (rc = orte_ns.create_jobid(&jobid))) {
rc = orte_ns.create_jobid(&jobid);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* get a vpid for the probe */
if (ORTE_SUCCESS != (rc = orte_ns.reserve_range(jobid, 1, &vpid))) {
rc = orte_ns.reserve_range(jobid, 1, &vpid);
if (ORTE_SUCCESS != rc ) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* initialize probe's process name... */
rc = orte_ns.create_process_name(&(orte_setup_hnp_cbdata.name), cellid, jobid, vpid);
if(ORTE_SUCCESS != rc) {
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* ...and get string representation */
if(ORTE_SUCCESS != (rc = orte_ns.get_proc_name_string(&name_string, orte_setup_hnp_cbdata.name))) {
rc = orte_ns.get_proc_name_string(&name_string, orte_setup_hnp_cbdata.name);
if (ORTE_SUCCESS != rc ) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
/* setup callback data on sigchild */
orte_setup_hnp_cbdata.target_cluster = strdup(target_cluster);
if (NULL != target_cluster) {
orte_setup_hnp_cbdata.target_cluster = strdup(target_cluster);
} else {
orte_setup_hnp_cbdata.target_cluster = NULL;
}
orte_setup_hnp_cbdata.headnode = strdup(headnode);
orte_setup_hnp_cbdata.jobid = jobid;
@ -405,7 +429,7 @@ MOVEON:
/* pass along any parameters for the head node process
* in case one needs to be created
*/
id = mca_base_param_register_string("scope",NULL,NULL,NULL,"private");
id = mca_base_param_register_string("scope",NULL,NULL,NULL,"public");
mca_base_param_lookup_string(id, &param);
opal_argv_append(&argc, &argv, "--scope");
opal_argv_append(&argc, &argv, param);
@ -459,25 +483,29 @@ MOVEON:
* utilities, though, or we will lose all of our MCA parameters
*/
orte_system_finalize();
/*
* now set the relevant MCA parameters to point us at the remote daemon...
*/
if (ORTE_SUCCESS != (rc = opal_setenv("OMPI_MCA_gpr_replica_uri",
orte_setup_hnp_orted_uri, true, &environ))) {
rc = opal_setenv("OMPI_MCA_gpr_replica_uri",
orte_setup_hnp_orted_uri, true, &environ);
if (ORTE_SUCCESS != rc) {
fprintf(stderr, "orte_setup_hnp: could not set gpr_replica_uri in environ\n");
return rc;
}
if (ORTE_SUCCESS != (rc = opal_setenv("OMPI_MCA_ns_replica_uri",
orte_setup_hnp_orted_uri, true, &environ))) {
rc = opal_setenv("OMPI_MCA_ns_replica_uri",
orte_setup_hnp_orted_uri, true, &environ);
if (ORTE_SUCCESS != rc) {
fprintf(stderr, "orte_setup_hnp: could not set ns_replica_uri in environ\n");
return rc;
}
opal_unsetenv("OMPI_MCA_seed", &environ);
if (ORTE_SUCCESS != (rc = opal_setenv("OMPI_MCA_universe_uri",
orte_setup_hnp_orted_uri, true, &environ))) {
rc = opal_setenv("OMPI_MCA_universe_uri",
orte_setup_hnp_orted_uri, true, &environ);
if (ORTE_SUCCESS != rc) {
fprintf(stderr, "orte_setup_hnp: could not set universe_uri in environ\n");
return rc;
}
@ -485,10 +513,12 @@ MOVEON:
/*
* ...re-init ourselves...
*/
if (ORTE_SUCCESS != (rc = orte_system_init())) {
rc = orte_system_init();
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
return rc;
}
/*
* ...and we are now ready to go!
*/

39
orte/runtime/orte_setup_hnp.h Обычный файл
Просмотреть файл

@ -0,0 +1,39 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
* Establish a Head Node Process on a cluster's front end
*/
#ifndef ORTE_SETUP_HNP_H
#define ORTE_SETUP_HNP_H
/*
* Local data structure
*/
typedef struct {
char *target_cluster;
char *headnode;
orte_process_name_t *name;
orte_jobid_t jobid;
} orte_setup_hnp_cb_data_t;
int orte_setup_hnp(char *target_cluster, char *headnode, char *username);
#endif

Просмотреть файл

@ -78,9 +78,9 @@ opal_cmd_line_init_t orte_cmd_line_opts[] = {
&orteprobe_globals.help, OPAL_CMD_LINE_TYPE_BOOL,
"This help message" },
{ NULL, NULL, NULL, '\0', NULL, "version", 0,
&orteprobe_globals.version, OPAL_CMD_LINE_TYPE_BOOL,
"Show the orteprobe version" },
{ NULL, NULL, NULL, NULL, NULL, "verbose", 0,
&orteprobe_globals.verbose, OPAL_CMD_LINE_TYPE_BOOL,
"Toggle Verbosity" },
{ NULL, NULL, NULL, 'd', NULL, "debug", 0,
&orteprobe_globals.debug, OPAL_CMD_LINE_TYPE_BOOL,
@ -144,12 +144,13 @@ int main(int argc, char *argv[])
memset(&orteprobe_globals, 0, sizeof(orteprobe_globals));
cmd_line = OBJ_NEW(opal_cmd_line_t);
opal_cmd_line_create(cmd_line, orte_cmd_line_opts);
if (OMPI_SUCCESS != (ret = opal_cmd_line_parse(cmd_line, true,
argc, argv))) {
ret = opal_cmd_line_parse(cmd_line, true, argc, argv);
if (OMPI_SUCCESS != ret) {
return ret;
}
/* check for help and version requests */
/* check for help request */
if (orteprobe_globals.help) {
char *args = NULL;
args = opal_cmd_line_get_usage_msg(cmd_line);
@ -159,12 +160,6 @@ int main(int argc, char *argv[])
return 1;
}
if (orteprobe_globals.version) {
/* show version message */
printf("...showing off my version!\n");
exit(1);
}
/*
* Attempt to parse the probe's name and save in proc_info
*/
@ -172,7 +167,7 @@ int main(int argc, char *argv[])
ret = orte_ns_base_convert_string_to_process_name(
&orte_process_info.my_name, orteprobe_globals.name_string);
if(ORTE_SUCCESS != ret) {
fprintf(stderr, "Couldn't convert environmental string to probe's process name\n");
fprintf(stderr, "orteprobe: Couldn't convert environmental string to probe's process name\n");
return 1;
}
}
@ -297,16 +292,16 @@ int main(int argc, char *argv[])
if (orteprobe_globals.requestor_string) {
if(ORTE_SUCCESS != (ret = orte_rml.parse_uris(
orteprobe_globals.requestor_string, &requestor, NULL))) {
fprintf(stderr, "Couldn't parse environmental string for requestor's contact info\n");
fprintf(stderr, "orteprobe: Couldn't parse environmental string for requestor's contact info\n");
return 1;
}
/* set the contact info */
if (ORTE_SUCCESS != (ret = orte_rml.set_uri(orteprobe_globals.requestor_string))) {
fprintf(stderr, "Couldn't set contact info for requestor\n");
fprintf(stderr, "orteprobe: Couldn't set contact info for requestor\n");
return ret;
}
} else {
fprintf(stderr, "No contact info received for requestor\n");
fprintf(stderr, "orteprobe: No contact info received for requestor\n");
return 1;
}
@ -315,29 +310,37 @@ int main(int argc, char *argv[])
*/
if (ORTE_SUCCESS == (ret = orte_universe_exists(&univ))) {
/* universe is here! send info back and die */
fprintf(stderr, "contacted existing universe - sending contact info back\n");
if(orteprobe_globals.verbose)
fprintf(stderr, "orteprobe: Contacted existing universe - sending contact info back\n");
OBJ_CONSTRUCT(&buffer, orte_buffer_t);
orted_uri_ptr = &(univ.seed_uri);
if (ORTE_SUCCESS != (ret = orte_dps.pack(&buffer, &orted_uri_ptr, 1, ORTE_STRING))) {
fprintf(stderr, "orteprobe: failed to pack contact info for existing universe\n");
exit(1);
}
if (0 > orte_rml.send_buffer(&requestor, &buffer, ORTE_RML_TAG_PROBE, 0)) {
fprintf(stderr, "orteprobe: comm failure when sending contact info for existing univ back to requestor\n");
OBJ_DESTRUCT(&buffer);
exit(1);
}
OBJ_DESTRUCT(&buffer);
OBJ_DESTRUCT(&buffer);
} else {
/* existing universe is not here or does not allow contact.
* ensure we have a unique universe name, fork/exec an appropriate
* daemon, and then tell whomever spawned us how to talk to the new
* daemon
*/
fprintf(stderr, "could not connect to existing universe\n");
if(orteprobe_globals.verbose)
fprintf(stderr, "orteprobe: Could not connect to existing universe\n");
if (ORTE_ERR_NOT_FOUND != ret) {
fprintf(stderr, "existing universe did not respond\n");
if(orteprobe_globals.verbose)
fprintf(stderr, "orteprobe: Existing universe did not respond\n");
/* if it exists but no contact could be established,
* define unique name based on current one.
*/
@ -345,11 +348,13 @@ fprintf(stderr, "existing universe did not respond\n");
free(orte_universe_info.name);
orte_universe_info.name = NULL;
pid = getpid();
if (0 > asprintf(&orte_universe_info.name, "%s-%d", universe, pid)) {
fprintf(stderr, "orteprobe: failed to create unique universe name");
exit(1);
}
}
/* setup to fork/exec the new universe */
/* setup the pipe to get the contact info back */
if (pipe(orted_pipe)) {
@ -361,7 +366,8 @@ fprintf(stderr, "existing universe did not respond\n");
id = mca_base_param_register_string("orted",NULL,NULL,NULL,"orted");
mca_base_param_lookup_string(id, &orted);
fprintf(stderr, "using %s for orted command\n", orted);
if(orteprobe_globals.verbose)
fprintf(stderr, "orteprobe: Using \"%s\" for orted command\n", orted);
/* Initialize the argv array */
ortedargv = opal_argv_split(orted, ' ');
@ -374,7 +380,8 @@ fprintf(stderr, "using %s for orted command\n", orted);
/* setup the path */
path = opal_path_findv(ortedargv[0], 0, environ, NULL);
fprintf(stderr, "path setup as %s\n", path);
if(orteprobe_globals.verbose)
fprintf(stderr, "orteprobe: Path setup as \"%s\"\n", path);
/* tell the daemon it's the seed */
opal_argv_append(&ortedargc, &ortedargv, "--seed");
@ -394,7 +401,8 @@ fprintf(stderr, "path setup as %s\n", path);
opal_argv_append(&ortedargc, &ortedargv, param);
free(param);
fprintf(stderr, "forking now\n");
if(orteprobe_globals.verbose)
fprintf(stderr, "orteprobe: Forking now\n");
/* Create the child process. */
pid = fork ();
@ -412,7 +420,8 @@ fprintf(stderr, "forking now\n");
/* This is the parent process.
Close write end first. */
fprintf(stderr, "attempting to read from daemon\n");
if(orteprobe_globals.verbose)
fprintf(stderr, "orteprobe: Attempting to read from daemon\n");
read(orted_pipe[0], orted_uri, 255);
close(orted_pipe[0]);
@ -421,26 +430,34 @@ fprintf(stderr, "attempting to read from daemon\n");
OBJ_CONSTRUCT(&buffer, orte_buffer_t);
param = orted_uri;
orted_uri_ptr = &param;
if (ORTE_SUCCESS != (ret = orte_dps.pack(&buffer, &orted_uri_ptr, 1, ORTE_STRING))) {
if (ORTE_SUCCESS != (ret = orte_dps.pack(&buffer, &orted_uri_ptr[0], 1, ORTE_STRING))) {
fprintf(stderr, "orteprobe: failed to pack daemon uri\n");
exit(1);
}
if (0 > orte_rml.send_buffer(&requestor, &buffer, ORTE_RML_TAG_PROBE, 0)) {
fprintf(stderr, "orteprobe: could not send daemon uri info back to probe\n");
OBJ_DESTRUCT(&buffer);
exit(1);
}
OBJ_DESTRUCT(&buffer);
}
}
if(orteprobe_globals.verbose)
fprintf(stderr, "orteprobe: All finished!\n");
/* cleanup */
if (NULL != contact_path) {
unlink(contact_path);
}
if (NULL != log_path) {
unlink(log_path);
}
/* finalize the system */
orte_finalize();

Просмотреть файл

@ -27,28 +27,15 @@
#include "opal/util/cmd_line.h"
#include "mca/mca.h"
/*
* Definitions needed for communication
*/
#define ORTE_DAEMON_CMD ORTE_INT16
#define ORTE_DAEMON_HOSTFILE_CMD 0x01
#define ORTE_DAEMON_SCRIPTFILE_CMD 0x02
#define ORTE_DAEMON_CONTACT_QUERY_CMD 0x03
#define ORTE_DAEMON_HEARTBEAT_CMD 0xfe
#define ORTE_DAEMON_EXIT_CMD 0xff
#include "tools/orted/orted.h"
/*
* Globals
*/
typedef uint16_t orte_daemon_cmd_flag_t;
typedef struct {
bool help;
bool version;
bool verbose;
bool debug;
char* name_string;
char* requestor_string;