diff --git a/orte/runtime/orte_setup_hnp.c b/orte/runtime/orte_setup_hnp.c index ac5a3113c7..9bbfa8f2d6 100644 --- a/orte/runtime/orte_setup_hnp.c +++ b/orte/runtime/orte_setup_hnp.c @@ -63,19 +63,10 @@ #include "mca/errmgr/errmgr.h" #include "runtime/runtime.h" +#include "runtime/orte_setup_hnp.h" extern char **environ; -/* - * Local data structure - */ -typedef struct { - char *target_cluster; - char *headnode; - orte_process_name_t *name; - orte_jobid_t jobid; -} orte_setup_hnp_cb_data_t; - /* Local condition variables and mutex */ static opal_mutex_t orte_setup_hnp_mutex; @@ -85,7 +76,7 @@ static int orte_setup_hnp_rc; /* Local uri storage */ static char *orte_setup_hnp_orted_uri; -static orte_setup_hnp_cb_data_t orte_setup_hnp_cbdata = {NULL, NULL, NULL, 0}; +static orte_setup_hnp_cb_data_t orte_setup_hnp_cbdata; /* * NON-BLOCKING RECEIVER @@ -161,20 +152,24 @@ int orte_setup_hnp(char *target_cluster, char *headnode, char *username) goto MOVEON; } else { /* lookup the headnode's cellid */ - hn = strdup(headnode); + hn = strdup(headnode); keys[0] = ORTE_RDS_FE_NAME; keys[1] = ORTE_RDS_FE_SSH; keys[2] = ORTE_CELLID_KEY; keys[3] = NULL; - if (ORTE_SUCCESS != (rc = orte_gpr.get(ORTE_GPR_TOKENS_OR | ORTE_GPR_KEYS_OR, - ORTE_RESOURCE_SEGMENT, - NULL, keys, &cnt, &values))) { - ORTE_ERROR_LOG(rc); - return rc; + + rc = orte_gpr.get(ORTE_GPR_TOKENS_OR | ORTE_GPR_KEYS_OR, + ORTE_RESOURCE_SEGMENT, + NULL, keys, &cnt, &values); + if (ORTE_SUCCESS != rc) { + ORTE_ERROR_LOG(rc); + return rc; } - if (0 == cnt || 0 == values[0]->cnt) { /* nothing found */ + /* Nothing found */ + if (0 == cnt || 0 == values[0]->cnt) { goto MOVEON; } + on_gpr = true; for (i=0; i < cnt; i++) { keyvals = values[i]->keyvals; @@ -202,7 +197,8 @@ int orte_setup_hnp(char *target_cluster, char *headnode, char *username) MOVEON: if (NULL != values) { - for (i=0; i < cnt; i++) OBJ_RELEASE(values[i]); + for (i=0; i < cnt; i++) + OBJ_RELEASE(values[i]); free(values); } @@ -214,6 +210,7 @@ MOVEON: * synonymous with the headnode name), a headnode name (on a named or * unnamed target_cluster), or both. */ + /* get new cellid for this site/resource */ if (NULL != target_cluster) { cellname = strdup(target_cluster); @@ -223,28 +220,33 @@ MOVEON: */ cellname = strdup(headnode); } + /* can't know the site name, so it becomes "unknown" */ - if (ORTE_SUCCESS != (rc = orte_ns.create_cellid(&cellid, "UNKNOWN", - cellname))) { + rc = orte_ns.create_cellid(&cellid, "unknown", cellname); + if (ORTE_SUCCESS != rc ) { ORTE_ERROR_LOG(rc); free(cellname); return rc; } - /* now store the cell info on the resource segment of the registry */ + + /* + * Store the cell info on the resource segment of the registry + */ value = OBJ_NEW(orte_gpr_value_t); if (NULL == value) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); return ORTE_ERR_OUT_OF_RESOURCE; } value->addr_mode = ORTE_GPR_TOKENS_XAND | ORTE_GPR_KEYS_OR; - value->segment = strdup(ORTE_RESOURCE_SEGMENT); + value->segment = strdup(ORTE_RESOURCE_SEGMENT); - value->cnt = 4; + value->cnt = 4; value->keyvals = (orte_gpr_keyval_t**)malloc(value->cnt * sizeof(orte_gpr_keyval_t*)); if (NULL == value->keyvals) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); return ORTE_ERR_OUT_OF_RESOURCE; } + for (i=0; i < value->cnt; i++) { value->keyvals[i] = OBJ_NEW(orte_gpr_keyval_t); if (NULL == value->keyvals[i]) { @@ -252,21 +254,29 @@ MOVEON: return ORTE_ERR_OUT_OF_RESOURCE; } } - value->keyvals[0]->key = strdup(ORTE_RDS_NAME); - value->keyvals[0]->type = ORTE_STRING; + + /* Set Cell Name */ + value->keyvals[0]->key = strdup(ORTE_RDS_NAME); + value->keyvals[0]->type = ORTE_STRING; value->keyvals[0]->value.strptr = strdup(cellname); - value->keyvals[1]->key = strdup(ORTE_CELLID_KEY); - value->keyvals[1]->type = ORTE_CELLID; + + /* Set Cell ID */ + value->keyvals[1]->key = strdup(ORTE_CELLID_KEY); + value->keyvals[1]->type = ORTE_CELLID; value->keyvals[1]->value.cellid = cellid; - value->keyvals[2]->key = strdup(ORTE_RDS_FE_NAME); + + /* Set Front End Name */ + value->keyvals[2]->key = strdup(ORTE_RDS_FE_NAME); value->keyvals[2]->type = ORTE_STRING; if (NULL == headnode) { value->keyvals[2]->value.strptr = strdup(cellname); } else { value->keyvals[2]->value.strptr = strdup(headnode); } - value->keyvals[3]->key = strdup(ORTE_RDS_FE_SSH); - value->keyvals[3]->type = ORTE_BOOL; + + /* Asssume ability to ssh to front end node*/ + value->keyvals[3]->key = strdup(ORTE_RDS_FE_SSH); + value->keyvals[3]->type = ORTE_BOOL; value->keyvals[3]->value.tf_flag = true; value->num_tokens = 3; @@ -275,25 +285,27 @@ MOVEON: ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); return ORTE_ERR_OUT_OF_RESOURCE; } - if (ORTE_SUCCESS != (rc = orte_ns.convert_cellid_to_string(&(value->tokens[0]), cellid))) { + + rc = orte_schema.get_node_tokens(&value->tokens, &value->num_tokens, cellid, cellname); + if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); return rc; } - value->tokens[1] = strdup("UNKNOWN"); /* site name is unknown */ - value->tokens[2] = strdup(cellname); - if (ORTE_SUCCESS != orte_gpr.put(1, &value)) { + /* Place tokens in GPR */ + rc = orte_gpr.put(1, &value); + if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(value); return rc; } + OBJ_RELEASE(value); free(cellname); + can_launch = true; } - orte_gpr.dump_segment(NULL, 0); - if (!can_launch || ORTE_CELLID_MAX == cellid) { return ORTE_ERR_UNREACH; } @@ -312,28 +324,40 @@ MOVEON: OBJ_CONSTRUCT(&orte_setup_hnp_condition, opal_condition_t); /* get a jobid for the probe */ - if (ORTE_SUCCESS != (rc = orte_ns.create_jobid(&jobid))) { + rc = orte_ns.create_jobid(&jobid); + if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); return rc; } + /* get a vpid for the probe */ - if (ORTE_SUCCESS != (rc = orte_ns.reserve_range(jobid, 1, &vpid))) { + rc = orte_ns.reserve_range(jobid, 1, &vpid); + if (ORTE_SUCCESS != rc ) { ORTE_ERROR_LOG(rc); return rc; } + /* initialize probe's process name... */ rc = orte_ns.create_process_name(&(orte_setup_hnp_cbdata.name), cellid, jobid, vpid); - if(ORTE_SUCCESS != rc) { + if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); return rc; } + /* ...and get string representation */ - if(ORTE_SUCCESS != (rc = orte_ns.get_proc_name_string(&name_string, orte_setup_hnp_cbdata.name))) { + rc = orte_ns.get_proc_name_string(&name_string, orte_setup_hnp_cbdata.name); + if (ORTE_SUCCESS != rc ) { ORTE_ERROR_LOG(rc); goto CLEANUP; } + /* setup callback data on sigchild */ - orte_setup_hnp_cbdata.target_cluster = strdup(target_cluster); + if (NULL != target_cluster) { + orte_setup_hnp_cbdata.target_cluster = strdup(target_cluster); + } else { + orte_setup_hnp_cbdata.target_cluster = NULL; + } + orte_setup_hnp_cbdata.headnode = strdup(headnode); orte_setup_hnp_cbdata.jobid = jobid; @@ -405,7 +429,7 @@ MOVEON: /* pass along any parameters for the head node process * in case one needs to be created */ - id = mca_base_param_register_string("scope",NULL,NULL,NULL,"private"); + id = mca_base_param_register_string("scope",NULL,NULL,NULL,"public"); mca_base_param_lookup_string(id, ¶m); opal_argv_append(&argc, &argv, "--scope"); opal_argv_append(&argc, &argv, param); @@ -459,25 +483,29 @@ MOVEON: * utilities, though, or we will lose all of our MCA parameters */ orte_system_finalize(); + /* * now set the relevant MCA parameters to point us at the remote daemon... */ - if (ORTE_SUCCESS != (rc = opal_setenv("OMPI_MCA_gpr_replica_uri", - orte_setup_hnp_orted_uri, true, &environ))) { + rc = opal_setenv("OMPI_MCA_gpr_replica_uri", + orte_setup_hnp_orted_uri, true, &environ); + if (ORTE_SUCCESS != rc) { fprintf(stderr, "orte_setup_hnp: could not set gpr_replica_uri in environ\n"); return rc; } - if (ORTE_SUCCESS != (rc = opal_setenv("OMPI_MCA_ns_replica_uri", - orte_setup_hnp_orted_uri, true, &environ))) { + rc = opal_setenv("OMPI_MCA_ns_replica_uri", + orte_setup_hnp_orted_uri, true, &environ); + if (ORTE_SUCCESS != rc) { fprintf(stderr, "orte_setup_hnp: could not set ns_replica_uri in environ\n"); return rc; } opal_unsetenv("OMPI_MCA_seed", &environ); - - if (ORTE_SUCCESS != (rc = opal_setenv("OMPI_MCA_universe_uri", - orte_setup_hnp_orted_uri, true, &environ))) { + + rc = opal_setenv("OMPI_MCA_universe_uri", + orte_setup_hnp_orted_uri, true, &environ); + if (ORTE_SUCCESS != rc) { fprintf(stderr, "orte_setup_hnp: could not set universe_uri in environ\n"); return rc; } @@ -485,10 +513,12 @@ MOVEON: /* * ...re-init ourselves... */ - if (ORTE_SUCCESS != (rc = orte_system_init())) { + rc = orte_system_init(); + if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); return rc; } + /* * ...and we are now ready to go! */ diff --git a/orte/runtime/orte_setup_hnp.h b/orte/runtime/orte_setup_hnp.h new file mode 100644 index 0000000000..38bba966f9 --- /dev/null +++ b/orte/runtime/orte_setup_hnp.h @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University. + * All rights reserved. + * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. + * All rights reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/** + * @file + * + * Establish a Head Node Process on a cluster's front end + */ + +#ifndef ORTE_SETUP_HNP_H +#define ORTE_SETUP_HNP_H + +/* + * Local data structure + */ +typedef struct { + char *target_cluster; + char *headnode; + orte_process_name_t *name; + orte_jobid_t jobid; +} orte_setup_hnp_cb_data_t; + + +int orte_setup_hnp(char *target_cluster, char *headnode, char *username); + +#endif diff --git a/orte/tools/orteprobe/orteprobe.c b/orte/tools/orteprobe/orteprobe.c index 0d7e99dae0..8362fe7fbf 100644 --- a/orte/tools/orteprobe/orteprobe.c +++ b/orte/tools/orteprobe/orteprobe.c @@ -78,9 +78,9 @@ opal_cmd_line_init_t orte_cmd_line_opts[] = { &orteprobe_globals.help, OPAL_CMD_LINE_TYPE_BOOL, "This help message" }, - { NULL, NULL, NULL, '\0', NULL, "version", 0, - &orteprobe_globals.version, OPAL_CMD_LINE_TYPE_BOOL, - "Show the orteprobe version" }, + { NULL, NULL, NULL, NULL, NULL, "verbose", 0, + &orteprobe_globals.verbose, OPAL_CMD_LINE_TYPE_BOOL, + "Toggle Verbosity" }, { NULL, NULL, NULL, 'd', NULL, "debug", 0, &orteprobe_globals.debug, OPAL_CMD_LINE_TYPE_BOOL, @@ -139,17 +139,18 @@ int main(int argc, char *argv[]) pid_t pid; #if defined(HAVE_FORK) && defined(HAVE_PIPE) - + /* setup to check common command line options that just report and die */ memset(&orteprobe_globals, 0, sizeof(orteprobe_globals)); cmd_line = OBJ_NEW(opal_cmd_line_t); opal_cmd_line_create(cmd_line, orte_cmd_line_opts); - if (OMPI_SUCCESS != (ret = opal_cmd_line_parse(cmd_line, true, - argc, argv))) { + + ret = opal_cmd_line_parse(cmd_line, true, argc, argv); + if (OMPI_SUCCESS != ret) { return ret; } - /* check for help and version requests */ + /* check for help request */ if (orteprobe_globals.help) { char *args = NULL; args = opal_cmd_line_get_usage_msg(cmd_line); @@ -159,12 +160,6 @@ int main(int argc, char *argv[]) return 1; } - if (orteprobe_globals.version) { - /* show version message */ - printf("...showing off my version!\n"); - exit(1); - } - /* * Attempt to parse the probe's name and save in proc_info */ @@ -172,7 +167,7 @@ int main(int argc, char *argv[]) ret = orte_ns_base_convert_string_to_process_name( &orte_process_info.my_name, orteprobe_globals.name_string); if(ORTE_SUCCESS != ret) { - fprintf(stderr, "Couldn't convert environmental string to probe's process name\n"); + fprintf(stderr, "orteprobe: Couldn't convert environmental string to probe's process name\n"); return 1; } } @@ -297,16 +292,16 @@ int main(int argc, char *argv[]) if (orteprobe_globals.requestor_string) { if(ORTE_SUCCESS != (ret = orte_rml.parse_uris( orteprobe_globals.requestor_string, &requestor, NULL))) { - fprintf(stderr, "Couldn't parse environmental string for requestor's contact info\n"); + fprintf(stderr, "orteprobe: Couldn't parse environmental string for requestor's contact info\n"); return 1; } /* set the contact info */ if (ORTE_SUCCESS != (ret = orte_rml.set_uri(orteprobe_globals.requestor_string))) { - fprintf(stderr, "Couldn't set contact info for requestor\n"); + fprintf(stderr, "orteprobe: Couldn't set contact info for requestor\n"); return ret; } } else { - fprintf(stderr, "No contact info received for requestor\n"); + fprintf(stderr, "orteprobe: No contact info received for requestor\n"); return 1; } @@ -315,29 +310,37 @@ int main(int argc, char *argv[]) */ if (ORTE_SUCCESS == (ret = orte_universe_exists(&univ))) { /* universe is here! send info back and die */ -fprintf(stderr, "contacted existing universe - sending contact info back\n"); + if(orteprobe_globals.verbose) + fprintf(stderr, "orteprobe: Contacted existing universe - sending contact info back\n"); + OBJ_CONSTRUCT(&buffer, orte_buffer_t); orted_uri_ptr = &(univ.seed_uri); + if (ORTE_SUCCESS != (ret = orte_dps.pack(&buffer, &orted_uri_ptr, 1, ORTE_STRING))) { fprintf(stderr, "orteprobe: failed to pack contact info for existing universe\n"); exit(1); } + if (0 > orte_rml.send_buffer(&requestor, &buffer, ORTE_RML_TAG_PROBE, 0)) { fprintf(stderr, "orteprobe: comm failure when sending contact info for existing univ back to requestor\n"); OBJ_DESTRUCT(&buffer); exit(1); } - OBJ_DESTRUCT(&buffer); + OBJ_DESTRUCT(&buffer); } else { /* existing universe is not here or does not allow contact. * ensure we have a unique universe name, fork/exec an appropriate * daemon, and then tell whomever spawned us how to talk to the new * daemon */ -fprintf(stderr, "could not connect to existing universe\n"); + if(orteprobe_globals.verbose) + fprintf(stderr, "orteprobe: Could not connect to existing universe\n"); + if (ORTE_ERR_NOT_FOUND != ret) { -fprintf(stderr, "existing universe did not respond\n"); + if(orteprobe_globals.verbose) + fprintf(stderr, "orteprobe: Existing universe did not respond\n"); + /* if it exists but no contact could be established, * define unique name based on current one. */ @@ -345,11 +348,13 @@ fprintf(stderr, "existing universe did not respond\n"); free(orte_universe_info.name); orte_universe_info.name = NULL; pid = getpid(); + if (0 > asprintf(&orte_universe_info.name, "%s-%d", universe, pid)) { fprintf(stderr, "orteprobe: failed to create unique universe name"); exit(1); } } + /* setup to fork/exec the new universe */ /* setup the pipe to get the contact info back */ if (pipe(orted_pipe)) { @@ -361,7 +366,8 @@ fprintf(stderr, "existing universe did not respond\n"); id = mca_base_param_register_string("orted",NULL,NULL,NULL,"orted"); mca_base_param_lookup_string(id, &orted); -fprintf(stderr, "using %s for orted command\n", orted); + if(orteprobe_globals.verbose) + fprintf(stderr, "orteprobe: Using \"%s\" for orted command\n", orted); /* Initialize the argv array */ ortedargv = opal_argv_split(orted, ' '); @@ -374,7 +380,8 @@ fprintf(stderr, "using %s for orted command\n", orted); /* setup the path */ path = opal_path_findv(ortedargv[0], 0, environ, NULL); -fprintf(stderr, "path setup as %s\n", path); + if(orteprobe_globals.verbose) + fprintf(stderr, "orteprobe: Path setup as \"%s\"\n", path); /* tell the daemon it's the seed */ opal_argv_append(&ortedargc, &ortedargv, "--seed"); @@ -394,7 +401,8 @@ fprintf(stderr, "path setup as %s\n", path); opal_argv_append(&ortedargc, &ortedargv, param); free(param); -fprintf(stderr, "forking now\n"); + if(orteprobe_globals.verbose) + fprintf(stderr, "orteprobe: Forking now\n"); /* Create the child process. */ pid = fork (); @@ -412,7 +420,8 @@ fprintf(stderr, "forking now\n"); /* This is the parent process. Close write end first. */ -fprintf(stderr, "attempting to read from daemon\n"); + if(orteprobe_globals.verbose) + fprintf(stderr, "orteprobe: Attempting to read from daemon\n"); read(orted_pipe[0], orted_uri, 255); close(orted_pipe[0]); @@ -421,26 +430,34 @@ fprintf(stderr, "attempting to read from daemon\n"); OBJ_CONSTRUCT(&buffer, orte_buffer_t); param = orted_uri; orted_uri_ptr = ¶m; - if (ORTE_SUCCESS != (ret = orte_dps.pack(&buffer, &orted_uri_ptr, 1, ORTE_STRING))) { + + if (ORTE_SUCCESS != (ret = orte_dps.pack(&buffer, &orted_uri_ptr[0], 1, ORTE_STRING))) { fprintf(stderr, "orteprobe: failed to pack daemon uri\n"); exit(1); } + if (0 > orte_rml.send_buffer(&requestor, &buffer, ORTE_RML_TAG_PROBE, 0)) { fprintf(stderr, "orteprobe: could not send daemon uri info back to probe\n"); OBJ_DESTRUCT(&buffer); exit(1); } + OBJ_DESTRUCT(&buffer); } } - + + if(orteprobe_globals.verbose) + fprintf(stderr, "orteprobe: All finished!\n"); + /* cleanup */ if (NULL != contact_path) { unlink(contact_path); } + if (NULL != log_path) { unlink(log_path); } + /* finalize the system */ orte_finalize(); diff --git a/orte/tools/orteprobe/orteprobe.h b/orte/tools/orteprobe/orteprobe.h index 0d238b3da8..ebd6d50189 100644 --- a/orte/tools/orteprobe/orteprobe.h +++ b/orte/tools/orteprobe/orteprobe.h @@ -27,28 +27,15 @@ #include "opal/util/cmd_line.h" #include "mca/mca.h" - -/* - * Definitions needed for communication - */ -#define ORTE_DAEMON_CMD ORTE_INT16 - -#define ORTE_DAEMON_HOSTFILE_CMD 0x01 -#define ORTE_DAEMON_SCRIPTFILE_CMD 0x02 -#define ORTE_DAEMON_CONTACT_QUERY_CMD 0x03 -#define ORTE_DAEMON_HEARTBEAT_CMD 0xfe -#define ORTE_DAEMON_EXIT_CMD 0xff - +#include "tools/orted/orted.h" /* * Globals */ -typedef uint16_t orte_daemon_cmd_flag_t; - typedef struct { bool help; - bool version; + bool verbose; bool debug; char* name_string; char* requestor_string;