1
1

Bring in the remote launch changes. This still isn't fully functional, but impacted a few other places that were worth fixing.

1. Added a new function to launch head node processes on remote nodes.

2. Added new tool "orteprobe" that checks to see if a daemon is running on a node. If so, it reports the contact info back to the requestor. If not, it will (eventually - but not now) fork/exec a daemon on the node, report the contact info back to requestor, and then die.

3. Modified orted to handle universe name parameters, and added separate command line flags for debugging the daemon and saving daemon debugging output in a file. The "debug" flag now turns on the runtime debug info instead of the daemon debug - thus, you can now just get daemon debug info if you like.

4. Fix the dps to handle zero length strings correctly.

5. Modify the fork and rsh launchers to pass required environmental variables to the daemons and processes

6. Pulled the redirection of stdin/stdout/stderr for the daemon out of orted and put it into the daemon_init function to simplify orted logic.

7. Modified sys_info to correctly deal with passed mca param

8. Modified univ_info to parse incoming universe location information.

This commit was SVN r5705.
Этот коммит содержится в:
Ralph Castain 2005-05-12 21:44:23 +00:00
родитель 0c6eaaebe3
Коммит fdfe457578
22 изменённых файлов: 947 добавлений и 105 удалений

Просмотреть файл

@ -1798,6 +1798,7 @@ AC_CONFIG_FILES([
src/tools/console/Makefile src/tools/console/Makefile
src/tools/ompi_info/Makefile src/tools/ompi_info/Makefile
src/tools/orted/Makefile src/tools/orted/Makefile
src/tools/orteprobe/Makefile
src/tools/orterun/Makefile src/tools/orterun/Makefile
src/tools/openmpi/Makefile src/tools/openmpi/Makefile
src/tools/wrappers/Makefile src/tools/wrappers/Makefile

Просмотреть файл

@ -300,15 +300,23 @@ int orte_dps_pack_string(orte_buffer_t *buffer, void *src,
char **ssrc = (char**) src; char **ssrc = (char**) src;
for (i = 0; i < num_vals; ++i) { for (i = 0; i < num_vals; ++i) {
len = strlen(ssrc[i]) + 1; if (NULL == ssrc[i]) { /* got zero-length string/NULL pointer - store NULL */
if (ORTE_SUCCESS != (ret = orte_dps_pack_sizet(buffer, &len, 1, ORTE_SIZE))) { len = 0;
ORTE_ERROR_LOG(ret); if (ORTE_SUCCESS != (ret = orte_dps_pack_sizet(buffer, &len, 1, ORTE_SIZE))) {
return ret; ORTE_ERROR_LOG(ret);
} return ret;
if (ORTE_SUCCESS != (ret = }
orte_dps_pack_byte(buffer, ssrc[i], len, ORTE_BYTE))) { } else {
ORTE_ERROR_LOG(ret); len = strlen(ssrc[i]) + 1;
return ret; if (ORTE_SUCCESS != (ret = orte_dps_pack_sizet(buffer, &len, 1, ORTE_SIZE))) {
ORTE_ERROR_LOG(ret);
return ret;
}
if (ORTE_SUCCESS != (ret =
orte_dps_pack_byte(buffer, ssrc[i], len, ORTE_BYTE))) {
ORTE_ERROR_LOG(ret);
return ret;
}
} }
} }

Просмотреть файл

@ -330,14 +330,18 @@ int orte_dps_unpack_string(orte_buffer_t *buffer, void *dest,
ORTE_ERROR_LOG(ret); ORTE_ERROR_LOG(ret);
return ret; return ret;
} }
if (0 == len) { /* zero-length string - unpack the NULL */
sdest[i] = NULL;
} else {
sdest[i] = malloc(len); sdest[i] = malloc(len);
if (NULL == sdest[i]) { if (NULL == sdest[i]) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE; return ORTE_ERR_OUT_OF_RESOURCE;
} }
if (ORTE_SUCCESS != (ret = orte_dps_unpack_byte(buffer, sdest[i], &len, ORTE_BYTE))) { if (ORTE_SUCCESS != (ret = orte_dps_unpack_byte(buffer, sdest[i], &len, ORTE_BYTE))) {
ORTE_ERROR_LOG(ret); ORTE_ERROR_LOG(ret);
return ret; return ret;
}
} }
} }

Просмотреть файл

@ -97,9 +97,6 @@ int orte_gpr_base_unpack_get(orte_buffer_t *buffer, int *ret, size_t *cnt, orte_
free(*values); free(*values);
return rc; return rc;
} }
for (n=0; n < num; n++) {
orte_gpr.dump_value((*values)[n], 0);
}
} }
/* unpack the response code */ /* unpack the response code */

Просмотреть файл

@ -34,6 +34,7 @@
#include "util/argv.h" #include "util/argv.h"
#include "util/output.h" #include "util/output.h"
#include "util/sys_info.h" #include "util/sys_info.h"
#include "util/univ_info.h"
#include "util/ompi_environ.h" #include "util/ompi_environ.h"
#include "util/session_dir.h" #include "util/session_dir.h"
#include "runtime/orte_wait.h" #include "runtime/orte_wait.h"
@ -152,6 +153,17 @@ static int orte_pls_fork_proc(
param = mca_base_param_environ_variable("rmgr","bootproxy","jobid"); param = mca_base_param_environ_variable("rmgr","bootproxy","jobid");
ompi_unsetenv(param, &environ_copy); ompi_unsetenv(param, &environ_copy);
/* setup universe info */
if (NULL != orte_universe_info.name) {
param = mca_base_param_environ_variable("universe", NULL, NULL);
asprintf(&uri, "%s@%s:%s", orte_universe_info.uid,
orte_universe_info.host,
orte_universe_info.name);
ompi_setenv(param, uri, true, &environ_copy);
free(param);
free(uri);
}
/* setup ns contact info */ /* setup ns contact info */
if(NULL != orte_process_info.ns_replica_uri) { if(NULL != orte_process_info.ns_replica_uri) {
uri = strdup(orte_process_info.ns_replica_uri); uri = strdup(orte_process_info.ns_replica_uri);
@ -185,7 +197,7 @@ static int orte_pls_fork_proc(
new_env = ompi_environ_merge(context->env, environ_copy); new_env = ompi_environ_merge(context->env, environ_copy);
ompi_argv_free(environ_copy); ompi_argv_free(environ_copy);
if(context->argv == NULL) { if (context->argv == NULL) {
context->argv = malloc(sizeof(char*)*2); context->argv = malloc(sizeof(char*)*2);
context->argv[0] = strdup(context->app); context->argv[0] = strdup(context->app);
context->argv[1] = NULL; context->argv[1] = NULL;

Просмотреть файл

@ -33,11 +33,15 @@
#include "include/orte_constants.h" #include "include/orte_constants.h"
#include "util/argv.h" #include "util/argv.h"
#include "util/output.h" #include "util/output.h"
#include "util/univ_info.h"
#include "util/session_dir.h" #include "util/session_dir.h"
#include "util/if.h" #include "util/if.h"
#include "util/path.h" #include "util/path.h"
#include "event/event.h" #include "event/event.h"
#include "runtime/orte_wait.h" #include "runtime/orte_wait.h"
#include "mca/base/mca_base_param.h"
#include "mca/ns/ns.h" #include "mca/ns/ns.h"
#include "mca/pls/pls.h" #include "mca/pls/pls.h"
#include "mca/rml/rml.h" #include "mca/rml/rml.h"
@ -52,6 +56,8 @@
#define NUM_CONCURRENT 128 #define NUM_CONCURRENT 128
extern char **environ;
#if OMPI_HAVE_POSIX_THREADS && OMPI_THREADS_HAVE_DIFFERENT_PIDS && OMPI_ENABLE_PROGRESS_THREADS #if OMPI_HAVE_POSIX_THREADS && OMPI_THREADS_HAVE_DIFFERENT_PIDS && OMPI_ENABLE_PROGRESS_THREADS
static int orte_pls_rsh_launch_threaded(orte_jobid_t jobid); static int orte_pls_rsh_launch_threaded(orte_jobid_t jobid);
@ -244,7 +250,8 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
char** argv; char** argv;
int argc; int argc;
int rc; int rc;
int id;
/* query the list of nodes allocated to the job - don't need the entire /* query the list of nodes allocated to the job - don't need the entire
* mapping - as the daemon/proxy is responsibe for determining the apps * mapping - as the daemon/proxy is responsibe for determining the apps
* to launch on each node. * to launch on each node.
@ -272,19 +279,34 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
asprintf(&jobid_string, "%lu", (unsigned long) jobid); asprintf(&jobid_string, "%lu", (unsigned long) jobid);
/* /*
* Build argv/env arrays. * Build argv array
*/ */
argv = ompi_argv_copy(mca_pls_rsh_component.argv); argv = ompi_argv_copy(mca_pls_rsh_component.argv);
argc = mca_pls_rsh_component.argc; argc = mca_pls_rsh_component.argc;
node_name_index1 = argc; node_name_index1 = argc;
ompi_argv_append(&argc, &argv, ""); /* placeholder for node name */ ompi_argv_append(&argc, &argv, ""); /* placeholder for node name */
/* application */ /* add the daemon command (as specified by user) */
local_exec_index = argc; local_exec_index = argc;
ompi_argv_append(&argc, &argv, mca_pls_rsh_component.orted); ompi_argv_append(&argc, &argv, mca_pls_rsh_component.orted);
if (mca_pls_rsh_component.debug) {
/* check for debug flags */
id = mca_base_param_register_int("orte","debug",NULL,NULL,0);
mca_base_param_lookup_int(id,&rc);
if (rc) {
ompi_argv_append(&argc, &argv, "--debug"); ompi_argv_append(&argc, &argv, "--debug");
} }
id = mca_base_param_register_int("orte","debug","daemons",NULL,0);
mca_base_param_lookup_int(id,&rc);
if (rc) {
ompi_argv_append(&argc, &argv, "--debug-daemons");
}
id = mca_base_param_register_int("orte","debug","daemons_file",NULL,0);
mca_base_param_lookup_int(id,&rc);
if (rc) {
ompi_argv_append(&argc, &argv, "--debug-daemons-file");
}
ompi_argv_append(&argc, &argv, "--bootproxy"); ompi_argv_append(&argc, &argv, "--bootproxy");
ompi_argv_append(&argc, &argv, jobid_string); ompi_argv_append(&argc, &argv, jobid_string);
ompi_argv_append(&argc, &argv, "--name"); ompi_argv_append(&argc, &argv, "--name");
@ -294,6 +316,13 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
node_name_index2 = argc; node_name_index2 = argc;
ompi_argv_append(&argc, &argv, ""); ompi_argv_append(&argc, &argv, "");
/* pass along the universe name and location info */
ompi_argv_append(&argc, &argv, "--universe");
asprintf(&param, "%s@%s:%s", orte_universe_info.uid,
orte_universe_info.host, orte_universe_info.name);
ompi_argv_append(&argc, &argv, param);
free(param);
/* setup ns contact info */ /* setup ns contact info */
ompi_argv_append(&argc, &argv, "--nsreplica"); ompi_argv_append(&argc, &argv, "--nsreplica");
if(NULL != orte_process_info.ns_replica_uri) { if(NULL != orte_process_info.ns_replica_uri) {
@ -304,6 +333,7 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
asprintf(&param, "\"%s\"", uri); asprintf(&param, "\"%s\"", uri);
ompi_argv_append(&argc, &argv, param); ompi_argv_append(&argc, &argv, param);
free(uri); free(uri);
free(param);
/* setup gpr contact info */ /* setup gpr contact info */
ompi_argv_append(&argc, &argv, "--gprreplica"); ompi_argv_append(&argc, &argv, "--gprreplica");
@ -315,6 +345,7 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
asprintf(&param, "\"%s\"", uri); asprintf(&param, "\"%s\"", uri);
ompi_argv_append(&argc, &argv, param); ompi_argv_append(&argc, &argv, param);
free(uri); free(uri);
free(param);
/* /*
* Iterate through each of the nodes and spin * Iterate through each of the nodes and spin

Просмотреть файл

@ -41,6 +41,7 @@ typedef uint32_t orte_rml_tag_t;
#define ORTE_RML_TAG_XCAST 7 #define ORTE_RML_TAG_XCAST 7
#define ORTE_RML_TAG_RMGR_SVC 8 #define ORTE_RML_TAG_RMGR_SVC 8
#define ORTE_RML_TAG_RMGR_CLNT 9 #define ORTE_RML_TAG_RMGR_CLNT 9
#define ORTE_RML_TAG_PROBE 10
#define ORTE_RML_TAG_DYNAMIC 2000 #define ORTE_RML_TAG_DYNAMIC 2000
#define ORTE_RML_TAG_MAX UINT32_MAX #define ORTE_RML_TAG_MAX UINT32_MAX

Просмотреть файл

@ -34,8 +34,6 @@
#include "mca/ns/ns_types.h" #include "mca/ns/ns_types.h"
#include "mca/soh/soh_types.h" #include "mca/soh/soh_types.h"
#include "soh_types.h" /* gpr keys and external datatypes needed for prototyping */
/* /*
* Component functions - all MUST be provided! * Component functions - all MUST be provided!
*/ */
@ -137,4 +135,6 @@ typedef orte_soh_base_component_1_0_0_t orte_soh_base_component_t;
/* soh v1.0 */ \ /* soh v1.0 */ \
"soh", 1, 0, 0 "soh", 1, 0, 0
OMPI_DECLSPEC extern orte_soh_base_module_t orte_soh; /* holds selected module's function pointers */
#endif /* ORTE_SOH_H */ #endif /* ORTE_SOH_H */

Просмотреть файл

@ -73,11 +73,6 @@ int orte_init_stage1(void)
/* For malloc debugging */ /* For malloc debugging */
ompi_malloc_init(); ompi_malloc_init();
/* Ensure the universe_info structure is instantiated and initialized */
if (ORTE_SUCCESS != (ret = orte_univ_info())) {
return ret;
}
/* Ensure the system_info structure is instantiated and initialized */ /* Ensure the system_info structure is instantiated and initialized */
if (ORTE_SUCCESS != (ret = orte_sys_info())) { if (ORTE_SUCCESS != (ret = orte_sys_info())) {
return ret; return ret;
@ -88,6 +83,11 @@ int orte_init_stage1(void)
return ret; return ret;
} }
/* Ensure the universe_info structure is instantiated and initialized */
if (ORTE_SUCCESS != (ret = orte_univ_info())) {
return ret;
}
/* /*
* Initialize the MCA framework * Initialize the MCA framework
*/ */

Просмотреть файл

@ -23,13 +23,21 @@
#include "orte_config.h" #include "orte_config.h"
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
#include <string.h> #include <string.h>
#ifdef HAVE_SYS_TIME_H #include <sys/types.h>
#include <sys/time.h> #include <sys/stat.h>
#endif #include <sys/wait.h>
#include <fcntl.h>
#include "include/orte_constants.h" #include "include/orte_constants.h"
#include "runtime/orte_wait.h"
#include "util/argv.h"
#include "util/output.h" #include "util/output.h"
#include "util/path.h"
#include "util/univ_info.h" #include "util/univ_info.h"
#include "util/sys_info.h" #include "util/sys_info.h"
#include "util/proc_info.h" #include "util/proc_info.h"
@ -37,14 +45,238 @@
#include "util/session_dir.h" #include "util/session_dir.h"
#include "util/universe_setup_file_io.h" #include "util/universe_setup_file_io.h"
#include "mca/base/mca_base_param.h"
#include "mca/soh/soh.h"
#include "mca/rml/rml.h" #include "mca/rml/rml.h"
#include "mca/ns/ns.h" #include "mca/ns/ns.h"
#include "mca/errmgr/errmgr.h" #include "mca/errmgr/errmgr.h"
#include "runtime/runtime.h" #include "runtime/runtime.h"
extern char **environ;
int orte_setup_hnp(char *target_cluster) /*
* Local data structure
*/
typedef struct {
char *target_cluster;
char *headnode;
orte_process_name_t *name;
orte_jobid_t jobid;
} orte_setup_hnp_cb_data_t;
static orte_setup_hnp_cb_data_t orte_setup_hnp_cbdata = {NULL, NULL, NULL, 0};
/*
* NON-BLOCKING RECEIVER
*/
static void orte_setup_hnp_recv(int status, orte_process_name_t* sender,
orte_buffer_t* buffer, orte_rml_tag_t tag,
void* cbdata);
/*
* PID WAIT CALLBACK
*/
static void orte_setup_hnp_wait(pid_t wpid, int status, void *data);
/*
* ORTE_SETUP_HNP
*/
int orte_setup_hnp(char *target_cluster, char *headnode, char *username)
{ {
return ORTE_ERR_NOT_IMPLEMENTED; char **argv, *param, *uri, *uid, *hn;
char *path, *name_string, *orteprobe;
int argc, rc=ORTE_SUCCESS, id;
pid_t pid;
orte_cellid_t cellid;
orte_jobid_t jobid;
orte_vpid_t vpid;
/* get the nodename for the headnode of the target cluster */
if (NULL == headnode) { /* not provided, so try to look it up */
} else { /* lookup the headnode's cellid */
hn = strdup(headnode);
cellid = 0;
}
/* get the user's name on the headnode */
if (NULL == username) {
uid = strdup(orte_system_info.user);
} else {
uid = strdup(username);
}
/* SETUP TO LAUNCH PROBE */
/* get a jobid for the probe */
if (ORTE_SUCCESS != (rc = orte_ns.create_jobid(&jobid))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* get a vpid for the probe */
if (ORTE_SUCCESS != (rc = orte_ns.reserve_range(jobid, 1, &vpid))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* initialize probe's process name... */
rc = orte_ns.create_process_name(&(orte_setup_hnp_cbdata.name), cellid, jobid, vpid);
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* ...and get string representation */
if(ORTE_SUCCESS != (rc = orte_ns.get_proc_name_string(&name_string, orte_setup_hnp_cbdata.name))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
/* setup callback data on sigchild */
orte_setup_hnp_cbdata.target_cluster = strdup(target_cluster);
orte_setup_hnp_cbdata.headnode = strdup(headnode);
orte_setup_hnp_cbdata.jobid = jobid;
/* get rsh/ssh launch mechanism parameters */
id = mca_base_param_register_string("pls","rsh","agent",NULL,"/usr/bin/ssh");
mca_base_param_lookup_string(id, &param);
id = mca_base_param_register_string("orteprobe",NULL,NULL,NULL,"orteprobe");
mca_base_param_lookup_string(id, &orteprobe);
/* Initialize the argv array */
argv = ompi_argv_split(param, ' ');
argc = ompi_argv_count(argv);
if (argc <= 0) {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
rc = ORTE_ERR_BAD_PARAM;
goto CLEANUP;
}
free(param);
/* setup the path */
path = ompi_path_findv(argv[0], 0, environ, NULL);
/* add the username and nodename */
ompi_argv_append(&argc, &argv, "-l");
ompi_argv_append(&argc, &argv, uid);
ompi_argv_append(&argc, &argv, hn);
/* add the probe application */
ompi_argv_append(&argc, &argv, orteprobe);
/* tell the probe it's name */
ompi_argv_append(&argc, &argv, "--name");
ompi_argv_append(&argc, &argv, name_string);
/* setup probe's ns contact info */
ompi_argv_append(&argc, &argv, "--nsreplica");
if(NULL != orte_process_info.ns_replica_uri) {
uri = strdup(orte_process_info.ns_replica_uri);
} else {
uri = orte_rml.get_uri();
}
asprintf(&param, "\"%s\"", uri);
ompi_argv_append(&argc, &argv, param);
free(uri);
/* setup probe's gpr contact info */
ompi_argv_append(&argc, &argv, "--gprreplica");
if(NULL != orte_process_info.gpr_replica_uri) {
uri = strdup(orte_process_info.gpr_replica_uri);
} else {
uri = orte_rml.get_uri();
}
asprintf(&param, "\"%s\"", uri);
ompi_argv_append(&argc, &argv, param);
free(uri);
/* tell the probe who to report to */
uri = orte_rml.get_uri();
ompi_argv_append(&argc, &argv, "--requestor");
ompi_argv_append(&argc, &argv, uri);
free(uri);
/* issue the non-blocking recv to get the probe's findings */
rc = orte_rml.recv_buffer_nb(ORTE_RML_NAME_ANY, ORTE_RML_TAG_PROBE,
0, orte_setup_hnp_recv, NULL);
if(rc < 0) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
/* fork a child to exec the rsh/ssh session */
pid = fork();
if (pid < 0) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
rc = ORTE_ERR_OUT_OF_RESOURCE;
goto CLEANUP;
}
if (pid == 0) { /* child */
/* exec the probe launch */
ompi_output(0, "exec'ing %s", path);
execv(path, argv);
ORTE_ERROR_LOG(ORTE_ERROR);
ompi_output(0, "orte_setup_hnp: execv failed with errno=%d\n", errno);
return ORTE_ERROR;
} else { /* parent */
orte_wait_cb(pid, orte_setup_hnp_wait, &orte_setup_hnp_cbdata);
}
CLEANUP:
return rc;
} }
static void orte_setup_hnp_recv(int status, orte_process_name_t* sender,
orte_buffer_t* buffer, orte_rml_tag_t tag,
void* cbdata)
{
ompi_output(0, "HE CALLED HOME!!");
orte_finalize();
exit(0);
}
static void orte_setup_hnp_wait(pid_t wpid, int status, void *cbdata)
{
int rc;
orte_setup_hnp_cb_data_t *data;
data = (orte_setup_hnp_cb_data_t*)cbdata;
/* if ssh exited abnormally, print something useful to the user and cleanup
* the registry entries for the HNP jobid.
This should somehow be pushed up to the calling level, but we
don't really have a way to do that just yet.
*/
if (! WIFEXITED(status) || ! WEXITSTATUS(status) == 0) {
/* set the probe's state-of-health to aborted */
if (ORTE_SUCCESS != (rc =
orte_soh.set_proc_soh(data->name, ORTE_PROC_STATE_ABORTED, status))) {
ORTE_ERROR_LOG(rc);
}
/* tell the user something went wrong */
ompi_output(0, "ERROR: The probe on head node %s of the %s cluster failed to start as expected.",
data->headnode, data->target_cluster);
ompi_output(0, "ERROR: There may be more information available from");
ompi_output(0, "ERROR: the remote shell (see above).");
if (WIFEXITED(status)) {
ompi_output(0, "ERROR: The probe exited unexpectedly with status %d.",
WEXITSTATUS(status));
} else if (WIFSIGNALED(status)) {
#ifdef WCOREDUMP
if (WCOREDUMP(status)) {
ompi_output(0, "The probe received a signal %d (with core).",
WTERMSIG(status));
} else {
ompi_output(0, "The probe received a signal %d.", WTERMSIG(status));
}
#else
ompi_output(0, "The probe received a signal %d.", WTERMSIG(status));
#endif /* WCOREDUMP */
} else {
ompi_output(0, "No extra status information is available: %d.", status);
}
}
}

Просмотреть файл

@ -156,7 +156,7 @@ OMPI_DECLSPEC int ompi_rte_init_io(void);
/** /**
* Establish a Head Node Process on a cluster's front end * Establish a Head Node Process on a cluster's front end
*/ */
OMPI_DECLSPEC int orte_setup_hnp(char *target_cluster); OMPI_DECLSPEC int orte_setup_hnp(char *target_cluster, char *headnode, char *username);
#if defined(c_plusplus) || defined(__cplusplus) #if defined(c_plusplus) || defined(__cplusplus)
} }

Просмотреть файл

@ -19,5 +19,5 @@ include $(top_srcdir)/config/Makefile.options
EXTRA_DIST = win_makefile EXTRA_DIST = win_makefile
SUBDIRS = ompi_info wrappers orted orterun openmpi console SUBDIRS = ompi_info wrappers orted orteprobe orterun openmpi console

Просмотреть файл

@ -76,40 +76,67 @@ ompi_cmd_line_init_t orte_cmd_line_opts[] = {
{ NULL, NULL, NULL, 'h', NULL, "help", 0, { NULL, NULL, NULL, 'h', NULL, "help", 0,
&orted_globals.help, OMPI_CMD_LINE_TYPE_BOOL, &orted_globals.help, OMPI_CMD_LINE_TYPE_BOOL,
"This help message" }, "This help message" },
{ NULL, NULL, NULL, '\0', NULL, "version", 0, { NULL, NULL, NULL, '\0', NULL, "version", 0,
&orted_globals.version, OMPI_CMD_LINE_TYPE_BOOL, &orted_globals.version, OMPI_CMD_LINE_TYPE_BOOL,
"Show the orted version" }, "Show the orted version" },
{ NULL, NULL, NULL, 'd', NULL, "debug", 0, { "orte", "debug", NULL, 'd', NULL, "debug", 0,
&orted_globals.debug, OMPI_CMD_LINE_TYPE_BOOL, NULL, OMPI_CMD_LINE_TYPE_BOOL,
"Run in debug mode (not generally intended for users)" }, "Debug the OpenRTE" },
{ NULL, NULL, NULL, '\0', NULL, "no-daemonize", 0, { NULL, NULL, NULL, '\0', NULL, "no-daemonize", 0,
&orted_globals.no_daemonize, OMPI_CMD_LINE_TYPE_BOOL, &orted_globals.no_daemonize, OMPI_CMD_LINE_TYPE_BOOL,
"Don't daemonize into the background" }, "Don't daemonize into the background" },
{ "orte", "debug", "daemons", '\0', NULL, "debug-daemons", 0,
&orted_globals.debug_daemons, OMPI_CMD_LINE_TYPE_BOOL,
"Enable debugging of OpenRTE daemons" },
{ "orte", "debug", "daemons_file", '\0', NULL, "debug-daemons-file", 0,
&orted_globals.debug_daemons_file, OMPI_CMD_LINE_TYPE_BOOL,
"Enable debugging of OpenRTE daemons, storing output in files" },
{ "rmgr", "bootproxy", "jobid", '\0', NULL, "bootproxy", 1, { "rmgr", "bootproxy", "jobid", '\0', NULL, "bootproxy", 1,
&orted_globals.bootproxy, OMPI_CMD_LINE_TYPE_INT, &orted_globals.bootproxy, OMPI_CMD_LINE_TYPE_INT,
"Run as boot proxy for <job-id>" }, "Run as boot proxy for <job-id>" },
{ NULL, NULL, NULL, '\0', NULL, "name", 1, { NULL, NULL, NULL, '\0', NULL, "name", 1,
&orted_globals.name, OMPI_CMD_LINE_TYPE_STRING, &orted_globals.name, OMPI_CMD_LINE_TYPE_STRING,
"Set the orte process name"}, "Set the orte process name"},
{ NULL, NULL, NULL, '\0', NULL, "nsreplica", 1, { NULL, NULL, NULL, '\0', NULL, "nsreplica", 1,
&orte_process_info.ns_replica_uri, OMPI_CMD_LINE_TYPE_STRING, &orte_process_info.ns_replica_uri, OMPI_CMD_LINE_TYPE_STRING,
"Name service contact information."}, "Name service contact information."},
{ NULL, NULL, NULL, '\0', NULL, "gprreplica", 1, { NULL, NULL, NULL, '\0', NULL, "gprreplica", 1,
&orte_process_info.gpr_replica_uri, OMPI_CMD_LINE_TYPE_STRING, &orte_process_info.gpr_replica_uri, OMPI_CMD_LINE_TYPE_STRING,
"Registry contact information."}, "Registry contact information."},
{ NULL, NULL, NULL, '\0', NULL, "nodename", 1, { NULL, NULL, NULL, '\0', NULL, "nodename", 1,
&orte_system_info.nodename, OMPI_CMD_LINE_TYPE_STRING, &orte_system_info.nodename, OMPI_CMD_LINE_TYPE_STRING,
"Node name as specified by host/resource description." }, "Node name as specified by host/resource description." },
{ "universe", NULL, NULL, '\0', NULL, "universe", 1,
&orted_globals.universe, OMPI_CMD_LINE_TYPE_STRING,
"Set the universe name as username@hostname:universe_name for this application" },
{ "tmpdir", "base", NULL, '\0', NULL, "tmpdir", 1,
NULL, OMPI_CMD_LINE_TYPE_STRING,
"Set the root for the session directory tree" },
{ "seed", NULL, NULL, '\0', NULL, "seed", 0, { "seed", NULL, NULL, '\0', NULL, "seed", 0,
&orte_process_info.seed, OMPI_CMD_LINE_TYPE_BOOL, NULL, OMPI_CMD_LINE_TYPE_BOOL,
"seed"}, "Host replicas for the core universe services"},
{ "universe", "persistence", NULL, '\0', NULL, "persistent", 0, { "universe", "persistence", NULL, '\0', NULL, "persistent", 0,
&orte_universe_info.persistence, OMPI_CMD_LINE_TYPE_BOOL, NULL, OMPI_CMD_LINE_TYPE_BOOL,
"persistent"}, "Remain alive after the application process completes"},
{ "universe", "scope", NULL, '\0', NULL, "scope", 1, { "universe", "scope", NULL, '\0', NULL, "scope", 1,
&orte_universe_info.scope, OMPI_CMD_LINE_TYPE_STRING, NULL, OMPI_CMD_LINE_TYPE_STRING,
"scope"}, "Set restrictions on who can connect to this universe"},
/* End of list */ /* End of list */
{ NULL, NULL, NULL, '\0', NULL, NULL, 0, { NULL, NULL, NULL, '\0', NULL, NULL, 0,
NULL, OMPI_CMD_LINE_TYPE_NULL, NULL } NULL, OMPI_CMD_LINE_TYPE_NULL, NULL }
@ -119,12 +146,15 @@ ompi_cmd_line_init_t orte_cmd_line_opts[] = {
int main(int argc, char *argv[]) int main(int argc, char *argv[])
{ {
int ret = 0; int ret = 0;
int fd;
ompi_cmd_line_t *cmd_line = NULL; ompi_cmd_line_t *cmd_line = NULL;
char *contact_path = NULL; char *contact_path = NULL;
char *log_path = NULL; char *log_path = NULL;
char log_file[PATH_MAX];
char *jobidstring;
/* setup to check common command line options that just report and die */ /* setup to check common command line options that just report and die */
memset(&orted_globals, 0, sizeof(orted_globals)); memset(&orted_globals, 0, sizeof(orted_globals_t));
cmd_line = OBJ_NEW(ompi_cmd_line_t); cmd_line = OBJ_NEW(ompi_cmd_line_t);
ompi_cmd_line_create(cmd_line, orte_cmd_line_opts); ompi_cmd_line_create(cmd_line, orte_cmd_line_opts);
if (OMPI_SUCCESS != (ret = ompi_cmd_line_parse(cmd_line, true, if (OMPI_SUCCESS != (ret = ompi_cmd_line_parse(cmd_line, true,
@ -161,14 +191,20 @@ int main(int argc, char *argv[])
ret = orte_ns_base_convert_string_to_process_name( ret = orte_ns_base_convert_string_to_process_name(
&orte_process_info.my_name, orted_globals.name); &orte_process_info.my_name, orted_globals.name);
if(ORTE_SUCCESS != ret) { if(ORTE_SUCCESS != ret) {
ORTE_ERROR_LOG(ret); fprintf(stderr, "Couldn't convert environmental string to process name\n");
return 1; return 1;
} }
} }
/* turn on debug if debug_file is requested so output will be generated */
if (orted_globals.debug_daemons_file) {
orted_globals.debug_daemons = true;
}
/* detach from controlling terminal */ /* detach from controlling terminal
if(orted_globals.debug == false && orted_globals.no_daemonize == false) { * otherwise, remain attached so output can get to us
*/
if(orted_globals.debug_daemons == false && orted_globals.no_daemonize == false) {
orte_daemon_init(NULL); orte_daemon_init(NULL);
} }
@ -179,20 +215,21 @@ int main(int argc, char *argv[])
} }
/* setup stdin/stdout/stderr */ /* setup stdin/stdout/stderr */
if (orted_globals.debug == false) { if (orted_globals.debug_daemons_file) {
int fd; /* if we are debugging to a file, then send stdin/stdout/stderr
char log_file[PATH_MAX]; * to the orted log file
*/
/* connect input to /dev/null */
fd = open("/dev/null", O_RDONLY); /* get my jobid */
if(fd > STDIN_FILENO) { if (ORTE_SUCCESS != (ret = orte_ns.get_jobid_string(&jobidstring,
dup2(fd, STDIN_FILENO); orte_process_info.my_name))) {
close(fd); ORTE_ERROR_LOG(ret);
return ret;
} }
/* connect output to a log file in the session directory */ /* define a log file name in the session directory */
sprintf(log_file, "output-orted-%d-%s.log", sprintf(log_file, "output-orted-%s-%s.log",
(int)orte_process_info.my_name->jobid, orte_system_info.nodename); jobidstring, orte_system_info.nodename);
log_path = orte_os_path(false, log_path = orte_os_path(false,
orte_process_info.tmpdir_base, orte_process_info.tmpdir_base,
orte_process_info.top_session_dir, orte_process_info.top_session_dir,
@ -200,10 +237,12 @@ int main(int argc, char *argv[])
NULL); NULL);
fd = open(log_path, O_RDWR|O_CREAT|O_TRUNC, 0666); fd = open(log_path, O_RDWR|O_CREAT|O_TRUNC, 0666);
if(fd < 0) { if (fd < 0) {
/* couldn't open the file for some reason, so
* just connect everything to /dev/null
*/
fd = open("/dev/null", O_RDWR|O_CREAT|O_TRUNC, 0666); fd = open("/dev/null", O_RDWR|O_CREAT|O_TRUNC, 0666);
} } else {
if(fd >= 0) {
dup2(fd, STDOUT_FILENO); dup2(fd, STDOUT_FILENO);
dup2(fd, STDERR_FILENO); dup2(fd, STDERR_FILENO);
if(fd != STDOUT_FILENO && fd != STDERR_FILENO) { if(fd != STDOUT_FILENO && fd != STDERR_FILENO) {
@ -246,19 +285,21 @@ int main(int argc, char *argv[])
orte_universe_info.seed_uri = orte_rml.get_uri(); orte_universe_info.seed_uri = orte_rml.get_uri();
contact_path = orte_os_path(false, orte_process_info.universe_session_dir, contact_path = orte_os_path(false, orte_process_info.universe_session_dir,
"universe-setup.txt", NULL); "universe-setup.txt", NULL);
ompi_output(0, "ompid: contact_file %s", contact_path); if (orted_globals.debug_daemons) {
ompi_output(0, "ompid: contact_file %s", contact_path);
}
if (OMPI_SUCCESS != (ret = orte_write_universe_setup_file(contact_path, &orte_universe_info))) { if (OMPI_SUCCESS != (ret = orte_write_universe_setup_file(contact_path, &orte_universe_info))) {
if (orted_globals.debug) { if (orted_globals.debug_daemons) {
ompi_output(0, "[%lu,%lu,%lu] ompid: couldn't write setup file", ORTE_NAME_ARGS(orte_process_info.my_name)); ompi_output(0, "[%lu,%lu,%lu] ompid: couldn't write setup file", ORTE_NAME_ARGS(orte_process_info.my_name));
} }
} else if (orted_globals.debug) { } else if (orted_globals.debug_daemons) {
ompi_output(0, "[%lu,%lu,%lu] ompid: wrote setup file", ORTE_NAME_ARGS(orte_process_info.my_name)); ompi_output(0, "[%lu,%lu,%lu] ompid: wrote setup file", ORTE_NAME_ARGS(orte_process_info.my_name));
} }
} }
if (orted_globals.debug) { if (orted_globals.debug_daemons) {
ompi_output(0, "[%lu,%lu,%lu] ompid: issuing callback", ORTE_NAME_ARGS(orte_process_info.my_name)); ompi_output(0, "[%lu,%lu,%lu] ompid: issuing callback", ORTE_NAME_ARGS(orte_process_info.my_name));
} }
@ -273,7 +314,7 @@ int main(int argc, char *argv[])
* - could be setup a virtual machine, spawn a console, etc. * - could be setup a virtual machine, spawn a console, etc.
*/ */
if (orted_globals.debug) { if (orted_globals.debug_daemons) {
ompi_output(0, "[%lu,%lu,%lu] ompid: setting up event monitor", ORTE_NAME_ARGS(orte_process_info.my_name)); ompi_output(0, "[%lu,%lu,%lu] ompid: setting up event monitor", ORTE_NAME_ARGS(orte_process_info.my_name));
} }
@ -286,7 +327,7 @@ int main(int argc, char *argv[])
OMPI_THREAD_UNLOCK(&orted_globals.mutex); OMPI_THREAD_UNLOCK(&orted_globals.mutex);
if (orted_globals.debug) { if (orted_globals.debug_daemons) {
ompi_output(0, "[%lu,%lu,%lu] ompid: mutex cleared - finalizing", ORTE_NAME_ARGS(orte_process_info.my_name)); ompi_output(0, "[%lu,%lu,%lu] ompid: mutex cleared - finalizing", ORTE_NAME_ARGS(orte_process_info.my_name));
} }
@ -301,7 +342,7 @@ int main(int argc, char *argv[])
/* finalize the system */ /* finalize the system */
orte_finalize(); orte_finalize();
if (orted_globals.debug) { if (orted_globals.debug_daemons) {
ompi_output(0, "[%lu,%lu,%lu] ompid: done - exiting", ORTE_NAME_ARGS(orte_process_info.my_name)); ompi_output(0, "[%lu,%lu,%lu] ompid: done - exiting", ORTE_NAME_ARGS(orte_process_info.my_name));
} }
@ -320,7 +361,7 @@ static void orte_daemon_recv(int status, orte_process_name_t* sender,
OMPI_THREAD_LOCK(&orted_globals.mutex); OMPI_THREAD_LOCK(&orted_globals.mutex);
if (orted_globals.debug) { if (orted_globals.debug_daemons) {
ompi_output(0, "[%lu,%lu,%lu] ompid: received message", ORTE_NAME_ARGS(orte_process_info.my_name)); ompi_output(0, "[%lu,%lu,%lu] ompid: received message", ORTE_NAME_ARGS(orte_process_info.my_name));
} }

Просмотреть файл

@ -49,10 +49,11 @@ typedef uint16_t orte_daemon_cmd_flag_t;
typedef struct { typedef struct {
bool help; bool help;
bool version; bool version;
bool debug;
bool no_daemonize; bool no_daemonize;
bool probe; bool debug_daemons;
bool debug_daemons_file;
char* name; char* name;
char* universe;
int bootproxy; int bootproxy;
ompi_mutex_t mutex; ompi_mutex_t mutex;
ompi_condition_t condition; ompi_condition_t condition;
@ -61,11 +62,6 @@ typedef struct {
extern orted_globals_t orted_globals; extern orted_globals_t orted_globals;
/*
* Internal functions
*/
int orte_daemon_bootproxy(void);
/* /*
* Version-related strings and functions * Version-related strings and functions
*/ */

50
src/tools/orteprobe/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,50 @@
#
# Copyright (c) 2004-2005 The Trustees of Indiana University.
# All rights reserved.
# Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
# All rights reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
include $(top_srcdir)/config/Makefile.options
AM_CPPFLAGS = \
-DOMPI_PREFIX="\"$(prefix)\"" \
-DOMPI_BINDIR="\"$(bindir)\"" \
-DOMPI_LIBDIR="\"$(libdir)\"" \
-DOMPI_INCDIR="\"$(includedir)\"" \
-DOMPI_PKGLIBDIR="\"$(pkglibdir)\"" \
-DOMPI_SYSCONFDIR="\"$(sysconfdir)\"" \
-DOMPI_CONFIGURE_USER="\"@OMPI_CONFIGURE_USER@\"" \
-DOMPI_CONFIGURE_HOST="\"@OMPI_CONFIGURE_HOST@\"" \
-DOMPI_CONFIGURE_DATE="\"@OMPI_CONFIGURE_DATE@\"" \
-DOMPI_BUILD_CFLAGS="\"@CFLAGS@\"" \
-DOMPI_BUILD_CPPFLAGS="\"@CPPFLAGS@\"" \
-DOMPI_BUILD_CXXFLAGS="\"@CXXFLAGS@\"" \
-DOMPI_BUILD_CXXCPPFLAGS="\"@CXXCPPFLAGS@\"" \
-DOMPI_BUILD_FFLAGS="\"@FFLAGS@\"" \
-DOMPI_BUILD_FCFLAGS="\"@FCFLAGS@\"" \
-DOMPI_BUILD_LDFLAGS="\"@LDFLAGS@\"" \
-DOMPI_BUILD_LIBS="\"@LIBS@\""
libs = $(top_builddir)/src/libmpi.la
bin_PROGRAMS = orteprobe
orteprobe_SOURCES = \
orteprobe.h \
orteprobe.c
orteprobe_LDADD = $(libs) $(LIBMPI_EXTRA_LIBS)
orteprobe_LDFLAGS = $(LIBMPI_EXTRA_LDFLAGS)
orteprobe_DEPENDENCIES = $(libs)
clean-local:
test -z "$(OMPI_CXX_TEMPLATE_REPOSITORY)" || $(RM) -rf $(OMPI_CXX_TEMPLATE_REPOSITORY)

335
src/tools/orteprobe/orteprobe.c Обычный файл
Просмотреть файл

@ -0,0 +1,335 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include <stdio.h>
#include <ctype.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#ifdef HAVE_NETDB_H
#include <netdb.h>
#endif
#ifdef HAVE_SYS_PARAM_H
#include <sys/param.h>
#endif
#include <fcntl.h>
#include <errno.h>
#include "include/orte_constants.h"
#include "threads/mutex.h"
#include "threads/condition.h"
#include "dps/dps.h"
#include "event/event.h"
#include "util/output.h"
#include "util/show_help.h"
#include "util/sys_info.h"
#include "util/os_path.h"
#include "util/cmd_line.h"
#include "util/proc_info.h"
#include "util/univ_info.h"
#include "util/session_dir.h"
#include "util/printf.h"
#include "util/daemon_init.h"
#include "util/universe_setup_file_io.h"
#include "mca/base/base.h"
#include "mca/base/mca_base_param.h"
#include "mca/rml/base/base.h"
#include "mca/rml/rml.h"
#include "mca/errmgr/base/base.h"
#include "mca/ns/base/base.h"
#include "mca/gpr/base/base.h"
#include "mca/schema/base/base.h"
#include "mca/soh/base/base.h"
#include "runtime/runtime.h"
#include "runtime/orte_wait.h"
#include "tools/orteprobe/orteprobe.h"
orteprobe_globals_t orteprobe_globals;
/*
* define the orteprobe context table for obtaining parameters
*/
ompi_cmd_line_init_t orte_cmd_line_opts[] = {
/* Various "obvious" options */
{ NULL, NULL, NULL, 'h', NULL, "help", 0,
&orteprobe_globals.help, OMPI_CMD_LINE_TYPE_BOOL,
"This help message" },
{ NULL, NULL, NULL, '\0', NULL, "version", 0,
&orteprobe_globals.version, OMPI_CMD_LINE_TYPE_BOOL,
"Show the orteprobe version" },
{ NULL, NULL, NULL, 'd', NULL, "debug", 0,
&orteprobe_globals.debug, OMPI_CMD_LINE_TYPE_BOOL,
"Run in debug mode (not generally intended for users)" },
{ NULL, NULL, NULL, '\0', NULL, "name", 1,
&orteprobe_globals.name_string, OMPI_CMD_LINE_TYPE_STRING,
"Set the orte process name"},
{ NULL, NULL, NULL, '\0', NULL, "nsreplica", 1,
&orte_process_info.ns_replica_uri, OMPI_CMD_LINE_TYPE_STRING,
"Name service contact information."},
{ NULL, NULL, NULL, '\0', NULL, "gprreplica", 1,
&orte_process_info.gpr_replica_uri, OMPI_CMD_LINE_TYPE_STRING,
"Registry contact information."},
{ NULL, NULL, NULL, '\0', NULL, "nodename", 1,
&orte_system_info.nodename, OMPI_CMD_LINE_TYPE_STRING,
"Node name as specified by host/resource description." },
{ NULL, NULL, NULL, '\0', NULL, "requestor", 1,
&orteprobe_globals.requestor_string, OMPI_CMD_LINE_TYPE_STRING,
"Set the orte process name"},
{ "seed", NULL, NULL, '\0', NULL, "seed", 0,
&orte_process_info.seed, OMPI_CMD_LINE_TYPE_BOOL,
"seed"},
{ "universe", "persistence", NULL, '\0', NULL, "persistent", 0,
&orte_universe_info.persistence, OMPI_CMD_LINE_TYPE_BOOL,
"persistent"},
{ "universe", "scope", NULL, '\0', NULL, "scope", 1,
&orte_universe_info.scope, OMPI_CMD_LINE_TYPE_STRING,
"scope"},
/* End of list */
{ NULL, NULL, NULL, '\0', NULL, NULL, 0,
NULL, OMPI_CMD_LINE_TYPE_NULL, NULL }
};
int main(int argc, char *argv[])
{
int ret = 0;
ompi_cmd_line_t *cmd_line = NULL;
char *contact_path = NULL;
char *log_path = NULL;
orte_universe_t univ;
orte_buffer_t buffer;
orte_process_name_t requestor;
/* setup to check common command line options that just report and die */
memset(&orteprobe_globals, 0, sizeof(orteprobe_globals));
cmd_line = OBJ_NEW(ompi_cmd_line_t);
ompi_cmd_line_create(cmd_line, orte_cmd_line_opts);
if (OMPI_SUCCESS != (ret = ompi_cmd_line_parse(cmd_line, true,
argc, argv))) {
return ret;
}
/* check for help and version requests */
if (orteprobe_globals.help) {
char *args = NULL;
args = ompi_cmd_line_get_usage_msg(cmd_line);
ompi_show_help("help-orteprobe.txt", "orteprobe:usage", false,
argv[0], args);
free(args);
return 1;
}
if (orteprobe_globals.version) {
/* show version message */
printf("...showing off my version!\n");
exit(1);
}
/*
* Attempt to parse the probe's name and save in proc_info
*/
if (orteprobe_globals.name_string) {
ret = orte_ns_base_convert_string_to_process_name(
&orte_process_info.my_name, orteprobe_globals.name_string);
if(ORTE_SUCCESS != ret) {
fprintf(stderr, "Couldn't convert environmental string to probe's process name\n");
return 1;
}
}
/*
* Attempt to parse the requestor's name and contact info
*/
if (orteprobe_globals.requestor_string) {
if(ORTE_SUCCESS != (ret = orte_rml.parse_uris(
orteprobe_globals.requestor_string, &requestor, NULL))) {
fprintf(stderr, "Couldn't parse environmental string for requestor's contact info\n");
return 1;
}
} else {
fprintf(stderr, "No contact info received for requestor\n");
return 1;
}
/* Open up the output streams */
if (!ompi_output_init()) {
return OMPI_ERROR;
}
/*
* If threads are supported - assume that we are using threads - and reset otherwise.
*/
ompi_set_using_threads(OMPI_HAVE_THREAD_SUPPORT);
/* For malloc debugging */
ompi_malloc_init();
/* Ensure the universe_info structure is instantiated and initialized */
if (ORTE_SUCCESS != (ret = orte_univ_info())) {
return ret;
}
/* Ensure the system_info structure is instantiated and initialized */
if (ORTE_SUCCESS != (ret = orte_sys_info())) {
return ret;
}
/* Ensure the process info structure is instantiated and initialized */
if (ORTE_SUCCESS != (ret = orte_proc_info())) {
return ret;
}
/*
* Initialize the MCA framework
*/
if (OMPI_SUCCESS != (ret = mca_base_open())) {
return ret;
}
/*
* Initialize the data packing service.
*/
if (ORTE_SUCCESS != (ret = orte_dps_open())) {
ORTE_ERROR_LOG(ret);
return ret;
}
/*
* Open the name services to ensure access to local functions
*/
if (OMPI_SUCCESS != (ret = orte_ns_base_open())) {
return ret;
}
/* Open the error manager to activate error logging - needs local name services */
if (ORTE_SUCCESS != (ret = orte_errmgr_base_open())) {
return ret;
}
/***** ERROR LOGGING NOW AVAILABLE *****/
/*
* Initialize the event library
*/
if (OMPI_SUCCESS != (ret = ompi_event_init())) {
ORTE_ERROR_LOG(ret);
return ret;
}
/*
* Intialize the general progress engine
*/
if (OMPI_SUCCESS != (ret = ompi_progress_init())) {
ORTE_ERROR_LOG(ret);
return ret;
}
/*
* Internal startup
*/
if (OMPI_SUCCESS != (ret = orte_wait_init())) {
ORTE_ERROR_LOG(ret);
return ret;
}
/*
* Runtime Messaging Layer
*/
if (OMPI_SUCCESS != (ret = orte_rml_base_open())) {
ORTE_ERROR_LOG(ret);
return ret;
}
/*
* Runtime Messaging Layer
*/
if (OMPI_SUCCESS != (ret = orte_rml_base_select())) {
ORTE_ERROR_LOG(ret);
return ret;
}
/*
* Registry
*/
if (ORTE_SUCCESS != (ret = orte_gpr_base_open())) {
ORTE_ERROR_LOG(ret);
return ret;
}
/*
* Initialize schema utilities
*/
if (ORTE_SUCCESS != (ret = orte_schema_base_open())) {
ORTE_ERROR_LOG(ret);
return ret;
}
/* see if a universe already exists on this machine */
if (ORTE_SUCCESS == (ret = orte_universe_exists(&univ))) {
/* universe is here! send info back and die */
}
/* existing universe is not here or does not allow contact.
* ensure we have a unique universe name, fork/exec an appropriate
* daemon, and then tell whomever spawned us how to talk to the new
* daemon
*/
/* cleanup */
if (NULL != contact_path) {
unlink(contact_path);
}
if (NULL != log_path) {
unlink(log_path);
}
OBJ_CONSTRUCT(&buffer, orte_buffer_t);
if (ORTE_SUCCESS != (ret = orte_dps.pack(&buffer, &ret, 1, ORTE_INT))) {
ORTE_ERROR_LOG(ret);
exit(1);
}
if (0 > orte_rml.send_buffer(&requestor, &buffer, ORTE_RML_TAG_PROBE, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_DESTRUCT(&buffer);
return ORTE_ERR_COMM_FAILURE;
}
OBJ_DESTRUCT(&buffer);
/* finalize the system */
orte_finalize();
exit(0);
}

62
src/tools/orteprobe/orteprobe.h Обычный файл
Просмотреть файл

@ -0,0 +1,62 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef ORTEPROBE_H
#define ORTEPROBE_H
#include "orte_config.h"
#include <string.h>
#include "class/ompi_list.h"
#include "threads/mutex.h"
#include "threads/condition.h"
#include "util/cmd_line.h"
#include "mca/mca.h"
/*
* Definitions needed for communication
*/
#define ORTE_DAEMON_CMD ORTE_INT16
#define ORTE_DAEMON_HOSTFILE_CMD 0x01
#define ORTE_DAEMON_SCRIPTFILE_CMD 0x02
#define ORTE_DAEMON_CONTACT_QUERY_CMD 0x03
#define ORTE_DAEMON_HEARTBEAT_CMD 0xfe
#define ORTE_DAEMON_EXIT_CMD 0xff
/*
* Globals
*/
typedef uint16_t orte_daemon_cmd_flag_t;
typedef struct {
bool help;
bool version;
bool debug;
char* name_string;
char* requestor_string;
ompi_mutex_t mutex;
ompi_condition_t condition;
bool exit_condition;
} orteprobe_globals_t;
extern orteprobe_globals_t orteprobe_globals;
#endif /* ORTEPROBE_H */

Просмотреть файл

@ -85,7 +85,6 @@ struct globals_t {
bool verbose; bool verbose;
bool exit; bool exit;
bool no_wait_for_job_completion; bool no_wait_for_job_completion;
bool debug;
size_t num_procs; size_t num_procs;
int exit_status; int exit_status;
char *hostfile; char *hostfile;
@ -113,9 +112,6 @@ ompi_cmd_line_init_t cmd_line_init[] = {
{ NULL, NULL, NULL, '\0', NULL, "version", 0, { NULL, NULL, NULL, '\0', NULL, "version", 0,
&orterun_globals.version, OMPI_CMD_LINE_TYPE_BOOL, &orterun_globals.version, OMPI_CMD_LINE_TYPE_BOOL,
"Show the orterun version" }, "Show the orterun version" },
{ "orte", "debug", NULL, 'd', NULL, "debug", 0,
&orterun_globals.debug, OMPI_CMD_LINE_TYPE_BOOL,
"Enable debugging" },
{ NULL, NULL, NULL, 'v', NULL, "verbose", 0, { NULL, NULL, NULL, 'v', NULL, "verbose", 0,
&orterun_globals.verbose, OMPI_CMD_LINE_TYPE_BOOL, &orterun_globals.verbose, OMPI_CMD_LINE_TYPE_BOOL,
"Be verbose" }, "Be verbose" },
@ -178,6 +174,23 @@ ompi_cmd_line_init_t cmd_line_init[] = {
NULL, OMPI_CMD_LINE_TYPE_STRING, NULL, OMPI_CMD_LINE_TYPE_STRING,
"List of hosts to invoke processes on" }, "List of hosts to invoke processes on" },
/* OpenRTE arguments */
{ "orte", "debug", NULL, 'd', NULL, "debug", 0,
NULL, OMPI_CMD_LINE_TYPE_BOOL,
"Enable debugging of OpenRTE" },
{ "orte", "debug", "daemons", '\0', NULL, "debug-daemons", 0,
NULL, OMPI_CMD_LINE_TYPE_INT,
"Enable debugging of any OpenRTE daemons used by this application" },
{ "orte", "debug", "daemons_file", '\0', NULL, "debug-daemons-file", 0,
NULL, OMPI_CMD_LINE_TYPE_BOOL,
"Enable debugging of any OpenRTE daemons used by this application, storing output in files" },
{ "universe", NULL, NULL, '\0', NULL, "universe", 1,
NULL, OMPI_CMD_LINE_TYPE_STRING,
"Set the universe name as username@hostname:universe_name for this application" },
{ NULL, NULL, NULL, '\0', NULL, "tmpdir", 1,
&orte_process_info.tmpdir_base, OMPI_CMD_LINE_TYPE_STRING,
"Set the root for the session directory tree for orterun ONLY" },
/* End of list */ /* End of list */
{ NULL, NULL, NULL, '\0', NULL, NULL, 0, { NULL, NULL, NULL, '\0', NULL, NULL, 0,
NULL, OMPI_CMD_LINE_TYPE_NULL, NULL } NULL, OMPI_CMD_LINE_TYPE_NULL, NULL }
@ -517,7 +530,6 @@ static int init_globals(void)
false, false,
false, false,
false, false,
false,
0, 0,
0, 0,
NULL, NULL,
@ -581,11 +593,6 @@ static int parse_globals(int argc, char* argv[])
wait_for_job_completion = false; wait_for_job_completion = false;
} }
/* debug */
if (orterun_globals.debug) {
int id = mca_base_param_register_int("debug",NULL,NULL,NULL,0);
mca_base_param_set_int(id,orterun_globals.debug);
}
OBJ_DESTRUCT(&cmd_line); OBJ_DESTRUCT(&cmd_line);
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }
@ -815,6 +822,7 @@ static int create_app(int argc, char* argv[], orte_app_context_t **app_ptr,
} else { } else {
asprintf(&value2, "%s=%s", param, value); asprintf(&value2, "%s=%s", param, value);
ompi_argv_append_nosize(&app->env, value2); ompi_argv_append_nosize(&app->env, value2);
free(value2);
} }
} else { } else {
ompi_output(0, "Warning: could not find environment variable \"%s\"\n", param); ompi_output(0, "Warning: could not find environment variable \"%s\"\n", param);

Просмотреть файл

@ -38,20 +38,48 @@ int orte_daemon_init(char *working_dir)
So, I am guessing that this piece of code is called only by UNIX versions */ So, I am guessing that this piece of code is called only by UNIX versions */
pid_t pid; pid_t pid;
int fd;
if ((pid = fork()) < 0) { if ((pid = fork()) < 0) {
return ORTE_ERROR; return ORTE_ERROR;
} else if (pid != 0) { } else if (pid != 0) {
exit(0); /* parent goes bye-bye */ exit(0); /* parent goes bye-bye */
} }
/* child continues */ /* child continues */
setsid(); /* become session leader */ setsid(); /* become session leader */
if (NULL != working_dir) { if (NULL != working_dir) {
chdir(working_dir); /* change working directory */ chdir(working_dir); /* change working directory */
} }
umask(0); /* clear file mode creation mask */ umask(0); /* clear file mode creation mask */
/* connect input to /dev/null */
fd = open("/dev/null", O_RDONLY);
if(fd > STDIN_FILENO) {
dup2(fd, STDIN_FILENO);
close(fd);
}
/* connect outputs to /dev/null */
fd = open("/dev/null", O_RDWR|O_CREAT|O_TRUNC, 0666);
if (fd >= 0) {
dup2(fd, STDOUT_FILENO);
dup2(fd, STDERR_FILENO);
/* just to be safe, make sure we aren't trying
* to close stdout or stderr! since we dup'd both
* of them to the same fd, we can't just close it
* since one of the two would still be open and
* someone could attempt to use it.
*/
if(fd != STDOUT_FILENO && fd != STDERR_FILENO) {
close(fd);
}
} else {
return ORTE_ERR_FATAL;
}
return ORTE_SUCCESS; return ORTE_SUCCESS;
#else #else
printf ("This function has not been implemented in windows yet, file %s line %d\n", __FILE__, __LINE__); printf ("This function has not been implemented in windows yet, file %s line %d\n", __FILE__, __LINE__);

Просмотреть файл

@ -356,7 +356,7 @@ orte_session_dir_finalize(orte_process_name_t *proc)
int rc; int rc;
char *tmp; char *tmp;
char *job, *job_session_dir, *vpid, *proc_session_dir; char *job, *job_session_dir, *vpid, *proc_session_dir;
/* need to setup the top_session_dir with the prefix */ /* need to setup the top_session_dir with the prefix */
tmp = strdup(orte_os_path(false, tmp = strdup(orte_os_path(false,
orte_process_info.tmpdir_base, orte_process_info.tmpdir_base,

Просмотреть файл

@ -91,7 +91,7 @@ int orte_sys_info(void)
return ORTE_ERROR; return ORTE_ERROR;
} else { } else {
orte_system_info.sysname = strdup(sys_info.sysname); orte_system_info.sysname = strdup(sys_info.sysname);
if(NULL == orte_system_info.nodename) { if (NULL == orte_system_info.nodename) {
orte_system_info.nodename = strdup(sys_info.nodename); orte_system_info.nodename = strdup(sys_info.nodename);
} }
orte_system_info.release = strdup(sys_info.release); orte_system_info.release = strdup(sys_info.release);

Просмотреть файл

@ -32,6 +32,10 @@
#include "include/orte_constants.h" #include "include/orte_constants.h"
#include "mca/base/base.h" #include "mca/base/base.h"
#include "mca/base/mca_base_param.h" #include "mca/base/mca_base_param.h"
#include "mca/ns/ns_types.h"
#include "util/output.h"
#include "util/proc_info.h"
#include "util/sys_info.h"
#include "util/univ_info.h" #include "util/univ_info.h"
@ -53,19 +57,51 @@ orte_universe_t orte_universe_info = {
int orte_univ_info(void) int orte_univ_info(void)
{ {
int id, tmp; int id, tmp;
char *tmpname=NULL, *tptr, *ptr;
if (!orte_universe_info.init) { if (!orte_universe_info.init) {
id = mca_base_param_register_string("universe", "path", NULL, NULL, orte_universe_info.path); id = mca_base_param_register_string("universe", "path", NULL, NULL, orte_universe_info.path);
mca_base_param_lookup_string(id, &(orte_universe_info.path)); mca_base_param_lookup_string(id, &(orte_universe_info.path));
id = mca_base_param_register_string("universe", "name", NULL, NULL, orte_universe_info.name); id = mca_base_param_register_string("universe", NULL, NULL, NULL, NULL);
mca_base_param_lookup_string(id, &(orte_universe_info.name)); mca_base_param_lookup_string(id, &tmpname);
id = mca_base_param_register_string("universe", "host", NULL, NULL, orte_universe_info.host);
mca_base_param_lookup_string(id, &(orte_universe_info.host));
/* uid is not set via parameter, but is determined elsewhere */
if (NULL != tmpname) {
/* Universe name info is passed as userid@hostname:univ_name */
/* extract the userid from the universe option, if provided */
tptr = tmpname;
if (NULL != (ptr = strchr(tptr, '@'))) {
*ptr = '\0';
orte_universe_info.uid = strdup(tptr);
ptr++;
tptr = ptr;
} else {
if (NULL == orte_system_info.user) {
orte_sys_info();
}
orte_universe_info.uid = strdup(orte_system_info.user);
}
/* extract the hostname, if provided */
if (NULL != (ptr = strchr(tptr, ':'))) {
*ptr = '\0';
orte_universe_info.host = strdup(tptr);
ptr++;
tptr = ptr;
} else {
orte_universe_info.host = strdup(orte_system_info.nodename);
}
/* now copy the universe name into the universe_info structure */
orte_universe_info.name = strdup(tptr);
} else {
/* if nothing was provided, then initialize the user and nodename
* to the local values
*/
orte_universe_info.uid = strdup(orte_system_info.user);
orte_universe_info.host = strdup(orte_system_info.nodename);
}
id = mca_base_param_register_int("universe", "persistence", NULL, NULL, orte_universe_info.persistence); id = mca_base_param_register_int("universe", "persistence", NULL, NULL, orte_universe_info.persistence);
mca_base_param_lookup_int(id, &tmp); mca_base_param_lookup_int(id, &tmp);
orte_universe_info.persistence = (tmp ? true : false); orte_universe_info.persistence = (tmp ? true : false);