Bring in the remote launch changes. This still isn't fully functional, but impacted a few other places that were worth fixing.
1. Added a new function to launch head node processes on remote nodes. 2. Added new tool "orteprobe" that checks to see if a daemon is running on a node. If so, it reports the contact info back to the requestor. If not, it will (eventually - but not now) fork/exec a daemon on the node, report the contact info back to requestor, and then die. 3. Modified orted to handle universe name parameters, and added separate command line flags for debugging the daemon and saving daemon debugging output in a file. The "debug" flag now turns on the runtime debug info instead of the daemon debug - thus, you can now just get daemon debug info if you like. 4. Fix the dps to handle zero length strings correctly. 5. Modify the fork and rsh launchers to pass required environmental variables to the daemons and processes 6. Pulled the redirection of stdin/stdout/stderr for the daemon out of orted and put it into the daemon_init function to simplify orted logic. 7. Modified sys_info to correctly deal with passed mca param 8. Modified univ_info to parse incoming universe location information. This commit was SVN r5705.
Этот коммит содержится в:
родитель
0c6eaaebe3
Коммит
fdfe457578
@ -1798,6 +1798,7 @@ AC_CONFIG_FILES([
|
||||
src/tools/console/Makefile
|
||||
src/tools/ompi_info/Makefile
|
||||
src/tools/orted/Makefile
|
||||
src/tools/orteprobe/Makefile
|
||||
src/tools/orterun/Makefile
|
||||
src/tools/openmpi/Makefile
|
||||
src/tools/wrappers/Makefile
|
||||
|
@ -300,15 +300,23 @@ int orte_dps_pack_string(orte_buffer_t *buffer, void *src,
|
||||
char **ssrc = (char**) src;
|
||||
|
||||
for (i = 0; i < num_vals; ++i) {
|
||||
len = strlen(ssrc[i]) + 1;
|
||||
if (ORTE_SUCCESS != (ret = orte_dps_pack_sizet(buffer, &len, 1, ORTE_SIZE))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
if (ORTE_SUCCESS != (ret =
|
||||
orte_dps_pack_byte(buffer, ssrc[i], len, ORTE_BYTE))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
if (NULL == ssrc[i]) { /* got zero-length string/NULL pointer - store NULL */
|
||||
len = 0;
|
||||
if (ORTE_SUCCESS != (ret = orte_dps_pack_sizet(buffer, &len, 1, ORTE_SIZE))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
} else {
|
||||
len = strlen(ssrc[i]) + 1;
|
||||
if (ORTE_SUCCESS != (ret = orte_dps_pack_sizet(buffer, &len, 1, ORTE_SIZE))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
if (ORTE_SUCCESS != (ret =
|
||||
orte_dps_pack_byte(buffer, ssrc[i], len, ORTE_BYTE))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -330,14 +330,18 @@ int orte_dps_unpack_string(orte_buffer_t *buffer, void *dest,
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
if (0 == len) { /* zero-length string - unpack the NULL */
|
||||
sdest[i] = NULL;
|
||||
} else {
|
||||
sdest[i] = malloc(len);
|
||||
if (NULL == sdest[i]) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
if (ORTE_SUCCESS != (ret = orte_dps_unpack_byte(buffer, sdest[i], &len, ORTE_BYTE))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
if (NULL == sdest[i]) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
if (ORTE_SUCCESS != (ret = orte_dps_unpack_byte(buffer, sdest[i], &len, ORTE_BYTE))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -97,9 +97,6 @@ int orte_gpr_base_unpack_get(orte_buffer_t *buffer, int *ret, size_t *cnt, orte_
|
||||
free(*values);
|
||||
return rc;
|
||||
}
|
||||
for (n=0; n < num; n++) {
|
||||
orte_gpr.dump_value((*values)[n], 0);
|
||||
}
|
||||
}
|
||||
|
||||
/* unpack the response code */
|
||||
|
@ -34,6 +34,7 @@
|
||||
#include "util/argv.h"
|
||||
#include "util/output.h"
|
||||
#include "util/sys_info.h"
|
||||
#include "util/univ_info.h"
|
||||
#include "util/ompi_environ.h"
|
||||
#include "util/session_dir.h"
|
||||
#include "runtime/orte_wait.h"
|
||||
@ -152,6 +153,17 @@ static int orte_pls_fork_proc(
|
||||
param = mca_base_param_environ_variable("rmgr","bootproxy","jobid");
|
||||
ompi_unsetenv(param, &environ_copy);
|
||||
|
||||
/* setup universe info */
|
||||
if (NULL != orte_universe_info.name) {
|
||||
param = mca_base_param_environ_variable("universe", NULL, NULL);
|
||||
asprintf(&uri, "%s@%s:%s", orte_universe_info.uid,
|
||||
orte_universe_info.host,
|
||||
orte_universe_info.name);
|
||||
ompi_setenv(param, uri, true, &environ_copy);
|
||||
free(param);
|
||||
free(uri);
|
||||
}
|
||||
|
||||
/* setup ns contact info */
|
||||
if(NULL != orte_process_info.ns_replica_uri) {
|
||||
uri = strdup(orte_process_info.ns_replica_uri);
|
||||
@ -185,7 +197,7 @@ static int orte_pls_fork_proc(
|
||||
new_env = ompi_environ_merge(context->env, environ_copy);
|
||||
ompi_argv_free(environ_copy);
|
||||
|
||||
if(context->argv == NULL) {
|
||||
if (context->argv == NULL) {
|
||||
context->argv = malloc(sizeof(char*)*2);
|
||||
context->argv[0] = strdup(context->app);
|
||||
context->argv[1] = NULL;
|
||||
|
@ -33,11 +33,15 @@
|
||||
#include "include/orte_constants.h"
|
||||
#include "util/argv.h"
|
||||
#include "util/output.h"
|
||||
#include "util/univ_info.h"
|
||||
#include "util/session_dir.h"
|
||||
#include "util/if.h"
|
||||
#include "util/path.h"
|
||||
#include "event/event.h"
|
||||
#include "runtime/orte_wait.h"
|
||||
|
||||
#include "mca/base/mca_base_param.h"
|
||||
|
||||
#include "mca/ns/ns.h"
|
||||
#include "mca/pls/pls.h"
|
||||
#include "mca/rml/rml.h"
|
||||
@ -52,6 +56,8 @@
|
||||
|
||||
#define NUM_CONCURRENT 128
|
||||
|
||||
extern char **environ;
|
||||
|
||||
|
||||
#if OMPI_HAVE_POSIX_THREADS && OMPI_THREADS_HAVE_DIFFERENT_PIDS && OMPI_ENABLE_PROGRESS_THREADS
|
||||
static int orte_pls_rsh_launch_threaded(orte_jobid_t jobid);
|
||||
@ -244,7 +250,8 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
|
||||
char** argv;
|
||||
int argc;
|
||||
int rc;
|
||||
|
||||
int id;
|
||||
|
||||
/* query the list of nodes allocated to the job - don't need the entire
|
||||
* mapping - as the daemon/proxy is responsibe for determining the apps
|
||||
* to launch on each node.
|
||||
@ -272,19 +279,34 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
|
||||
asprintf(&jobid_string, "%lu", (unsigned long) jobid);
|
||||
|
||||
/*
|
||||
* Build argv/env arrays.
|
||||
* Build argv array
|
||||
*/
|
||||
argv = ompi_argv_copy(mca_pls_rsh_component.argv);
|
||||
argc = mca_pls_rsh_component.argc;
|
||||
node_name_index1 = argc;
|
||||
ompi_argv_append(&argc, &argv, ""); /* placeholder for node name */
|
||||
|
||||
/* application */
|
||||
/* add the daemon command (as specified by user) */
|
||||
local_exec_index = argc;
|
||||
ompi_argv_append(&argc, &argv, mca_pls_rsh_component.orted);
|
||||
if (mca_pls_rsh_component.debug) {
|
||||
|
||||
/* check for debug flags */
|
||||
id = mca_base_param_register_int("orte","debug",NULL,NULL,0);
|
||||
mca_base_param_lookup_int(id,&rc);
|
||||
if (rc) {
|
||||
ompi_argv_append(&argc, &argv, "--debug");
|
||||
}
|
||||
id = mca_base_param_register_int("orte","debug","daemons",NULL,0);
|
||||
mca_base_param_lookup_int(id,&rc);
|
||||
if (rc) {
|
||||
ompi_argv_append(&argc, &argv, "--debug-daemons");
|
||||
}
|
||||
id = mca_base_param_register_int("orte","debug","daemons_file",NULL,0);
|
||||
mca_base_param_lookup_int(id,&rc);
|
||||
if (rc) {
|
||||
ompi_argv_append(&argc, &argv, "--debug-daemons-file");
|
||||
}
|
||||
|
||||
ompi_argv_append(&argc, &argv, "--bootproxy");
|
||||
ompi_argv_append(&argc, &argv, jobid_string);
|
||||
ompi_argv_append(&argc, &argv, "--name");
|
||||
@ -294,6 +316,13 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
|
||||
node_name_index2 = argc;
|
||||
ompi_argv_append(&argc, &argv, "");
|
||||
|
||||
/* pass along the universe name and location info */
|
||||
ompi_argv_append(&argc, &argv, "--universe");
|
||||
asprintf(¶m, "%s@%s:%s", orte_universe_info.uid,
|
||||
orte_universe_info.host, orte_universe_info.name);
|
||||
ompi_argv_append(&argc, &argv, param);
|
||||
free(param);
|
||||
|
||||
/* setup ns contact info */
|
||||
ompi_argv_append(&argc, &argv, "--nsreplica");
|
||||
if(NULL != orte_process_info.ns_replica_uri) {
|
||||
@ -304,6 +333,7 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
|
||||
asprintf(¶m, "\"%s\"", uri);
|
||||
ompi_argv_append(&argc, &argv, param);
|
||||
free(uri);
|
||||
free(param);
|
||||
|
||||
/* setup gpr contact info */
|
||||
ompi_argv_append(&argc, &argv, "--gprreplica");
|
||||
@ -315,6 +345,7 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
|
||||
asprintf(¶m, "\"%s\"", uri);
|
||||
ompi_argv_append(&argc, &argv, param);
|
||||
free(uri);
|
||||
free(param);
|
||||
|
||||
/*
|
||||
* Iterate through each of the nodes and spin
|
||||
|
@ -41,6 +41,7 @@ typedef uint32_t orte_rml_tag_t;
|
||||
#define ORTE_RML_TAG_XCAST 7
|
||||
#define ORTE_RML_TAG_RMGR_SVC 8
|
||||
#define ORTE_RML_TAG_RMGR_CLNT 9
|
||||
#define ORTE_RML_TAG_PROBE 10
|
||||
#define ORTE_RML_TAG_DYNAMIC 2000
|
||||
#define ORTE_RML_TAG_MAX UINT32_MAX
|
||||
|
||||
|
@ -34,8 +34,6 @@
|
||||
#include "mca/ns/ns_types.h"
|
||||
#include "mca/soh/soh_types.h"
|
||||
|
||||
#include "soh_types.h" /* gpr keys and external datatypes needed for prototyping */
|
||||
|
||||
/*
|
||||
* Component functions - all MUST be provided!
|
||||
*/
|
||||
@ -137,4 +135,6 @@ typedef orte_soh_base_component_1_0_0_t orte_soh_base_component_t;
|
||||
/* soh v1.0 */ \
|
||||
"soh", 1, 0, 0
|
||||
|
||||
OMPI_DECLSPEC extern orte_soh_base_module_t orte_soh; /* holds selected module's function pointers */
|
||||
|
||||
#endif /* ORTE_SOH_H */
|
||||
|
@ -73,11 +73,6 @@ int orte_init_stage1(void)
|
||||
/* For malloc debugging */
|
||||
ompi_malloc_init();
|
||||
|
||||
/* Ensure the universe_info structure is instantiated and initialized */
|
||||
if (ORTE_SUCCESS != (ret = orte_univ_info())) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Ensure the system_info structure is instantiated and initialized */
|
||||
if (ORTE_SUCCESS != (ret = orte_sys_info())) {
|
||||
return ret;
|
||||
@ -88,6 +83,11 @@ int orte_init_stage1(void)
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Ensure the universe_info structure is instantiated and initialized */
|
||||
if (ORTE_SUCCESS != (ret = orte_univ_info())) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Initialize the MCA framework
|
||||
*/
|
||||
|
@ -23,13 +23,21 @@
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <unistd.h>
|
||||
#include <errno.h>
|
||||
#include <string.h>
|
||||
#ifdef HAVE_SYS_TIME_H
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/wait.h>
|
||||
#include <fcntl.h>
|
||||
|
||||
|
||||
#include "include/orte_constants.h"
|
||||
#include "runtime/orte_wait.h"
|
||||
#include "util/argv.h"
|
||||
#include "util/output.h"
|
||||
#include "util/path.h"
|
||||
#include "util/univ_info.h"
|
||||
#include "util/sys_info.h"
|
||||
#include "util/proc_info.h"
|
||||
@ -37,14 +45,238 @@
|
||||
#include "util/session_dir.h"
|
||||
#include "util/universe_setup_file_io.h"
|
||||
|
||||
#include "mca/base/mca_base_param.h"
|
||||
#include "mca/soh/soh.h"
|
||||
#include "mca/rml/rml.h"
|
||||
#include "mca/ns/ns.h"
|
||||
#include "mca/errmgr/errmgr.h"
|
||||
|
||||
#include "runtime/runtime.h"
|
||||
|
||||
extern char **environ;
|
||||
|
||||
int orte_setup_hnp(char *target_cluster)
|
||||
/*
|
||||
* Local data structure
|
||||
*/
|
||||
typedef struct {
|
||||
char *target_cluster;
|
||||
char *headnode;
|
||||
orte_process_name_t *name;
|
||||
orte_jobid_t jobid;
|
||||
} orte_setup_hnp_cb_data_t;
|
||||
|
||||
static orte_setup_hnp_cb_data_t orte_setup_hnp_cbdata = {NULL, NULL, NULL, 0};
|
||||
|
||||
/*
|
||||
* NON-BLOCKING RECEIVER
|
||||
*/
|
||||
static void orte_setup_hnp_recv(int status, orte_process_name_t* sender,
|
||||
orte_buffer_t* buffer, orte_rml_tag_t tag,
|
||||
void* cbdata);
|
||||
|
||||
/*
|
||||
* PID WAIT CALLBACK
|
||||
*/
|
||||
static void orte_setup_hnp_wait(pid_t wpid, int status, void *data);
|
||||
|
||||
|
||||
/*
|
||||
* ORTE_SETUP_HNP
|
||||
*/
|
||||
int orte_setup_hnp(char *target_cluster, char *headnode, char *username)
|
||||
{
|
||||
return ORTE_ERR_NOT_IMPLEMENTED;
|
||||
char **argv, *param, *uri, *uid, *hn;
|
||||
char *path, *name_string, *orteprobe;
|
||||
int argc, rc=ORTE_SUCCESS, id;
|
||||
pid_t pid;
|
||||
orte_cellid_t cellid;
|
||||
orte_jobid_t jobid;
|
||||
orte_vpid_t vpid;
|
||||
|
||||
/* get the nodename for the headnode of the target cluster */
|
||||
if (NULL == headnode) { /* not provided, so try to look it up */
|
||||
} else { /* lookup the headnode's cellid */
|
||||
hn = strdup(headnode);
|
||||
cellid = 0;
|
||||
}
|
||||
|
||||
/* get the user's name on the headnode */
|
||||
if (NULL == username) {
|
||||
uid = strdup(orte_system_info.user);
|
||||
} else {
|
||||
uid = strdup(username);
|
||||
}
|
||||
|
||||
/* SETUP TO LAUNCH PROBE */
|
||||
|
||||
/* get a jobid for the probe */
|
||||
if (ORTE_SUCCESS != (rc = orte_ns.create_jobid(&jobid))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
/* get a vpid for the probe */
|
||||
if (ORTE_SUCCESS != (rc = orte_ns.reserve_range(jobid, 1, &vpid))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
/* initialize probe's process name... */
|
||||
rc = orte_ns.create_process_name(&(orte_setup_hnp_cbdata.name), cellid, jobid, vpid);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
/* ...and get string representation */
|
||||
if(ORTE_SUCCESS != (rc = orte_ns.get_proc_name_string(&name_string, orte_setup_hnp_cbdata.name))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto CLEANUP;
|
||||
}
|
||||
/* setup callback data on sigchild */
|
||||
orte_setup_hnp_cbdata.target_cluster = strdup(target_cluster);
|
||||
orte_setup_hnp_cbdata.headnode = strdup(headnode);
|
||||
orte_setup_hnp_cbdata.jobid = jobid;
|
||||
|
||||
/* get rsh/ssh launch mechanism parameters */
|
||||
id = mca_base_param_register_string("pls","rsh","agent",NULL,"/usr/bin/ssh");
|
||||
mca_base_param_lookup_string(id, ¶m);
|
||||
|
||||
id = mca_base_param_register_string("orteprobe",NULL,NULL,NULL,"orteprobe");
|
||||
mca_base_param_lookup_string(id, &orteprobe);
|
||||
|
||||
/* Initialize the argv array */
|
||||
argv = ompi_argv_split(param, ' ');
|
||||
argc = ompi_argv_count(argv);
|
||||
if (argc <= 0) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
||||
rc = ORTE_ERR_BAD_PARAM;
|
||||
goto CLEANUP;
|
||||
}
|
||||
free(param);
|
||||
|
||||
/* setup the path */
|
||||
path = ompi_path_findv(argv[0], 0, environ, NULL);
|
||||
|
||||
/* add the username and nodename */
|
||||
ompi_argv_append(&argc, &argv, "-l");
|
||||
ompi_argv_append(&argc, &argv, uid);
|
||||
ompi_argv_append(&argc, &argv, hn);
|
||||
|
||||
/* add the probe application */
|
||||
ompi_argv_append(&argc, &argv, orteprobe);
|
||||
|
||||
/* tell the probe it's name */
|
||||
ompi_argv_append(&argc, &argv, "--name");
|
||||
ompi_argv_append(&argc, &argv, name_string);
|
||||
|
||||
/* setup probe's ns contact info */
|
||||
ompi_argv_append(&argc, &argv, "--nsreplica");
|
||||
if(NULL != orte_process_info.ns_replica_uri) {
|
||||
uri = strdup(orte_process_info.ns_replica_uri);
|
||||
} else {
|
||||
uri = orte_rml.get_uri();
|
||||
}
|
||||
asprintf(¶m, "\"%s\"", uri);
|
||||
ompi_argv_append(&argc, &argv, param);
|
||||
free(uri);
|
||||
|
||||
/* setup probe's gpr contact info */
|
||||
ompi_argv_append(&argc, &argv, "--gprreplica");
|
||||
if(NULL != orte_process_info.gpr_replica_uri) {
|
||||
uri = strdup(orte_process_info.gpr_replica_uri);
|
||||
} else {
|
||||
uri = orte_rml.get_uri();
|
||||
}
|
||||
asprintf(¶m, "\"%s\"", uri);
|
||||
ompi_argv_append(&argc, &argv, param);
|
||||
free(uri);
|
||||
|
||||
/* tell the probe who to report to */
|
||||
uri = orte_rml.get_uri();
|
||||
ompi_argv_append(&argc, &argv, "--requestor");
|
||||
ompi_argv_append(&argc, &argv, uri);
|
||||
free(uri);
|
||||
|
||||
/* issue the non-blocking recv to get the probe's findings */
|
||||
rc = orte_rml.recv_buffer_nb(ORTE_RML_NAME_ANY, ORTE_RML_TAG_PROBE,
|
||||
0, orte_setup_hnp_recv, NULL);
|
||||
if(rc < 0) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
/* fork a child to exec the rsh/ssh session */
|
||||
pid = fork();
|
||||
if (pid < 0) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
if (pid == 0) { /* child */
|
||||
/* exec the probe launch */
|
||||
ompi_output(0, "exec'ing %s", path);
|
||||
execv(path, argv);
|
||||
ORTE_ERROR_LOG(ORTE_ERROR);
|
||||
ompi_output(0, "orte_setup_hnp: execv failed with errno=%d\n", errno);
|
||||
return ORTE_ERROR;
|
||||
|
||||
} else { /* parent */
|
||||
|
||||
orte_wait_cb(pid, orte_setup_hnp_wait, &orte_setup_hnp_cbdata);
|
||||
}
|
||||
|
||||
CLEANUP:
|
||||
return rc;
|
||||
}
|
||||
|
||||
static void orte_setup_hnp_recv(int status, orte_process_name_t* sender,
|
||||
orte_buffer_t* buffer, orte_rml_tag_t tag,
|
||||
void* cbdata)
|
||||
{
|
||||
ompi_output(0, "HE CALLED HOME!!");
|
||||
orte_finalize();
|
||||
exit(0);
|
||||
}
|
||||
|
||||
static void orte_setup_hnp_wait(pid_t wpid, int status, void *cbdata)
|
||||
{
|
||||
int rc;
|
||||
orte_setup_hnp_cb_data_t *data;
|
||||
|
||||
data = (orte_setup_hnp_cb_data_t*)cbdata;
|
||||
|
||||
/* if ssh exited abnormally, print something useful to the user and cleanup
|
||||
* the registry entries for the HNP jobid.
|
||||
This should somehow be pushed up to the calling level, but we
|
||||
don't really have a way to do that just yet.
|
||||
*/
|
||||
if (! WIFEXITED(status) || ! WEXITSTATUS(status) == 0) {
|
||||
/* set the probe's state-of-health to aborted */
|
||||
if (ORTE_SUCCESS != (rc =
|
||||
orte_soh.set_proc_soh(data->name, ORTE_PROC_STATE_ABORTED, status))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
/* tell the user something went wrong */
|
||||
ompi_output(0, "ERROR: The probe on head node %s of the %s cluster failed to start as expected.",
|
||||
data->headnode, data->target_cluster);
|
||||
ompi_output(0, "ERROR: There may be more information available from");
|
||||
ompi_output(0, "ERROR: the remote shell (see above).");
|
||||
if (WIFEXITED(status)) {
|
||||
ompi_output(0, "ERROR: The probe exited unexpectedly with status %d.",
|
||||
WEXITSTATUS(status));
|
||||
} else if (WIFSIGNALED(status)) {
|
||||
#ifdef WCOREDUMP
|
||||
if (WCOREDUMP(status)) {
|
||||
ompi_output(0, "The probe received a signal %d (with core).",
|
||||
WTERMSIG(status));
|
||||
} else {
|
||||
ompi_output(0, "The probe received a signal %d.", WTERMSIG(status));
|
||||
}
|
||||
#else
|
||||
ompi_output(0, "The probe received a signal %d.", WTERMSIG(status));
|
||||
#endif /* WCOREDUMP */
|
||||
} else {
|
||||
ompi_output(0, "No extra status information is available: %d.", status);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -156,7 +156,7 @@ OMPI_DECLSPEC int ompi_rte_init_io(void);
|
||||
/**
|
||||
* Establish a Head Node Process on a cluster's front end
|
||||
*/
|
||||
OMPI_DECLSPEC int orte_setup_hnp(char *target_cluster);
|
||||
OMPI_DECLSPEC int orte_setup_hnp(char *target_cluster, char *headnode, char *username);
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
}
|
||||
|
@ -19,5 +19,5 @@ include $(top_srcdir)/config/Makefile.options
|
||||
|
||||
EXTRA_DIST = win_makefile
|
||||
|
||||
SUBDIRS = ompi_info wrappers orted orterun openmpi console
|
||||
SUBDIRS = ompi_info wrappers orted orteprobe orterun openmpi console
|
||||
|
||||
|
@ -76,40 +76,67 @@ ompi_cmd_line_init_t orte_cmd_line_opts[] = {
|
||||
{ NULL, NULL, NULL, 'h', NULL, "help", 0,
|
||||
&orted_globals.help, OMPI_CMD_LINE_TYPE_BOOL,
|
||||
"This help message" },
|
||||
|
||||
{ NULL, NULL, NULL, '\0', NULL, "version", 0,
|
||||
&orted_globals.version, OMPI_CMD_LINE_TYPE_BOOL,
|
||||
"Show the orted version" },
|
||||
|
||||
{ NULL, NULL, NULL, 'd', NULL, "debug", 0,
|
||||
&orted_globals.debug, OMPI_CMD_LINE_TYPE_BOOL,
|
||||
"Run in debug mode (not generally intended for users)" },
|
||||
{ "orte", "debug", NULL, 'd', NULL, "debug", 0,
|
||||
NULL, OMPI_CMD_LINE_TYPE_BOOL,
|
||||
"Debug the OpenRTE" },
|
||||
|
||||
{ NULL, NULL, NULL, '\0', NULL, "no-daemonize", 0,
|
||||
&orted_globals.no_daemonize, OMPI_CMD_LINE_TYPE_BOOL,
|
||||
"Don't daemonize into the background" },
|
||||
|
||||
{ "orte", "debug", "daemons", '\0', NULL, "debug-daemons", 0,
|
||||
&orted_globals.debug_daemons, OMPI_CMD_LINE_TYPE_BOOL,
|
||||
"Enable debugging of OpenRTE daemons" },
|
||||
|
||||
{ "orte", "debug", "daemons_file", '\0', NULL, "debug-daemons-file", 0,
|
||||
&orted_globals.debug_daemons_file, OMPI_CMD_LINE_TYPE_BOOL,
|
||||
"Enable debugging of OpenRTE daemons, storing output in files" },
|
||||
|
||||
{ "rmgr", "bootproxy", "jobid", '\0', NULL, "bootproxy", 1,
|
||||
&orted_globals.bootproxy, OMPI_CMD_LINE_TYPE_INT,
|
||||
"Run as boot proxy for <job-id>" },
|
||||
|
||||
{ NULL, NULL, NULL, '\0', NULL, "name", 1,
|
||||
&orted_globals.name, OMPI_CMD_LINE_TYPE_STRING,
|
||||
"Set the orte process name"},
|
||||
|
||||
{ NULL, NULL, NULL, '\0', NULL, "nsreplica", 1,
|
||||
&orte_process_info.ns_replica_uri, OMPI_CMD_LINE_TYPE_STRING,
|
||||
"Name service contact information."},
|
||||
|
||||
{ NULL, NULL, NULL, '\0', NULL, "gprreplica", 1,
|
||||
&orte_process_info.gpr_replica_uri, OMPI_CMD_LINE_TYPE_STRING,
|
||||
"Registry contact information."},
|
||||
|
||||
{ NULL, NULL, NULL, '\0', NULL, "nodename", 1,
|
||||
&orte_system_info.nodename, OMPI_CMD_LINE_TYPE_STRING,
|
||||
"Node name as specified by host/resource description." },
|
||||
|
||||
{ "universe", NULL, NULL, '\0', NULL, "universe", 1,
|
||||
&orted_globals.universe, OMPI_CMD_LINE_TYPE_STRING,
|
||||
"Set the universe name as username@hostname:universe_name for this application" },
|
||||
|
||||
{ "tmpdir", "base", NULL, '\0', NULL, "tmpdir", 1,
|
||||
NULL, OMPI_CMD_LINE_TYPE_STRING,
|
||||
"Set the root for the session directory tree" },
|
||||
|
||||
{ "seed", NULL, NULL, '\0', NULL, "seed", 0,
|
||||
&orte_process_info.seed, OMPI_CMD_LINE_TYPE_BOOL,
|
||||
"seed"},
|
||||
NULL, OMPI_CMD_LINE_TYPE_BOOL,
|
||||
"Host replicas for the core universe services"},
|
||||
|
||||
{ "universe", "persistence", NULL, '\0', NULL, "persistent", 0,
|
||||
&orte_universe_info.persistence, OMPI_CMD_LINE_TYPE_BOOL,
|
||||
"persistent"},
|
||||
NULL, OMPI_CMD_LINE_TYPE_BOOL,
|
||||
"Remain alive after the application process completes"},
|
||||
|
||||
{ "universe", "scope", NULL, '\0', NULL, "scope", 1,
|
||||
&orte_universe_info.scope, OMPI_CMD_LINE_TYPE_STRING,
|
||||
"scope"},
|
||||
NULL, OMPI_CMD_LINE_TYPE_STRING,
|
||||
"Set restrictions on who can connect to this universe"},
|
||||
|
||||
/* End of list */
|
||||
{ NULL, NULL, NULL, '\0', NULL, NULL, 0,
|
||||
NULL, OMPI_CMD_LINE_TYPE_NULL, NULL }
|
||||
@ -119,12 +146,15 @@ ompi_cmd_line_init_t orte_cmd_line_opts[] = {
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int ret = 0;
|
||||
int fd;
|
||||
ompi_cmd_line_t *cmd_line = NULL;
|
||||
char *contact_path = NULL;
|
||||
char *log_path = NULL;
|
||||
|
||||
char log_file[PATH_MAX];
|
||||
char *jobidstring;
|
||||
|
||||
/* setup to check common command line options that just report and die */
|
||||
memset(&orted_globals, 0, sizeof(orted_globals));
|
||||
memset(&orted_globals, 0, sizeof(orted_globals_t));
|
||||
cmd_line = OBJ_NEW(ompi_cmd_line_t);
|
||||
ompi_cmd_line_create(cmd_line, orte_cmd_line_opts);
|
||||
if (OMPI_SUCCESS != (ret = ompi_cmd_line_parse(cmd_line, true,
|
||||
@ -161,14 +191,20 @@ int main(int argc, char *argv[])
|
||||
ret = orte_ns_base_convert_string_to_process_name(
|
||||
&orte_process_info.my_name, orted_globals.name);
|
||||
if(ORTE_SUCCESS != ret) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
fprintf(stderr, "Couldn't convert environmental string to process name\n");
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
/* turn on debug if debug_file is requested so output will be generated */
|
||||
if (orted_globals.debug_daemons_file) {
|
||||
orted_globals.debug_daemons = true;
|
||||
}
|
||||
|
||||
/* detach from controlling terminal */
|
||||
if(orted_globals.debug == false && orted_globals.no_daemonize == false) {
|
||||
/* detach from controlling terminal
|
||||
* otherwise, remain attached so output can get to us
|
||||
*/
|
||||
if(orted_globals.debug_daemons == false && orted_globals.no_daemonize == false) {
|
||||
orte_daemon_init(NULL);
|
||||
}
|
||||
|
||||
@ -179,20 +215,21 @@ int main(int argc, char *argv[])
|
||||
}
|
||||
|
||||
/* setup stdin/stdout/stderr */
|
||||
if (orted_globals.debug == false) {
|
||||
int fd;
|
||||
char log_file[PATH_MAX];
|
||||
|
||||
/* connect input to /dev/null */
|
||||
fd = open("/dev/null", O_RDONLY);
|
||||
if(fd > STDIN_FILENO) {
|
||||
dup2(fd, STDIN_FILENO);
|
||||
close(fd);
|
||||
if (orted_globals.debug_daemons_file) {
|
||||
/* if we are debugging to a file, then send stdin/stdout/stderr
|
||||
* to the orted log file
|
||||
*/
|
||||
|
||||
/* get my jobid */
|
||||
if (ORTE_SUCCESS != (ret = orte_ns.get_jobid_string(&jobidstring,
|
||||
orte_process_info.my_name))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* connect output to a log file in the session directory */
|
||||
sprintf(log_file, "output-orted-%d-%s.log",
|
||||
(int)orte_process_info.my_name->jobid, orte_system_info.nodename);
|
||||
|
||||
/* define a log file name in the session directory */
|
||||
sprintf(log_file, "output-orted-%s-%s.log",
|
||||
jobidstring, orte_system_info.nodename);
|
||||
log_path = orte_os_path(false,
|
||||
orte_process_info.tmpdir_base,
|
||||
orte_process_info.top_session_dir,
|
||||
@ -200,10 +237,12 @@ int main(int argc, char *argv[])
|
||||
NULL);
|
||||
|
||||
fd = open(log_path, O_RDWR|O_CREAT|O_TRUNC, 0666);
|
||||
if(fd < 0) {
|
||||
if (fd < 0) {
|
||||
/* couldn't open the file for some reason, so
|
||||
* just connect everything to /dev/null
|
||||
*/
|
||||
fd = open("/dev/null", O_RDWR|O_CREAT|O_TRUNC, 0666);
|
||||
}
|
||||
if(fd >= 0) {
|
||||
} else {
|
||||
dup2(fd, STDOUT_FILENO);
|
||||
dup2(fd, STDERR_FILENO);
|
||||
if(fd != STDOUT_FILENO && fd != STDERR_FILENO) {
|
||||
@ -246,19 +285,21 @@ int main(int argc, char *argv[])
|
||||
orte_universe_info.seed_uri = orte_rml.get_uri();
|
||||
contact_path = orte_os_path(false, orte_process_info.universe_session_dir,
|
||||
"universe-setup.txt", NULL);
|
||||
ompi_output(0, "ompid: contact_file %s", contact_path);
|
||||
if (orted_globals.debug_daemons) {
|
||||
ompi_output(0, "ompid: contact_file %s", contact_path);
|
||||
}
|
||||
|
||||
if (OMPI_SUCCESS != (ret = orte_write_universe_setup_file(contact_path, &orte_universe_info))) {
|
||||
if (orted_globals.debug) {
|
||||
if (orted_globals.debug_daemons) {
|
||||
ompi_output(0, "[%lu,%lu,%lu] ompid: couldn't write setup file", ORTE_NAME_ARGS(orte_process_info.my_name));
|
||||
}
|
||||
} else if (orted_globals.debug) {
|
||||
} else if (orted_globals.debug_daemons) {
|
||||
ompi_output(0, "[%lu,%lu,%lu] ompid: wrote setup file", ORTE_NAME_ARGS(orte_process_info.my_name));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (orted_globals.debug) {
|
||||
if (orted_globals.debug_daemons) {
|
||||
ompi_output(0, "[%lu,%lu,%lu] ompid: issuing callback", ORTE_NAME_ARGS(orte_process_info.my_name));
|
||||
}
|
||||
|
||||
@ -273,7 +314,7 @@ int main(int argc, char *argv[])
|
||||
* - could be setup a virtual machine, spawn a console, etc.
|
||||
*/
|
||||
|
||||
if (orted_globals.debug) {
|
||||
if (orted_globals.debug_daemons) {
|
||||
ompi_output(0, "[%lu,%lu,%lu] ompid: setting up event monitor", ORTE_NAME_ARGS(orte_process_info.my_name));
|
||||
}
|
||||
|
||||
@ -286,7 +327,7 @@ int main(int argc, char *argv[])
|
||||
|
||||
OMPI_THREAD_UNLOCK(&orted_globals.mutex);
|
||||
|
||||
if (orted_globals.debug) {
|
||||
if (orted_globals.debug_daemons) {
|
||||
ompi_output(0, "[%lu,%lu,%lu] ompid: mutex cleared - finalizing", ORTE_NAME_ARGS(orte_process_info.my_name));
|
||||
}
|
||||
|
||||
@ -301,7 +342,7 @@ int main(int argc, char *argv[])
|
||||
/* finalize the system */
|
||||
orte_finalize();
|
||||
|
||||
if (orted_globals.debug) {
|
||||
if (orted_globals.debug_daemons) {
|
||||
ompi_output(0, "[%lu,%lu,%lu] ompid: done - exiting", ORTE_NAME_ARGS(orte_process_info.my_name));
|
||||
}
|
||||
|
||||
@ -320,7 +361,7 @@ static void orte_daemon_recv(int status, orte_process_name_t* sender,
|
||||
|
||||
OMPI_THREAD_LOCK(&orted_globals.mutex);
|
||||
|
||||
if (orted_globals.debug) {
|
||||
if (orted_globals.debug_daemons) {
|
||||
ompi_output(0, "[%lu,%lu,%lu] ompid: received message", ORTE_NAME_ARGS(orte_process_info.my_name));
|
||||
}
|
||||
|
||||
|
@ -49,10 +49,11 @@ typedef uint16_t orte_daemon_cmd_flag_t;
|
||||
typedef struct {
|
||||
bool help;
|
||||
bool version;
|
||||
bool debug;
|
||||
bool no_daemonize;
|
||||
bool probe;
|
||||
bool debug_daemons;
|
||||
bool debug_daemons_file;
|
||||
char* name;
|
||||
char* universe;
|
||||
int bootproxy;
|
||||
ompi_mutex_t mutex;
|
||||
ompi_condition_t condition;
|
||||
@ -61,11 +62,6 @@ typedef struct {
|
||||
|
||||
extern orted_globals_t orted_globals;
|
||||
|
||||
/*
|
||||
* Internal functions
|
||||
*/
|
||||
int orte_daemon_bootproxy(void);
|
||||
|
||||
/*
|
||||
* Version-related strings and functions
|
||||
*/
|
||||
|
50
src/tools/orteprobe/Makefile.am
Обычный файл
50
src/tools/orteprobe/Makefile.am
Обычный файл
@ -0,0 +1,50 @@
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
include $(top_srcdir)/config/Makefile.options
|
||||
|
||||
AM_CPPFLAGS = \
|
||||
-DOMPI_PREFIX="\"$(prefix)\"" \
|
||||
-DOMPI_BINDIR="\"$(bindir)\"" \
|
||||
-DOMPI_LIBDIR="\"$(libdir)\"" \
|
||||
-DOMPI_INCDIR="\"$(includedir)\"" \
|
||||
-DOMPI_PKGLIBDIR="\"$(pkglibdir)\"" \
|
||||
-DOMPI_SYSCONFDIR="\"$(sysconfdir)\"" \
|
||||
-DOMPI_CONFIGURE_USER="\"@OMPI_CONFIGURE_USER@\"" \
|
||||
-DOMPI_CONFIGURE_HOST="\"@OMPI_CONFIGURE_HOST@\"" \
|
||||
-DOMPI_CONFIGURE_DATE="\"@OMPI_CONFIGURE_DATE@\"" \
|
||||
-DOMPI_BUILD_CFLAGS="\"@CFLAGS@\"" \
|
||||
-DOMPI_BUILD_CPPFLAGS="\"@CPPFLAGS@\"" \
|
||||
-DOMPI_BUILD_CXXFLAGS="\"@CXXFLAGS@\"" \
|
||||
-DOMPI_BUILD_CXXCPPFLAGS="\"@CXXCPPFLAGS@\"" \
|
||||
-DOMPI_BUILD_FFLAGS="\"@FFLAGS@\"" \
|
||||
-DOMPI_BUILD_FCFLAGS="\"@FCFLAGS@\"" \
|
||||
-DOMPI_BUILD_LDFLAGS="\"@LDFLAGS@\"" \
|
||||
-DOMPI_BUILD_LIBS="\"@LIBS@\""
|
||||
|
||||
libs = $(top_builddir)/src/libmpi.la
|
||||
|
||||
bin_PROGRAMS = orteprobe
|
||||
orteprobe_SOURCES = \
|
||||
orteprobe.h \
|
||||
orteprobe.c
|
||||
|
||||
orteprobe_LDADD = $(libs) $(LIBMPI_EXTRA_LIBS)
|
||||
orteprobe_LDFLAGS = $(LIBMPI_EXTRA_LDFLAGS)
|
||||
orteprobe_DEPENDENCIES = $(libs)
|
||||
|
||||
clean-local:
|
||||
test -z "$(OMPI_CXX_TEMPLATE_REPOSITORY)" || $(RM) -rf $(OMPI_CXX_TEMPLATE_REPOSITORY)
|
335
src/tools/orteprobe/orteprobe.c
Обычный файл
335
src/tools/orteprobe/orteprobe.c
Обычный файл
@ -0,0 +1,335 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <ctype.h>
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#ifdef HAVE_NETDB_H
|
||||
#include <netdb.h>
|
||||
#endif
|
||||
#ifdef HAVE_SYS_PARAM_H
|
||||
#include <sys/param.h>
|
||||
#endif
|
||||
#include <fcntl.h>
|
||||
#include <errno.h>
|
||||
|
||||
#include "include/orte_constants.h"
|
||||
|
||||
#include "threads/mutex.h"
|
||||
#include "threads/condition.h"
|
||||
|
||||
#include "dps/dps.h"
|
||||
#include "event/event.h"
|
||||
#include "util/output.h"
|
||||
#include "util/show_help.h"
|
||||
#include "util/sys_info.h"
|
||||
#include "util/os_path.h"
|
||||
#include "util/cmd_line.h"
|
||||
#include "util/proc_info.h"
|
||||
#include "util/univ_info.h"
|
||||
#include "util/session_dir.h"
|
||||
#include "util/printf.h"
|
||||
#include "util/daemon_init.h"
|
||||
#include "util/universe_setup_file_io.h"
|
||||
|
||||
#include "mca/base/base.h"
|
||||
#include "mca/base/mca_base_param.h"
|
||||
#include "mca/rml/base/base.h"
|
||||
#include "mca/rml/rml.h"
|
||||
#include "mca/errmgr/base/base.h"
|
||||
#include "mca/ns/base/base.h"
|
||||
#include "mca/gpr/base/base.h"
|
||||
#include "mca/schema/base/base.h"
|
||||
#include "mca/soh/base/base.h"
|
||||
|
||||
#include "runtime/runtime.h"
|
||||
#include "runtime/orte_wait.h"
|
||||
|
||||
#include "tools/orteprobe/orteprobe.h"
|
||||
|
||||
orteprobe_globals_t orteprobe_globals;
|
||||
|
||||
/*
|
||||
* define the orteprobe context table for obtaining parameters
|
||||
*/
|
||||
ompi_cmd_line_init_t orte_cmd_line_opts[] = {
|
||||
/* Various "obvious" options */
|
||||
{ NULL, NULL, NULL, 'h', NULL, "help", 0,
|
||||
&orteprobe_globals.help, OMPI_CMD_LINE_TYPE_BOOL,
|
||||
"This help message" },
|
||||
|
||||
{ NULL, NULL, NULL, '\0', NULL, "version", 0,
|
||||
&orteprobe_globals.version, OMPI_CMD_LINE_TYPE_BOOL,
|
||||
"Show the orteprobe version" },
|
||||
|
||||
{ NULL, NULL, NULL, 'd', NULL, "debug", 0,
|
||||
&orteprobe_globals.debug, OMPI_CMD_LINE_TYPE_BOOL,
|
||||
"Run in debug mode (not generally intended for users)" },
|
||||
|
||||
{ NULL, NULL, NULL, '\0', NULL, "name", 1,
|
||||
&orteprobe_globals.name_string, OMPI_CMD_LINE_TYPE_STRING,
|
||||
"Set the orte process name"},
|
||||
|
||||
{ NULL, NULL, NULL, '\0', NULL, "nsreplica", 1,
|
||||
&orte_process_info.ns_replica_uri, OMPI_CMD_LINE_TYPE_STRING,
|
||||
"Name service contact information."},
|
||||
|
||||
{ NULL, NULL, NULL, '\0', NULL, "gprreplica", 1,
|
||||
&orte_process_info.gpr_replica_uri, OMPI_CMD_LINE_TYPE_STRING,
|
||||
"Registry contact information."},
|
||||
|
||||
{ NULL, NULL, NULL, '\0', NULL, "nodename", 1,
|
||||
&orte_system_info.nodename, OMPI_CMD_LINE_TYPE_STRING,
|
||||
"Node name as specified by host/resource description." },
|
||||
|
||||
{ NULL, NULL, NULL, '\0', NULL, "requestor", 1,
|
||||
&orteprobe_globals.requestor_string, OMPI_CMD_LINE_TYPE_STRING,
|
||||
"Set the orte process name"},
|
||||
|
||||
{ "seed", NULL, NULL, '\0', NULL, "seed", 0,
|
||||
&orte_process_info.seed, OMPI_CMD_LINE_TYPE_BOOL,
|
||||
"seed"},
|
||||
|
||||
{ "universe", "persistence", NULL, '\0', NULL, "persistent", 0,
|
||||
&orte_universe_info.persistence, OMPI_CMD_LINE_TYPE_BOOL,
|
||||
"persistent"},
|
||||
|
||||
{ "universe", "scope", NULL, '\0', NULL, "scope", 1,
|
||||
&orte_universe_info.scope, OMPI_CMD_LINE_TYPE_STRING,
|
||||
"scope"},
|
||||
|
||||
/* End of list */
|
||||
{ NULL, NULL, NULL, '\0', NULL, NULL, 0,
|
||||
NULL, OMPI_CMD_LINE_TYPE_NULL, NULL }
|
||||
};
|
||||
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int ret = 0;
|
||||
ompi_cmd_line_t *cmd_line = NULL;
|
||||
char *contact_path = NULL;
|
||||
char *log_path = NULL;
|
||||
orte_universe_t univ;
|
||||
orte_buffer_t buffer;
|
||||
orte_process_name_t requestor;
|
||||
|
||||
/* setup to check common command line options that just report and die */
|
||||
memset(&orteprobe_globals, 0, sizeof(orteprobe_globals));
|
||||
cmd_line = OBJ_NEW(ompi_cmd_line_t);
|
||||
ompi_cmd_line_create(cmd_line, orte_cmd_line_opts);
|
||||
if (OMPI_SUCCESS != (ret = ompi_cmd_line_parse(cmd_line, true,
|
||||
argc, argv))) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* check for help and version requests */
|
||||
if (orteprobe_globals.help) {
|
||||
char *args = NULL;
|
||||
args = ompi_cmd_line_get_usage_msg(cmd_line);
|
||||
ompi_show_help("help-orteprobe.txt", "orteprobe:usage", false,
|
||||
argv[0], args);
|
||||
free(args);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (orteprobe_globals.version) {
|
||||
/* show version message */
|
||||
printf("...showing off my version!\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/*
|
||||
* Attempt to parse the probe's name and save in proc_info
|
||||
*/
|
||||
if (orteprobe_globals.name_string) {
|
||||
ret = orte_ns_base_convert_string_to_process_name(
|
||||
&orte_process_info.my_name, orteprobe_globals.name_string);
|
||||
if(ORTE_SUCCESS != ret) {
|
||||
fprintf(stderr, "Couldn't convert environmental string to probe's process name\n");
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Attempt to parse the requestor's name and contact info
|
||||
*/
|
||||
if (orteprobe_globals.requestor_string) {
|
||||
if(ORTE_SUCCESS != (ret = orte_rml.parse_uris(
|
||||
orteprobe_globals.requestor_string, &requestor, NULL))) {
|
||||
fprintf(stderr, "Couldn't parse environmental string for requestor's contact info\n");
|
||||
return 1;
|
||||
}
|
||||
} else {
|
||||
fprintf(stderr, "No contact info received for requestor\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* Open up the output streams */
|
||||
if (!ompi_output_init()) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
/*
|
||||
* If threads are supported - assume that we are using threads - and reset otherwise.
|
||||
*/
|
||||
ompi_set_using_threads(OMPI_HAVE_THREAD_SUPPORT);
|
||||
|
||||
/* For malloc debugging */
|
||||
ompi_malloc_init();
|
||||
|
||||
/* Ensure the universe_info structure is instantiated and initialized */
|
||||
if (ORTE_SUCCESS != (ret = orte_univ_info())) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Ensure the system_info structure is instantiated and initialized */
|
||||
if (ORTE_SUCCESS != (ret = orte_sys_info())) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Ensure the process info structure is instantiated and initialized */
|
||||
if (ORTE_SUCCESS != (ret = orte_proc_info())) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Initialize the MCA framework
|
||||
*/
|
||||
if (OMPI_SUCCESS != (ret = mca_base_open())) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Initialize the data packing service.
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = orte_dps_open())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Open the name services to ensure access to local functions
|
||||
*/
|
||||
if (OMPI_SUCCESS != (ret = orte_ns_base_open())) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Open the error manager to activate error logging - needs local name services */
|
||||
if (ORTE_SUCCESS != (ret = orte_errmgr_base_open())) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
/***** ERROR LOGGING NOW AVAILABLE *****/
|
||||
|
||||
/*
|
||||
* Initialize the event library
|
||||
*/
|
||||
if (OMPI_SUCCESS != (ret = ompi_event_init())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Intialize the general progress engine
|
||||
*/
|
||||
if (OMPI_SUCCESS != (ret = ompi_progress_init())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Internal startup
|
||||
*/
|
||||
if (OMPI_SUCCESS != (ret = orte_wait_init())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Runtime Messaging Layer
|
||||
*/
|
||||
if (OMPI_SUCCESS != (ret = orte_rml_base_open())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Runtime Messaging Layer
|
||||
*/
|
||||
if (OMPI_SUCCESS != (ret = orte_rml_base_select())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Registry
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = orte_gpr_base_open())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Initialize schema utilities
|
||||
*/
|
||||
|
||||
if (ORTE_SUCCESS != (ret = orte_schema_base_open())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* see if a universe already exists on this machine */
|
||||
if (ORTE_SUCCESS == (ret = orte_universe_exists(&univ))) {
|
||||
/* universe is here! send info back and die */
|
||||
}
|
||||
|
||||
/* existing universe is not here or does not allow contact.
|
||||
* ensure we have a unique universe name, fork/exec an appropriate
|
||||
* daemon, and then tell whomever spawned us how to talk to the new
|
||||
* daemon
|
||||
*/
|
||||
|
||||
|
||||
/* cleanup */
|
||||
if (NULL != contact_path) {
|
||||
unlink(contact_path);
|
||||
}
|
||||
if (NULL != log_path) {
|
||||
unlink(log_path);
|
||||
}
|
||||
|
||||
OBJ_CONSTRUCT(&buffer, orte_buffer_t);
|
||||
if (ORTE_SUCCESS != (ret = orte_dps.pack(&buffer, &ret, 1, ORTE_INT))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit(1);
|
||||
}
|
||||
if (0 > orte_rml.send_buffer(&requestor, &buffer, ORTE_RML_TAG_PROBE, 0)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
|
||||
OBJ_DESTRUCT(&buffer);
|
||||
return ORTE_ERR_COMM_FAILURE;
|
||||
}
|
||||
OBJ_DESTRUCT(&buffer);
|
||||
|
||||
/* finalize the system */
|
||||
orte_finalize();
|
||||
|
||||
exit(0);
|
||||
}
|
62
src/tools/orteprobe/orteprobe.h
Обычный файл
62
src/tools/orteprobe/orteprobe.h
Обычный файл
@ -0,0 +1,62 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef ORTEPROBE_H
|
||||
#define ORTEPROBE_H
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#include "class/ompi_list.h"
|
||||
#include "threads/mutex.h"
|
||||
#include "threads/condition.h"
|
||||
|
||||
#include "util/cmd_line.h"
|
||||
#include "mca/mca.h"
|
||||
|
||||
/*
|
||||
* Definitions needed for communication
|
||||
*/
|
||||
#define ORTE_DAEMON_CMD ORTE_INT16
|
||||
|
||||
#define ORTE_DAEMON_HOSTFILE_CMD 0x01
|
||||
#define ORTE_DAEMON_SCRIPTFILE_CMD 0x02
|
||||
#define ORTE_DAEMON_CONTACT_QUERY_CMD 0x03
|
||||
#define ORTE_DAEMON_HEARTBEAT_CMD 0xfe
|
||||
#define ORTE_DAEMON_EXIT_CMD 0xff
|
||||
|
||||
|
||||
/*
|
||||
* Globals
|
||||
*/
|
||||
|
||||
typedef uint16_t orte_daemon_cmd_flag_t;
|
||||
|
||||
typedef struct {
|
||||
bool help;
|
||||
bool version;
|
||||
bool debug;
|
||||
char* name_string;
|
||||
char* requestor_string;
|
||||
ompi_mutex_t mutex;
|
||||
ompi_condition_t condition;
|
||||
bool exit_condition;
|
||||
} orteprobe_globals_t;
|
||||
|
||||
extern orteprobe_globals_t orteprobe_globals;
|
||||
|
||||
#endif /* ORTEPROBE_H */
|
@ -85,7 +85,6 @@ struct globals_t {
|
||||
bool verbose;
|
||||
bool exit;
|
||||
bool no_wait_for_job_completion;
|
||||
bool debug;
|
||||
size_t num_procs;
|
||||
int exit_status;
|
||||
char *hostfile;
|
||||
@ -113,9 +112,6 @@ ompi_cmd_line_init_t cmd_line_init[] = {
|
||||
{ NULL, NULL, NULL, '\0', NULL, "version", 0,
|
||||
&orterun_globals.version, OMPI_CMD_LINE_TYPE_BOOL,
|
||||
"Show the orterun version" },
|
||||
{ "orte", "debug", NULL, 'd', NULL, "debug", 0,
|
||||
&orterun_globals.debug, OMPI_CMD_LINE_TYPE_BOOL,
|
||||
"Enable debugging" },
|
||||
{ NULL, NULL, NULL, 'v', NULL, "verbose", 0,
|
||||
&orterun_globals.verbose, OMPI_CMD_LINE_TYPE_BOOL,
|
||||
"Be verbose" },
|
||||
@ -178,6 +174,23 @@ ompi_cmd_line_init_t cmd_line_init[] = {
|
||||
NULL, OMPI_CMD_LINE_TYPE_STRING,
|
||||
"List of hosts to invoke processes on" },
|
||||
|
||||
/* OpenRTE arguments */
|
||||
{ "orte", "debug", NULL, 'd', NULL, "debug", 0,
|
||||
NULL, OMPI_CMD_LINE_TYPE_BOOL,
|
||||
"Enable debugging of OpenRTE" },
|
||||
{ "orte", "debug", "daemons", '\0', NULL, "debug-daemons", 0,
|
||||
NULL, OMPI_CMD_LINE_TYPE_INT,
|
||||
"Enable debugging of any OpenRTE daemons used by this application" },
|
||||
{ "orte", "debug", "daemons_file", '\0', NULL, "debug-daemons-file", 0,
|
||||
NULL, OMPI_CMD_LINE_TYPE_BOOL,
|
||||
"Enable debugging of any OpenRTE daemons used by this application, storing output in files" },
|
||||
{ "universe", NULL, NULL, '\0', NULL, "universe", 1,
|
||||
NULL, OMPI_CMD_LINE_TYPE_STRING,
|
||||
"Set the universe name as username@hostname:universe_name for this application" },
|
||||
{ NULL, NULL, NULL, '\0', NULL, "tmpdir", 1,
|
||||
&orte_process_info.tmpdir_base, OMPI_CMD_LINE_TYPE_STRING,
|
||||
"Set the root for the session directory tree for orterun ONLY" },
|
||||
|
||||
/* End of list */
|
||||
{ NULL, NULL, NULL, '\0', NULL, NULL, 0,
|
||||
NULL, OMPI_CMD_LINE_TYPE_NULL, NULL }
|
||||
@ -517,7 +530,6 @@ static int init_globals(void)
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
0,
|
||||
0,
|
||||
NULL,
|
||||
@ -581,11 +593,6 @@ static int parse_globals(int argc, char* argv[])
|
||||
wait_for_job_completion = false;
|
||||
}
|
||||
|
||||
/* debug */
|
||||
if (orterun_globals.debug) {
|
||||
int id = mca_base_param_register_int("debug",NULL,NULL,NULL,0);
|
||||
mca_base_param_set_int(id,orterun_globals.debug);
|
||||
}
|
||||
OBJ_DESTRUCT(&cmd_line);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
@ -815,6 +822,7 @@ static int create_app(int argc, char* argv[], orte_app_context_t **app_ptr,
|
||||
} else {
|
||||
asprintf(&value2, "%s=%s", param, value);
|
||||
ompi_argv_append_nosize(&app->env, value2);
|
||||
free(value2);
|
||||
}
|
||||
} else {
|
||||
ompi_output(0, "Warning: could not find environment variable \"%s\"\n", param);
|
||||
|
@ -38,20 +38,48 @@ int orte_daemon_init(char *working_dir)
|
||||
So, I am guessing that this piece of code is called only by UNIX versions */
|
||||
|
||||
pid_t pid;
|
||||
int fd;
|
||||
|
||||
if ((pid = fork()) < 0) {
|
||||
return ORTE_ERROR;
|
||||
return ORTE_ERROR;
|
||||
} else if (pid != 0) {
|
||||
exit(0); /* parent goes bye-bye */
|
||||
exit(0); /* parent goes bye-bye */
|
||||
}
|
||||
|
||||
/* child continues */
|
||||
setsid(); /* become session leader */
|
||||
|
||||
if (NULL != working_dir) {
|
||||
chdir(working_dir); /* change working directory */
|
||||
chdir(working_dir); /* change working directory */
|
||||
}
|
||||
|
||||
umask(0); /* clear file mode creation mask */
|
||||
|
||||
/* connect input to /dev/null */
|
||||
fd = open("/dev/null", O_RDONLY);
|
||||
if(fd > STDIN_FILENO) {
|
||||
dup2(fd, STDIN_FILENO);
|
||||
close(fd);
|
||||
}
|
||||
|
||||
/* connect outputs to /dev/null */
|
||||
fd = open("/dev/null", O_RDWR|O_CREAT|O_TRUNC, 0666);
|
||||
if (fd >= 0) {
|
||||
dup2(fd, STDOUT_FILENO);
|
||||
dup2(fd, STDERR_FILENO);
|
||||
/* just to be safe, make sure we aren't trying
|
||||
* to close stdout or stderr! since we dup'd both
|
||||
* of them to the same fd, we can't just close it
|
||||
* since one of the two would still be open and
|
||||
* someone could attempt to use it.
|
||||
*/
|
||||
if(fd != STDOUT_FILENO && fd != STDERR_FILENO) {
|
||||
close(fd);
|
||||
}
|
||||
} else {
|
||||
return ORTE_ERR_FATAL;
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
#else
|
||||
printf ("This function has not been implemented in windows yet, file %s line %d\n", __FILE__, __LINE__);
|
||||
|
@ -356,7 +356,7 @@ orte_session_dir_finalize(orte_process_name_t *proc)
|
||||
int rc;
|
||||
char *tmp;
|
||||
char *job, *job_session_dir, *vpid, *proc_session_dir;
|
||||
|
||||
|
||||
/* need to setup the top_session_dir with the prefix */
|
||||
tmp = strdup(orte_os_path(false,
|
||||
orte_process_info.tmpdir_base,
|
||||
|
@ -91,7 +91,7 @@ int orte_sys_info(void)
|
||||
return ORTE_ERROR;
|
||||
} else {
|
||||
orte_system_info.sysname = strdup(sys_info.sysname);
|
||||
if(NULL == orte_system_info.nodename) {
|
||||
if (NULL == orte_system_info.nodename) {
|
||||
orte_system_info.nodename = strdup(sys_info.nodename);
|
||||
}
|
||||
orte_system_info.release = strdup(sys_info.release);
|
||||
|
@ -32,6 +32,10 @@
|
||||
#include "include/orte_constants.h"
|
||||
#include "mca/base/base.h"
|
||||
#include "mca/base/mca_base_param.h"
|
||||
#include "mca/ns/ns_types.h"
|
||||
#include "util/output.h"
|
||||
#include "util/proc_info.h"
|
||||
#include "util/sys_info.h"
|
||||
|
||||
#include "util/univ_info.h"
|
||||
|
||||
@ -53,19 +57,51 @@ orte_universe_t orte_universe_info = {
|
||||
int orte_univ_info(void)
|
||||
{
|
||||
int id, tmp;
|
||||
char *tmpname=NULL, *tptr, *ptr;
|
||||
|
||||
if (!orte_universe_info.init) {
|
||||
id = mca_base_param_register_string("universe", "path", NULL, NULL, orte_universe_info.path);
|
||||
mca_base_param_lookup_string(id, &(orte_universe_info.path));
|
||||
|
||||
id = mca_base_param_register_string("universe", "name", NULL, NULL, orte_universe_info.name);
|
||||
mca_base_param_lookup_string(id, &(orte_universe_info.name));
|
||||
|
||||
id = mca_base_param_register_string("universe", "host", NULL, NULL, orte_universe_info.host);
|
||||
mca_base_param_lookup_string(id, &(orte_universe_info.host));
|
||||
|
||||
/* uid is not set via parameter, but is determined elsewhere */
|
||||
id = mca_base_param_register_string("universe", NULL, NULL, NULL, NULL);
|
||||
mca_base_param_lookup_string(id, &tmpname);
|
||||
|
||||
if (NULL != tmpname) {
|
||||
/* Universe name info is passed as userid@hostname:univ_name */
|
||||
/* extract the userid from the universe option, if provided */
|
||||
tptr = tmpname;
|
||||
if (NULL != (ptr = strchr(tptr, '@'))) {
|
||||
*ptr = '\0';
|
||||
orte_universe_info.uid = strdup(tptr);
|
||||
ptr++;
|
||||
tptr = ptr;
|
||||
} else {
|
||||
if (NULL == orte_system_info.user) {
|
||||
orte_sys_info();
|
||||
}
|
||||
orte_universe_info.uid = strdup(orte_system_info.user);
|
||||
}
|
||||
|
||||
/* extract the hostname, if provided */
|
||||
if (NULL != (ptr = strchr(tptr, ':'))) {
|
||||
*ptr = '\0';
|
||||
orte_universe_info.host = strdup(tptr);
|
||||
ptr++;
|
||||
tptr = ptr;
|
||||
} else {
|
||||
orte_universe_info.host = strdup(orte_system_info.nodename);
|
||||
}
|
||||
|
||||
/* now copy the universe name into the universe_info structure */
|
||||
orte_universe_info.name = strdup(tptr);
|
||||
} else {
|
||||
/* if nothing was provided, then initialize the user and nodename
|
||||
* to the local values
|
||||
*/
|
||||
orte_universe_info.uid = strdup(orte_system_info.user);
|
||||
orte_universe_info.host = strdup(orte_system_info.nodename);
|
||||
}
|
||||
|
||||
id = mca_base_param_register_int("universe", "persistence", NULL, NULL, orte_universe_info.persistence);
|
||||
mca_base_param_lookup_int(id, &tmp);
|
||||
orte_universe_info.persistence = (tmp ? true : false);
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user