1
1

Per request from Aurelien, make orterun report-pid and report-uri functions work the same as that of ompi-server. Since these are used for ompi-server-like functionality, it makes sense that the report options work the same. Make orte-top take the corresponding input the same way too for consistency.

The modified cmd line options are:

--report-uri x where x is either '-' for stdout, '+' for stderr, or a filename
--report-pid x where x is the same as above

For orte-top, you can now provide either a pid or a uri (which allows connection to remote mpiruns), specified either directly or with a "file:x" option as per mpirun's ompi-server option.

Note: I did not add a report-pid option to ompi-server as it probably wouldn't be useful - the report-uri option works as well, and allows remote access (which is likely the normal way it would be used).

This commit was SVN r20168.
Этот коммит содержится в:
Ralph Castain 2008-12-24 15:27:46 +00:00
родитель 213daa58da
Коммит bb96474d6e
7 изменённых файлов: 288 добавлений и 81 удалений

Просмотреть файл

@ -32,6 +32,44 @@ Pid provided: %d
#
[orte-top:pid-required]
This tool requires that you specify the pid of the mpirun executing
This tool requires that you specify contact info for the mpirun executing
the specified rank(s). Please use the --help option for more information.
#
[orte-top:hnp-filename-bad]
We are unable to parse the filename where contact info for the
mpirun to be contacted was to be found. The option we were given was:
--%s %s
This appears to be missing the required ':' following the
keyword "file". Please use the --help option for more information on
the correct format for this command line option.
#
orte-top:hnp-filename-access]
We are unable to access the filename where contact info for the
mpirun to be contacted was to be found. The filename we were given was:
File: %s
Please use the --help option for more information on
the correct format for this command line option.
#
[orte-top:hnp-file-bad]
We are unable to read the mpirun's contact info from the
given filename. The filename we were given was:
FILE: %s
Please use the --help option for more information on
the correct format for this command line option.
#
[orte-top:hnp-uri-bad]
We are unable to correctly parse the mpirun's contact info. The uri we were given was:
URI: %s
Please remember that this is *not* a standard uri, but
a special format used internally by Open MPI for communications. It can
best be generated by simply directing mpirun to put its
uri in a file, and then giving us that filename.

Просмотреть файл

@ -38,8 +38,16 @@ Display help for this command
.
.TP
.B -pid | --pid \fR<value>\fP
The pid of the mpirun whose processes you want information about. Note that
the ompi-top command must be executed on the same node as mpirun.
The pid of the mpirun whose processes you want information about, or the name
of the file (specified as file:filename) that contains that info. Note that
the ompi-top command must be executed on the same node as mpirun to use this option.
.
.
.TP
.B -uri | --uri \fR<value>\fP
Specify the URI of the mpirun whose processes you want information about, or the name
of the file (specified as file:filename) that contains that info. Note that
the ompi-top command does not have to be executed on the same node as mpirun to use this option.
.
.
.TP

Просмотреть файл

@ -53,6 +53,7 @@
#include "orte/util/hnp_contact.h"
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_wait.h"
#include "orte/mca/rml/base/rml_contact.h"
/*
* Local variables & functions
@ -69,6 +70,7 @@ static opal_event_t *my_exit_event;
static FILE *fp = NULL;
static bool help;
static char *hnppidstr;
static char *hnpuristr;
static char *ranks;
static orte_hnp_contact_t *target_hnp;
static int update_rate;
@ -117,6 +119,12 @@ opal_cmd_line_init_t cmd_line_opts[] = {
&hnppidstr, OPAL_CMD_LINE_TYPE_STRING,
"The pid of the mpirun that you wish to query/monitor" },
{ NULL, NULL, NULL,
'\0', "uri", "uri",
1,
&hnpuristr, OPAL_CMD_LINE_TYPE_STRING,
"The uri of the mpirun that you wish to query/monitor" },
{ NULL, NULL, NULL,
'\0', "rank", "rank",
1,
@ -245,28 +253,6 @@ main(int argc, char *argv[])
return ORTE_ERROR;
}
/*
* Must specify the mpirun pid
*/
if (NULL == hnppidstr) {
orte_show_help("help-orte-top.txt", "orte-top:pid-required", true);
return ORTE_ERROR;
}
/* convert the pid */
hnppid = strtoul(hnppidstr, NULL, 10);
/* if an output file was specified, open it */
if (NULL != logfile) {
fp = fopen(logfile, "w");
if (NULL == fp) {
orte_show_help("help-orte-top.txt", "orte-top:cant-open-logfile", true, logfile);
return ORTE_ERROR;
}
} else {
fp = stdout;
}
/***************************
* We need all of OPAL and the TOOL portion of ORTE
***************************/
@ -279,9 +265,12 @@ main(int argc, char *argv[])
if (ORTE_SUCCESS != orte_wait_event(&my_exit_event, &orte_exit, "job_complete", abort_exit_callback)) {
orte_finalize();
return 1;
exit(1);
}
/* setup the list for recvd stats */
OBJ_CONSTRUCT(&recvd_stats, opal_list_t);
/** setup callbacks for abort signals - from this point
* forward, we need to abort in a manner that allows us
* to cleanup
@ -293,16 +282,63 @@ main(int argc, char *argv[])
abort_exit_callback, &int_handler);
opal_signal_add(&int_handler, NULL);
/* setup the list for recvd stats */
OBJ_CONSTRUCT(&recvd_stats, opal_list_t);
/*
* Must specify the mpirun pid
*/
if (NULL != hnppidstr) {
if (0 == strncmp(hnppidstr, "file", strlen("file")) ||
0 == strncmp(hnppidstr, "FILE", strlen("FILE"))) {
char input[1024], *filename;
FILE *fp;
/* it is a file - get the filename */
filename = strchr(hnppidstr, ':');
if (NULL == filename) {
/* filename is not correctly formatted */
orte_show_help("help-orte-top.txt", "orte-top:hnp-filename-bad", true, "pid", hnppidstr);
orte_finalize();
exit(1);
}
++filename; /* space past the : */
if (0 >= strlen(filename)) {
/* they forgot to give us the name! */
orte_show_help("help-orte-top.txt", "orte-top:hnp-filename-bad", true, "pid", hnppidstr);
orte_finalize();
exit(1);
}
/* open the file and extract the pid */
fp = fopen(filename, "r");
if (NULL == fp) { /* can't find or read file! */
orte_show_help("help-orte-top.txt", "orte-top:hnp-filename-access", true, filename);
orte_finalize();
exit(1);
}
if (NULL == fgets(input, 1024, fp)) {
/* something malformed about file */
fclose(fp);
orte_show_help("help-orte-top.txt", "orte-top:hnp-file-bad", true, filename);
orte_finalize();
exit(1);
}
fclose(fp);
input[strlen(input)-1] = '\0'; /* remove newline */
/* convert the pid */
hnppid = strtoul(input, NULL, 10);
} else {
/* should just be the pid itself */
hnppid = strtoul(hnppidstr, NULL, 10);
}
/*
* Get the list of available hnp's and setup contact info
* to them in the RML
*/
OBJ_CONSTRUCT(&hnp_list, opal_list_t);
if (ORTE_SUCCESS != (ret = orte_list_local_hnps(&hnp_list, true) ) ) {
goto cleanup;
orte_show_help("help-orte-top.txt", "orte-top:pid-not-found", true, hnppid);
orte_finalize();
exit(1);
}
/*
@ -313,20 +349,105 @@ main(int argc, char *argv[])
if (hnppid == hnp->pid) {
/* this is the one we want */
target_hnp = hnp;
break;
/* let it continue to run so we deconstruct the list */
continue;
}
OBJ_RELEASE(hnp);
}
OBJ_DESTRUCT(&hnp_list);
/* if we get here without finding the one we wanted, then abort */
if (NULL == target_hnp) {
orte_show_help("help-orte-top.txt", "orte-top:pid-not-found", true, hnppid);
goto cleanup;
orte_finalize();
exit(1);
}
} else if (NULL != hnpuristr) {
if (0 == strncmp(hnpuristr, "file", strlen("file")) ||
0 == strncmp(hnpuristr, "FILE", strlen("FILE"))) {
char input[1024], *filename;
FILE *fp;
/* it is a file - get the filename */
filename = strchr(hnpuristr, ':');
if (NULL == filename) {
/* filename is not correctly formatted */
orte_show_help("help-orte-top.txt", "orte-top:hnp-filename-bad", true, "uri", hnpuristr);
orte_finalize();
exit(1);
}
++filename; /* space past the : */
if (0 >= strlen(filename)) {
/* they forgot to give us the name! */
orte_show_help("help-orte-top.txt", "orte-top:hnp-filename-bad", true, "uri", hnpuristr);
orte_finalize();
exit(1);
}
/* open the file and extract the uri */
fp = fopen(filename, "r");
if (NULL == fp) { /* can't find or read file! */
orte_show_help("help-orte-top.txt", "orte-top:hnp-filename-access", true, filename);
orte_finalize();
exit(1);
}
if (NULL == fgets(input, 1024, fp)) {
/* something malformed about file */
fclose(fp);
orte_show_help("help-orte-top.txt", "orte-top:hnp-file-bad", true, filename);
orte_finalize();
exit(1);
}
fclose(fp);
input[strlen(input)-1] = '\0'; /* remove newline */
/* construct the target hnp info */
target_hnp = OBJ_NEW(orte_hnp_contact_t);
target_hnp->rml_uri = strdup(input);
} else {
/* should just be the uri itself - construct the target hnp info */
target_hnp = OBJ_NEW(orte_hnp_contact_t);
target_hnp->rml_uri = strdup(hnpuristr);
}
/* set the info in our contact table */
if (ORTE_SUCCESS != orte_rml.set_contact_info(target_hnp->rml_uri)) {
orte_show_help("help-orte-top.txt", "orte-top:hnp-uri-bad", true, target_hnp->rml_uri);
orte_finalize();
exit(1);
}
/* extract the name */
if (ORTE_SUCCESS != orte_rml_base_parse_uris(target_hnp->rml_uri, &target_hnp->name, NULL)) {
orte_show_help("help-orte-top.txt", "orte-top:hnp-uri-bad", true, target_hnp->rml_uri);
orte_finalize();
exit(1);
}
/* set the route to be direct */
if (ORTE_SUCCESS != orte_routed.update_route(&target_hnp->name, &target_hnp->name)) {
orte_show_help("help-orte-top.txt", "orte-top:hnp-uri-bad", true, target_hnp->rml_uri);
orte_finalize();
exit(1);
}
} else {
orte_show_help("help-orte-top.txt", "orte-top:no-contact-given", true);
orte_finalize();
exit(1);
}
/* set the target hnp as our lifeline so we will terminate if it exits */
orte_routed.set_lifeline(&target_hnp->name);
/* if an output file was specified, open it */
if (NULL != logfile) {
fp = fopen(logfile, "w");
if (NULL == fp) {
orte_show_help("help-orte-top.txt", "orte-top:cant-open-logfile", true, logfile);
orte_finalize();
exit(1);
}
} else {
fp = stdout;
}
/* setup a non-blocking recv to get answers - we don't know how
* many daemons are going to send replies, so we just have to
* accept whatever comes back
@ -415,10 +536,6 @@ cleanup:
opal_signal_del(&term_handler);
opal_signal_del(&int_handler);
while (NULL != (item = opal_list_remove_first(&hnp_list))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&hnp_list);
while (NULL != (item = opal_list_remove_first(&recvd_stats))) {
OBJ_RELEASE(item);
}
@ -440,10 +557,6 @@ static void abort_exit_callback(int fd, short ign, void *arg)
opal_signal_del(&term_handler);
opal_signal_del(&int_handler);
while (NULL != (item = opal_list_remove_first(&hnp_list))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&hnp_list);
while (NULL != (item = opal_list_remove_first(&recvd_stats))) {
OBJ_RELEASE(item);
}

Просмотреть файл

@ -420,4 +420,9 @@ Please remember that the correct format for this command line option is:
--ompi-server PID:pid-of-%s
where PID can be either "PID" or "pid".
#
[orterun:write_file]
%s was unable to open a file to printout %s as requested. The file
name given was:
File: %s

Просмотреть файл

@ -306,11 +306,21 @@ Print out mpirun's URI during startup.
.
.
.TP
.B -report-uri-file\fR,\fP --report-uri-file <filename>
Print out mpirun's URI to the specified file during startup.
.
.
.TP
.B -report-pid\fR,\fP --report-pid
Print out mpirun's PID during startup.
.
.
.TP
.B -report-pid-file\fR,\fP --report-pid-file <filename>
Print out mpirun's PID to the specified file during startup.
.
.
.TP
.B -rf \f |--rankfile <arg0>\fP.
Provide a rankfile file.
.

Просмотреть файл

@ -137,12 +137,13 @@ static opal_cmd_line_init_t cmd_line_init[] = {
{ NULL, NULL, NULL, 'q', NULL, "quiet", 0,
&orterun_globals.quiet, OPAL_CMD_LINE_TYPE_BOOL,
"Suppress helpful messages" },
{ NULL, NULL, NULL, '\0', "report-pid", "report-pid", 0,
&orterun_globals.report_pid, OPAL_CMD_LINE_TYPE_BOOL,
"Printout pid" },
{ NULL, NULL, NULL, '\0', "report-uri", "report-uri", 0,
&orterun_globals.report_uri, OPAL_CMD_LINE_TYPE_BOOL,
"Printout URI" },
{ NULL, NULL, NULL, '\0', "report-pid", "report-pid", 1,
&orterun_globals.report_pid, OPAL_CMD_LINE_TYPE_STRING,
"Printout pid on stdout [-], stderr [+], or a file [anything else]" },
{ NULL, NULL, NULL, '\0', "report-uri", "report-uri", 1,
&orterun_globals.report_uri, OPAL_CMD_LINE_TYPE_STRING,
"Printout URI on stdout [-], stderr [+], or a file [anything else]" },
/* hetero apps */
{ "orte", "hetero", "apps", '\0', NULL, "hetero", 0,
@ -495,12 +496,28 @@ int orterun(int argc, char *argv[])
}
/* check for request to report uri */
if (orterun_globals.report_uri) {
char *uri;
uri = orte_rml.get_contact_info();
printf("%s uri: %s\n", orterun_basename, (NULL == uri) ? "NULL" : uri);
if (NULL != uri) {
free(uri);
if (NULL != orterun_globals.report_uri) {
FILE *fp;
char *rml_uri;
rml_uri = orte_rml.get_contact_info();
if (0 == strcmp(orterun_globals.report_uri, "-")) {
/* if '-', then output to stdout */
printf("%s\n", (NULL == rml_uri) ? "NULL" : rml_uri);
} else if (0 == strcmp(orterun_globals.report_uri, "+")) {
/* if '+', output to stderr */
fprintf(stderr, "%s\n", (NULL == rml_uri) ? "NULL" : rml_uri);
} else {
fp = fopen(orterun_globals.report_uri, "w");
if (NULL == fp) {
orte_show_help("help-orterun.txt", "orterun:write_file", false,
orterun_basename, "uri", orterun_globals.report_uri);
exit(0);
}
fprintf(fp, "%s\n", (NULL == rml_uri) ? "NULL" : rml_uri);
fclose(fp);
}
if (NULL != rml_uri) {
free(rml_uri);
}
}
@ -1179,6 +1196,8 @@ static int init_globals(void)
orterun_globals.wait_for_server = false;
orterun_globals.server_wait_timeout = 10;
orterun_globals.stdin_target = "0";
orterun_globals.report_pid = NULL;
orterun_globals.report_uri = NULL;
}
/* Reset the other fields every time */
@ -1187,8 +1206,6 @@ static int init_globals(void)
orterun_globals.version = false;
orterun_globals.verbose = false;
orterun_globals.quiet = false;
orterun_globals.report_pid = false;
orterun_globals.report_uri = false;
orterun_globals.by_node = false;
orterun_globals.by_slot = false;
orterun_globals.debugger = false;
@ -1258,8 +1275,24 @@ static int parse_globals(int argc, char* argv[], opal_cmd_line_t *cmd_line)
}
/* check for request to report pid */
if (orterun_globals.report_pid) {
printf("%s pid: %d\n", orterun_basename, (int)getpid());
if (NULL != orterun_globals.report_pid) {
FILE *fp;
if (0 == strcmp(orterun_globals.report_pid, "-")) {
/* if '-', then output to stdout */
printf("%d\n", (int)getpid());
} else if (0 == strcmp(orterun_globals.report_pid, "+")) {
/* if '+', output to stderr */
fprintf(stderr, "%d\n", (int)getpid());
} else {
fp = fopen(orterun_globals.report_pid, "w");
if (NULL == fp) {
orte_show_help("help-orterun.txt", "orterun:write_file", false,
orterun_basename, "pid", orterun_globals.report_pid);
exit(0);
}
fprintf(fp, "%d\n", (int)getpid());
fclose(fp);
}
}
/* Do we want a user-level debugger? */

Просмотреть файл

@ -42,8 +42,8 @@ struct orterun_globals_t {
bool version;
bool verbose;
bool quiet;
bool report_pid;
bool report_uri;
char *report_pid;
char *report_uri;
bool exit;
bool by_node;
bool by_slot;