1
1

Per request from Aurelien, make orterun report-pid and report-uri functions work the same as that of ompi-server. Since these are used for ompi-server-like functionality, it makes sense that the report options work the same. Make orte-top take the corresponding input the same way too for consistency.

The modified cmd line options are:

--report-uri x where x is either '-' for stdout, '+' for stderr, or a filename
--report-pid x where x is the same as above

For orte-top, you can now provide either a pid or a uri (which allows connection to remote mpiruns), specified either directly or with a "file:x" option as per mpirun's ompi-server option.

Note: I did not add a report-pid option to ompi-server as it probably wouldn't be useful - the report-uri option works as well, and allows remote access (which is likely the normal way it would be used).

This commit was SVN r20168.
Этот коммит содержится в:
Ralph Castain 2008-12-24 15:27:46 +00:00
родитель 213daa58da
Коммит bb96474d6e
7 изменённых файлов: 288 добавлений и 81 удалений

Просмотреть файл

@ -32,6 +32,44 @@ Pid provided: %d
# #
[orte-top:pid-required] [orte-top:pid-required]
This tool requires that you specify the pid of the mpirun executing This tool requires that you specify contact info for the mpirun executing
the specified rank(s). Please use the --help option for more information. the specified rank(s). Please use the --help option for more information.
#
[orte-top:hnp-filename-bad]
We are unable to parse the filename where contact info for the
mpirun to be contacted was to be found. The option we were given was:
--%s %s
This appears to be missing the required ':' following the
keyword "file". Please use the --help option for more information on
the correct format for this command line option.
#
orte-top:hnp-filename-access]
We are unable to access the filename where contact info for the
mpirun to be contacted was to be found. The filename we were given was:
File: %s
Please use the --help option for more information on
the correct format for this command line option.
#
[orte-top:hnp-file-bad]
We are unable to read the mpirun's contact info from the
given filename. The filename we were given was:
FILE: %s
Please use the --help option for more information on
the correct format for this command line option.
#
[orte-top:hnp-uri-bad]
We are unable to correctly parse the mpirun's contact info. The uri we were given was:
URI: %s
Please remember that this is *not* a standard uri, but
a special format used internally by Open MPI for communications. It can
best be generated by simply directing mpirun to put its
uri in a file, and then giving us that filename.

Просмотреть файл

@ -38,8 +38,16 @@ Display help for this command
. .
.TP .TP
.B -pid | --pid \fR<value>\fP .B -pid | --pid \fR<value>\fP
The pid of the mpirun whose processes you want information about. Note that The pid of the mpirun whose processes you want information about, or the name
the ompi-top command must be executed on the same node as mpirun. of the file (specified as file:filename) that contains that info. Note that
the ompi-top command must be executed on the same node as mpirun to use this option.
.
.
.TP
.B -uri | --uri \fR<value>\fP
Specify the URI of the mpirun whose processes you want information about, or the name
of the file (specified as file:filename) that contains that info. Note that
the ompi-top command does not have to be executed on the same node as mpirun to use this option.
. .
. .
.TP .TP

Просмотреть файл

@ -53,6 +53,7 @@
#include "orte/util/hnp_contact.h" #include "orte/util/hnp_contact.h"
#include "orte/util/name_fns.h" #include "orte/util/name_fns.h"
#include "orte/runtime/orte_wait.h" #include "orte/runtime/orte_wait.h"
#include "orte/mca/rml/base/rml_contact.h"
/* /*
* Local variables & functions * Local variables & functions
@ -69,6 +70,7 @@ static opal_event_t *my_exit_event;
static FILE *fp = NULL; static FILE *fp = NULL;
static bool help; static bool help;
static char *hnppidstr; static char *hnppidstr;
static char *hnpuristr;
static char *ranks; static char *ranks;
static orte_hnp_contact_t *target_hnp; static orte_hnp_contact_t *target_hnp;
static int update_rate; static int update_rate;
@ -117,6 +119,12 @@ opal_cmd_line_init_t cmd_line_opts[] = {
&hnppidstr, OPAL_CMD_LINE_TYPE_STRING, &hnppidstr, OPAL_CMD_LINE_TYPE_STRING,
"The pid of the mpirun that you wish to query/monitor" }, "The pid of the mpirun that you wish to query/monitor" },
{ NULL, NULL, NULL,
'\0', "uri", "uri",
1,
&hnpuristr, OPAL_CMD_LINE_TYPE_STRING,
"The uri of the mpirun that you wish to query/monitor" },
{ NULL, NULL, NULL, { NULL, NULL, NULL,
'\0', "rank", "rank", '\0', "rank", "rank",
1, 1,
@ -245,28 +253,6 @@ main(int argc, char *argv[])
return ORTE_ERROR; return ORTE_ERROR;
} }
/*
* Must specify the mpirun pid
*/
if (NULL == hnppidstr) {
orte_show_help("help-orte-top.txt", "orte-top:pid-required", true);
return ORTE_ERROR;
}
/* convert the pid */
hnppid = strtoul(hnppidstr, NULL, 10);
/* if an output file was specified, open it */
if (NULL != logfile) {
fp = fopen(logfile, "w");
if (NULL == fp) {
orte_show_help("help-orte-top.txt", "orte-top:cant-open-logfile", true, logfile);
return ORTE_ERROR;
}
} else {
fp = stdout;
}
/*************************** /***************************
* We need all of OPAL and the TOOL portion of ORTE * We need all of OPAL and the TOOL portion of ORTE
***************************/ ***************************/
@ -276,12 +262,15 @@ main(int argc, char *argv[])
} }
OBJ_CONSTRUCT(&orte_exit, orte_trigger_event_t); OBJ_CONSTRUCT(&orte_exit, orte_trigger_event_t);
if (ORTE_SUCCESS != orte_wait_event(&my_exit_event, &orte_exit, "job_complete", abort_exit_callback)) { if (ORTE_SUCCESS != orte_wait_event(&my_exit_event, &orte_exit, "job_complete", abort_exit_callback)) {
orte_finalize(); orte_finalize();
return 1; exit(1);
} }
/* setup the list for recvd stats */
OBJ_CONSTRUCT(&recvd_stats, opal_list_t);
/** setup callbacks for abort signals - from this point /** setup callbacks for abort signals - from this point
* forward, we need to abort in a manner that allows us * forward, we need to abort in a manner that allows us
* to cleanup * to cleanup
@ -293,40 +282,172 @@ main(int argc, char *argv[])
abort_exit_callback, &int_handler); abort_exit_callback, &int_handler);
opal_signal_add(&int_handler, NULL); opal_signal_add(&int_handler, NULL);
/* setup the list for recvd stats */
OBJ_CONSTRUCT(&recvd_stats, opal_list_t);
/* /*
* Get the list of available hnp's and setup contact info * Must specify the mpirun pid
* to them in the RML
*/ */
OBJ_CONSTRUCT(&hnp_list, opal_list_t); if (NULL != hnppidstr) {
if (ORTE_SUCCESS != (ret = orte_list_local_hnps(&hnp_list, true) ) ) { if (0 == strncmp(hnppidstr, "file", strlen("file")) ||
goto cleanup; 0 == strncmp(hnppidstr, "FILE", strlen("FILE"))) {
} char input[1024], *filename;
FILE *fp;
/*
* For each hnp in the listing /* it is a file - get the filename */
*/ filename = strchr(hnppidstr, ':');
while (NULL != (item = opal_list_remove_first(&hnp_list))) { if (NULL == filename) {
orte_hnp_contact_t *hnp = (orte_hnp_contact_t*)item; /* filename is not correctly formatted */
if (hnppid == hnp->pid) { orte_show_help("help-orte-top.txt", "orte-top:hnp-filename-bad", true, "pid", hnppidstr);
/* this is the one we want */ orte_finalize();
target_hnp = hnp; exit(1);
break; }
++filename; /* space past the : */
if (0 >= strlen(filename)) {
/* they forgot to give us the name! */
orte_show_help("help-orte-top.txt", "orte-top:hnp-filename-bad", true, "pid", hnppidstr);
orte_finalize();
exit(1);
}
/* open the file and extract the pid */
fp = fopen(filename, "r");
if (NULL == fp) { /* can't find or read file! */
orte_show_help("help-orte-top.txt", "orte-top:hnp-filename-access", true, filename);
orte_finalize();
exit(1);
}
if (NULL == fgets(input, 1024, fp)) {
/* something malformed about file */
fclose(fp);
orte_show_help("help-orte-top.txt", "orte-top:hnp-file-bad", true, filename);
orte_finalize();
exit(1);
}
fclose(fp);
input[strlen(input)-1] = '\0'; /* remove newline */
/* convert the pid */
hnppid = strtoul(input, NULL, 10);
} else {
/* should just be the pid itself */
hnppid = strtoul(hnppidstr, NULL, 10);
} }
OBJ_RELEASE(hnp); /*
* Get the list of available hnp's and setup contact info
* to them in the RML
*/
OBJ_CONSTRUCT(&hnp_list, opal_list_t);
if (ORTE_SUCCESS != (ret = orte_list_local_hnps(&hnp_list, true) ) ) {
orte_show_help("help-orte-top.txt", "orte-top:pid-not-found", true, hnppid);
orte_finalize();
exit(1);
}
/*
* For each hnp in the listing
*/
while (NULL != (item = opal_list_remove_first(&hnp_list))) {
orte_hnp_contact_t *hnp = (orte_hnp_contact_t*)item;
if (hnppid == hnp->pid) {
/* this is the one we want */
target_hnp = hnp;
/* let it continue to run so we deconstruct the list */
continue;
}
OBJ_RELEASE(hnp);
}
OBJ_DESTRUCT(&hnp_list);
/* if we get here without finding the one we wanted, then abort */
if (NULL == target_hnp) {
orte_show_help("help-orte-top.txt", "orte-top:pid-not-found", true, hnppid);
orte_finalize();
exit(1);
}
} else if (NULL != hnpuristr) {
if (0 == strncmp(hnpuristr, "file", strlen("file")) ||
0 == strncmp(hnpuristr, "FILE", strlen("FILE"))) {
char input[1024], *filename;
FILE *fp;
/* it is a file - get the filename */
filename = strchr(hnpuristr, ':');
if (NULL == filename) {
/* filename is not correctly formatted */
orte_show_help("help-orte-top.txt", "orte-top:hnp-filename-bad", true, "uri", hnpuristr);
orte_finalize();
exit(1);
}
++filename; /* space past the : */
if (0 >= strlen(filename)) {
/* they forgot to give us the name! */
orte_show_help("help-orte-top.txt", "orte-top:hnp-filename-bad", true, "uri", hnpuristr);
orte_finalize();
exit(1);
}
/* open the file and extract the uri */
fp = fopen(filename, "r");
if (NULL == fp) { /* can't find or read file! */
orte_show_help("help-orte-top.txt", "orte-top:hnp-filename-access", true, filename);
orte_finalize();
exit(1);
}
if (NULL == fgets(input, 1024, fp)) {
/* something malformed about file */
fclose(fp);
orte_show_help("help-orte-top.txt", "orte-top:hnp-file-bad", true, filename);
orte_finalize();
exit(1);
}
fclose(fp);
input[strlen(input)-1] = '\0'; /* remove newline */
/* construct the target hnp info */
target_hnp = OBJ_NEW(orte_hnp_contact_t);
target_hnp->rml_uri = strdup(input);
} else {
/* should just be the uri itself - construct the target hnp info */
target_hnp = OBJ_NEW(orte_hnp_contact_t);
target_hnp->rml_uri = strdup(hnpuristr);
}
/* set the info in our contact table */
if (ORTE_SUCCESS != orte_rml.set_contact_info(target_hnp->rml_uri)) {
orte_show_help("help-orte-top.txt", "orte-top:hnp-uri-bad", true, target_hnp->rml_uri);
orte_finalize();
exit(1);
}
/* extract the name */
if (ORTE_SUCCESS != orte_rml_base_parse_uris(target_hnp->rml_uri, &target_hnp->name, NULL)) {
orte_show_help("help-orte-top.txt", "orte-top:hnp-uri-bad", true, target_hnp->rml_uri);
orte_finalize();
exit(1);
}
/* set the route to be direct */
if (ORTE_SUCCESS != orte_routed.update_route(&target_hnp->name, &target_hnp->name)) {
orte_show_help("help-orte-top.txt", "orte-top:hnp-uri-bad", true, target_hnp->rml_uri);
orte_finalize();
exit(1);
}
} else {
orte_show_help("help-orte-top.txt", "orte-top:no-contact-given", true);
orte_finalize();
exit(1);
} }
/* if we get here without finding the one we wanted, then abort */
if (NULL == target_hnp) {
orte_show_help("help-orte-top.txt", "orte-top:pid-not-found", true, hnppid);
goto cleanup;
}
/* set the target hnp as our lifeline so we will terminate if it exits */ /* set the target hnp as our lifeline so we will terminate if it exits */
orte_routed.set_lifeline(&target_hnp->name); orte_routed.set_lifeline(&target_hnp->name);
/* if an output file was specified, open it */
if (NULL != logfile) {
fp = fopen(logfile, "w");
if (NULL == fp) {
orte_show_help("help-orte-top.txt", "orte-top:cant-open-logfile", true, logfile);
orte_finalize();
exit(1);
}
} else {
fp = stdout;
}
/* setup a non-blocking recv to get answers - we don't know how /* setup a non-blocking recv to get answers - we don't know how
* many daemons are going to send replies, so we just have to * many daemons are going to send replies, so we just have to
* accept whatever comes back * accept whatever comes back
@ -415,10 +536,6 @@ cleanup:
opal_signal_del(&term_handler); opal_signal_del(&term_handler);
opal_signal_del(&int_handler); opal_signal_del(&int_handler);
while (NULL != (item = opal_list_remove_first(&hnp_list))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&hnp_list);
while (NULL != (item = opal_list_remove_first(&recvd_stats))) { while (NULL != (item = opal_list_remove_first(&recvd_stats))) {
OBJ_RELEASE(item); OBJ_RELEASE(item);
} }
@ -440,10 +557,6 @@ static void abort_exit_callback(int fd, short ign, void *arg)
opal_signal_del(&term_handler); opal_signal_del(&term_handler);
opal_signal_del(&int_handler); opal_signal_del(&int_handler);
while (NULL != (item = opal_list_remove_first(&hnp_list))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&hnp_list);
while (NULL != (item = opal_list_remove_first(&recvd_stats))) { while (NULL != (item = opal_list_remove_first(&recvd_stats))) {
OBJ_RELEASE(item); OBJ_RELEASE(item);
} }

Просмотреть файл

@ -420,4 +420,9 @@ Please remember that the correct format for this command line option is:
--ompi-server PID:pid-of-%s --ompi-server PID:pid-of-%s
where PID can be either "PID" or "pid". where PID can be either "PID" or "pid".
#
[orterun:write_file]
%s was unable to open a file to printout %s as requested. The file
name given was:
File: %s

Просмотреть файл

@ -306,11 +306,21 @@ Print out mpirun's URI during startup.
. .
. .
.TP .TP
.B -report-uri-file\fR,\fP --report-uri-file <filename>
Print out mpirun's URI to the specified file during startup.
.
.
.TP
.B -report-pid\fR,\fP --report-pid .B -report-pid\fR,\fP --report-pid
Print out mpirun's PID during startup. Print out mpirun's PID during startup.
. .
. .
.TP .TP
.B -report-pid-file\fR,\fP --report-pid-file <filename>
Print out mpirun's PID to the specified file during startup.
.
.
.TP
.B -rf \f |--rankfile <arg0>\fP. .B -rf \f |--rankfile <arg0>\fP.
Provide a rankfile file. Provide a rankfile file.
. .

Просмотреть файл

@ -137,13 +137,14 @@ static opal_cmd_line_init_t cmd_line_init[] = {
{ NULL, NULL, NULL, 'q', NULL, "quiet", 0, { NULL, NULL, NULL, 'q', NULL, "quiet", 0,
&orterun_globals.quiet, OPAL_CMD_LINE_TYPE_BOOL, &orterun_globals.quiet, OPAL_CMD_LINE_TYPE_BOOL,
"Suppress helpful messages" }, "Suppress helpful messages" },
{ NULL, NULL, NULL, '\0', "report-pid", "report-pid", 0, { NULL, NULL, NULL, '\0', "report-pid", "report-pid", 1,
&orterun_globals.report_pid, OPAL_CMD_LINE_TYPE_BOOL, &orterun_globals.report_pid, OPAL_CMD_LINE_TYPE_STRING,
"Printout pid" }, "Printout pid on stdout [-], stderr [+], or a file [anything else]" },
{ NULL, NULL, NULL, '\0', "report-uri", "report-uri", 0, { NULL, NULL, NULL, '\0', "report-uri", "report-uri", 1,
&orterun_globals.report_uri, OPAL_CMD_LINE_TYPE_BOOL, &orterun_globals.report_uri, OPAL_CMD_LINE_TYPE_STRING,
"Printout URI" }, "Printout URI on stdout [-], stderr [+], or a file [anything else]" },
/* hetero apps */ /* hetero apps */
{ "orte", "hetero", "apps", '\0', NULL, "hetero", 0, { "orte", "hetero", "apps", '\0', NULL, "hetero", 0,
NULL, OPAL_CMD_LINE_TYPE_BOOL, NULL, OPAL_CMD_LINE_TYPE_BOOL,
@ -495,13 +496,29 @@ int orterun(int argc, char *argv[])
} }
/* check for request to report uri */ /* check for request to report uri */
if (orterun_globals.report_uri) { if (NULL != orterun_globals.report_uri) {
char *uri; FILE *fp;
uri = orte_rml.get_contact_info(); char *rml_uri;
printf("%s uri: %s\n", orterun_basename, (NULL == uri) ? "NULL" : uri); rml_uri = orte_rml.get_contact_info();
if (NULL != uri) { if (0 == strcmp(orterun_globals.report_uri, "-")) {
free(uri); /* if '-', then output to stdout */
printf("%s\n", (NULL == rml_uri) ? "NULL" : rml_uri);
} else if (0 == strcmp(orterun_globals.report_uri, "+")) {
/* if '+', output to stderr */
fprintf(stderr, "%s\n", (NULL == rml_uri) ? "NULL" : rml_uri);
} else {
fp = fopen(orterun_globals.report_uri, "w");
if (NULL == fp) {
orte_show_help("help-orterun.txt", "orterun:write_file", false,
orterun_basename, "uri", orterun_globals.report_uri);
exit(0);
}
fprintf(fp, "%s\n", (NULL == rml_uri) ? "NULL" : rml_uri);
fclose(fp);
} }
if (NULL != rml_uri) {
free(rml_uri);
}
} }
/* Change the default behavior of libevent such that we want to /* Change the default behavior of libevent such that we want to
@ -1179,6 +1196,8 @@ static int init_globals(void)
orterun_globals.wait_for_server = false; orterun_globals.wait_for_server = false;
orterun_globals.server_wait_timeout = 10; orterun_globals.server_wait_timeout = 10;
orterun_globals.stdin_target = "0"; orterun_globals.stdin_target = "0";
orterun_globals.report_pid = NULL;
orterun_globals.report_uri = NULL;
} }
/* Reset the other fields every time */ /* Reset the other fields every time */
@ -1187,8 +1206,6 @@ static int init_globals(void)
orterun_globals.version = false; orterun_globals.version = false;
orterun_globals.verbose = false; orterun_globals.verbose = false;
orterun_globals.quiet = false; orterun_globals.quiet = false;
orterun_globals.report_pid = false;
orterun_globals.report_uri = false;
orterun_globals.by_node = false; orterun_globals.by_node = false;
orterun_globals.by_slot = false; orterun_globals.by_slot = false;
orterun_globals.debugger = false; orterun_globals.debugger = false;
@ -1258,8 +1275,24 @@ static int parse_globals(int argc, char* argv[], opal_cmd_line_t *cmd_line)
} }
/* check for request to report pid */ /* check for request to report pid */
if (orterun_globals.report_pid) { if (NULL != orterun_globals.report_pid) {
printf("%s pid: %d\n", orterun_basename, (int)getpid()); FILE *fp;
if (0 == strcmp(orterun_globals.report_pid, "-")) {
/* if '-', then output to stdout */
printf("%d\n", (int)getpid());
} else if (0 == strcmp(orterun_globals.report_pid, "+")) {
/* if '+', output to stderr */
fprintf(stderr, "%d\n", (int)getpid());
} else {
fp = fopen(orterun_globals.report_pid, "w");
if (NULL == fp) {
orte_show_help("help-orterun.txt", "orterun:write_file", false,
orterun_basename, "pid", orterun_globals.report_pid);
exit(0);
}
fprintf(fp, "%d\n", (int)getpid());
fclose(fp);
}
} }
/* Do we want a user-level debugger? */ /* Do we want a user-level debugger? */

Просмотреть файл

@ -42,8 +42,8 @@ struct orterun_globals_t {
bool version; bool version;
bool verbose; bool verbose;
bool quiet; bool quiet;
bool report_pid; char *report_pid;
bool report_uri; char *report_uri;
bool exit; bool exit;
bool by_node; bool by_node;
bool by_slot; bool by_slot;