Per request from Aurelien, make orterun report-pid and report-uri functions work the same as that of ompi-server. Since these are used for ompi-server-like functionality, it makes sense that the report options work the same. Make orte-top take the corresponding input the same way too for consistency.
The modified cmd line options are: --report-uri x where x is either '-' for stdout, '+' for stderr, or a filename --report-pid x where x is the same as above For orte-top, you can now provide either a pid or a uri (which allows connection to remote mpiruns), specified either directly or with a "file:x" option as per mpirun's ompi-server option. Note: I did not add a report-pid option to ompi-server as it probably wouldn't be useful - the report-uri option works as well, and allows remote access (which is likely the normal way it would be used). This commit was SVN r20168.
Этот коммит содержится в:
родитель
213daa58da
Коммит
bb96474d6e
@ -32,6 +32,44 @@ Pid provided: %d
|
||||
|
||||
#
|
||||
[orte-top:pid-required]
|
||||
This tool requires that you specify the pid of the mpirun executing
|
||||
This tool requires that you specify contact info for the mpirun executing
|
||||
the specified rank(s). Please use the --help option for more information.
|
||||
|
||||
#
|
||||
[orte-top:hnp-filename-bad]
|
||||
We are unable to parse the filename where contact info for the
|
||||
mpirun to be contacted was to be found. The option we were given was:
|
||||
|
||||
--%s %s
|
||||
|
||||
This appears to be missing the required ':' following the
|
||||
keyword "file". Please use the --help option for more information on
|
||||
the correct format for this command line option.
|
||||
#
|
||||
orte-top:hnp-filename-access]
|
||||
We are unable to access the filename where contact info for the
|
||||
mpirun to be contacted was to be found. The filename we were given was:
|
||||
|
||||
File: %s
|
||||
|
||||
Please use the --help option for more information on
|
||||
the correct format for this command line option.
|
||||
#
|
||||
[orte-top:hnp-file-bad]
|
||||
We are unable to read the mpirun's contact info from the
|
||||
given filename. The filename we were given was:
|
||||
|
||||
FILE: %s
|
||||
|
||||
Please use the --help option for more information on
|
||||
the correct format for this command line option.
|
||||
#
|
||||
[orte-top:hnp-uri-bad]
|
||||
We are unable to correctly parse the mpirun's contact info. The uri we were given was:
|
||||
|
||||
URI: %s
|
||||
|
||||
Please remember that this is *not* a standard uri, but
|
||||
a special format used internally by Open MPI for communications. It can
|
||||
best be generated by simply directing mpirun to put its
|
||||
uri in a file, and then giving us that filename.
|
||||
|
@ -38,8 +38,16 @@ Display help for this command
|
||||
.
|
||||
.TP
|
||||
.B -pid | --pid \fR<value>\fP
|
||||
The pid of the mpirun whose processes you want information about. Note that
|
||||
the ompi-top command must be executed on the same node as mpirun.
|
||||
The pid of the mpirun whose processes you want information about, or the name
|
||||
of the file (specified as file:filename) that contains that info. Note that
|
||||
the ompi-top command must be executed on the same node as mpirun to use this option.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -uri | --uri \fR<value>\fP
|
||||
Specify the URI of the mpirun whose processes you want information about, or the name
|
||||
of the file (specified as file:filename) that contains that info. Note that
|
||||
the ompi-top command does not have to be executed on the same node as mpirun to use this option.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
|
@ -53,6 +53,7 @@
|
||||
#include "orte/util/hnp_contact.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/mca/rml/base/rml_contact.h"
|
||||
|
||||
/*
|
||||
* Local variables & functions
|
||||
@ -69,6 +70,7 @@ static opal_event_t *my_exit_event;
|
||||
static FILE *fp = NULL;
|
||||
static bool help;
|
||||
static char *hnppidstr;
|
||||
static char *hnpuristr;
|
||||
static char *ranks;
|
||||
static orte_hnp_contact_t *target_hnp;
|
||||
static int update_rate;
|
||||
@ -117,6 +119,12 @@ opal_cmd_line_init_t cmd_line_opts[] = {
|
||||
&hnppidstr, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"The pid of the mpirun that you wish to query/monitor" },
|
||||
|
||||
{ NULL, NULL, NULL,
|
||||
'\0', "uri", "uri",
|
||||
1,
|
||||
&hnpuristr, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"The uri of the mpirun that you wish to query/monitor" },
|
||||
|
||||
{ NULL, NULL, NULL,
|
||||
'\0', "rank", "rank",
|
||||
1,
|
||||
@ -245,28 +253,6 @@ main(int argc, char *argv[])
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
/*
|
||||
* Must specify the mpirun pid
|
||||
*/
|
||||
if (NULL == hnppidstr) {
|
||||
orte_show_help("help-orte-top.txt", "orte-top:pid-required", true);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
/* convert the pid */
|
||||
hnppid = strtoul(hnppidstr, NULL, 10);
|
||||
|
||||
/* if an output file was specified, open it */
|
||||
if (NULL != logfile) {
|
||||
fp = fopen(logfile, "w");
|
||||
if (NULL == fp) {
|
||||
orte_show_help("help-orte-top.txt", "orte-top:cant-open-logfile", true, logfile);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
} else {
|
||||
fp = stdout;
|
||||
}
|
||||
|
||||
/***************************
|
||||
* We need all of OPAL and the TOOL portion of ORTE
|
||||
***************************/
|
||||
@ -279,9 +265,12 @@ main(int argc, char *argv[])
|
||||
|
||||
if (ORTE_SUCCESS != orte_wait_event(&my_exit_event, &orte_exit, "job_complete", abort_exit_callback)) {
|
||||
orte_finalize();
|
||||
return 1;
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/* setup the list for recvd stats */
|
||||
OBJ_CONSTRUCT(&recvd_stats, opal_list_t);
|
||||
|
||||
/** setup callbacks for abort signals - from this point
|
||||
* forward, we need to abort in a manner that allows us
|
||||
* to cleanup
|
||||
@ -293,16 +282,63 @@ main(int argc, char *argv[])
|
||||
abort_exit_callback, &int_handler);
|
||||
opal_signal_add(&int_handler, NULL);
|
||||
|
||||
/* setup the list for recvd stats */
|
||||
OBJ_CONSTRUCT(&recvd_stats, opal_list_t);
|
||||
/*
|
||||
* Must specify the mpirun pid
|
||||
*/
|
||||
if (NULL != hnppidstr) {
|
||||
if (0 == strncmp(hnppidstr, "file", strlen("file")) ||
|
||||
0 == strncmp(hnppidstr, "FILE", strlen("FILE"))) {
|
||||
char input[1024], *filename;
|
||||
FILE *fp;
|
||||
|
||||
/* it is a file - get the filename */
|
||||
filename = strchr(hnppidstr, ':');
|
||||
if (NULL == filename) {
|
||||
/* filename is not correctly formatted */
|
||||
orte_show_help("help-orte-top.txt", "orte-top:hnp-filename-bad", true, "pid", hnppidstr);
|
||||
orte_finalize();
|
||||
exit(1);
|
||||
}
|
||||
++filename; /* space past the : */
|
||||
|
||||
if (0 >= strlen(filename)) {
|
||||
/* they forgot to give us the name! */
|
||||
orte_show_help("help-orte-top.txt", "orte-top:hnp-filename-bad", true, "pid", hnppidstr);
|
||||
orte_finalize();
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/* open the file and extract the pid */
|
||||
fp = fopen(filename, "r");
|
||||
if (NULL == fp) { /* can't find or read file! */
|
||||
orte_show_help("help-orte-top.txt", "orte-top:hnp-filename-access", true, filename);
|
||||
orte_finalize();
|
||||
exit(1);
|
||||
}
|
||||
if (NULL == fgets(input, 1024, fp)) {
|
||||
/* something malformed about file */
|
||||
fclose(fp);
|
||||
orte_show_help("help-orte-top.txt", "orte-top:hnp-file-bad", true, filename);
|
||||
orte_finalize();
|
||||
exit(1);
|
||||
}
|
||||
fclose(fp);
|
||||
input[strlen(input)-1] = '\0'; /* remove newline */
|
||||
/* convert the pid */
|
||||
hnppid = strtoul(input, NULL, 10);
|
||||
} else {
|
||||
/* should just be the pid itself */
|
||||
hnppid = strtoul(hnppidstr, NULL, 10);
|
||||
}
|
||||
/*
|
||||
* Get the list of available hnp's and setup contact info
|
||||
* to them in the RML
|
||||
*/
|
||||
OBJ_CONSTRUCT(&hnp_list, opal_list_t);
|
||||
if (ORTE_SUCCESS != (ret = orte_list_local_hnps(&hnp_list, true) ) ) {
|
||||
goto cleanup;
|
||||
orte_show_help("help-orte-top.txt", "orte-top:pid-not-found", true, hnppid);
|
||||
orte_finalize();
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -313,20 +349,105 @@ main(int argc, char *argv[])
|
||||
if (hnppid == hnp->pid) {
|
||||
/* this is the one we want */
|
||||
target_hnp = hnp;
|
||||
break;
|
||||
/* let it continue to run so we deconstruct the list */
|
||||
continue;
|
||||
}
|
||||
OBJ_RELEASE(hnp);
|
||||
}
|
||||
OBJ_DESTRUCT(&hnp_list);
|
||||
|
||||
/* if we get here without finding the one we wanted, then abort */
|
||||
if (NULL == target_hnp) {
|
||||
orte_show_help("help-orte-top.txt", "orte-top:pid-not-found", true, hnppid);
|
||||
goto cleanup;
|
||||
orte_finalize();
|
||||
exit(1);
|
||||
}
|
||||
} else if (NULL != hnpuristr) {
|
||||
if (0 == strncmp(hnpuristr, "file", strlen("file")) ||
|
||||
0 == strncmp(hnpuristr, "FILE", strlen("FILE"))) {
|
||||
char input[1024], *filename;
|
||||
FILE *fp;
|
||||
|
||||
/* it is a file - get the filename */
|
||||
filename = strchr(hnpuristr, ':');
|
||||
if (NULL == filename) {
|
||||
/* filename is not correctly formatted */
|
||||
orte_show_help("help-orte-top.txt", "orte-top:hnp-filename-bad", true, "uri", hnpuristr);
|
||||
orte_finalize();
|
||||
exit(1);
|
||||
}
|
||||
++filename; /* space past the : */
|
||||
|
||||
if (0 >= strlen(filename)) {
|
||||
/* they forgot to give us the name! */
|
||||
orte_show_help("help-orte-top.txt", "orte-top:hnp-filename-bad", true, "uri", hnpuristr);
|
||||
orte_finalize();
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/* open the file and extract the uri */
|
||||
fp = fopen(filename, "r");
|
||||
if (NULL == fp) { /* can't find or read file! */
|
||||
orte_show_help("help-orte-top.txt", "orte-top:hnp-filename-access", true, filename);
|
||||
orte_finalize();
|
||||
exit(1);
|
||||
}
|
||||
if (NULL == fgets(input, 1024, fp)) {
|
||||
/* something malformed about file */
|
||||
fclose(fp);
|
||||
orte_show_help("help-orte-top.txt", "orte-top:hnp-file-bad", true, filename);
|
||||
orte_finalize();
|
||||
exit(1);
|
||||
}
|
||||
fclose(fp);
|
||||
input[strlen(input)-1] = '\0'; /* remove newline */
|
||||
/* construct the target hnp info */
|
||||
target_hnp = OBJ_NEW(orte_hnp_contact_t);
|
||||
target_hnp->rml_uri = strdup(input);
|
||||
} else {
|
||||
/* should just be the uri itself - construct the target hnp info */
|
||||
target_hnp = OBJ_NEW(orte_hnp_contact_t);
|
||||
target_hnp->rml_uri = strdup(hnpuristr);
|
||||
}
|
||||
/* set the info in our contact table */
|
||||
if (ORTE_SUCCESS != orte_rml.set_contact_info(target_hnp->rml_uri)) {
|
||||
orte_show_help("help-orte-top.txt", "orte-top:hnp-uri-bad", true, target_hnp->rml_uri);
|
||||
orte_finalize();
|
||||
exit(1);
|
||||
}
|
||||
/* extract the name */
|
||||
if (ORTE_SUCCESS != orte_rml_base_parse_uris(target_hnp->rml_uri, &target_hnp->name, NULL)) {
|
||||
orte_show_help("help-orte-top.txt", "orte-top:hnp-uri-bad", true, target_hnp->rml_uri);
|
||||
orte_finalize();
|
||||
exit(1);
|
||||
}
|
||||
/* set the route to be direct */
|
||||
if (ORTE_SUCCESS != orte_routed.update_route(&target_hnp->name, &target_hnp->name)) {
|
||||
orte_show_help("help-orte-top.txt", "orte-top:hnp-uri-bad", true, target_hnp->rml_uri);
|
||||
orte_finalize();
|
||||
exit(1);
|
||||
}
|
||||
} else {
|
||||
orte_show_help("help-orte-top.txt", "orte-top:no-contact-given", true);
|
||||
orte_finalize();
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/* set the target hnp as our lifeline so we will terminate if it exits */
|
||||
orte_routed.set_lifeline(&target_hnp->name);
|
||||
|
||||
/* if an output file was specified, open it */
|
||||
if (NULL != logfile) {
|
||||
fp = fopen(logfile, "w");
|
||||
if (NULL == fp) {
|
||||
orte_show_help("help-orte-top.txt", "orte-top:cant-open-logfile", true, logfile);
|
||||
orte_finalize();
|
||||
exit(1);
|
||||
}
|
||||
} else {
|
||||
fp = stdout;
|
||||
}
|
||||
|
||||
/* setup a non-blocking recv to get answers - we don't know how
|
||||
* many daemons are going to send replies, so we just have to
|
||||
* accept whatever comes back
|
||||
@ -415,10 +536,6 @@ cleanup:
|
||||
opal_signal_del(&term_handler);
|
||||
opal_signal_del(&int_handler);
|
||||
|
||||
while (NULL != (item = opal_list_remove_first(&hnp_list))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
OBJ_DESTRUCT(&hnp_list);
|
||||
while (NULL != (item = opal_list_remove_first(&recvd_stats))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
@ -440,10 +557,6 @@ static void abort_exit_callback(int fd, short ign, void *arg)
|
||||
opal_signal_del(&term_handler);
|
||||
opal_signal_del(&int_handler);
|
||||
|
||||
while (NULL != (item = opal_list_remove_first(&hnp_list))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
OBJ_DESTRUCT(&hnp_list);
|
||||
while (NULL != (item = opal_list_remove_first(&recvd_stats))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
|
@ -420,4 +420,9 @@ Please remember that the correct format for this command line option is:
|
||||
--ompi-server PID:pid-of-%s
|
||||
|
||||
where PID can be either "PID" or "pid".
|
||||
#
|
||||
[orterun:write_file]
|
||||
%s was unable to open a file to printout %s as requested. The file
|
||||
name given was:
|
||||
|
||||
File: %s
|
||||
|
@ -306,11 +306,21 @@ Print out mpirun's URI during startup.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -report-uri-file\fR,\fP --report-uri-file <filename>
|
||||
Print out mpirun's URI to the specified file during startup.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -report-pid\fR,\fP --report-pid
|
||||
Print out mpirun's PID during startup.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -report-pid-file\fR,\fP --report-pid-file <filename>
|
||||
Print out mpirun's PID to the specified file during startup.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -rf \f |--rankfile <arg0>\fP.
|
||||
Provide a rankfile file.
|
||||
.
|
||||
|
@ -137,12 +137,13 @@ static opal_cmd_line_init_t cmd_line_init[] = {
|
||||
{ NULL, NULL, NULL, 'q', NULL, "quiet", 0,
|
||||
&orterun_globals.quiet, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Suppress helpful messages" },
|
||||
{ NULL, NULL, NULL, '\0', "report-pid", "report-pid", 0,
|
||||
&orterun_globals.report_pid, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Printout pid" },
|
||||
{ NULL, NULL, NULL, '\0', "report-uri", "report-uri", 0,
|
||||
&orterun_globals.report_uri, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Printout URI" },
|
||||
{ NULL, NULL, NULL, '\0', "report-pid", "report-pid", 1,
|
||||
&orterun_globals.report_pid, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Printout pid on stdout [-], stderr [+], or a file [anything else]" },
|
||||
{ NULL, NULL, NULL, '\0', "report-uri", "report-uri", 1,
|
||||
&orterun_globals.report_uri, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Printout URI on stdout [-], stderr [+], or a file [anything else]" },
|
||||
|
||||
|
||||
/* hetero apps */
|
||||
{ "orte", "hetero", "apps", '\0', NULL, "hetero", 0,
|
||||
@ -495,12 +496,28 @@ int orterun(int argc, char *argv[])
|
||||
}
|
||||
|
||||
/* check for request to report uri */
|
||||
if (orterun_globals.report_uri) {
|
||||
char *uri;
|
||||
uri = orte_rml.get_contact_info();
|
||||
printf("%s uri: %s\n", orterun_basename, (NULL == uri) ? "NULL" : uri);
|
||||
if (NULL != uri) {
|
||||
free(uri);
|
||||
if (NULL != orterun_globals.report_uri) {
|
||||
FILE *fp;
|
||||
char *rml_uri;
|
||||
rml_uri = orte_rml.get_contact_info();
|
||||
if (0 == strcmp(orterun_globals.report_uri, "-")) {
|
||||
/* if '-', then output to stdout */
|
||||
printf("%s\n", (NULL == rml_uri) ? "NULL" : rml_uri);
|
||||
} else if (0 == strcmp(orterun_globals.report_uri, "+")) {
|
||||
/* if '+', output to stderr */
|
||||
fprintf(stderr, "%s\n", (NULL == rml_uri) ? "NULL" : rml_uri);
|
||||
} else {
|
||||
fp = fopen(orterun_globals.report_uri, "w");
|
||||
if (NULL == fp) {
|
||||
orte_show_help("help-orterun.txt", "orterun:write_file", false,
|
||||
orterun_basename, "uri", orterun_globals.report_uri);
|
||||
exit(0);
|
||||
}
|
||||
fprintf(fp, "%s\n", (NULL == rml_uri) ? "NULL" : rml_uri);
|
||||
fclose(fp);
|
||||
}
|
||||
if (NULL != rml_uri) {
|
||||
free(rml_uri);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1179,6 +1196,8 @@ static int init_globals(void)
|
||||
orterun_globals.wait_for_server = false;
|
||||
orterun_globals.server_wait_timeout = 10;
|
||||
orterun_globals.stdin_target = "0";
|
||||
orterun_globals.report_pid = NULL;
|
||||
orterun_globals.report_uri = NULL;
|
||||
}
|
||||
|
||||
/* Reset the other fields every time */
|
||||
@ -1187,8 +1206,6 @@ static int init_globals(void)
|
||||
orterun_globals.version = false;
|
||||
orterun_globals.verbose = false;
|
||||
orterun_globals.quiet = false;
|
||||
orterun_globals.report_pid = false;
|
||||
orterun_globals.report_uri = false;
|
||||
orterun_globals.by_node = false;
|
||||
orterun_globals.by_slot = false;
|
||||
orterun_globals.debugger = false;
|
||||
@ -1258,8 +1275,24 @@ static int parse_globals(int argc, char* argv[], opal_cmd_line_t *cmd_line)
|
||||
}
|
||||
|
||||
/* check for request to report pid */
|
||||
if (orterun_globals.report_pid) {
|
||||
printf("%s pid: %d\n", orterun_basename, (int)getpid());
|
||||
if (NULL != orterun_globals.report_pid) {
|
||||
FILE *fp;
|
||||
if (0 == strcmp(orterun_globals.report_pid, "-")) {
|
||||
/* if '-', then output to stdout */
|
||||
printf("%d\n", (int)getpid());
|
||||
} else if (0 == strcmp(orterun_globals.report_pid, "+")) {
|
||||
/* if '+', output to stderr */
|
||||
fprintf(stderr, "%d\n", (int)getpid());
|
||||
} else {
|
||||
fp = fopen(orterun_globals.report_pid, "w");
|
||||
if (NULL == fp) {
|
||||
orte_show_help("help-orterun.txt", "orterun:write_file", false,
|
||||
orterun_basename, "pid", orterun_globals.report_pid);
|
||||
exit(0);
|
||||
}
|
||||
fprintf(fp, "%d\n", (int)getpid());
|
||||
fclose(fp);
|
||||
}
|
||||
}
|
||||
|
||||
/* Do we want a user-level debugger? */
|
||||
|
@ -42,8 +42,8 @@ struct orterun_globals_t {
|
||||
bool version;
|
||||
bool verbose;
|
||||
bool quiet;
|
||||
bool report_pid;
|
||||
bool report_uri;
|
||||
char *report_pid;
|
||||
char *report_uri;
|
||||
bool exit;
|
||||
bool by_node;
|
||||
bool by_slot;
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user