diff --git a/configure.ac b/configure.ac index 25b7bd013e..4e8f1c315b 100644 --- a/configure.ac +++ b/configure.ac @@ -1373,6 +1373,8 @@ AC_CONFIG_FILES([ ompi/tools/wrappers/mpif90-wrapper-data.txt ompi/tools/ortetools/Makefile ompi/tools/ompi-server/Makefile + ompi/tools/ompi-probe/Makefile + ompi/tools/ompi-profiler/Makefile test/Makefile test/event/Makefile diff --git a/ompi/mca/bml/base/bml_base_init.c b/ompi/mca/bml/base/bml_base_init.c index 636653b864..f0106e3c46 100644 --- a/ompi/mca/bml/base/bml_base_init.c +++ b/ompi/mca/bml/base/bml_base_init.c @@ -21,6 +21,8 @@ #include "opal/mca/base/base.h" #include "opal/mca/mca.h" +#include "opal/runtime/opal.h" + int mca_bml_base_output = -1; mca_bml_base_module_t mca_bml = { @@ -86,6 +88,9 @@ int mca_bml_base_init( bool enable_progress_threads, else { mca_bml_component = *best_component; mca_bml = *best_module; + if (opal_profile) { + opal_output(0, "bml:%s", mca_bml_component.bml_version.mca_component_name); + } return mca_base_components_close(mca_bml_base_output, &mca_bml_base_components_available, (mca_base_component_t*) best_component); diff --git a/ompi/mca/btl/base/btl_base_select.c b/ompi/mca/btl/base/btl_base_select.c index 363982dbde..bcf2b88d93 100644 --- a/ompi/mca/btl/base/btl_base_select.c +++ b/ompi/mca/btl/base/btl_base_select.c @@ -28,6 +28,7 @@ #include "ompi/mca/btl/btl.h" #include "ompi/mca/btl/base/base.h" #include "orte/mca/errmgr/errmgr.h" +#include "opal/runtime/opal.h" OBJ_CLASS_INSTANCE( mca_btl_base_selected_module_t, opal_list_item_t, @@ -132,6 +133,10 @@ int mca_btl_base_select(bool enable_progress_threads, "select: init of component %s returned success", component->btl_version.mca_component_name); + if (opal_profile) { + opal_output(0, "btl:%s", component->btl_version.mca_component_name); + } + for (i = 0; i < num_btls; ++i) { sm = OBJ_NEW(mca_btl_base_selected_module_t); if (NULL == sm) { diff --git a/ompi/mca/pml/base/pml_base_select.c b/ompi/mca/pml/base/pml_base_select.c index e94a7b089f..e352fc4893 100644 --- a/ompi/mca/pml/base/pml_base_select.c +++ b/ompi/mca/pml/base/pml_base_select.c @@ -24,6 +24,7 @@ #include "opal/runtime/opal_progress.h" #include "opal/mca/mca.h" #include "opal/mca/base/base.h" +#include "opal/runtime/opal.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/util/name_fns.h" @@ -248,6 +249,10 @@ int mca_pml_base_select(bool enable_progress_threads, "select: component %s selected", mca_pml_base_selected_component.pmlm_version.mca_component_name ); + if (opal_profile) { + opal_output(0, "pml:%s", mca_pml_base_selected_component.pmlm_version.mca_component_name ); + } + /* This base function closes, unloads, and removes from the available list all unselected components. The available list will contain only the selected component. */ diff --git a/ompi/tools/Makefile.am b/ompi/tools/Makefile.am index 8a8ab41bdd..0f6a6c1445 100644 --- a/ompi/tools/Makefile.am +++ b/ompi/tools/Makefile.am @@ -25,11 +25,14 @@ SUBDIRS += \ tools/ompi_info \ tools/wrappers \ tools/ortetools \ - tools/ompi-server + tools/ompi-server \ + tools/ompi-probe \ + tools/ompi-profiler DIST_SUBDIRS += \ tools/ompi_info \ tools/wrappers \ tools/ortetools \ - tools/ompi-server - + tools/ompi-server \ + tools/ompi-probe \ + tools/ompi-profiler diff --git a/ompi/tools/ompi-probe/Makefile.am b/ompi/tools/ompi-probe/Makefile.am new file mode 100644 index 0000000000..7510034dad --- /dev/null +++ b/ompi/tools/ompi-probe/Makefile.am @@ -0,0 +1,47 @@ +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2008 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +include $(top_srcdir)/Makefile.man-page-rules + +man_pages = ompi-probe.1 +EXTRA_DIST = $(man_pages:.1=.1in) + +if OMPI_INSTALL_BINARIES +if !ORTE_DISABLE_FULL_SUPPORT + +bin_PROGRAMS = ompi-probe + +dist_pkgdata_DATA = help-ompi-probe.txt + +nodist_man_MANS = $(man_pages) + +# Ensure that the man pages are rebuilt if the opal_config.h file +# changes; a "good enough" way to know if configure was run again (and +# therefore the release date or version may have changed) +$(nodist_man_MANS): $(top_builddir)/opal/include/opal_config.h + +endif # !ORTE_DISABLE_FULL_SUPPORT +endif # OMPI_INSTALL_BINARIES + +ompi_probe_SOURCES = ompi-probe.c +ompi_probe_LDADD = $(top_builddir)/ompi/libmpi.la + +distclean-local: + rm -f $(man_pages) diff --git a/ompi/tools/ompi-probe/help-ompi-probe.txt b/ompi/tools/ompi-probe/help-ompi-probe.txt new file mode 100644 index 0000000000..6e02630e54 --- /dev/null +++ b/ompi/tools/ompi-probe/help-ompi-probe.txt @@ -0,0 +1,27 @@ +# -*- text -*- +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# +# This is the US/English general help file for Open MPI's ompi-probe tool. +# +[ompiprobe:usage] +Probe a set of nodes to determine their configuration + +Usage: %s [OPTIONS] + +%s + diff --git a/ompi/tools/ompi-probe/ompi-probe.1in b/ompi/tools/ompi-probe/ompi-probe.1in new file mode 100644 index 0000000000..6ccc2136f3 --- /dev/null +++ b/ompi/tools/ompi-probe/ompi-probe.1in @@ -0,0 +1,72 @@ +.\" +.\" Copyright (c) 2007 Los Alamos National Security, LLC +.\" All rights reserved. +.\" Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved. +.\" +.\" Man page for OMPI's ompi-server command +.\" +.\" .TH name section center-footer left-footer center-header +.TH OMPI-SERVER 1 "#OMPI_DATE#" "#PACKAGE_VERSION#" "#PACKAGE_NAME#" +.\" ************************** +.\" Name Section +.\" ************************** +.SH NAME +. +ompi-server \- Server for supporting name publish/lookup operations. +. +.PP +. +.\" ************************** +.\" Synopsis Section +.\" ************************** +.SH SYNOPSIS +. +.BR ompi-server " [ options ]" +. +.\" ************************** +.\" Options Section +.\" ************************** +.SH Options +. +\fIompi-server\fR acts as a data server for Open MPI jobs to exchange +contact information in support of MPI-2's Publish_name and Lookup_name +functions. +. +.TP 10 +.B -h | --help +Display help for this command +. +. +.TP +.B -d | --debug +Enable verbose output for debugging +. +. +.TP +.B -r | --report-uri \fR\fP +Report the Open MPI contact information for the server. This information is +required for MPI jobs to use the data server. Three parameter values are supported: +(a) '-', indicating that the uri is to be printed to stdout; (b) '+', indicating that +the uri is to be printed to stderr; and (c) "file:path-to-file", indicating that +the uri is to be printed to the specified file. The "path-to-file" can be either +absolute or relative, but must be in a location where the user has write +permissions. Please note that the resulting file must be read-accessible to +expected users of the server. +. +. +.\" ************************** +.\" Description Section +.\" ************************** +.SH DESCRIPTION +. +.PP +\fIompi-server\fR acts as a data server for Open MPI jobs to exchange +contact information in support of MPI-2's Publish_name and Lookup_name +functions. +. +.\" ************************** +.\" See Also Section +.\" ************************** +. +.SH SEE ALSO +. diff --git a/ompi/tools/ompi-probe/ompi-probe.c b/ompi/tools/ompi-probe/ompi-probe.c new file mode 100644 index 0000000000..ac18287b7f --- /dev/null +++ b/ompi/tools/ompi-probe/ompi-probe.c @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2006 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2007 Cisco, Inc. All rights reserved. + * Copyright (c) 2007 Los Alamos National Security, LLC. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" +#include "mpi.h" + +#include +#include +#ifdef HAVE_UNISTD_H +#include +#endif +#ifdef HAVE_NETDB_H +#include +#endif +#ifdef HAVE_SYS_PARAM_H +#include +#endif +#include +#include +#include +#include + +#include "opal/dss/dss.h" +#include "opal/mca/base/base.h" +#include "opal/util/opal_environ.h" +#include "opal/runtime/opal.h" + +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/rml/rml.h" +#include "orte/util/proc_info.h" +#include "orte/runtime/orte_globals.h" + +/* + * Globals + */ + +int main(int argc, char *argv[]) +{ + char * tmp_env_var = NULL; + char *rml_uri; + opal_buffer_t buffer; + char *attr = "oob.tcp"; + int32_t len; + int rc; + + /* init enough of opal to use a few utilities */ + if (OPAL_SUCCESS != opal_init_util()) { + fprintf(stderr, "OPAL failed to initialize -- ompi-probe aborting\n"); + exit(1); + } + +#if OPAL_ENABLE_FT == 1 + /* Disable the checkpoint notification routine for this + * tool. As we will never need to checkpoint this tool. + * Note: This must happen before opal_init(). + */ + opal_cr_set_enabled(false); + + /* Select the none component, since we don't actually use a checkpointer */ + tmp_env_var = mca_base_param_env_var("crs"); + opal_setenv(tmp_env_var, + "none", + true, &environ); + free(tmp_env_var); + tmp_env_var = NULL; + + /* Mark as a tool program */ + tmp_env_var = mca_base_param_env_var("opal_cr_is_tool"); + opal_setenv(tmp_env_var, + "1", + true, &environ); + free(tmp_env_var); +#endif + tmp_env_var = NULL; /* Silence compiler warning */ + + /* open up and select all the frameworks - this will generate the + * profiled output + */ + MPI_Init(NULL, NULL); + + /* get our RML uri */ + rml_uri = orte_rml.get_contact_info(); + + if (NULL != rml_uri) { + char *ptr, *endip, *ipout=NULL, *tmp; + endip = rml_uri; + /* remove the non-IP info */ + while (NULL != (ptr = strchr(endip, '/'))) { + /* next position is the second '/' */ + ptr += 2; + /* now look for ':' */ + endip = strchr(ptr, ':'); + if (NULL == endip) { + /* got an error - just dump this */ + free(rml_uri); + goto CLEANUP; + } + *endip = '\0'; + if (NULL == ipout) { + ipout = strdup(ptr); + } else { + asprintf(&tmp, "%s:%s", ipout, ptr); + free(ipout); + ipout = tmp; + } + ptr = endip + 1; + } + /* send the result to the HNP*/ + OBJ_CONSTRUCT(&buffer, opal_buffer_t); + if (ORTE_SUCCESS != (rc = opal_dss.pack(&buffer, &orte_process_info.nodename, 1, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + goto skip; + } + if (ORTE_SUCCESS != (rc = opal_dss.pack(&buffer, &attr, 1, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + goto skip; + } + len = strlen(ipout); + if (ORTE_SUCCESS != (rc = opal_dss.pack(&buffer, &len, 1, OPAL_INT32))) { + ORTE_ERROR_LOG(rc); + goto skip; + } + if (ORTE_SUCCESS != (rc = opal_dss.pack(&buffer, &ipout, len, OPAL_BYTE))) { + ORTE_ERROR_LOG(rc); + goto skip; + } + orte_rml.send_buffer(ORTE_PROC_MY_HNP, &buffer, ORTE_RML_TAG_GRPCOMM_PROFILE, 0); + skip: + OBJ_DESTRUCT(&buffer); + /* cleanup */ + free(rml_uri); + } + +CLEANUP: + /* Finalize and clean up ourselves */ + MPI_Finalize(); + + return 0; +} diff --git a/ompi/tools/ompi-profiler/Makefile.am b/ompi/tools/ompi-profiler/Makefile.am new file mode 100644 index 0000000000..17acc15931 --- /dev/null +++ b/ompi/tools/ompi-profiler/Makefile.am @@ -0,0 +1,47 @@ +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2008 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +include $(top_srcdir)/Makefile.man-page-rules + +man_pages = ompi-profiler.1 +EXTRA_DIST = $(man_pages:.1=.1in) + +if OMPI_INSTALL_BINARIES +if !ORTE_DISABLE_FULL_SUPPORT + +bin_PROGRAMS = ompi-profiler + +dist_pkgdata_DATA = help-ompi-profiler.txt + +nodist_man_MANS = $(man_pages) + +# Ensure that the man pages are rebuilt if the opal_config.h file +# changes; a "good enough" way to know if configure was run again (and +# therefore the release date or version may have changed) +$(nodist_man_MANS): $(top_builddir)/opal/include/opal_config.h + +endif # !ORTE_DISABLE_FULL_SUPPORT +endif # OMPI_INSTALL_BINARIES + +ompi_profiler_SOURCES = ompi-profiler.c +ompi_profiler_LDADD = $(top_builddir)/ompi/libmpi.la + +distclean-local: + rm -f $(man_pages) diff --git a/ompi/tools/ompi-profiler/help-ompi-profiler.txt b/ompi/tools/ompi-profiler/help-ompi-profiler.txt new file mode 100644 index 0000000000..4e4a94940f --- /dev/null +++ b/ompi/tools/ompi-profiler/help-ompi-profiler.txt @@ -0,0 +1,37 @@ +# -*- text -*- +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# +# This is the US/English general help file for Open MPI's ompi-probe tool. +# +[ompi-profiler:usage] + +ompi-profiler (Open MPI) version %s + +Profile Open MPI to determine what frameworks are being used, and to +report out information that can be used to tailor Open MPI's behavior. + +Usage: ompi-profiler [OPTIONS] + +Report bugs to %s +# +[ompi-profiler:version] + +ompi-profiler (Open MPI) version %s + +Report bugs to %s + diff --git a/ompi/tools/ompi-profiler/ompi-profiler.1in b/ompi/tools/ompi-profiler/ompi-profiler.1in new file mode 100644 index 0000000000..6ccc2136f3 --- /dev/null +++ b/ompi/tools/ompi-profiler/ompi-profiler.1in @@ -0,0 +1,72 @@ +.\" +.\" Copyright (c) 2007 Los Alamos National Security, LLC +.\" All rights reserved. +.\" Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved. +.\" +.\" Man page for OMPI's ompi-server command +.\" +.\" .TH name section center-footer left-footer center-header +.TH OMPI-SERVER 1 "#OMPI_DATE#" "#PACKAGE_VERSION#" "#PACKAGE_NAME#" +.\" ************************** +.\" Name Section +.\" ************************** +.SH NAME +. +ompi-server \- Server for supporting name publish/lookup operations. +. +.PP +. +.\" ************************** +.\" Synopsis Section +.\" ************************** +.SH SYNOPSIS +. +.BR ompi-server " [ options ]" +. +.\" ************************** +.\" Options Section +.\" ************************** +.SH Options +. +\fIompi-server\fR acts as a data server for Open MPI jobs to exchange +contact information in support of MPI-2's Publish_name and Lookup_name +functions. +. +.TP 10 +.B -h | --help +Display help for this command +. +. +.TP +.B -d | --debug +Enable verbose output for debugging +. +. +.TP +.B -r | --report-uri \fR\fP +Report the Open MPI contact information for the server. This information is +required for MPI jobs to use the data server. Three parameter values are supported: +(a) '-', indicating that the uri is to be printed to stdout; (b) '+', indicating that +the uri is to be printed to stderr; and (c) "file:path-to-file", indicating that +the uri is to be printed to the specified file. The "path-to-file" can be either +absolute or relative, but must be in a location where the user has write +permissions. Please note that the resulting file must be read-accessible to +expected users of the server. +. +. +.\" ************************** +.\" Description Section +.\" ************************** +.SH DESCRIPTION +. +.PP +\fIompi-server\fR acts as a data server for Open MPI jobs to exchange +contact information in support of MPI-2's Publish_name and Lookup_name +functions. +. +.\" ************************** +.\" See Also Section +.\" ************************** +. +.SH SEE ALSO +. diff --git a/ompi/tools/ompi-profiler/ompi-profiler.c b/ompi/tools/ompi-profiler/ompi-profiler.c new file mode 100644 index 0000000000..624b56f2dc --- /dev/null +++ b/ompi/tools/ompi-profiler/ompi-profiler.c @@ -0,0 +1,457 @@ +/* + * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2006 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2007 Cisco, Inc. All rights reserved. + * Copyright (c) 2007 Los Alamos National Security, LLC. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "opal_config.h" +#include "opal/constants.h" +#include "opal/version.h" + +#include +#include +#include +#ifdef HAVE_UNISTD_H +#include +#endif +#ifdef HAVE_SYS_STAT_H +#include +#endif +#include +#include +#include + + +#include "opal/class/opal_list.h" +#include "opal/util/argv.h" +#include "opal/runtime/opal.h" +#include "opal/util/cmd_line.h" +#include "opal/mca/base/base.h" +#include "opal/util/os_path.h" +#include "opal/util/path.h" +#include "opal/util/opal_environ.h" +#include "opal/util/show_help.h" + +/* + * Globals + */ +typedef struct { + opal_list_item_t item; + char *name; + opal_list_t frameworks; +} orte_profile_node_t; +static void node_constructor(orte_profile_node_t *ptr) +{ + ptr->name = NULL; + OBJ_CONSTRUCT(&ptr->frameworks, opal_list_t); +} +static void node_destructor(orte_profile_node_t *ptr) +{ + if (NULL != ptr->name) { + free(ptr->name); + } + OBJ_DESTRUCT(&ptr->frameworks); +} +OBJ_CLASS_INSTANCE(orte_profile_node_t, + opal_list_item_t, + node_constructor, + node_destructor); + +typedef struct { + opal_list_item_t item; + int num_nodes; + char *framework; + char *component; + char *params; +} orte_profile_t; +static void profile_constructor(orte_profile_t *ptr) +{ + ptr->num_nodes = 0; + ptr->framework = NULL; + ptr->component = NULL; + ptr->params = NULL; +} +static void profile_destructor(orte_profile_t *ptr) +{ + if (NULL != ptr->framework) { + free(ptr->framework); + } + if (NULL != ptr->component) { + free(ptr->component); + } + if (NULL != ptr->params) { + free(ptr->params); + } +} +OBJ_CLASS_INSTANCE(orte_profile_t, + opal_list_item_t, + profile_constructor, + profile_destructor); + +static void read_file(opal_list_t *nodes, FILE *fp); + +/* global variables */ +static bool help = false; +static bool version = false; +static bool verbose = false; +static bool configout = false; +static char *profilefile = NULL; +static bool report = false; + +static opal_cmd_line_init_t cmd_line_init[] = { + /* Various "obvious" options */ + { NULL, NULL, NULL, 'h', "help", "help", 0, + &help, OPAL_CMD_LINE_TYPE_BOOL, + "This help message" }, + + { NULL, NULL, NULL, 'V', "version", "version", 0, + &version, OPAL_CMD_LINE_TYPE_BOOL, + "Print version and exit" }, + + { NULL, NULL, NULL, '\0', "verbose", "verbose", 0, + &verbose, OPAL_CMD_LINE_TYPE_BOOL, + "Print version and exit" }, + + { NULL, NULL, NULL, '\0', "config", "config", 0, + &configout, OPAL_CMD_LINE_TYPE_BOOL, + "Print framework/component usage" }, + + { NULL, NULL, NULL, '\0', "profile", "profile", 1, + &profilefile, OPAL_CMD_LINE_TYPE_STRING, + "File to update with system profile parameters" }, + + { NULL, NULL, NULL, '\0', "report", "report", 0, + &report, OPAL_CMD_LINE_TYPE_BOOL, + "Print out a report of the data in the given profile file" }, + + /* End of list */ + { NULL, NULL, NULL, '\0', NULL, NULL, 0, + NULL, OPAL_CMD_LINE_TYPE_NULL, NULL } +}; + +int main(int argc, char *argv[]) +{ + opal_list_t nodes, frames; + opal_list_item_t *item, *item2, *itemold; + orte_profile_node_t *node; + orte_profile_t *profile, *frame; + int ret; + int len; + opal_cmd_line_t cmd_line; + char *args = NULL; + char *configfile=NULL; + char *cmd; + + opal_cmd_line_create(&cmd_line, cmd_line_init); + mca_base_cmd_line_setup(&cmd_line); + if (OPAL_SUCCESS != (ret = opal_cmd_line_parse(&cmd_line, true, + argc, argv)) ) { + return ret; + } + + /* init enough of opal to use a few utilities */ + if (OPAL_SUCCESS != opal_init_util()) { + fprintf(stderr, "OPAL failed to initialize -- ompi-profiler aborting\n"); + exit(1); + } + + /* check for some simple options */ + if (help) { + args = opal_cmd_line_get_usage_msg(&cmd_line); + opal_show_help("help-ompi-profiler.txt", "ompi-profiler:usage", false, + OPAL_VERSION, args, PACKAGE_BUGREPORT); + free(args); + + /* If someone asks for help, that should be all we do */ + exit(0); + + } + + if (version) { + opal_show_help("help-ompi-profiler.txt", "ompi-profiler:version", false, + OPAL_VERSION, PACKAGE_BUGREPORT); + exit(0); + } + + if (report) { + int fd; + int32_t num_bytes; + char *nodename, *attr; + char data[8192]; + + /* just read the given file and print out a report */ + if (NULL == profilefile) { + opal_show_help("help-ompi-profiler.txt", "ompi-profiler:report-wo-file", false); + exit(1); + } + + fd = open(profilefile, O_RDONLY); + if (fd < 0) { + opal_show_help("help-ompi-profiler.txt", "ompi-profiler:report-file-not-found", false); + exit(1); + } + + /* loop through file until end */ + while (0 < read(fd, &num_bytes, sizeof(num_bytes))) { + /* this is the number of bytes in the nodename */ + memset(data, 0, sizeof(data)); + if (0 > read(fd, data, num_bytes)) { + fprintf(stderr, "ompi-profiler: node name not found\n"); + close(fd); + exit(0); + } + /* this is the nodename - save it */ + nodename = strdup(data); + /* get the number of bytes in the attribute name */ + if (0 > read(fd, &num_bytes, sizeof(num_bytes))) { + fprintf(stderr, "ompi-profiler: attribute size not found\n"); + close(fd); + exit(0); + } + /* get the attribute name */ + memset(data, 0, sizeof(data)); + if (0 > read(fd, data, num_bytes)) { + fprintf(stderr, "ompi-profiler: attribute name not found\n"); + close(fd); + exit(0); + } + /* remove the newline and save it */ + attr = strdup(data); + /* read the number of bytes in the blob */ + if (0 > read(fd, &num_bytes, sizeof(num_bytes))) { + fprintf(stderr, "ompi-profiler: data size not found\n"); + close(fd); + exit(0); + } + /* read the bytes so we position ourselves */ + if (0 > read(fd, data, num_bytes)) { + fprintf(stderr, "ompi-profiler: data not found\n"); + close(fd); + exit(0); + } + /* report the results */ + fprintf(stdout, "Node %s reported %d bytes for attribute %s\n", + nodename, num_bytes, attr); + free(nodename); + free(attr); + } + exit(0); + } + + /* do a quick sanity check - since they didn't want a report, see if they don't + * want -anything- + */ + if (!configout && NULL == profilefile) { + /* save us the annoyance - you have to want -something-! */ + fprintf(stderr, "ompi-profiler: no options specified - aborting\n"); + exit(1); + } + + /* setup the cmd to execute */ + if (configout) { + asprintf(&configfile, "profiler.%d", getpid()); + } else { + configfile = strdup("/dev/null"); + } + if (NULL != profilefile) { + asprintf(&cmd, "mpirun -pernode -mca opal_profile 1 -mca opal_profile_file %s -mca grpcomm basic ompi-probe >& %s", + profilefile, configfile); + } else { + asprintf(&cmd, "mpirun -pernode -mca opal_profile 1 -mca grpcomm basic ompi-probe >& %s", + configfile); + } + + if (verbose) { + fprintf(stderr, "ompi-profiler: executing %s\n", cmd); + } + + /* execute it */ + if (0 > system(cmd)) { + fprintf(stderr, "ompi-profiler: could not execute cmd %s\n", cmd); + free(cmd); + goto CLEANUP; + } + free(cmd); + + /* did they want the configuration output? */ + if (configout) { + FILE *fp; + struct stat buf; + + /* does the file already exist? */ + if (0 != stat(configfile, &buf)) { + /* file must not have been created */ + fprintf(stderr, "Temporary output file %s could not be found - config report cannot be generated\n", configfile); + goto CLEANUP; + } + + /* yes - read the info so we can output it */ + fp = fopen(configfile, "r"); + if (NULL == fp) { + fprintf(stderr, "Impossible to open the file %s in read mode\n", configfile ); + goto CLEANUP; + } + OBJ_CONSTRUCT(&nodes, opal_list_t); + read_file(&nodes, fp); + fclose(fp); + + /* setup a list of framework info */ + OBJ_CONSTRUCT(&frames, opal_list_t); + len = opal_list_get_size(&nodes); + /* convert the results over to the new list */ + while (NULL != (item = opal_list_remove_first(&nodes))) { + node = (orte_profile_node_t*)item; + /* loop through this node's frameworks */ + item2 = opal_list_get_first(&node->frameworks); + while (item2 != opal_list_get_end(&node->frameworks)) { + profile = (orte_profile_t*)item2; + /* is this framework already in our list? */ + for (itemold = opal_list_get_first(&frames); + itemold != opal_list_get_end(&frames); + itemold = opal_list_get_next(itemold)) { + frame = (orte_profile_t*)itemold; + if (0 == strcmp(profile->framework, frame->framework) && + 0 == strcmp(profile->component, frame->component)) { + /* all matches - increment # matches */ + frame->num_nodes++; + goto COMPLETE; + } + } + /* get here if the framework/component is new */ + frame = OBJ_NEW(orte_profile_t); + frame->num_nodes++; + frame->framework = strdup(profile->framework); + frame->component = strdup(profile->component); + opal_list_append(&frames, &frame->item); + COMPLETE: + item2 = opal_list_get_next(item2); + } + OBJ_RELEASE(item); + } + OBJ_DESTRUCT(&nodes); + /* output the list */ + while (NULL != (itemold = opal_list_remove_first(&frames))) { + frame = (orte_profile_t*)itemold; + if (len == frame->num_nodes) { + fprintf(stderr, "All nodes use framework %s component %s\n", frame->framework, frame->component); + } else { + fprintf(stderr, "%d nodes use framework %s component %s\n", frame->num_nodes, frame->framework, frame->component); + } + OBJ_RELEASE(frame); + } + OBJ_DESTRUCT(&frames); + } + +CLEANUP: + /* remove the file */ + if (NULL != configfile) { + if (0 != strcmp("/dev/null", configfile)) { + unlink(configfile); + } + free(configfile); + } + return 0; +} + +static void read_file(opal_list_t *nodes, FILE *fp) +{ + char line[1024]; + char *endprefix, *endnodename, *data, *nodename; + char **inputs; + opal_list_item_t *item; + orte_profile_node_t *node; + orte_profile_t *profile; + int len; + + memset(line, 0, sizeof(line)); + while (NULL != fgets(line, sizeof(line), fp)) { + /* get the length of the line */ + len = strlen(line); + /* remove any trailing newline */ + if (line[len-1] == '\n') { + line[len-1] = '\0'; + } + if ('[' != line[0]) { + /* indicates empty line - ignore it */ + continue; + } + if (NULL == (endprefix = strchr(line, ']'))) { + fprintf(stderr, "ompi-profiler: read bad input for ] %s\n", line); + continue; + } + *endprefix = '\0'; + /* break the prefix at the colon - we don't need the pid */ + if (NULL == (endnodename = strchr(line, ':'))) { + fprintf(stderr, "ompi-profiler: read bad input for : %s\n", line); + continue; + } + *endnodename = '\0'; + nodename = &line[1]; + /* is this node already in our list */ + for (item = opal_list_get_first(nodes); + item != opal_list_get_end(nodes); + item = opal_list_get_next(item)) { + node = (orte_profile_node_t*)item; + if (0 == strcmp(node->name, nodename)) { + /* already present - just add to it */ + goto PROCESS; + } + } + /* if we got here, then it wasn't found */ + node = OBJ_NEW(orte_profile_node_t); + node->name = strdup(nodename); + opal_list_append(nodes, &node->item); + PROCESS: + /* point to the rest of the data */ + data = endprefix; + data += 2; /* get past space */ + /* use an opal utility to parse it */ + if ((NULL == (inputs = opal_argv_split(data, ':'))) || + opal_argv_count(inputs) < 2) { + fprintf(stderr, "ompi-profiler: read bad input for second : %s\n", data); + opal_argv_free(inputs); + continue; + } + /* first entry must be the framework - see if we already have it */ + for (item = opal_list_get_first(&node->frameworks); + item != opal_list_get_end(&node->frameworks); + item = opal_list_get_next(item)) { + profile = (orte_profile_t*)item; + if (0 == strcmp(inputs[0], profile->framework)) { + /* this will happen if mpirun outputs some of the + * frameworks AND a proc is local to mpirun + */ + goto SKIP; + } + } + profile = OBJ_NEW(orte_profile_t); + profile->framework = strdup(inputs[0]); + /* second entry is component */ + profile->component = strdup(inputs[1]); + /* if there is anything more, just save it */ + if (NULL != inputs[2]) { + profile->params = opal_argv_join(&inputs[2], ':'); + } + opal_list_append(&node->frameworks, &profile->item); + SKIP: + opal_argv_free(inputs); + + memset(line, 0, sizeof(line)); + } +} + diff --git a/opal/mca/base/mca_base_components_select.c b/opal/mca/base/mca_base_components_select.c index 3d1330161b..7d458acd8b 100644 --- a/opal/mca/base/mca_base_components_select.c +++ b/opal/mca/base/mca_base_components_select.c @@ -18,6 +18,7 @@ #include #endif +#include "opal/runtime/opal.h" #include "opal/class/opal_list.h" #include "opal/util/strncpy.h" #include "opal/util/argv.h" @@ -120,7 +121,10 @@ int mca_base_select(const char *type_name, int output_id, opal_output_verbose(5, output_id, "mca:base:select:(%5s) Selected component [%s]", type_name, (*best_component)->mca_component_name); - + if (opal_profile) { + opal_output(0, "%s:%s", type_name, (*best_component)->mca_component_name); + } + /* * Close the non-selected components */ diff --git a/opal/runtime/opal.h b/opal/runtime/opal.h index 7b45432f13..cd042d5d11 100644 --- a/opal/runtime/opal.h +++ b/opal/runtime/opal.h @@ -24,13 +24,15 @@ #include "opal_config.h" -#if defined(c_plusplus) || defined(__cplusplus) -extern "C" { -#endif +BEGIN_C_DECLS /** version string of opal */ OPAL_DECLSPEC extern const char opal_version_string[]; +/* profile flag */ +OPAL_DECLSPEC extern bool opal_profile; +OPAL_DECLSPEC extern char *opal_profile_file; + /** * Initialize the OPAL layer, including the MCA system. * @@ -80,8 +82,6 @@ OPAL_DECLSPEC int opal_finalize_util(void); */ OPAL_DECLSPEC int opal_register_params(void); -#if defined(c_plusplus) || defined(__cplusplus) -} -#endif +END_C_DECLS #endif diff --git a/opal/runtime/opal_init.c b/opal/runtime/opal_init.c index 5d39af461d..ea9f3e7a00 100644 --- a/opal/runtime/opal_init.c +++ b/opal/runtime/opal_init.c @@ -61,6 +61,8 @@ const char opal_version_string[] = OPAL_IDENT_STRING; int opal_initialized = 0; int opal_util_initialized = 0; +bool opal_profile = false; +char *opal_profile_file = NULL; static const char * opal_err2str(int errnum) diff --git a/opal/runtime/opal_params.c b/opal/runtime/opal_params.c index 13f9ea1a43..e20e5b9955 100644 --- a/opal/runtime/opal_params.c +++ b/opal/runtime/opal_params.c @@ -73,6 +73,19 @@ int opal_register_params(void) free(string); } + { + int j; + + mca_base_param_reg_int_name("opal", "profile", + "Set to non-zero to profile component selections", + false, false, (int)false, &j); + opal_profile = OPAL_INT_TO_BOOL(j); + + mca_base_param_reg_string_name("opal", "profile_file", + "Name of the file containing the cluster configuration information", + false, false, NULL, &opal_profile_file); + } + #if OMPI_ENABLE_DEBUG diff --git a/orte/mca/grpcomm/base/Makefile.am b/orte/mca/grpcomm/base/Makefile.am index 710ff94d0e..a6531d5380 100644 --- a/orte/mca/grpcomm/base/Makefile.am +++ b/orte/mca/grpcomm/base/Makefile.am @@ -28,5 +28,7 @@ if !ORTE_DISABLE_FULL_SUPPORT libmca_grpcomm_la_SOURCES += \ base/grpcomm_base_allgather.c \ - base/grpcomm_base_modex.c + base/grpcomm_base_modex.c \ + base/grpcomm_base_receive.c + endif diff --git a/orte/mca/grpcomm/base/base.h b/orte/mca/grpcomm/base/base.h index 5b1e3524a7..24ffb25ccc 100644 --- a/orte/mca/grpcomm/base/base.h +++ b/orte/mca/grpcomm/base/base.h @@ -76,6 +76,11 @@ ORTE_DECLSPEC void orte_grpcomm_base_modex_finalize(void); ORTE_DECLSPEC int orte_grpcomm_base_pack_modex_entries(opal_buffer_t *buf, bool *modex_reqd); ORTE_DECLSPEC int orte_grpcomm_base_update_modex_entries(orte_process_name_t *proc_name, opal_buffer_t *rbuf); +ORTE_DECLSPEC int orte_grpcomm_base_load_modex_data(orte_process_name_t *proc, char *attribute_name, + void *data, int num_bytes); + +ORTE_DECLSPEC int orte_grpcomm_base_comm_start(void); +ORTE_DECLSPEC int orte_grpcomm_base_comm_stop(void); #endif /* ORTE_DISABLE_FULL_SUPPORT */ diff --git a/orte/mca/grpcomm/base/grpcomm_base_modex.c b/orte/mca/grpcomm/base/grpcomm_base_modex.c index 2e1bf9a05e..740dfc19cc 100644 --- a/orte/mca/grpcomm/base/grpcomm_base_modex.c +++ b/orte/mca/grpcomm/base/grpcomm_base_modex.c @@ -330,6 +330,11 @@ int orte_grpcomm_base_get_proc_attr(const orte_process_name_t proc, modex_proc_data_t *proc_data; modex_attr_data_t *attr_data; + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_output, + "%s grpcomm:get_proc_attr: searching for attr %s on proc %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), attribute_name, + ORTE_NAME_PRINT(&proc))); + proc_data = modex_lookup_orte_proc(&proc); if (NULL == proc_data) { OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_output, @@ -363,6 +368,11 @@ int orte_grpcomm_base_get_proc_attr(const orte_process_name_t proc, memcpy(copy, attr_data->attr_data, attr_data->attr_data_size); *val = copy; *size = attr_data->attr_data_size; + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_output, + "%s grpcomm:get_proc_attr: found %d bytes for attr %s on proc %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)attr_data->attr_data_size, + attribute_name, ORTE_NAME_PRINT(&proc))); + } OPAL_THREAD_UNLOCK(&proc_data->modex_lock); @@ -451,6 +461,11 @@ int orte_grpcomm_base_update_modex_entries(orte_process_name_t *proc_name, goto cleanup; } + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_output, + "%s grpcomm:base:update_modex_entries: adding %d entries for proc %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), num_recvd_entries, + ORTE_NAME_PRINT(proc_name))); + /* * Extract the attribute names and values */ @@ -507,3 +522,54 @@ cleanup: OPAL_THREAD_UNLOCK(&proc_data->modex_lock); return rc; } + +int orte_grpcomm_base_load_modex_data(orte_process_name_t *proc_name, char *attr_name, + void *data, int num_bytes) +{ + modex_proc_data_t *proc_data; + modex_attr_data_t *attr_data; + int rc = ORTE_SUCCESS; + void *bytes; + + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_output, + "%s grpcomm:base:load_modex_data: loading %ld bytes for attr %s on proc %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (long)num_bytes, attr_name, ORTE_NAME_PRINT(proc_name))); + + /* look up the modex data structure */ + proc_data = modex_lookup_orte_proc(proc_name); + if (proc_data == NULL) { + /* report the error */ + opal_output(0, "grpcomm:base:update_modex: received modex info for unknown proc %s\n", + ORTE_NAME_PRINT(proc_name)); + return ORTE_ERR_NOT_FOUND; + } + + OPAL_THREAD_LOCK(&proc_data->modex_lock); + + /* + * Lookup the corresponding modex structure + */ + if (NULL == (attr_data = modex_lookup_attr_data(proc_data, + attr_name, true))) { + opal_output(0, "grpcomm:base:update_modex: modex_lookup_attr_data failed\n"); + rc = ORTE_ERR_NOT_FOUND; + goto cleanup; + } + if (NULL != attr_data->attr_data) { + /* some pre-existing value must be here - release it */ + free(attr_data->attr_data); + } + /* create space for the data - this is necessary since the data being + * passed to us may be static or released on the other end + */ + bytes = (void*)malloc(num_bytes); + memcpy(bytes, data, num_bytes); + attr_data->attr_data = bytes; + attr_data->attr_data_size = num_bytes; + proc_data->modex_received_data = true; + +cleanup: + OPAL_THREAD_UNLOCK(&proc_data->modex_lock); + return rc; +} diff --git a/orte/mca/grpcomm/base/grpcomm_base_receive.c b/orte/mca/grpcomm/base/grpcomm_base_receive.c new file mode 100644 index 0000000000..4ad07f271f --- /dev/null +++ b/orte/mca/grpcomm/base/grpcomm_base_receive.c @@ -0,0 +1,220 @@ +/* -*- C -*- + * + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** @file: + * + */ + +/* + * includes + */ +#include "orte_config.h" +#include "orte/constants.h" +#include "orte/types.h" + +#include +#include +#ifdef HAVE_UNISTD_H +#include +#endif +#ifdef HAVE_SYS_TIME_H +#include +#endif + +#include "opal/class/opal_list.h" +#include "opal/mca/mca.h" +#include "opal/mca/base/mca_base_param.h" +#include "opal/runtime/opal.h" +#include "opal/dss/dss.h" + +#include "orte/util/proc_info.h" +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/rml/rml.h" +#include "orte/mca/rml/base/rml_contact.h" +#include "orte/mca/routed/routed.h" +#include "orte/util/name_fns.h" +#include "orte/util/show_help.h" +#include "orte/runtime/orte_globals.h" +#include "orte/runtime/orte_wait.h" + +#include "orte/mca/grpcomm/base/base.h" + +static bool recv_issued=false; +static int profile_fd = -1; + +static void orte_grpcomm_base_recv(int status, orte_process_name_t* sender, + opal_buffer_t* buffer, orte_rml_tag_t tag, + void* cbdata); + +int orte_grpcomm_base_comm_start(void) +{ + int rc; + + if (recv_issued) { + return ORTE_SUCCESS; + } + + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_output, + "%s grpcomm:base:receive start comm", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + + /* open the profile file for writing */ + if (NULL == opal_profile_file) { + /* no file specified - we will just ignore any incoming data */ + profile_fd = -1; + } else { + profile_fd = open(opal_profile_file, O_CREAT|O_RDWR|O_TRUNC, 0644); + if (profile_fd < 0) { + /* couldn't be opened */ + ORTE_ERROR_LOG(ORTE_ERR_FILE_OPEN_FAILURE); + return ORTE_ERR_FILE_OPEN_FAILURE; + } + } + + if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, + ORTE_RML_TAG_GRPCOMM_PROFILE, + ORTE_RML_NON_PERSISTENT, + orte_grpcomm_base_recv, + NULL))) { + ORTE_ERROR_LOG(rc); + } + recv_issued = true; + + return rc; +} + + +int orte_grpcomm_base_comm_stop(void) +{ + if (!recv_issued) { + return ORTE_SUCCESS; + } + + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_output, + "%s grpcomm:base:receive stop comm", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + + orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_GRPCOMM_PROFILE); + recv_issued = false; + + if (0 <= profile_fd) { + close(profile_fd); + profile_fd = -1; + } + + return ORTE_SUCCESS; +} + +/* process incoming messages in order of receipt */ +static void process_msg(int fd, short event, void *data) +{ + orte_message_event_t *mev = (orte_message_event_t*)data; + char *attr, *nodename; + int32_t isize, count; + void *blob; + int32_t len, rc; + + /* save the info in the file */ + if (0 <= profile_fd) { + /* unpack the node name */ + count = 1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(mev->buffer, &nodename, &count, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + goto CLEANUP; + } + + /* unpack the attribute name */ + count = 1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(mev->buffer, &attr, &count, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + goto CLEANUP; + } + + /* unpack the data size */ + count = 1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(mev->buffer, &isize, &count, OPAL_INT32))) { + ORTE_ERROR_LOG(rc); + goto CLEANUP; + } + + /* allocate space and unpack the data itself */ + blob = (void*)malloc(isize); + count = isize; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(mev->buffer, blob, &count, OPAL_BYTE))) { + ORTE_ERROR_LOG(rc); + goto CLEANUP; + } + + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_output, + "%s grpcomm:base:receive writing %d bytes of data for node %s, attribute %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + isize, nodename, attr)); + len = strlen(nodename); + write(profile_fd, &len, sizeof(len)); + write(profile_fd, nodename, len); + len = strlen(attr); + write(profile_fd, &len, sizeof(len)); + write(profile_fd, attr, strlen(attr)); + write(profile_fd, &isize, sizeof(isize)); + write(profile_fd, blob, isize); + } + +CLEANUP: + /* release the message */ + OBJ_RELEASE(mev); + +} + +/* + * NOTE: The incoming buffer "buffer" is OBJ_RELEASED by the calling program. + * DO NOT RELEASE THIS BUFFER IN THIS CODE + */ + +static void orte_grpcomm_base_recv(int status, orte_process_name_t* sender, + opal_buffer_t* buffer, orte_rml_tag_t tag, + void* cbdata) +{ + int rc; + + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_output, + "%s grpcomm:base:receive got message from %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(sender))); + + /* don't process this right away - we need to get out of the recv before + * we process the message as it may ask us to do something that involves + * more messaging! Instead, setup an event so that the message gets processed + * as soon as we leave the recv. + * + * The macro makes a copy of the buffer, which we release above - the incoming + * buffer, however, is NOT released here, although its payload IS transferred + * to the message buffer for later processing + */ + ORTE_MESSAGE_EVENT(sender, buffer, tag, process_msg); + + /* reissue the recv */ + if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, + ORTE_RML_TAG_GRPCOMM_PROFILE, + ORTE_RML_NON_PERSISTENT, + orte_grpcomm_base_recv, + NULL))) { + ORTE_ERROR_LOG(rc); + } + return; +} + diff --git a/orte/mca/grpcomm/basic/grpcomm_basic_module.c b/orte/mca/grpcomm/basic/grpcomm_basic_module.c index 8e46c403a1..441595aaf7 100644 --- a/orte/mca/grpcomm/basic/grpcomm_basic_module.c +++ b/orte/mca/grpcomm/basic/grpcomm_basic_module.c @@ -25,12 +25,16 @@ #ifdef HAVE_SYS_TIME_H #include #endif /* HAVE_SYS_TIME_H */ +#ifdef HAVE_SYS_STAT_H +#include +#endif +#include #include "opal/threads/condition.h" #include "opal/util/bit_ops.h" #include "opal/class/opal_hash_table.h" #include "opal/dss/dss.h" - +#include "opal/runtime/opal.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/ess/ess.h" @@ -56,6 +60,7 @@ static int xcast(orte_jobid_t job, static int allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf); static int barrier(void); static int modex(opal_list_t *procs); +static int set_proc_attr(const char *attr_name, const void *data, size_t size); /* Module def */ orte_grpcomm_base_module_t orte_grpcomm_basic_module = { @@ -65,23 +70,41 @@ orte_grpcomm_base_module_t orte_grpcomm_basic_module = { allgather, orte_grpcomm_base_allgather_list, barrier, - orte_grpcomm_base_set_proc_attr, + set_proc_attr, orte_grpcomm_base_get_proc_attr, modex, orte_grpcomm_base_purge_proc_attrs }; +static bool profile; + /** * Initialize the module */ static int init(void) { int rc; + int value; if (ORTE_SUCCESS != (rc = orte_grpcomm_base_modex_init())) { ORTE_ERROR_LOG(rc); } + + /* if we are profiling and I am the HNP, then start the + * profiling receive + */ + mca_base_param_reg_int_name("orte", "grpcomm_recv_on", + "Whether to turn on grpcomm recv", + false, false, (int)false, &value); + profile = OPAL_INT_TO_BOOL(value); + + if (profile && orte_process_info.hnp) { + if (ORTE_SUCCESS != (rc = orte_grpcomm_base_comm_start())) { + ORTE_ERROR_LOG(rc); + } + } + return rc; } @@ -91,6 +114,13 @@ static int init(void) static void finalize(void) { orte_grpcomm_base_modex_finalize(); + + /* if we are profiling and I am the HNP, then stop the + * profiling receive + */ + if (profile && orte_process_info.hnp) { + orte_grpcomm_base_comm_stop(); + } } /** @@ -458,7 +488,7 @@ static int modex(opal_list_t *procs) orte_std_cntr_t i, num_procs; orte_std_cntr_t cnt; orte_process_name_t proc_name; - int rc; + int rc=ORTE_SUCCESS; int32_t arch; bool modex_reqd = false; @@ -466,21 +496,26 @@ static int modex(opal_list_t *procs) "%s grpcomm:basic: modex entered", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - /* setup the buffer that will actually be sent */ - OBJ_CONSTRUCT(&buf, opal_buffer_t); - OBJ_CONSTRUCT(&rbuf, opal_buffer_t); - - /* put our process name in the buffer so it can be unpacked later */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, ORTE_PROC_MY_NAME, 1, ORTE_NAME))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - - /* decide if we need to add the architecture to the modex. Check - * first to see if hetero is enabled - if not, then we clearly - * don't need to exchange arch's as they are all identical + /* if we were given a list of procs to modex with, then this is happening + * as part of a connect/accept operation. In this case, we -must- do the + * modex for two reasons: + * + * (a) the modex could involve procs from different mpiruns. In this case, + * there is no way for the two sets of procs to know which node the + * other procs are on, so we cannot use the profile_file to determine + * their contact info + * + * (b) in a comm_spawn, the parent job does not have a pidmap for the + * child job. Thus, it cannot know where the child procs are located, + * and cannot use the profile_file to determine their contact info */ - if (OMPI_ENABLE_HETEROGENEOUS_SUPPORT) { + if (NULL != procs || NULL == opal_profile_file || opal_profile) { + modex_reqd = true; + } else if (OMPI_ENABLE_HETEROGENEOUS_SUPPORT) { + /* decide if we need to add the architecture to the modex. Check + * first to see if hetero is enabled - if not, then we clearly + * don't need to exchange arch's as they are all identical + */ /* Case 1: If different apps in this job were built differently - e.g., some * are built 32-bit while others are built 64-bit - then we need to modex * regardless of any other consideration. The user is reqd to tell us via a @@ -502,19 +537,27 @@ static int modex(opal_list_t *procs) } if (modex_reqd) { + /* setup the buffer that will actually be sent */ + OBJ_CONSTRUCT(&buf, opal_buffer_t); + OBJ_CONSTRUCT(&rbuf, opal_buffer_t); + + /* put our process name in the buffer so it can be unpacked later */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, ORTE_PROC_MY_NAME, 1, ORTE_NAME))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &orte_process_info.arch, 1, OPAL_UINT32))) { ORTE_ERROR_LOG(rc); goto cleanup; } - } - - /* pack the entries we have received */ - if (ORTE_SUCCESS != (rc = orte_grpcomm_base_pack_modex_entries(&buf, &modex_reqd))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - - if (modex_reqd) { + + /* pack the entries we have received */ + if (ORTE_SUCCESS != (rc = orte_grpcomm_base_pack_modex_entries(&buf, &modex_reqd))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_output, "%s grpcomm:basic:modex: executing allgather", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); @@ -563,36 +606,47 @@ static int modex(opal_list_t *procs) goto cleanup; } - if (OMPI_ENABLE_HETEROGENEOUS_SUPPORT) { - /* are the nodes hetero? */ - if (orte_homogeneous_nodes) { - goto unpack_entries; - } - /* unpack its architecture */ - cnt=1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(&rbuf, &arch, &cnt, OPAL_UINT32))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - /* update the arch in the ESS */ + /* unpack its architecture */ + cnt=1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(&rbuf, &arch, &cnt, OPAL_UINT32))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + + /* update the arch in the ESS + * RHC: DO NOT UPDATE ARCH IF THE PROC IS NOT IN OUR JOB. THIS IS A TEMPORARY + * FIX TO COMPENSATE FOR A PROBLEM IN THE CONNECT/ACCEPT CODE WHERE WE EXCHANGE + * INFO INCLUDING THE ARCH, BUT THEN DO A MODEX THAT ALSO INCLUDES THE ARCH. WE + * CANNOT UPDATE THE ARCH FOR JOBS OUTSIDE OUR OWN AS THE ESS HAS NO INFO ON + * THOSE PROCS/NODES - AND DOESN'T NEED IT AS THE MPI LAYER HAS ALREADY SET + * ITSELF UP AND DOES NOT NEED ESS SUPPORT FOR PROCS IN THE OTHER JOB + * + * EVENTUALLY, WE WILL SUPPORT THE ESS HAVING INFO ON OTHER JOBS FOR + * FAULT TOLERANCE PURPOSES - BUT NOT RIGHT NOW + */ + if (proc_name.jobid == ORTE_PROC_MY_NAME->jobid) { if (ORTE_SUCCESS != (rc = orte_ess.update_arch(&proc_name, arch))) { ORTE_ERROR_LOG(rc); goto cleanup; } } - unpack_entries: + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_output, + "%s grpcomm:basic:modex: adding modex entry for proc %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&proc_name))); + /* update the modex database */ if (ORTE_SUCCESS != (rc = orte_grpcomm_base_update_modex_entries(&proc_name, &rbuf))) { ORTE_ERROR_LOG(rc); goto cleanup; } } + cleanup: + OBJ_DESTRUCT(&buf); + OBJ_DESTRUCT(&rbuf); } -cleanup: - OBJ_DESTRUCT(&buf); - OBJ_DESTRUCT(&rbuf); OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_output, "%s grpcomm:basic: modex completed", @@ -601,3 +655,170 @@ cleanup: return rc; } +/* the HNP will -never- execute the following as it is NOT an MPI process */ +static int set_proc_attr(const char *attr_name, const void *data, size_t size) +{ + struct stat buf; + int rc; + int fd; + int32_t num_bytes; + char *nodename, *attr, *prochost; + char modex_data[8192]; + orte_process_name_t name; + orte_vpid_t i; + + OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_output, + "%s grpcomm:basic:set_proc_attr for attribute %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), attr_name)); + + /* if we are doing a profile, pack this up and send it to the HNP */ + if (opal_profile) { + opal_buffer_t buffer; + int32_t isize; + + OBJ_CONSTRUCT(&buffer, opal_buffer_t); + if (ORTE_SUCCESS != (rc = opal_dss.pack(&buffer, &orte_process_info.nodename, 1, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + if (ORTE_SUCCESS != (rc = opal_dss.pack(&buffer, &attr_name, 1, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + isize = size; + if (ORTE_SUCCESS != (rc = opal_dss.pack(&buffer, &isize, 1, OPAL_INT32))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + if (ORTE_SUCCESS != (rc = opal_dss.pack(&buffer, data, isize, OPAL_BYTE))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + orte_rml.send_buffer(ORTE_PROC_MY_HNP, &buffer, ORTE_RML_TAG_GRPCOMM_PROFILE, 0); + cleanup: + OBJ_DESTRUCT(&buffer); + /* let it fall through so that the job doesn't hang! */ + return orte_grpcomm_base_set_proc_attr(attr_name, data, size); + } + + /* we always have to set our own attributes in case they are needed for + * a connect/accept at some later time + */ + rc = orte_grpcomm_base_set_proc_attr(attr_name, data, size); + + /* if we are not doing a profile, then see if the profile file was + * provided. if not, then we are done + */ + if (NULL == opal_profile_file) { + return rc; + } + + /* if the file was provided, then we need to check the file to see if + * info for this particular attribute is available there. But first, + * the file must be available + */ + if (0 != stat(opal_profile_file, &buf)) { + orte_show_help("help-grpcomm-basic.txt", "grpcomm-basic:file-not-found", true, opal_profile_file); + return ORTE_ERR_NOT_FOUND; + } + + fd = open(opal_profile_file, O_RDONLY); + if (fd < 0) { + orte_show_help("help-grpcomm-basic.txt", "grpcomm-basic:file-cant-open", true, opal_profile_file); + return ORTE_ERR_NOT_FOUND; + } + + OPAL_OUTPUT_VERBOSE((10, orte_grpcomm_base_output, + "%s grpcomm:basic:set_proc_attr reading %s file for attr %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), opal_profile_file, attr_name)); + + /* loop through file until end */ + while (0 < read(fd, &num_bytes, sizeof(num_bytes))) { + OPAL_OUTPUT_VERBOSE((20, orte_grpcomm_base_output, + "%s grpcomm:basic:set_proc_attr read %d string length", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), num_bytes)); + /* this is the number of bytes in the nodename */ + memset(modex_data, 0, sizeof(modex_data)); + if (0 > read(fd, modex_data, num_bytes)) { + opal_output(0, "%s: orte:grpcomm:basic: node name not found", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); + close(fd); + return ORTE_ERR_NOT_FOUND; + } + /* this is the nodename - save it */ + nodename = strdup(modex_data); + OPAL_OUTPUT_VERBOSE((20, orte_grpcomm_base_output, + "%s grpcomm:basic:set_proc_attr got nodename %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), nodename)); + /* get the number of bytes in the attribute name */ + if (0 > read(fd, &num_bytes, sizeof(num_bytes))) { + opal_output(0, "%s: orte:grpcomm:basic: attribute name size not found", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); + close(fd); + return ORTE_ERR_NOT_FOUND; + } + /* get the attribute name */ + memset(modex_data, 0, sizeof(modex_data)); + if (0 > read(fd, modex_data, num_bytes)) { + opal_output(0, "%s: orte:grpcomm:basic: attribute name not found", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); + close(fd); + free(nodename); + return ORTE_ERR_NOT_FOUND; + } + /* save it */ + attr = strdup(modex_data); + OPAL_OUTPUT_VERBOSE((20, orte_grpcomm_base_output, + "%s grpcomm:basic:set_proc_attr got attribute %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), attr)); + /* read the number of bytes in the blob */ + if (0 > read(fd, &num_bytes, sizeof(num_bytes))) { + opal_output(0, "%s: orte:grpcomm:basic: data size not found", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); + close(fd); + free(nodename); + free(attr); + return ORTE_ERR_NOT_FOUND; + } + /* read the bytes so we position ourselves */ + if (0 > read(fd, modex_data, num_bytes)) { + opal_output(0, "%s: orte:grpcomm:basic: data not found", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); + close(fd); + free(nodename); + free(attr); + return ORTE_ERR_NOT_FOUND; + } + /* is this from the calling component? */ + if (0 == strcmp(attr, attr_name)) { + /* lookup all procs on the given node */ + name.jobid = ORTE_PROC_MY_NAME->jobid; + for (i=0; i < orte_process_info.num_procs; i++) { + name.vpid = i; + /* if this is me, just skip it - I loaded my info above */ + if (ORTE_PROC_MY_NAME->vpid == name.vpid) { + continue; + } + prochost = orte_ess.proc_get_hostname(&name); + if (NULL == prochost) { + /* report error - unknown host */ + opal_output(0, "%s: orte:grpcomm:basic: host for proc %s not found", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&name)); + close(fd); + free(nodename); + free(attr); + return ORTE_ERR_NOT_FOUND; + } + OPAL_OUTPUT_VERBOSE((20, orte_grpcomm_base_output, + "%s grpcomm:basic:set_proc_attr checking node %s against %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), nodename, prochost)); + if (0 == strncmp(nodename, prochost, strlen(prochost))) { + /* on this host - load the data into the modex db */ + if (ORTE_SUCCESS != (rc = orte_grpcomm_base_load_modex_data(&name, (char*)attr_name, modex_data, num_bytes))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + } + } + } + free(nodename); + free(attr); + } + return ORTE_SUCCESS; +} diff --git a/orte/mca/rml/rml_types.h b/orte/mca/rml/rml_types.h index 0aee35cb6e..1cf6929adc 100644 --- a/orte/mca/rml/rml_types.h +++ b/orte/mca/rml/rml_types.h @@ -106,6 +106,9 @@ BEGIN_C_DECLS /* debugger release */ #define ORTE_RML_TAG_DEBUGGER_RELEASE 32 +/* profile data */ +#define ORTE_RML_TAG_GRPCOMM_PROFILE 33 + #define ORTE_RML_TAG_MAX 100 diff --git a/orte/orted/orted_main.c b/orte/orted/orted_main.c index c5bf8e9c85..771a41e5f3 100644 --- a/orte/orted/orted_main.c +++ b/orte/orted/orted_main.c @@ -224,6 +224,13 @@ int orte_daemon(int argc, char *argv[]) */ mca_base_cmd_line_process_args(cmd_line, &environ, &environ); + /* make sure that opal_profile is -not- set as we do not care + * what frameworks are opened by the daemons + */ + if (NULL != getenv("OMPI_MCA_opal_profile")) { + putenv("OMPI_MCA_opal_profile=0"); + } + /* Ensure that enough of OPAL is setup for us to be able to run */ /* * NOTE: (JJH) diff --git a/orte/test/system/Makefile b/orte/test/system/Makefile index d1fcb3aef4..7f46ac7677 100644 --- a/orte/test/system/Makefile +++ b/orte/test/system/Makefile @@ -1,4 +1,4 @@ -PROGS = no_op sigusr_trap spin orte_nodename orte_spawn orte_loop_spawn orte_loop_child orte_abort get_limits orte_ring spawn_child orte_tool orte_no_op binom oob_stress iof_stress iof_delay radix +PROGS = no_op sigusr_trap spin orte_nodename orte_spawn orte_loop_spawn orte_loop_child orte_abort get_limits orte_ring spawn_child orte_tool orte_no_op binom oob_stress iof_stress iof_delay radix orte_barrier all: $(PROGS) diff --git a/orte/test/system/orte_barrier.c b/orte/test/system/orte_barrier.c new file mode 100644 index 0000000000..87887f7b18 --- /dev/null +++ b/orte/test/system/orte_barrier.c @@ -0,0 +1,28 @@ +/* -*- C -*- + * + * $HEADER$ + * + * The most basic of applications + */ + +#include + +#include "orte/runtime/runtime.h" +#include "orte/mca/grpcomm/grpcomm.h" + +int main(int argc, char* argv[]) +{ + if (ORTE_SUCCESS != orte_init(ORTE_NON_TOOL)) { + fprintf(stderr, "Failed orte_init\n"); + exit(1); + } + +orte_grpcomm.barrier(); +orte_grpcomm.barrier(); + + if (ORTE_SUCCESS != orte_finalize()) { + fprintf(stderr, "Failed orte_finalize\n"); + exit(1); + } + return 0; +} diff --git a/orte/tools/orterun/orterun.c b/orte/tools/orterun/orterun.c index f4843859e9..df45d2118c 100644 --- a/orte/tools/orterun/orterun.c +++ b/orte/tools/orterun/orterun.c @@ -114,6 +114,7 @@ static char *ompi_server=NULL; static opal_event_t *abort_exit_event=NULL; static bool forcibly_die = false; static opal_event_t *timeout_ev=NULL; +static bool profile_is_set = false; /* * Globals @@ -371,6 +372,35 @@ int orterun(int argc, char *argv[]) return rc; } + /* + * Since this process can now handle MCA/GMCA parameters, make sure to + * process them. + */ + mca_base_cmd_line_process_args(&cmd_line, &environ, &environ); + + /* make sure that opal_profile is -not- set for us locally as + * we really only want to profile MPI apps. However, if it is + * set, remember it so we can add it to the apps environment later + */ + if (NULL != getenv("OMPI_MCA_opal_profile")) { + putenv("OMPI_MCA_opal_profile=0"); + profile_is_set = true; + /* ensure that I know to turn on my profile receive! */ + putenv("OMPI_MCA_orte_grpcomm_recv_on=1"); + } + + /* Ensure that enough of OPAL is setup for us to be able to run */ + /* + * NOTE: (JJH) + * We need to allow 'mca_base_cmd_line_process_args()' to process command + * line arguments *before* calling opal_init_util() since the command + * line could contain MCA parameters that affect the way opal_init_util() + * functions. AMCA parameters are one such option normally received on the + * command line that affect the way opal_init_util() behaves. + * It is "safe" to call mca_base_cmd_line_process_args() before + * opal_init_util() since mca_base_cmd_line_process_args() does *not* + * depend upon opal_init_util() functionality. + */ /* Need to initialize OPAL so that install_dirs are filled in */ /* * NOTE: (JJH) @@ -1628,24 +1658,14 @@ static int create_app(int argc, char* argv[], orte_app_context_t **app_ptr, free(param); } } - + /* if profile was set, add it back in */ + if (profile_is_set) { + opal_setenv("OMPI_MCA_opal_profile", "1", true, &app->env); + } + /* add the ompi-server, if provided */ if (NULL != ompi_server) { - bool found_serv = false; - asprintf(¶m, "OMPI_MCA_pubsub_orte_server=%s", ompi_server); - /* this shouldn't exist, but if it does... */ - for (i=0; i < opal_argv_count(app->env); i++) { - if (0 == strcmp(param, app->env[i])) { - free(app->env[i]); - app->env[i] = strdup(param); - found_serv = true; - break; - } - } - if (!found_serv) { - opal_argv_append_nosize(&app->env, param); /* add it */ - } - free(param); + opal_setenv("OMPI_MCA_pubsub_orte_server", ompi_server, true, &app->env); } /* Did the user request to export any environment variables? */