From 0411f9772e18d6c87999d1345d1f7ca8137984dd Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Fri, 29 Sep 2006 13:19:44 +0000 Subject: [PATCH] Begin instrumenting for scalability tests. I have added a new MCA param (hey, you can't have too many!) called OMPI_MCA_orte_timing. If set to anything other than zero, the system will report out critical timing loops. At the moment, this includes three measurements: 1. Time spent going through the RDS->RAS->RMAPS, setting up triggers, etc. prior to calling the actual PLS launch function. This is reported out as time to setup job. 2. Time spent in MPI_Init from start of that function (well, right after opal_init) to the place where we send all of our info the registry. Reported out as time from start to exec_compound_cmd 3. Time actually spent executing the compound cmd. Reported out as time to exec_compound_cmd. A few additional timing points will be added shortly. These may eventually be removed or (better) setup with a conditional compile flag. This commit was SVN r11892. --- ompi/runtime/ompi_mpi_init.c | 49 +++++++++++++++++++++++++- orte/mca/rmgr/urm/rmgr_urm.c | 26 ++++++++++++-- orte/mca/rmgr/urm/rmgr_urm.h | 2 ++ orte/mca/rmgr/urm/rmgr_urm_component.c | 11 ++++++ 4 files changed, 85 insertions(+), 3 deletions(-) diff --git a/ompi/runtime/ompi_mpi_init.c b/ompi/runtime/ompi_mpi_init.c index 6e222d3375..d76e61646f 100644 --- a/ompi/runtime/ompi_mpi_init.c +++ b/ompi/runtime/ompi_mpi_init.c @@ -19,6 +19,10 @@ #include "ompi_config.h" +#ifdef HAVE_SYS_TIME_H +#include +#endif /* HAVE_SYS_TIME_H */ + #include "mpi.h" #include "opal/mca/base/base.h" #include "opal/mca/paffinity/base/base.h" @@ -202,6 +206,9 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided) size_t nprocs; char *error = NULL; bool compound_cmd = false; + bool timing = false; + int param, value; + struct timeval ompistart, ompistop; /* Join the run-time environment - do the things that don't hit the registry */ @@ -211,6 +218,19 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided) goto error; } + /* check to see if we want timing information */ + param = mca_base_param_reg_int_name("orte", "timing", + "Request that critical timing loops be measured", + false, false, 0, &value); + if (value != 0) { + timing = true; + if (0 != gettimeofday(&ompistart, NULL)) { + opal_output(0, "ompi_mpi_init: could not obtain start time"); + ompistart.tv_sec = 0; + ompistart.tv_usec = 0; + } + } + /* Setup ORTE stage 1, note that we are not infrastructre */ if (ORTE_SUCCESS != (ret = orte_init_stage1(false))) { @@ -481,6 +501,22 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided) goto error; } + /* check for timing request - get stop time and report elapsed time if so */ + if (timing) { + if (0 != gettimeofday(&ompistop, NULL)) { + opal_output(0, "ompi_mpi_init: could not obtain stop time"); + } else { + opal_output(0, "ompi_mpi_init: time from start to exec_compound_cmd %ld sec %ld usec", + (long int)(ompistop.tv_sec - ompistart.tv_sec), + (long int)(ompistop.tv_usec - ompistart.tv_usec)); + if (0 != gettimeofday(&ompistart, NULL)) { + opal_output(0, "ompi_mpi_init: could not obtain new start time"); + ompistart.tv_sec = ompistop.tv_sec; + ompistart.tv_usec = ompistop.tv_usec; + } + } + } + /* if the compound command is operative, execute it */ if (compound_cmd) { @@ -491,7 +527,18 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided) } } - /* FIRST BARRIER - WAIT FOR MSG FROM RMGR_PROC_STAGE_GATE_MGR TO ARRIVE */ + /* check for timing request - get stop time and report elapsed time if so */ + if (timing) { + if (0 != gettimeofday(&ompistop, NULL)) { + opal_output(0, "ompi_mpi_init: could not obtain stop time after compound_cmd"); + } else { + opal_output(0, "ompi_mpi_init: time to exec_compound_cmd %ld sec %ld usec", + (long int)(ompistop.tv_sec - ompistart.tv_sec), + (long int)(ompistop.tv_usec - ompistart.tv_usec)); + } + } + + /* FIRST BARRIER - WAIT FOR MSG FROM RMGR_PROC_STAGE_GATE_MGR TO ARRIVE */ if (ORTE_SUCCESS != (ret = orte_rml.xcast(NULL, NULL, 0, NULL, orte_gpr.deliver_notify_msg, NULL))) { ORTE_ERROR_LOG(ret); diff --git a/orte/mca/rmgr/urm/rmgr_urm.c b/orte/mca/rmgr/urm/rmgr_urm.c index 96a0253810..2d71e5bf2c 100644 --- a/orte/mca/rmgr/urm/rmgr_urm.c +++ b/orte/mca/rmgr/urm/rmgr_urm.c @@ -30,6 +30,7 @@ #endif /* HAVE_STRING_H */ #include "opal/util/trace.h" +#include "opal/util/output.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/rds/rds.h" @@ -275,9 +276,19 @@ static int orte_rmgr_urm_spawn_job( { int rc; orte_process_name_t* name; + struct timeval urmstart, urmstop; OPAL_TRACE(1); + /* check for timing request - get start time if so */ + if (mca_rmgr_urm_component.timing) { + if (0 != gettimeofday(&urmstart, NULL)) { + opal_output(0, "rmgr_urm: could not obtain start time"); + urmstart.tv_sec = 0; + urmstart.tv_usec = 0; + } + } + /* * Perform resource discovery. */ @@ -305,7 +316,7 @@ static int orte_rmgr_urm_spawn_job( return rc; } - /* + /* * setup I/O forwarding */ @@ -368,7 +379,18 @@ static int orte_rmgr_urm_spawn_job( } } - /* + /* check for timing request - get stop time and report elapsed time if so */ + if (mca_rmgr_urm_component.timing) { + if (0 != gettimeofday(&urmstop, NULL)) { + opal_output(0, "rmgr_urm: could not obtain stop time"); + } else { + opal_output(0, "rmgr_urm: job setup time is %ld sec %ld usec", + (long int)(urmstop.tv_sec - urmstart.tv_sec), + (long int)(urmstop.tv_usec - urmstart.tv_usec)); + } + } + + /* * launch the job */ if (ORTE_SUCCESS != (rc = orte_pls.launch_job(*jobid))) { diff --git a/orte/mca/rmgr/urm/rmgr_urm.h b/orte/mca/rmgr/urm/rmgr_urm.h index cd966e08f4..14408458eb 100644 --- a/orte/mca/rmgr/urm/rmgr_urm.h +++ b/orte/mca/rmgr/urm/rmgr_urm.h @@ -38,6 +38,8 @@ struct orte_rmgr_urm_component_t { orte_rmgr_base_component_t super; /** Has RDS query been called */ bool urm_rds; + /* timing tests requested */ + bool timing; }; /** Convenience typedef */ typedef struct orte_rmgr_urm_component_t orte_rmgr_urm_component_t; diff --git a/orte/mca/rmgr/urm/rmgr_urm_component.c b/orte/mca/rmgr/urm/rmgr_urm_component.c index 8be72732a1..d295373e63 100644 --- a/orte/mca/rmgr/urm/rmgr_urm_component.c +++ b/orte/mca/rmgr/urm/rmgr_urm_component.c @@ -83,11 +83,22 @@ static int orte_rmgr_urm_open(void) static orte_rmgr_base_module_t *orte_rmgr_urm_init(int* priority) { + int param, value; + /* if we are NOT an HNP, then we do NOT want to be selected */ if(!orte_process_info.seed) { return NULL; } + param = mca_base_param_reg_int_name("orte", "timing", + "Request that critical timing loops be measured", + false, false, 0, &value); + if (value != 0) { + mca_rmgr_urm_component.timing = true; + } else { + mca_rmgr_urm_component.timing = false; + } + /* volunteer to be selected */ *priority = 100; return &orte_rmgr_urm_module;