1
1

Begin instrumenting for scalability tests.

I have added a new MCA param (hey, you can't have too many!) called OMPI_MCA_orte_timing. If set to anything other than zero, the system will report out critical timing loops. At the moment, this includes three measurements:

1. Time spent going through the RDS->RAS->RMAPS, setting up triggers, etc. prior to calling the actual PLS launch function. This is reported out as time to setup job.

2. Time spent in MPI_Init from start of that function (well, right after opal_init) to the place where we send all of our info the registry. Reported out as time from start to exec_compound_cmd

3. Time actually spent executing the compound cmd. Reported out as time to exec_compound_cmd.

A few additional timing points will be added shortly.

These may eventually be removed or (better) setup with a conditional compile flag.

This commit was SVN r11892.
Этот коммит содержится в:
Ralph Castain 2006-09-29 13:19:44 +00:00
родитель db6a93fa63
Коммит 0411f9772e
4 изменённых файлов: 85 добавлений и 3 удалений

Просмотреть файл

@ -19,6 +19,10 @@
#include "ompi_config.h" #include "ompi_config.h"
#ifdef HAVE_SYS_TIME_H
#include <sys/time.h>
#endif /* HAVE_SYS_TIME_H */
#include "mpi.h" #include "mpi.h"
#include "opal/mca/base/base.h" #include "opal/mca/base/base.h"
#include "opal/mca/paffinity/base/base.h" #include "opal/mca/paffinity/base/base.h"
@ -202,6 +206,9 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
size_t nprocs; size_t nprocs;
char *error = NULL; char *error = NULL;
bool compound_cmd = false; bool compound_cmd = false;
bool timing = false;
int param, value;
struct timeval ompistart, ompistop;
/* Join the run-time environment - do the things that don't hit /* Join the run-time environment - do the things that don't hit
the registry */ the registry */
@ -211,6 +218,19 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
goto error; goto error;
} }
/* check to see if we want timing information */
param = mca_base_param_reg_int_name("orte", "timing",
"Request that critical timing loops be measured",
false, false, 0, &value);
if (value != 0) {
timing = true;
if (0 != gettimeofday(&ompistart, NULL)) {
opal_output(0, "ompi_mpi_init: could not obtain start time");
ompistart.tv_sec = 0;
ompistart.tv_usec = 0;
}
}
/* Setup ORTE stage 1, note that we are not infrastructre */ /* Setup ORTE stage 1, note that we are not infrastructre */
if (ORTE_SUCCESS != (ret = orte_init_stage1(false))) { if (ORTE_SUCCESS != (ret = orte_init_stage1(false))) {
@ -481,6 +501,22 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
goto error; goto error;
} }
/* check for timing request - get stop time and report elapsed time if so */
if (timing) {
if (0 != gettimeofday(&ompistop, NULL)) {
opal_output(0, "ompi_mpi_init: could not obtain stop time");
} else {
opal_output(0, "ompi_mpi_init: time from start to exec_compound_cmd %ld sec %ld usec",
(long int)(ompistop.tv_sec - ompistart.tv_sec),
(long int)(ompistop.tv_usec - ompistart.tv_usec));
if (0 != gettimeofday(&ompistart, NULL)) {
opal_output(0, "ompi_mpi_init: could not obtain new start time");
ompistart.tv_sec = ompistop.tv_sec;
ompistart.tv_usec = ompistop.tv_usec;
}
}
}
/* if the compound command is operative, execute it */ /* if the compound command is operative, execute it */
if (compound_cmd) { if (compound_cmd) {
@ -491,7 +527,18 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
} }
} }
/* FIRST BARRIER - WAIT FOR MSG FROM RMGR_PROC_STAGE_GATE_MGR TO ARRIVE */ /* check for timing request - get stop time and report elapsed time if so */
if (timing) {
if (0 != gettimeofday(&ompistop, NULL)) {
opal_output(0, "ompi_mpi_init: could not obtain stop time after compound_cmd");
} else {
opal_output(0, "ompi_mpi_init: time to exec_compound_cmd %ld sec %ld usec",
(long int)(ompistop.tv_sec - ompistart.tv_sec),
(long int)(ompistop.tv_usec - ompistart.tv_usec));
}
}
/* FIRST BARRIER - WAIT FOR MSG FROM RMGR_PROC_STAGE_GATE_MGR TO ARRIVE */
if (ORTE_SUCCESS != (ret = orte_rml.xcast(NULL, NULL, 0, NULL, if (ORTE_SUCCESS != (ret = orte_rml.xcast(NULL, NULL, 0, NULL,
orte_gpr.deliver_notify_msg, NULL))) { orte_gpr.deliver_notify_msg, NULL))) {
ORTE_ERROR_LOG(ret); ORTE_ERROR_LOG(ret);

Просмотреть файл

@ -30,6 +30,7 @@
#endif /* HAVE_STRING_H */ #endif /* HAVE_STRING_H */
#include "opal/util/trace.h" #include "opal/util/trace.h"
#include "opal/util/output.h"
#include "orte/mca/errmgr/errmgr.h" #include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rds/rds.h" #include "orte/mca/rds/rds.h"
@ -275,9 +276,19 @@ static int orte_rmgr_urm_spawn_job(
{ {
int rc; int rc;
orte_process_name_t* name; orte_process_name_t* name;
struct timeval urmstart, urmstop;
OPAL_TRACE(1); OPAL_TRACE(1);
/* check for timing request - get start time if so */
if (mca_rmgr_urm_component.timing) {
if (0 != gettimeofday(&urmstart, NULL)) {
opal_output(0, "rmgr_urm: could not obtain start time");
urmstart.tv_sec = 0;
urmstart.tv_usec = 0;
}
}
/* /*
* Perform resource discovery. * Perform resource discovery.
*/ */
@ -305,7 +316,7 @@ static int orte_rmgr_urm_spawn_job(
return rc; return rc;
} }
/* /*
* setup I/O forwarding * setup I/O forwarding
*/ */
@ -368,7 +379,18 @@ static int orte_rmgr_urm_spawn_job(
} }
} }
/* /* check for timing request - get stop time and report elapsed time if so */
if (mca_rmgr_urm_component.timing) {
if (0 != gettimeofday(&urmstop, NULL)) {
opal_output(0, "rmgr_urm: could not obtain stop time");
} else {
opal_output(0, "rmgr_urm: job setup time is %ld sec %ld usec",
(long int)(urmstop.tv_sec - urmstart.tv_sec),
(long int)(urmstop.tv_usec - urmstart.tv_usec));
}
}
/*
* launch the job * launch the job
*/ */
if (ORTE_SUCCESS != (rc = orte_pls.launch_job(*jobid))) { if (ORTE_SUCCESS != (rc = orte_pls.launch_job(*jobid))) {

Просмотреть файл

@ -38,6 +38,8 @@ struct orte_rmgr_urm_component_t {
orte_rmgr_base_component_t super; orte_rmgr_base_component_t super;
/** Has RDS query been called */ /** Has RDS query been called */
bool urm_rds; bool urm_rds;
/* timing tests requested */
bool timing;
}; };
/** Convenience typedef */ /** Convenience typedef */
typedef struct orte_rmgr_urm_component_t orte_rmgr_urm_component_t; typedef struct orte_rmgr_urm_component_t orte_rmgr_urm_component_t;

Просмотреть файл

@ -83,11 +83,22 @@ static int orte_rmgr_urm_open(void)
static orte_rmgr_base_module_t *orte_rmgr_urm_init(int* priority) static orte_rmgr_base_module_t *orte_rmgr_urm_init(int* priority)
{ {
int param, value;
/* if we are NOT an HNP, then we do NOT want to be selected */ /* if we are NOT an HNP, then we do NOT want to be selected */
if(!orte_process_info.seed) { if(!orte_process_info.seed) {
return NULL; return NULL;
} }
param = mca_base_param_reg_int_name("orte", "timing",
"Request that critical timing loops be measured",
false, false, 0, &value);
if (value != 0) {
mca_rmgr_urm_component.timing = true;
} else {
mca_rmgr_urm_component.timing = false;
}
/* volunteer to be selected */ /* volunteer to be selected */
*priority = 100; *priority = 100;
return &orte_rmgr_urm_module; return &orte_rmgr_urm_module;