Begin instrumenting for scalability tests.
I have added a new MCA param (hey, you can't have too many!) called OMPI_MCA_orte_timing. If set to anything other than zero, the system will report out critical timing loops. At the moment, this includes three measurements: 1. Time spent going through the RDS->RAS->RMAPS, setting up triggers, etc. prior to calling the actual PLS launch function. This is reported out as time to setup job. 2. Time spent in MPI_Init from start of that function (well, right after opal_init) to the place where we send all of our info the registry. Reported out as time from start to exec_compound_cmd 3. Time actually spent executing the compound cmd. Reported out as time to exec_compound_cmd. A few additional timing points will be added shortly. These may eventually be removed or (better) setup with a conditional compile flag. This commit was SVN r11892.
Этот коммит содержится в:
родитель
db6a93fa63
Коммит
0411f9772e
@ -19,6 +19,10 @@
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#ifdef HAVE_SYS_TIME_H
|
||||
#include <sys/time.h>
|
||||
#endif /* HAVE_SYS_TIME_H */
|
||||
|
||||
#include "mpi.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/mca/paffinity/base/base.h"
|
||||
@ -202,6 +206,9 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
||||
size_t nprocs;
|
||||
char *error = NULL;
|
||||
bool compound_cmd = false;
|
||||
bool timing = false;
|
||||
int param, value;
|
||||
struct timeval ompistart, ompistop;
|
||||
|
||||
/* Join the run-time environment - do the things that don't hit
|
||||
the registry */
|
||||
@ -211,6 +218,19 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
||||
goto error;
|
||||
}
|
||||
|
||||
/* check to see if we want timing information */
|
||||
param = mca_base_param_reg_int_name("orte", "timing",
|
||||
"Request that critical timing loops be measured",
|
||||
false, false, 0, &value);
|
||||
if (value != 0) {
|
||||
timing = true;
|
||||
if (0 != gettimeofday(&ompistart, NULL)) {
|
||||
opal_output(0, "ompi_mpi_init: could not obtain start time");
|
||||
ompistart.tv_sec = 0;
|
||||
ompistart.tv_usec = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/* Setup ORTE stage 1, note that we are not infrastructre */
|
||||
|
||||
if (ORTE_SUCCESS != (ret = orte_init_stage1(false))) {
|
||||
@ -481,6 +501,22 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
||||
goto error;
|
||||
}
|
||||
|
||||
/* check for timing request - get stop time and report elapsed time if so */
|
||||
if (timing) {
|
||||
if (0 != gettimeofday(&ompistop, NULL)) {
|
||||
opal_output(0, "ompi_mpi_init: could not obtain stop time");
|
||||
} else {
|
||||
opal_output(0, "ompi_mpi_init: time from start to exec_compound_cmd %ld sec %ld usec",
|
||||
(long int)(ompistop.tv_sec - ompistart.tv_sec),
|
||||
(long int)(ompistop.tv_usec - ompistart.tv_usec));
|
||||
if (0 != gettimeofday(&ompistart, NULL)) {
|
||||
opal_output(0, "ompi_mpi_init: could not obtain new start time");
|
||||
ompistart.tv_sec = ompistop.tv_sec;
|
||||
ompistart.tv_usec = ompistop.tv_usec;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* if the compound command is operative, execute it */
|
||||
|
||||
if (compound_cmd) {
|
||||
@ -491,7 +527,18 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
||||
}
|
||||
}
|
||||
|
||||
/* FIRST BARRIER - WAIT FOR MSG FROM RMGR_PROC_STAGE_GATE_MGR TO ARRIVE */
|
||||
/* check for timing request - get stop time and report elapsed time if so */
|
||||
if (timing) {
|
||||
if (0 != gettimeofday(&ompistop, NULL)) {
|
||||
opal_output(0, "ompi_mpi_init: could not obtain stop time after compound_cmd");
|
||||
} else {
|
||||
opal_output(0, "ompi_mpi_init: time to exec_compound_cmd %ld sec %ld usec",
|
||||
(long int)(ompistop.tv_sec - ompistart.tv_sec),
|
||||
(long int)(ompistop.tv_usec - ompistart.tv_usec));
|
||||
}
|
||||
}
|
||||
|
||||
/* FIRST BARRIER - WAIT FOR MSG FROM RMGR_PROC_STAGE_GATE_MGR TO ARRIVE */
|
||||
if (ORTE_SUCCESS != (ret = orte_rml.xcast(NULL, NULL, 0, NULL,
|
||||
orte_gpr.deliver_notify_msg, NULL))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
|
@ -30,6 +30,7 @@
|
||||
#endif /* HAVE_STRING_H */
|
||||
|
||||
#include "opal/util/trace.h"
|
||||
#include "opal/util/output.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/rds/rds.h"
|
||||
@ -275,9 +276,19 @@ static int orte_rmgr_urm_spawn_job(
|
||||
{
|
||||
int rc;
|
||||
orte_process_name_t* name;
|
||||
struct timeval urmstart, urmstop;
|
||||
|
||||
OPAL_TRACE(1);
|
||||
|
||||
/* check for timing request - get start time if so */
|
||||
if (mca_rmgr_urm_component.timing) {
|
||||
if (0 != gettimeofday(&urmstart, NULL)) {
|
||||
opal_output(0, "rmgr_urm: could not obtain start time");
|
||||
urmstart.tv_sec = 0;
|
||||
urmstart.tv_usec = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Perform resource discovery.
|
||||
*/
|
||||
@ -305,7 +316,7 @@ static int orte_rmgr_urm_spawn_job(
|
||||
return rc;
|
||||
}
|
||||
|
||||
/*
|
||||
/*
|
||||
* setup I/O forwarding
|
||||
*/
|
||||
|
||||
@ -368,7 +379,18 @@ static int orte_rmgr_urm_spawn_job(
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
/* check for timing request - get stop time and report elapsed time if so */
|
||||
if (mca_rmgr_urm_component.timing) {
|
||||
if (0 != gettimeofday(&urmstop, NULL)) {
|
||||
opal_output(0, "rmgr_urm: could not obtain stop time");
|
||||
} else {
|
||||
opal_output(0, "rmgr_urm: job setup time is %ld sec %ld usec",
|
||||
(long int)(urmstop.tv_sec - urmstart.tv_sec),
|
||||
(long int)(urmstop.tv_usec - urmstart.tv_usec));
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* launch the job
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_pls.launch_job(*jobid))) {
|
||||
|
@ -38,6 +38,8 @@ struct orte_rmgr_urm_component_t {
|
||||
orte_rmgr_base_component_t super;
|
||||
/** Has RDS query been called */
|
||||
bool urm_rds;
|
||||
/* timing tests requested */
|
||||
bool timing;
|
||||
};
|
||||
/** Convenience typedef */
|
||||
typedef struct orte_rmgr_urm_component_t orte_rmgr_urm_component_t;
|
||||
|
@ -83,11 +83,22 @@ static int orte_rmgr_urm_open(void)
|
||||
|
||||
static orte_rmgr_base_module_t *orte_rmgr_urm_init(int* priority)
|
||||
{
|
||||
int param, value;
|
||||
|
||||
/* if we are NOT an HNP, then we do NOT want to be selected */
|
||||
if(!orte_process_info.seed) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
param = mca_base_param_reg_int_name("orte", "timing",
|
||||
"Request that critical timing loops be measured",
|
||||
false, false, 0, &value);
|
||||
if (value != 0) {
|
||||
mca_rmgr_urm_component.timing = true;
|
||||
} else {
|
||||
mca_rmgr_urm_component.timing = false;
|
||||
}
|
||||
|
||||
/* volunteer to be selected */
|
||||
*priority = 100;
|
||||
return &orte_rmgr_urm_module;
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user