Begin instrumenting for scalability tests.
I have added a new MCA param (hey, you can't have too many!) called OMPI_MCA_orte_timing. If set to anything other than zero, the system will report out critical timing loops. At the moment, this includes three measurements: 1. Time spent going through the RDS->RAS->RMAPS, setting up triggers, etc. prior to calling the actual PLS launch function. This is reported out as time to setup job. 2. Time spent in MPI_Init from start of that function (well, right after opal_init) to the place where we send all of our info the registry. Reported out as time from start to exec_compound_cmd 3. Time actually spent executing the compound cmd. Reported out as time to exec_compound_cmd. A few additional timing points will be added shortly. These may eventually be removed or (better) setup with a conditional compile flag. This commit was SVN r11892.
Этот коммит содержится в:
родитель
db6a93fa63
Коммит
0411f9772e
@ -19,6 +19,10 @@
|
|||||||
|
|
||||||
#include "ompi_config.h"
|
#include "ompi_config.h"
|
||||||
|
|
||||||
|
#ifdef HAVE_SYS_TIME_H
|
||||||
|
#include <sys/time.h>
|
||||||
|
#endif /* HAVE_SYS_TIME_H */
|
||||||
|
|
||||||
#include "mpi.h"
|
#include "mpi.h"
|
||||||
#include "opal/mca/base/base.h"
|
#include "opal/mca/base/base.h"
|
||||||
#include "opal/mca/paffinity/base/base.h"
|
#include "opal/mca/paffinity/base/base.h"
|
||||||
@ -202,6 +206,9 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
|||||||
size_t nprocs;
|
size_t nprocs;
|
||||||
char *error = NULL;
|
char *error = NULL;
|
||||||
bool compound_cmd = false;
|
bool compound_cmd = false;
|
||||||
|
bool timing = false;
|
||||||
|
int param, value;
|
||||||
|
struct timeval ompistart, ompistop;
|
||||||
|
|
||||||
/* Join the run-time environment - do the things that don't hit
|
/* Join the run-time environment - do the things that don't hit
|
||||||
the registry */
|
the registry */
|
||||||
@ -211,6 +218,19 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
|||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* check to see if we want timing information */
|
||||||
|
param = mca_base_param_reg_int_name("orte", "timing",
|
||||||
|
"Request that critical timing loops be measured",
|
||||||
|
false, false, 0, &value);
|
||||||
|
if (value != 0) {
|
||||||
|
timing = true;
|
||||||
|
if (0 != gettimeofday(&ompistart, NULL)) {
|
||||||
|
opal_output(0, "ompi_mpi_init: could not obtain start time");
|
||||||
|
ompistart.tv_sec = 0;
|
||||||
|
ompistart.tv_usec = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/* Setup ORTE stage 1, note that we are not infrastructre */
|
/* Setup ORTE stage 1, note that we are not infrastructre */
|
||||||
|
|
||||||
if (ORTE_SUCCESS != (ret = orte_init_stage1(false))) {
|
if (ORTE_SUCCESS != (ret = orte_init_stage1(false))) {
|
||||||
@ -481,6 +501,22 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
|||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* check for timing request - get stop time and report elapsed time if so */
|
||||||
|
if (timing) {
|
||||||
|
if (0 != gettimeofday(&ompistop, NULL)) {
|
||||||
|
opal_output(0, "ompi_mpi_init: could not obtain stop time");
|
||||||
|
} else {
|
||||||
|
opal_output(0, "ompi_mpi_init: time from start to exec_compound_cmd %ld sec %ld usec",
|
||||||
|
(long int)(ompistop.tv_sec - ompistart.tv_sec),
|
||||||
|
(long int)(ompistop.tv_usec - ompistart.tv_usec));
|
||||||
|
if (0 != gettimeofday(&ompistart, NULL)) {
|
||||||
|
opal_output(0, "ompi_mpi_init: could not obtain new start time");
|
||||||
|
ompistart.tv_sec = ompistop.tv_sec;
|
||||||
|
ompistart.tv_usec = ompistop.tv_usec;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/* if the compound command is operative, execute it */
|
/* if the compound command is operative, execute it */
|
||||||
|
|
||||||
if (compound_cmd) {
|
if (compound_cmd) {
|
||||||
@ -491,6 +527,17 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* check for timing request - get stop time and report elapsed time if so */
|
||||||
|
if (timing) {
|
||||||
|
if (0 != gettimeofday(&ompistop, NULL)) {
|
||||||
|
opal_output(0, "ompi_mpi_init: could not obtain stop time after compound_cmd");
|
||||||
|
} else {
|
||||||
|
opal_output(0, "ompi_mpi_init: time to exec_compound_cmd %ld sec %ld usec",
|
||||||
|
(long int)(ompistop.tv_sec - ompistart.tv_sec),
|
||||||
|
(long int)(ompistop.tv_usec - ompistart.tv_usec));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/* FIRST BARRIER - WAIT FOR MSG FROM RMGR_PROC_STAGE_GATE_MGR TO ARRIVE */
|
/* FIRST BARRIER - WAIT FOR MSG FROM RMGR_PROC_STAGE_GATE_MGR TO ARRIVE */
|
||||||
if (ORTE_SUCCESS != (ret = orte_rml.xcast(NULL, NULL, 0, NULL,
|
if (ORTE_SUCCESS != (ret = orte_rml.xcast(NULL, NULL, 0, NULL,
|
||||||
orte_gpr.deliver_notify_msg, NULL))) {
|
orte_gpr.deliver_notify_msg, NULL))) {
|
||||||
|
@ -30,6 +30,7 @@
|
|||||||
#endif /* HAVE_STRING_H */
|
#endif /* HAVE_STRING_H */
|
||||||
|
|
||||||
#include "opal/util/trace.h"
|
#include "opal/util/trace.h"
|
||||||
|
#include "opal/util/output.h"
|
||||||
|
|
||||||
#include "orte/mca/errmgr/errmgr.h"
|
#include "orte/mca/errmgr/errmgr.h"
|
||||||
#include "orte/mca/rds/rds.h"
|
#include "orte/mca/rds/rds.h"
|
||||||
@ -275,9 +276,19 @@ static int orte_rmgr_urm_spawn_job(
|
|||||||
{
|
{
|
||||||
int rc;
|
int rc;
|
||||||
orte_process_name_t* name;
|
orte_process_name_t* name;
|
||||||
|
struct timeval urmstart, urmstop;
|
||||||
|
|
||||||
OPAL_TRACE(1);
|
OPAL_TRACE(1);
|
||||||
|
|
||||||
|
/* check for timing request - get start time if so */
|
||||||
|
if (mca_rmgr_urm_component.timing) {
|
||||||
|
if (0 != gettimeofday(&urmstart, NULL)) {
|
||||||
|
opal_output(0, "rmgr_urm: could not obtain start time");
|
||||||
|
urmstart.tv_sec = 0;
|
||||||
|
urmstart.tv_usec = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Perform resource discovery.
|
* Perform resource discovery.
|
||||||
*/
|
*/
|
||||||
@ -368,6 +379,17 @@ static int orte_rmgr_urm_spawn_job(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* check for timing request - get stop time and report elapsed time if so */
|
||||||
|
if (mca_rmgr_urm_component.timing) {
|
||||||
|
if (0 != gettimeofday(&urmstop, NULL)) {
|
||||||
|
opal_output(0, "rmgr_urm: could not obtain stop time");
|
||||||
|
} else {
|
||||||
|
opal_output(0, "rmgr_urm: job setup time is %ld sec %ld usec",
|
||||||
|
(long int)(urmstop.tv_sec - urmstart.tv_sec),
|
||||||
|
(long int)(urmstop.tv_usec - urmstart.tv_usec));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* launch the job
|
* launch the job
|
||||||
*/
|
*/
|
||||||
|
@ -38,6 +38,8 @@ struct orte_rmgr_urm_component_t {
|
|||||||
orte_rmgr_base_component_t super;
|
orte_rmgr_base_component_t super;
|
||||||
/** Has RDS query been called */
|
/** Has RDS query been called */
|
||||||
bool urm_rds;
|
bool urm_rds;
|
||||||
|
/* timing tests requested */
|
||||||
|
bool timing;
|
||||||
};
|
};
|
||||||
/** Convenience typedef */
|
/** Convenience typedef */
|
||||||
typedef struct orte_rmgr_urm_component_t orte_rmgr_urm_component_t;
|
typedef struct orte_rmgr_urm_component_t orte_rmgr_urm_component_t;
|
||||||
|
@ -83,11 +83,22 @@ static int orte_rmgr_urm_open(void)
|
|||||||
|
|
||||||
static orte_rmgr_base_module_t *orte_rmgr_urm_init(int* priority)
|
static orte_rmgr_base_module_t *orte_rmgr_urm_init(int* priority)
|
||||||
{
|
{
|
||||||
|
int param, value;
|
||||||
|
|
||||||
/* if we are NOT an HNP, then we do NOT want to be selected */
|
/* if we are NOT an HNP, then we do NOT want to be selected */
|
||||||
if(!orte_process_info.seed) {
|
if(!orte_process_info.seed) {
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
param = mca_base_param_reg_int_name("orte", "timing",
|
||||||
|
"Request that critical timing loops be measured",
|
||||||
|
false, false, 0, &value);
|
||||||
|
if (value != 0) {
|
||||||
|
mca_rmgr_urm_component.timing = true;
|
||||||
|
} else {
|
||||||
|
mca_rmgr_urm_component.timing = false;
|
||||||
|
}
|
||||||
|
|
||||||
/* volunteer to be selected */
|
/* volunteer to be selected */
|
||||||
*priority = 100;
|
*priority = 100;
|
||||||
return &orte_rmgr_urm_module;
|
return &orte_rmgr_urm_module;
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user