Add some additional timing reporting:
1. Added reporting points around the xcasts in MPI_Init. Note that these times will include time spent waiting for a trigger to fire, which is why the times between stage gates did NOT include these times initially. The inter-stage-gate times still do NOT include the xcast time - the xcast time is reported separately. 2. Added the process vpid on the MPI_Init timing reports for clarity. 3. Added a report from the xcast function on the HNP that outputs the number of bytes in the message being sent to the processes. This commit was SVN r12422.
Этот коммит содержится в:
родитель
bc93583184
Коммит
60e27c77e7
@ -208,7 +208,7 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
|||||||
bool compound_cmd = false;
|
bool compound_cmd = false;
|
||||||
bool timing = false;
|
bool timing = false;
|
||||||
int param, value;
|
int param, value;
|
||||||
struct timeval ompistart, ompistop, stg2start, stg3start;
|
struct timeval ompistart, ompistop;
|
||||||
|
|
||||||
/* Join the run-time environment - do the things that don't hit
|
/* Join the run-time environment - do the things that don't hit
|
||||||
the registry */
|
the registry */
|
||||||
@ -224,11 +224,7 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
|||||||
false, false, 0, &value);
|
false, false, 0, &value);
|
||||||
if (value != 0) {
|
if (value != 0) {
|
||||||
timing = true;
|
timing = true;
|
||||||
if (0 != gettimeofday(&ompistart, NULL)) {
|
gettimeofday(&ompistart, NULL);
|
||||||
opal_output(0, "ompi_mpi_init: could not obtain start time");
|
|
||||||
ompistart.tv_sec = 0;
|
|
||||||
ompistart.tv_usec = 0;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Setup ORTE stage 1, note that we are not infrastructre */
|
/* Setup ORTE stage 1, note that we are not infrastructre */
|
||||||
@ -265,6 +261,15 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
|||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* check for timing request - get stop time and report elapsed time if so */
|
||||||
|
if (timing) {
|
||||||
|
gettimeofday(&ompistop, NULL);
|
||||||
|
opal_output(0, "ompi_mpi_init [%ld]: time from start to completion of orte_init %ld usec",
|
||||||
|
(long)ORTE_PROC_MY_NAME->vpid,
|
||||||
|
(long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
|
||||||
|
(ompistop.tv_usec - ompistart.tv_usec)));
|
||||||
|
gettimeofday(&ompistart, NULL);
|
||||||
|
}
|
||||||
/* Once we've joined the RTE, see if any MCA parameters were
|
/* Once we've joined the RTE, see if any MCA parameters were
|
||||||
passed to the MPI level */
|
passed to the MPI level */
|
||||||
|
|
||||||
@ -503,18 +508,12 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
|||||||
|
|
||||||
/* check for timing request - get stop time and report elapsed time if so */
|
/* check for timing request - get stop time and report elapsed time if so */
|
||||||
if (timing) {
|
if (timing) {
|
||||||
if (0 != gettimeofday(&ompistop, NULL)) {
|
gettimeofday(&ompistop, NULL);
|
||||||
opal_output(0, "ompi_mpi_init: could not obtain stop time");
|
opal_output(0, "ompi_mpi_init[%ld]: time from completion of orte_init to exec_compound_cmd %ld usec",
|
||||||
} else {
|
(long)ORTE_PROC_MY_NAME->vpid,
|
||||||
opal_output(0, "ompi_mpi_init: time from start to exec_compound_cmd %ld usec",
|
|
||||||
(long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
|
(long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
|
||||||
(ompistop.tv_usec - ompistart.tv_usec)));
|
(ompistop.tv_usec - ompistart.tv_usec)));
|
||||||
if (0 != gettimeofday(&ompistart, NULL)) {
|
gettimeofday(&ompistart, NULL);
|
||||||
opal_output(0, "ompi_mpi_init: could not obtain new start time");
|
|
||||||
ompistart.tv_sec = ompistop.tv_sec;
|
|
||||||
ompistart.tv_usec = ompistop.tv_usec;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* if the compound command is operative, execute it */
|
/* if the compound command is operative, execute it */
|
||||||
@ -529,13 +528,12 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
|||||||
|
|
||||||
/* check for timing request - get stop time and report elapsed time if so */
|
/* check for timing request - get stop time and report elapsed time if so */
|
||||||
if (timing) {
|
if (timing) {
|
||||||
if (0 != gettimeofday(&ompistop, NULL)) {
|
gettimeofday(&ompistop, NULL);
|
||||||
opal_output(0, "ompi_mpi_init: could not obtain stop time after compound_cmd");
|
opal_output(0, "ompi_mpi_init[%ld]: time to execute compound command %ld usec",
|
||||||
} else {
|
(long)ORTE_PROC_MY_NAME->vpid,
|
||||||
opal_output(0, "ompi_mpi_init: time to execute compound command %ld usec",
|
|
||||||
(long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
|
(long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
|
||||||
(ompistop.tv_usec - ompistart.tv_usec)));
|
(ompistop.tv_usec - ompistart.tv_usec)));
|
||||||
}
|
gettimeofday(&ompistart, NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* FIRST BARRIER - WAIT FOR MSG FROM RMGR_PROC_STAGE_GATE_MGR TO ARRIVE */
|
/* FIRST BARRIER - WAIT FOR MSG FROM RMGR_PROC_STAGE_GATE_MGR TO ARRIVE */
|
||||||
@ -548,9 +546,12 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
|||||||
|
|
||||||
/* check for timing request - get start time */
|
/* check for timing request - get start time */
|
||||||
if (timing) {
|
if (timing) {
|
||||||
if (0 != gettimeofday(&stg2start, NULL)) {
|
gettimeofday(&ompistop, NULL);
|
||||||
opal_output(0, "ompi_mpi_init: could not obtain stop time after compound_cmd");
|
opal_output(0, "ompi_mpi_init[%ld]: time from execute compound command to receive xcast from STG1 %ld usec",
|
||||||
}
|
(long)ORTE_PROC_MY_NAME->vpid,
|
||||||
|
(long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
|
||||||
|
(ompistop.tv_usec - ompistart.tv_usec)));
|
||||||
|
gettimeofday(&ompistart, NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* start PTL's */
|
/* start PTL's */
|
||||||
@ -647,13 +648,12 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
|||||||
|
|
||||||
/* check for timing request - get stop time and report elapsed time if so */
|
/* check for timing request - get stop time and report elapsed time if so */
|
||||||
if (timing) {
|
if (timing) {
|
||||||
if (0 != gettimeofday(&ompistop, NULL)) {
|
gettimeofday(&ompistop, NULL);
|
||||||
opal_output(0, "ompi_mpi_init: could not obtain stop time after compound_cmd");
|
opal_output(0, "ompi_mpi_init[%ld]: time from stage1 to stage2 %ld usec",
|
||||||
} else {
|
(long)ORTE_PROC_MY_NAME->vpid,
|
||||||
opal_output(0, "ompi_mpi_init: time from stage1 to stage2 %ld usec",
|
(long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
|
||||||
(long int)((ompistop.tv_sec - stg2start.tv_sec)*1000000 +
|
(ompistop.tv_usec - ompistart.tv_usec)));
|
||||||
(ompistop.tv_usec - stg2start.tv_usec)));
|
gettimeofday(&ompistart, NULL);
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Second barrier -- wait for message from
|
/* Second barrier -- wait for message from
|
||||||
@ -668,9 +668,12 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
|||||||
|
|
||||||
/* check for timing request - get start time */
|
/* check for timing request - get start time */
|
||||||
if (timing) {
|
if (timing) {
|
||||||
if (0 != gettimeofday(&stg3start, NULL)) {
|
gettimeofday(&ompistop, NULL);
|
||||||
opal_output(0, "ompi_mpi_init: could not obtain start time for stg3");
|
opal_output(0, "ompi_mpi_init[%ld]: time to receive xcast from STG2 %ld usec",
|
||||||
}
|
(long)ORTE_PROC_MY_NAME->vpid,
|
||||||
|
(long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
|
||||||
|
(ompistop.tv_usec - ompistart.tv_usec)));
|
||||||
|
gettimeofday(&ompistart, NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* new very last step: check whether we have been spawned or not.
|
/* new very last step: check whether we have been spawned or not.
|
||||||
@ -732,13 +735,11 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
|||||||
|
|
||||||
/* check for timing request - get stop time and report elapsed time if so */
|
/* check for timing request - get stop time and report elapsed time if so */
|
||||||
if (timing) {
|
if (timing) {
|
||||||
if (0 != gettimeofday(&ompistop, NULL)) {
|
gettimeofday(&ompistop, NULL);
|
||||||
opal_output(0, "ompi_mpi_init: could not obtain stop time at end");
|
opal_output(0, "ompi_mpi_init[%ld]: time from stage2 xcast to complete mpi_init %ld usec",
|
||||||
} else {
|
(long)ORTE_PROC_MY_NAME->vpid,
|
||||||
opal_output(0, "ompi_mpi_init: time from stage2 to complete mpi_init %ld usec",
|
(long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
|
||||||
(long int)((ompistop.tv_sec - stg3start.tv_sec)*1000000 +
|
(ompistop.tv_usec - ompistart.tv_usec)));
|
||||||
(ompistop.tv_usec - stg3start.tv_usec)));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return MPI_SUCCESS;
|
return MPI_SUCCESS;
|
||||||
|
@ -43,6 +43,11 @@
|
|||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
/*
|
||||||
|
* global flag for use in timing tests
|
||||||
|
*/
|
||||||
|
ORTE_DECLSPEC extern bool orte_oob_base_timing;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Well known address
|
* Well known address
|
||||||
*/
|
*/
|
||||||
|
@ -44,6 +44,7 @@ char* mca_oob_base_exclude = NULL;
|
|||||||
opal_list_t mca_oob_base_components;
|
opal_list_t mca_oob_base_components;
|
||||||
opal_list_t mca_oob_base_modules;
|
opal_list_t mca_oob_base_modules;
|
||||||
opal_list_t mca_oob_base_exception_handlers;
|
opal_list_t mca_oob_base_exception_handlers;
|
||||||
|
bool orte_oob_base_timing = false;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Function for finding and opening either all MCA components, or the one
|
* Function for finding and opening either all MCA components, or the one
|
||||||
@ -51,6 +52,8 @@ opal_list_t mca_oob_base_exception_handlers;
|
|||||||
*/
|
*/
|
||||||
int mca_oob_base_open(void)
|
int mca_oob_base_open(void)
|
||||||
{
|
{
|
||||||
|
int param, value;
|
||||||
|
|
||||||
/* Open up all available components */
|
/* Open up all available components */
|
||||||
|
|
||||||
OBJ_CONSTRUCT(&mca_oob_base_components, opal_list_t);
|
OBJ_CONSTRUCT(&mca_oob_base_components, opal_list_t);
|
||||||
@ -72,6 +75,15 @@ int mca_oob_base_open(void)
|
|||||||
"Components to exclude for oob framework selection",
|
"Components to exclude for oob framework selection",
|
||||||
false, false, NULL, &mca_oob_base_exclude);
|
false, false, NULL, &mca_oob_base_exclude);
|
||||||
|
|
||||||
|
param = mca_base_param_reg_int_name("orte", "timing",
|
||||||
|
"Request that critical timing loops be measured",
|
||||||
|
false, false, 0, &value);
|
||||||
|
if (value != 0) {
|
||||||
|
orte_oob_base_timing = true;
|
||||||
|
} else {
|
||||||
|
orte_oob_base_timing = false;
|
||||||
|
}
|
||||||
|
|
||||||
/* All done */
|
/* All done */
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
@ -99,6 +99,11 @@ int mca_oob_xcast(
|
|||||||
if(NULL != root && ORTE_EQUAL == orte_dss.compare(root, orte_process_info.my_name, ORTE_NAME)) {
|
if(NULL != root && ORTE_EQUAL == orte_dss.compare(root, orte_process_info.my_name, ORTE_NAME)) {
|
||||||
mca_oob_xcast_t *xcast = OBJ_NEW(mca_oob_xcast_t);
|
mca_oob_xcast_t *xcast = OBJ_NEW(mca_oob_xcast_t);
|
||||||
xcast->counter = num_peers;
|
xcast->counter = num_peers;
|
||||||
|
/* check for timing request - if so, we want to printout the size of the message being sent */
|
||||||
|
if (orte_oob_base_timing) {
|
||||||
|
opal_output(0, "oob_xcast: message size is %lu bytes", (unsigned long)buffer->bytes_used);
|
||||||
|
}
|
||||||
|
|
||||||
for(i=0; i<num_peers; i++) {
|
for(i=0; i<num_peers; i++) {
|
||||||
/* check status of peer to ensure they are alive */
|
/* check status of peer to ensure they are alive */
|
||||||
if (ORTE_SUCCESS != (rc = orte_smr.get_proc_state(&state, &status, peers+i))) {
|
if (ORTE_SUCCESS != (rc = orte_smr.get_proc_state(&state, &status, peers+i))) {
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user