Fix timing framework:
1. Fixes according to (http://www.open-mpi.org/community/lists/devel/2014/09/15869.php) 2. Force mpisync:rank0 to gather results. Now sync info is written by rank0 to the output file. 3. Improve mpirun_prof: 1) adopt to the environment (SLURM/TORQUE); 2) recognize some noteset-related mpirun options. This commit was SVN r32772.
Этот коммит содержится в:
родитель
9c20940190
Коммит
f2e586980b
@ -185,6 +185,7 @@ fi
|
|||||||
AC_DEFINE_UNQUOTED(OPAL_ENABLE_TIMING, $WANT_TIMING,
|
AC_DEFINE_UNQUOTED(OPAL_ENABLE_TIMING, $WANT_TIMING,
|
||||||
[Whether we want developer-level timing framework or not])
|
[Whether we want developer-level timing framework or not])
|
||||||
|
|
||||||
|
AM_CONDITIONAL([OPAL_COMPILE_TIMING], [test "$WANT_TIMING" = "1"])
|
||||||
AM_CONDITIONAL([OPAL_INSTALL_TIMING_BINARIES], [test "$WANT_TIMING" = "1" -a "$enable_binaries" != "no"])
|
AM_CONDITIONAL([OPAL_INSTALL_TIMING_BINARIES], [test "$WANT_TIMING" = "1" -a "$enable_binaries" != "no"])
|
||||||
|
|
||||||
|
|
||||||
|
@ -21,10 +21,16 @@
|
|||||||
|
|
||||||
SUBDIRS += \
|
SUBDIRS += \
|
||||||
tools/ompi_info \
|
tools/ompi_info \
|
||||||
tools/wrappers \
|
tools/wrappers
|
||||||
tools/mpisync
|
|
||||||
|
if OPAL_COMPILE_TIMING
|
||||||
|
SUBDIRS += tools/mpisync
|
||||||
|
endif
|
||||||
|
|
||||||
DIST_SUBDIRS += \
|
DIST_SUBDIRS += \
|
||||||
tools/ompi_info \
|
tools/ompi_info \
|
||||||
tools/wrappers \
|
tools/wrappers
|
||||||
tools/mpisync
|
|
||||||
|
if OPAL_COMPILE_TIMING
|
||||||
|
DIST_SUBDIRS += tools/mpisync
|
||||||
|
endif
|
||||||
|
@ -42,13 +42,24 @@ AM_CFLAGS = \
|
|||||||
-DOPAL_CC_ABSOLUTE="\"@OPAL_CC_ABSOLUTE@\"" \
|
-DOPAL_CC_ABSOLUTE="\"@OPAL_CC_ABSOLUTE@\"" \
|
||||||
-DOMPI_CXX_ABSOLUTE="\"@OMPI_CXX_ABSOLUTE@\""
|
-DOMPI_CXX_ABSOLUTE="\"@OMPI_CXX_ABSOLUTE@\""
|
||||||
|
|
||||||
#if (OPAL_INSTALL_BINARIES && OPAL_ENABLE_TIMING)
|
include $(top_srcdir)/Makefile.ompi-rules
|
||||||
|
|
||||||
if OPAL_INSTALL_TIMING_BINARIES
|
if OPAL_INSTALL_TIMING_BINARIES
|
||||||
|
|
||||||
bin_PROGRAMS = mpisync
|
bin_PROGRAMS = mpisync
|
||||||
bin_SCRIPTS = mpirun_prof ompi_timing_post
|
bin_SCRIPTS = mpirun_prof ompi_timing_post
|
||||||
|
|
||||||
|
man_pages = mpisync.1
|
||||||
|
EXTRA_DIST = $(man_pages:.1=.1in)
|
||||||
|
|
||||||
|
nodist_man_MANS = $(man_pages)
|
||||||
|
|
||||||
|
$(nodist_man_MANS): $(top_builddir)/opal/include/opal_config.h
|
||||||
|
|
||||||
|
install-data-hook:
|
||||||
|
(cd $(DESTDIR)$(mandir)/man1; rm -f mpirun_prof.1; $(LN_S) mpisync.1 mpirun_prof.1)
|
||||||
|
(cd $(DESTDIR)$(mandir)/man1; rm -f ompi_timing_post.1; $(LN_S) mpisync.1 ompi_timing_post.1)
|
||||||
|
|
||||||
endif
|
endif
|
||||||
|
|
||||||
mpisync_SOURCES = \
|
mpisync_SOURCES = \
|
||||||
|
@ -1,23 +1,78 @@
|
|||||||
#!/bin/sh
|
#!/bin/sh
|
||||||
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
# Copyright (c) 2014 Artem Polyakov <artpol84@gmail.com>
|
||||||
|
# $COPYRIGHT$
|
||||||
|
#
|
||||||
|
# Additional copyrights may follow
|
||||||
|
#
|
||||||
|
# $HEADER$
|
||||||
|
|
||||||
|
function get_jobid()
|
||||||
|
{
|
||||||
|
if [ -n "$SLURM_JOBID" ]; then
|
||||||
|
echo "$SLURM_JOBID"
|
||||||
|
return
|
||||||
|
elif [ -n "$PBS_JOBID" ]; then
|
||||||
|
echo "$PBS_JOBID"
|
||||||
|
return
|
||||||
|
else
|
||||||
|
echo "$$"
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
function extract_nodeset_opts()
|
||||||
|
{
|
||||||
|
short_opts="H:h:"
|
||||||
|
long_opts="host:"
|
||||||
|
opts=""
|
||||||
|
|
||||||
|
while [ -n "$1" ]; do
|
||||||
|
case "$1" in
|
||||||
|
-H | -host | --host )
|
||||||
|
opts=$opts" $1 $2";
|
||||||
|
shift 2;;
|
||||||
|
-hostfile | --hostfile )
|
||||||
|
opts=$opts" $1 $2";
|
||||||
|
shift 2;;
|
||||||
|
-machinefile | --machinefile )
|
||||||
|
opts=$opts" $1 $2";
|
||||||
|
shift 2;;
|
||||||
|
*)
|
||||||
|
shift 1;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "$opts"
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
ompi_instdir=`dirname $0`
|
ompi_instdir=`dirname $0`
|
||||||
syncfile="ompi_clock_sync_data.$$"
|
jobid=`get_jobid`
|
||||||
tmp_timings=mpirun_prof_timings.out
|
syncfile=`pwd`"/ompi_clock_sync_data.$jobid"
|
||||||
tmp_out=mpirun_prof.out
|
tmp_timings=`pwd`"/ompi_timing_temp_file.$jobid"
|
||||||
|
tmp_out=`pwd`"/ompi_mpirun_prof.$jobid"
|
||||||
|
|
||||||
timing_bkp=$OMPI_MCA_opal_timing_file
|
timing_bkp=$OMPI_MCA_opal_timing_file
|
||||||
export OMPI_MCA_opal_timing_file=$tmp_timings
|
export OMPI_MCA_opal_timing_file=$tmp_timings
|
||||||
${ompi_instdir}/mpirun --npernode 1 ${ompi_instdir}/mpisync -o $syncfile >$tmp_out 2>&1
|
|
||||||
|
|
||||||
export OMPI_MCA_opal_timing_file=$timing_bkp
|
opts=`extract_nodeset_opts $@`
|
||||||
export OMPI_MCA_opal_clksync_file=$syncfile
|
${ompi_instdir}/mpirun --npernode 1 $opts ${ompi_instdir}/mpisync -o $syncfile >$tmp_out 2>&1
|
||||||
|
|
||||||
|
export OMPI_MCA_opal_timing_output=$timing_bkp
|
||||||
|
export OMPI_MCA_opal_timing_sync_file=$syncfile
|
||||||
|
|
||||||
# Remove old output
|
# Remove old output
|
||||||
rm -f $OMPI_MCA_opal_timing_file
|
rm -f $OMPI_MCA_opal_timing_output
|
||||||
|
|
||||||
# Run a program of interest
|
# Run a program of interest
|
||||||
${ompi_instdir}/mpirun $@
|
${ompi_instdir}/mpirun $@
|
||||||
|
|
||||||
|
if [ -n "$timing_bkp" ]; then
|
||||||
|
${ompi_instdir}/ompi_timing_post $timing_bkp $timing_bkp.post
|
||||||
|
fi
|
||||||
|
|
||||||
# Cleanup
|
# Cleanup
|
||||||
rm -f $syncfile
|
rm -f $syncfile
|
||||||
rm -f $tmp_timings
|
rm -f $tmp_timings
|
||||||
|
48
ompi/tools/mpisync/mpisync.1in
Обычный файл
48
ompi/tools/mpisync/mpisync.1in
Обычный файл
@ -0,0 +1,48 @@
|
|||||||
|
.\" Copyright (c) 2014 Artem Polyakov <artpol84@gmail.com>. All rights reserved.
|
||||||
|
.TH MPISYNC 1 "#OMPI_DATE#" "#PACKAGE_VERSION#" "#PACKAGE_NAME#"
|
||||||
|
.SH NAME
|
||||||
|
Open MPI timing tools
|
||||||
|
.
|
||||||
|
.SH SYNTAX
|
||||||
|
.B mpisync
|
||||||
|
[\fIoptions\fR]
|
||||||
|
.br
|
||||||
|
.B mpirun_prof
|
||||||
|
[\fIoptions\fR]
|
||||||
|
.br
|
||||||
|
.B ompi_timing_post
|
||||||
|
[\fI<timing-file>\fR] [\fI<processed-file>\fR]
|
||||||
|
.
|
||||||
|
.SH DESCRIPTION
|
||||||
|
.PP
|
||||||
|
.BR mpisync
|
||||||
|
determines clock offsets relative to Head Node Process (HNP). It accepts the following options:
|
||||||
|
.TP
|
||||||
|
\fB\-o\fR, \fB\-\-output\fR
|
||||||
|
The name of output file where offsets related to HNP will be written
|
||||||
|
.TP
|
||||||
|
\fB\-h\fR, \fB\-\-help\fR
|
||||||
|
Print help information
|
||||||
|
.PP
|
||||||
|
.BR ompi_timing_post
|
||||||
|
takes the timing output file as input parameter. The events are sorted by the timestamps. Next, the timestamps are replaced with time offsets relative to the
|
||||||
|
.BR first
|
||||||
|
:
|
||||||
|
.BR previous
|
||||||
|
event.
|
||||||
|
.PP
|
||||||
|
.BR mpirun_prof
|
||||||
|
is a wrapper around
|
||||||
|
.BR mpirun
|
||||||
|
that performs clock synchronisation and post-processing of the timing output file.
|
||||||
|
|
||||||
|
.SH NOTES
|
||||||
|
.PP
|
||||||
|
The mpisync code was derived from MPIPerf project:
|
||||||
|
http://mpiperf.cpct.sibsutis.ru/index.php/Main/Documentation
|
||||||
|
.
|
||||||
|
.SH FILES
|
||||||
|
.PP
|
||||||
|
The output file has following format:
|
||||||
|
.PP
|
||||||
|
<hostname> <round-trip-time> <offset-from-hnp>
|
@ -1,4 +1,11 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
# Copyright (c) 2014 Artem Polyakov <artpol84@gmail.com>
|
||||||
|
# $COPYRIGHT$
|
||||||
|
#
|
||||||
|
# Additional copyrights may follow
|
||||||
|
#
|
||||||
|
# $HEADER$
|
||||||
|
|
||||||
#####################################################################
|
#####################################################################
|
||||||
# Evaluate a floating point number expression.
|
# Evaluate a floating point number expression.
|
||||||
@ -18,16 +25,17 @@ function float_eval(){
|
|||||||
return $stat
|
return $stat
|
||||||
}
|
}
|
||||||
|
|
||||||
if [ -z "$1" ]; then
|
if [ -z "$1" ] || [ -z "$2" ]; then
|
||||||
echo "Need the name of a timing file"
|
echo "Need the name of a timing file and the output file"
|
||||||
exit 0
|
exit 0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
thefile=$1
|
thefile=$1
|
||||||
|
postfile=$2
|
||||||
sed '/^$/d' $thefile > ${thefile}_tmp
|
sed '/^$/d' $thefile > ${thefile}_tmp
|
||||||
sort ${thefile}_tmp > ${thefile}
|
sort ${thefile}_tmp > ${postfile}
|
||||||
|
|
||||||
read line < ${thefile}
|
read line < ${postfile}
|
||||||
first_ts=`echo $line | awk '{ print $1 }' | sed -e 's/s//'`
|
first_ts=`echo $line | awk '{ print $1 }' | sed -e 's/s//'`
|
||||||
prev_ts=$first_ts
|
prev_ts=$first_ts
|
||||||
echo $first_ts
|
echo $first_ts
|
||||||
@ -39,8 +47,8 @@ while read line ; do
|
|||||||
newline=`echo $line | sed -e "s/$cur_ts/$dif1:$dif2/"`
|
newline=`echo $line | sed -e "s/$cur_ts/$dif1:$dif2/"`
|
||||||
prev_ts=$cur_ts
|
prev_ts=$cur_ts
|
||||||
echo $newline
|
echo $newline
|
||||||
done < ${thefile} > ${thefile}_tmp
|
done < ${postfile} > ${thefile}_tmp
|
||||||
|
|
||||||
cat ${thefile}_tmp > ${thefile}
|
cat ${thefile}_tmp > ${postfile}
|
||||||
|
|
||||||
rm -f ${thefile}_tmp
|
rm -f ${thefile}_tmp
|
||||||
|
@ -23,8 +23,6 @@ typedef enum { Gen, Chk } prog_mode_t;
|
|||||||
|
|
||||||
char *filename = NULL;
|
char *filename = NULL;
|
||||||
prog_mode_t mode = Gen;
|
prog_mode_t mode = Gen;
|
||||||
double orig_rtt = 0.0, orig_offs = 0.0;
|
|
||||||
|
|
||||||
void print_help(char *progname);
|
void print_help(char *progname);
|
||||||
int parse_opts(int rank, int argc, char **argv);
|
int parse_opts(int rank, int argc, char **argv);
|
||||||
|
|
||||||
@ -70,14 +68,13 @@ int main(int argc, char **argv)
|
|||||||
MPI_Init(&argc, &argv);
|
MPI_Init(&argc, &argv);
|
||||||
MPI_Comm comm = MPI_COMM_WORLD;
|
MPI_Comm comm = MPI_COMM_WORLD;
|
||||||
int rank, commsize;
|
int rank, commsize;
|
||||||
double offs, rtt;
|
double offs = 0, rtt = 0;
|
||||||
char hname[1024];
|
char hname[1024];
|
||||||
|
|
||||||
MPI_Comm_rank(comm, &rank);
|
MPI_Comm_rank(comm, &rank);
|
||||||
MPI_Comm_size(comm, &commsize);
|
MPI_Comm_size(comm, &commsize);
|
||||||
|
|
||||||
int ret = parse_opts(rank, argc, argv);
|
int ret = parse_opts(rank, argc, argv);
|
||||||
|
|
||||||
if( ret < 0 ){
|
if( ret < 0 ){
|
||||||
// Error exit
|
// Error exit
|
||||||
MPI_Finalize();
|
MPI_Finalize();
|
||||||
@ -90,49 +87,58 @@ int main(int argc, char **argv)
|
|||||||
|
|
||||||
if( filename == NULL ){
|
if( filename == NULL ){
|
||||||
if( rank == 0 ){
|
if( rank == 0 ){
|
||||||
fprintf(stderr, "The name of output file wasn't specified. Abort\n");
|
print_help(argv[0]);
|
||||||
}
|
}
|
||||||
MPI_Finalize();
|
MPI_Finalize();
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
if( gethostname(hname, 1024) ){
|
if( gethostname(hname, 1024) ){
|
||||||
perror("Cannot get hostname");
|
perror("Cannot get hostname. Abort");
|
||||||
MPI_Finalize();
|
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Clear output file if it exists
|
|
||||||
if( rank == 0 ){
|
|
||||||
FILE *fp = fopen(filename, "w");
|
|
||||||
if( fp == NULL ){
|
|
||||||
fprintf(stderr,"Cannot open output file %s for writing. Abort: %s\n",
|
|
||||||
filename, strerror(errno));
|
|
||||||
MPI_Finalize();
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
fclose(fp);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int rc = hpctimer_initialize("gettimeofday");
|
int rc = hpctimer_initialize("gettimeofday");
|
||||||
|
|
||||||
if( rc == HPCTIMER_FAILURE ){
|
if( rc == HPCTIMER_FAILURE ){
|
||||||
fprintf(stderr, "Fail to initialize hpc timer. Abort\n");
|
fprintf(stderr, "Fail to initialize hpc timer. Abort\n");
|
||||||
MPI_Finalize();
|
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||||
exit(1);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
offs = mpigclock_sync_linear(comm, 0, &rtt);
|
offs = mpigclock_sync_linear(comm, 0, &rtt);
|
||||||
|
|
||||||
FILE *fp = fopen(filename,"a");
|
double send[2] = { rtt, offs };
|
||||||
if( fp == NULL ){
|
if( rank == 0 ){
|
||||||
fprintf(stderr, "Cannot open %s for appending. Abort\n", filename);
|
double *measure = malloc(commsize*2*sizeof(double));
|
||||||
MPI_Finalize();
|
if( measure == NULL ){
|
||||||
exit(1);
|
fprintf(stderr, "Fail to allocate memory. Abort\n");
|
||||||
|
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||||
|
}
|
||||||
|
char *hnames = malloc(1024*commsize);
|
||||||
|
if( hnames == NULL ){
|
||||||
|
fprintf(stderr, "Fail to allocate memory. Abort\n");
|
||||||
|
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
MPI_Gather(hname,1024,MPI_CHAR,hnames,1024,MPI_CHAR, 0, MPI_COMM_WORLD);
|
||||||
|
MPI_Gather(send,2,MPI_DOUBLE,measure,2, MPI_DOUBLE, 0, MPI_COMM_WORLD);
|
||||||
|
char tmpname[128];
|
||||||
|
FILE *fp = fopen(filename,"w");
|
||||||
|
if( fp == NULL ){
|
||||||
|
fprintf(stderr, "Fail to open the file %s. Abort\n", filename);
|
||||||
|
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||||
|
}
|
||||||
|
double (*m)[2] = (void*)measure;
|
||||||
|
char (*h)[1024] = (void*)hnames;
|
||||||
|
int i;
|
||||||
|
for(i=0; i<commsize;i++){
|
||||||
|
fprintf(fp, "%s %lf %lf\n", h[i], m[i][0], m[i][1]);
|
||||||
|
}
|
||||||
|
fclose(fp);
|
||||||
|
} else {
|
||||||
|
MPI_Gather(hname,1024, MPI_CHAR, NULL, 1024, MPI_CHAR, 0, MPI_COMM_WORLD);
|
||||||
|
MPI_Gather(send,2, MPI_DOUBLE, NULL, 2, MPI_DOUBLE, 0, MPI_COMM_WORLD);
|
||||||
}
|
}
|
||||||
fprintf(fp, "%s %lf %lf\n", hname, rtt, offs);
|
|
||||||
fclose(fp);
|
|
||||||
|
|
||||||
MPI_Finalize();
|
MPI_Finalize();
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -49,9 +49,9 @@ char *opal_net_private_ipv4 = NULL;
|
|||||||
char *opal_set_max_sys_limits = NULL;
|
char *opal_set_max_sys_limits = NULL;
|
||||||
|
|
||||||
#if OPAL_ENABLE_TIMING
|
#if OPAL_ENABLE_TIMING
|
||||||
char *opal_clksync_file = NULL;
|
char *opal_timing_sync_file = NULL;
|
||||||
char *opal_timing_file = NULL;
|
char *opal_timing_output = NULL;
|
||||||
bool opal_timing_account_overhead = true;
|
bool opal_timing_overhead = true;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
bool opal_built_with_cuda_support = OPAL_INT_TO_BOOL(OPAL_CUDA_SUPPORT);
|
bool opal_built_with_cuda_support = OPAL_INT_TO_BOOL(OPAL_CUDA_SUPPORT);
|
||||||
@ -231,26 +231,28 @@ int opal_register_params(void)
|
|||||||
}
|
}
|
||||||
|
|
||||||
#if OPAL_ENABLE_TIMING
|
#if OPAL_ENABLE_TIMING
|
||||||
(void) mca_base_var_register ("opal", "opal", NULL, "clksync_file",
|
(void) mca_base_var_register ("opal", "opal", NULL, "timing_sync_file",
|
||||||
"Mapping of clock offsets from HNP node",
|
"Clock synchronisation information generated by mpisync tool. You don't need to touch this if you use mpirun_prof tool.",
|
||||||
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
|
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
|
||||||
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_ALL,
|
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_ALL,
|
||||||
&opal_clksync_file);
|
&opal_timing_sync_file);
|
||||||
if( opal_timing_clksync_read(opal_clksync_file) ){
|
if( opal_timing_clocksync_read(opal_timing_sync_file) ){
|
||||||
opal_output(0, "Cannot read file %s containing clock synchronisation information\n", opal_clksync_file);
|
opal_output(0, "Cannot read file %s containing clock synchronisation information\n", opal_timing_sync_file);
|
||||||
}
|
}
|
||||||
|
|
||||||
(void) mca_base_var_register ("opal", "opal", NULL, "timing_file",
|
(void) mca_base_var_register ("opal", "opal", NULL, "timing_output",
|
||||||
"OPAL Timing framework output file",
|
"The name of output file for timing information. If this parameter is not set then output will be directed into OPAL debug channel.",
|
||||||
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
|
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
|
||||||
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_ALL,
|
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_ALL,
|
||||||
&opal_timing_file);
|
&opal_timing_output);
|
||||||
|
|
||||||
(void) mca_base_var_register ("opal", "opal", NULL, "timing_overhead",
|
(void) mca_base_var_register ("opal", "opal", NULL, "timing_overhead",
|
||||||
"Whether account measured timing overhead or not (default: true)",
|
"Timing framework introduce additional overhead (malloc's mostly)."
|
||||||
|
" The time spend in such costly routines is measured and may be accounted"
|
||||||
|
" (subtracted from timestamps). 'true' means consider overhead, 'false' - ignore (default: true).",
|
||||||
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
|
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
|
||||||
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_ALL,
|
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_ALL,
|
||||||
&opal_timing_account_overhead);
|
&opal_timing_overhead);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
opal_warn_on_fork = true;
|
opal_warn_on_fork = true;
|
||||||
|
@ -31,9 +31,9 @@ extern char *opal_net_private_ipv4;
|
|||||||
extern char *opal_set_max_sys_limits;
|
extern char *opal_set_max_sys_limits;
|
||||||
|
|
||||||
#if OPAL_ENABLE_TIMING
|
#if OPAL_ENABLE_TIMING
|
||||||
extern char *opal_clksync_file;
|
extern char *opal_timing_sync_file;
|
||||||
extern char *opal_timing_file;
|
extern char *opal_timing_output;
|
||||||
extern bool opal_timing_account_overhead;
|
extern bool opal_timing_overhead;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
OPAL_DECLSPEC extern int opal_initialized;
|
OPAL_DECLSPEC extern int opal_initialized;
|
||||||
|
@ -103,9 +103,12 @@ libopalutil_la_SOURCES = \
|
|||||||
stacktrace.c \
|
stacktrace.c \
|
||||||
strncpy.c \
|
strncpy.c \
|
||||||
sys_limits.c \
|
sys_limits.c \
|
||||||
timings.c \
|
|
||||||
uri.c
|
uri.c
|
||||||
|
|
||||||
|
if OPAL_COMPILE_TIMING
|
||||||
|
libopalutil_la_SOURCES += timings.c
|
||||||
|
endif
|
||||||
|
|
||||||
libopalutil_la_LIBADD = \
|
libopalutil_la_LIBADD = \
|
||||||
keyval/libopalutilkeyval.la
|
keyval/libopalutilkeyval.la
|
||||||
libopalutil_la_DEPENDENCIES = \
|
libopalutil_la_DEPENDENCIES = \
|
||||||
|
@ -8,18 +8,16 @@
|
|||||||
* $HEADER$
|
* $HEADER$
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#define _GNU_SOURCE
|
#include "opal_config.h"
|
||||||
|
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <stdarg.h>
|
#include <stdarg.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
|
|
||||||
#include "opal_config.h"
|
#ifdef HAVE_STRING_H
|
||||||
|
|
||||||
// TODO : restore ifdefs
|
|
||||||
//#ifdef HAVE_STRING_H
|
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
//#endif
|
#endif
|
||||||
|
|
||||||
#include <errno.h>
|
#include <errno.h>
|
||||||
#ifdef HAVE_SYS_TYPES_H
|
#ifdef HAVE_SYS_TYPES_H
|
||||||
@ -41,9 +39,6 @@
|
|||||||
#include "opal/util/timings.h"
|
#include "opal/util/timings.h"
|
||||||
#include "opal/util/output.h"
|
#include "opal/util/output.h"
|
||||||
|
|
||||||
#if OPAL_ENABLE_TIMING
|
|
||||||
|
|
||||||
|
|
||||||
static void debug_hang(int i)
|
static void debug_hang(int i)
|
||||||
{
|
{
|
||||||
while( i ){
|
while( i ){
|
||||||
@ -65,10 +60,9 @@ opal_mutex_t tm_lock;
|
|||||||
static char *nodename = NULL;
|
static char *nodename = NULL;
|
||||||
static char *jobid = "";
|
static char *jobid = "";
|
||||||
static double hnp_offs = 0;
|
static double hnp_offs = 0;
|
||||||
// TODO use RTT to estimate precise of measurement
|
|
||||||
static double hnp_rtt = 0;
|
static double hnp_rtt = 0;
|
||||||
|
|
||||||
int opal_timing_clksync_read(char *fname)
|
int opal_timing_clocksync_read(char *fname)
|
||||||
{
|
{
|
||||||
int rc = 0;
|
int rc = 0;
|
||||||
FILE *fp = NULL;
|
FILE *fp = NULL;
|
||||||
@ -79,9 +73,8 @@ int opal_timing_clksync_read(char *fname)
|
|||||||
|
|
||||||
char hname[1024];
|
char hname[1024];
|
||||||
if( gethostname(hname, 1024) ){
|
if( gethostname(hname, 1024) ){
|
||||||
rc = -1;
|
opal_output(0, "opal_timing_clocksync_read(%s): Cannot gethostname\n",fname);
|
||||||
opal_output(0, "opal_timing_clksync_read(%s): Cannot gethostname\n",fname);
|
return OPAL_ERROR;
|
||||||
return -1;
|
|
||||||
}
|
}
|
||||||
nodename = strdup(hname);
|
nodename = strdup(hname);
|
||||||
ptr = strchr(nodename,'.');
|
ptr = strchr(nodename,'.');
|
||||||
@ -95,8 +88,8 @@ int opal_timing_clksync_read(char *fname)
|
|||||||
|
|
||||||
fp = fopen(fname,"r");
|
fp = fopen(fname,"r");
|
||||||
if( fp == NULL ){
|
if( fp == NULL ){
|
||||||
opal_output(0, "opal_timing_clksync_read(%s): Cannot open the file\n",fname);
|
opal_output(0, "opal_timing_clocksync_read(%s): Cannot open the file\n",fname);
|
||||||
return -1;
|
return OPAL_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
while( getline(&line,&n,fp) > 0 ){
|
while( getline(&line,&n,fp) > 0 ){
|
||||||
@ -118,8 +111,8 @@ int opal_timing_clksync_read(char *fname)
|
|||||||
}
|
}
|
||||||
|
|
||||||
if( !found ){
|
if( !found ){
|
||||||
opal_output(0,"opal_timing_clksync_read: Can't find my host %s in %s\n", hname, fname);
|
opal_output(0,"opal_timing_clocksync_read: Can't find my host %s in %s\n", hname, fname);
|
||||||
rc = -1;
|
rc = OPAL_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
err_exit:
|
err_exit:
|
||||||
@ -138,7 +131,7 @@ int opal_timing_set_jobid(char *jid)
|
|||||||
{
|
{
|
||||||
jobid = strdup(jid);
|
jobid = strdup(jid);
|
||||||
if( jobid == NULL ){
|
if( jobid == NULL ){
|
||||||
return -1;
|
return OPAL_ERROR;
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@ -159,7 +152,8 @@ opal_timing_event_t *opal_timing_event_alloc(opal_timing_t *t)
|
|||||||
|
|
||||||
t->buffer = malloc(sizeof(opal_timing_event_t)*t->buffer_size);
|
t->buffer = malloc(sizeof(opal_timing_event_t)*t->buffer_size);
|
||||||
if( t->buffer == NULL ){
|
if( t->buffer == NULL ){
|
||||||
// TODO: out of memory error process
|
opal_output(0, "opal_timing_event_alloc: Out of memory!\n");
|
||||||
|
return NULL;
|
||||||
}
|
}
|
||||||
memset(t->buffer, 0, sizeof(opal_timing_event_t)*t->buffer_size);
|
memset(t->buffer, 0, sizeof(opal_timing_event_t)*t->buffer_size);
|
||||||
|
|
||||||
@ -193,6 +187,10 @@ void opal_timing_init(opal_timing_t *t)
|
|||||||
opal_timing_prep_t opal_timing_prep_ev(opal_timing_t *t, const char *fmt, ...)
|
opal_timing_prep_t opal_timing_prep_ev(opal_timing_t *t, const char *fmt, ...)
|
||||||
{
|
{
|
||||||
opal_timing_event_t *ev = opal_timing_event_alloc(t);
|
opal_timing_event_t *ev = opal_timing_event_alloc(t);
|
||||||
|
if( ev == NULL ){
|
||||||
|
opal_timing_prep_t p = { t, NULL, OPAL_ERR_OUT_OF_RESOURCE };
|
||||||
|
return p;
|
||||||
|
}
|
||||||
OBJ_CONSTRUCT(ev, opal_timing_event_t);
|
OBJ_CONSTRUCT(ev, opal_timing_event_t);
|
||||||
ev->ts = opal_timing_get_ts();
|
ev->ts = opal_timing_get_ts();
|
||||||
va_list args;
|
va_list args;
|
||||||
@ -200,18 +198,20 @@ opal_timing_prep_t opal_timing_prep_ev(opal_timing_t *t, const char *fmt, ...)
|
|||||||
vsnprintf(ev->descr, OPAL_TIMING_DESCR_MAX - 1, fmt, args);
|
vsnprintf(ev->descr, OPAL_TIMING_DESCR_MAX - 1, fmt, args);
|
||||||
ev->descr[OPAL_TIMING_DESCR_MAX-1] = '\0';
|
ev->descr[OPAL_TIMING_DESCR_MAX-1] = '\0';
|
||||||
va_end( args );
|
va_end( args );
|
||||||
opal_timing_prep_t p = { t, ev };
|
opal_timing_prep_t p = { t, ev, 0 };
|
||||||
return p;
|
return p;
|
||||||
}
|
}
|
||||||
|
|
||||||
void opal_timing_add_step(opal_timing_prep_t p,
|
void opal_timing_add_step(opal_timing_prep_t p,
|
||||||
const char *func, const char *file, int line)
|
const char *func, const char *file, int line)
|
||||||
{
|
{
|
||||||
p.ev->func = func;
|
if( !p.errcode ) {
|
||||||
p.ev->file = file;
|
p.ev->func = func;
|
||||||
p.ev->line = line;
|
p.ev->file = file;
|
||||||
p.ev->type = TEVENT;
|
p.ev->line = line;
|
||||||
opal_list_append(p.t->events, (opal_list_item_t*)p.ev);
|
p.ev->type = TEVENT;
|
||||||
|
opal_list_append(p.t->events, (opal_list_item_t*)p.ev);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int opal_timing_report(opal_timing_t *t, bool account_overhead, const char *prefix, char *fname)
|
int opal_timing_report(opal_timing_t *t, bool account_overhead, const char *prefix, char *fname)
|
||||||
@ -221,14 +221,14 @@ int opal_timing_report(opal_timing_t *t, bool account_overhead, const char *pref
|
|||||||
FILE *fp = NULL;
|
FILE *fp = NULL;
|
||||||
char *buf = NULL;
|
char *buf = NULL;
|
||||||
int buf_size = 0;
|
int buf_size = 0;
|
||||||
int rc = 0;
|
int rc = OPAL_SUCCESS;
|
||||||
|
|
||||||
debug_hang(0);
|
debug_hang(0);
|
||||||
|
|
||||||
if( fname != NULL ){
|
if( fname != NULL ){
|
||||||
fp = fopen(fname,"a");
|
fp = fopen(fname,"a");
|
||||||
if( fp == NULL ){
|
if( fp == NULL ){
|
||||||
// TODO: log error
|
opal_output(0, "opal_timing_report: Cannot open %s file for writing timing information!\n",fname);
|
||||||
rc = OPAL_ERROR;
|
rc = OPAL_ERROR;
|
||||||
goto err_exit;
|
goto err_exit;
|
||||||
}
|
}
|
||||||
@ -237,8 +237,8 @@ int opal_timing_report(opal_timing_t *t, bool account_overhead, const char *pref
|
|||||||
|
|
||||||
buf = malloc(OPAL_TIMING_OUTBUF_SIZE+1);
|
buf = malloc(OPAL_TIMING_OUTBUF_SIZE+1);
|
||||||
if( buf == NULL ){
|
if( buf == NULL ){
|
||||||
// TODO: log error
|
opal_output(0, "opal_timing_report: Out of memory!\n");
|
||||||
rc = OPAL_ERROR;
|
rc = OPAL_ERR_OUT_OF_RESOURCE;
|
||||||
goto err_exit;
|
goto err_exit;
|
||||||
}
|
}
|
||||||
buf[0] = '\0';
|
buf[0] = '\0';
|
||||||
@ -269,14 +269,16 @@ int opal_timing_report(opal_timing_t *t, bool account_overhead, const char *pref
|
|||||||
ev->descr, nodename, jobid, ev->func, file_name, ev->line);
|
ev->descr, nodename, jobid, ev->func, file_name, ev->line);
|
||||||
}
|
}
|
||||||
if( rc < 0 ){
|
if( rc < 0 ){
|
||||||
// TODO: log mem allocation problems
|
opal_output(0, "opal_timing_report: Cannot asprintf!\n");
|
||||||
|
rc = OPAL_ERR_OUT_OF_RESOURCE;
|
||||||
goto err_exit;
|
goto err_exit;
|
||||||
}
|
}
|
||||||
rc = 0;
|
rc = 0;
|
||||||
|
|
||||||
if( strlen(line) > OPAL_TIMING_OUTBUF_SIZE ){
|
if( strlen(line) > OPAL_TIMING_OUTBUF_SIZE ){
|
||||||
// TODO: log buffer overflow
|
opal_output(0, "opal_timing_report: timing output buffer overflow!\n");
|
||||||
free(line);
|
free(line);
|
||||||
|
rc = OPAL_ERR_OUT_OF_RESOURCE;
|
||||||
goto err_exit;
|
goto err_exit;
|
||||||
}
|
}
|
||||||
if( buf_size + strlen(line) > OPAL_TIMING_OUTBUF_SIZE ){
|
if( buf_size + strlen(line) > OPAL_TIMING_OUTBUF_SIZE ){
|
||||||
@ -313,6 +315,7 @@ err_exit:
|
|||||||
free(buf);
|
free(buf);
|
||||||
}
|
}
|
||||||
if( fp != NULL ){
|
if( fp != NULL ){
|
||||||
|
fflush(fp);
|
||||||
fclose(fp);
|
fclose(fp);
|
||||||
}
|
}
|
||||||
return rc;
|
return rc;
|
||||||
@ -346,5 +349,3 @@ void opal_timing_release(opal_timing_t *t)
|
|||||||
OBJ_RELEASE(t->events);
|
OBJ_RELEASE(t->events);
|
||||||
t->events = NULL;
|
t->events = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
|
||||||
|
@ -8,9 +8,8 @@
|
|||||||
* $HEADER$
|
* $HEADER$
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#ifndef OPAL_UTIL_TIMING_H
|
||||||
#ifndef OPAL_SYS_TIMING_H
|
#define OPAL_UTIL_TIMING_H
|
||||||
#define OPAL_SYS_TIMING_H
|
|
||||||
|
|
||||||
#include "opal/class/opal_list.h"
|
#include "opal/class/opal_list.h"
|
||||||
#include "opal/runtime/opal_params.h"
|
#include "opal/runtime/opal_params.h"
|
||||||
@ -47,40 +46,149 @@ typedef struct opal_timing_t
|
|||||||
typedef struct {
|
typedef struct {
|
||||||
opal_timing_t *t;
|
opal_timing_t *t;
|
||||||
opal_timing_event_t *ev;
|
opal_timing_event_t *ev;
|
||||||
|
int errcode;
|
||||||
} opal_timing_prep_t;
|
} opal_timing_prep_t;
|
||||||
|
|
||||||
int opal_timing_clksync_read(char *opal_clksync_file);
|
/**
|
||||||
|
* Read synchronisation information from the file
|
||||||
|
* provided through the MCA parameter.
|
||||||
|
* Should not be directly used, for service purposes.
|
||||||
|
*
|
||||||
|
* @param sync_file Name of the file to read
|
||||||
|
*
|
||||||
|
* @retval OPAL_SUCCESS On success
|
||||||
|
* @retval OPAL_ERROR On failure
|
||||||
|
*/
|
||||||
|
int opal_timing_clocksync_read(char *sync_file);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Pass string representation of ORTE job ID down to the OPAL.
|
||||||
|
* Should not be directly used, for service purposes.
|
||||||
|
*
|
||||||
|
* @param jid job id
|
||||||
|
*
|
||||||
|
* @retval OPAL_SUCCESS On success
|
||||||
|
* @retval OPAL_ERROR On failure
|
||||||
|
*/
|
||||||
int opal_timing_set_jobid(char *jid);
|
int opal_timing_set_jobid(char *jid);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Initialize timing structure.
|
||||||
|
*
|
||||||
|
* @param t pointer to the timing handler structure
|
||||||
|
*
|
||||||
|
* @retval OPAL_SUCCESS On success
|
||||||
|
* @retval OPAL_ERROR On failure
|
||||||
|
*/
|
||||||
void opal_timing_init(opal_timing_t *t);
|
void opal_timing_init(opal_timing_t *t);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Prepare timing event, do all printf-like processing.
|
||||||
|
* Should not be directly used, for service purposes.
|
||||||
|
*
|
||||||
|
* @param t pointer to the timing handler structure
|
||||||
|
* @param fmt printf-like format
|
||||||
|
* @param ... other parameters that should be converted to string representation
|
||||||
|
*
|
||||||
|
* @retval partly filled opal_timing_prep_t structure
|
||||||
|
*/
|
||||||
opal_timing_prep_t opal_timing_prep_ev(opal_timing_t *t, const char *fmt, ...);
|
opal_timing_prep_t opal_timing_prep_ev(opal_timing_t *t, const char *fmt, ...);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Enqueue timing event into the list of events in handler 't'.
|
||||||
|
*
|
||||||
|
* @param p result of opal_timing_prep_ev
|
||||||
|
* @param func function name where event occurs
|
||||||
|
* @param file file name where event occurs
|
||||||
|
* @param line line number in the file
|
||||||
|
*
|
||||||
|
* @retval
|
||||||
|
*/
|
||||||
void opal_timing_add_step(opal_timing_prep_t p,
|
void opal_timing_add_step(opal_timing_prep_t p,
|
||||||
const char *func, const char *file, int line);
|
const char *func, const char *file, int line);
|
||||||
|
|
||||||
/*
|
/**
|
||||||
opal_timing_prep_t opal_timing_prep_end(opal_timing_t *t, int id, const char *fmt, ...);
|
* Report all events that were enqueued in the timing handler 't'.
|
||||||
int opal_timing_begin(opal_timing_t *t, char *file, int line);
|
* - if fname == NULL the output will be done using opal_output and
|
||||||
void opal_timing_end(opal_timing_prep_t p, char *file, int line);
|
* each line will be prefixed with "prefix" to ease grep'ing.
|
||||||
*/
|
* - otherwise the corresponding file will be used for output in "append" mode
|
||||||
|
* WARRNING: not all filesystems provide enough support for that feature, some records may
|
||||||
|
* disappear.
|
||||||
|
*
|
||||||
|
* @param t timing handler
|
||||||
|
* @param accovh consider malloc overhead introduced by timing code
|
||||||
|
* @param prefix prefix to use when no fname was specifyed to ease grep'ing
|
||||||
|
* @param fname name of the output file (may be NULL)
|
||||||
|
*
|
||||||
|
* @retval OPAL_SUCCESS On success
|
||||||
|
* @retval OPAL_ERROR or OPAL_ERR_OUT_OF_RESOURCE On failure
|
||||||
|
*/
|
||||||
|
int opal_timing_report(opal_timing_t *t, bool accovh, const char *prefix, char *fname);
|
||||||
|
|
||||||
int opal_timing_report(opal_timing_t *t, bool account_overhead, const char *prefix, char *fname);
|
/**
|
||||||
|
* Release all memory allocated for the timing handler 't'.
|
||||||
|
*
|
||||||
|
* @param t timing handler
|
||||||
|
*
|
||||||
|
* @retval
|
||||||
|
*/
|
||||||
void opal_timing_release(opal_timing_t *t);
|
void opal_timing_release(opal_timing_t *t);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Main macro for use in declaring opal timing handler;
|
||||||
|
* will be "compiled out" when OPAL is configured without
|
||||||
|
* --enable-timing.
|
||||||
|
*
|
||||||
|
*/
|
||||||
#define OPAL_TIMING_DECLARE(t) opal_timing_t t; // must have the semicolon here to avoid warnings when not enabled
|
#define OPAL_TIMING_DECLARE(t) opal_timing_t t; // must have the semicolon here to avoid warnings when not enabled
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Main macro for use in declaring external opal timing handler;
|
||||||
|
* will be "compiled out" when OPAL is configured without
|
||||||
|
* --enable-timing.
|
||||||
|
*
|
||||||
|
*/
|
||||||
#define OPAL_TIMING_DECLARE_EXT(x, t) x extern opal_timing_t t; // must have the semicolon here to avoid warnings when not enabled
|
#define OPAL_TIMING_DECLARE_EXT(x, t) x extern opal_timing_t t; // must have the semicolon here to avoid warnings when not enabled
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Main macro for use in initializing opal timing handler;
|
||||||
|
* will be "compiled out" when OPAL is configured without
|
||||||
|
* --enable-timing.
|
||||||
|
*
|
||||||
|
* @see opal_timing_init()
|
||||||
|
*/
|
||||||
#define OPAL_TIMING_INIT(t) opal_timing_init(t)
|
#define OPAL_TIMING_INIT(t) opal_timing_init(t)
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Main macro for use in adding new timing event for the specifyed timing handler;
|
||||||
|
* will be "compiled out" when OPAL is configured without
|
||||||
|
* --enable-timing.
|
||||||
|
*
|
||||||
|
* @see opal_timing_add_step()
|
||||||
|
*/
|
||||||
#define OPAL_TIMING_EVENT(x) opal_timing_add_step( opal_timing_prep_ev x, __FUNCTION__, __FILE__, __LINE__)
|
#define OPAL_TIMING_EVENT(x) opal_timing_add_step( opal_timing_prep_ev x, __FUNCTION__, __FILE__, __LINE__)
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Main macro for use in reporting collected events;
|
||||||
|
* will be "compiled out" when OPAL is configured without
|
||||||
|
* --enable-timing.
|
||||||
|
*
|
||||||
|
* @param enable flag that enables/disables reporting. Used for fine-grained timing.
|
||||||
|
* @see opal_timing_report()
|
||||||
|
*/
|
||||||
#define OPAL_TIMING_REPORT(enable, t, prefix) { \
|
#define OPAL_TIMING_REPORT(enable, t, prefix) { \
|
||||||
if( enable ) { \
|
if( enable ) { \
|
||||||
opal_timing_report(t, opal_timing_account_overhead, prefix, opal_timing_file); \
|
opal_timing_report(t, opal_timing_overhead, prefix, opal_timing_output); \
|
||||||
} \
|
} \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Main macro for use in releasing allocated resources;
|
||||||
|
* will be "compiled out" when OPAL is configured without
|
||||||
|
* --enable-timing.
|
||||||
|
*
|
||||||
|
* @see opal_timing_release()
|
||||||
|
*/
|
||||||
#define OPAL_TIMING_RELEASE(t) opal_timing_release(t)
|
#define OPAL_TIMING_RELEASE(t) opal_timing_release(t)
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user