1
1
openmpi/opal/util/timings.c
Artem Polyakov f2e586980b Fix timing framework:
1. Fixes according to (http://www.open-mpi.org/community/lists/devel/2014/09/15869.php)
2. Force mpisync:rank0 to gather results. Now sync info is written by rank0 to the output file.
3. Improve mpirun_prof: 1) adopt to the environment (SLURM/TORQUE); 2) recognize some noteset-related mpirun options.

This commit was SVN r32772.
2014-09-23 12:59:54 +00:00

352 строки
8.9 KiB
C

/*
* Copyright (C) 2014 Artem Polyakov <artpol84@gmail.com>
* Copyright (c) 2014 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "opal_config.h"
#include <stdlib.h>
#include <stdarg.h>
#include <stdio.h>
#include <unistd.h>
#ifdef HAVE_STRING_H
#include <string.h>
#endif
#include <errno.h>
#ifdef HAVE_SYS_TYPES_H
#include <sys/types.h>
#endif
#ifdef HAVE_SYS_TIME_H
#include <sys/time.h>
#endif
#ifdef HAVE_SYS_RESOURCE_H
#include <sys/resource.h>
#endif
#include "opal/constants.h"
#include "opal/runtime/opal_params.h"
#include "opal/class/opal_pointer_array.h"
#include "opal/class/opal_list.h"
#include "opal/util/timings.h"
#include "opal/util/output.h"
static void debug_hang(int i)
{
while( i ){
sleep(1);
}
}
double opal_timing_get_ts(void);
opal_timing_event_t *opal_timing_event_alloc(opal_timing_t *t);
void opal_timing_init(opal_timing_t *t);
opal_timing_prep_t opal_timing_prep_ev(opal_timing_t *t, const char *fmt, ...);
void opal_timing_release(opal_timing_t *t);
static OBJ_CLASS_INSTANCE(opal_timing_event_t, opal_list_item_t, NULL, NULL);
opal_mutex_t tm_lock;
static char *nodename = NULL;
static char *jobid = "";
static double hnp_offs = 0;
static double hnp_rtt = 0;
int opal_timing_clocksync_read(char *fname)
{
int rc = 0;
FILE *fp = NULL;
char *line = NULL;
size_t n;
bool found = false;
char *ptr = NULL;
char hname[1024];
if( gethostname(hname, 1024) ){
opal_output(0, "opal_timing_clocksync_read(%s): Cannot gethostname\n",fname);
return OPAL_ERROR;
}
nodename = strdup(hname);
ptr = strchr(nodename,'.');
if( ptr != NULL ){
*ptr = '\0';
}
if( fname == NULL ){
return 0;
}
fp = fopen(fname,"r");
if( fp == NULL ){
opal_output(0, "opal_timing_clocksync_read(%s): Cannot open the file\n",fname);
return OPAL_ERROR;
}
while( getline(&line,&n,fp) > 0 ){
ptr = strchr(line,' ');
if( ptr == NULL ){
rc = -1;
goto err_exit;
}
*ptr = '\0';
ptr++;
if( strcmp(line, hname) == 0 ){
if( sscanf(ptr,"%lf %lf", &hnp_rtt, &hnp_offs) != 2 ){
rc = -1;
goto err_exit;
}
found = true;
break;
}
}
if( !found ){
opal_output(0,"opal_timing_clocksync_read: Can't find my host %s in %s\n", hname, fname);
rc = OPAL_ERROR;
}
err_exit:
if( line != NULL ){
free(line);
}
if( fp != NULL ){
fclose(fp);
}
return rc;
}
int opal_timing_set_jobid(char *jid)
{
jobid = strdup(jid);
if( jobid == NULL ){
return OPAL_ERROR;
}
return 0;
}
/* Get current timestamp */
double opal_timing_get_ts(void){
struct timeval tv;
gettimeofday(&tv,NULL);
double ret = tv.tv_sec + tv.tv_usec*1E-6;
return ret;
}
opal_timing_event_t *opal_timing_event_alloc(opal_timing_t *t)
{
if( t->buffer_offset >= t->buffer_size ){
// notch timings overhead
double alloc_begin = opal_timing_get_ts();
t->buffer = malloc(sizeof(opal_timing_event_t)*t->buffer_size);
if( t->buffer == NULL ){
opal_output(0, "opal_timing_event_alloc: Out of memory!\n");
return NULL;
}
memset(t->buffer, 0, sizeof(opal_timing_event_t)*t->buffer_size);
double alloc_end = opal_timing_get_ts();
t->buffer_offset = 0;
t->buffer[0].fib = 1;
t->buffer[0].ts_ovh = alloc_end - alloc_begin;
}
int tmp = t->buffer_offset;
(t->buffer_offset)++;
return t->buffer + tmp;
}
void opal_timing_init(opal_timing_t *t)
{
memset(t,0,sizeof(*t));
t->cur_id = 0;
// initialize events list
t->events = OBJ_NEW(opal_list_t);
// Set buffer size
t->buffer_size = OPAL_TIMING_BUFSIZE;
// Set buffer_offset = buffer_size so new buffer
// will be allocated at first event report
t->buffer_offset = t->buffer_size;
OPAL_TIMING_EVENT((t,"%p: Created, events = %p, buffer: ptr = %p, offs = %d", t, t->events, t->buffer, t->buffer_size));
}
opal_timing_prep_t opal_timing_prep_ev(opal_timing_t *t, const char *fmt, ...)
{
opal_timing_event_t *ev = opal_timing_event_alloc(t);
if( ev == NULL ){
opal_timing_prep_t p = { t, NULL, OPAL_ERR_OUT_OF_RESOURCE };
return p;
}
OBJ_CONSTRUCT(ev, opal_timing_event_t);
ev->ts = opal_timing_get_ts();
va_list args;
va_start( args, fmt );
vsnprintf(ev->descr, OPAL_TIMING_DESCR_MAX - 1, fmt, args);
ev->descr[OPAL_TIMING_DESCR_MAX-1] = '\0';
va_end( args );
opal_timing_prep_t p = { t, ev, 0 };
return p;
}
void opal_timing_add_step(opal_timing_prep_t p,
const char *func, const char *file, int line)
{
if( !p.errcode ) {
p.ev->func = func;
p.ev->file = file;
p.ev->line = line;
p.ev->type = TEVENT;
opal_list_append(p.t->events, (opal_list_item_t*)p.ev);
}
}
int opal_timing_report(opal_timing_t *t, bool account_overhead, const char *prefix, char *fname)
{
opal_timing_event_t *ev;
int count = 0;
FILE *fp = NULL;
char *buf = NULL;
int buf_size = 0;
int rc = OPAL_SUCCESS;
debug_hang(0);
if( fname != NULL ){
fp = fopen(fname,"a");
if( fp == NULL ){
opal_output(0, "opal_timing_report: Cannot open %s file for writing timing information!\n",fname);
rc = OPAL_ERROR;
goto err_exit;
}
prefix=NULL;
}
buf = malloc(OPAL_TIMING_OUTBUF_SIZE+1);
if( buf == NULL ){
opal_output(0, "opal_timing_report: Out of memory!\n");
rc = OPAL_ERR_OUT_OF_RESOURCE;
goto err_exit;
}
buf[0] = '\0';
double overhead = 0;
OPAL_LIST_FOREACH(ev, t->events, opal_timing_event_t){
count++;
if( ev->fib && account_overhead ){
overhead += ev->ts_ovh;
}
if( count > 1){
char *line;
const char *file_name = ev->file;
const char *ptr = file_name;
for( ; *ptr != '\0' ; ptr++ ){
if( *ptr == '/'){
file_name = ptr+1;
}
}
if( prefix != NULL ){
rc = asprintf(&line,"%s:\t%lfs\t\"%s\"\t|\t%s\t%s\t%s\t%s:%d\n",
prefix,ev->ts + hnp_offs + overhead,
ev->descr, nodename, jobid, ev->func, file_name, ev->line);
} else {
rc = asprintf(&line,"%lfs\t\"%s\"\t|\t%s\t%s\t%s\t%s:%d\n",
ev->ts + hnp_offs + overhead,
ev->descr, nodename, jobid, ev->func, file_name, ev->line);
}
if( rc < 0 ){
opal_output(0, "opal_timing_report: Cannot asprintf!\n");
rc = OPAL_ERR_OUT_OF_RESOURCE;
goto err_exit;
}
rc = 0;
if( strlen(line) > OPAL_TIMING_OUTBUF_SIZE ){
opal_output(0, "opal_timing_report: timing output buffer overflow!\n");
free(line);
rc = OPAL_ERR_OUT_OF_RESOURCE;
goto err_exit;
}
if( buf_size + strlen(line) > OPAL_TIMING_OUTBUF_SIZE ){
// flush buffer to the file
if( fp != NULL ){
fprintf(fp,"%s", buf);
fprintf(fp,"\n");
} else {
opal_output(0,"\n%s", buf);
}
buf[0] = '\0';
buf_size = 0;
}
sprintf(buf,"%s%s", buf, line);
buf_size += strlen(line);
free(line);
}
}
if( buf_size > 0 ){
// flush buffer to the file
if( fp != NULL ){
fprintf(fp,"%s", buf);
fprintf(fp,"\n");
} else {
opal_output(0,"\n%s", buf);
}
buf[0] = '\0';
buf_size = 0;
}
err_exit:
if( buf != NULL ){
free(buf);
}
if( fp != NULL ){
fflush(fp);
fclose(fp);
}
return rc;
}
void opal_timing_release(opal_timing_t *t)
{
int cnt = opal_list_get_size(t->events);
if( cnt > 0 ){
opal_list_t *tmp = OBJ_NEW(opal_list_t);
int i;
for(i=0; i<cnt; i++){
opal_timing_event_t *ev = (opal_timing_event_t *)opal_list_remove_first(t->events);
if( ev->fib ){
opal_list_append(tmp,(opal_list_item_t*)ev);
}
}
cnt = opal_list_get_size(tmp);
for(i=0; i<cnt; i++){
opal_timing_event_t *ev = (opal_timing_event_t *)opal_list_remove_first(tmp);
free(ev);
}
OBJ_RELEASE(tmp);
} else {
// Error case. At list one event was inserted at initialization.
}
OBJ_RELEASE(t->events);
t->events = NULL;
}