1
1

Continue to improve timing instrumentation. Add ability to store timing data directly to a file instead of just to stdout.

This commit was SVN r20229.
Этот коммит содержится в:
Ralph Castain 2009-01-08 14:27:52 +00:00
родитель 007d68becc
Коммит 25f578a7d2
10 изменённых файлов: 269 добавлений и 193 удалений

Просмотреть файл

@ -22,9 +22,6 @@
#include "orte/types.h"
#include <string.h>
#ifdef HAVE_SYS_TIME_H
#include <sys/time.h>
#endif /* HAVE_SYS_TIME_H */
#include "opal/threads/condition.h"
#include "orte/util/show_help.h"
@ -197,7 +194,6 @@ CLEANUP:
static bool barrier_recvd;
static bool barrier_timer;
static void barrier_recv(int status, orte_process_name_t* sender,
opal_buffer_t *buffer,
@ -207,29 +203,17 @@ static void barrier_recv(int status, orte_process_name_t* sender,
barrier_recvd = true;
}
static void barrier_timer_recv(int status, orte_process_name_t* sender,
opal_buffer_t *buffer,
orte_rml_tag_t tag, void *cbdata)
{
barrier_timer = true;
}
static int barrier(void)
{
opal_buffer_t buf;
orte_daemon_cmd_flag_t command=ORTE_DAEMON_COLL_CMD;
orte_grpcomm_coll_t coll_type=ORTE_GRPCOMM_BARRIER;
int rc;
struct timeval ompistart, ompistop;
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_output,
"%s grpcomm:bad entering barrier",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
if (orte_timing && ORTE_PROC_MY_NAME->vpid == 0) {
gettimeofday(&ompistart, NULL);
}
/* everyone sends barrier to local daemon */
OBJ_CONSTRUCT(&buf, opal_buffer_t);
/* tell the daemon to collect the data */
@ -272,44 +256,7 @@ static int barrier(void)
OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_output,
"%s grpcomm:bad received barrier release",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
if (orte_timing) {
if (ORTE_PROC_MY_NAME->vpid == 0) {
/* setup a receive to hear when the rank=N proc has received the data
* release - in most xcast schemes, this will always be the final recvr
*/
barrier_timer = false;
orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_COLLECTIVE_TIMER,
ORTE_RML_NON_PERSISTENT, barrier_timer_recv, NULL);
if (rc != ORTE_SUCCESS) {
ORTE_ERROR_LOG(rc);
return rc;
}
ORTE_PROGRESSED_WAIT(barrier_timer, 0, 1);
gettimeofday(&ompistop, NULL);
opal_output(0, "%s time to complete barrier %ld usec",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
(ompistop.tv_usec - ompistart.tv_usec)));
} else if (ORTE_PROC_MY_NAME->vpid == orte_process_info.num_procs-1) {
/* if we are rank=N, send a message back to indicate
* the xcast completed for timing purposes
*/
orte_process_name_t name;
name.jobid = ORTE_PROC_MY_NAME->jobid;
name.vpid = 0;
OBJ_CONSTRUCT(&buf, opal_buffer_t);
if (0 > (rc = orte_rml.send_buffer(&name,&buf,ORTE_RML_TAG_COLLECTIVE_TIMER,0))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&buf);
return rc;
}
rc = ORTE_SUCCESS;
OBJ_DESTRUCT(&buf);
}
}
return ORTE_SUCCESS;
}
@ -333,7 +280,6 @@ static int allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf)
{
int rc;
orte_daemon_cmd_flag_t command=ORTE_DAEMON_COLL_CMD;
struct timeval ompistart, ompistop;
opal_buffer_t coll;
orte_grpcomm_coll_t coll_type=ORTE_GRPCOMM_ALLGATHER;
@ -341,10 +287,6 @@ static int allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf)
"%s grpcomm:bad entering allgather",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
if (orte_timing && ORTE_PROC_MY_NAME->vpid == 0) {
gettimeofday(&ompistart, NULL);
}
/* everyone sends data to their local daemon */
OBJ_CONSTRUCT(&coll, opal_buffer_t);
/* tell the daemon to collect the data */
@ -400,49 +342,7 @@ static int allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf)
return rc;
}
OBJ_RELEASE(allgather_buf);
OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_output,
"%s allgather buffer received",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
if (orte_timing) {
if (ORTE_PROC_MY_NAME->vpid == 0) {
/* setup a receive to hear when the rank=N proc has received the data
* release - in most xcast schemes, this will always be the final recvr
*/
barrier_timer = false;
rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_COLLECTIVE_TIMER,
ORTE_RML_NON_PERSISTENT, barrier_timer_recv, NULL);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
return rc;
}
ORTE_PROGRESSED_WAIT(barrier_timer, 0, 1);
gettimeofday(&ompistop, NULL);
opal_output(0, "%s allgather: time to complete %ld usec",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
(ompistop.tv_usec - ompistart.tv_usec)));
} else if (ORTE_PROC_MY_NAME->vpid == orte_process_info.num_procs-1) {
/* if we are rank=N, send a message back to indicate
* the xcast completed for timing purposes
*/
orte_process_name_t name;
opal_buffer_t buf;
name.jobid = ORTE_PROC_MY_NAME->jobid;
name.vpid = 0;
OBJ_CONSTRUCT(&buf, opal_buffer_t);
if (0 > (rc = orte_rml.send_buffer(&name,&buf,ORTE_RML_TAG_COLLECTIVE_TIMER,0))) {
ORTE_ERROR_LOG(rc);
return rc;
}
rc = ORTE_SUCCESS;
OBJ_DESTRUCT(&buf);
}
}
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_output,
"%s grpcomm:bad allgather completed",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

Просмотреть файл

@ -22,9 +22,6 @@
#include "orte/types.h"
#include <string.h>
#ifdef HAVE_SYS_TIME_H
#include <sys/time.h>
#endif /* HAVE_SYS_TIME_H */
#include <fcntl.h>
#include "opal/threads/condition.h"
@ -257,7 +254,6 @@ CLEANUP:
static bool barrier_recvd;
static bool barrier_timer;
static void barrier_recv(int status, orte_process_name_t* sender,
opal_buffer_t *buffer,
@ -267,13 +263,6 @@ static void barrier_recv(int status, orte_process_name_t* sender,
barrier_recvd = true;
}
static void barrier_timer_recv(int status, orte_process_name_t* sender,
opal_buffer_t *buffer,
orte_rml_tag_t tag, void *cbdata)
{
barrier_timer = true;
}
static int barrier(void)
{
opal_buffer_t buf;
@ -351,7 +340,6 @@ static int allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf)
{
int rc;
orte_daemon_cmd_flag_t command=ORTE_DAEMON_COLL_CMD;
struct timeval ompistart, ompistop;
opal_buffer_t coll;
orte_grpcomm_coll_t coll_type=ORTE_GRPCOMM_ALLGATHER;
@ -359,10 +347,6 @@ static int allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf)
"%s grpcomm:basic entering allgather",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
if (orte_timing && ORTE_PROC_MY_NAME->vpid == 0) {
gettimeofday(&ompistart, NULL);
}
/* everyone sends data to their local daemon */
OBJ_CONSTRUCT(&coll, opal_buffer_t);
/* tell the daemon to collect the data */
@ -419,48 +403,6 @@ static int allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf)
}
OBJ_RELEASE(allgather_buf);
OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_output,
"%s allgather buffer received",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
if (orte_timing) {
if (ORTE_PROC_MY_NAME->vpid == 0) {
/* setup a receive to hear when the rank=N proc has received the data
* release - in most xcast schemes, this will always be the final recvr
*/
barrier_timer = false;
rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_COLLECTIVE_TIMER,
ORTE_RML_NON_PERSISTENT, barrier_timer_recv, NULL);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
return rc;
}
ORTE_PROGRESSED_WAIT(barrier_timer, 0, 1);
gettimeofday(&ompistop, NULL);
opal_output(0, "%s allgather: time to complete %ld usec",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
(ompistop.tv_usec - ompistart.tv_usec)));
} else if (ORTE_PROC_MY_NAME->vpid == orte_process_info.num_procs-1) {
/* if we are rank=N, send a message back to indicate
* the xcast completed for timing purposes
*/
orte_process_name_t name;
opal_buffer_t buf;
name.jobid = ORTE_PROC_MY_NAME->jobid;
name.vpid = 0;
OBJ_CONSTRUCT(&buf, opal_buffer_t);
if (0 > (rc = orte_rml.send_buffer(&name,&buf,ORTE_RML_TAG_COLLECTIVE_TIMER,0))) {
ORTE_ERROR_LOG(rc);
return rc;
}
rc = ORTE_SUCCESS;
OBJ_DESTRUCT(&buf);
}
}
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_output,
"%s grpcomm:basic allgather completed",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

Просмотреть файл

@ -24,6 +24,9 @@
#ifdef HAVE_SYS_WAIT_H
#include <sys/wait.h>
#endif
#ifdef HAVE_SYS_TIME_H
#include <sys/time.h>
#endif /* HAVE_SYS_TIME_H */
#include "opal/util/argv.h"
#include "opal/runtime/opal_progress.h"
@ -87,29 +90,34 @@ int orte_plm_base_setup_job(orte_job_t *jdata)
{
opal_byte_object_t bo;
int i;
orte_nid_t **nodes;
opal_pointer_array_t dummy;
orte_nid_t **nd;
opal_list_item_t *item;
orte_attr_t *attr;
/* construct a nodemap */
if (ORTE_SUCCESS != (rc = orte_util_encode_nodemap(&bo))) {
ORTE_ERROR_LOG(rc);
return rc;
}
OBJ_CONSTRUCT(&dummy, opal_pointer_array_t);
if (ORTE_SUCCESS != (rc = orte_util_decode_nodemap(&bo, &dummy))) {
if (ORTE_SUCCESS != (rc = orte_util_decode_nodemap(&bo))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* print-out the map */
nodes = (orte_nid_t**)dummy.addr;
for (i=0; i < dummy.size; i++) {
if (NULL != nodes[i]) {
fprintf(stderr, "NIDMAP: name %s daemon %s arch %0x\n",
nodes[i]->name, ORTE_VPID_PRINT(nodes[i]->daemon), nodes[i]->arch);
OBJ_RELEASE(nodes[i]);
nd = (orte_nid_t**)orte_nidmap.addr;
for (i=0; i < orte_nidmap.size && NULL != nd[i]; i++) {
fprintf(stderr, "%s node[%d].name %s daemon %s arch %0x\n",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), i,
(NULL == nd[i]) ? "NULL" : nd[i]->name,
ORTE_VPID_PRINT(nd[i]->daemon),
(NULL == nd[i]) ? 0 : nd[i]->arch);
for (item = opal_list_get_first(&nd[i]->attrs);
item != opal_list_get_end(&nd[i]->attrs);
item = opal_list_get_next(item)) {
attr = (orte_attr_t*)item;
fprintf(stderr, "\tAttribute: %s #bytes: %d\n", attr->name, attr->size);
}
}
OBJ_DESTRUCT(&dummy);
}
#endif
@ -175,6 +183,8 @@ int orte_plm_base_setup_job(orte_job_t *jdata)
return ORTE_SUCCESS;
}
static struct timeval app_launch_start, app_launch_stop;
int orte_plm_base_launch_apps(orte_jobid_t job)
{
orte_job_t *jdata;
@ -189,6 +199,10 @@ int orte_plm_base_launch_apps(orte_jobid_t job)
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(job)));
if (orte_timing) {
gettimeofday(&app_launch_start, NULL);
}
/* find the job's data record */
if (NULL == (jdata = orte_get_job_data_object(job))) {
/* bad jobid */
@ -231,6 +245,26 @@ int orte_plm_base_launch_apps(orte_jobid_t job)
return rc;
}
if (orte_timing) {
unsigned long maxsec, maxusec, minutes, seconds;
float fsecs;
gettimeofday(&app_launch_stop, NULL);
/* subtract starting time to get time in microsecs for test */
maxsec = app_launch_stop.tv_sec - app_launch_start.tv_sec;
maxusec = app_launch_stop.tv_usec - app_launch_start.tv_usec;
/* pretty-print the result */
seconds = maxsec + (maxusec / 1000000l);
minutes = seconds / 60l;
seconds = seconds % 60l;
if (0 == minutes && 0 == seconds) {
fsecs = ((float)(maxsec)*1000000.0 + (float)maxusec) / 1000.0;
fprintf(orte_timing_output, "Time to launch apps: %8.2f millisecs\n", fsecs);
} else {
fprintf(orte_timing_output, "Time to launch apps: %3lu:%02lu min:sec\n", minutes, seconds);
}
}
/* complete wiring up the iof */
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:launch wiring up iof",
@ -257,6 +291,7 @@ static int orted_num_callback;
static bool orted_failed_launch;
static orte_job_t *jdatorted;
static orte_proc_t **pdatorted;
static struct timeval daemonlaunchtime = {0,0}, daemonsetuptime = {0,0}, daemoncbtime = {0,0};
void orte_plm_base_launch_failed(orte_jobid_t job, pid_t pid,
int status, orte_job_state_t state)
@ -350,12 +385,21 @@ static void process_orted_launch_report(int fd, short event, void *data)
int rc, idx;
int32_t arch;
orte_node_t **nodes;
struct timeval recvtime;
long secs, usecs;
int64_t setupsec, setupusec;
int64_t startsec, startusec;
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:orted_report_launch from daemon %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&mev->sender)));
/* see if we need to timestamp this receipt */
if (orte_timing) {
gettimeofday(&recvtime, NULL);
}
/* update state */
pdatorted[mev->sender.vpid]->state = ORTE_PROC_STATE_RUNNING;
@ -385,6 +429,81 @@ static void process_orted_launch_report(int fd, short event, void *data)
orted_failed_launch = true;
goto CLEANUP;
}
/* if we are doing a timing test, unload the start and setup times of the daemon */
if (orte_timing) {
/* get the time stamp when the daemon first started */
idx = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &startsec, &idx, OPAL_INT64))) {
ORTE_ERROR_LOG(rc);
orted_failed_launch = true;
goto CLEANUP;
}
idx = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &startusec, &idx, OPAL_INT64))) {
ORTE_ERROR_LOG(rc);
orted_failed_launch = true;
goto CLEANUP;
}
/* save the latest daemon to start */
if (startsec > daemonlaunchtime.tv_sec) {
daemonlaunchtime.tv_sec = startsec;
daemonlaunchtime.tv_usec = startusec;
} else if (startsec == daemonlaunchtime.tv_sec &&
startusec > daemonlaunchtime.tv_usec) {
daemonlaunchtime.tv_usec = startusec;
}
/* get the time required for the daemon to setup - locally computed by each daemon */
idx = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &setupsec, &idx, OPAL_INT64))) {
ORTE_ERROR_LOG(rc);
orted_failed_launch = true;
goto CLEANUP;
}
idx = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &setupusec, &idx, OPAL_INT64))) {
ORTE_ERROR_LOG(rc);
orted_failed_launch = true;
goto CLEANUP;
}
/* save the longest */
if (setupsec > daemonsetuptime.tv_sec) {
daemonsetuptime.tv_sec = setupsec;
daemonsetuptime.tv_usec = setupusec;
} else if (setupsec == daemonsetuptime.tv_sec &&
setupusec > daemonsetuptime.tv_usec) {
daemonsetuptime.tv_usec = setupusec;
}
/* get the time stamp of when the daemon started to send this message to us */
idx = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &setupsec, &idx, OPAL_INT64))) {
ORTE_ERROR_LOG(rc);
orted_failed_launch = true;
goto CLEANUP;
}
idx = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &setupusec, &idx, OPAL_INT64))) {
ORTE_ERROR_LOG(rc);
orted_failed_launch = true;
goto CLEANUP;
}
/* check the time for the callback to complete and save the longest */
secs = recvtime.tv_sec - setupsec;
if (setupusec <= recvtime.tv_usec) {
usecs = recvtime.tv_usec - setupusec;
} else {
secs--;
usecs = 1000000 - setupusec + recvtime.tv_usec;
}
if (secs > daemoncbtime.tv_sec) {
daemoncbtime.tv_sec = secs;
daemoncbtime.tv_usec = usecs;
} else if (secs == daemoncbtime.tv_sec &&
usecs > daemoncbtime.tv_usec) {
daemoncbtime.tv_usec = usecs;
}
}
/* lookup the node */
nodes = (orte_node_t**)orte_node_pool->addr;
if (NULL == nodes[mev->sender.vpid]) {
@ -446,7 +565,9 @@ static void orted_report_launch(int status, orte_process_name_t* sender,
int orte_plm_base_daemon_callback(orte_std_cntr_t num_daemons)
{
int rc;
unsigned long minutes, seconds;
float fsecs;
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:daemon_callback",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
@ -475,6 +596,39 @@ int orte_plm_base_daemon_callback(orte_std_cntr_t num_daemons)
return rc;
}
/* if we are timing, output the results */
if (orte_timing) {
seconds = (daemonlaunchtime.tv_sec - orte_plm_globals.daemonlaunchstart.tv_sec) +
((daemonlaunchtime.tv_usec - orte_plm_globals.daemonlaunchstart.tv_usec) / 1000000l);
minutes = seconds / 60l;
seconds = seconds % 60l;
if (0 == minutes && 0 == seconds) {
fsecs = ((float)(daemonlaunchtime.tv_sec - orte_plm_globals.daemonlaunchstart.tv_sec)*1000000.0 +
(float)(daemonlaunchtime.tv_usec - orte_plm_globals.daemonlaunchstart.tv_usec)) / 1000.0;
fprintf(orte_timing_output, "Daemon initial launch was completed in %8.2f millisecs\n", fsecs);
} else {
fprintf(orte_timing_output, "Daemon initial launch was completed in %3lu:%02lu min:sec\n", minutes, seconds);
}
seconds = daemonsetuptime.tv_sec + (daemonsetuptime.tv_usec / 1000000l);
minutes = seconds / 60l;
seconds = seconds % 60l;
if (0 == minutes && 0 == seconds) {
fsecs = ((float)(daemonsetuptime.tv_sec)*1000000.0 + (float)daemonsetuptime.tv_usec) / 1000.0;
fprintf(orte_timing_output, "Daemon setup was completed in a maximum of %8.2f millisecs\n", fsecs);
} else {
fprintf(orte_timing_output, "Daemon setup was completed in a maximum of %3lu:%02lu min:sec\n", minutes, seconds);
}
seconds = daemoncbtime.tv_sec + (daemoncbtime.tv_usec / 1000000l);
minutes = seconds / 60l;
seconds = seconds % 60l;
if (0 == minutes && 0 == seconds) {
fsecs = ((float)(daemoncbtime.tv_sec)*1000000.0 + (float)daemoncbtime.tv_usec) / 1000.0;
fprintf(orte_timing_output, "Daemon callback to HNP was completed in a maximum of %8.3f millisecs\n", fsecs);
} else {
fprintf(orte_timing_output, "Daemon callback to HNP was completed in a maximum of %3lu:%02lu min:sec\n", minutes, seconds);
}
}
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:daemon_callback completed",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

Просмотреть файл

@ -27,6 +27,10 @@
#include "orte_config.h"
#include "orte/types.h"
#ifdef HAVE_SYS_TIME_H
#include <sys/time.h>
#endif /* HAVE_SYS_TIME_H */
#include "opal/class/opal_list.h"
#include "opal/threads/condition.h"
@ -49,6 +53,8 @@ typedef struct {
opal_condition_t orted_cmd_cond;
/* next jobid */
orte_jobid_t next_jobid;
/* time when daemons started launch */
struct timeval daemonlaunchstart;
} orte_plm_globals_t;
/**
* Global instance of PLM framework data

Просмотреть файл

@ -148,6 +148,11 @@ static int plm_slurm_launch_job(orte_job_t *jdata)
int proc_vpid_index;
orte_jobid_t failed_job;
/* if we are timing, record the start time */
if (orte_timing) {
gettimeofday(&orte_plm_globals.daemonlaunchstart, NULL);
}
/* flag the daemons as failing by default */
failed_job = ORTE_PROC_MY_NAME->jobid;

Просмотреть файл

@ -148,6 +148,11 @@ static int plm_tm_launch_job(orte_job_t *jdata)
mode_t current_umask;
orte_jobid_t failed_job;
/* if we are timing, record the start time */
if (orte_timing) {
gettimeofday(&orte_plm_globals.daemonlaunchstart, NULL);
}
/* default to declaring the daemons as failed */
failed_job = ORTE_PROC_MY_NAME->jobid;

Просмотреть файл

@ -23,9 +23,6 @@
#include <errno.h>
#include <unistd.h>
#include <string.h>
#ifdef HAVE_SYS_TIME_H
#include <sys/time.h>
#endif /* HAVE_SYS_TIME_H */
#include "opal/util/argv.h"
#include "orte/util/show_help.h"
@ -122,13 +119,7 @@ static int discover(opal_list_t* nodelist, char *pbs_jobid)
opal_list_item_t* item;
FILE *fp;
char *hostname;
struct timeval start, stop;
/* check for timing request - get start time if so */
if (orte_timing) {
gettimeofday(&start, NULL);
}
/* Ignore anything that the user already specified -- we're
getting nodes only from TM. */
@ -206,15 +197,6 @@ static int discover(opal_list_t* nodelist, char *pbs_jobid)
nodeid++;
}
/* check for timing request - get stop time and report elapsed time if so */
if (orte_timing) {
gettimeofday(&stop, NULL);
opal_output(0, "ras_tm: time to allocate is %ld usec",
(long int)((stop.tv_sec - start.tv_sec)*1000000 +
(stop.tv_usec - start.tv_usec)));
gettimeofday(&start, NULL);
}
return ORTE_SUCCESS;
}

Просмотреть файл

@ -184,7 +184,7 @@ static void send_relay(opal_buffer_t *buf, orte_jobid_t target_job, orte_rml_tag
} else {
/* buffer is already setup - just point to it */
buffer = buf;
/* retain it so that callback function can release it */
/* retain it to keep bookkeeping straight */
OBJ_RETAIN(buffer);
/* tag needs to be set to daemon_tag */
tag = ORTE_RML_TAG_DAEMON;
@ -227,6 +227,7 @@ static void send_relay(opal_buffer_t *buf, orte_jobid_t target_job, orte_rml_tag
"%s orte:daemon:send_relay sending relay msg to %s tag %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&target), tag));
/* retain buffer so callback function can release it */
OBJ_RETAIN(buffer);
if (0 > (ret = orte_rml.send_buffer_nb(&target, buffer, tag, 0,
send_callback, NULL))) {
@ -244,6 +245,7 @@ static void send_relay(opal_buffer_t *buf, orte_jobid_t target_job, orte_rml_tag
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&nm->name)));
/* retain buffer so callback function can release it */
OBJ_RETAIN(buffer);
if (0 > (ret = orte_rml.send_buffer_nb(&nm->name, buffer, tag, 0,
send_callback, NULL))) {

Просмотреть файл

@ -36,6 +36,9 @@
#include <fcntl.h>
#include <errno.h>
#include <signal.h>
#ifdef HAVE_SYS_TIME_H
#include <sys/time.h>
#endif /* HAVE_SYS_TIME_H */
#include "opal/event/event.h"
@ -196,7 +199,11 @@ int orte_daemon(int argc, char *argv[])
opal_buffer_t *buffer;
char hostname[100];
char *tmp_env_var = NULL;
struct timeval starttime, setuptime;
/* get our time for first executable */
gettimeofday(&starttime, NULL);
/* initialize the globals */
memset(&orted_globals, 0, sizeof(orted_globals));
/* initialize the singleton died pipe to an illegal value so we can detect it was set */
@ -606,6 +613,56 @@ int orte_daemon(int argc, char *argv[])
OBJ_RELEASE(buffer);
return ret;
}
if (orte_timing) {
int64_t secs, usecs;
/* add our start time */
secs = starttime.tv_sec;
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &secs, 1, OPAL_INT64))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(buffer);
return ret;
}
usecs = starttime.tv_usec;
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &usecs, 1, OPAL_INT64))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(buffer);
return ret;
}
/* get and send our setup time */
gettimeofday(&setuptime, NULL);
secs = setuptime.tv_sec - starttime.tv_sec;
if (starttime.tv_usec <= setuptime.tv_usec) {
usecs = setuptime.tv_usec - starttime.tv_usec;
} else {
secs--;
usecs = 1000000 - starttime.tv_usec + setuptime.tv_usec;
}
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &secs, 1, OPAL_INT64))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(buffer);
return ret;
}
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &usecs, 1, OPAL_INT64))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(buffer);
return ret;
}
/* include the actual timestamp so the HNP can figure out how
* long it took for this message to arrive
*/
secs = setuptime.tv_sec;
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &secs, 1, OPAL_INT64))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(buffer);
return ret;
}
usecs = setuptime.tv_usec;
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &usecs, 1, OPAL_INT64))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(buffer);
return ret;
}
}
if (0 > (ret = orte_rml.send_buffer(ORTE_PROC_MY_HNP, buffer,
ORTE_RML_TAG_ORTED_CALLBACK, 0))) {
ORTE_ERROR_LOG(ret);

Просмотреть файл

@ -24,8 +24,10 @@
#ifdef HAVE_SYS_TIME_H
#include <sys/time.h>
#endif
#include <stdio.h>
#include "opal/mca/base/mca_base_param.h"
#include "opal/util/output.h"
#include "orte/util/proc_info.h"
#include "orte/util/show_help.h"
@ -144,6 +146,27 @@ int orte_register_params(void)
false, false, (int)false, &value);
orte_timing = OPAL_INT_TO_BOOL(value);
if (orte_process_info.hnp) {
char *tmp;
mca_base_param_reg_string_name("orte", "timing_file",
"Name of the file where timing data is to be written (relative or absolute path)",
false, false, NULL, &tmp);
if (orte_timing && NULL == tmp) {
/* send the timing output to stdout */
orte_timing_output = stdout;
} else if (NULL != tmp) {
/* make sure the timing flag is set */
orte_timing = true;
/* send the output to the indicated file */
orte_timing_output = fopen(tmp, "w");
if (NULL == orte_timing_output) {
/* couldn't be opened */
opal_output(0, "File %s could not be opened", tmp);
orte_timing_output = stderr;
}
}
}
/* User-level debugger info string */
mca_base_param_reg_string_name("orte", "base_user_debugger",