Continue to improve timing instrumentation. Add ability to store timing data directly to a file instead of just to stdout.
This commit was SVN r20229.
Этот коммит содержится в:
родитель
007d68becc
Коммит
25f578a7d2
@ -22,9 +22,6 @@
|
||||
#include "orte/types.h"
|
||||
|
||||
#include <string.h>
|
||||
#ifdef HAVE_SYS_TIME_H
|
||||
#include <sys/time.h>
|
||||
#endif /* HAVE_SYS_TIME_H */
|
||||
|
||||
#include "opal/threads/condition.h"
|
||||
#include "orte/util/show_help.h"
|
||||
@ -197,7 +194,6 @@ CLEANUP:
|
||||
|
||||
|
||||
static bool barrier_recvd;
|
||||
static bool barrier_timer;
|
||||
|
||||
static void barrier_recv(int status, orte_process_name_t* sender,
|
||||
opal_buffer_t *buffer,
|
||||
@ -207,29 +203,17 @@ static void barrier_recv(int status, orte_process_name_t* sender,
|
||||
barrier_recvd = true;
|
||||
}
|
||||
|
||||
static void barrier_timer_recv(int status, orte_process_name_t* sender,
|
||||
opal_buffer_t *buffer,
|
||||
orte_rml_tag_t tag, void *cbdata)
|
||||
{
|
||||
barrier_timer = true;
|
||||
}
|
||||
|
||||
static int barrier(void)
|
||||
{
|
||||
opal_buffer_t buf;
|
||||
orte_daemon_cmd_flag_t command=ORTE_DAEMON_COLL_CMD;
|
||||
orte_grpcomm_coll_t coll_type=ORTE_GRPCOMM_BARRIER;
|
||||
int rc;
|
||||
struct timeval ompistart, ompistop;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_output,
|
||||
"%s grpcomm:bad entering barrier",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
if (orte_timing && ORTE_PROC_MY_NAME->vpid == 0) {
|
||||
gettimeofday(&ompistart, NULL);
|
||||
}
|
||||
|
||||
/* everyone sends barrier to local daemon */
|
||||
OBJ_CONSTRUCT(&buf, opal_buffer_t);
|
||||
/* tell the daemon to collect the data */
|
||||
@ -272,44 +256,7 @@ static int barrier(void)
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_output,
|
||||
"%s grpcomm:bad received barrier release",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
if (orte_timing) {
|
||||
if (ORTE_PROC_MY_NAME->vpid == 0) {
|
||||
/* setup a receive to hear when the rank=N proc has received the data
|
||||
* release - in most xcast schemes, this will always be the final recvr
|
||||
*/
|
||||
barrier_timer = false;
|
||||
orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_COLLECTIVE_TIMER,
|
||||
ORTE_RML_NON_PERSISTENT, barrier_timer_recv, NULL);
|
||||
if (rc != ORTE_SUCCESS) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
ORTE_PROGRESSED_WAIT(barrier_timer, 0, 1);
|
||||
gettimeofday(&ompistop, NULL);
|
||||
opal_output(0, "%s time to complete barrier %ld usec",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
|
||||
(ompistop.tv_usec - ompistart.tv_usec)));
|
||||
} else if (ORTE_PROC_MY_NAME->vpid == orte_process_info.num_procs-1) {
|
||||
/* if we are rank=N, send a message back to indicate
|
||||
* the xcast completed for timing purposes
|
||||
*/
|
||||
orte_process_name_t name;
|
||||
|
||||
name.jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
name.vpid = 0;
|
||||
OBJ_CONSTRUCT(&buf, opal_buffer_t);
|
||||
if (0 > (rc = orte_rml.send_buffer(&name,&buf,ORTE_RML_TAG_COLLECTIVE_TIMER,0))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&buf);
|
||||
return rc;
|
||||
}
|
||||
rc = ORTE_SUCCESS;
|
||||
OBJ_DESTRUCT(&buf);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
@ -333,7 +280,6 @@ static int allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf)
|
||||
{
|
||||
int rc;
|
||||
orte_daemon_cmd_flag_t command=ORTE_DAEMON_COLL_CMD;
|
||||
struct timeval ompistart, ompistop;
|
||||
opal_buffer_t coll;
|
||||
orte_grpcomm_coll_t coll_type=ORTE_GRPCOMM_ALLGATHER;
|
||||
|
||||
@ -341,10 +287,6 @@ static int allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf)
|
||||
"%s grpcomm:bad entering allgather",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
if (orte_timing && ORTE_PROC_MY_NAME->vpid == 0) {
|
||||
gettimeofday(&ompistart, NULL);
|
||||
}
|
||||
|
||||
/* everyone sends data to their local daemon */
|
||||
OBJ_CONSTRUCT(&coll, opal_buffer_t);
|
||||
/* tell the daemon to collect the data */
|
||||
@ -400,49 +342,7 @@ static int allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf)
|
||||
return rc;
|
||||
}
|
||||
OBJ_RELEASE(allgather_buf);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_output,
|
||||
"%s allgather buffer received",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
if (orte_timing) {
|
||||
if (ORTE_PROC_MY_NAME->vpid == 0) {
|
||||
/* setup a receive to hear when the rank=N proc has received the data
|
||||
* release - in most xcast schemes, this will always be the final recvr
|
||||
*/
|
||||
barrier_timer = false;
|
||||
rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_COLLECTIVE_TIMER,
|
||||
ORTE_RML_NON_PERSISTENT, barrier_timer_recv, NULL);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
ORTE_PROGRESSED_WAIT(barrier_timer, 0, 1);
|
||||
gettimeofday(&ompistop, NULL);
|
||||
opal_output(0, "%s allgather: time to complete %ld usec",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
|
||||
(ompistop.tv_usec - ompistart.tv_usec)));
|
||||
} else if (ORTE_PROC_MY_NAME->vpid == orte_process_info.num_procs-1) {
|
||||
/* if we are rank=N, send a message back to indicate
|
||||
* the xcast completed for timing purposes
|
||||
*/
|
||||
orte_process_name_t name;
|
||||
opal_buffer_t buf;
|
||||
|
||||
name.jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
name.vpid = 0;
|
||||
OBJ_CONSTRUCT(&buf, opal_buffer_t);
|
||||
if (0 > (rc = orte_rml.send_buffer(&name,&buf,ORTE_RML_TAG_COLLECTIVE_TIMER,0))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
rc = ORTE_SUCCESS;
|
||||
OBJ_DESTRUCT(&buf);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_output,
|
||||
"%s grpcomm:bad allgather completed",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
@ -22,9 +22,6 @@
|
||||
#include "orte/types.h"
|
||||
|
||||
#include <string.h>
|
||||
#ifdef HAVE_SYS_TIME_H
|
||||
#include <sys/time.h>
|
||||
#endif /* HAVE_SYS_TIME_H */
|
||||
#include <fcntl.h>
|
||||
|
||||
#include "opal/threads/condition.h"
|
||||
@ -257,7 +254,6 @@ CLEANUP:
|
||||
|
||||
|
||||
static bool barrier_recvd;
|
||||
static bool barrier_timer;
|
||||
|
||||
static void barrier_recv(int status, orte_process_name_t* sender,
|
||||
opal_buffer_t *buffer,
|
||||
@ -267,13 +263,6 @@ static void barrier_recv(int status, orte_process_name_t* sender,
|
||||
barrier_recvd = true;
|
||||
}
|
||||
|
||||
static void barrier_timer_recv(int status, orte_process_name_t* sender,
|
||||
opal_buffer_t *buffer,
|
||||
orte_rml_tag_t tag, void *cbdata)
|
||||
{
|
||||
barrier_timer = true;
|
||||
}
|
||||
|
||||
static int barrier(void)
|
||||
{
|
||||
opal_buffer_t buf;
|
||||
@ -351,7 +340,6 @@ static int allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf)
|
||||
{
|
||||
int rc;
|
||||
orte_daemon_cmd_flag_t command=ORTE_DAEMON_COLL_CMD;
|
||||
struct timeval ompistart, ompistop;
|
||||
opal_buffer_t coll;
|
||||
orte_grpcomm_coll_t coll_type=ORTE_GRPCOMM_ALLGATHER;
|
||||
|
||||
@ -359,10 +347,6 @@ static int allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf)
|
||||
"%s grpcomm:basic entering allgather",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
if (orte_timing && ORTE_PROC_MY_NAME->vpid == 0) {
|
||||
gettimeofday(&ompistart, NULL);
|
||||
}
|
||||
|
||||
/* everyone sends data to their local daemon */
|
||||
OBJ_CONSTRUCT(&coll, opal_buffer_t);
|
||||
/* tell the daemon to collect the data */
|
||||
@ -419,48 +403,6 @@ static int allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf)
|
||||
}
|
||||
OBJ_RELEASE(allgather_buf);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_output,
|
||||
"%s allgather buffer received",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
if (orte_timing) {
|
||||
if (ORTE_PROC_MY_NAME->vpid == 0) {
|
||||
/* setup a receive to hear when the rank=N proc has received the data
|
||||
* release - in most xcast schemes, this will always be the final recvr
|
||||
*/
|
||||
barrier_timer = false;
|
||||
rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_COLLECTIVE_TIMER,
|
||||
ORTE_RML_NON_PERSISTENT, barrier_timer_recv, NULL);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
ORTE_PROGRESSED_WAIT(barrier_timer, 0, 1);
|
||||
gettimeofday(&ompistop, NULL);
|
||||
opal_output(0, "%s allgather: time to complete %ld usec",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
|
||||
(ompistop.tv_usec - ompistart.tv_usec)));
|
||||
} else if (ORTE_PROC_MY_NAME->vpid == orte_process_info.num_procs-1) {
|
||||
/* if we are rank=N, send a message back to indicate
|
||||
* the xcast completed for timing purposes
|
||||
*/
|
||||
orte_process_name_t name;
|
||||
opal_buffer_t buf;
|
||||
|
||||
name.jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
name.vpid = 0;
|
||||
OBJ_CONSTRUCT(&buf, opal_buffer_t);
|
||||
if (0 > (rc = orte_rml.send_buffer(&name,&buf,ORTE_RML_TAG_COLLECTIVE_TIMER,0))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
rc = ORTE_SUCCESS;
|
||||
OBJ_DESTRUCT(&buf);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_output,
|
||||
"%s grpcomm:basic allgather completed",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
@ -24,6 +24,9 @@
|
||||
#ifdef HAVE_SYS_WAIT_H
|
||||
#include <sys/wait.h>
|
||||
#endif
|
||||
#ifdef HAVE_SYS_TIME_H
|
||||
#include <sys/time.h>
|
||||
#endif /* HAVE_SYS_TIME_H */
|
||||
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/runtime/opal_progress.h"
|
||||
@ -87,29 +90,34 @@ int orte_plm_base_setup_job(orte_job_t *jdata)
|
||||
{
|
||||
opal_byte_object_t bo;
|
||||
int i;
|
||||
orte_nid_t **nodes;
|
||||
opal_pointer_array_t dummy;
|
||||
|
||||
orte_nid_t **nd;
|
||||
opal_list_item_t *item;
|
||||
orte_attr_t *attr;
|
||||
|
||||
/* construct a nodemap */
|
||||
if (ORTE_SUCCESS != (rc = orte_util_encode_nodemap(&bo))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
OBJ_CONSTRUCT(&dummy, opal_pointer_array_t);
|
||||
if (ORTE_SUCCESS != (rc = orte_util_decode_nodemap(&bo, &dummy))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_util_decode_nodemap(&bo))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
/* print-out the map */
|
||||
nodes = (orte_nid_t**)dummy.addr;
|
||||
for (i=0; i < dummy.size; i++) {
|
||||
if (NULL != nodes[i]) {
|
||||
fprintf(stderr, "NIDMAP: name %s daemon %s arch %0x\n",
|
||||
nodes[i]->name, ORTE_VPID_PRINT(nodes[i]->daemon), nodes[i]->arch);
|
||||
OBJ_RELEASE(nodes[i]);
|
||||
nd = (orte_nid_t**)orte_nidmap.addr;
|
||||
for (i=0; i < orte_nidmap.size && NULL != nd[i]; i++) {
|
||||
fprintf(stderr, "%s node[%d].name %s daemon %s arch %0x\n",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), i,
|
||||
(NULL == nd[i]) ? "NULL" : nd[i]->name,
|
||||
ORTE_VPID_PRINT(nd[i]->daemon),
|
||||
(NULL == nd[i]) ? 0 : nd[i]->arch);
|
||||
for (item = opal_list_get_first(&nd[i]->attrs);
|
||||
item != opal_list_get_end(&nd[i]->attrs);
|
||||
item = opal_list_get_next(item)) {
|
||||
attr = (orte_attr_t*)item;
|
||||
fprintf(stderr, "\tAttribute: %s #bytes: %d\n", attr->name, attr->size);
|
||||
}
|
||||
}
|
||||
OBJ_DESTRUCT(&dummy);
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -175,6 +183,8 @@ int orte_plm_base_setup_job(orte_job_t *jdata)
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static struct timeval app_launch_start, app_launch_stop;
|
||||
|
||||
int orte_plm_base_launch_apps(orte_jobid_t job)
|
||||
{
|
||||
orte_job_t *jdata;
|
||||
@ -189,6 +199,10 @@ int orte_plm_base_launch_apps(orte_jobid_t job)
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(job)));
|
||||
|
||||
if (orte_timing) {
|
||||
gettimeofday(&app_launch_start, NULL);
|
||||
}
|
||||
|
||||
/* find the job's data record */
|
||||
if (NULL == (jdata = orte_get_job_data_object(job))) {
|
||||
/* bad jobid */
|
||||
@ -231,6 +245,26 @@ int orte_plm_base_launch_apps(orte_jobid_t job)
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (orte_timing) {
|
||||
unsigned long maxsec, maxusec, minutes, seconds;
|
||||
float fsecs;
|
||||
|
||||
gettimeofday(&app_launch_stop, NULL);
|
||||
/* subtract starting time to get time in microsecs for test */
|
||||
maxsec = app_launch_stop.tv_sec - app_launch_start.tv_sec;
|
||||
maxusec = app_launch_stop.tv_usec - app_launch_start.tv_usec;
|
||||
/* pretty-print the result */
|
||||
seconds = maxsec + (maxusec / 1000000l);
|
||||
minutes = seconds / 60l;
|
||||
seconds = seconds % 60l;
|
||||
if (0 == minutes && 0 == seconds) {
|
||||
fsecs = ((float)(maxsec)*1000000.0 + (float)maxusec) / 1000.0;
|
||||
fprintf(orte_timing_output, "Time to launch apps: %8.2f millisecs\n", fsecs);
|
||||
} else {
|
||||
fprintf(orte_timing_output, "Time to launch apps: %3lu:%02lu min:sec\n", minutes, seconds);
|
||||
}
|
||||
}
|
||||
|
||||
/* complete wiring up the iof */
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||
"%s plm:base:launch wiring up iof",
|
||||
@ -257,6 +291,7 @@ static int orted_num_callback;
|
||||
static bool orted_failed_launch;
|
||||
static orte_job_t *jdatorted;
|
||||
static orte_proc_t **pdatorted;
|
||||
static struct timeval daemonlaunchtime = {0,0}, daemonsetuptime = {0,0}, daemoncbtime = {0,0};
|
||||
|
||||
void orte_plm_base_launch_failed(orte_jobid_t job, pid_t pid,
|
||||
int status, orte_job_state_t state)
|
||||
@ -350,12 +385,21 @@ static void process_orted_launch_report(int fd, short event, void *data)
|
||||
int rc, idx;
|
||||
int32_t arch;
|
||||
orte_node_t **nodes;
|
||||
struct timeval recvtime;
|
||||
long secs, usecs;
|
||||
int64_t setupsec, setupusec;
|
||||
int64_t startsec, startusec;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||
"%s plm:base:orted_report_launch from daemon %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&mev->sender)));
|
||||
|
||||
/* see if we need to timestamp this receipt */
|
||||
if (orte_timing) {
|
||||
gettimeofday(&recvtime, NULL);
|
||||
}
|
||||
|
||||
/* update state */
|
||||
pdatorted[mev->sender.vpid]->state = ORTE_PROC_STATE_RUNNING;
|
||||
|
||||
@ -385,6 +429,81 @@ static void process_orted_launch_report(int fd, short event, void *data)
|
||||
orted_failed_launch = true;
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
/* if we are doing a timing test, unload the start and setup times of the daemon */
|
||||
if (orte_timing) {
|
||||
/* get the time stamp when the daemon first started */
|
||||
idx = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &startsec, &idx, OPAL_INT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
orted_failed_launch = true;
|
||||
goto CLEANUP;
|
||||
}
|
||||
idx = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &startusec, &idx, OPAL_INT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
orted_failed_launch = true;
|
||||
goto CLEANUP;
|
||||
}
|
||||
/* save the latest daemon to start */
|
||||
if (startsec > daemonlaunchtime.tv_sec) {
|
||||
daemonlaunchtime.tv_sec = startsec;
|
||||
daemonlaunchtime.tv_usec = startusec;
|
||||
} else if (startsec == daemonlaunchtime.tv_sec &&
|
||||
startusec > daemonlaunchtime.tv_usec) {
|
||||
daemonlaunchtime.tv_usec = startusec;
|
||||
}
|
||||
/* get the time required for the daemon to setup - locally computed by each daemon */
|
||||
idx = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &setupsec, &idx, OPAL_INT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
orted_failed_launch = true;
|
||||
goto CLEANUP;
|
||||
}
|
||||
idx = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &setupusec, &idx, OPAL_INT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
orted_failed_launch = true;
|
||||
goto CLEANUP;
|
||||
}
|
||||
/* save the longest */
|
||||
if (setupsec > daemonsetuptime.tv_sec) {
|
||||
daemonsetuptime.tv_sec = setupsec;
|
||||
daemonsetuptime.tv_usec = setupusec;
|
||||
} else if (setupsec == daemonsetuptime.tv_sec &&
|
||||
setupusec > daemonsetuptime.tv_usec) {
|
||||
daemonsetuptime.tv_usec = setupusec;
|
||||
}
|
||||
/* get the time stamp of when the daemon started to send this message to us */
|
||||
idx = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &setupsec, &idx, OPAL_INT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
orted_failed_launch = true;
|
||||
goto CLEANUP;
|
||||
}
|
||||
idx = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &setupusec, &idx, OPAL_INT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
orted_failed_launch = true;
|
||||
goto CLEANUP;
|
||||
}
|
||||
/* check the time for the callback to complete and save the longest */
|
||||
secs = recvtime.tv_sec - setupsec;
|
||||
if (setupusec <= recvtime.tv_usec) {
|
||||
usecs = recvtime.tv_usec - setupusec;
|
||||
} else {
|
||||
secs--;
|
||||
usecs = 1000000 - setupusec + recvtime.tv_usec;
|
||||
}
|
||||
if (secs > daemoncbtime.tv_sec) {
|
||||
daemoncbtime.tv_sec = secs;
|
||||
daemoncbtime.tv_usec = usecs;
|
||||
} else if (secs == daemoncbtime.tv_sec &&
|
||||
usecs > daemoncbtime.tv_usec) {
|
||||
daemoncbtime.tv_usec = usecs;
|
||||
}
|
||||
}
|
||||
|
||||
/* lookup the node */
|
||||
nodes = (orte_node_t**)orte_node_pool->addr;
|
||||
if (NULL == nodes[mev->sender.vpid]) {
|
||||
@ -446,7 +565,9 @@ static void orted_report_launch(int status, orte_process_name_t* sender,
|
||||
int orte_plm_base_daemon_callback(orte_std_cntr_t num_daemons)
|
||||
{
|
||||
int rc;
|
||||
|
||||
unsigned long minutes, seconds;
|
||||
float fsecs;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||
"%s plm:base:daemon_callback",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
@ -475,6 +596,39 @@ int orte_plm_base_daemon_callback(orte_std_cntr_t num_daemons)
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* if we are timing, output the results */
|
||||
if (orte_timing) {
|
||||
seconds = (daemonlaunchtime.tv_sec - orte_plm_globals.daemonlaunchstart.tv_sec) +
|
||||
((daemonlaunchtime.tv_usec - orte_plm_globals.daemonlaunchstart.tv_usec) / 1000000l);
|
||||
minutes = seconds / 60l;
|
||||
seconds = seconds % 60l;
|
||||
if (0 == minutes && 0 == seconds) {
|
||||
fsecs = ((float)(daemonlaunchtime.tv_sec - orte_plm_globals.daemonlaunchstart.tv_sec)*1000000.0 +
|
||||
(float)(daemonlaunchtime.tv_usec - orte_plm_globals.daemonlaunchstart.tv_usec)) / 1000.0;
|
||||
fprintf(orte_timing_output, "Daemon initial launch was completed in %8.2f millisecs\n", fsecs);
|
||||
} else {
|
||||
fprintf(orte_timing_output, "Daemon initial launch was completed in %3lu:%02lu min:sec\n", minutes, seconds);
|
||||
}
|
||||
seconds = daemonsetuptime.tv_sec + (daemonsetuptime.tv_usec / 1000000l);
|
||||
minutes = seconds / 60l;
|
||||
seconds = seconds % 60l;
|
||||
if (0 == minutes && 0 == seconds) {
|
||||
fsecs = ((float)(daemonsetuptime.tv_sec)*1000000.0 + (float)daemonsetuptime.tv_usec) / 1000.0;
|
||||
fprintf(orte_timing_output, "Daemon setup was completed in a maximum of %8.2f millisecs\n", fsecs);
|
||||
} else {
|
||||
fprintf(orte_timing_output, "Daemon setup was completed in a maximum of %3lu:%02lu min:sec\n", minutes, seconds);
|
||||
}
|
||||
seconds = daemoncbtime.tv_sec + (daemoncbtime.tv_usec / 1000000l);
|
||||
minutes = seconds / 60l;
|
||||
seconds = seconds % 60l;
|
||||
if (0 == minutes && 0 == seconds) {
|
||||
fsecs = ((float)(daemoncbtime.tv_sec)*1000000.0 + (float)daemoncbtime.tv_usec) / 1000.0;
|
||||
fprintf(orte_timing_output, "Daemon callback to HNP was completed in a maximum of %8.3f millisecs\n", fsecs);
|
||||
} else {
|
||||
fprintf(orte_timing_output, "Daemon callback to HNP was completed in a maximum of %3lu:%02lu min:sec\n", minutes, seconds);
|
||||
}
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||
"%s plm:base:daemon_callback completed",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
@ -27,6 +27,10 @@
|
||||
#include "orte_config.h"
|
||||
#include "orte/types.h"
|
||||
|
||||
#ifdef HAVE_SYS_TIME_H
|
||||
#include <sys/time.h>
|
||||
#endif /* HAVE_SYS_TIME_H */
|
||||
|
||||
#include "opal/class/opal_list.h"
|
||||
#include "opal/threads/condition.h"
|
||||
|
||||
@ -49,6 +53,8 @@ typedef struct {
|
||||
opal_condition_t orted_cmd_cond;
|
||||
/* next jobid */
|
||||
orte_jobid_t next_jobid;
|
||||
/* time when daemons started launch */
|
||||
struct timeval daemonlaunchstart;
|
||||
} orte_plm_globals_t;
|
||||
/**
|
||||
* Global instance of PLM framework data
|
||||
|
@ -148,6 +148,11 @@ static int plm_slurm_launch_job(orte_job_t *jdata)
|
||||
int proc_vpid_index;
|
||||
orte_jobid_t failed_job;
|
||||
|
||||
/* if we are timing, record the start time */
|
||||
if (orte_timing) {
|
||||
gettimeofday(&orte_plm_globals.daemonlaunchstart, NULL);
|
||||
}
|
||||
|
||||
/* flag the daemons as failing by default */
|
||||
failed_job = ORTE_PROC_MY_NAME->jobid;
|
||||
|
||||
|
@ -148,6 +148,11 @@ static int plm_tm_launch_job(orte_job_t *jdata)
|
||||
mode_t current_umask;
|
||||
orte_jobid_t failed_job;
|
||||
|
||||
/* if we are timing, record the start time */
|
||||
if (orte_timing) {
|
||||
gettimeofday(&orte_plm_globals.daemonlaunchstart, NULL);
|
||||
}
|
||||
|
||||
/* default to declaring the daemons as failed */
|
||||
failed_job = ORTE_PROC_MY_NAME->jobid;
|
||||
|
||||
|
@ -23,9 +23,6 @@
|
||||
#include <errno.h>
|
||||
#include <unistd.h>
|
||||
#include <string.h>
|
||||
#ifdef HAVE_SYS_TIME_H
|
||||
#include <sys/time.h>
|
||||
#endif /* HAVE_SYS_TIME_H */
|
||||
|
||||
#include "opal/util/argv.h"
|
||||
#include "orte/util/show_help.h"
|
||||
@ -122,13 +119,7 @@ static int discover(opal_list_t* nodelist, char *pbs_jobid)
|
||||
opal_list_item_t* item;
|
||||
FILE *fp;
|
||||
char *hostname;
|
||||
struct timeval start, stop;
|
||||
|
||||
/* check for timing request - get start time if so */
|
||||
if (orte_timing) {
|
||||
gettimeofday(&start, NULL);
|
||||
}
|
||||
|
||||
/* Ignore anything that the user already specified -- we're
|
||||
getting nodes only from TM. */
|
||||
|
||||
@ -206,15 +197,6 @@ static int discover(opal_list_t* nodelist, char *pbs_jobid)
|
||||
nodeid++;
|
||||
}
|
||||
|
||||
/* check for timing request - get stop time and report elapsed time if so */
|
||||
if (orte_timing) {
|
||||
gettimeofday(&stop, NULL);
|
||||
opal_output(0, "ras_tm: time to allocate is %ld usec",
|
||||
(long int)((stop.tv_sec - start.tv_sec)*1000000 +
|
||||
(stop.tv_usec - start.tv_usec)));
|
||||
gettimeofday(&start, NULL);
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -184,7 +184,7 @@ static void send_relay(opal_buffer_t *buf, orte_jobid_t target_job, orte_rml_tag
|
||||
} else {
|
||||
/* buffer is already setup - just point to it */
|
||||
buffer = buf;
|
||||
/* retain it so that callback function can release it */
|
||||
/* retain it to keep bookkeeping straight */
|
||||
OBJ_RETAIN(buffer);
|
||||
/* tag needs to be set to daemon_tag */
|
||||
tag = ORTE_RML_TAG_DAEMON;
|
||||
@ -227,6 +227,7 @@ static void send_relay(opal_buffer_t *buf, orte_jobid_t target_job, orte_rml_tag
|
||||
"%s orte:daemon:send_relay sending relay msg to %s tag %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&target), tag));
|
||||
/* retain buffer so callback function can release it */
|
||||
OBJ_RETAIN(buffer);
|
||||
if (0 > (ret = orte_rml.send_buffer_nb(&target, buffer, tag, 0,
|
||||
send_callback, NULL))) {
|
||||
@ -244,6 +245,7 @@ static void send_relay(opal_buffer_t *buf, orte_jobid_t target_job, orte_rml_tag
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&nm->name)));
|
||||
|
||||
/* retain buffer so callback function can release it */
|
||||
OBJ_RETAIN(buffer);
|
||||
if (0 > (ret = orte_rml.send_buffer_nb(&nm->name, buffer, tag, 0,
|
||||
send_callback, NULL))) {
|
||||
|
@ -36,6 +36,9 @@
|
||||
#include <fcntl.h>
|
||||
#include <errno.h>
|
||||
#include <signal.h>
|
||||
#ifdef HAVE_SYS_TIME_H
|
||||
#include <sys/time.h>
|
||||
#endif /* HAVE_SYS_TIME_H */
|
||||
|
||||
|
||||
#include "opal/event/event.h"
|
||||
@ -196,7 +199,11 @@ int orte_daemon(int argc, char *argv[])
|
||||
opal_buffer_t *buffer;
|
||||
char hostname[100];
|
||||
char *tmp_env_var = NULL;
|
||||
|
||||
struct timeval starttime, setuptime;
|
||||
|
||||
/* get our time for first executable */
|
||||
gettimeofday(&starttime, NULL);
|
||||
|
||||
/* initialize the globals */
|
||||
memset(&orted_globals, 0, sizeof(orted_globals));
|
||||
/* initialize the singleton died pipe to an illegal value so we can detect it was set */
|
||||
@ -606,6 +613,56 @@ int orte_daemon(int argc, char *argv[])
|
||||
OBJ_RELEASE(buffer);
|
||||
return ret;
|
||||
}
|
||||
if (orte_timing) {
|
||||
int64_t secs, usecs;
|
||||
/* add our start time */
|
||||
secs = starttime.tv_sec;
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &secs, 1, OPAL_INT64))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(buffer);
|
||||
return ret;
|
||||
}
|
||||
usecs = starttime.tv_usec;
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &usecs, 1, OPAL_INT64))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(buffer);
|
||||
return ret;
|
||||
}
|
||||
/* get and send our setup time */
|
||||
gettimeofday(&setuptime, NULL);
|
||||
secs = setuptime.tv_sec - starttime.tv_sec;
|
||||
if (starttime.tv_usec <= setuptime.tv_usec) {
|
||||
usecs = setuptime.tv_usec - starttime.tv_usec;
|
||||
} else {
|
||||
secs--;
|
||||
usecs = 1000000 - starttime.tv_usec + setuptime.tv_usec;
|
||||
}
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &secs, 1, OPAL_INT64))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(buffer);
|
||||
return ret;
|
||||
}
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &usecs, 1, OPAL_INT64))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(buffer);
|
||||
return ret;
|
||||
}
|
||||
/* include the actual timestamp so the HNP can figure out how
|
||||
* long it took for this message to arrive
|
||||
*/
|
||||
secs = setuptime.tv_sec;
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &secs, 1, OPAL_INT64))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(buffer);
|
||||
return ret;
|
||||
}
|
||||
usecs = setuptime.tv_usec;
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &usecs, 1, OPAL_INT64))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(buffer);
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
if (0 > (ret = orte_rml.send_buffer(ORTE_PROC_MY_HNP, buffer,
|
||||
ORTE_RML_TAG_ORTED_CALLBACK, 0))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
|
@ -24,8 +24,10 @@
|
||||
#ifdef HAVE_SYS_TIME_H
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
#include <stdio.h>
|
||||
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "opal/util/output.h"
|
||||
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/show_help.h"
|
||||
@ -144,6 +146,27 @@ int orte_register_params(void)
|
||||
false, false, (int)false, &value);
|
||||
orte_timing = OPAL_INT_TO_BOOL(value);
|
||||
|
||||
if (orte_process_info.hnp) {
|
||||
char *tmp;
|
||||
mca_base_param_reg_string_name("orte", "timing_file",
|
||||
"Name of the file where timing data is to be written (relative or absolute path)",
|
||||
false, false, NULL, &tmp);
|
||||
if (orte_timing && NULL == tmp) {
|
||||
/* send the timing output to stdout */
|
||||
orte_timing_output = stdout;
|
||||
} else if (NULL != tmp) {
|
||||
/* make sure the timing flag is set */
|
||||
orte_timing = true;
|
||||
/* send the output to the indicated file */
|
||||
orte_timing_output = fopen(tmp, "w");
|
||||
if (NULL == orte_timing_output) {
|
||||
/* couldn't be opened */
|
||||
opal_output(0, "File %s could not be opened", tmp);
|
||||
orte_timing_output = stderr;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* User-level debugger info string */
|
||||
|
||||
mca_base_param_reg_string_name("orte", "base_user_debugger",
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user