Continue to refine the timing instrumentation to identify where launch time is being spent
This commit was SVN r20244.
Этот коммит содержится в:
родитель
56ad7acc8a
Коммит
2778c13fac
@ -485,6 +485,12 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
|
||||
opal_list_append(&orte_local_jobdata, &jobdat->super);
|
||||
}
|
||||
|
||||
/* if we are doing a timing test, store the time the msg was recvd */
|
||||
if (orte_timing) {
|
||||
jobdat->launch_msg_recvd.tv_sec = orte_daemon_msg_recvd.tv_sec;
|
||||
jobdat->launch_msg_recvd.tv_usec = orte_daemon_msg_recvd.tv_usec;
|
||||
}
|
||||
|
||||
/* UNPACK JOB-SPECIFIC DATA */
|
||||
/* unpack the number of procs in this launch */
|
||||
cnt=1;
|
||||
@ -896,7 +902,7 @@ static int odls_base_default_setup_fork(orte_app_context_t *context,
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int pack_state_for_proc(opal_buffer_t *alert, bool pack_pid, orte_odls_child_t *child)
|
||||
static int pack_state_for_proc(opal_buffer_t *alert, bool include_startup_info, orte_odls_child_t *child)
|
||||
{
|
||||
int rc;
|
||||
|
||||
@ -905,12 +911,26 @@ static int pack_state_for_proc(opal_buffer_t *alert, bool pack_pid, orte_odls_ch
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
/* pack its pid if we need to report it */
|
||||
if (pack_pid) {
|
||||
/* pack startup info if we need to report it */
|
||||
if (include_startup_info) {
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &child->pid, 1, OPAL_PID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
/* if we are timing things, pack the time the proc was launched */
|
||||
if (orte_timing) {
|
||||
int64_t tmp;
|
||||
tmp = child->starttime.tv_sec;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
tmp = child->starttime.tv_usec;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
}
|
||||
/* pack its state */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &child->state, 1, ORTE_PROC_STATE))) {
|
||||
@ -926,7 +946,7 @@ static int pack_state_for_proc(opal_buffer_t *alert, bool pack_pid, orte_odls_ch
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int pack_state_update(opal_buffer_t *alert, bool pack_pid, orte_jobid_t job)
|
||||
static int pack_state_update(opal_buffer_t *alert, bool include_startup_info, orte_odls_job_t *jobdat)
|
||||
{
|
||||
int rc;
|
||||
opal_list_item_t *item;
|
||||
@ -934,17 +954,31 @@ static int pack_state_update(opal_buffer_t *alert, bool pack_pid, orte_jobid_t j
|
||||
orte_vpid_t null=ORTE_VPID_INVALID;
|
||||
|
||||
/* pack the jobid */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &job, 1, ORTE_JOBID))) {
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &jobdat->jobid, 1, ORTE_JOBID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
/* if we are timing things, pack the time the launch msg for this job was recvd */
|
||||
if (include_startup_info && orte_timing) {
|
||||
int64_t tmp;
|
||||
tmp = jobdat->launch_msg_recvd.tv_sec;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
tmp = jobdat->launch_msg_recvd.tv_usec;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
for (item = opal_list_get_first(&orte_local_children);
|
||||
item != opal_list_get_end(&orte_local_children);
|
||||
item = opal_list_get_next(item)) {
|
||||
child = (orte_odls_child_t*)item;
|
||||
/* if this child is part of the job... */
|
||||
if (child->name->jobid == job) {
|
||||
if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, pack_pid, child))) {
|
||||
if (child->name->jobid == jobdat->jobid) {
|
||||
if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, include_startup_info, child))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
@ -1200,6 +1234,11 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
|
||||
}
|
||||
free(param);
|
||||
|
||||
/* if we are timing things, record when we are going to launch this proc */
|
||||
if (orte_timing) {
|
||||
gettimeofday(&child->starttime, NULL);
|
||||
}
|
||||
|
||||
/* must unlock prior to fork to keep things clean in the
|
||||
* event library
|
||||
*/
|
||||
@ -1265,7 +1304,7 @@ CLEANUP:
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(job)));
|
||||
/* pack the launch results */
|
||||
if (ORTE_SUCCESS != (ret = pack_state_update(&alert, true, job))) {
|
||||
if (ORTE_SUCCESS != (ret = pack_state_update(&alert, true, jobdat))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
}
|
||||
|
||||
@ -1742,6 +1781,8 @@ static void check_proc_complete(orte_odls_child_t *child)
|
||||
int rc;
|
||||
opal_buffer_t alert;
|
||||
orte_plm_cmd_flag_t cmd=ORTE_PLM_UPDATE_PROC_STATE;
|
||||
opal_list_item_t *item;
|
||||
orte_odls_job_t *jdat;
|
||||
|
||||
/* is this proc fully complete? */
|
||||
if (!child->waitpid_recvd || !child->iof_complete) {
|
||||
@ -1817,7 +1858,17 @@ static void check_proc_complete(orte_odls_child_t *child)
|
||||
goto unlock;
|
||||
}
|
||||
/* pack the data for the job */
|
||||
if (ORTE_SUCCESS != (rc = pack_state_update(&alert, false, child->name->jobid))) {
|
||||
for (item = opal_list_get_first(&orte_local_jobdata);
|
||||
item != opal_list_get_end(&orte_local_jobdata);
|
||||
item = opal_list_get_next(item)) {
|
||||
jdat = (orte_odls_job_t*)item;
|
||||
|
||||
/* is this the specified job? */
|
||||
if (jdat->jobid == child->name->jobid) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = pack_state_update(&alert, false, jdat))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto unlock;
|
||||
}
|
||||
|
@ -23,6 +23,10 @@
|
||||
#include "orte_config.h"
|
||||
#include "orte/types.h"
|
||||
|
||||
#ifdef HAVE_SYS_TIME_H
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
|
||||
#include "opal/class/opal_list.h"
|
||||
#include "opal/dss/dss_types.h"
|
||||
#include "orte/mca/grpcomm/grpcomm_types.h"
|
||||
@ -89,6 +93,7 @@ typedef struct {
|
||||
char *slot_list; /* list of slots for this child */
|
||||
bool waitpid_recvd; /* waitpid has detected proc termination */
|
||||
bool iof_complete; /* IOF has noted proc terminating all channels */
|
||||
struct timeval starttime; /* when the proc was started - for timing purposes only */
|
||||
} orte_odls_child_t;
|
||||
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_odls_child_t);
|
||||
|
||||
@ -112,6 +117,7 @@ typedef struct orte_odls_job_t {
|
||||
int32_t num_contributors;
|
||||
int num_participating;
|
||||
int num_collected;
|
||||
struct timeval launch_msg_recvd; /* when the launch msg for this job was recvd - for timing purposes only */
|
||||
} orte_odls_job_t;
|
||||
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_odls_job_t);
|
||||
|
||||
|
@ -59,6 +59,8 @@
|
||||
|
||||
static int orte_plm_base_report_launched(orte_jobid_t job);
|
||||
|
||||
static char *pretty_print_timing(int64_t secs, int64_t usecs);
|
||||
|
||||
int orte_plm_base_setup_job(orte_job_t *jdata)
|
||||
{
|
||||
orte_job_t *jdatorted;
|
||||
@ -183,7 +185,7 @@ int orte_plm_base_setup_job(orte_job_t *jdata)
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static struct timeval app_launch_start, app_launch_stop;
|
||||
static struct timeval app_launch_start, app_launch_stop, launch_msg_sent;
|
||||
|
||||
int orte_plm_base_launch_apps(orte_jobid_t job)
|
||||
{
|
||||
@ -227,6 +229,11 @@ int orte_plm_base_launch_apps(orte_jobid_t job)
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* if we are timing, record the time we send this message */
|
||||
if (orte_timing) {
|
||||
gettimeofday(&launch_msg_sent, NULL);
|
||||
}
|
||||
|
||||
/* send the command to the daemons */
|
||||
if (ORTE_SUCCESS != (rc = orte_grpcomm.xcast(ORTE_PROC_MY_NAME->jobid,
|
||||
buffer, ORTE_RML_TAG_DAEMON))) {
|
||||
@ -246,23 +253,13 @@ int orte_plm_base_launch_apps(orte_jobid_t job)
|
||||
}
|
||||
|
||||
if (orte_timing) {
|
||||
unsigned long maxsec, maxusec, minutes, seconds;
|
||||
float fsecs;
|
||||
int64_t maxsec, maxusec;
|
||||
|
||||
gettimeofday(&app_launch_stop, NULL);
|
||||
/* subtract starting time to get time in microsecs for test */
|
||||
maxsec = app_launch_stop.tv_sec - app_launch_start.tv_sec;
|
||||
maxusec = app_launch_stop.tv_usec - app_launch_start.tv_usec;
|
||||
/* pretty-print the result */
|
||||
seconds = maxsec + (maxusec / 1000000l);
|
||||
minutes = seconds / 60l;
|
||||
seconds = seconds % 60l;
|
||||
if (0 == minutes && 0 == seconds) {
|
||||
fsecs = ((float)(maxsec)*1000000.0 + (float)maxusec) / 1000.0;
|
||||
fprintf(orte_timing_output, "Time to launch apps: %8.2f millisecs\n", fsecs);
|
||||
} else {
|
||||
fprintf(orte_timing_output, "Time to launch apps: %3lu:%02lu min:sec\n", minutes, seconds);
|
||||
}
|
||||
fprintf(orte_timing_output, "Time to launch apps: %s\n", pretty_print_timing(maxsec, maxusec));
|
||||
}
|
||||
|
||||
/* complete wiring up the iof */
|
||||
@ -444,6 +441,9 @@ static void process_orted_launch_report(int fd, short event, void *data)
|
||||
ORTE_ERROR_LOG(rc);
|
||||
orted_failed_launch = true;
|
||||
goto CLEANUP;
|
||||
}
|
||||
if (orte_timing_details) {
|
||||
|
||||
}
|
||||
/* save the latest daemon to start */
|
||||
if (startsec > daemonlaunchtime.tv_sec) {
|
||||
@ -488,13 +488,7 @@ static void process_orted_launch_report(int fd, short event, void *data)
|
||||
goto CLEANUP;
|
||||
}
|
||||
/* check the time for the callback to complete and save the longest */
|
||||
secs = recvtime.tv_sec - setupsec;
|
||||
if (setupusec <= recvtime.tv_usec) {
|
||||
usecs = recvtime.tv_usec - setupusec;
|
||||
} else {
|
||||
secs--;
|
||||
usecs = 1000000 - setupusec + recvtime.tv_usec;
|
||||
}
|
||||
ORTE_COMPUTE_TIME_DIFF(secs, usecs, setupsec, setupusec, recvtime.tv_sec, recvtime.tv_usec);
|
||||
if (secs > daemoncbtime.tv_sec) {
|
||||
daemoncbtime.tv_sec = secs;
|
||||
daemoncbtime.tv_usec = usecs;
|
||||
@ -565,8 +559,6 @@ static void orted_report_launch(int status, orte_process_name_t* sender,
|
||||
int orte_plm_base_daemon_callback(orte_std_cntr_t num_daemons)
|
||||
{
|
||||
int rc;
|
||||
unsigned long minutes, seconds;
|
||||
float fsecs;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||
"%s plm:base:daemon_callback",
|
||||
@ -598,35 +590,16 @@ int orte_plm_base_daemon_callback(orte_std_cntr_t num_daemons)
|
||||
|
||||
/* if we are timing, output the results */
|
||||
if (orte_timing) {
|
||||
seconds = (daemonlaunchtime.tv_sec - orte_plm_globals.daemonlaunchstart.tv_sec) +
|
||||
((daemonlaunchtime.tv_usec - orte_plm_globals.daemonlaunchstart.tv_usec) / 1000000l);
|
||||
minutes = seconds / 60l;
|
||||
seconds = seconds % 60l;
|
||||
if (0 == minutes && 0 == seconds) {
|
||||
fsecs = ((float)(daemonlaunchtime.tv_sec - orte_plm_globals.daemonlaunchstart.tv_sec)*1000000.0 +
|
||||
(float)(daemonlaunchtime.tv_usec - orte_plm_globals.daemonlaunchstart.tv_usec)) / 1000.0;
|
||||
fprintf(orte_timing_output, "Daemon initial launch was completed in %8.2f millisecs\n", fsecs);
|
||||
} else {
|
||||
fprintf(orte_timing_output, "Daemon initial launch was completed in %3lu:%02lu min:sec\n", minutes, seconds);
|
||||
}
|
||||
seconds = daemonsetuptime.tv_sec + (daemonsetuptime.tv_usec / 1000000l);
|
||||
minutes = seconds / 60l;
|
||||
seconds = seconds % 60l;
|
||||
if (0 == minutes && 0 == seconds) {
|
||||
fsecs = ((float)(daemonsetuptime.tv_sec)*1000000.0 + (float)daemonsetuptime.tv_usec) / 1000.0;
|
||||
fprintf(orte_timing_output, "Daemon setup was completed in a maximum of %8.2f millisecs\n", fsecs);
|
||||
} else {
|
||||
fprintf(orte_timing_output, "Daemon setup was completed in a maximum of %3lu:%02lu min:sec\n", minutes, seconds);
|
||||
}
|
||||
seconds = daemoncbtime.tv_sec + (daemoncbtime.tv_usec / 1000000l);
|
||||
minutes = seconds / 60l;
|
||||
seconds = seconds % 60l;
|
||||
if (0 == minutes && 0 == seconds) {
|
||||
fsecs = ((float)(daemoncbtime.tv_sec)*1000000.0 + (float)daemoncbtime.tv_usec) / 1000.0;
|
||||
fprintf(orte_timing_output, "Daemon callback to HNP was completed in a maximum of %8.3f millisecs\n", fsecs);
|
||||
} else {
|
||||
fprintf(orte_timing_output, "Daemon callback to HNP was completed in a maximum of %3lu:%02lu min:sec\n", minutes, seconds);
|
||||
}
|
||||
int64_t sec, usec;
|
||||
ORTE_COMPUTE_TIME_DIFF(sec, usec, orte_plm_globals.daemonlaunchstart.tv_sec,
|
||||
orte_plm_globals.daemonlaunchstart.tv_usec,
|
||||
daemonlaunchtime.tv_sec, daemonlaunchtime.tv_usec);
|
||||
fprintf(orte_timing_output, "Daemon launch was completed in %s\n",
|
||||
pretty_print_timing(sec, usec));
|
||||
fprintf(orte_timing_output, "Daemon setup (from first exec statement to ready-for-commands) was completed in a maximum of %s\n",
|
||||
pretty_print_timing(daemonsetuptime.tv_sec, daemonsetuptime.tv_usec));
|
||||
fprintf(orte_timing_output, "Daemon callback message to HNP took a maximum time of %s to reach the HNP\n",
|
||||
pretty_print_timing(daemoncbtime.tv_sec, daemoncbtime.tv_usec));
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||
@ -646,6 +619,7 @@ int orte_plm_base_daemon_callback(orte_std_cntr_t num_daemons)
|
||||
* for their local procs.
|
||||
*/
|
||||
static bool app_launch_failed;
|
||||
static struct timeval max_daemon_launch_msg_recvd = {0,0};
|
||||
|
||||
/* since the HNP also reports launch of procs, we need to separate out
|
||||
* the processing of the message vs its receipt so that the HNP
|
||||
@ -697,6 +671,42 @@ void orte_plm_base_app_report_launch(int fd, short event, void *data)
|
||||
}
|
||||
procs = (orte_proc_t**)(jdata->procs->addr);
|
||||
|
||||
/* if we are timing, the daemon will have included the time it
|
||||
* recvd the launch msg - the maximum time between when we sent
|
||||
* that message and a daemon recvd it tells us the time reqd
|
||||
* to wireup the daemon comm network
|
||||
*/
|
||||
if (orte_timing) {
|
||||
int64_t tmpsec, tmpusec;
|
||||
cnt = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &tmpsec, &cnt, OPAL_INT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
app_launch_failed = true;
|
||||
goto CLEANUP;
|
||||
}
|
||||
cnt = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &tmpusec, &cnt, OPAL_INT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
app_launch_failed = true;
|
||||
goto CLEANUP;
|
||||
}
|
||||
/* keep the maximum time */
|
||||
if (tmpsec > max_daemon_launch_msg_recvd.tv_sec) {
|
||||
max_daemon_launch_msg_recvd.tv_sec = tmpsec;
|
||||
max_daemon_launch_msg_recvd.tv_usec = tmpusec;
|
||||
} else if (tmpsec == max_daemon_launch_msg_recvd.tv_sec &&
|
||||
tmpusec > max_daemon_launch_msg_recvd.tv_usec) {
|
||||
max_daemon_launch_msg_recvd.tv_usec = tmpusec;
|
||||
}
|
||||
if (orte_timing_details) {
|
||||
int64_t sec, usec;
|
||||
ORTE_COMPUTE_TIME_DIFF(sec, usec, launch_msg_sent.tv_sec, launch_msg_sent.tv_usec,
|
||||
tmpsec, tmpusec);
|
||||
fprintf(orte_timing_output, "Time for launch msg to reach daemon %s: %s\n",
|
||||
ORTE_VPID_PRINT(mev->sender.vpid), pretty_print_timing(sec, usec));
|
||||
}
|
||||
}
|
||||
|
||||
/* the daemon will report the vpid, state, and pid of each
|
||||
* process it launches - we need the pid in particular so
|
||||
* that any debuggers can attach to the process
|
||||
@ -714,6 +724,32 @@ void orte_plm_base_app_report_launch(int fd, short event, void *data)
|
||||
app_launch_failed = true;
|
||||
goto CLEANUP;
|
||||
}
|
||||
/* if we are timing things, unpack the time this proc was started */
|
||||
if (orte_timing) {
|
||||
int64_t tmpsec, tmpusec;
|
||||
cnt = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &tmpsec, &cnt, OPAL_INT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
app_launch_failed = true;
|
||||
goto CLEANUP;
|
||||
}
|
||||
cnt = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &tmpusec, &cnt, OPAL_INT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
app_launch_failed = true;
|
||||
goto CLEANUP;
|
||||
}
|
||||
if (orte_timing_details) {
|
||||
time_t tmptime;
|
||||
char *tmpstr;
|
||||
tmptime = tmpsec;
|
||||
tmpstr = ctime(&tmptime);
|
||||
/* remove the newline and the year at the end */
|
||||
tmpstr[strlen(tmpstr)-6] = '\0';
|
||||
fprintf(orte_timing_output, "Time rank %s was launched: %s.%3lu\n",
|
||||
ORTE_VPID_PRINT(vpid), tmpstr, (unsigned long)(tmpusec/1000));
|
||||
}
|
||||
}
|
||||
/* unpack the state */
|
||||
cnt = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &state, &cnt, ORTE_PROC_STATE))) {
|
||||
@ -1260,3 +1296,24 @@ CHECK_ALL_JOBS:
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
static char timestring[128];
|
||||
|
||||
static char *pretty_print_timing(int64_t secs, int64_t usecs)
|
||||
{
|
||||
unsigned long minutes, seconds;
|
||||
float fsecs;
|
||||
|
||||
seconds = secs + (usecs / 1000000l);
|
||||
minutes = seconds / 60l;
|
||||
seconds = seconds % 60l;
|
||||
if (0 == minutes && 0 == seconds) {
|
||||
fsecs = ((float)(secs)*1000000.0 + (float)usecs) / 1000.0;
|
||||
snprintf(timestring, 128, "%8.2f millisecs", fsecs);
|
||||
} else {
|
||||
snprintf(timestring, 128, "%3lu:%02lu min:sec", minutes, seconds);
|
||||
}
|
||||
|
||||
return timestring;
|
||||
}
|
||||
|
||||
|
@ -22,6 +22,10 @@
|
||||
#include "orte_config.h"
|
||||
#include "orte/types.h"
|
||||
|
||||
#ifdef HAVE_TIME_H
|
||||
#include <time.h>
|
||||
#endif
|
||||
|
||||
#include "opal/dss/dss_types.h"
|
||||
#include "orte/mca/rml/rml_types.h"
|
||||
|
||||
@ -38,6 +42,9 @@ ORTE_DECLSPEC void orte_daemon_recv(int status, orte_process_name_t* sender,
|
||||
/* direct cmd processing entry point - used by HNP */
|
||||
ORTE_DECLSPEC void orte_daemon_cmd_processor(int fd, short event, void *data);
|
||||
|
||||
/* a time flag that needs to be visible elsewhere */
|
||||
ORTE_DECLSPEC extern struct timeval orte_daemon_msg_recvd;
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* ORTED_H */
|
||||
|
@ -36,6 +36,9 @@
|
||||
#include <fcntl.h>
|
||||
#include <errno.h>
|
||||
#include <signal.h>
|
||||
#ifdef HAVE_TIME_H
|
||||
#include <time.h>
|
||||
#endif
|
||||
|
||||
|
||||
#include "opal/class/opal_pointer_array.h"
|
||||
@ -88,6 +91,10 @@ static int process_commands(orte_process_name_t* sender,
|
||||
orte_rml_tag_t tag);
|
||||
|
||||
|
||||
/* instantiate this - it is shared via orted.h */
|
||||
struct timeval orte_daemon_msg_recvd;
|
||||
|
||||
static struct timeval mesg_recvd={0,0};
|
||||
|
||||
/* local callback function for non-blocking sends */
|
||||
static void send_callback(int status, orte_process_name_t *peer,
|
||||
@ -270,6 +277,11 @@ void orte_daemon_recv(int status, orte_process_name_t* sender,
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(sender)));
|
||||
|
||||
if (orte_timing) {
|
||||
/* pickup the time the message was recvd by this daemon */
|
||||
gettimeofday(&mesg_recvd, NULL);
|
||||
}
|
||||
|
||||
/* don't process this right away - we need to get out of the recv before
|
||||
* we process the message as it may ask us to do something that involves
|
||||
* more messaging! Instead, setup an event so that the message gets processed
|
||||
@ -343,6 +355,13 @@ void orte_daemon_cmd_processor(int fd, short event, void *data)
|
||||
wait_time = 1;
|
||||
num_recursions = 0;
|
||||
|
||||
if (orte_timing && orte_process_info.hnp) {
|
||||
/* if we are doing timing, and we are the HNP, then the message doesn't come
|
||||
* through the RML recv, so we have to pickup the recv time here
|
||||
*/
|
||||
gettimeofday(&mesg_recvd, NULL);
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_debug_output,
|
||||
"%s orte:daemon:cmd:processor called by %s for tag %ld",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
@ -393,6 +412,11 @@ void orte_daemon_cmd_processor(int fd, short event, void *data)
|
||||
}
|
||||
/* flag this location */
|
||||
save = buffer->unpack_ptr;
|
||||
/* store the time the cmd was recvd */
|
||||
if (orte_timing) {
|
||||
orte_daemon_msg_recvd.tv_sec = mesg_recvd.tv_sec;
|
||||
orte_daemon_msg_recvd.tv_usec = mesg_recvd.tv_usec;
|
||||
}
|
||||
}
|
||||
|
||||
/* rewind the buffer to the beginning */
|
||||
|
@ -47,6 +47,7 @@
|
||||
/* globals used by RTE */
|
||||
bool orte_timing;
|
||||
FILE *orte_timing_output = NULL;
|
||||
bool orte_timing_details;
|
||||
bool orte_debug_daemons_file_flag = false;
|
||||
bool orte_leave_session_attached;
|
||||
bool orte_do_not_launch = false;
|
||||
|
@ -118,6 +118,21 @@ typedef struct orte_job_t orte_job_t;
|
||||
} while(0);
|
||||
|
||||
|
||||
/* define a macro for computing time differences - used for timing tests
|
||||
* across the code base
|
||||
*/
|
||||
#define ORTE_COMPUTE_TIME_DIFF(r, ur, s1, us1, s2, us2) \
|
||||
do { \
|
||||
(r) = (s2) - (s1); \
|
||||
if ((us2) >= (us1)) { \
|
||||
(ur) = (us2) - (us1); \
|
||||
} else { \
|
||||
(r)--; \
|
||||
(ur) = 1000000 - (us1) + (us2); \
|
||||
} \
|
||||
} while(0);
|
||||
|
||||
|
||||
/* global type definitions used by RTE - instanced in orte_globals.c */
|
||||
|
||||
/************
|
||||
@ -400,6 +415,7 @@ ORTE_DECLSPEC orte_job_t* orte_get_job_data_object(orte_jobid_t job);
|
||||
/* global variables used by RTE - instanced in orte_globals.c */
|
||||
ORTE_DECLSPEC extern bool orte_timing;
|
||||
ORTE_DECLSPEC extern FILE *orte_timing_output;
|
||||
ORTE_DECLSPEC extern bool orte_timing_details;
|
||||
ORTE_DECLSPEC extern bool orte_debug_daemons_flag, orte_debug_daemons_file_flag;
|
||||
ORTE_DECLSPEC extern bool orte_leave_session_attached;
|
||||
ORTE_DECLSPEC extern bool orte_do_not_launch;
|
||||
|
@ -145,6 +145,15 @@ int orte_register_params(void)
|
||||
"Request that critical timing loops be measured",
|
||||
false, false, (int)false, &value);
|
||||
orte_timing = OPAL_INT_TO_BOOL(value);
|
||||
|
||||
mca_base_param_reg_int_name("orte", "timing_details",
|
||||
"Request that detailed timing data by reported",
|
||||
false, false, (int)false, &value);
|
||||
orte_timing_details = OPAL_INT_TO_BOOL(value);
|
||||
if (orte_timing_details) {
|
||||
/* ensure the timing flag is set too */
|
||||
orte_timing = true;
|
||||
}
|
||||
|
||||
if (orte_process_info.hnp) {
|
||||
char *tmp;
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user