Bring the timing instrumentation to the trunk.
If you want to look at our launch and MPI process startup times, you can do so with two MCA params: OMPI_MCA_orte_timing: set it to anything non-zero and you will get the launch time for different steps in the job launch procedure. The degree of detail depends on the launch environment. rsh will provide you with the average, min, and max launch time for the daemons. SLURM block launches the daemon, so you only get the time to launch the daemons and the total time to launch the job. Ditto for bproc. TM looks more like rsh. Only those four environments are currently supported - anyone interested in extending this capability to other environs is welcome to do so. In all cases, you also get the time to setup the job for launch. OMPI_MCA_ompi_timing: set it to anything non-zero and you will get the time for mpi_init to reach the compound registry command, the time to execute that command, the time to go from our stage1 barrier to the stage2 barrier, and the time to go from the stage2 barrier to the end of mpi_init. This will be output for each process, so you'll have to compile any statistics on your own. Note: if someone develops a nice parser to do so, it would be really appreciated if you could/would share! This commit was SVN r12302.
Этот коммит содержится в:
родитель
d6ff14ed61
Коммит
36d4511143
@ -208,7 +208,7 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
|||||||
bool compound_cmd = false;
|
bool compound_cmd = false;
|
||||||
bool timing = false;
|
bool timing = false;
|
||||||
int param, value;
|
int param, value;
|
||||||
struct timeval ompistart, ompistop;
|
struct timeval ompistart, ompistop, stg2start, stg3start;
|
||||||
|
|
||||||
/* Join the run-time environment - do the things that don't hit
|
/* Join the run-time environment - do the things that don't hit
|
||||||
the registry */
|
the registry */
|
||||||
@ -219,7 +219,7 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* check to see if we want timing information */
|
/* check to see if we want timing information */
|
||||||
param = mca_base_param_reg_int_name("orte", "timing",
|
param = mca_base_param_reg_int_name("ompi", "timing",
|
||||||
"Request that critical timing loops be measured",
|
"Request that critical timing loops be measured",
|
||||||
false, false, 0, &value);
|
false, false, 0, &value);
|
||||||
if (value != 0) {
|
if (value != 0) {
|
||||||
@ -506,9 +506,9 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
|||||||
if (0 != gettimeofday(&ompistop, NULL)) {
|
if (0 != gettimeofday(&ompistop, NULL)) {
|
||||||
opal_output(0, "ompi_mpi_init: could not obtain stop time");
|
opal_output(0, "ompi_mpi_init: could not obtain stop time");
|
||||||
} else {
|
} else {
|
||||||
opal_output(0, "ompi_mpi_init: time from start to exec_compound_cmd %ld sec %ld usec",
|
opal_output(0, "ompi_mpi_init: time from start to exec_compound_cmd %ld usec",
|
||||||
(long int)(ompistop.tv_sec - ompistart.tv_sec),
|
(long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
|
||||||
(long int)(ompistop.tv_usec - ompistart.tv_usec));
|
(ompistop.tv_usec - ompistart.tv_usec)));
|
||||||
if (0 != gettimeofday(&ompistart, NULL)) {
|
if (0 != gettimeofday(&ompistart, NULL)) {
|
||||||
opal_output(0, "ompi_mpi_init: could not obtain new start time");
|
opal_output(0, "ompi_mpi_init: could not obtain new start time");
|
||||||
ompistart.tv_sec = ompistop.tv_sec;
|
ompistart.tv_sec = ompistop.tv_sec;
|
||||||
@ -532,9 +532,9 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
|||||||
if (0 != gettimeofday(&ompistop, NULL)) {
|
if (0 != gettimeofday(&ompistop, NULL)) {
|
||||||
opal_output(0, "ompi_mpi_init: could not obtain stop time after compound_cmd");
|
opal_output(0, "ompi_mpi_init: could not obtain stop time after compound_cmd");
|
||||||
} else {
|
} else {
|
||||||
opal_output(0, "ompi_mpi_init: time to exec_compound_cmd %ld sec %ld usec",
|
opal_output(0, "ompi_mpi_init: time to execute compound command %ld usec",
|
||||||
(long int)(ompistop.tv_sec - ompistart.tv_sec),
|
(long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
|
||||||
(long int)(ompistop.tv_usec - ompistart.tv_usec));
|
(ompistop.tv_usec - ompistart.tv_usec)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -546,6 +546,13 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
|||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* check for timing request - get start time */
|
||||||
|
if (timing) {
|
||||||
|
if (0 != gettimeofday(&stg2start, NULL)) {
|
||||||
|
opal_output(0, "ompi_mpi_init: could not obtain stop time after compound_cmd");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/* start PTL's */
|
/* start PTL's */
|
||||||
ret = MCA_PML_CALL(enable(true));
|
ret = MCA_PML_CALL(enable(true));
|
||||||
if( OMPI_SUCCESS != ret ) {
|
if( OMPI_SUCCESS != ret ) {
|
||||||
@ -638,6 +645,17 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
|||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* check for timing request - get stop time and report elapsed time if so */
|
||||||
|
if (timing) {
|
||||||
|
if (0 != gettimeofday(&ompistop, NULL)) {
|
||||||
|
opal_output(0, "ompi_mpi_init: could not obtain stop time after compound_cmd");
|
||||||
|
} else {
|
||||||
|
opal_output(0, "ompi_mpi_init: time from stage1 to stage2 %ld usec",
|
||||||
|
(long int)((ompistop.tv_sec - stg2start.tv_sec)*1000000 +
|
||||||
|
(ompistop.tv_usec - stg2start.tv_usec)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/* Second barrier -- wait for message from
|
/* Second barrier -- wait for message from
|
||||||
RMGR_PROC_STAGE_GATE_MGR to arrive */
|
RMGR_PROC_STAGE_GATE_MGR to arrive */
|
||||||
|
|
||||||
@ -648,6 +666,13 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
|||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* check for timing request - get start time */
|
||||||
|
if (timing) {
|
||||||
|
if (0 != gettimeofday(&stg3start, NULL)) {
|
||||||
|
opal_output(0, "ompi_mpi_init: could not obtain start time for stg3");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/* new very last step: check whether we have been spawned or not.
|
/* new very last step: check whether we have been spawned or not.
|
||||||
We introduce that at the very end, since we need collectives,
|
We introduce that at the very end, since we need collectives,
|
||||||
datatypes, ptls etc. up and running here.... */
|
datatypes, ptls etc. up and running here.... */
|
||||||
@ -705,5 +730,16 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
|||||||
/* Do we need to wait for a TotalView-like debugger? */
|
/* Do we need to wait for a TotalView-like debugger? */
|
||||||
ompi_wait_for_totalview();
|
ompi_wait_for_totalview();
|
||||||
|
|
||||||
|
/* check for timing request - get stop time and report elapsed time if so */
|
||||||
|
if (timing) {
|
||||||
|
if (0 != gettimeofday(&ompistop, NULL)) {
|
||||||
|
opal_output(0, "ompi_mpi_init: could not obtain stop time at end");
|
||||||
|
} else {
|
||||||
|
opal_output(0, "ompi_mpi_init: time from stage2 to complete mpi_init %ld usec",
|
||||||
|
(long int)((ompistop.tv_sec - stg3start.tv_sec)*1000000 +
|
||||||
|
(ompistop.tv_usec - stg3start.tv_usec)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return MPI_SUCCESS;
|
return MPI_SUCCESS;
|
||||||
}
|
}
|
||||||
|
@ -511,13 +511,15 @@ static int orte_pls_bproc_launch_daemons(orte_cellid_t cellid, char *** envp,
|
|||||||
opal_list_t daemons;
|
opal_list_t daemons;
|
||||||
orte_pls_daemon_info_t *dmn;
|
orte_pls_daemon_info_t *dmn;
|
||||||
opal_list_item_t *item;
|
opal_list_item_t *item;
|
||||||
|
struct timeval joblaunchstart, launchstart, launchstop;
|
||||||
|
|
||||||
OPAL_TRACE(1);
|
OPAL_TRACE(1);
|
||||||
|
|
||||||
/* clean out any MCA component selection directives that
|
if (mca_pls_bproc_component.timing) {
|
||||||
* won't work on remote nodes
|
if (0 != gettimeofday(&joblaunchstart, NULL)) {
|
||||||
*/
|
opal_output(0, "pls_bproc: could not obtain start time");
|
||||||
orte_pls_base_purge_mca_params(envp);
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/* setup a list that will contain the info for all the daemons
|
/* setup a list that will contain the info for all the daemons
|
||||||
* so we can store it on the registry when done
|
* so we can store it on the registry when done
|
||||||
@ -633,7 +635,25 @@ static int orte_pls_bproc_launch_daemons(orte_cellid_t cellid, char *** envp,
|
|||||||
|
|
||||||
/* launch the daemons */
|
/* launch the daemons */
|
||||||
mca_pls_bproc_component.num_daemons += num_daemons;
|
mca_pls_bproc_component.num_daemons += num_daemons;
|
||||||
|
|
||||||
|
if (mca_pls_bproc_component.timing) {
|
||||||
|
if (0 != gettimeofday(&launchstart, NULL)) {
|
||||||
|
opal_output(0, "pls_bproc: could not obtain start time");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
rc = bproc_vexecmove(num_daemons, daemon_list, pids, orted_path, argv, *envp);
|
rc = bproc_vexecmove(num_daemons, daemon_list, pids, orted_path, argv, *envp);
|
||||||
|
|
||||||
|
if (mca_pls_bproc_component.timing) {
|
||||||
|
if (0 != gettimeofday(&launchstop, NULL)) {
|
||||||
|
opal_output(0, "pls_bproc: could not obtain stop time");
|
||||||
|
} else {
|
||||||
|
opal_output(0, "pls_bproc: daemon launch time is %ld usec",
|
||||||
|
(launchstop.tv_sec - launchstart.tv_sec)*1000000 +
|
||||||
|
(launchstop.tv_usec - launchstart.tv_usec));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if(rc != num_daemons) {
|
if(rc != num_daemons) {
|
||||||
opal_show_help("help-pls-bproc.txt", "daemon-launch-number", true,
|
opal_show_help("help-pls-bproc.txt", "daemon-launch-number", true,
|
||||||
num_daemons, rc, orted_path);
|
num_daemons, rc, orted_path);
|
||||||
@ -699,6 +719,16 @@ static int orte_pls_bproc_launch_daemons(orte_cellid_t cellid, char *** envp,
|
|||||||
}
|
}
|
||||||
*num_launched = num_daemons;
|
*num_launched = num_daemons;
|
||||||
|
|
||||||
|
if (mca_pls_bproc_component.timing) {
|
||||||
|
if (0 != gettimeofday(&launchstop, NULL)) {
|
||||||
|
opal_output(0, "pls_bproc: could not obtain stop time");
|
||||||
|
} else {
|
||||||
|
opal_output(0, "pls_bproc: total job launch time is %ld usec",
|
||||||
|
(launchstop.tv_sec - joblaunchstart.tv_sec)*1000000 +
|
||||||
|
(launchstop.tv_usec - joblaunchstart.tv_usec));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
cleanup:
|
cleanup:
|
||||||
if(NULL != argv) {
|
if(NULL != argv) {
|
||||||
opal_argv_free(argv);
|
opal_argv_free(argv);
|
||||||
|
@ -111,6 +111,8 @@ struct orte_pls_bproc_component_t {
|
|||||||
* we will look for it in the user's path */
|
* we will look for it in the user's path */
|
||||||
int debug;
|
int debug;
|
||||||
/**< If greater than 0 print debugging information */
|
/**< If greater than 0 print debugging information */
|
||||||
|
bool timing;
|
||||||
|
/**< If true, report launch timing info */
|
||||||
int num_procs;
|
int num_procs;
|
||||||
/**< The number of processes that are running */
|
/**< The number of processes that are running */
|
||||||
int priority;
|
int priority;
|
||||||
|
@ -54,7 +54,7 @@ orte_pls_bproc_component_t mca_pls_bproc_component = {
|
|||||||
* finishes setting up the component struct.
|
* finishes setting up the component struct.
|
||||||
*/
|
*/
|
||||||
int orte_pls_bproc_component_open(void) {
|
int orte_pls_bproc_component_open(void) {
|
||||||
int rc;
|
int rc, tmp, value;
|
||||||
char *policy;
|
char *policy;
|
||||||
|
|
||||||
/* init parameters */
|
/* init parameters */
|
||||||
@ -89,6 +89,15 @@ int orte_pls_bproc_component_open(void) {
|
|||||||
mca_pls_bproc_component.bynode = false;
|
mca_pls_bproc_component.bynode = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
tmp = mca_base_param_reg_int_name("orte", "timing",
|
||||||
|
"Request that critical timing loops be measured",
|
||||||
|
false, false, 0, &value);
|
||||||
|
if (value != 0) {
|
||||||
|
mca_pls_bproc_component.timing = true;
|
||||||
|
} else {
|
||||||
|
mca_pls_bproc_component.timing = false;
|
||||||
|
}
|
||||||
|
|
||||||
/* init the list to hold the daemon names */
|
/* init the list to hold the daemon names */
|
||||||
rc = orte_pointer_array_init(&mca_pls_bproc_component.daemon_names, 8, 200000, 8);
|
rc = orte_pointer_array_init(&mca_pls_bproc_component.daemon_names, 8, 200000, 8);
|
||||||
/* init the list to hold the daemon names */
|
/* init the list to hold the daemon names */
|
||||||
|
@ -63,6 +63,7 @@ struct orte_pls_rsh_component_t {
|
|||||||
bool debug;
|
bool debug;
|
||||||
bool debug_malloc;
|
bool debug_malloc;
|
||||||
bool debug_daemons;
|
bool debug_daemons;
|
||||||
|
bool timing;
|
||||||
bool reap;
|
bool reap;
|
||||||
bool assume_same_shell;
|
bool assume_same_shell;
|
||||||
int delay;
|
int delay;
|
||||||
|
@ -108,7 +108,7 @@ orte_pls_rsh_component_t mca_pls_rsh_component = {
|
|||||||
|
|
||||||
int orte_pls_rsh_component_open(void)
|
int orte_pls_rsh_component_open(void)
|
||||||
{
|
{
|
||||||
int tmp;
|
int tmp, value;
|
||||||
mca_base_component_t *c = &mca_pls_rsh_component.super.pls_version;
|
mca_base_component_t *c = &mca_pls_rsh_component.super.pls_version;
|
||||||
|
|
||||||
/* initialize globals */
|
/* initialize globals */
|
||||||
@ -140,12 +140,20 @@ int orte_pls_rsh_component_open(void)
|
|||||||
false, false, false, &tmp);
|
false, false, false, &tmp);
|
||||||
mca_pls_rsh_component.debug = OPAL_INT_TO_BOOL(tmp);
|
mca_pls_rsh_component.debug = OPAL_INT_TO_BOOL(tmp);
|
||||||
}
|
}
|
||||||
|
|
||||||
mca_base_param_reg_int_name("orte", "debug_daemons",
|
mca_base_param_reg_int_name("orte", "debug_daemons",
|
||||||
"Whether or not to enable debugging daemons (0 or 1)",
|
"Whether or not to enable debugging of daemons (0 or 1)",
|
||||||
false, false, false, &tmp);
|
false, false, false, &tmp);
|
||||||
mca_pls_rsh_component.debug_daemons = OPAL_INT_TO_BOOL(tmp);
|
mca_pls_rsh_component.debug_daemons = OPAL_INT_TO_BOOL(tmp);
|
||||||
|
|
||||||
|
tmp = mca_base_param_reg_int_name("orte", "timing",
|
||||||
|
"Request that critical timing loops be measured",
|
||||||
|
false, false, 0, &value);
|
||||||
|
if (value != 0) {
|
||||||
|
mca_pls_rsh_component.timing = true;
|
||||||
|
} else {
|
||||||
|
mca_pls_rsh_component.timing = false;
|
||||||
|
}
|
||||||
|
|
||||||
mca_base_param_reg_string(c, "orted",
|
mca_base_param_reg_string(c, "orted",
|
||||||
"The command name that the rsh pls component will invoke for the ORTE daemon",
|
"The command name that the rsh pls component will invoke for the ORTE daemon",
|
||||||
false, false, "orted",
|
false, false, "orted",
|
||||||
|
@ -41,9 +41,6 @@
|
|||||||
#ifdef HAVE_SYS_TYPES_H
|
#ifdef HAVE_SYS_TYPES_H
|
||||||
#include <sys/types.h>
|
#include <sys/types.h>
|
||||||
#endif
|
#endif
|
||||||
#ifdef HAVE_UNISTD_H
|
|
||||||
#include <unistd.h>
|
|
||||||
#endif
|
|
||||||
#ifdef HAVE_SYS_STAT_H
|
#ifdef HAVE_SYS_STAT_H
|
||||||
#include <sys/stat.h>
|
#include <sys/stat.h>
|
||||||
#endif
|
#endif
|
||||||
@ -129,8 +126,14 @@ static const char * orte_pls_rsh_shell_name[] = {
|
|||||||
"unknown"
|
"unknown"
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/* local global storage of timing variables */
|
||||||
|
static unsigned long mintime=999999999, miniter, maxtime=0, maxiter;
|
||||||
|
static float avgtime=0.0;
|
||||||
|
static struct timeval *launchstart;
|
||||||
|
static struct timeval joblaunchstart, joblaunchstop;
|
||||||
|
|
||||||
/* local global storage of the list of active daemons */
|
/* local global storage of the list of active daemons */
|
||||||
opal_list_t active_daemons;
|
static opal_list_t active_daemons;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -281,6 +284,8 @@ static void orte_pls_rsh_wait_daemon(pid_t pid, int status, void* cbdata)
|
|||||||
orte_mapped_proc_t *proc;
|
orte_mapped_proc_t *proc;
|
||||||
opal_list_item_t *item;
|
opal_list_item_t *item;
|
||||||
int rc;
|
int rc;
|
||||||
|
unsigned long deltat;
|
||||||
|
struct timeval launchstop;
|
||||||
|
|
||||||
/* if ssh exited abnormally, set the child processes to aborted
|
/* if ssh exited abnormally, set the child processes to aborted
|
||||||
and print something useful to the user. The usual reasons for
|
and print something useful to the user. The usual reasons for
|
||||||
@ -345,9 +350,6 @@ static void orte_pls_rsh_wait_daemon(pid_t pid, int status, void* cbdata)
|
|||||||
} else {
|
} else {
|
||||||
opal_output(0, "No extra status information is available: %d.", status);
|
opal_output(0, "No extra status information is available: %d.", status);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
/* release any waiting threads */
|
|
||||||
OPAL_THREAD_LOCK(&mca_pls_rsh_component.lock);
|
OPAL_THREAD_LOCK(&mca_pls_rsh_component.lock);
|
||||||
/* tell the system that this daemon is gone */
|
/* tell the system that this daemon is gone */
|
||||||
if (ORTE_SUCCESS != (rc = orte_pls_base_remove_daemon(info))) {
|
if (ORTE_SUCCESS != (rc = orte_pls_base_remove_daemon(info))) {
|
||||||
@ -357,12 +359,57 @@ static void orte_pls_rsh_wait_daemon(pid_t pid, int status, void* cbdata)
|
|||||||
/* remove the daemon from our local list */
|
/* remove the daemon from our local list */
|
||||||
opal_list_remove_item(&active_daemons, &info->super);
|
opal_list_remove_item(&active_daemons, &info->super);
|
||||||
OBJ_RELEASE(info);
|
OBJ_RELEASE(info);
|
||||||
|
OPAL_THREAD_UNLOCK(&mca_pls_rsh_component.lock);
|
||||||
|
} /* if abnormal exit */
|
||||||
|
|
||||||
|
/* release any waiting threads */
|
||||||
|
OPAL_THREAD_LOCK(&mca_pls_rsh_component.lock);
|
||||||
|
/* first check timing request */
|
||||||
|
if (mca_pls_rsh_component.timing) {
|
||||||
|
if (0 != gettimeofday(&launchstop, NULL)) {
|
||||||
|
opal_output(0, "pls_rsh: could not obtain stop time");
|
||||||
|
} else {
|
||||||
|
deltat = (launchstop.tv_sec - launchstart[info->name->vpid].tv_sec)*1000000 +
|
||||||
|
(launchstop.tv_usec - launchstart[info->name->vpid].tv_usec);
|
||||||
|
avgtime = avgtime + deltat;
|
||||||
|
if (deltat < mintime) {
|
||||||
|
mintime = deltat;
|
||||||
|
miniter = (unsigned long)info->name->vpid;
|
||||||
|
}
|
||||||
|
if (deltat > maxtime) {
|
||||||
|
maxtime = deltat;
|
||||||
|
maxiter = (unsigned long)info->name->vpid;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (mca_pls_rsh_component.num_children-- >=
|
if (mca_pls_rsh_component.num_children-- >=
|
||||||
mca_pls_rsh_component.num_concurrent ||
|
mca_pls_rsh_component.num_concurrent ||
|
||||||
mca_pls_rsh_component.num_children == 0) {
|
mca_pls_rsh_component.num_children == 0) {
|
||||||
opal_condition_signal(&mca_pls_rsh_component.cond);
|
opal_condition_signal(&mca_pls_rsh_component.cond);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (mca_pls_rsh_component.timing && mca_pls_rsh_component.num_children == 0) {
|
||||||
|
if (0 != gettimeofday(&joblaunchstop, NULL)) {
|
||||||
|
opal_output(0, "pls_rsh: could not obtain job launch stop time");
|
||||||
|
} else {
|
||||||
|
deltat = (joblaunchstop.tv_sec - joblaunchstart.tv_sec)*1000000 +
|
||||||
|
(joblaunchstop.tv_usec - joblaunchstart.tv_usec);
|
||||||
|
opal_output(0, "pls_rsh: total time to launch job is %lu usec", deltat);
|
||||||
|
if (mintime < 999999999) {
|
||||||
|
/* had at least one non-local node */
|
||||||
|
avgtime = avgtime/opal_list_get_size(&active_daemons);
|
||||||
|
opal_output(0, "pls_rsh: average time to launch one daemon %f usec", avgtime);
|
||||||
|
opal_output(0, "pls_rsh: min time to launch a daemon was %lu usec for iter %lu", mintime, miniter);
|
||||||
|
opal_output(0, "pls_rsh: max time to launch a daemon was %lu usec for iter %lu", maxtime, maxiter);
|
||||||
|
} else {
|
||||||
|
opal_output(0, "No nonlocal launches to report for timing info");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
free(launchstart);
|
||||||
|
}
|
||||||
|
|
||||||
OPAL_THREAD_UNLOCK(&mca_pls_rsh_component.lock);
|
OPAL_THREAD_UNLOCK(&mca_pls_rsh_component.lock);
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -387,7 +434,6 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
|
|||||||
char *uri, *param;
|
char *uri, *param;
|
||||||
char **argv, **tmp;
|
char **argv, **tmp;
|
||||||
char *prefix_dir;
|
char *prefix_dir;
|
||||||
char **env;
|
|
||||||
int argc;
|
int argc;
|
||||||
int rc;
|
int rc;
|
||||||
sigset_t sigs;
|
sigset_t sigs;
|
||||||
@ -397,6 +443,14 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
|
|||||||
char *lib_base = NULL, *bin_base = NULL;
|
char *lib_base = NULL, *bin_base = NULL;
|
||||||
orte_pls_daemon_info_t *dmn;
|
orte_pls_daemon_info_t *dmn;
|
||||||
|
|
||||||
|
if (mca_pls_rsh_component.timing) {
|
||||||
|
if (0 != gettimeofday(&joblaunchstart, NULL)) {
|
||||||
|
opal_output(0, "pls_rsh: could not obtain start time");
|
||||||
|
joblaunchstart.tv_sec = 0;
|
||||||
|
joblaunchstart.tv_usec = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/* setup a list that will contain the info for all the daemons
|
/* setup a list that will contain the info for all the daemons
|
||||||
* so we can store it on the registry when done and use it
|
* so we can store it on the registry when done and use it
|
||||||
* locally to track their state
|
* locally to track their state
|
||||||
@ -414,10 +468,6 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
|
|||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (mca_pls_rsh_component.debug) {
|
|
||||||
orte_dss.dump(0, map, ORTE_JOB_MAP);
|
|
||||||
}
|
|
||||||
|
|
||||||
num_nodes = (orte_std_cntr_t)opal_list_get_size(&map->nodes);
|
num_nodes = (orte_std_cntr_t)opal_list_get_size(&map->nodes);
|
||||||
|
|
||||||
if (mca_pls_rsh_component.debug_daemons &&
|
if (mca_pls_rsh_component.debug_daemons &&
|
||||||
@ -648,63 +698,14 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
|
|||||||
lib_base = opal_basename(OPAL_LIBDIR);
|
lib_base = opal_basename(OPAL_LIBDIR);
|
||||||
bin_base = opal_basename(OPAL_BINDIR);
|
bin_base = opal_basename(OPAL_BINDIR);
|
||||||
|
|
||||||
/* copy the environment so we can modify it with opal functions. The
|
|
||||||
* environment is the same for all daemons, so we only need to do
|
|
||||||
* this once
|
|
||||||
*/
|
|
||||||
env = opal_argv_copy(environ);
|
|
||||||
|
|
||||||
/* If we have a prefix, then modify the PATH and
|
|
||||||
LD_LIBRARY_PATH environment variables
|
|
||||||
*/
|
|
||||||
if (NULL != prefix_dir) {
|
|
||||||
char *oldenv, *newenv;
|
|
||||||
|
|
||||||
/* Reset PATH */
|
|
||||||
newenv = opal_os_path( false, prefix_dir, bin_base, NULL );
|
|
||||||
oldenv = getenv("PATH");
|
|
||||||
if (NULL != oldenv) {
|
|
||||||
char *temp;
|
|
||||||
asprintf(&temp, "%s:%s", newenv, oldenv );
|
|
||||||
free( newenv );
|
|
||||||
newenv = temp;
|
|
||||||
}
|
|
||||||
opal_setenv("PATH", newenv, true, &env);
|
|
||||||
if (mca_pls_rsh_component.debug) {
|
|
||||||
opal_output(0, "pls:rsh: reset PATH: %s", newenv);
|
|
||||||
}
|
|
||||||
free(newenv);
|
|
||||||
|
|
||||||
/* Reset LD_LIBRARY_PATH */
|
|
||||||
newenv = opal_os_path( false, prefix_dir, lib_base, NULL );
|
|
||||||
oldenv = getenv("LD_LIBRARY_PATH");
|
|
||||||
if (NULL != oldenv) {
|
|
||||||
char* temp;
|
|
||||||
asprintf(&temp, "%s:%s", newenv, oldenv);
|
|
||||||
free(newenv);
|
|
||||||
newenv = temp;
|
|
||||||
}
|
|
||||||
opal_setenv("LD_LIBRARY_PATH", newenv, true, &env);
|
|
||||||
if (mca_pls_rsh_component.debug) {
|
|
||||||
opal_output(0, "pls:rsh: reset LD_LIBRARY_PATH: %s",
|
|
||||||
newenv);
|
|
||||||
}
|
|
||||||
free(newenv);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* ensure we aren't the seed */
|
|
||||||
param = mca_base_param_environ_variable("seed",NULL,NULL);
|
|
||||||
opal_setenv(param, "0", true, &env);
|
|
||||||
free(param);
|
|
||||||
|
|
||||||
/* clean out any MCA component selection directives that
|
|
||||||
* won't work on remote nodes
|
|
||||||
*/
|
|
||||||
orte_pls_base_purge_mca_params(&env);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Iterate through each of the nodes
|
* Iterate through each of the nodes
|
||||||
*/
|
*/
|
||||||
|
if (mca_pls_rsh_component.timing) {
|
||||||
|
/* allocate space to track the start times */
|
||||||
|
launchstart = (struct timeval*)malloc((num_nodes+vpid) * sizeof(struct timeval));
|
||||||
|
}
|
||||||
|
|
||||||
for(n_item = opal_list_get_first(&map->nodes);
|
for(n_item = opal_list_get_first(&map->nodes);
|
||||||
n_item != opal_list_get_end(&map->nodes);
|
n_item != opal_list_get_end(&map->nodes);
|
||||||
n_item = opal_list_get_next(n_item)) {
|
n_item = opal_list_get_next(n_item)) {
|
||||||
@ -715,6 +716,12 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
|
|||||||
|
|
||||||
rmaps_node = (orte_mapped_node_t*)n_item;
|
rmaps_node = (orte_mapped_node_t*)n_item;
|
||||||
|
|
||||||
|
if (mca_pls_rsh_component.timing) {
|
||||||
|
if (0 != gettimeofday(&launchstart[vpid], NULL)) {
|
||||||
|
opal_output(0, "pls_rsh: could not obtain start time");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/* new daemon - setup to record its info */
|
/* new daemon - setup to record its info */
|
||||||
dmn = OBJ_NEW(orte_pls_daemon_info_t);
|
dmn = OBJ_NEW(orte_pls_daemon_info_t);
|
||||||
dmn->active_job = jobid;
|
dmn->active_job = jobid;
|
||||||
@ -750,7 +757,7 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
|
|||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* rsh a child to exec the rsh/ssh session */
|
/* fork a child to exec the rsh/ssh session */
|
||||||
|
|
||||||
/* set the process state to "launched" */
|
/* set the process state to "launched" */
|
||||||
if (ORTE_SUCCESS != (rc = orte_smr.set_proc_state(name, ORTE_PROC_STATE_LAUNCHED, 0))) {
|
if (ORTE_SUCCESS != (rc = orte_smr.set_proc_state(name, ORTE_PROC_STATE_LAUNCHED, 0))) {
|
||||||
@ -767,6 +774,7 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
|
|||||||
/* child */
|
/* child */
|
||||||
if (pid == 0) {
|
if (pid == 0) {
|
||||||
char* name_string;
|
char* name_string;
|
||||||
|
char** env;
|
||||||
char* var;
|
char* var;
|
||||||
long fd, fdmax = sysconf(_SC_OPEN_MAX);
|
long fd, fdmax = sysconf(_SC_OPEN_MAX);
|
||||||
|
|
||||||
@ -795,6 +803,15 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
|
|||||||
opal_output(0, "pls:rsh: %s is a LOCAL node\n",
|
opal_output(0, "pls:rsh: %s is a LOCAL node\n",
|
||||||
rmaps_node->nodename);
|
rmaps_node->nodename);
|
||||||
}
|
}
|
||||||
|
if (mca_pls_rsh_component.timing) {
|
||||||
|
/* since this is a local launch, the daemon will never reach
|
||||||
|
* the waitpid callback - so set the start value to
|
||||||
|
* something nonsensical
|
||||||
|
*/
|
||||||
|
launchstart[vpid].tv_sec = 0;
|
||||||
|
launchstart[vpid].tv_usec = 0;
|
||||||
|
}
|
||||||
|
|
||||||
exec_argv = &argv[local_exec_index];
|
exec_argv = &argv[local_exec_index];
|
||||||
exec_path = opal_path_findv(exec_argv[0], 0, environ, NULL);
|
exec_path = opal_path_findv(exec_argv[0], 0, environ, NULL);
|
||||||
|
|
||||||
@ -816,6 +833,44 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* If we have a prefix, then modify the PATH and
|
||||||
|
LD_LIBRARY_PATH environment variables. We're
|
||||||
|
already in the child process, so it's ok to modify
|
||||||
|
environ. */
|
||||||
|
if (NULL != prefix_dir) {
|
||||||
|
char *oldenv, *newenv;
|
||||||
|
|
||||||
|
/* Reset PATH */
|
||||||
|
newenv = opal_os_path( false, prefix_dir, bin_base, NULL );
|
||||||
|
oldenv = getenv("PATH");
|
||||||
|
if (NULL != oldenv) {
|
||||||
|
char *temp;
|
||||||
|
asprintf(&temp, "%s:%s", newenv, oldenv );
|
||||||
|
free( newenv );
|
||||||
|
newenv = temp;
|
||||||
|
}
|
||||||
|
opal_setenv("PATH", newenv, true, &environ);
|
||||||
|
if (mca_pls_rsh_component.debug) {
|
||||||
|
opal_output(0, "pls:rsh: reset PATH: %s", newenv);
|
||||||
|
}
|
||||||
|
free(newenv);
|
||||||
|
|
||||||
|
/* Reset LD_LIBRARY_PATH */
|
||||||
|
newenv = opal_os_path( false, prefix_dir, lib_base, NULL );
|
||||||
|
oldenv = getenv("LD_LIBRARY_PATH");
|
||||||
|
if (NULL != oldenv) {
|
||||||
|
char* temp;
|
||||||
|
asprintf(&temp, "%s:%s", newenv, oldenv);
|
||||||
|
free(newenv);
|
||||||
|
newenv = temp;
|
||||||
|
}
|
||||||
|
opal_setenv("LD_LIBRARY_PATH", newenv, true, &environ);
|
||||||
|
if (mca_pls_rsh_component.debug) {
|
||||||
|
opal_output(0, "pls:rsh: reset LD_LIBRARY_PATH: %s",
|
||||||
|
newenv);
|
||||||
|
}
|
||||||
|
free(newenv);
|
||||||
|
}
|
||||||
|
|
||||||
/* Since this is a local execution, we need to
|
/* Since this is a local execution, we need to
|
||||||
potentially whack the final ")" in the argv (if
|
potentially whack the final ")" in the argv (if
|
||||||
@ -846,7 +901,7 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
|
|||||||
}
|
}
|
||||||
/* Ignore errors -- what are we going to do?
|
/* Ignore errors -- what are we going to do?
|
||||||
(and we ignore errors on the remote nodes
|
(and we ignore errors on the remote nodes
|
||||||
in the odls, so this is consistent) */
|
in the fork pls, so this is consistent) */
|
||||||
chdir(var);
|
chdir(var);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@ -938,6 +993,11 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
|
|||||||
sigprocmask(0, 0, &sigs);
|
sigprocmask(0, 0, &sigs);
|
||||||
sigprocmask(SIG_UNBLOCK, &sigs, 0);
|
sigprocmask(SIG_UNBLOCK, &sigs, 0);
|
||||||
|
|
||||||
|
/* setup environment */
|
||||||
|
env = opal_argv_copy(environ);
|
||||||
|
var = mca_base_param_environ_variable("seed",NULL,NULL);
|
||||||
|
opal_setenv(var, "0", true, &env);
|
||||||
|
|
||||||
/* exec the daemon */
|
/* exec the daemon */
|
||||||
if (mca_pls_rsh_component.debug) {
|
if (mca_pls_rsh_component.debug) {
|
||||||
param = opal_argv_join(exec_argv, ' ');
|
param = opal_argv_join(exec_argv, ' ');
|
||||||
@ -946,7 +1006,6 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
|
|||||||
free(param);
|
free(param);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
execve(exec_path, exec_argv, env);
|
execve(exec_path, exec_argv, env);
|
||||||
opal_output(0, "pls:rsh: execv failed with errno=%d\n", errno);
|
opal_output(0, "pls:rsh: execv failed with errno=%d\n", errno);
|
||||||
exit(-1);
|
exit(-1);
|
||||||
@ -1001,7 +1060,6 @@ cleanup:
|
|||||||
|
|
||||||
free(jobid_string); /* done with this variable */
|
free(jobid_string); /* done with this variable */
|
||||||
opal_argv_free(argv);
|
opal_argv_free(argv);
|
||||||
opal_argv_free(env);
|
|
||||||
|
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
@ -32,6 +32,7 @@ extern "C" {
|
|||||||
orte_pls_base_component_t super;
|
orte_pls_base_component_t super;
|
||||||
int priority;
|
int priority;
|
||||||
int debug;
|
int debug;
|
||||||
|
bool timing;
|
||||||
char *orted;
|
char *orted;
|
||||||
char *custom_args;
|
char *custom_args;
|
||||||
};
|
};
|
||||||
|
@ -103,6 +103,7 @@ orte_pls_slurm_component_t mca_pls_slurm_component = {
|
|||||||
static int pls_slurm_open(void)
|
static int pls_slurm_open(void)
|
||||||
{
|
{
|
||||||
mca_base_component_t *comp = &mca_pls_slurm_component.super.pls_version;
|
mca_base_component_t *comp = &mca_pls_slurm_component.super.pls_version;
|
||||||
|
int tmp, value;
|
||||||
|
|
||||||
mca_base_param_reg_int(comp, "debug", "Enable debugging of slurm pls",
|
mca_base_param_reg_int(comp, "debug", "Enable debugging of slurm pls",
|
||||||
false, false, 0,
|
false, false, 0,
|
||||||
@ -122,6 +123,15 @@ static int pls_slurm_open(void)
|
|||||||
false, false, "orted",
|
false, false, "orted",
|
||||||
&mca_pls_slurm_component.orted);
|
&mca_pls_slurm_component.orted);
|
||||||
|
|
||||||
|
tmp = mca_base_param_reg_int_name("orte", "timing",
|
||||||
|
"Request that critical timing loops be measured",
|
||||||
|
false, false, 0, &value);
|
||||||
|
if (value != 0) {
|
||||||
|
mca_pls_slurm_component.timing = true;
|
||||||
|
} else {
|
||||||
|
mca_pls_slurm_component.timing = false;
|
||||||
|
}
|
||||||
|
|
||||||
mca_base_param_reg_string(comp, "args",
|
mca_base_param_reg_string(comp, "args",
|
||||||
"Custom arguments to srun",
|
"Custom arguments to srun",
|
||||||
false, false, NULL,
|
false, false, NULL,
|
||||||
|
@ -38,6 +38,9 @@
|
|||||||
#ifdef HAVE_SYS_TYPES_H
|
#ifdef HAVE_SYS_TYPES_H
|
||||||
#include <sys/types.h>
|
#include <sys/types.h>
|
||||||
#endif
|
#endif
|
||||||
|
#ifdef HAVE_SYS_TIME_H
|
||||||
|
#include <sys/time.h>
|
||||||
|
#endif
|
||||||
#ifdef HAVE_SYS_STAT_H
|
#ifdef HAVE_SYS_STAT_H
|
||||||
#include <sys/stat.h>
|
#include <sys/stat.h>
|
||||||
#endif
|
#endif
|
||||||
@ -132,6 +135,13 @@ static int pls_slurm_launch_job(orte_jobid_t jobid)
|
|||||||
char *cur_prefix;
|
char *cur_prefix;
|
||||||
opal_list_t daemons;
|
opal_list_t daemons;
|
||||||
orte_pls_daemon_info_t *dmn;
|
orte_pls_daemon_info_t *dmn;
|
||||||
|
struct timeval joblaunchstart, launchstart, launchstop;
|
||||||
|
|
||||||
|
if (mca_pls_slurm_component.timing) {
|
||||||
|
if (0 != gettimeofday(&joblaunchstart, NULL)) {
|
||||||
|
opal_output(0, "pls_slurm: could not obtain job start time");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/* setup a list that will contain the info for all the daemons
|
/* setup a list that will contain the info for all the daemons
|
||||||
* so we can store it on the registry when done
|
* so we can store it on the registry when done
|
||||||
@ -369,13 +379,28 @@ static int pls_slurm_launch_job(orte_jobid_t jobid)
|
|||||||
var = mca_base_param_environ_variable("seed", NULL, NULL);
|
var = mca_base_param_environ_variable("seed", NULL, NULL);
|
||||||
opal_setenv(var, "0", true, &env);
|
opal_setenv(var, "0", true, &env);
|
||||||
|
|
||||||
/* clean out any MCA component selection directives that
|
if (mca_pls_slurm_component.timing) {
|
||||||
* won't work on remote nodes
|
if (0 != gettimeofday(&launchstart, NULL)) {
|
||||||
*/
|
opal_output(0, "pls_slurm: could not obtain start time");
|
||||||
orte_pls_base_purge_mca_params(&env);
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/* exec the daemon */
|
/* exec the daemon */
|
||||||
rc = pls_slurm_start_proc(argc, argv, env, cur_prefix);
|
rc = pls_slurm_start_proc(argc, argv, env, cur_prefix);
|
||||||
|
|
||||||
|
if (mca_pls_slurm_component.timing) {
|
||||||
|
if (0 != gettimeofday(&launchstop, NULL)) {
|
||||||
|
opal_output(0, "pls_slurm: could not obtain stop time");
|
||||||
|
} else {
|
||||||
|
opal_output(0, "pls_slurm: daemon block launch time is %ld usec",
|
||||||
|
(launchstop.tv_sec - launchstart.tv_sec)*1000000 +
|
||||||
|
(launchstop.tv_usec - launchstart.tv_usec));
|
||||||
|
opal_output(0, "pls_slurm: total job launch time is %ld usec",
|
||||||
|
(launchstop.tv_sec - joblaunchstart.tv_sec)*1000000 +
|
||||||
|
(launchstop.tv_usec - joblaunchstart.tv_usec));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (ORTE_SUCCESS != rc) {
|
if (ORTE_SUCCESS != rc) {
|
||||||
opal_output(0, "pls:slurm: start_procs returned error %d", rc);
|
opal_output(0, "pls:slurm: start_procs returned error %d", rc);
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
|
@ -37,6 +37,7 @@ extern "C" {
|
|||||||
bool want_path_check;
|
bool want_path_check;
|
||||||
char *orted;
|
char *orted;
|
||||||
char **checked_paths;
|
char **checked_paths;
|
||||||
|
bool timing;
|
||||||
};
|
};
|
||||||
typedef struct orte_pls_tm_component_t orte_pls_tm_component_t;
|
typedef struct orte_pls_tm_component_t orte_pls_tm_component_t;
|
||||||
|
|
||||||
|
@ -95,7 +95,7 @@ orte_pls_tm_component_t mca_pls_tm_component = {
|
|||||||
|
|
||||||
static int pls_tm_open(void)
|
static int pls_tm_open(void)
|
||||||
{
|
{
|
||||||
int tmp;
|
int tmp, value;
|
||||||
mca_base_component_t *comp = &mca_pls_tm_component.super.pls_version;
|
mca_base_component_t *comp = &mca_pls_tm_component.super.pls_version;
|
||||||
|
|
||||||
mca_base_param_reg_int(comp, "debug", "Enable debugging of the TM pls",
|
mca_base_param_reg_int(comp, "debug", "Enable debugging of the TM pls",
|
||||||
@ -115,6 +115,15 @@ static int pls_tm_open(void)
|
|||||||
false, false, (int) true, &tmp);
|
false, false, (int) true, &tmp);
|
||||||
mca_pls_tm_component.want_path_check = (bool) tmp;
|
mca_pls_tm_component.want_path_check = (bool) tmp;
|
||||||
|
|
||||||
|
tmp = mca_base_param_reg_int_name("orte", "timing",
|
||||||
|
"Request that critical timing loops be measured",
|
||||||
|
false, false, 0, &value);
|
||||||
|
if (value != 0) {
|
||||||
|
mca_pls_tm_component.timing = true;
|
||||||
|
} else {
|
||||||
|
mca_pls_tm_component.timing = false;
|
||||||
|
}
|
||||||
|
|
||||||
mca_pls_tm_component.checked_paths = NULL;
|
mca_pls_tm_component.checked_paths = NULL;
|
||||||
|
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
|
@ -30,11 +30,18 @@
|
|||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
#endif
|
#endif
|
||||||
#include <signal.h>
|
#include <signal.h>
|
||||||
|
#ifdef HAVE_SYS_TYPES_H
|
||||||
#include <sys/types.h>
|
#include <sys/types.h>
|
||||||
|
#endif
|
||||||
|
#ifdef HAVE_SYS_WAIT_H
|
||||||
#include <sys/wait.h>
|
#include <sys/wait.h>
|
||||||
|
#endif
|
||||||
#ifdef HAVE_SCHED_H
|
#ifdef HAVE_SCHED_H
|
||||||
#include <sched.h>
|
#include <sched.h>
|
||||||
#endif
|
#endif
|
||||||
|
#ifdef HAVE_SYS_TIME_H
|
||||||
|
#include <sys/time.h>
|
||||||
|
#endif
|
||||||
#include <errno.h>
|
#include <errno.h>
|
||||||
#include <tm.h>
|
#include <tm.h>
|
||||||
|
|
||||||
@ -138,6 +145,17 @@ static int pls_tm_launch_job(orte_jobid_t jobid)
|
|||||||
tm_event_t event;
|
tm_event_t event;
|
||||||
opal_list_t daemons;
|
opal_list_t daemons;
|
||||||
orte_pls_daemon_info_t *dmn;
|
orte_pls_daemon_info_t *dmn;
|
||||||
|
struct timeval launchstart, launchstop, completionstart, completionstop;
|
||||||
|
struct timeval jobstart, jobstop;
|
||||||
|
int maxtime=0, mintime=99999999, maxiter, miniter, deltat;
|
||||||
|
float avgtime=0.0;
|
||||||
|
|
||||||
|
/* check for timing request - get start time if so */
|
||||||
|
if (mca_pls_tm_component.timing) {
|
||||||
|
if (0 != gettimeofday(&jobstart, NULL)) {
|
||||||
|
opal_output(0, "pls_tm: could not obtain job start time");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/* Query the map for this job.
|
/* Query the map for this job.
|
||||||
* We need the entire mapping for a couple of reasons:
|
* We need the entire mapping for a couple of reasons:
|
||||||
@ -397,6 +415,15 @@ static int pls_tm_launch_job(orte_jobid_t jobid)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* check for timing request - get start time if so */
|
||||||
|
if (mca_pls_tm_component.timing) {
|
||||||
|
if (0 != gettimeofday(&launchstart, NULL)) {
|
||||||
|
opal_output(0, "pls_tm: could not obtain start time");
|
||||||
|
launchstart.tv_sec = 0;
|
||||||
|
launchstart.tv_usec = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
rc = pls_tm_start_proc(node->nodename, argc, argv, env,
|
rc = pls_tm_start_proc(node->nodename, argc, argv, env,
|
||||||
tm_task_ids + launched,
|
tm_task_ids + launched,
|
||||||
tm_events + launched);
|
tm_events + launched);
|
||||||
@ -404,6 +431,25 @@ static int pls_tm_launch_job(orte_jobid_t jobid)
|
|||||||
opal_output(0, "pls:tm: start_procs returned error %d", rc);
|
opal_output(0, "pls:tm: start_procs returned error %d", rc);
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
|
/* check for timing request - get stop time and process if so */
|
||||||
|
if (mca_pls_tm_component.timing) {
|
||||||
|
if (0 != gettimeofday(&launchstop, NULL)) {
|
||||||
|
opal_output(0, "pls_tm: could not obtain stop time");
|
||||||
|
} else {
|
||||||
|
deltat = (launchstop.tv_sec - launchstart.tv_sec)*1000000 +
|
||||||
|
(launchstop.tv_usec - launchstart.tv_usec);
|
||||||
|
avgtime = avgtime + deltat / num_nodes;
|
||||||
|
if (deltat < mintime) {
|
||||||
|
mintime = deltat;
|
||||||
|
miniter = launched;
|
||||||
|
}
|
||||||
|
if (deltat > maxtime) {
|
||||||
|
maxtime = deltat;
|
||||||
|
maxiter = launched;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
launched++;
|
launched++;
|
||||||
++vpid;
|
++vpid;
|
||||||
free(name);
|
free(name);
|
||||||
@ -415,6 +461,15 @@ static int pls_tm_launch_job(orte_jobid_t jobid)
|
|||||||
opal_output(0, "pls:tm:launch: finished spawning orteds\n");
|
opal_output(0, "pls:tm:launch: finished spawning orteds\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* check for timing request - get start time for launch completion */
|
||||||
|
if (mca_pls_tm_component.timing) {
|
||||||
|
if (0 != gettimeofday(&completionstart, NULL)) {
|
||||||
|
opal_output(0, "pls_tm: could not obtain completion start time");
|
||||||
|
completionstart.tv_sec = 0;
|
||||||
|
completionstart.tv_usec = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/* all done, so store the daemon info on the registry */
|
/* all done, so store the daemon info on the registry */
|
||||||
if (ORTE_SUCCESS != (rc = orte_pls_base_store_active_daemons(&daemons))) {
|
if (ORTE_SUCCESS != (rc = orte_pls_base_store_active_daemons(&daemons))) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
@ -430,6 +485,22 @@ static int pls_tm_launch_job(orte_jobid_t jobid)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* check for timing request - get stop time for launch completion and report */
|
||||||
|
if (mca_pls_tm_component.timing) {
|
||||||
|
if (0 != gettimeofday(&completionstop, NULL)) {
|
||||||
|
opal_output(0, "pls_tm: could not obtain completion stop time");
|
||||||
|
} else {
|
||||||
|
deltat = (launchstop.tv_sec - launchstart.tv_sec)*1000000 +
|
||||||
|
(launchstop.tv_usec - launchstart.tv_usec);
|
||||||
|
opal_output(0, "pls_tm: launch completion required %d usec", deltat);
|
||||||
|
}
|
||||||
|
opal_output(0, "pls_tm: Launch statistics:");
|
||||||
|
opal_output(0, "pls_tm: Average time to launch an orted: %f usec", avgtime);
|
||||||
|
opal_output(0, "pls_tm: Max time to launch an orted: %d usec at iter %d", maxtime, maxiter);
|
||||||
|
opal_output(0, "pls_tm: Min time to launch an orted: %d usec at iter %d", mintime, miniter);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
cleanup:
|
cleanup:
|
||||||
OBJ_RELEASE(map);
|
OBJ_RELEASE(map);
|
||||||
|
|
||||||
@ -456,6 +527,17 @@ static int pls_tm_launch_job(orte_jobid_t jobid)
|
|||||||
}
|
}
|
||||||
OBJ_DESTRUCT(&daemons);
|
OBJ_DESTRUCT(&daemons);
|
||||||
|
|
||||||
|
/* check for timing request - get stop time and process if so */
|
||||||
|
if (mca_pls_tm_component.timing) {
|
||||||
|
if (0 != gettimeofday(&jobstop, NULL)) {
|
||||||
|
opal_output(0, "pls_tm: could not obtain stop time");
|
||||||
|
} else {
|
||||||
|
deltat = (jobstop.tv_sec - jobstart.tv_sec)*1000000 +
|
||||||
|
(jobstop.tv_usec - jobstart.tv_usec);
|
||||||
|
opal_output(0, "pls_tm: launch of entire job required %d usec", deltat);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (mca_pls_tm_component.debug) {
|
if (mca_pls_tm_component.debug) {
|
||||||
opal_output(0, "pls:tm:launch: finished\n");
|
opal_output(0, "pls:tm:launch: finished\n");
|
||||||
}
|
}
|
||||||
|
@ -408,9 +408,9 @@ static int orte_rmgr_urm_spawn_job(
|
|||||||
if (0 != gettimeofday(&urmstop, NULL)) {
|
if (0 != gettimeofday(&urmstop, NULL)) {
|
||||||
opal_output(0, "rmgr_urm: could not obtain stop time");
|
opal_output(0, "rmgr_urm: could not obtain stop time");
|
||||||
} else {
|
} else {
|
||||||
opal_output(0, "rmgr_urm: job setup time is %ld sec %ld usec",
|
opal_output(0, "rmgr_urm: job setup time is %ld usec",
|
||||||
(long int)(urmstop.tv_sec - urmstart.tv_sec),
|
(long int)((urmstop.tv_sec - urmstart.tv_sec)*1000000 +
|
||||||
(long int)(urmstop.tv_usec - urmstart.tv_usec));
|
(urmstop.tv_usec - urmstart.tv_usec)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -422,6 +422,17 @@ static int orte_rmgr_urm_spawn_job(
|
|||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* check for timing request - get start time if so */
|
||||||
|
if (mca_rmgr_urm_component.timing) {
|
||||||
|
if (0 != gettimeofday(&urmstart, NULL)) {
|
||||||
|
opal_output(0, "rmgr_urm: could not obtain launch stop time");
|
||||||
|
} else {
|
||||||
|
opal_output(0, "rmgr_urm: launch time is %ld usec",
|
||||||
|
(long int)((urmstart.tv_sec - urmstop.tv_sec)*1000000 +
|
||||||
|
(urmstart.tv_usec - urmstop.tv_usec)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user