First cut at direct launch for TM. Able to launch non-ORTE procs and detect their completion for a clean shutdown.
This commit was SVN r17732.
Этот коммит содержится в:
родитель
10c2ce7d35
Коммит
06d3145fe4
@ -64,8 +64,10 @@
|
|||||||
#include "opal/runtime/opal_progress.h"
|
#include "opal/runtime/opal_progress.h"
|
||||||
|
|
||||||
#include "orte/util/name_fns.h"
|
#include "orte/util/name_fns.h"
|
||||||
|
#include "orte/util/context_fns.h"
|
||||||
#include "orte/runtime/orte_globals.h"
|
#include "orte/runtime/orte_globals.h"
|
||||||
#include "orte/runtime/orte_wait.h"
|
#include "orte/runtime/orte_wait.h"
|
||||||
|
#include "orte/runtime/orte_wakeup.h"
|
||||||
#include "orte/mca/errmgr/errmgr.h"
|
#include "orte/mca/errmgr/errmgr.h"
|
||||||
#include "orte/mca/rmaps/rmaps.h"
|
#include "orte/mca/rmaps/rmaps.h"
|
||||||
|
|
||||||
@ -73,7 +75,8 @@
|
|||||||
#include "orte/mca/plm/base/plm_private.h"
|
#include "orte/mca/plm/base/plm_private.h"
|
||||||
#include "plm_tmd.h"
|
#include "plm_tmd.h"
|
||||||
|
|
||||||
|
/* define frequency of checking job complete */
|
||||||
|
#define PLM_CHECK_COMPLETE_TIME 1
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Local functions
|
* Local functions
|
||||||
@ -88,6 +91,13 @@ static int plm_tmd_finalize(void);
|
|||||||
static int plm_tmd_connect(void);
|
static int plm_tmd_connect(void);
|
||||||
static int plm_tmd_disconnect(void);
|
static int plm_tmd_disconnect(void);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Local "global" variables
|
||||||
|
*/
|
||||||
|
static tm_task_id *tm_task_ids = NULL;
|
||||||
|
static int num_tids;
|
||||||
|
static orte_job_t *active_jdata;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Global variable
|
* Global variable
|
||||||
*/
|
*/
|
||||||
@ -115,6 +125,89 @@ static int plm_tmd_init(void)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* check for job completion */
|
||||||
|
static void check_job_complete(int fd, short evnt, void *arg)
|
||||||
|
{
|
||||||
|
tm_event_t event, *tm_events;
|
||||||
|
int *exit_codes, i, rc, local_err;
|
||||||
|
orte_vpid_t v;
|
||||||
|
orte_proc_t **procs;
|
||||||
|
|
||||||
|
opal_output(0, "checking job completion");
|
||||||
|
|
||||||
|
tm_events = malloc(sizeof(tm_event_t) * num_tids);
|
||||||
|
exit_codes = malloc(sizeof(int) * num_tids);
|
||||||
|
|
||||||
|
/* connect to the mom */
|
||||||
|
rc = plm_tmd_connect();
|
||||||
|
if (ORTE_SUCCESS != rc) {
|
||||||
|
opal_output(0, "CANNOT CONNECT TO MOM");
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* cycle through the task_id's and check for complete */
|
||||||
|
for (i=0; i < num_tids; i++) {
|
||||||
|
if (ORTE_SUCCESS != (rc = tm_obit(tm_task_ids[i], exit_codes + i, tm_events + i))) {
|
||||||
|
opal_output(0, "CANNOT CHECK PROC %d", i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
opal_output(0, "obits requested");
|
||||||
|
|
||||||
|
|
||||||
|
/* cycle through and poll the events */
|
||||||
|
for (i=0; i < num_tids; i++) {
|
||||||
|
rc = tm_poll(TM_NULL_EVENT, &event, 1, &local_err);
|
||||||
|
if (TM_SUCCESS != rc) {
|
||||||
|
errno = local_err;
|
||||||
|
opal_output(0, "plm:tmd: failed to poll obit, return status = %d", rc);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
opal_output(0, "obits polled");
|
||||||
|
|
||||||
|
|
||||||
|
/* store the exit codes */
|
||||||
|
procs = (orte_proc_t**)active_jdata->procs->addr;
|
||||||
|
for (v=0; v < active_jdata->num_procs; v++) {
|
||||||
|
procs[v]->exit_code = exit_codes[v];
|
||||||
|
opal_output(0, "rank %d ecode %d", (int)v, exit_codes[v]);
|
||||||
|
if (WIFEXITED(exit_codes[v])) {
|
||||||
|
if (procs[v]->state < ORTE_PROC_STATE_TERMINATED) {
|
||||||
|
procs[v]->state = ORTE_PROC_STATE_TERMINATED;
|
||||||
|
active_jdata->num_terminated++;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
procs[v]->state = ORTE_PROC_STATE_ABORTED_BY_SIG;
|
||||||
|
active_jdata->abort = true;
|
||||||
|
active_jdata->aborted_proc = procs[v];
|
||||||
|
active_jdata->state = ORTE_JOB_STATE_ABORTED;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* disconnect from mom */
|
||||||
|
plm_tmd_disconnect();
|
||||||
|
|
||||||
|
cleanup:
|
||||||
|
/* free tm_events */
|
||||||
|
free(tm_events);
|
||||||
|
free(exit_codes);
|
||||||
|
|
||||||
|
/* check for completion */
|
||||||
|
if (active_jdata->num_terminated >= active_jdata->num_procs) {
|
||||||
|
active_jdata->state = ORTE_JOB_STATE_TERMINATED;
|
||||||
|
orte_wakeup(0);
|
||||||
|
} else if (active_jdata->state == ORTE_JOB_STATE_ABORTED &&
|
||||||
|
!orte_abnormal_term_ordered && !orte_abort_in_progress) {
|
||||||
|
orte_errmgr.proc_aborted(&(active_jdata->aborted_proc->name),
|
||||||
|
active_jdata->aborted_proc->exit_code);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* reset the timer */
|
||||||
|
ORTE_TIMER_EVENT(1, check_job_complete);
|
||||||
|
opal_output(0, "timer reset");
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/* When working in this function, ALWAYS jump to "cleanup" if
|
/* When working in this function, ALWAYS jump to "cleanup" if
|
||||||
* you encounter an error so that orterun will be woken up and
|
* you encounter an error so that orterun will be woken up and
|
||||||
* the job can cleanly terminate
|
* the job can cleanly terminate
|
||||||
@ -124,35 +217,22 @@ static int plm_tmd_launch_job(orte_job_t *jdata)
|
|||||||
orte_job_map_t *map = NULL;
|
orte_job_map_t *map = NULL;
|
||||||
orte_app_context_t **apps;
|
orte_app_context_t **apps;
|
||||||
orte_node_t **nodes;
|
orte_node_t **nodes;
|
||||||
int node_name_index;
|
orte_proc_t **procs;
|
||||||
int proc_vpid_index;
|
char *param, *param2;
|
||||||
char *param;
|
|
||||||
char **env = NULL;
|
char **env = NULL;
|
||||||
char *var;
|
|
||||||
char **argv = NULL;
|
|
||||||
int argc;
|
|
||||||
int rc;
|
int rc;
|
||||||
bool connected = false;
|
bool connected = false;
|
||||||
orte_std_cntr_t launched = 0, i;
|
orte_std_cntr_t launched = 0, i, n;
|
||||||
char *bin_base = NULL, *lib_base = NULL;
|
char *bin_base = NULL, *lib_base = NULL;
|
||||||
tm_event_t *tm_events = NULL;
|
|
||||||
tm_task_id *tm_task_ids = NULL;
|
|
||||||
int local_err;
|
int local_err;
|
||||||
tm_event_t event;
|
tm_event_t event;
|
||||||
struct timeval launchstart, launchstop, completionstart, completionstop;
|
|
||||||
struct timeval jobstart, jobstop;
|
|
||||||
int maxtime=0, mintime=99999999, maxiter = 0, miniter = 0, deltat;
|
|
||||||
float avgtime=0.0;
|
|
||||||
bool failed_launch = true;
|
bool failed_launch = true;
|
||||||
mode_t current_umask;
|
orte_vpid_t v;
|
||||||
|
tm_event_t *tm_events = NULL;
|
||||||
|
|
||||||
|
/* record who we are working on */
|
||||||
|
active_jdata = jdata;
|
||||||
|
|
||||||
|
|
||||||
/* check for timing request - get start time if so */
|
|
||||||
if (orte_timing) {
|
|
||||||
if (0 != gettimeofday(&jobstart, NULL)) {
|
|
||||||
opal_output(0, "plm_tmd: could not obtain job start time");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
/* create a jobid for this job */
|
/* create a jobid for this job */
|
||||||
if (ORTE_SUCCESS != (rc = orte_plm_base_create_jobid(&jdata->jobid))) {
|
if (ORTE_SUCCESS != (rc = orte_plm_base_create_jobid(&jdata->jobid))) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
@ -178,175 +258,299 @@ static int plm_tmd_launch_job(orte_job_t *jdata)
|
|||||||
}
|
}
|
||||||
apps = (orte_app_context_t**)jdata->apps->addr;
|
apps = (orte_app_context_t**)jdata->apps->addr;
|
||||||
nodes = (orte_node_t**)map->nodes->addr;
|
nodes = (orte_node_t**)map->nodes->addr;
|
||||||
|
procs = (orte_proc_t**)jdata->procs->addr;
|
||||||
|
|
||||||
if (0 == map->num_new_daemons) {
|
/* Allocate a bunch of TM events to use for tm_spawn()ing - we will
|
||||||
/* have all the daemons we need - launch app */
|
* need one/process since we are not launching daemons
|
||||||
goto launch_apps;
|
*/
|
||||||
}
|
tm_events = malloc(sizeof(tm_event_t) * jdata->num_procs);
|
||||||
|
|
||||||
/* Allocate a bunch of TM events to use for tm_spawn()ing */
|
|
||||||
tm_events = malloc(sizeof(tm_event_t) * map->num_new_daemons);
|
|
||||||
if (NULL == tm_events) {
|
if (NULL == tm_events) {
|
||||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
tm_task_ids = malloc(sizeof(tm_task_id) * map->num_new_daemons);
|
tm_task_ids = malloc(sizeof(tm_task_id) * jdata->num_procs);
|
||||||
if (NULL == tm_task_ids) {
|
if (NULL == tm_task_ids) {
|
||||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
|
num_tids = jdata->num_procs;
|
||||||
|
|
||||||
/* Add basic orted command line options */
|
/* Figure out the basenames for the libdir and bindir. There is a
|
||||||
orte_plm_base_orted_append_basic_args(&argc, &argv, "env",
|
lengthy comment about this in plm_rsh_module.c explaining all
|
||||||
&proc_vpid_index,
|
the rationale for how / why we're doing this. */
|
||||||
&node_name_index);
|
lib_base = opal_basename(opal_install_dirs.libdir);
|
||||||
|
bin_base = opal_basename(opal_install_dirs.bindir);
|
||||||
if (0 < opal_output_get_verbosity(orte_plm_globals.output)) {
|
|
||||||
param = opal_argv_join(argv, ' ');
|
|
||||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
/* setup the environment for each app_context - we need to insert enviro
|
||||||
"%s plm:tm: final top-level argv:\n\t%s",
|
* values that tell the procs how to bootstrap themselves
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
*/
|
||||||
(NULL == param) ? "NULL" : param));
|
for (n=0; n < jdata->num_apps; n++) {
|
||||||
if (NULL != param) free(param);
|
orte_app_context_t *context = apps[n];
|
||||||
|
char ***env = &context->env;
|
||||||
|
|
||||||
|
/* setup base environment: copy the current environ and merge
|
||||||
|
* in the app context environ
|
||||||
|
*/
|
||||||
|
if (NULL != context->env) {
|
||||||
|
*env = opal_environ_merge(orte_launch_environ, context->env);
|
||||||
|
} else {
|
||||||
|
*env = opal_argv_copy(orte_launch_environ);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Try to change to the context cwd and check that the app
|
||||||
|
* exists and is executable The function will
|
||||||
|
* take care of outputting a pretty error message, if required
|
||||||
|
*/
|
||||||
|
if (ORTE_SUCCESS != (rc = orte_util_check_context_cwd(context, true))) {
|
||||||
|
/* do not ERROR_LOG - it will be reported elsewhere */
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
if (ORTE_SUCCESS != (rc = orte_util_check_context_app(context))) {
|
||||||
|
/* do not ERROR_LOG - it will be reported elsewhere */
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* special case handling for --prefix: this is somewhat icky,
|
||||||
|
* but at least some users do this. :-\ It is possible that
|
||||||
|
* when using --prefix, the user will also "-x PATH" and/or
|
||||||
|
* "-x LD_LIBRARY_PATH", which would therefore clobber the
|
||||||
|
* work that was done in the prior pls to ensure that we have
|
||||||
|
* the prefix at the beginning of the PATH and
|
||||||
|
* LD_LIBRARY_PATH. So examine the context->env and see if we
|
||||||
|
* find PATH or LD_LIBRARY_PATH. If found, that means the
|
||||||
|
* prior work was clobbered, and we need to re-prefix those
|
||||||
|
* variables.
|
||||||
|
*/
|
||||||
|
for (i = 0; NULL != context->env && NULL != context->env[i]; ++i) {
|
||||||
|
char *newenv;
|
||||||
|
|
||||||
|
/* Reset PATH */
|
||||||
|
if (0 == strncmp("PATH=", context->env[i], 5)) {
|
||||||
|
asprintf(&newenv, "%s/%s:%s",
|
||||||
|
apps[n]->prefix_dir, bin_base, context->env[i] + 5);
|
||||||
|
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||||
|
"%s plm:tmd: resetting PATH: %s",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
newenv));
|
||||||
|
opal_setenv("PATH", newenv, true, env);
|
||||||
|
free(newenv);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Reset LD_LIBRARY_PATH */
|
||||||
|
else if (0 == strncmp("LD_LIBRARY_PATH=", context->env[i], 16)) {
|
||||||
|
asprintf(&newenv, "%s/%s:%s",
|
||||||
|
apps[n]->prefix_dir, lib_base, context->env[i] + 16);
|
||||||
|
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||||
|
"%s plm:tm: resetting LD_LIBRARY_PATH: %s",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
newenv));
|
||||||
|
opal_setenv("LD_LIBRARY_PATH", newenv, true, env);
|
||||||
|
free(newenv);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* there will be no local daemon */
|
||||||
|
param = mca_base_param_environ_variable("orte","local_daemon","uri");
|
||||||
|
opal_unsetenv(param, env);
|
||||||
|
free(param);
|
||||||
|
|
||||||
|
/* pass the hnp's contact info to the local proc */
|
||||||
|
param = mca_base_param_environ_variable("orte","hnp","uri");
|
||||||
|
opal_setenv(param, orte_process_info.my_hnp_uri, true, env);
|
||||||
|
free(param);
|
||||||
|
|
||||||
|
/* set the app_context number into the environment */
|
||||||
|
param = mca_base_param_environ_variable("orte","app","num");
|
||||||
|
asprintf(¶m2, "%ld", (long)context->idx);
|
||||||
|
opal_setenv(param, param2, true, env);
|
||||||
|
free(param);
|
||||||
|
free(param2);
|
||||||
|
|
||||||
|
/* set the universe size in the environment */
|
||||||
|
param = mca_base_param_environ_variable("orte","universe","size");
|
||||||
|
asprintf(¶m2, "%ld", (long)jdata->total_slots_alloc);
|
||||||
|
opal_setenv(param, param2, true, env);
|
||||||
|
free(param);
|
||||||
|
|
||||||
|
/* although the total_slots_alloc is the universe size, users
|
||||||
|
* would appreciate being given a public environmental variable
|
||||||
|
* that also represents this value - something MPI specific - so
|
||||||
|
* do that here.
|
||||||
|
*
|
||||||
|
* AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT.
|
||||||
|
* We know - just live with it
|
||||||
|
*/
|
||||||
|
opal_setenv("OMPI_UNIVERSE_SIZE", param2, true, env);
|
||||||
|
free(param2);
|
||||||
|
|
||||||
|
/* tell the proc to use the "envd" - i.e., the environment direct
|
||||||
|
* ESS module to set itself up
|
||||||
|
*/
|
||||||
|
param = mca_base_param_environ_variable("ess",NULL,"NULL");
|
||||||
|
opal_setenv(param, "envd", true, env);
|
||||||
|
free(param);
|
||||||
|
|
||||||
|
/* since we want to pass the name as separate components, make sure
|
||||||
|
* that the "name" environmental variable is cleared!
|
||||||
|
*/
|
||||||
|
if(NULL == (param = mca_base_param_environ_variable("orte","ess","name"))) {
|
||||||
|
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||||
|
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||||
|
}
|
||||||
|
opal_unsetenv(param, env);
|
||||||
|
free(param);
|
||||||
|
|
||||||
|
/* tell the proc the jobid - same for everyone */
|
||||||
|
if (ORTE_SUCCESS != (rc = orte_util_convert_jobid_to_string(¶m2, jdata->jobid))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
if(NULL == (param = mca_base_param_environ_variable("orte","ess","jobid"))) {
|
||||||
|
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||||
|
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
opal_setenv(param, param2, true, env);
|
||||||
|
free(param);
|
||||||
|
free(param2);
|
||||||
|
|
||||||
|
/* tell the proc the #procs in this job */
|
||||||
|
asprintf(¶m2, "%ld", (long) jdata->num_procs);
|
||||||
|
if(NULL == (param = mca_base_param_environ_variable("orte","ess","num_procs"))) {
|
||||||
|
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||||
|
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||||
|
}
|
||||||
|
opal_setenv(param, param2, true, env);
|
||||||
|
free(param);
|
||||||
|
|
||||||
|
/* although the num_procs is the comm_world size, users
|
||||||
|
* would appreciate being given a public environmental variable
|
||||||
|
* that also represents this value - something MPI specific - so
|
||||||
|
* do that here.
|
||||||
|
*
|
||||||
|
* AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT.
|
||||||
|
* We know - just live with it
|
||||||
|
*/
|
||||||
|
opal_setenv("OMPI_COMM_WORLD_SIZE", param2, true, env);
|
||||||
|
free(param2);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
rc = plm_tmd_connect();
|
rc = plm_tmd_connect();
|
||||||
if (ORTE_SUCCESS != rc) {
|
if (ORTE_SUCCESS != rc) {
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
connected = true;
|
connected = true;
|
||||||
|
|
||||||
/* Figure out the basenames for the libdir and bindir. There is a
|
/* Iterate through each of the procs and launch it */
|
||||||
lengthy comment about this in plm_rsh_module.c explaining all
|
for (v = 0; v < jdata->num_procs; v++) {
|
||||||
the rationale for how / why we're doing this. */
|
orte_proc_t *proc = procs[v];
|
||||||
lib_base = opal_basename(opal_install_dirs.libdir);
|
orte_node_t* node = proc->node;
|
||||||
bin_base = opal_basename(opal_install_dirs.bindir);
|
orte_app_context_t *context = apps[proc->app_idx];
|
||||||
|
char **environ_copy;
|
||||||
/* setup environment */
|
int argc;
|
||||||
env = opal_argv_copy(environ);
|
|
||||||
|
|
||||||
/* add our umask -- see big note in orted.c */
|
|
||||||
current_umask = umask(0);
|
|
||||||
umask(current_umask);
|
|
||||||
asprintf(&var, "0%o", current_umask);
|
|
||||||
opal_setenv("ORTE_DAEMON_UMASK_VALUE", var, true, &env);
|
|
||||||
free(var);
|
|
||||||
|
|
||||||
/* If we have a prefix, then modify the PATH and
|
|
||||||
LD_LIBRARY_PATH environment variables. We only allow
|
|
||||||
a single prefix to be specified. Since there will
|
|
||||||
always be at least one app_context, we take it from
|
|
||||||
there
|
|
||||||
*/
|
|
||||||
if (NULL != apps[0]->prefix_dir) {
|
|
||||||
char *newenv;
|
|
||||||
|
|
||||||
for (i = 0; NULL != env && NULL != env[i]; ++i) {
|
/* copy the environment */
|
||||||
/* Reset PATH */
|
environ_copy = opal_argv_copy(context->env);
|
||||||
if (0 == strncmp("PATH=", env[i], 5)) {
|
|
||||||
asprintf(&newenv, "%s/%s:%s",
|
|
||||||
apps[0]->prefix_dir, bin_base, env[i] + 5);
|
|
||||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
|
||||||
"%s plm:tm: resetting PATH: %s",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
||||||
newenv));
|
|
||||||
opal_setenv("PATH", newenv, true, &env);
|
|
||||||
free(newenv);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Reset LD_LIBRARY_PATH */
|
|
||||||
else if (0 == strncmp("LD_LIBRARY_PATH=", env[i], 16)) {
|
|
||||||
asprintf(&newenv, "%s/%s:%s",
|
|
||||||
apps[0]->prefix_dir, lib_base, env[i] + 16);
|
|
||||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
|
||||||
"%s plm:tm: resetting LD_LIBRARY_PATH: %s",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
||||||
newenv));
|
|
||||||
opal_setenv("LD_LIBRARY_PATH", newenv, true, &env);
|
|
||||||
free(newenv);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Iterate through each of the nodes and spin
|
|
||||||
* up a daemon.
|
|
||||||
*/
|
|
||||||
for (i = 0; i < map->num_nodes; i++) {
|
|
||||||
orte_node_t* node = nodes[i];
|
|
||||||
char* vpid_string;
|
|
||||||
|
|
||||||
/* if this daemon already exists, don't launch it! */
|
/* pass the proc's vpid */
|
||||||
if (node->daemon_launched) {
|
if (ORTE_SUCCESS != (rc = orte_util_convert_vpid_to_string(¶m2, proc->name.vpid))) {
|
||||||
continue;
|
ORTE_ERROR_LOG(rc);
|
||||||
|
return rc;
|
||||||
}
|
}
|
||||||
|
if(NULL == (param = mca_base_param_environ_variable("orte","ess","vpid"))) {
|
||||||
/* setup node name */
|
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||||
free(argv[node_name_index]);
|
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||||
argv[node_name_index] = strdup(node->name);
|
goto cleanup;
|
||||||
|
}
|
||||||
|
opal_setenv(param, param2, true, &environ_copy);
|
||||||
|
free(param);
|
||||||
|
/* although the vpid IS the process' rank within the job, users
|
||||||
|
* would appreciate being given a public environmental variable
|
||||||
|
* that also represents this value - something MPI specific - so
|
||||||
|
* do that here.
|
||||||
|
*
|
||||||
|
* AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT.
|
||||||
|
* We know - just live with it
|
||||||
|
*/
|
||||||
|
opal_setenv("OMPI_COMM_WORLD_RANK", param2, true, &environ_copy);
|
||||||
|
free(param2); /* done with this now */
|
||||||
|
|
||||||
|
/* pass the node's launch_id to the process as a "nodeid" so it can
|
||||||
|
* identify which procs are local to it
|
||||||
|
*/
|
||||||
|
asprintf(¶m2, "%ld", (long)node->launch_id);
|
||||||
|
param = mca_base_param_environ_variable("orte","nodeid",NULL);
|
||||||
|
opal_setenv(param, param2, true, &environ_copy);
|
||||||
|
free(param);
|
||||||
|
free(param2);
|
||||||
|
/* ensure we use the same nodename */
|
||||||
|
param = mca_base_param_environ_variable("orte", "base", "nodename");
|
||||||
|
opal_setenv(param, node->name, true, &environ_copy);
|
||||||
|
free(param);
|
||||||
|
|
||||||
|
/* set the local rank */
|
||||||
|
asprintf(¶m2, "%lu", (unsigned long) proc->local_rank);
|
||||||
|
if(NULL == (param = mca_base_param_environ_variable("orte","ess","local_rank"))) {
|
||||||
|
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||||
|
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
opal_setenv(param, param2, true, &environ_copy);
|
||||||
|
free(param);
|
||||||
|
free(param2);
|
||||||
|
|
||||||
|
/* setup yield schedule and processor affinity
|
||||||
|
* We default here to always setting the affinity processor if we want
|
||||||
|
* it. The processor affinity system then determines
|
||||||
|
* if processor affinity is enabled/requested - if so, it then uses
|
||||||
|
* this value to select the process to which the proc is "assigned".
|
||||||
|
* Otherwise, the paffinity subsystem just ignores this value anyway
|
||||||
|
*/
|
||||||
|
param = mca_base_param_environ_variable("mpi", NULL,
|
||||||
|
"paffinity_processor");
|
||||||
|
asprintf(¶m2, "%lu", (unsigned long) proc->local_rank);
|
||||||
|
opal_setenv(param, param2, true, &environ_copy);
|
||||||
|
free(param);
|
||||||
|
free(param2);
|
||||||
|
|
||||||
|
if (node->oversubscribed) {
|
||||||
|
param = mca_base_param_environ_variable("mpi", NULL, "yield_when_idle");
|
||||||
|
opal_setenv(param, "1", false, &environ_copy);
|
||||||
|
} else {
|
||||||
|
param = mca_base_param_environ_variable("mpi", NULL, "yield_when_idle");
|
||||||
|
opal_setenv(param, "0", false, &environ_copy);
|
||||||
|
}
|
||||||
|
free(param);
|
||||||
|
|
||||||
|
asprintf(¶m2, "%ld", (long) node->num_procs);
|
||||||
|
if(NULL == (param = mca_base_param_environ_variable("orte","ess","num_local_procs"))) {
|
||||||
|
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||||
|
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||||
|
}
|
||||||
|
opal_setenv(param, param2, true, &environ_copy);
|
||||||
|
free(param);
|
||||||
|
free(param2);
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||||
"%s plm:tm: launching on node %s",
|
"%s plm:tm: launching on node %s",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
node->name));
|
node->name));
|
||||||
|
|
||||||
/* setup process name */
|
/* set the argc count */
|
||||||
rc = orte_util_convert_vpid_to_string(&vpid_string, nodes[i]->daemon->name.vpid);
|
argc = opal_argv_count(context->argv);
|
||||||
if (ORTE_SUCCESS != rc) {
|
|
||||||
opal_output(0, "plm:tm: unable to get daemon vpid as string");
|
|
||||||
exit(-1);
|
|
||||||
}
|
|
||||||
free(argv[proc_vpid_index]);
|
|
||||||
argv[proc_vpid_index] = strdup(vpid_string);
|
|
||||||
free(vpid_string);
|
|
||||||
|
|
||||||
/* exec the daemon */
|
rc = tm_spawn(argc, context->argv, environ_copy, node->launch_id, tm_task_ids + launched, tm_events + launched);
|
||||||
if (0 < opal_output_get_verbosity(orte_plm_globals.output)) {
|
|
||||||
param = opal_argv_join(argv, ' ');
|
|
||||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
|
||||||
"%s plm:tm: executing:\n\t%s",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
||||||
(NULL == param) ? "NULL" : param));
|
|
||||||
if (NULL != param) free(param);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* check for timing request - get start time if so */
|
|
||||||
if (orte_timing) {
|
|
||||||
if (0 != gettimeofday(&launchstart, NULL)) {
|
|
||||||
opal_output(0, "plm_tmd: could not obtain start time");
|
|
||||||
launchstart.tv_sec = 0;
|
|
||||||
launchstart.tv_usec = 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
rc = tm_spawn(argc, argv, env, node->launch_id, tm_task_ids + launched, tm_events + launched);
|
|
||||||
if (TM_SUCCESS != rc) {
|
if (TM_SUCCESS != rc) {
|
||||||
opal_show_help("help-plm-tm.txt", "tm-spawn-failed",
|
opal_show_help("help-plm-tm.txt", "tm-spawn-failed",
|
||||||
true, argv[0], node->name, node->launch_id);
|
true, context->argv[0], node->name, node->launch_id);
|
||||||
rc = ORTE_ERROR;
|
rc = ORTE_ERROR;
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
|
opal_argv_free(environ_copy);
|
||||||
/* check for timing request - get stop time and process if so */
|
|
||||||
if (orte_timing) {
|
|
||||||
if (0 != gettimeofday(&launchstop, NULL)) {
|
|
||||||
opal_output(0, "plm_tmd: could not obtain stop time");
|
|
||||||
} else {
|
|
||||||
deltat = (launchstop.tv_sec - launchstart.tv_sec)*1000000 +
|
|
||||||
(launchstop.tv_usec - launchstart.tv_usec);
|
|
||||||
avgtime = avgtime + deltat / map->num_new_daemons;
|
|
||||||
if (deltat < mintime) {
|
|
||||||
mintime = deltat;
|
|
||||||
miniter = launched;
|
|
||||||
}
|
|
||||||
if (deltat > maxtime) {
|
|
||||||
maxtime = deltat;
|
|
||||||
maxiter = launched;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
launched++;
|
launched++;
|
||||||
|
|
||||||
/* Allow some progress to occur */
|
/* Allow some progress to occur */
|
||||||
@ -354,18 +558,9 @@ static int plm_tmd_launch_job(orte_job_t *jdata)
|
|||||||
}
|
}
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||||
"%s plm:tm:launch: finished spawning orteds",
|
"%s plm:tm:launch: finished spawning procs",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||||
|
|
||||||
/* check for timing request - get start time for launch completion */
|
|
||||||
if (orte_timing) {
|
|
||||||
if (0 != gettimeofday(&completionstart, NULL)) {
|
|
||||||
opal_output(0, "plm_tmd: could not obtain completion start time");
|
|
||||||
completionstart.tv_sec = 0;
|
|
||||||
completionstart.tv_usec = 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* TM poll for all the spawns */
|
/* TM poll for all the spawns */
|
||||||
for (i = 0; i < launched; ++i) {
|
for (i = 0; i < launched; ++i) {
|
||||||
rc = tm_poll(TM_NULL_EVENT, &event, 1, &local_err);
|
rc = tm_poll(TM_NULL_EVENT, &event, 1, &local_err);
|
||||||
@ -375,17 +570,7 @@ static int plm_tmd_launch_job(orte_job_t *jdata)
|
|||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#if 0
|
||||||
/* wait for daemons to callback */
|
|
||||||
if (ORTE_SUCCESS != (rc = orte_plm_base_daemon_callback(map->num_new_daemons))) {
|
|
||||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
|
||||||
"%s plm:tm: daemon launch failed for job %s on error %s",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
||||||
ORTE_JOBID_PRINT(jdata->jobid), ORTE_ERROR_NAME(rc)));
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
|
|
||||||
launch_apps:
|
|
||||||
if (ORTE_SUCCESS != (rc = orte_plm_base_launch_apps(jdata->jobid))) {
|
if (ORTE_SUCCESS != (rc = orte_plm_base_launch_apps(jdata->jobid))) {
|
||||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||||
"%s plm:tm: launch of apps failed for job %s on error %s",
|
"%s plm:tm: launch of apps failed for job %s on error %s",
|
||||||
@ -393,30 +578,19 @@ launch_apps:
|
|||||||
ORTE_JOBID_PRINT(jdata->jobid), ORTE_ERROR_NAME(rc)));
|
ORTE_JOBID_PRINT(jdata->jobid), ORTE_ERROR_NAME(rc)));
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
/* if we get here, then everything launched okay - record that fact */
|
/* if we get here, then everything launched okay - record that fact */
|
||||||
failed_launch = false;
|
failed_launch = false;
|
||||||
|
|
||||||
/* check for timing request - get stop time for launch completion and report */
|
/* setup periodic timer so we check for job completion */
|
||||||
if (orte_timing) {
|
ORTE_TIMER_EVENT(1, check_job_complete);
|
||||||
if (0 != gettimeofday(&completionstop, NULL)) {
|
|
||||||
opal_output(0, "plm_tmd: could not obtain completion stop time");
|
|
||||||
} else {
|
|
||||||
deltat = (launchstop.tv_sec - launchstart.tv_sec)*1000000 +
|
|
||||||
(launchstop.tv_usec - launchstart.tv_usec);
|
|
||||||
opal_output(0, "plm_tmd: launch completion required %d usec", deltat);
|
|
||||||
}
|
|
||||||
opal_output(0, "plm_tmd: Launch statistics:");
|
|
||||||
opal_output(0, "plm_tmd: Average time to launch an orted: %f usec", avgtime);
|
|
||||||
opal_output(0, "plm_tmd: Max time to launch an orted: %d usec at iter %d", maxtime, maxiter);
|
|
||||||
opal_output(0, "plm_tmd: Min time to launch an orted: %d usec at iter %d", mintime, miniter);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
cleanup:
|
cleanup:
|
||||||
if (NULL != argv) {
|
/* can't reuse the events, so may as well get rid of them */
|
||||||
opal_argv_free(argv);
|
if (NULL != tm_events) {
|
||||||
|
free(tm_events);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (NULL != env) {
|
if (NULL != env) {
|
||||||
opal_argv_free(env);
|
opal_argv_free(env);
|
||||||
}
|
}
|
||||||
@ -424,13 +598,6 @@ launch_apps:
|
|||||||
if (connected) {
|
if (connected) {
|
||||||
plm_tmd_disconnect();
|
plm_tmd_disconnect();
|
||||||
}
|
}
|
||||||
if (NULL != tm_events) {
|
|
||||||
free(tm_events);
|
|
||||||
}
|
|
||||||
if (NULL != tm_task_ids) {
|
|
||||||
free(tm_task_ids);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (NULL != lib_base) {
|
if (NULL != lib_base) {
|
||||||
free(lib_base);
|
free(lib_base);
|
||||||
}
|
}
|
||||||
@ -443,17 +610,6 @@ launch_apps:
|
|||||||
orte_plm_base_launch_failed(jdata->jobid, false, -1, 0, ORTE_JOB_STATE_FAILED_TO_START);
|
orte_plm_base_launch_failed(jdata->jobid, false, -1, 0, ORTE_JOB_STATE_FAILED_TO_START);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* check for timing request - get stop time and process if so */
|
|
||||||
if (orte_timing) {
|
|
||||||
if (0 != gettimeofday(&jobstop, NULL)) {
|
|
||||||
opal_output(0, "plm_tmd: could not obtain stop time");
|
|
||||||
} else {
|
|
||||||
deltat = (jobstop.tv_sec - jobstart.tv_sec)*1000000 +
|
|
||||||
(jobstop.tv_usec - jobstart.tv_usec);
|
|
||||||
opal_output(0, "plm_tmd: launch of entire job required %d usec", deltat);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||||
"%s plm:tm:launch: finished",
|
"%s plm:tm:launch: finished",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||||
@ -465,41 +621,119 @@ launch_apps:
|
|||||||
static int plm_tmd_terminate_job(orte_jobid_t jobid)
|
static int plm_tmd_terminate_job(orte_jobid_t jobid)
|
||||||
{
|
{
|
||||||
int rc;
|
int rc;
|
||||||
|
int i, local_err;
|
||||||
|
tm_event_t event, *tm_events;
|
||||||
|
|
||||||
/* order all of the daemons to kill their local procs for this job */
|
tm_events = malloc(sizeof(tm_event_t) * num_tids);
|
||||||
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_kill_local_procs(jobid))) {
|
if (NULL == tm_events) {
|
||||||
|
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
|
return rc;
|
||||||
}
|
}
|
||||||
|
|
||||||
return rc;
|
/* connect to the mom */
|
||||||
|
rc = plm_tmd_connect();
|
||||||
|
if (ORTE_SUCCESS != rc) {
|
||||||
|
opal_output(0, "CANNOT CONNECT TO MOM");
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* cycle through the task_id's and kill them */
|
||||||
|
for (i=0; i < num_tids; i++) {
|
||||||
|
if (ORTE_SUCCESS != (rc = tm_kill(tm_task_ids[i], SIGTERM, &tm_events[i]))) {
|
||||||
|
opal_output(0, "CANNOT KILL PROC %d", i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* cycle through and poll the events */
|
||||||
|
for (i=0; i < num_tids; i++) {
|
||||||
|
rc = tm_poll(TM_NULL_EVENT, &event, 1, &local_err);
|
||||||
|
if (TM_SUCCESS != rc) {
|
||||||
|
errno = local_err;
|
||||||
|
opal_output(0, "plm:tmd: failed to kill proc, return status = %d", rc);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* disconnect from mom */
|
||||||
|
plm_tmd_disconnect();
|
||||||
|
|
||||||
|
cleanup:
|
||||||
|
/* free tm_events */
|
||||||
|
free(tm_events);
|
||||||
|
|
||||||
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Terminate the orteds for a given job
|
* Terminate the orteds for a given job
|
||||||
*/
|
*/
|
||||||
int plm_tmd_terminate_orteds(void)
|
static int plm_tmd_terminate_orteds(void)
|
||||||
{
|
{
|
||||||
int rc;
|
orte_job_t *daemons;
|
||||||
|
int data=1;
|
||||||
/* now tell them to die! */
|
|
||||||
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit())) {
|
/* fake the system into thinking the orteds
|
||||||
ORTE_ERROR_LOG(rc);
|
* reported their clean termination
|
||||||
|
*/
|
||||||
|
if (NULL == (daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
|
||||||
|
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||||
|
return ORTE_ERR_NOT_FOUND;
|
||||||
}
|
}
|
||||||
|
daemons->num_terminated = daemons->num_procs;
|
||||||
|
|
||||||
return rc;
|
/* fire the trigger indicating that the orteds are gone */
|
||||||
|
write(orteds_exit, &data, sizeof(int));
|
||||||
|
opal_progress();
|
||||||
|
|
||||||
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int plm_tmd_signal_job(orte_jobid_t jobid, int32_t signal)
|
static int plm_tmd_signal_job(orte_jobid_t jobid, int32_t signal)
|
||||||
{
|
{
|
||||||
int rc;
|
int rc;
|
||||||
|
int i, local_err;
|
||||||
|
tm_event_t event, *tm_events;
|
||||||
|
|
||||||
/* order them to pass this signal to their local procs */
|
tm_events = malloc(sizeof(tm_event_t) * num_tids);
|
||||||
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_signal_local_procs(jobid, signal))) {
|
if (NULL == tm_events) {
|
||||||
|
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
|
return rc;
|
||||||
}
|
}
|
||||||
|
|
||||||
return rc;
|
/* connect to the mom */
|
||||||
|
rc = plm_tmd_connect();
|
||||||
|
if (ORTE_SUCCESS != rc) {
|
||||||
|
opal_output(0, "CANNOT CONNECT TO MOM");
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* cycle through the task_id's and kill them */
|
||||||
|
for (i=0; i < num_tids; i++) {
|
||||||
|
if (ORTE_SUCCESS != (rc = tm_kill(tm_task_ids[i], signal, &tm_events[i]))) {
|
||||||
|
opal_output(0, "CANNOT SIGNAL PROC %d", i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* cycle through and poll the events */
|
||||||
|
for (i=0; i < num_tids; i++) {
|
||||||
|
rc = tm_poll(TM_NULL_EVENT, &event, 1, &local_err);
|
||||||
|
if (TM_SUCCESS != rc) {
|
||||||
|
errno = local_err;
|
||||||
|
opal_output(0, "plm:tmd: failed to signal proc, return status = %d", rc);
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* disconnect from mom */
|
||||||
|
plm_tmd_disconnect();
|
||||||
|
|
||||||
|
cleanup:
|
||||||
|
/* free tm_events */
|
||||||
|
free(tm_events);
|
||||||
|
|
||||||
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -515,6 +749,10 @@ static int plm_tmd_finalize(void)
|
|||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (NULL != tm_task_ids) {
|
||||||
|
free(tm_task_ids);
|
||||||
|
}
|
||||||
|
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -245,6 +245,27 @@ ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_message_event_t);
|
|||||||
*(event) = tmp; \
|
*(event) = tmp; \
|
||||||
}while(0); \
|
}while(0); \
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* There are places in the code where we just want to periodically
|
||||||
|
* wakeup to do something, and then go back to sleep again. Setting
|
||||||
|
* a timer allows us to do this
|
||||||
|
*/
|
||||||
|
#define ORTE_TIMER_EVENT(time, cbfunc) \
|
||||||
|
do { \
|
||||||
|
struct timeval now; \
|
||||||
|
opal_event_t *tmp; \
|
||||||
|
tmp = (opal_event_t*)malloc(sizeof(opal_event_t)); \
|
||||||
|
opal_evtimer_set(tmp, (cbfunc), tmp); \
|
||||||
|
now.tv_sec = (time); \
|
||||||
|
now.tv_usec = 0; \
|
||||||
|
OPAL_OUTPUT_VERBOSE((1, orte_debug_output, \
|
||||||
|
"defining timer event: %ld sec", \
|
||||||
|
(long)now.tv_sec)); \
|
||||||
|
opal_evtimer_add(tmp, &now); \
|
||||||
|
}while(0); \
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* \internal
|
* \internal
|
||||||
*
|
*
|
||||||
|
@ -923,8 +923,6 @@ static void abort_exit_callback(int fd, short ign, void *arg)
|
|||||||
*/
|
*/
|
||||||
static void abort_signal_callback(int fd, short flags, void *arg)
|
static void abort_signal_callback(int fd, short flags, void *arg)
|
||||||
{
|
{
|
||||||
opal_event_t *event;
|
|
||||||
|
|
||||||
/* if we have already ordered this once, or we are already
|
/* if we have already ordered this once, or we are already
|
||||||
* aborting the job, don't keep doing it to avoid race conditions
|
* aborting the job, don't keep doing it to avoid race conditions
|
||||||
*/
|
*/
|
||||||
@ -942,7 +940,7 @@ static void abort_signal_callback(int fd, short flags, void *arg)
|
|||||||
(which is a Bad Thing), so we can't call it directly.
|
(which is a Bad Thing), so we can't call it directly.
|
||||||
Instead, we have to exit this handler and setup to call
|
Instead, we have to exit this handler and setup to call
|
||||||
job_completed() after this. */
|
job_completed() after this. */
|
||||||
ORTE_DETECT_TIMEOUT(&event, 0, 0, 1, abort_exit_callback);
|
ORTE_TIMER_EVENT(0, abort_exit_callback);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user