1
1

First cut at direct launch for TM. Able to launch non-ORTE procs and detect their completion for a clean shutdown.

This commit was SVN r17732.
Этот коммит содержится в:
Ralph Castain 2008-03-05 13:51:32 +00:00
родитель 10c2ce7d35
Коммит 06d3145fe4
3 изменённых файлов: 487 добавлений и 230 удалений

Просмотреть файл

@ -64,8 +64,10 @@
#include "opal/runtime/opal_progress.h"
#include "orte/util/name_fns.h"
#include "orte/util/context_fns.h"
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wait.h"
#include "orte/runtime/orte_wakeup.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rmaps/rmaps.h"
@ -73,7 +75,8 @@
#include "orte/mca/plm/base/plm_private.h"
#include "plm_tmd.h"
/* define frequency of checking job complete */
#define PLM_CHECK_COMPLETE_TIME 1
/*
* Local functions
@ -88,6 +91,13 @@ static int plm_tmd_finalize(void);
static int plm_tmd_connect(void);
static int plm_tmd_disconnect(void);
/*
* Local "global" variables
*/
static tm_task_id *tm_task_ids = NULL;
static int num_tids;
static orte_job_t *active_jdata;
/*
* Global variable
*/
@ -115,6 +125,89 @@ static int plm_tmd_init(void)
}
/* check for job completion */
static void check_job_complete(int fd, short evnt, void *arg)
{
tm_event_t event, *tm_events;
int *exit_codes, i, rc, local_err;
orte_vpid_t v;
orte_proc_t **procs;
opal_output(0, "checking job completion");
tm_events = malloc(sizeof(tm_event_t) * num_tids);
exit_codes = malloc(sizeof(int) * num_tids);
/* connect to the mom */
rc = plm_tmd_connect();
if (ORTE_SUCCESS != rc) {
opal_output(0, "CANNOT CONNECT TO MOM");
goto cleanup;
}
/* cycle through the task_id's and check for complete */
for (i=0; i < num_tids; i++) {
if (ORTE_SUCCESS != (rc = tm_obit(tm_task_ids[i], exit_codes + i, tm_events + i))) {
opal_output(0, "CANNOT CHECK PROC %d", i);
}
}
opal_output(0, "obits requested");
/* cycle through and poll the events */
for (i=0; i < num_tids; i++) {
rc = tm_poll(TM_NULL_EVENT, &event, 1, &local_err);
if (TM_SUCCESS != rc) {
errno = local_err;
opal_output(0, "plm:tmd: failed to poll obit, return status = %d", rc);
}
}
opal_output(0, "obits polled");
/* store the exit codes */
procs = (orte_proc_t**)active_jdata->procs->addr;
for (v=0; v < active_jdata->num_procs; v++) {
procs[v]->exit_code = exit_codes[v];
opal_output(0, "rank %d ecode %d", (int)v, exit_codes[v]);
if (WIFEXITED(exit_codes[v])) {
if (procs[v]->state < ORTE_PROC_STATE_TERMINATED) {
procs[v]->state = ORTE_PROC_STATE_TERMINATED;
active_jdata->num_terminated++;
}
} else {
procs[v]->state = ORTE_PROC_STATE_ABORTED_BY_SIG;
active_jdata->abort = true;
active_jdata->aborted_proc = procs[v];
active_jdata->state = ORTE_JOB_STATE_ABORTED;
}
}
/* disconnect from mom */
plm_tmd_disconnect();
cleanup:
/* free tm_events */
free(tm_events);
free(exit_codes);
/* check for completion */
if (active_jdata->num_terminated >= active_jdata->num_procs) {
active_jdata->state = ORTE_JOB_STATE_TERMINATED;
orte_wakeup(0);
} else if (active_jdata->state == ORTE_JOB_STATE_ABORTED &&
!orte_abnormal_term_ordered && !orte_abort_in_progress) {
orte_errmgr.proc_aborted(&(active_jdata->aborted_proc->name),
active_jdata->aborted_proc->exit_code);
}
/* reset the timer */
ORTE_TIMER_EVENT(1, check_job_complete);
opal_output(0, "timer reset");
}
/* When working in this function, ALWAYS jump to "cleanup" if
* you encounter an error so that orterun will be woken up and
* the job can cleanly terminate
@ -124,35 +217,22 @@ static int plm_tmd_launch_job(orte_job_t *jdata)
orte_job_map_t *map = NULL;
orte_app_context_t **apps;
orte_node_t **nodes;
int node_name_index;
int proc_vpid_index;
char *param;
orte_proc_t **procs;
char *param, *param2;
char **env = NULL;
char *var;
char **argv = NULL;
int argc;
int rc;
bool connected = false;
orte_std_cntr_t launched = 0, i;
orte_std_cntr_t launched = 0, i, n;
char *bin_base = NULL, *lib_base = NULL;
tm_event_t *tm_events = NULL;
tm_task_id *tm_task_ids = NULL;
int local_err;
tm_event_t event;
struct timeval launchstart, launchstop, completionstart, completionstop;
struct timeval jobstart, jobstop;
int maxtime=0, mintime=99999999, maxiter = 0, miniter = 0, deltat;
float avgtime=0.0;
bool failed_launch = true;
mode_t current_umask;
orte_vpid_t v;
tm_event_t *tm_events = NULL;
/* record who we are working on */
active_jdata = jdata;
/* check for timing request - get start time if so */
if (orte_timing) {
if (0 != gettimeofday(&jobstart, NULL)) {
opal_output(0, "plm_tmd: could not obtain job start time");
}
}
/* create a jobid for this job */
if (ORTE_SUCCESS != (rc = orte_plm_base_create_jobid(&jdata->jobid))) {
ORTE_ERROR_LOG(rc);
@ -178,175 +258,299 @@ static int plm_tmd_launch_job(orte_job_t *jdata)
}
apps = (orte_app_context_t**)jdata->apps->addr;
nodes = (orte_node_t**)map->nodes->addr;
procs = (orte_proc_t**)jdata->procs->addr;
if (0 == map->num_new_daemons) {
/* have all the daemons we need - launch app */
goto launch_apps;
}
/* Allocate a bunch of TM events to use for tm_spawn()ing */
tm_events = malloc(sizeof(tm_event_t) * map->num_new_daemons);
/* Allocate a bunch of TM events to use for tm_spawn()ing - we will
* need one/process since we are not launching daemons
*/
tm_events = malloc(sizeof(tm_event_t) * jdata->num_procs);
if (NULL == tm_events) {
rc = ORTE_ERR_OUT_OF_RESOURCE;
ORTE_ERROR_LOG(rc);
goto cleanup;
}
tm_task_ids = malloc(sizeof(tm_task_id) * map->num_new_daemons);
tm_task_ids = malloc(sizeof(tm_task_id) * jdata->num_procs);
if (NULL == tm_task_ids) {
rc = ORTE_ERR_OUT_OF_RESOURCE;
ORTE_ERROR_LOG(rc);
goto cleanup;
}
num_tids = jdata->num_procs;
/* Add basic orted command line options */
orte_plm_base_orted_append_basic_args(&argc, &argv, "env",
&proc_vpid_index,
&node_name_index);
if (0 < opal_output_get_verbosity(orte_plm_globals.output)) {
param = opal_argv_join(argv, ' ');
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:tm: final top-level argv:\n\t%s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(NULL == param) ? "NULL" : param));
if (NULL != param) free(param);
/* Figure out the basenames for the libdir and bindir. There is a
lengthy comment about this in plm_rsh_module.c explaining all
the rationale for how / why we're doing this. */
lib_base = opal_basename(opal_install_dirs.libdir);
bin_base = opal_basename(opal_install_dirs.bindir);
/* setup the environment for each app_context - we need to insert enviro
* values that tell the procs how to bootstrap themselves
*/
for (n=0; n < jdata->num_apps; n++) {
orte_app_context_t *context = apps[n];
char ***env = &context->env;
/* setup base environment: copy the current environ and merge
* in the app context environ
*/
if (NULL != context->env) {
*env = opal_environ_merge(orte_launch_environ, context->env);
} else {
*env = opal_argv_copy(orte_launch_environ);
}
/* Try to change to the context cwd and check that the app
* exists and is executable The function will
* take care of outputting a pretty error message, if required
*/
if (ORTE_SUCCESS != (rc = orte_util_check_context_cwd(context, true))) {
/* do not ERROR_LOG - it will be reported elsewhere */
goto cleanup;
}
if (ORTE_SUCCESS != (rc = orte_util_check_context_app(context))) {
/* do not ERROR_LOG - it will be reported elsewhere */
goto cleanup;
}
/* special case handling for --prefix: this is somewhat icky,
* but at least some users do this. :-\ It is possible that
* when using --prefix, the user will also "-x PATH" and/or
* "-x LD_LIBRARY_PATH", which would therefore clobber the
* work that was done in the prior pls to ensure that we have
* the prefix at the beginning of the PATH and
* LD_LIBRARY_PATH. So examine the context->env and see if we
* find PATH or LD_LIBRARY_PATH. If found, that means the
* prior work was clobbered, and we need to re-prefix those
* variables.
*/
for (i = 0; NULL != context->env && NULL != context->env[i]; ++i) {
char *newenv;
/* Reset PATH */
if (0 == strncmp("PATH=", context->env[i], 5)) {
asprintf(&newenv, "%s/%s:%s",
apps[n]->prefix_dir, bin_base, context->env[i] + 5);
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:tmd: resetting PATH: %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
newenv));
opal_setenv("PATH", newenv, true, env);
free(newenv);
}
/* Reset LD_LIBRARY_PATH */
else if (0 == strncmp("LD_LIBRARY_PATH=", context->env[i], 16)) {
asprintf(&newenv, "%s/%s:%s",
apps[n]->prefix_dir, lib_base, context->env[i] + 16);
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:tm: resetting LD_LIBRARY_PATH: %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
newenv));
opal_setenv("LD_LIBRARY_PATH", newenv, true, env);
free(newenv);
}
}
/* there will be no local daemon */
param = mca_base_param_environ_variable("orte","local_daemon","uri");
opal_unsetenv(param, env);
free(param);
/* pass the hnp's contact info to the local proc */
param = mca_base_param_environ_variable("orte","hnp","uri");
opal_setenv(param, orte_process_info.my_hnp_uri, true, env);
free(param);
/* set the app_context number into the environment */
param = mca_base_param_environ_variable("orte","app","num");
asprintf(&param2, "%ld", (long)context->idx);
opal_setenv(param, param2, true, env);
free(param);
free(param2);
/* set the universe size in the environment */
param = mca_base_param_environ_variable("orte","universe","size");
asprintf(&param2, "%ld", (long)jdata->total_slots_alloc);
opal_setenv(param, param2, true, env);
free(param);
/* although the total_slots_alloc is the universe size, users
* would appreciate being given a public environmental variable
* that also represents this value - something MPI specific - so
* do that here.
*
* AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT.
* We know - just live with it
*/
opal_setenv("OMPI_UNIVERSE_SIZE", param2, true, env);
free(param2);
/* tell the proc to use the "envd" - i.e., the environment direct
* ESS module to set itself up
*/
param = mca_base_param_environ_variable("ess",NULL,"NULL");
opal_setenv(param, "envd", true, env);
free(param);
/* since we want to pass the name as separate components, make sure
* that the "name" environmental variable is cleared!
*/
if(NULL == (param = mca_base_param_environ_variable("orte","ess","name"))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
opal_unsetenv(param, env);
free(param);
/* tell the proc the jobid - same for everyone */
if (ORTE_SUCCESS != (rc = orte_util_convert_jobid_to_string(&param2, jdata->jobid))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
if(NULL == (param = mca_base_param_environ_variable("orte","ess","jobid"))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
rc = ORTE_ERR_OUT_OF_RESOURCE;
goto cleanup;
}
opal_setenv(param, param2, true, env);
free(param);
free(param2);
/* tell the proc the #procs in this job */
asprintf(&param2, "%ld", (long) jdata->num_procs);
if(NULL == (param = mca_base_param_environ_variable("orte","ess","num_procs"))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
opal_setenv(param, param2, true, env);
free(param);
/* although the num_procs is the comm_world size, users
* would appreciate being given a public environmental variable
* that also represents this value - something MPI specific - so
* do that here.
*
* AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT.
* We know - just live with it
*/
opal_setenv("OMPI_COMM_WORLD_SIZE", param2, true, env);
free(param2);
}
rc = plm_tmd_connect();
if (ORTE_SUCCESS != rc) {
goto cleanup;
}
connected = true;
/* Figure out the basenames for the libdir and bindir. There is a
lengthy comment about this in plm_rsh_module.c explaining all
the rationale for how / why we're doing this. */
lib_base = opal_basename(opal_install_dirs.libdir);
bin_base = opal_basename(opal_install_dirs.bindir);
/* setup environment */
env = opal_argv_copy(environ);
/* add our umask -- see big note in orted.c */
current_umask = umask(0);
umask(current_umask);
asprintf(&var, "0%o", current_umask);
opal_setenv("ORTE_DAEMON_UMASK_VALUE", var, true, &env);
free(var);
/* If we have a prefix, then modify the PATH and
LD_LIBRARY_PATH environment variables. We only allow
a single prefix to be specified. Since there will
always be at least one app_context, we take it from
there
*/
if (NULL != apps[0]->prefix_dir) {
char *newenv;
/* Iterate through each of the procs and launch it */
for (v = 0; v < jdata->num_procs; v++) {
orte_proc_t *proc = procs[v];
orte_node_t* node = proc->node;
orte_app_context_t *context = apps[proc->app_idx];
char **environ_copy;
int argc;
for (i = 0; NULL != env && NULL != env[i]; ++i) {
/* Reset PATH */
if (0 == strncmp("PATH=", env[i], 5)) {
asprintf(&newenv, "%s/%s:%s",
apps[0]->prefix_dir, bin_base, env[i] + 5);
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:tm: resetting PATH: %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
newenv));
opal_setenv("PATH", newenv, true, &env);
free(newenv);
}
/* Reset LD_LIBRARY_PATH */
else if (0 == strncmp("LD_LIBRARY_PATH=", env[i], 16)) {
asprintf(&newenv, "%s/%s:%s",
apps[0]->prefix_dir, lib_base, env[i] + 16);
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:tm: resetting LD_LIBRARY_PATH: %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
newenv));
opal_setenv("LD_LIBRARY_PATH", newenv, true, &env);
free(newenv);
}
}
}
/* Iterate through each of the nodes and spin
* up a daemon.
*/
for (i = 0; i < map->num_nodes; i++) {
orte_node_t* node = nodes[i];
char* vpid_string;
/* copy the environment */
environ_copy = opal_argv_copy(context->env);
/* if this daemon already exists, don't launch it! */
if (node->daemon_launched) {
continue;
/* pass the proc's vpid */
if (ORTE_SUCCESS != (rc = orte_util_convert_vpid_to_string(&param2, proc->name.vpid))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* setup node name */
free(argv[node_name_index]);
argv[node_name_index] = strdup(node->name);
if(NULL == (param = mca_base_param_environ_variable("orte","ess","vpid"))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
rc = ORTE_ERR_OUT_OF_RESOURCE;
goto cleanup;
}
opal_setenv(param, param2, true, &environ_copy);
free(param);
/* although the vpid IS the process' rank within the job, users
* would appreciate being given a public environmental variable
* that also represents this value - something MPI specific - so
* do that here.
*
* AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT.
* We know - just live with it
*/
opal_setenv("OMPI_COMM_WORLD_RANK", param2, true, &environ_copy);
free(param2); /* done with this now */
/* pass the node's launch_id to the process as a "nodeid" so it can
* identify which procs are local to it
*/
asprintf(&param2, "%ld", (long)node->launch_id);
param = mca_base_param_environ_variable("orte","nodeid",NULL);
opal_setenv(param, param2, true, &environ_copy);
free(param);
free(param2);
/* ensure we use the same nodename */
param = mca_base_param_environ_variable("orte", "base", "nodename");
opal_setenv(param, node->name, true, &environ_copy);
free(param);
/* set the local rank */
asprintf(&param2, "%lu", (unsigned long) proc->local_rank);
if(NULL == (param = mca_base_param_environ_variable("orte","ess","local_rank"))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
rc = ORTE_ERR_OUT_OF_RESOURCE;
goto cleanup;
}
opal_setenv(param, param2, true, &environ_copy);
free(param);
free(param2);
/* setup yield schedule and processor affinity
* We default here to always setting the affinity processor if we want
* it. The processor affinity system then determines
* if processor affinity is enabled/requested - if so, it then uses
* this value to select the process to which the proc is "assigned".
* Otherwise, the paffinity subsystem just ignores this value anyway
*/
param = mca_base_param_environ_variable("mpi", NULL,
"paffinity_processor");
asprintf(&param2, "%lu", (unsigned long) proc->local_rank);
opal_setenv(param, param2, true, &environ_copy);
free(param);
free(param2);
if (node->oversubscribed) {
param = mca_base_param_environ_variable("mpi", NULL, "yield_when_idle");
opal_setenv(param, "1", false, &environ_copy);
} else {
param = mca_base_param_environ_variable("mpi", NULL, "yield_when_idle");
opal_setenv(param, "0", false, &environ_copy);
}
free(param);
asprintf(&param2, "%ld", (long) node->num_procs);
if(NULL == (param = mca_base_param_environ_variable("orte","ess","num_local_procs"))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
opal_setenv(param, param2, true, &environ_copy);
free(param);
free(param2);
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:tm: launching on node %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
node->name));
/* setup process name */
rc = orte_util_convert_vpid_to_string(&vpid_string, nodes[i]->daemon->name.vpid);
if (ORTE_SUCCESS != rc) {
opal_output(0, "plm:tm: unable to get daemon vpid as string");
exit(-1);
}
free(argv[proc_vpid_index]);
argv[proc_vpid_index] = strdup(vpid_string);
free(vpid_string);
/* set the argc count */
argc = opal_argv_count(context->argv);
/* exec the daemon */
if (0 < opal_output_get_verbosity(orte_plm_globals.output)) {
param = opal_argv_join(argv, ' ');
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:tm: executing:\n\t%s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(NULL == param) ? "NULL" : param));
if (NULL != param) free(param);
}
/* check for timing request - get start time if so */
if (orte_timing) {
if (0 != gettimeofday(&launchstart, NULL)) {
opal_output(0, "plm_tmd: could not obtain start time");
launchstart.tv_sec = 0;
launchstart.tv_usec = 0;
}
}
rc = tm_spawn(argc, argv, env, node->launch_id, tm_task_ids + launched, tm_events + launched);
rc = tm_spawn(argc, context->argv, environ_copy, node->launch_id, tm_task_ids + launched, tm_events + launched);
if (TM_SUCCESS != rc) {
opal_show_help("help-plm-tm.txt", "tm-spawn-failed",
true, argv[0], node->name, node->launch_id);
true, context->argv[0], node->name, node->launch_id);
rc = ORTE_ERROR;
goto cleanup;
}
/* check for timing request - get stop time and process if so */
if (orte_timing) {
if (0 != gettimeofday(&launchstop, NULL)) {
opal_output(0, "plm_tmd: could not obtain stop time");
} else {
deltat = (launchstop.tv_sec - launchstart.tv_sec)*1000000 +
(launchstop.tv_usec - launchstart.tv_usec);
avgtime = avgtime + deltat / map->num_new_daemons;
if (deltat < mintime) {
mintime = deltat;
miniter = launched;
}
if (deltat > maxtime) {
maxtime = deltat;
maxiter = launched;
}
}
}
opal_argv_free(environ_copy);
launched++;
/* Allow some progress to occur */
@ -354,18 +558,9 @@ static int plm_tmd_launch_job(orte_job_t *jdata)
}
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:tm:launch: finished spawning orteds",
"%s plm:tm:launch: finished spawning procs",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* check for timing request - get start time for launch completion */
if (orte_timing) {
if (0 != gettimeofday(&completionstart, NULL)) {
opal_output(0, "plm_tmd: could not obtain completion start time");
completionstart.tv_sec = 0;
completionstart.tv_usec = 0;
}
}
/* TM poll for all the spawns */
for (i = 0; i < launched; ++i) {
rc = tm_poll(TM_NULL_EVENT, &event, 1, &local_err);
@ -375,17 +570,7 @@ static int plm_tmd_launch_job(orte_job_t *jdata)
goto cleanup;
}
}
/* wait for daemons to callback */
if (ORTE_SUCCESS != (rc = orte_plm_base_daemon_callback(map->num_new_daemons))) {
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:tm: daemon launch failed for job %s on error %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jdata->jobid), ORTE_ERROR_NAME(rc)));
goto cleanup;
}
launch_apps:
#if 0
if (ORTE_SUCCESS != (rc = orte_plm_base_launch_apps(jdata->jobid))) {
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:tm: launch of apps failed for job %s on error %s",
@ -393,30 +578,19 @@ launch_apps:
ORTE_JOBID_PRINT(jdata->jobid), ORTE_ERROR_NAME(rc)));
goto cleanup;
}
#endif
/* if we get here, then everything launched okay - record that fact */
failed_launch = false;
/* check for timing request - get stop time for launch completion and report */
if (orte_timing) {
if (0 != gettimeofday(&completionstop, NULL)) {
opal_output(0, "plm_tmd: could not obtain completion stop time");
} else {
deltat = (launchstop.tv_sec - launchstart.tv_sec)*1000000 +
(launchstop.tv_usec - launchstart.tv_usec);
opal_output(0, "plm_tmd: launch completion required %d usec", deltat);
}
opal_output(0, "plm_tmd: Launch statistics:");
opal_output(0, "plm_tmd: Average time to launch an orted: %f usec", avgtime);
opal_output(0, "plm_tmd: Max time to launch an orted: %d usec at iter %d", maxtime, maxiter);
opal_output(0, "plm_tmd: Min time to launch an orted: %d usec at iter %d", mintime, miniter);
}
/* setup periodic timer so we check for job completion */
ORTE_TIMER_EVENT(1, check_job_complete);
cleanup:
if (NULL != argv) {
opal_argv_free(argv);
/* can't reuse the events, so may as well get rid of them */
if (NULL != tm_events) {
free(tm_events);
}
if (NULL != env) {
opal_argv_free(env);
}
@ -424,13 +598,6 @@ launch_apps:
if (connected) {
plm_tmd_disconnect();
}
if (NULL != tm_events) {
free(tm_events);
}
if (NULL != tm_task_ids) {
free(tm_task_ids);
}
if (NULL != lib_base) {
free(lib_base);
}
@ -443,17 +610,6 @@ launch_apps:
orte_plm_base_launch_failed(jdata->jobid, false, -1, 0, ORTE_JOB_STATE_FAILED_TO_START);
}
/* check for timing request - get stop time and process if so */
if (orte_timing) {
if (0 != gettimeofday(&jobstop, NULL)) {
opal_output(0, "plm_tmd: could not obtain stop time");
} else {
deltat = (jobstop.tv_sec - jobstart.tv_sec)*1000000 +
(jobstop.tv_usec - jobstart.tv_usec);
opal_output(0, "plm_tmd: launch of entire job required %d usec", deltat);
}
}
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:tm:launch: finished",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
@ -465,41 +621,119 @@ launch_apps:
static int plm_tmd_terminate_job(orte_jobid_t jobid)
{
int rc;
int i, local_err;
tm_event_t event, *tm_events;
/* order all of the daemons to kill their local procs for this job */
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_kill_local_procs(jobid))) {
tm_events = malloc(sizeof(tm_event_t) * num_tids);
if (NULL == tm_events) {
rc = ORTE_ERR_OUT_OF_RESOURCE;
ORTE_ERROR_LOG(rc);
return rc;
}
return rc;
/* connect to the mom */
rc = plm_tmd_connect();
if (ORTE_SUCCESS != rc) {
opal_output(0, "CANNOT CONNECT TO MOM");
goto cleanup;
}
/* cycle through the task_id's and kill them */
for (i=0; i < num_tids; i++) {
if (ORTE_SUCCESS != (rc = tm_kill(tm_task_ids[i], SIGTERM, &tm_events[i]))) {
opal_output(0, "CANNOT KILL PROC %d", i);
}
}
/* cycle through and poll the events */
for (i=0; i < num_tids; i++) {
rc = tm_poll(TM_NULL_EVENT, &event, 1, &local_err);
if (TM_SUCCESS != rc) {
errno = local_err;
opal_output(0, "plm:tmd: failed to kill proc, return status = %d", rc);
}
}
/* disconnect from mom */
plm_tmd_disconnect();
cleanup:
/* free tm_events */
free(tm_events);
return ORTE_SUCCESS;
}
/**
* Terminate the orteds for a given job
*/
int plm_tmd_terminate_orteds(void)
static int plm_tmd_terminate_orteds(void)
{
int rc;
/* now tell them to die! */
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit())) {
ORTE_ERROR_LOG(rc);
orte_job_t *daemons;
int data=1;
/* fake the system into thinking the orteds
* reported their clean termination
*/
if (NULL == (daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
daemons->num_terminated = daemons->num_procs;
return rc;
/* fire the trigger indicating that the orteds are gone */
write(orteds_exit, &data, sizeof(int));
opal_progress();
return ORTE_SUCCESS;
}
static int plm_tmd_signal_job(orte_jobid_t jobid, int32_t signal)
{
int rc;
int i, local_err;
tm_event_t event, *tm_events;
/* order them to pass this signal to their local procs */
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_signal_local_procs(jobid, signal))) {
tm_events = malloc(sizeof(tm_event_t) * num_tids);
if (NULL == tm_events) {
rc = ORTE_ERR_OUT_OF_RESOURCE;
ORTE_ERROR_LOG(rc);
return rc;
}
return rc;
/* connect to the mom */
rc = plm_tmd_connect();
if (ORTE_SUCCESS != rc) {
opal_output(0, "CANNOT CONNECT TO MOM");
goto cleanup;
}
/* cycle through the task_id's and kill them */
for (i=0; i < num_tids; i++) {
if (ORTE_SUCCESS != (rc = tm_kill(tm_task_ids[i], signal, &tm_events[i]))) {
opal_output(0, "CANNOT SIGNAL PROC %d", i);
}
}
/* cycle through and poll the events */
for (i=0; i < num_tids; i++) {
rc = tm_poll(TM_NULL_EVENT, &event, 1, &local_err);
if (TM_SUCCESS != rc) {
errno = local_err;
opal_output(0, "plm:tmd: failed to signal proc, return status = %d", rc);
goto cleanup;
}
}
/* disconnect from mom */
plm_tmd_disconnect();
cleanup:
/* free tm_events */
free(tm_events);
return ORTE_SUCCESS;
}
@ -515,6 +749,10 @@ static int plm_tmd_finalize(void)
ORTE_ERROR_LOG(rc);
}
if (NULL != tm_task_ids) {
free(tm_task_ids);
}
return ORTE_SUCCESS;
}

Просмотреть файл

@ -245,6 +245,27 @@ ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_message_event_t);
*(event) = tmp; \
}while(0); \
/**
* There are places in the code where we just want to periodically
* wakeup to do something, and then go back to sleep again. Setting
* a timer allows us to do this
*/
#define ORTE_TIMER_EVENT(time, cbfunc) \
do { \
struct timeval now; \
opal_event_t *tmp; \
tmp = (opal_event_t*)malloc(sizeof(opal_event_t)); \
opal_evtimer_set(tmp, (cbfunc), tmp); \
now.tv_sec = (time); \
now.tv_usec = 0; \
OPAL_OUTPUT_VERBOSE((1, orte_debug_output, \
"defining timer event: %ld sec", \
(long)now.tv_sec)); \
opal_evtimer_add(tmp, &now); \
}while(0); \
/**
* \internal
*

Просмотреть файл

@ -923,8 +923,6 @@ static void abort_exit_callback(int fd, short ign, void *arg)
*/
static void abort_signal_callback(int fd, short flags, void *arg)
{
opal_event_t *event;
/* if we have already ordered this once, or we are already
* aborting the job, don't keep doing it to avoid race conditions
*/
@ -942,7 +940,7 @@ static void abort_signal_callback(int fd, short flags, void *arg)
(which is a Bad Thing), so we can't call it directly.
Instead, we have to exit this handler and setup to call
job_completed() after this. */
ORTE_DETECT_TIMEOUT(&event, 0, 0, 1, abort_exit_callback);
ORTE_TIMER_EVENT(0, abort_exit_callback);
}
/**