First cut at direct launch for TM. Able to launch non-ORTE procs and detect their completion for a clean shutdown.
This commit was SVN r17732.
Этот коммит содержится в:
родитель
10c2ce7d35
Коммит
06d3145fe4
@ -64,8 +64,10 @@
|
||||
#include "opal/runtime/opal_progress.h"
|
||||
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/context_fns.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/runtime/orte_wakeup.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/rmaps/rmaps.h"
|
||||
|
||||
@ -73,7 +75,8 @@
|
||||
#include "orte/mca/plm/base/plm_private.h"
|
||||
#include "plm_tmd.h"
|
||||
|
||||
|
||||
/* define frequency of checking job complete */
|
||||
#define PLM_CHECK_COMPLETE_TIME 1
|
||||
|
||||
/*
|
||||
* Local functions
|
||||
@ -88,6 +91,13 @@ static int plm_tmd_finalize(void);
|
||||
static int plm_tmd_connect(void);
|
||||
static int plm_tmd_disconnect(void);
|
||||
|
||||
/*
|
||||
* Local "global" variables
|
||||
*/
|
||||
static tm_task_id *tm_task_ids = NULL;
|
||||
static int num_tids;
|
||||
static orte_job_t *active_jdata;
|
||||
|
||||
/*
|
||||
* Global variable
|
||||
*/
|
||||
@ -115,6 +125,89 @@ static int plm_tmd_init(void)
|
||||
}
|
||||
|
||||
|
||||
/* check for job completion */
|
||||
static void check_job_complete(int fd, short evnt, void *arg)
|
||||
{
|
||||
tm_event_t event, *tm_events;
|
||||
int *exit_codes, i, rc, local_err;
|
||||
orte_vpid_t v;
|
||||
orte_proc_t **procs;
|
||||
|
||||
opal_output(0, "checking job completion");
|
||||
|
||||
tm_events = malloc(sizeof(tm_event_t) * num_tids);
|
||||
exit_codes = malloc(sizeof(int) * num_tids);
|
||||
|
||||
/* connect to the mom */
|
||||
rc = plm_tmd_connect();
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
opal_output(0, "CANNOT CONNECT TO MOM");
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* cycle through the task_id's and check for complete */
|
||||
for (i=0; i < num_tids; i++) {
|
||||
if (ORTE_SUCCESS != (rc = tm_obit(tm_task_ids[i], exit_codes + i, tm_events + i))) {
|
||||
opal_output(0, "CANNOT CHECK PROC %d", i);
|
||||
}
|
||||
}
|
||||
opal_output(0, "obits requested");
|
||||
|
||||
|
||||
/* cycle through and poll the events */
|
||||
for (i=0; i < num_tids; i++) {
|
||||
rc = tm_poll(TM_NULL_EVENT, &event, 1, &local_err);
|
||||
if (TM_SUCCESS != rc) {
|
||||
errno = local_err;
|
||||
opal_output(0, "plm:tmd: failed to poll obit, return status = %d", rc);
|
||||
}
|
||||
}
|
||||
opal_output(0, "obits polled");
|
||||
|
||||
|
||||
/* store the exit codes */
|
||||
procs = (orte_proc_t**)active_jdata->procs->addr;
|
||||
for (v=0; v < active_jdata->num_procs; v++) {
|
||||
procs[v]->exit_code = exit_codes[v];
|
||||
opal_output(0, "rank %d ecode %d", (int)v, exit_codes[v]);
|
||||
if (WIFEXITED(exit_codes[v])) {
|
||||
if (procs[v]->state < ORTE_PROC_STATE_TERMINATED) {
|
||||
procs[v]->state = ORTE_PROC_STATE_TERMINATED;
|
||||
active_jdata->num_terminated++;
|
||||
}
|
||||
} else {
|
||||
procs[v]->state = ORTE_PROC_STATE_ABORTED_BY_SIG;
|
||||
active_jdata->abort = true;
|
||||
active_jdata->aborted_proc = procs[v];
|
||||
active_jdata->state = ORTE_JOB_STATE_ABORTED;
|
||||
}
|
||||
}
|
||||
|
||||
/* disconnect from mom */
|
||||
plm_tmd_disconnect();
|
||||
|
||||
cleanup:
|
||||
/* free tm_events */
|
||||
free(tm_events);
|
||||
free(exit_codes);
|
||||
|
||||
/* check for completion */
|
||||
if (active_jdata->num_terminated >= active_jdata->num_procs) {
|
||||
active_jdata->state = ORTE_JOB_STATE_TERMINATED;
|
||||
orte_wakeup(0);
|
||||
} else if (active_jdata->state == ORTE_JOB_STATE_ABORTED &&
|
||||
!orte_abnormal_term_ordered && !orte_abort_in_progress) {
|
||||
orte_errmgr.proc_aborted(&(active_jdata->aborted_proc->name),
|
||||
active_jdata->aborted_proc->exit_code);
|
||||
}
|
||||
|
||||
/* reset the timer */
|
||||
ORTE_TIMER_EVENT(1, check_job_complete);
|
||||
opal_output(0, "timer reset");
|
||||
|
||||
}
|
||||
|
||||
|
||||
/* When working in this function, ALWAYS jump to "cleanup" if
|
||||
* you encounter an error so that orterun will be woken up and
|
||||
* the job can cleanly terminate
|
||||
@ -124,35 +217,22 @@ static int plm_tmd_launch_job(orte_job_t *jdata)
|
||||
orte_job_map_t *map = NULL;
|
||||
orte_app_context_t **apps;
|
||||
orte_node_t **nodes;
|
||||
int node_name_index;
|
||||
int proc_vpid_index;
|
||||
char *param;
|
||||
orte_proc_t **procs;
|
||||
char *param, *param2;
|
||||
char **env = NULL;
|
||||
char *var;
|
||||
char **argv = NULL;
|
||||
int argc;
|
||||
int rc;
|
||||
bool connected = false;
|
||||
orte_std_cntr_t launched = 0, i;
|
||||
orte_std_cntr_t launched = 0, i, n;
|
||||
char *bin_base = NULL, *lib_base = NULL;
|
||||
tm_event_t *tm_events = NULL;
|
||||
tm_task_id *tm_task_ids = NULL;
|
||||
int local_err;
|
||||
tm_event_t event;
|
||||
struct timeval launchstart, launchstop, completionstart, completionstop;
|
||||
struct timeval jobstart, jobstop;
|
||||
int maxtime=0, mintime=99999999, maxiter = 0, miniter = 0, deltat;
|
||||
float avgtime=0.0;
|
||||
bool failed_launch = true;
|
||||
mode_t current_umask;
|
||||
orte_vpid_t v;
|
||||
tm_event_t *tm_events = NULL;
|
||||
|
||||
/* record who we are working on */
|
||||
active_jdata = jdata;
|
||||
|
||||
|
||||
/* check for timing request - get start time if so */
|
||||
if (orte_timing) {
|
||||
if (0 != gettimeofday(&jobstart, NULL)) {
|
||||
opal_output(0, "plm_tmd: could not obtain job start time");
|
||||
}
|
||||
}
|
||||
/* create a jobid for this job */
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_create_jobid(&jdata->jobid))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -178,175 +258,299 @@ static int plm_tmd_launch_job(orte_job_t *jdata)
|
||||
}
|
||||
apps = (orte_app_context_t**)jdata->apps->addr;
|
||||
nodes = (orte_node_t**)map->nodes->addr;
|
||||
procs = (orte_proc_t**)jdata->procs->addr;
|
||||
|
||||
if (0 == map->num_new_daemons) {
|
||||
/* have all the daemons we need - launch app */
|
||||
goto launch_apps;
|
||||
}
|
||||
|
||||
/* Allocate a bunch of TM events to use for tm_spawn()ing */
|
||||
tm_events = malloc(sizeof(tm_event_t) * map->num_new_daemons);
|
||||
/* Allocate a bunch of TM events to use for tm_spawn()ing - we will
|
||||
* need one/process since we are not launching daemons
|
||||
*/
|
||||
tm_events = malloc(sizeof(tm_event_t) * jdata->num_procs);
|
||||
if (NULL == tm_events) {
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
tm_task_ids = malloc(sizeof(tm_task_id) * map->num_new_daemons);
|
||||
tm_task_ids = malloc(sizeof(tm_task_id) * jdata->num_procs);
|
||||
if (NULL == tm_task_ids) {
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
num_tids = jdata->num_procs;
|
||||
|
||||
/* Add basic orted command line options */
|
||||
orte_plm_base_orted_append_basic_args(&argc, &argv, "env",
|
||||
&proc_vpid_index,
|
||||
&node_name_index);
|
||||
|
||||
if (0 < opal_output_get_verbosity(orte_plm_globals.output)) {
|
||||
param = opal_argv_join(argv, ' ');
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:tm: final top-level argv:\n\t%s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(NULL == param) ? "NULL" : param));
|
||||
if (NULL != param) free(param);
|
||||
/* Figure out the basenames for the libdir and bindir. There is a
|
||||
lengthy comment about this in plm_rsh_module.c explaining all
|
||||
the rationale for how / why we're doing this. */
|
||||
lib_base = opal_basename(opal_install_dirs.libdir);
|
||||
bin_base = opal_basename(opal_install_dirs.bindir);
|
||||
|
||||
|
||||
/* setup the environment for each app_context - we need to insert enviro
|
||||
* values that tell the procs how to bootstrap themselves
|
||||
*/
|
||||
for (n=0; n < jdata->num_apps; n++) {
|
||||
orte_app_context_t *context = apps[n];
|
||||
char ***env = &context->env;
|
||||
|
||||
/* setup base environment: copy the current environ and merge
|
||||
* in the app context environ
|
||||
*/
|
||||
if (NULL != context->env) {
|
||||
*env = opal_environ_merge(orte_launch_environ, context->env);
|
||||
} else {
|
||||
*env = opal_argv_copy(orte_launch_environ);
|
||||
}
|
||||
|
||||
/* Try to change to the context cwd and check that the app
|
||||
* exists and is executable The function will
|
||||
* take care of outputting a pretty error message, if required
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_util_check_context_cwd(context, true))) {
|
||||
/* do not ERROR_LOG - it will be reported elsewhere */
|
||||
goto cleanup;
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = orte_util_check_context_app(context))) {
|
||||
/* do not ERROR_LOG - it will be reported elsewhere */
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* special case handling for --prefix: this is somewhat icky,
|
||||
* but at least some users do this. :-\ It is possible that
|
||||
* when using --prefix, the user will also "-x PATH" and/or
|
||||
* "-x LD_LIBRARY_PATH", which would therefore clobber the
|
||||
* work that was done in the prior pls to ensure that we have
|
||||
* the prefix at the beginning of the PATH and
|
||||
* LD_LIBRARY_PATH. So examine the context->env and see if we
|
||||
* find PATH or LD_LIBRARY_PATH. If found, that means the
|
||||
* prior work was clobbered, and we need to re-prefix those
|
||||
* variables.
|
||||
*/
|
||||
for (i = 0; NULL != context->env && NULL != context->env[i]; ++i) {
|
||||
char *newenv;
|
||||
|
||||
/* Reset PATH */
|
||||
if (0 == strncmp("PATH=", context->env[i], 5)) {
|
||||
asprintf(&newenv, "%s/%s:%s",
|
||||
apps[n]->prefix_dir, bin_base, context->env[i] + 5);
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:tmd: resetting PATH: %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
newenv));
|
||||
opal_setenv("PATH", newenv, true, env);
|
||||
free(newenv);
|
||||
}
|
||||
|
||||
/* Reset LD_LIBRARY_PATH */
|
||||
else if (0 == strncmp("LD_LIBRARY_PATH=", context->env[i], 16)) {
|
||||
asprintf(&newenv, "%s/%s:%s",
|
||||
apps[n]->prefix_dir, lib_base, context->env[i] + 16);
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:tm: resetting LD_LIBRARY_PATH: %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
newenv));
|
||||
opal_setenv("LD_LIBRARY_PATH", newenv, true, env);
|
||||
free(newenv);
|
||||
}
|
||||
}
|
||||
|
||||
/* there will be no local daemon */
|
||||
param = mca_base_param_environ_variable("orte","local_daemon","uri");
|
||||
opal_unsetenv(param, env);
|
||||
free(param);
|
||||
|
||||
/* pass the hnp's contact info to the local proc */
|
||||
param = mca_base_param_environ_variable("orte","hnp","uri");
|
||||
opal_setenv(param, orte_process_info.my_hnp_uri, true, env);
|
||||
free(param);
|
||||
|
||||
/* set the app_context number into the environment */
|
||||
param = mca_base_param_environ_variable("orte","app","num");
|
||||
asprintf(¶m2, "%ld", (long)context->idx);
|
||||
opal_setenv(param, param2, true, env);
|
||||
free(param);
|
||||
free(param2);
|
||||
|
||||
/* set the universe size in the environment */
|
||||
param = mca_base_param_environ_variable("orte","universe","size");
|
||||
asprintf(¶m2, "%ld", (long)jdata->total_slots_alloc);
|
||||
opal_setenv(param, param2, true, env);
|
||||
free(param);
|
||||
|
||||
/* although the total_slots_alloc is the universe size, users
|
||||
* would appreciate being given a public environmental variable
|
||||
* that also represents this value - something MPI specific - so
|
||||
* do that here.
|
||||
*
|
||||
* AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT.
|
||||
* We know - just live with it
|
||||
*/
|
||||
opal_setenv("OMPI_UNIVERSE_SIZE", param2, true, env);
|
||||
free(param2);
|
||||
|
||||
/* tell the proc to use the "envd" - i.e., the environment direct
|
||||
* ESS module to set itself up
|
||||
*/
|
||||
param = mca_base_param_environ_variable("ess",NULL,"NULL");
|
||||
opal_setenv(param, "envd", true, env);
|
||||
free(param);
|
||||
|
||||
/* since we want to pass the name as separate components, make sure
|
||||
* that the "name" environmental variable is cleared!
|
||||
*/
|
||||
if(NULL == (param = mca_base_param_environ_variable("orte","ess","name"))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
opal_unsetenv(param, env);
|
||||
free(param);
|
||||
|
||||
/* tell the proc the jobid - same for everyone */
|
||||
if (ORTE_SUCCESS != (rc = orte_util_convert_jobid_to_string(¶m2, jdata->jobid))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
if(NULL == (param = mca_base_param_environ_variable("orte","ess","jobid"))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
goto cleanup;
|
||||
}
|
||||
opal_setenv(param, param2, true, env);
|
||||
free(param);
|
||||
free(param2);
|
||||
|
||||
/* tell the proc the #procs in this job */
|
||||
asprintf(¶m2, "%ld", (long) jdata->num_procs);
|
||||
if(NULL == (param = mca_base_param_environ_variable("orte","ess","num_procs"))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
opal_setenv(param, param2, true, env);
|
||||
free(param);
|
||||
|
||||
/* although the num_procs is the comm_world size, users
|
||||
* would appreciate being given a public environmental variable
|
||||
* that also represents this value - something MPI specific - so
|
||||
* do that here.
|
||||
*
|
||||
* AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT.
|
||||
* We know - just live with it
|
||||
*/
|
||||
opal_setenv("OMPI_COMM_WORLD_SIZE", param2, true, env);
|
||||
free(param2);
|
||||
|
||||
}
|
||||
|
||||
|
||||
rc = plm_tmd_connect();
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
goto cleanup;
|
||||
}
|
||||
connected = true;
|
||||
|
||||
/* Figure out the basenames for the libdir and bindir. There is a
|
||||
lengthy comment about this in plm_rsh_module.c explaining all
|
||||
the rationale for how / why we're doing this. */
|
||||
lib_base = opal_basename(opal_install_dirs.libdir);
|
||||
bin_base = opal_basename(opal_install_dirs.bindir);
|
||||
|
||||
/* setup environment */
|
||||
env = opal_argv_copy(environ);
|
||||
|
||||
/* add our umask -- see big note in orted.c */
|
||||
current_umask = umask(0);
|
||||
umask(current_umask);
|
||||
asprintf(&var, "0%o", current_umask);
|
||||
opal_setenv("ORTE_DAEMON_UMASK_VALUE", var, true, &env);
|
||||
free(var);
|
||||
|
||||
/* If we have a prefix, then modify the PATH and
|
||||
LD_LIBRARY_PATH environment variables. We only allow
|
||||
a single prefix to be specified. Since there will
|
||||
always be at least one app_context, we take it from
|
||||
there
|
||||
*/
|
||||
if (NULL != apps[0]->prefix_dir) {
|
||||
char *newenv;
|
||||
/* Iterate through each of the procs and launch it */
|
||||
for (v = 0; v < jdata->num_procs; v++) {
|
||||
orte_proc_t *proc = procs[v];
|
||||
orte_node_t* node = proc->node;
|
||||
orte_app_context_t *context = apps[proc->app_idx];
|
||||
char **environ_copy;
|
||||
int argc;
|
||||
|
||||
for (i = 0; NULL != env && NULL != env[i]; ++i) {
|
||||
/* Reset PATH */
|
||||
if (0 == strncmp("PATH=", env[i], 5)) {
|
||||
asprintf(&newenv, "%s/%s:%s",
|
||||
apps[0]->prefix_dir, bin_base, env[i] + 5);
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:tm: resetting PATH: %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
newenv));
|
||||
opal_setenv("PATH", newenv, true, &env);
|
||||
free(newenv);
|
||||
}
|
||||
|
||||
/* Reset LD_LIBRARY_PATH */
|
||||
else if (0 == strncmp("LD_LIBRARY_PATH=", env[i], 16)) {
|
||||
asprintf(&newenv, "%s/%s:%s",
|
||||
apps[0]->prefix_dir, lib_base, env[i] + 16);
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:tm: resetting LD_LIBRARY_PATH: %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
newenv));
|
||||
opal_setenv("LD_LIBRARY_PATH", newenv, true, &env);
|
||||
free(newenv);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Iterate through each of the nodes and spin
|
||||
* up a daemon.
|
||||
*/
|
||||
for (i = 0; i < map->num_nodes; i++) {
|
||||
orte_node_t* node = nodes[i];
|
||||
char* vpid_string;
|
||||
/* copy the environment */
|
||||
environ_copy = opal_argv_copy(context->env);
|
||||
|
||||
/* if this daemon already exists, don't launch it! */
|
||||
if (node->daemon_launched) {
|
||||
continue;
|
||||
/* pass the proc's vpid */
|
||||
if (ORTE_SUCCESS != (rc = orte_util_convert_vpid_to_string(¶m2, proc->name.vpid))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* setup node name */
|
||||
free(argv[node_name_index]);
|
||||
argv[node_name_index] = strdup(node->name);
|
||||
if(NULL == (param = mca_base_param_environ_variable("orte","ess","vpid"))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
goto cleanup;
|
||||
}
|
||||
opal_setenv(param, param2, true, &environ_copy);
|
||||
free(param);
|
||||
/* although the vpid IS the process' rank within the job, users
|
||||
* would appreciate being given a public environmental variable
|
||||
* that also represents this value - something MPI specific - so
|
||||
* do that here.
|
||||
*
|
||||
* AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT.
|
||||
* We know - just live with it
|
||||
*/
|
||||
opal_setenv("OMPI_COMM_WORLD_RANK", param2, true, &environ_copy);
|
||||
free(param2); /* done with this now */
|
||||
|
||||
/* pass the node's launch_id to the process as a "nodeid" so it can
|
||||
* identify which procs are local to it
|
||||
*/
|
||||
asprintf(¶m2, "%ld", (long)node->launch_id);
|
||||
param = mca_base_param_environ_variable("orte","nodeid",NULL);
|
||||
opal_setenv(param, param2, true, &environ_copy);
|
||||
free(param);
|
||||
free(param2);
|
||||
/* ensure we use the same nodename */
|
||||
param = mca_base_param_environ_variable("orte", "base", "nodename");
|
||||
opal_setenv(param, node->name, true, &environ_copy);
|
||||
free(param);
|
||||
|
||||
/* set the local rank */
|
||||
asprintf(¶m2, "%lu", (unsigned long) proc->local_rank);
|
||||
if(NULL == (param = mca_base_param_environ_variable("orte","ess","local_rank"))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
goto cleanup;
|
||||
}
|
||||
opal_setenv(param, param2, true, &environ_copy);
|
||||
free(param);
|
||||
free(param2);
|
||||
|
||||
/* setup yield schedule and processor affinity
|
||||
* We default here to always setting the affinity processor if we want
|
||||
* it. The processor affinity system then determines
|
||||
* if processor affinity is enabled/requested - if so, it then uses
|
||||
* this value to select the process to which the proc is "assigned".
|
||||
* Otherwise, the paffinity subsystem just ignores this value anyway
|
||||
*/
|
||||
param = mca_base_param_environ_variable("mpi", NULL,
|
||||
"paffinity_processor");
|
||||
asprintf(¶m2, "%lu", (unsigned long) proc->local_rank);
|
||||
opal_setenv(param, param2, true, &environ_copy);
|
||||
free(param);
|
||||
free(param2);
|
||||
|
||||
if (node->oversubscribed) {
|
||||
param = mca_base_param_environ_variable("mpi", NULL, "yield_when_idle");
|
||||
opal_setenv(param, "1", false, &environ_copy);
|
||||
} else {
|
||||
param = mca_base_param_environ_variable("mpi", NULL, "yield_when_idle");
|
||||
opal_setenv(param, "0", false, &environ_copy);
|
||||
}
|
||||
free(param);
|
||||
|
||||
asprintf(¶m2, "%ld", (long) node->num_procs);
|
||||
if(NULL == (param = mca_base_param_environ_variable("orte","ess","num_local_procs"))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
opal_setenv(param, param2, true, &environ_copy);
|
||||
free(param);
|
||||
free(param2);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:tm: launching on node %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
node->name));
|
||||
|
||||
/* setup process name */
|
||||
rc = orte_util_convert_vpid_to_string(&vpid_string, nodes[i]->daemon->name.vpid);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
opal_output(0, "plm:tm: unable to get daemon vpid as string");
|
||||
exit(-1);
|
||||
}
|
||||
free(argv[proc_vpid_index]);
|
||||
argv[proc_vpid_index] = strdup(vpid_string);
|
||||
free(vpid_string);
|
||||
/* set the argc count */
|
||||
argc = opal_argv_count(context->argv);
|
||||
|
||||
/* exec the daemon */
|
||||
if (0 < opal_output_get_verbosity(orte_plm_globals.output)) {
|
||||
param = opal_argv_join(argv, ' ');
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:tm: executing:\n\t%s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(NULL == param) ? "NULL" : param));
|
||||
if (NULL != param) free(param);
|
||||
}
|
||||
|
||||
/* check for timing request - get start time if so */
|
||||
if (orte_timing) {
|
||||
if (0 != gettimeofday(&launchstart, NULL)) {
|
||||
opal_output(0, "plm_tmd: could not obtain start time");
|
||||
launchstart.tv_sec = 0;
|
||||
launchstart.tv_usec = 0;
|
||||
}
|
||||
}
|
||||
|
||||
rc = tm_spawn(argc, argv, env, node->launch_id, tm_task_ids + launched, tm_events + launched);
|
||||
rc = tm_spawn(argc, context->argv, environ_copy, node->launch_id, tm_task_ids + launched, tm_events + launched);
|
||||
if (TM_SUCCESS != rc) {
|
||||
opal_show_help("help-plm-tm.txt", "tm-spawn-failed",
|
||||
true, argv[0], node->name, node->launch_id);
|
||||
true, context->argv[0], node->name, node->launch_id);
|
||||
rc = ORTE_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* check for timing request - get stop time and process if so */
|
||||
if (orte_timing) {
|
||||
if (0 != gettimeofday(&launchstop, NULL)) {
|
||||
opal_output(0, "plm_tmd: could not obtain stop time");
|
||||
} else {
|
||||
deltat = (launchstop.tv_sec - launchstart.tv_sec)*1000000 +
|
||||
(launchstop.tv_usec - launchstart.tv_usec);
|
||||
avgtime = avgtime + deltat / map->num_new_daemons;
|
||||
if (deltat < mintime) {
|
||||
mintime = deltat;
|
||||
miniter = launched;
|
||||
}
|
||||
if (deltat > maxtime) {
|
||||
maxtime = deltat;
|
||||
maxiter = launched;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
opal_argv_free(environ_copy);
|
||||
launched++;
|
||||
|
||||
/* Allow some progress to occur */
|
||||
@ -354,18 +558,9 @@ static int plm_tmd_launch_job(orte_job_t *jdata)
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:tm:launch: finished spawning orteds",
|
||||
"%s plm:tm:launch: finished spawning procs",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
/* check for timing request - get start time for launch completion */
|
||||
if (orte_timing) {
|
||||
if (0 != gettimeofday(&completionstart, NULL)) {
|
||||
opal_output(0, "plm_tmd: could not obtain completion start time");
|
||||
completionstart.tv_sec = 0;
|
||||
completionstart.tv_usec = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/* TM poll for all the spawns */
|
||||
for (i = 0; i < launched; ++i) {
|
||||
rc = tm_poll(TM_NULL_EVENT, &event, 1, &local_err);
|
||||
@ -375,17 +570,7 @@ static int plm_tmd_launch_job(orte_job_t *jdata)
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
|
||||
/* wait for daemons to callback */
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_daemon_callback(map->num_new_daemons))) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:tm: daemon launch failed for job %s on error %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(jdata->jobid), ORTE_ERROR_NAME(rc)));
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
launch_apps:
|
||||
#if 0
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_launch_apps(jdata->jobid))) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:tm: launch of apps failed for job %s on error %s",
|
||||
@ -393,30 +578,19 @@ launch_apps:
|
||||
ORTE_JOBID_PRINT(jdata->jobid), ORTE_ERROR_NAME(rc)));
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
#endif
|
||||
/* if we get here, then everything launched okay - record that fact */
|
||||
failed_launch = false;
|
||||
|
||||
/* check for timing request - get stop time for launch completion and report */
|
||||
if (orte_timing) {
|
||||
if (0 != gettimeofday(&completionstop, NULL)) {
|
||||
opal_output(0, "plm_tmd: could not obtain completion stop time");
|
||||
} else {
|
||||
deltat = (launchstop.tv_sec - launchstart.tv_sec)*1000000 +
|
||||
(launchstop.tv_usec - launchstart.tv_usec);
|
||||
opal_output(0, "plm_tmd: launch completion required %d usec", deltat);
|
||||
}
|
||||
opal_output(0, "plm_tmd: Launch statistics:");
|
||||
opal_output(0, "plm_tmd: Average time to launch an orted: %f usec", avgtime);
|
||||
opal_output(0, "plm_tmd: Max time to launch an orted: %d usec at iter %d", maxtime, maxiter);
|
||||
opal_output(0, "plm_tmd: Min time to launch an orted: %d usec at iter %d", mintime, miniter);
|
||||
}
|
||||
|
||||
/* setup periodic timer so we check for job completion */
|
||||
ORTE_TIMER_EVENT(1, check_job_complete);
|
||||
|
||||
cleanup:
|
||||
if (NULL != argv) {
|
||||
opal_argv_free(argv);
|
||||
/* can't reuse the events, so may as well get rid of them */
|
||||
if (NULL != tm_events) {
|
||||
free(tm_events);
|
||||
}
|
||||
|
||||
if (NULL != env) {
|
||||
opal_argv_free(env);
|
||||
}
|
||||
@ -424,13 +598,6 @@ launch_apps:
|
||||
if (connected) {
|
||||
plm_tmd_disconnect();
|
||||
}
|
||||
if (NULL != tm_events) {
|
||||
free(tm_events);
|
||||
}
|
||||
if (NULL != tm_task_ids) {
|
||||
free(tm_task_ids);
|
||||
}
|
||||
|
||||
if (NULL != lib_base) {
|
||||
free(lib_base);
|
||||
}
|
||||
@ -443,17 +610,6 @@ launch_apps:
|
||||
orte_plm_base_launch_failed(jdata->jobid, false, -1, 0, ORTE_JOB_STATE_FAILED_TO_START);
|
||||
}
|
||||
|
||||
/* check for timing request - get stop time and process if so */
|
||||
if (orte_timing) {
|
||||
if (0 != gettimeofday(&jobstop, NULL)) {
|
||||
opal_output(0, "plm_tmd: could not obtain stop time");
|
||||
} else {
|
||||
deltat = (jobstop.tv_sec - jobstart.tv_sec)*1000000 +
|
||||
(jobstop.tv_usec - jobstart.tv_usec);
|
||||
opal_output(0, "plm_tmd: launch of entire job required %d usec", deltat);
|
||||
}
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:tm:launch: finished",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
@ -465,41 +621,119 @@ launch_apps:
|
||||
static int plm_tmd_terminate_job(orte_jobid_t jobid)
|
||||
{
|
||||
int rc;
|
||||
int i, local_err;
|
||||
tm_event_t event, *tm_events;
|
||||
|
||||
/* order all of the daemons to kill their local procs for this job */
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_kill_local_procs(jobid))) {
|
||||
tm_events = malloc(sizeof(tm_event_t) * num_tids);
|
||||
if (NULL == tm_events) {
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
return rc;
|
||||
/* connect to the mom */
|
||||
rc = plm_tmd_connect();
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
opal_output(0, "CANNOT CONNECT TO MOM");
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* cycle through the task_id's and kill them */
|
||||
for (i=0; i < num_tids; i++) {
|
||||
if (ORTE_SUCCESS != (rc = tm_kill(tm_task_ids[i], SIGTERM, &tm_events[i]))) {
|
||||
opal_output(0, "CANNOT KILL PROC %d", i);
|
||||
}
|
||||
}
|
||||
|
||||
/* cycle through and poll the events */
|
||||
for (i=0; i < num_tids; i++) {
|
||||
rc = tm_poll(TM_NULL_EVENT, &event, 1, &local_err);
|
||||
if (TM_SUCCESS != rc) {
|
||||
errno = local_err;
|
||||
opal_output(0, "plm:tmd: failed to kill proc, return status = %d", rc);
|
||||
}
|
||||
}
|
||||
|
||||
/* disconnect from mom */
|
||||
plm_tmd_disconnect();
|
||||
|
||||
cleanup:
|
||||
/* free tm_events */
|
||||
free(tm_events);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Terminate the orteds for a given job
|
||||
*/
|
||||
int plm_tmd_terminate_orteds(void)
|
||||
static int plm_tmd_terminate_orteds(void)
|
||||
{
|
||||
int rc;
|
||||
|
||||
/* now tell them to die! */
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit())) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
orte_job_t *daemons;
|
||||
int data=1;
|
||||
|
||||
/* fake the system into thinking the orteds
|
||||
* reported their clean termination
|
||||
*/
|
||||
if (NULL == (daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
daemons->num_terminated = daemons->num_procs;
|
||||
|
||||
return rc;
|
||||
/* fire the trigger indicating that the orteds are gone */
|
||||
write(orteds_exit, &data, sizeof(int));
|
||||
opal_progress();
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int plm_tmd_signal_job(orte_jobid_t jobid, int32_t signal)
|
||||
{
|
||||
int rc;
|
||||
int i, local_err;
|
||||
tm_event_t event, *tm_events;
|
||||
|
||||
/* order them to pass this signal to their local procs */
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_signal_local_procs(jobid, signal))) {
|
||||
tm_events = malloc(sizeof(tm_event_t) * num_tids);
|
||||
if (NULL == tm_events) {
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
return rc;
|
||||
/* connect to the mom */
|
||||
rc = plm_tmd_connect();
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
opal_output(0, "CANNOT CONNECT TO MOM");
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* cycle through the task_id's and kill them */
|
||||
for (i=0; i < num_tids; i++) {
|
||||
if (ORTE_SUCCESS != (rc = tm_kill(tm_task_ids[i], signal, &tm_events[i]))) {
|
||||
opal_output(0, "CANNOT SIGNAL PROC %d", i);
|
||||
}
|
||||
}
|
||||
|
||||
/* cycle through and poll the events */
|
||||
for (i=0; i < num_tids; i++) {
|
||||
rc = tm_poll(TM_NULL_EVENT, &event, 1, &local_err);
|
||||
if (TM_SUCCESS != rc) {
|
||||
errno = local_err;
|
||||
opal_output(0, "plm:tmd: failed to signal proc, return status = %d", rc);
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
|
||||
/* disconnect from mom */
|
||||
plm_tmd_disconnect();
|
||||
|
||||
cleanup:
|
||||
/* free tm_events */
|
||||
free(tm_events);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
@ -515,6 +749,10 @@ static int plm_tmd_finalize(void)
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
if (NULL != tm_task_ids) {
|
||||
free(tm_task_ids);
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -245,6 +245,27 @@ ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_message_event_t);
|
||||
*(event) = tmp; \
|
||||
}while(0); \
|
||||
|
||||
|
||||
/**
|
||||
* There are places in the code where we just want to periodically
|
||||
* wakeup to do something, and then go back to sleep again. Setting
|
||||
* a timer allows us to do this
|
||||
*/
|
||||
#define ORTE_TIMER_EVENT(time, cbfunc) \
|
||||
do { \
|
||||
struct timeval now; \
|
||||
opal_event_t *tmp; \
|
||||
tmp = (opal_event_t*)malloc(sizeof(opal_event_t)); \
|
||||
opal_evtimer_set(tmp, (cbfunc), tmp); \
|
||||
now.tv_sec = (time); \
|
||||
now.tv_usec = 0; \
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_debug_output, \
|
||||
"defining timer event: %ld sec", \
|
||||
(long)now.tv_sec)); \
|
||||
opal_evtimer_add(tmp, &now); \
|
||||
}while(0); \
|
||||
|
||||
|
||||
/**
|
||||
* \internal
|
||||
*
|
||||
|
@ -923,8 +923,6 @@ static void abort_exit_callback(int fd, short ign, void *arg)
|
||||
*/
|
||||
static void abort_signal_callback(int fd, short flags, void *arg)
|
||||
{
|
||||
opal_event_t *event;
|
||||
|
||||
/* if we have already ordered this once, or we are already
|
||||
* aborting the job, don't keep doing it to avoid race conditions
|
||||
*/
|
||||
@ -942,7 +940,7 @@ static void abort_signal_callback(int fd, short flags, void *arg)
|
||||
(which is a Bad Thing), so we can't call it directly.
|
||||
Instead, we have to exit this handler and setup to call
|
||||
job_completed() after this. */
|
||||
ORTE_DETECT_TIMEOUT(&event, 0, 0, 1, abort_exit_callback);
|
||||
ORTE_TIMER_EVENT(0, abort_exit_callback);
|
||||
}
|
||||
|
||||
/**
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user