1
1
For some time, ORTE has had the ability to launch daemons on all nodes prior to launching an application. It has largely been used outside of the OMPI community, and so was never explicitly turned "on" inside OMPI releases. Nevertheless, the code has been there.

Allowing VM launches does not require ANY changes to existing PLM components. All that was required was to have orterun launch the daemons as a separate call to orte_plm.spawn -prior- to launching the applications. The rest of the VM support code resides in the rmaps framework:

(a) a check when asked to map a job to see if it is the daemon job, and

(b) a separate "setup_virtual_machine" mapper in the rmaps base that creates the required map so the PLM's will do the right thing.

In order to support those users who have no RM allocation but like to give the allocation in the form of a -host or -hostfile argument to their application, there is a little more code in orterun and the setup_virtual_machine mapper to capture information passed in that manner.

This has been tested with rsh and slurm environments, and, since there is nothing environment-specific in the implementation, should work in others as well - but needs to be proven.

This commit was SVN r24524.
Этот коммит содержится в:
Ralph Castain 2011-03-12 22:50:53 +00:00
родитель 80265b472e
Коммит dc6f616599
4 изменённых файлов: 127 добавлений и 18 удалений

Просмотреть файл

@ -138,6 +138,7 @@ static int rte_init(void)
orte_job_t *jdata;
orte_node_t *node;
orte_proc_t *proc;
orte_app_context_t *app;
int value;
/* run the prolog */
@ -474,6 +475,11 @@ static int rte_init(void)
jdata->jobid = ORTE_PROC_MY_NAME->jobid;
opal_pointer_array_set_item(orte_job_data, 0, jdata);
/* every job requires at least one app */
app = OBJ_NEW(orte_app_context_t);
opal_pointer_array_set_item(jdata->apps, 0, app);
jdata->num_apps++;
/* create and store a node object where we are */
node = OBJ_NEW(orte_node_t);
node->name = strdup(orte_process_info.nodename);

Просмотреть файл

@ -813,6 +813,7 @@ int orte_rmaps_base_define_daemons(orte_job_t *jdata)
int orte_rmaps_base_setup_virtual_machine(orte_job_t *jdata)
{
orte_job_t *jdat;
orte_node_t *node;
orte_proc_t *proc;
orte_job_map_t *map;
@ -820,8 +821,9 @@ int orte_rmaps_base_setup_virtual_machine(orte_job_t *jdata)
opal_list_item_t *item;
orte_app_context_t *app;
orte_std_cntr_t num_slots;
int rc;
int rc, i, n;
bool ignored;
/* get the daemon app if provided - may include -host or hostfile
* info about available nodes
*/
@ -839,9 +841,50 @@ int orte_rmaps_base_setup_virtual_machine(orte_job_t *jdata)
OBJ_DESTRUCT(&node_list);
return rc;
}
/* check all other known jobs to see if they have something to
* add to the allocation - we won't have seen these and the
* daemon job won't have any in its app
*/
for (i=0; i < orte_job_data->size; i++) {
if (NULL == (jdat = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, i))) {
continue;
}
for (n=0; n < jdat->apps->size; n++) {
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdat->apps, n))) {
continue;
}
if (NULL != app->hostfile) {
/* hostfile was specified - parse it and add it to the list. The
* function automatically ignores duplicates
*/
if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&node_list,
&ignored,
app->hostfile))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&node_list);
return rc;
}
}
if (NULL != app->dash_host) {
/* parse and add to list, ignoring duplicates */
if (ORTE_SUCCESS != (rc = orte_util_add_dash_host_nodes(&node_list,
&ignored,
app->dash_host))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&node_list);
return rc;
}
}
}
}
/* add all these nodes to the map */
while (NULL != (item = opal_list_remove_first(&node_list))) {
node = (orte_node_t*)item;
/* if this is my node, ignore it - we are already here */
if (0 == strcmp(node->name, orte_process_info.nodename)) {
continue;
}
opal_pointer_array_add(map->nodes, (void*)node);
++(map->num_nodes);
/* if this node already has a daemon, release that object

Просмотреть файл

@ -80,6 +80,7 @@
#include "orte/mca/debugger/base/base.h"
#include "orte/mca/odls/odls.h"
#include "orte/mca/plm/plm.h"
#include "orte/mca/plm/base/plm_private.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/rml/rml_types.h"
#include "orte/mca/rml/base/rml_contact.h"
@ -429,6 +430,10 @@ static opal_cmd_line_init_t cmd_line_init[] = {
NULL, OPAL_CMD_LINE_TYPE_INT,
"Max number of times to restart a failed process" },
{ "orte", "vm", "launch", '\0', "vm", "vm", 0,
&orterun_globals.launch_vm, OPAL_CMD_LINE_TYPE_BOOL,
"Launch daemons on all nodes at start to create a virtual machine [Default = false]" },
#if OPAL_ENABLE_CRDEBUG == 1
{ "opal", "cr", "enable_crdebug", '\0', "crdebug", "crdebug", 0,
NULL, OPAL_CMD_LINE_TYPE_BOOL,
@ -462,6 +467,7 @@ int orterun(int argc, char *argv[])
opal_cmd_line_t cmd_line;
char * tmp_env_var = NULL;
orte_debugger_breakpoint_fn_t foo;
orte_job_t *daemons;
/* find our basename (the name of the executable) so that we can
use it in pretty-print error messages */
@ -472,7 +478,7 @@ int orterun(int argc, char *argv[])
opal_cmd_line_create(&cmd_line, cmd_line_init);
mca_base_cmd_line_setup(&cmd_line);
if (ORTE_SUCCESS != (rc = opal_cmd_line_parse(&cmd_line, true,
argc, argv)) ) {
argc, argv)) ) {
return rc;
}
@ -585,7 +591,7 @@ int orterun(int argc, char *argv[])
if (0 == jdata->num_apps) {
/* This should never happen -- this case should be caught in
create_app(), but let's just double check... */
create_app(), but let's just double check... */
orte_show_help("help-orterun.txt", "orterun:nothing-to-do",
true, orte_basename);
exit(ORTE_ERROR_DEFAULT_EXIT_CODE);
@ -655,23 +661,23 @@ int orterun(int argc, char *argv[])
}
/* Change the default behavior of libevent such that we want to
continually block rather than blocking for the default timeout
and then looping around the progress engine again. There
should be nothing in the orted that cannot block in libevent
until "something" happens (i.e., there's no need to keep
cycling through progress because the only things that should
happen will happen in libevent). This is a minor optimization,
but what the heck... :-) */
continually block rather than blocking for the default timeout
and then looping around the progress engine again. There
should be nothing in the orted that cannot block in libevent
until "something" happens (i.e., there's no need to keep
cycling through progress because the only things that should
happen will happen in libevent). This is a minor optimization,
but what the heck... :-) */
opal_progress_set_event_flag(OPAL_EVLOOP_ONCE);
/* If we have a prefix, then modify the PATH and
LD_LIBRARY_PATH environment variables in our copy. This
will ensure that any locally-spawned children will
have our executables and libraries in their path
LD_LIBRARY_PATH environment variables in our copy. This
will ensure that any locally-spawned children will
have our executables and libraries in their path
For now, default to the prefix_dir provided in the first app_context.
Since there always MUST be at least one app_context, we are safe in
doing this.
For now, default to the prefix_dir provided in the first app_context.
Since there always MUST be at least one app_context, we are safe in
doing this.
*/
if (NULL != ((orte_app_context_t*)jdata->apps->addr[0])->prefix_dir) {
char *oldenv, *newenv, *lib_base, *bin_base;
@ -778,6 +784,58 @@ int orterun(int argc, char *argv[])
}
}
/* if we are launching the vm, now is the time to do so */
if (orterun_globals.launch_vm) {
int32_t ljob, i;
orte_app_context_t *app;
/* we may need to look at the apps for the user's job
* to get our full list of nodes, so prep the job for
* launch. This duplicates some code in orte_plm_base_setup_job
* that won't run if we do this here - eventually, we'll want
* to refactor the plm_base routine to avoid the duplication
*/
/* get a jobid for it */
if (ORTE_SUCCESS != (rc = orte_plm_base_create_jobid(jdata))) {
ORTE_ERROR_LOG(rc);
goto DONE;
}
/* store it on the global job data pool - this is the key
* step required before we launch the daemons. It allows
* the orte_rmaps_base_setup_virtual_machine routine to
* search all apps for any hosts to be used by the vm
*/
ljob = ORTE_LOCAL_JOBID(jdata->jobid);
opal_pointer_array_set_item(orte_job_data, ljob, jdata);
/* set the job state */
jdata->state = ORTE_JOB_STATE_INIT;
/* if job recovery is not defined, set it to default */
if (!jdata->recovery_defined) {
/* set to system default */
jdata->enable_recovery = orte_enable_recovery;
}
/* if app recovery is not defined, set apps to defaults */
for (i=0; i < jdata->apps->size; i++) {
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
continue;
}
if (!app->recovery_defined) {
app->max_restarts = orte_max_restarts;
}
}
/* get the daemon job object */
daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
/* launch the daemons */
if (ORTE_SUCCESS != (rc = orte_plm.spawn(daemons))) {
fprintf(stderr, "%s: UNABLE TO LAUNCH VIRTUAL MACHINE\n", orte_basename);
goto DONE;
}
/* ensure all future jobs use the VM */
orte_default_mapping_policy |= ORTE_MAPPING_USE_VM;
}
/* setup for debugging */
orte_debugger.init_before_spawn(jdata);
@ -793,7 +851,7 @@ int orterun(int argc, char *argv[])
/* we only reach this point by jumping there due
* to an error - so just cleanup and leave
*/
DONE:
DONE:
ORTE_UPDATE_EXIT_STATUS(orte_exit_status);
orte_quit();
@ -816,6 +874,7 @@ static int init_globals(void)
orterun_globals.report_pid = NULL;
orterun_globals.report_uri = NULL;
orterun_globals.disable_recovery = false;
orterun_globals.launch_vm = false;
}
/* Reset the other fields every time */

Просмотреть файл

@ -67,6 +67,7 @@ struct orterun_globals_t {
char *sstore_load;
#endif
bool disable_recovery;
bool launch_vm;
};
/**