1
1

Add a binomial tree-based launch to ssh, turned "on" only when the plm_rsh_tree_spawned mca param is set to a non-zero value. This probably isn't a very optimized capability, but it does execute a tree-based launch that may scale better than linear at high node counts.

Add the daemon map capability to the ODLS to create and save a map of daemon vpid vs nodename from the launch message.

Cleanup a few places in the base plm launch support where we didn't adequately protect rml recv's from potentially executing sends.

This commit was SVN r18143.
Этот коммит содержится в:
Ralph Castain 2008-04-14 18:26:08 +00:00
родитель 0f311ed824
Коммит 7c7304466c
21 изменённых файлов: 1272 добавлений и 394 удалений

Просмотреть файл

@ -62,6 +62,21 @@ int orte_ess_base_orted_setup(void)
char *error = NULL;
char *jobid_str, *procid_str;
/* some environments allow remote launches - e.g., ssh - so
* open the PLM and select something
*/
if (ORTE_SUCCESS != (ret = orte_plm_base_open())) {
ORTE_ERROR_LOG(ret);
error = "orte_plm_base_open";
goto error;
}
if (ORTE_SUCCESS != (ret = orte_plm_base_select())) {
ORTE_ERROR_LOG(ret);
error = "orte_plm_base_select";
goto error;
}
/* Setup the communication infrastructure */
/* Runtime Messaging Layer */
@ -263,9 +278,8 @@ int orte_ess_base_orted_finalize(void)
orte_wait_finalize();
orte_iof_base_close();
/* finalize selected modules so they can de-register
* any receives
*/
/* finalize selected modules */
orte_plm_base_close();
orte_errmgr_base_close();
/* now can close the rml and its friendly group comm */

Просмотреть файл

@ -24,6 +24,9 @@
#include "opal/util/trace.h"
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "opal/class/opal_pointer_array.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/odls/odls.h"
#include "orte/mca/odls/base/base.h"
@ -32,6 +35,9 @@
int orte_odls_base_close(void)
{
int i;
char **nodes;
OPAL_TRACE(5);
/* cleanup globals */
@ -40,6 +46,14 @@ int orte_odls_base_close(void)
OBJ_DESTRUCT(&orte_odls_globals.children);
OBJ_DESTRUCT(&orte_odls_globals.jobs);
nodes = (char**)orte_daemonmap.addr;
for (i=0; i < orte_daemonmap.size; i++) {
if (NULL != nodes[i]) {
free(nodes[i]);
}
}
OBJ_DESTRUCT(&orte_daemonmap);
/* if no components are available, then punt */
if (!orte_odls_base.components_available) {
return ORTE_SUCCESS;

Просмотреть файл

@ -37,6 +37,7 @@
#include "opal/util/num_procs.h"
#include "opal/util/sys_limits.h"
#include "opal/util/show_help.h"
#include "opal/class/opal_pointer_array.h"
#include "opal/dss/dss.h"
#include "orte/mca/errmgr/errmgr.h"
@ -66,6 +67,7 @@
#include "orte/mca/odls/base/odls_private.h"
/* IT IS CRITICAL THAT ANY CHANGE IN THE ORDER OF THE INFO PACKED IN
* THIS FUNCTION BE REFLECTED IN THE CONSTRUCT_CHILD_LIST PARSER BELOW
*/
@ -80,10 +82,11 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
orte_std_cntr_t i;
orte_vpid_t j;
orte_vpid_t invalid_vpid=ORTE_VPID_INVALID;
char *nodename;
opal_buffer_t *wireup;
opal_byte_object_t bo, *boptr;
int32_t numbytes;
/* get wireup info for daemons per the selected routing module */
wireup = OBJ_NEW(opal_buffer_t);
if (ORTE_SUCCESS != (rc = orte_routed.get_wireup_info(ORTE_PROC_MY_NAME->jobid, wireup))) {
@ -195,6 +198,27 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
return rc;
}
/* if we are not keeping FQDN hostnames, abbreviate
* the nodename as required
*/
if (!orte_keep_fqdn_hostnames) {
char *ptr;
nodename = strdup(node->name);
if (NULL != (ptr = strchr(nodename, '.'))) {
*ptr = '\0';
}
} else {
nodename = strdup(node->name);
}
/* pack the nodename so that all daemons know where this one is located */
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &nodename, 1, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
free(nodename);
return rc;
}
free(nodename);
/* pack the number of procs on this node */
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &node->num_procs, 1, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
@ -275,7 +299,7 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
orte_odls_child_t *child;
orte_std_cntr_t cnt, j, num_nodes, app_idx;
orte_process_name_t proc, daemon;
char *slot_str;
char *slot_str, *nodename;
bool node_oversubscribed;
orte_odls_job_t *jobdat;
opal_buffer_t wireup;
@ -400,6 +424,12 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
return rc;
}
/* set the size of the daemonmap to minimize realloc's */
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_size(&orte_daemonmap, num_nodes))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* setup the proc and daemon names */
proc.jobid = *job;
daemon.jobid = ORTE_PROC_MY_NAME->jobid;
@ -413,6 +443,21 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
return rc;
}
/* unpack the name of the node so we know where this daemon is located */
cnt=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &nodename, &cnt, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* is this daemon already known to us? */
if (NULL == opal_pointer_array_get_item(&orte_daemonmap, daemon.vpid)) {
/* record it */
if (ORTE_SUCCESS != (opal_pointer_array_set_item(&orte_daemonmap, daemon.vpid, strdup(nodename)))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
/* if daemon participation is sparse, add this daemon to the
* list of those participating
*/
@ -2148,4 +2193,3 @@ CLEANUP:
OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex);
return rc;
}

Просмотреть файл

@ -27,6 +27,7 @@
#include "opal/util/trace.h"
#include "opal/util/argv.h"
#include "opal/class/opal_value_array.h"
#include "opal/class/opal_pointer_array.h"
#include "opal/dss/dss.h"
#include "orte/mca/errmgr/errmgr.h"
@ -51,7 +52,6 @@
*/
orte_odls_base_module_t orte_odls;
/* instance the child list object */
static void orte_odls_child_constructor(orte_odls_child_t *ptr)
{
@ -125,6 +125,10 @@ int orte_odls_base_open(void)
OBJ_CONSTRUCT(&orte_odls_globals.children, opal_list_t);
OBJ_CONSTRUCT(&orte_odls_globals.jobs, opal_list_t);
/* initialize and setup the daemonmap */
OBJ_CONSTRUCT(&orte_daemonmap, opal_pointer_array_t);
opal_pointer_array_init(&orte_daemonmap, 8, INT32_MAX, 8);
/* Open up all available components */
if (ORTE_SUCCESS !=

Просмотреть файл

@ -28,6 +28,7 @@
#include "orte/types.h"
#include "opal/class/opal_list.h"
#include "opal/class/opal_pointer_array.h"
#include "opal/threads/mutex.h"
#include "opal/threads/condition.h"
@ -91,7 +92,8 @@ typedef struct {
} orte_odls_globals_t;
ORTE_DECLSPEC extern orte_odls_globals_t orte_odls_globals;
/*
* Default functions that are common to most environments - can
* be overridden by specific environments if they need something
@ -164,6 +166,11 @@ ORTE_DECLSPEC int orte_odls_base_preload_files_app_context(orte_app_context_t* c
*/
ORTE_DECLSPEC int orte_odls_base_default_collect_data(orte_process_name_t *proc, opal_buffer_t *buf);
/*
* Retrive the daemon map
*/
ORTE_DECLSPEC opal_pointer_array_t* orte_odls_base_get_daemon_map(void);
END_C_DECLS
#endif

Просмотреть файл

@ -35,28 +35,29 @@ typedef uint8_t orte_daemon_cmd_flag_t;
#define ORTE_DAEMON_CONTACT_QUERY_CMD (orte_daemon_cmd_flag_t) 1
#define ORTE_DAEMON_KILL_LOCAL_PROCS (orte_daemon_cmd_flag_t) 2
#define ORTE_DAEMON_SIGNAL_LOCAL_PROCS (orte_daemon_cmd_flag_t) 3
#define ORTE_DAEMON_ADD_LOCAL_PROCS (orte_daemon_cmd_flag_t) 4
#define ORTE_DAEMON_HEARTBEAT_CMD (orte_daemon_cmd_flag_t) 5
#define ORTE_DAEMON_EXIT_CMD (orte_daemon_cmd_flag_t) 6
#define ORTE_DAEMON_PROCESS_AND_RELAY_CMD (orte_daemon_cmd_flag_t) 7
#define ORTE_DAEMON_MESSAGE_LOCAL_PROCS (orte_daemon_cmd_flag_t) 8
#define ORTE_DAEMON_NULL_CMD (orte_daemon_cmd_flag_t) 9
#define ORTE_DAEMON_SYNC_BY_PROC (orte_daemon_cmd_flag_t) 10
#define ORTE_DAEMON_ADD_LOCAL_PROC (orte_daemon_cmd_flag_t) 4
#define ORTE_DAEMON_ADD_AND_SPAWN (orte_daemon_cmd_flag_t) 5
#define ORTE_DAEMON_HEARTBEAT_CMD (orte_daemon_cmd_flag_t) 6
#define ORTE_DAEMON_EXIT_CMD (orte_daemon_cmd_flag_t) 7
#define ORTE_DAEMON_PROCESS_AND_RELAY_CMD (orte_daemon_cmd_flag_t) 8
#define ORTE_DAEMON_MESSAGE_LOCAL_PROCS (orte_daemon_cmd_flag_t) 9
#define ORTE_DAEMON_NULL_CMD (orte_daemon_cmd_flag_t) 10
#define ORTE_DAEMON_SYNC_BY_PROC (orte_daemon_cmd_flag_t) 11
/* commands for use by tools */
#define ORTE_DAEMON_REPORT_JOB_INFO_CMD (orte_daemon_cmd_flag_t) 11
#define ORTE_DAEMON_REPORT_NODE_INFO_CMD (orte_daemon_cmd_flag_t) 12
#define ORTE_DAEMON_REPORT_PROC_INFO_CMD (orte_daemon_cmd_flag_t) 13
#define ORTE_DAEMON_ATTACH_STDOUT_CMD (orte_daemon_cmd_flag_t) 14
#define ORTE_DAEMON_ATTACH_STDERR_CMD (orte_daemon_cmd_flag_t) 15
#define ORTE_DAEMON_DETACH_STDOUT_CMD (orte_daemon_cmd_flag_t) 16
#define ORTE_DAEMON_DETACH_STDERR_CMD (orte_daemon_cmd_flag_t) 17
#define ORTE_DAEMON_SPAWN_JOB_CMD (orte_daemon_cmd_flag_t) 18
#define ORTE_DAEMON_TERMINATE_JOB_CMD (orte_daemon_cmd_flag_t) 19
#define ORTE_DAEMON_HALT_VM_CMD (orte_daemon_cmd_flag_t) 20
#define ORTE_DAEMON_REPORT_JOB_INFO_CMD (orte_daemon_cmd_flag_t) 12
#define ORTE_DAEMON_REPORT_NODE_INFO_CMD (orte_daemon_cmd_flag_t) 13
#define ORTE_DAEMON_REPORT_PROC_INFO_CMD (orte_daemon_cmd_flag_t) 14
#define ORTE_DAEMON_ATTACH_STDOUT_CMD (orte_daemon_cmd_flag_t) 15
#define ORTE_DAEMON_ATTACH_STDERR_CMD (orte_daemon_cmd_flag_t) 16
#define ORTE_DAEMON_DETACH_STDOUT_CMD (orte_daemon_cmd_flag_t) 17
#define ORTE_DAEMON_DETACH_STDERR_CMD (orte_daemon_cmd_flag_t) 18
#define ORTE_DAEMON_SPAWN_JOB_CMD (orte_daemon_cmd_flag_t) 19
#define ORTE_DAEMON_TERMINATE_JOB_CMD (orte_daemon_cmd_flag_t) 20
#define ORTE_DAEMON_HALT_VM_CMD (orte_daemon_cmd_flag_t) 21
/* collective-based cmds */
#define ORTE_DAEMON_COLL_CMD (orte_daemon_cmd_flag_t) 21
#define ORTE_DAEMON_COLL_CMD (orte_daemon_cmd_flag_t) 22
END_C_DECLS

Просмотреть файл

@ -235,18 +235,17 @@ WAKEUP:
orte_wakeup();
}
static void orted_report_launch(int status, orte_process_name_t* sender,
opal_buffer_t *buffer,
orte_rml_tag_t tag, void *cbdata)
static void process_orted_launch_report(int fd, short event, void *data)
{
orte_message_event_t *mev = (orte_message_event_t*)data;
opal_buffer_t *buffer = mev->buffer;
char *rml_uri;
int rc, idx;
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:orted_report_launch from daemon %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(sender)));
ORTE_NAME_PRINT(&mev->sender)));
/* unpack its contact info */
idx = 1;
@ -264,31 +263,23 @@ static void orted_report_launch(int status, orte_process_name_t* sender,
goto CLEANUP;
}
/* lookup and record this daemon's contact info */
pdatorted[sender->vpid]->rml_uri = strdup(rml_uri);
pdatorted[mev->sender.vpid]->rml_uri = strdup(rml_uri);
free(rml_uri);
/* set the route to be direct */
if (ORTE_SUCCESS != (rc = orte_routed.update_route(sender, sender))) {
if (ORTE_SUCCESS != (rc = orte_routed.update_route(&mev->sender, &mev->sender))) {
ORTE_ERROR_LOG(rc);
orted_failed_launch = true;
goto CLEANUP;
}
/* reissue the recv */
rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ORTED_CALLBACK,
ORTE_RML_NON_PERSISTENT, orted_report_launch, NULL);
if (rc != ORTE_SUCCESS && rc != ORTE_ERR_NOT_IMPLEMENTED) {
ORTE_ERROR_LOG(rc);
orted_failed_launch = true;
}
CLEANUP:
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:orted_report_launch %s for daemon %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
orted_failed_launch ? "failed" : "completed",
ORTE_NAME_PRINT(sender)));
ORTE_NAME_PRINT(&mev->sender)));
if (orted_failed_launch) {
orte_errmgr.incomplete_start(ORTE_PROC_MY_NAME->jobid, jdatorted->aborted_proc->exit_code);
@ -298,6 +289,33 @@ CLEANUP:
}
static void orted_report_launch(int status, orte_process_name_t* sender,
opal_buffer_t *buffer,
orte_rml_tag_t tag, void *cbdata)
{
int rc;
/* don't process this right away - we need to get out of the recv before
* we process the message as it may ask us to do something that involves
* more messaging! Instead, setup an event so that the message gets processed
* as soon as we leave the recv.
*
* The macro makes a copy of the buffer, which we release when processed - the incoming
* buffer, however, is NOT released here, although its payload IS transferred
* to the message buffer for later processing
*/
ORTE_MESSAGE_EVENT(sender, buffer, tag, process_orted_launch_report);
/* reissue the recv */
rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ORTED_CALLBACK,
ORTE_RML_NON_PERSISTENT, orted_report_launch, NULL);
if (rc != ORTE_SUCCESS && rc != ORTE_ERR_NOT_IMPLEMENTED) {
ORTE_ERROR_LOG(rc);
orted_failed_launch = true;
}
}
int orte_plm_base_daemon_callback(orte_std_cntr_t num_daemons)
{
int rc;
@ -324,6 +342,12 @@ int orte_plm_base_daemon_callback(orte_std_cntr_t num_daemons)
ORTE_PROGRESSED_WAIT(orted_failed_launch, orted_num_callback, num_daemons);
/* cancel the lingering recv */
if (ORTE_SUCCESS != (rc = orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ORTED_CALLBACK))) {
ORTE_ERROR_LOG(rc);
return rc;
}
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:daemon_callback completed",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
@ -520,6 +544,12 @@ static int orte_plm_base_report_launched(orte_jobid_t job)
ORTE_PROGRESSED_WAIT(app_launch_failed, jdata->num_launched, jdata->num_procs);
/* cancel the lingering recv */
if (ORTE_SUCCESS != (rc = orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_APP_LAUNCH_CALLBACK))) {
ORTE_ERROR_LOG(rc);
return rc;
}
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:report_launched all apps reported",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
@ -548,6 +578,7 @@ int orte_plm_base_orted_append_basic_args(int *argc, char ***argv,
int i, cnt;
orte_job_t *jdata;
char *rml_uri;
unsigned long num_procs;
/* check for debug flags */
if (orte_debug_flag) {
@ -584,15 +615,24 @@ int orte_plm_base_orted_append_basic_args(int *argc, char ***argv,
}
/* pass the total number of daemons that will be in the system */
jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
if (orte_process_info.hnp) {
jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
num_procs = jdata->num_procs;
} else {
num_procs = orte_process_info.num_procs;
}
opal_argv_append(argc, argv, "-mca");
opal_argv_append(argc, argv, "orte_ess_num_procs");
asprintf(&param, "%lu", (unsigned long)(jdata->num_procs));
asprintf(&param, "%lu", num_procs);
opal_argv_append(argc, argv, param);
free(param);
/* pass the uri of the hnp */
rml_uri = orte_rml.get_contact_info();
if (orte_process_info.hnp) {
rml_uri = orte_rml.get_contact_info();
} else {
rml_uri = orte_process_info.my_hnp_uri;
}
asprintf(&param, "\"%s\"", rml_uri);
opal_argv_append(argc, argv, "--hnp-uri");
opal_argv_append(argc, argv, param);
@ -609,21 +649,23 @@ int orte_plm_base_orted_append_basic_args(int *argc, char ***argv,
* being sure to "purge" any that would cause problems
* on backend nodes
*/
cnt = opal_argv_count(orted_cmd_line);
for (i=0; i < cnt; i+=3) {
/* if the specified option is more than one word, we don't
* have a generic way of passing it as some environments ignore
* any quotes we add, while others don't - so we ignore any
* such options. In most cases, this won't be a problem as
* they typically only apply to things of interest to the HNP
*/
if (NULL != strchr(orted_cmd_line[i+2], ' ')) {
continue;
if (orte_process_info.hnp) {
cnt = opal_argv_count(orted_cmd_line);
for (i=0; i < cnt; i+=3) {
/* if the specified option is more than one word, we don't
* have a generic way of passing it as some environments ignore
* any quotes we add, while others don't - so we ignore any
* such options. In most cases, this won't be a problem as
* they typically only apply to things of interest to the HNP
*/
if (NULL != strchr(orted_cmd_line[i+2], ' ')) {
continue;
}
/* must be okay - pass it along */
opal_argv_append(argc, argv, orted_cmd_line[i]);
opal_argv_append(argc, argv, orted_cmd_line[i+1]);
opal_argv_append(argc, argv, orted_cmd_line[i+2]);
}
/* must be okay - pass it along */
opal_argv_append(argc, argv, orted_cmd_line[i]);
opal_argv_append(argc, argv, orted_cmd_line[i+1]);
opal_argv_append(argc, argv, orted_cmd_line[i+2]);
}
/*

Просмотреть файл

@ -58,6 +58,7 @@ orte_plm_base_module_t orte_plm = {
orte_plm_proxy_init,
NULL, /* cannot set hnp name in a proxy */
orte_plm_proxy_spawn,
NULL, /* cannot remotely spawn by default */
NULL, /* cannot terminate job from a proxy */
NULL, /* cannot terminate orteds from a proxy */
NULL, /* cannot signal job from a proxy */

Просмотреть файл

@ -78,8 +78,6 @@ int orte_plm_base_comm_start(void)
int orte_plm_base_comm_stop(void)
{
int rc;
if (!recv_issued) {
return ORTE_SUCCESS;
}
@ -88,12 +86,10 @@ int orte_plm_base_comm_stop(void)
"%s plm:base:receive stop comm",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
if (ORTE_SUCCESS != (rc = orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_PLM))) {
ORTE_ERROR_LOG(rc);
}
orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_PLM);
recv_issued = false;
return rc;
return ORTE_SUCCESS;
}

Просмотреть файл

@ -34,6 +34,8 @@
#include "orte/types.h"
#include "opal/mca/mca.h"
#include "opal/dss/dss_types.h"
#include "opal/class/opal_pointer_array.h"
#include "orte/runtime/orte_globals.h"
@ -55,6 +57,11 @@ typedef int (*orte_plm_base_module_init_fn_t)(void);
*/
typedef int (*orte_plm_base_module_spawn_fn_t)(orte_job_t *jdata);
/*
* Remote spawn - spawn called by a daemon to launch a process on its own
*/
typedef int (*orte_plm_base_module_remote_spawn_fn_t)(opal_buffer_t *launch);
/*
* Entry point to set the HNP name
*/
@ -90,6 +97,7 @@ struct orte_plm_base_module_1_0_0_t {
orte_plm_base_module_init_fn_t init;
orte_plm_base_module_set_hnp_name_fn_t set_hnp_name;
orte_plm_base_module_spawn_fn_t spawn;
orte_plm_base_module_remote_spawn_fn_t remote_spawn;
orte_plm_base_module_terminate_job_fn_t terminate_job;
orte_plm_base_module_terminate_orteds_fn_t terminate_orteds;
orte_plm_base_module_signal_job_fn_t signal_job;

Просмотреть файл

@ -72,6 +72,8 @@ struct orte_plm_rsh_component_t {
int agent_argc;
char* agent_path;
char* orted;
bool tree_spawn;
opal_list_t children;
orte_std_cntr_t num_children;
orte_std_cntr_t num_concurrent;
opal_mutex_t lock;

Просмотреть файл

@ -117,6 +117,7 @@ int orte_plm_rsh_component_open(void)
mca_plm_rsh_component.agent_argv = NULL;
mca_plm_rsh_component.agent_argc = 0;
mca_plm_rsh_component.agent_path = NULL;
OBJ_CONSTRUCT(&mca_plm_rsh_component.children, opal_list_t);
/* lookup parameters */
mca_base_param_reg_int(c, "num_concurrent",
@ -157,6 +158,11 @@ int orte_plm_rsh_component_open(void)
false, false, "ssh : rsh",
&mca_plm_rsh_component.agent_param);
mca_base_param_reg_int(c, "tree_spawn",
"If set to 1, launch via a tree-based topology",
false, false, (int)false, &tmp);
mca_plm_rsh_component.tree_spawn = OPAL_INT_TO_BOOL(tmp);
return ORTE_SUCCESS;
}
@ -229,6 +235,7 @@ int orte_plm_rsh_component_close(void)
/* cleanup state */
OBJ_DESTRUCT(&mca_plm_rsh_component.lock);
OBJ_DESTRUCT(&mca_plm_rsh_component.cond);
OBJ_DESTRUCT(&mca_plm_rsh_component.children);
if (NULL != mca_plm_rsh_component.orted) {
free(mca_plm_rsh_component.orted);
}

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -93,6 +93,7 @@ orte_plm_base_module_1_0_0_t orte_plm_slurm_module = {
plm_slurm_init,
orte_plm_base_set_hnp_name,
plm_slurm_launch_job,
NULL,
plm_slurm_terminate_job,
plm_slurm_terminate_orteds,
plm_slurm_signal_job,

Просмотреть файл

@ -79,25 +79,28 @@ int orte_rml_base_update_contact_info(opal_buffer_t* data)
OPAL_OUTPUT_VERBOSE((5, orte_rml_base_output,
"%s rml:base:update:contact:info got uri %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
rml_uri));
NULL == rml_uri ? "NULL" : rml_uri));
/* set the contact info into the hash table */
if (ORTE_SUCCESS != (rc = orte_rml.set_contact_info(rml_uri))) {
ORTE_ERROR_LOG(rc);
if (NULL != rml_uri) {
/* set the contact info into the hash table */
if (ORTE_SUCCESS != (rc = orte_rml.set_contact_info(rml_uri))) {
ORTE_ERROR_LOG(rc);
free(rml_uri);
return(rc);
}
/* extract the proc's name */
if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(rml_uri, &name, NULL))) {
ORTE_ERROR_LOG(rc);
free(rml_uri);
return rc;
}
free(rml_uri);
return(rc);
/* update the route - in this case, always set it to direct routing
* since we were given the contact info
*/
orte_routed.update_route(&name, &name);
}
/* extract the proc's name */
if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(rml_uri, &name, NULL))) {
ORTE_ERROR_LOG(rc);
free(rml_uri);
return rc;
}
free(rml_uri);
/* update the route - in this case, always set it to direct routing
* since we were given the contact info
*/
orte_routed.update_route(&name, &name);
/* we only get an update from a single jobid - the command
* that creates these doesn't cross jobid boundaries - so
* record it here
@ -119,7 +122,7 @@ int orte_rml_base_update_contact_info(opal_buffer_t* data)
*/
if (ORTE_PROC_MY_NAME->jobid == jobid &&
orte_process_info.daemon &&
orte_process_info.num_procs != num_procs) {
orte_process_info.num_procs < num_procs) {
orte_process_info.num_procs = num_procs;
/* if we changed it, then we better update the trees in the
* grpcomm so daemon collectives work correctly

Просмотреть файл

@ -61,41 +61,42 @@ BEGIN_C_DECLS
#define ORTE_RML_TAG_RML_INFO_UPDATE 9
#define ORTE_RML_TAG_ORTED_CALLBACK 10
#define ORTE_RML_TAG_APP_LAUNCH_CALLBACK 11
#define ORTE_RML_TAG_REPORT_REMOTE_LAUNCH 12
#define ORTE_RML_TAG_CKPT 12
#define ORTE_RML_TAG_CKPT 13
#define ORTE_RML_TAG_RML_ROUTE 13
#define ORTE_RML_TAG_RML_ROUTE 14
#define ORTE_RML_TAG_ALLGATHER 14
#define ORTE_RML_TAG_BARRIER 15
#define ORTE_RML_TAG_ALLGATHER 15
#define ORTE_RML_TAG_BARRIER 16
#define ORTE_RML_TAG_INIT_ROUTES 16
#define ORTE_RML_TAG_UPDATE_ROUTES 17
#define ORTE_RML_TAG_SYNC 18
#define ORTE_RML_TAG_INIT_ROUTES 17
#define ORTE_RML_TAG_UPDATE_ROUTES 18
#define ORTE_RML_TAG_SYNC 19
/* For FileM Base */
#define ORTE_RML_TAG_FILEM_BASE 19
#define ORTE_RML_TAG_FILEM_BASE_RESP 20
#define ORTE_RML_TAG_FILEM_BASE 20
#define ORTE_RML_TAG_FILEM_BASE_RESP 21
/* For FileM RSH Component */
#define ORTE_RML_TAG_FILEM_RSH 21
#define ORTE_RML_TAG_FILEM_RSH 22
/* For SnapC Framework */
#define ORTE_RML_TAG_SNAPC 22
#define ORTE_RML_TAG_SNAPC_FULL 23
#define ORTE_RML_TAG_SNAPC 23
#define ORTE_RML_TAG_SNAPC_FULL 24
/* For tools */
#define ORTE_RML_TAG_TOOL 24
#define ORTE_RML_TAG_TOOL 25
/* support data store/lookup */
#define ORTE_RML_TAG_DATA_SERVER 25
#define ORTE_RML_TAG_DATA_CLIENT 26
#define ORTE_RML_TAG_DATA_SERVER 26
#define ORTE_RML_TAG_DATA_CLIENT 27
/* timing related */
#define ORTE_RML_TAG_COLLECTIVE_TIMER 27
#define ORTE_RML_TAG_COLLECTIVE_TIMER 28
/* daemon collectives */
#define ORTE_RML_TAG_DAEMON_COLLECTIVE 28
#define ORTE_RML_TAG_DAEMON_COLLECTIVE 29
#define ORTE_RML_TAG_MAX 100

Просмотреть файл

@ -38,6 +38,7 @@
#include <signal.h>
#include "opal/class/opal_pointer_array.h"
#include "opal/event/event.h"
#include "opal/mca/base/base.h"
#include "opal/threads/mutex.h"
@ -85,6 +86,7 @@ static int process_commands(orte_process_name_t* sender,
orte_rml_tag_t tag);
/* local callback function for non-blocking sends */
static void send_callback(int status, orte_process_name_t *peer,
opal_buffer_t *buf, orte_rml_tag_t tag,
@ -370,6 +372,30 @@ static int process_commands(orte_process_name_t* sender,
}
break;
/**** ADD_AND_SPAWN ****/
case ORTE_DAEMON_ADD_AND_SPAWN:
if (orte_debug_daemons_flag) {
opal_output(0, "%s orted_cmd: received add_and_spawn",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
}
/* launch the local processes */
if (ORTE_SUCCESS != (ret = orte_odls.launch_local_procs(buffer))) {
OPAL_OUTPUT_VERBOSE((1, orte_debug_output,
"%s orted:comm:add_procs failed to launch on error %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_ERROR_NAME(ret)));
}
/* rewind the buffer so the plm can reuse it */
buffer->unpack_ptr = buffer->base_ptr;
/* if the PLM supports remote spawn, pass it all along */
if (NULL != orte_plm.remote_spawn) {
if (ORTE_SUCCESS != (ret = orte_plm.remote_spawn(buffer))) {
ORTE_ERROR_LOG(ret);
}
} else {
opal_output(0, "%s remote spawn is NULL!", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
}
break;
/**** DELIVER A MESSAGE TO THE LOCAL PROCS ****/
case ORTE_DAEMON_MESSAGE_LOCAL_PROCS:
if (orte_debug_daemons_flag) {

Просмотреть файл

@ -67,6 +67,7 @@
#include "orte/mca/odls/odls.h"
#include "orte/mca/plm/plm.h"
#include "orte/mca/ras/ras.h"
#include "orte/mca/routed/routed.h"
/* need access to the create_jobid fn used by plm components
* so we can set singleton name, if necessary
@ -106,6 +107,7 @@ static struct {
char* num_procs;
int uri_pipe;
int singleton_died_pipe;
char* parent;
} orted_globals;
/*
@ -165,6 +167,10 @@ opal_cmd_line_init_t orte_cmd_line_opts[] = {
&orted_globals.singleton_died_pipe, OPAL_CMD_LINE_TYPE_INT,
"Watch on indicated pipe for singleton termination"},
{ NULL, NULL, NULL, '\0', NULL, "parent", 1,
&orted_globals.parent, OPAL_CMD_LINE_TYPE_STRING,
"Parent vpid for tree-based spawns"},
/* End of list */
{ NULL, NULL, NULL, '\0', NULL, NULL, 0,
NULL, OPAL_CMD_LINE_TYPE_NULL, NULL }
@ -273,15 +279,6 @@ int orte_daemon(int argc, char *argv[])
/* set ourselves to be just a daemon */
orte_process_info.hnp = false;
orte_process_info.daemon = true;
#if 0
/* since I am a daemon, I need to ensure that orte_init selects
* the rsh PLM module to support local spawns, if an rsh agent is
* available
*/
param = mca_base_param_environ_variable("plm","rsh",NULL);
putenv(param);
free(param);
#endif
}
#if OPAL_ENABLE_FT == 1
@ -523,6 +520,8 @@ int orte_daemon(int argc, char *argv[])
* We need to do this at the last possible second as the HNP
* can turn right around and begin issuing orders to us
*/
orte_process_name_t parent;
buffer = OBJ_NEW(opal_buffer_t);
rml_uri = orte_rml.get_contact_info();
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &rml_uri, 1, OPAL_STRING))) {
@ -530,7 +529,19 @@ int orte_daemon(int argc, char *argv[])
OBJ_RELEASE(buffer);
return ret;
}
if (0 > (ret = orte_rml.send_buffer(ORTE_PROC_MY_HNP, buffer,
/* if no parent was specified, send to my hnp */
if (NULL == orted_globals.parent) {
parent.vpid = ORTE_PROC_MY_HNP->vpid;
parent.jobid = ORTE_PROC_MY_HNP->jobid;
} else {
/* set the parent's contact info into our hash tables */
orte_rml.set_contact_info(orted_globals.parent);
/* extract the parent's name */
orte_rml_base_parse_uris(orted_globals.parent, &parent, NULL);
/* set the route to be direct */
orte_routed.update_route(&parent, &parent);
}
if (0 > (ret = orte_rml.send_buffer(&parent, buffer,
ORTE_RML_TAG_ORTED_CALLBACK, 0))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(buffer);

Просмотреть файл

@ -53,6 +53,7 @@ bool orte_keep_fqdn_hostnames = false;
int orte_debug_output = -1;
char **orte_launch_environ;
opal_pointer_array_t orte_daemonmap;
char **orted_cmd_line=NULL;
int orte_exit, orteds_exit;
int orte_exit_status = 0;

Просмотреть файл

@ -291,6 +291,8 @@ ORTE_DECLSPEC extern int orte_debug_output;
ORTE_DECLSPEC extern bool orte_keep_fqdn_hostnames;
ORTE_DECLSPEC extern char **orte_launch_environ;
ORTE_DECLSPEC extern opal_pointer_array_t orte_daemonmap;
ORTE_DECLSPEC extern char **orted_cmd_line;
ORTE_DECLSPEC extern int orte_exit, orteds_exit;
ORTE_DECLSPEC extern int orte_exit_status;

Просмотреть файл

@ -119,7 +119,7 @@ int main(int argc, char* argv[])
orte_init(ORTE_TOOL);
num_procs = 4;
num_procs = 32;
for (i=0; i < num_procs; i++) {
OBJ_CONSTRUCT(&children, opal_list_t);