1
1

Bring in the generalized xcast communication system along with the correspondingly revised orted launch. I will send a message out to developers explaining the basic changes. In brief:

1. generalize orte_rml.xcast to become a general broadcast-like messaging system. Messages can now be sent to any tag on the daemons or processes. Note that any message sent via xcast will be delivered to ALL processes in the specified job - you don't get to pick and choose. At a later date, we will introduce an augmented capability that will use the daemons as relays, but will allow you to send to a specified array of process names.

2. extended orte_rml.xcast so it supports more scalable message routing methodologies. At the moment, we support three: (a) direct, which sends the message directly to all recipients; (b) linear, which sends the message to the local daemon on each node, which then relays it to its own local procs; and (b) binomial, which sends the message via a binomial algo across all the daemons, each of which then relays to its own local procs. The crossover points between the algos are adjustable via MCA param, or you can simply demand that a specific algo be used.

3. orteds no longer exhibit two types of behavior: bootproxy or VM. Orteds now always behave like they are part of a virtual machine - they simply launch a job if mpirun tells them to do so. This is another step towards creating an "orteboot" functionality, but also provided a clean system for supporting message relaying.

Note one major impact of this commit: multiple daemons on a node cannot be supported any longer! Only a single daemon/node is now allowed.

This commit is known to break support for the following environments: POE, Xgrid, Xcpu, Windows. It has been tested on rsh, SLURM, and Bproc. Modifications for TM support have been made but could not be verified due to machine problems at LANL. Modifications for SGE have been made but could not be verified. The developers for the non-verified environments will be separately notified along with suggestions on how to fix the problems.

This commit was SVN r15007.
Этот коммит содержится в:
Ralph Castain 2007-06-12 13:28:54 +00:00
родитель 8e7cce813e
Коммит 85df3bd92f
67 изменённых файлов: 2113 добавлений и 1761 удалений

Просмотреть файл

@ -112,6 +112,7 @@ typedef int32_t orte_gpr_trigger_id_t;
#define ORTE_GPR_STRIPPED (uint16_t)0x2000 /**< Return values should contain no descriptive info */
#define ORTE_GPR_OVERWRITE (uint16_t)0x8000 /**< Allow overwrite of existing info */
#define ORTE_GPR_NO_OVERWRITE (uint16_t)0x0000 /**< Do not allow overwrite of existing info */
#define ORTE_GPR_NO_DUPLICATE (uint16_t)0x4000 /**< Do not duplicate an existing entry - just ignore the new one */
typedef uint16_t orte_gpr_addr_mode_t;
#define ORTE_GPR_ADDR_MODE_T ORTE_UINT16

Просмотреть файл

@ -42,7 +42,7 @@ int orte_gpr_replica_arith(orte_gpr_addr_mode_t addr_mode,
orte_data_value_t *operand)
{
int rc;
orte_std_cntr_t num_tokens, num_keys;
orte_std_cntr_t num_tokens=0, num_keys=0;
orte_gpr_replica_segment_t *seg=NULL;
orte_gpr_replica_itag_t *itags=NULL, *keytags=NULL;

Просмотреть файл

@ -121,7 +121,7 @@ int orte_gpr_replica_put_fn(orte_gpr_addr_mode_t addr_mode,
orte_gpr_replica_itag_t itag;
orte_gpr_replica_addr_mode_t tok_mode;
orte_gpr_replica_itagval_t *iptr, **iptrs;
bool overwrite, overwritten;
bool overwrite, duplicate, overwritten;
int rc;
orte_std_cntr_t i, j, k, m, n, index;
@ -151,9 +151,13 @@ int orte_gpr_replica_put_fn(orte_gpr_addr_mode_t addr_mode,
/* extract the token address mode and overwrite permissions */
overwrite = false;
duplicate = true;
if (addr_mode & ORTE_GPR_OVERWRITE) {
overwrite = true;
} else if (addr_mode & ORTE_GPR_NO_DUPLICATE) {
duplicate = false;
}
tok_mode = ORTE_GPR_REPLICA_TOKMODE(addr_mode);
if (0x00 == tok_mode) { /* default tokens addressing mode to AND */
tok_mode = ORTE_GPR_REPLICA_AND;
@ -205,9 +209,7 @@ int orte_gpr_replica_put_fn(orte_gpr_addr_mode_t addr_mode,
ORTE_GPR_REPLICA_OR,
&itag, 1, cptr[j])) {
if (0 < orte_gpr_replica_globals.num_srch_ival) {
/* this key already exists - overwrite, if permission given
* else add this keyval to the container as a new entry
*/
/* this key already exists - overwrite, if permission given */
if (overwrite) {
/* check to see if we have already overwritten this keyval. if so,
* then we add the remaining values - otherwise, only the
@ -220,7 +222,7 @@ int orte_gpr_replica_put_fn(orte_gpr_addr_mode_t addr_mode,
if (NULL != iptrs[m]) {
n++;
if (iptrs[m]->itag == itag) {
/* keyval was previously overwritten */
/* keyval was previously overwritten so just add this one as another entry */
if (ORTE_SUCCESS != (rc = orte_gpr_replica_add_keyval(&iptr, seg, cptr[j], keyvals[i]))) {
ORTE_ERROR_LOG(rc);
return rc;
@ -248,7 +250,10 @@ int orte_gpr_replica_put_fn(orte_gpr_addr_mode_t addr_mode,
}
(orte_gpr_replica_globals.num_overwritten)++;
}
} else {
} else if (duplicate) {
/* no overwrite permission - add this keyval to the container as a new entry
* if we are allowed to duplicate. Otherwise, we just ignore it.
*/
if (ORTE_SUCCESS != (rc = orte_gpr_replica_add_keyval(&iptr, seg, cptr[j], keyvals[i]))) {
ORTE_ERROR_LOG(rc);
return rc;

Просмотреть файл

@ -55,7 +55,6 @@
* Initialization of the bproc_orted module with all the needed function pointers
*/
orte_odls_base_module_t orte_odls_bproc_module = {
orte_odls_bproc_subscribe_launch_data,
orte_odls_bproc_get_add_procs_data,
orte_odls_bproc_launch_local_procs,
orte_odls_bproc_kill_local_procs,
@ -77,7 +76,108 @@ static int odls_bproc_setup_stdio(orte_process_name_t *proc_name,
int orte_odls_bproc_get_add_procs_data(orte_gpr_notify_data_t **data, orte_job_map_t *map)
{
return ORTE_ERR_NOT_IMPLEMENTED;
orte_gpr_notify_data_t *ndat;
orte_gpr_value_t **values, *value;
orte_std_cntr_t cnt;
opal_list_item_t *item, *m_item;
orte_mapped_node_t *node;
orte_mapped_proc_t *proc;
int rc;
/* set default answer */
*data = NULL;
ndat = OBJ_NEW(orte_gpr_notify_data_t);
if (NULL == ndat) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
/* construct a fake trigger name so that the we can extract the jobid from it later */
if (ORTE_SUCCESS != (rc = orte_schema.get_std_trigger_name(&(ndat->target), "bogus", map->job))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(ndat);
return rc;
}
/* our required info is in the mapped_node objects, so all we
* have to do is transfer it over
*/
for (m_item = opal_list_get_first(&map->nodes);
m_item != opal_list_get_end(&map->nodes);
m_item = opal_list_get_next(m_item)) {
node = (orte_mapped_node_t*)m_item;
for (item = opal_list_get_first(&node->procs);
item != opal_list_get_end(&node->procs);
item = opal_list_get_next(item)) {
proc = (orte_mapped_proc_t*)item;
/* must not have any tokens so that launch_procs can process it correctly */
if (ORTE_SUCCESS != (rc = orte_gpr.create_value(&value, 0, "bogus", 5, 0))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(ndat);
OBJ_RELEASE(value);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[0]),
ORTE_PROC_NAME_KEY,
ORTE_NAME, &proc->name))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(ndat);
OBJ_RELEASE(value);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[1]),
ORTE_PROC_APP_CONTEXT_KEY,
ORTE_STD_CNTR, &proc->app_idx))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(ndat);
OBJ_RELEASE(value);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[2]),
ORTE_NODE_NAME_KEY,
ORTE_STRING, node->nodename))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(ndat);
OBJ_RELEASE(value);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[3]),
ORTE_PROC_LOCAL_RANK_KEY,
ORTE_VPID, &proc->local_rank))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(ndat);
OBJ_RELEASE(value);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[4]),
ORTE_NODE_NUM_PROCS_KEY,
ORTE_STD_CNTR, &node->num_procs))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(ndat);
OBJ_RELEASE(value);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_pointer_array_add(&cnt, ndat->values, value))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(ndat);
OBJ_RELEASE(values[0]);
return rc;
}
ndat->cnt += 1;
}
}
*data = ndat;
return ORTE_SUCCESS;
}
@ -432,89 +532,6 @@ cleanup:
}
/* this entire function gets called within a GPR compound command,
* so the subscription actually doesn't get done until the orted
* executes the compound command
*/
int orte_odls_bproc_subscribe_launch_data(orte_jobid_t job, orte_gpr_notify_cb_fn_t cbfunc)
{
char *segment;
orte_gpr_value_t *values[1];
orte_gpr_subscription_t *subs, sub=ORTE_GPR_SUBSCRIPTION_EMPTY;
orte_gpr_trigger_t *trigs, trig=ORTE_GPR_TRIGGER_EMPTY;
char* keys[] = {
ORTE_PROC_NAME_KEY,
ORTE_PROC_APP_CONTEXT_KEY,
ORTE_NODE_NAME_KEY,
};
int num_keys = 3;
int i, rc;
/* get the job segment name */
if (ORTE_SUCCESS != (rc = orte_schema.get_job_segment_name(&segment, job))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* attach ourselves to the "standard" orted trigger */
if (ORTE_SUCCESS !=
(rc = orte_schema.get_std_trigger_name(&(trig.name),
ORTED_LAUNCH_STAGE_GATE_TRIGGER, job))) {
ORTE_ERROR_LOG(rc);
free(segment);
return rc;
}
/* ask for return of all data required for launching local processes */
subs = &sub;
sub.action = ORTE_GPR_NOTIFY_DELETE_AFTER_TRIG;
if (ORTE_SUCCESS != (rc = orte_schema.get_std_subscription_name(&(sub.name),
ORTED_LAUNCH_STG_SUB,
job))) {
ORTE_ERROR_LOG(rc);
free(segment);
free(trig.name);
return rc;
}
sub.cnt = 1;
sub.values = values;
if (ORTE_SUCCESS != (rc = orte_gpr.create_value(&(values[0]), ORTE_GPR_KEYS_OR | ORTE_GPR_TOKENS_OR,
segment, num_keys, 0))) {
ORTE_ERROR_LOG(rc);
free(segment);
free(sub.name);
free(trig.name);
return rc;
}
for (i=0; i < num_keys; i++) {
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(values[0]->keyvals[i]),
keys[i], ORTE_UNDEF, NULL))) {
ORTE_ERROR_LOG(rc);
free(segment);
free(sub.name);
free(trig.name);
OBJ_RELEASE(values[0]);
return rc;
}
}
sub.cbfunc = cbfunc;
trigs = &trig;
/* do the subscription */
if (ORTE_SUCCESS != (rc = orte_gpr.subscribe(1, &subs, 1, &trigs))) {
ORTE_ERROR_LOG(rc);
}
free(segment);
free(sub.name);
free(trig.name);
OBJ_RELEASE(values[0]);
return rc;
}
/**
* Setup io for the current node, then tell orterun we are ready for the actual
* processes.
@ -538,6 +555,8 @@ orte_odls_bproc_launch_local_procs(orte_gpr_notify_data_t *data, char **base_env
int cycle = 0;
char *job_str=NULL, *vpid_str, *uri_file, *my_uri=NULL, *session_dir=NULL;
FILE *fp;
orte_vpid_t *vptr;
bool node_included;
/* first, retrieve the job number we are to launch from the
* returned data - we can extract the jobid directly from the
@ -545,7 +564,8 @@ orte_odls_bproc_launch_local_procs(orte_gpr_notify_data_t *data, char **base_env
*/
if (ORTE_SUCCESS != (rc = orte_schema.extract_jobid_from_std_trigger_name(&jobid, data->target))) {
ORTE_ERROR_LOG(rc);
return rc;
src = rc;
goto CALLHOME;
}
/**
@ -555,6 +575,9 @@ orte_odls_bproc_launch_local_procs(orte_gpr_notify_data_t *data, char **base_env
*/
setpgid(0,0);
/* set the flag indicating this node is not included in the launch data */
node_included = false;
/* loop through the returned data to find the global info and
* the info for processes going onto this node
*/
@ -574,11 +597,15 @@ orte_odls_bproc_launch_local_procs(orte_gpr_notify_data_t *data, char **base_env
* so we can access it */
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&node_name, kval->value, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
return rc;
src = rc;
goto CALLHOME;
}
/* if this is our node...must also protect against a zero-length string */
if (NULL != node_name && 0 == strcmp(node_name, orte_system_info.nodename)) {
/* ...harvest the info into a new child structure */
/* indicate that there is something for us to do */
node_included = true;
/* setup and populate the child object */
child = OBJ_NEW(odls_bproc_child_t);
for (kv2 = 0; kv2 < value->cnt; kv2++) {
kval = value->keyvals[kv2];
@ -586,18 +613,38 @@ orte_odls_bproc_launch_local_procs(orte_gpr_notify_data_t *data, char **base_env
/* copy the name into the child object */
if (ORTE_SUCCESS != (rc = orte_dss.copy((void**)&(child->name), kval->value->data, ORTE_NAME))) {
ORTE_ERROR_LOG(rc);
return rc;
src = rc;
goto CALLHOME;
}
continue;
}
if(strcmp(kval->key, ORTE_PROC_APP_CONTEXT_KEY) == 0) {
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&sptr, kval->value, ORTE_STD_CNTR))) {
ORTE_ERROR_LOG(rc);
return rc;
src = rc;
goto CALLHOME;
}
child->app_idx = *sptr; /* save the index into the app_context objects */
continue;
}
if(strcmp(kval->key, ORTE_PROC_LOCAL_RANK_KEY) == 0) {
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&vptr, kval->value, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
src = rc;
goto CALLHOME;
}
child->local_rank = *vptr; /* save the local_rank */
continue;
}
if(strcmp(kval->key, ORTE_NODE_NUM_PROCS_KEY) == 0) {
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&sptr, kval->value, ORTE_STD_CNTR))) {
ORTE_ERROR_LOG(rc);
src = rc;
goto CALLHOME;
}
child->num_procs = *sptr; /* save the number of procs from this job on this node */
continue;
}
} /* kv2 */
/* protect operation on the global list of children */
OPAL_THREAD_LOCK(&mca_odls_bproc_component.mutex);
@ -611,6 +658,14 @@ orte_odls_bproc_launch_local_procs(orte_gpr_notify_data_t *data, char **base_env
} /* for j */
}
/* if there is nothing for us to do, we still have to report back
* before we just return
*/
if (!node_included) {
rc = ORTE_SUCCESS;
goto CALLHOME;
}
/* setup some values we'll need to drop my uri for each child */
orte_ns.convert_jobid_to_string(&job_str, jobid);
my_uri = orte_rml.get_uri();
@ -639,7 +694,8 @@ orte_odls_bproc_launch_local_procs(orte_gpr_notify_data_t *data, char **base_env
connect_stdin);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
goto cleanup;
src = rc;
goto CALLHOME;
}
/* record my uri in a file within the session directory so the child can contact me */
@ -649,26 +705,32 @@ orte_odls_bproc_launch_local_procs(orte_gpr_notify_data_t *data, char **base_env
if (ORTE_SUCCESS != (rc = orte_session_dir(true, NULL, NULL, NULL,
NULL, NULL, job_str, vpid_str))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
src = rc;
goto CALLHOME;
}
/* get the session dir name so we can put the file there */
if (ORTE_SUCCESS != (rc = orte_session_dir_get_name(&session_dir, NULL, NULL, NULL,
NULL, NULL, NULL, job_str, vpid_str))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
src = rc;
goto CALLHOME;
}
free(vpid_str);
/* create the file and put my uri into it */
/* create the file and put my uri, this child's local rank, and the
* number of local procs into it */
uri_file = opal_os_path(false, session_dir, "orted-uri.txt", NULL);
fp = fopen(uri_file, "w");
if (NULL == fp) {
ORTE_ERROR_LOG(ORTE_ERR_FILE_OPEN_FAILURE);
rc = ORTE_ERR_FILE_OPEN_FAILURE;
goto cleanup;
src = rc;
goto CALLHOME;
}
fprintf(fp, "%s\n", my_uri);
fprintf(fp, "%ld\n", (long)child->local_rank);
fprintf(fp, "%ld\n", (long)child->num_procs);
fclose(fp);
free(uri_file);
@ -679,6 +741,7 @@ orte_odls_bproc_launch_local_procs(orte_gpr_notify_data_t *data, char **base_env
free(job_str);
free(my_uri);
CALLHOME:
/* message to indicate that we are ready */
ack = OBJ_NEW(orte_buffer_t);
rc = orte_dss.pack(ack, &src, 1, ORTE_INT);
@ -738,13 +801,18 @@ int orte_odls_bproc_deliver_message(orte_jobid_t job, orte_buffer_t *buffer, ort
item = opal_list_get_next(item)) {
child = (orte_odls_child_t*)item;
/* see if this is one of the chosen */
if (job == child->name->jobid) {
/* if so, send the message */
rc = orte_rml.send_buffer(child->name, buffer, tag, 0);
if (rc < 0) {
ORTE_ERROR_LOG(rc);
}
/* do we have a child from the specified job. Because the
* job could be given as a WILDCARD value, we must use
* the dss.compare function to check for equality.
*/
if (ORTE_EQUAL != orte_dss.compare(&job, &(child->name->jobid), ORTE_JOBID)) {
continue;
}
/* if so, send the message */
rc = orte_rml.send_buffer(child->name, buffer, tag, 0);
if (rc < 0) {
ORTE_ERROR_LOG(rc);
}
}

Просмотреть файл

@ -59,7 +59,6 @@ int orte_odls_bproc_finalize(void);
/*
* Interface
*/
int orte_odls_bproc_subscribe_launch_data(orte_jobid_t job, orte_gpr_notify_cb_fn_t cbfunc);
int orte_odls_bproc_get_add_procs_data(orte_gpr_notify_data_t **data, orte_job_map_t *map);
int orte_odls_bproc_launch_local_procs(orte_gpr_notify_data_t *data, char **base_environ);
int orte_odls_bproc_kill_local_procs(orte_jobid_t job, bool set_state);
@ -100,6 +99,8 @@ typedef struct odls_bproc_child_t {
pid_t pid; /* local pid of the proc */
orte_std_cntr_t app_idx; /* index of the app_context for this proc */
bool alive; /* is this proc alive? */
orte_vpid_t local_rank; /* local rank of this proc */
orte_std_cntr_t num_procs; /* number of local procs sharing this node */
} odls_bproc_child_t;
OBJ_CLASS_DECLARATION(odls_bproc_child_t);

Просмотреть файл

@ -36,6 +36,8 @@ static void odls_bproc_child_constructor(odls_bproc_child_t *ptr)
ptr->name = NULL;
ptr->app_idx = -1;
ptr->alive = false;
ptr->local_rank = ORTE_VPID_INVALID;
ptr->num_procs = 0;
}
static void odls_bproc_child_destructor(odls_bproc_child_t *ptr)
{

Просмотреть файл

@ -101,10 +101,10 @@ static int orte_pls_fork_preload_append_binary(orte_app_context_t* context,
static int orte_pls_fork_preload_append_files(orte_app_context_t* context,
orte_filem_base_request_t *filem_request);
static bool is_preload_local_dup(char *local_ref, orte_filem_base_request_t *filem_request);
/*
* External Interface
*/
static int orte_odls_default_subscribe_launch_data(orte_jobid_t job, orte_gpr_notify_cb_fn_t cbfunc);
static int orte_odls_default_get_add_procs_data(orte_gpr_notify_data_t **data, orte_job_map_t *map);
static int orte_odls_default_launch_local_procs(orte_gpr_notify_data_t *data, char **base_environ);
static int orte_odls_default_kill_local_procs(orte_jobid_t job, bool set_state);
@ -115,7 +115,6 @@ static int orte_odls_default_deliver_message(orte_jobid_t job, orte_buffer_t *bu
static void set_handler_default(int sig);
orte_odls_base_module_t orte_odls_default_module = {
orte_odls_default_subscribe_launch_data,
orte_odls_default_get_add_procs_data,
orte_odls_default_launch_local_procs,
orte_odls_default_kill_local_procs,
@ -123,123 +122,6 @@ orte_odls_base_module_t orte_odls_default_module = {
orte_odls_default_deliver_message
};
/* this entire function gets called within a GPR compound command,
* so the subscription actually doesn't get done until the orted
* executes the compound command
*/
int orte_odls_default_subscribe_launch_data(orte_jobid_t job, orte_gpr_notify_cb_fn_t cbfunc)
{
char *segment;
orte_gpr_value_t *values[2];
orte_gpr_subscription_t *subs, sub=ORTE_GPR_SUBSCRIPTION_EMPTY;
orte_gpr_trigger_t *trigs, trig=ORTE_GPR_TRIGGER_EMPTY;
char *glob_keys[] = {
ORTE_JOB_APP_CONTEXT_KEY,
ORTE_JOB_VPID_START_KEY,
ORTE_JOB_VPID_RANGE_KEY,
ORTE_JOB_OVERSUBSCRIBE_OVERRIDE_KEY
};
int num_glob_keys = 4;
char* keys[] = {
ORTE_PROC_NAME_KEY,
ORTE_PROC_LOCAL_RANK_KEY,
ORTE_PROC_APP_CONTEXT_KEY,
ORTE_NODE_NAME_KEY,
ORTE_NODE_NUM_PROCS_KEY,
ORTE_NODE_OVERSUBSCRIBED_KEY
};
int num_keys = 6;
int i, rc;
/* get the job segment name */
if (ORTE_SUCCESS != (rc = orte_schema.get_job_segment_name(&segment, job))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* attach ourselves to the "standard" orted trigger */
if (ORTE_SUCCESS !=
(rc = orte_schema.get_std_trigger_name(&(trig.name),
ORTED_LAUNCH_STAGE_GATE_TRIGGER, job))) {
ORTE_ERROR_LOG(rc);
free(segment);
return rc;
}
/* ask for return of all data required for launching local processes */
subs = &sub;
sub.action = ORTE_GPR_NOTIFY_DELETE_AFTER_TRIG;
if (ORTE_SUCCESS != (rc = orte_schema.get_std_subscription_name(&(sub.name),
ORTED_LAUNCH_STG_SUB,
job))) {
ORTE_ERROR_LOG(rc);
free(segment);
free(trig.name);
return rc;
}
sub.cnt = 2;
sub.values = values;
if (ORTE_SUCCESS != (rc = orte_gpr.create_value(&(values[0]), ORTE_GPR_KEYS_OR | ORTE_GPR_TOKENS_OR,
segment, num_glob_keys, 1))) {
ORTE_ERROR_LOG(rc);
free(segment);
free(sub.name);
free(trig.name);
return rc;
}
values[0]->tokens[0] = strdup(ORTE_JOB_GLOBALS);
for (i=0; i < num_glob_keys; i++) {
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(values[0]->keyvals[i]),
glob_keys[i], ORTE_UNDEF, NULL))) {
ORTE_ERROR_LOG(rc);
free(segment);
free(sub.name);
free(trig.name);
OBJ_RELEASE(values[0]);
return rc;
}
}
if (ORTE_SUCCESS != (rc = orte_gpr.create_value(&(values[1]), ORTE_GPR_KEYS_OR | ORTE_GPR_TOKENS_OR | ORTE_GPR_STRIPPED,
segment, num_keys, 0))) {
ORTE_ERROR_LOG(rc);
free(segment);
free(sub.name);
free(trig.name);
OBJ_RELEASE(values[0]);
return rc;
}
for (i=0; i < num_keys; i++) {
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(values[1]->keyvals[i]),
keys[i], ORTE_UNDEF, NULL))) {
ORTE_ERROR_LOG(rc);
free(segment);
free(sub.name);
free(trig.name);
OBJ_RELEASE(values[0]);
OBJ_RELEASE(values[1]);
return rc;
}
}
sub.cbfunc = cbfunc;
trigs = &trig;
/* do the subscription */
if (ORTE_SUCCESS != (rc = orte_gpr.subscribe(1, &subs, 1, &trigs))) {
ORTE_ERROR_LOG(rc);
}
free(segment);
free(sub.name);
free(trig.name);
OBJ_RELEASE(values[0]);
OBJ_RELEASE(values[1]);
return rc;
}
int orte_odls_default_get_add_procs_data(orte_gpr_notify_data_t **data,
orte_job_map_t *map)
{
@ -453,30 +335,33 @@ int orte_odls_default_kill_local_procs(orte_jobid_t job, bool set_state)
opal_output(orte_odls_globals.output, "[%ld,%ld,%ld] odls_kill_local_proc: checking child process [%ld,%ld,%ld]",
ORTE_NAME_ARGS(ORTE_PROC_MY_NAME), ORTE_NAME_ARGS(child->name));
/* do we have a child from the specified job? Because the
* job could be given as a WILDCARD value, we must use
* the dss.compare function to check for equality.
*/
if (ORTE_EQUAL != orte_dss.compare(&job, &(child->name->jobid), ORTE_JOBID)) {
continue;
}
/* remove the child from the list since it is either already dead or soon going to be dead */
opal_list_remove_item(&orte_odls_default.children, item);
/* is this process alive? if not, then nothing for us
* to do to it
*/
if (!child->alive) {
opal_output(orte_odls_globals.output, "[%ld,%ld,%ld] odls_kill_local_proc: child [%ld,%ld,%ld] is not alive",
ORTE_NAME_ARGS(ORTE_PROC_MY_NAME), ORTE_NAME_ARGS(child->name));
continue;
/* ensure, though, that the state is terminated so we don't lockup if
* the proc never started
*/
goto MOVEON;
}
/* do we have a child from the specified job? Because the
* job could be given as a WILDCARD value, we must use
* the dss.compare function to check for equality.
*/
if (ORTE_EQUAL != orte_dss.compare(&job, &(child->name->jobid), ORTE_JOBID)) {
continue;
}
/* remove the child from the list since it is going to be dead */
opal_list_remove_item(&orte_odls_default.children, item);
/* de-register the SIGCHILD callback for this pid */
if (ORTE_SUCCESS != (rc = orte_wait_cb_cancel(child->pid))) {
ORTE_ERROR_LOG(rc);
continue;
/* no need to error_log this - it just means that the pid is already gone */
goto MOVEON;
}
/* Send a sigterm to the process. If we get ESRCH back, that
@ -614,10 +499,12 @@ GOTCHILD:
}
opal_output(orte_odls_globals.output, "orted sent IOF unpub message!\n");
#if 0
/* Note that the svc IOF component will detect an exception on the
oob because we're shutting it down, so it will take care of
closing down any streams that it has open to us. */
orte_iof.iof_flush();
#endif
/* determine the state of this process */
aborted = false;

Просмотреть файл

@ -46,11 +46,6 @@
extern "C" {
#endif
/**
* Subscribe to receive the launch data for local processes
*/
typedef int (*orte_odls_base_module_subscribe_launch_data_fn_t)(orte_jobid_t job, orte_gpr_notify_cb_fn_t cbfunc);
/*
* Construct a notify data object for use in adding local processes
* In order to reuse daemons, we need a way for the HNP to construct a notify_data object that
@ -89,7 +84,6 @@ typedef int (*orte_odls_base_module_deliver_message_fn_t)(orte_jobid_t job, orte
* pls module version 1.3.0
*/
struct orte_odls_base_module_1_3_0_t {
orte_odls_base_module_subscribe_launch_data_fn_t subscribe_launch_data;
orte_odls_base_module_get_add_procs_data_fn_t get_add_procs_data;
orte_odls_base_module_launch_local_processes_fn_t launch_local_procs;
orte_odls_base_module_kill_local_processes_fn_t kill_local_procs;

Просмотреть файл

@ -48,6 +48,7 @@ typedef uint8_t orte_daemon_cmd_flag_t;
#define ORTE_DAEMON_ROUTE_NONE (orte_daemon_cmd_flag_t) 11
#define ORTE_DAEMON_ROUTE_BINOMIAL (orte_daemon_cmd_flag_t) 12
#define ORTE_DAEMON_WARMUP_LOCAL_CONN (orte_daemon_cmd_flag_t) 13
#define ORTE_DAEMON_NULL_CMD (orte_daemon_cmd_flag_t) 14
/* define some useful attributes for dealing with orteds */
#define ORTE_DAEMON_SOFT_KILL "orted-soft-kill"

Просмотреть файл

@ -48,11 +48,12 @@ extern "C" {
/*
* global flag for use in timing tests
*/
ORTE_DECLSPEC extern int mca_oob_base_output;
ORTE_DECLSPEC extern bool orte_oob_base_timing;
ORTE_DECLSPEC extern bool orte_oob_xcast_timing;
ORTE_DECLSPEC extern int orte_oob_xcast_mode;
ORTE_DECLSPEC extern opal_mutex_t orte_oob_xcast_mutex;
ORTE_DECLSPEC extern opal_condition_t orte_oob_xcast_cond;
ORTE_DECLSPEC extern int orte_oob_xcast_linear_xover, orte_oob_xcast_binomial_xover;
/*
* Flag indicating if this framework has been opened

Просмотреть файл

@ -47,14 +47,17 @@ int mca_oob_base_output = -1;
opal_list_t mca_oob_base_components;
opal_list_t mca_oob_base_modules;
opal_list_t mca_oob_base_exception_handlers;
bool orte_oob_base_timing;
bool orte_oob_xcast_timing;
int orte_oob_xcast_mode;
opal_mutex_t orte_oob_xcast_mutex;
opal_condition_t orte_oob_xcast_cond;
int orte_oob_xcast_linear_xover;
int orte_oob_xcast_binomial_xover;
bool orte_oob_base_already_opened = false;
#define ORTE_OOB_XCAST_LINEAR_XOVER_DEFAULT 10
#define ORTE_OOB_XCAST_BINOMIAL_XOVER_DEFAULT INT_MAX
/**
* Function for finding and opening either all MCA components, or the one
* that was specifically requested via a MCA parameter.
@ -63,7 +66,7 @@ int mca_oob_base_open(void)
{
int param, value;
char *mode;
/* Sanity check. This may be able to be removed when the rml/oob
interface is re-worked (the current infrastructure may invoke
this function twice: once as a standalone, and once via the rml
@ -76,49 +79,50 @@ int mca_oob_base_open(void)
OBJ_CONSTRUCT(&orte_oob_xcast_mutex, opal_mutex_t);
OBJ_CONSTRUCT(&orte_oob_xcast_cond, opal_condition_t);
/* Open up all available components */
OBJ_CONSTRUCT(&mca_oob_base_components, opal_list_t);
OBJ_CONSTRUCT(&mca_oob_base_modules, opal_list_t);
OBJ_CONSTRUCT(&mca_oob_base_exception_handlers, opal_list_t);
if (ORTE_SUCCESS !=
mca_base_components_open("oob", mca_oob_base_output,
mca_oob_base_static_components,
&mca_oob_base_components, true)) {
return ORTE_ERROR;
}
/* register parameters */
param = mca_base_param_reg_int_name("orte", "timing",
"Request that critical timing loops be measured",
false, false, 0, &value);
if (value != 0) {
orte_oob_base_timing = true;
} else {
orte_oob_base_timing = false;
}
param = mca_base_param_reg_int_name("oob", "xcast_timing",
"Request that xcast timing loops be measured",
false, false, 0, &value);
if (value != 0) {
orte_oob_xcast_timing = true;
} else {
orte_oob_xcast_timing = false;
}
param = mca_base_param_reg_string_name("oob", "xcast_mode",
"Select xcast mode (\"linear\" | \"binomial\" | \"direct [default] \")",
false, false, "direct", &mode);
if (0 == strcmp(mode, "binomial")) {
orte_oob_xcast_mode = 0;
} else if (0 == strcmp(mode, "linear")) {
orte_oob_xcast_mode = 1;
} else if (0 == strcmp(mode, "direct")) {
orte_oob_xcast_mode = 2;
/* register parameters */
param = mca_base_param_reg_int_name("oob", "base_verbose",
"Verbosity level for the oob framework",
false, false, 0, &value);
if (value != 0) {
mca_oob_base_output = opal_output_open(NULL);
} else {
opal_output(0, "oob_xcast_mode: unknown option %s", mode);
mca_oob_base_output = -1;
}
param = mca_base_param_reg_int_name("oob", "xcast_linear_xover",
"Number of daemons where use of linear xcast mode is to begin",
false, false, ORTE_OOB_XCAST_LINEAR_XOVER_DEFAULT, &orte_oob_xcast_linear_xover);
param = mca_base_param_reg_int_name("oob", "xcast_binomial_xover",
"Number of daemons where use of binomial xcast mode is to begin",
false, false, ORTE_OOB_XCAST_BINOMIAL_XOVER_DEFAULT, &orte_oob_xcast_binomial_xover);
param = mca_base_param_reg_string_name("oob", "xcast_mode",
"Select xcast mode (\"linear\" | \"binomial\" | \"direct\")",
false, false, "none", &mode);
if (0 == strcmp(mode, "binomial")) {
orte_oob_xcast_binomial_xover = 0;
orte_oob_xcast_linear_xover = 0;
} else if (0 == strcmp(mode, "linear")) {
orte_oob_xcast_linear_xover = 0;
orte_oob_xcast_binomial_xover = INT_MAX;
} else if (0 == strcmp(mode, "direct")) {
orte_oob_xcast_binomial_xover = INT_MAX;
orte_oob_xcast_linear_xover = INT_MAX;
} else if (0 != strcmp(mode, "none")) {
opal_output(0, "oob_xcast_mode: unknown option %s - using defaults", mode);
}
/* Open up all available components */
OBJ_CONSTRUCT(&mca_oob_base_components, opal_list_t);
OBJ_CONSTRUCT(&mca_oob_base_modules, opal_list_t);
OBJ_CONSTRUCT(&mca_oob_base_exception_handlers, opal_list_t);
if (ORTE_SUCCESS !=
mca_base_components_open("oob", mca_oob_base_output,
mca_oob_base_static_components,
&mca_oob_base_components, true)) {
return ORTE_ERROR;
}

Просмотреть файл

@ -38,6 +38,7 @@
#include "orte/mca/smr/smr.h"
#include "orte/mca/odls/odls_types.h"
#include "orte/mca/rml/rml.h"
#include "orte/runtime/params.h"
#include "orte/mca/oob/oob.h"
#include "orte/mca/oob/base/base.h"
@ -46,7 +47,6 @@
/* Local variables */
static orte_std_cntr_t xcast_num_active;
static bool xcast_in_progress=false;
static char *mode_string[] = {"binomial", "linear", "direct", "unknown"};
/* Local functions */
static int mca_oob_xcast_binomial_tree(orte_jobid_t job,
@ -61,7 +61,6 @@ static int mca_oob_xcast_direct(orte_jobid_t job,
orte_buffer_t *buffer,
orte_rml_tag_t tag);
/* define a callback function for use by the blocking version
* of xcast so we can "hold" the caller here until all non-blocking
* sends have completed
@ -96,6 +95,7 @@ int mca_oob_xcast_nb(orte_jobid_t job,
{
int rc = ORTE_SUCCESS;
struct timeval start, stop;
orte_vpid_t num_daemons;
/* if there is no message to send, then just return ok */
if (NULL == buffer) {
@ -113,30 +113,58 @@ int mca_oob_xcast_nb(orte_jobid_t job,
xcast_num_active = 0;
OPAL_THREAD_UNLOCK(&orte_oob_xcast_mutex);
if (orte_oob_xcast_timing) {
if (orte_timing) {
gettimeofday(&start, NULL);
}
switch(orte_oob_xcast_mode) {
case 0: /* binomial tree */
rc = mca_oob_xcast_binomial_tree(job, buffer, tag);
break;
case 1: /* linear */
rc = mca_oob_xcast_linear(job, buffer, tag);
break;
case 2: /* direct */
rc = mca_oob_xcast_direct(job, buffer, tag);
break;
/* get the number of daemons currently in the system so we can
* select the "optimal" algorithm
*/
if (ORTE_SUCCESS != (rc = orte_ns.get_vpid_range(0, &num_daemons))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (orte_oob_xcast_timing) {
opal_output(mca_oob_base_output, "oob_xcast_nb: num_daemons %ld linear xover: %ld binomial xover: %ld",
(long)num_daemons, (long)orte_oob_xcast_linear_xover, (long)orte_oob_xcast_binomial_xover);
if (num_daemons < 2) {
/* if there is only one daemon in the system, then we must
* use the direct mode - there is no other option. Note that
* since the HNP is the one that typically does xcast sends,
* only one daemon means that the HNP itself is sending to
* itself. This is required in singletons - where the
* singleton acts as the HNP - and as an HNP starts
* itself up
*
* NOTE: although we allow users to alter crossover points
* for selecting specific xcast modes, this required
* use-case behavior MUST always be retained or else
* singletons and HNP startup will fail!
*/
rc = mca_oob_xcast_direct(job, buffer, tag);
goto DONE;
}
/* now use the crossover points to select the proper transmission
* mode. We have built-in default crossover points for this
* decision tree, but the user is free to alter them as
* they wish via MCA params
*/
if (num_daemons < orte_oob_xcast_linear_xover) {
rc = mca_oob_xcast_direct(job, buffer, tag);
} else if (num_daemons < orte_oob_xcast_binomial_xover) {
rc = mca_oob_xcast_linear(job, buffer, tag);
} else {
rc = mca_oob_xcast_binomial_tree(job, buffer, tag);
}
DONE:
if (orte_timing) {
gettimeofday(&stop, NULL);
opal_output(0, "xcast_nb [%ld,%ld,%ld]: mode %s time %ld usec", ORTE_NAME_ARGS(ORTE_PROC_MY_NAME),
(orte_oob_xcast_mode < 0 || orte_oob_xcast_mode > 2) ?
mode_string[3] : mode_string[orte_oob_xcast_mode],
(long int)((stop.tv_sec - start.tv_sec)*1000000 +
opal_output(0, "xcast_nb [%ld,%ld,%ld]: time %ld usec", ORTE_NAME_ARGS(ORTE_PROC_MY_NAME),
(long int)((stop.tv_sec - start.tv_sec)*1000000 +
(stop.tv_usec - start.tv_usec)));
}
@ -150,6 +178,7 @@ int mca_oob_xcast(orte_jobid_t job,
{
int rc = ORTE_SUCCESS;
struct timeval start, stop;
orte_vpid_t num_daemons;
/* if there is no message to send, then just return ok */
if (NULL == buffer) {
@ -167,24 +196,54 @@ int mca_oob_xcast(orte_jobid_t job,
xcast_num_active = 0;
OPAL_THREAD_UNLOCK(&orte_oob_xcast_mutex);
if (orte_oob_xcast_timing) {
if (orte_timing) {
gettimeofday(&start, NULL);
}
switch(orte_oob_xcast_mode) {
case 0: /* binomial tree */
rc = mca_oob_xcast_binomial_tree(job, buffer, tag);
break;
case 1: /* linear */
rc = mca_oob_xcast_linear(job, buffer, tag);
break;
case 2: /* direct */
rc = mca_oob_xcast_direct(job, buffer, tag);
break;
/* get the number of daemons currently in the system so we can
* select the "optimal" algorithm
*/
if (ORTE_SUCCESS != (rc = orte_ns.get_vpid_range(0, &num_daemons))) {
ORTE_ERROR_LOG(rc);
return rc;
}
opal_output(mca_oob_base_output, "oob_xcast: num_daemons %ld linear xover: %ld binomial xover: %ld",
(long)num_daemons, (long)orte_oob_xcast_linear_xover, (long)orte_oob_xcast_binomial_xover);
if (num_daemons < 2) {
/* if there is only one daemon in the system, then we must
* use the direct mode - there is no other option. Note that
* since the HNP is the one that typically does xcast sends,
* only one daemon means that the HNP itself is sending to
* itself. This is required in singletons - where the
* singleton acts as the HNP - and as an HNP starts
* itself up
*
* NOTE: although we allow users to alter crossover points
* for selecting specific xcast modes, this required
* use-case behavior MUST always be retained or else
* singletons and HNP startup will fail!
*/
rc = mca_oob_xcast_direct(job, buffer, tag);
goto DONE;
}
/* now use the crossover points to select the proper transmission
* mode. We have built-in default crossover points for this
* decision tree, but the user is free to alter them as
* they wish via MCA params
*/
if (num_daemons < orte_oob_xcast_linear_xover) {
rc = mca_oob_xcast_direct(job, buffer, tag);
} else if (num_daemons < orte_oob_xcast_binomial_xover) {
rc = mca_oob_xcast_linear(job, buffer, tag);
} else {
rc = mca_oob_xcast_binomial_tree(job, buffer, tag);
}
DONE:
/* now go to sleep until woken up */
OPAL_THREAD_LOCK(&orte_oob_xcast_mutex);
if (xcast_num_active > 0) {
@ -192,15 +251,12 @@ int mca_oob_xcast(orte_jobid_t job,
}
OPAL_THREAD_UNLOCK(&orte_oob_xcast_mutex);
if (orte_oob_xcast_timing) {
if (orte_timing) {
gettimeofday(&stop, NULL);
opal_output(0, "xcast_nb [%ld,%ld,%ld]: mode %s time %ld usec", ORTE_NAME_ARGS(ORTE_PROC_MY_NAME),
(orte_oob_xcast_mode < 0 || orte_oob_xcast_mode > 2) ?
mode_string[3] : mode_string[orte_oob_xcast_mode],
opal_output(0, "xcast [%ld,%ld,%ld]: time %ld usec", ORTE_NAME_ARGS(ORTE_PROC_MY_NAME),
(long int)((stop.tv_sec - start.tv_sec)*1000000 +
(stop.tv_usec - start.tv_usec)));
}
return rc;
}
@ -208,16 +264,17 @@ static int mca_oob_xcast_binomial_tree(orte_jobid_t job,
orte_buffer_t *buffer,
orte_rml_tag_t tag)
{
orte_daemon_cmd_flag_t command, mode;
orte_std_cntr_t i;
int rc;
int peer, size, rank, hibit, mask;
orte_process_name_t target;
orte_buffer_t *buf;
orte_daemon_cmd_flag_t command=ORTE_DAEMON_MESSAGE_LOCAL_PROCS;
orte_daemon_cmd_flag_t mode=ORTE_DAEMON_ROUTE_BINOMIAL;
orte_vpid_t num_daemons;
int bitmap;
opal_output(mca_oob_base_output, "oob_xcast_mode: binomial");
/* this is the HNP end, so it starts the procedure. Since the HNP is always the
* vpid=0 at this time, we take advantage of that fact to figure out who we
* should send this to on the first step
@ -227,34 +284,15 @@ static int mca_oob_xcast_binomial_tree(orte_jobid_t job,
*/
buf = OBJ_NEW(orte_buffer_t);
/* ======== LOAD THE VALUES THAT ARE COMMON TO ALL NON-DIRECT MESSAGE PATHS ======== */
/* tell the daemon this is a message for its local procs */
if (ORTE_SUCCESS != (rc = orte_dss.pack(buf, &command, 1, ORTE_DAEMON_CMD))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
/* tell the daemon the routing algorithm is binomial so it can figure
* out who to forward the message down the tree
/* tell the daemon the routing algorithm so it can figure
* out how to forward the message down the tree, if at all
*/
mode = ORTE_DAEMON_ROUTE_BINOMIAL;
if (ORTE_SUCCESS != (rc = orte_dss.pack(buf, &mode, 1, ORTE_DAEMON_CMD))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
return rc;
}
/* tell the daemon the jobid of the procs that are to receive the message */
if (ORTE_SUCCESS != (rc = orte_dss.pack(buf, &job, 1, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
/* tell the daemon the tag where the message is to be sent */
if (ORTE_SUCCESS != (rc = orte_dss.pack(buf, &tag, 1, ORTE_RML_TAG))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
/* ======= DONE WITH COMMON VALUES ====== */
/* get the number of daemons currently in the system and tell the daemon so
* it can properly route
*/
@ -267,6 +305,31 @@ static int mca_oob_xcast_binomial_tree(orte_jobid_t job,
goto CLEANUP;
}
/* if this isn't intended for the daemon command tag, then we better
* tell the daemon to deliver it to the procs, and what job is supposed
* to get it - this occurs when a caller just wants to send something
* to all the procs in a job. In that use-case, the caller doesn't know
* anything about inserting daemon commands or what routing algo might
* be used, so we have to help them out a little. Functions that are
* sending commands to the daemons themselves are smart enough to know
* what they need to do.
*/
if (ORTE_RML_TAG_DAEMON != tag) {
command = ORTE_DAEMON_MESSAGE_LOCAL_PROCS;
if (ORTE_SUCCESS != (rc = orte_dss.pack(buf, &command, 1, ORTE_DAEMON_CMD))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
if (ORTE_SUCCESS != (rc = orte_dss.pack(buf, &job, 1, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
if (ORTE_SUCCESS != (rc = orte_dss.pack(buf, &tag, 1, ORTE_RML_TAG))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
}
/* copy the payload into the new buffer - this is non-destructive, so our
* caller is still responsible for releasing any memory in the buffer they
* gave to us
@ -276,7 +339,7 @@ static int mca_oob_xcast_binomial_tree(orte_jobid_t job,
goto CLEANUP;
}
if (orte_oob_xcast_timing) {
if (orte_timing) {
opal_output(0, "xcast [%ld,%ld,%ld]: mode binomial buffer size %ld",
ORTE_NAME_ARGS(ORTE_PROC_MY_NAME), (long)buf->bytes_used);
}
@ -285,34 +348,6 @@ static int mca_oob_xcast_binomial_tree(orte_jobid_t job,
target.cellid = ORTE_PROC_MY_NAME->cellid;
target.jobid = 0;
/* we have to account for all of the messages we are about to send
* because the non-blocking send can come back almost immediately - before
* we would get the chance to increment the num_active. This causes us
* to not correctly wakeup and reset the xcast_in_progress flag
*/
OPAL_THREAD_LOCK(&orte_oob_xcast_mutex);
xcast_num_active = num_daemons;
if (orte_process_info.daemon ||
orte_process_info.seed ||
orte_process_info.singleton) {
/* we never send to ourselves,
* so we need to adjust the number of sends
* we are expecting to complete
*/
xcast_num_active--;
if (xcast_num_active <= 0) {
/* if we aren't going to send anything at all, we
* need to reset the xcast_in_progress flag so
* we don't block the entire system and return
*/
xcast_in_progress = false;
OPAL_THREAD_UNLOCK(&orte_oob_xcast_mutex);
rc = ORTE_SUCCESS;
goto CLEANUP;
}
}
OPAL_THREAD_UNLOCK(&orte_oob_xcast_mutex);
/* compute the bitmap */
bitmap = opal_cube_dim((int)num_daemons);
rank = 0;
@ -320,7 +355,36 @@ static int mca_oob_xcast_binomial_tree(orte_jobid_t job,
hibit = opal_hibit(rank, bitmap);
--bitmap;
/* we have to account for all of the messages we are about to send
* because the non-blocking send can come back almost immediately - before
* we would get the chance to increment the num_active. This causes us
* to not correctly wakeup and reset the xcast_in_progress flag
*/
OPAL_THREAD_LOCK(&orte_oob_xcast_mutex);
/* compute the number of sends we are going to do - it would be nice
* to have a simple algo to do this, but for now just brute force
* is fine
*/
xcast_num_active = 0;
for (i = hibit + 1, mask = 1 << i; i <= bitmap; ++i, mask <<= 1) {
peer = rank | mask;
if (peer < size) {
++xcast_num_active;
}
}
if (xcast_num_active == 0) {
/* if we aren't going to send anything at all, we
* need to reset the xcast_in_progress flag so
* we don't block the entire system and return
*/
xcast_in_progress = false;
OPAL_THREAD_UNLOCK(&orte_oob_xcast_mutex);
rc = ORTE_SUCCESS;
goto CLEANUP;
}
OPAL_THREAD_UNLOCK(&orte_oob_xcast_mutex);
target.cellid = ORTE_PROC_MY_NAME->cellid;
target.jobid = 0;
for (i = hibit + 1, mask = 1 << i; i <= bitmap; ++i, mask <<= 1) {
@ -328,7 +392,7 @@ static int mca_oob_xcast_binomial_tree(orte_jobid_t job,
if (peer < size) {
target.vpid = (orte_vpid_t)peer;
opal_output(0, "[%ld,%ld,%ld] xcast to [%ld,%ld,%ld]", ORTE_NAME_ARGS(ORTE_PROC_MY_NAME), ORTE_NAME_ARGS(&target));
if (0 > (rc = mca_oob_send_packed_nb(&target, buf, ORTE_RML_TAG_PLS_ORTED,
if (0 > (rc = mca_oob_send_packed_nb(&target, buf, ORTE_RML_TAG_ORTED_ROUTED,
0, mca_oob_xcast_send_cb, NULL))) {
if (ORTE_ERR_ADDRESSEE_UNKNOWN != rc) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
@ -346,8 +410,8 @@ static int mca_oob_xcast_binomial_tree(orte_jobid_t job,
}
}
CLEANUP:
OBJ_RELEASE(buf); /* done with this object */
CLEANUP:
OBJ_RELEASE(buf);
return rc;
}
@ -358,11 +422,12 @@ static int mca_oob_xcast_linear(orte_jobid_t job,
{
int rc;
orte_buffer_t *buf;
orte_daemon_cmd_flag_t command=ORTE_DAEMON_MESSAGE_LOCAL_PROCS;
orte_daemon_cmd_flag_t mode=ORTE_DAEMON_ROUTE_NONE;
orte_daemon_cmd_flag_t command, mode=ORTE_DAEMON_ROUTE_NONE;
orte_vpid_t i, range;
orte_process_name_t dummy;
opal_output(mca_oob_base_output, "oob_xcast_mode: linear");
/* since we have to pack some additional info into the buffer to be
* sent to the daemons, we create a new buffer into which we will
* put the intermediate payload - i.e., the info that goes to the
@ -371,26 +436,35 @@ static int mca_oob_xcast_linear(orte_jobid_t job,
*/
buf = OBJ_NEW(orte_buffer_t);
/* tell the daemon this is a message for its local procs */
if (ORTE_SUCCESS != (rc = orte_dss.pack(buf, &command, 1, ORTE_DAEMON_CMD))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
/* tell the daemon that no further routing required */
if (ORTE_SUCCESS != (rc = orte_dss.pack(buf, &mode, 1, ORTE_DAEMON_CMD))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
/* tell the daemon the jobid of the procs that are to receive the message */
if (ORTE_SUCCESS != (rc = orte_dss.pack(buf, &job, 1, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
/* tell the daemon the tag where the message is to be sent */
if (ORTE_SUCCESS != (rc = orte_dss.pack(buf, &tag, 1, ORTE_RML_TAG))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
/* if this isn't intended for the daemon command tag, then we better
* tell the daemon to deliver it to the procs, and what job is supposed
* to get it - this occurs when a caller just wants to send something
* to all the procs in a job. In that use-case, the caller doesn't know
* anything about inserting daemon commands or what routing algo might
* be used, so we have to help them out a little. Functions that are
* sending commands to the daemons themselves are smart enough to know
* what they need to do.
*/
if (ORTE_RML_TAG_DAEMON != tag) {
command = ORTE_DAEMON_MESSAGE_LOCAL_PROCS;
if (ORTE_SUCCESS != (rc = orte_dss.pack(buf, &command, 1, ORTE_DAEMON_CMD))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
if (ORTE_SUCCESS != (rc = orte_dss.pack(buf, &job, 1, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
if (ORTE_SUCCESS != (rc = orte_dss.pack(buf, &tag, 1, ORTE_RML_TAG))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
}
/* copy the payload into the new buffer - this is non-destructive, so our
@ -402,7 +476,7 @@ static int mca_oob_xcast_linear(orte_jobid_t job,
goto CLEANUP;
}
if (orte_oob_xcast_timing) {
if (orte_timing) {
opal_output(0, "xcast [%ld,%ld,%ld]: mode linear buffer size %ld",
ORTE_NAME_ARGS(ORTE_PROC_MY_NAME), (long)buf->bytes_used);
}
@ -444,7 +518,7 @@ static int mca_oob_xcast_linear(orte_jobid_t job,
for (i=0; i < range; i++) {
if (ORTE_PROC_MY_NAME->vpid != i) { /* don't send to myself */
dummy.vpid = i;
if (0 > (rc = mca_oob_send_packed_nb(&dummy, buf, ORTE_RML_TAG_PLS_ORTED,
if (0 > (rc = mca_oob_send_packed_nb(&dummy, buf, ORTE_RML_TAG_ORTED_ROUTED,
0, mca_oob_xcast_send_cb, NULL))) {
if (ORTE_ERR_ADDRESSEE_UNKNOWN != rc) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
@ -465,8 +539,7 @@ static int mca_oob_xcast_linear(orte_jobid_t job,
/* cleanup */
CLEANUP:
OBJ_RELEASE(buf); /* done with this object */
OBJ_RELEASE(buf);
return rc;
}
@ -477,10 +550,12 @@ static int mca_oob_xcast_direct(orte_jobid_t job,
orte_std_cntr_t i;
int rc;
orte_process_name_t *peers=NULL;
orte_std_cntr_t n=0;
orte_std_cntr_t n;
opal_list_t attrs;
opal_list_item_t *item;
opal_output(mca_oob_base_output, "oob_xcast_mode: direct");
/* need to get the job peers so we know who to send the message to */
OBJ_CONSTRUCT(&attrs, opal_list_t);
orte_rmgr.add_attribute(&attrs, ORTE_NS_USE_JOBID, ORTE_JOBID, &job, ORTE_RMGR_ATTR_OVERRIDE);
@ -493,11 +568,7 @@ static int mca_oob_xcast_direct(orte_jobid_t job,
OBJ_RELEASE(item);
OBJ_DESTRUCT(&attrs);
/* no need to re-pack the msg for sending - no routing info here as this message
* goes DIRECTLY to the processes
*/
if (orte_oob_xcast_timing) {
if (orte_timing) {
opal_output(0, "xcast [%ld,%ld,%ld]: mode direct buffer size %ld",
ORTE_NAME_ARGS(ORTE_PROC_MY_NAME), (long)buffer->bytes_used);
}

Просмотреть файл

@ -53,10 +53,6 @@ extern "C" {
opal_mutex_t orted_cmd_lock;
/* orted cmd cond */
opal_condition_t orted_cmd_cond;
/** reuse daemons flag */
bool reuse_daemons;
/** request for timing measurement reports */
bool timing;
} orte_pls_base_t;
/**

Просмотреть файл

@ -84,7 +84,6 @@ void orte_pls_base_purge_mca_params(char ***env)
int orte_pls_base_orted_append_basic_args(int *argc, char ***argv,
int *proc_name_index,
int *node_name_index,
char *jobid_string,
orte_std_cntr_t num_procs)
{
char *param = NULL, *uri = NULL;
@ -95,10 +94,6 @@ int orte_pls_base_orted_append_basic_args(int *argc, char ***argv,
/* check for debug flags */
orte_pls_base_mca_argv(argc, argv);
/* Bootproxy */
opal_argv_append(argc, argv, "--bootproxy");
opal_argv_append(argc, argv, jobid_string);
/* Name */
if( NULL != proc_name_index ) {
opal_argv_append(argc, argv, "--name");

Просмотреть файл

@ -53,8 +53,6 @@ orte_pls_base_module_t orte_pls;
*/
int orte_pls_base_open(void)
{
int value;
/* Debugging / verbose output. Always have stream open, with
verbose set by the mca open system... */
orte_pls_base.pls_output = opal_output_open(NULL);
@ -66,26 +64,6 @@ int orte_pls_base_open(void)
OBJ_CONSTRUCT(&orte_pls_base.orted_cmd_lock, opal_mutex_t);
OBJ_CONSTRUCT(&orte_pls_base.orted_cmd_cond, opal_condition_t);
/* check for reuse of daemons */
mca_base_param_reg_int_name("pls", "base_reuse_daemons",
"If nonzero, reuse daemons to launch dynamically spawned processes. If zero, do not reuse daemons (default)",
false, false, (int)false, &value);
if (false == value) {
orte_pls_base.reuse_daemons = false;
} else {
orte_pls_base.reuse_daemons = true;
}
/* check for timing requests */
mca_base_param_reg_int_name("orte", "timing",
"Request that critical timing loops be measured",
false, false, 0, &value);
if (value != 0) {
orte_pls_base.timing = true;
} else {
orte_pls_base.timing = false;
}
/* Open up all the components that we can find */
if (ORTE_SUCCESS !=

Просмотреть файл

@ -41,143 +41,6 @@
#include "orte/mca/pls/base/base.h"
#include "orte/mca/pls/base/pls_private.h"
static orte_std_cntr_t orted_cmd_num_active;
static int completion_status;
static void orte_pls_base_orted_default_wakeup(int fd, short event, void *arg)
{
/* protect for threads */
OPAL_THREAD_LOCK(&orte_pls_base.orted_cmd_lock);
/* cancel the receive - we didn't get everyone's response in time */
orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_PLS_ORTED_ACK);
/* set the completion status to reflect timeout error */
completion_status = ORTE_ERR_TIMEOUT;
/* declare us "done" so we can exit cleanly */
opal_condition_signal(&orte_pls_base.orted_cmd_cond);
/* unlock us */
OPAL_THREAD_UNLOCK(&orte_pls_base.orted_cmd_lock);
}
static void orte_pls_base_orted_send_cb(int status,
orte_process_name_t* peer,
orte_buffer_t* req,
orte_rml_tag_t tag,
void* cbdata)
{
/* nothing to do here - this just catches the callback when
* the send is received on the far end
*/
return;
}
static void orte_pls_base_cmd_ack(int status, orte_process_name_t* sender,
orte_buffer_t* buffer, orte_rml_tag_t tag,
void* cbdata)
{
int ret;
OPAL_THREAD_LOCK(&orte_pls_base.orted_cmd_lock);
orted_cmd_num_active--;
if (orted_cmd_num_active == 0) {
opal_condition_signal(&orte_pls_base.orted_cmd_cond);
} else {
ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_PLS_ORTED_ACK,
ORTE_RML_NON_PERSISTENT, orte_pls_base_cmd_ack, NULL);
if (ret != ORTE_SUCCESS) {
ORTE_ERROR_LOG(ret);
return;
}
}
OPAL_THREAD_UNLOCK(&orte_pls_base.orted_cmd_lock);
return;
}
static int send_cmd(orte_buffer_t *cmd, struct timeval *timeout)
{
opal_event_t* event = NULL;
orte_vpid_t i, range;
orte_process_name_t dummy;
int rc;
/* get the number of daemons out there */
orte_ns.get_vpid_range(0, &range);
/* send the commands as fast as we can */
dummy.cellid = ORTE_PROC_MY_NAME->cellid;
dummy.jobid = 0;
for (i=0; i < range; i++) {
if (ORTE_PROC_MY_NAME->vpid != i) { /* don't kill myself */
dummy.vpid = i;
if (0 > (rc = orte_rml.send_buffer_nb(&dummy, cmd, ORTE_RML_TAG_PLS_ORTED,
0, orte_pls_base_orted_send_cb, NULL))) {
if (ORTE_ERR_ADDRESSEE_UNKNOWN != rc) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
return ORTE_ERR_COMM_FAILURE;
}
} else {
/* be sure to protect the global variable */
OPAL_THREAD_LOCK(&orte_pls_base.orted_cmd_lock);
orted_cmd_num_active++;
OPAL_THREAD_UNLOCK(&orte_pls_base.orted_cmd_lock);
}
}
}
/* post the receive for the ack's */
OPAL_THREAD_LOCK(&orte_pls_base.orted_cmd_lock);
if (orted_cmd_num_active > 0) {
rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_PLS_ORTED_ACK,
ORTE_RML_NON_PERSISTENT, orte_pls_base_cmd_ack, NULL);
if (rc != ORTE_SUCCESS) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_pls_base.orted_cmd_lock);
return rc;
}
}
OPAL_THREAD_UNLOCK(&orte_pls_base.orted_cmd_lock);
/* define the default completion status */
completion_status = ORTE_SUCCESS;
/* wait for all commands to have been received */
OPAL_THREAD_LOCK(&orte_pls_base.orted_cmd_lock);
if (orted_cmd_num_active > 0) {
/* setup a delay to give the orteds time to complete their departure - wake us up if they
* don't exit by the prescribed time
*/
if (NULL != timeout && /* only do this if the user gave us a time to wait */
NULL != (event = (opal_event_t*)malloc(sizeof(opal_event_t)))) {
opal_evtimer_set(event, orte_pls_base_orted_default_wakeup, NULL);
opal_evtimer_add(event, timeout);
}
/* now go to sleep until woken up */
opal_condition_wait(&orte_pls_base.orted_cmd_cond, &orte_pls_base.orted_cmd_lock);
}
OPAL_THREAD_UNLOCK(&orte_pls_base.orted_cmd_lock);
/* log an error if one occurred */
if (ORTE_SUCCESS != completion_status) {
ORTE_ERROR_LOG(completion_status);
}
/* if started, kill the timer event so it doesn't hit us later */
if (NULL != event) {
opal_evtimer_del(event);
free(event);
}
/* we're done! */
return completion_status;
}
static int get_jobids(orte_jobid_t **jobs, orte_std_cntr_t *num_jobs, bool *allocated,
orte_jobid_t job, opal_list_t *attrs)
{
@ -214,28 +77,6 @@ static int get_jobids(orte_jobid_t **jobs, orte_std_cntr_t *num_jobs, bool *allo
return ORTE_SUCCESS;
}
int orte_pls_base_orted_cancel_operation(void)
{
/* protect for threads */
OPAL_THREAD_LOCK(&orte_pls_base.orted_cmd_lock);
/* cancel any waiting receive - we don't want to hear it */
orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_PLS_ORTED_ACK);
/* set the completion status to reflect cancellation -- no need to
print anything */
completion_status = ORTE_ERR_SILENT;
/* declare us "done" so we can exit cleanly */
opal_condition_signal(&orte_pls_base.orted_cmd_cond);
/* unlock us */
OPAL_THREAD_UNLOCK(&orte_pls_base.orted_cmd_lock);
return ORTE_SUCCESS;
}
int orte_pls_base_orted_exit(struct timeval *timeout, opal_list_t *attrs)
{
int rc;
@ -264,7 +105,7 @@ int orte_pls_base_orted_exit(struct timeval *timeout, opal_list_t *attrs)
}
/* send it! */
if (ORTE_SUCCESS != (rc = send_cmd(&cmd, timeout))) {
if (ORTE_SUCCESS != (rc = orte_rml.xcast(0, &cmd, ORTE_RML_TAG_DAEMON))) {
ORTE_ERROR_LOG(rc);
}
OBJ_DESTRUCT(&cmd);
@ -318,7 +159,7 @@ int orte_pls_base_orted_kill_local_procs(orte_jobid_t job, struct timeval *timeo
if (allocated) free(jobs); /* not needed any more */
/* send it! */
if (ORTE_SUCCESS != (rc = send_cmd(&cmd, timeout))) {
if (ORTE_SUCCESS != (rc = orte_rml.xcast(0, &cmd, ORTE_RML_TAG_DAEMON))) {
ORTE_ERROR_LOG(rc);
}
OBJ_DESTRUCT(&cmd);
@ -381,7 +222,7 @@ int orte_pls_base_orted_signal_local_procs(orte_jobid_t job, int32_t signal, opa
}
/* send it! */
if (ORTE_SUCCESS != (rc = send_cmd(&cmd, NULL))) {
if (ORTE_SUCCESS != (rc = orte_rml.xcast(0, &cmd, ORTE_RML_TAG_DAEMON))) {
ORTE_ERROR_LOG(rc);
}
OBJ_DESTRUCT(&cmd);
@ -389,37 +230,3 @@ int orte_pls_base_orted_signal_local_procs(orte_jobid_t job, int32_t signal, opa
/* we're done! */
return ORTE_SUCCESS;
}
int orte_pls_base_orted_add_local_procs(orte_gpr_notify_data_t *ndat)
{
int rc;
orte_buffer_t cmd;
orte_daemon_cmd_flag_t command=ORTE_DAEMON_ADD_LOCAL_PROCS;
OPAL_TRACE(1);
/* pack the command */
OBJ_CONSTRUCT(&cmd, orte_buffer_t);
if (ORTE_SUCCESS != (rc = orte_dss.pack(&cmd, &command, 1, ORTE_DAEMON_CMD))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&cmd);
return rc;
}
/* pack the launch data for the daemons */
if (ORTE_SUCCESS != (rc = orte_dss.pack(&cmd, &ndat, 1, ORTE_GPR_NOTIFY_DATA))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&cmd);
return rc;
}
/* send it! */
if (ORTE_SUCCESS != (rc = send_cmd(&cmd, NULL))) {
ORTE_ERROR_LOG(rc);
}
OBJ_DESTRUCT(&cmd);
return rc;
}

Просмотреть файл

@ -259,13 +259,6 @@ void orte_pls_base_recv(int status, orte_process_name_t* sender,
}
break;
case ORTE_PLS_CANCEL_OPERATION_CMD:
/* issue the command */
if (ORTE_SUCCESS != (rc = orte_pls.cancel_operation())) {
ORTE_ERROR_LOG(rc);
}
break;
default:
ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS);
}

Просмотреть файл

@ -20,117 +20,56 @@
#include "orte_config.h"
#include "orte/orte_constants.h"
#include "opal/util/argv.h"
#include "opal/util/opal_environ.h"
#include "opal/mca/base/mca_base_param.h"
#include "orte/dss/dss.h"
#include "orte/mca/odls/odls.h"
#include "orte/mca/rmaps/rmaps_types.h"
#include "orte/mca/gpr/gpr.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ns/ns_types.h"
#include "orte/mca/pls/base/pls_private.h"
/* Since we now send the "add_procs" command using xcast to all daemons
* as our standard launch procedure, all we need do for launching on
* existing daemons is correctly increment the launch counter so that
* trigger will fire and the launch message will be sent
*/
int orte_pls_base_launch_on_existing_daemons(orte_job_map_t *map)
{
orte_gpr_value_t **values; /* the gpr initializes this to NULL */
orte_gpr_keyval_t *kv;
orte_std_cntr_t cnt, i;
char *keys[] = {
ORTE_NODE_NAME_KEY,
orte_std_cntr_t num_reused;
orte_data_value_t dval = ORTE_DATA_VALUE_EMPTY;
char *trig_tokens[] = {
ORTE_JOB_GLOBALS,
NULL
};
char *to_launch_keys[] = {
ORTE_PROC_NUM_LAUNCHED,
NULL
};
opal_list_item_t *item2, *next;
orte_mapped_node_t *node;
orte_gpr_notify_data_t *ndat;
bool found;
char *nodename;
int rc;
/* check the number of new daemons vs the number of nodes in the job
* if num_new_daemons < num_nodes, then we are reusing some existing
* daemons and we need to increment the launch counter
*/
if (map->num_nodes == map->num_new_daemons) {
return ORTE_SUCCESS;
}
/* query the daemon info */
if (ORTE_SUCCESS != (rc = orte_gpr.get(ORTE_GPR_KEYS_OR|ORTE_GPR_TOKENS_OR,
"orte-job-0", /* the daemon job segment */
NULL, /* all containers */
keys,
&cnt, &values))) {
/* compute the number of daemons that are being reused */
num_reused = map->num_nodes - map->num_new_daemons;
/* setup the arithmetic operand */
if (ORTE_SUCCESS != (rc = orte_dss.set(&dval, (void*)&num_reused, ORTE_STD_CNTR))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* if no daemons are around (except HNP), then don't worry about this */
if (cnt <= 1) {
rc = ORTE_SUCCESS;
goto CLEANUP;
}
/* get here if some daemons, other than HNP, exist
* go through the list, checking nodenames against what is in the
* map. If nodes match, then construct and send an appropriate command
* to that daemon to launch the local procs - remove that node structure
* from the map so that the main launcher doesn't also try to start procs
* on that node!
*/
found = false;
item2 = opal_list_get_first(&map->nodes);
while (item2 != opal_list_get_end(&map->nodes)) {
node = (orte_mapped_node_t*)item2;
/* save the next position in case we remove this one */
next = opal_list_get_next(item2);
/* check the returned values and see if the nodenames match */
for (i=0; i < cnt; i++) {
kv = values[i]->keyvals[0];
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&nodename, kv->value, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
if (0 == strcmp(node->nodename, nodename)) {
/* get the launch message only once - do it the first time
* through so all the nodes are still on the map!
*/
if (!found) {
if (ORTE_SUCCESS != (rc = orte_odls.get_add_procs_data(&ndat, map))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(ndat);
return rc;
}
/* indicate that at least one daemon was found */
found = true;
}
/* procs on this node will be taken care of, so remove it from
* the map list so the main launcher won't try to launch them
*/
opal_list_remove_item(&map->nodes, item2);
OBJ_RELEASE(item2);
}
}
/* move to next position */
item2 = next;
}
if (!found) {
/* if no daemons were reused, then just return */
rc = ORTE_SUCCESS;
goto CLEANUP;
}
/* launch any procs that are using existing daemons */
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_add_local_procs(ndat))) {
/* update the counter */
if (ORTE_SUCCESS != (rc = orte_gpr.arith(ORTE_GPR_TOKENS_AND | ORTE_GPR_KEYS_OR,
"orte-job-0", trig_tokens, to_launch_keys,
ORTE_DSS_ADD, &dval))) {
ORTE_ERROR_LOG(rc);
return rc;
}
OBJ_RELEASE(ndat);
CLEANUP:
for (i=0; i < cnt; i++) {
if (NULL != values[i]) OBJ_RELEASE(values[i]);
}
if (NULL != values) free(values);
return rc;
return ORTE_SUCCESS;
}

Просмотреть файл

@ -56,7 +56,6 @@ typedef uint8_t orte_pls_cmd_flag_t;
#define ORTE_PLS_SIGNAL_JOB_CMD 4
#define ORTE_PLS_SIGNAL_PROC_CMD 5
#define ORTE_PLS_TERMINATE_ORTEDS_CMD 6
#define ORTE_PLS_CANCEL_OPERATION_CMD 7
/*
* object for daemon information
@ -116,7 +115,6 @@ typedef uint8_t orte_pls_cmd_flag_t;
char ***argv,
int *proc_name_index,
int *node_name_index,
char *jobid_string,
orte_std_cntr_t num_procs);
#if defined(c_plusplus) || defined(__cplusplus)

Просмотреть файл

@ -102,7 +102,6 @@ orte_pls_base_module_t orte_pls_bproc_module = {
orte_pls_bproc_terminate_proc,
orte_pls_bproc_signal_job,
orte_pls_bproc_signal_proc,
orte_pls_bproc_cancel_operation,
orte_pls_bproc_finalize
};
@ -442,6 +441,7 @@ static void orte_pls_bproc_setup_env(char *** env)
static int orte_pls_bproc_launch_daemons(orte_job_map_t *map, char ***envp) {
int * daemon_list = NULL;
int num_daemons = 0;
int total_num_daemons = 0;
int rc, i;
int * pids = NULL;
int argc;
@ -450,14 +450,13 @@ static int orte_pls_bproc_launch_daemons(orte_job_map_t *map, char ***envp) {
char * var;
int stride;
char * orted_path;
orte_vpid_t daemon_vpid_start;
orte_std_cntr_t idx;
struct stat buf;
struct timeval joblaunchstart, launchstart, launchstop;
opal_list_item_t* item;
OPAL_TRACE(1);
if (orte_pls_base.timing) {
if (orte_timing) {
if (0 != gettimeofday(&joblaunchstart, NULL)) {
opal_output(0, "pls_bproc: could not obtain start time");
}
@ -466,14 +465,34 @@ static int orte_pls_bproc_launch_daemons(orte_job_map_t *map, char ***envp) {
/* indicate that the daemons have not completely launched yet */
daemons_launched = false;
/* get the number of nodes in this job and allocate an array for
/* get the total number of daemons involved in this job - we need
* this number because all of the daemons involved in the job
* are going to send us back a message indicating they have
* finished preparing their node for the arrival of the procs themselves.
* Since we are going to "hold" until all the messages have arrived,
* we need to know how many are coming
*/
total_num_daemons = map->num_nodes;
/* account for any reuse of daemons */
if (ORTE_SUCCESS != (rc = orte_pls_base_launch_on_existing_daemons(map))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* get the number of new daemons to be launched for this job and allocate an array for
* their names so we can pass that to bproc - populate the list
* with the node names
*/
num_daemons = map->num_nodes;
num_daemons = map->num_new_daemons;
if (0 == num_daemons) {
/* nothing to do */
return ORTE_SUCCESS;
/* nothing to do - but we still need to wait for all the
* existing daemons to report back if we are going to launch!
*/
if (mca_pls_bproc_component.do_not_launch) {
return ORTE_SUCCESS;
}
goto WAITFORCOMM;
}
if(NULL == (daemon_list = (int*)malloc(sizeof(int) * num_daemons))) {
@ -497,30 +516,9 @@ static int orte_pls_bproc_launch_daemons(orte_job_map_t *map, char ***envp) {
goto cleanup;
}
/* allocate a range of vpids for the daemons */
rc = orte_ns.reserve_range(0, num_daemons, &daemon_vpid_start);
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* setup the orted triggers for passing their launch info */
if (ORTE_SUCCESS != (rc = orte_smr.init_orted_stage_gates(map->job, num_daemons, NULL, NULL))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* setup the daemon environment */
orte_pls_bproc_setup_env(envp);
/* direct the daemons to drop contact files so the local procs
* can learn how to contact them - this is used for routing
* OOB messaging
*/
var = mca_base_param_environ_variable("odls","base","drop_contact_file");
opal_setenv(var,"1", true, envp);
free(var);
/* daemons calculate their process name using a "stride" of one, so
* push that value into their environment */
stride = 1;
@ -531,7 +529,7 @@ static int orte_pls_bproc_launch_daemons(orte_job_map_t *map, char ***envp) {
free(var);
/* set up the base environment so the daemons can get their names once launched */
rc = orte_ns_nds_bproc_put(ORTE_PROC_MY_NAME->cellid, 0, daemon_vpid_start,
rc = orte_ns_nds_bproc_put(ORTE_PROC_MY_NAME->cellid, 0, map->daemon_vpid_start,
0, num_daemons, ORTE_VPID_INVALID, 1, envp);
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
@ -548,11 +546,6 @@ static int orte_pls_bproc_launch_daemons(orte_job_map_t *map, char ***envp) {
}
#endif
opal_argv_append(&argc, &argv, "--bootproxy");
orte_ns.convert_jobid_to_string(&param, map->job);
opal_argv_append(&argc, &argv, param);
free(param);
/* pass along the universe name and location info */
opal_argv_append(&argc, &argv, "--universe");
asprintf(&param, "%s@%s:%s", orte_universe_info.uid,
@ -591,7 +584,7 @@ static int orte_pls_bproc_launch_daemons(orte_job_map_t *map, char ***envp) {
}
/* launch the daemons */
if (orte_pls_base.timing) {
if (orte_timing) {
if (0 != gettimeofday(&launchstart, NULL)) {
opal_output(0, "pls_bproc: could not obtain start time");
}
@ -604,7 +597,7 @@ static int orte_pls_bproc_launch_daemons(orte_job_map_t *map, char ***envp) {
rc = bproc_vexecmove(num_daemons, daemon_list, pids, orted_path, argv, *envp);
}
if (orte_pls_base.timing) {
if (orte_timing) {
if (0 != gettimeofday(&launchstop, NULL)) {
opal_output(0, "pls_bproc: could not obtain stop time");
} else {
@ -666,10 +659,11 @@ static int orte_pls_bproc_launch_daemons(orte_job_map_t *map, char ***envp) {
}
}
WAITFORCOMM:
/* wait for communication back from the daemons, which indicates they have
* sucessfully set up the pty/pipes and IO forwarding which the user apps
* will use */
for(i = 0; i < num_daemons; i++) {
for(i = 0; i < total_num_daemons; i++) {
orte_buffer_t ack;
int src[4];
OBJ_CONSTRUCT(&ack, orte_buffer_t);
@ -706,7 +700,7 @@ static int orte_pls_bproc_launch_daemons(orte_job_map_t *map, char ***envp) {
/* indicate that the daemons have now launched */
daemons_launched = true;
if (orte_pls_base.timing) {
if (orte_timing) {
if (0 != gettimeofday(&launchstop, NULL)) {
opal_output(0, "pls_bproc: could not obtain stop time");
} else {
@ -1084,8 +1078,8 @@ int orte_pls_bproc_launch(orte_jobid_t jobid) {
/* save the current working directory */
if (NULL == getcwd(cwd_save, sizeof(cwd_save))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
rc = ORTE_ERR_NOT_FOUND;
ORTE_ERROR_LOG(rc);
goto cleanup;
}
cwd_save[sizeof(cwd_save) - 1] = '\0';
@ -1110,18 +1104,21 @@ int orte_pls_bproc_launch(orte_jobid_t jobid) {
have been specified as a relative path to the wdir */
rc = orte_rmgr.check_context_cwd(map->apps[i], true);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* Check that the app exists and is executable */
rc = orte_rmgr.check_context_app(map->apps[i]);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* Return to the original dir */
if (0 != chdir(cwd_save)) {
rc = ORTE_ERR_IN_ERRNO;
ORTE_ERROR_LOG(rc);
goto cleanup;
}
}
@ -1341,22 +1338,6 @@ int orte_pls_bproc_signal_proc(const orte_process_name_t* proc_name, int32_t sig
return ORTE_SUCCESS;
}
/**
* Cancel an operation involving comm to an orted
*/
int orte_pls_bproc_cancel_operation(void)
{
int rc;
OPAL_TRACE(1);
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_cancel_operation())) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
/**
* Module cleanup

Просмотреть файл

@ -80,7 +80,6 @@ int orte_pls_bproc_terminate_proc(const orte_process_name_t* proc_name);
int orte_pls_bproc_terminate_orteds(struct timeval *timeout, opal_list_t*);
int orte_pls_bproc_signal_job(orte_jobid_t, int32_t, opal_list_t*);
int orte_pls_bproc_signal_proc(const orte_process_name_t* proc_name, int32_t);
int orte_pls_bproc_cancel_operation(void);
/* Utility routine to get/set process pid */
ORTE_DECLSPEC int orte_pls_bproc_set_proc_pid(const orte_process_name_t*, pid_t, int);

Просмотреть файл

@ -123,7 +123,6 @@ int orte_pls_gridengine_terminate_orteds(struct timeval *timeout, opal_list_t *a
int orte_pls_gridengine_terminate_proc(const orte_process_name_t*);
int orte_pls_gridengine_signal_job(orte_jobid_t, int32_t, opal_list_t *attrs);
int orte_pls_gridengine_signal_proc(const orte_process_name_t*, int32_t);
int orte_pls_gridengine_cancel_operation(void);
/**
* PLS Component

Просмотреть файл

@ -74,6 +74,7 @@
#include "opal/util/output.h"
#include "opal/util/basename.h"
#include "orte/runtime/params.h"
#include "orte/util/univ_info.h"
#include "orte/util/session_dir.h"
#include "orte/util/sys_info.h"
@ -99,7 +100,6 @@ orte_pls_base_module_t orte_pls_gridengine_module = {
orte_pls_gridengine_terminate_proc,
orte_pls_gridengine_signal_job,
orte_pls_gridengine_signal_proc,
orte_pls_gridengine_cancel_operation,
orte_pls_gridengine_finalize
};
@ -227,16 +227,6 @@ int orte_pls_gridengine_launch_job(orte_jobid_t jobid)
goto cleanup;
}
/* if the user requested that we re-use daemons,
* launch the procs on any existing, re-usable daemons
*/
if (orte_pls_base.reuse_daemons) {
if (ORTE_SUCCESS != (rc = orte_pls_base_launch_on_existing_daemons(map))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
}
num_nodes = (orte_std_cntr_t)opal_list_get_size(&map->nodes);
if (num_nodes == 0) {
/* job must have been launched on existing daemons - just return */
@ -245,27 +235,6 @@ int orte_pls_gridengine_launch_job(orte_jobid_t jobid)
goto cleanup;
}
/*
* Allocate a range of vpids for the daemons.
*/
rc = orte_ns.reserve_range(0, num_nodes, &vpid);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* setup the orted triggers for passing their launch info */
if (ORTE_SUCCESS != (rc = orte_smr.init_orted_stage_gates(jobid, num_nodes, NULL, NULL))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* need integer value for command line parameter */
if (ORTE_SUCCESS != (rc = orte_ns.convert_jobid_to_string(&jobid_string, jobid))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/*
* Build argv array
*/
@ -293,7 +262,6 @@ int orte_pls_gridengine_launch_job(orte_jobid_t jobid)
orte_pls_base_orted_append_basic_args(&argc, &argv,
&proc_name_index,
&node_name_index2,
jobid_string,
(vpid + num_nodes)
);
@ -784,20 +752,6 @@ int orte_pls_gridengine_signal_proc(const orte_process_name_t* proc, int32_t sig
return ORTE_ERR_NOT_IMPLEMENTED;
}
/**
* Cancel an operation involving comm to an orted
*/
int orte_pls_gridengine_cancel_operation(void)
{
int rc;
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_cancel_operation())) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
/**
* Finalize

Просмотреть файл

@ -225,11 +225,6 @@ typedef int (*orte_pls_base_module_signal_job_fn_t)(orte_jobid_t, int32_t, opal_
*/
typedef int (*orte_pls_base_module_signal_proc_fn_t)(const orte_process_name_t*, int32_t);
/**
* Cancel an ongoing operation involving communication to the orteds
*/
typedef int (*orte_pls_base_module_cancel_operation_fn_t)(void);
/**
* Cleanup all resources held by the module
*/
@ -245,7 +240,6 @@ struct orte_pls_base_module_1_3_0_t {
orte_pls_base_module_terminate_proc_fn_t terminate_proc;
orte_pls_base_module_signal_job_fn_t signal_job;
orte_pls_base_module_signal_proc_fn_t signal_proc;
orte_pls_base_module_cancel_operation_fn_t cancel_operation;
orte_pls_base_module_finalize_fn_t finalize;
};

Просмотреть файл

@ -498,64 +498,3 @@ int orte_pls_proxy_signal_proc(const orte_process_name_t* name, int32_t signal)
OBJ_RELEASE(answer);
return ORTE_SUCCESS;
}
int orte_pls_proxy_cancel_operation(void)
{
orte_buffer_t* cmd;
orte_buffer_t* answer;
orte_pls_cmd_flag_t command, ret_cmd;
orte_std_cntr_t count;
int rc;
OPAL_TRACE(1);
command = ORTE_PLS_CANCEL_OPERATION_CMD;
cmd = OBJ_NEW(orte_buffer_t);
if (cmd == NULL) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &command, 1, ORTE_PLS_CMD))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
return rc;
}
if (0 > orte_rml.send_buffer(orte_pls_proxy_replica, cmd, ORTE_RML_TAG_PLS, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(cmd);
return ORTE_ERR_COMM_FAILURE;
}
OBJ_RELEASE(cmd);
answer = OBJ_NEW(orte_buffer_t);
if(answer == NULL) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (0 > orte_rml.recv_buffer(orte_pls_proxy_replica, answer, ORTE_RML_TAG_PLS)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer);
return ORTE_ERR_COMM_FAILURE;
}
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &ret_cmd, &count, ORTE_PLS_CMD))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(answer);
return rc;
}
if (ret_cmd != command) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer);
return ORTE_ERR_COMM_FAILURE;
}
OBJ_RELEASE(answer);
return ORTE_SUCCESS;
}

Просмотреть файл

@ -57,7 +57,6 @@ int orte_pls_proxy_terminate_orteds(struct timeval *timeout, opal_list_t *attrs)
int orte_pls_proxy_terminate_proc(const orte_process_name_t* name);
int orte_pls_proxy_signal_job(orte_jobid_t job, int32_t signal, opal_list_t *attrs);
int orte_pls_proxy_signal_proc(const orte_process_name_t* name, int32_t signal);
int orte_pls_proxy_cancel_operation(void);
#if defined(c_plusplus) || defined(__cplusplus)

Просмотреть файл

@ -66,7 +66,6 @@ static orte_pls_base_module_t orte_pls_proxy_module = {
orte_pls_proxy_terminate_proc,
orte_pls_proxy_signal_job,
orte_pls_proxy_signal_proc,
orte_pls_proxy_cancel_operation,
orte_pls_proxy_finalize
};

Просмотреть файл

@ -59,7 +59,6 @@ int orte_pls_rsh_terminate_orteds(struct timeval *timeout, opal_list_t*);
int orte_pls_rsh_terminate_proc(const orte_process_name_t* proc_name);
int orte_pls_rsh_signal_job(orte_jobid_t, int32_t, opal_list_t*);
int orte_pls_rsh_signal_proc(const orte_process_name_t* proc_name, int32_t);
int orte_pls_rsh_cancel_operation(void);
/**
* PLS Component

Просмотреть файл

@ -77,6 +77,7 @@
#include "orte/runtime/orte_wait.h"
#include "orte/runtime/orte_wakeup.h"
#include "orte/runtime/params.h"
#include "orte/dss/dss.h"
#include "orte/mca/ns/ns.h"
@ -108,7 +109,6 @@ orte_pls_base_module_t orte_pls_rsh_module = {
orte_pls_rsh_terminate_proc,
orte_pls_rsh_signal_job,
orte_pls_rsh_signal_proc,
orte_pls_rsh_cancel_operation,
orte_pls_rsh_finalize
};
@ -384,19 +384,15 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
goto cleanup;
}
/* if the user requested that we re-use daemons,
* launch the procs on any existing, re-usable daemons
*/
if (orte_pls_base.reuse_daemons) {
if (ORTE_SUCCESS != (rc = orte_pls_base_launch_on_existing_daemons(map))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* account for any reuse of daemons */
if (ORTE_SUCCESS != (rc = orte_pls_base_launch_on_existing_daemons(map))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
num_nodes = (orte_std_cntr_t)opal_list_get_size(&map->nodes);
num_nodes = map->num_new_daemons;
if (0 == num_nodes) {
/* nothing left to do - just return */
/* nothing to do - just return */
failed_launch = false;
rc = ORTE_SUCCESS;
goto cleanup;
@ -442,32 +438,6 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
*/
prefix_dir = map->apps[0]->prefix_dir;
/*
* Allocate a range of vpids for the daemons.
*/
if (num_nodes == 0) {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
rc = ORTE_ERR_BAD_PARAM;
goto cleanup;
}
rc = orte_ns.reserve_range(0, num_nodes, &vpid);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* setup the orted triggers for passing their launch info */
if (ORTE_SUCCESS != (rc = orte_smr.init_orted_stage_gates(jobid, num_nodes, NULL, NULL))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* need integer value for command line parameter */
if (ORTE_SUCCESS != (rc = orte_ns.convert_jobid_to_string(&jobid_string, jobid))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* What is our local shell? */
p = getpwuid(getuid());
if( NULL == p ) {
@ -564,7 +534,6 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
orte_pls_base_orted_append_basic_args(&argc, &argv,
&proc_name_index,
&node_name_index2,
jobid_string,
(vpid + num_nodes));
local_exec_index_end = argc;
@ -614,13 +583,17 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
for(n_item = opal_list_get_first(&map->nodes);
n_item != opal_list_get_end(&map->nodes);
n_item = opal_list_get_next(n_item)) {
orte_process_name_t* name;
pid_t pid;
char *exec_path;
char **exec_argv;
rmaps_node = (orte_mapped_node_t*)n_item;
/* if this daemon already exists, don't launch it! */
if (rmaps_node->daemon_preexists) {
continue;
}
/* setup node name */
free(argv[node_name_index1]);
if (NULL != rmaps_node->username &&
@ -634,13 +607,6 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
free(argv[node_name_index2]);
argv[node_name_index2] = strdup(rmaps_node->nodename);
/* initialize daemons process name */
rc = orte_ns.create_process_name(&name, rmaps_node->cell, 0, vpid);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* fork a child to exec the rsh/ssh session */
pid = fork();
@ -759,7 +725,7 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
/* Since this is a local execution, we need to
potentially whack the final ")" in the argv (if
sh/csh conditionals, from above). Note that we're
sh/csh conditionals, from above). Note that we're
modifying the argv[] in the child process, so
there's no need to save this and restore it
afterward -- the parent's argv[] is unmodified. */
@ -845,9 +811,9 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
}
/* setup process name */
rc = orte_ns.get_proc_name_string(&name_string, name);
rc = orte_ns.get_proc_name_string(&name_string, rmaps_node->daemon);
if (ORTE_SUCCESS != rc) {
opal_output(0, "orte_pls_rsh: unable to create process name");
opal_output(0, "orte_pls_rsh: unable to get daemon name as string");
exit(-1);
}
free(argv[proc_name_index]);
@ -906,6 +872,12 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
exit(-1);
} else { /* father */
/* indicate this daemon has been launched in case anyone is sitting on that trigger */
if (ORTE_SUCCESS != (rc = orte_smr.set_proc_state(rmaps_node->daemon, ORTE_PROC_STATE_LAUNCHED, 0))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
OPAL_THREAD_LOCK(&mca_pls_rsh_component.lock);
/* This situation can lead to a deadlock if '--debug-daemons' is set.
* However, the deadlock condition is tested at the begining of this
@ -928,7 +900,6 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
}
vpid++;
}
free(name);
}
/* get here if launch went okay */
failed_launch = false;
@ -1026,23 +997,6 @@ int orte_pls_rsh_signal_proc(const orte_process_name_t* proc, int32_t signal)
return ORTE_ERR_NOT_IMPLEMENTED;
}
/**
* Cancel an operation involving comm to an orted
*/
int orte_pls_rsh_cancel_operation(void)
{
int rc;
OPAL_TRACE(1);
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_cancel_operation())) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
int orte_pls_rsh_finalize(void)
{
int rc;

Просмотреть файл

@ -84,7 +84,6 @@ static int pls_slurm_terminate_proc(const orte_process_name_t *name);
static int pls_slurm_signal_job(orte_jobid_t jobid, int32_t signal, opal_list_t *attrs);
static int pls_slurm_signal_proc(const orte_process_name_t *name, int32_t signal);
static int pls_slurm_finalize(void);
static int pls_slurm_cancel_operation(void);
static int pls_slurm_start_proc(int argc, char **argv, char **env,
char *prefix);
@ -100,7 +99,6 @@ orte_pls_base_module_1_3_0_t orte_pls_slurm_module = {
pls_slurm_terminate_proc,
pls_slurm_signal_job,
pls_slurm_signal_proc,
pls_slurm_cancel_operation,
pls_slurm_finalize
};
@ -120,7 +118,6 @@ static int pls_slurm_launch_job(orte_jobid_t jobid)
orte_job_map_t *map = NULL;
opal_list_item_t *item;
size_t num_nodes;
orte_vpid_t vpid;
char *jobid_string = NULL;
char *param;
char **argv = NULL;
@ -132,7 +129,7 @@ static int pls_slurm_launch_job(orte_jobid_t jobid)
char *nodelist_flat;
char **nodelist_argv;
int nodelist_argc;
orte_process_name_t* name;
orte_process_name_t name;
char *name_string;
char **custom_strings;
int num_args, i;
@ -161,40 +158,21 @@ static int pls_slurm_launch_job(orte_jobid_t jobid)
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* if the user requested that we re-use daemons,
* launch the procs on any existing, re-usable daemons
*/
if (orte_pls_base.reuse_daemons) {
if (ORTE_SUCCESS != (rc = orte_pls_base_launch_on_existing_daemons(map))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* account for any reuse of daemons */
if (ORTE_SUCCESS != (rc = orte_pls_base_launch_on_existing_daemons(map))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/*
* Allocate a range of vpids for the daemons.
*/
num_nodes = opal_list_get_size(&map->nodes);
num_nodes = map->num_new_daemons;
if (num_nodes == 0) {
/* nothing further to do - job must have been launched
* on existing daemons, so we can just return
*/
OBJ_RELEASE(map);
return ORTE_SUCCESS;
}
rc = orte_ns.reserve_range(0, num_nodes, &vpid);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
/* nothing to do - just return */
failed_launch = false;
rc = ORTE_SUCCESS;
goto cleanup;
}
/* setup the orted triggers for passing their launch info */
if (ORTE_SUCCESS != (rc = orte_smr.init_orted_stage_gates(jobid, num_nodes, NULL, NULL))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* need integer value for command line parameter */
asprintf(&jobid_string, "%lu", (unsigned long) jobid);
@ -238,6 +216,16 @@ static int pls_slurm_launch_job(orte_jobid_t jobid)
item = opal_list_get_next(item)) {
orte_mapped_node_t* node = (orte_mapped_node_t*)item;
/* if the daemon already exists on this node, then
* don't include it
*/
if (node->daemon_preexists) {
continue;
}
/* otherwise, add it to the list of nodes upon which
* we need to launch a daemon
*/
opal_argv_append(&nodelist_argc, &nodelist_argv, node->nodename);
}
nodelist_flat = opal_argv_join(nodelist_argv, ',');
@ -259,7 +247,6 @@ static int pls_slurm_launch_job(orte_jobid_t jobid)
orte_pls_base_orted_append_basic_args(&argc, &argv,
&proc_name_index,
NULL,
jobid_string,
num_nodes
);
@ -267,20 +254,17 @@ static int pls_slurm_launch_job(orte_jobid_t jobid)
opal_argv_append(&argc, &argv, "--ns-nds");
opal_argv_append(&argc, &argv, "slurm");
/* set orte process name to be the base of the name list for the daemons */
rc = orte_ns.create_process_name(&name,
orte_process_info.my_name->cellid,
0, vpid);
/* tell the new daemons the base of the name list so they can compute
* their own name on the other end
*/
name.cellid = ORTE_PROC_MY_NAME->cellid;
name.jobid = 0;
name.vpid = map->daemon_vpid_start;
rc = orte_ns.get_proc_name_string(&name_string, &name);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
opal_output(0, "pls_slurm: unable to create process name");
goto cleanup;
}
rc = orte_ns.get_proc_name_string(&name_string, name);
if (ORTE_SUCCESS != rc) {
opal_output(0, "orte_pls_rsh: unable to create process name");
goto cleanup;
}
free(name);
free(argv[proc_name_index]);
argv[proc_name_index] = strdup(name_string);
@ -342,7 +326,7 @@ static int pls_slurm_launch_job(orte_jobid_t jobid)
}
}
/* exec the daemon */
/* exec the daemon(s) */
if (ORTE_SUCCESS != (rc = pls_slurm_start_proc(argc, argv, env, cur_prefix))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
@ -476,21 +460,6 @@ static int pls_slurm_signal_proc(const orte_process_name_t *name, int32_t signal
}
/**
* Cancel an operation involving comm to an orted
*/
static int pls_slurm_cancel_operation(void)
{
int rc;
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_cancel_operation())) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
static int pls_slurm_finalize(void)
{
int rc;

Просмотреть файл

@ -87,7 +87,6 @@ static int pls_tm_terminate_orteds(struct timeval *timeout, opal_list_t *attrs);
static int pls_tm_terminate_proc(const orte_process_name_t *name);
static int pls_tm_signal_job(orte_jobid_t jobid, int32_t signal, opal_list_t *attrs);
static int pls_tm_signal_proc(const orte_process_name_t *name, int32_t signal);
static int pls_tm_cancel_operation(void);
static int pls_tm_finalize(void);
static int pls_tm_connect(void);
@ -108,7 +107,6 @@ orte_pls_base_module_t orte_pls_tm_module = {
pls_tm_terminate_proc,
pls_tm_signal_job,
pls_tm_signal_proc,
pls_tm_cancel_operation,
pls_tm_finalize
};
@ -124,7 +122,6 @@ static int pls_tm_launch_job(orte_jobid_t jobid)
orte_vpid_t vpid;
int node_name_index;
int proc_name_index;
char *jobid_string;
char *param;
char **env = NULL;
char *var;
@ -178,35 +175,17 @@ static int pls_tm_launch_job(orte_jobid_t jobid)
}
}
/* if the user requested that we re-use daemons,
* launch the procs on any existing, re-usable daemons
*/
if (orte_pls_base.reuse_daemons) {
if (ORTE_SUCCESS != (rc = orte_pls_base_launch_on_existing_daemons(map))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
}
num_nodes = opal_list_get_size(&map->nodes);
if (0 == num_nodes) {
/* must have been launched on existing daemons - just return */
OBJ_RELEASE(map);
return ORTE_SUCCESS;
}
/*
* Allocate a range of vpids for the daemons.
*/
rc = orte_ns.reserve_range(0, num_nodes, &vpid);
if (ORTE_SUCCESS != rc) {
/* account for any reuse of daemons */
if (ORTE_SUCCESS != (rc = orte_pls_base_launch_on_existing_daemons(map))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* setup the orted triggers for passing their launch info */
if (ORTE_SUCCESS != (rc = orte_smr.init_orted_stage_gates(jobid, num_nodes, NULL, NULL))) {
ORTE_ERROR_LOG(rc);
num_nodes = map->num_new_daemons;
if (0 == num_nodes) {
/* must have been launched on existing daemons - just return */
failed_launch = false;
rc = ORTE_SUCCESS;
goto cleanup;
}
@ -224,9 +203,6 @@ static int pls_tm_launch_job(orte_jobid_t jobid)
goto cleanup;
}
/* need integer value for command line parameter */
asprintf(&jobid_string, "%lu", (unsigned long) jobid);
/* add the daemon command (as specified by user) */
argv = opal_argv_split(mca_pls_tm_component.orted, ' ');
argc = opal_argv_count(argv);
@ -237,9 +213,7 @@ static int pls_tm_launch_job(orte_jobid_t jobid)
orte_pls_base_orted_append_basic_args(&argc, &argv,
&proc_name_index,
&node_name_index,
jobid_string,
(vpid + num_nodes)
);
(vpid + num_nodes));
if (mca_pls_tm_component.debug) {
param = opal_argv_join(argv, ' ');
@ -555,21 +529,6 @@ static int pls_tm_signal_proc(const orte_process_name_t *name, int32_t signal)
}
/**
* Cancel an operation involving comm to an orted
*/
static int pls_tm_cancel_operation(void)
{
int rc;
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_cancel_operation())) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
/*
* Free stuff
*/

Просмотреть файл

@ -89,6 +89,9 @@ int orte_rmaps_base_copy_map(orte_job_map_t **dest, orte_job_map_t *src, orte_da
opal_list_append(&((*dest)->nodes), &nodeptr->super);
}
(*dest)->num_new_daemons = src->num_new_daemons;
(*dest)->daemon_vpid_start = src->daemon_vpid_start;
return ORTE_SUCCESS;
}
@ -163,7 +166,8 @@ int orte_rmaps_base_copy_mapped_node(orte_mapped_node_t **dest, orte_mapped_node
return rc;
}
}
(*dest)->daemon_preexists = src->daemon_preexists;
(*dest)->oversubscribed = src->oversubscribed;
(*dest)->num_procs = src->num_procs;

Просмотреть файл

@ -101,6 +101,19 @@ int orte_rmaps_base_pack_map(orte_buffer_t *buffer, void *src,
}
}
}
/* pack the number of new daemons */
if (ORTE_SUCCESS != (rc = orte_dss_pack_buffer(buffer, &(maps[i]->num_new_daemons), 1, ORTE_STD_CNTR))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* pack the daemon starting vpid */
if (ORTE_SUCCESS != (rc = orte_dss_pack_buffer(buffer, &(maps[i]->daemon_vpid_start), 1, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
return ORTE_SUCCESS;
@ -203,6 +216,12 @@ int orte_rmaps_base_pack_mapped_node(orte_buffer_t *buffer, void *src,
return rc;
}
/* pack the daemon_preexists flag */
if (ORTE_SUCCESS != (rc = orte_dss_pack_buffer(buffer, &(nodes[i]->daemon_preexists), 1, ORTE_BOOL))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* pack the oversubscribed flag */
if (ORTE_SUCCESS != (rc = orte_dss_pack_buffer(buffer, &(nodes[i]->oversubscribed), 1, ORTE_BOOL))) {
ORTE_ERROR_LOG(rc);

Просмотреть файл

@ -90,8 +90,12 @@ int orte_rmaps_base_print_map(char **output, char *prefix, orte_job_map_t *src,
tmp = tmp3;
}
asprintf(&tmp2, "%s\n%sNum new daemons: %ld\tNew daemon starting vpid %ld", tmp, pfx,
(long)src->num_new_daemons, (long)src->daemon_vpid_start);
free(tmp);
/* set the return */
*output = tmp;
*output = tmp2;
free(pfx);
return ORTE_SUCCESS;
@ -173,8 +177,8 @@ int orte_rmaps_base_print_mapped_node(char **output, char *prefix, orte_mapped_n
return rc;
}
asprintf(&tmp3, "%s\n\t%s\n%sOversubscribed: %s\tNum procs from this job on node: %ld", tmp, tmp2, pfx,
(src->oversubscribed ? "True" : "False"), (long)src->num_procs);
asprintf(&tmp3, "%s\n\t%s\n%s\tPreexists: %s\n%sOversubscribed: %s\tNum procs from this job on node: %ld", tmp, tmp2, pfx,
(src->daemon_preexists ? "True" : "False"), pfx, (src->oversubscribed ? "True" : "False"), (long)src->num_procs);
free(tmp);
free(tmp2);

Просмотреть файл

@ -122,6 +122,21 @@ int orte_rmaps_base_unpack_map(orte_buffer_t *buffer, void *dest,
}
opal_list_append(&(maps[i]->nodes), &node->super);
}
/* unpack the number of daemons to be created */
n = 1;
if (ORTE_SUCCESS != (rc = orte_dss_unpack_buffer(buffer, &(maps[i]->num_new_daemons), &n, ORTE_STD_CNTR))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* unpack the starting vpid of the new daemons */
n = 1;
if (ORTE_SUCCESS != (rc = orte_dss_unpack_buffer(buffer, &(maps[i]->daemon_vpid_start), &n, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
return ORTE_SUCCESS;
@ -253,6 +268,14 @@ int orte_rmaps_base_unpack_mapped_node(orte_buffer_t *buffer, void *dest,
return rc;
}
/* unpack the daemon_preexists flag */
n = 1;
if (ORTE_SUCCESS != (rc = orte_dss_unpack_buffer(buffer,
&(nodes[i]->daemon_preexists), &n, ORTE_BOOL))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* unpack the oversubscribed flag */
n = 1;
if (ORTE_SUCCESS != (rc = orte_dss_unpack_buffer(buffer,

Просмотреть файл

@ -43,6 +43,8 @@ int orte_rmaps_base_get_job_map(orte_job_map_t **map, orte_jobid_t jobid)
{
orte_job_map_t *mapping;
orte_mapped_proc_t *proc;
orte_mapped_node_t *node;
opal_list_item_t *item;
orte_cellid_t *cellptr, cell=ORTE_CELLID_INVALID;
orte_vpid_t *vptr;
orte_std_cntr_t *sptr;
@ -50,12 +52,12 @@ int orte_rmaps_base_get_job_map(orte_job_map_t **map, orte_jobid_t jobid)
pid_t *pidptr;
orte_process_name_t *pptr;
int32_t *i32, launch_id;
char *segment;
char *segment=NULL;
char *node_name=NULL;
char *username=NULL;
orte_gpr_value_t **values, *value;
orte_gpr_value_t **values=NULL, **dvalues=NULL, *value;
orte_gpr_keyval_t* keyval;
orte_std_cntr_t v, kv, num_values;
orte_std_cntr_t v, kv, num_values=0, num_dvalues=0;
int rc;
char* keys[] = {
ORTE_PROC_RANK_KEY,
@ -70,6 +72,8 @@ int orte_rmaps_base_get_job_map(orte_job_map_t **map, orte_jobid_t jobid)
ORTE_JOB_VPID_START_KEY,
ORTE_JOB_VPID_RANGE_KEY,
ORTE_JOB_MAPPING_MODE_KEY,
ORTE_JOB_NUM_NEW_DAEMONS_KEY,
ORTE_JOB_DAEMON_VPID_START_KEY,
#if OPAL_ENABLE_FT == 1
ORTE_PROC_CKPT_STATE_KEY,
ORTE_PROC_CKPT_SNAPSHOT_REF_KEY,
@ -77,7 +81,11 @@ int orte_rmaps_base_get_job_map(orte_job_map_t **map, orte_jobid_t jobid)
#endif
NULL
};
char* dkeys[] = {
ORTE_PROC_NAME_KEY,
ORTE_NODE_NAME_KEY,
NULL
};
OPAL_TRACE(1);
/* define default answer */
@ -107,21 +115,16 @@ int orte_rmaps_base_get_job_map(orte_job_map_t **map, orte_jobid_t jobid)
}
/* query the process list from the registry */
rc = orte_gpr.get(
ORTE_GPR_KEYS_OR|ORTE_GPR_TOKENS_OR,
segment,
NULL,
keys,
&num_values,
&values);
if(ORTE_SUCCESS != rc) {
if (ORTE_SUCCESS != (rc = orte_gpr.get(ORTE_GPR_KEYS_OR|ORTE_GPR_TOKENS_OR,
segment,
NULL,
keys,
&num_values,
&values))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(mapping);
free(segment);
return rc;
goto cleanup;
}
free(segment);
/* build the node and proc lists. each value corresponds
* to a process in the map
@ -158,6 +161,22 @@ int orte_rmaps_base_get_job_map(orte_job_map_t **map, orte_jobid_t jobid)
}
continue;
}
if(strcmp(value->keyvals[kv]->key, ORTE_JOB_NUM_NEW_DAEMONS_KEY) == 0) {
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&sptr, value->keyvals[kv]->value, ORTE_STD_CNTR))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
mapping->num_new_daemons = *sptr;
continue;
}
if(strcmp(value->keyvals[kv]->key, ORTE_JOB_DAEMON_VPID_START_KEY) == 0) {
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&vptr, value->keyvals[kv]->value, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
mapping->daemon_vpid_start = *vptr;
continue;
}
}
}
@ -284,6 +303,55 @@ int orte_rmaps_base_get_job_map(orte_job_map_t **map, orte_jobid_t jobid)
}
}
/* query the daemon info from the registry */
if (ORTE_SUCCESS != (rc = orte_gpr.get(ORTE_GPR_KEYS_OR|ORTE_GPR_TOKENS_OR,
"orte-job-0",
NULL,
dkeys,
&num_dvalues,
&dvalues))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* process the results, storing info in the mapped_node objects */
for(v=0; v<num_dvalues; v++) {
value = dvalues[v];
node_name = NULL;
for(kv = 0; kv<value->cnt; kv++) {
keyval = value->keyvals[kv];
if(strcmp(keyval->key, ORTE_NODE_NAME_KEY) == 0) {
/* use the dss.copy function here to protect us against zero-length strings */
if (ORTE_SUCCESS != (rc = orte_dss.copy((void**)&node_name, keyval->value->data, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
continue;
}
if (strcmp(keyval->key, ORTE_PROC_NAME_KEY) == 0) {
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&pptr, keyval->value, ORTE_NAME))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
continue;
}
}
if (NULL == node_name) continue;
/* find this node on the map */
for (item = opal_list_get_first(&mapping->nodes);
item != opal_list_get_end(&mapping->nodes);
item = opal_list_get_next(item)) {
node = (orte_mapped_node_t*)item;
if (strcmp(node->nodename, node_name) == 0) {
/* got it! store the daemon name here */
if (ORTE_SUCCESS != (rc = orte_dss.copy((void**)&node->daemon, pptr, ORTE_NAME))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
}
}
}
/* compute and save convenience values */
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_usage(mapping, mapping->vpid_range))) {
ORTE_ERROR_LOG(rc);
@ -294,17 +362,24 @@ int orte_rmaps_base_get_job_map(orte_job_map_t **map, orte_jobid_t jobid)
/* all done */
*map = mapping;
rc = ORTE_SUCCESS;
cleanup:
if(rc != ORTE_SUCCESS) {
OBJ_RELEASE(mapping);
}
if (NULL != segment) free(segment);
for (v=0; v < num_values; v++) {
OBJ_RELEASE(values[v]);
}
if (NULL != values) free(values);
for (v=0; v < num_dvalues; v++) {
OBJ_RELEASE(dvalues[v]);
}
if (NULL != dvalues) free(dvalues);
return rc;
}
@ -353,13 +428,13 @@ int orte_rmaps_base_get_node_map(orte_mapped_node_t **node, orte_cellid_t cell,
int orte_rmaps_base_put_job_map(orte_job_map_t *map)
{
orte_std_cntr_t i, j;
orte_std_cntr_t index=0;
orte_std_cntr_t num_procs = 0;
orte_std_cntr_t i;
orte_std_cntr_t index;
orte_std_cntr_t num_procs = 0, num_vals;
int rc = ORTE_SUCCESS;
opal_list_item_t *item, *item2;
orte_gpr_value_t **values, *value;
char *segment;
orte_gpr_value_t **values=NULL, *value;
char *segment=NULL;
orte_mapped_node_t *node;
orte_mapped_proc_t *proc;
orte_proc_state_t proc_state=ORTE_PROC_STATE_INIT;
@ -370,7 +445,7 @@ int orte_rmaps_base_put_job_map(orte_job_map_t *map)
item != opal_list_get_end(&map->nodes);
item = opal_list_get_next(item)) {
node = (orte_mapped_node_t*)item;
num_procs += (orte_std_cntr_t)opal_list_get_size(&node->procs);
num_procs += node->num_procs;
}
if(num_procs == 0) {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
@ -378,38 +453,39 @@ int orte_rmaps_base_put_job_map(orte_job_map_t *map)
}
/**
* allocate value array. We need to reserve one extra spot so we can set the counter
* for the process INIT state to indicate that all procs are at that state. This will
* allow the INIT trigger to fire.
* allocate value array. We need enough spots to allow us to store all of the
* proc info on the job segment and all of the daemon info on the job-0 segment.
* In addition, we need to reserve one extra spot so we can set the counter
* on the job segment for the process INIT state to indicate that all procs
* are at that state to allow the INIT trigger to fire and store some map-level
* information on the number of new daemons
*/
values = (orte_gpr_value_t**)malloc((1+num_procs) * sizeof(orte_gpr_value_t*));
num_vals = 1+num_procs+map->num_nodes;
values = (orte_gpr_value_t**)malloc(num_vals * sizeof(orte_gpr_value_t*));
if(NULL == values) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
/* let's deal with the procs first - start by getting their job segment name */
if (ORTE_SUCCESS != (rc = orte_schema.get_job_segment_name(&segment, map->job))) {
ORTE_ERROR_LOG(rc);
free(values);
return rc;
goto cleanup;
}
/** preallocate the appropriate number of containers on the segment */
/** preallocate the appropriate number of containers on that segment */
if (ORTE_SUCCESS != (rc = orte_gpr.preallocate_segment(segment, num_procs + 1))) {
ORTE_ERROR_LOG(rc);
free(values);
return rc;
goto cleanup;
}
/** setup the last value in the array to store the vpid start/range and update the INIT counter */
if (ORTE_SUCCESS != (rc = orte_gpr.create_value(&(values[num_procs]),
ORTE_GPR_OVERWRITE|ORTE_GPR_TOKENS_AND,
segment, 4, 1))) {
segment, 6, 1))) {
ORTE_ERROR_LOG(rc);
free(values);
free(segment);
return rc;
goto cleanup;
}
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(values[num_procs]->keyvals[0]), ORTE_PROC_NUM_AT_INIT, ORTE_STD_CNTR, &num_procs))) {
ORTE_ERROR_LOG(rc);
@ -427,6 +503,14 @@ int orte_rmaps_base_put_job_map(orte_job_map_t *map)
ORTE_ERROR_LOG(rc);
goto cleanup;
}
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(values[num_procs]->keyvals[4]), ORTE_JOB_NUM_NEW_DAEMONS_KEY, ORTE_STD_CNTR, &map->num_new_daemons))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(values[num_procs]->keyvals[5]), ORTE_JOB_DAEMON_VPID_START_KEY, ORTE_VPID, &map->daemon_vpid_start))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
values[num_procs]->tokens[0] = strdup(ORTE_JOB_GLOBALS); /* counter is in the job's globals container */
@ -440,16 +524,12 @@ int orte_rmaps_base_put_job_map(orte_job_map_t *map)
#endif
0))) {
ORTE_ERROR_LOG(rc);
for(j=0; j<i; j++) {
OBJ_RELEASE(values[j]);
}
free(values);
free(segment);
return rc;
goto cleanup;
}
}
/* iterate through all processes and initialize value array */
index = 0;
for(item = opal_list_get_first(&map->nodes);
item != opal_list_get_end(&map->nodes);
item = opal_list_get_next(item)) {
@ -557,13 +637,60 @@ int orte_rmaps_base_put_job_map(orte_job_map_t *map)
}
}
/* now let's deal with the daemons. We know this info goes onto the job-0 segment, so
* let's begin by preallocating the appropriate number of containers on that segment
* to make sure it is just big enough
*/
if (ORTE_SUCCESS != (rc = orte_gpr.preallocate_segment("orte-job-0", map->num_nodes + 1))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* now iterate through the nodes and create a value object for each daemon
* being sure to start from the right place in the array
*/
i = num_procs+1;
for(item = opal_list_get_first(&map->nodes);
item != opal_list_get_end(&map->nodes);
item = opal_list_get_next(item)) {
node = (orte_mapped_node_t*)item;
if (ORTE_SUCCESS != (rc = orte_gpr.create_value(&(values[i]),
ORTE_GPR_OVERWRITE|ORTE_GPR_TOKENS_AND,
"orte-job-0", 2,
0))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* store the node name and the daemon's name */
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(values[i]->keyvals[0]), ORTE_NODE_NAME_KEY, ORTE_STRING, node->nodename))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(values[i]->keyvals[1]), ORTE_PROC_NAME_KEY, ORTE_NAME, node->daemon))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* set the tokens */
if (ORTE_SUCCESS != (rc = orte_schema.get_proc_tokens(&(values[i]->tokens), &(values[i]->num_tokens), node->daemon))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* move to next position */
++i;
}
/* insert all values in one call */
if (ORTE_SUCCESS != (rc = orte_gpr.put((1+num_procs), values))) {
if (ORTE_SUCCESS != (rc = orte_gpr.put(num_vals, values))) {
ORTE_ERROR_LOG(rc);
}
cleanup:
for(i=0; i<=num_procs; i++) {
for(i=0; i < num_vals; i++) {
if(NULL != values[i]) {
OBJ_RELEASE(values[i]);
}

Просмотреть файл

@ -29,6 +29,7 @@
#include "opal/util/if.h"
#include "opal/util/show_help.h"
#include "orte/runtime/params.h"
#include "orte/util/sys_info.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/smr/smr_types.h"
@ -502,3 +503,119 @@ int orte_rmaps_base_compute_usage(orte_job_map_t *map, orte_std_cntr_t num_procs
return ORTE_SUCCESS;
}
int orte_rmaps_base_define_daemons(orte_job_map_t *map)
{
opal_list_item_t *item;
orte_mapped_node_t *node;
orte_vpid_t vpid;
orte_std_cntr_t num_daemons;
char* dkeys[] = {
ORTE_PROC_NAME_KEY,
ORTE_NODE_NAME_KEY,
NULL
};
orte_gpr_value_t **dvalues=NULL, *value;
orte_gpr_keyval_t* keyval;
orte_std_cntr_t v, kv, num_dvalues=0;
char *node_name;
orte_process_name_t *pptr;
int rc;
/* save the default number of daemons we will need */
num_daemons = map->num_nodes;
/* with the new launch system based on xcast messages to all daemons,
* there is no choice but to reuse existing daemons for dynamic spawns
*/
/* get the current list of daemons off of the registry */
if (ORTE_SUCCESS != (rc = orte_gpr.get(ORTE_GPR_KEYS_OR|ORTE_GPR_TOKENS_OR,
"orte-job-0",
NULL,
dkeys,
&num_dvalues,
&dvalues))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* process the results, storing info in the mapped_node objects */
for(v=0; v<num_dvalues; v++) {
value = dvalues[v];
node_name = NULL;
for(kv = 0; kv<value->cnt; kv++) {
keyval = value->keyvals[kv];
if(strcmp(keyval->key, ORTE_NODE_NAME_KEY) == 0) {
/* use the dss.copy function here to protect us against zero-length strings */
if (ORTE_SUCCESS != (rc = orte_dss.copy((void**)&node_name, keyval->value->data, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
return rc;
}
continue;
}
if (strcmp(keyval->key, ORTE_PROC_NAME_KEY) == 0) {
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&pptr, keyval->value, ORTE_NAME))) {
ORTE_ERROR_LOG(rc);
return rc;
}
continue;
}
}
if (NULL == node_name) continue;
/* find this node on the map */
for (item = opal_list_get_first(&map->nodes);
item != opal_list_get_end(&map->nodes);
item = opal_list_get_next(item)) {
node = (orte_mapped_node_t*)item;
if (strcmp(node->nodename, node_name) == 0) {
/* got it! store the daemon name here */
if (ORTE_SUCCESS != (rc = orte_dss.copy((void**)&node->daemon, pptr, ORTE_NAME))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* flag that this daemon already exists */
node->daemon_preexists = true;
/* decrease the number of daemons we will need to start */
--num_daemons;
}
}
}
/* do we need to create any new ones? */
if (0 >= num_daemons) {
/* nope - we are done! */
return ORTE_SUCCESS;
}
/* get a vpid range for the daemons still to be created */
if (ORTE_SUCCESS != (rc = orte_ns.reserve_range(0, num_daemons, &vpid))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* store the info in the map */
map->num_new_daemons = num_daemons;
map->daemon_vpid_start = vpid;
/* for each node being used by this job... */
for (item = opal_list_get_first(&map->nodes);
item != opal_list_get_end(&map->nodes);
item = opal_list_get_next(item)) {
node = (orte_mapped_node_t*)item;
/* if the daemon already exists...do nothing */
if (node->daemon_preexists) continue;
/* otherwise, create the daemon's process name and store it on the mapped_node... */
if (ORTE_SUCCESS != (rc = orte_ns.create_process_name(&node->daemon, ORTE_PROC_MY_NAME->cellid,
0, vpid))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* ...and increment the vpid for the next one */
++vpid;
}
return ORTE_SUCCESS;
}

Просмотреть файл

@ -100,6 +100,7 @@ static void orte_rmaps_mapped_node_construct(orte_mapped_node_t* node)
node->launch_id = -1;
node->username = NULL;
node->daemon = NULL;
node->daemon_preexists = false;
node->oversubscribed = false;
node->num_procs = 0;
OBJ_CONSTRUCT(&node->procs, opal_list_t);
@ -145,6 +146,8 @@ static void orte_rmaps_job_map_construct(orte_job_map_t* map)
map->num_apps = 0;
map->apps = NULL;
map->num_nodes = 0;
map->num_new_daemons = 0;
map->daemon_vpid_start = ORTE_VPID_INVALID;
OBJ_CONSTRUCT(&map->nodes, opal_list_t);
}

Просмотреть файл

@ -181,6 +181,7 @@ ORTE_DECLSPEC int orte_rmaps_base_proxy_map_job(orte_jobid_t job, opal_list_t *a
ORTE_DECLSPEC int orte_rmaps_base_compute_usage(orte_job_map_t *map, orte_std_cntr_t num_procs);
ORTE_DECLSPEC int orte_rmaps_base_define_daemons(orte_job_map_t *map);
/** Local data type functions */
void orte_rmaps_base_std_obj_release(orte_data_value_t *value);

Просмотреть файл

@ -77,6 +77,7 @@ struct orte_mapped_node_t {
orte_process_name_t *daemon; /* name of the daemon on this node
* NULL => daemon not assigned yet
*/
bool daemon_preexists; /* whether or not the daemon already exists */
bool oversubscribed; /* whether or not the #procs > #process slots on this node */
orte_std_cntr_t num_procs; /* #procs on this node - just the length of the procs list, but
* stored here so we don't have to keep recomputing it elsewhere
@ -101,6 +102,8 @@ struct orte_job_map_t {
orte_std_cntr_t num_nodes; /* #nodes in this map - just the length of the nodes list, but
* stored here so we don't have to keep recomputing it elsewhere
*/
orte_std_cntr_t num_new_daemons;
orte_vpid_t daemon_vpid_start;
opal_list_t nodes; /* list of mapped_node_t */
};
typedef struct orte_job_map_t orte_job_map_t;

Просмотреть файл

@ -708,6 +708,12 @@ static int orte_rmaps_rr_map(orte_jobid_t jobid, opal_list_t *attributes)
goto cleanup;
}
/* define the daemons that we will use for this job */
if (ORTE_SUCCESS != (rc = orte_rmaps_base_define_daemons(map))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* save mapping to the registry */
if(ORTE_SUCCESS != (rc = orte_rmaps_base_put_job_map(map))) {
ORTE_ERROR_LOG(rc);

Просмотреть файл

@ -241,6 +241,21 @@ CLEANUP_SPAWN:
}
break;
case ORTE_RMGR_SETUP_ORTED_GATES_CMD:
/* get the jobid */
count = 1;
if(ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &job, &count, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
goto SEND_ANSWER;
}
/* setup the stage gates */
if (ORTE_SUCCESS != (rc = orte_rmgr_base_orted_stage_gate_init(job))) {
ORTE_ERROR_LOG(rc);
goto SEND_ANSWER;
}
break;
case ORTE_RMGR_XCONNECT_CMD:
/* get the child jobid */
count = 1;

Просмотреть файл

@ -36,6 +36,8 @@
#include "orte/mca/gpr/gpr.h"
#include "orte/mca/ns/ns.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/odls/odls.h"
#include "orte/mca/rmaps/rmaps.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/smr/smr.h"
#include "orte/runtime/runtime.h"
@ -64,21 +66,7 @@ int orte_rmgr_base_proc_stage_gate_mgr(orte_gpr_notify_message_t *msg)
OPAL_TRACE(1);
/* check to see if this came from a trigger that we ignore because
* that stage gate does NOT set an xcast barrier - processes simply
* record their state and continue processing. The only triggers that
* involve a xcast barrier are the ORTE_STARTUP_TRIGGER,
* STGx, and FINALIZED ones - ignore the rest.
*/
if (!orte_schema.check_std_trigger_name(msg->target, ORTE_STARTUP_TRIGGER) &&
!orte_schema.check_std_trigger_name(msg->target, ORTE_STG1_TRIGGER) &&
!orte_schema.check_std_trigger_name(msg->target, ORTE_STG2_TRIGGER) &&
!orte_schema.check_std_trigger_name(msg->target, ORTE_STG3_TRIGGER) &&
!orte_schema.check_std_trigger_name(msg->target, ORTE_NUM_FINALIZED_TRIGGER)) {
return ORTE_SUCCESS;
}
/* All stage gate triggers are named, so we can extract the jobid
/* All stage gate triggers are named, so we can extract the jobid
* directly from the trigger name
*/
if (ORTE_SUCCESS != (rc = orte_schema.extract_jobid_from_std_trigger_name(&job, msg->target))) {
@ -89,7 +77,12 @@ int orte_rmgr_base_proc_stage_gate_mgr(orte_gpr_notify_message_t *msg)
OPAL_TRACE_ARG1(1, job);
/* set the job state to the appropriate level */
if (orte_schema.check_std_trigger_name(msg->target, ORTE_STARTUP_TRIGGER)) {
if (orte_schema.check_std_trigger_name(msg->target, ORTE_ALL_INIT_TRIGGER)) {
if (ORTE_SUCCESS != (rc = orte_smr.set_job_state(job, ORTE_JOB_STATE_INIT))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
} else if (orte_schema.check_std_trigger_name(msg->target, ORTE_STARTUP_TRIGGER)) {
if (ORTE_SUCCESS != (rc = orte_smr.set_job_state(job, ORTE_JOB_ORTE_STARTUP_COMPLETE))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
@ -99,6 +92,11 @@ int orte_rmgr_base_proc_stage_gate_mgr(orte_gpr_notify_message_t *msg)
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
} else if (orte_schema.check_std_trigger_name(msg->target, ORTE_ALL_RUNNING_TRIGGER)) {
if (ORTE_SUCCESS != (rc = orte_smr.set_job_state(job, ORTE_JOB_STATE_RUNNING))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
} else if (orte_schema.check_std_trigger_name(msg->target, ORTE_STG1_TRIGGER)) {
if (ORTE_SUCCESS != (rc = orte_smr.set_job_state(job, ORTE_JOB_STATE_AT_STG1))) {
ORTE_ERROR_LOG(rc);
@ -114,13 +112,39 @@ int orte_rmgr_base_proc_stage_gate_mgr(orte_gpr_notify_message_t *msg)
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
} else if (orte_schema.check_std_trigger_name(msg->target, ORTE_NUM_FINALIZED_TRIGGER)) {
} else if (orte_schema.check_std_trigger_name(msg->target, ORTE_ALL_FINALIZED_TRIGGER)) {
if (ORTE_SUCCESS != (rc = orte_smr.set_job_state(job, ORTE_JOB_STATE_FINALIZED))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
} else if (orte_schema.check_std_trigger_name(msg->target, ORTE_ALL_TERMINATED_TRIGGER)) {
if (ORTE_SUCCESS != (rc = orte_smr.set_job_state(job, ORTE_JOB_STATE_TERMINATED))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
} else if (orte_schema.check_std_trigger_name(msg->target, ORTE_NUM_ABORTED_TRIGGER)) {
if (ORTE_SUCCESS != (rc = orte_smr.set_job_state(job, ORTE_JOB_STATE_ABORTED))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
} else if (orte_schema.check_std_trigger_name(msg->target, ORTE_FAILED_TO_START_TRIGGER)) {
if (ORTE_SUCCESS != (rc = orte_smr.set_job_state(job, ORTE_JOB_STATE_FAILED_TO_START))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
}
/* check to see if this came from a trigger that does not require we send
* out a message
*/
if (!orte_schema.check_std_trigger_name(msg->target, ORTE_STARTUP_TRIGGER) &&
!orte_schema.check_std_trigger_name(msg->target, ORTE_STG1_TRIGGER) &&
!orte_schema.check_std_trigger_name(msg->target, ORTE_STG2_TRIGGER) &&
!orte_schema.check_std_trigger_name(msg->target, ORTE_STG3_TRIGGER) &&
!orte_schema.check_std_trigger_name(msg->target, ORTE_ALL_FINALIZED_TRIGGER)) {
return ORTE_SUCCESS;
}
/* set the message type to SUBSCRIPTION. When we give this to the processes, we want
* them to break the message down and deliver it to the various subsystems.
*/
@ -129,13 +153,18 @@ int orte_rmgr_base_proc_stage_gate_mgr(orte_gpr_notify_message_t *msg)
/* setup the buffer */
buffer = OBJ_NEW(orte_buffer_t);
/* load the payload */
if (ORTE_SUCCESS != (rc = orte_dss.pack(buffer, &msg, 1, ORTE_GPR_NOTIFY_MSG))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buffer);
return rc;
}
/* send the message */
/* send the message to the xcast_barrier tag for handling - this is the
* destination here since these messages are intended to release
* a process from an xcast gate
*/
if (ORTE_SUCCESS != (rc = orte_rml.xcast(job, buffer, ORTE_RML_TAG_XCAST_BARRIER))) {
ORTE_ERROR_LOG(rc);
}
@ -145,3 +174,167 @@ CLEANUP:
return rc;
}
int orte_rmgr_base_orted_stage_gate_init(orte_jobid_t job)
{
orte_job_map_t *map;
int rc;
/* get the map for this job */
if (ORTE_SUCCESS != (rc = orte_rmaps.get_job_map(&map, job))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* setup the orted triggers for passing their launch info */
if (ORTE_SUCCESS != (rc = orte_smr.init_orted_stage_gates(map->job, map->num_new_daemons,
orte_rmgr_base_orted_stage_gate_mgr, NULL))) {
ORTE_ERROR_LOG(rc);
return rc;
}
return ORTE_SUCCESS;
}
int orte_rmgr_base_orted_stage_gate_mgr(orte_gpr_notify_message_t *msg)
{
int rc;
orte_jobid_t job, *jptr;
orte_job_map_t *map;
orte_daemon_cmd_flag_t command;
orte_buffer_t *buffer;
orte_gpr_notify_data_t *launch_data;
char* keys[] = {
ORTE_JOB_BEING_LAUNCHED_KEY,
NULL
};
char* tokens[] = {
ORTE_JOB_GLOBALS,
NULL
};
orte_gpr_value_t **values=NULL;
orte_std_cntr_t num_values=0;
OPAL_TRACE(1);
/* set the job state to the appropriate level */
if (orte_schema.check_std_trigger_name(msg->target, ORTE_STARTUP_TRIGGER)) {
if (ORTE_SUCCESS != (rc = orte_smr.set_job_state(0, ORTE_JOB_ORTE_STARTUP_COMPLETE))) {
ORTE_ERROR_LOG(rc);
return rc;
}
} else if (orte_schema.check_std_trigger_name(msg->target, ORTE_ALL_RUNNING_TRIGGER)) {
if (ORTE_SUCCESS != (rc = orte_smr.set_job_state(0, ORTE_JOB_STATE_RUNNING))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
/* check to see if this came from a trigger that requires we send a message - if
* so, then handle it as required
*/
if (orte_schema.check_std_trigger_name(msg->target, ORTE_STARTUP_TRIGGER)) {
/* the startup trigger is intended for the sharing of contact info
* across all orteds. The new orteds will be sitting at their startup
* stage gate and need an xcast_barrier message to release them, so we
* send the message to that RML tag. Orteds that have already started will
* have posted a persistent RML receive on that tag so they can "catch"
* this message as well - they use that mechanism to update their RML
* contact info so they can talk to the other daemons since the xcast
* goes to ALL members of the specified job.
*/
/* set the message type to SUBSCRIPTION. When we give this to the processes, we want
* them to break the message down and deliver it to the various subsystems.
*/
msg->msg_type = ORTE_GPR_SUBSCRIPTION_MSG;
msg->id = ORTE_GPR_TRIGGER_ID_MAX;
/* setup the buffer */
buffer = OBJ_NEW(orte_buffer_t);
/* pack the msg to be delivered */
if (ORTE_SUCCESS != (rc = orte_dss.pack(buffer, &msg, 1, ORTE_GPR_NOTIFY_MSG))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buffer);
return rc;
}
/* send the message to the xcast_barrier since the orte_startup trigger
* is a blocking action
*/
if (ORTE_SUCCESS != (rc = orte_rml.xcast(0, buffer, ORTE_RML_TAG_XCAST_BARRIER))) {
ORTE_ERROR_LOG(rc);
}
OBJ_RELEASE(buffer);
} else if (orte_schema.check_std_trigger_name(msg->target, ORTE_ALL_RUNNING_TRIGGER)) {
/* the running trigger indicates that we are ready to launch a job - get the
* job being launched from the registry, get the launch data, and then send it out to
* all the orteds
*/
if (ORTE_SUCCESS != (rc = orte_gpr.get(ORTE_GPR_KEYS_OR|ORTE_GPR_TOKENS_OR,
"orte-job-0", tokens, keys,
&num_values, &values))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
if (1 != num_values || 1 != values[0]->cnt) { /* can only be one value returned */
ORTE_ERROR_LOG(ORTE_ERR_GPR_DATA_CORRUPT);
goto cleanup;
}
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&jptr, values[0]->keyvals[0]->value, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
job = *jptr;
OBJ_RELEASE(values[0]);
/* get the job map */
if (ORTE_SUCCESS != (rc = orte_rmaps.get_job_map(&map, job))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* let the local launcher provide its required data */
if (ORTE_SUCCESS != (rc = orte_odls.get_add_procs_data(&launch_data, map))) {
ORTE_ERROR_LOG(rc);
return rc;
}
OBJ_RELEASE(map); /* done with this */
/* setup the buffer */
buffer = OBJ_NEW(orte_buffer_t);
/* pack the add_local_procs command */
command = ORTE_DAEMON_ADD_LOCAL_PROCS;
if (ORTE_SUCCESS != (rc = orte_dss.pack(buffer, &command, 1, ORTE_DAEMON_CMD))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buffer);
return rc;
}
/* pack the launch data */
if (ORTE_SUCCESS != (rc = orte_dss.pack(buffer, &launch_data, 1, ORTE_GPR_NOTIFY_DATA))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buffer);
return rc;
}
/* send the command to the daemon */
if (ORTE_SUCCESS != (rc = orte_rml.xcast(0, buffer, ORTE_RML_TAG_DAEMON))) {
ORTE_ERROR_LOG(rc);
}
OBJ_RELEASE(buffer);
cleanup:
if (NULL != values) free(values);
}
return ORTE_SUCCESS;
}

Просмотреть файл

@ -34,13 +34,14 @@
#include "orte/dss/dss.h"
#include "orte/mca/gpr/gpr_types.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/odls/odls_types.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/rmgr/base/rmgr_private.h"
int orte_rmgr_base_xconnect(orte_jobid_t child, orte_jobid_t parent)
{
orte_rml_cmd_flag_t command=ORTE_RML_UPDATE_CMD;
orte_rml_cmd_flag_t command;
orte_gpr_notify_data_t *data=NULL;
orte_process_name_t name;
orte_buffer_t *buf;
@ -60,7 +61,8 @@ int orte_rmgr_base_xconnect(orte_jobid_t child, orte_jobid_t parent)
/* send that info to everyone in the parent */
if (NULL != data) {
buf = OBJ_NEW(orte_buffer_t);
/* pack the update command */
/* pack the update-RML command */
command = ORTE_RML_UPDATE_CMD;
if (ORTE_SUCCESS != (rc = orte_dss.pack(buf, &command, 1, ORTE_RML_CMD))) {
ORTE_ERROR_LOG(rc);
}
@ -94,7 +96,8 @@ int orte_rmgr_base_xconnect(orte_jobid_t child, orte_jobid_t parent)
/* send that info to everyone in the child */
if (NULL != data) {
buf = OBJ_NEW(orte_buffer_t);
/* pack the update command */
/* pack the update-RML command */
command = ORTE_RML_UPDATE_CMD;
if (ORTE_SUCCESS != (rc = orte_dss.pack(buf, &command, 1, ORTE_RML_CMD))) {
ORTE_ERROR_LOG(rc);
}

Просмотреть файл

@ -33,6 +33,7 @@
#include "opal/mca/mca.h"
#include "orte/mca/gpr/gpr_types.h"
#include "orte/mca/rml/rml_types.h"
#include "orte/mca/rmaps/rmaps_types.h"
#include "orte/mca/rmgr/rmgr.h"
@ -47,10 +48,11 @@ extern "C" {
/*
* Constants for command values
*/
#define ORTE_RMGR_SETUP_JOB_CMD 1
#define ORTE_RMGR_SPAWN_JOB_CMD 2
#define ORTE_RMGR_SETUP_GATES_CMD 3
#define ORTE_RMGR_XCONNECT_CMD 4
#define ORTE_RMGR_SETUP_JOB_CMD 1
#define ORTE_RMGR_SPAWN_JOB_CMD 2
#define ORTE_RMGR_SETUP_GATES_CMD 3
#define ORTE_RMGR_XCONNECT_CMD 4
#define ORTE_RMGR_SETUP_ORTED_GATES_CMD 5
#define ORTE_RMGR_CMD ORTE_UINT8
typedef uint8_t orte_rmgr_cmd_t;
@ -130,6 +132,10 @@ ORTE_DECLSPEC int orte_rmgr_base_proc_stage_gate_init(orte_jobid_t job);
ORTE_DECLSPEC int orte_rmgr_base_proc_stage_gate_mgr(orte_gpr_notify_message_t *msg);
ORTE_DECLSPEC int orte_rmgr_base_orted_stage_gate_init(orte_jobid_t job);
ORTE_DECLSPEC int orte_rmgr_base_orted_stage_gate_mgr(orte_gpr_notify_message_t *msg);
ORTE_DECLSPEC int orte_rmgr_base_xconnect(orte_jobid_t child, orte_jobid_t parent);
ORTE_DECLSPEC int orte_rmgr_base_comm_start(void);

Просмотреть файл

@ -49,6 +49,8 @@ static int orte_rmgr_proxy_setup_job(orte_app_context_t** app_context,
static int orte_rmgr_proxy_setup_stage_gates(orte_jobid_t jobid);
static int orte_rmgr_proxy_orted_stage_gate_init(orte_jobid_t jobid);
static int orte_rmgr_proxy_spawn_job(
orte_app_context_t** app_context,
orte_std_cntr_t num_context,
@ -170,6 +172,67 @@ static int orte_rmgr_proxy_setup_job(orte_app_context_t** app_context,
return rc;
}
static int orte_rmgr_proxy_orted_stage_gate_init(orte_jobid_t jobid)
{
orte_buffer_t cmd;
orte_buffer_t rsp;
orte_std_cntr_t count;
orte_rmgr_cmd_t command=ORTE_RMGR_SETUP_ORTED_GATES_CMD;
int rc;
OPAL_TRACE(1);
/* construct command */
OBJ_CONSTRUCT(&cmd, orte_buffer_t);
/* pack the command */
if (ORTE_SUCCESS != (rc = orte_dss.pack(&cmd, &command, 1, ORTE_RMGR_CMD))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&cmd);
return rc;
}
/* pack the jobid */
if(ORTE_SUCCESS != (rc = orte_dss.pack(&cmd, &jobid, 1, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&cmd);
return rc;
}
/* send the command */
if(0 > (rc = orte_rml.send_buffer(ORTE_PROC_MY_HNP, &cmd, ORTE_RML_TAG_RMGR, 0))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&cmd);
return rc;
}
OBJ_DESTRUCT(&cmd);
/* wait for response */
OBJ_CONSTRUCT(&rsp, orte_buffer_t);
if(0 > (rc = orte_rml.recv_buffer(ORTE_PROC_MY_HNP, &rsp, ORTE_RML_TAG_RMGR))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&rsp);
return rc;
}
/* get the returned command */
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(&rsp, &command, &count, ORTE_RMGR_CMD))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&rsp);
return rc;
}
/* and check it to ensure valid comm */
if (ORTE_RMGR_SETUP_ORTED_GATES_CMD != command) {
OBJ_DESTRUCT(&rsp);
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
return ORTE_ERR_COMM_FAILURE;
}
OBJ_DESTRUCT(&rsp);
return rc;
}
static int orte_rmgr_proxy_setup_stage_gates(orte_jobid_t jobid)
{
orte_buffer_t cmd;
@ -634,6 +697,15 @@ static int orte_rmgr_proxy_spawn_job(
return ORTE_SUCCESS;
}
/* setup the orted's stage gate triggers - do this here as, if there are no
* new orteds to launch, the trigger will fire immediately and launch
* the procs
*/
if (ORTE_SUCCESS != (rc = orte_rmgr_proxy_orted_stage_gate_init(*jobid))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/*
* launch the job
*/

Просмотреть файл

@ -589,7 +589,16 @@ static int orte_rmgr_urm_spawn_job(
}
#endif
/*
/* setup the orted's stage gate triggers - do this here as, if there are no
* new orteds to launch, the trigger will fire immediately and launch
* the procs
*/
if (ORTE_SUCCESS != (rc = orte_rmgr_base_orted_stage_gate_init(*jobid))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/*
* launch the job
*/
if (ORTE_SUCCESS != (rc = orte_pls.launch_job(*jobid))) {

Просмотреть файл

@ -52,24 +52,22 @@ typedef uint32_t orte_rml_tag_t;
#define ORTE_RML_TAG_IOF_SVC 5
#define ORTE_RML_TAG_IOF_CLNT 6
#define ORTE_RML_TAG_XCAST_BARRIER 7
#define ORTE_RML_TAG_XCAST_NB 8
#define ORTE_RML_TAG_ORTED_ROUTED 8
#define ORTE_RML_TAG_RMGR 9
#define ORTE_RML_TAG_PROBE 10
#define ORTE_RML_TAG_RDS 11
#define ORTE_RML_TAG_RAS 12
#define ORTE_RML_TAG_RMAPS 13
#define ORTE_RML_TAG_PLS 14
#define ORTE_RML_TAG_PLS_ORTED 15
#define ORTE_RML_TAG_PLS_ORTED_ACK 16
#define ORTE_RML_TAG_ERRMGR 17
#define ORTE_RML_TAG_BPROC 18
#define ORTE_RML_TAG_BPROC_ABORT 19
#define ORTE_RML_TAG_SM_BACK_FILE_CREATED 20
#define ORTE_RML_TAG_WIREUP 21
#define ORTE_RML_TAG_RML 22
#define ORTE_RML_TAG_ERRMGR 15
#define ORTE_RML_TAG_BPROC 16
#define ORTE_RML_TAG_BPROC_ABORT 17
#define ORTE_RML_TAG_SM_BACK_FILE_CREATED 18
#define ORTE_RML_TAG_WIREUP 19
#define ORTE_RML_TAG_RML 20
#define ORTE_RML_TAG_FILEM 23
#define ORTE_RML_TAG_CKPT 24
#define ORTE_RML_TAG_FILEM 21
#define ORTE_RML_TAG_CKPT 22
/* For CRCP Coord Component */
#define OMPI_CRCP_COORD_BOOKMARK_TAG 4242

Просмотреть файл

@ -89,6 +89,9 @@
#define ORTE_JOB_STATE_KEY "orte-job-state"
#define ORTE_JOB_MAPPING_MODE_KEY "orte-job-mapping-mode"
#define ORTE_JOB_PARENT_JOBID_KEY "orte-job-parent-jobid"
#define ORTE_JOB_NUM_NEW_DAEMONS_KEY "orte-job-num-new-daemons"
#define ORTE_JOB_DAEMON_VPID_START_KEY "orte-job-daemon-vpid-start"
#define ORTE_JOB_BEING_LAUNCHED_KEY "orte-job-being-launched"
/* PROCESS specific keys */
#define ORTE_PROC_NAME_KEY "orte-proc-name"
@ -139,8 +142,8 @@
#define ORTE_STG1_TRIGGER "orte-stage1"
#define ORTE_STG2_TRIGGER "orte-stage2"
#define ORTE_STG3_TRIGGER "orte-stage3"
#define ORTE_NUM_FINALIZED_TRIGGER "orte-num-finalized"
#define ORTE_NUM_TERMINATED_TRIGGER "orte-num-terminated"
#define ORTE_ALL_FINALIZED_TRIGGER "orte-finalized-trig"
#define ORTE_ALL_TERMINATED_TRIGGER "orte-terminated-trig"
#define ORTE_JOB_CKPT_STATE_TRIGGER "orte-job-ckpt-trig"
#define ORTE_PROC_CKPT_STATE_TRIGGER "orte-proc-ckpt-trig"
@ -149,13 +152,12 @@
#define ORTE_NUM_ABORTED_TRIGGER "orte-num-aborted"
#define ORTE_FAILED_TO_START_TRIGGER "orte-failed-start-trig"
/*
* ORTED (ORTE DAEMON) TRIGGER DEFINITIONS
*/
#define ORTED_LAUNCH_STAGE_GATE_TRIGGER "orted-launch-gate"
#define ORTED_LAUNCH_STG_SUB "orted-launch-sub"
#define ORTED_LAUNCH_STAGE_GATE_CNTR "orted-num-at-launch-gate"
#define ORTED_NUM_TO_BE_LAUNCHED "orted-num-to-be-launched"
#define ORTED_LAUNCH_STG_SUB "orted-launch-sub"
/*
* BPROC-SPECIFIC SEGMENT FOR STORING CLUSTER-WIDE NODE STATES

Просмотреть файл

@ -68,7 +68,7 @@ int orte_sds_base_contact_orted(char *orted_uri)
/* do the send - it will be ignored on the far end, so don't worry about
* getting a response
*/
if (0 > orte_rml.send_buffer(&orted, &buffer, ORTE_RML_TAG_PLS_ORTED, 0)) {
if (0 > orte_rml.send_buffer(&orted, &buffer, ORTE_RML_TAG_DAEMON, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_CONNECTION_FAILED);
OBJ_DESTRUCT(&buffer);
return ORTE_ERR_CONNECTION_FAILED;

Просмотреть файл

@ -87,6 +87,7 @@ orte_sds_base_basic_contact_universe(void)
/* user-specified name - abort */
opal_output(0, "orte_init: could not contact the specified universe name %s",
orte_universe_info.name);
ORTE_ERROR_LOG(ret);
return ORTE_ERR_UNREACH;
}
orte_process_info.seed = true;

Просмотреть файл

@ -198,22 +198,6 @@ int orte_sds_bproc_set_name(void)
cleanup_vpid_string = true;
}
/* it is okay for this param not to be found - for example, we don't bother
* to set it for orteds - so just set it to an invalid value which indicates
* it wasn't found if it isn't there
*/
id = mca_base_param_register_int("ns", "nds", "local_rank", NULL, ORTE_VPID_INVALID);
mca_base_param_lookup_int(id, &local_rank);
orte_process_info.local_rank = (orte_vpid_t)local_rank;
/* it is okay for this param not to be found - for example, we don't bother
* to set it for orteds - so just set it to a value which indicates
* it wasn't found if it isn't there
*/
id = mca_base_param_register_int("ns", "nds", "num_local_procs", NULL, 0);
mca_base_param_lookup_int(id, &num_local_procs);
orte_process_info.num_local_procs = (orte_std_cntr_t)num_local_procs;
/* if we are NOT a daemon, then lookup our local daemon's contact info
* and setup that link
*/
@ -236,8 +220,14 @@ int orte_sds_bproc_set_name(void)
}
fgets(orted_uri, 1024, fp);
orted_uri[strlen(orted_uri)-1] = '\0';
/* now get the local rank */
fscanf(fp, "%d", &local_rank);
orte_process_info.local_rank = (orte_vpid_t)local_rank;
/* and the number of local procs */
fscanf(fp, "%d", &num_local_procs);
orte_process_info.num_local_procs = (orte_std_cntr_t)num_local_procs;
fclose(fp);
/* setup the link */
/* setup the link to the local orted */
if (ORTE_SUCCESS != (rc = orte_sds_base_contact_orted(orted_uri))) {
ORTE_ERROR_LOG(rc);
return(rc);

Просмотреть файл

@ -71,36 +71,36 @@ static void orte_smr_base_quick_print(char **output, char *type_name, char *pref
case 1:
ui8 = (uint8_t*)src;
if (NULL == prefix) {
asprintf(output, "Data type: %s\tValue: %d", type_name, (int) *ui8);
asprintf(output, "Data type: %s\tData size: 8-bit\tValue: %d", type_name, (int) *ui8);
} else {
asprintf(output, "%sData type: %s\tValue: %d", prefix, type_name, (int) *ui8);
asprintf(output, "%sData type: %s\tData size: 8-bit\tValue: %d", prefix, type_name, (int) *ui8);
}
break;
case 2:
ui16 = (uint16_t*)src;
if (NULL == prefix) {
asprintf(output, "Data type: %s\tValue: %d", type_name, (int) *ui16);
asprintf(output, "Data type: %s\tData size: 16-bit\tValue: %d", type_name, (int) *ui16);
} else {
asprintf(output, "%sData type: %s\tValue: %d", prefix, type_name, (int) *ui16);
asprintf(output, "%sData type: %s\tData size: 16-bit\tValue: %d", prefix, type_name, (int) *ui16);
}
break;
case 4:
ui32 = (uint32_t*)src;
if (NULL == prefix) {
asprintf(output, "Data type: %s\tValue: %lu", type_name, (unsigned long) *ui32);
asprintf(output, "Data type: %s\tData size: 32-bit\tValue: %lu", type_name, (unsigned long) *ui32);
} else {
asprintf(output, "%sData type: %s\tValue: %lu", prefix, type_name, (unsigned long) *ui32);
asprintf(output, "%sData type: %s\tData size: 32-bit\tValue: %lu", prefix, type_name, (unsigned long) *ui32);
}
break;
case 8:
ui64 = (uint64_t*)src;
if (NULL == prefix) {
asprintf(output, "Data type: %s\tValue: %lu", type_name, (unsigned long) *ui64);
asprintf(output, "Data type: %s\tData size: 64-bit\tValue: %lu", type_name, (unsigned long) *ui64);
} else {
asprintf(output, "%sData type: %s\tValue: %lu", prefix, type_name, (unsigned long) *ui64);
asprintf(output, "%sData type: %s\tData size: 64-bit\tValue: %lu", prefix, type_name, (unsigned long) *ui64);
}
break;

Просмотреть файл

@ -89,16 +89,10 @@ int orte_smr_base_set_proc_state(orte_process_name_t *proc,
}
OBJ_RELEASE(value);
/* check to see if we need to increment orte-standard counters */
if (ORTE_PROC_ORTE_STARTUP_COMPLETE == state ||
ORTE_PROC_STATE_LAUNCHED == state ||
ORTE_PROC_STATE_AT_STG1 == state ||
ORTE_PROC_STATE_AT_STG2 == state ||
ORTE_PROC_STATE_AT_STG3 == state ||
ORTE_PROC_STATE_FINALIZED == state ||
ORTE_PROC_STATE_TERMINATED == state ||
ORTE_PROC_STATE_FAILED_TO_START == state ||
ORTE_PROC_STATE_ABORTED == state) {
/* we don't need to increment the INIT counter as this is done
* prior to process launch
*/
if (ORTE_PROC_STATE_INIT != state) {
/* If we're setting ABORTED or FAILED_TO_START, we're also setting TERMINATED, so we
need 2 keyvals. Everything else only needs 1 keyval. */
@ -120,6 +114,7 @@ int orte_smr_base_set_proc_state(orte_process_name_t *proc,
}
}
/* all counters are in the JOB_GLOBALS container */
value->tokens[0] = strdup(ORTE_JOB_GLOBALS);
/* see which state we are in - let that determine the counter */
@ -138,13 +133,20 @@ int orte_smr_base_set_proc_state(orte_process_name_t *proc,
}
break;
case ORTE_PROC_STATE_RUNNING:
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[0]), ORTE_PROC_NUM_RUNNING, ORTE_UNDEF, NULL))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
break;
case ORTE_PROC_STATE_AT_STG1:
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[0]), ORTE_PROC_NUM_AT_STG1, ORTE_UNDEF, NULL))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
break;
case ORTE_PROC_STATE_AT_STG2:
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[0]), ORTE_PROC_NUM_AT_STG2, ORTE_UNDEF, NULL))) {
ORTE_ERROR_LOG(rc);

Просмотреть файл

@ -45,9 +45,13 @@ int orte_smr_base_init_job_stage_gates(orte_jobid_t job,
orte_std_cntr_t zero=0;
int rc, num_start_routing;
orte_gpr_value_t *value;
char *segment, *trig_name, *tokens[2], *trig_keys[2];
char *segment, *trig_name, *trig_keys[2];
orte_gpr_trigger_id_t id;
orte_gpr_trigger_action_t trig_mode, trig_mode_routed;
char* tokens[] = {
ORTE_JOB_GLOBALS,
NULL
};
char* keys[] = {
/* changes to this ordering need to be reflected in code below */
/* We need to set up counters for all the defined ORTE process states, even though
@ -70,13 +74,13 @@ int orte_smr_base_init_job_stage_gates(orte_jobid_t job,
ORTE_ALL_INIT_TRIGGER,
ORTE_ALL_LAUNCHED_TRIGGER,
ORTE_ALL_RUNNING_TRIGGER,
ORTE_NUM_TERMINATED_TRIGGER,
ORTE_ALL_TERMINATED_TRIGGER,
/* the following triggers need data routed through them */
ORTE_STARTUP_TRIGGER,
ORTE_STG1_TRIGGER,
ORTE_STG2_TRIGGER,
ORTE_STG3_TRIGGER,
ORTE_NUM_FINALIZED_TRIGGER,
ORTE_ALL_FINALIZED_TRIGGER
};
@ -91,9 +95,15 @@ int orte_smr_base_init_job_stage_gates(orte_jobid_t job,
return rc;
}
/* setup the counters - set initial values to 0 */
/* setup the counters - set initial values to 0. Since the counters may pre-exist
* if the procs already got started, we want to ensure we do NOT disturb their current value.
* So, we just indicate that we do NOT want to overwrite them by not setting the
* ORTE_GPR_OVERWRITE flag, and do NOT want duplicate entries on the registry by setting
* the ORTE_GPR_NO_DUPLICATE flag. Thus, we will ONLY write the counters on the registry
* if they do not previously exist.
*/
if (ORTE_SUCCESS != (rc = orte_gpr.create_value(&value,
ORTE_GPR_OVERWRITE | ORTE_GPR_TOKENS_XAND | ORTE_GPR_KEYS_OR,
ORTE_GPR_NO_DUPLICATE | ORTE_GPR_TOKENS_XAND | ORTE_GPR_KEYS_OR,
segment, num_counters, 1))) {
ORTE_ERROR_LOG(rc);
@ -123,9 +133,6 @@ int orte_smr_base_init_job_stage_gates(orte_jobid_t job,
* can get required information for notifying processes. Other
* subscriptions will then attach to them.
*/
tokens[0] = strdup(ORTE_JOB_GLOBALS);
tokens[1] = NULL;
trig_keys[0] = strdup(ORTE_JOB_SLOTS_KEY);
trig_mode = ORTE_GPR_TRIG_INCLUDE_TRIG_CNTRS | ORTE_GPR_TRIG_ONE_SHOT |
ORTE_GPR_TRIG_CMP_LEVELS;
@ -136,7 +143,6 @@ int orte_smr_base_init_job_stage_gates(orte_jobid_t job,
if (ORTE_SUCCESS != (rc = orte_schema.get_std_trigger_name(&trig_name,
trig_names[i], job))) {
ORTE_ERROR_LOG(rc);
free(tokens[0]);
free(segment);
free(trig_keys[0]);
free(trig_keys[1]);
@ -153,7 +159,6 @@ int orte_smr_base_init_job_stage_gates(orte_jobid_t job,
segment, tokens, 2, trig_keys,
NULL, NULL))) {
ORTE_ERROR_LOG(rc);
free(tokens[0]);
free(segment);
free(trig_name);
free(trig_keys[0]);
@ -166,7 +171,6 @@ int orte_smr_base_init_job_stage_gates(orte_jobid_t job,
segment, tokens, 2, trig_keys,
cbfunc, user_tag))) {
ORTE_ERROR_LOG(rc);
free(tokens[0]);
free(segment);
free(trig_name);
free(trig_keys[0]);
@ -178,7 +182,6 @@ int orte_smr_base_init_job_stage_gates(orte_jobid_t job,
free(trig_keys[1]);
}
free(trig_keys[0]);
free(tokens[0]);
free(segment);
return ORTE_SUCCESS;
@ -194,85 +197,179 @@ int orte_smr_base_init_orted_stage_gates(orte_jobid_t job,
orte_gpr_trigger_cb_fn_t cbfunc,
void *user_tag)
{
char *segment;
char *trig_name;
orte_gpr_value_t *value;
char *segment=NULL;
char *trig_name=NULL;
orte_gpr_value_t *value=NULL;
orte_std_cntr_t zero=0;
orte_std_cntr_t i, num_counters, num_named_trigs, num_start_routing;
char *trig_tokens[] = {
ORTE_JOB_GLOBALS,
NULL
};
char *to_launch_keys[] = {
ORTE_JOB_SLOTS_KEY,
NULL
};
char* keys[] = {
/* changes to this ordering need to be reflected in code below */
/* We need to set up counters for defined ORTED process states, even though
* a given launch system may not actually use them all. This must be done so that
* callbacks can be generated - otherwise, they won't happen!
*/
ORTE_PROC_NUM_AT_INIT,
ORTE_PROC_NUM_LAUNCHED,
ORTE_PROC_NUM_FINALIZED,
ORTE_PROC_NUM_TERMINATED,
/* the following stage gates need data routed through them */
ORTE_PROC_NUM_AT_ORTE_STARTUP,
ORTE_PROC_NUM_RUNNING
};
char* trig_names[] = {
/* this ordering needs to be identical to that in the array above! */
ORTE_ALL_INIT_TRIGGER,
ORTE_ALL_LAUNCHED_TRIGGER,
ORTE_ALL_FINALIZED_TRIGGER,
ORTE_ALL_TERMINATED_TRIGGER,
/* the following triggers need data routed through them */
ORTE_STARTUP_TRIGGER,
ORTE_ALL_RUNNING_TRIGGER
};
char *trig_keys[] = {
ORTED_NUM_TO_BE_LAUNCHED,
ORTED_LAUNCH_STAGE_GATE_CNTR,
ORTE_JOB_SLOTS_KEY,
NULL, /* placeholder */
NULL
};
int rc;
orte_gpr_trigger_id_t id;
orte_gpr_trigger_action_t trig_mode, trig_mode_routed;
orte_data_value_t dval = ORTE_DATA_VALUE_EMPTY;
/** get the segment name where all of this is taking place */
if (ORTE_SUCCESS != (rc = orte_schema.get_job_segment_name(&segment, job))) {
num_counters = sizeof(keys)/sizeof(keys[0]);
num_named_trigs= sizeof(trig_names)/sizeof(trig_names[0]);
/* the index where the triggers that need data routed through them begin */
num_start_routing = 4;
/* increment the total number of orteds in the system. This MUST be done first
* or else the triggers will all immediately fire! Since there may be a pre-existing value here, we
* will use the arith function and do an ADD - this will either (a) add to a pre-existing value, or
* (b) initialize a location to the provided value
*/
if (ORTE_SUCCESS != (rc = orte_dss.set(&dval, (void*)&num_orteds, ORTE_STD_CNTR))) {
ORTE_ERROR_LOG(rc);
return rc;
goto cleanup;
}
if (ORTE_SUCCESS != (rc = orte_gpr.arith(ORTE_GPR_TOKENS_AND | ORTE_GPR_KEYS_OR,
"orte-job-0", trig_tokens, to_launch_keys,
ORTE_DSS_ADD, &dval))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* reset the data value so it can be reused */
dval.type = ORTE_UNDEF;
dval.data = NULL;
/* setup to store two values - the number of orteds and the counter */
/* setup the counters - set initial values to 0. All reside on the "orte-job-0" segment. Since
* the counters may pre-exist, we want to ensure we do NOT disturb their current value.
* So, we just indicate that we do NOT want to overwrite them by not setting the
* ORTE_GPR_OVERWRITE flag, and do NOT want duplicate entries on the registry by setting
* the ORTE_GPR_NO_DUPLICATE flag. Thus, we will ONLY write the counters on the registry
* if they do not previously exist.
*/
if (ORTE_SUCCESS != (rc = orte_gpr.create_value(&value,
ORTE_GPR_OVERWRITE | ORTE_GPR_TOKENS_XAND | ORTE_GPR_KEYS_OR,
segment, 2, 1))) {
ORTE_GPR_NO_DUPLICATE | ORTE_GPR_TOKENS_XAND | ORTE_GPR_KEYS_OR,
"orte-job-0", num_counters, 1))) {
ORTE_ERROR_LOG(rc);
free(segment);
return rc;
}
value->tokens[0] = strdup(ORTE_JOB_GLOBALS); /* put counters in the job's globals container */
/** store the number of orteds */
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[0]),
ORTED_NUM_TO_BE_LAUNCHED, ORTE_STD_CNTR, &num_orteds))) {
ORTE_ERROR_LOG(rc);
free(segment);
OBJ_RELEASE(value);
return rc;
}
/** initialize the counter to zero */
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[1]), ORTED_LAUNCH_STAGE_GATE_CNTR, ORTE_STD_CNTR, &zero))) {
ORTE_ERROR_LOG(rc);
free(segment);
OBJ_RELEASE(value);
return rc;
/** initialize the counters to zero */
for (i=0; i < num_counters; i++) {
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[i]), keys[i], ORTE_STD_CNTR, &zero))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
}
/* put the counters on the registry */
/* store them on the registry */
if (ORTE_SUCCESS != (rc = orte_gpr.put(1, &value))) {
ORTE_ERROR_LOG(rc);
free(segment);
OBJ_RELEASE(value);
return rc;
goto cleanup;
}
/* record which job we are trying to launch so we can retrieve it later - this cannot
* be included in the prior put as we must overwrite any pre-existing info
*/
if (ORTE_SUCCESS != (rc = orte_dss.set(&dval, (void*)&job, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
if (ORTE_SUCCESS != (rc = orte_gpr.put_1(ORTE_GPR_OVERWRITE | ORTE_GPR_TOKENS_XAND | ORTE_GPR_KEYS_OR,
"orte-job-0", trig_tokens, ORTE_JOB_BEING_LAUNCHED_KEY,
&dval))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
OBJ_RELEASE(value);
/* now define a trigger based on those counters */
if (ORTE_SUCCESS != (rc = orte_schema.get_std_trigger_name(&trig_name,
ORTED_LAUNCH_STAGE_GATE_TRIGGER, job))) {
ORTE_ERROR_LOG(rc);
free(segment);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_gpr.define_trigger(&id, trig_name,
ORTE_GPR_TRIG_INCLUDE_TRIG_CNTRS | ORTE_GPR_TRIG_ONE_SHOT |
ORTE_GPR_TRIG_CMP_LEVELS,
ORTE_GPR_TOKENS_XAND | ORTE_GPR_KEYS_OR,
segment, trig_tokens, 2, trig_keys,
cbfunc, user_tag))) {
ORTE_ERROR_LOG(rc);
free(segment);
/* now define the standard orted triggers. If these triggers already
* exist, the registry will overwrite them with the new information.
* The standard triggers will return the trigger counters so that we
* can get required information for notifying processes. Other
* subscriptions will then attach to them.
*
* NOTE: if a seed or a virtual machine is being setup, then the
* jobid=0.
*/
trig_mode = ORTE_GPR_TRIG_INCLUDE_TRIG_CNTRS | ORTE_GPR_TRIG_ONE_SHOT |
ORTE_GPR_TRIG_CMP_LEVELS;
trig_mode_routed = trig_mode | ORTE_GPR_TRIG_ROUTE_DATA_THRU_ME;
for (i=0; i < num_named_trigs; i++) {
trig_keys[1] = strdup(keys[i]);
if (ORTE_SUCCESS != (rc = orte_schema.get_std_trigger_name(&trig_name,
trig_names[i], 0))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
if (i < num_start_routing) {
/* the first set of triggers do NOT have anything routed to them.
* They are setup here strictly for users to attach to them.
* Hence, we do not pass a trigger callback function and
* set the trig actions to "not route data through me"
*/
if (ORTE_SUCCESS != (rc = orte_gpr.define_trigger(&id, trig_name, trig_mode,
ORTE_GPR_TOKENS_XAND | ORTE_GPR_KEYS_OR,
"orte-job-0", trig_tokens, 2, trig_keys,
cbfunc, user_tag))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
} else {
/* these triggers do need data routed through them, so use
* the appropriate trigger mode
*/
if (ORTE_SUCCESS != (rc = orte_gpr.define_trigger(&id, trig_name, trig_mode_routed,
ORTE_GPR_TOKENS_XAND | ORTE_GPR_KEYS_OR,
"orte-job-0", trig_tokens, 2, trig_keys,
cbfunc, user_tag))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
}
free(trig_name);
return rc;
trig_name = NULL;
free(trig_keys[1]);
trig_keys[1] = NULL;
}
free(segment);
free(trig_name);
cleanup:
if (NULL != segment) free(segment);
if (NULL != trig_name) free(trig_name);
if (NULL != value) OBJ_RELEASE(value);
if (NULL != trig_keys[1]) free(trig_keys[1]);
return ORTE_SUCCESS;
}
@ -372,7 +469,7 @@ int orte_smr_base_job_stage_gate_subscribe(orte_jobid_t job,
{
orte_std_cntr_t i;
int rc;
char *segment, *trig_name, *tokens[2];
char *segment, *trig_name;
orte_proc_state_t conditions;
orte_gpr_subscription_id_t id;
/** the order of the next three definitions MUST match */
@ -406,8 +503,12 @@ int orte_smr_base_job_stage_gate_subscribe(orte_jobid_t job,
ORTE_STG1_TRIGGER,
ORTE_STG2_TRIGGER,
ORTE_STG3_TRIGGER,
ORTE_NUM_FINALIZED_TRIGGER,
ORTE_NUM_TERMINATED_TRIGGER
ORTE_ALL_FINALIZED_TRIGGER,
ORTE_ALL_TERMINATED_TRIGGER
};
char* tokens[] = {
ORTE_JOB_GLOBALS,
NULL
};
orte_std_cntr_t num_counters = sizeof(keys)/sizeof(keys[0]);
@ -417,11 +518,7 @@ int orte_smr_base_job_stage_gate_subscribe(orte_jobid_t job,
return rc;
}
/* setup the tokens */
tokens[0]=ORTE_JOB_GLOBALS;
tokens[1]=NULL;
conditions = cb_conditions;
conditions = cb_conditions;
for (i=0; i < num_counters; i++) {
if (state[i] & conditions) {
/** want this one - attach ourselves to the appropriate standard trigger */

Просмотреть файл

@ -554,15 +554,21 @@ int orte_init_stage1(bool infrastructure)
}
while (NULL != (item = opal_list_remove_first(&attrs))) OBJ_RELEASE(item);
OBJ_DESTRUCT(&attrs);
}
/* for singleton or seed, need to define our stage gates and fire the LAUNCHED gate
* to ensure that everything in the rest of the system runs smoothly
*/
if (ORTE_SUCCESS != (ret = orte_rmgr_base_proc_stage_gate_init(ORTE_PROC_MY_NAME->jobid))) {
ORTE_ERROR_LOG(ret);
error = "singleton/seed orte_rmgr_base_proc_stage_gate_init";
goto error;
/* need to define our stage gates and fire the LAUNCHED gate
* to ensure that everything in the rest of the system runs smoothly
*/
if (ORTE_SUCCESS != (ret = orte_rmgr_base_proc_stage_gate_init(ORTE_PROC_MY_NAME->jobid))) {
ORTE_ERROR_LOG(ret);
error = "singleton orte_rmgr_base_proc_stage_gate_init";
goto error;
}
} else { /* if we an HNP, then we need to define the orted stage gates */
if (ORTE_SUCCESS != (ret = orte_rmgr_base_orted_stage_gate_init(ORTE_PROC_MY_NAME->jobid))) {
ORTE_ERROR_LOG(ret);
error = "seed orte_rmgr_base_orted_stage_gate_init";
goto error;
}
}
/* set our state to LAUNCHED */

Просмотреть файл

@ -31,6 +31,7 @@
#include "orte/mca/ns/ns.h"
#include "orte/mca/gpr/gpr.h"
#include "orte/mca/iof/base/base.h"
#include "orte/mca/smr/smr.h"
#include "orte/runtime/orte_cr.h"
@ -80,6 +81,17 @@ int orte_init_stage2(char *trigger)
/* Since we are now finished with init, change the state to running */
orte_universe_info.state = ORTE_UNIVERSE_STATE_RUNNING;
/* for singleton or seed, need to fire the RUNNING gate
* to ensure that everything in the rest of the system runs smoothly
*/
if (orte_process_info.seed || orte_process_info.singleton) {
if (ORTE_SUCCESS != (ret = orte_smr.set_proc_state(orte_process_info.my_name, ORTE_PROC_STATE_RUNNING, 0))) {
ORTE_ERROR_LOG(ret);
error_str = "singleton/seed could not set RUNNING state";
goto return_error;
}
}
/* startup the receive if we are not the HNP - unless we are a singleton,
* in which case we must start it up in case we do a comm_spawn!
*/

Просмотреть файл

@ -30,7 +30,7 @@
#include "orte/runtime/params.h"
/* globals used by RTE */
bool orte_debug_flag;
bool orte_debug_flag, orte_timing;
struct timeval orte_abort_timeout;
@ -48,7 +48,7 @@ int orte_register_params(bool infrastructure)
false, false, (int)false, &value);
orte_debug_flag = OPAL_INT_TO_BOOL(value);
mca_base_param_reg_int_name("orte_debug", "daemons_file",
mca_base_param_reg_int_name("orte", "debug_daemons_file",
"Whether want stdout/stderr of daemons to go to a file or not",
false, false, (int)false, NULL);
@ -56,7 +56,7 @@ int orte_register_params(bool infrastructure)
"Whether to properly daemonize the ORTE daemons or not",
false, false, (int)false, NULL);
mca_base_param_reg_int_name("orte_debug", "daemons",
mca_base_param_reg_int_name("orte", "debug_daemons",
"Whether to debug the ORTE daemons or not",
false, false, (int)false, NULL);
@ -64,6 +64,12 @@ int orte_register_params(bool infrastructure)
"Whether we are ORTE infrastructure or an ORTE application",
true, true, (int)infrastructure, NULL);
/* check for timing requests */
mca_base_param_reg_int_name("orte", "timing",
"Request that critical timing loops be measured",
false, false, (int)false, &value);
orte_timing = OPAL_INT_TO_BOOL(value);
/* User-level debugger info string */
mca_base_param_reg_string_name("orte", "base_user_debugger",

Просмотреть файл

@ -240,6 +240,8 @@ int orte_universe_search(opal_list_t *universe_list, bool report_broken_files, b
static int orte_universe_check_connect(orte_universe_t *uni)
{
int rc;
if (!orte_universe_info.console) { /* if we aren't trying to connect a console */
if (!uni->persistence || /* if the target universe is not persistent... */
(0 == strncmp(uni->scope, "exclusive", strlen("exclusive")))) { /* ...or no connection allowed */
@ -258,11 +260,16 @@ static int orte_universe_check_connect(orte_universe_t *uni)
opal_output(0, "connect_uni: contact info to set: %s", uni->seed_uri);
}
/* insert the universe contact info into the RML hash tables */
if (ORTE_SUCCESS != (rc = orte_rml.set_uri(uni->seed_uri))) {
ORTE_ERROR_LOG(rc);
return(rc);
}
/* ping to verify it's alive */
if (ORTE_SUCCESS != orte_rml.ping(uni->seed_uri, &ompi_rte_ping_wait)) {
if (ORTE_SUCCESS != (rc = orte_rml.ping(uni->seed_uri, &ompi_rte_ping_wait))) {
if (orte_debug_flag) {
ORTE_ERROR_LOG(ORTE_ERR_CONNECTION_FAILED);
ORTE_ERROR_LOG(rc);
}
return ORTE_ERR_CONNECTION_FAILED;
}

Просмотреть файл

@ -39,7 +39,7 @@ extern "C" {
/* globals used by RTE - instanced in orte_params.c */
ORTE_DECLSPEC extern bool orte_debug_flag;
ORTE_DECLSPEC extern bool orte_debug_flag, orte_reuse_daemons, orte_timing;
ORTE_DECLSPEC extern struct timeval orte_abort_timeout;

Просмотреть файл

@ -1,4 +1,4 @@
PROGS = gpr_dt_buffer gpr_dt_cmp gpr_dt_copy gpr_dt_print gpr_dt_release gpr_dt_size
PROGS = gpr_arith gpr_dt_buffer gpr_dt_cmp gpr_dt_copy gpr_dt_print gpr_dt_release gpr_dt_size
all: $(PROGS)

157
orte/test/unit/gpr/gpr_arith.c Обычный файл
Просмотреть файл

@ -0,0 +1,157 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*
* The Open MPI general purpose registry - unit test
*
*/
/*
* includes
*/
#include "orte_config.h"
#include <stdio.h>
#include <string.h>
#include "orte/orte_constants.h"
#include "opal/runtime/opal.h"
#include "opal/util/malloc.h"
#include "opal/util/output.h"
#include "orte/dss/dss.h"
#include "orte/util/proc_info.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/runtime/runtime.h"
#include "orte/mca/gpr/base/base.h"
static bool test_add(void);
static bool test_sub(void);
static bool test_mult(void);
static bool test_div(void);
FILE *test_out;
int main(int argc, char **argv)
{
int ret;
orte_init(ORTE_INFRASTRUCTURE, ORTE_NON_BARRIER);
/* Now do the tests */
fprintf(stderr, "executing test_add\n");
if (test_add()) {
fprintf(stderr, "test_add succeeded\n");
}
else {
fprintf(stderr, "test_add failed\n");
}
fprintf(stderr, "executing test_sub\n");
if (test_sub()) {
fprintf(stderr, "test_sub succeeded\n");
}
else {
fprintf(stderr, "test_sub failed\n");
}
fprintf(stderr, "executing test_mult\n");
if (test_mult()) {
fprintf(stderr, "test_mult succeeded\n");
}
else {
fprintf(stderr, "test_mult failed\n");
}
fprintf(stderr, "executing test_div\n");
if (test_div()) {
fprintf(stderr, "test_div succeeded\n");
}
else {
fprintf(stderr, "test_div failed\n");
}
orte_dss_close();
mca_base_close();
opal_malloc_finalize();
opal_output_finalize();
opal_class_finalize();
return 0;
}
static bool test_add(void)
{
orte_data_value_t dval = ORTE_DATA_VALUE_EMPTY;
orte_std_cntr_t testval=5;
char *tokens[] = {
ORTE_JOB_GLOBALS,
NULL
};
char *keys[] = {
ORTE_JOB_SLOTS_KEY,
NULL
};
int rc;
/* increment a value that doesn't already exist */
if (ORTE_SUCCESS != (rc = orte_dss.set(&dval, (void*)&testval, ORTE_STD_CNTR))) {
ORTE_ERROR_LOG(rc);
return false;
}
if (ORTE_SUCCESS != (rc = orte_gpr.arith(ORTE_GPR_TOKENS_AND | ORTE_GPR_KEYS_OR,
"test-seg", tokens, keys,
ORTE_DSS_ADD, &dval))) {
ORTE_ERROR_LOG(rc);
return false;
}
orte_gpr.dump_segment("test-seg");
/* increment the existing value */
if (ORTE_SUCCESS != (rc = orte_gpr.arith(ORTE_GPR_TOKENS_AND | ORTE_GPR_KEYS_OR,
"test-seg", tokens, keys,
ORTE_DSS_ADD, &dval))) {
ORTE_ERROR_LOG(rc);
return false;
}
orte_gpr.dump_segment("test-seg");
return (true);
}
static bool test_sub(void)
{
return (true);
}
static bool test_mult(void)
{
return(true);
}
static bool test_div(void)
{
return (true);
}

Просмотреть файл

@ -91,7 +91,6 @@ static struct opal_event term_handler;
static struct opal_event int_handler;
static void signal_callback(int fd, short flags, void *arg);
static void orted_local_cb_launcher(orte_gpr_notify_data_t *data, void *user_tag);
/*
* define the orted context table for obtaining parameters
@ -122,10 +121,6 @@ opal_cmd_line_init_t orte_cmd_line_opts[] = {
&orted_globals.debug_daemons_file, OPAL_CMD_LINE_TYPE_BOOL,
"Enable debugging of OpenRTE daemons, storing output in files" },
{ "rmgr", "bootproxy", "jobid", '\0', NULL, "bootproxy", 1,
&orted_globals.bootproxy, OPAL_CMD_LINE_TYPE_INT,
"Run as boot proxy for <job-id>" },
{ NULL, NULL, NULL, '\0', NULL, "set-sid", 0,
&orted_globals.set_sid, OPAL_CMD_LINE_TYPE_BOOL,
"Direct the orted to separate from the current session"},
@ -195,10 +190,7 @@ int main(int argc, char *argv[])
char *log_path = NULL;
char log_file[PATH_MAX];
char *jobidstring;
orte_gpr_value_t *value;
char *segment;
int i;
orte_buffer_t answer;
char * orted_amca_param_path = NULL;
/* initialize the globals */
@ -320,7 +312,7 @@ int main(int argc, char *argv[])
/* Okay, now on to serious business! */
/* Ensure the process info structure in instantiated and initialized
/* Ensure the process info structure is instantiated and initialized
* and set the daemon flag to true
*/
orte_process_info.daemon = true;
@ -383,13 +375,70 @@ int main(int argc, char *argv[])
opal_daemon_init(NULL);
}
/* Intialize the Open RTE */
/* Set the flag telling orte_init that I am NOT a
/* Intialize OPAL */
if (ORTE_SUCCESS != (ret = opal_init())) {
ORTE_ERROR_LOG(ret);
return ret;
}
/* Set the flag telling OpenRTE that I am NOT a
* singleton, but am "infrastructure" - prevents setting
* up incorrect infrastructure that only a singleton would
* require
* require.
*/
if (ORTE_SUCCESS != (ret = orte_init(ORTE_INFRASTRUCTURE, ORTE_NON_BARRIER))) {
if (ORTE_SUCCESS != (ret = orte_init_stage1(ORTE_INFRASTRUCTURE))) {
ORTE_ERROR_LOG(ret);
return ret;
}
/* setup our receive functions - this will allow us to relay messages
* during start for better scalability
*/
/* register the daemon main receive functions */
/* setup to listen for broadcast commands via routed messaging algorithms */
ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ORTED_ROUTED,
ORTE_RML_NON_PERSISTENT, orte_daemon_recv_routed, NULL);
if (ret != ORTE_SUCCESS && ret != ORTE_ERR_NOT_IMPLEMENTED) {
ORTE_ERROR_LOG(ret);
return ret;
}
/* setup to listen for commands sent specifically to me */
ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DAEMON, ORTE_RML_NON_PERSISTENT, orte_daemon_recv, NULL);
if (ret != ORTE_SUCCESS && ret != ORTE_ERR_NOT_IMPLEMENTED) {
ORTE_ERROR_LOG(ret);
return ret;
}
/* Complete initializing the rte - begin recording registry actions */
if (ORTE_SUCCESS != (ret = orte_gpr.begin_compound_cmd())) {
ORTE_ERROR_LOG(ret);
return ret;
}
if (ORTE_SUCCESS != (ret = orte_init_stage2(ORTE_STARTUP_TRIGGER))) {
ORTE_ERROR_LOG(ret);
return ret;
}
/* indicate we are at the ORTE_STARTUP_COMPLETE state */
if (ORTE_SUCCESS != (ret = orte_smr.set_proc_state(ORTE_PROC_MY_NAME,
ORTE_PROC_ORTE_STARTUP_COMPLETE, 0))) {
ORTE_ERROR_LOG(ret);
return ret;
}
/* send the information */
if (ORTE_SUCCESS != (ret = orte_gpr.exec_compound_cmd())) {
ORTE_ERROR_LOG(ret);
return ret;
}
/* Use the barrier capability to hold us
* in orte_init until the orte_setup state is achieved. This
* will allow us to obtain a complete set of contact info
* for all of our fellow daemons
*/
if (ORTE_SUCCESS != (ret = orte_rml.xcast_gate(orte_gpr.deliver_notify_msg))) {
ORTE_ERROR_LOG(ret);
return ret;
}
@ -461,147 +510,18 @@ int main(int argc, char *argv[])
OBJ_CONSTRUCT(&orted_globals.mutex, opal_mutex_t);
OBJ_CONSTRUCT(&orted_globals.condition, opal_condition_t);
/* register the daemon main receive functions */
ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_PLS_ORTED, ORTE_RML_NON_PERSISTENT, orte_daemon_recv_pls, NULL);
/* a daemon should *always* yield the processor when idle */
opal_progress_set_yield_when_idle(true);
/* setup to listen for xcast stage gate commands. We need to do this because updates to the
* contact info for dynamically spawned daemons will come to the gate RML-tag
*/
ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_XCAST_BARRIER,
ORTE_RML_NON_PERSISTENT, orte_daemon_recv_gate, NULL);
if (ret != ORTE_SUCCESS && ret != ORTE_ERR_NOT_IMPLEMENTED) {
ORTE_ERROR_LOG(ret);
return ret;
}
ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DAEMON, ORTE_RML_NON_PERSISTENT, orte_daemon_recv, NULL);
if (ret != ORTE_SUCCESS && ret != ORTE_ERR_NOT_IMPLEMENTED) {
ORTE_ERROR_LOG(ret);
return ret;
}
/* check to see if I'm a bootproxy */
if (orted_globals.bootproxy) { /* perform bootproxy-specific things */
/* a daemon should *always* yield the processor when idle */
opal_progress_set_yield_when_idle(true);
/* attach a subscription to the orted standard trigger so I can get
* information on the processes I am to locally launch as soon as all
* the orteds for this job are started.
*
* Once the registry gets to 2.0, we will be able to setup the
* subscription so we only get our own launch info back. In the interim,
* we setup the subscription so that ALL launch info for this job
* is returned. We will then have to parse that message to get our
* own local launch info.
*
* Since we have chosen this approach, we can take advantage of the
* fact that the callback function will directly receive this data.
* By setting up that callback function to actually perform the launch
* based on the received data, all we have to do here is go into our
* conditioned wait until the job completes!
*
* Sometimes, life can be good! :-)
*/
/** put all this registry stuff in a compound command to limit communications */
if (ORTE_SUCCESS != (ret = orte_gpr.begin_compound_cmd())) {
ORTE_ERROR_LOG(ret);
return ret;
}
/* let the local launcher setup a subscription for its required data. We
* pass the local_cb_launcher function so that this gets called back - this
* allows us to wakeup the orted so it can exit cleanly if the callback
* generates an error
*/
if (ORTE_SUCCESS != (ret = orte_odls.subscribe_launch_data(orted_globals.bootproxy, orted_local_cb_launcher))) {
ORTE_ERROR_LOG(ret);
return ret;
}
/* THIS IS A TEMPORARY PATCH - REPORT NODE AND PROC NAME FOR THIS DAEMON */
if (ORTE_SUCCESS != (ret = orte_gpr.create_value(&value, ORTE_GPR_KEYS_OR|ORTE_GPR_TOKENS_AND,
"orte-job-0", 2, 0))) {
ORTE_ERROR_LOG(ret);
return ret;
}
if (ORTE_SUCCESS != (ret = orte_schema.get_proc_tokens(&(value->tokens), &(value->num_tokens), ORTE_PROC_MY_NAME))) {
ORTE_ERROR_LOG(ret);
return ret;
}
if (ORTE_SUCCESS != (ret = orte_gpr.create_keyval(&(value->keyvals[0]), ORTE_NODE_NAME_KEY, ORTE_STRING, orte_system_info.nodename))) {
ORTE_ERROR_LOG(ret);
return ret;
}
if (ORTE_SUCCESS != (ret = orte_gpr.create_keyval(&(value->keyvals[1]), ORTE_PROC_NAME_KEY, ORTE_NAME, ORTE_PROC_MY_NAME))) {
ORTE_ERROR_LOG(ret);
return ret;
}
if (ORTE_SUCCESS != (ret = orte_gpr.put(1, &value))) {
ORTE_ERROR_LOG(ret);
return ret;
}
OBJ_RELEASE(value);
/* get the job segment name */
if (ORTE_SUCCESS != (ret = orte_schema.get_job_segment_name(&segment, orted_globals.bootproxy))) {
ORTE_ERROR_LOG(ret);
return ret;
}
/** increment the orted stage gate counter */
if (ORTE_SUCCESS != (ret = orte_gpr.create_value(&value, ORTE_GPR_KEYS_OR|ORTE_GPR_TOKENS_AND,
segment, 1, 1))) {
ORTE_ERROR_LOG(ret);
return ret;
}
free(segment); /* done with this now */
value->tokens[0] = strdup(ORTE_JOB_GLOBALS);
if (ORTE_SUCCESS != (ret = orte_gpr.create_keyval(&(value->keyvals[0]), ORTED_LAUNCH_STAGE_GATE_CNTR, ORTE_UNDEF, NULL))) {
ORTE_ERROR_LOG(ret);
return ret;
}
/* do the increment */
if (ORTE_SUCCESS != (ret = orte_gpr.increment_value(value))) {
ORTE_ERROR_LOG(ret);
return ret;
}
OBJ_RELEASE(value); /* done with this now */
/** send the compound command */
if (ORTE_SUCCESS != (ret = orte_gpr.exec_compound_cmd())) {
ORTE_ERROR_LOG(ret);
return ret;
}
/* setup and enter the event monitor to wait for a wakeup call */
OPAL_THREAD_LOCK(&orted_globals.mutex);
while (false == orted_globals.exit_condition) {
opal_condition_wait(&orted_globals.condition, &orted_globals.mutex);
}
OPAL_THREAD_UNLOCK(&orted_globals.mutex);
/* make sure our local procs are dead - but don't update their state
* on the HNP as this may be redundant
*/
orte_odls.kill_local_procs(ORTE_JOBID_WILDCARD, false);
/* cleanup their session directory */
orte_session_dir_cleanup(orted_globals.bootproxy);
/* send an ack - we are as close to done as we can be while
* still able to communicate
*/
OBJ_CONSTRUCT(&answer, orte_buffer_t);
if (0 > orte_rml.send_buffer(ORTE_PROC_MY_HNP, &answer, ORTE_RML_TAG_PLS_ORTED_ACK, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
}
OBJ_DESTRUCT(&answer);
/* Finalize and clean up ourselves */
if (ORTE_SUCCESS != (ret = orte_finalize())) {
ORTE_ERROR_LOG(ret);
}
exit(ret);
}
/*
* Set my process status to "running". Note that this must be done
@ -614,15 +534,7 @@ int main(int argc, char *argv[])
}
if (orted_globals.debug_daemons) {
opal_output(0, "[%lu,%lu,%lu] ompid: issuing callback", ORTE_NAME_ARGS(orte_process_info.my_name));
}
/* go through the universe fields and see what else I need to do
* - could be setup a virtual machine, spawn a console, etc.
*/
if (orted_globals.debug_daemons) {
opal_output(0, "[%lu,%lu,%lu] ompid: setting up event monitor", ORTE_NAME_ARGS(orte_process_info.my_name));
opal_output(0, "[%lu,%lu,%lu] orted: up and running - waiting for commands!", ORTE_NAME_ARGS(orte_process_info.my_name));
}
/* setup and enter the event monitor */
@ -643,46 +555,21 @@ int main(int argc, char *argv[])
unlink(log_path);
}
/* finalize the system */
orte_finalize();
/* make sure our local procs are dead - but don't update their state
* on the HNP as this may be redundant
*/
orte_odls.kill_local_procs(ORTE_JOBID_WILDCARD, false);
if (orted_globals.debug_daemons) {
opal_output(0, "[%lu,%lu,%lu] orted: done - exiting", ORTE_NAME_ARGS(orte_process_info.my_name));
/* cleanup any lingering session directories */
orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
/* Finalize and clean up ourselves */
if (ORTE_SUCCESS != (ret = orte_finalize())) {
ORTE_ERROR_LOG(ret);
}
exit(0);
exit(ret);
}
/* this function receives the trigger callback from the orted launch stage gate
* and passes it to the orted local launcher for processing. We do this intermediate
* step so that we can get an error code if anything went wrong and, if so, wakeup the
* orted so we can gracefully die
*/
static void orted_local_cb_launcher(orte_gpr_notify_data_t *data, void *user_tag)
{
int rc;
if (orted_globals.debug_daemons) {
opal_output(0, "[%lu,%lu,%lu] orted: received launch callback", ORTE_NAME_ARGS(orte_process_info.my_name));
}
/* pass the data to the orted_local_launcher and get a report on
* success or failure of the launch
*/
if (ORTE_SUCCESS != (rc = orte_odls.launch_local_procs(data, orted_globals.saved_environ))) {
/* if there was an error, report it.
* NOTE: it is absolutely imperative that we do not cause the orted to EXIT when
* this happens!!! If we do, then the HNP will "hang" as the orted will no longer
* be around to receive messages telling it what to do in response to the failure
*/
ORTE_ERROR_LOG(rc);
}
/* all done - return and let the orted sleep until something happens */
return;
}
static void signal_callback(int fd, short flags, void *arg)
{
OPAL_TRACE(1);

Просмотреть файл

@ -42,13 +42,11 @@ typedef struct {
char* num_procs;
char* universe;
char **saved_environ;
int bootproxy;
int uri_pipe;
opal_mutex_t mutex;
opal_condition_t condition;
bool exit_condition;
bool spin;
int reap;
} orted_globals_t;
ORTE_DECLSPEC extern orted_globals_t orted_globals;
@ -58,9 +56,14 @@ void orte_daemon_recv(int status, orte_process_name_t* sender,
orte_buffer_t *buffer, orte_rml_tag_t tag,
void* cbdata);
void orte_daemon_recv_pls(int status, orte_process_name_t* sender,
orte_buffer_t *buffer, orte_rml_tag_t tag,
void* cbdata);
void orte_daemon_recv_routed(int status, orte_process_name_t* sender,
orte_buffer_t *buffer, orte_rml_tag_t tag,
void* cbdata);
void orte_daemon_recv_gate(int status, orte_process_name_t* sender,
orte_buffer_t *buffer, orte_rml_tag_t tag,
void* cbdata);
#if defined(c_plusplus) || defined(__cplusplus)
}

Просмотреть файл

@ -49,6 +49,7 @@
#include "orte/util/univ_info.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/gpr/gpr.h"
#include "orte/mca/ns/ns.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/odls/odls.h"
@ -58,14 +59,134 @@
#include "orte/tools/orted/orted.h"
static int binomial_route_msg(orte_buffer_t *buf, orte_jobid_t job, orte_rml_tag_t target_tag);
static int binomial_route_msg(orte_process_name_t *sender,
orte_buffer_t *buf,
orte_rml_tag_t tag);
void orte_daemon_recv_pls(int status, orte_process_name_t* sender,
orte_buffer_t *buffer, orte_rml_tag_t tag,
void* cbdata)
static int process_commands(orte_process_name_t* sender,
orte_buffer_t *buffer,
orte_rml_tag_t tag);
void orte_daemon_recv_routed(int status, orte_process_name_t* sender,
orte_buffer_t *buffer, orte_rml_tag_t tag,
void* cbdata)
{
orte_daemon_cmd_flag_t command, routing_mode;
orte_buffer_t answer, *relay;
orte_daemon_cmd_flag_t routing_mode;
int ret;
orte_std_cntr_t n;
OPAL_TRACE(1);
OPAL_THREAD_LOCK(&orted_globals.mutex);
if (orted_globals.debug_daemons) {
opal_output(0, "[%lu,%lu,%lu] orted_recv_routed: received message from [%ld,%ld,%ld]",
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(sender));
}
/* unpack the routing algorithm */
n = 1;
if (ORTE_SUCCESS != (ret = orte_dss.unpack(buffer, &routing_mode, &n, ORTE_DAEMON_CMD))) {
ORTE_ERROR_LOG(ret);
goto CLEANUP;
}
/* if the mode is BINOMIAL, then handle that elsewhere */
if (ORTE_DAEMON_ROUTE_BINOMIAL == routing_mode) {
if (ORTE_SUCCESS != (ret = binomial_route_msg(sender, buffer, tag))) {
ORTE_ERROR_LOG(ret);
goto CLEANUP;
}
} else {
/* process the command locally */
if (ORTE_SUCCESS != (ret = process_commands(sender, buffer, tag))) {
ORTE_ERROR_LOG(ret);
}
}
CLEANUP:
OPAL_THREAD_UNLOCK(&orted_globals.mutex);
/* reissue the non-blocking receive */
ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ORTED_ROUTED,
ORTE_RML_NON_PERSISTENT, orte_daemon_recv_routed, NULL);
if (ret != ORTE_SUCCESS && ret != ORTE_ERR_NOT_IMPLEMENTED) {
ORTE_ERROR_LOG(ret);
}
}
void orte_daemon_recv(int status, orte_process_name_t* sender,
orte_buffer_t *buffer, orte_rml_tag_t tag,
void* cbdata)
{
int ret;
OPAL_TRACE(1);
OPAL_THREAD_LOCK(&orted_globals.mutex);
if (orted_globals.debug_daemons) {
opal_output(0, "[%lu,%lu,%lu] orted_recv_cmd: received message from [%ld,%ld,%ld]",
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(sender));
}
/* process the command */
if (ORTE_SUCCESS != (ret = process_commands(sender, buffer, tag))) {
ORTE_ERROR_LOG(ret);
}
OPAL_THREAD_UNLOCK(&orted_globals.mutex);
/* reissue the non-blocking receive */
ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DAEMON,
ORTE_RML_NON_PERSISTENT, orte_daemon_recv, NULL);
if (ret != ORTE_SUCCESS && ret != ORTE_ERR_NOT_IMPLEMENTED) {
ORTE_ERROR_LOG(ret);
}
}
void orte_daemon_recv_gate(int status, orte_process_name_t* sender,
orte_buffer_t *buffer, orte_rml_tag_t tag,
void* cbdata)
{
int rc;
orte_std_cntr_t i;
orte_gpr_notify_message_t *mesg;
mesg = OBJ_NEW(orte_gpr_notify_message_t);
if (NULL == mesg) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return;
}
i=1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &mesg, &i, ORTE_GPR_NOTIFY_MSG))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(mesg);
return;
}
if (ORTE_SUCCESS != (rc = orte_gpr.deliver_notify_msg(mesg))) {
ORTE_ERROR_LOG(rc);
}
OBJ_RELEASE(mesg);
/* reissue the non-blocking receive */
rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_XCAST_BARRIER,
ORTE_RML_NON_PERSISTENT, orte_daemon_recv_gate, NULL);
if (rc != ORTE_SUCCESS && rc != ORTE_ERR_NOT_IMPLEMENTED) {
ORTE_ERROR_LOG(rc);
}
}
static int process_commands(orte_process_name_t* sender,
orte_buffer_t *buffer,
orte_rml_tag_t tag)
{
orte_daemon_cmd_flag_t command;
orte_buffer_t *relay;
int ret;
orte_std_cntr_t n;
int32_t signal;
@ -73,24 +194,22 @@ void orte_daemon_recv_pls(int status, orte_process_name_t* sender,
orte_jobid_t *jobs, job;
orte_std_cntr_t num_jobs;
orte_rml_tag_t target_tag;
OPAL_TRACE(1);
OPAL_THREAD_LOCK(&orted_globals.mutex);
if (orted_globals.debug_daemons) {
opal_output(0, "[%lu,%lu,%lu] orted_recv_pls: received message from [%ld,%ld,%ld]",
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(sender));
}
opal_list_t attrs;
opal_list_item_t *item;
char *contact_info;
orte_buffer_t *answer;
orte_rml_cmd_flag_t rml_cmd;
orte_gpr_notify_message_t *mesg;
char *unpack_ptr;
/* unpack the command */
n = 1;
if (ORTE_SUCCESS != (ret = orte_dss.unpack(buffer, &command, &n, ORTE_DAEMON_CMD))) {
ORTE_ERROR_LOG(ret);
goto CLEANUP;
return ret;
}
/* now process the command locally */
switch(command) {
/**** KILL_LOCAL_PROCS ****/
@ -105,12 +224,13 @@ void orte_daemon_recv_pls(int status, orte_process_name_t* sender,
jobs = (orte_jobid_t*)malloc(num_jobs * sizeof(orte_jobid_t));
if (ORTE_SUCCESS != (ret = orte_dss.unpack(buffer, jobs, &num_jobs, ORTE_JOBID))) {
ORTE_ERROR_LOG(ret);
free(jobs);
goto CLEANUP;
}
for (n=0; n < num_jobs; n++) {
if (orted_globals.debug_daemons) {
opal_output(0, "[%lu,%lu,%lu] orted_recv_pls: received kill_local_procs for job %ld",
opal_output(0, "[%lu,%lu,%lu] orted_cmd: received kill_local_procs for job %ld",
ORTE_NAME_ARGS(orte_process_info.my_name), (long)jobs[n]);
}
@ -124,7 +244,7 @@ void orte_daemon_recv_pls(int status, orte_process_name_t* sender,
/**** SIGNAL_LOCAL_PROCS ****/
case ORTE_DAEMON_SIGNAL_LOCAL_PROCS:
if (orted_globals.debug_daemons) {
opal_output(0, "[%lu,%lu,%lu] orted_recv_pls: received signal_local_procs",
opal_output(0, "[%lu,%lu,%lu] orted_cmd: received signal_local_procs",
ORTE_NAME_ARGS(orte_process_info.my_name));
}
/* unpack the number of jobids */
@ -137,6 +257,7 @@ void orte_daemon_recv_pls(int status, orte_process_name_t* sender,
jobs = (orte_jobid_t*)malloc(num_jobs * sizeof(orte_jobid_t));
if (ORTE_SUCCESS != (ret = orte_dss.unpack(buffer, jobs, &num_jobs, ORTE_JOBID))) {
ORTE_ERROR_LOG(ret);
free(jobs);
goto CLEANUP;
}
@ -144,6 +265,7 @@ void orte_daemon_recv_pls(int status, orte_process_name_t* sender,
n = 1;
if (ORTE_SUCCESS != (ret = orte_dss.unpack(buffer, &signal, &n, ORTE_INT32))) {
ORTE_ERROR_LOG(ret);
free(jobs);
goto CLEANUP;
}
@ -157,7 +279,7 @@ void orte_daemon_recv_pls(int status, orte_process_name_t* sender,
/**** ADD_LOCAL_PROCS ****/
case ORTE_DAEMON_ADD_LOCAL_PROCS:
if (orted_globals.debug_daemons) {
opal_output(0, "[%lu,%lu,%lu] orted_recv_pls: received add_local_procs",
opal_output(0, "[%lu,%lu,%lu] orted_cmd: received add_local_procs",
ORTE_NAME_ARGS(orte_process_info.my_name));
}
/* unpack the notify data object */
@ -179,17 +301,10 @@ void orte_daemon_recv_pls(int status, orte_process_name_t* sender,
/**** DELIVER A MESSAGE TO THE LOCAL PROCS ****/
case ORTE_DAEMON_MESSAGE_LOCAL_PROCS:
if (orted_globals.debug_daemons) {
opal_output(0, "[%lu,%lu,%lu] orted_recv_pls: received message_local_procs",
opal_output(0, "[%lu,%lu,%lu] orted_cmd: received message_local_procs",
ORTE_NAME_ARGS(orte_process_info.my_name));
}
/* unpack the routing algorithm */
n = 1;
if (ORTE_SUCCESS != (ret = orte_dss.unpack(buffer, &routing_mode, &n, ORTE_DAEMON_CMD))) {
ORTE_ERROR_LOG(ret);
goto CLEANUP;
}
/* unpack the jobid of the procs that are to receive the message */
n = 1;
if (ORTE_SUCCESS != (ret = orte_dss.unpack(buffer, &job, &n, ORTE_JOBID))) {
@ -204,21 +319,80 @@ void orte_daemon_recv_pls(int status, orte_process_name_t* sender,
goto CLEANUP;
}
/* if the mode is BINOMIAL, then relay it on before doing anything else */
if (ORTE_DAEMON_ROUTE_BINOMIAL == routing_mode) {
if (ORTE_SUCCESS != (ret = binomial_route_msg(buffer, job, target_tag))) {
ORTE_ERROR_LOG(ret);
goto CLEANUP;
}
}
relay = OBJ_NEW(orte_buffer_t);
orte_dss.copy_payload(relay, buffer);
/* now deliver the message to our children */
if (ORTE_SUCCESS != (ret = orte_odls.deliver_message(job, relay, target_tag))) {
ORTE_ERROR_LOG(ret);
goto CLEANUP;
/* if job=0, then this message is for us and not for our children */
if (0 == job) {
/* if the target tag is our xcast_barrier or rml_update, then we have
* to handle the message as a special case. The RML has logic in it
* intended to make it easier to use. This special logic mandates that
* any message we "send" actually only goes into the queue for later
* transmission. Thus, since we are already in a recv when we enter
* the "process_commands" function, any attempt to "send" the relay
* buffer to ourselves will only be added to the queue - it won't
* actually be delivered until *after* we conclude the processing
* of the current recv.
*
* The problem here is that, for messages where we need to relay
* them along the orted chain, the xcast_barrier and rml_update
* messages contain contact info we may well need in order to do
* the relay! So we need to process those messages immediately.
* The only way to accomplish that is to (a) detect that the
* buffer is intended for those tags, and then (b) process
* those buffers here.
*
* NOTE: in the case of xcast_barrier, we *also* must send the
* message along anyway so that it will release us from the
* barrier! So we will process that info twice - can't be helped
* and won't harm anything
*/
if (ORTE_RML_TAG_XCAST_BARRIER == target_tag) {
/* need to preserve the relay buffer's pointers so it can be
* unpacked again at the barrier
*/
unpack_ptr = relay->unpack_ptr;
mesg = OBJ_NEW(orte_gpr_notify_message_t);
n = 1;
if (ORTE_SUCCESS != (ret = orte_dss.unpack(relay, &mesg, &n, ORTE_GPR_NOTIFY_MSG))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(mesg);
goto CLEANUP;
}
orte_gpr.deliver_notify_msg(mesg);
OBJ_RELEASE(mesg);
/* restore the unpack ptr in the buffer */
relay->unpack_ptr = unpack_ptr;
/* make sure we queue this up for later delivery to release us from the barrier */
if ((ret = orte_rml.send_buffer(ORTE_PROC_MY_NAME, relay, target_tag, 0)) < 0) {
ORTE_ERROR_LOG(ret);
} else {
ret = ORTE_SUCCESS;
}
} else if (ORTE_RML_TAG_RML == target_tag) {
n = 1;
if (ORTE_SUCCESS != (ret = orte_dss.unpack(relay, &rml_cmd, &n, ORTE_RML_CMD))) {
ORTE_ERROR_LOG(ret);
goto CLEANUP;
}
if (ORTE_SUCCESS != (ret = orte_dss.unpack(relay, &ndat, &n, ORTE_GPR_NOTIFY_DATA))) {
ORTE_ERROR_LOG(ret);
goto CLEANUP;
}
orte_rml.update_contact_info(ndat, NULL);
} else {
/* just deliver it to ourselves */
if ((ret = orte_rml.send_buffer(ORTE_PROC_MY_NAME, relay, target_tag, 0)) < 0) {
ORTE_ERROR_LOG(ret);
} else {
ret = ORTE_SUCCESS;
}
}
} else {
/* must be for our children - deliver the message */
if (ORTE_SUCCESS != (ret = orte_odls.deliver_message(job, relay, target_tag))) {
ORTE_ERROR_LOG(ret);
}
}
OBJ_RELEASE(relay);
break;
@ -230,195 +404,120 @@ void orte_daemon_recv_pls(int status, orte_process_name_t* sender,
* the same as an exit_vm "hard kill" command
*/
if (orted_globals.debug_daemons) {
opal_output(0, "[%lu,%lu,%lu] orted_recv_pls: received exit",
opal_output(0, "[%lu,%lu,%lu] orted_cmd: received exit",
ORTE_NAME_ARGS(orte_process_info.my_name));
}
/* no response to send here - we'll send it when nearly exit'd */
orted_globals.exit_condition = true;
opal_condition_signal(&orted_globals.condition);
/* have to unlock here as we are waking up and will
* do things inside the orted
*/
OPAL_THREAD_UNLOCK(&orted_globals.mutex);
return;
return ORTE_SUCCESS;
break;
/**** HALT VM COMMAND ****/
case ORTE_DAEMON_HALT_VM_CMD:
if (orted_globals.debug_daemons) {
opal_output(0, "[%lu,%lu,%lu] orted_recv_pls: received halt vm",
opal_output(0, "[%lu,%lu,%lu] orted_cmd: received halt vm",
ORTE_NAME_ARGS(orte_process_info.my_name));
}
/* no response to send here - we'll send it when nearly exit'd */
/* if we are the HNP, then terminate all orteds reporting to us */
if (orte_process_info.seed) {
OBJ_CONSTRUCT(&attrs, opal_list_t);
orte_rmgr.add_attribute(&attrs, ORTE_DAEMON_HARD_KILL, ORTE_UNDEF, NULL, ORTE_RMGR_ATTR_OVERRIDE);
ret = orte_pls.terminate_orteds(&orte_abort_timeout, &attrs);
while (NULL != (item = opal_list_remove_first(&attrs))) OBJ_RELEASE(item);
OBJ_DESTRUCT(&attrs);
}
/* wake up so we can exit too */
orted_globals.exit_condition = true;
opal_condition_signal(&orted_globals.condition);
/* have to unlock here as we are waking up and will
* do things inside the orted
*/
OPAL_THREAD_UNLOCK(&orted_globals.mutex);
return;
return ORTE_SUCCESS;
break;
/**** CONTACT QUERY COMMAND ****/
case ORTE_DAEMON_CONTACT_QUERY_CMD:
if (orted_globals.debug_daemons) {
opal_output(0, "[%lu,%lu,%lu] orted_cmd: received contact query",
ORTE_NAME_ARGS(orte_process_info.my_name));
}
/* send back contact info */
contact_info = orte_rml.get_uri();
if (NULL == contact_info) {
ORTE_ERROR_LOG(ORTE_ERROR);
ret = ORTE_ERROR;
goto CLEANUP;
}
/* setup buffer with answer */
answer = OBJ_NEW(orte_buffer_t);
if (ORTE_SUCCESS != (ret = orte_dss.pack(answer, &contact_info, 1, ORTE_STRING))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(answer);
goto CLEANUP;
}
if (0 > orte_rml.send_buffer(sender, answer, tag, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
ret = ORTE_ERR_COMM_FAILURE;
}
OBJ_RELEASE(answer);
break;
/**** HOSTFILE COMMAND ****/
case ORTE_DAEMON_HOSTFILE_CMD:
ORTE_ERROR_LOG(ORTE_ERR_NOT_IMPLEMENTED);
ret = ORTE_ERR_NOT_IMPLEMENTED;
break;
/**** SCRIPTFILE COMMAND ****/
case ORTE_DAEMON_SCRIPTFILE_CMD:
ORTE_ERROR_LOG(ORTE_ERR_NOT_IMPLEMENTED);
ret = ORTE_ERR_NOT_IMPLEMENTED;
break;
/**** HEARTBEAT COMMAND ****/
case ORTE_DAEMON_HEARTBEAT_CMD:
ORTE_ERROR_LOG(ORTE_ERR_NOT_IMPLEMENTED);
ret = ORTE_ERR_NOT_IMPLEMENTED;
break;
/**** WARMUP CONNECTION TO LOCAL PROC ****/
case ORTE_DAEMON_WARMUP_LOCAL_CONN:
/* nothing to do here - just ignore it */
if (orted_globals.debug_daemons) {
opal_output(0, "[%lu,%lu,%lu] orted_recv_pls: received connection from local proc",
opal_output(0, "[%lu,%lu,%lu] orted_recv: received connection from local proc",
ORTE_NAME_ARGS(orte_process_info.my_name));
}
ret = ORTE_SUCCESS;
break;
default:
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
break;
ret = ORTE_ERR_BAD_PARAM;
}
CLEANUP:
/* send an ack that command is done */
OBJ_CONSTRUCT(&answer, orte_buffer_t);
if (0 > orte_rml.send_buffer(sender, &answer, ORTE_RML_TAG_PLS_ORTED_ACK, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
}
OBJ_DESTRUCT(&answer);
OPAL_THREAD_UNLOCK(&orted_globals.mutex);
/* reissue the non-blocking receive */
ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_PLS_ORTED, ORTE_RML_NON_PERSISTENT, orte_daemon_recv_pls, NULL);
if (ret != ORTE_SUCCESS && ret != ORTE_ERR_NOT_IMPLEMENTED) {
ORTE_ERROR_LOG(ret);
}
return;
}
static void halt_vm(void)
{
int ret;
opal_list_t attrs;
opal_list_item_t *item;
/* terminate the vm - this will also wake us up so we can exit */
OBJ_CONSTRUCT(&attrs, opal_list_t);
orte_rmgr.add_attribute(&attrs, ORTE_DAEMON_HARD_KILL, ORTE_UNDEF, NULL, ORTE_RMGR_ATTR_OVERRIDE);
ret = orte_pls.terminate_orteds(&orte_abort_timeout, &attrs);
while (NULL != (item = opal_list_remove_first(&attrs))) OBJ_RELEASE(item);
OBJ_DESTRUCT(&attrs);
/* Trigger the normal exit conditions */
orted_globals.exit_condition = true;
opal_condition_signal(&orted_globals.condition);
}
void orte_daemon_recv(int status, orte_process_name_t* sender,
orte_buffer_t *buffer, orte_rml_tag_t tag,
void* cbdata)
{
orte_buffer_t *answer;
orte_daemon_cmd_flag_t command;
int ret;
orte_std_cntr_t n;
char *contact_info;
OPAL_TRACE(1);
OPAL_THREAD_LOCK(&orted_globals.mutex);
if (orted_globals.debug_daemons) {
opal_output(0, "[%lu,%lu,%lu] orted_recv: received message from [%ld,%ld,%ld]",
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(sender));
}
n = 1;
if (ORTE_SUCCESS != (ret = orte_dss.unpack(buffer, &command, &n, ORTE_DAEMON_CMD))) {
ORTE_ERROR_LOG(ret);
OPAL_THREAD_UNLOCK(&orted_globals.mutex);
return;
}
answer = OBJ_NEW(orte_buffer_t);
if (NULL == answer) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
goto DONE;
}
switch(command) {
/**** EXIT COMMAND ****/
case ORTE_DAEMON_EXIT_CMD:
if (orted_globals.debug_daemons) {
opal_output(0, "[%lu,%lu,%lu] orted_recv: received exit",
ORTE_NAME_ARGS(orte_process_info.my_name));
}
orted_globals.exit_condition = true;
opal_condition_signal(&orted_globals.condition);
break;
/**** HALT VM COMMAND ****/
case ORTE_DAEMON_HALT_VM_CMD:
if (orted_globals.debug_daemons) {
opal_output(0, "[%lu,%lu,%lu] orted_recv: received halt vm",
ORTE_NAME_ARGS(orte_process_info.my_name));
}
halt_vm();
break;
/**** CONTACT QUERY COMMAND ****/
case ORTE_DAEMON_CONTACT_QUERY_CMD:
/* send back contact info */
contact_info = orte_rml.get_uri();
if (NULL == contact_info) {
ORTE_ERROR_LOG(ORTE_ERROR);
goto CLEANUP;
}
if (ORTE_SUCCESS != (ret = orte_dss.pack(answer, &contact_info, 1, ORTE_STRING))) {
ORTE_ERROR_LOG(ret);
goto CLEANUP;
}
if (0 > orte_rml.send_buffer(sender, answer, tag, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
}
break;
/**** HOSTFILE COMMAND ****/
case ORTE_DAEMON_HOSTFILE_CMD:
ORTE_ERROR_LOG(ORTE_ERR_NOT_IMPLEMENTED);
break;
/**** SCRIPTFILE COMMAND ****/
case ORTE_DAEMON_SCRIPTFILE_CMD:
ORTE_ERROR_LOG(ORTE_ERR_NOT_IMPLEMENTED);
break;
/**** HEARTBEAT COMMAND ****/
case ORTE_DAEMON_HEARTBEAT_CMD:
ORTE_ERROR_LOG(ORTE_ERR_NOT_IMPLEMENTED);
break;
default:
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
}
CLEANUP:
OBJ_RELEASE(answer);
DONE:
OPAL_THREAD_UNLOCK(&orted_globals.mutex);
/* reissue the non-blocking receive */
ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DAEMON, ORTE_RML_NON_PERSISTENT, orte_daemon_recv, NULL);
if (ret != ORTE_SUCCESS && ret != ORTE_ERR_NOT_IMPLEMENTED) {
ORTE_ERROR_LOG(ret);
}
return;
return ret;
}
static int binomial_route_msg(orte_buffer_t *buf, orte_jobid_t job, orte_rml_tag_t target_tag)
static int binomial_route_msg(orte_process_name_t *sender,
orte_buffer_t *buf,
orte_rml_tag_t tag)
{
orte_daemon_cmd_flag_t mode;
orte_std_cntr_t n, num_daemons;
int i, bitmap, peer, size, rank, hibit, mask;
orte_process_name_t target;
orte_daemon_cmd_flag_t command;
orte_buffer_t *relay=NULL;
orte_buffer_t *relay;
int ret;
/* initialize the relay buffer */
@ -428,33 +527,14 @@ static int binomial_route_msg(orte_buffer_t *buf, orte_jobid_t job, orte_rml_tag
return ORTE_ERR_OUT_OF_RESOURCE;
}
/* tell the downstream daemons this is a message for their local procs */
command=ORTE_DAEMON_MESSAGE_LOCAL_PROCS;
if (ORTE_SUCCESS != (ret = orte_dss.pack(relay, &command, 1, ORTE_DAEMON_CMD))) {
ORTE_ERROR_LOG(ret);
goto CLEANUP;
}
/* tell the downstream daemons the routing algorithm is binomial */
command = ORTE_DAEMON_ROUTE_BINOMIAL;
if (ORTE_SUCCESS != (ret = orte_dss.pack(relay, &command, 1, ORTE_DAEMON_CMD))) {
mode = ORTE_DAEMON_ROUTE_BINOMIAL;
if (ORTE_SUCCESS != (ret = orte_dss.pack(relay, &mode, 1, ORTE_DAEMON_CMD))) {
ORTE_ERROR_LOG(ret);
goto CLEANUP;
}
/* tell the downstream daemons the jobid of the procs that are to receive the message */
if (ORTE_SUCCESS != (ret = orte_dss.pack(relay, &job, 1, ORTE_JOBID))) {
ORTE_ERROR_LOG(ret);
goto CLEANUP;
}
/* tell the downstream daemons the tag where the message is to be delivered */
if (ORTE_SUCCESS != (ret = orte_dss.pack(relay, &target_tag, 1, ORTE_RML_TAG))) {
ORTE_ERROR_LOG(ret);
goto CLEANUP;
}
/* unpack the current number of daemons */
/* unpack the current number of daemons - we need it here! */
n = 1;
if (ORTE_SUCCESS != (ret = orte_dss.unpack(buf, &num_daemons, &n, ORTE_STD_CNTR))) {
ORTE_ERROR_LOG(ret);
@ -467,12 +547,24 @@ static int binomial_route_msg(orte_buffer_t *buf, orte_jobid_t job, orte_rml_tag
goto CLEANUP;
}
/* copy the message payload to the relay buffer - this is non-destructive */
/* copy the message payload to the relay buffer - this is non-destructive
* Note that this still includes the target job and target tag data
* required for eventual delivery of the payload
*/
if (ORTE_SUCCESS != (ret = orte_dss.copy_payload(relay, buf))) {
ORTE_ERROR_LOG(ret);
goto CLEANUP;
}
/* process the command locally - we need to do this prior to attempting
* to send the message to the next recipient in case this message
* contains address information for that recipient. If we don't, then
* the send will fail
*/
if (ORTE_SUCCESS != (ret = process_commands(sender, buf, tag))) {
ORTE_ERROR_LOG(ret);
}
/* compute the bitmap */
bitmap = opal_cube_dim((int)num_daemons);
rank = (int)ORTE_PROC_MY_NAME->vpid;
@ -488,7 +580,7 @@ static int binomial_route_msg(orte_buffer_t *buf, orte_jobid_t job, orte_rml_tag
if (peer < size) {
target.vpid = (orte_vpid_t)peer;
opal_output(0, "[%ld,%ld,%ld] relaying to [%ld,%ld,%ld]", ORTE_NAME_ARGS(ORTE_PROC_MY_NAME), ORTE_NAME_ARGS(&target));
if (0 > (ret = orte_rml.send_buffer(&target, relay, ORTE_RML_TAG_PLS_ORTED, 0))) {
if (0 > (ret = orte_rml.send_buffer(&target, relay, ORTE_RML_TAG_ORTED_ROUTED, 0))) {
ORTE_ERROR_LOG(ret);
goto CLEANUP;
}
@ -496,7 +588,7 @@ static int binomial_route_msg(orte_buffer_t *buf, orte_jobid_t job, orte_rml_tag
}
CLEANUP:
if (NULL != relay) OBJ_RELEASE(relay);
OBJ_RELEASE(relay);
return ORTE_SUCCESS;
}

Просмотреть файл

@ -300,10 +300,6 @@ opal_cmd_line_init_t cmd_line_init[] = {
&orterun_globals.do_not_launch, OPAL_CMD_LINE_TYPE_BOOL,
"Perform all necessary operations to prepare to launch the application, but do not actually launch it" },
{ "pls", "base", "reuse_daemons", '\0', "reuse-daemons", "reuse-daemons", 0,
NULL, OPAL_CMD_LINE_TYPE_BOOL,
"If set, reuse daemons to launch dynamically spawned processes"},
{ NULL, NULL, NULL, '\0', NULL, "prefix", 1,
NULL, OPAL_CMD_LINE_TYPE_STRING,
"Prefix where Open MPI is installed on remote nodes" },
@ -898,13 +894,6 @@ static void abort_signal_callback(int fd, short flags, void *arg)
orterun_basename);
}
/* tell the pls to cancel the terminate request -
* obviously, something is wrong at this point
*/
if (ORTE_SUCCESS != (ret = orte_pls.cancel_operation())) {
ORTE_ERROR_LOG(ret);
}
/* We are in an event handler; exit_callback() will delete
the handler that is currently running (which is a Bad
Thing), so we can't call it directly. Instead, we have

Просмотреть файл

@ -518,32 +518,44 @@ orte_session_dir_cleanup(orte_jobid_t jobid)
{
int rc;
char *tmp;
char *job, *job_session_dir;
char *job=NULL, *job_session_dir=NULL;
/* need to setup the top_session_dir with the prefix */
tmp = opal_os_path(false,
orte_process_info.tmpdir_base,
orte_process_info.top_session_dir, NULL);
/* define the proc and job session directories for this process */
if (ORTE_SUCCESS != (rc = orte_ns.convert_jobid_to_string(&job, jobid))) {
ORTE_ERROR_LOG(rc);
free(tmp);
return rc;
}
job_session_dir = opal_os_path( false, orte_process_info.universe_session_dir,
job, NULL );
if( NULL == job_session_dir ) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
free(tmp);
free(job);
return ORTE_ERR_OUT_OF_RESOURCE;
if (ORTE_JOBID_WILDCARD != jobid) {
/* define the proc and job session directories for this process */
if (ORTE_SUCCESS != (rc = orte_ns.convert_jobid_to_string(&job, jobid))) {
ORTE_ERROR_LOG(rc);
free(tmp);
return rc;
}
job_session_dir = opal_os_path( false, orte_process_info.universe_session_dir,
job, NULL );
if( NULL == job_session_dir ) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
free(tmp);
free(job);
return ORTE_ERR_OUT_OF_RESOURCE;
}
opal_os_dirpath_destroy(job_session_dir,
true, orte_dir_check_file);
/* take out the universe session dir, but only if there
* are no remaining job session dirs around
*/
opal_os_dirpath_destroy(orte_process_info.universe_session_dir,
false, orte_dir_check_file);
} else {
/* if we want the session_dir removed for ALL jobids, then
* just recursively blow the whole universe session away
*/
opal_os_dirpath_destroy(orte_process_info.universe_session_dir,
true, orte_dir_check_file);
}
opal_os_dirpath_destroy(job_session_dir,
true, orte_dir_check_file);
opal_os_dirpath_destroy(orte_process_info.universe_session_dir,
false, orte_dir_check_file);
opal_os_dirpath_destroy(tmp,
false, orte_dir_check_file);
@ -584,8 +596,8 @@ orte_session_dir_cleanup(orte_jobid_t jobid)
CLEANUP:
free(tmp);
free(job);
free(job_session_dir);
if (NULL != job) free(job);
if (NULL != job_session_dir) free(job_session_dir);
return ORTE_SUCCESS;
}