1
1

* Merge changes from tim branch from r 4821 to 4892. Tree can now run

MPI and non-ORTE applications for RSH on one node with or without
  threads.  I think we're approaching convergence with the tim branch

This commit was SVN r4895.
Этот коммит содержится в:
Brian Barrett 2005-03-18 03:43:59 +00:00
родитель e89b6ba7f3
Коммит 77c65d69cc
90 изменённых файлов: 2953 добавлений и 1022 удалений

Просмотреть файл

@ -1,6 +1,112 @@
This file contains information on merging the branches/tim tree into the
trunk.
UPDATE MERGE:
svn merge -r4821:4892 svn+ssh://svn.open-mpi.org/l/svn/ompi/branches/tim .
RESULTS:
M test/mca/gpr
M test/mca/gpr/gpr_test.c
M test/mca/gpr/gpr_test_proxy.c
M test/mca/gpr/Makefile.am
A + test/mca/gpr/gpr_test_overwrite.c
M RTE_MERGE_README
? src/event/event.c.working
? src/event/event.c.merge-right.r4892
? src/event/event.c.merge-left.r4821
M src/event/signal.c
C src/event/event.c
M src/event/event.h
M src/runtime/orte_wait.h
M src/runtime/orte_restart.c
M src/runtime/ompi_progress.c
M src/runtime/orte_universe_exists.c
M src/runtime/orte_init.c
M src/runtime/ompi_progress.h
M src/tools/ompi_info/components.cc
M src/tools/orted/orted.c
M src/tools/orterun/orterun.c
M src/tools/orterun/help-orterun.txt
M src/tools/orterun/Makefile.am
M src/mca/oob/base/oob_base_init.c
M src/mca/ns/replica/src/ns_replica_component.c
M src/mca/ns/proxy/src/ns_proxy.c
M src/mca/ns/base/base.h
M src/mca/ns/base/ns_base_nds.h
M src/mca/ns/base/ns_base_nds_env.c
M src/mca/ns/base/ns_base_nds_pipe.c
M src/mca/ras/lsf_bproc/ras_lsf_bproc_component.c
M src/mca/ras/bjs/ras_bjs_component.c
M src/mca/pml/base/pml_base_close.c
M src/mca/gpr/replica/api_layer/gpr_replica_segment_ops_api.c
M src/mca/gpr/replica/api_layer/gpr_replica_dump_api.c
M src/mca/gpr/replica/api_layer/gpr_replica_put_get_api.c
M src/mca/gpr/replica/api_layer/gpr_replica_arithmetic_ops_api.c
M src/mca/gpr/replica/communications/gpr_replica_arithmetic_ops_cm.c
M src/mca/gpr/replica/communications/gpr_replica_del_index_cm.c
M src/mca/gpr/replica/communications/gpr_replica_subscribe_cm.c
M src/mca/gpr/replica/communications/gpr_replica_cleanup_cm.c
M src/mca/gpr/replica/communications/gpr_replica_put_get_cm.c
M src/mca/gpr/replica/functional_layer/gpr_replica_dump_fn.c
M src/mca/gpr/replica/functional_layer/gpr_replica_trig_ops_fn.c
M src/mca/gpr/replica/functional_layer/gpr_replica_put_get_fn.c
M src/mca/gpr/replica/functional_layer/gpr_replica_messaging_fn.c
M src/mca/gpr/replica/functional_layer/gpr_replica_subscribe_fn.c
M src/mca/gpr/replica/functional_layer/gpr_replica_segment_fn.c
M src/mca/gpr/proxy/gpr_proxy_dump.c
M src/mca/gpr/proxy/gpr_proxy_compound_cmd.c
M src/mca/gpr/proxy/gpr_proxy_put_get.c
M src/mca/gpr/proxy/gpr_proxy_cleanup.c
M src/mca/gpr/proxy/gpr_proxy_arithmetic_ops.c
M src/mca/gpr/proxy/gpr_proxy_del_index.c
M src/mca/gpr/proxy/gpr_proxy_subscribe.c
M src/mca/gpr/proxy/gpr_proxy_component.c
M src/mca/gpr/base/base.h
M src/mca/gpr/base/unpack_api_response/gpr_base_dump_notify.c
M src/mca/rds/hostfile/rds_hostfile.c
M src/mca/rds/hostfile/rds_hostfile_component.c
M src/mca/rds/hostfile/rds_hostfile.h
M src/mca/rml/rml_types.h
M src/mca/errmgr/base/errmgr_base_fns.c
A + src/mca/pls/tm/src/pls_tm_child.c
M src/mca/pls/tm/src/pls_tm_module.c
M src/mca/pls/tm/src/pls_tm.h
M src/mca/pls/tm/src/pls_tm_registry.c
M src/mca/pls/tm/src/Makefile.am
M src/mca/pls/fork/pls_fork_module.c
M src/mca/pls/bproc_seed/pls_bproc_seed.c
M src/mca/pls/bproc_seed/pls_bproc_seed_component.c
M src/mca/pls/base/pls_base_open.c
M src/mca/pls/rsh/pls_rsh_module.c
M src/mca/pls/rsh/pls_rsh.h
M src/mca/pls/rsh/pls_rsh_component.c
? src/mca/io/base/io_base_component_list.c.merge-right.r4892
? src/mca/io/base/io_base_component_list.c.merge-left.r4821
? src/mca/io/base/io_base_component_list.c.working
C src/mca/io/base/io_base_component_list.c
M src/mca/base/mca_base_module_exchange.c
M src/mca/base/mca_base_param.c
M src/mca/base/mca_base_cmd_line.c
M src/mca/iof/proxy/iof_proxy_component.c
M src/mca/iof/base/base.h
M src/mca/iof/base/iof_base_select.c
M src/mca/iof/base/iof_base_open.c
M src/mca/iof/base/iof_base_close.c
A + src/mca/iof/null
A + src/mca/iof/null/configure.params
A + src/mca/iof/null/Makefile.am
A + src/mca/iof/null/iof_null_component.c
A + src/mca/iof/null/iof_null_module.c
A + src/mca/iof/null/iof_null.h
A + src/mca/iof/null/configure.stub
M src/mca/iof/svc/iof_svc_component.c
M src/attribute/attribute_predefined.c
M src/util/sys_info.c
M src/util/path.c
INITIAL MERGE:
svn merge -r3853:4821 svn+ssh://svn.open-mpi.org/l/svn/ompi/branches/tim .

Просмотреть файл

@ -227,7 +227,6 @@ void ompi_attr_create_predefined_callback(
}
}
}
OBJ_RELEASE(data);
/* DO NOT CHANGE THE ORDER OF CREATING THESE KEYVALS! This order
strictly adheres to the order in mpi.h. If you change the

Просмотреть файл

@ -218,6 +218,16 @@ static void* ompi_event_run(ompi_object_t* arg)
int rc = ompi_event_loop(0);
assert(rc >= 0);
#endif
#if OMPI_ENABLE_PROGRESS_THREADS
OMPI_THREAD_LOCK(&ompi_event_lock);
ompi_event_del_i(&ompi_event_pipe_event);
close(ompi_event_pipe[0]);
close(ompi_event_pipe[1]);
ompi_event_pipe[0] = -1;
ompi_event_pipe[1] = -1;
OMPI_THREAD_UNLOCK(&ompi_event_lock);
#endif
return NULL;
}
@ -260,22 +270,6 @@ ompi_event_init(void)
errx(1, "%s: no event mechanism available", __func__);
#if OMPI_ENABLE_PROGRESS_THREADS
if(ompi_using_threads()) {
if(pipe(ompi_event_pipe) != 0) {
ompi_output(0, "ompi_event_init: pipe() failed with errno=%d\n", errno);
return OMPI_ERROR;
}
ompi_event_pipe_signalled = 1;
ompi_event_set(
&ompi_event_pipe_event,
ompi_event_pipe[0],
OMPI_EV_READ|OMPI_EV_PERSIST,
ompi_event_pipe_handler,
0);
ompi_event_add_i(&ompi_event_pipe_event, 0);
ompi_event_pipe_signalled = 0;
}
#endif
ompi_event_enable();
@ -288,21 +282,7 @@ ompi_event_init(void)
int ompi_event_fini(void)
{
#if OMPI_ENABLE_PROGRESS_THREADS
if(ompi_using_threads()) {
OMPI_THREAD_LOCK(&ompi_event_lock);
if(ompi_event_inited > 0 && ompi_event_enabled) {
unsigned char byte = 0;
ompi_event_enabled = false;
write(ompi_event_pipe[1], &byte, 1);
OMPI_THREAD_UNLOCK(&ompi_event_lock);
ompi_thread_join(&ompi_event_thread, NULL);
ompi_event_pipe_signalled = 1;
} else {
OMPI_THREAD_UNLOCK(&ompi_event_lock);
}
}
#endif
ompi_event_disable();
ompi_event_inited--;
return OMPI_SUCCESS;
}
@ -312,19 +292,20 @@ int ompi_event_disable(void)
#if OMPI_ENABLE_PROGRESS_THREADS
if(ompi_using_threads()) {
OMPI_THREAD_LOCK(&ompi_event_lock);
if(ompi_event_inited > 0 && ompi_event_enabled) {
ompi_event_enabled = false;
if(ompi_event_pipe_signalled == 0) {
unsigned char byte = 0;
if(write(ompi_event_pipe[1], &byte, 1) != 1)
ompi_output(0, "ompi_event_add: write() to ompi_event_pipe[1] failed with errno=%d\n", errno);
ompi_event_pipe_signalled++;
}
OMPI_THREAD_UNLOCK(&ompi_event_lock);
ompi_thread_join(&ompi_event_thread, NULL);
} else {
OMPI_THREAD_UNLOCK(&ompi_event_lock);
if(ompi_event_inited > 0 && ompi_event_enabled == false) {
OMPI_THREAD_LOCK(&ompi_event_lock);
return OMPI_SUCCESS;
}
ompi_event_enabled = false;
if(ompi_event_pipe_signalled == 0) {
unsigned char byte = 0;
if(write(ompi_event_pipe[1], &byte, 1) != 1)
ompi_output(0, "ompi_event_add: write() to ompi_event_pipe[1] failed with errno=%d\n", errno);
ompi_event_pipe_signalled++;
}
OMPI_THREAD_UNLOCK(&ompi_event_lock);
ompi_thread_join(&ompi_event_thread, NULL);
} else {
ompi_event_enabled = false;
}
@ -339,19 +320,42 @@ int ompi_event_enable(void)
#if OMPI_ENABLE_PROGRESS_THREADS
if(ompi_using_threads()) {
int rc;
/* spin up a thread to dispatch events */
OMPI_THREAD_LOCK(&ompi_event_lock);
if(ompi_event_inited > 0 && ompi_event_enabled == false) {
OBJ_CONSTRUCT(&ompi_event_thread, ompi_thread_t);
ompi_event_enabled = true;
ompi_event_thread.t_run = ompi_event_run;
if((rc = ompi_thread_start(&ompi_event_thread)) != OMPI_SUCCESS) {
OMPI_THREAD_UNLOCK(&ompi_event_lock);
return rc;
}
if(ompi_event_inited > 0 && ompi_event_enabled == true) {
OMPI_THREAD_LOCK(&ompi_event_lock);
return OMPI_SUCCESS;
}
/* create a pipe to signal the event thread */
if(pipe(ompi_event_pipe) != 0) {
ompi_output(0, "ompi_event_init: pipe() failed with errno=%d\n", errno);
OMPI_THREAD_UNLOCK(&ompi_event_lock);
return OMPI_ERROR;
}
ompi_event_pipe_signalled = 1;
ompi_event_set(
&ompi_event_pipe_event,
ompi_event_pipe[0],
OMPI_EV_READ|OMPI_EV_PERSIST,
ompi_event_pipe_handler,
0);
ompi_event_add_i(&ompi_event_pipe_event, 0);
ompi_event_pipe_signalled = 0;
/* spin up a thread to dispatch events */
OBJ_CONSTRUCT(&ompi_event_thread, ompi_thread_t);
ompi_event_enabled = true;
ompi_event_thread.t_run = ompi_event_run;
if((rc = ompi_thread_start(&ompi_event_thread)) != OMPI_SUCCESS) {
OMPI_THREAD_UNLOCK(&ompi_event_lock);
return rc;
}
OMPI_THREAD_UNLOCK(&ompi_event_lock);
} else {
ompi_event_pipe[0] = -1;
ompi_event_pipe[1] = -1;
ompi_event_enabled = true;
}
#else
@ -363,8 +367,21 @@ int ompi_event_enable(void)
int ompi_event_restart(void)
{
int rc;
if((rc = ompi_event_enable()) != OMPI_SUCCESS)
return rc;
#if OMPI_ENABLE_PROGRESS_THREADS
OMPI_THREAD_LOCK(&ompi_event_lock);
if(ompi_event_pipe[0] >= 0) {
ompi_event_del_i(&ompi_event_pipe_event);
/* do not close pipes - in case of bproc_vrfork they are not open
* and we may close something else
*/
ompi_event_pipe[0] = -1;
ompi_event_pipe[1] = -1;
}
ompi_event_enabled = false;
OMPI_THREAD_UNLOCK(&ompi_event_lock);
#endif
ompi_event_enable();
if((rc = ompi_evsignal_restart()) != 0)
return OMPI_ERROR;
return (OMPI_SUCCESS);

Просмотреть файл

@ -189,6 +189,7 @@ OMPI_DECLSPEC int ompi_event_add_i(struct ompi_event *, struct timeval *);
OMPI_DECLSPEC int ompi_event_del_i(struct ompi_event *);
OMPI_DECLSPEC void ompi_event_active_i(struct ompi_event*, int, short);
OMPI_DECLSPEC extern ompi_mutex_t ompi_event_lock;
OMPI_DECLSPEC extern int ompi_evsignal_restart(void);
/* public functions */
static inline void

Просмотреть файл

@ -93,7 +93,8 @@ ompi_evsignal_add(sigset_t *evsigmask, struct ompi_event *ev)
int evsignal;
if (!initialized) {
int i;
ompi_event_signal_count = 0;
#if 0
/* Must delay the event add until after init() because
it may trigger poll events that are not yet setup
to be triggered. */
@ -104,7 +105,7 @@ ompi_evsignal_add(sigset_t *evsigmask, struct ompi_event *ev)
ompi_event_signal_pipe_handler,
0);
ompi_event_add_i(&ompi_event_signal_pipe_event, 0);
ompi_event_signal_count = 0;
#endif
initialized = true;
}
@ -119,6 +120,8 @@ int
ompi_evsignal_restart(void)
{
if(initialized) {
ompi_event_signal_count = 0;
#if 0
int rc;
ompi_event_del_i(&ompi_event_signal_pipe_event);
if ((rc = pipe(ompi_event_signal_pipe)) != 0) {
@ -129,8 +132,8 @@ ompi_evsignal_restart(void)
OMPI_EV_READ|OMPI_EV_PERSIST,
ompi_event_signal_pipe_handler,
0);
ompi_event_signal_count = 0;
ompi_event_add_i(&ompi_event_signal_pipe_event, 0);
#endif
}
return (0);
}
@ -154,8 +157,9 @@ ompi_evsignal_del(sigset_t *evsigmask, struct ompi_event *ev)
void
ompi_evsignal_handler(int sig)
{
const char errmsg[] = "ompi_evsignal_handler: error in write\n";
#if 0
unsigned char byte = 0;
#endif
ompi_evsigcaught[sig]++;
ompi_evsignal_caught = 1;

Просмотреть файл

@ -37,8 +37,8 @@ static char **mca_value_argv = NULL;
*/
int mca_base_cmd_line_setup(ompi_cmd_line_t *cmd)
{
return ompi_cmd_line_make_opt(cmd, 'm', "mca", 2,
"General mechanism to pass MCA parameters");
return ompi_cmd_line_make_opt3(cmd, '\0', "mca", "mca", 2,
"General mechanism to pass MCA parameters");
}
@ -48,39 +48,37 @@ int mca_base_cmd_line_setup(ompi_cmd_line_t *cmd)
int mca_base_cmd_line_process_args(ompi_cmd_line_t *cmd)
{
int i, num_insts;
char *buf = 0;
int buflen = 0;
char *buf, *name;
/* If no "-mca" parameters were given, just return */
if (!ompi_cmd_line_is_taken(cmd, "mca"))
return OMPI_SUCCESS;
if (!ompi_cmd_line_is_taken(cmd, "mca")) {
return OMPI_SUCCESS;
}
/* Otherwise, assemble them into an argc/argv */
num_insts = ompi_cmd_line_get_ninsts(cmd, "mca");
for (i = 0; i < num_insts; ++i)
mca_base_cmd_line_process_arg(ompi_cmd_line_get_param(cmd, "mca", i, 0),
ompi_cmd_line_get_param(cmd, "mca", i, 1));
for (i = 0; i < num_insts; ++i) {
mca_base_cmd_line_process_arg(ompi_cmd_line_get_param(cmd, "mca", i, 0),
ompi_cmd_line_get_param(cmd, "mca", i, 1));
}
/* Now put that argc/argv in the environment */
if (NULL == mca_param_argv)
return OMPI_SUCCESS;
if (NULL == mca_param_argv) {
return OMPI_SUCCESS;
}
/* Loop through all the -mca args that we've gotten and make env
vars of the form OMPI_MCA_*=value. This is a memory leak, but
that's how putenv works. :-( */
for (i = 0; NULL != mca_param_argv[i]; ++i) {
buflen = strlen(mca_param_argv[i]) + strlen(mca_value_argv[i]) + 32;
buf = malloc(buflen);
if (NULL == buf)
return OMPI_ERR_OUT_OF_RESOURCE;
snprintf(buf, buflen, "OMPI_MCA_%s=%s", mca_param_argv[i],
mca_value_argv[i]);
putenv(buf);
name = mca_base_param_environ_variable(mca_param_argv[i], NULL, NULL);
asprintf(&buf, "%s=%s", name, mca_value_argv[i]);
putenv(buf);
free(name);
}
return OMPI_SUCCESS;

Просмотреть файл

@ -26,6 +26,7 @@
#include "mca/errmgr/errmgr.h"
#include "mca/rml/rml.h"
#include "mca/gpr/gpr.h"
#include "mca/gpr/base/base.h"
#include "mca/ns/ns.h"
#include "mca/pml/pml.h"
#include "mca/base/mca_base_module_exchange.h"
@ -212,6 +213,12 @@ static void mca_base_modex_registry_callback(
bool isnew = false;
int rc;
#if 0
ompi_output(0, "[%d,%d,%d] mca_base_modex_registry_callback\n",
ORTE_NAME_ARGS(orte_process_info.my_name));
orte_gpr_base_dump_notify_data(data,0);
#endif
/* process the callback */
value = data->values;
for (i=0; i < data->cnt; i++) {
@ -350,6 +357,15 @@ static void mca_base_modex_registry_callback(
modex_module->module_data_size = num_bytes;
#endif
modex_module->module_data_avail = true;
#if 0
ompi_output(0, "[%d,%d,%d] mca_base_modex_registry_callback: %s-%s-%d-%d received %d bytes\n",
ORTE_NAME_ARGS(orte_process_info.my_name),
component.mca_type_name,
component.mca_component_name,
component.mca_component_major_version,
component.mca_component_minor_version,
num_bytes);
#endif
ompi_condition_signal(&modex_module->module_data_cond);
}
OMPI_THREAD_UNLOCK(&proc->proc_lock);
@ -510,7 +526,7 @@ int mca_base_modex_send(
const void *data,
size_t size)
{
char *jobidstring;
orte_jobid_t jobid;
orte_gpr_value_t *value;
int rc;
orte_buffer_t buffer;
@ -522,28 +538,23 @@ int mca_base_modex_send(
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (ORTE_SUCCESS != (rc = orte_ns.get_jobid_string(&jobidstring, orte_process_info.my_name))) {
if (ORTE_SUCCESS != (rc = orte_ns.get_jobid(&jobid, orte_process_info.my_name))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (0 > asprintf(&(value->segment), "%s-%s", ORTE_JOB_SEGMENT, jobidstring)) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
value->tokens = (char**)malloc(sizeof(char*));
if (NULL == value->tokens) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
value->addr_mode = ORTE_GPR_TOKENS_AND | ORTE_GPR_OVERWRITE;
if (ORTE_SUCCESS != (rc = orte_ns.get_proc_name_string(&(value->tokens[0]), orte_process_info.my_name))) {
if (ORTE_SUCCESS != (rc = orte_schema.get_job_segment_name(&(value->segment), jobid))) {
ORTE_ERROR_LOG(rc);
return rc;
}
value->num_tokens = 1;
if (ORTE_SUCCESS != (rc = orte_schema.get_proc_tokens(&(value->tokens),
&(value->num_tokens), orte_process_info.my_name))) {
ORTE_ERROR_LOG(rc);
return rc;
}
value->addr_mode = ORTE_GPR_TOKENS_AND | ORTE_GPR_KEYS_OR;
value->cnt = 1;
value->keyvals = (orte_gpr_keyval_t**)malloc(sizeof(orte_gpr_keyval_t*));
value->keyvals[0] = OBJ_NEW(orte_gpr_keyval_t);
@ -642,6 +653,14 @@ int mca_base_modex_recv(
/* wait until data is available */
while(modex_module->module_data_avail == false) {
#if 0
ompi_output(0, "[%d,%d,%d] mca_base_modex_registry_callback: waiting for %s-%s-%d-%d\n",
ORTE_NAME_ARGS(orte_process_info.my_name),
component->mca_type_name,
component->mca_component_name,
component->mca_component_major_version,
component->mca_component_minor_version);
#endif
ompi_condition_wait(&modex_module->module_data_cond, &proc->proc_lock);
}

Просмотреть файл

@ -547,11 +547,9 @@ int mca_base_param_build_env(char ***env, int *num_env, bool internal)
asprintf(&str, "%s=%s", array[i].mbp_env_var_name,
storage.stringval);
free(storage.stringval);
} else {
asprintf(&str, "%s=", array[i].mbp_env_var_name);
}
ompi_argv_append(num_env, env, str);
free(str);
ompi_argv_append(num_env, env, str);
free(str);
}
} else {
goto cleanup;
}

Просмотреть файл

@ -71,7 +71,7 @@ void orte_errmgr_base_abort()
orte_wait_kill(9);
/* abnormal exit */
_exit(-1);
exit(-1);
}
int orte_errmgr_base_register_job(orte_jobid_t job)

Просмотреть файл

@ -141,6 +141,7 @@ extern "C" {
OMPI_DECLSPEC int orte_gpr_base_pack_dump_segments(orte_buffer_t *cmd);
OMPI_DECLSPEC int orte_gpr_base_pack_dump_triggers(orte_buffer_t *cmd);
OMPI_DECLSPEC int orte_gpr_base_print_dump(orte_buffer_t *buffer, int output_id);
OMPI_DECLSPEC void orte_gpr_base_dump_keyval_value(orte_gpr_keyval_t *iptr, int output_id);
OMPI_DECLSPEC int orte_gpr_base_dump_notify_msg(orte_gpr_notify_message_t *msg, int output_id);
OMPI_DECLSPEC int orte_gpr_base_dump_notify_data(orte_gpr_notify_data_t *data, int output_id);

Просмотреть файл

@ -31,7 +31,6 @@
#include "mca/gpr/base/base.h"
static void orte_gpr_base_dump_data(orte_gpr_notify_data_t *data, int output_id);
static void orte_gpr_base_dump_keyval_value(orte_gpr_keyval_t *iptr, int output_id);
int orte_gpr_base_dump_notify_msg(orte_gpr_notify_message_t *msg, int output_id)
{
@ -145,7 +144,7 @@ static void orte_gpr_base_dump_data(orte_gpr_notify_data_t *data, int output_id)
}
static void orte_gpr_base_dump_keyval_value(orte_gpr_keyval_t *iptr, int output_id)
void orte_gpr_base_dump_keyval_value(orte_gpr_keyval_t *iptr, int output_id)
{
switch(iptr->type) {

Просмотреть файл

@ -67,7 +67,7 @@ int orte_gpr_proxy_increment_value(orte_gpr_value_t *value)
return rc;
}
if (0 > orte_rml.send_buffer(orte_process_info.gpr_replica, cmd, MCA_OOB_TAG_GPR, 0)) {
if (0 > orte_rml.send_buffer(orte_process_info.gpr_replica, cmd, ORTE_RML_TAG_GPR, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(cmd);
return ORTE_ERR_COMM_FAILURE;
@ -80,7 +80,7 @@ int orte_gpr_proxy_increment_value(orte_gpr_value_t *value)
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (0 > orte_rml.recv_buffer(orte_process_info.gpr_replica, answer, MCA_OOB_TAG_GPR)) {
if (0 > orte_rml.recv_buffer(orte_process_info.gpr_replica, answer, ORTE_RML_TAG_GPR)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer);
return ORTE_ERR_COMM_FAILURE;
@ -128,7 +128,7 @@ int orte_gpr_proxy_decrement_value(orte_gpr_value_t *value)
return rc;
}
if (0 > orte_rml.send_buffer(orte_process_info.gpr_replica, cmd, MCA_OOB_TAG_GPR, 0)) {
if (0 > orte_rml.send_buffer(orte_process_info.gpr_replica, cmd, ORTE_RML_TAG_GPR, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(cmd);
return ORTE_ERR_COMM_FAILURE;
@ -141,7 +141,7 @@ int orte_gpr_proxy_decrement_value(orte_gpr_value_t *value)
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (0 > orte_rml.recv_buffer(orte_process_info.gpr_replica, answer, MCA_OOB_TAG_GPR)) {
if (0 > orte_rml.recv_buffer(orte_process_info.gpr_replica, answer, ORTE_RML_TAG_GPR)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer);
return ORTE_ERR_COMM_FAILURE;

Просмотреть файл

@ -59,7 +59,7 @@ int orte_gpr_proxy_cleanup_job(orte_jobid_t jobid)
return rc;
}
if (0 > orte_rml.send_buffer(orte_process_info.gpr_replica, cmd, MCA_OOB_TAG_GPR, 0)) {
if (0 > orte_rml.send_buffer(orte_process_info.gpr_replica, cmd, ORTE_RML_TAG_GPR, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(cmd);
return ORTE_ERR_COMM_FAILURE;
@ -72,7 +72,7 @@ int orte_gpr_proxy_cleanup_job(orte_jobid_t jobid)
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (0 > orte_rml.recv_buffer(orte_process_info.gpr_replica, answer, MCA_OOB_TAG_GPR)) {
if (0 > orte_rml.recv_buffer(orte_process_info.gpr_replica, answer, ORTE_RML_TAG_GPR)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer);
return ORTE_ERR_COMM_FAILURE;
@ -115,7 +115,7 @@ int orte_gpr_proxy_cleanup_proc(orte_process_name_t *proc)
return rc;
}
if (0 > orte_rml.send_buffer(orte_process_info.gpr_replica, cmd, MCA_OOB_TAG_GPR, 0)) {
if (0 > orte_rml.send_buffer(orte_process_info.gpr_replica, cmd, ORTE_RML_TAG_GPR, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(cmd);
return ORTE_ERR_COMM_FAILURE;
@ -128,7 +128,7 @@ int orte_gpr_proxy_cleanup_proc(orte_process_name_t *proc)
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (0 > orte_rml.recv_buffer(orte_process_info.gpr_replica, answer, MCA_OOB_TAG_GPR)) {
if (0 > orte_rml.recv_buffer(orte_process_info.gpr_replica, answer, ORTE_RML_TAG_GPR)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer);
return ORTE_ERR_COMM_FAILURE;

Просмотреть файл

@ -247,7 +247,7 @@ orte_gpr_proxy_component_init(bool *allow_multi_user_threads, bool *have_hidden_
int orte_gpr_proxy_module_init(void)
{
/* issue the non-blocking receive */
return orte_rml.recv_buffer_nb(ORTE_RML_NAME_ANY, MCA_OOB_TAG_GPR_NOTIFY, 0, orte_gpr_proxy_notify_recv, NULL);
return orte_rml.recv_buffer_nb(ORTE_RML_NAME_ANY, ORTE_RML_TAG_GPR_NOTIFY, 0, orte_gpr_proxy_notify_recv, NULL);
}
@ -266,7 +266,7 @@ int orte_gpr_proxy_finalize(void)
}
/* All done */
orte_rml.recv_cancel(ORTE_RML_NAME_ANY, MCA_OOB_TAG_GPR_NOTIFY);
orte_rml.recv_cancel(ORTE_RML_NAME_ANY, ORTE_RML_TAG_GPR_NOTIFY);
return ORTE_SUCCESS;
}
@ -354,7 +354,7 @@ void orte_gpr_proxy_notify_recv(int status, orte_process_name_t* sender,
RETURN_ERROR:
/* reissue non-blocking receive */
orte_rml.recv_buffer_nb(ORTE_RML_NAME_ANY, MCA_OOB_TAG_GPR_NOTIFY, 0, orte_gpr_proxy_notify_recv, NULL);
orte_rml.recv_buffer_nb(ORTE_RML_NAME_ANY, ORTE_RML_TAG_GPR_NOTIFY, 0, orte_gpr_proxy_notify_recv, NULL);
}

Просмотреть файл

@ -108,7 +108,7 @@ int orte_gpr_proxy_exec_compound_cmd(void)
OMPI_THREAD_LOCK(&orte_gpr_proxy_globals.wait_for_compound_mutex);
rc = ORTE_SUCCESS;
if (0 > orte_rml.send_buffer(orte_process_info.gpr_replica, orte_gpr_proxy_globals.compound_cmd, MCA_OOB_TAG_GPR, 0)) {
if (0 > orte_rml.send_buffer(orte_process_info.gpr_replica, orte_gpr_proxy_globals.compound_cmd, ORTE_RML_TAG_GPR, 0)) {
rc = ORTE_ERR_COMM_FAILURE;
goto CLEANUP;
}
@ -119,7 +119,7 @@ int orte_gpr_proxy_exec_compound_cmd(void)
goto CLEANUP;
}
if (0 > orte_rml.recv_buffer(orte_process_info.gpr_replica, answer, MCA_OOB_TAG_GPR)) {
if (0 > orte_rml.recv_buffer(orte_process_info.gpr_replica, answer, ORTE_RML_TAG_GPR)) {
OBJ_RELEASE(answer);
rc = ORTE_ERR_COMM_FAILURE;
goto CLEANUP;

Просмотреть файл

@ -60,7 +60,7 @@ int orte_gpr_proxy_delete_segment(char *segment)
return rc;
}
if (0 > orte_rml.send_buffer(orte_process_info.gpr_replica, cmd, MCA_OOB_TAG_GPR, 0)) {
if (0 > orte_rml.send_buffer(orte_process_info.gpr_replica, cmd, ORTE_RML_TAG_GPR, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(cmd);
return ORTE_ERR_COMM_FAILURE;
@ -73,7 +73,7 @@ int orte_gpr_proxy_delete_segment(char *segment)
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (0 > orte_rml.recv_buffer(orte_process_info.gpr_replica, answer, MCA_OOB_TAG_GPR)) {
if (0 > orte_rml.recv_buffer(orte_process_info.gpr_replica, answer, ORTE_RML_TAG_GPR)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer);
return ORTE_ERR_COMM_FAILURE;
@ -136,7 +136,7 @@ int orte_gpr_proxy_delete_entries(orte_gpr_addr_mode_t mode,
return rc;
}
if (0 > orte_rml.send_buffer(orte_process_info.gpr_replica, cmd, MCA_OOB_TAG_GPR, 0)) {
if (0 > orte_rml.send_buffer(orte_process_info.gpr_replica, cmd, ORTE_RML_TAG_GPR, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(cmd);
return ORTE_ERR_COMM_FAILURE;
@ -149,7 +149,7 @@ int orte_gpr_proxy_delete_entries(orte_gpr_addr_mode_t mode,
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (0 > orte_rml.recv_buffer(orte_process_info.gpr_replica, answer, MCA_OOB_TAG_GPR)) {
if (0 > orte_rml.recv_buffer(orte_process_info.gpr_replica, answer, ORTE_RML_TAG_GPR)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer);
return ORTE_ERR_COMM_FAILURE;
@ -203,7 +203,7 @@ int orte_gpr_proxy_index(char *segment, size_t *cnt, char **index)
return rc;
}
if (0 > orte_rml.send_buffer(orte_process_info.gpr_replica, cmd, MCA_OOB_TAG_GPR, 0)) {
if (0 > orte_rml.send_buffer(orte_process_info.gpr_replica, cmd, ORTE_RML_TAG_GPR, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(cmd);
return ORTE_ERR_COMM_FAILURE;
@ -216,7 +216,7 @@ int orte_gpr_proxy_index(char *segment, size_t *cnt, char **index)
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (0 > orte_rml.recv_buffer(orte_process_info.gpr_replica, answer, MCA_OOB_TAG_GPR)) {
if (0 > orte_rml.recv_buffer(orte_process_info.gpr_replica, answer, ORTE_RML_TAG_GPR)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer);
return ORTE_ERR_COMM_FAILURE;

Просмотреть файл

@ -67,7 +67,7 @@ int orte_gpr_proxy_dump_all(int output_id)
return rc;
}
if (0 > orte_rml.send_buffer(orte_process_info.gpr_replica, cmd, MCA_OOB_TAG_GPR, 0)) {
if (0 > orte_rml.send_buffer(orte_process_info.gpr_replica, cmd, ORTE_RML_TAG_GPR, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
return ORTE_ERR_COMM_FAILURE;
}
@ -78,7 +78,7 @@ int orte_gpr_proxy_dump_all(int output_id)
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (0 > orte_rml.recv_buffer(orte_process_info.gpr_replica, answer, MCA_OOB_TAG_GPR)) {
if (0 > orte_rml.recv_buffer(orte_process_info.gpr_replica, answer, ORTE_RML_TAG_GPR)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
return ORTE_ERR_COMM_FAILURE;
}
@ -128,7 +128,7 @@ int orte_gpr_proxy_dump_segments(int output_id)
return rc;
}
if (0 > orte_rml.send_buffer(orte_process_info.gpr_replica, cmd, MCA_OOB_TAG_GPR, 0)) {
if (0 > orte_rml.send_buffer(orte_process_info.gpr_replica, cmd, ORTE_RML_TAG_GPR, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
return ORTE_ERR_COMM_FAILURE;
}
@ -139,7 +139,7 @@ int orte_gpr_proxy_dump_segments(int output_id)
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (0 > orte_rml.recv_buffer(orte_process_info.gpr_replica, answer, MCA_OOB_TAG_GPR)) {
if (0 > orte_rml.recv_buffer(orte_process_info.gpr_replica, answer, ORTE_RML_TAG_GPR)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
return ORTE_ERR_COMM_FAILURE;
}
@ -189,7 +189,7 @@ int orte_gpr_proxy_dump_triggers(int output_id)
return rc;
}
if (0 > orte_rml.send_buffer(orte_process_info.gpr_replica, cmd, MCA_OOB_TAG_GPR, 0)) {
if (0 > orte_rml.send_buffer(orte_process_info.gpr_replica, cmd, ORTE_RML_TAG_GPR, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
return ORTE_ERR_COMM_FAILURE;
}
@ -200,7 +200,7 @@ int orte_gpr_proxy_dump_triggers(int output_id)
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (0 > orte_rml.recv_buffer(orte_process_info.gpr_replica, answer, MCA_OOB_TAG_GPR)) {
if (0 > orte_rml.recv_buffer(orte_process_info.gpr_replica, answer, ORTE_RML_TAG_GPR)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
return ORTE_ERR_COMM_FAILURE;
}

Просмотреть файл

@ -65,7 +65,7 @@ int orte_gpr_proxy_put(int cnt, orte_gpr_value_t **values)
return rc;
}
if (0 > orte_rml.send_buffer(orte_process_info.gpr_replica, cmd, MCA_OOB_TAG_GPR, 0)) {
if (0 > orte_rml.send_buffer(orte_process_info.gpr_replica, cmd, ORTE_RML_TAG_GPR, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
return ORTE_ERR_COMM_FAILURE;
}
@ -77,7 +77,7 @@ int orte_gpr_proxy_put(int cnt, orte_gpr_value_t **values)
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (0 > orte_rml.recv_buffer(orte_process_info.gpr_replica, answer, MCA_OOB_TAG_GPR)) {
if (0 > orte_rml.recv_buffer(orte_process_info.gpr_replica, answer, ORTE_RML_TAG_GPR)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
return ORTE_ERR_COMM_FAILURE;
}
@ -133,7 +133,7 @@ int orte_gpr_proxy_get(orte_gpr_addr_mode_t mode,
return rc;
}
if (0 > orte_rml.send_buffer(orte_process_info.gpr_replica, cmd, MCA_OOB_TAG_GPR, 0)) {
if (0 > orte_rml.send_buffer(orte_process_info.gpr_replica, cmd, ORTE_RML_TAG_GPR, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
return ORTE_ERR_COMM_FAILURE;
}
@ -144,7 +144,7 @@ int orte_gpr_proxy_get(orte_gpr_addr_mode_t mode,
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (0 > orte_rml.recv_buffer(orte_process_info.gpr_replica, answer, MCA_OOB_TAG_GPR)) {
if (0 > orte_rml.recv_buffer(orte_process_info.gpr_replica, answer, ORTE_RML_TAG_GPR)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
return ORTE_ERR_COMM_FAILURE;
}

Просмотреть файл

@ -131,7 +131,7 @@ orte_gpr_proxy_subscribe(orte_gpr_notify_action_t action,
}
if (0 > orte_rml.send_buffer(orte_process_info.gpr_replica, cmd, MCA_OOB_TAG_GPR, 0)) {
if (0 > orte_rml.send_buffer(orte_process_info.gpr_replica, cmd, ORTE_RML_TAG_GPR, 0)) {
ORTE_ERROR_LOG(rc);
OMPI_THREAD_LOCK(&orte_gpr_proxy_globals.mutex);
orte_gpr_proxy_remove_notify_request(idtag, &remote_idtag);
@ -148,7 +148,7 @@ orte_gpr_proxy_subscribe(orte_gpr_notify_action_t action,
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (0 > orte_rml.recv_buffer(orte_process_info.gpr_replica, answer, MCA_OOB_TAG_GPR)) {
if (0 > orte_rml.recv_buffer(orte_process_info.gpr_replica, answer, ORTE_RML_TAG_GPR)) {
ORTE_ERROR_LOG(rc);
OMPI_THREAD_LOCK(&orte_gpr_proxy_globals.mutex);
orte_gpr_proxy_remove_notify_request(idtag, &remote_idtag);
@ -232,7 +232,7 @@ int orte_gpr_proxy_unsubscribe(orte_gpr_notify_id_t sub_number)
return rc;
}
if (0 > orte_rml.send_buffer(orte_process_info.gpr_replica, cmd, MCA_OOB_TAG_GPR, 0)) {
if (0 > orte_rml.send_buffer(orte_process_info.gpr_replica, cmd, ORTE_RML_TAG_GPR, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(cmd);
return ORTE_ERR_COMM_FAILURE;
@ -245,7 +245,7 @@ int orte_gpr_proxy_unsubscribe(orte_gpr_notify_id_t sub_number)
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (0 > orte_rml.recv_buffer(orte_process_info.gpr_replica, answer, MCA_OOB_TAG_GPR)) {
if (0 > orte_rml.recv_buffer(orte_process_info.gpr_replica, answer, ORTE_RML_TAG_GPR)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer);
return ORTE_ERR_COMM_FAILURE;

Просмотреть файл

@ -77,14 +77,15 @@ int orte_gpr_replica_increment_value(orte_gpr_value_t *value)
free(itags);
}
OMPI_THREAD_UNLOCK(&orte_gpr_replica_globals.mutex);
if (ORTE_SUCCESS == rc) {
if (ORTE_SUCCESS !=
(rc = orte_gpr_replica_check_subscriptions(seg, ORTE_GPR_REPLICA_VALUE_INCREMENTED))) {
ORTE_ERROR_LOG(rc);
OMPI_THREAD_UNLOCK(&orte_gpr_replica_globals.mutex);
return rc;
}
OMPI_THREAD_UNLOCK(&orte_gpr_replica_globals.mutex);
orte_gpr_replica_process_callbacks();
}
@ -138,14 +139,15 @@ int orte_gpr_replica_decrement_value(orte_gpr_value_t *value)
free(itags);
}
OMPI_THREAD_UNLOCK(&orte_gpr_replica_globals.mutex);
if (ORTE_SUCCESS == rc) {
if (ORTE_SUCCESS !=
(rc = orte_gpr_replica_check_subscriptions(seg, ORTE_GPR_REPLICA_VALUE_DECREMENTED))) {
ORTE_ERROR_LOG(rc);
OMPI_THREAD_UNLOCK(&orte_gpr_replica_globals.mutex);
return rc;
}
OMPI_THREAD_UNLOCK(&orte_gpr_replica_globals.mutex);
orte_gpr_replica_process_callbacks();
}

Просмотреть файл

@ -62,13 +62,13 @@ int orte_gpr_replica_dump_all(int output_id)
ORTE_ERROR_LOG(rc);
}
OMPI_THREAD_UNLOCK(&orte_gpr_replica_globals.mutex);
if (ORTE_SUCCESS == rc) {
orte_gpr_base_print_dump(buffer, output_id);
}
OBJ_RELEASE(buffer);
OMPI_THREAD_UNLOCK(&orte_gpr_replica_globals.mutex);
return rc;
}
@ -102,13 +102,13 @@ int orte_gpr_replica_dump_segments(int output_id)
ORTE_ERROR_LOG(rc);
}
OMPI_THREAD_UNLOCK(&orte_gpr_replica_globals.mutex);
if (ORTE_SUCCESS == rc) {
orte_gpr_base_print_dump(buffer, output_id);
}
OBJ_RELEASE(buffer);
OMPI_THREAD_UNLOCK(&orte_gpr_replica_globals.mutex);
return rc;
}
@ -142,12 +142,12 @@ int orte_gpr_replica_dump_triggers(int output_id)
ORTE_ERROR_LOG(rc);
}
OMPI_THREAD_UNLOCK(&orte_gpr_replica_globals.mutex);
if (ORTE_SUCCESS == rc) {
orte_gpr_base_print_dump(buffer, output_id);
}
OBJ_RELEASE(buffer);
OMPI_THREAD_UNLOCK(&orte_gpr_replica_globals.mutex);
return rc;
}

Просмотреть файл

@ -87,7 +87,9 @@ int orte_gpr_replica_put(int cnt, orte_gpr_value_t **values)
goto CLEANUP;
}
free(itags);
if (NULL != itags) {
free(itags);
}
itags = NULL;
}

Просмотреть файл

@ -49,5 +49,7 @@ int orte_gpr_replica_preallocate_segment(char *name, int num_slots)
orte_gpr_replica_globals.max_size,
orte_gpr_replica_globals.block_size);
return rc;
OMPI_THREAD_UNLOCK(&orte_gpr_replica_globals.mutex);
return rc;
}

Просмотреть файл

@ -62,8 +62,7 @@ int orte_gpr_replica_recv_increment_value_cmd(orte_buffer_t *cmd, orte_buffer_t
if (ORTE_SUCCESS != (rc = orte_gpr_replica_get_itag_list(&itags, seg,
value->tokens, &(value->num_tokens)))) {
ORTE_ERROR_LOG(rc);
OMPI_THREAD_UNLOCK(&orte_gpr_replica_globals.mutex);
ret = rc;
ret = rc;
goto RETURN_ERROR;
}
@ -77,12 +76,11 @@ int orte_gpr_replica_recv_increment_value_cmd(orte_buffer_t *cmd, orte_buffer_t
free(itags);
}
OMPI_THREAD_UNLOCK(&orte_gpr_replica_globals.mutex);
if (ORTE_SUCCESS == ret) {
if (ORTE_SUCCESS !=
(rc = orte_gpr_replica_check_subscriptions(seg, ORTE_GPR_REPLICA_VALUE_INCREMENTED))) {
ORTE_ERROR_LOG(rc);
OMPI_THREAD_UNLOCK(&orte_gpr_replica_globals.mutex);
return rc;
}
}
@ -90,9 +88,12 @@ int orte_gpr_replica_recv_increment_value_cmd(orte_buffer_t *cmd, orte_buffer_t
RETURN_ERROR:
if (ORTE_SUCCESS != (rc = orte_dps.pack(answer, &ret, 1, ORTE_INT))) {
ORTE_ERROR_LOG(rc);
OMPI_THREAD_UNLOCK(&orte_gpr_replica_globals.mutex);
return rc;
}
OMPI_THREAD_UNLOCK(&orte_gpr_replica_globals.mutex);
return ret;
}
@ -145,12 +146,11 @@ int orte_gpr_replica_recv_decrement_value_cmd(orte_buffer_t *cmd, orte_buffer_t
free(itags);
}
OMPI_THREAD_UNLOCK(&orte_gpr_replica_globals.mutex);
if (ORTE_SUCCESS == ret) {
if (ORTE_SUCCESS !=
(rc = orte_gpr_replica_check_subscriptions(seg, ORTE_GPR_REPLICA_VALUE_DECREMENTED))) {
ORTE_ERROR_LOG(rc);
OMPI_THREAD_UNLOCK(&orte_gpr_replica_globals.mutex);
return rc;
}
}
@ -158,8 +158,11 @@ int orte_gpr_replica_recv_decrement_value_cmd(orte_buffer_t *cmd, orte_buffer_t
RETURN_ERROR:
if (ORTE_SUCCESS != (rc = orte_dps.pack(answer, &ret, 1, ORTE_INT))) {
ORTE_ERROR_LOG(rc);
OMPI_THREAD_UNLOCK(&orte_gpr_replica_globals.mutex);
return rc;
}
OMPI_THREAD_UNLOCK(&orte_gpr_replica_globals.mutex);
return ret;
}

Просмотреть файл

@ -41,25 +41,28 @@ int orte_gpr_replica_recv_cleanup_job_cmd(orte_buffer_t *input_buffer,
return rc;
}
OMPI_THREAD_LOCK(&orte_gpr_replica_globals.mutex);
n = 1;
if (ORTE_SUCCESS != (rc = orte_dps.unpack(input_buffer, &jobid, &n, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
return rc;
if (ORTE_SUCCESS != (ret = orte_dps.unpack(input_buffer, &jobid, &n, ORTE_JOBID))) {
ORTE_ERROR_LOG(ret);
goto RETURN_ERROR;
}
OMPI_THREAD_LOCK(&orte_gpr_replica_globals.mutex);
ret = orte_gpr_replica_cleanup_job_fn(jobid);
OMPI_THREAD_UNLOCK(&orte_gpr_replica_globals.mutex);
if (ORTE_SUCCESS != ret) {
ORTE_ERROR_LOG(ret);
}
RETURN_ERROR:
if (ORTE_SUCCESS != (rc = orte_dps.pack(output_buffer, &ret, 1, ORTE_INT))) {
ORTE_ERROR_LOG(rc);
OMPI_THREAD_UNLOCK(&orte_gpr_replica_globals.mutex);
return rc;
}
OMPI_THREAD_UNLOCK(&orte_gpr_replica_globals.mutex);
return ret;
}
@ -77,25 +80,28 @@ int orte_gpr_replica_recv_cleanup_proc_cmd(orte_buffer_t *input_buffer,
return rc;
}
OMPI_THREAD_LOCK(&orte_gpr_replica_globals.mutex);
n = 1;
if (ORTE_SUCCESS != (rc = orte_dps.unpack(input_buffer, &proc, &n, ORTE_NAME))) {
ORTE_ERROR_LOG(rc);
return rc;
if (ORTE_SUCCESS != (ret = orte_dps.unpack(input_buffer, &proc, &n, ORTE_NAME))) {
ORTE_ERROR_LOG(ret);
goto RETURN_ERROR;
}
OMPI_THREAD_LOCK(&orte_gpr_replica_globals.mutex);
ret = orte_gpr_replica_cleanup_proc_fn(&proc);
OMPI_THREAD_UNLOCK(&orte_gpr_replica_globals.mutex);
if (ORTE_SUCCESS != ret) {
ORTE_ERROR_LOG(ret);
}
RETURN_ERROR:
if (ORTE_SUCCESS != (rc = orte_dps.pack(output_buffer, &ret, 1, ORTE_INT))) {
ORTE_ERROR_LOG(rc);
OMPI_THREAD_UNLOCK(&orte_gpr_replica_globals.mutex);
return rc;
}
OMPI_THREAD_UNLOCK(&orte_gpr_replica_globals.mutex);
return ret;
}

Просмотреть файл

@ -41,19 +41,16 @@ int orte_gpr_replica_recv_delete_segment_cmd(orte_buffer_t *buffer, orte_buffer_
return rc;
}
OMPI_THREAD_LOCK(&orte_gpr_replica_globals.mutex);
n = 1;
if (ORTE_SUCCESS != (rc = orte_dps.unpack(buffer, &segment, &n, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
ret = rc;
if (ORTE_SUCCESS != (ret = orte_dps.unpack(buffer, &segment, &n, ORTE_STRING))) {
ORTE_ERROR_LOG(ret);
goto RETURN_ERROR;
}
OMPI_THREAD_LOCK(&orte_gpr_replica_globals.mutex);
if (ORTE_SUCCESS != (rc = orte_gpr_replica_find_seg(&seg, false, segment))) {
OMPI_THREAD_UNLOCK(&orte_gpr_replica_globals.mutex);
ORTE_ERROR_LOG(rc);
ret = rc;
if (ORTE_SUCCESS != (ret = orte_gpr_replica_find_seg(&seg, false, segment))) {
ORTE_ERROR_LOG(ret);
goto RETURN_ERROR;
}
@ -61,14 +58,15 @@ int orte_gpr_replica_recv_delete_segment_cmd(orte_buffer_t *buffer, orte_buffer_
ORTE_ERROR_LOG(ret);
}
OMPI_THREAD_UNLOCK(&orte_gpr_replica_globals.mutex);
RETURN_ERROR:
if (ORTE_SUCCESS != (rc = orte_dps.pack(answer, &ret, 1, ORTE_INT))) {
ORTE_ERROR_LOG(rc);
OMPI_THREAD_UNLOCK(&orte_gpr_replica_globals.mutex);
return rc;
}
OMPI_THREAD_UNLOCK(&orte_gpr_replica_globals.mutex);
return ret;
}
@ -87,24 +85,23 @@ int orte_gpr_replica_recv_delete_entries_cmd(orte_buffer_t *buffer, orte_buffer_
return rc;
}
OMPI_THREAD_LOCK(&orte_gpr_replica_globals.mutex);
n = 1;
if (ORTE_SUCCESS != (rc = orte_dps.unpack(buffer, &addr_mode, &n, ORTE_GPR_ADDR_MODE))) {
ORTE_ERROR_LOG(rc);
ret = rc;
if (ORTE_SUCCESS != (ret = orte_dps.unpack(buffer, &addr_mode, &n, ORTE_GPR_ADDR_MODE))) {
ORTE_ERROR_LOG(ret);
goto RETURN_ERROR;
}
n = 1;
if (ORTE_SUCCESS != (rc = orte_dps.unpack(buffer, &segment, &n, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
ret = rc;
if (ORTE_SUCCESS != (ret = orte_dps.unpack(buffer, &segment, &n, ORTE_STRING))) {
ORTE_ERROR_LOG(ret);
goto RETURN_ERROR;
}
n = 1;
if (ORTE_SUCCESS != (rc = orte_dps.unpack(buffer, &num_tokens, &n, ORTE_INT))) {
ORTE_ERROR_LOG(rc);
ret = rc;
if (ORTE_SUCCESS != (ret = orte_dps.unpack(buffer, &num_tokens, &n, ORTE_INT))) {
ORTE_ERROR_LOG(ret);
goto RETURN_ERROR;
}
@ -117,17 +114,15 @@ int orte_gpr_replica_recv_delete_entries_cmd(orte_buffer_t *buffer, orte_buffer_
ret = ORTE_ERR_OUT_OF_RESOURCE;
goto RETURN_ERROR;
}
if (ORTE_SUCCESS != (rc = orte_dps.unpack(buffer, tokens, (size_t*)&num_tokens, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
ret = rc;
if (ORTE_SUCCESS != (ret = orte_dps.unpack(buffer, tokens, (size_t*)&num_tokens, ORTE_STRING))) {
ORTE_ERROR_LOG(ret);
goto RETURN_ERROR;
}
}
n = 1;
if (ORTE_SUCCESS != (rc = orte_dps.unpack(buffer, &num_keys, &n, ORTE_INT))) {
ORTE_ERROR_LOG(rc);
ret = rc;
if (ORTE_SUCCESS != (ret = orte_dps.unpack(buffer, &num_keys, &n, ORTE_INT))) {
ORTE_ERROR_LOG(ret);
goto RETURN_ERROR;
}
@ -140,34 +135,25 @@ int orte_gpr_replica_recv_delete_entries_cmd(orte_buffer_t *buffer, orte_buffer_
ret = ORTE_ERR_OUT_OF_RESOURCE;
goto RETURN_ERROR;
}
if (ORTE_SUCCESS != (rc = orte_dps.unpack(buffer, keys, (size_t*)&num_keys, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
ret = rc;
if (ORTE_SUCCESS != (ret = orte_dps.unpack(buffer, keys, (size_t*)&num_keys, ORTE_STRING))) {
ORTE_ERROR_LOG(ret);
goto RETURN_ERROR;
}
}
OMPI_THREAD_LOCK(&orte_gpr_replica_globals.mutex);
/* locate the segment */
if (ORTE_SUCCESS != (rc = orte_gpr_replica_find_seg(&seg, false, segment))) {
ORTE_ERROR_LOG(rc);
ret = rc;
OMPI_THREAD_UNLOCK(&orte_gpr_replica_globals.mutex);
if (ORTE_SUCCESS != (ret = orte_gpr_replica_find_seg(&seg, false, segment))) {
ORTE_ERROR_LOG(ret);
goto RETURN_ERROR;
}
if (ORTE_SUCCESS != (rc = orte_gpr_replica_get_itag_list(&token_itags, seg, tokens, &num_tokens))) {
ORTE_ERROR_LOG(rc);
ret = rc;
OMPI_THREAD_UNLOCK(&orte_gpr_replica_globals.mutex);
if (ORTE_SUCCESS != (ret = orte_gpr_replica_get_itag_list(&token_itags, seg, tokens, &num_tokens))) {
ORTE_ERROR_LOG(ret);
goto RETURN_ERROR;
}
if (ORTE_SUCCESS != (rc = orte_gpr_replica_get_itag_list(&key_itags, seg, keys, &num_keys))) {
ORTE_ERROR_LOG(rc);
ret = rc;
OMPI_THREAD_UNLOCK(&orte_gpr_replica_globals.mutex);
if (ORTE_SUCCESS != (ret = orte_gpr_replica_get_itag_list(&key_itags, seg, keys, &num_keys))) {
ORTE_ERROR_LOG(ret);
goto RETURN_ERROR;
}
@ -179,8 +165,6 @@ int orte_gpr_replica_recv_delete_entries_cmd(orte_buffer_t *buffer, orte_buffer_
orte_gpr_replica_check_subscriptions(seg, ORTE_GPR_REPLICA_ENTRY_DELETED);
}
OMPI_THREAD_UNLOCK(&orte_gpr_replica_globals.mutex);
RETURN_ERROR:
if (NULL != segment) {
@ -211,9 +195,12 @@ int orte_gpr_replica_recv_delete_entries_cmd(orte_buffer_t *buffer, orte_buffer_
if (ORTE_SUCCESS != (rc = orte_dps.pack(answer, &ret, 1, ORTE_INT))) {
ORTE_ERROR_LOG(rc);
OMPI_THREAD_UNLOCK(&orte_gpr_replica_globals.mutex);
return rc;
}
OMPI_THREAD_UNLOCK(&orte_gpr_replica_globals.mutex);
return ret;
}
@ -232,55 +219,55 @@ int orte_gpr_replica_recv_index_cmd(orte_buffer_t *buffer,
return rc;
}
if (ORTE_SUCCESS != (rc = orte_dps.peek(buffer, &type, &n))) {
ORTE_ERROR_LOG(rc);
ret = rc;
OMPI_THREAD_LOCK(&orte_gpr_replica_globals.mutex);
if (ORTE_SUCCESS != (ret = orte_dps.peek(buffer, &type, &n))) {
ORTE_ERROR_LOG(ret);
goto RETURN_ERROR;
}
OMPI_THREAD_LOCK(&orte_gpr_replica_globals.mutex);
if (ORTE_STRING != type) { /* get index of segment names */
seg = NULL;
} else {
if (ORTE_SUCCESS != (rc = orte_dps.unpack(buffer, &segment, &n, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
ret = rc;
OMPI_THREAD_UNLOCK(&orte_gpr_replica_globals.mutex);
if (ORTE_SUCCESS != (ret = orte_dps.unpack(buffer, &segment, &n, ORTE_STRING))) {
ORTE_ERROR_LOG(ret);
goto RETURN_ERROR;
}
/* locate the segment */
if (ORTE_SUCCESS != (rc = orte_gpr_replica_find_seg(&seg, false, segment))) {
ORTE_ERROR_LOG(rc);
ret = rc;
OMPI_THREAD_UNLOCK(&orte_gpr_replica_globals.mutex);
if (ORTE_SUCCESS != (ret = orte_gpr_replica_find_seg(&seg, false, segment))) {
ORTE_ERROR_LOG(ret);
goto RETURN_ERROR;
}
}
if (ORTE_SUCCESS != (ret = orte_gpr_replica_index_fn(seg, &cnt, index))) {
ORTE_ERROR_LOG(ret);
OMPI_THREAD_UNLOCK(&orte_gpr_replica_globals.mutex);
goto RETURN_ERROR;
}
OMPI_THREAD_UNLOCK(&orte_gpr_replica_globals.mutex);
if (ORTE_SUCCESS != (rc = orte_dps.pack(answer, &cnt, 1, ORTE_SIZE))) {
ORTE_ERROR_LOG(rc);
ret = rc;
goto RETURN_ERROR;
goto RETURN_PACK_ERROR;
}
if (0 < cnt) { /* got a non-zero answer back */
if (ORTE_SUCCESS != (rc = orte_dps.pack(answer, index, cnt, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
ret = rc;
goto RETURN_ERROR;
goto RETURN_PACK_ERROR;
}
}
RETURN_ERROR:
/* ensure that the minimum response is generated */
cnt = 0;
if (ORTE_SUCCESS != (rc = orte_dps.pack(answer, &cnt, 1, ORTE_SIZE))) {
ORTE_ERROR_LOG(rc);
ret = rc;
}
RETURN_PACK_ERROR:
if (NULL != segment) {
free(segment);
}
@ -294,8 +281,11 @@ int orte_gpr_replica_recv_index_cmd(orte_buffer_t *buffer,
if (ORTE_SUCCESS != (rc = orte_dps.pack(answer, &ret, 1, ORTE_INT))) {
ORTE_ERROR_LOG(rc);
OMPI_THREAD_UNLOCK(&orte_gpr_replica_globals.mutex);
return rc;
}
OMPI_THREAD_UNLOCK(&orte_gpr_replica_globals.mutex);
return ret;
}

Просмотреть файл

@ -34,7 +34,6 @@ int orte_gpr_replica_recv_put_cmd(orte_buffer_t *buffer, orte_buffer_t *answer)
orte_gpr_value_t **values = NULL, *val;
orte_gpr_replica_segment_t *seg=NULL;
orte_gpr_replica_itag_t *itags=NULL;
orte_gpr_addr_mode_t addr_mode;
orte_data_type_t type;
int8_t action_taken=0;
int i=0, rc, ret;
@ -92,8 +91,8 @@ int orte_gpr_replica_recv_put_cmd(orte_buffer_t *buffer, orte_buffer_t *answer)
goto RETURN_ERROR;
}
if (ORTE_SUCCESS != (ret = orte_gpr_replica_put_fn(addr_mode, seg, itags, val->num_tokens,
val->cnt, val->keyvals, &action_taken))) {
if (ORTE_SUCCESS != (ret = orte_gpr_replica_put_fn(val->addr_mode, seg, itags,
val->num_tokens, val->cnt, val->keyvals, &action_taken))) {
ORTE_ERROR_LOG(ret);
goto RETURN_ERROR;
}
@ -102,6 +101,7 @@ int orte_gpr_replica_recv_put_cmd(orte_buffer_t *buffer, orte_buffer_t *answer)
if (ORTE_SUCCESS !=
(rc = orte_gpr_replica_check_subscriptions(seg, action_taken))) {
ORTE_ERROR_LOG(rc);
OMPI_THREAD_UNLOCK(&orte_gpr_replica_globals.mutex);
return rc;
}
}
@ -111,8 +111,6 @@ int orte_gpr_replica_recv_put_cmd(orte_buffer_t *buffer, orte_buffer_t *answer)
}
RETURN_ERROR:
OMPI_THREAD_UNLOCK(&orte_gpr_replica_globals.mutex);
/* release list of itags */
if (NULL != itags) {
free(itags);
@ -130,9 +128,11 @@ int orte_gpr_replica_recv_put_cmd(orte_buffer_t *buffer, orte_buffer_t *answer)
if (ORTE_SUCCESS != (rc = orte_dps.pack(answer, &ret, 1, ORTE_INT))) {
ORTE_ERROR_LOG(rc);
OMPI_THREAD_UNLOCK(&orte_gpr_replica_globals.mutex);
return rc;
}
OMPI_THREAD_UNLOCK(&orte_gpr_replica_globals.mutex);
return ret;
}
@ -154,6 +154,8 @@ int orte_gpr_replica_recv_get_cmd(orte_buffer_t *input_buffer,
return rc;
}
OMPI_THREAD_LOCK(&orte_gpr_replica_globals.mutex);
n = 1;
if (ORTE_SUCCESS != (ret = orte_dps.unpack(input_buffer, &addr_mode, &n, ORTE_GPR_ADDR_MODE))) {
ORTE_ERROR_LOG(ret);
@ -176,7 +178,8 @@ int orte_gpr_replica_recv_get_cmd(orte_buffer_t *input_buffer,
tokens = (char**)malloc(num_tokens*sizeof(char*));
if (NULL == tokens) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
ret = ORTE_ERR_OUT_OF_RESOURCE;
goto RETURN_ERROR;
}
n = num_tokens;
if (ORTE_SUCCESS != (ret = orte_dps.unpack(input_buffer, tokens, &n, ORTE_STRING))) {
@ -210,8 +213,6 @@ int orte_gpr_replica_recv_get_cmd(orte_buffer_t *input_buffer,
keys = NULL;
}
OMPI_THREAD_LOCK(&orte_gpr_replica_globals.mutex);
/* find the segment */
if (ORTE_SUCCESS != (ret = orte_gpr_replica_find_seg(&seg, true, segment))) {
ORTE_ERROR_LOG(ret);
@ -257,8 +258,6 @@ int orte_gpr_replica_recv_get_cmd(orte_buffer_t *input_buffer,
}
}
OMPI_THREAD_UNLOCK(&orte_gpr_replica_globals.mutex);
if (NULL != segment) {
free(segment);
}
@ -288,9 +287,10 @@ int orte_gpr_replica_recv_get_cmd(orte_buffer_t *input_buffer,
/* pack response code */
if (ORTE_SUCCESS != (rc = orte_dps.pack(output_buffer, &ret, 1, ORTE_INT))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
OMPI_THREAD_UNLOCK(&orte_gpr_replica_globals.mutex);
return ret;
}

Просмотреть файл

@ -35,7 +35,7 @@ int orte_gpr_replica_recv_subscribe_cmd(orte_process_name_t* sender,
{
orte_gpr_cmd_flag_t command=ORTE_GPR_SUBSCRIBE_CMD;
orte_data_type_t type;
orte_gpr_notify_id_t local_idtag=0, idtag=0;
orte_gpr_notify_id_t local_idtag=ORTE_GPR_NOTIFY_ID_MAX, idtag=ORTE_GPR_NOTIFY_ID_MAX;
int rc, ret, num_subs, num_trigs;
size_t n;
orte_gpr_notify_action_t action;
@ -47,57 +47,63 @@ int orte_gpr_replica_recv_subscribe_cmd(orte_process_name_t* sender,
return rc;
}
/******* LOCK *****/
OMPI_THREAD_LOCK(&orte_gpr_replica_globals.mutex);
n = 1;
if (ORTE_SUCCESS != (rc = orte_dps.unpack(input_buffer, &action, &n, ORTE_NOTIFY_ACTION))) {
ORTE_ERROR_LOG(rc);
return rc;
goto RETURN_ERROR;
}
if (ORTE_SUCCESS != (rc = orte_dps.peek(input_buffer, &type, &n))) {
ORTE_ERROR_LOG(rc);
return rc;
goto RETURN_ERROR;
}
/* create the space for the subscriptions */
subscriptions = (orte_gpr_subscription_t**)malloc(n * sizeof(orte_gpr_subscription_t*));
if (NULL == subscriptions) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (ORTE_SUCCESS != (rc = orte_dps.unpack(input_buffer, subscriptions, &n, ORTE_GPR_SUBSCRIPTION))) {
ORTE_ERROR_LOG(rc);
return rc;
if (0 < n) {
/* create the space for the subscriptions */
subscriptions = (orte_gpr_subscription_t**)malloc(n * sizeof(orte_gpr_subscription_t*));
if (NULL == subscriptions) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
rc = ORTE_ERR_OUT_OF_RESOURCE;
goto RETURN_ERROR;
}
if (ORTE_SUCCESS != (rc = orte_dps.unpack(input_buffer, subscriptions, &n, ORTE_GPR_SUBSCRIPTION))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
}
num_subs = (int)n;
if (ORTE_SUCCESS != (rc = orte_dps.peek(input_buffer, &type, &n))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* create the space for the triggers */
trigs = (orte_gpr_value_t**)malloc(n * sizeof(orte_gpr_value_t*));
if (NULL == trigs) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (ORTE_SUCCESS != orte_dps.unpack(input_buffer, trigs, &n, ORTE_GPR_VALUE)) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
num_trigs = (int)n;
if (0 < n) {
/* create the space for the triggers */
trigs = (orte_gpr_value_t**)malloc(n * sizeof(orte_gpr_value_t*));
if (NULL == trigs) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
rc = ORTE_ERR_OUT_OF_RESOURCE;
goto RETURN_ERROR;
}
if (ORTE_SUCCESS != orte_dps.unpack(input_buffer, trigs, &n, ORTE_GPR_VALUE)) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
}
num_trigs = (int)n;
n = 1;
if (ORTE_SUCCESS != orte_dps.unpack(input_buffer, &idtag, &n, ORTE_GPR_NOTIFY_ID)) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
/******* LOCK *****/
OMPI_THREAD_LOCK(&orte_gpr_replica_globals.mutex);
if (NULL != sender) { /* remote sender */
if (orte_gpr_replica_globals.debug) {
@ -109,7 +115,6 @@ int orte_gpr_replica_recv_subscribe_cmd(orte_process_name_t* sender,
if (ORTE_SUCCESS != (rc = orte_gpr_replica_enter_notify_request(&local_idtag,
sender, idtag, num_subs, subscriptions))) {
ORTE_ERROR_LOG(rc);
OMPI_THREAD_UNLOCK(&orte_gpr_replica_globals.mutex);
goto RETURN_ERROR;
}
@ -119,14 +124,6 @@ int orte_gpr_replica_recv_subscribe_cmd(orte_process_name_t* sender,
num_trigs, trigs,
local_idtag))) {
ORTE_ERROR_LOG(rc);
OMPI_THREAD_UNLOCK(&orte_gpr_replica_globals.mutex);
goto RETURN_ERROR;
}
/* pack the local idtag for return to sender */
if (ORTE_SUCCESS != (rc = orte_dps.pack(output_buffer, &local_idtag, 1, ORTE_GPR_NOTIFY_ID))) {
ORTE_ERROR_LOG(rc);
OMPI_THREAD_UNLOCK(&orte_gpr_replica_globals.mutex);
goto RETURN_ERROR;
}
@ -137,28 +134,28 @@ int orte_gpr_replica_recv_subscribe_cmd(orte_process_name_t* sender,
num_trigs, trigs,
idtag))) {
ORTE_ERROR_LOG(rc);
OMPI_THREAD_UNLOCK(&orte_gpr_replica_globals.mutex);
goto RETURN_ERROR;
}
/* pack the local idtag for return to local sender */
if (ORTE_SUCCESS != (rc = orte_dps.pack(output_buffer, &idtag, 1, ORTE_GPR_NOTIFY_ID))) {
ORTE_ERROR_LOG(rc);
OMPI_THREAD_UNLOCK(&orte_gpr_replica_globals.mutex);
goto RETURN_ERROR;
}
/* set the local idtag for return to local sender */
local_idtag = idtag;
}
OMPI_THREAD_UNLOCK(&orte_gpr_replica_globals.mutex);
/****** UNLOCK ******/
RETURN_ERROR:
/* pack the local idtag for return to sender */
if (ORTE_SUCCESS != (rc = orte_dps.pack(output_buffer, &local_idtag, 1, ORTE_GPR_NOTIFY_ID))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
if (ORTE_SUCCESS != (ret = orte_dps.pack(output_buffer, &rc, 1, ORTE_INT))) {
ORTE_ERROR_LOG(ret);
return ret;
}
OMPI_THREAD_UNLOCK(&orte_gpr_replica_globals.mutex);
/****** UNLOCK ******/
return rc;
}
@ -175,28 +172,31 @@ int orte_gpr_replica_recv_unsubscribe_cmd(orte_buffer_t *input_buffer,
return rc;
}
/******* LOCK *****/
OMPI_THREAD_LOCK(&orte_gpr_replica_globals.mutex);
n = 1;
if (ORTE_SUCCESS != (rc = orte_dps.unpack(input_buffer, &sub_number, &n, ORTE_GPR_NOTIFY_ID))) {
ORTE_ERROR_LOG(rc);
return rc;
ret = rc;
goto RETURN_ERROR;
}
/******* LOCK *****/
OMPI_THREAD_LOCK(&orte_gpr_replica_globals.mutex);
ret = orte_gpr_replica_unsubscribe_fn(sub_number);
if (ORTE_SUCCESS != ret) {
ORTE_ERROR_LOG(ret);
}
OMPI_THREAD_UNLOCK(&orte_gpr_replica_globals.mutex);
/****** UNLOCK ******/
RETURN_ERROR:
if (ORTE_SUCCESS != (rc = orte_dps.pack(output_buffer, &ret, 1, ORTE_INT))) {
ORTE_ERROR_LOG(rc);
OMPI_THREAD_UNLOCK(&orte_gpr_replica_globals.mutex);
return rc;
}
OMPI_THREAD_UNLOCK(&orte_gpr_replica_globals.mutex);
/****** UNLOCK ******/
return ret;
}

Просмотреть файл

@ -485,7 +485,12 @@ void orte_gpr_replica_dump_itagval_value(orte_buffer_t *buffer,
break;
}
orte_gpr_replica_dump_load_string(buffer, &tmp);
if (NULL == buffer) {
ompi_output(0, "%s", tmp);
free(tmp);
} else {
orte_gpr_replica_dump_load_string(buffer, &tmp);
}
}

Просмотреть файл

@ -32,6 +32,7 @@
#include "mca/ns/ns.h"
#include "mca/errmgr/errmgr.h"
#include "mca/gpr/base/base.h"
#include "mca/gpr/replica/communications/gpr_replica_comm.h"
#include "gpr_replica_fn.h"
@ -112,8 +113,8 @@ int orte_gpr_replica_register_callback(orte_gpr_replica_triggers_t *trig)
if (trig->requestor == cb->requestor) { /* same destination - add to existing callback */
if (ORTE_SUCCESS != (rc = orte_gpr_replica_construct_notify_message(&(cb->message), trig))) {
ORTE_ERROR_LOG(rc);
return rc;
}
return rc;
}
}
/* got a new callback, generate the request */
@ -256,6 +257,10 @@ int orte_gpr_replica_add_values(orte_gpr_notify_data_t **data,
num_tokens = (int)orte_value_array_get_size(&(sptr->tokentags));
num_keys = (int) orte_value_array_get_size(&(sptr->keytags));
if (orte_gpr_replica_globals.debug) {
ompi_output(0, "add_values: performing a get");
}
if (ORTE_SUCCESS != (rc = orte_gpr_replica_get_fn(sptr->addr_mode, sptr->seg,
ORTE_VALUE_ARRAY_GET_BASE(&(sptr->tokentags), orte_gpr_replica_itag_t),
num_tokens,
@ -265,6 +270,21 @@ int orte_gpr_replica_add_values(orte_gpr_notify_data_t **data,
ORTE_ERROR_LOG(rc);
return rc;
}
if (orte_gpr_replica_globals.debug) {
ompi_output(0, "add_values: get returned %d values", cnt);
for (i=0; i < cnt; i++) {
ompi_output(0, "Data for value %d from segment %s\nTokens:", i, values[i]->segment);
for (j=0; j < values[i]->num_tokens; j++) {
ompi_output(0, "\ttoken num: %d\tToken: %s", j, values[i]->tokens[j]);
}
ompi_output(0, "\tGot %d keyals:", values[i]->cnt);
for (j=0; j < values[i]->cnt; j++) {
ompi_output(0, "\tValue num: %d\tKey: %s", j, (values[i]->keyvals[j])->key);
orte_gpr_base_dump_keyval_value(values[i]->keyvals[j], 0);
}
}
}
/* store these values in the notify_data structure, combining data
* where containers match

Просмотреть файл

@ -199,7 +199,6 @@ int orte_gpr_replica_put_fn(orte_gpr_addr_mode_t addr_mode,
if (ORTE_SUCCESS != (rc = orte_gpr_replica_update_keyval(seg, cptr[j], keyvals[i]))) {
return rc;
}
overwrite = false; /* only do it for the first one - rest get added */
*action_taken = *action_taken | ORTE_GPR_REPLICA_ENTRY_CHANGED;
} else {
if (ORTE_SUCCESS != (rc = orte_gpr_replica_add_keyval(&iptr, seg, cptr[j], keyvals[i]))) {
@ -251,9 +250,35 @@ int orte_gpr_replica_get_fn(orte_gpr_addr_mode_t addr_mode,
orte_gpr_keyval_t **kptr;
orte_gpr_replica_addr_mode_t tokmode, keymode;
int rc, i, j, num_found;
char *token;
if (orte_gpr_replica_globals.debug) {
ompi_output(0, "[%d,%d,%d] gpr replica: get entered", ORTE_NAME_ARGS(orte_process_info.my_name));
ompi_output(0, "\tGetting data from segment %s wiht %d tokens and %d keys",
seg->name, num_tokens, num_keys);
for (i=0; i < num_tokens; i++) {
if (ORTE_SUCCESS != orte_gpr_replica_dict_reverse_lookup(
&token, seg, tokentags[i])) {
ompi_output(0, "\t\ttoken num %d: No entry found for itag %X",
i, tokentags[i]);
} else {
ompi_output(0, "\t\ttoken num %d: itag %d\tToken: %s",
i, tokentags[i], token);
free(token);
}
}
for (i=0; i < num_keys; i++) {
if (ORTE_SUCCESS != orte_gpr_replica_dict_reverse_lookup(
&token, seg, keytags[i])) {
ompi_output(0, "\t\tkey num %d: No entry found for itag %X",
i, keytags[i]);
} else {
ompi_output(0, "\t\tkey num %d: itag %d\tKey: %s",
i, keytags[i], token);
free(token);
}
}
}
/* initialize the list of findings */

Просмотреть файл

@ -165,8 +165,6 @@ int orte_gpr_replica_delete_itagval(orte_gpr_replica_segment_t *seg,
orte_gpr_replica_container_t *cptr,
orte_gpr_replica_itagval_t *iptr)
{
int i, rc;
/* see if anyone cares that this value is deleted */
/* trig = (orte_gpr_replica_triggers_t**)((orte_gpr_replica.triggers)->addr);
@ -184,15 +182,6 @@ int orte_gpr_replica_delete_itagval(orte_gpr_replica_segment_t *seg,
*/
/* remove the itag from the container;s list of itags */
for (i=0; i < cptr->num_itags; i++) {
if (ORTE_VALUE_ARRAY_GET_ITEM(&(cptr->itaglist), orte_gpr_replica_itag_t, i) == iptr->itag) {
orte_value_array_remove_item(&(cptr->itaglist), i);
(cptr->num_itags)--;
break;
}
}
/* remove the entry from the container's itagval array */
orte_pointer_array_set_item(cptr->itagvals, iptr->index, NULL);

Просмотреть файл

@ -51,6 +51,18 @@ int orte_gpr_replica_subscribe_fn(orte_gpr_notify_action_t action, int num_subs,
if (orte_gpr_replica_globals.debug) {
ompi_output(0, "[%d,%d,%d] gpr replica: subscribe entered",
ORTE_NAME_ARGS(orte_process_info.my_name));
ompi_output(0, "Received %d subscriptions", num_subs);
for (i=0; i < num_subs; i++) {
ompi_output(0, "Subscription %d on segment %s with %d tokens, %d keys",
i, subscriptions[i]->segment, subscriptions[i]->num_tokens,
subscriptions[i]->num_keys);
for (j=0; j < subscriptions[i]->num_tokens; j++) {
ompi_output(0, "\tToken num: %d\tToken: %s", j, subscriptions[i]->tokens[j]);
}
for (j=0; j < subscriptions[i]->num_keys; j++) {
ompi_output(0, "\tKey num: %d\tKey: %s", j, subscriptions[i]->keys[j]);
}
}
}
trig = (orte_gpr_replica_triggers_t*)((orte_gpr_replica.triggers)->addr[idtag]);
@ -59,7 +71,6 @@ int orte_gpr_replica_subscribe_fn(orte_gpr_notify_action_t action, int num_subs,
return ORTE_ERR_BAD_PARAM;
}
trig->action = action;
trig->num_subscribed_data = num_subs;
for (i=0; i < num_subs; i++) {
/* find the subscribed_data entry in the trigger pointer array - placed

Просмотреть файл

@ -76,7 +76,8 @@ orte_gpr_replica_enter_notify_request(orte_gpr_notify_id_t *local_idtag,
}
data->index = i;
}
trig->num_subscribed_data = cnt;
if (0 > (rc = orte_pointer_array_add(orte_gpr_replica.triggers, trig))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
@ -153,34 +154,13 @@ int orte_gpr_replica_check_subscriptions(orte_gpr_replica_segment_t *seg,
trig = (orte_gpr_replica_triggers_t**)((orte_gpr_replica.triggers)->addr);
for (i=0; i < (orte_gpr_replica.triggers)->size; i++) {
if (NULL != trig[i]) {
sptr = (orte_gpr_replica_subscribed_data_t**)((trig[i]->subscribed_data)->addr);
n = (trig[i]->subscribed_data)->size;
for (j=0; j < n; j++) {
if (NULL != sptr[j] && seg == sptr[j]->seg) {
#if 0
if (ORTE_GPR_NOTIFY_ANY & trig[i]->action &&
!(ORTE_GPR_TRIG_NOTIFY_START & trig[i]->action)) { /* notify exists and is active */
if (((ORTE_GPR_NOTIFY_ADD_ENTRY & trig[i]->action) && (ORTE_GPR_REPLICA_ENTRY_ADDED == action_taken)) ||
((ORTE_GPR_NOTIFY_DEL_ENTRY & trig[i]->action) && (ORTE_GPR_REPLICA_ENTRY_DELETED == action_taken)) ||
((ORTE_GPR_NOTIFY_VALUE_CHG & trig[i]->action) && (ORTE_GPR_REPLICA_ENTRY_CHANGED == action_taken)) ||
((ORTE_GPR_NOTIFY_VALUE_CHG_TO & trig[i]->action) && (ORTE_GPR_REPLICA_ENTRY_CHG_TO == action_taken)) ||
((ORTE_GPR_NOTIFY_VALUE_CHG_FRM & trig[i]->action) && (ORTE_GPR_REPLICA_ENTRY_CHG_FRM == action_taken))) {
if (ORTE_SUCCESS != (rc = orte_gpr_replica_register_callback(trig[i]))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
}
#endif
/* check if trigger is on this subscription - if so, check it */
if (ORTE_GPR_TRIG_ANY & trig[i]->action) {
if (ORTE_SUCCESS != (rc = orte_gpr_replica_check_trig(trig[i]))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
} /* if sptr not NULL */
} /* for j */
/* check if trigger is on this subscription - if so, check it */
if (ORTE_GPR_TRIG_ANY & trig[i]->action) {
if (ORTE_SUCCESS != (rc = orte_gpr_replica_check_trig(trig[i]))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
} /* if trig not NULL */
}
return ORTE_SUCCESS;
@ -217,6 +197,9 @@ int orte_gpr_replica_check_trig(orte_gpr_replica_triggers_t *trig)
}
}
if (fire) { /* all levels were equal */
if (orte_gpr_replica_globals.debug) {
ompi_output(0, "REGISTERING CALLBACK FOR TRIG %d", trig->index);
}
if (ORTE_SUCCESS != (rc = orte_gpr_replica_register_callback(trig))) {
ORTE_ERROR_LOG(rc);
return rc;

Просмотреть файл

@ -207,6 +207,9 @@ int mca_io_base_component_finalize(void)
{
initialized = false;
ompi_progress_unregister(mca_io_base_component_run_progress);
OBJ_DESTRUCT(&components_in_use);
return OMPI_SUCCESS;

Просмотреть файл

@ -44,6 +44,7 @@ extern "C" {
struct orte_iof_base_t {
int iof_output;
ompi_list_t iof_components_opened;
bool iof_component_selected;
ompi_list_t iof_endpoints;
ompi_mutex_t iof_lock;
ompi_condition_t iof_condition;

Просмотреть файл

@ -29,8 +29,13 @@ int orte_iof_base_close(void)
{
ompi_list_item_t* item;
/* flush all pending output */
orte_iof_base_flush();
/* We only need to flush if an iof component was successfully
selected */
if (orte_iof_base.iof_component_selected) {
orte_iof_base_flush();
orte_iof_base.iof_component_selected = false;
}
/* shutdown any remaining opened components */
if (0 != ompi_list_get_size(&orte_iof_base.iof_components_opened)) {

Просмотреть файл

@ -57,6 +57,7 @@ int orte_iof_base_open(void)
OBJ_CONSTRUCT(&orte_iof_base.iof_condition, ompi_condition_t);
OBJ_CONSTRUCT(&orte_iof_base.iof_fragments, ompi_free_list_t);
orte_iof_base.iof_waiting = 0;
orte_iof_base.iof_component_selected = false;
/* lookup common parameters */
id = mca_base_param_register_int("iof","base","window_size",NULL,ORTE_IOF_BASE_MSG_MAX << 1);

Просмотреть файл

@ -100,6 +100,7 @@ int orte_iof_base_select(void)
/* setup reference to selected module */
if (NULL != selected_module) {
orte_iof = *selected_module;
orte_iof_base.iof_component_selected = true;
return ORTE_SUCCESS;
}

45
src/mca/iof/null/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,45 @@
#
# Copyright (c) 2004-2005 The Trustees of Indiana University.
# All rights reserved.
# Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
# All rights reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# Use the top-level Makefile.options
include $(top_ompi_srcdir)/config/Makefile.options
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if OMPI_BUILD_iof_null_DSO
component_noinst =
component_install = mca_iof_null.la
else
component_noinst = libmca_iof_null.la
component_install =
endif
null_SOURCES = \
iof_null.h \
iof_null_module.c \
iof_null_component.c
mcacomponentdir = $(libdir)/openmpi
mcacomponent_LTLIBRARIES = $(component_install)
mca_iof_null_la_SOURCES = $(null_SOURCES)
mca_iof_null_la_LIBADD =
mca_iof_null_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_iof_null_la_SOURCES = $(null_SOURCES)
libmca_iof_null_la_LIBADD =
libmca_iof_null_la_LDFLAGS = -module -avoid-version

20
src/mca/iof/null/configure.params Обычный файл
Просмотреть файл

@ -0,0 +1,20 @@
# -*- shell-script -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University.
# All rights reserved.
# Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
# All rights reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# Specific to this module
PARAM_INIT_FILE=iof_null.h
PARAM_CONFIG_HEADER_FILE="iof_null_config.h"
PARAM_CONFIG_FILES="Makefile"

50
src/mca/iof/null/configure.stub Обычный файл
Просмотреть файл

@ -0,0 +1,50 @@
# -*- shell-script -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University.
# All rights reserved.
# Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
# All rights reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
AC_DEFUN([MCA_CONFIGURE_STUB],[
# README README README README README README README README README
#
# The NULL iof component is here for debugging some things when
# using TM. It should not be used anywhere else, as you won't
# get I/O. So check for tm and follow the tm pls lead.
#
# README README README README README README README README README
#
# boot tm configure.stub
#
AC_ARG_WITH(pls-tm,
AC_HELP_STRING([--with-pls-tm=DIR],
[directory where the tm software was installed]))
#
# Need to find tm.h - note that we don't care about CPPFLAGS being reset
# if this doesn't work, that's all she wrote :)
#
if test ! -z "$with_pls_tm"; then
CPPFLAGS="$CPPFLAGS -I$with_pls_tm/include"
fi
AC_CHECK_HEADERS(tm.h,,
AC_MSG_ERROR([*** Cannot find working tm.h.]))
#
# done with tm configure.stub
#
])dnl

143
src/mca/iof/null/iof_null.h Обычный файл
Просмотреть файл

@ -0,0 +1,143 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*/
#ifndef ORTE_IOF_NULL_H
#define ORTE_IOF_NULL_H
#include "mca/iof/iof.h"
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
/**
* Publish a local file descriptor as an endpoint that is logically
* associated with the specified process name (e.g. master side of a
* pipe/pty connected to a child process)
*
* @param name
* @param mode
* @param tag
* @param fd
*
*/
int orte_iof_null_publish(
const orte_process_name_t* name,
orte_iof_base_mode_t mode,
orte_iof_base_tag_t tag,
int fd
);
/**
* Remove all registrations matching the specified process
* name, mask and tag values.
*
* @param name
* @param mask
* @param tag
*
*/
int orte_iof_null_unpublish(
const orte_process_name_t* name,
orte_ns_cmp_bitmask_t mask,
orte_iof_base_tag_t tag
);
/**
* Explicitly push data from the specified file descriptor
* to the indicated set of peers.
*
* @param dst_name Name used to qualify set of peers.
* @param dst_mask Mask that specified how name is interpreted.
* @param dst_tag Match a specific peer endpoint.
* @param fd Local file descriptor.
*/
int orte_iof_null_push(
const orte_process_name_t* dst_name,
orte_ns_cmp_bitmask_t dst_mask,
orte_iof_base_tag_t dst_tag,
int fd
);
/**
* Explicitly pull data from the specified set of peers
* and dump to the indicated file descriptor.
*
* @param dst_name Name used to qualify set of peers.
* @param dst_mask Mask that specified how name is interpreted.
* @param dst_tag Match a specific peer endpoint.
* @param fd Local file descriptor.
*/
int orte_iof_null_pull(
const orte_process_name_t* src_name,
orte_ns_cmp_bitmask_t src_mask,
orte_iof_base_tag_t src_tag,
int fd
);
/**
* Setup buffering for a specified set of endpoints.
*/
int orte_iof_null_buffer(
const orte_process_name_t* src_name,
orte_ns_cmp_bitmask_t src_mask,
orte_iof_base_tag_t src_tag,
size_t buffer_size
);
/*
* Subscribe to receive a callback on receipt of data
* from a specified set of peers.
*/
int orte_iof_null_subscribe(
const orte_process_name_t* src_name,
orte_ns_cmp_bitmask_t src_mask,
orte_iof_base_tag_t src_tag,
orte_iof_base_callback_fn_t cb,
void* cbdata
);
int orte_iof_null_unsubscribe(
const orte_process_name_t* src_name,
orte_ns_cmp_bitmask_t src_mask,
orte_iof_base_tag_t src_tag
);
/**
* IOF null Component
*/
struct orte_iof_null_component_t {
orte_iof_base_component_t super;
int null_debug;
struct iovec null_iov[1];
};
typedef struct orte_iof_null_component_t orte_iof_null_component_t;
OMPI_COMP_EXPORT extern orte_iof_null_component_t mca_iof_null_component;
OMPI_COMP_EXPORT extern orte_iof_base_module_t orte_iof_null_module;
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif

126
src/mca/iof/null/iof_null_component.c Обычный файл
Просмотреть файл

@ -0,0 +1,126 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "util/proc_info.h"
#include "util/output.h"
#include "runtime/ompi_progress.h"
#include "mca/rml/rml.h"
#include "mca/base/base.h"
#include "mca/base/mca_base_param.h"
#include "mca/iof/base/base.h"
#include "mca/iof/base/iof_base_endpoint.h"
#include "iof_null.h"
/*
* Local functions
*/
static int orte_iof_null_open(void);
static int orte_iof_null_close(void);
static orte_iof_base_module_t* orte_iof_null_init(
int* priority,
bool *allow_multi_user_threads,
bool *have_hidden_threads);
orte_iof_null_component_t mca_iof_null_component = {
{
/* First, the mca_base_component_t struct containing meta
information about the component itself */
{
/* Indicate that we are a iof v1.0.0 component (which also
implies a specific MCA version) */
ORTE_IOF_BASE_VERSION_1_0_0,
"null", /* MCA component name */
1, /* MCA component major version */
0, /* MCA component minor version */
0, /* MCA component release version */
orte_iof_null_open, /* component open */
orte_iof_null_close /* component close */
},
/* Next the MCA v1.0.0 component meta data */
{
/* Whether the component is checkpointable or not */
false
},
orte_iof_null_init
},
false,
{{NULL, 0}}
};
static int orte_iof_null_param_register_int(
const char* param_name,
int default_value)
{
int id = mca_base_param_register_int("iof","null",param_name,NULL,default_value);
int param_value = default_value;
mca_base_param_lookup_int(id,&param_value);
return param_value;
}
/**
* component open/close/init function
*/
static int orte_iof_null_open(void)
{
mca_iof_null_component.null_debug =
orte_iof_null_param_register_int("debug", 1);
mca_iof_null_component.null_debug =
orte_iof_null_param_register_int("override", 0);
return OMPI_SUCCESS;
}
static orte_iof_base_module_t*
orte_iof_null_init(int* priority, bool *allow_multi_user_threads,
bool *have_hidden_threads)
{
int param, override;
param = mca_base_param_find("iof", "null", "override");
mca_base_param_lookup_int(param, &override);
/* Only be used in a PBS environment -- this component is
currently *only* for debugging */
if (0 != override ||
(NULL != getenv("PBS_ENVIRONMENT") &&
NULL != getenv("PBS_JOBID"))) {
*priority = 50;
*allow_multi_user_threads = true;
*have_hidden_threads = false;
return &orte_iof_null_module;
}
return NULL;
}
/**
*
*/
static int orte_iof_null_close(void)
{
return ORTE_SUCCESS;
}

157
src/mca/iof/null/iof_null_module.c Обычный файл
Просмотреть файл

@ -0,0 +1,157 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include <errno.h>
#include <unistd.h>
#include <string.h>
#include "include/constants.h"
#include "util/output.h"
#include "mca/iof/iof.h"
#include "mca/rml/rml.h"
#include "mca/rml/rml_types.h"
#include "mca/iof/iof.h"
#include "mca/iof/base/base.h"
#include "mca/iof/base/iof_base_endpoint.h"
#include "iof_null.h"
orte_iof_base_module_t orte_iof_null_module = {
orte_iof_null_publish,
orte_iof_null_unpublish,
orte_iof_null_push,
orte_iof_null_pull,
orte_iof_null_subscribe,
orte_iof_null_unsubscribe,
orte_iof_base_flush
};
/**
* Publish a local file descriptor as an endpoint that is logically
* associated with the specified process name (e.g. master side of a
* pipe/pty connected to a child process)
*
* @param name
* @param mode
* @param tag
* @param fd
*
*/
int orte_iof_null_publish(
const orte_process_name_t* name,
orte_iof_base_mode_t mode,
orte_iof_base_tag_t tag,
int fd)
{
return ORTE_SUCCESS;
}
/**
* Remove all registrations matching the specified process
* name, mask and tag values.
*
* @param name
* @param mask
* @param tag
*
*/
int orte_iof_null_unpublish(
const orte_process_name_t* name,
orte_ns_cmp_bitmask_t mask,
orte_iof_base_tag_t tag)
{
return ORTE_SUCCESS;
}
/**
* Explicitly push data from the specified file descriptor
* to the indicated set of peers.
*
* @param dst_name Name used to qualify set of peers.
* @param dst_mask Mask that specified how name is interpreted.
* @param dst_tag Match a specific peer endpoint.
* @param fd Local file descriptor.
*/
int orte_iof_null_push(
const orte_process_name_t* dst_name,
orte_ns_cmp_bitmask_t dst_mask,
orte_iof_base_tag_t dst_tag,
int fd)
{
return ORTE_SUCCESS;
}
/**
* Explicitly pull data from the specified set of peers
* and dump to the indicated file descriptor.
*
* @param dst_name Name used to qualify set of peers.
* @param dst_mask Mask that specified how name is interpreted.
* @param dst_tag Match a specific peer endpoint.
* @param fd Local file descriptor.
*/
int orte_iof_null_pull(
const orte_process_name_t* src_name,
orte_ns_cmp_bitmask_t src_mask,
orte_iof_base_tag_t src_tag,
int fd)
{
return ORTE_SUCCESS;
}
/**
* Setup buffering for a specified set of endpoints.
*/
int orte_iof_null_buffer(
const orte_process_name_t* src_name,
orte_ns_cmp_bitmask_t src_mask,
orte_iof_base_tag_t src_tag,
size_t buffer_size)
{
return ORTE_SUCCESS;
}
/*
* Subscribe to receive a callback on receipt of data
* from a specified set of peers.
*/
int orte_iof_null_subscribe(
const orte_process_name_t* src_name,
orte_ns_cmp_bitmask_t src_mask,
orte_iof_base_tag_t src_tag,
orte_iof_base_callback_fn_t cb,
void* cbdata)
{
return ORTE_SUCCESS;
}
int orte_iof_null_unsubscribe(
const orte_process_name_t* src_name,
orte_ns_cmp_bitmask_t src_mask,
orte_iof_base_tag_t src_tag)
{
return ORTE_SUCCESS;
}

Просмотреть файл

@ -35,6 +35,12 @@ static orte_iof_base_module_t* orte_iof_proxy_init(
bool *have_hidden_threads);
/*
* Local variables
*/
static bool initialized = false;
orte_iof_proxy_component_t mca_iof_proxy_component = {
{
/* First, the mca_base_component_t struct containing meta
@ -127,6 +133,7 @@ orte_iof_proxy_init(int* priority, bool *allow_multi_user_threads, bool *have_hi
ompi_output(0, "orte_iof_proxy_init: unable to post non-blocking recv");
return NULL;
}
initialized = true;
return &orte_iof_proxy_module;
}
@ -136,7 +143,12 @@ orte_iof_proxy_init(int* priority, bool *allow_multi_user_threads, bool *have_hi
static int orte_iof_proxy_close(void)
{
return orte_rml.recv_cancel(ORTE_RML_NAME_ANY, ORTE_RML_TAG_IOF_SVC);
int rc = ORTE_SUCCESS;
if (initialized) {
rc = orte_rml.recv_cancel(ORTE_RML_NAME_ANY, ORTE_RML_TAG_IOF_SVC);
}
return rc;
}

Просмотреть файл

@ -33,6 +33,11 @@ static orte_iof_base_module_t* orte_iof_svc_init(
bool *allow_multi_user_threads,
bool *have_hidden_threads);
/*
* Local variables
*/
static bool initialized = false;
orte_iof_svc_component_t mca_iof_svc_component = {
{
@ -92,26 +97,27 @@ static int orte_iof_svc_param_register_int(
static int orte_iof_svc_open(void)
{
mca_iof_svc_component.svc_debug = orte_iof_svc_param_register_int("debug", 1);
OBJ_CONSTRUCT(&mca_iof_svc_component.svc_subscribed, ompi_list_t);
OBJ_CONSTRUCT(&mca_iof_svc_component.svc_published, ompi_list_t);
OBJ_CONSTRUCT(&mca_iof_svc_component.svc_lock, ompi_mutex_t);
return OMPI_SUCCESS;
return ORTE_SUCCESS;
}
static int orte_iof_svc_close(void)
{
ompi_list_item_t* item;
OMPI_THREAD_LOCK(&mca_iof_svc_component.svc_lock);
while((item = ompi_list_remove_first(&mca_iof_svc_component.svc_subscribed)) != NULL) {
OBJ_RELEASE(item);
if (initialized) {
OMPI_THREAD_LOCK(&mca_iof_svc_component.svc_lock);
while((item = ompi_list_remove_first(&mca_iof_svc_component.svc_subscribed)) != NULL) {
OBJ_RELEASE(item);
}
while((item = ompi_list_remove_first(&mca_iof_svc_component.svc_published)) != NULL) {
OBJ_RELEASE(item);
}
OMPI_THREAD_UNLOCK(&mca_iof_svc_component.svc_lock);
orte_rml.recv_cancel(ORTE_RML_NAME_ANY, ORTE_RML_TAG_IOF_SVC);
}
while((item = ompi_list_remove_first(&mca_iof_svc_component.svc_published)) != NULL) {
OBJ_RELEASE(item);
}
OMPI_THREAD_UNLOCK(&mca_iof_svc_component.svc_lock);
orte_rml.recv_cancel(ORTE_RML_NAME_ANY, ORTE_RML_TAG_IOF_SVC);
return OMPI_SUCCESS;
return ORTE_SUCCESS;
}
@ -121,13 +127,18 @@ static orte_iof_base_module_t*
orte_iof_svc_init(int* priority, bool *allow_multi_user_threads, bool *have_hidden_threads)
{
int rc;
if(orte_process_info.seed == false)
if (false == orte_process_info.seed) {
return NULL;
}
*priority = 1;
*allow_multi_user_threads = true;
*have_hidden_threads = false;
OBJ_CONSTRUCT(&mca_iof_svc_component.svc_subscribed, ompi_list_t);
OBJ_CONSTRUCT(&mca_iof_svc_component.svc_published, ompi_list_t);
OBJ_CONSTRUCT(&mca_iof_svc_component.svc_lock, ompi_mutex_t);
/* post non-blocking recv */
mca_iof_svc_component.svc_iov[0].iov_base = NULL;
mca_iof_svc_component.svc_iov[0].iov_len = 0;
@ -145,6 +156,7 @@ orte_iof_svc_init(int* priority, bool *allow_multi_user_threads, bool *have_hidd
ompi_output(0, "orte_iof_svc_init: unable to post non-blocking recv");
return NULL;
}
initialized = true;
return &orte_iof_svc_module;
}

Просмотреть файл

@ -58,11 +58,7 @@ typedef uint8_t orte_ns_cmd_bitmask_t;
/* CAUTION - any changes here must also change corresponding
* typedefs above
*/
#define ORTE_NS_OOB_PACK_JOBID ORTE_INT32
#define ORTE_NS_OOB_PACK_CELLID ORTE_INT32
#define ORTE_NS_OOB_PACK_VPID ORTE_INT32
#define ORTE_NS_OOB_PACK_CMD ORTE_INT16
#define ORTE_NS_OOB_PACK_OOB_TAG ORTE_INT32
#define ORTE_NS_CMD ORTE_INT16
/*
* define flag values for remote commands - only used internally

Просмотреть файл

@ -73,7 +73,7 @@ int orte_ns_nds_env_put(const orte_process_name_t* proc,
char ***environ);
int orte_ns_nds_pipe_get(void);
int orte_ns_nds_pipe_put(const orte_process_name_t* proc, orte_vpid_t vpid_start, size_t num_procs);
int orte_ns_nds_pipe_put(const orte_process_name_t* proc, orte_vpid_t vpid_start, size_t num_procs, int fd);
#if defined(c_plusplus) || defined(__cplusplus)
}

Просмотреть файл

@ -48,30 +48,45 @@ int orte_ns_nds_env_get(void)
} else {
int cellid;
int jobid;
int vpid;
orte_cellid_t cellid;
orte_jobid_t jobid;
orte_vpid_t vpid;
char* cellid_string;
char* jobid_string;
char* vpid_string;
id = mca_base_param_register_int("ns", "nds", "cellid", NULL, -1);
mca_base_param_lookup_int(id, &cellid);
if (cellid < 0) {
id = mca_base_param_register_string("ns", "nds", "cellid", NULL, NULL);
mca_base_param_lookup_string(id, &cellid_string);
if (NULL == cellid_string) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
if (ORTE_SUCCESS != (rc = orte_ns.convert_string_to_cellid(&cellid, cellid_string))) {
ORTE_ERROR_LOG(rc);
return(rc);
}
id = mca_base_param_register_int("ns", "nds", "jobid", NULL, -1);
mca_base_param_lookup_int(id, &jobid);
if (jobid < 0) {
id = mca_base_param_register_string("ns", "nds", "jobid", NULL, NULL);
mca_base_param_lookup_string(id, &jobid_string);
if (NULL == jobid_string) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
if (ORTE_SUCCESS != (rc = orte_ns.convert_string_to_jobid(&jobid, jobid_string))) {
ORTE_ERROR_LOG(rc);
return(rc);
}
id = mca_base_param_register_int("ns", "nds", "vpid", NULL, -1);
mca_base_param_lookup_int(id, &vpid);
if (vpid < 0) {
id = mca_base_param_register_string("ns", "nds", "vpid", NULL, NULL);
mca_base_param_lookup_string(id, &vpid_string);
if (NULL == vpid_string) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
if (ORTE_SUCCESS != (rc = orte_ns.convert_string_to_vpid(&vpid, vpid_string))) {
ORTE_ERROR_LOG(rc);
return(rc);
}
if (ORTE_SUCCESS != (rc = orte_ns.create_process_name(
&(orte_process_info.my_name),

Просмотреть файл

@ -15,6 +15,9 @@
*/
#include "orte_config.h"
#include <unistd.h>
#include <stdlib.h>
#include "include/orte_constants.h"
#include "util/proc_info.h"
#include "mca/base/mca_base_param.h"
@ -26,11 +29,64 @@
int orte_ns_nds_pipe_get(void)
{
return ORTE_ERR_NOT_IMPLEMENTED;
int rc, fd, id;
orte_process_name_t name;
size_t num_procs;
/* lookup the fd to use */
id = mca_base_param_register_int("nds","pipe","fd", NULL, 3);
mca_base_param_lookup_int(id, &fd);
rc = read(fd,&name,sizeof(name));
if(rc != sizeof(name)) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
if(ORTE_SUCCESS != (rc = orte_ns.copy_process_name(&orte_process_info.my_name, &name))) {
ORTE_ERROR_LOG(rc);
return rc;
}
rc = read(fd,&orte_process_info.vpid_start, sizeof(orte_process_info.vpid_start));
if(rc != sizeof(orte_process_info.vpid_start)) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
rc = read(fd,&num_procs, sizeof(num_procs));
if(rc != sizeof(orte_process_info.num_procs)) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
orte_process_info.num_procs = num_procs;
close(fd);
return ORTE_SUCCESS;
}
int orte_ns_nds_pipe_put(const orte_process_name_t* name, orte_vpid_t vpid_start, size_t num_procs)
int orte_ns_nds_pipe_put(const orte_process_name_t* name, orte_vpid_t vpid_start, size_t num_procs, int fd)
{
return ORTE_ERR_NOT_IMPLEMENTED;
int rc;
rc = write(fd,name,sizeof(orte_process_name_t));
if(rc != sizeof(orte_process_name_t)) {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
return ORTE_ERR_NOT_FOUND;
}
rc = write(fd,&vpid_start, sizeof(vpid_start));
if(rc != sizeof(vpid_start)) {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
return ORTE_ERR_NOT_FOUND;
}
rc = write(fd,&num_procs, sizeof(num_procs));
if(rc != sizeof(num_procs)) {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
return ORTE_ERR_NOT_FOUND;
}
return ORTE_SUCCESS;
}

Просмотреть файл

@ -20,6 +20,7 @@
#include "include/orte_types.h"
#include "mca/mca.h"
#include "dps/dps.h"
#include "mca/errmgr/errmgr.h"
#include "mca/rml/rml.h"
#include "ns_proxy.h"
@ -46,44 +47,44 @@ int orte_ns_proxy_create_cellid(orte_cellid_t *cellid)
cmd = OBJ_NEW(orte_buffer_t);
if(cmd == NULL) {
return ORTE_ERR_OUT_OF_RESOURCE;
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (OMPI_SUCCESS != orte_dps.pack(cmd, (void*)&command, 1, ORTE_NS_OOB_PACK_CMD)) {
if (ORTE_SUCCESS != orte_dps.pack(cmd, (void*)&command, 1, ORTE_NS_CMD)) {
OBJ_RELEASE(cmd);
return ORTE_ERR_PACK_FAILURE;
return ORTE_ERR_PACK_FAILURE;
}
if (0 > orte_rml.send_buffer(orte_ns_my_replica, cmd, MCA_OOB_TAG_NS, 0)) {
OBJ_RELEASE(cmd);
return ORTE_ERR_COMM_FAILURE;
return ORTE_ERR_COMM_FAILURE;
}
OBJ_RELEASE(cmd);
answer = OBJ_NEW(orte_buffer_t);
if(answer == NULL) {
return ORTE_ERR_OUT_OF_RESOURCE;
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (0 > orte_rml.recv_buffer(orte_ns_my_replica, answer, ORTE_RML_TAG_NS)) {
OBJ_RELEASE(answer);
return ORTE_ERR_COMM_FAILURE;
return ORTE_ERR_COMM_FAILURE;
}
count = 1;
if ((OMPI_SUCCESS != orte_dps.unpack(answer, &command, &count, ORTE_NS_OOB_PACK_CMD))
|| (ORTE_NS_CREATE_CELLID_CMD != command)) {
if ((ORTE_SUCCESS != orte_dps.unpack(answer, &command, &count, ORTE_NS_CMD))
|| (ORTE_NS_CREATE_CELLID_CMD != command)) {
OBJ_RELEASE(answer);
return ORTE_ERR_UNPACK_FAILURE;
return ORTE_ERR_UNPACK_FAILURE;
}
count = 1;
if (OMPI_SUCCESS != orte_dps.unpack(answer, cellid, &count, ORTE_NS_OOB_PACK_CELLID)) {
if (ORTE_SUCCESS != orte_dps.unpack(answer, cellid, &count, ORTE_CELLID)) {
OBJ_RELEASE(answer);
return ORTE_ERR_UNPACK_FAILURE;
return ORTE_ERR_UNPACK_FAILURE;
}
OBJ_RELEASE(answer);
return ORTE_SUCCESS;
return ORTE_SUCCESS;
}
@ -98,44 +99,44 @@ int orte_ns_proxy_create_jobid(orte_jobid_t *job)
*job = ORTE_JOBID_MAX;
if ((cmd = OBJ_NEW(orte_buffer_t)) == NULL) {
return ORTE_ERR_OUT_OF_RESOURCE;
return ORTE_ERR_OUT_OF_RESOURCE;
}
command = ORTE_NS_CREATE_JOBID_CMD;
if (OMPI_SUCCESS != orte_dps.pack(cmd, (void*)&command, 1, ORTE_NS_OOB_PACK_CMD)) { /* got a problem */
if (ORTE_SUCCESS != orte_dps.pack(cmd, (void*)&command, 1, ORTE_NS_CMD)) { /* got a problem */
OBJ_RELEASE(cmd);
return OMPI_ERR_PACK_FAILURE;
return ORTE_ERR_PACK_FAILURE;
}
if (0 > orte_rml.send_buffer(orte_ns_my_replica, cmd, ORTE_RML_TAG_NS, 0)) {
OBJ_RELEASE(cmd);
return ORTE_ERR_COMM_FAILURE;
return ORTE_ERR_COMM_FAILURE;
}
OBJ_RELEASE(cmd);
if ((answer = OBJ_NEW(orte_buffer_t)) == NULL) {
OBJ_RELEASE(answer);
return ORTE_ERR_OUT_OF_RESOURCE;
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (0 > orte_rml.recv_buffer(orte_ns_my_replica, answer, ORTE_RML_TAG_NS)) {
OBJ_RELEASE(answer);
return ORTE_ERR_COMM_FAILURE;
return ORTE_ERR_COMM_FAILURE;
}
count = 1;
if ((OMPI_SUCCESS != orte_dps.unpack(answer, &command, &count, ORTE_NS_OOB_PACK_CMD))
|| (ORTE_NS_CREATE_JOBID_CMD != command)) {
if ((ORTE_SUCCESS != orte_dps.unpack(answer, &command, &count, ORTE_NS_CMD))
|| (ORTE_NS_CREATE_JOBID_CMD != command)) {
OBJ_RELEASE(answer);
return ORTE_ERR_UNPACK_FAILURE;
return ORTE_ERR_UNPACK_FAILURE;
}
count = 1;
if (OMPI_SUCCESS != orte_dps.unpack(answer, job, &count, ORTE_NS_OOB_PACK_JOBID)) {
if (ORTE_SUCCESS != orte_dps.unpack(answer, job, &count, ORTE_JOBID)) {
OBJ_RELEASE(answer);
return ORTE_ERR_UNPACK_FAILURE;
return ORTE_ERR_UNPACK_FAILURE;
}
OBJ_RELEASE(answer);
return ORTE_SUCCESS;
return ORTE_SUCCESS;
}
@ -145,60 +146,69 @@ int orte_ns_proxy_reserve_range(orte_jobid_t job, orte_vpid_t range, orte_vpid_t
orte_buffer_t* answer;
orte_ns_cmd_flag_t command;
size_t count;
int rc;
/* set default return value */
*starting_vpid = ORTE_VPID_MAX;
if ((cmd = OBJ_NEW(orte_buffer_t)) == NULL) {
return ORTE_ERR_OUT_OF_RESOURCE;
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
command = ORTE_NS_RESERVE_RANGE_CMD;
if (OMPI_SUCCESS != orte_dps.pack(cmd, (void*)&command, 1, ORTE_NS_OOB_PACK_CMD)) { /* got a problem */
if (ORTE_SUCCESS != (rc = orte_dps.pack(cmd, (void*)&command, 1, ORTE_NS_CMD))) { /* got a problem */
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
return ORTE_ERR_PACK_FAILURE;
return rc;
}
if (OMPI_SUCCESS != orte_dps.pack(cmd, (void*)&job, 1, ORTE_NS_OOB_PACK_JOBID)) { /* got a problem */
if (ORTE_SUCCESS != (rc = orte_dps.pack(cmd, (void*)&job, 1, ORTE_JOBID))) { /* got a problem */
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
return ORTE_ERR_PACK_FAILURE;
return rc;
}
if (OMPI_SUCCESS != orte_dps.pack(cmd, (void*)&range, 1, ORTE_NS_OOB_PACK_VPID)) { /* got a problem */
if (ORTE_SUCCESS != (rc = orte_dps.pack(cmd, (void*)&range, 1, ORTE_VPID))) { /* got a problem */
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
return ORTE_ERR_PACK_FAILURE;
return rc;
}
if (0 > orte_rml.send_buffer(orte_ns_my_replica, cmd, ORTE_RML_TAG_NS, 0)) {
if (0 > (rc = orte_rml.send_buffer(orte_ns_my_replica, cmd, ORTE_RML_TAG_NS, 0))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
return ORTE_ERR_COMM_FAILURE;
return rc;
}
OBJ_RELEASE(cmd);
if ((answer = OBJ_NEW(orte_buffer_t)) == NULL) {
return ORTE_ERR_OUT_OF_RESOURCE;
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (0 > orte_rml.recv_buffer(orte_ns_my_replica, answer, ORTE_RML_TAG_NS)) {
if (0 > (rc = orte_rml.recv_buffer(orte_ns_my_replica, answer, ORTE_RML_TAG_NS))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(answer);
return ORTE_ERR_COMM_FAILURE;
return rc;
}
count = 1;
if ((OMPI_SUCCESS != orte_dps.unpack(answer, &command, &count, ORTE_NS_OOB_PACK_CMD))
|| (ORTE_NS_RESERVE_RANGE_CMD != command)) {
if ((ORTE_SUCCESS != (rc = orte_dps.unpack(answer, &command, &count, ORTE_NS_CMD)))
|| (ORTE_NS_RESERVE_RANGE_CMD != command)) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(answer);
return ORTE_ERR_UNPACK_FAILURE;
return rc;
}
count = 1;
if (OMPI_SUCCESS != orte_dps.unpack(answer, starting_vpid, &count, ORTE_NS_OOB_PACK_VPID)) {
if (ORTE_SUCCESS != (rc = orte_dps.unpack(answer, starting_vpid, &count, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(answer);
return ORTE_ERR_UNPACK_FAILURE;
return ORTE_ERR_UNPACK_FAILURE;
}
OBJ_RELEASE(answer);
return ORTE_SUCCESS;
return ORTE_SUCCESS;
}
@ -235,7 +245,7 @@ int orte_ns_proxy_assign_rml_tag(orte_rml_tag_t *tag,
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (OMPI_SUCCESS != orte_dps.pack(cmd, (void*)&command, 1, ORTE_NS_OOB_PACK_CMD)) {
if (ORTE_SUCCESS != orte_dps.pack(cmd, (void*)&command, 1, ORTE_NS_CMD)) {
OBJ_RELEASE(cmd);
return ORTE_ERR_PACK_FAILURE;
}
@ -265,14 +275,14 @@ int orte_ns_proxy_assign_rml_tag(orte_rml_tag_t *tag,
}
count = 1;
if ((OMPI_SUCCESS != orte_dps.unpack(answer, &command, &count, ORTE_NS_OOB_PACK_CMD))
if ((ORTE_SUCCESS != orte_dps.unpack(answer, &command, &count, ORTE_NS_CMD))
|| (ORTE_NS_CREATE_CELLID_CMD != command)) {
OBJ_RELEASE(answer);
return ORTE_ERR_UNPACK_FAILURE;
}
count = 1;
if (OMPI_SUCCESS != orte_dps.unpack(answer, tag, &count, ORTE_NS_OOB_PACK_OOB_TAG)) {
if (ORTE_SUCCESS != orte_dps.unpack(answer, tag, &count, ORTE_UINT32)) {
OBJ_RELEASE(answer);
return ORTE_ERR_UNPACK_FAILURE;
}

Просмотреть файл

@ -36,6 +36,7 @@
#include "mca/mca.h"
#include "mca/base/mca_base_param.h"
#include "mca/errmgr/errmgr.h"
#include "mca/rml/rml.h"
#include "ns_replica.h"
@ -200,7 +201,7 @@ mca_ns_base_module_t* orte_ns_replica_init(int *priority)
/* initialize the taglist */
OBJ_CONSTRUCT(&orte_ns_replica_taglist, ompi_list_t);
orte_ns_replica_next_rml_tag = ORTE_OOB_TAG_START_LIST;
orte_ns_replica_next_rml_tag = ORTE_RML_TAG_DYNAMIC;
/* setup the thread lock */
OBJ_CONSTRUCT(&orte_ns_replica_mutex, ompi_mutex_t);
@ -278,115 +279,129 @@ void orte_ns_replica_recv(int status, orte_process_name_t* sender,
char *tagname;
orte_rml_tag_t oob_tag;
size_t count;
int32_t return_code=ORTE_SUCCESS;
int rc=ORTE_SUCCESS, ret;
count = 1;
if (ORTE_SUCCESS != orte_dps.unpack(buffer, (void*)&command, &count, ORTE_NS_OOB_PACK_CMD)) {
return_code = ORTE_ERR_BAD_PARAM;
goto RETURN_ERROR;
if (ORTE_SUCCESS != (rc = orte_dps.unpack(buffer, (void*)&command, &count, ORTE_NS_CMD))) {
ORTE_ERROR_LOG(rc);
rc = ORTE_ERR_BAD_PARAM;
goto RETURN_ERROR;
}
if ((answer = OBJ_NEW(orte_buffer_t)) != NULL) {
return_code = ORTE_ERR_OUT_OF_RESOURCE;
if ((answer = OBJ_NEW(orte_buffer_t)) == NULL) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
rc = ORTE_ERR_OUT_OF_RESOURCE;
goto RETURN_ERROR;
}
if (ORTE_NS_CREATE_CELLID_CMD == command) { /* got a command to create a cellid */
if (OMPI_SUCCESS != (return_code = orte_dps.pack(answer, (void*)&command, 1, ORTE_NS_OOB_PACK_CMD))) {
if (ORTE_SUCCESS != (rc = orte_dps.pack(answer, (void*)&command, 1, ORTE_NS_CMD))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
if (ORTE_SUCCESS != (return_code = orte_ns_replica_create_cellid(&cell))) {
goto RETURN_ERROR;
}
rc = orte_ns_replica_create_cellid(&cell);
if (OMPI_SUCCESS != (return_code = orte_dps.pack(answer, (void*)&cell, 1, ORTE_NS_OOB_PACK_CELLID))) {
if (ORTE_SUCCESS != (ret = orte_dps.pack(answer, (void*)&cell, 1, ORTE_CELLID))) {
ORTE_ERROR_LOG(ret);
goto RETURN_ERROR;
}
if (0 > orte_rml.send_buffer(sender, answer, tag, 0)) {
/* RHC -- not sure what to do if the return send fails */
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
goto RETURN_ERROR;
}
} else if (ORTE_NS_CREATE_JOBID_CMD == command) { /* got command to create jobid */
if (OMPI_SUCCESS != (return_code = orte_dps.pack(answer, (void*)&command, 1, ORTE_NS_OOB_PACK_CMD))) {
if (ORTE_SUCCESS != (rc = orte_dps.pack(answer, (void*)&command, 1, ORTE_NS_CMD))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
if (ORTE_SUCCESS != (return_code = orte_ns_replica_create_jobid(&job))) {
if (ORTE_SUCCESS != (rc = orte_ns_replica_create_jobid(&job))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
if (OMPI_SUCCESS != (return_code = orte_dps.pack(answer, (void*)&job, 1, ORTE_NS_OOB_PACK_JOBID))) {
if (OMPI_SUCCESS != (rc = orte_dps.pack(answer, (void*)&job, 1, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
if (0 > orte_rml.send_buffer(sender, answer, tag, 0)) {
/* RHC -- not sure what to do if the return send fails */
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
goto RETURN_ERROR;
}
} else if (ORTE_NS_RESERVE_RANGE_CMD == command) { /* got command to reserve vpid range */
count = 1;
if (OMPI_SUCCESS != (return_code = orte_dps.unpack(buffer, (void*)&job, &count, ORTE_NS_OOB_PACK_JOBID))) {
if (OMPI_SUCCESS != (rc = orte_dps.unpack(buffer, (void*)&job, &count, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
count = 1;
if (OMPI_SUCCESS != (return_code = orte_dps.unpack(buffer, (void*)&range, &count, ORTE_NS_OOB_PACK_VPID))) {
if (OMPI_SUCCESS != (rc = orte_dps.unpack(buffer, (void*)&range, &count, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
if (ORTE_SUCCESS != (return_code = orte_ns_replica_reserve_range(job, range, &startvpid))) {
goto RETURN_ERROR;
}
if (ORTE_SUCCESS != (rc = orte_ns_replica_reserve_range(job, range, &startvpid))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
if (OMPI_SUCCESS != (return_code = orte_dps.pack(answer, (void*)&command, 1, ORTE_NS_OOB_PACK_CMD))) {
if (OMPI_SUCCESS != (rc = orte_dps.pack(answer, (void*)&command, 1, ORTE_NS_CMD))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
if (OMPI_SUCCESS != (return_code = orte_dps.pack(answer, (void*)&startvpid, 1, ORTE_NS_OOB_PACK_VPID))) {
if (OMPI_SUCCESS != (rc = orte_dps.pack(answer, (void*)&startvpid, 1, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
if (0 > orte_rml.send_buffer(sender, answer, tag, 0)) {
/* RHC -- not sure what to do if the return send fails */
if (0 > (rc = orte_rml.send_buffer(sender, answer, tag, 0))) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
goto RETURN_ERROR;
}
} else if (ORTE_NS_ASSIGN_OOB_TAG_CMD == command) { /* got command to assign an OOB tag */
count = 1;
if (0 > orte_dps.unpack(buffer, &tagname, &count, ORTE_STRING)) {
return_code = ORTE_ERR_UNPACK_FAILURE;
rc = ORTE_ERR_UNPACK_FAILURE;
goto RETURN_ERROR;
}
if (0 == strncmp(tagname, "NULL", 4)) {
if (ORTE_SUCCESS != (return_code = orte_ns_replica_assign_rml_tag(&oob_tag, NULL))) {
if (ORTE_SUCCESS != (rc = orte_ns_replica_assign_rml_tag(&oob_tag, NULL))) {
goto RETURN_ERROR;
}
} else {
if (ORTE_SUCCESS != (return_code = orte_ns_replica_assign_rml_tag(&oob_tag, tagname))) {
if (ORTE_SUCCESS != (rc = orte_ns_replica_assign_rml_tag(&oob_tag, tagname))) {
goto RETURN_ERROR;
}
}
if (OMPI_SUCCESS != (return_code = orte_dps.pack(answer, (void*)&command, 1, ORTE_NS_OOB_PACK_CMD))) {
if (OMPI_SUCCESS != (rc = orte_dps.pack(answer, (void*)&command, 1, ORTE_NS_CMD))) {
goto RETURN_ERROR;
}
if (OMPI_SUCCESS != (return_code = orte_dps.pack(answer, (void*)&oob_tag, 1, ORTE_NS_OOB_PACK_OOB_TAG))) {
if (OMPI_SUCCESS != (rc = orte_dps.pack(answer, (void*)&oob_tag, 1, ORTE_UINT32))) {
goto RETURN_ERROR;
}
if (0 > orte_rml.send_buffer(sender, answer, tag, 0)) {
/* RHC -- not sure what to do if the return send fails */
}
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
goto RETURN_ERROR;
}
} else { /* got an unrecognized command */
RETURN_ERROR:
OBJ_CONSTRUCT(&error_answer, orte_buffer_t);
orte_dps.pack(&error_answer, (void*)&command, 1, ORTE_NS_OOB_PACK_CMD);
orte_dps.pack(&error_answer, (void*)&return_code, 1, ORTE_INT32);
orte_dps.pack(&error_answer, (void*)&command, 1, ORTE_NS_CMD);
orte_dps.pack(&error_answer, (void*)&rc, 1, ORTE_INT32);
orte_rml.send_buffer(sender, &error_answer, tag, 0);
OBJ_DESTRUCT(&error_answer);
}

Просмотреть файл

@ -12,7 +12,8 @@
* $HEADER$
*/
#include "ompi_config.h"
#include "orte_config.h"
#include "include/orte_constants.h"
#include <stdio.h>
#include <string.h>
@ -56,19 +57,22 @@ int mca_oob_parse_contact_info(
char*** uri)
{
orte_process_name_t* proc_name;
int rc;
/* parse the process name */
char* cinfo = strdup(contact_info);
char* ptr = strchr(cinfo, ';');
if(NULL == ptr) {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
free(cinfo);
return OMPI_ERR_BAD_PARAM;
return ORTE_ERR_BAD_PARAM;
}
*ptr = '\0';
ptr++;
if (ORTE_SUCCESS != orte_ns.convert_string_to_process_name(&proc_name, cinfo)) {
name = NULL;
return OMPI_ERROR;
if (ORTE_SUCCESS != (rc = orte_ns.convert_string_to_process_name(&proc_name, cinfo))) {
ORTE_ERROR_LOG(rc);
free(cinfo);
return rc;
}
*name = *proc_name;
free(proc_name);
@ -78,7 +82,7 @@ int mca_oob_parse_contact_info(
*uri = ompi_argv_split(ptr, ';');
}
free(cinfo);
return OMPI_SUCCESS;
return ORTE_SUCCESS;
}
@ -164,11 +168,11 @@ int mca_oob_base_init(void)
/* set the global variable to point to the first initialize module */
if(s_module == NULL) {
ompi_output(0, "mca_oob_base_init: no OOB modules available\n");
return OMPI_ERROR;
return ORTE_ERROR;
}
mca_oob = *s_module;
return OMPI_SUCCESS;
return ORTE_SUCCESS;
}
@ -216,7 +220,7 @@ int mca_oob_set_contact_info(const char* contact_info)
char** uri;
char** ptr;
int rc = mca_oob_parse_contact_info(contact_info, &name, &uri);
if(rc != OMPI_SUCCESS)
if(rc != ORTE_SUCCESS)
return rc;
for(ptr = uri; ptr != NULL && *ptr != NULL; ptr++) {
@ -234,7 +238,7 @@ int mca_oob_set_contact_info(const char* contact_info)
if(uri != NULL) {
ompi_argv_free(uri);
}
return OMPI_SUCCESS;
return ORTE_SUCCESS;
}
/**
@ -254,6 +258,6 @@ int mca_oob_base_module_init(void)
if (NULL != base->oob_module->oob_init)
base->oob_module->oob_init();
}
return OMPI_SUCCESS;
return ORTE_SUCCESS;
}

Просмотреть файл

@ -118,7 +118,6 @@ int orte_pls_base_open(void)
/* Sort the resulting available list in priority order */
ompi_list_sort(&orte_pls_base.pls_available, compare);
/* All done */
return ORTE_SUCCESS;

Просмотреть файл

@ -22,16 +22,20 @@
#include <sys/wait.h>
#include <fcntl.h>
#include "util/argv.h"
#include "util/output.h"
#include "util/environ.h"
#include "util/proc_info.h"
#include "event/event.h"
#include "runtime/orte_wait.h"
#include "runtime/runtime.h"
#include "mca/ns/base/base.h"
#include "mca/ns/base/ns_base_nds.h"
#include "mca/pls/base/base.h"
#include "mca/base/mca_base_param.h"
#include "mca/iof/iof.h"
#include "mca/rmgr/base/base.h"
#include "mca/rmaps/base/base.h"
#include "mca/rml/rml.h"
#include "mca/errmgr/errmgr.h"
#include "mca/soh/soh.h"
@ -40,7 +44,6 @@
#include "pls_bproc_seed.h"
extern int ompi_evsignal_restart(void);
#if OMPI_HAVE_POSIX_THREADS && OMPI_THREADS_HAVE_DIFFERENT_PIDS
int orte_pls_bproc_seed_launch_threaded(orte_jobid_t);
#endif
@ -160,7 +163,13 @@ cleanup:
* Spew out a new child based on the in-memory process image.
*/
static int orte_pls_bproc_undump(orte_rmaps_base_proc_t* proc, uint8_t* image, size_t image_len, pid_t* pid)
static int orte_pls_bproc_undump(
orte_rmaps_base_proc_t* proc,
orte_vpid_t vpid_start,
orte_vpid_t vpid_range,
uint8_t* image,
size_t image_len,
pid_t* pid)
{
int p_name[2];
int p_stdout[2];
@ -266,7 +275,7 @@ static int orte_pls_bproc_undump(orte_rmaps_base_proc_t* proc, uint8_t* image, s
close(p_image[1]);
/* write the process name */
write(p_name[1], &proc->proc_name, sizeof(proc->proc_name));
orte_ns_nds_pipe_put(&proc->proc_name, vpid_start, vpid_range, p_name[1]);
close(p_name[1]);
return ORTE_SUCCESS;
}
@ -309,14 +318,14 @@ static void orte_pls_bproc_wait_node(pid_t pid, int status, void* cbdata)
for(item = ompi_list_get_first(&node->node_procs);
item != ompi_list_get_end(&node->node_procs);
item = ompi_list_get_next(item)) {
orte_rmaps_base_proc_t* proc = (orte_rmaps_base_proc_t*)item;
orte_rmaps_base_proc_t* proc = (orte_rmaps_base_proc_t*)cbdata;
int rc = orte_soh.set_proc_soh(&proc->proc_name, ORTE_PROC_STATE_TERMINATED, 0);
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
}
}
OBJ_RELEASE(node);
/* OBJ_RELEASE(node); */
/* release any waiting threads */
OMPI_THREAD_LOCK(&mca_pls_bproc_seed_component.lock);
@ -333,16 +342,23 @@ static void orte_pls_bproc_wait_node(pid_t pid, int status, void* cbdata)
* on each of the nodes.
*/
static int orte_pls_bproc_launch_app(orte_jobid_t jobid, orte_rmaps_base_map_t* map)
static int orte_pls_bproc_launch_app(
orte_jobid_t jobid,
orte_rmaps_base_map_t* map,
orte_vpid_t vpid_start,
orte_vpid_t vpid_range)
{
uint8_t* image = NULL;
size_t image_len;
int* node_list = NULL;
int* daemon_pids = NULL;
size_t num_nodes;
orte_vpid_t daemon_vpid_start;
orte_vpid_t daemon_vpid_start = 0;
int rc, index;
char* uri;
char *var, *value;
char *env[4];
char **new_env;
/* convert node names to bproc nodelist */
if(ORTE_SUCCESS != (rc = orte_pls_bproc_nodelist(map, &node_list, &num_nodes))) {
@ -360,6 +376,43 @@ static int orte_pls_bproc_launch_app(orte_jobid_t jobid, orte_rmaps_base_map_t*
goto cleanup;
}
var = mca_base_param_environ_variable("ns","nds",NULL);
asprintf(&value, "%s=pipe", var);
env[0] = value;
/* ns replica contact info */
if(NULL == orte_process_info.ns_replica) {
rc = orte_ns.copy_process_name(&orte_process_info.ns_replica,orte_process_info.my_name);
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
orte_process_info.ns_replica_uri = orte_rml.get_uri();
}
var = mca_base_param_environ_variable("ns","replica","uri");
asprintf(&value, "%s=uri%s", var, orte_process_info.ns_replica_uri);
env[1] = value;
/* gpr replica contact info */
if(NULL == orte_process_info.gpr_replica) {
rc = orte_ns.copy_process_name(&orte_process_info.gpr_replica,orte_process_info.my_name);
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
orte_process_info.gpr_replica_uri = orte_rml.get_uri();
}
var = mca_base_param_environ_variable("gpr","replica","uri");
asprintf(&value, "%s=uri%s", var, orte_process_info.gpr_replica_uri);
env[2] = value;
env[3] = NULL;
/* overwrite previously specified values with the above settings */
new_env = ompi_environ_merge(map->app->env, env);
ompi_argv_free(map->app->env);
map->app->env = new_env;
map->app->num_env = ompi_argv_count(new_env);
/* read process image */
if(ORTE_SUCCESS != (rc = orte_pls_bproc_dump(map->app, &image, &image_len))) {
ORTE_ERROR_LOG(rc);
@ -392,6 +445,7 @@ static int orte_pls_bproc_launch_app(orte_jobid_t jobid, orte_rmaps_base_map_t*
orte_rmaps_base_node_t* node = NULL;
orte_process_name_t* daemon_name;
int fd;
int rank = rc;
/* connect stdin to /dev/null */
fd = open("/dev/null", O_RDWR);
@ -413,13 +467,17 @@ static int orte_pls_bproc_launch_app(orte_jobid_t jobid, orte_rmaps_base_map_t*
} else {
_exit(-1);
}
if(mca_pls_bproc_seed_component.debug) {
ompi_output(0, "orte_pls_bproc: rank=%d\n", rank);
}
/* find this node */
index = 0;
for(item = ompi_list_get_first(&map->nodes);
item != ompi_list_get_end(&map->nodes);
item = ompi_list_get_next(item)) {
if(index++ == rc) {
if(index++ == rank) {
node = (orte_rmaps_base_node_t*)item;
break;
}
@ -432,14 +490,17 @@ static int orte_pls_bproc_launch_app(orte_jobid_t jobid, orte_rmaps_base_map_t*
/* setup the daemons process name */
rc = orte_ns.create_process_name(
&daemon_name, orte_process_info.my_name->cellid, 0, daemon_vpid_start + rc);
&daemon_name, orte_process_info.my_name->cellid, 0, daemon_vpid_start + rank);
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
_exit(-1);
}
if(mca_pls_bproc_seed_component.debug) {
ompi_output(0, "orte_pls_bproc: node=%s name=%d.%d.%d\n",
node->node_name, orte_process_info.my_name->cellid, 0, daemon_vpid_start+rc);
ompi_output(0, "orte_pls_bproc: node=%s name=%d.%d.%d procs=%d\n",
node->node_name,
orte_process_info.my_name->cellid, 0,
daemon_vpid_start+rank,
ompi_list_get_size(&node->node_procs));
}
/* restart the daemon w/ the new process name */
@ -467,7 +528,7 @@ static int orte_pls_bproc_launch_app(orte_jobid_t jobid, orte_rmaps_base_map_t*
if(mca_pls_bproc_seed_component.debug) {
ompi_output(0, "orte_pls_bproc: starting: %d.%d.%d\n", ORTE_NAME_ARGS(&proc->proc_name));
}
rc = orte_pls_bproc_undump(proc, image, image_len, &pid);
rc = orte_pls_bproc_undump(proc, vpid_start, vpid_range, image, image_len, &pid);
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
_exit(1);
@ -514,7 +575,6 @@ static int orte_pls_bproc_launch_app(orte_jobid_t jobid, orte_rmaps_base_map_t*
OMPI_THREAD_LOCK(&mca_pls_bproc_seed_component.lock);
mca_pls_bproc_seed_component.num_children++;
OMPI_THREAD_UNLOCK(&mca_pls_bproc_seed_component.lock);
orte_wait_cb(daemon_pids[index++], orte_pls_bproc_wait_node, node);
}
@ -538,10 +598,12 @@ cleanup:
* w/ a distinct set of daemons.
*/
static int orte_pls_bproc_seed_launch(orte_jobid_t jobid)
int orte_pls_bproc_seed_launch(orte_jobid_t jobid)
{
ompi_list_item_t* item;
ompi_list_t mapping;
orte_vpid_t vpid_start;
orte_vpid_t vpid_range;
int rc;
/* query for the application context and allocated nodes */
@ -550,13 +612,17 @@ static int orte_pls_bproc_seed_launch(orte_jobid_t jobid)
ORTE_ERROR_LOG(rc);
return rc;
}
if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_vpid_range(jobid, &vpid_start, &vpid_range))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* for each application context - launch across the first n nodes required */
for(item = ompi_list_get_first(&mapping);
item != ompi_list_get_end(&mapping);
item = ompi_list_get_next(item)) {
orte_rmaps_base_map_t* map = (orte_rmaps_base_map_t*)item;
rc = orte_pls_bproc_launch_app(jobid, map);
rc = orte_pls_bproc_launch_app(jobid, map, vpid_start, vpid_range);
if(rc != ORTE_SUCCESS) {
ORTE_ERROR_LOG(rc);
goto cleanup;
@ -681,9 +747,31 @@ static OBJ_CLASS_INSTANCE(
static void orte_pls_bproc_seed_launch_cb(int fd, short event, void* args)
{
orte_pls_bproc_stack_t *stack = (orte_pls_bproc_stack_t*)args;
orte_vpid_t child_vpid;
orte_process_name_t* child_name;
char* uri;
int pid;
int rc;
/* setup the daemons process name */
rc = orte_ns.reserve_range(orte_process_info.my_name->jobid,1,&child_vpid);
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
stack->rc = rc;
goto complete;
}
rc = orte_ns.create_process_name(
&child_name, orte_process_info.my_name->cellid,
orte_process_info.my_name->jobid,
child_vpid);
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
stack->rc = rc;
goto complete;
}
uri = orte_rml.get_uri();
/* fork the child */
pid = fork();
if(pid < 0) {
ompi_output(0, "orte_pls_bproc: fork failed with errno=%d\n", errno);
@ -693,7 +781,31 @@ static void orte_pls_bproc_seed_launch_cb(int fd, short event, void* args)
pthread_kill_other_threads_np();
ompi_set_using_threads(false);
rc = ompi_event_restart();
if(NULL == orte_process_info.ns_replica) {
rc = orte_ns.copy_process_name(&orte_process_info.ns_replica,orte_process_info.my_name);
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
exit(rc);
}
orte_process_info.ns_replica_uri = orte_rml.get_uri();
}
if(NULL == orte_process_info.gpr_replica) {
rc = orte_ns.copy_process_name(&orte_process_info.gpr_replica,orte_process_info.my_name);
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
exit(rc);
}
orte_process_info.gpr_replica_uri = orte_rml.get_uri();
}
/* restart the daemon w/ the new process name */
rc = orte_restart(child_name, uri);
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
exit(rc);
}
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
exit(rc);
@ -712,6 +824,7 @@ static void orte_pls_bproc_seed_launch_cb(int fd, short event, void* args)
stack->rc = ORTE_SUCCESS;
}
complete:
OMPI_THREAD_LOCK(&stack->mutex);
stack->complete = true;
ompi_condition_signal(&stack->cond);

Просмотреть файл

@ -67,19 +67,26 @@ static int orte_pls_bproc_param_register_int(
int orte_pls_bproc_seed_component_open(void)
{
int id;
/* init globals */
OBJ_CONSTRUCT(&mca_pls_bproc_seed_component.lock, ompi_mutex_t);
OBJ_CONSTRUCT(&mca_pls_bproc_seed_component.condition, ompi_condition_t);
mca_pls_bproc_seed_component.num_children = 0;
/* init parameters */
mca_pls_bproc_seed_component.debug = orte_pls_bproc_param_register_int("debug", 0);
mca_pls_bproc_seed_component.debug = orte_pls_bproc_param_register_int("debug", 1);
mca_pls_bproc_seed_component.reap = orte_pls_bproc_param_register_int("reap", 1);
mca_pls_bproc_seed_component.image_frag_size = orte_pls_bproc_param_register_int("image_frag_size", 256*1024);
mca_pls_bproc_seed_component.name_fd = orte_pls_bproc_param_register_int("name_fd", 3);
mca_pls_bproc_seed_component.priority = orte_pls_bproc_param_register_int("priority", 100);
mca_pls_bproc_seed_component.terminate_sig = orte_pls_bproc_param_register_int("terminate_sig", 9);
id = mca_base_param_find("nds", "pipe", "fd");
if(id > 0) {
mca_base_param_lookup_int(id, &mca_pls_bproc_seed_component.name_fd);
} else {
mca_pls_bproc_seed_component.name_fd = 3;
}
return ORTE_SUCCESS;
}
@ -103,6 +110,7 @@ orte_pls_base_module_t* orte_pls_bproc_seed_init(
{
int ret;
struct bproc_version_t version;
/* are we the seed */
if(orte_process_info.seed == false)
@ -119,9 +127,6 @@ orte_pls_base_module_t* orte_pls_bproc_seed_init(
return NULL;
}
/* post a non-blocking receive */
*priority = mca_pls_bproc_seed_component.priority;
return &orte_pls_bproc_seed_module;
}

Просмотреть файл

@ -73,7 +73,11 @@ static void orte_pls_fork_wait_proc(pid_t pid, int status, void* cbdata)
int rc;
/* set the state of this process */
rc = orte_soh.set_proc_soh(&proc->proc_name, ORTE_PROC_STATE_TERMINATED, status);
if(WIFEXITED(status)) {
rc = orte_soh.set_proc_soh(&proc->proc_name, ORTE_PROC_STATE_TERMINATED, status);
} else {
rc = orte_soh.set_proc_soh(&proc->proc_name, ORTE_PROC_STATE_ABORTED, status);
}
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
}
@ -98,10 +102,6 @@ static int orte_pls_fork_proc(
int p_stderr[2];
int rc;
if(mca_pls_fork_component.debug) {
ompi_output(0, "orte_pls_fork: starting %d.%d.%d\n", ORTE_NAME_ARGS(&proc->proc_name));
}
if(pipe(p_stdout) < 0 ||
pipe(p_stderr) < 0) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
@ -167,6 +167,8 @@ static int orte_pls_fork_proc(
new_env = ompi_environ_merge(context->env, environ_copy);
ompi_argv_free(environ_copy);
execve(context->app, context->argv, new_env);
ompi_output(0, "orte_pls_fork: %s - %s\n", context->app,
ompi_argv_join(context->argv, ' '));
ompi_output(0, "orte_pls_fork: execv failed with errno=%d\n", errno);
exit(-1);

Просмотреть файл

@ -51,6 +51,7 @@ int orte_pls_rsh_terminate_proc(const orte_process_name_t* proc_name);
struct orte_pls_rsh_component_t {
orte_pls_base_component_t super;
int debug;
int delay;
int reap;
int priority;
char** argv;

Просмотреть файл

@ -114,7 +114,6 @@ static char* orte_pls_rsh_param_register_string(
int orte_pls_rsh_component_open(void)
{
char* param;
/* initialize globals */
OBJ_CONSTRUCT(&mca_pls_rsh_component.lock, ompi_mutex_t);
OBJ_CONSTRUCT(&mca_pls_rsh_component.cond, ompi_condition_t);
@ -123,8 +122,16 @@ int orte_pls_rsh_component_open(void)
/* lookup parameters */
mca_pls_rsh_component.debug = orte_pls_rsh_param_register_int("debug",0);
mca_pls_rsh_component.num_concurrent = orte_pls_rsh_param_register_int("num_concurrent",128);
if(mca_pls_rsh_component.debug == 0) {
int id = mca_base_param_register_int("debug",NULL,NULL,NULL,0);
int value;
mca_base_param_lookup_int(id,&value);
mca_pls_rsh_component.debug = (value > 0) ? 1 : 0;
}
mca_pls_rsh_component.orted = orte_pls_rsh_param_register_string("orted","orted");
mca_pls_rsh_component.priority = orte_pls_rsh_param_register_int("priority",10);
mca_pls_rsh_component.delay = orte_pls_rsh_param_register_int("delay",1);
mca_pls_rsh_component.reap = orte_pls_rsh_param_register_int("reap",1);
param = orte_pls_rsh_param_register_string("agent","ssh");
@ -139,6 +146,9 @@ orte_pls_base_module_t *orte_pls_rsh_component_init(int *priority)
extern char **environ;
/* If we didn't find the agent in the path, then don't use this component */
if (NULL == mca_pls_rsh_component.argv || NULL == mca_pls_rsh_component.argv[0]) {
return NULL;
}
mca_pls_rsh_component.path = ompi_path_findv(mca_pls_rsh_component.argv[0], 0, environ, NULL);
if (NULL == mca_pls_rsh_component.path) {
return NULL;

Просмотреть файл

@ -65,24 +65,6 @@ orte_pls_base_module_1_0_0_t orte_pls_rsh_module = {
static void orte_pls_rsh_wait_daemon(pid_t pid, int status, void* cbdata)
{
#if 0
orte_rmaps_base_node_t* node = (orte_rmaps_base_node_t*)cbdata;
ompi_list_item_t *item;
int rc;
/* set the state of all processes launched by this daemon */
for(item = ompi_list_get_first(&node->node_procs);
item != ompi_list_get_end(&node->node_procs);
item = ompi_list_get_next(item)) {
orte_rmaps_base_proc_t* proc = (orte_rmaps_base_proc_t*)item;
rc = orte_soh.set_proc_soh(&proc->proc_name, ORTE_PROC_STATE_TERMINATED, status);
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
}
}
OBJ_RELEASE(node);
#endif
/* release any waiting threads */
OMPI_THREAD_LOCK(&mca_pls_rsh_component.lock);
if(mca_pls_rsh_component.num_children-- >= NUM_CONCURRENT ||
@ -98,8 +80,9 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
ompi_list_t nodes;
ompi_list_item_t* item;
size_t num_nodes;
orte_vpid_t vpid_start;
int node_name_index;
orte_vpid_t vpid;
int node_name_index1;
int node_name_index2;
int proc_name_index;
char *jobid_string;
char *uri, *param;
@ -125,7 +108,7 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
if(num_nodes == 0) {
return ORTE_ERR_BAD_PARAM;
}
rc = orte_ns.reserve_range(0, num_nodes, &vpid_start);
rc = orte_ns.reserve_range(0, num_nodes, &vpid);
if(ORTE_SUCCESS != rc) {
goto cleanup;
}
@ -139,7 +122,7 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
*/
argv = ompi_argv_copy(mca_pls_rsh_component.argv);
argc = mca_pls_rsh_component.argc;
node_name_index = argc;
node_name_index1 = argc;
ompi_argv_append(&argc, &argv, ""); /* placeholder for node name */
/* application */
@ -152,6 +135,9 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
ompi_argv_append(&argc, &argv, "--name");
proc_name_index = argc;
ompi_argv_append(&argc, &argv, "");
ompi_argv_append(&argc, &argv, "--nodename");
node_name_index2 = argc;
ompi_argv_append(&argc, &argv, "");
/* setup ns contact info */
ompi_argv_append(&argc, &argv, "--nsreplica");
@ -187,7 +173,8 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
pid_t pid;
/* setup node name */
argv[node_name_index] = node->node_name;
argv[node_name_index1] = node->node_name;
argv[node_name_index2] = node->node_name;
/* rsh a child to exec the rsh/ssh session */
pid = fork();
@ -201,10 +188,9 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
orte_process_name_t* name;
char* name_string;
int fd = open("/dev/null", O_RDWR);
/* setup process name */
rc = orte_ns.create_process_name(&name, node->node_cellid, 0, vpid_start);
rc = orte_ns.create_process_name(&name, node->node_cellid, 0, vpid);
if(ORTE_SUCCESS != rc) {
ompi_output(0, "orte_pls_rsh: unable to create process name");
exit(-1);
@ -216,17 +202,20 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
}
argv[proc_name_index] = name_string;
/* debug output */
if(mca_pls_rsh_component.debug) {
if (mca_pls_rsh_component.debug > 1) {
/* debug output */
char* cmd = ompi_argv_join(argv, ' ');
ompi_output(0, "orte_pls_rsh: %s\n", cmd);
}
}
/* setup stdin/stdout/stderr */
dup2(fd, 0);
dup2(fd, 1);
dup2(fd, 2);
close(fd);
if (mca_pls_rsh_component.debug == 0) {
/* setup stdin/stdout/stderr */
int fd = open("/dev/null", O_RDWR);
dup2(fd, 0);
dup2(fd, 1);
dup2(fd, 2);
close(fd);
}
/* exec the daemon */
execv(mca_pls_rsh_component.path, argv);
@ -242,6 +231,12 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
OBJ_RETAIN(node);
orte_wait_cb(pid, orte_pls_rsh_wait_daemon, node);
vpid++;
/* if required - add delay to avoid problems w/ X11 authentication */
if (mca_pls_rsh_component.debug && mca_pls_rsh_component.delay) {
sleep(mca_pls_rsh_component.delay);
}
}
}

Просмотреть файл

@ -22,6 +22,7 @@ AM_CPPFLAGS = -I$(top_ompi_builddir)/src/include \
noinst_LTLIBRARIES = libmca_pls_tm.la
libmca_pls_tm_la_SOURCES = \
pls_tm.h \
pls_tm_child.c \
pls_tm_component.c \
pls_tm_module.c \
pls_tm_registry.c

Просмотреть файл

@ -26,15 +26,17 @@
extern "C" {
#endif
/*
* Globally exported variables
*/
/* Globally exported variables */
OMPI_COMP_EXPORT extern orte_pls_base_component_1_0_0_t
orte_pls_tm_component;
OMPI_COMP_EXPORT extern orte_pls_base_module_1_0_0_t
orte_pls_tm_module;
/* Global, but not exported variables */
extern bool orte_pls_tm_connected;
/* Internal struct */
typedef struct pls_tm_proc_state_t {
@ -47,7 +49,14 @@ extern "C" {
int orte_pls_tm_put_tid(const orte_process_name_t* name,
tm_task_id tid, int state);
int orte_pls_tm_get_tids(orte_jobid_t jobid, tm_task_id **tids,
size_t *num_tids);
orte_process_name_t **names, size_t *size);
/* Child process functions */
int orte_pls_tm_child_init(void);
int orte_pls_tm_child_launch(orte_jobid_t jobid);
int orte_pls_tm_child_wait(orte_jobid_t jobid);
int orte_pls_tm_child_finalize(void);
#if defined(c_plusplus) || defined(__cplusplus)
}

590
src/mca/pls/tm/src/pls_tm_child.c Обычный файл
Просмотреть файл

@ -0,0 +1,590 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
* These symbols are in a file by themselves to provide nice linker
* semantics. Since linkers generally pull in symbols by object
* files, keeping these symbols as the only symbols in this file
* prevents utility programs such as "ompi_info" from having to import
* entire components just to query their version and parameters.
*/
#include "orte_config.h"
#if HAVE_UNISTD_H
#include <unistd.h>
#endif
#include <signal.h>
#include "include/orte_constants.h"
#include "include/orte_types.h"
#include "util/argv.h"
#include "util/output.h"
#include "util/environ.h"
#include "runtime/runtime.h"
#include "runtime/orte_wait.h"
#include "mca/base/mca_base_param.h"
#include "mca/rmgr/base/base.h"
#include "mca/rmaps/base/rmaps_base_map.h"
#include "mca/pls/pls.h"
#include "mca/pls/base/base.h"
#include "mca/errmgr/errmgr.h"
#include "mca/soh/soh_types.h"
#include "mca/gpr/gpr.h"
#include "mca/ns/base/ns_base_nds.h"
#include "mca/soh/soh.h"
#include "mca/rml/rml.h"
#include "mca/ns/ns.h"
#include "pls_tm.h"
/*
* Local functions
*/
static int do_tm_resolve(char *hostnames, tm_node_id *tnodeid);
static int query_tm_hostnames(void);
static char* get_tm_hostname(tm_node_id node);
/*
* Local variables. Note that these are only used *per child
* process*, so we're guaranteed that only one thread will be using
* these -- no need for locking.
*/
static char **tm_hostnames = NULL;
static tm_node_id *tm_node_ids = NULL;
static tm_task_id *task_ids = NULL;
static size_t num_spawned = 0;
static int num_tm_hostnames, num_node_ids;
static orte_process_name_t *names = NULL;
int orte_pls_tm_child_init(void)
{
int ret;
char* uri;
orte_cellid_t new_cellid;
orte_jobid_t new_jobid;
orte_vpid_t new_vpid;
orte_process_name_t *new_child_name;
/* Re-start us as a new ORTE process */
ompi_set_using_threads(false);
ompi_output(orte_pls_base.pls_output,
"pls:tm:launch:child: starting");
if (NULL == (uri = orte_rml.get_uri())) {
ORTE_ERROR_LOG(ORTE_ERROR);
exit(-1);
}
ompi_output(orte_pls_base.pls_output,
"pls:tm:launch:child: got uri: %s", uri);
orte_ns.get_cellid(&new_cellid, orte_process_info.my_name);
orte_ns.get_jobid(&new_jobid, orte_process_info.my_name);
new_vpid = 1;
orte_ns.reserve_range(new_jobid, 1, &new_vpid);
if (ORTE_JOBID_MAX == new_jobid ||
ORTE_CELLID_MAX == new_cellid ||
ORTE_VPID_MAX == new_vpid) {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
exit(-1);
}
ret = orte_ns.create_process_name(&new_child_name, new_cellid,
new_jobid, new_vpid);
if (ORTE_SUCCESS != ret) {
ORTE_ERROR_LOG(ret);
exit(-1);
}
ompi_output(orte_pls_base.pls_output,
"pls:tm:launch:child: restarting ORTE");
ret = orte_restart(new_child_name, uri);
if (ORTE_SUCCESS != ret) {
ORTE_ERROR_LOG(ret);
exit(-1);
}
ompi_output(orte_pls_base.pls_output,
"pls:tm:launch:child: am now a new ORTE process");
/* All done */
return ORTE_SUCCESS;
}
int orte_pls_tm_child_launch(orte_jobid_t jobid)
{
int ret, local_errno;
size_t i, j;
tm_event_t event;
char *flat;
char old_cwd[OMPI_PATH_MAX];
ompi_list_t mapping;
bool mapping_valid = false;
ompi_list_item_t *item;
char **mca_env = NULL, **tmp_env, **local_env;
char *path, *new_path;
int num_mca_env;
orte_rmaps_base_proc_t *proc;
orte_app_context_t *app;
bool failure;
tm_node_id tnodeid;
struct tm_roots tm_root;
/* Open up our connection to tm */
ret = tm_init(NULL, &tm_root);
if (TM_SUCCESS != ret) {
return ORTE_ERR_RESOURCE_BUSY;
}
orte_pls_tm_connected = true;
/* Get the hostnames from the output of the mapping. Since we
have to cross reference against TM, it's much more efficient to
do all the nodes in the entire map all at once. */
OBJ_CONSTRUCT(&mapping, ompi_list_t);
if (ORTE_SUCCESS != (ret = orte_rmaps_base_get_map(jobid, &mapping))) {
goto cleanup;
}
mapping_valid = true;
/* Count how many processes we're starting so that we can allocate
space for all the tid's */
for (failure = false, i = 0, item = ompi_list_get_first(&mapping);
!failure && item != ompi_list_get_end(&mapping);
item = ompi_list_get_next(item)) {
orte_rmaps_base_map_t* map = (orte_rmaps_base_map_t*) item;
i += map->num_procs;
}
ompi_output(orte_pls_base.pls_output,
"pls:tm:launch:child: found a total of %d procs", i);
task_ids = malloc((sizeof(tm_task_id) * i) +
(sizeof(orte_process_name_t) * i));
if (NULL == task_ids) {
ret = ORTE_ERR_OUT_OF_RESOURCE;
ORTE_ERROR_LOG(ret);
goto cleanup;
}
names = (orte_process_name_t*) (task_ids + i);
memset(names, 0, sizeof(orte_process_name_t) * i);
/* Make up an environment for all the job processes. */
mca_env = NULL;
num_mca_env = 0;
mca_base_param_build_env(&mca_env, &num_mca_env, true);
/* While we're traversing these data structures, also setup the
proc_status array for later a "put" to the registry */
getcwd(old_cwd, OMPI_PATH_MAX);
failure = false;
for (num_spawned = i = 0, item = ompi_list_get_first(&mapping);
!failure && item != ompi_list_get_end(&mapping);
item = ompi_list_get_next(item), ++i) {
orte_rmaps_base_map_t* map = (orte_rmaps_base_map_t*) item;
app = map->app;
/* See if the app cwd exists; try changing to the cwd and then
changing back */
if (0 != chdir(app->cwd)) {
ret = ORTE_ERR_NOT_FOUND;
ORTE_ERROR_LOG(ret);
goto cleanup;
}
ompi_output(orte_pls_base.pls_output,
"pls:tm:launch:child: app %d cwd (%s) exists",
i, app->cwd);
/* Get a full pathname for argv[0] -- tm won't spawn without
an absolute pathname. :-( app->app is already an absolute
pathname, so don't even bother to check -- just replace
argv[0] with app->app. */
free(app->argv[0]);
app->argv[0] = strdup(app->app);
flat = ompi_argv_join(app->argv, ' ');
/* Make a global env for the app */
tmp_env = ompi_environ_merge(app->env, mca_env);
local_env = ompi_environ_merge(environ, tmp_env);
if (NULL != tmp_env) {
ompi_argv_free(tmp_env);
}
/* Ensure "." is in the PATH. If it's not there, add it at
the end */
for (j = 0; NULL != local_env[j]; ++j) {
if (0 == strncmp("PATH=", local_env[j], 5)) {
path = local_env[j] + 5;
if (0 != strcmp(".", path) &&
0 != strncmp(".:", path, 2) &&
NULL == strstr(":.:", path) &&
0 != strncmp(":.", path + strlen(path) - 2, 2)) {
asprintf(&new_path, "PATH=%s:.", path);
free(local_env[j]);
local_env[j] = new_path;
ompi_output(orte_pls_base.pls_output,
"pls:tm:launch:child: appended \".\" to PATH");
break;
}
}
}
/* Now iterate through all the procs in this app and launch them */
for (j = 0; j < map->num_procs; ++j, ++num_spawned) {
proc = map->procs[j];
/* Get a TM node ID for the node for this proc */
if (ORTE_SUCCESS != do_tm_resolve(proc->proc_node->node_name,
&tnodeid)) {
ret = ORTE_ERR_NOT_FOUND;
ORTE_ERROR_LOG(ret);
goto cleanup;
}
/* Set the job name in the environment */
orte_ns_nds_env_put(&proc->proc_name, num_spawned, 1, &local_env);
/* Launch it */
ompi_output(orte_pls_base.pls_output,
"pls:tm:launch:child: starting process %d (%s) on %s (TM node id %d)",
num_spawned, flat, proc->proc_node->node_name,
tnodeid);
if (TM_SUCCESS != tm_spawn(app->argc, app->argv,
local_env, tnodeid,
&task_ids[num_spawned], &event)) {
ret = ORTE_ERR_RESOURCE_BUSY;
ORTE_ERROR_LOG(ret);
goto loop_error;
}
ret = tm_poll(TM_NULL_EVENT, &event, 1, &local_errno);
if (TM_SUCCESS != ret) {
ret = ORTE_ERR_RESOURCE_BUSY;
ORTE_ERROR_LOG(ret);
goto loop_error;
}
ompi_output(orte_pls_base.pls_output,
"pls:tm:launch:child: launch successful (tid %d); posting to registry", task_ids[num_spawned]);
/* Write this proc's TID to the registry (so that we can
kill it if we need to) */
orte_pls_tm_put_tid(&(proc->proc_name), task_ids[num_spawned],
ORTE_PROC_STATE_LAUNCHED);
/* Bastardize this function to set our state to
ORTE_PROC_STATE_LAUNCHED with a bogus PID (make it
equal this proc's index in the overall job -- i.e.,
rank in MPI_COMM_WORLD) */
ret = orte_pls_base_set_proc_pid(&(proc->proc_name), num_spawned);
if (ORTE_SUCCESS != ret) {
ret = ORTE_ERR_RESOURCE_BUSY;
ORTE_ERROR_LOG(ret);
goto loop_error;
}
/* Save the name so that we can use it later */
names[num_spawned] = proc->proc_name;
/* Ok, we succeeded in lauching that process. Loop around
to get the next. */
continue;
loop_error:
/* Hack so that we don't have to make the
pls_tm_terminate_job globally scoped */
orte_pls_tm_module.terminate_job(jobid);
failure = true;
break;
}
/* Now go back to the original cwd */
if (0 != chdir(old_cwd)) {
ret = ORTE_ERR_NOT_FOUND;
ORTE_ERROR_LOG(ret);
goto cleanup;
}
/* Free things from the last app */
ompi_argv_free(local_env);
free(flat);
}
/* All done */
cleanup:
ompi_output(orte_pls_base.pls_output,
"pls:tm:launch:child: launched %d processes", num_spawned);
if (NULL != mca_env) {
ompi_argv_free(mca_env);
}
if (mapping_valid) {
while (NULL != (item = ompi_list_remove_first(&mapping))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&mapping);
}
tm_finalize();
orte_pls_tm_connected = false;
return ret;
}
/*
* Waiting for the death of all the tm_spawn'ed processes.
*/
int orte_pls_tm_child_wait(orte_jobid_t jobid)
{
size_t i, j;
int ret, local_errno, *exit_statuses;
tm_event_t event, *events;
struct tm_roots tm_root;
ompi_output(orte_pls_base.pls_output,
"pls:tm:wait:child: waiting for processes to exit");
/* Open up our connection to tm */
ret = tm_init(NULL, &tm_root);
if (TM_SUCCESS != ret) {
ret = ORTE_ERR_RESOURCE_BUSY;
ORTE_ERROR_LOG(ret);
goto cleanup;
}
orte_pls_tm_connected = true;
/* Setup to wait for all the tid's to die */
events = malloc((sizeof(tm_event_t) * num_spawned) +
(sizeof(int) * num_spawned));
if (NULL == events) {
ret = ORTE_ERR_OUT_OF_RESOURCE;
ORTE_ERROR_LOG(ret);
goto cleanup;
}
exit_statuses = (int*) (events + num_spawned);
/* Ask for all obituaries */
for (i = 0; i < num_spawned; ++i) {
ret = tm_obit(task_ids[i], &exit_statuses[i], &events[i]);
if (TM_SUCCESS != ret) {
ompi_output(orte_pls_base.pls_output,
"pls:tm:kill: tm_obit failed with %d", ret);
ret = ORTE_ERROR;
ORTE_ERROR_LOG(ret);
goto cleanup;
}
}
/* Poll until we get all obituaries */
for (i = 0; i < num_spawned; ++i) {
tm_poll(TM_NULL_EVENT, &event, 1, &local_errno);
for (j = 0; j < num_spawned; ++j) {
if (event == events[j]) {
ompi_output(orte_pls_base.pls_output,
"pls:tm:wait:child: caught obit for tid %d",
task_ids[j]);
ret = orte_soh.set_proc_soh(&names[j],
ORTE_PROC_STATE_TERMINATED,
exit_statuses[j]);
events[j] = TM_NULL_EVENT;
break;
}
}
if (j >= num_spawned) {
fprintf(stderr, "Whoops! Didn't find return event!\n");
}
}
cleanup:
ompi_output(orte_pls_base.pls_output,
"pls:tm:wait:child: done waiting for process obits");
if (NULL != events) {
free(events);
}
/* All done */
if (orte_pls_tm_connected) {
tm_finalize();
}
orte_pls_tm_connected = false;
return ORTE_SUCCESS;
}
int orte_pls_tm_child_finalize(void)
{
if (NULL != tm_hostnames) {
ompi_argv_free(tm_hostnames);
tm_hostnames = NULL;
}
if (NULL != tm_node_ids) {
free(tm_node_ids);
tm_node_ids = NULL;
}
/* All done */
ompi_output(orte_pls_base.pls_output,
"pls:tm:finalize:child: all done -- exiting");
orte_finalize();
return ORTE_SUCCESS;
}
/***********************************************************************/
/*
* Take a list of hostnames and return their corresponding TM node
* ID's. This is not the most efficient method of doing this, but
* it's not much of an issue here (this is not a performance-critical
* section of code)
*/
static int do_tm_resolve(char *hostname, tm_node_id *tnodeid)
{
int i, ret;
/* Have we already queried TM for all the node info? */
if (NULL == tm_hostnames) {
ret = query_tm_hostnames();
if (ORTE_SUCCESS != ret) {
return ret;
}
}
/* Find the TM ID of the hostname that we're looking for */
for (i = 0; i < num_tm_hostnames; ++i) {
if (0 == strcmp(hostname, tm_hostnames[i])) {
*tnodeid = tm_node_ids[i];
ompi_output(orte_pls_base.pls_output,
"pls:tm:launch: resolved host %s to node ID %d",
hostname, tm_node_ids[i]);
break;
}
}
/* All done */
if (i < num_tm_hostnames) {
ret = ORTE_SUCCESS;
} else {
ret = ORTE_ERR_NOT_FOUND;
}
return ret;
}
static int query_tm_hostnames(void)
{
char *h;
int i, ret;
/* Get the list of nodes allocated in this PBS job */
ret = tm_nodeinfo(&tm_node_ids, &num_node_ids);
if (TM_SUCCESS != ret) {
return ORTE_ERR_NOT_FOUND;
}
/* TM "nodes" may actually correspond to PBS "VCPUs", which means
there may be multiple "TM nodes" that correspond to the same
physical node. This doesn't really affect what we're doing
here (we actually ignore the fact that they're duplicates --
slightly inefficient, but no big deal); just mentioned for
completeness... */
tm_hostnames = NULL;
num_tm_hostnames = 0;
for (i = 0; i < num_node_ids; ++i) {
h = get_tm_hostname(tm_node_ids[i]);
ompi_argv_append(&num_tm_hostnames, &tm_hostnames, h);
free(h);
}
/* All done */
return ORTE_SUCCESS;
}
/*
* For a given TM node ID, get the string hostname corresponding to
* it.
*/
static char* get_tm_hostname(tm_node_id node)
{
int ret, local_errno;
char *hostname;
tm_event_t event;
char buffer[256];
char **argv;
/* Get the info string corresponding to this TM node ID */
ret = tm_rescinfo(node, buffer, sizeof(buffer) - 1, &event);
if (TM_SUCCESS != ret) {
return NULL;
}
/* Now wait for that event to happen */
ret = tm_poll(TM_NULL_EVENT, &event, 1, &local_errno);
if (TM_SUCCESS != ret) {
return NULL;
}
/* According to the TM man page, we get back a space-separated
string array. The hostname is the second item. Use a cheap
trick to get it. */
buffer[sizeof(buffer) - 1] = '\0';
argv = ompi_argv_split(buffer, ' ');
if (NULL == argv) {
return NULL;
}
hostname = strdup(argv[1]);
ompi_argv_free(argv);
/* All done */
return hostname;
}

Просмотреть файл

@ -24,12 +24,16 @@
#include <unistd.h>
#endif
#include <signal.h>
#include <sys/types.h>
#include <sys/wait.h>
#include "include/orte_constants.h"
#include "include/orte_types.h"
#include "util/argv.h"
#include "util/output.h"
#include "util/environ.h"
#include "runtime/runtime.h"
#include "runtime/orte_wait.h"
#include "mca/base/mca_base_param.h"
#include "mca/rmgr/base/base.h"
#include "mca/rmaps/base/rmaps_base_map.h"
@ -39,6 +43,9 @@
#include "mca/soh/soh_types.h"
#include "mca/gpr/gpr.h"
#include "mca/ns/base/ns_base_nds.h"
#include "mca/soh/soh.h"
#include "mca/rml/rml.h"
#include "mca/ns/ns.h"
#include "pls_tm.h"
@ -50,10 +57,9 @@ static int pls_tm_terminate_job(orte_jobid_t jobid);
static int pls_tm_terminate_proc(const orte_process_name_t *name);
static int pls_tm_finalize(void);
static int do_tm_resolve(char *hostnames, tm_node_id *tnodeid);
static int query_tm_hostnames(void);
static char* get_tm_hostname(tm_node_id node);
static int kill_tids(tm_task_id *tids, int num_tids);
static void do_wait_proc(pid_t pid, int status, void* cbdata);
static int kill_tids(tm_task_id *tids, orte_process_name_t *names,
size_t num_tids);
/*
@ -65,6 +71,7 @@ orte_pls_base_module_1_0_0_t orte_pls_tm_module = {
pls_tm_terminate_proc,
pls_tm_finalize
};
bool orte_pls_tm_connected = false;
extern char **environ;
#define NUM_SIGNAL_POLL_ITERS 50
@ -73,187 +80,106 @@ extern char **environ;
/*
* Local variables
*/
static bool tm_connected = false;
static struct tm_roots tm_root;
static char **tm_hostnames = NULL;
static tm_node_id *tm_node_ids;
static int num_node_ids, num_tm_hostnames;
static bool wait_cb_set = false;
static pid_t child_pid = -1;
static int pls_tm_launch(orte_jobid_t jobid)
{
int ret, local_errno;
size_t i, j, count;
tm_event_t event;
char *flat;
char old_cwd[OMPI_PATH_MAX];
ompi_list_t mapping;
bool mapping_valid = false;
ompi_list_item_t *item;
char **mca_env, **tmp_env, **local_env;
int num_mca_env;
orte_rmaps_base_proc_t *proc;
orte_app_context_t *app;
bool failure;
tm_node_id tnodeid;
tm_task_id tid;
orte_jobid_t *save;
/* Open up our connection to tm */
/* Copy the jobid */
ret = tm_init(NULL, &tm_root);
if (TM_SUCCESS != ret) {
return ORTE_ERR_RESOURCE_BUSY;
save = malloc(sizeof(orte_jobid_t));
if (NULL == save) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
tm_connected = true;
memcpy(save, &jobid, sizeof(orte_jobid_t));
/* Get the hostnames from the output of the mapping. Since we
have to cross reference against TM, it's much more efficient to
do all the nodes in the entire map all at once. */
/* Child */
OBJ_CONSTRUCT(&mapping, ompi_list_t);
if (ORTE_SUCCESS != (ret = orte_rmaps_base_get_map(jobid, &mapping))) {
goto cleanup;
ompi_output(orte_pls_base.pls_output,
"pls:tm:launch: launching child to do the work");
child_pid = fork();
if (0 == child_pid) {
if (ORTE_SUCCESS != orte_pls_tm_child_init() ||
ORTE_SUCCESS != orte_pls_tm_child_launch(jobid) ||
ORTE_SUCCESS != orte_pls_tm_child_wait(jobid) ||
ORTE_SUCCESS != orte_pls_tm_child_finalize()) {
/* Bogus logic just to stop at the first failure */
child_pid++;
}
exit(0);
}
mapping_valid = true;
printf("tm child PID: %d\n", child_pid);
fflush(stdout);
/* While we're traversing these data structures, also setup the
proc_status array for later a "put" to the registry */
/* Parent */
getcwd(old_cwd, OMPI_PATH_MAX);
failure = false;
for (count = i = 0, item = ompi_list_get_first(&mapping);
!failure && item != ompi_list_get_end(&mapping);
item = ompi_list_get_next(item), ++i) {
orte_rmaps_base_map_t* map = (orte_rmaps_base_map_t*) item;
app = map->app;
orte_wait_cb(child_pid, do_wait_proc, save);
wait_cb_set = true;
/* See if the app cwd exists; try changing to the cwd and then
changing back */
if (0 != chdir(app->cwd) ||
0 != chdir(old_cwd)) {
ret = ORTE_ERR_NOT_FOUND;
ORTE_ERROR_LOG(ret);
goto cleanup;
}
ompi_output(orte_pls_base.pls_output,
"pls:tm:launch: app %d cwd (%s) exists", i, app->cwd);
/* A few things global to the app */
flat = ompi_argv_join(app->argv, ' ');
num_mca_env = 0;
mca_env = ompi_argv_copy(environ);
mca_base_param_build_env(&mca_env, &num_mca_env, true);
tmp_env = ompi_environ_merge(app->env, mca_env);
local_env = ompi_environ_merge(environ, tmp_env);
if (NULL != mca_env) {
ompi_argv_free(mca_env);
}
if (NULL != tmp_env) {
ompi_argv_free(tmp_env);
}
/* Now iterate through all the procs in this app and launch them */
for (j = 0; j < map->num_procs; ++j, ++count) {
proc = map->procs[j];
/* Get a TM node ID for the node for this proc */
if (ORTE_SUCCESS != do_tm_resolve(proc->proc_node->node_name,
&tnodeid)) {
ret = ORTE_ERR_NOT_FOUND;
ORTE_ERROR_LOG(ret);
goto cleanup;
}
/* Set the job name in the environment */
orte_ns_nds_env_put(&proc->proc_name, count, 1, &local_env);
/* Launch it */
ompi_output(orte_pls_base.pls_output,
"pls:tm:launch: starting process %d (%s) on %s (TM node id %d)",
count, flat, proc->proc_node->node_name, tnodeid);
if (TM_SUCCESS != tm_spawn(app->argc, app->argv,
local_env, tnodeid, &tid, &event)) {
ret = ORTE_ERR_RESOURCE_BUSY;
ORTE_ERROR_LOG(ret);
goto loop_error;
}
ret = tm_poll(TM_NULL_EVENT, &event, 1, &local_errno);
if (TM_SUCCESS != ret) {
ret = ORTE_ERR_RESOURCE_BUSY;
ORTE_ERROR_LOG(ret);
goto loop_error;
}
/* Write this proc's status to the registry */
orte_pls_tm_put_tid(&(proc->proc_name), tid,
ORTE_PROC_STATE_LAUNCHED);
continue;
loop_error:
pls_tm_terminate_job(jobid);
failure = true;
break;
}
/* Free things from the last app */
ompi_argv_free(local_env);
free(flat);
}
/* All done */
cleanup:
if (mapping_valid) {
while (NULL != (item = ompi_list_remove_first(&mapping))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&mapping);
}
tm_finalize();
tm_connected = false;
return ret;
return ORTE_SUCCESS;
}
static int pls_tm_terminate_job(orte_jobid_t jobid)
{
struct tm_roots tm_root;
tm_task_id *tids;
size_t num_tids;
orte_process_name_t *names;
size_t size;
int ret;
/* If we have a child, that child is potentially sitting inside
tm_poll(), and we won't be able to tm_init(). Sigh. So kill
the child. */
if (child_pid > 0) {
ompi_output(orte_pls_base.pls_output,
"pls:tm:terminate_job: killing tm shephard");
kill(child_pid, SIGKILL);
waitpid(child_pid, NULL, 0);
child_pid = -1;
sleep(1);
}
/* Open up our connection to tm. Note that we may be called from
launch, above, in which case we don't need to tm_init */
if (!tm_connected) {
ompi_output(orte_pls_base.pls_output,
"pls:tm:terminate_job: killing jobid %d", jobid);
if (!orte_pls_tm_connected) {
ret = tm_init(NULL, &tm_root);
if (TM_SUCCESS != ret) {
return ORTE_ERR_RESOURCE_BUSY;
ret = ORTE_ERR_RESOURCE_BUSY;
ORTE_ERROR_LOG(ret);
return ret;
}
}
/* Get the TIDs from the registry */
ret = orte_pls_tm_get_tids(jobid, &tids, &num_tids);
if (ORTE_SUCCESS == ret) {
ret = kill_tids(tids, num_tids);
free(tids);
ret = orte_pls_tm_get_tids(jobid, &tids, &names, &size);
if (ORTE_SUCCESS == ret && size > 0) {
ompi_output(orte_pls_base.pls_output,
"pls:tm:terminate_job: got %d tids from registry", size);
ret = kill_tids(tids, names, size);
if (NULL != names) {
free(names);
}
if (NULL != tids) {
free(tids);
}
} else {
ompi_output(orte_pls_base.pls_output,
"pls:tm:terminate_job: got no tids from registry -- nothing to kill");
}
/* All done */
if (!tm_connected) {
if (!orte_pls_tm_connected) {
tm_finalize();
}
return ret;
@ -265,6 +191,9 @@ static int pls_tm_terminate_job(orte_jobid_t jobid)
*/
static int pls_tm_terminate_proc(const orte_process_name_t *name)
{
ompi_output(orte_pls_base.pls_output,
"pls:tm:terminate_proc: not supported");
ORTE_ERROR_LOG(ORTE_ERR_NOT_SUPPORTED);
return ORTE_ERR_NOT_SUPPORTED;
}
@ -274,133 +203,22 @@ static int pls_tm_terminate_proc(const orte_process_name_t *name)
*/
static int pls_tm_finalize(void)
{
if (NULL != tm_hostnames) {
free(tm_node_ids);
ompi_argv_free(tm_hostnames);
tm_hostnames = NULL;
if (wait_cb_set) {
orte_wait_cb_cancel(child_pid);
}
return ORTE_SUCCESS;
}
/*
* Take a list of hostnames and return their corresponding TM node
* ID's. This is not the most efficient method of doing this, but
* it's not much of an issue here (this is not a performance-critical
* section of code)
*/
static int do_tm_resolve(char *hostname, tm_node_id *tnodeid)
static void do_wait_proc(pid_t pid, int status, void *cbdata)
{
int i, ret;
orte_jobid_t *jobid = (orte_jobid_t *) cbdata;
/* Have we already queried TM for all the node info? */
printf("Child TM proc has exited!\n");
fflush(stdout);
if (NULL == tm_hostnames) {
ret = query_tm_hostnames();
if (ORTE_SUCCESS != ret) {
return ret;
}
}
/* Find the TM ID of the hostname that we're looking for */
for (i = 0; i < num_tm_hostnames; ++i) {
if (0 == strcmp(hostname, tm_hostnames[i])) {
*tnodeid = tm_node_ids[i];
ompi_output(orte_pls_base.pls_output,
"pls:tm:launch: resolved host %s to node ID %d",
hostname, tm_node_ids[i]);
break;
}
}
/* All done */
if (i < num_tm_hostnames) {
ret = ORTE_SUCCESS;
} else {
ret = ORTE_ERR_NOT_FOUND;
}
return ret;
}
static int query_tm_hostnames(void)
{
char *h;
int i, ret;
/* Get the list of nodes allocated in this PBS job */
ret = tm_nodeinfo(&tm_node_ids, &num_node_ids);
if (TM_SUCCESS != ret) {
return ORTE_ERR_NOT_FOUND;
}
/* TM "nodes" may actually correspond to PBS "VCPUs", which means
there may be multiple "TM nodes" that correspond to the same
physical node. This doesn't really affect what we're doing
here (we actually ignore the fact that they're duplicates --
slightly inefficient, but no big deal); just mentioned for
completeness... */
tm_hostnames = NULL;
num_tm_hostnames = 0;
for (i = 0; i < num_node_ids; ++i) {
h = get_tm_hostname(tm_node_ids[i]);
ompi_argv_append(&num_tm_hostnames, &tm_hostnames, h);
free(h);
}
/* All done */
return ORTE_SUCCESS;
}
/*
* For a given TM node ID, get the string hostname corresponding to
* it.
*/
static char* get_tm_hostname(tm_node_id node)
{
int ret, local_errno;
char *hostname;
tm_event_t event;
char buffer[256];
char **argv;
/* Get the info string corresponding to this TM node ID */
ret = tm_rescinfo(node, buffer, sizeof(buffer) - 1, &event);
if (TM_SUCCESS != ret) {
return NULL;
}
/* Now wait for that event to happen */
ret = tm_poll(TM_NULL_EVENT, &event, 1, &local_errno);
if (TM_SUCCESS != ret) {
return NULL;
}
/* According to the TM man page, we get back a space-separated
string array. The hostname is the second item. Use a cheap
trick to get it. */
buffer[sizeof(buffer) - 1] = '\0';
argv = ompi_argv_split(buffer, ' ');
if (NULL == argv) {
return NULL;
}
hostname = strdup(argv[1]);
ompi_argv_free(argv);
/* All done */
return hostname;
free(cbdata);
}
@ -408,105 +226,139 @@ static char* get_tm_hostname(tm_node_id node)
* Kill a bunch of tids. Don't care about errors here -- just make a
* best attempt to kill kill kill; if we fail, oh well.
*/
static int kill_tids(tm_task_id *tids, int num_tids)
static int kill_tids(tm_task_id *tids, orte_process_name_t *names, size_t size)
{
int j, i, ret, local_errno, exit_status;
size_t i;
int j, ret, local_errno, exit_status;
tm_event_t event;
bool killed;
bool died;
for (i = 0; i < num_tids; ++i) {
for (i = 0; i < size; ++i) {
died = false;
/* First, kill with SIGTERM */
ompi_output(orte_pls_base.pls_output,
"pls:tm:terminate:kill_tids: killing tid %d", tids[i]);
ret = tm_kill(tids[i], SIGTERM, &event);
if (TM_SUCCESS != ret) {
/* If we didn't find the tid, then just continue -- it may
have exited on its own */
if (TM_ENOTFOUND == ret) {
ompi_output(orte_pls_base.pls_output,
"pls:tm:terminate:kill_tids: tid %d not found (already dead?)",
tids[i]);
died = true;
} else if (TM_SUCCESS != ret) {
ompi_output(orte_pls_base.pls_output,
"pls:tm:kill: tm_kill failed with %d", ret);
ret = ORTE_ERROR;
ORTE_ERROR_LOG(ret);
return ret;
}
tm_poll(TM_NULL_EVENT, &event, 1, &local_errno);
ompi_output(orte_pls_base.pls_output,
"pls:tm:kill: killed TID %d with SIGTERM", tids[i]);
/* Did it die? */
ret = tm_obit(tids[i], &exit_status, &event);
if (TM_SUCCESS != ret) {
if (!died) {
tm_poll(TM_NULL_EVENT, &event, 1, &local_errno);
ompi_output(orte_pls_base.pls_output,
"pls:tm:kill: tm_obit failed with %d", ret);
ret = ORTE_ERROR;
ORTE_ERROR_LOG(ret);
return ret;
"pls:tm:kill: killed tid %d with SIGTERM", tids[i]);
/* Did it die? */
ret = tm_obit(tids[i], &exit_status, &event);
if (TM_SUCCESS != ret) {
ompi_output(orte_pls_base.pls_output,
"pls:tm:kill: tm_obit failed with %d", ret);
ret = ORTE_ERROR;
ORTE_ERROR_LOG(ret);
return ret;
}
tm_poll(TM_NULL_EVENT, &event, 0, &local_errno);
/* If it's dead, save the state */
if (TM_NULL_EVENT != event) {
died = true;
}
/* It didn't seem to die right away; poll a few times */
else {
for (j = 0; j < NUM_SIGNAL_POLL_ITERS; ++j) {
tm_poll(TM_NULL_EVENT, &event, 0, &local_errno);
if (TM_NULL_EVENT != event) {
died = true;
ompi_output(orte_pls_base.pls_output,
"pls:tm:kill: tid %d died", tids[i]);
break;
}
usleep(1);
}
/* No, it did not die. Try with SIGKILL */
if (!died) {
ret = tm_kill(tids[i], SIGKILL, &event);
if (TM_SUCCESS != ret) {
ompi_output(orte_pls_base.pls_output,
"pls:tm:kill: tm_kill failed with %d",
ret);
ret = ORTE_ERROR;
ORTE_ERROR_LOG(ret);
return ret;
}
tm_poll(TM_NULL_EVENT, &event, 1, &local_errno);
ompi_output(orte_pls_base.pls_output,
"pls:tm:kill: killed tid %d with SIGKILL",
tids[i]);
/* Did it die this time? */
ret = tm_obit(tids[i], &exit_status, &event);
if (TM_SUCCESS != ret) {
ompi_output(orte_pls_base.pls_output,
"pls:tm:kill: tm_obit failed with %d",
ret);
ret = ORTE_ERROR;
ORTE_ERROR_LOG(ret);
return ret;
}
tm_poll(TM_NULL_EVENT, &event, 0, &local_errno);
/* No -- poll a few times -- just to try to clean it
up... If we don't get it here, oh well. Just let
the resources hang; TM will clean them up when the
job completed */
if (TM_NULL_EVENT == event) {
for (j = 0; j < NUM_SIGNAL_POLL_ITERS; ++j) {
tm_poll(TM_NULL_EVENT, &event, 0, &local_errno);
if (TM_NULL_EVENT != event) {
ompi_output(orte_pls_base.pls_output,
"pls:tm:kill: tid %d (finally) died",
tids[i]);
died = true;
break;
}
usleep(1);
}
if (j >= NUM_SIGNAL_POLL_ITERS) {
ompi_output(orte_pls_base.pls_output,
"pls:tm:kill: tid %d did not die!",
tids[i]);
}
}
}
}
}
tm_poll(TM_NULL_EVENT, &event, 0, &local_errno);
/* If it's dead, update the registry */
/* It didn't seem to die right away; poll a few times */
if (TM_NULL_EVENT == event) {
killed = false;
for (j = 0; j < NUM_SIGNAL_POLL_ITERS; ++j) {
tm_poll(TM_NULL_EVENT, &event, 0, &local_errno);
if (TM_NULL_EVENT != event) {
killed = true;
ompi_output(orte_pls_base.pls_output,
"pls:tm:kill: TID %d died", tids[i]);
break;
}
usleep(1);
}
/* No, it did not die. Try with SIGKILL */
if (!killed) {
ret = tm_kill(tids[i], SIGKILL, &event);
if (TM_SUCCESS != ret) {
ompi_output(orte_pls_base.pls_output,
"pls:tm:kill: tm_kill failed with %d", ret);
ret = ORTE_ERROR;
ORTE_ERROR_LOG(ret);
return ret;
}
tm_poll(TM_NULL_EVENT, &event, 1, &local_errno);
ompi_output(orte_pls_base.pls_output,
"pls:tm:kill: killed TID %d with SIGKILL", tids[i]);
/* Did it die this time? */
ret = tm_obit(tids[i], &exit_status, &event);
if (TM_SUCCESS != ret) {
ompi_output(orte_pls_base.pls_output,
"pls:tm:kill: tm_obit failed with %d", ret);
ret = ORTE_ERROR;
ORTE_ERROR_LOG(ret);
return ret;
}
tm_poll(TM_NULL_EVENT, &event, 0, &local_errno);
/* No -- poll a few times -- just to try to clean it
up... If we don't get it here, oh well. Just let
the resources hang; TM will clean them up when the
job completed */
if (TM_NULL_EVENT == event) {
for (j = 0; j < NUM_SIGNAL_POLL_ITERS; ++j) {
tm_poll(TM_NULL_EVENT, &event, 0, &local_errno);
if (TM_NULL_EVENT != event) {
ompi_output(orte_pls_base.pls_output,
"pls:tm:kill: TID %d died", tids[i]);
break;
}
usleep(1);
}
if (j >= NUM_SIGNAL_POLL_ITERS) {
ompi_output(orte_pls_base.pls_output,
"pls:tm:kill: TID %d did not die!", tids[i]);
}
}
}
if (died) {
ret = orte_soh.set_proc_soh(&names[i],
ORTE_PROC_STATE_TERMINATED,
exit_status);
}
}

Просмотреть файл

@ -65,7 +65,7 @@ int orte_pls_tm_put_tid(const orte_process_name_t* name,
rc = orte_gpr.put(1, values);
free(value.segment);
for(i=0; i<value.num_tokens; i++) {
for (i = 0; i < value.num_tokens; ++i) {
free(value.tokens[i]);
}
free(value.tokens);
@ -76,27 +76,36 @@ int orte_pls_tm_put_tid(const orte_process_name_t* name,
/**
* Retreive all process tids for the specified job.
*/
#include <unistd.h>
int orte_pls_tm_get_tids(orte_jobid_t jobid, tm_task_id **tids,
size_t* num_tids)
orte_process_name_t **names, size_t* size)
{
char *segment;
char *keys[2];
char *segment = NULL;
char *keys[3];
orte_gpr_value_t** values = NULL;
int i, num_values = 0;
int i, j, num_values = 0;
int rc;
/* query the job segment on the registry */
/* Zero out in case of error */
*tids = NULL;
*names = NULL;
*size = 0;
/* Query the job segment on the registry */
if (ORTE_SUCCESS !=
(rc = orte_schema.get_job_segment_name(&segment, jobid))) {
ORTE_ERROR_LOG(rc);
return rc;
goto cleanup;
}
keys[0] = TID_KEY;
keys[1] = NULL;
keys[1] = ORTE_PROC_NAME_KEY;
keys[2] = NULL;
rc = orte_gpr.get(
ORTE_GPR_KEYS_OR|ORTE_GPR_TOKENS_OR,
ORTE_GPR_KEYS_AND,
segment,
NULL,
keys,
@ -108,17 +117,29 @@ int orte_pls_tm_get_tids(orte_jobid_t jobid, tm_task_id **tids,
return rc;
}
if (0 == num_values) {
rc = ORTE_ERR_NOT_FOUND;
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* If we got values back (both TID and the process names have to
exist), then process them */
*tids = (tm_task_id*) malloc(sizeof(tm_task_id) * num_values);
for (i = 0; i < num_values; ++i) {
(*tids)[i] = values[i]->keyvals[0]->value.ui32;
if (num_values > 0) {
*tids = malloc(sizeof(tm_task_id) * num_values);
*names = malloc(sizeof(orte_process_name_t) * num_values);
if (NULL == *tids || NULL == *names) {
rc = ORTE_ERR_OUT_OF_RESOURCE;
ORTE_ERROR_LOG(rc);
goto cleanup;
}
for (i = 0; i < num_values; ++i) {
for (j = 0; j < values[i]->cnt; ++j) {
if (0 == strcmp(values[i]->keyvals[j]->key, TID_KEY)) {
(*tids)[i] = values[i]->keyvals[j]->value.ui32;
} else if (0 == strcmp(values[i]->keyvals[j]->key,
ORTE_PROC_NAME_KEY)) {
(*names)[i] = values[i]->keyvals[j]->value.proc;
}
}
}
*size = num_values;
}
*num_tids = num_values;
cleanup:
if (NULL != values) {
@ -127,6 +148,8 @@ cleanup:
}
free(values);
}
free(segment);
if (NULL != segment) {
free(segment);
}
return rc;
}

Просмотреть файл

@ -25,6 +25,9 @@
int mca_pml_base_close(void)
{
/* turn off the progress code for the pml */
ompi_progress_unregister(mca_pml.pml_progress);
/* Blatently ignore the return code (what would we do to recover,
anyway? This module is going away, so errors don't matter
anymore) */

Просмотреть файл

@ -96,7 +96,7 @@ static int orte_ras_bjs_open(void)
}
static orte_ras_base_module_t *orte_ras_bjs_init(void)
static orte_ras_base_module_t *orte_ras_bjs_init(int* priority)
{
if(getenv("NODES") == NULL) {
return NULL;

Просмотреть файл

@ -26,7 +26,7 @@
static int orte_ras_lsf_bproc_open(void);
static int orte_ras_lsf_bproc_close(void);
static orte_ras_base_module_t* orte_ras_lsf_bproc_init(void);
static orte_ras_base_module_t* orte_ras_lsf_bproc_init(int* priority);
orte_ras_lsf_bproc_component_t mca_ras_lsf_bproc_component = {

Просмотреть файл

@ -154,7 +154,6 @@ static int orte_rds_hostfile_parse(const char *hostfile, ompi_list_t* existing,
orte_rds_hostfile_done = false;
orte_rds_hostfile_in = fopen(hostfile, "r");
if (NULL == orte_rds_hostfile_in) {
ompi_output(0, "orte_rds_hostfile: could not open %s (%s)\n", hostfile, strerror(errno));
rc = ORTE_ERR_NOT_FOUND;
goto unlock;
}
@ -211,7 +210,11 @@ static int orte_rds_hostfile_query(void)
rc = orte_rds_hostfile_parse(mca_rds_hostfile_component.path, &existing, &updates);
if (ORTE_ERR_NOT_FOUND == rc) {
rc = ORTE_SUCCESS;
if(mca_rds_hostfile_component.default_hostfile) {
rc = ORTE_SUCCESS;
} else {
ompi_output(0, "orte_rds_hostfile: could not open %s\n", mca_rds_hostfile_component.path);
}
goto cleanup;
}
rc = orte_ras_base_node_insert(&updates);

Просмотреть файл

@ -34,6 +34,7 @@ struct orte_rds_hostfile_component_t {
orte_rds_base_component_t super;
int debug;
char* path;
bool default_hostfile;
ompi_mutex_t lock;
};
typedef struct orte_rds_hostfile_component_t orte_rds_hostfile_component_t;

Просмотреть файл

@ -95,6 +95,7 @@ static int orte_rds_hostfile_open(void)
OBJ_CONSTRUCT(&mca_rds_hostfile_component.lock, ompi_mutex_t);
mca_rds_hostfile_component.debug = orte_rds_hostfile_param_register_int("debug",1);
mca_rds_hostfile_component.path = orte_rds_hostfile_param_register_string("path", path);
mca_rds_hostfile_component.default_hostfile = (strcmp(mca_rds_hostfile_component.path,path) == 0);
return ORTE_SUCCESS;
}

Просмотреть файл

@ -39,6 +39,7 @@ typedef uint32_t orte_rml_tag_t;
#define ORTE_RML_TAG_XCAST 7
#define ORTE_RML_TAG_BPROC_SVC 8
#define ORTE_RML_TAG_BPROC_CLT 9
#define ORTE_RML_TAG_DYNAMIC 2000
#define ORTE_RML_TAG_MAX UINT32_MAX

Просмотреть файл

@ -73,9 +73,11 @@ void ompi_progress(void)
#endif
for (i = 0 ; i < callbacks_len ; ++i) {
ret = (callbacks[i])();
if (ret > 0) {
events += ret;
if (NULL != callbacks[i]) {
ret = (callbacks[i])();
if (ret > 0) {
events += ret;
}
}
}
@ -139,3 +141,18 @@ ompi_progress_register(ompi_progress_callback_t cb)
return OMPI_SUCCESS;
}
int
ompi_progress_unregister(ompi_progress_callback_t cb)
{
size_t i;
for (i = 0 ; i < callbacks_len ; ++i) {
if (cb == callbacks[i]) {
callbacks[i] = NULL;
return OMPI_SUCCESS;
}
}
return OMPI_ERR_NOT_FOUND;
}

Просмотреть файл

@ -28,6 +28,8 @@ typedef int (*ompi_progress_callback_t)(void);
OMPI_DECLSPEC int ompi_progress_register(ompi_progress_callback_t cb);
OMPI_DECLSPEC int ompi_progress_unregister(ompi_progress_callback_t cb);
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif

Просмотреть файл

@ -286,7 +286,7 @@ int orte_init(void)
ORTE_ERROR_LOG(ret);
return ret;
}
/* setup my session directory */
if (ORTE_SUCCESS != (ret = orte_ns.get_jobid_string(&jobid_str, orte_process_info.my_name))) {
ORTE_ERROR_LOG(ret);
@ -341,6 +341,7 @@ int orte_init(void)
ORTE_ERROR_LOG(ret);
return ret;
}
if (ORTE_SUCCESS != (ret = orte_soh_base_open())) {
ORTE_ERROR_LOG(ret);
return ret;

Просмотреть файл

@ -80,11 +80,11 @@ int orte_restart(orte_process_name_t *name, const char* uri)
ORTE_ERROR_LOG(rc);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_rml_base_close())) {
if (ORTE_SUCCESS != (rc = orte_ns_base_close())) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_ns_base_close())) {
if (ORTE_SUCCESS != (rc = orte_rml_base_close())) {
ORTE_ERROR_LOG(rc);
return rc;
}
@ -149,6 +149,18 @@ int orte_restart(orte_process_name_t *name, const char* uri)
ORTE_ERROR_LOG(rc);
return rc;
}
if (NULL != orte_process_info.ns_replica_uri) {
if (ORTE_SUCCESS != (rc = orte_rml.set_uri(orte_process_info.ns_replica_uri))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
if (NULL != orte_process_info.gpr_replica_uri) {
if (ORTE_SUCCESS != (rc = orte_rml.set_uri(orte_process_info.gpr_replica_uri))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
/*
* Re-init selected modules.

Просмотреть файл

@ -58,15 +58,23 @@ int orte_universe_exists()
/* if both ns_replica and gpr_replica were provided, check for contact with them */
if (NULL != orte_process_info.ns_replica_uri && NULL != orte_process_info.gpr_replica_uri) {
orte_process_name_t name;
if(ORTE_SUCCESS != (ret = orte_rml.parse_uris(orte_process_info.ns_replica_uri, &name, NULL)))
if(ORTE_SUCCESS != (ret = orte_rml.parse_uris(orte_process_info.ns_replica_uri, &name, NULL))) {
ORTE_ERROR_LOG(ret);
return ret;
if(ORTE_SUCCESS != (ret = orte_ns.copy_process_name(&orte_process_info.ns_replica, &name)))
}
if(ORTE_SUCCESS != (ret = orte_ns.copy_process_name(&orte_process_info.ns_replica, &name))) {
ORTE_ERROR_LOG(ret);
return ret;
if(ORTE_SUCCESS != (ret = orte_rml.parse_uris(orte_process_info.gpr_replica_uri, &name, NULL)))
}
if(ORTE_SUCCESS != (ret = orte_rml.parse_uris(orte_process_info.gpr_replica_uri, &name, NULL))) {
ORTE_ERROR_LOG(ret);
return ret;
if(ORTE_SUCCESS != (ret = orte_ns.copy_process_name(&orte_process_info.gpr_replica, &name)))
}
if(ORTE_SUCCESS != (ret = orte_ns.copy_process_name(&orte_process_info.gpr_replica, &name))) {
ORTE_ERROR_LOG(ret);
return ret;
return ORTE_SUCCESS;
}
return ORTE_SUCCESS;
}
/* /\* ping to verify ns_replica alive *\/ */

Просмотреть файл

@ -55,8 +55,9 @@ pid_t orte_waitpid(pid_t wpid, int *status, int options);
* time.
*
* If a thread is already blocked in \c ompi_rte_waitpid for \c wpid,
* this function will return \c OMPI_ERR_EXISTS. It is legal for
* multiple callbacks to be registered for a single \c wpid.
* this function will return \c OMPI_ERR_EXISTS. It is illegal for
* multiple callbacks to be registered for a single \c wpid
* (OMPI_EXISTS will be returned in this case).
*
* \warning It is not legal for \c wpid to be -1 when registering a
* callback.

Просмотреть файл

@ -124,6 +124,44 @@ void ompi_info::open_components()
component_map["base"] = NULL;
// ORTE frameworks
mca_oob_base_open();
component_map["oob"] = &mca_oob_base_components;
orte_errmgr_base_open();
component_map["errmgr"] = &orte_errmgr_base_components_available;
orte_gpr_base_open();
component_map["gpr"] = &orte_gpr_base_components_available;
orte_iof_base_open();
component_map["iof"] = &orte_iof_base.iof_components_opened;
orte_ns_base_open();
component_map["ns"] = &mca_ns_base_components_available;
orte_ras_base_open();
component_map["ras"] = &orte_ras_base.ras_opened;
orte_rds_base_open();
component_map["rds"] = &orte_rds_base.rds_components;
orte_rmaps_base_open();
component_map["rmaps"] = &orte_rmaps_base.rmaps_opened;
orte_rmgr_base_open();
component_map["rmgr"] = &orte_rmgr_base.rmgr_components;
orte_rml_base_open();
component_map["rml"] = &orte_rml_base.rml_components;
orte_pls_base_open();
component_map["pls"] = &orte_pls_base.pls_opened;
orte_soh_base_open();
component_map["soh"] = &orte_soh_base.soh_components;
// MPI frameworks
mca_allocator_base_open();
@ -147,44 +185,6 @@ void ompi_info::open_components()
mca_topo_base_open();
component_map["topo"] = &mca_topo_base_components_opened;
// ORTE frameworks
orte_errmgr_base_open();
component_map["errmgr"] = &orte_errmgr_base_components_available;
orte_gpr_base_open();
component_map["gpr"] = &orte_gpr_base_components_available;
orte_iof_base_open();
component_map["iof"] = &orte_iof_base.iof_components_opened;
orte_ns_base_open();
component_map["ns"] = &mca_ns_base_components_available;
mca_oob_base_open();
component_map["oob"] = &mca_oob_base_components;
orte_ras_base_open();
component_map["ras"] = &orte_ras_base.ras_opened;
orte_rds_base_open();
component_map["rds"] = &orte_rds_base.rds_components;
orte_rmaps_base_open();
component_map["rmaps"] = &orte_rmaps_base.rmaps_opened;
orte_rmgr_base_open();
component_map["rmgr"] = &orte_rmgr_base.rmgr_components;
orte_rml_base_open();
component_map["rml"] = &orte_rml_base.rml_components;
orte_pls_base_open();
component_map["pls"] = &orte_pls_base.pls_opened;
orte_soh_base_open();
component_map["soh"] = &orte_soh_base.soh_components;
// All done
opened_components = true;
@ -193,24 +193,36 @@ void ompi_info::open_components()
void ompi_info::close_components()
{
if (opened_components) {
mca_oob_base_close();
orte_ns_base_close();
orte_gpr_base_close();
#if 0
// JMS waiting for ralph to finish
mca_soh_base_close();
#endif
mca_coll_base_close();
mca_pml_base_close();
mca_ptl_base_close();
mca_topo_base_close();
mca_mpool_base_close();
mca_allocator_base_close();
mca_base_close();
if (opened_components) {
component_map.clear();
}
// Note that the order of shutdown here doesn't matter because
// we aren't *using* any components -- none were selected, so
// there are no dependencies between the frameworks. We list
// them generally "in order", but it doesn't really matter.
opened_components = false;
mca_topo_base_close();
mca_ptl_base_close();
mca_pml_base_close();
mca_mpool_base_close();
mca_io_base_close();
mca_coll_base_close();
mca_allocator_base_close();
orte_iof_base_close();
orte_soh_base_close();
orte_pls_base_close();
orte_rml_base_close();
orte_rmgr_base_close();
orte_rmaps_base_close();
orte_rds_base_close();
orte_ras_base_close();
orte_ns_base_close();
orte_gpr_base_close();
orte_errmgr_base_close();
mca_oob_base_close();
component_map.clear();
}
opened_components = false;
}

Просмотреть файл

@ -91,7 +91,9 @@ ompi_cmd_line_init_t orte_cmd_line_opts[] = {
{ NULL, NULL, NULL, '\0', NULL, "gprreplica", 1,
&orte_process_info.gpr_replica_uri, OMPI_CMD_LINE_TYPE_STRING,
"Registry contact information."},
{ NULL, NULL, NULL, '\0', NULL, "nodename", 1,
&orte_system_info.nodename, OMPI_CMD_LINE_TYPE_STRING,
"Node name as specified by host/resource description." },
/* End of list */
{ NULL, NULL, NULL, '\0', NULL, NULL, 0,
NULL, OMPI_CMD_LINE_TYPE_NULL, NULL }
@ -168,7 +170,7 @@ int main(int argc, char *argv[])
}
/* setup stdin/stdout/stderr */
if (orted_globals.debug == false || orted_globals.bootproxy > 0) {
if (orted_globals.debug == false) {
int fd;
char log_file[PATH_MAX];

Просмотреть файл

@ -28,3 +28,11 @@ orterun_SOURCES = \
orterun_LDADD = $(libs)
orterun_DEPENDENCIES = $(libs)
install-exec-hook:
(cd $(DESTDIR)$(bindir); rm -f mpirun; ln -s orterun mpirun)
(cd $(DESTDIR)$(bindir); rm -f mpiexec; ln -s orterun mpiexec)
uninstall-local:
rm -f $(DESTDIR)$(bindir)/mpirun \
$(DESTDIR)$(bindir)/mpiexec

Просмотреть файл

@ -30,18 +30,22 @@ requsted more nodes than exist in your cluster).
While probably only useful to Open RTE developers, the error returned
was %d.
[orterun:no-application]
[orterun:executable-not-found]
%s could not find the executable "%s".
This may mean you forgot to specify the application to start on the
command line. Or it could mean that you have an error with another
option on the command line and confused the command line parser. "%s
--help" will provide a detailed usage guide.
Please check your PATH and ensure that the executable is able to be
found and executed.
[orterun:error-spawning]
%s was unable to start the specified application. An attempt has been
made to clean up all processes that did start. The error returned was
%d.
[orterun:appfile-not-found]
orterun was unable to open the appfile "%s".
Unable to open the appfile:
%s
Double check that this file exists and is readable.
[orterun:executable-not-specified]
No executable was specified on the %s command line.
Aborting.

Просмотреть файл

@ -72,6 +72,7 @@ struct globals_t {
bool verbose;
bool exit;
bool no_wait_for_job_completion;
bool debug;
int num_procs;
char *hostfile;
char *env_val;
@ -91,6 +92,9 @@ ompi_cmd_line_init_t cmd_line_init[] = {
{ NULL, NULL, NULL, '\0', NULL, "version", 0,
&orterun_globals.version, OMPI_CMD_LINE_TYPE_BOOL,
"Show the orterun version" },
{ "orte", "debug", NULL, 'd', NULL, "debug", 0,
&orterun_globals.debug, OMPI_CMD_LINE_TYPE_BOOL,
"Enable debugging" },
{ NULL, NULL, NULL, 'v', NULL, "verbose", 0,
&orterun_globals.verbose, OMPI_CMD_LINE_TYPE_BOOL,
"Be verbose" },
@ -299,6 +303,7 @@ static int init_globals(void)
false,
false,
false,
false,
-1,
NULL,
NULL,
@ -350,6 +355,11 @@ static int parse_globals(int argc, char* argv[])
wait_for_job_completion = false;
}
/* debug */
if (orterun_globals.debug) {
int id = mca_base_param_register_int("debug",NULL,NULL,NULL,0);
mca_base_param_set_int(id,orterun_globals.debug);
}
OBJ_DESTRUCT(&cmd_line);
return ORTE_SUCCESS;
}
@ -515,6 +525,7 @@ static int create_app(int argc, char* argv[], orte_app_context_t **app_ptr,
init_globals();
ompi_cmd_line_create(&cmd_line, cmd_line_init);
mca_base_cmd_line_setup(&cmd_line);
cmd_line_made = true;
ompi_cmd_line_make_opt3(&cmd_line, '\0', NULL, "rawmap", 2,
"Hidden / internal parameter -- users should not use this!");
@ -524,6 +535,7 @@ static int create_app(int argc, char* argv[], orte_app_context_t **app_ptr,
if (OMPI_SUCCESS != rc) {
goto cleanup;
}
mca_base_cmd_line_process_args(&cmd_line);
/* Is there an appfile in here? */
@ -540,8 +552,8 @@ static int create_app(int argc, char* argv[], orte_app_context_t **app_ptr,
/* See if we have anything left */
if (0 == app->argc) {
ompi_show_help("help-orterun.txt", "orterun:no-application", true,
argv[0], argv[0]);
ompi_show_help("help-orterun.txt", "orterun:executable-not-specified",
true, argv[0], argv[0]);
rc = ORTE_ERR_NOT_FOUND;
goto cleanup;
}
@ -638,8 +650,8 @@ static int create_app(int argc, char* argv[], orte_app_context_t **app_ptr,
app->app = ompi_path_findv(app->argv[0], 0, environ, app->cwd);
if (NULL == app->app) {
ompi_show_help("help-orterun.txt", "orterun:no-application", true,
argv[0], app->argv[0], argv[0]);
ompi_show_help("help-orterun.txt", "orterun:executable-not-found",
true, argv[0], app->argv[0], argv[0]);
rc = ORTE_ERR_NOT_FOUND;
goto cleanup;
}

Просмотреть файл

@ -97,7 +97,6 @@ char *ompi_path_find(char *fname, char **pathv, int mode, char **envv)
return fullpath;
}
/*
* Locates a file with certain permissions from a list of search paths
*/
@ -141,6 +140,8 @@ char *ompi_path_findv(char *fname, int mode, char **envv, char *wrkdir)
ompi_argv_append(&dirc, &dirv, wrkdir);
}
if(NULL == dirv)
return NULL;
fullpath = ompi_path_find(fname, dirv, mode, envv);
ompi_argv_free(dirv);
return fullpath;

Просмотреть файл

@ -87,7 +87,9 @@ int orte_sys_info(void)
return OMPI_ERROR;
} else {
orte_system_info.sysname = strdup(sys_info.sysname);
orte_system_info.nodename = strdup(sys_info.nodename);
if(NULL == orte_system_info.nodename) {
orte_system_info.nodename = strdup(sys_info.nodename);
}
orte_system_info.release = strdup(sys_info.release);
orte_system_info.version = strdup(sys_info.version);
orte_system_info.machine = strdup(sys_info.machine);
@ -111,7 +113,6 @@ int orte_sys_info(void)
/* get the name of the user */
#ifndef WIN32
if ((pwdent = getpwuid(getuid())) != 0) {

Просмотреть файл

@ -19,7 +19,8 @@ AM_CPPFLAGS = -I$(top_srcdir)/test/support -DOMPI_ENABLE_DEBUG_OVERRIDE=1 -g
noinst_PROGRAMS = \
gpr_test \
gpr_test_trigs \
gpr_test_proxy
gpr_test_proxy \
gpr_test_overwrite
gpr_test_SOURCES = gpr_test.c
gpr_test_LDADD = \
@ -38,3 +39,9 @@ gpr_test_proxy_LDADD = \
$(top_builddir)/src/libmpi.la \
$(top_builddir)/test/support/libsupport.la
gpr_test_proxy_DEPENDENCIES = $(gpr_test_proxy_LDADD)
gpr_test_overwrite_SOURCES = gpr_test_overwrite.c
gpr_test_overwrite_LDADD = \
$(top_builddir)/src/libmpi.la \
$(top_builddir)/test/support/libsupport.la
gpr_test_overwrite_DEPENDENCIES = $(gpr_test_overwrite_LDADD)

Просмотреть файл

@ -34,6 +34,7 @@
#include "dps/dps.h"
#include "runtime/runtime.h"
#include "util/proc_info.h"
#include "util/sys_info.h"
#include "mca/gpr/base/base.h"
#include "mca/gpr/replica/api_layer/gpr_replica_api.h"
@ -55,7 +56,7 @@ int main(int argc, char **argv)
orte_gpr_replica_itag_t itag[10], itag2, *itaglist;
orte_gpr_replica_container_t *cptr=NULL, **cptrs=NULL;
orte_gpr_keyval_t *kptr=NULL, **kvals;
orte_gpr_replica_itagval_t **ivals=NULL;
orte_gpr_replica_itagval_t **ivals=NULL, *iptr;
orte_gpr_value_t **values, *val;
orte_process_name_t seed={0,0,0};
bool found;
@ -73,7 +74,29 @@ int main(int argc, char **argv)
/* ENSURE THE REPLICA IS ISOLATED */
setenv("OMPI_MCA_gpr_replica_isolate", "1", 1);
ompi_init(argc, argv);
/* Open up the output streams */
if (!ompi_output_init()) {
return OMPI_ERROR;
}
/*
* If threads are supported - assume that we are using threads - and reset otherwise.
*/
ompi_set_using_threads(OMPI_HAVE_THREADS);
/* For malloc debugging */
ompi_malloc_init();
/* Ensure the system_info structure is instantiated and initialized */
if (ORTE_SUCCESS != (rc = orte_sys_info())) {
return rc;
}
/* Ensure the process info structure is instantiated and initialized */
if (ORTE_SUCCESS != (rc = orte_proc_info())) {
return rc;
}
orte_process_info.seed = true;
orte_process_info.my_name = &seed;
@ -227,7 +250,7 @@ int main(int argc, char **argv)
kptr->key = strdup("stupid-value");
kptr->type = ORTE_INT16;
kptr->value.i16 = 21;
if (ORTE_SUCCESS != (rc = orte_gpr_replica_add_keyval(seg, cptr, kptr))) {
if (ORTE_SUCCESS != (rc = orte_gpr_replica_add_keyval(&iptr, seg, cptr, kptr))) {
fprintf(test_out, "gpr_test: add keyval failed with error code %s\n",
ORTE_ERROR_NAME(rc));
test_failure("gpr_test: add keyval failed");
@ -250,9 +273,9 @@ int main(int argc, char **argv)
kptr->type = ORTE_STRING;
kptr->value.strptr = strdup("try-string-value");
orte_gpr_replica_create_itag(&itag2, seg, kptr->key);
if (ORTE_SUCCESS != (rc = orte_gpr_replica_search_container(&found, ORTE_GPR_REPLICA_OR,
if (ORTE_SUCCESS != (rc = orte_gpr_replica_search_container(&num_found, ORTE_GPR_REPLICA_OR,
&itag2, 1, cptr) ||
!found)) {
0 >= num_found)) {
fprintf(test_out, "gpr_test: search container for single entry failed - returned %s for itag %d\n",
ORTE_ERROR_NAME(rc), itag2);
test_failure("gpr_test: search container for single entry failed");
@ -332,6 +355,7 @@ int main(int argc, char **argv)
fprintf(stderr, "put one value with single keyval\n");
val = OBJ_NEW(orte_gpr_value_t);
val->addr_mode = ORTE_GPR_NO_OVERWRITE | ORTE_GPR_TOKENS_XAND;
val->cnt = 1;
val->segment = strdup("test-put-segment");
val->num_tokens = 14;
@ -344,8 +368,7 @@ int main(int argc, char **argv)
(val->keyvals[0])->key = strdup("stupid-value-next-one");
(val->keyvals[0])->type = ORTE_INT32;
(val->keyvals[0])->value.i32 = 654321;
if (ORTE_SUCCESS != (rc = orte_gpr_replica_put(ORTE_GPR_NO_OVERWRITE | ORTE_GPR_TOKENS_XAND,
1, &val))) {
if (ORTE_SUCCESS != (rc = orte_gpr_replica_put(1, &val))) {
fprintf(test_out, "gpr_test: put of 1 value/1 keyval failed with error code %s\n",
ORTE_ERROR_NAME(rc));
test_failure("gpr_test: put of 1 value/1 keyval failed");
@ -358,6 +381,7 @@ int main(int argc, char **argv)
fprintf(stderr, "put one value with multiple keyvals\n");
val = OBJ_NEW(orte_gpr_value_t);
val->addr_mode = ORTE_GPR_NO_OVERWRITE | ORTE_GPR_TOKENS_XAND;
val->cnt = 20;
val->segment = strdup("test-put-segment");
val->num_tokens = 14;
@ -372,8 +396,7 @@ int main(int argc, char **argv)
(val->keyvals[i])->type = ORTE_UINT32;
(val->keyvals[i])->value.ui32 = (uint32_t)i;
}
if (ORTE_SUCCESS != (rc = orte_gpr_replica_put(ORTE_GPR_NO_OVERWRITE | ORTE_GPR_TOKENS_XAND,
1, &val))) {
if (ORTE_SUCCESS != (rc = orte_gpr_replica_put(1, &val))) {
fprintf(test_out, "gpr_test: put 1 value/multiple keyval failed with error code %s\n",
ORTE_ERROR_NAME(rc));
test_failure("gpr_test: put 1 value/multiple keyval failed");
@ -385,8 +408,7 @@ int main(int argc, char **argv)
fprintf(stderr, "put 1 value/multiple keyvals - second container\n");
val->num_tokens = 10;
if (ORTE_SUCCESS != (rc = orte_gpr_replica_put(ORTE_GPR_NO_OVERWRITE | ORTE_GPR_TOKENS_XAND,
1, &val))) {
if (ORTE_SUCCESS != (rc = orte_gpr_replica_put(1, &val))) {
fprintf(test_out, "gpr_test: put 1 value/multiple keyval in second container failed with error code %s\n",
ORTE_ERROR_NAME(rc));
test_failure("gpr_test: put 1 value/multiple keyval in second container failed");
@ -398,7 +420,7 @@ int main(int argc, char **argv)
OBJ_RELEASE(val);
fprintf(stderr, "dump\n");
if (ORTE_SUCCESS != (rc = orte_gpr_replica_dump(0))) {
if (ORTE_SUCCESS != (rc = orte_gpr.dump_all(0))) {
fprintf(test_out, "gpr_test: dump failed with error code %s\n",
ORTE_ERROR_NAME(rc));
test_failure("gpr_test: dump failed");
@ -479,6 +501,7 @@ int main(int argc, char **argv)
fprintf(stderr, "put multiple copies of same entry in single container\n");
val = OBJ_NEW(orte_gpr_value_t);
val->addr_mode = ORTE_GPR_NO_OVERWRITE | ORTE_GPR_TOKENS_XAND;
val->cnt = 1;
val->segment = strdup("test-put-segment");
val->num_tokens = 5;
@ -493,8 +516,7 @@ int main(int argc, char **argv)
(val->keyvals[0])->value.strptr = strdup("try-string-value");
for (i = 0; i < 10; i++) {
fprintf(stderr, "\tputting copy %d\n", i);
if (ORTE_SUCCESS != (rc = orte_gpr_replica_put(ORTE_GPR_NO_OVERWRITE | ORTE_GPR_TOKENS_XAND,
1, &val))) {
if (ORTE_SUCCESS != (rc = orte_gpr_replica_put(1, &val))) {
fprintf(test_out, "gpr_test: put multiple copies of one keyval in a container failed with error code %s\n",
ORTE_ERROR_NAME(rc));
test_failure("gpr_test: put multiple copies of one keyval in a container failed");
@ -504,7 +526,7 @@ int main(int argc, char **argv)
}
OBJ_RELEASE(val);
orte_gpr_replica_dump(0);
orte_gpr.dump_all(0);
fprintf(stderr, "update multiple keyvals in a container\n");
if(ORTE_SUCCESS != orte_gpr_replica_find_seg(&seg, false, "test-put-segment")) {
@ -532,10 +554,11 @@ int main(int argc, char **argv)
fprintf(test_out, "gpr_test: update multiple keyvals passed\n");
}
orte_gpr_replica_dump(0);
orte_gpr.dump_all(0);
fprintf(stderr, "put with no tokens puts in every container\n");
val = OBJ_NEW(orte_gpr_value_t);
val->addr_mode = ORTE_GPR_NO_OVERWRITE;
val->cnt = 1;
val->segment = strdup("test-put-segment");
val->num_tokens = 0;
@ -545,8 +568,7 @@ int main(int argc, char **argv)
(val->keyvals[0])->key = strdup("stupid-value-next-one");
(val->keyvals[0])->type = ORTE_STRING;
(val->keyvals[0])->value.strptr = strdup("try-string-value");
if (ORTE_SUCCESS != (rc = orte_gpr_replica_put(ORTE_GPR_NO_OVERWRITE,
1, &val))) {
if (ORTE_SUCCESS != (rc = orte_gpr_replica_put(1, &val))) {
fprintf(test_out, "gpr_test: put with no tokens failed - returned %s\n",
ORTE_ERROR_NAME(rc));
test_failure("gpr_test: put with no tokens failed");
@ -556,7 +578,7 @@ int main(int argc, char **argv)
}
OBJ_RELEASE(val);
orte_gpr_replica_dump(0);
orte_gpr.dump_all(0);
fprintf(stderr, "\nreleasing segment\n");
if (ORTE_SUCCESS != (rc = orte_gpr_replica_release_segment(&seg)) ||

363
test/mca/gpr/gpr_test_overwrite.c Обычный файл
Просмотреть файл

@ -0,0 +1,363 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "include/orte_constants.h"
#include "include/orte_types.h"
#include "include/orte_schema.h"
#include <stdio.h>
#include <string.h>
#include "support.h"
#include "util/proc_info.h"
#include "util/sys_info.h"
#include "mca/errmgr/errmgr.h"
#include "mca/ns/ns_types.h"
#include "mca/gpr/gpr.h"
#include "dps/dps.h"
#include "mca/gpr/base/base.h"
#include "mca/gpr/replica/api_layer/gpr_replica_api.h"
#include "mca/gpr/replica/functional_layer/gpr_replica_fn.h"
#include "mca/gpr/replica/communications/gpr_replica_comm.h"
#include "mca/gpr/replica/transition_layer/gpr_replica_tl.h"
/* output files needed by the test */
static FILE *test_out=NULL;
/**
* Struct for holding information
*/
struct test_node_t {
/** Base object */
ompi_list_item_t super;
/** String node name */
char *node_name;
/** String of the architecture for the node. This is permitted to
be NULL if it is not known. */
char *node_arch;
/** The cell ID of this node */
orte_cellid_t node_cellid;
/** State of this node; see include/orte_types.h */
orte_node_state_t node_state;
/** A "soft" limit on the number of slots available on the node.
This will typically correspond to the number of physical CPUs
that we have been allocated on this note and would be the
"ideal" number of processes for us to launch. */
size_t node_slots;
/** How many processes have already been launched, used by one or
more jobs on this node. */
size_t node_slots_inuse;
/** This represents the number of slots we (the allocator) are
attempting to allocate to the current job - or the number of
slots allocated to a specific job on a query for the jobs
allocations */
size_t node_slots_alloc;
/** A "hard" limit (if set -- a value of 0 implies no hard limit)
on the number of slots that can be allocated on a given
node. This is for some environments (e.g. grid) there may be
fixed limits on the number of slots that can be used.
This value also could have been a boolean - but we may want to
allow the hard limit be different than the soft limit - in
other words allow the node to be oversubscribed up to a
specified limit. For example, if we have two processors, we
may want to allow up to four processes but no more. */
size_t node_slots_max;
};
/**
* Convenience typedef
*/
typedef struct test_node_t test_node_t;
static void test_node_construct(test_node_t* node)
{
node->node_name = NULL;
node->node_arch = NULL;
node->node_cellid = 0;
node->node_state = ORTE_NODE_STATE_UNKNOWN;
node->node_slots = 0;
node->node_slots_alloc = 0;
node->node_slots_inuse = 0;
node->node_slots_max = 0;
}
static void test_node_destruct(test_node_t* node)
{
if (NULL != node->node_name) {
free(node->node_name);
}
if (NULL != node->node_arch) {
free(node->node_arch);
}
}
OBJ_CLASS_INSTANCE(
test_node_t,
ompi_list_item_t,
test_node_construct,
test_node_destruct);
static int test_overwrite(ompi_list_t* nodes);
int main(int argc, char **argv)
{
ompi_list_t nodes;
test_node_t *node;
orte_process_name_t seed={0,0,0};
int i, rc;
/* test_out = fopen( "test_gpr_replica_out", "w+" ); */
test_out = stderr;
if( test_out == NULL ) {
test_failure("gpr_test couldn't open test file failed");
test_finalize();
exit(1);
}
/* ENSURE THE REPLICA IS ISOLATED */
setenv("OMPI_MCA_gpr_replica_isolate", "1", 1);
/* Open up the output streams */
if (!ompi_output_init()) {
return OMPI_ERROR;
}
/*
* If threads are supported - assume that we are using threads - and reset otherwise.
*/
ompi_set_using_threads(OMPI_HAVE_THREADS);
/* For malloc debugging */
ompi_malloc_init();
/* Ensure the system_info structure is instantiated and initialized */
if (ORTE_SUCCESS != (rc = orte_sys_info())) {
return rc;
}
/* Ensure the process info structure is instantiated and initialized */
if (ORTE_SUCCESS != (rc = orte_proc_info())) {
return rc;
}
orte_process_info.seed = true;
orte_process_info.my_name = &seed;
/* startup the MCA */
if (OMPI_SUCCESS == mca_base_open()) {
fprintf(test_out, "MCA started\n");
} else {
fprintf(test_out, "MCA could not start\n");
exit (1);
}
if (ORTE_SUCCESS == orte_gpr_base_open()) {
fprintf(test_out, "GPR started\n");
} else {
fprintf(test_out, "GPR could not start\n");
exit (1);
}
if (ORTE_SUCCESS == orte_gpr_base_select()) {
fprintf(test_out, "GPR replica selected\n");
} else {
fprintf(test_out, "GPR replica could not be selected\n");
exit (1);
}
if (ORTE_SUCCESS == orte_dps_open()) {
fprintf(test_out, "DPS started\n");
} else {
fprintf(test_out, "DPS could not start\n");
exit (1);
}
/* setup a node list */
OBJ_CONSTRUCT(&nodes, ompi_list_t);
for (i=0; i < 5; i++) {
node = OBJ_NEW(test_node_t);
asprintf(&(node->node_name), "node-%d", i);
asprintf(&(node->node_arch), "arch-%d", i);
node->node_cellid = 0;
node->node_state = ORTE_NODE_STATE_UP;
node->node_slots = i;
node->node_slots_alloc = i%2;
node->node_slots_inuse = i % 3;
node->node_slots_max = i * 5;
ompi_list_append(&nodes, &node->super);
}
fprintf(test_out, "putting initial set of values on registry\n");
if (ORTE_SUCCESS != (rc = test_overwrite(&nodes))) {
fprintf(test_out, "initial put of values failed with error %s\n",
ORTE_ERROR_NAME(rc));
return rc;
} else {
fprintf(test_out, "initial put of values successful\n");
}
orte_gpr.dump_all(0);
fprintf(test_out, "changing values for overwrite test\n");
/* change the arch, state, and slots_inuse values */
for (i=0, node = (test_node_t*)ompi_list_get_first(&nodes);
node != (test_node_t*)ompi_list_get_end(&nodes);
node = (test_node_t*)ompi_list_get_next(node), i++) {
free(node->node_arch);
asprintf(&(node->node_arch), "new-arch-%d", i*10);
node->node_state = ORTE_NODE_STATE_DOWN;
node->node_slots_inuse = node->node_slots_inuse * 20;
}
fprintf(test_out, "putting second set of values on registry to test overwrite\n");
if (ORTE_SUCCESS != (rc = test_overwrite(&nodes))) {
fprintf(test_out, "second put of values failed with error %s\n",
ORTE_ERROR_NAME(rc));
return rc;
} else {
fprintf(test_out, "second put of values successful\n");
}
orte_gpr.dump_all(0);
fclose( test_out );
test_finalize();
return(0);
}
int test_overwrite(ompi_list_t* nodes)
{
ompi_list_item_t* item;
orte_gpr_value_t **values;
int rc, num_values, i, j;
test_node_t* node;
num_values = ompi_list_get_size(nodes);
if (0 >= num_values) {
return ORTE_ERR_BAD_PARAM;
}
values = (orte_gpr_value_t**)malloc(num_values * sizeof(orte_gpr_value_t*));
if (NULL == values) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
for (i=0; i < num_values; i++) {
orte_gpr_value_t* value = values[i] = OBJ_NEW(orte_gpr_value_t);
if (NULL == value) {
for (j=0; j < i; j++) {
OBJ_RELEASE(values[j]);
}
free(values);
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
value->addr_mode = ORTE_GPR_OVERWRITE;
value->segment = strdup(ORTE_NODE_SEGMENT);
value->cnt = 6;
value->keyvals = (orte_gpr_keyval_t**)malloc(value->cnt*sizeof(orte_gpr_keyval_t*));
if (NULL == value->keyvals) {
for (j=0; j < i; j++) {
OBJ_RELEASE(values[j]);
}
free(values);
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
for (j=0; j < value->cnt; j++) {
value->keyvals[j] = OBJ_NEW(orte_gpr_keyval_t);
if (NULL == value->keyvals[j]) {
for (j=0; j <= i; j++) {
OBJ_RELEASE(values[j]);
}
free(values);
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
}
}
for(i=0, item = ompi_list_get_first(nodes);
i < num_values && item != ompi_list_get_end(nodes);
i++, item = ompi_list_get_next(item)) {
orte_gpr_value_t* value = values[i];
node = (test_node_t*)item;
j = 0;
(value->keyvals[j])->key = strdup(ORTE_NODE_NAME_KEY);
(value->keyvals[j])->type = ORTE_STRING;
(value->keyvals[j])->value.strptr = strdup(node->node_name);
++j;
(value->keyvals[j])->key = strdup(ORTE_NODE_ARCH_KEY);
(value->keyvals[j])->type = ORTE_STRING;
if (NULL != node->node_arch) {
(value->keyvals[j])->value.strptr = strdup(node->node_arch);
} else {
(value->keyvals[j])->value.strptr = strdup("");
}
++j;
(value->keyvals[j])->key = strdup(ORTE_NODE_STATE_KEY);
(value->keyvals[j])->type = ORTE_NODE_STATE;
(value->keyvals[j])->value.node_state = node->node_state;
++j;
(value->keyvals[j])->key = strdup(ORTE_CELLID_KEY);
(value->keyvals[j])->type = ORTE_CELLID;
(value->keyvals[j])->value.cellid = node->node_cellid;
++j;
(value->keyvals[j])->key = strdup(ORTE_NODE_SLOTS_KEY);
(value->keyvals[j])->type = ORTE_UINT32;
(value->keyvals[j])->value.ui32 = node->node_slots;
++j;
(value->keyvals[j])->key = strdup(ORTE_NODE_SLOTS_MAX_KEY);
(value->keyvals[j])->type = ORTE_UINT32;
(value->keyvals[j])->value.ui32 = node->node_slots_max;
/* setup index/keys for this node */
rc = orte_schema.get_node_tokens(&value->tokens, &value->num_tokens, node->node_cellid, node->node_name);
if (ORTE_SUCCESS != rc) {
for (j=0; j <= i; j++) {
OBJ_RELEASE(values[j]);
}
free(values);
return rc;
}
}
/* try the insert */
rc = orte_gpr.put(num_values, values);
for (j=0; j < num_values; j++) {
OBJ_RELEASE(values[j]);
}
free(values);
return rc;
}

Просмотреть файл

@ -49,16 +49,14 @@
/* output files needed by the test */
static FILE *test_out=NULL;
static char *cmd_str="diff ./test_gpr_replica_out ./test_gpr_replica_out_std";
static void test_cbfunc(orte_gpr_notify_data_t *data, void *user_tag);
int main(int argc, char **argv)
{
ompi_cmd_line_t cmd_line;
int rc, num_names, num_found, num_counters=6;
int i, j, cnt, ret;
int rc, num_counters=6;
int i;
orte_gpr_value_t *values, value, trig, *trigs;
orte_gpr_subscription_t *subscription;
orte_gpr_notify_id_t sub;
@ -72,7 +70,7 @@ int main(int argc, char **argv)
ORTE_PROC_NUM_TERMINATED
};
test_init("test_gpr_replica_trigs");
test_init("test_gpr_proxy");
/* test_out = fopen( "test_gpr_replica_out", "w+" ); */
test_out = stderr;
@ -82,8 +80,7 @@ int main(int argc, char **argv)
exit(1);
}
OBJ_CONSTRUCT(&cmd_line, ompi_cmd_line_t);
if (ORTE_SUCCESS != (rc = orte_init(&cmd_line, argc, argv))) {
if (ORTE_SUCCESS != (rc = orte_init())) {
fprintf(test_out, "orte_init failed to start started\n");
} else {
fprintf(test_out, "orte_init_started\n");
@ -113,7 +110,7 @@ int main(int argc, char **argv)
fprintf(test_out, "gpr_test_trigs: subscribe on seg registered\n");
}
orte_gpr.dump(0);
orte_gpr.dump_all(0);
/* setup some test counters */
OBJ_CONSTRUCT(&value, orte_gpr_value_t);
@ -159,7 +156,7 @@ int main(int argc, char **argv)
return rc;
}
orte_gpr.dump(0);
orte_gpr.dump_all(0);
fprintf(test_out, "incrementing all counters\n");
@ -170,7 +167,7 @@ int main(int argc, char **argv)
return rc;
}
orte_gpr.dump(0);
orte_gpr.dump_all(0);
fprintf(test_out, "decrementing all counters\n");
@ -182,7 +179,7 @@ int main(int argc, char **argv)
}
OBJ_DESTRUCT(&value);
orte_gpr.dump(0);
orte_gpr.dump_all(0);
/* for testing the trigger, we'll just use the prior subscription setup.
@ -225,7 +222,7 @@ int main(int argc, char **argv)
return rc;
}
orte_gpr.dump(0);
orte_gpr.dump_all(0);
fprintf(test_out, "incrementing until trigger\n");
@ -266,7 +263,7 @@ int main(int argc, char **argv)
}
}
orte_gpr.dump(0);
orte_gpr.dump_all(0);
fclose( test_out );
/* result = system( cmd_str );
@ -288,7 +285,7 @@ void test_cbfunc(orte_gpr_notify_data_t *data, void *tag)
/* fprintf(test_out, "\tSegment: %s\tNumber of values: %d\n", (msg->values[0])->segment, msg->cnt);
*/
orte_gpr.dump(0);
orte_gpr.dump_all(0);
OBJ_RELEASE(data);
}