1
1

* sync with close to the trunk, but right before the gm changes that we

actually need.  More, interesting, stuff coming soon.

This commit was SVN r4933.
Этот коммит содержится в:
Brian Barrett 2005-03-18 23:40:08 +00:00
родитель 77c65d69cc
Коммит a1d11d9b54
31 изменённых файлов: 488 добавлений и 130 удалений

52
ISSUES Обычный файл
Просмотреть файл

@ -0,0 +1,52 @@
Undecided timing:
-----------------
- if an MPI process fails (e.g., it seg faults), it causes orterun to
hang. This is with the rsh pls.
- if the daemon is not found or fails to start, orterun will hang. No
indication is given to the users that something went wrong.
- $prefix/etc/hosts vs. $prefix/etc/openmpi-default-hostfile
Pre-milestone:
--------------
- singleton mpi doesn't work
Post-milestone:
---------------
- ras_base_alloc: doesn't allow for oversubscribing like this:
eddie: cpu=2
vogon: cpu=2 max-slots=4
mpirun -np 6 uptime
It barfs because it tries to evenly divide the remaining unallocated
procs across all nodes (i.e., 1 each on eddie/vogon) rather than
seeing that vogon can take the remaining 2.
- Jeff: TM needs to be re-written to use daemons (can't hog TM
connection forever)
- Jeff: make the mapper be able to handle app->map_data
- Jeff: add function callback in cmd_line_t stuff
- Jeff: does cmd_line_t need to *get* MCA params if a command line
param is not taken but an MCA param is available?
- consider empty string problem...
- ?: Friendlier error messages (e.g., if no nodes -- need something
meaningful to tell the user)
- ?: Populate orte_finalize()
- Ralph: compare and set function in GPR
- Jeff: collapse MCA params from 3 names to 1 name
- ?: Apply LANL copyright to trunk (post all merging activity)

Просмотреть файл

@ -1,12 +1,47 @@
This file contains information on merging the branches/tim tree into the
trunk.
UPDATE MERGE:
UPDATE MERGE 2:
svn merge -r4892:4922 svn+ssh://svn.open-mpi.org/l/svn/ompi/branches/tim .
U src/event/poll.c
U src/event/select.c
U src/runtime/orte_finalize.c
U src/runtime/orte_restart.c
U src/tools/orterun/orterun.c
U src/mca/oob/tcp/oob_tcp_send.c
U src/mca/oob/tcp/oob_tcp_peer.c
U src/mca/oob/tcp/oob_tcp_recv.c
C src/mca/oob/tcp/oob_tcp.c
C src/mca/oob/tcp/oob_tcp.h
U src/mca/ras/base/ras_base_alloc.c
U src/mca/rds/hostfile/rds_hostfile.c
U src/mca/rds/hostfile/rds_hostfile_lex.l
U src/mca/soh/base/soh_base_get_proc_soh.c
U src/mca/pls/bproc_seed/pls_bproc_seed.c
U src/mca/pls/rsh/pls_rsh_component.c
C src/mca/ptl/gm/src/ptl_gm_priv.c
U src/mca/rmgr/base/rmgr_base_stage_gate.c
U src/mca/base/mca_base_param.c
U src/mca/iof/base/base.h
U src/mca/iof/base/iof_base_select.c
U src/mca/iof/base/iof_base_open.c
U src/mca/iof/base/iof_base_close.c
U src/util/univ_info.h
U src/util/proc_info.c
U src/util/proc_info.h
U src/util/sys_info.c
U src/util/sys_info.h
U src/util/cmd_line.c
U src/util/univ_info.c
A ISSUES
UPDATE MERGE 1:
svn merge -r4821:4892 svn+ssh://svn.open-mpi.org/l/svn/ompi/branches/tim .
RESULTS:
M test/mca/gpr
M test/mca/gpr/gpr_test.c
M test/mca/gpr/gpr_test_proxy.c

Просмотреть файл

@ -58,6 +58,7 @@
#endif
#include "event.h"
#include "util/output.h"
#if OMPI_EVENT_USE_SIGNALS
#include "evsignal.h"
#endif
@ -206,7 +207,7 @@ poll_dispatch(void *arg, struct timeval *tv)
if (res == -1) {
if (errno != EINTR) {
log_error("poll");
ompi_output(0, "poll failed with errno=%d\n", errno);
return (-1);
}

Просмотреть файл

@ -215,6 +215,29 @@ select_dispatch(void *arg, struct timeval *tv)
#endif
if (res == -1) {
if (errno == EBADF) {
/* poll each of the file descriptors individually to determine
* which is bad
*/
for (ev = TAILQ_FIRST(&ompi_eventqueue); ev != NULL; ev = next) {
next = TAILQ_NEXT(ev, ev_next);
tv->tv_sec = 0;
tv->tv_usec = 0;
memset(sop->event_readset, 0, sop->event_fdsz);
memset(sop->event_writeset, 0, sop->event_fdsz);
if (ev->ev_events & OMPI_EV_WRITE)
FD_SET(ev->ev_fd, sop->event_writeset);
if (ev->ev_events & OMPI_EV_READ)
FD_SET(ev->ev_fd, sop->event_readset);
res = select(sop->event_fds + 1, sop->event_readset,
sop->event_writeset, NULL, tv);
if(res < 0) {
ompi_output(0, "bad file descriptor: %d\n", ev->ev_fd);
ompi_event_del_i(ev);
}
}
}
if (errno != EINTR) {
ompi_output(0, "select failed with errno=%d\n", errno);
return (-1);

Просмотреть файл

@ -159,7 +159,6 @@ int mca_base_param_register_int(const char *type_name,
int default_value)
{
mca_base_param_storage_t storage;
mca_base_param_find(type_name,component_name,param_name);
storage.intval = default_value;
return param_register(type_name, component_name, param_name, mca_param_name,
@ -177,7 +176,6 @@ int mca_base_param_register_string(const char *type_name,
const char *default_value)
{
mca_base_param_storage_t storage;
mca_base_param_find(type_name,component_name,param_name);
if (NULL != default_value) {
storage.stringval = (char *) default_value;

Просмотреть файл

@ -44,7 +44,7 @@ extern "C" {
struct orte_iof_base_t {
int iof_output;
ompi_list_t iof_components_opened;
bool iof_component_selected;
bool iof_flush;
ompi_list_t iof_endpoints;
ompi_mutex_t iof_lock;
ompi_condition_t iof_condition;

Просмотреть файл

@ -32,9 +32,9 @@ int orte_iof_base_close(void)
/* We only need to flush if an iof component was successfully
selected */
if (orte_iof_base.iof_component_selected) {
if (orte_iof_base.iof_flush) {
orte_iof_base_flush();
orte_iof_base.iof_component_selected = false;
orte_iof_base.iof_flush = false;
}
/* shutdown any remaining opened components */

Просмотреть файл

@ -57,7 +57,7 @@ int orte_iof_base_open(void)
OBJ_CONSTRUCT(&orte_iof_base.iof_condition, ompi_condition_t);
OBJ_CONSTRUCT(&orte_iof_base.iof_fragments, ompi_free_list_t);
orte_iof_base.iof_waiting = 0;
orte_iof_base.iof_component_selected = false;
orte_iof_base.iof_flush = false;
/* lookup common parameters */
id = mca_base_param_register_int("iof","base","window_size",NULL,ORTE_IOF_BASE_MSG_MAX << 1);

Просмотреть файл

@ -100,7 +100,7 @@ int orte_iof_base_select(void)
/* setup reference to selected module */
if (NULL != selected_module) {
orte_iof = *selected_module;
orte_iof_base.iof_component_selected = true;
orte_iof_base.iof_flush = true;
return ORTE_SUCCESS;
}

Просмотреть файл

@ -192,11 +192,16 @@ int mca_oob_tcp_component_open(void)
mca_oob_tcp_component.tcp_peer_retries =
mca_oob_tcp_param_register_int("peer_retries", 60);
mca_oob_tcp_component.tcp_debug =
mca_oob_tcp_param_register_int("debug", 1);
mca_oob_tcp_param_register_int("debug", 0);
mca_oob_tcp_component.tcp_include =
mca_oob_tcp_param_register_str("include", NULL);
mca_oob_tcp_component.tcp_exclude =
mca_oob_tcp_param_register_str("exclude", NULL);
mca_oob_tcp_component.tcp_include =
mca_oob_tcp_param_register_str("include", NULL);
mca_oob_tcp_component.tcp_exclude =
mca_oob_tcp_param_register_str("exclude", NULL);
/* initialize state */
mca_oob_tcp_component.tcp_listen_sd = -1;
@ -254,6 +259,14 @@ static void mca_oob_tcp_accept(void)
ompi_output(0, "mca_oob_tcp_accept: accept() failed with errno %d.", ompi_socket_errno);
return;
}
/* log the accept */
if(mca_oob_tcp_component.tcp_debug) {
ompi_output(0, "[%d,%d,%d] mca_oob_tcp_accept: %s:%d\n",
ORTE_NAME_ARGS(orte_process_info.my_name),
inet_ntoa(addr.sin_addr),
addr.sin_port);
}
/* wait for receipt of peers process identifier to complete this connection */
event = OBJ_NEW(mca_oob_tcp_event_t);
@ -279,13 +292,6 @@ static int mca_oob_tcp_create_listen(void)
ompi_output(0,"mca_oob_tcp_component_init: socket() failed with errno=%d", ompi_socket_errno);
return OMPI_ERROR;
}
/* allow port to be re-used - for temporary fixed port numbers */
if (setsockopt(
mca_oob_tcp_component.tcp_listen_sd, SOL_SOCKET, SO_REUSEADDR, (char *)&optval, sizeof(optval)) < 0) {
ompi_output(0, "mca_oob_tcp_create_listen: setsockopt(SO_REUSEADDR) failed with errno=%d\n",
ompi_socket_errno);
}
memset(&inaddr, 0, sizeof(inaddr));
inaddr.sin_family = AF_INET;
inaddr.sin_addr.s_addr = INADDR_ANY;
@ -356,7 +362,7 @@ static void mca_oob_tcp_recv_handler(int sd, short flags, void* user)
/* recv the process identifier */
while((rc = recv(sd, (char *)guid, sizeof(guid), 0)) != sizeof(guid)) {
if(rc >= 0) {
if(mca_oob_tcp_component.tcp_debug > 3) {
if(mca_oob_tcp_component.tcp_debug > 1) {
ompi_output(0, "[%d,%d,%d] mca_oob_tcp_recv_handler: peer closed connection",
ORTE_NAME_ARGS(orte_process_info.my_name));
}
@ -411,7 +417,7 @@ static void mca_oob_tcp_recv_handler(int sd, short flags, void* user)
}
/* is the peer instance willing to accept this connection */
if(mca_oob_tcp_peer_accept(peer, sd) == false) {
if(mca_oob_tcp_component.tcp_debug > 1) {
if(mca_oob_tcp_component.tcp_debug > 0) {
ompi_output(0, "[%d,%d,%d]-[%d,%d,%d] mca_oob_tcp_recv_handler: "
"rejected connection from [%d,%d,%d] connection state %d",
ORTE_NAME_ARGS(orte_process_info.my_name),
@ -629,8 +635,10 @@ int mca_oob_tcp_init(void)
int rc;
ompi_list_item_t* item;
/* get my jobid */
/* random delay to stagger connections back to seed */
usleep((orte_process_info.num_procs % 100) * 10000);
/* get my jobid */
if (ORTE_SUCCESS != (rc = orte_ns.get_jobid(&jobid,
orte_process_info.my_name))) {
ORTE_ERROR_LOG(rc);
@ -656,7 +664,7 @@ int mca_oob_tcp_init(void)
ompi_list_append(&mca_oob_tcp_component.tcp_subscriptions, &subscription->item);
OMPI_THREAD_UNLOCK(&mca_oob_tcp_component.tcp_lock);
if(mca_oob_tcp_component.tcp_debug > 1) {
if(mca_oob_tcp_component.tcp_debug > 2) {
ompi_output(0, "[%d,%d,%d] mca_oob_tcp_init: calling orte_gpr.subscribe\n",
ORTE_NAME_ARGS(orte_process_info.my_name));
}
@ -800,7 +808,7 @@ int mca_oob_tcp_init(void)
return rc;
}
if(mca_oob_tcp_component.tcp_debug > 1) {
if(mca_oob_tcp_component.tcp_debug > 2) {
ompi_output(0, "[%d,%d,%d] mca_oob_tcp_init: calling orte_gpr.put(%s)\n",
ORTE_NAME_ARGS(orte_process_info.my_name),
value->segment);

Просмотреть файл

@ -241,8 +241,8 @@ void mca_oob_tcp_registry_callback(
*/
struct mca_oob_tcp_component_t {
mca_oob_base_component_1_0_0_t super; /**< base OOB component */
char* tcp_include; /**< list of interfaces to include */
char* tcp_exclude; /**< list of interfaces to exclude */
char* tcp_include; /**< list of ip interfaces to include */
char* tcp_exclude; /**< list of ip interfaces to exclude */
int tcp_listen_sd; /**< listen socket for incoming connection requests */
unsigned short tcp_listen_port; /**< listen port */
ompi_list_t tcp_subscriptions; /**< list of registry subscriptions */

Просмотреть файл

@ -284,7 +284,7 @@ static int mca_oob_tcp_peer_start_connect(mca_oob_tcp_peer_t* peer)
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(&(peer->peer_name)),
ompi_socket_errno);
mca_oob_tcp_peer_close(peer);
mca_oob_tcp_peer_shutdown(peer);
ompi_evtimer_add(&peer->peer_timer_event, &tv);
return OMPI_ERR_UNREACH;
}
@ -316,10 +316,11 @@ static int mca_oob_tcp_peer_start_connect(mca_oob_tcp_peer_t* peer)
return rc;
}
if(mca_oob_tcp_component.tcp_debug > 2) {
ompi_output(0, "[%d,%d,%d]-[%d,%d,%d] mca_oob_tcp_peer_start_connect: connecting to: %s:%d\n",
if(mca_oob_tcp_component.tcp_debug > 0) {
ompi_output(0, "[%d,%d,%d]-[%d,%d,%d] mca_oob_tcp_peer_start_connect: connecting port %d to: %s:%d\n",
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(&(peer->peer_name)),
ntohs(mca_oob_tcp_component.tcp_listen_port),
inet_ntoa(inaddr.sin_addr),
ntohs(inaddr.sin_port));
}
@ -382,16 +383,14 @@ static void mca_oob_tcp_peer_complete_connect(mca_oob_tcp_peer_t* peer)
if(so_error == EINPROGRESS) {
ompi_event_add(&peer->peer_send_event, 0);
return;
} else if (so_error == ECONNREFUSED) {
} else if (so_error == ECONNREFUSED || so_error == ETIMEDOUT) {
struct timeval tv = { 1,0 };
ompi_output(0, "[%d,%d,%d]-[%d,%d,%d] mca_oob_tcp_peer_complete_connect: "
"connection refused - retrying\n",
"connection failed (errno=%d) - retrying (pid=%d)\n",
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(&(peer->peer_name)));
mca_oob_tcp_peer_close(peer);
if(peer->peer_retries > mca_oob_tcp_component.tcp_peer_retries) {
return;
}
ORTE_NAME_ARGS(&(peer->peer_name)),
so_error, getpid());
mca_oob_tcp_peer_shutdown(peer);
ompi_evtimer_add(&peer->peer_timer_event, &tv);
return;
} else if(so_error != 0) {
@ -438,7 +437,7 @@ static void mca_oob_tcp_peer_connected(mca_oob_tcp_peer_t* peer)
*/
void mca_oob_tcp_peer_close(mca_oob_tcp_peer_t* peer)
{
if(mca_oob_tcp_component.tcp_debug > 2) {
if(mca_oob_tcp_component.tcp_debug > 0) {
ompi_output(0, "[%d,%d,%d]-[%d,%d,%d] mca_oob_tcp_peer_close(%p) sd %d state %d\n",
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(&(peer->peer_name)),
@ -537,7 +536,7 @@ static int mca_oob_tcp_peer_recv_connect_ack(mca_oob_tcp_peer_t* peer)
/* connected */
mca_oob_tcp_peer_connected(peer);
if(mca_oob_tcp_component.tcp_debug > 2) {
if(mca_oob_tcp_component.tcp_debug > 0) {
mca_oob_tcp_peer_dump(peer, "connected");
}
return OMPI_SUCCESS;
@ -557,7 +556,7 @@ static int mca_oob_tcp_peer_recv_blocking(mca_oob_tcp_peer_t* peer, void* data,
/* remote closed connection */
if(retval == 0) {
if(mca_oob_tcp_component.tcp_debug > 3) {
if(mca_oob_tcp_component.tcp_debug > 0) {
ompi_output(0, "[%d,%d,%d]-[%d,%d,%d] mca_oob_tcp_peer_recv_blocking: "
"peer closed connection: peer state %d",
ORTE_NAME_ARGS(orte_process_info.my_name),
@ -841,7 +840,7 @@ bool mca_oob_tcp_peer_accept(mca_oob_tcp_peer_t* peer, int sd)
mca_oob_tcp_peer_connected(peer);
ompi_event_add(&peer->peer_recv_event, 0);
if(mca_oob_tcp_component.tcp_debug > 2) {
if(mca_oob_tcp_component.tcp_debug > 0) {
mca_oob_tcp_peer_dump(peer, "accepted");
}
OMPI_THREAD_UNLOCK(&peer->peer_lock);
@ -875,6 +874,7 @@ static void mca_oob_tcp_peer_timer_handler(int sd, short flags, void* user)
{
/* start the connection to the peer */
mca_oob_tcp_peer_t* peer = (mca_oob_tcp_peer_t*)user;
ompi_output(0, "mca_oob_tcp_peer_timer_handler\n");
OMPI_THREAD_LOCK(&peer->peer_lock);
if(peer->peer_state == MCA_OOB_TCP_CLOSED)
mca_oob_tcp_peer_start_connect(peer);

Просмотреть файл

@ -39,7 +39,7 @@ int mca_oob_tcp_recv(
mca_oob_tcp_msg_t *msg;
int i, rc = 0, size = 0;
if(mca_oob_tcp_component.tcp_debug > 1) {
if(mca_oob_tcp_component.tcp_debug > 3) {
ompi_output(0, "[%d,%d,%d]-[%d,%d,%d] mca_oob_tcp_recv: tag %d\n",
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(peer),

Просмотреть файл

@ -70,7 +70,7 @@ int mca_oob_tcp_send(
if(NULL == peer)
return OMPI_ERR_UNREACH;
if(mca_oob_tcp_component.tcp_debug > 1) {
if(mca_oob_tcp_component.tcp_debug > 3) {
ompi_output(0, "[%d,%d,%d]-[%d,%d,%d] mca_oob_tcp_send: tag %d\n",
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(&(peer->peer_name)),

Просмотреть файл

@ -356,9 +356,7 @@ static int orte_pls_bproc_launch_app(
orte_vpid_t daemon_vpid_start = 0;
int rc, index;
char* uri;
char *var, *value;
char *env[4];
char **new_env;
char *var;
/* convert node names to bproc nodelist */
if(ORTE_SUCCESS != (rc = orte_pls_bproc_nodelist(map, &node_list, &num_nodes))) {
@ -377,8 +375,8 @@ static int orte_pls_bproc_launch_app(
}
var = mca_base_param_environ_variable("ns","nds",NULL);
asprintf(&value, "%s=pipe", var);
env[0] = value;
ompi_setenv(var, "pipe", true, &map->app->env);
free(var);
/* ns replica contact info */
if(NULL == orte_process_info.ns_replica) {
@ -390,8 +388,8 @@ static int orte_pls_bproc_launch_app(
orte_process_info.ns_replica_uri = orte_rml.get_uri();
}
var = mca_base_param_environ_variable("ns","replica","uri");
asprintf(&value, "%s=uri%s", var, orte_process_info.ns_replica_uri);
env[1] = value;
ompi_setenv(var,orte_process_info.ns_replica_uri, true, &map->app->env);
free(var);
/* gpr replica contact info */
if(NULL == orte_process_info.gpr_replica) {
@ -403,15 +401,11 @@ static int orte_pls_bproc_launch_app(
orte_process_info.gpr_replica_uri = orte_rml.get_uri();
}
var = mca_base_param_environ_variable("gpr","replica","uri");
asprintf(&value, "%s=uri%s", var, orte_process_info.gpr_replica_uri);
env[2] = value;
env[3] = NULL;
ompi_setenv(var,orte_process_info.gpr_replica_uri, true, &map->app->env);
free(var);
/* overwrite previously specified values with the above settings */
new_env = ompi_environ_merge(map->app->env, env);
ompi_argv_free(map->app->env);
map->app->env = new_env;
map->app->num_env = ompi_argv_count(new_env);
map->app->num_env = ompi_argv_count(map->app->env);
/* read process image */
if(ORTE_SUCCESS != (rc = orte_pls_bproc_dump(map->app, &image, &image_len))) {

Просмотреть файл

@ -137,7 +137,13 @@ int orte_pls_rsh_component_open(void)
param = orte_pls_rsh_param_register_string("agent","ssh");
mca_pls_rsh_component.argv = ompi_argv_split(param, ' ');
mca_pls_rsh_component.argc = ompi_argv_count(mca_pls_rsh_component.argv);
return (mca_pls_rsh_component.argc > 0) ? ORTE_SUCCESS : ORTE_ERR_BAD_PARAM;
if (mca_pls_rsh_component.argc > 0) {
mca_pls_rsh_component.path = strdup(mca_pls_rsh_component.argv[0]);
return ORTE_SUCCESS;
} else {
mca_pls_rsh_component.path = NULL;
return ORTE_ERR_BAD_PARAM;
}
}
@ -170,7 +176,10 @@ int orte_pls_rsh_component_close(void)
OBJ_DESTRUCT(&mca_pls_rsh_component.lock);
OBJ_DESTRUCT(&mca_pls_rsh_component.cond);
ompi_argv_free(mca_pls_rsh_component.argv);
if(NULL != mca_pls_rsh_component.argv)
ompi_argv_free(mca_pls_rsh_component.argv);
if(NULL != mca_pls_rsh_component.path)
free(mca_pls_rsh_component.path);
return ORTE_SUCCESS;
}

Просмотреть файл

@ -178,6 +178,13 @@ validate:
}
}
/* If we still didn't get enough, it's an error */
if (num_allocated < num_requested) {
rc = ORTE_ERR_OUT_OF_RESOURCE;
goto cleanup;
}
rc = orte_ras_base_node_assign(&allocated, jobid);
cleanup:

Просмотреть файл

@ -25,6 +25,7 @@
#include "mca/mca.h"
#include "mca/base/base.h"
#include "mca/ras/base/ras_base_node.h"
#include "mca/errmgr/errmgr.h"
#include "rds_hostfile.h"
#include "rds_hostfile_lex.h"
@ -105,10 +106,14 @@ static int orte_rds_hostfile_parse_line(int token, ompi_list_t* existing, ompi_l
OBJ_RELEASE(node);
return OMPI_ERROR;
}
if(node->node_slots != (size_t)rc) {
if (node->node_slots != (size_t)rc) {
node->node_slots = rc;
update++;
}
/* Ensure that node_slots_max >= node_slots */
if (node->node_slots_max < node->node_slots) {
node->node_slots_max = node->node_slots;
}
break;
case ORTE_RDS_HOSTFILE_SLOTS_MAX:
@ -116,10 +121,17 @@ static int orte_rds_hostfile_parse_line(int token, ompi_list_t* existing, ompi_l
if (rc < 0) {
OBJ_RELEASE(node);
return OMPI_ERROR;
}
if(node->node_slots_max != (size_t)rc) {
node->node_slots_max = rc;
update++;
}
/* Only take this update if it puts us > node_slots */
if (((size_t) rc) > node->node_slots) {
if (node->node_slots_max != (size_t)rc) {
node->node_slots_max = rc;
update++;
}
} else {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
OBJ_RELEASE(node);
return OMPI_ERROR;
}
break;
@ -208,6 +220,8 @@ static int orte_rds_hostfile_query(void)
goto cleanup;
}
rc = mca_base_param_find("rds", "hostfile", "path");
mca_base_param_lookup_string(rc, &mca_rds_hostfile_component.path);
rc = orte_rds_hostfile_parse(mca_rds_hostfile_component.path, &existing, &updates);
if (ORTE_ERR_NOT_FOUND == rc) {
if(mca_rds_hostfile_component.default_hostfile) {

Просмотреть файл

@ -53,7 +53,13 @@ count { orte_rds_hostfile_value.sval = yytext;
return ORTE_RDS_HOSTFILE_COUNT; }
slots { orte_rds_hostfile_value.sval = yytext;
return ORTE_RDS_HOSTFILE_SLOTS; }
slots-max { orte_rds_hostfile_value.sval = yytext;
"slots-max" { orte_rds_hostfile_value.sval = yytext;
return ORTE_RDS_HOSTFILE_SLOTS_MAX; }
slots_max { orte_rds_hostfile_value.sval = yytext;
return ORTE_RDS_HOSTFILE_SLOTS_MAX; }
"max-slots" { orte_rds_hostfile_value.sval = yytext;
return ORTE_RDS_HOSTFILE_SLOTS_MAX; }
max_slots { orte_rds_hostfile_value.sval = yytext;
return ORTE_RDS_HOSTFILE_SLOTS_MAX; }
[0-9]+ { orte_rds_hostfile_value.ival = atol(yytext);
@ -62,7 +68,7 @@ slots-max { orte_rds_hostfile_value.sval = yytext;
[A-Za-z0-9_\-\.]* { orte_rds_hostfile_value.sval = yytext;
return ORTE_RDS_HOSTFILE_STRING; }
. { orte_rds_hostfile_value.sval = yytext;
. { orte_rds_hostfile_value.sval = yytext;
return ORTE_RDS_HOSTFILE_ERROR; }
%%

Просмотреть файл

@ -131,7 +131,7 @@ int orte_rmgr_base_proc_stage_gate_init(orte_jobid_t job)
trig.tokens = (char**)malloc(sizeof(char*));
if (NULL == trig.tokens) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OBJ_DESTRUCT(&value);
OBJ_DESTRUCT(&sub);
OBJ_DESTRUCT(&trig);
return ORTE_ERR_OUT_OF_RESOURCE;
}
@ -141,21 +141,21 @@ int orte_rmgr_base_proc_stage_gate_init(orte_jobid_t job)
trig.keyvals = (orte_gpr_keyval_t**)malloc(2*sizeof(orte_gpr_keyval_t*));
if (NULL == trig.keyvals) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OBJ_DESTRUCT(&value);
OBJ_DESTRUCT(&sub);
OBJ_DESTRUCT(&trig);
return ORTE_ERR_OUT_OF_RESOURCE;
}
trig.keyvals[0] = OBJ_NEW(orte_gpr_keyval_t);
if (NULL == trig.keyvals[0]) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OBJ_DESTRUCT(&value);
OBJ_DESTRUCT(&sub);
OBJ_DESTRUCT(&trig);
return ORTE_ERR_OUT_OF_RESOURCE;
}
trig.keyvals[1] = OBJ_NEW(orte_gpr_keyval_t);
if (NULL == trig.keyvals[1]) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OBJ_DESTRUCT(&value);
OBJ_DESTRUCT(&sub);
OBJ_DESTRUCT(&trig);
return ORTE_ERR_OUT_OF_RESOURCE;
}
@ -167,7 +167,7 @@ int orte_rmgr_base_proc_stage_gate_init(orte_jobid_t job)
trig.keyvals[0]->key = strdup(ORTE_JOB_SLOTS_KEY);
if (NULL == trig.keyvals[0]->key) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OBJ_DESTRUCT(&value);
OBJ_DESTRUCT(&sub);
OBJ_DESTRUCT(&trig);
return ORTE_ERR_OUT_OF_RESOURCE;
}
@ -188,7 +188,7 @@ int orte_rmgr_base_proc_stage_gate_init(orte_jobid_t job)
trig.keyvals[1]->key = strdup(keys[i]);
if (NULL == trig.keyvals[1]->key) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OBJ_DESTRUCT(&value);
OBJ_DESTRUCT(&sub);
OBJ_DESTRUCT(&trig);
return ORTE_ERR_OUT_OF_RESOURCE;
}
@ -232,21 +232,21 @@ int orte_rmgr_base_proc_stage_gate_init(orte_jobid_t job)
trig.keyvals = (orte_gpr_keyval_t**)malloc(sizeof(orte_gpr_keyval_t**));
if (NULL == trig.keyvals) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OBJ_DESTRUCT(&value);
OBJ_DESTRUCT(&sub);
OBJ_DESTRUCT(&trig);
return ORTE_ERR_OUT_OF_RESOURCE;
}
trig.keyvals[0] = OBJ_NEW(orte_gpr_keyval_t);
if (NULL == trig.keyvals[0]) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OBJ_DESTRUCT(&value);
OBJ_DESTRUCT(&sub);
OBJ_DESTRUCT(&trig);
return ORTE_ERR_OUT_OF_RESOURCE;
}
trig.keyvals[0]->key = strdup(ORTE_PROC_NUM_ABORTED);
if (NULL == trig.keyvals[0]->key) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OBJ_DESTRUCT(&value);
OBJ_DESTRUCT(&sub);
OBJ_DESTRUCT(&trig);
return ORTE_ERR_OUT_OF_RESOURCE;
}
@ -499,6 +499,55 @@ int orte_rmgr_base_proc_stage_gate_subscribe(orte_jobid_t job, orte_gpr_notify_c
free(trig.keyvals[1]->key);
trig.keyvals[1]->key = NULL;
}
/* Now do the abort trigger */
sub.keys[0] = strdup(ORTE_PROC_NUM_ABORTED);
if (NULL == sub.keys[0]) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OBJ_DESTRUCT(&sub);
OBJ_DESTRUCT(&trig);
return ORTE_ERR_OUT_OF_RESOURCE;
}
OBJ_RELEASE(trig.keyvals[0]);
OBJ_RELEASE(trig.keyvals[1]);
free(trig.keyvals);
trig.cnt = 1;
trig.keyvals = (orte_gpr_keyval_t**)malloc(sizeof(orte_gpr_keyval_t**));
if (NULL == trig.keyvals) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OBJ_DESTRUCT(&sub);
OBJ_DESTRUCT(&trig);
return ORTE_ERR_OUT_OF_RESOURCE;
}
trig.keyvals[0] = OBJ_NEW(orte_gpr_keyval_t);
if (NULL == trig.keyvals[0]) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OBJ_DESTRUCT(&sub);
OBJ_DESTRUCT(&trig);
return ORTE_ERR_OUT_OF_RESOURCE;
}
trig.keyvals[0]->key = strdup(ORTE_PROC_NUM_ABORTED);
if (NULL == trig.keyvals[0]->key) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OBJ_DESTRUCT(&sub);
OBJ_DESTRUCT(&trig);
return ORTE_ERR_OUT_OF_RESOURCE;
}
trig.keyvals[0]->type = ORTE_INT32;
trig.keyvals[0]->value.i32 = 1; /* trigger on the first process that aborts */
subs = &sub;
trigs = &trig;
rc = orte_gpr.subscribe(
ORTE_GPR_TRIG_ALL_AT,
1, &subs,
1, &trigs,
&rc);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
}
OBJ_DESTRUCT(&sub);
OBJ_DESTRUCT(&trig);
return ORTE_SUCCESS;

Просмотреть файл

@ -90,7 +90,12 @@ CLEANUP:
}
free(tokens);
if (NULL != values) OBJ_RELEASE(values);
if (NULL != values) {
for (i=0; i < cnt; i++) {
OBJ_RELEASE(values[i]);
}
free(values);
}
return rc;
}

Просмотреть файл

@ -26,6 +26,9 @@
#include "mca/iof/base/base.h"
#include "mca/rmgr/base/base.h"
#include "util/session_dir.h"
#include "util/sys_info.h"
#include "util/proc_info.h"
#include "util/univ_info.h"
/**
* Leave ORTE.
@ -52,6 +55,11 @@ int orte_finalize(void)
orte_session_dir_finalize();
#endif
/* clean out the global structures */
orte_sys_info_finalize();
orte_proc_info_finalize();
orte_univ_info_finalize();
ompi_malloc_finalize();
return ORTE_SUCCESS;

Просмотреть файл

@ -47,9 +47,9 @@
int orte_restart(orte_process_name_t *name, const char* uri)
{
int rc;
orte_process_name_t* old_name;
orte_process_name_t* new_name;
if (ORTE_SUCCESS != (rc = orte_ns.copy_process_name(&old_name, orte_process_info.my_name))) {
ORTE_ERROR_LOG(rc);
return rc;
@ -72,6 +72,7 @@ int orte_restart(orte_process_name_t *name, const char* uri)
* Close selected components.
*/
orte_iof_base.iof_flush = false;
if (ORTE_SUCCESS != (rc = orte_iof_base_close())) {
ORTE_ERROR_LOG(rc);
return rc;

Просмотреть файл

@ -114,8 +114,11 @@ ompi_cmd_line_init_t cmd_line_init[] = {
"Number of processes to run" },
/* Set a hostfile */
{ "hostfile", NULL, NULL, '\0', NULL, "hostfile", 1,
&orterun_globals.num_procs, OMPI_CMD_LINE_TYPE_INT,
{ "rds", "hostfile", "path", '\0', "hostfile", "hostfile", 1,
NULL, OMPI_CMD_LINE_TYPE_STRING,
"Provide a hostfile" },
{ "rds", "hostfile", "path", '\0', "machinefile", "machinefile", 1,
NULL, OMPI_CMD_LINE_TYPE_STRING,
"Provide a hostfile" },
/* Don't wait for the process to finish before exiting */

Просмотреть файл

@ -55,7 +55,7 @@ struct cmd_line_option_t {
char *clo_description;
ompi_cmd_line_type_t clo_type;
int clo_mca_param_id;
char *clo_mca_param_env_var;
void *clo_variable_dest;
bool clo_variable_set;
};
@ -409,7 +409,7 @@ int ompi_cmd_line_parse(ompi_cmd_line_t *cmd, bool ignore_unknown,
variable dest and/or MCA parameter */
if (0 == j &&
(option->clo_mca_param_id >= 0 ||
(NULL != option->clo_mca_param_env_var ||
NULL != option->clo_variable_dest)) {
set_dest(option, cmd->lcl_argv[i]);
}
@ -805,7 +805,7 @@ static void option_constructor(cmd_line_option_t *o)
o->clo_description = NULL;
o->clo_type = OMPI_CMD_LINE_TYPE_NULL;
o->clo_mca_param_id = -1;
o->clo_mca_param_env_var = NULL;
o->clo_variable_dest = NULL;
o->clo_variable_set = false;
}
@ -822,6 +822,9 @@ static void option_destructor(cmd_line_option_t *o)
if (NULL != o->clo_description) {
free(o->clo_description);
}
if (NULL != o->clo_mca_param_env_var) {
free(o->clo_mca_param_env_var);
}
}
@ -930,10 +933,10 @@ static int make_opt(ompi_cmd_line_t *cmd, ompi_cmd_line_init_t *e)
option->clo_type = e->ocl_variable_type;
option->clo_variable_dest = e->ocl_variable_dest;
if (NULL != e->ocl_mca_type_name) {
option->clo_mca_param_id =
mca_base_param_find(e->ocl_mca_type_name,
e->ocl_mca_component_name,
e->ocl_mca_param_name);
option->clo_mca_param_env_var =
mca_base_param_environ_variable(e->ocl_mca_type_name,
e->ocl_mca_component_name,
e->ocl_mca_param_name);
}
/* Append the item, serializing thread access */
@ -1076,23 +1079,33 @@ static cmd_line_option_t *find_option(ompi_cmd_line_t *cmd,
static void set_dest(cmd_line_option_t *option, char *sval)
{
int ival = atoi(sval);
char *str;
/* Set MCA param */
/* Set MCA param. We do this in the environment because the MCA
parameter may not have been registered yet -- and if it isn't
registered, we don't really want to register a dummy one
because we don't know what it's type and default value should
be. These are solvable problems (e.g., make a re-registration
overwrite everything), but it's far simpler to just leave the
registered table alone and set an environment variable with the
desired value. The environment variable will get picked up
during a nromal parameter lookup, and all will be well. */
if (option->clo_mca_param_id >= 0) {
if (NULL != option->clo_mca_param_env_var) {
switch(option->clo_type) {
case OMPI_CMD_LINE_TYPE_STRING:
mca_base_param_set_string(option->clo_mca_param_id, sval);
break;
case OMPI_CMD_LINE_TYPE_INT:
mca_base_param_set_int(option->clo_mca_param_id, ival);
asprintf(&str, "%s=%s", option->clo_mca_param_env_var, sval);
break;
case OMPI_CMD_LINE_TYPE_BOOL:
mca_base_param_set_int(option->clo_mca_param_id, 1);
asprintf(&str, "%s=1", option->clo_mca_param_env_var);
break;
default:
break;
}
if (NULL != str) {
putenv(str);
}
}
/* Set variable */

Просмотреть файл

@ -84,3 +84,61 @@ int orte_proc_info(void)
return ORTE_SUCCESS;
}
int orte_proc_info_finalize(void)
{
if (NULL != orte_process_info.my_name) {
free(orte_process_info.my_name);
}
if (NULL != orte_process_info.ns_replica_uri) {
free(orte_process_info.ns_replica_uri);
}
if (NULL != orte_process_info.gpr_replica_uri) {
free(orte_process_info.gpr_replica_uri);
}
if (NULL != orte_process_info.ns_replica) {
free(orte_process_info.ns_replica);
}
if (NULL != orte_process_info.gpr_replica) {
free(orte_process_info.gpr_replica);
}
if (NULL != orte_process_info.tmpdir_base) {
free(orte_process_info.tmpdir_base);
}
if (NULL != orte_process_info.top_session_dir) {
free(orte_process_info.top_session_dir);
}
if (NULL != orte_process_info.universe_session_dir) {
free(orte_process_info.universe_session_dir);
}
if (NULL != orte_process_info.job_session_dir) {
free(orte_process_info.job_session_dir);
}
if (NULL != orte_process_info.proc_session_dir) {
free(orte_process_info.proc_session_dir);
}
if (NULL != orte_process_info.sock_stdin) {
free(orte_process_info.sock_stdin);
}
if (NULL != orte_process_info.sock_stdout) {
free(orte_process_info.sock_stdout);
}
if (NULL != orte_process_info.sock_stderr) {
free(orte_process_info.sock_stderr);
}
return ORTE_SUCCESS;
}

Просмотреть файл

@ -105,6 +105,7 @@ OMPI_DECLSPEC extern orte_proc_info_t orte_process_info;
OMPI_DECLSPEC int orte_proc_info(void);
OMPI_DECLSPEC int orte_proc_info_finalize(void);
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif

Просмотреть файл

@ -27,7 +27,7 @@
#include <sys/stat.h>
#include "include/constants.h"
#include "include/orte_constants.h"
#include "util/output.h"
#include "util/sys_info.h"
@ -60,7 +60,7 @@ int orte_sys_info(void)
#endif
if (orte_system_info.init) {
return OMPI_SUCCESS;
return ORTE_SUCCESS;
}
if (0 > uname(&sys_info)) { /* have an error - set utsname values to indicate */
@ -84,7 +84,7 @@ int orte_sys_info(void)
free(orte_system_info.machine);
orte_system_info.machine = NULL;
}
return OMPI_ERROR;
return ORTE_ERROR;
} else {
orte_system_info.sysname = strdup(sys_info.sysname);
if(NULL == orte_system_info.nodename) {
@ -105,6 +105,7 @@ int orte_sys_info(void)
}
sep[1] = '\0';
orte_system_info.path_sep = strdup(sep);
free(path_name);
}
#else
/* we can hardcode windows path seperator to be "\" */
@ -131,5 +132,37 @@ int orte_sys_info(void)
/* set the init flag */
orte_system_info.init = true; /* only indicates that we have been through here once - still have to test for NULL values */
return(OMPI_SUCCESS);
return(ORTE_SUCCESS);
}
int orte_sys_info_finalize(void)
{
if (NULL != orte_system_info.sysname)
free(orte_system_info.sysname);
if (NULL != orte_system_info.nodename)
free(orte_system_info.nodename);
if (NULL != orte_system_info.release)
free(orte_system_info.release);
if (NULL != orte_system_info.version)
free(orte_system_info.version);
if (NULL != orte_system_info.machine)
free(orte_system_info.machine);
if (NULL != orte_system_info.path_sep)
free(orte_system_info.path_sep);
if (NULL != orte_system_info.user)
free(orte_system_info.user);
if (NULL != orte_system_info.enviro)
free(orte_system_info.enviro);
if (NULL != orte_system_info.suffix)
free(orte_system_info.suffix);
return ORTE_SUCCESS;
}

Просмотреть файл

@ -90,6 +90,18 @@ OMPI_DECLSPEC extern orte_sys_info_t orte_system_info;
*/
OMPI_DECLSPEC int orte_sys_info(void);
/*
* \internal
*
* Free any memory held in the system_info structure
*
* Called from \c orte_finalize
*
* @retval ORTE_SUCCESS If all values successfully released
* @retval ORTE_ERROR If any problems occur
*/
OMPI_DECLSPEC int orte_sys_info_finalize(void);
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif

Просмотреть файл

@ -34,6 +34,7 @@
#include "util/univ_info.h"
orte_universe_t orte_universe_info = {
/* .init = */ false,
/* .path = */ NULL,
/* .name = */ NULL,
/* .host = */ NULL,
@ -51,43 +52,67 @@ int orte_univ_info(void)
{
int id, tmp;
id = mca_base_param_register_string("universe", "path", NULL, NULL, NULL);
mca_base_param_lookup_string(id, &(orte_universe_info.path));
id = mca_base_param_register_string("universe", "name", NULL, NULL, "default-universe");
mca_base_param_lookup_string(id, &(orte_universe_info.name));
id = mca_base_param_register_string("universe", "host", NULL, NULL, NULL);
mca_base_param_lookup_string(id, &(orte_universe_info.host));
/* uid is not set via parameter, but is determined elsewhere */
if (!orte_universe_info.init) {
id = mca_base_param_register_string("universe", "path", NULL, NULL, NULL);
mca_base_param_lookup_string(id, &(orte_universe_info.path));
id = mca_base_param_register_int("universe", "persistence", NULL, NULL, (int)false);
mca_base_param_lookup_int(id, &tmp);
if (tmp) {
orte_universe_info.persistence = true;
} else {
orte_universe_info.persistence = false;
}
id = mca_base_param_register_string("universe", "scope", NULL, NULL, NULL);
mca_base_param_lookup_string(id, &(orte_universe_info.scope));
id = mca_base_param_register_int("universe", "console", NULL, NULL, (int)false);
mca_base_param_lookup_int(id, &tmp);
if (tmp) {
orte_universe_info.console = true;
} else {
orte_universe_info.console = false;
}
id = mca_base_param_register_string("universe", "uri", NULL, NULL, NULL);
mca_base_param_lookup_string(id, &(orte_universe_info.seed_uri));
/* console connected is set elsewhere */
id = mca_base_param_register_string("universe", "name", NULL, NULL, "default-universe");
mca_base_param_lookup_string(id, &(orte_universe_info.name));
id = mca_base_param_register_string("universe", "script", NULL, NULL, NULL);
mca_base_param_lookup_string(id, &(orte_universe_info.scriptfile));
id = mca_base_param_register_string("universe", "host", NULL, NULL, NULL);
mca_base_param_lookup_string(id, &(orte_universe_info.host));
/* uid is not set via parameter, but is determined elsewhere */
id = mca_base_param_register_int("universe", "persistence", NULL, NULL, (int)false);
mca_base_param_lookup_int(id, &tmp);
if (tmp) {
orte_universe_info.persistence = true;
} else {
orte_universe_info.persistence = false;
}
id = mca_base_param_register_string("universe", "scope", NULL, NULL, NULL);
mca_base_param_lookup_string(id, &(orte_universe_info.scope));
id = mca_base_param_register_int("universe", "console", NULL, NULL, (int)false);
mca_base_param_lookup_int(id, &tmp);
if (tmp) {
orte_universe_info.console = true;
} else {
orte_universe_info.console = false;
}
id = mca_base_param_register_string("universe", "uri", NULL, NULL, NULL);
mca_base_param_lookup_string(id, &(orte_universe_info.seed_uri));
/* console connected is set elsewhere */
id = mca_base_param_register_string("universe", "script", NULL, NULL, NULL);
mca_base_param_lookup_string(id, &(orte_universe_info.scriptfile));
orte_universe_info.init = true;
}
return(ORTE_SUCCESS);
}
int orte_univ_info_finalize(void)
{
if (NULL != orte_universe_info.path) free(orte_universe_info.path);
if (NULL != orte_universe_info.name) free(orte_universe_info.name);
if (NULL != orte_universe_info.host) free(orte_universe_info.host);
if (NULL != orte_universe_info.uid) free(orte_universe_info.uid);
if (NULL != orte_universe_info.scope) free(orte_universe_info.scope);
if (NULL != orte_universe_info.seed_uri) free(orte_universe_info.seed_uri);
if (NULL != orte_universe_info.scriptfile) free(orte_universe_info.scriptfile);
return ORTE_SUCCESS;
}

Просмотреть файл

@ -40,6 +40,7 @@ extern "C" {
* instanced in ompi_rte_init.c */
struct orte_universe_t {
bool init;
char *path;
char *name;
char *host;
@ -68,6 +69,8 @@ OMPI_DECLSPEC extern orte_universe_t orte_universe_info;
*/
OMPI_DECLSPEC int orte_univ_info(void);
OMPI_DECLSPEC int orte_univ_info_finalize(void);
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif