1
1

Turns out that it's a really Bad Idea(tm) to tm_spawn() and then not

keep the resulting tm_event_t that is generated because the back-end
TM library actually caches a bunch of stuff on it for internal
processing, and doesn't let go of it until tm_poll().

tm_event_t's are similar to (but slightly different than)
MPI_Requests: you can't do a million MPI_Isend()'s on a single
MPI_Request -- a) you need an array of MPI_Request's to fill and b)
you need to keep them around until all the requests have completed.

This commit was SVN r10820.
Этот коммит содержится в:
Jeff Squyres 2006-07-14 22:04:41 +00:00
родитель 2897d2ef9b
Коммит ffddfc5629

Просмотреть файл

@ -78,7 +78,8 @@ static int pls_tm_finalize(void);
static int pls_tm_connect(void);
static int pls_tm_disconnect(void);
static int pls_tm_start_proc(char *nodename, int argc, char **argv, char **env);
static int pls_tm_start_proc(char *nodename, int argc, char **argv,
char **env, tm_event_t *event);
static int pls_tm_check_path(char *exe, char **env);
/*
@ -114,6 +115,7 @@ pls_tm_launch(orte_jobid_t jobid)
bool connected = false;
int launched = 0, i;
char *bin_base = NULL, *lib_base = NULL;
tm_event_t *events = NULL;
/* Query the list of nodes allocated and mapped to this job.
* We need the entire mapping for a couple of reasons:
@ -145,6 +147,13 @@ pls_tm_launch(orte_jobid_t jobid)
goto cleanup;
}
/* Allocate a bunch of TM events to use for tm_spawn()ing */
events = malloc(sizeof(tm_event_t) * num_nodes);
if (NULL == events) {
rc = ORTE_ERR_OUT_OF_RESOURCE;
goto cleanup;
}
/* need integer value for command line parameter */
asprintf(&jobid_string, "%lu", (unsigned long) jobid);
@ -227,7 +236,6 @@ pls_tm_launch(orte_jobid_t jobid)
/* Figure out the basenames for the libdir and bindir. There is a
lengthy comment about this in pls_rsh_module.c explaining all
the rationale for how / why we're doing this. */
lib_base = opal_basename(OPAL_LIBDIR);
bin_base = opal_basename(OPAL_BINDIR);
@ -368,7 +376,8 @@ pls_tm_launch(orte_jobid_t jobid)
}
}
rc = pls_tm_start_proc(node->node_name, argc, argv, env);
rc = pls_tm_start_proc(node->node_name, argc, argv, env,
events + launched);
if (ORTE_SUCCESS != rc) {
opal_output(0, "pls:tm: start_procs returned error %d", rc);
goto cleanup;
@ -399,6 +408,9 @@ pls_tm_launch(orte_jobid_t jobid)
if (connected) {
pls_tm_disconnect();
}
if (NULL != events) {
free(events);
}
while (NULL != (m_item = opal_list_remove_first(&mapping))) {
OBJ_RELEASE(m_item);
@ -619,18 +631,18 @@ do_tm_resolve(char *hostname, tm_node_id *tnodeid)
static int
pls_tm_start_proc(char *nodename, int argc, char **argv, char **env)
pls_tm_start_proc(char *nodename, int argc, char **argv, char **env,
tm_event_t *event)
{
int ret;
tm_node_id node_id;
tm_task_id task_id;
tm_event_t event;
/* get the tm node id for this node */
ret = do_tm_resolve(nodename, &node_id);
if (ORTE_SUCCESS != ret) return ret;
ret = tm_spawn(argc, argv, env, node_id, &task_id, &event);
ret = tm_spawn(argc, argv, env, node_id, &task_id, event);
if (TM_SUCCESS != ret) return ORTE_ERROR;
return ORTE_SUCCESS;