Turns out that it's a really Bad Idea(tm) to tm_spawn() and then not
keep the resulting tm_event_t that is generated because the back-end TM library actually caches a bunch of stuff on it for internal processing, and doesn't let go of it until tm_poll(). tm_event_t's are similar to (but slightly different than) MPI_Requests: you can't do a million MPI_Isend()'s on a single MPI_Request -- a) you need an array of MPI_Request's to fill and b) you need to keep them around until all the requests have completed. This commit was SVN r10820.
Этот коммит содержится в:
родитель
2897d2ef9b
Коммит
ffddfc5629
@ -78,7 +78,8 @@ static int pls_tm_finalize(void);
|
|||||||
|
|
||||||
static int pls_tm_connect(void);
|
static int pls_tm_connect(void);
|
||||||
static int pls_tm_disconnect(void);
|
static int pls_tm_disconnect(void);
|
||||||
static int pls_tm_start_proc(char *nodename, int argc, char **argv, char **env);
|
static int pls_tm_start_proc(char *nodename, int argc, char **argv,
|
||||||
|
char **env, tm_event_t *event);
|
||||||
static int pls_tm_check_path(char *exe, char **env);
|
static int pls_tm_check_path(char *exe, char **env);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -114,6 +115,7 @@ pls_tm_launch(orte_jobid_t jobid)
|
|||||||
bool connected = false;
|
bool connected = false;
|
||||||
int launched = 0, i;
|
int launched = 0, i;
|
||||||
char *bin_base = NULL, *lib_base = NULL;
|
char *bin_base = NULL, *lib_base = NULL;
|
||||||
|
tm_event_t *events = NULL;
|
||||||
|
|
||||||
/* Query the list of nodes allocated and mapped to this job.
|
/* Query the list of nodes allocated and mapped to this job.
|
||||||
* We need the entire mapping for a couple of reasons:
|
* We need the entire mapping for a couple of reasons:
|
||||||
@ -145,6 +147,13 @@ pls_tm_launch(orte_jobid_t jobid)
|
|||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Allocate a bunch of TM events to use for tm_spawn()ing */
|
||||||
|
events = malloc(sizeof(tm_event_t) * num_nodes);
|
||||||
|
if (NULL == events) {
|
||||||
|
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
|
||||||
/* need integer value for command line parameter */
|
/* need integer value for command line parameter */
|
||||||
asprintf(&jobid_string, "%lu", (unsigned long) jobid);
|
asprintf(&jobid_string, "%lu", (unsigned long) jobid);
|
||||||
|
|
||||||
@ -227,7 +236,6 @@ pls_tm_launch(orte_jobid_t jobid)
|
|||||||
/* Figure out the basenames for the libdir and bindir. There is a
|
/* Figure out the basenames for the libdir and bindir. There is a
|
||||||
lengthy comment about this in pls_rsh_module.c explaining all
|
lengthy comment about this in pls_rsh_module.c explaining all
|
||||||
the rationale for how / why we're doing this. */
|
the rationale for how / why we're doing this. */
|
||||||
|
|
||||||
lib_base = opal_basename(OPAL_LIBDIR);
|
lib_base = opal_basename(OPAL_LIBDIR);
|
||||||
bin_base = opal_basename(OPAL_BINDIR);
|
bin_base = opal_basename(OPAL_BINDIR);
|
||||||
|
|
||||||
@ -368,7 +376,8 @@ pls_tm_launch(orte_jobid_t jobid)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
rc = pls_tm_start_proc(node->node_name, argc, argv, env);
|
rc = pls_tm_start_proc(node->node_name, argc, argv, env,
|
||||||
|
events + launched);
|
||||||
if (ORTE_SUCCESS != rc) {
|
if (ORTE_SUCCESS != rc) {
|
||||||
opal_output(0, "pls:tm: start_procs returned error %d", rc);
|
opal_output(0, "pls:tm: start_procs returned error %d", rc);
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
@ -399,6 +408,9 @@ pls_tm_launch(orte_jobid_t jobid)
|
|||||||
if (connected) {
|
if (connected) {
|
||||||
pls_tm_disconnect();
|
pls_tm_disconnect();
|
||||||
}
|
}
|
||||||
|
if (NULL != events) {
|
||||||
|
free(events);
|
||||||
|
}
|
||||||
|
|
||||||
while (NULL != (m_item = opal_list_remove_first(&mapping))) {
|
while (NULL != (m_item = opal_list_remove_first(&mapping))) {
|
||||||
OBJ_RELEASE(m_item);
|
OBJ_RELEASE(m_item);
|
||||||
@ -619,18 +631,18 @@ do_tm_resolve(char *hostname, tm_node_id *tnodeid)
|
|||||||
|
|
||||||
|
|
||||||
static int
|
static int
|
||||||
pls_tm_start_proc(char *nodename, int argc, char **argv, char **env)
|
pls_tm_start_proc(char *nodename, int argc, char **argv, char **env,
|
||||||
|
tm_event_t *event)
|
||||||
{
|
{
|
||||||
int ret;
|
int ret;
|
||||||
tm_node_id node_id;
|
tm_node_id node_id;
|
||||||
tm_task_id task_id;
|
tm_task_id task_id;
|
||||||
tm_event_t event;
|
|
||||||
|
|
||||||
/* get the tm node id for this node */
|
/* get the tm node id for this node */
|
||||||
ret = do_tm_resolve(nodename, &node_id);
|
ret = do_tm_resolve(nodename, &node_id);
|
||||||
if (ORTE_SUCCESS != ret) return ret;
|
if (ORTE_SUCCESS != ret) return ret;
|
||||||
|
|
||||||
ret = tm_spawn(argc, argv, env, node_id, &task_id, &event);
|
ret = tm_spawn(argc, argv, env, node_id, &task_id, event);
|
||||||
if (TM_SUCCESS != ret) return ORTE_ERROR;
|
if (TM_SUCCESS != ret) return ORTE_ERROR;
|
||||||
|
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user