Cleanup compile issues - missing updates to some plm components and the slurm ras component
This commit was SVN r31921.
Этот коммит содержится в:
родитель
4b0c3dcd29
Коммит
65a35d92ef
@ -169,7 +169,7 @@ static int plm_alps_launch_job(orte_job_t *jdata)
|
||||
opal_argv_append (&env_count, &app->env, "PMI_NO_PREINITIALIZE=1");
|
||||
}
|
||||
|
||||
if (ORTE_JOB_CONTROL_RESTART & jdata->controls) {
|
||||
if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RESTART)) {
|
||||
/* this is a restart situation - skip to the mapping stage */
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP);
|
||||
} else {
|
||||
@ -206,7 +206,7 @@ static void launch_daemons(int fd, short args, void *cbdata)
|
||||
/* if we are launching debugger daemons, then just go
|
||||
* do it - no new daemons will be launched
|
||||
*/
|
||||
if (ORTE_JOB_CONTROL_DEBUGGER_DAEMON & state->jdata->controls) {
|
||||
if (ORTE_FLAG_TEST(state->jdata, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) {
|
||||
state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
|
||||
ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
|
||||
OBJ_RELEASE(state);
|
||||
@ -304,7 +304,7 @@ static void launch_daemons(int fd, short args, void *cbdata)
|
||||
/* if the daemon already exists on this node, then
|
||||
* don't include it
|
||||
*/
|
||||
if (node->daemon_launched) {
|
||||
if (ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_DAEMON_LAUNCHED)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -378,7 +378,7 @@ static void launch_daemons(int fd, short args, void *cbdata)
|
||||
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(state->jdata->apps, i))) {
|
||||
continue;
|
||||
}
|
||||
app_prefix_dir = app->prefix_dir;
|
||||
orte_get_attribute(&app->attributes, ORTE_APP_PREFIX_DIR, (void**)&app_prefix_dir, OPAL_STRING);
|
||||
/* Check for already set cur_prefix_dir -- if different,
|
||||
complain */
|
||||
if (NULL != app_prefix_dir) {
|
||||
@ -398,6 +398,7 @@ static void launch_daemons(int fd, short args, void *cbdata)
|
||||
cur_prefix);
|
||||
}
|
||||
}
|
||||
free(app_prefix_dir);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -141,7 +141,7 @@ int plm_lsf_init(void)
|
||||
*/
|
||||
static int plm_lsf_launch_job(orte_job_t *jdata)
|
||||
{
|
||||
if (ORTE_JOB_CONTROL_RESTART & jdata->controls) {
|
||||
if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RESTART)) {
|
||||
/* this is a restart situation - skip to the mapping stage */
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP);
|
||||
} else {
|
||||
@ -235,7 +235,7 @@ static void launch_daemons(int fd, short args, void *cbdata)
|
||||
/* if the daemon already exists on this node, then
|
||||
* don't include it
|
||||
*/
|
||||
if (node->daemon_launched) {
|
||||
if (ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_DAEMON_LAUNCHED)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -299,7 +299,7 @@ static void launch_daemons(int fd, short args, void *cbdata)
|
||||
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
|
||||
continue;
|
||||
}
|
||||
app_prefix_dir = app->prefix_dir;
|
||||
orte_get_attribute(&app->attributes, ORTE_APP_PREFIX_DIR, (void**)&app_prefix_dir, OPAL_STRING);
|
||||
/* Check for already set cur_prefix_dir -- if different,
|
||||
complain */
|
||||
if (NULL != app_prefix_dir) {
|
||||
@ -319,6 +319,7 @@ static void launch_daemons(int fd, short args, void *cbdata)
|
||||
"%s plm:lsf: Set prefix:%s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), cur_prefix));
|
||||
}
|
||||
free(app_prefix_dir);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -157,7 +157,7 @@ static int plm_slurm_init(void)
|
||||
*/
|
||||
static int plm_slurm_launch_job(orte_job_t *jdata)
|
||||
{
|
||||
if (ORTE_JOB_CONTROL_RESTART & jdata->controls) {
|
||||
if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RESTART)) {
|
||||
/* this is a restart situation - skip to the mapping stage */
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP);
|
||||
} else {
|
||||
@ -198,7 +198,7 @@ static void launch_daemons(int fd, short args, void *cbdata)
|
||||
/* if we are launching debugger daemons, then just go
|
||||
* do it - no new daemons will be launched
|
||||
*/
|
||||
if (ORTE_JOB_CONTROL_DEBUGGER_DAEMON & state->jdata->controls) {
|
||||
if (ORTE_FLAG_TEST(state->jdata, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) {
|
||||
state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
|
||||
ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
|
||||
OBJ_RELEASE(state);
|
||||
@ -298,7 +298,7 @@ static void launch_daemons(int fd, short args, void *cbdata)
|
||||
/* if the daemon already exists on this node, then
|
||||
* don't include it
|
||||
*/
|
||||
if (node->daemon_launched) {
|
||||
if (ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_DAEMON_LAUNCHED)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -375,7 +375,7 @@ static void launch_daemons(int fd, short args, void *cbdata)
|
||||
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(state->jdata->apps, n))) {
|
||||
continue;
|
||||
}
|
||||
app_prefix_dir = app->prefix_dir;
|
||||
orte_get_attribute(&app->attributes, ORTE_APP_PREFIX_DIR, (void**)&app_prefix_dir, OPAL_STRING);
|
||||
/* Check for already set cur_prefix_dir -- if different,
|
||||
complain */
|
||||
if (NULL != app_prefix_dir) {
|
||||
@ -396,6 +396,7 @@ static void launch_daemons(int fd, short args, void *cbdata)
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
cur_prefix));
|
||||
}
|
||||
free(app_prefix_dir);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -150,7 +150,7 @@ static int plm_tm_init(void)
|
||||
|
||||
static int plm_tm_launch_job(orte_job_t *jdata)
|
||||
{
|
||||
if (ORTE_JOB_CONTROL_RESTART & jdata->controls) {
|
||||
if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RESTART)) {
|
||||
/* this is a restart situation - skip to the mapping stage */
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP);
|
||||
} else {
|
||||
@ -188,13 +188,14 @@ static void launch_daemons(int fd, short args, void *cbdata)
|
||||
orte_job_t *daemons, *jdata;
|
||||
orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata;
|
||||
int32_t launchid, *ldptr;
|
||||
char *prefix_dir = NULL;
|
||||
|
||||
jdata = state->jdata;
|
||||
|
||||
/* if we are launching debugger daemons, then just go
|
||||
* do it - no new daemons will be launched
|
||||
*/
|
||||
if (ORTE_JOB_CONTROL_DEBUGGER_DAEMON & jdata->controls) {
|
||||
if (ORTE_FLAG_TEST(state->jdata, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) {
|
||||
jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
|
||||
OBJ_RELEASE(state);
|
||||
@ -270,7 +271,7 @@ static void launch_daemons(int fd, short args, void *cbdata)
|
||||
}
|
||||
|
||||
/* if this daemon already exists, don't launch it! */
|
||||
if (node->daemon_launched) {
|
||||
if (ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_DAEMON_LAUNCHED)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -330,14 +331,15 @@ static void launch_daemons(int fd, short args, void *cbdata)
|
||||
there
|
||||
*/
|
||||
app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, 0);
|
||||
if (NULL != app->prefix_dir) {
|
||||
orte_get_attribute(&app->attributes, ORTE_APP_PREFIX_DIR, (void**)&prefix_dir, OPAL_STRING);
|
||||
if (NULL != prefix_dir) {
|
||||
char *newenv;
|
||||
|
||||
for (i = 0; NULL != env && NULL != env[i]; ++i) {
|
||||
/* Reset PATH */
|
||||
if (0 == strncmp("PATH=", env[i], 5)) {
|
||||
asprintf(&newenv, "%s/%s:%s",
|
||||
app->prefix_dir, bin_base, env[i] + 5);
|
||||
prefix_dir, bin_base, env[i] + 5);
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
|
||||
"%s plm:tm: resetting PATH: %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
@ -349,7 +351,7 @@ static void launch_daemons(int fd, short args, void *cbdata)
|
||||
/* Reset LD_LIBRARY_PATH */
|
||||
else if (0 == strncmp("LD_LIBRARY_PATH=", env[i], 16)) {
|
||||
asprintf(&newenv, "%s/%s:%s",
|
||||
app->prefix_dir, lib_base, env[i] + 16);
|
||||
prefix_dir, lib_base, env[i] + 16);
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
|
||||
"%s plm:tm: resetting LD_LIBRARY_PATH: %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
@ -358,6 +360,7 @@ static void launch_daemons(int fd, short args, void *cbdata)
|
||||
free(newenv);
|
||||
}
|
||||
}
|
||||
free(prefix_dir);
|
||||
}
|
||||
|
||||
/* Iterate through each of the nodes and spin
|
||||
|
@ -12,7 +12,7 @@
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2013 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -739,6 +739,7 @@ static void recv_data(int fd, short args, void *cbdata)
|
||||
orte_app_context_t *app;
|
||||
orte_jobid_t jobid;
|
||||
orte_job_t *jdata;
|
||||
char **dash_host = NULL;
|
||||
|
||||
opal_output_verbose(2, orte_ras_base_framework.framework_output,
|
||||
"%s ras:slurm: dynamic allocation - data recvd",
|
||||
@ -802,6 +803,8 @@ static void recv_data(int fd, short args, void *cbdata)
|
||||
idx = -1;
|
||||
sjob = -1;
|
||||
nodelist = NULL;
|
||||
/* release the current dash_host as that contained the *desired* allocation */
|
||||
orte_remove_attribute(&app->attributes, ORTE_APP_DASH_HOST);
|
||||
for (i=1; NULL != alloc[i]; i++) {
|
||||
if (ORTE_SUCCESS != parse_alloc_msg(alloc[i], &idx, &sjob, &nodelist, &tpn)) {
|
||||
orte_show_help("help-ras-slurm.txt", "slurm-dyn-alloc-failed", true, jtrk->cmd);
|
||||
@ -820,9 +823,6 @@ static void recv_data(int fd, short args, void *cbdata)
|
||||
opal_pointer_array_set_item(&jtrk->apps, idx, aptrk);
|
||||
}
|
||||
aptrk->sjob = sjob;
|
||||
/* release the current dash_host as that contained the *desired* allocation */
|
||||
opal_argv_free(app->dash_host);
|
||||
app->dash_host = NULL;
|
||||
/* since the nodelist/tpn may contain regular expressions, parse them */
|
||||
if (ORTE_SUCCESS != (rc = orte_ras_slurm_discover(nodelist, tpn, &ndtmp))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -835,7 +835,7 @@ static void recv_data(int fd, short args, void *cbdata)
|
||||
*/
|
||||
while (NULL != (item = opal_list_remove_first(&ndtmp))) {
|
||||
nd = (orte_node_t*)item;
|
||||
opal_argv_append_nosize(&app->dash_host, nd->name);
|
||||
opal_argv_append_nosize(&dash_host, nd->name);
|
||||
/* check for duplicates */
|
||||
found = false;
|
||||
for (itm = opal_list_get_first(&nds);
|
||||
@ -861,6 +861,12 @@ static void recv_data(int fd, short args, void *cbdata)
|
||||
/* cleanup */
|
||||
opal_argv_free(alloc);
|
||||
OBJ_DESTRUCT(&ndtmp);
|
||||
if (NULL != dash_host) {
|
||||
tpn = opal_argv_join(dash_host, ',');
|
||||
orte_set_attribute(&app->attributes, ORTE_APP_DASH_HOST, ORTE_ATTR_LOCAL, (void*)tpn, OPAL_STRING);
|
||||
opal_argv_free(dash_host);
|
||||
free(tpn);
|
||||
}
|
||||
|
||||
if (opal_list_is_empty(&nds)) {
|
||||
/* if we get here, then we were able to contact slurm,
|
||||
@ -908,6 +914,7 @@ static int dyn_allocate(orte_job_t *jdata)
|
||||
int i;
|
||||
struct timeval tv;
|
||||
local_jobtracker_t *jtrk;
|
||||
int64_t i64, *i64ptr;
|
||||
|
||||
if (NULL == mca_ras_slurm_component.config_file) {
|
||||
opal_output(0, "Cannot perform dynamic allocation as no Slurm configuration file provided");
|
||||
@ -956,6 +963,7 @@ static int dyn_allocate(orte_job_t *jdata)
|
||||
free(tmp);
|
||||
|
||||
/* for each app, add its allocation request info */
|
||||
i64ptr = &i64;
|
||||
for (i=0; i < jdata->apps->size; i++) {
|
||||
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
|
||||
continue;
|
||||
@ -969,8 +977,8 @@ static int dyn_allocate(orte_job_t *jdata)
|
||||
opal_argv_append_nosize(&cmd, tmp);
|
||||
free(tmp);
|
||||
/* if we were given a minimum number of nodes, pass it along */
|
||||
if (0 < app->min_number_of_nodes) {
|
||||
asprintf(&tmp, "N=%ld", (long int)app->min_number_of_nodes);
|
||||
if (orte_get_attribute(&app->attributes, ORTE_APP_MIN_NODES, (void**)&i64ptr, OPAL_INT64)) {
|
||||
asprintf(&tmp, "N=%ld", (long int)i64);
|
||||
opal_argv_append_nosize(&cmd, tmp);
|
||||
free(tmp);
|
||||
}
|
||||
@ -985,7 +993,7 @@ static int dyn_allocate(orte_job_t *jdata)
|
||||
free(tmp);
|
||||
}
|
||||
/* add the mandatory/optional flag */
|
||||
if (app->mandatory) {
|
||||
if (orte_get_attribute(&app->attributes, ORTE_APP_MANDATORY, NULL, OPAL_BOOL)) {
|
||||
opal_argv_append_nosize(&cmd, "flag=mandatory");
|
||||
} else {
|
||||
opal_argv_append_nosize(&cmd, "flag=optional");
|
||||
@ -1071,13 +1079,17 @@ static char* get_node_list(orte_app_context_t *app)
|
||||
int j;
|
||||
char **total_host = NULL;
|
||||
char *nodes;
|
||||
char **dash_host, *dh;
|
||||
|
||||
if (NULL == app->dash_host) {
|
||||
if (!orte_get_attribute(&app->attributes, ORTE_APP_DASH_HOST, (void**)&dh, OPAL_STRING)) {
|
||||
return NULL;
|
||||
}
|
||||
for (j=0; NULL != app->dash_host[j]; j++) {
|
||||
opal_argv_append_unique_nosize(&total_host, app->dash_host[j], false);
|
||||
dash_host = opal_argv_split(dh, ',');
|
||||
free(dh);
|
||||
for (j=0; NULL != dash_host[j]; j++) {
|
||||
opal_argv_append_unique_nosize(&total_host, dash_host[j], false);
|
||||
}
|
||||
opal_argv_free(dash_host);
|
||||
if (NULL == total_host) {
|
||||
return NULL;
|
||||
}
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user