1
1

Fix orte-submit so it allows application procs to select the correct ess component. Protect orte_data_server from multiple calls to finalize.

Этот коммит содержится в:
Ralph Castain 2015-09-21 20:31:57 -07:00
родитель ef6cf50687
Коммит f872e99315
5 изменённых файлов: 63 добавлений и 47 удалений

Просмотреть файл

@ -601,6 +601,13 @@ static int rte_init(void)
orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_SHOW_HELP,
ORTE_RML_PERSISTENT, orte_show_help_recv, NULL);
/* setup the data server */
if (ORTE_SUCCESS != (ret = orte_data_server_init())) {
ORTE_ERROR_LOG(ret);
error = "orte_data_server_init";
goto error;
}
if (orte_create_session_dirs) {
/* set the opal_output hnp file location to be in the
* proc-specific session directory. */
@ -814,6 +821,8 @@ static int rte_finalize(void)
/* shutdown the pmix server */
pmix_server_finalize();
(void) mca_base_framework_close(&opal_pmix_base_framework);
/* cleanup our data server */
orte_data_server_finalize();
(void) mca_base_framework_close(&orte_schizo_base_framework);
(void) mca_base_framework_close(&orte_dfs_base_framework);

Просмотреть файл

@ -104,11 +104,17 @@ OBJ_CLASS_INSTANCE(orte_data_req_t,
/* local globals */
static opal_pointer_array_t orte_data_server_store;
static opal_list_t pending;
static bool initialized = false;
int orte_data_server_init(void)
{
int rc;
if (initialized) {
return ORTE_SUCCESS;
}
initialized = true;
OBJ_CONSTRUCT(&orte_data_server_store, opal_pointer_array_t);
if (ORTE_SUCCESS != (rc = opal_pointer_array_init(&orte_data_server_store,
1,
@ -134,6 +140,11 @@ void orte_data_server_finalize(void)
orte_std_cntr_t i;
orte_data_object_t *data;
if (!initialized) {
return;
}
initialized = false;
orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DATA_SERVER);
for (i=0; i < orte_data_server_store.size; i++) {

Просмотреть файл

@ -96,7 +96,6 @@ static struct {
bool help;
bool version;
char *report_uri;
char *basename;
char *prefix;
bool run_as_root;
} myglobals;
@ -150,7 +149,7 @@ int main(int argc, char *argv[])
memset(&myglobals, 0, sizeof(myglobals));
/* find our basename (the name of the executable) so that we can
use it in pretty-print error messages */
myglobals.basename = opal_basename(argv[0]);
orte_basename = opal_basename(argv[0]);
opal_cmd_line_create(&cmd_line, cmd_line_init);
mca_base_cmd_line_setup(&cmd_line);
@ -174,7 +173,7 @@ int main(int argc, char *argv[])
OPAL_REPO_REV);
if (NULL != str) {
fprintf(stdout, "%s %s\n\nReport bugs to %s\n",
myglobals.basename, str, PACKAGE_BUGREPORT);
orte_basename, str, PACKAGE_BUGREPORT);
free(str);
}
exit(0);
@ -187,10 +186,10 @@ int main(int argc, char *argv[])
if (0 == geteuid() && !myglobals.run_as_root) {
fprintf(stderr, "--------------------------------------------------------------------------\n");
if (myglobals.help) {
fprintf(stderr, "%s cannot provide the help message when run as root\n", myglobals.basename);
fprintf(stderr, "%s cannot provide the help message when run as root\n", orte_basename);
} else {
/* show_help is not yet available, so print an error manually */
fprintf(stderr, "%s has detected an attempt to run as root.\n", myglobals.basename);
fprintf(stderr, "%s has detected an attempt to run as root.\n", orte_basename);
}
fprintf(stderr, " This is *strongly* discouraged as any mistake (e.g., in defining TMPDIR) or bug can\n");
fprintf(stderr, "result in catastrophic damage to the OS file system, leaving\n");
@ -222,15 +221,15 @@ int main(int argc, char *argv[])
if (myglobals.help) {
char *str, *args = NULL;
char *project_name = NULL;
if (0 == strcmp(myglobals.basename, "mpirun")) {
if (0 == strcmp(orte_basename, "mpirun")) {
project_name = "Open MPI";
} else {
project_name = "OpenRTE";
}
args = opal_cmd_line_get_usage_msg(&cmd_line);
str = opal_show_help_string("help-orterun.txt", "orterun:usage", false,
myglobals.basename, project_name, OPAL_VERSION,
myglobals.basename, args,
orte_basename, project_name, OPAL_VERSION,
orte_basename, args,
PACKAGE_BUGREPORT);
if (NULL != str) {
printf("%s", str);
@ -245,6 +244,14 @@ int main(int argc, char *argv[])
/* Setup MCA params */
orte_register_params();
/* save the environment for launch purposes. This MUST be
* done so that we can pass it to any local procs we
* spawn - otherwise, those local procs won't see any
* non-MCA envars were set in the enviro prior to calling
* orterun
*/
orte_launch_environ = opal_argv_copy(environ);
/* Intialize our Open RTE environment */
if (ORTE_SUCCESS != (rc = orte_init(&argc, &argv, ORTE_PROC_MASTER))) {
/* cannot call ORTE_ERROR_LOG as it could be the errmgr
@ -273,7 +280,7 @@ int main(int argc, char *argv[])
fp = fopen(ptr, "w");
if (NULL == fp) {
orte_show_help("help-orterun.txt", "orterun:write_file", false,
myglobals.basename, "pid", ptr);
orte_basename, "pid", ptr);
exit(0);
}
fprintf(fp, "%s\n", uri);
@ -282,7 +289,7 @@ int main(int argc, char *argv[])
fp = fopen(myglobals.report_uri, "w");
if (NULL == fp) {
orte_show_help("help-orterun.txt", "orterun:write_file", false,
myglobals.basename, "pid", myglobals.report_uri);
orte_basename, "pid", myglobals.report_uri);
exit(0);
}
fprintf(fp, "%s\n", uri);
@ -296,13 +303,13 @@ int main(int argc, char *argv[])
/* get the daemon job object - was created by ess/hnp component */
if (NULL == (jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
orte_show_help("help-orterun.txt", "bad-job-object", true,
myglobals.basename);
orte_basename);
exit(0);
}
/* also should have created a daemon "app" */
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, 0))) {
orte_show_help("help-orterun.txt", "bad-app-object", true,
myglobals.basename);
orte_basename);
exit(0);
}
@ -326,7 +333,7 @@ int main(int argc, char *argv[])
}
if (0 != strcmp(param, value)) {
orte_show_help("help-orterun.txt", "orterun:app-prefix-conflict",
true, myglobals.basename, value, param);
true, orte_basename, value, param);
/* let the global-level prefix take precedence since we
* know that one is being used
*/
@ -352,7 +359,7 @@ int main(int argc, char *argv[])
param_len--;
if (0 == param_len) {
orte_show_help("help-orterun.txt", "orterun:empty-prefix",
true, myglobals.basename, myglobals.basename);
true, orte_basename, orte_basename);
return ORTE_ERR_FATAL;
}
}
@ -368,7 +375,7 @@ int main(int argc, char *argv[])
if (0 < (j = opal_cmd_line_get_ninsts(&cmd_line, "hostfile"))) {
if(1 < j) {
orte_show_help("help-orterun.txt", "orterun:multiple-hostfiles",
true, myglobals.basename, NULL);
true, orte_basename, NULL);
return ORTE_ERR_FATAL;
} else {
value = opal_cmd_line_get_param(&cmd_line, "hostfile", 0, 0);
@ -378,7 +385,7 @@ int main(int argc, char *argv[])
if (0 < (j = opal_cmd_line_get_ninsts(&cmd_line, "machinefile"))) {
if(1 < j || orte_get_attribute(&app->attributes, ORTE_APP_HOSTFILE, NULL, OPAL_STRING)) {
orte_show_help("help-orterun.txt", "orterun:multiple-hostfiles",
true, myglobals.basename, NULL);
true, orte_basename, NULL);
return ORTE_ERR_FATAL;
} else {
value = opal_cmd_line_get_param(&cmd_line, "machinefile", 0, 0);

Просмотреть файл

@ -127,7 +127,6 @@ static struct {
char *path;
bool enable_recovery;
char *personality;
char *basename;
char *prefix;
bool terminate;
bool nolocal;
@ -333,7 +332,7 @@ static void spawn_recv(int status, orte_process_name_t* sender,
int main(int argc, char *argv[])
{
int rc, i;
int rc;
opal_cmd_line_t cmd_line;
char *param;
orte_job_t *jdata=NULL;
@ -344,7 +343,7 @@ int main(int argc, char *argv[])
memset(&myglobals, 0, sizeof(myglobals));
/* find our basename (the name of the executable) so that we can
use it in pretty-print error messages */
myglobals.basename = opal_basename(argv[0]);
orte_basename = opal_basename(argv[0]);
opal_cmd_line_create(&cmd_line, cmd_line_init);
@ -369,7 +368,7 @@ int main(int argc, char *argv[])
OPAL_REPO_REV);
if (NULL != str) {
fprintf(stdout, "%s %s\n\nReport bugs to %s\n",
myglobals.basename, str, PACKAGE_BUGREPORT);
orte_basename, str, PACKAGE_BUGREPORT);
free(str);
}
exit(0);
@ -382,10 +381,10 @@ int main(int argc, char *argv[])
if (0 == geteuid() && !myglobals.run_as_root) {
fprintf(stderr, "--------------------------------------------------------------------------\n");
if (myglobals.help) {
fprintf(stderr, "%s cannot provide the help message when run as root\n", myglobals.basename);
fprintf(stderr, "%s cannot provide the help message when run as root\n", orte_basename);
} else {
/* show_help is not yet available, so print an error manually */
fprintf(stderr, "%s has detected an attempt to run as root.\n", myglobals.basename);
fprintf(stderr, "%s has detected an attempt to run as root.\n", orte_basename);
}
fprintf(stderr, " This is *strongly* discouraged as any mistake (e.g., in defining TMPDIR) or bug can\n");
fprintf(stderr, "result in catastrophic damage to the OS file system, leaving\n");
@ -427,7 +426,7 @@ int main(int argc, char *argv[])
char *str, *args = NULL;
char *project_name = NULL;
opal_output(0, "GETTING HELP");
if (0 == strcmp(myglobals.basename, "mpirun")) {
if (0 == strcmp(orte_basename, "mpirun")) {
project_name = "Open MPI";
} else {
project_name = "OpenRTE";
@ -435,8 +434,8 @@ int main(int argc, char *argv[])
args = opal_cmd_line_get_usage_msg(&cmd_line);
opal_output(0, "CMD LINE %s", args);
str = opal_show_help_string("help-orterun.txt", "orterun:usage", false,
myglobals.basename, project_name, OPAL_VERSION,
myglobals.basename, args,
orte_basename, project_name, OPAL_VERSION,
orte_basename, args,
PACKAGE_BUGREPORT);
if (NULL != str) {
printf("%s", str);
@ -529,11 +528,9 @@ int main(int argc, char *argv[])
*/
opal_finalize();
for (i=0; NULL != environ[i]; i++) {
if (0 == strncmp(environ[i], "OMPI", 4)) {
fprintf(stderr, "%s\n", environ[i]);
}
}
/* clear the ess param from the environment so our children
* don't pick it up */
opal_unsetenv("OMPI_MCA_ess", &environ);
/* set the info in our contact table */
orte_rml.set_contact_info(orte_process_info.my_hnp_uri);
@ -659,7 +656,7 @@ int main(int argc, char *argv[])
/* This should never happen -- this case should be caught in
create_app(), but let's just double check... */
orte_show_help("help-orterun.txt", "orterun:nothing-to-do",
true, myglobals.basename);
true, orte_basename);
exit(ORTE_ERROR_DEFAULT_EXIT_CODE);
}
@ -765,7 +762,7 @@ static int parse_globals(int argc, char* argv[], opal_cmd_line_t *cmd_line)
fp = fopen(myglobals.report_pid, "w");
if (NULL == fp) {
orte_show_help("help-orterun.txt", "orterun:write_file", false,
myglobals.basename, "pid", myglobals.report_pid);
orte_basename, "pid", myglobals.report_pid);
exit(0);
}
fprintf(fp, "%d\n", (int)getpid());
@ -1009,7 +1006,7 @@ static int create_app(int argc, char* argv[],
if (0 == count) {
orte_show_help("help-orterun.txt", "orterun:executable-not-specified",
true, myglobals.basename, myglobals.basename);
true, orte_basename, orte_basename);
rc = ORTE_ERR_NOT_FOUND;
goto cleanup;
}
@ -1099,7 +1096,7 @@ static int create_app(int argc, char* argv[],
}
if (0 != strcmp(param, value)) {
orte_show_help("help-orterun.txt", "orterun:app-prefix-conflict",
true, myglobals.basename, value, param);
true, orte_basename, value, param);
/* let the global-level prefix take precedence since we
* know that one is being used
*/
@ -1125,7 +1122,7 @@ static int create_app(int argc, char* argv[],
param_len--;
if (0 == param_len) {
orte_show_help("help-orterun.txt", "orterun:empty-prefix",
true, myglobals.basename, myglobals.basename);
true, orte_basename, orte_basename);
free(param);
return ORTE_ERR_FATAL;
}
@ -1143,7 +1140,7 @@ static int create_app(int argc, char* argv[],
if (0 < (j = opal_cmd_line_get_ninsts(&cmd_line, "hostfile"))) {
if(1 < j) {
orte_show_help("help-orterun.txt", "orterun:multiple-hostfiles",
true, myglobals.basename, NULL);
true, orte_basename, NULL);
return ORTE_ERR_FATAL;
} else {
value = opal_cmd_line_get_param(&cmd_line, "hostfile", 0, 0);
@ -1153,7 +1150,7 @@ static int create_app(int argc, char* argv[],
if (0 < (j = opal_cmd_line_get_ninsts(&cmd_line, "machinefile"))) {
if(1 < j || orte_get_attribute(&app->attributes, ORTE_APP_HOSTFILE, NULL, OPAL_STRING)) {
orte_show_help("help-orterun.txt", "orterun:multiple-hostfiles",
true, myglobals.basename, NULL);
true, orte_basename, NULL);
return ORTE_ERR_FATAL;
} else {
value = opal_cmd_line_get_param(&cmd_line, "machinefile", 0, 0);
@ -1177,7 +1174,7 @@ static int create_app(int argc, char* argv[],
/* check for bozo error */
if (0 > myglobals.num_procs) {
orte_show_help("help-orterun.txt", "orterun:negative-nprocs",
true, myglobals.basename, app->argv[0],
true, orte_basename, app->argv[0],
myglobals.num_procs, NULL);
return ORTE_ERR_FATAL;
}
@ -1216,7 +1213,7 @@ static int create_app(int argc, char* argv[],
app->app = strdup(app->argv[0]);
if (NULL == app->app) {
orte_show_help("help-orterun.txt", "orterun:call-failed",
true, myglobals.basename, "library", "strdup returned NULL", errno);
true, orte_basename, "library", "strdup returned NULL", errno);
rc = ORTE_ERR_NOT_FOUND;
goto cleanup;
}

Просмотреть файл

@ -105,7 +105,6 @@
#include "orte/runtime/runtime.h"
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wait.h"
#include "orte/runtime/orte_data_server.h"
#include "orte/runtime/orte_locks.h"
#include "orte/runtime/orte_quit.h"
@ -1011,13 +1010,6 @@ int orterun(int argc, char *argv[])
orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DAEMON,
ORTE_RML_PERSISTENT, orte_daemon_recv, NULL);
/* setup the data server */
if (ORTE_SUCCESS != (rc = orte_data_server_init())) {
ORTE_ERROR_LOG(rc);
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
goto DONE;
}
/* setup for debugging */
orte_debugger_init_before_spawn(jdata);
orte_state.add_job_state(ORTE_JOB_STATE_READY_FOR_DEBUGGERS,