39a6057fc6
* General TCP cleanup for OPAL / ORTE * Simplifying the OOB by moving much of the logic into the RML * Allowing the OOB RML component to do routing of messages * Adding a component framework for handling routing tables * Moving the xcast functionality from the OOB base to its own framework Includes merge from tmp/bwb-oob-rml-merge revisions: r15506, r15507, r15508, r15510, r15511, r15512, r15513 This commit was SVN r15528. The following SVN revisions from the original message are invalid or inconsistent and therefore were not cross-referenced: r15506 r15507 r15508 r15510 r15511 r15512 r15513
573 строки
19 KiB
C
573 строки
19 KiB
C
/*
|
|
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
|
* University Research and Technology
|
|
* Corporation. All rights reserved.
|
|
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
|
* of Tennessee Research Foundation. All rights
|
|
* reserved.
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
* University of Stuttgart. All rights reserved.
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
* All rights reserved.
|
|
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
/**
|
|
* @file
|
|
*
|
|
* Setup command line options for the Open MPI Run Time Environment
|
|
*/
|
|
|
|
|
|
#include "orte_config.h"
|
|
#include "orte/orte_constants.h"
|
|
|
|
#include <string.h>
|
|
#ifdef HAVE_SYS_TIME_H
|
|
#include <sys/time.h>
|
|
#endif
|
|
#include <sys/types.h>
|
|
#ifdef HAVE_DIRENT_H
|
|
#include <dirent.h>
|
|
#endif /* HAVE_DIRENT_H */
|
|
|
|
#include "opal/util/output.h"
|
|
#include "opal/util/os_path.h"
|
|
#include "opal/util/os_dirpath.h"
|
|
|
|
#include "orte/util/univ_info.h"
|
|
#include "orte/util/sys_info.h"
|
|
#include "orte/util/proc_info.h"
|
|
#include "orte/util/session_dir.h"
|
|
#include "orte/util/universe_setup_file_io.h"
|
|
#include "orte/runtime/params.h"
|
|
#include "orte/mca/rml/rml.h"
|
|
#include "orte/mca/ns/ns.h"
|
|
#include "orte/mca/errmgr/errmgr.h"
|
|
|
|
#include "orte/runtime/runtime.h"
|
|
|
|
static struct timeval ompi_rte_ping_wait = {2, 0};
|
|
|
|
int orte_universe_search(opal_list_t *universe_list, bool report_broken_files, bool remove_broken_files)
|
|
{
|
|
int ret, exit_status = ORTE_SUCCESS;
|
|
#ifndef __WINDOWS__
|
|
DIR *cur_dirp = NULL;
|
|
struct dirent * dir_entry;
|
|
#else
|
|
HANDLE hFind = INVALID_HANDLE_VALUE;
|
|
WIN32_FIND_DATA file_data;
|
|
#endif /* __WINDOWS__ */
|
|
char *univ_setup_filename = NULL;
|
|
char *fulldirpath = NULL;
|
|
char *prefix = NULL;
|
|
char *frontend = NULL;
|
|
char *frontend_abs = NULL;
|
|
|
|
/*
|
|
* Get the session directory
|
|
*/
|
|
if( ORTE_SUCCESS != (ret = orte_session_dir_get_name(&fulldirpath,
|
|
&prefix,
|
|
&frontend,
|
|
orte_system_info.user,
|
|
orte_system_info.nodename,
|
|
NULL, /* batch ID -- Not used */
|
|
NULL, /* Universe Name -- NONE */
|
|
NULL, /* jobid */
|
|
NULL /* vpid */
|
|
) ) ) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
#if !defined(__WINDOWS__)
|
|
frontend_abs = opal_os_path(false, prefix, frontend, NULL);
|
|
|
|
/*
|
|
* Check to make sure we have access to this directory
|
|
*/
|
|
if( ORTE_SUCCESS != (ret = opal_os_dirpath_access(frontend_abs, 0) )) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
/*
|
|
* Open up the base directory so we can get a listing
|
|
*/
|
|
if( NULL == (cur_dirp = opendir(frontend_abs)) ) {
|
|
exit_status = ORTE_ERROR;
|
|
goto cleanup;
|
|
}
|
|
/*
|
|
* For each directory/universe
|
|
*/
|
|
while( NULL != (dir_entry = readdir(cur_dirp)) ) {
|
|
orte_universe_t *univ = NULL;
|
|
|
|
/*
|
|
* Skip non-universe directories
|
|
*/
|
|
if( 0 == strncmp(dir_entry->d_name, ".", strlen(".")) ||
|
|
0 == strncmp(dir_entry->d_name, ".", strlen("..")) ) {
|
|
continue;
|
|
}
|
|
|
|
/*
|
|
* Read the setup file
|
|
*/
|
|
univ_setup_filename = opal_os_path( false, frontend_abs,
|
|
dir_entry->d_name, "universe-setup.txt", NULL );
|
|
|
|
univ = OBJ_NEW(orte_universe_t);
|
|
if(ORTE_SUCCESS != (ret = orte_read_universe_setup_file(univ_setup_filename, univ) ) ){
|
|
if (report_broken_files) {
|
|
printf("universe_search: Unable to read the file (%s)\n", univ_setup_filename);
|
|
exit_status = ret;
|
|
}
|
|
|
|
/*
|
|
* See if we want to remove any cases with broken
|
|
* universe-setup.txt files. If so, print out a message and
|
|
* remove the directory. This is used by the orte-clean
|
|
* routine.
|
|
*/
|
|
if (remove_broken_files) {
|
|
char *univ_directory;
|
|
univ_directory = opal_os_path(false, frontend_abs,
|
|
dir_entry->d_name, NULL);
|
|
printf("universe_search: Removing defunct directory (%s)\n", univ_directory);
|
|
opal_os_dirpath_destroy(univ_directory, true, NULL);
|
|
free(univ_directory);
|
|
}
|
|
OBJ_RELEASE(univ);
|
|
} else {
|
|
OBJ_RETAIN(univ);
|
|
opal_list_append(universe_list, &(univ->super));
|
|
}
|
|
}
|
|
#else
|
|
/*
|
|
* Open up the base directory so we can get a listing.
|
|
*
|
|
* On Windows if we want to parse the content of a directory the filename
|
|
* should end with the "*". Otherwise we will only open the directory
|
|
* structure (and not the content).
|
|
*/
|
|
frontend_abs = opal_os_path(false, prefix, frontend, "*", NULL);
|
|
hFind = FindFirstFile( frontend_abs, &file_data );
|
|
if( INVALID_HANDLE_VALUE == hFind ) {
|
|
exit_status = GetLastError();
|
|
exit_status = ORTE_ERR_NOT_FOUND;
|
|
goto cleanup;
|
|
}
|
|
|
|
/*
|
|
* For each directory/universe
|
|
*/
|
|
do {
|
|
orte_universe_t *univ = NULL;
|
|
|
|
/*
|
|
* Skip non-universe directories
|
|
*/
|
|
if( 0 == strncmp(file_data.cFileName, ".", strlen(".")) ||
|
|
0 == strncmp(file_data.cFileName, ".", strlen("..")) ) {
|
|
continue;
|
|
}
|
|
|
|
/*
|
|
* Read the setup file
|
|
*/
|
|
univ_setup_filename = opal_os_path( false, prefix, frontend,
|
|
file_data.cFileName, "universe-setup.txt", NULL);
|
|
|
|
univ = OBJ_NEW(orte_universe_t);
|
|
if(ORTE_SUCCESS != (ret = orte_read_universe_setup_file(univ_setup_filename, univ) ) ){
|
|
if (report_broken_files) {
|
|
printf("universe_search: Unable to read the file (%s)\n", univ_setup_filename);
|
|
exit_status = ret;
|
|
}
|
|
|
|
/*
|
|
* See if we want to remove any cases with broken
|
|
* universe-setup.txt files. If so, print out a message and
|
|
* remove the directory. This is used by the orte-clean
|
|
* routine.
|
|
*/
|
|
if (remove_broken_files) {
|
|
char *univ_directory;
|
|
univ_directory = opal_os_path(false, frontend_abs,
|
|
file_data.cFileName, NULL);
|
|
printf("universe_search: Removing defunct directory (%s)\n", univ_directory);
|
|
opal_os_dirpath_destroy(univ_directory, true, NULL);
|
|
free(univ_directory);
|
|
}
|
|
OBJ_RELEASE(univ);
|
|
} else {
|
|
OBJ_RETAIN(univ);
|
|
opal_list_append(universe_list, &(univ->super));
|
|
}
|
|
} while( 0 != FindNextFile( hFind, &file_data ) );
|
|
#endif /* !defined(__WINDOWS__) */
|
|
|
|
cleanup:
|
|
#ifndef __WINDOWS__
|
|
if( NULL != cur_dirp )
|
|
closedir(cur_dirp);
|
|
#else
|
|
FindClose(hFind);
|
|
#endif /* __WINDOWS__ */
|
|
if( NULL != univ_setup_filename)
|
|
free(univ_setup_filename);
|
|
if( NULL != fulldirpath)
|
|
free(fulldirpath);
|
|
if( NULL != prefix)
|
|
free(prefix);
|
|
if( NULL != frontend)
|
|
free(frontend);
|
|
if( NULL != frontend_abs)
|
|
free(frontend_abs);
|
|
|
|
return (opal_list_is_empty(universe_list) ? exit_status : ORTE_SUCCESS);
|
|
}
|
|
|
|
static int orte_universe_check_connect(orte_universe_t *uni)
|
|
{
|
|
int rc;
|
|
|
|
if (!orte_universe_info.console) { /* if we aren't trying to connect a console */
|
|
if (!uni->persistence || /* if the target universe is not persistent... */
|
|
(0 == strncmp(uni->scope, "exclusive", strlen("exclusive")))) { /* ...or no connection allowed */
|
|
/* also need to check "local" and that we did not specify the exact
|
|
* matching universe name
|
|
*/
|
|
if (orte_debug_flag) {
|
|
opal_output(0, "connect_uni: connection not allowed");
|
|
}
|
|
/* NOTE: THIS IS NOT AN ERROR - DON'T ERROR_LOG IT */
|
|
return ORTE_ERR_NO_CONNECTION_ALLOWED;
|
|
}
|
|
}
|
|
|
|
if (orte_debug_flag) {
|
|
opal_output(0, "connect_uni: contact info to set: %s", uni->seed_uri);
|
|
}
|
|
|
|
/* insert the universe contact info into the RML hash tables */
|
|
if (ORTE_SUCCESS != (rc = orte_rml.set_contact_info(uni->seed_uri))) {
|
|
ORTE_ERROR_LOG(rc);
|
|
return(rc);
|
|
}
|
|
|
|
/* ping to verify it's alive */
|
|
if (ORTE_SUCCESS != (rc = orte_rml.ping(uni->seed_uri, &ompi_rte_ping_wait))) {
|
|
if (orte_debug_flag) {
|
|
ORTE_ERROR_LOG(rc);
|
|
}
|
|
return ORTE_ERR_CONNECTION_FAILED;
|
|
}
|
|
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
|
|
int orte_universe_exists(orte_universe_t *univ)
|
|
{
|
|
char *contact_file;
|
|
opal_list_t universes;
|
|
opal_list_item_t *item;
|
|
orte_universe_t *uniptr;
|
|
int ret;
|
|
|
|
/* if the user didn't provide a name for our universe, then we have to check
|
|
* for other universe names we could join. It is virtually impossible for
|
|
* another universe to have our exact default universe name as they would
|
|
* have to have the same PID - and that would be bad in so many ways!
|
|
*/
|
|
if (orte_universe_info.default_name) {
|
|
/* if we just have the default name - i.e., no name was specified -
|
|
* then get a list of all universes known on the local system. All
|
|
* we can do here is just loop through the session directory tree
|
|
* for universes - we have no better discovery mechanism at this time
|
|
*/
|
|
OBJ_CONSTRUCT(&universes, opal_list_t);
|
|
if (ORTE_SUCCESS != (ret = orte_universe_search(&universes, false, false))) {
|
|
/* if nothing was found, that's okay - report anything else */
|
|
if (ORTE_ERR_NOT_FOUND != ret) {
|
|
ORTE_ERROR_LOG(ret);
|
|
}
|
|
return ret;
|
|
}
|
|
/* if the list is empty, then we can just return */
|
|
if (opal_list_is_empty(&universes)) return ORTE_ERR_NOT_FOUND;
|
|
|
|
/* we have no real criteria for picking one over the other, so
|
|
* we just loop through the returned objects and pick the first
|
|
* one that will support connection
|
|
*/
|
|
while (NULL != (item = opal_list_remove_first(&universes))) {
|
|
uniptr = (orte_universe_t*)item;
|
|
if (ORTE_SUCCESS == orte_universe_check_connect(uniptr)) {
|
|
univ->name = strdup(uniptr->name);
|
|
univ->host = strdup(uniptr->host);
|
|
univ->uid = strdup(uniptr->uid);
|
|
univ->persistence = uniptr->persistence;
|
|
univ->scope = strdup(uniptr->scope);
|
|
univ->seed_uri = strdup(uniptr->seed_uri);
|
|
univ->console_connected = uniptr->console_connected;
|
|
return ORTE_SUCCESS;
|
|
}
|
|
}
|
|
|
|
/* if we get here, then we did not success in connecting to
|
|
* anyone - report that situation
|
|
*/
|
|
return ORTE_ERR_NOT_FOUND;
|
|
}
|
|
|
|
/* if the user did provide a name, then see if we can join it */
|
|
|
|
/* check to see if local universe session directory already exists */
|
|
if (ORTE_SUCCESS != orte_session_dir(false,
|
|
orte_process_info.tmpdir_base,
|
|
orte_system_info.user,
|
|
orte_system_info.nodename,
|
|
NULL,
|
|
orte_universe_info.name,
|
|
NULL,
|
|
NULL)) { /* not found */
|
|
/* NOTE: NOT FINDING THE DIRECTORY IS NOT AN ERROR - DON'T ERROR_LOG IT */
|
|
return ORTE_ERR_NOT_FOUND;
|
|
}
|
|
|
|
/* check for "contact-info" file. if present, read it in. */
|
|
if (NULL == (contact_file = opal_os_path(false, orte_process_info.universe_session_dir,
|
|
"universe-setup.txt", NULL))) {
|
|
/* NOTE: NOT FINDING THE FILE IS NOT AN ERROR - DON'T ERROR_LOG IT */
|
|
return ORTE_ERR_NOT_FOUND;
|
|
}
|
|
|
|
if (ORTE_SUCCESS != (ret = orte_read_universe_setup_file(contact_file, univ))) {
|
|
/* NOTE: THIS IS NOT AN ERROR - DON'T ERROR_LOG IT */
|
|
free(contact_file);
|
|
return ret;
|
|
}
|
|
|
|
/* don't need this string any more - free it */
|
|
free(contact_file);
|
|
|
|
if (orte_debug_flag) {
|
|
opal_output(0, "connect_uni: contact info read");
|
|
}
|
|
|
|
return orte_universe_check_connect(univ);
|
|
}
|
|
|
|
void
|
|
orte_universe_clean_directories(char *my_universe, int verbose) {
|
|
char *session_dir = NULL;
|
|
#if !defined(__WINDOWS__)
|
|
DIR *cur_dirp = NULL;
|
|
struct dirent * dir_entry;
|
|
#else
|
|
HANDLE hFind = INVALID_HANDLE_VALUE;
|
|
WIN32_FIND_DATA file_data;
|
|
#endif /* __WINDOWS__ */
|
|
char *fulldirpath = NULL;
|
|
char *prefix = NULL;
|
|
char *frontend = NULL;
|
|
|
|
/*
|
|
* Compute the full pathname to the session directory.
|
|
*/
|
|
if (ORTE_SUCCESS != orte_session_dir_get_name(&fulldirpath,
|
|
&prefix,
|
|
&frontend,
|
|
orte_system_info.user,
|
|
orte_system_info.nodename,
|
|
NULL, /* batch ID -- Not used */
|
|
NULL, /* Universe Name -- NONE */
|
|
NULL, /* jobid */
|
|
NULL /* vpid */
|
|
)) {
|
|
goto cleanup;
|
|
}
|
|
|
|
#if !defined(__WINDOWS__)
|
|
session_dir = opal_os_path(false, prefix, frontend, NULL);
|
|
|
|
/*
|
|
* Free up the various strings as these are allocated within
|
|
* the previous function.
|
|
*/
|
|
if (NULL != fulldirpath) {
|
|
free(fulldirpath);
|
|
fulldirpath = NULL;
|
|
}
|
|
if (NULL != prefix) {
|
|
free(prefix);
|
|
prefix = NULL;
|
|
}
|
|
if (NULL != frontend) {
|
|
free(frontend);
|
|
frontend = NULL;
|
|
}
|
|
|
|
/*
|
|
* Check to make sure we have access to this directory
|
|
*/
|
|
if (ORTE_SUCCESS != opal_os_dirpath_access(session_dir, 0)) {
|
|
goto cleanup;
|
|
}
|
|
|
|
/*
|
|
* Open up the base directory so we can get a listing
|
|
*/
|
|
if (NULL == (cur_dirp = opendir(session_dir))) {
|
|
goto cleanup;
|
|
}
|
|
/*
|
|
* For each directory/universe
|
|
*/
|
|
while (NULL != (dir_entry = readdir(cur_dirp))) {
|
|
|
|
/*
|
|
* Skip non-universe directories
|
|
*/
|
|
if (0 == strncmp(dir_entry->d_name, ".", strlen(".")) ||
|
|
0 == strncmp(dir_entry->d_name, ".", strlen(".."))) {
|
|
continue;
|
|
}
|
|
|
|
/*
|
|
* Skip my own universe. Let normal cleanup take care of that.
|
|
*/
|
|
if ((0 == strcmp(dir_entry->d_name, my_universe)) &&
|
|
(strlen(dir_entry->d_name) == strlen(my_universe))) {
|
|
if (verbose) {
|
|
opal_output(0, "orte-clean: skipping ourselves, name=%s\n",
|
|
orte_universe_info.name);
|
|
}
|
|
continue;
|
|
}
|
|
|
|
if (ORTE_SUCCESS != orte_session_dir_get_name(&fulldirpath,
|
|
&prefix,
|
|
&frontend,
|
|
orte_system_info.user,
|
|
orte_system_info.nodename,
|
|
NULL, /* batch ID -- Not used */
|
|
dir_entry->d_name,
|
|
NULL, /* jobid */
|
|
NULL /* vpid */
|
|
)) {
|
|
continue;
|
|
}
|
|
|
|
if (verbose) {
|
|
opal_output(0, "orte-clean: removing directory %s\n", fulldirpath);
|
|
}
|
|
opal_os_dirpath_destroy(fulldirpath, true, NULL);
|
|
|
|
/*
|
|
* The orte_session_dir_get_name handles the freeing of the
|
|
* fulldirpath each time it is called. The prefix gets reused.
|
|
* So, there is no need to free them on each call.
|
|
*/
|
|
if (NULL != frontend) {
|
|
free(frontend);
|
|
}
|
|
}
|
|
#else
|
|
/*
|
|
* Open up the base directory so we can get a listing.
|
|
*
|
|
* On Windows if we want to parse the content of a directory the filename
|
|
* should end with the "*". Otherwise we will only open the directory
|
|
* structure (and not the content).
|
|
*/
|
|
frontend = opal_os_path(false, prefix, frontend, "*", NULL);
|
|
hFind = FindFirstFile (frontend, &file_data);
|
|
if (INVALID_HANDLE_VALUE == hFind) {
|
|
goto cleanup;
|
|
}
|
|
|
|
do {
|
|
/*
|
|
* Skip non-universe directories
|
|
*/
|
|
/* Skip . and .. */
|
|
if ((0 == strcmp(file_data.cFileName, ".")) ||
|
|
(0 == strcmp(file_data.cFileName, "..")) ) {
|
|
continue;
|
|
}
|
|
|
|
if ((0 == strcmp(file_data.cFileName, my_universe)) &&
|
|
(strlen(file_data.cFileName) == strlen(my_universe))) {
|
|
if (verbose) {
|
|
opal_output(0, "orte-clean: skipping ourseleves, name=%s\n",
|
|
orte_universe_info.name);
|
|
}
|
|
continue;
|
|
}
|
|
|
|
if (ORTE_SUCCESS != orte_session_dir_get_name(&fulldirpath,
|
|
&prefix,
|
|
&frontend,
|
|
orte_system_info.user,
|
|
orte_system_info.nodename,
|
|
NULL, /* batch ID -- Not used */
|
|
file_data.cFileName,
|
|
NULL, /* jobid */
|
|
NULL /* vpid */
|
|
)) {
|
|
continue;
|
|
}
|
|
|
|
if (verbose) {
|
|
opal_output(0, "orte-clean: removing directory %s\n", fulldirpath);
|
|
}
|
|
opal_os_dirpath_destroy(fulldirpath, true, NULL);
|
|
|
|
/*
|
|
* The orte_session_dir_get_name handles the freeing of the
|
|
* fulldirpath each time it is called. The prefix gets reused.
|
|
* So, there is no need to free them on each call.
|
|
*/
|
|
if (NULL != frontend)
|
|
free(frontend);
|
|
|
|
} while (0 != FindNextFile(hFind, &file_data));
|
|
#endif /* !defined(__WINDOWS__) */
|
|
|
|
#if !defined(__WINDOWS__)
|
|
if (NULL != cur_dirp) {
|
|
closedir(cur_dirp);
|
|
}
|
|
#else
|
|
FindClose(hFind);
|
|
#endif /* __WINDOWS__ */
|
|
|
|
if(NULL != fulldirpath) {
|
|
free(fulldirpath);
|
|
}
|
|
if(NULL != prefix) {
|
|
free(prefix);
|
|
}
|
|
|
|
/*
|
|
* If the session directory is empty, then remove that too
|
|
*/
|
|
opal_os_dirpath_destroy(session_dir, false, NULL);
|
|
free(session_dir);
|
|
cleanup:
|
|
return;
|
|
}
|