1
1
openmpi/orte/runtime/orte_universe_exists.c
Brian Barrett 39a6057fc6 A number of improvements / changes to the RML/OOB layers:
* General TCP cleanup for OPAL / ORTE
  * Simplifying the OOB by moving much of the logic into the RML
  * Allowing the OOB RML component to do routing of messages
  * Adding a component framework for handling routing tables
  * Moving the xcast functionality from the OOB base to its own framework

Includes merge from tmp/bwb-oob-rml-merge revisions:

    r15506, r15507, r15508, r15510, r15511, r15512, r15513

This commit was SVN r15528.

The following SVN revisions from the original message are invalid or
inconsistent and therefore were not cross-referenced:
  r15506
  r15507
  r15508
  r15510
  r15511
  r15512
  r15513
2007-07-20 01:34:02 +00:00

573 строки
19 KiB
C

/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
* Setup command line options for the Open MPI Run Time Environment
*/
#include "orte_config.h"
#include "orte/orte_constants.h"
#include <string.h>
#ifdef HAVE_SYS_TIME_H
#include <sys/time.h>
#endif
#include <sys/types.h>
#ifdef HAVE_DIRENT_H
#include <dirent.h>
#endif /* HAVE_DIRENT_H */
#include "opal/util/output.h"
#include "opal/util/os_path.h"
#include "opal/util/os_dirpath.h"
#include "orte/util/univ_info.h"
#include "orte/util/sys_info.h"
#include "orte/util/proc_info.h"
#include "orte/util/session_dir.h"
#include "orte/util/universe_setup_file_io.h"
#include "orte/runtime/params.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/ns/ns.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/runtime/runtime.h"
static struct timeval ompi_rte_ping_wait = {2, 0};
int orte_universe_search(opal_list_t *universe_list, bool report_broken_files, bool remove_broken_files)
{
int ret, exit_status = ORTE_SUCCESS;
#ifndef __WINDOWS__
DIR *cur_dirp = NULL;
struct dirent * dir_entry;
#else
HANDLE hFind = INVALID_HANDLE_VALUE;
WIN32_FIND_DATA file_data;
#endif /* __WINDOWS__ */
char *univ_setup_filename = NULL;
char *fulldirpath = NULL;
char *prefix = NULL;
char *frontend = NULL;
char *frontend_abs = NULL;
/*
* Get the session directory
*/
if( ORTE_SUCCESS != (ret = orte_session_dir_get_name(&fulldirpath,
&prefix,
&frontend,
orte_system_info.user,
orte_system_info.nodename,
NULL, /* batch ID -- Not used */
NULL, /* Universe Name -- NONE */
NULL, /* jobid */
NULL /* vpid */
) ) ) {
exit_status = ret;
goto cleanup;
}
#if !defined(__WINDOWS__)
frontend_abs = opal_os_path(false, prefix, frontend, NULL);
/*
* Check to make sure we have access to this directory
*/
if( ORTE_SUCCESS != (ret = opal_os_dirpath_access(frontend_abs, 0) )) {
exit_status = ret;
goto cleanup;
}
/*
* Open up the base directory so we can get a listing
*/
if( NULL == (cur_dirp = opendir(frontend_abs)) ) {
exit_status = ORTE_ERROR;
goto cleanup;
}
/*
* For each directory/universe
*/
while( NULL != (dir_entry = readdir(cur_dirp)) ) {
orte_universe_t *univ = NULL;
/*
* Skip non-universe directories
*/
if( 0 == strncmp(dir_entry->d_name, ".", strlen(".")) ||
0 == strncmp(dir_entry->d_name, ".", strlen("..")) ) {
continue;
}
/*
* Read the setup file
*/
univ_setup_filename = opal_os_path( false, frontend_abs,
dir_entry->d_name, "universe-setup.txt", NULL );
univ = OBJ_NEW(orte_universe_t);
if(ORTE_SUCCESS != (ret = orte_read_universe_setup_file(univ_setup_filename, univ) ) ){
if (report_broken_files) {
printf("universe_search: Unable to read the file (%s)\n", univ_setup_filename);
exit_status = ret;
}
/*
* See if we want to remove any cases with broken
* universe-setup.txt files. If so, print out a message and
* remove the directory. This is used by the orte-clean
* routine.
*/
if (remove_broken_files) {
char *univ_directory;
univ_directory = opal_os_path(false, frontend_abs,
dir_entry->d_name, NULL);
printf("universe_search: Removing defunct directory (%s)\n", univ_directory);
opal_os_dirpath_destroy(univ_directory, true, NULL);
free(univ_directory);
}
OBJ_RELEASE(univ);
} else {
OBJ_RETAIN(univ);
opal_list_append(universe_list, &(univ->super));
}
}
#else
/*
* Open up the base directory so we can get a listing.
*
* On Windows if we want to parse the content of a directory the filename
* should end with the "*". Otherwise we will only open the directory
* structure (and not the content).
*/
frontend_abs = opal_os_path(false, prefix, frontend, "*", NULL);
hFind = FindFirstFile( frontend_abs, &file_data );
if( INVALID_HANDLE_VALUE == hFind ) {
exit_status = GetLastError();
exit_status = ORTE_ERR_NOT_FOUND;
goto cleanup;
}
/*
* For each directory/universe
*/
do {
orte_universe_t *univ = NULL;
/*
* Skip non-universe directories
*/
if( 0 == strncmp(file_data.cFileName, ".", strlen(".")) ||
0 == strncmp(file_data.cFileName, ".", strlen("..")) ) {
continue;
}
/*
* Read the setup file
*/
univ_setup_filename = opal_os_path( false, prefix, frontend,
file_data.cFileName, "universe-setup.txt", NULL);
univ = OBJ_NEW(orte_universe_t);
if(ORTE_SUCCESS != (ret = orte_read_universe_setup_file(univ_setup_filename, univ) ) ){
if (report_broken_files) {
printf("universe_search: Unable to read the file (%s)\n", univ_setup_filename);
exit_status = ret;
}
/*
* See if we want to remove any cases with broken
* universe-setup.txt files. If so, print out a message and
* remove the directory. This is used by the orte-clean
* routine.
*/
if (remove_broken_files) {
char *univ_directory;
univ_directory = opal_os_path(false, frontend_abs,
file_data.cFileName, NULL);
printf("universe_search: Removing defunct directory (%s)\n", univ_directory);
opal_os_dirpath_destroy(univ_directory, true, NULL);
free(univ_directory);
}
OBJ_RELEASE(univ);
} else {
OBJ_RETAIN(univ);
opal_list_append(universe_list, &(univ->super));
}
} while( 0 != FindNextFile( hFind, &file_data ) );
#endif /* !defined(__WINDOWS__) */
cleanup:
#ifndef __WINDOWS__
if( NULL != cur_dirp )
closedir(cur_dirp);
#else
FindClose(hFind);
#endif /* __WINDOWS__ */
if( NULL != univ_setup_filename)
free(univ_setup_filename);
if( NULL != fulldirpath)
free(fulldirpath);
if( NULL != prefix)
free(prefix);
if( NULL != frontend)
free(frontend);
if( NULL != frontend_abs)
free(frontend_abs);
return (opal_list_is_empty(universe_list) ? exit_status : ORTE_SUCCESS);
}
static int orte_universe_check_connect(orte_universe_t *uni)
{
int rc;
if (!orte_universe_info.console) { /* if we aren't trying to connect a console */
if (!uni->persistence || /* if the target universe is not persistent... */
(0 == strncmp(uni->scope, "exclusive", strlen("exclusive")))) { /* ...or no connection allowed */
/* also need to check "local" and that we did not specify the exact
* matching universe name
*/
if (orte_debug_flag) {
opal_output(0, "connect_uni: connection not allowed");
}
/* NOTE: THIS IS NOT AN ERROR - DON'T ERROR_LOG IT */
return ORTE_ERR_NO_CONNECTION_ALLOWED;
}
}
if (orte_debug_flag) {
opal_output(0, "connect_uni: contact info to set: %s", uni->seed_uri);
}
/* insert the universe contact info into the RML hash tables */
if (ORTE_SUCCESS != (rc = orte_rml.set_contact_info(uni->seed_uri))) {
ORTE_ERROR_LOG(rc);
return(rc);
}
/* ping to verify it's alive */
if (ORTE_SUCCESS != (rc = orte_rml.ping(uni->seed_uri, &ompi_rte_ping_wait))) {
if (orte_debug_flag) {
ORTE_ERROR_LOG(rc);
}
return ORTE_ERR_CONNECTION_FAILED;
}
return ORTE_SUCCESS;
}
int orte_universe_exists(orte_universe_t *univ)
{
char *contact_file;
opal_list_t universes;
opal_list_item_t *item;
orte_universe_t *uniptr;
int ret;
/* if the user didn't provide a name for our universe, then we have to check
* for other universe names we could join. It is virtually impossible for
* another universe to have our exact default universe name as they would
* have to have the same PID - and that would be bad in so many ways!
*/
if (orte_universe_info.default_name) {
/* if we just have the default name - i.e., no name was specified -
* then get a list of all universes known on the local system. All
* we can do here is just loop through the session directory tree
* for universes - we have no better discovery mechanism at this time
*/
OBJ_CONSTRUCT(&universes, opal_list_t);
if (ORTE_SUCCESS != (ret = orte_universe_search(&universes, false, false))) {
/* if nothing was found, that's okay - report anything else */
if (ORTE_ERR_NOT_FOUND != ret) {
ORTE_ERROR_LOG(ret);
}
return ret;
}
/* if the list is empty, then we can just return */
if (opal_list_is_empty(&universes)) return ORTE_ERR_NOT_FOUND;
/* we have no real criteria for picking one over the other, so
* we just loop through the returned objects and pick the first
* one that will support connection
*/
while (NULL != (item = opal_list_remove_first(&universes))) {
uniptr = (orte_universe_t*)item;
if (ORTE_SUCCESS == orte_universe_check_connect(uniptr)) {
univ->name = strdup(uniptr->name);
univ->host = strdup(uniptr->host);
univ->uid = strdup(uniptr->uid);
univ->persistence = uniptr->persistence;
univ->scope = strdup(uniptr->scope);
univ->seed_uri = strdup(uniptr->seed_uri);
univ->console_connected = uniptr->console_connected;
return ORTE_SUCCESS;
}
}
/* if we get here, then we did not success in connecting to
* anyone - report that situation
*/
return ORTE_ERR_NOT_FOUND;
}
/* if the user did provide a name, then see if we can join it */
/* check to see if local universe session directory already exists */
if (ORTE_SUCCESS != orte_session_dir(false,
orte_process_info.tmpdir_base,
orte_system_info.user,
orte_system_info.nodename,
NULL,
orte_universe_info.name,
NULL,
NULL)) { /* not found */
/* NOTE: NOT FINDING THE DIRECTORY IS NOT AN ERROR - DON'T ERROR_LOG IT */
return ORTE_ERR_NOT_FOUND;
}
/* check for "contact-info" file. if present, read it in. */
if (NULL == (contact_file = opal_os_path(false, orte_process_info.universe_session_dir,
"universe-setup.txt", NULL))) {
/* NOTE: NOT FINDING THE FILE IS NOT AN ERROR - DON'T ERROR_LOG IT */
return ORTE_ERR_NOT_FOUND;
}
if (ORTE_SUCCESS != (ret = orte_read_universe_setup_file(contact_file, univ))) {
/* NOTE: THIS IS NOT AN ERROR - DON'T ERROR_LOG IT */
free(contact_file);
return ret;
}
/* don't need this string any more - free it */
free(contact_file);
if (orte_debug_flag) {
opal_output(0, "connect_uni: contact info read");
}
return orte_universe_check_connect(univ);
}
void
orte_universe_clean_directories(char *my_universe, int verbose) {
char *session_dir = NULL;
#if !defined(__WINDOWS__)
DIR *cur_dirp = NULL;
struct dirent * dir_entry;
#else
HANDLE hFind = INVALID_HANDLE_VALUE;
WIN32_FIND_DATA file_data;
#endif /* __WINDOWS__ */
char *fulldirpath = NULL;
char *prefix = NULL;
char *frontend = NULL;
/*
* Compute the full pathname to the session directory.
*/
if (ORTE_SUCCESS != orte_session_dir_get_name(&fulldirpath,
&prefix,
&frontend,
orte_system_info.user,
orte_system_info.nodename,
NULL, /* batch ID -- Not used */
NULL, /* Universe Name -- NONE */
NULL, /* jobid */
NULL /* vpid */
)) {
goto cleanup;
}
#if !defined(__WINDOWS__)
session_dir = opal_os_path(false, prefix, frontend, NULL);
/*
* Free up the various strings as these are allocated within
* the previous function.
*/
if (NULL != fulldirpath) {
free(fulldirpath);
fulldirpath = NULL;
}
if (NULL != prefix) {
free(prefix);
prefix = NULL;
}
if (NULL != frontend) {
free(frontend);
frontend = NULL;
}
/*
* Check to make sure we have access to this directory
*/
if (ORTE_SUCCESS != opal_os_dirpath_access(session_dir, 0)) {
goto cleanup;
}
/*
* Open up the base directory so we can get a listing
*/
if (NULL == (cur_dirp = opendir(session_dir))) {
goto cleanup;
}
/*
* For each directory/universe
*/
while (NULL != (dir_entry = readdir(cur_dirp))) {
/*
* Skip non-universe directories
*/
if (0 == strncmp(dir_entry->d_name, ".", strlen(".")) ||
0 == strncmp(dir_entry->d_name, ".", strlen(".."))) {
continue;
}
/*
* Skip my own universe. Let normal cleanup take care of that.
*/
if ((0 == strcmp(dir_entry->d_name, my_universe)) &&
(strlen(dir_entry->d_name) == strlen(my_universe))) {
if (verbose) {
opal_output(0, "orte-clean: skipping ourselves, name=%s\n",
orte_universe_info.name);
}
continue;
}
if (ORTE_SUCCESS != orte_session_dir_get_name(&fulldirpath,
&prefix,
&frontend,
orte_system_info.user,
orte_system_info.nodename,
NULL, /* batch ID -- Not used */
dir_entry->d_name,
NULL, /* jobid */
NULL /* vpid */
)) {
continue;
}
if (verbose) {
opal_output(0, "orte-clean: removing directory %s\n", fulldirpath);
}
opal_os_dirpath_destroy(fulldirpath, true, NULL);
/*
* The orte_session_dir_get_name handles the freeing of the
* fulldirpath each time it is called. The prefix gets reused.
* So, there is no need to free them on each call.
*/
if (NULL != frontend) {
free(frontend);
}
}
#else
/*
* Open up the base directory so we can get a listing.
*
* On Windows if we want to parse the content of a directory the filename
* should end with the "*". Otherwise we will only open the directory
* structure (and not the content).
*/
frontend = opal_os_path(false, prefix, frontend, "*", NULL);
hFind = FindFirstFile (frontend, &file_data);
if (INVALID_HANDLE_VALUE == hFind) {
goto cleanup;
}
do {
/*
* Skip non-universe directories
*/
/* Skip . and .. */
if ((0 == strcmp(file_data.cFileName, ".")) ||
(0 == strcmp(file_data.cFileName, "..")) ) {
continue;
}
if ((0 == strcmp(file_data.cFileName, my_universe)) &&
(strlen(file_data.cFileName) == strlen(my_universe))) {
if (verbose) {
opal_output(0, "orte-clean: skipping ourseleves, name=%s\n",
orte_universe_info.name);
}
continue;
}
if (ORTE_SUCCESS != orte_session_dir_get_name(&fulldirpath,
&prefix,
&frontend,
orte_system_info.user,
orte_system_info.nodename,
NULL, /* batch ID -- Not used */
file_data.cFileName,
NULL, /* jobid */
NULL /* vpid */
)) {
continue;
}
if (verbose) {
opal_output(0, "orte-clean: removing directory %s\n", fulldirpath);
}
opal_os_dirpath_destroy(fulldirpath, true, NULL);
/*
* The orte_session_dir_get_name handles the freeing of the
* fulldirpath each time it is called. The prefix gets reused.
* So, there is no need to free them on each call.
*/
if (NULL != frontend)
free(frontend);
} while (0 != FindNextFile(hFind, &file_data));
#endif /* !defined(__WINDOWS__) */
#if !defined(__WINDOWS__)
if (NULL != cur_dirp) {
closedir(cur_dirp);
}
#else
FindClose(hFind);
#endif /* __WINDOWS__ */
if(NULL != fulldirpath) {
free(fulldirpath);
}
if(NULL != prefix) {
free(prefix);
}
/*
* If the session directory is empty, then remove that too
*/
opal_os_dirpath_destroy(session_dir, false, NULL);
free(session_dir);
cleanup:
return;
}