1
1
openmpi/orte/runtime/orte_universe_exists.c

220 строки
6.5 KiB
C
Исходник Обычный вид История

/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
* Setup command line options for the Open MPI Run Time Environment
*/
#include "orte_config.h"
#include <string.h>
#ifdef HAVE_SYS_TIME_H
#include <sys/time.h>
#endif
Brining over the session directory and universe changes from the tmp/jjhursey-ft-cr branch. In this commit we change the way universe names are created. Before we by default first created "default-universe" then if there was a conflict we created "default-universe-PID" where PID is the PID of the HNP. Now we create "default-universe-PID" all the time (when a default universe name is used). This makes it much easier when trying to find a HNP from an outside app (e.g. orte-ps, orteconsole, ...) This also adds a "search" function to find all of the universes on the machine. This is useful in many contexts when trying to find a persistent daemon or when trying to connect to a HNP. This commit also makes orte_universe_t an opal_object_t, which is something that needed to happen, and only effected the SDS in one of it's base functions. I was asked to bring this over to aid in fixing orteconsole and orteprobe. Due to the change of orte_universe_t to an object orteprobe may need to be updated to reflect this change. Since orteprobe needs to be looked at anyway I'll leave this to Ralph to take care of. *Note*: These changes do not depend upon any of the FT work (but the FT work does depend upon them). These were brought over to help in fixing some of the ORTE tool set that require the functionality layed out in this patch. Testing: Ran the 'ibm' tests before and after this change, and all was as well as before the change. If anyone notices additional irregularities in the system let me know. But none are expected. This commit was SVN r10550.
2006-06-29 01:03:31 +04:00
#include <sys/types.h>
#include <dirent.h>
#include <libgen.h>
#include "orte/orte_constants.h"
#include "opal/util/output.h"
#include "orte/util/univ_info.h"
#include "orte/util/sys_info.h"
#include "orte/util/proc_info.h"
#include "opal/util/os_path.h"
#include "opal/util/os_dirpath.h"
#include "orte/util/session_dir.h"
#include "orte/util/universe_setup_file_io.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/ns/ns.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/runtime/runtime.h"
static struct timeval ompi_rte_ping_wait = {2, 0};
Brining over the session directory and universe changes from the tmp/jjhursey-ft-cr branch. In this commit we change the way universe names are created. Before we by default first created "default-universe" then if there was a conflict we created "default-universe-PID" where PID is the PID of the HNP. Now we create "default-universe-PID" all the time (when a default universe name is used). This makes it much easier when trying to find a HNP from an outside app (e.g. orte-ps, orteconsole, ...) This also adds a "search" function to find all of the universes on the machine. This is useful in many contexts when trying to find a persistent daemon or when trying to connect to a HNP. This commit also makes orte_universe_t an opal_object_t, which is something that needed to happen, and only effected the SDS in one of it's base functions. I was asked to bring this over to aid in fixing orteconsole and orteprobe. Due to the change of orte_universe_t to an object orteprobe may need to be updated to reflect this change. Since orteprobe needs to be looked at anyway I'll leave this to Ralph to take care of. *Note*: These changes do not depend upon any of the FT work (but the FT work does depend upon them). These were brought over to help in fixing some of the ORTE tool set that require the functionality layed out in this patch. Testing: Ran the 'ibm' tests before and after this change, and all was as well as before the change. If anyone notices additional irregularities in the system let me know. But none are expected. This commit was SVN r10550.
2006-06-29 01:03:31 +04:00
int orte_universe_search(opal_list_t *universe_list) {
int ret, exit_status = ORTE_SUCCESS;
DIR *cur_dirp = NULL;
struct dirent * dir_entry;
char *univ_setup_filename = NULL;
char *fulldirpath = NULL;
char *prefix = NULL;
char *frontend = NULL;
/*
* Get the session directory
*/
if( ORTE_SUCCESS != (ret = orte_session_dir_get_name(&fulldirpath,
&prefix,
&frontend,
orte_system_info.user,
orte_system_info.nodename,
NULL, /* batch ID -- Not used */
strdup("dummy"), /* Universe Name -- appened below */
NULL, /* jobid */
NULL /* vpid */
) ) ) {
exit_status = ret;
goto cleanup;
}
/* Strip off dummy the universe name */
fulldirpath = dirname(fulldirpath);
/*
* Check to make sure we have access to this directory
*/
if( ORTE_SUCCESS != (ret = opal_os_dirpath_access(fulldirpath, 0) )) {
Brining over the session directory and universe changes from the tmp/jjhursey-ft-cr branch. In this commit we change the way universe names are created. Before we by default first created "default-universe" then if there was a conflict we created "default-universe-PID" where PID is the PID of the HNP. Now we create "default-universe-PID" all the time (when a default universe name is used). This makes it much easier when trying to find a HNP from an outside app (e.g. orte-ps, orteconsole, ...) This also adds a "search" function to find all of the universes on the machine. This is useful in many contexts when trying to find a persistent daemon or when trying to connect to a HNP. This commit also makes orte_universe_t an opal_object_t, which is something that needed to happen, and only effected the SDS in one of it's base functions. I was asked to bring this over to aid in fixing orteconsole and orteprobe. Due to the change of orte_universe_t to an object orteprobe may need to be updated to reflect this change. Since orteprobe needs to be looked at anyway I'll leave this to Ralph to take care of. *Note*: These changes do not depend upon any of the FT work (but the FT work does depend upon them). These were brought over to help in fixing some of the ORTE tool set that require the functionality layed out in this patch. Testing: Ran the 'ibm' tests before and after this change, and all was as well as before the change. If anyone notices additional irregularities in the system let me know. But none are expected. This commit was SVN r10550.
2006-06-29 01:03:31 +04:00
exit_status = ret;
goto cleanup;
}
/*
* Open up the base directory so we can get a listing
*/
if( NULL == (cur_dirp = opendir(fulldirpath)) ) {
exit_status = ORTE_ERROR;
goto cleanup;
}
/*
* For each directory/universe
*/
while( NULL != (dir_entry = readdir(cur_dirp))) {
orte_universe_t *univ = NULL;
char * tmp_str = NULL;
/*
* Skip non-universe directories
*/
if( 0 == strncmp(dir_entry->d_name, ".", strlen(".")) ||
0 == strncmp(dir_entry->d_name, ".", strlen("..")) ) {
continue;
}
/*
* Read the setup file
*/
tmp_str = strdup(dir_entry->d_name);
asprintf(&univ_setup_filename, "%s/%s/%s",
fulldirpath,
tmp_str,
"universe-setup.txt");
univ = OBJ_NEW(orte_universe_t);
OBJ_RETAIN(univ);
if(ORTE_SUCCESS != (ret = orte_read_universe_setup_file(univ_setup_filename, univ) ) ){
printf("orte_ps: Unable to read the file (%s)\n", univ_setup_filename);
exit_status = ret;
goto cleanup;
}
opal_list_append(universe_list, &(univ->super));
if( NULL != tmp_str)
free(tmp_str);
}
cleanup:
if( NULL != cur_dirp )
closedir(cur_dirp);
if( NULL != univ_setup_filename)
free(univ_setup_filename);
if( NULL != fulldirpath)
free(fulldirpath);
if( NULL != prefix)
free(prefix);
if( NULL != frontend)
free(frontend);
return exit_status;
}
int orte_universe_exists(orte_universe_t *univ)
{
char *contact_file;
int ret;
/* check to see if local universe session directory already exists */
if (ORTE_SUCCESS != orte_session_dir(false,
orte_process_info.tmpdir_base,
orte_system_info.user,
orte_system_info.nodename,
NULL,
orte_universe_info.name,
NULL,
NULL)) { /* not found */
/* NOTE: NOT FINDING THE DIRECTORY IS NOT AN ERROR - DON'T ERROR_LOG IT */
return ORTE_ERR_NOT_FOUND;
}
/* check for "contact-info" file. if present, read it in. */
if (NULL == (contact_file = opal_os_path(false, orte_process_info.universe_session_dir,
"universe-setup.txt", NULL))) {
/* NOTE: NOT FINDING THE FILE IS NOT AN ERROR - DON'T ERROR_LOG IT */
return ORTE_ERR_NOT_FOUND;
}
if (ORTE_SUCCESS != (ret = orte_read_universe_setup_file(contact_file, univ))) {
/* NOTE: THIS IS NOT AN ERROR - DON'T ERROR_LOG IT */
free(contact_file);
return ret;
}
/* don't need this string any more - free it */
free(contact_file);
if (orte_debug_flag) {
opal_output(0, "connect_uni: contact info read");
}
if (!orte_universe_info.console) { /* if we aren't trying to connect a console */
if (!univ->persistence || /* if the target universe is not persistent... */
(0 == strncmp(univ->scope, "exclusive", strlen("exclusive")))) { /* ...or no connection allowed */
/* also need to check "local" and that we did not specify the exact
* matching universe name
*/
if (orte_debug_flag) {
opal_output(0, "connect_uni: connection not allowed");
}
/* NOTE: THIS IS NOT AN ERROR - DON'T ERROR_LOG IT */
return ORTE_ERR_NO_CONNECTION_ALLOWED;
}
}
if (orte_debug_flag) {
opal_output(0, "connect_uni: contact info to set: %s", univ->seed_uri);
}
/* if persistent, ping to verify it's alive */
if (ORTE_SUCCESS != orte_rml.ping(univ->seed_uri, &ompi_rte_ping_wait)) {
if (orte_debug_flag) {
ORTE_ERROR_LOG(ORTE_ERR_CONNECTION_FAILED);
}
return ORTE_ERR_CONNECTION_FAILED;
}
return ORTE_SUCCESS;
}