
We currently save the hostname of a proc when we create the ompi_proc_t for it. This was originally done because the only method we had for discovering the host of a proc was to include that info in the modex, and we had to therefore store it somewhere proc-local. Obviously, this ccarried a memory penalty for storing all those strings, and so we added a "cutoff" parameter so that we wouldn't collect hostnames above a certain number of procs. Unfortunately, this still results in an 8-byte/proc memory cost as we have a char* pointer in the opal_proc_t that is contained in the ompi_proc_t so that we can store the hostname of the other procs if we fall below the cutoff. At scale, this can consume a fair amount of memory. With the switch to relying on PMIx, there is no longer a need to cache the proc hostnames. Using the "optional" feature of PMIx_Get, we restrict the retrieval to be purely proc-local - i.e., we retrieve the info either via shared memory or from within the proc-internal hash storage (depending upon the active PMIx components). Thus, the retrieval of a hostname is purely a local operation involving no communication. All RM's are required to provide a complete hostname map of all procs at startup. Thus, we have full access to all hostnames without including them in a modex or having to cache them on each proc. This allows us to remove the char* pointer from the opal_proc_t, saving us 8-bytes/proc. Unfortunately, PMIx_Get does not currently support the return of a static pointer to memory. Thus, even though PMIx has the hostname in its memory, it can only return a malloc'd version of it. I have therefore ensured that the return from opal_get_proc_hostname is consistently malloc'd and free'd wherever used. This shouldn't be a burden as the hostname is only used in one of two circumstances: (a) in an error message (b) in a verbose output for debugging purposes Thus, there should be no performance penalty associated with the malloc/free requirement. PMIx will eventually be returning static pointers, and so we can eventually simplify this method and return a "const char*" - but as noted, this really isn't an issue even today. Signed-off-by: Ralph Castain <rhc@pmix.org>
211 строки
6.7 KiB
C
211 строки
6.7 KiB
C
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
|
/*
|
|
* Copyright (c) 2013 The University of Tennessee and The University
|
|
* of Tennessee Research Foundation. All rights
|
|
* reserved.
|
|
* Copyright (c) 2013 Inria. All rights reserved.
|
|
* Copyright (c) 2014-2020 Intel, Inc. All rights reserved.
|
|
* Copyright (c) 2014-2017 Research Organization for Information Science
|
|
* and Technology (RIST). All rights reserved.
|
|
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
|
|
* reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
#include "opal_config.h"
|
|
|
|
#include "proc.h"
|
|
#include "opal/util/proc.h"
|
|
#include "opal/util/arch.h"
|
|
#include "opal/util/string_copy.h"
|
|
#include "opal/mca/pmix/pmix-internal.h"
|
|
|
|
opal_process_name_t opal_name_wildcard = {OPAL_JOBID_WILDCARD, OPAL_VPID_WILDCARD};
|
|
opal_process_name_t opal_name_invalid = {OPAL_JOBID_INVALID, OPAL_VPID_INVALID};
|
|
|
|
opal_process_info_t opal_process_info = {
|
|
.nativelaunch = false,
|
|
.nodename = NULL,
|
|
.top_session_dir = NULL,
|
|
.job_session_dir = NULL,
|
|
.proc_session_dir = NULL,
|
|
.num_local_peers = 0, /* there is nobody else but me */
|
|
.my_local_rank = 0, /* I'm the only process around here */
|
|
.cpuset = NULL,
|
|
};
|
|
|
|
static opal_proc_t opal_local_proc = {
|
|
{ .opal_list_next = NULL,
|
|
.opal_list_prev = NULL},
|
|
.proc_name = {OPAL_JOBID_INVALID, OPAL_VPID_INVALID},
|
|
.proc_arch = 0,
|
|
.proc_flags = 0,
|
|
.proc_convertor = NULL
|
|
};
|
|
static opal_proc_t* opal_proc_my_name = &opal_local_proc;
|
|
|
|
static void opal_proc_construct(opal_proc_t* proc)
|
|
{
|
|
proc->proc_arch = opal_local_arch;
|
|
proc->proc_convertor = NULL;
|
|
proc->proc_flags = 0;
|
|
proc->proc_name = *OPAL_NAME_INVALID;
|
|
}
|
|
|
|
static void opal_proc_destruct(opal_proc_t* proc)
|
|
{
|
|
proc->proc_flags = 0;
|
|
proc->proc_name = *OPAL_NAME_INVALID;
|
|
proc->proc_convertor = NULL;
|
|
}
|
|
|
|
OBJ_CLASS_INSTANCE(opal_proc_t, opal_list_item_t,
|
|
opal_proc_construct, opal_proc_destruct);
|
|
|
|
OBJ_CLASS_INSTANCE(opal_namelist_t, opal_list_item_t,
|
|
NULL, NULL);
|
|
|
|
static int
|
|
opal_compare_opal_procs(const opal_process_name_t p1,
|
|
const opal_process_name_t p2)
|
|
{
|
|
if( p1.jobid < p2.jobid ) {
|
|
return -1;
|
|
}
|
|
if( p1.jobid > p2.jobid ) {
|
|
return 1;
|
|
}
|
|
if( p1.vpid < p2.vpid ) {
|
|
return -1;
|
|
}
|
|
if( p1.vpid > p2.vpid ) {
|
|
return 1;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
opal_compare_proc_fct_t opal_compare_proc = opal_compare_opal_procs;
|
|
|
|
opal_proc_t* opal_proc_local_get(void)
|
|
{
|
|
return opal_proc_my_name;
|
|
}
|
|
|
|
int opal_proc_local_set(opal_proc_t* proc)
|
|
{
|
|
if( proc != opal_proc_my_name ) {
|
|
if( NULL != proc )
|
|
OBJ_RETAIN(proc);
|
|
if( &opal_local_proc != opal_proc_my_name )
|
|
OBJ_RELEASE(opal_proc_my_name);
|
|
if( NULL != proc ) {
|
|
opal_proc_my_name = proc;
|
|
} else {
|
|
opal_proc_my_name = &opal_local_proc;
|
|
}
|
|
}
|
|
return OPAL_SUCCESS;
|
|
}
|
|
|
|
/* this function is used to temporarily set the local
|
|
* name while OPAL and upper layers are initializing,
|
|
* thus allowing debug messages to be more easily
|
|
* understood */
|
|
void opal_proc_set_name(opal_process_name_t *name)
|
|
{
|
|
/* to protect alignment, copy the name across */
|
|
memcpy(&opal_local_proc.proc_name, name, sizeof(opal_process_name_t));
|
|
}
|
|
|
|
/**
|
|
* The following functions are surrogates for the RTE functionality, and are not supposed
|
|
* to be called. Instead, the corresponding function pointer should be set by the upper layer
|
|
* before the call to opal_init, to make them point to the correct accessors based on the
|
|
* underlying RTE.
|
|
*/
|
|
static char*
|
|
opal_process_name_print_should_never_be_called(const opal_process_name_t procname)
|
|
{
|
|
return "My Name is Nobody";
|
|
}
|
|
|
|
static char*
|
|
opal_vpid_print_should_never_be_called(const opal_vpid_t unused)
|
|
{
|
|
return "My VPID";
|
|
}
|
|
|
|
static char*
|
|
opal_jobid_print_should_never_be_called(const opal_jobid_t unused)
|
|
{
|
|
return "My JOBID";
|
|
}
|
|
|
|
static int opal_convert_string_to_process_name_should_never_be_called(opal_process_name_t *name,
|
|
const char* name_string)
|
|
{
|
|
return OPAL_ERR_NOT_SUPPORTED;
|
|
}
|
|
|
|
static int opal_convert_process_name_to_string_should_never_be_called(char** name_string,
|
|
const opal_process_name_t *name)
|
|
{
|
|
return OPAL_ERR_NOT_SUPPORTED;
|
|
}
|
|
|
|
static int opal_snprintf_jobid_should_never_be_called(char* name_string, size_t size, opal_jobid_t jobid)
|
|
{
|
|
(void)opal_string_copy(name_string, "My JOBID", size);
|
|
return OPAL_SUCCESS;
|
|
}
|
|
|
|
static int opal_convert_string_to_jobid_should_never_be_called(opal_jobid_t *jobid, const char *jobid_string)
|
|
{
|
|
return OPAL_ERR_NOT_SUPPORTED;
|
|
}
|
|
|
|
static struct opal_proc_t *opal_proc_for_name_should_never_be_called (opal_process_name_t name)
|
|
{
|
|
return NULL;
|
|
}
|
|
|
|
char* (*opal_process_name_print)(const opal_process_name_t) = opal_process_name_print_should_never_be_called;
|
|
char* (*opal_vpid_print)(const opal_vpid_t) = opal_vpid_print_should_never_be_called;
|
|
char* (*opal_jobid_print)(const opal_jobid_t) = opal_jobid_print_should_never_be_called;
|
|
int (*opal_convert_string_to_process_name)(opal_process_name_t *name, const char* name_string) = opal_convert_string_to_process_name_should_never_be_called;
|
|
int (*opal_convert_process_name_to_string)(char** name_string, const opal_process_name_t *name) = opal_convert_process_name_to_string_should_never_be_called;
|
|
int (*opal_snprintf_jobid)(char* name_string, size_t size, opal_jobid_t jobid) = opal_snprintf_jobid_should_never_be_called;
|
|
int (*opal_convert_string_to_jobid)(opal_jobid_t *jobid, const char *jobid_string) = opal_convert_string_to_jobid_should_never_be_called;
|
|
struct opal_proc_t *(*opal_proc_for_name) (const opal_process_name_t name) = opal_proc_for_name_should_never_be_called;
|
|
|
|
char* opal_get_proc_hostname(const opal_proc_t *proc)
|
|
{
|
|
int ret;
|
|
char *hostname;
|
|
|
|
/* if the proc is NULL, then we can't know */
|
|
if (NULL == proc) {
|
|
return strdup("unknown");
|
|
}
|
|
|
|
/* if it is my own hostname we are after, then just hand back
|
|
* the value in opal_process_info */
|
|
if (proc == opal_proc_my_name) {
|
|
return strdup(opal_process_info.nodename);
|
|
}
|
|
|
|
/* if we don't already have it, then try to get it */
|
|
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, PMIX_HOSTNAME, &proc->proc_name,
|
|
(char**)&hostname, PMIX_STRING);
|
|
if (OPAL_SUCCESS != ret) {
|
|
return strdup("unknown"); // return something so the caller doesn't segfault
|
|
}
|
|
|
|
/* user is not allowed to release the data */
|
|
return hostname;
|
|
}
|