33ab928e1b
We currently save the hostname of a proc when we create the ompi_proc_t for it. This was originally done because the only method we had for discovering the host of a proc was to include that info in the modex, and we had to therefore store it somewhere proc-local. Obviously, this ccarried a memory penalty for storing all those strings, and so we added a "cutoff" parameter so that we wouldn't collect hostnames above a certain number of procs. Unfortunately, this still results in an 8-byte/proc memory cost as we have a char* pointer in the opal_proc_t that is contained in the ompi_proc_t so that we can store the hostname of the other procs if we fall below the cutoff. At scale, this can consume a fair amount of memory. With the switch to relying on PMIx, there is no longer a need to cache the proc hostnames. Using the "optional" feature of PMIx_Get, we restrict the retrieval to be purely proc-local - i.e., we retrieve the info either via shared memory or from within the proc-internal hash storage (depending upon the active PMIx components). Thus, the retrieval of a hostname is purely a local operation involving no communication. All RM's are required to provide a complete hostname map of all procs at startup. Thus, we have full access to all hostnames without including them in a modex or having to cache them on each proc. This allows us to remove the char* pointer from the opal_proc_t, saving us 8-bytes/proc. Unfortunately, PMIx_Get does not currently support the return of a static pointer to memory. Thus, even though PMIx has the hostname in its memory, it can only return a malloc'd version of it. I have therefore ensured that the return from opal_get_proc_hostname is consistently malloc'd and free'd wherever used. This shouldn't be a burden as the hostname is only used in one of two circumstances: (a) in an error message (b) in a verbose output for debugging purposes Thus, there should be no performance penalty associated with the malloc/free requirement. PMIx will eventually be returning static pointers, and so we can eventually simplify this method and return a "const char*" - but as noted, this really isn't an issue even today. Signed-off-by: Ralph Castain <rhc@pmix.org>
102 строки
4.4 KiB
C
102 строки
4.4 KiB
C
/*
|
|
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
|
* University Research and Technology
|
|
* Corporation. All rights reserved.
|
|
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
|
* of Tennessee Research Foundation. All rights
|
|
* reserved.
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
* University of Stuttgart. All rights reserved.
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
* All rights reserved.
|
|
* Copyright (c) 2007-2008 Cisco Systems, Inc. All rights reserved.
|
|
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
|
|
* Copyright (c) 2012 Los Alamos National Security, LLC.
|
|
* All rights reserved.
|
|
* Copyright (c) 2013-2020 Intel, Inc. All rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
#ifndef MCA_BTL_BASE_ERROR_H
|
|
#define MCA_BTL_BASE_ERROR_H
|
|
|
|
#include "opal_config.h"
|
|
|
|
#include <errno.h>
|
|
#include <stdio.h>
|
|
|
|
#include "opal/util/proc.h"
|
|
|
|
OPAL_DECLSPEC extern int mca_btl_base_verbose;
|
|
|
|
OPAL_DECLSPEC extern int mca_btl_base_err(const char*, ...) __opal_attribute_format__(__printf__, 1, 2);
|
|
OPAL_DECLSPEC extern int mca_btl_base_out(const char*, ...) __opal_attribute_format__(__printf__, 1, 2);
|
|
|
|
#define BTL_OUTPUT(args) \
|
|
do { \
|
|
mca_btl_base_out("[%s]%s[%s:%d:%s] ", \
|
|
opal_process_info.nodename, \
|
|
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), \
|
|
__FILE__, __LINE__, __func__); \
|
|
mca_btl_base_out args; \
|
|
mca_btl_base_out("\n"); \
|
|
} while(0);
|
|
|
|
|
|
#define BTL_ERROR(args) \
|
|
do { \
|
|
mca_btl_base_err("[%s]%s[%s:%d:%s] ", \
|
|
opal_process_info.nodename, \
|
|
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), \
|
|
__FILE__, __LINE__, __func__); \
|
|
mca_btl_base_err args; \
|
|
mca_btl_base_err("\n"); \
|
|
} while(0);
|
|
|
|
#define BTL_PEER_ERROR(proc, args) \
|
|
do { \
|
|
char *errhost; \
|
|
mca_btl_base_err("%s[%s:%d:%s] from %s ", \
|
|
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), \
|
|
__FILE__, __LINE__, __func__, \
|
|
opal_process_info.nodename); \
|
|
if (proc) { \
|
|
errhost = opal_get_proc_hostname(proc); \
|
|
mca_btl_base_err("to: %s ", errhost); \
|
|
free(errhost); \
|
|
} \
|
|
mca_btl_base_err args; \
|
|
mca_btl_base_err("\n"); \
|
|
} while(0);
|
|
|
|
|
|
#if OPAL_ENABLE_DEBUG
|
|
#define BTL_VERBOSE(args) \
|
|
do { \
|
|
if(mca_btl_base_verbose > 0) { \
|
|
mca_btl_base_err("[%s]%s[%s:%d:%s] ", \
|
|
opal_process_info.nodename, \
|
|
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), \
|
|
__FILE__, __LINE__, __func__); \
|
|
mca_btl_base_err args; \
|
|
mca_btl_base_err("\n"); \
|
|
} \
|
|
} while(0);
|
|
#else
|
|
#define BTL_VERBOSE(args)
|
|
#endif
|
|
|
|
#endif
|
|
|
|
|
|
BEGIN_C_DECLS
|
|
|
|
OPAL_DECLSPEC extern void mca_btl_base_error_no_nics(const char* transport,
|
|
const char* nic_name);
|
|
|
|
END_C_DECLS
|