3dbd4d9be7
1. taking advantage of the fact that we no longer create the launch message via a GPR trigger. In earlier times, we had the GPR create the launch message based on a subscription. In that mode of operation, we could not guarantee the order in which the data was stored in the message - hence, we had no choice but to parse the message in a loop that checked each value against a list of possible "keys" until the corresponding value was found. Now, however, we construct the message "by hand", so we know precisely what data is in each location in the message. Thus, we no longer need to send the character string "keys" for each data value any more. This represents a rather large savings in the message size - to give you an example, we typically would use a 30-char "key" for a 2-byte data value. As you can see, the overhead can become very large. 2. sending node-specific data only once. Again, because we used to construct the message via subscriptions that were done on a per-proc basis, the data for each node (e.g., the daemon's name, whether or not the node was oversubscribed) would be included in the data for each proc. Thus, the node-specific data was repeated for every proc. Now that we construct the message "by hand", there is no reason to do this any more. Instead, we can insert the data for a specific node only once, and then provide the per-proc data for that node. We therefore not only save all that extra data in the message, but we also only need to parse the per-node data once. The savings become significant at scale. Here is a comparison between the revised trunk and the trunk prior to this commit (all data was taken on odin, using openib, 64 nodes, unity message routing, tested with application consisting of mpi_init/mpi_barrier/mpi_finalize, all execution times given in seconds, all launch message sizes in bytes): Per-node scaling, taken at 1ppn: #nodes original trunk revised trunk time size time size 1 0.10 819 0.09 564 2 0.14 1070 0.14 677 3 0.15 1321 0.14 790 4 0.15 1572 0.15 903 8 0.17 2576 0.20 1355 16 0.25 4584 0.21 2259 32 0.28 8600 0.27 4067 64 0.50 16632 0.39 7683 Per-proc scaling, taken at 64 nodes ppn original trunk revised trunk time size time size 1 0.50 16669 0.40 7720 2 0.55 32733 0.54 11048 3 0.87 48797 0.81 14376 4 1.0 64861 0.85 17704 Condensing those numbers, it appears we gained: per-node message size: 251 bytes/node -> 113 bytes/node per-proc message size: 251 bytes/proc -> 52 bytes/proc per-job message size: 568 bytes/job -> 399 bytes/job (job-specific data such as jobid, override oversubscribe flag, total #procs in job, total slots allocated) The fact that the two pre-commit trunk numbers are the same confirms the fact that each proc was containing the node data as well. It isn't quite the 10x message reduction I had hoped to get, but it is significant and gives much better scaling. Note that the timing info was, as usual, pretty chaotic - the numbers cited here were typical across several runs taken after the initial one to avoid NFS file positioning influences. Also note that this commit removes the orte_process_info.vpid_start field and the handful of places that passed that useless value. By definition, all jobs start at vpid=0, so all we were doing is passing "0" around. In fact, many places simply hardwired it to "0" anyway rather than deal with it. This commit was SVN r16428.
119 строки
4.5 KiB
C
119 строки
4.5 KiB
C
/*
|
|
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
|
* University Research and Technology
|
|
* Corporation. All rights reserved.
|
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
|
* of Tennessee Research Foundation. All rights
|
|
* reserved.
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
* University of Stuttgart. All rights reserved.
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
* All rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
/** @file:
|
|
*
|
|
* Populates global structure with process-specific information.
|
|
*
|
|
*
|
|
*/
|
|
|
|
#ifndef _ORTE_PROC_INFO_H_
|
|
#define _ORTE_PROC_INFO_H_
|
|
|
|
#include "orte_config.h"
|
|
|
|
#ifdef HAVE_SYS_TYPES_H
|
|
#include <sys/types.h>
|
|
#endif
|
|
#include "orte/mca/ns/ns_types.h"
|
|
|
|
BEGIN_C_DECLS
|
|
|
|
/**
|
|
* Process information structure
|
|
*
|
|
* The orte_proc_info() function fills the pid field and obtains the
|
|
* process name, storing that information in the global structure. The
|
|
* structure also holds path names to the universe, job, and process
|
|
* session directories, and to the stdin, stdout, and stderr temp
|
|
* files - however, these are all initialized elsewhere.
|
|
*/
|
|
struct orte_proc_info_t {
|
|
orte_process_name_t *my_name; /**< My official process name */
|
|
orte_process_name_t my_daemon; /**< Name of my local daemon */
|
|
orte_std_cntr_t app_num; /**< our index into the app_context array */
|
|
orte_std_cntr_t universe_size; /**< the size of the universe we are in */
|
|
bool singleton; /**< I am a singleton */
|
|
orte_std_cntr_t num_procs; /**< number of processes in this job */
|
|
orte_vpid_t local_rank; /**< local rank on this node */
|
|
orte_std_cntr_t num_local_procs; /**< total number of procs on this node */
|
|
pid_t pid; /**< Local process ID for this process */
|
|
bool seed; /**< Indicate whether or not this is seed daemon */
|
|
bool daemon; /**< Indicate whether or not I am a daemon */
|
|
char *ns_replica_uri; /**< contact info for name services replica */
|
|
char *gpr_replica_uri; /**< contact info for registry replica */
|
|
orte_process_name_t *ns_replica; /**< Name of my name server replica (NULL=>me) */
|
|
orte_process_name_t *gpr_replica; /**< Name of my registry replica (NULL=>me) */
|
|
char *tmpdir_base; /**< Base directory of the session dir tree */
|
|
char *top_session_dir; /**< Top-most directory of the session tree */
|
|
char *universe_session_dir; /**< Location of universe temp dir.
|
|
* The session directory has the form
|
|
* <prefix><openmpi-sessions-user><universe>, where the prefix
|
|
* can either be provided by the user via the
|
|
* --tmpdir command-line flag, the use of one of several
|
|
* environmental variables, or else a default location.
|
|
*/
|
|
|
|
char *job_session_dir; /**< Session directory for job */
|
|
|
|
char *proc_session_dir; /**< Session directory for the process */
|
|
|
|
char *sock_stdin; /**< Path name to temp file for stdin. */
|
|
char *sock_stdout; /**< Path name to temp file for stdout. */
|
|
char *sock_stderr; /**< Path name to temp file for stderr. */
|
|
};
|
|
typedef struct orte_proc_info_t orte_proc_info_t;
|
|
|
|
|
|
/**
|
|
*
|
|
* Global process info descriptor. Initialized to almost no
|
|
* meaningful information - data is provided by calling \c
|
|
* orte_rte_init() (which calls \c orte_proc_info() to fill in the
|
|
* structure).
|
|
*
|
|
* The exception to this rule is the \c orte_process_info.seed field,
|
|
* which will be initialized to \c false, but should be set to \c true
|
|
* before calling \c orte_rte_info() if the caller is a seed daemon.
|
|
*/
|
|
ORTE_DECLSPEC extern orte_proc_info_t orte_process_info;
|
|
|
|
|
|
/**
|
|
* \internal
|
|
*
|
|
* Global structure to store a wide range of information about the
|
|
* process. orte_proc_info populates a global variable with
|
|
* information about the process being executing. This function should
|
|
* be called only once, from orte_rte_init().
|
|
*
|
|
* @param None.
|
|
*
|
|
* @retval ORTE_SUCCESS Successfully initialized the various fields.
|
|
* @retval OMPI_ERROR Failed to initialize one or more fields.
|
|
*/
|
|
|
|
ORTE_DECLSPEC int orte_proc_info(void);
|
|
|
|
ORTE_DECLSPEC int orte_proc_info_finalize(void);
|
|
|
|
END_C_DECLS
|
|
|
|
#endif
|