* Add performance code requested by LANL, per ticket #128. Must be

explicitly enabled at run-time with the mca parameter io_romio_enable_parallel_optimizations set to something non-zero. This will enable some magic flags in Panasas if the user didn't set them (either on or off) and do some slightly better things with strided collective writes. This commit was SVN r10516.
2006-06-26 22:26:36 +00:00 · 2006-06-26 22:26:36 +00:00 · 970d858f30
--- a/ompi/mca/io/romio/romio/Makefile.options
+++ b/ompi/mca/io/romio/romio/Makefile.options
@ -23,6 +23,8 @@ AUTOMAKE_OPTIONS	= foreign dist-bzip2
 # is in the build tree.
 AM_CPPFLAGS = \
        -DOMPI_BUILDING=1 \
-        -I$(top_ompi_builddir)/ompi/include \
-	-I$(top_srcdir)/include \
-	-I$(top_srcdir)/adio/include
+        -I$(top_ompi_builddir) \
+        -I$(top_ompi_builddir)/opal/include \
+        -I$(top_ompi_builddir)/ompi/include
+#	-I$(top_srcdir)/include \
+#	-I$(top_srcdir)/adio/include
--- a/ompi/mca/io/romio/romio/adio/ad_panfs/ad_panfs_hints.c
+++ b/ompi/mca/io/romio/romio/adio/ad_panfs/ad_panfs_hints.c
@ -8,6 +8,7 @@

 #include "ad_panfs.h"
 #include <pan_fs_client_cw_mode.h>
+#include "opal/mca/base/mca_base_param.h"

 void ADIOI_PANFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
 {
@ -22,10 +23,15 @@ void ADIOI_PANFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
    unsigned long int layout_total_num_comps = 0;
    pan_fs_client_layout_visit_t layout_visit_policy  = PAN_FS_CLIENT_LAYOUT_VISIT__ROUND_ROBIN;
    int gen_error_code;
+    int ompi_parallel_opts = 0;

    *error_code = MPI_SUCCESS;

    if (fd->info == MPI_INFO_NULL) {
+        mca_base_param_lookup_int(mca_base_param_find("io", "romio", 
+                                                      "enable_parallel_optimizations"),
+                                  &ompi_parallel_opts);
+    
 	    /* This must be part of the open call. can set striping parameters 
         * if necessary. 
         */ 
@ -47,6 +53,12 @@ void ADIOI_PANFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
                    MPI_Abort(MPI_COMM_WORLD, 1);
                }
 	            MPI_Info_set(fd->info, "panfs_concurrent_write", value); 
+            } else if (ompi_parallel_opts != 0) {
+                /* ------------------------------------------------------------------------ */
+                /* OMPI: User hints supplied, but not panfs_concurrent_write:               */
+                /* OMPI: Make panfs_concurrent_write == 1 the default                       */
+                /* ------------------------------------------------------------------------ */
+                MPI_Info_set(fd->info, "panfs_concurrent_write", "1");
            }

            MPI_Info_get(users_info, "panfs_layout_type", MPI_MAX_INFO_VAL, 
@ -129,6 +141,12 @@ void ADIOI_PANFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)

 	        ADIOI_Free(value);

+        } else if (ompi_parallel_opts) {
+            /* ------------------------------------------------------------------------ */
+            /* OMPI: No user hints supplied.                                            */
+            /* OMPI: Make panfs_concurrent_write == 1 the default                       */
+            /* ------------------------------------------------------------------------ */
+            MPI_Info_set(fd->info, "panfs_concurrent_write", "1");
        }
    }

--- a/ompi/mca/io/romio/romio/adio/common/ad_hints.c
+++ b/ompi/mca/io/romio/romio/adio/common/ad_hints.c
@ -5,6 +5,9 @@
 *   See COPYRIGHT notice in top-level directory.
 */

+#include "ompi_config.h"
+#include "opal/mca/base/mca_base_param.h"
+
 #include "adio.h"
 #include "adio_extern.h"

@ -20,6 +23,11 @@ void ADIOI_GEN_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
    char *value;
    int flag, intval, tmp_val, nprocs=0, nprocs_is_valid = 0, len;
    static char myname[] = "ADIOI_GEN_SETINFO";
+    int ompi_parallel_opts = 0;
+
+    mca_base_param_lookup_int(mca_base_param_find("io", "romio", 
+                                                  "enable_parallel_optimizations"),
+                              &ompi_parallel_opts);

    if (fd->info == MPI_INFO_NULL) MPI_Info_create(&(fd->info));
    info = fd->info;
@ -37,6 +45,10 @@ void ADIOI_GEN_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
     * previously initialized
     */
    if (!fd->hints->initialized) {
+        if (ompi_parallel_opts != 0) {
+            MPI_Info_set(info, "ompi_enable_parallel_optimizations", "true");
+        }
+
 	/* buffer size for collective I/O */
 	MPI_Info_set(info, "cb_buffer_size", ADIOI_CB_BUFFER_SIZE_DFLT); 
 	fd->hints->cb_buffer_size = atoi(ADIOI_CB_BUFFER_SIZE_DFLT);
@ -276,7 +288,13 @@ void ADIOI_GEN_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
 		MPI_Info_set(info, "cb_nodes", value);
 		fd->hints->cb_nodes = intval;
 	    }
-	}
+
+        } else {
+            /* OMPI: allow run-time override of cb_nodes during collective calls */
+            if (ompi_parallel_opts != 0) {
+                MPI_Info_set(info, "ompi_cb_nodes_runtime_override", "true");
+            }
+        }

 	MPI_Info_get(users_info, "ind_wr_buffer_size", MPI_MAX_INFO_VAL, 
 		     value, &flag);
--- a/ompi/mca/io/romio/romio/adio/common/ad_write_coll.c
+++ b/ompi/mca/io/romio/romio/adio/common/ad_write_coll.c
@ -11,6 +11,8 @@
 #include "mpe.h"
 #endif

+#include "limits.h"
+
 /* prototypes of functions used for collective writes only. */
 static void ADIOI_Exch_and_write(ADIO_File fd, void *buf, MPI_Datatype
                         datatype, int nprocs, int myrank, ADIOI_Access
@ -75,7 +77,16 @@ void ADIOI_GEN_WriteStridedColl(ADIO_File fd, void *buf, int count,
    ADIO_Offset *offset_list = NULL, *st_offsets = NULL, *fd_start = NULL,
 	*fd_end = NULL, *end_offsets = NULL;
    int *buf_idx = NULL, *len_list = NULL;
+    char *value;
+    int info_flag, ompi_parallel_opts = 0;
+    unsigned long long min_pe_request = ULONG_MAX;
+    unsigned long long max_pe_request =         0;
+    unsigned long long min_rd_request = ULONG_MAX;
+    unsigned long long max_rd_request =         0;

+    MPI_Info_get(fd->info, "ompi_enable_parallel_optimizations", MPI_MAX_INFO_VAL, value, 
+                 &info_flag);
+    if (info_flag) ompi_parallel_opts = 1;

 #ifdef PROFILE
 	MPE_Log_event(13, 0, "start computation");
@ -105,14 +116,77 @@ void ADIOI_GEN_WriteStridedColl(ADIO_File fd, void *buf, int count,
 	/* each process communicates its start and end offsets to other 
 	   processes. The result is an array each of start and end offsets stored
 	   in order of process rank. */ 
-    
+
 	st_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs*sizeof(ADIO_Offset));
 	end_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs*sizeof(ADIO_Offset));

-	MPI_Allgather(&start_offset, 1, ADIO_OFFSET, st_offsets, 1,
-		      ADIO_OFFSET, fd->comm);
-	MPI_Allgather(&end_offset, 1, ADIO_OFFSET, end_offsets, 1,
-		      ADIO_OFFSET, fd->comm);
+        if (ompi_parallel_opts) {
+            /* OMPI: reduce the collectives calls from 2 to 1, to improve scaling */
+            ADIO_Offset *stend_offsets, min_rd_st_offset, max_rd_end_offset, total_rd_size;
+            ADIO_Offset my_offsets[2];
+            int nprocs_for_creq;
+
+            stend_offsets = (ADIO_Offset *) ADIOI_Malloc(2*nprocs*sizeof(ADIO_Offset));
+            my_offsets[0] = start_offset;
+            my_offsets[1] = end_offset;
+
+            MPI_Allgather(my_offsets, 2, ADIO_OFFSET, stend_offsets, 2, ADIO_OFFSET, fd->comm);
+            min_rd_st_offset  = stend_offsets[0];
+            max_rd_end_offset = stend_offsets[1]; 
+            for (i=0; i<nprocs; i++) 
+                {
+                    st_offsets [i]    = stend_offsets[i*2  ];
+                    end_offsets[i]    = stend_offsets[i*2+1]; 
+                    min_rd_st_offset  = ADIOI_MIN(st_offsets [i],min_rd_st_offset);
+                    max_rd_end_offset = ADIOI_MAX(end_offsets[i],max_rd_end_offset);
+                    min_pe_request    = ADIOI_MIN((ADIO_Offset) min_pe_request,end_offsets[i]-st_offsets[i]+1);
+                    max_pe_request    = ADIOI_MAX((ADIO_Offset) max_pe_request,end_offsets[i]-st_offsets[i]+1);
+                }
+            min_rd_request    = ADIOI_MIN((ADIO_Offset) min_rd_request, max_rd_end_offset-min_rd_st_offset+1);
+            max_rd_request    = ADIOI_MAX((ADIO_Offset) max_rd_request, max_rd_end_offset-min_rd_st_offset+1);
+            ADIOI_Free(stend_offsets);
+
+            MPI_Info_get(fd->info, "ompi_cb_nodes_runtime_override", MPI_MAX_INFO_VAL, value, 
+                         &info_flag);
+            if (info_flag) {
+                /* ------------------------------------------------------------------ */
+                /* OMPI: swh@lanl.gov (Steve Hodson):                                 */
+                /* If user has not specified cb_nodes then calculate it as follows:   */
+                /* 1)nprocs_for_coll depends initially on the collective request size.*/
+                /*   For larger requests the denominator is directly proportional to  */
+                /*   the number of times the collective buffer is reused per request. */
+                /* 2)nprocs_for_coll limited to 1/4      the number of processes      */
+                /* 3)nprocs_for_coll is at least to 1/32 the number of processes      */
+                /* 4)nprocs_for_coll limited to range 1-32. Need at least 1,          */
+                /*   but don't exceed expected number of disks in use at a time       */
+                /* 5)nprocs_for_coll even workaround                                  */
+                /* 6)nprocs_for_coll at least 2 for more than 15 processes,           */
+                /*   regardless of how small collective request is.                   */
+                /* Caveat:                                                            */
+                /* The preceeding recipe was arrived at empirically for the           */
+                /* Panasas file system on Flash. Applicability to other file systems  */
+                /* needs to be demonstrated.                                          */
+                /* Caution: Care must be taken below to make sure that nprocs_for_coll*/ 
+                /* NEVER exceeds the default aggregator configuration list build once */
+                /* in open: ADIOI_cb_config_list_parse. Since nprocs_for_coll is      */
+                /* usually less that this number, only a subset of the previously     */
+                /* allocated aggregators will be used.                                */
+                /* ------------------------------------------------------------------ */
+                total_rd_size   = max_rd_end_offset - min_rd_st_offset + 1;
+                nprocs_for_creq = (int)(total_rd_size / ( 8 * 1024 * 1024 ));
+                nprocs_for_coll = ADIOI_MIN(nprocs_for_creq, nprocs/ 4);
+                nprocs_for_coll = ADIOI_MAX(nprocs_for_coll, nprocs/32);
+                nprocs_for_coll = ADIOI_MAX(nprocs_for_coll,  1);
+                nprocs_for_coll = ADIOI_MIN(nprocs_for_coll, 32);
+                if (nprocs_for_coll > 1 && nprocs_for_coll%2 ) nprocs_for_coll--;
+                if ( nprocs > 15 ) nprocs_for_coll = ADIOI_MAX(nprocs_for_coll, 2 );
+            }
+        } else {
+            MPI_Allgather(&start_offset, 1, ADIO_OFFSET, st_offsets, 1,
+                          ADIO_OFFSET, fd->comm);
+            MPI_Allgather(&end_offset, 1, ADIO_OFFSET, end_offsets, 1,
+                          ADIO_OFFSET, fd->comm);
+        }

 	/* are the accesses of different processes interleaved? */
 	for (i=1; i<nprocs; i++)
--- a/ompi/mca/io/romio/romio/configure.in
+++ b/ompi/mca/io/romio/romio/configure.in
@ -45,8 +45,6 @@ if test -f "$srcdir/../../../../../config/mca_configure.ac"; then

    top_ompi_srcdir='$(top_srcdir)/../../../../..'
    top_ompi_builddir='$(top_builddir)/../../../../..'
-    OMPI_CFLAGS='-I$(top_srcdir)/include -I$(top_srcdir) -I$(top_builddir) -I$(top_srcdir)/opal -I$(top_builddir)/opal -I$(top_srcdir)/orte -I$(top_builddir)/orte -I$(top_srcdir)/ompi -I$(top_builddir)/ompi'" $CPPFLAGS"
-    found_ompi_headers=1
    AC_MSG_RESULT([in Open MPI source tree -- good])
    AC_SUBST(top_ompi_srcdir)
    AC_SUBST(top_ompi_builddir)
--- a/ompi/mca/io/romio/src/io_romio_component.c
+++ b/ompi/mca/io/romio/src/io_romio_component.c
@ -141,6 +141,11 @@ static int open_component(void)
                               "Delete priority of the io romio component",
                               false, false, 10, NULL);

+    mca_base_param_reg_int(&mca_io_romio_component.io_version,
+                           "enable_parallel_optimizations",
+                           "Enable set of Open MPI-added options to improve collective file i/o performance",
+                           false, false, 0, NULL);
+
    /* Create the mutex */
    OBJ_CONSTRUCT(&mca_io_romio_mutex, opal_mutex_t);