/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
 * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
 *                         University Research and Technology
 *                         Corporation.  All rights reserved.
 * Copyright (c) 2004-2013 The University of Tennessee and The University
 *                         of Tennessee Research Foundation.  All rights
 *                         reserved.
 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
 *                         University of Stuttgart.  All rights reserved.
 * Copyright (c) 2004-2005 The Regents of the University of California.
 *                         All rights reserved.
 * Copyright (c) 2008-2015 University of Houston. All rights reserved.
 * Copyright (c) 2011-2015 Cisco Systems, Inc.  All rights reserved.
 * Copyright (c) 2012-2013 Inria.  All rights reserved.
 * Copyright (c) 2015      Research Organization for Information Science
 *                         and Technology (RIST). All rights reserved.
 * $COPYRIGHT$
 *
 * Additional copyrights may follow
 *
 * $HEADER$
 */

#include "ompi_config.h"

#include "ompi/runtime/params.h"
#include "ompi/communicator/communicator.h"
#include "ompi/mca/pml/pml.h"
#include "ompi/mca/topo/topo.h"
#include "opal/datatype/opal_convertor.h"
#include "opal/datatype/opal_datatype.h"
#include "ompi/datatype/ompi_datatype.h"
#include "ompi/info/info.h"
#include "ompi/request/request.h"

#include <math.h>
#include <unistd.h>

#ifdef HAVE_SYS_STATFS_H
#include <sys/statfs.h> /* or <sys/vfs.h> */
#endif
#ifdef HAVE_SYS_PARAM_H
#include <sys/param.h>
#endif
#ifdef HAVE_SYS_MOUNT_H
#include <sys/mount.h>
#endif
#ifdef HAVE_SYS_STAT_H
#include <sys/stat.h>
#endif
#include "io_ompio.h"

mca_io_ompio_print_queue *coll_write_time=NULL;
mca_io_ompio_print_queue *coll_read_time=NULL;


static int mca_io_ompio_create_groups(mca_io_ompio_file_t *fh,
                                      size_t bytes_per_proc);


static int mca_io_ompio_prepare_to_group(mca_io_ompio_file_t *fh,
                                         OMPI_MPI_OFFSET_TYPE **start_offsets_lens,
                                         OMPI_MPI_OFFSET_TYPE **end_offsets,
                                         OMPI_MPI_OFFSET_TYPE **aggr_bytes_per_group,
                                         OMPI_MPI_OFFSET_TYPE *bytes_per_group,
					 int **decision_list,
                                         size_t bytes_per_proc,
                                         int *is_aggregator,
                                         int *ompio_grouping_flag);

static int mca_io_ompio_retain_initial_groups(mca_io_ompio_file_t *fh);


static int mca_io_ompio_split_initial_groups(mca_io_ompio_file_t *fh,
		                             OMPI_MPI_OFFSET_TYPE *start_offsets_lens,
					     OMPI_MPI_OFFSET_TYPE *end_offsets,
					     OMPI_MPI_OFFSET_TYPE bytes_per_group);


static int mca_io_ompio_split_a_group(mca_io_ompio_file_t *fh,
                                      OMPI_MPI_OFFSET_TYPE *start_offsets_lens,
                                      OMPI_MPI_OFFSET_TYPE *end_offsets,
                                      int size_new_group,
                                      OMPI_MPI_OFFSET_TYPE *max_cci,
                                      OMPI_MPI_OFFSET_TYPE *min_cci,
                                      int *num_groups,
                                      int *size_smallest_group);


static int mca_io_ompio_finalize_split(mca_io_ompio_file_t *fh,
                                       int size_new_group,
                                       int size_last_group);

static int mca_io_ompio_merge_initial_groups(mca_io_ompio_file_t *fh,
		                             OMPI_MPI_OFFSET_TYPE *aggr_bytes_per_group,
					     int *decision_list,
					     int is_aggregator);

static int mca_io_ompio_merge_groups(mca_io_ompio_file_t *fh,
                                     int *merge_aggrs,
                                     int num_merge_aggrs);


int ompi_io_ompio_set_file_defaults (mca_io_ompio_file_t *fh)
{

   if (NULL != fh) {
        ompi_datatype_t *types[2];
        int blocklen[2] = {1, 1};
        OPAL_PTRDIFF_TYPE d[2], base;
        int i;

        fh->f_io_array = NULL;
        fh->f_perm = OMPIO_PERM_NULL;
        fh->f_flags = 0;
        fh->f_bytes_per_agg = mca_io_ompio_bytes_per_agg;
        fh->f_datarep = strdup ("native");

        fh->f_offset = 0;
        fh->f_disp = 0;
        fh->f_position_in_file_view = 0;
        fh->f_index_in_file_view = 0;
        fh->f_total_bytes = 0;

        fh->f_init_procs_per_group = -1;
        fh->f_init_procs_in_group = NULL;

	fh->f_procs_per_group = -1;
        fh->f_procs_in_group = NULL;

        fh->f_init_num_aggrs = -1;
        fh->f_init_aggr_list = NULL;


        /* Default file View */
        fh->f_iov_type = MPI_DATATYPE_NULL;
        fh->f_stripe_size = mca_io_ompio_bytes_per_agg;
	/*Decoded iovec of the file-view*/
	fh->f_decoded_iov = NULL;
        fh->f_etype = NULL;
        fh->f_filetype = NULL;
        fh->f_orig_filetype = NULL;

	mca_io_ompio_set_view_internal(fh,
				       0,
				       &ompi_mpi_byte.dt,
				       &ompi_mpi_byte.dt,
                                       "native",
				       fh->f_info);


	/*Create a derived datatype for the created iovec */
	types[0] = &ompi_mpi_long.dt;
        types[1] = &ompi_mpi_long.dt;

        d[0] = (OPAL_PTRDIFF_TYPE) fh->f_decoded_iov;
        d[1] = (OPAL_PTRDIFF_TYPE) &fh->f_decoded_iov[0].iov_len;

        base = d[0];
        for (i=0 ; i<2 ; i++) {
            d[i] -= base;
        }

        ompi_datatype_create_struct (2,
                                     blocklen,
                                     d,
                                     types,
                                     &fh->f_iov_type);
        ompi_datatype_commit (&fh->f_iov_type);

        return OMPI_SUCCESS;
    }
    else {
        return OMPI_ERROR;
    }
}

int ompi_io_ompio_generate_current_file_view (struct mca_io_ompio_file_t *fh,
                                              size_t max_data,
                                              struct iovec **f_iov,
                                              int *iov_count)
{

    struct iovec *iov = NULL;
    size_t bytes_to_write;
    size_t sum_previous_counts = 0;
    int j, k;
    int block = 1;

   /* allocate an initial iovec, will grow if needed */
    iov = (struct iovec *) calloc
        (OMPIO_IOVEC_INITIAL_SIZE, sizeof (struct iovec));
    if (NULL == iov) {
        opal_output(1, "OUT OF MEMORY\n");
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    sum_previous_counts = fh->f_position_in_file_view;
    j = fh->f_index_in_file_view;
    bytes_to_write = max_data;
    k = 0;

    while (bytes_to_write) {
        OPAL_PTRDIFF_TYPE disp;
        /* reallocate if needed */
        if (OMPIO_IOVEC_INITIAL_SIZE*block <= k) {
            block ++;
            iov = (struct iovec *)realloc
                (iov, OMPIO_IOVEC_INITIAL_SIZE *block *sizeof(struct iovec));
            if (NULL == iov) {
                opal_output(1, "OUT OF MEMORY\n");
                return OMPI_ERR_OUT_OF_RESOURCE;
            }
        }

        if (fh->f_decoded_iov[j].iov_len -
            (fh->f_total_bytes - sum_previous_counts) <= 0) {
            sum_previous_counts += fh->f_decoded_iov[j].iov_len;
            j = j + 1;
            if (j == (int)fh->f_iov_count) {
                j = 0;
                sum_previous_counts = 0;
                fh->f_offset += fh->f_view_extent;
                fh->f_position_in_file_view = sum_previous_counts;
                fh->f_index_in_file_view = j;
                fh->f_total_bytes = 0;
            }
        }

        disp = (OPAL_PTRDIFF_TYPE)(fh->f_decoded_iov[j].iov_base) +
            (fh->f_total_bytes - sum_previous_counts);
        iov[k].iov_base = (IOVBASE_TYPE *)(intptr_t)(disp + fh->f_offset);

        if ((fh->f_decoded_iov[j].iov_len -
             (fh->f_total_bytes - sum_previous_counts))
            >= bytes_to_write) {
            iov[k].iov_len = bytes_to_write;
        }
        else {
            iov[k].iov_len =  fh->f_decoded_iov[j].iov_len -
                (fh->f_total_bytes - sum_previous_counts);
        }

        fh->f_total_bytes += iov[k].iov_len;
        bytes_to_write -= iov[k].iov_len;
        k = k + 1;
    }
    fh->f_position_in_file_view = sum_previous_counts;
    fh->f_index_in_file_view = j;
    *iov_count = k;
    *f_iov = iov;

    if (mca_io_ompio_record_offset_info){

	int tot_entries=0, *recvcounts=NULL, *displs=NULL;
	mca_io_ompio_offlen_array_t *per_process=NULL;
	mca_io_ompio_offlen_array_t  *all_process=NULL;
	int *sorted=NULL, *column_list=NULL, *values=NULL;
	int *row_index=NULL, i=0, l=0, m=0;
	int column_index=0, r_index=0;
	int blocklen[3] = {1, 1, 1};
	OPAL_PTRDIFF_TYPE d[3], base;
	ompi_datatype_t *types[3];
	ompi_datatype_t *io_array_type=MPI_DATATYPE_NULL;
	int **adj_matrix=NULL;
	FILE *fp;


        recvcounts = (int *) malloc (fh->f_size * sizeof(int));
        if (NULL == recvcounts){
            return OMPI_ERR_OUT_OF_RESOURCE;
	}
        displs = (int *) malloc (fh->f_size * sizeof(int));
        if (NULL == displs){
            free(recvcounts);
            return OMPI_ERR_OUT_OF_RESOURCE;
	}

        fh->f_comm->c_coll.coll_gather (&k,
                                        1,
                                        MPI_INT,
                                        recvcounts,
                                        1,
					MPI_INT,
                                        OMPIO_ROOT,
                                        fh->f_comm,
                                        fh->f_comm->c_coll.coll_gather_module);

        per_process = (mca_io_ompio_offlen_array_t *)
	    malloc (k * sizeof(mca_io_ompio_offlen_array_t));
	if (NULL == per_process){
            opal_output(1,"Error while allocating per process!\n");
            free(recvcounts);
            free(displs);
            return  OMPI_ERR_OUT_OF_RESOURCE;
        }
        for (i=0;i<k;i++){
            per_process[i].offset =
                (OMPI_MPI_OFFSET_TYPE)(intptr_t)iov[i].iov_base;
            per_process[i].length =
                (MPI_Aint)iov[i].iov_len;
            per_process[i].process_id = fh->f_rank;
        }

	types[0] = &ompi_mpi_long.dt;
        types[1] = &ompi_mpi_long.dt;
        types[2] = &ompi_mpi_int.dt;

	d[0] = (OPAL_PTRDIFF_TYPE)&per_process[0];
        d[1] = (OPAL_PTRDIFF_TYPE)&per_process[0].length;
        d[2] = (OPAL_PTRDIFF_TYPE)&per_process[0].process_id;
        base = d[0];
        for (i=0;i<3;i++){
            d[i] -= base;
        }
        ompi_datatype_create_struct (3,
                                     blocklen,
                                     d,
                                     types,
                                     &io_array_type);
        ompi_datatype_commit (&io_array_type);

	if (OMPIO_ROOT == fh->f_rank){
            tot_entries = recvcounts[0];
            displs[0] = 0;
            for(i=1;i<fh->f_size;i++){
                displs[i] = displs[i-1] + recvcounts[i-1];
                tot_entries += recvcounts[i];
            }
            all_process = (mca_io_ompio_offlen_array_t *)
                malloc (tot_entries * sizeof(mca_io_ompio_offlen_array_t));
            if (NULL == all_process){
                opal_output(1,"Error while allocating per process!\n");
                free(per_process);
                free(recvcounts);
                free(displs);
                return  OMPI_ERR_OUT_OF_RESOURCE;
            }

            sorted = (int *) malloc
                (tot_entries * sizeof(int));
            if (NULL == sorted){
                opal_output(1,"Error while allocating per process!\n");
                free(all_process);
                free(per_process);
                free(recvcounts);
                free(displs);
                return  OMPI_ERR_OUT_OF_RESOURCE;
            }

            adj_matrix = (int **) malloc (fh->f_size *
                                          sizeof(int *));
            if (NULL == adj_matrix) {
                opal_output(1,"Error while allocating per process!\n");
                free(sorted);
                free(all_process);
                free(per_process);
                free(recvcounts);
                free(displs);
                return  OMPI_ERR_OUT_OF_RESOURCE;
            }
            for (i=0;i<fh->f_size;i++){
                adj_matrix[i] = (int *) malloc (fh->f_size *
                                                sizeof (int ));
                if (NULL == adj_matrix[i]) {
                    for (j=0; j<i; j++) {
                        free(adj_matrix[j]);
                    }
                    free(adj_matrix);
                    free(sorted);
                    free(all_process);
                    free(per_process);
                    free(recvcounts);
                    free(displs);
                    return  OMPI_ERR_OUT_OF_RESOURCE;
                 }
            }

            for (i=0;i<fh->f_size;i++){
                for (j=0;j<fh->f_size;j++){
                    adj_matrix[i][j] = 0;
                }
            }
	}
	fh->f_comm->c_coll.coll_gatherv (per_process,
					 k,
					 io_array_type,
					 all_process,
					 recvcounts,
					 displs,
					 io_array_type,
					 OMPIO_ROOT,
					 fh->f_comm,
					 fh->f_comm->c_coll.coll_gatherv_module);

	ompi_datatype_destroy(&io_array_type);

	if (OMPIO_ROOT == fh->f_rank){

	    ompi_io_ompio_sort_offlen(all_process,
				      tot_entries,
				      sorted);

	    for (i=0;i<tot_entries-1;i++){
		j = all_process[sorted[i]].process_id;
		l = all_process[sorted[i+1]].process_id;
		adj_matrix[j][l] += 1;
		adj_matrix[l][j] += 1;
	    }

	    /*Compress sparse matrix based on CRS to write to file */
	    m = 0;
	    for (i=0; i<fh->f_size; i++){
		for (j=0; j<fh->f_size; j++){
		    if (adj_matrix[i][j] > 0){
			m++;
		    }
		}
	    }
	    fp = fopen("fileview_info.out", "w+");
            if ( NULL == fp ) {
                for (i=0; i<fh->f_size; i++) {
                    free(adj_matrix[i]);
                }
                free(adj_matrix);
                free(sorted);
                free(all_process);
                free(per_process);
                free(recvcounts);
                free(displs);
		return MPI_ERR_OTHER;
            }
	    fprintf(fp,"FILEVIEW\n");
	    column_list = (int *) malloc ( m * sizeof(int));
	    if (NULL == column_list){
		opal_output(1,"Error while allocating column list\n");
                fclose(fp);
                for (i=0; i<fh->f_size; i++) {
                    free(adj_matrix[i]);
                }
                free(adj_matrix);
                free(sorted);
                free(all_process);
                free(per_process);
                free(recvcounts);
                free(displs);
		return OMPI_ERR_OUT_OF_RESOURCE;
	    }
	    values = (int *) malloc ( m * sizeof(int));
	    if (NULL == values){
		opal_output(1,"Error while allocating values list\n");
                fclose(fp);
                for (i=0; i<fh->f_size; i++) {
                    free(adj_matrix[i]);
                }
                free(adj_matrix);
                free(column_list);
                free(sorted);
                free(all_process);
                free(per_process);
                free(recvcounts);
                free(displs);
		return OMPI_ERR_OUT_OF_RESOURCE;
	    }

	    row_index = (int *) malloc ((fh->f_size + 1) *
					sizeof(int));
	    if (NULL == row_index){
		opal_output(1,"Error while allocating row_index list\n");
                fclose(fp);
                for (i=0; i<fh->f_size; i++) {
                    free(adj_matrix[i]);
                }
                free(adj_matrix);
                free(values);
                free(column_list);
                free(sorted);
                free(all_process);
                free(per_process);
                free(recvcounts);
                free(displs);
		return OMPI_ERR_OUT_OF_RESOURCE;
	    }
	    fprintf(fp,"%d %d\n", m, fh->f_size+1);
	    column_index = 0;
	    r_index = 1;
	    row_index[0] = r_index;
	    for (i=0; i<fh->f_size; i++){
		for (j=0; j<fh->f_size; j++){
		    if (adj_matrix[i][j] > 0){
			values[column_index]= adj_matrix[i][j];
			column_list[column_index]= j;
			fprintf(fp,"%d ", column_list[column_index]);
			column_index++;
			r_index++;
		    }

		}
		row_index[i+1]= r_index;
	    }

	    fprintf(fp,"\n");
	    for (i=0; i<m;i++){
		fprintf(fp, "%d ", values[i]);
	    }
	    fprintf(fp, "\n");
	    for (i=0; i< (fh->f_size + 1); i++){
		fprintf(fp, "%d ", row_index[i]);
	    }
	    fprintf(fp, "\n");
	    fclose(fp);

	    if (NULL != recvcounts){
		free(recvcounts);
		recvcounts = NULL;
	    }
	    if (NULL != displs){
		free(displs);
		displs = NULL;
	    }
	    if (NULL != sorted){
		free(sorted);
		sorted = NULL;
	    }
	    if (NULL != per_process){
		free(per_process);
		per_process = NULL;
	    }
	    if (NULL != all_process){
		free(all_process);
		all_process = NULL;
	    }
	    free(column_list);
	    free(values);
	    if (NULL != row_index){
		free(row_index);
		row_index = NULL;
	    }
	    if (NULL != adj_matrix){
		for (i=0;i<fh->f_size;i++){
		    free(adj_matrix[i]);
		}
		free(adj_matrix);
		adj_matrix = NULL;
	    }
	}
    }
    return OMPI_SUCCESS;
}

int ompi_io_ompio_set_explicit_offset (mca_io_ompio_file_t *fh,
                                       OMPI_MPI_OFFSET_TYPE offset)
{
    int i = 0;
    int k = 0;

    if ( fh->f_view_size  > 0 ) {
	/* starting offset of the current copy of the filew view */
	fh->f_offset = (fh->f_view_extent *
			((offset*fh->f_etype_size) / fh->f_view_size)) + fh->f_disp;


	/* number of bytes used within the current copy of the file view */
	fh->f_total_bytes = (offset*fh->f_etype_size) % fh->f_view_size;
	i = fh->f_total_bytes;


	/* Initialize the block id and the starting offset of the current block
	   within the current copy of the file view to zero */
	fh->f_index_in_file_view = 0;
	fh->f_position_in_file_view = 0;

	/* determine block id that the offset is located in and
	   the starting offset of that block */
	k = fh->f_decoded_iov[fh->f_index_in_file_view].iov_len;
	while (i >= k) {
	    fh->f_position_in_file_view = k;
	    fh->f_index_in_file_view++;
	    k += fh->f_decoded_iov[fh->f_index_in_file_view].iov_len;
	}
    }

    return OMPI_SUCCESS;
}

int ompi_io_ompio_decode_datatype (struct mca_io_ompio_file_t *fh,
                                   ompi_datatype_t *datatype,
                                   int count,
                                   const void *buf,
                                   size_t *max_data,
                                   struct iovec **iov,
                                   uint32_t *iovec_count)
{



    opal_convertor_t convertor;
    size_t remaining_length = 0;
    uint32_t i;
    uint32_t temp_count;
    struct iovec *temp_iov=NULL;
    size_t temp_data;


    opal_convertor_clone (fh->f_convertor, &convertor, 0);

    if (OMPI_SUCCESS != opal_convertor_prepare_for_send (&convertor,
                                                         &(datatype->super),
                                                         count,
                                                         buf)) {
        opal_output (1, "Cannot attach the datatype to a convertor\n");
        return OMPI_ERROR;
    }

    if ( 0 == datatype->super.size ) {
	*max_data = 0;
	*iovec_count = 0;
	*iov = NULL;
	return OMPI_SUCCESS;
    }

    remaining_length = count * datatype->super.size;

    temp_count = OMPIO_IOVEC_INITIAL_SIZE;
    temp_iov = (struct iovec*)malloc(temp_count * sizeof(struct iovec));
    if (NULL == temp_iov) {
        opal_output (1, "OUT OF MEMORY\n");
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    while (0 == opal_convertor_raw(&convertor,
				   temp_iov,
                                   &temp_count,
                                   &temp_data)) {
#if 0
        printf ("%d: New raw extraction (iovec_count = %d, max_data = %lu)\n",
                fh->f_rank,temp_count, (unsigned long)temp_data);
        for (i = 0; i < temp_count; i++) {
            printf ("%d: \t{%p, %lu}\n",fh->f_rank,
		    temp_iov[i].iov_base,
		    (unsigned long)temp_iov[i].iov_len);
        }
#endif

        *iovec_count = *iovec_count + temp_count;
        *max_data = *max_data + temp_data;
        *iov = (struct iovec *) realloc (*iov, *iovec_count * sizeof(struct iovec));
        if (NULL == *iov) {
            opal_output(1, "OUT OF MEMORY\n");
            free(temp_iov);
            return OMPI_ERR_OUT_OF_RESOURCE;
        }
        for (i=0 ; i<temp_count ; i++) {
            (*iov)[i+(*iovec_count-temp_count)].iov_base = temp_iov[i].iov_base;
            (*iov)[i+(*iovec_count-temp_count)].iov_len = temp_iov[i].iov_len;
        }

        remaining_length -= temp_data;
        temp_count = OMPIO_IOVEC_INITIAL_SIZE;
    }
#if 0
    printf ("%d: LAST raw extraction (iovec_count = %d, max_data = %d)\n",
            fh->f_rank,temp_count, temp_data);
    for (i = 0; i < temp_count; i++) {
        printf ("%d: \t offset[%d]: %ld; length[%d]: %ld\n", fh->f_rank,i,temp_iov[i].iov_base, i,temp_iov[i].iov_len);
    }
#endif
    *iovec_count = *iovec_count + temp_count;
    *max_data = *max_data + temp_data;
    if ( temp_count > 0 ) {
	*iov = (struct iovec *) realloc (*iov, *iovec_count * sizeof(struct iovec));
	if (NULL == *iov) {
	    opal_output(1, "OUT OF MEMORY\n");
            free(temp_iov);
	    return OMPI_ERR_OUT_OF_RESOURCE;
	}
    }
    for (i=0 ; i<temp_count ; i++) {
        (*iov)[i+(*iovec_count-temp_count)].iov_base = temp_iov[i].iov_base;
        (*iov)[i+(*iovec_count-temp_count)].iov_len = temp_iov[i].iov_len;
    }

    remaining_length -= temp_data;

#if 0
    if (0 == fh->f_rank) {
        printf ("%d Entries: \n",*iovec_count);
        for (i=0 ; i<*iovec_count ; i++) {
            printf ("\t{%p, %d}\n",
                    (*iov)[i].iov_base,
                    (*iov)[i].iov_len);
        }
    }
#endif
    if (remaining_length != 0) {
        printf( "Not all raw description was been extracted (%lu bytes missing)\n",
                (unsigned long) remaining_length );
    }

    free (temp_iov);

    return OMPI_SUCCESS;
}

int ompi_io_ompio_sort (mca_io_ompio_io_array_t *io_array,
                        int num_entries,
                        int *sorted)
{
    int i = 0;
    int j = 0;
    int left = 0;
    int right = 0;
    int largest = 0;
    int heap_size = num_entries - 1;
    int temp = 0;
    unsigned char done = 0;
    int* temp_arr = NULL;

    temp_arr = (int*)malloc(num_entries*sizeof(int));
    if (NULL == temp_arr) {
        opal_output (1, "OUT OF MEMORY\n");
        return OMPI_ERR_OUT_OF_RESOURCE;
    }
    temp_arr[0] = 0;
    for (i = 1; i < num_entries; ++i) {
        temp_arr[i] = i;
    }
    /* num_entries can be a large no. so NO RECURSION */
    for (i = num_entries/2-1 ; i>=0 ; i--) {
        done = 0;
        j = i;
        largest = j;

        while (!done) {
            left = j*2+1;
            right = j*2+2;
            if ((left <= heap_size) &&
                (io_array[temp_arr[left]].offset > io_array[temp_arr[j]].offset)) {
                largest = left;
            }
            else {
                largest = j;
            }
            if ((right <= heap_size) &&
                (io_array[temp_arr[right]].offset >
                 io_array[temp_arr[largest]].offset)) {
                largest = right;
            }
            if (largest != j) {
                temp = temp_arr[largest];
                temp_arr[largest] = temp_arr[j];
                temp_arr[j] = temp;
                j = largest;
            }
            else {
                done = 1;
            }
        }
    }

    for (i = num_entries-1; i >=1; --i) {
        temp = temp_arr[0];
        temp_arr[0] = temp_arr[i];
        temp_arr[i] = temp;
        heap_size--;
        done = 0;
        j = 0;
        largest = j;

        while (!done) {
            left =  j*2+1;
            right = j*2+2;

            if ((left <= heap_size) &&
                (io_array[temp_arr[left]].offset >
                 io_array[temp_arr[j]].offset)) {
                largest = left;
            }
            else {
                largest = j;
            }
            if ((right <= heap_size) &&
                (io_array[temp_arr[right]].offset >
                 io_array[temp_arr[largest]].offset)) {
                largest = right;
            }
            if (largest != j) {
                temp = temp_arr[largest];
                temp_arr[largest] = temp_arr[j];
                temp_arr[j] = temp;
                j = largest;
            }
            else {
                done = 1;
            }
        }
        sorted[i] = temp_arr[i];
    }
    sorted[0] = temp_arr[0];

    if (NULL != temp_arr) {
        free(temp_arr);
        temp_arr = NULL;
    }
    return OMPI_SUCCESS;
}

int ompi_io_ompio_sort_iovec (struct iovec *iov,
                              int num_entries,
                              int *sorted)
{
    int i = 0;
    int j = 0;
    int left = 0;
    int right = 0;
    int largest = 0;
    int heap_size = num_entries - 1;
    int temp = 0;
    unsigned char done = 0;
    int* temp_arr = NULL;

    if (0 == num_entries) {
        return OMPI_SUCCESS;
    }

    temp_arr = (int*)malloc(num_entries*sizeof(int));
    if (NULL == temp_arr) {
        opal_output (1, "OUT OF MEMORY\n");
        return OMPI_ERR_OUT_OF_RESOURCE;
    }
    temp_arr[0] = 0;
    for (i = 1; i < num_entries; ++i) {
        temp_arr[i] = i;
    }
    /* num_entries can be a large no. so NO RECURSION */
    for (i = num_entries/2-1 ; i>=0 ; i--) {
        done = 0;
        j = i;
        largest = j;

        while (!done) {
            left = j*2+1;
            right = j*2+2;
            if ((left <= heap_size) &&
                (iov[temp_arr[left]].iov_base > iov[temp_arr[j]].iov_base)) {
                largest = left;
            }
            else {
                largest = j;
            }
            if ((right <= heap_size) &&
                (iov[temp_arr[right]].iov_base >
                 iov[temp_arr[largest]].iov_base)) {
                largest = right;
            }
            if (largest != j) {
                temp = temp_arr[largest];
                temp_arr[largest] = temp_arr[j];
                temp_arr[j] = temp;
                j = largest;
            }
            else {
                done = 1;
            }
        }
    }

    for (i = num_entries-1; i >=1; --i) {
        temp = temp_arr[0];
        temp_arr[0] = temp_arr[i];
        temp_arr[i] = temp;
        heap_size--;
        done = 0;
        j = 0;
        largest = j;

        while (!done) {
            left =  j*2+1;
            right = j*2+2;

            if ((left <= heap_size) &&
                (iov[temp_arr[left]].iov_base >
                 iov[temp_arr[j]].iov_base)) {
                largest = left;
            }
            else {
                largest = j;
            }
            if ((right <= heap_size) &&
                (iov[temp_arr[right]].iov_base >
                 iov[temp_arr[largest]].iov_base)) {
                largest = right;
            }
            if (largest != j) {
                temp = temp_arr[largest];
                temp_arr[largest] = temp_arr[j];
                temp_arr[j] = temp;
                j = largest;
            }
            else {
                done = 1;
            }
        }
        sorted[i] = temp_arr[i];
    }
    sorted[0] = temp_arr[0];

    if (NULL != temp_arr) {
        free(temp_arr);
        temp_arr = NULL;
    }
    return OMPI_SUCCESS;
}

int ompi_io_ompio_sort_offlen (mca_io_ompio_offlen_array_t *io_array,
                               int num_entries,
                               int *sorted){

    int i = 0;
    int j = 0;
    int left = 0;
    int right = 0;
    int largest = 0;
    int heap_size = num_entries - 1;
    int temp = 0;
    unsigned char done = 0;
    int* temp_arr = NULL;

    temp_arr = (int*)malloc(num_entries*sizeof(int));
    if (NULL == temp_arr) {
        opal_output (1, "OUT OF MEMORY\n");
        return OMPI_ERR_OUT_OF_RESOURCE;
    }
    temp_arr[0] = 0;
    for (i = 1; i < num_entries; ++i) {
        temp_arr[i] = i;
    }
    /* num_entries can be a large no. so NO RECURSION */
    for (i = num_entries/2-1 ; i>=0 ; i--) {
        done = 0;
        j = i;
        largest = j;

        while (!done) {
	    left = j*2+1;
            right = j*2+2;
            if ((left <= heap_size) &&
                (io_array[temp_arr[left]].offset > io_array[temp_arr[j]].offset)) {
                largest = left;
            }
            else {
                largest = j;
            }
            if ((right <= heap_size) &&
                (io_array[temp_arr[right]].offset >
                 io_array[temp_arr[largest]].offset)) {
                largest = right;
            }
            if (largest != j) {
                temp = temp_arr[largest];
                temp_arr[largest] = temp_arr[j];
                temp_arr[j] = temp;
                j = largest;
            }
            else {
                done = 1;
            }
        }
    }

    for (i = num_entries-1; i >=1; --i) {
        temp = temp_arr[0];
        temp_arr[0] = temp_arr[i];
        temp_arr[i] = temp;
        heap_size--;
        done = 0;
        j = 0;
        largest = j;

        while (!done) {
            left =  j*2+1;
            right = j*2+2;

            if ((left <= heap_size) &&
                (io_array[temp_arr[left]].offset >
                 io_array[temp_arr[j]].offset)) {
		largest = left;
            }
            else {
                largest = j;
            }
            if ((right <= heap_size) &&
                (io_array[temp_arr[right]].offset >
                 io_array[temp_arr[largest]].offset)) {
                largest = right;
            }
            if (largest != j) {
                temp = temp_arr[largest];
                temp_arr[largest] = temp_arr[j];
                temp_arr[j] = temp;
                j = largest;
            }
            else {
                done = 1;
            }
        }
        sorted[i] = temp_arr[i];
    }
    sorted[0] = temp_arr[0];

    if (NULL != temp_arr) {
        free(temp_arr);
        temp_arr = NULL;
    }
    return OMPI_SUCCESS;
}

int ompi_io_ompio_set_aggregator_props (struct mca_io_ompio_file_t *fh,
                                        int num_aggregators,
                                        size_t bytes_per_proc)
{
    int j,procs_per_group = 0;

    /*If only one process used, no need to do aggregator selection!*/
    if (fh->f_size == 1){
	num_aggregators = 1;
    }

    fh->f_flags |= OMPIO_AGGREGATOR_IS_SET;

    if (-1 == num_aggregators) {
        if ( SIMPLE == mca_io_ompio_grouping_option ||
            NO_REFINEMENT == mca_io_ompio_grouping_option ) {
            fh->f_aggregator_index = 0;
            fh->f_final_num_aggrs  = fh->f_init_num_aggrs;
            fh->f_procs_per_group  = fh->f_init_procs_per_group;

            fh->f_procs_in_group = (int*)malloc (fh->f_procs_per_group * sizeof(int));
            if (NULL == fh->f_procs_in_group) {
                opal_output (1, "OUT OF MEMORY\n");
                return OMPI_ERR_OUT_OF_RESOURCE;
            }

            for (j=0 ; j<fh->f_procs_per_group ; j++) {
                fh->f_procs_in_group[j] = fh->f_init_procs_in_group[j];
            }
        }
        else {
            mca_io_ompio_create_groups(fh,bytes_per_proc);
        }
        return OMPI_SUCCESS;
    }

   //Forced number of aggregators
    /* calculate the offset at which each group of processes will start */
    procs_per_group = ceil ((float)fh->f_size/num_aggregators);

    /* calculate the number of processes in the local group */
    if (fh->f_size/procs_per_group != fh->f_rank/procs_per_group) {
        fh->f_procs_per_group = procs_per_group;
    }
    else {
        fh->f_procs_per_group = fh->f_size%procs_per_group;
    }

    fh->f_procs_in_group = (int*)malloc (fh->f_procs_per_group * sizeof(int));
    if (NULL == fh->f_procs_in_group) {
        opal_output (1, "OUT OF MEMORY\n");
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    for (j=0 ; j<fh->f_procs_per_group ; j++) {
        fh->f_procs_in_group[j] = (fh->f_rank/procs_per_group) * procs_per_group + j;
    }

    fh->f_aggregator_index = 0;
    fh->f_final_num_aggrs  = num_aggregators;

    return OMPI_SUCCESS;
 }




int ompi_io_ompio_break_file_view (mca_io_ompio_file_t *fh,
                                   struct iovec *iov,
                                   int count,
                                   int stripe_count,
                                   size_t stripe_size,
                                   struct iovec **broken_iov,
                                   int *broken_count)
{



    struct iovec *temp_iov = NULL;
    int i = 0;
    int k = 0;
    int block = 1;
    int broken = 0;
    size_t remaining = 0;
    size_t temp = 0;
    OPAL_PTRDIFF_TYPE current_offset = 0;


    /* allocate an initial iovec, will grow if needed */
    temp_iov = (struct iovec *) malloc
        (count * sizeof (struct iovec));
    if (NULL == temp_iov) {
        opal_output(1, "OUT OF MEMORY\n");
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    while (i < count) {
        if (count*block <= k) {
            block ++;
            temp_iov = (struct iovec *)realloc
                (temp_iov, count * block *sizeof(struct iovec));
            if (NULL == temp_iov) {
                opal_output(1, "OUT OF MEMORY\n");
                return OMPI_ERR_OUT_OF_RESOURCE;
            }
        }
        if (0 == broken) {
            temp = (OPAL_PTRDIFF_TYPE)(iov[i].iov_base)%stripe_size;
            if ((stripe_size-temp) >= iov[i].iov_len) {
                temp_iov[k].iov_base = iov[i].iov_base;
                temp_iov[k].iov_len = iov[i].iov_len;
                i++;
                k++;
            }
            else {
                temp_iov[k].iov_base = iov[i].iov_base;
                temp_iov[k].iov_len = stripe_size-temp;
                current_offset = (OPAL_PTRDIFF_TYPE)(temp_iov[k].iov_base) +
                    temp_iov[k].iov_len;
                remaining = iov[i].iov_len - temp_iov[k].iov_len;
                k++;
                broken ++;
            }
            continue;
        }
        temp = current_offset%stripe_size;
        if ((stripe_size-temp) >= remaining) {
            temp_iov[k].iov_base = (IOVBASE_TYPE *)current_offset;
            temp_iov[k].iov_len = remaining;
            i++;
            k++;
            broken = 0;
            current_offset = 0;
            remaining = 0;
        }
        else {
            temp_iov[k].iov_base = (IOVBASE_TYPE *)current_offset;
            temp_iov[k].iov_len = stripe_size-temp;
            current_offset += temp_iov[k].iov_len;
            remaining -= temp_iov[k].iov_len;
            k++;
            broken ++;
        }
    }
    *broken_iov = temp_iov;
    *broken_count = k;

    return 1;
}
int ompi_io_ompio_distribute_file_view (mca_io_ompio_file_t *fh,
                                        struct iovec *broken_iov,
                                        int broken_count,
                                        int num_aggregators,
                                        size_t stripe_size,
                                        int **fview_count,
                                        struct iovec **iov,
                                        int *count)
{


    int *num_entries;
    int *broken_index;
    int temp = 0;
    int *fview_cnt = NULL;
    int global_fview_count = 0;
    int i = 0;
    int *displs = NULL;
    int rc = OMPI_SUCCESS;
    struct iovec *global_fview = NULL;
    struct iovec **broken = NULL;
    MPI_Request *req=NULL, *sendreq=NULL;


    num_entries = (int *) malloc (sizeof (int) * num_aggregators);
    if (NULL == num_entries) {
        opal_output (1, "OUT OF MEMORY\n");
        return OMPI_ERR_OUT_OF_RESOURCE;
    }
    broken_index = (int *) malloc (sizeof (int) * num_aggregators);
    if (NULL == broken_index) {
        opal_output (1, "OUT OF MEMORY\n");
        free(num_entries);
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    memset (num_entries, 0x0, num_aggregators * sizeof (int));
    memset (broken_index, 0x0, num_aggregators * sizeof (int));

    /* calculate how many entries in the broken iovec belong to each aggregator */
    for (i=0 ; i<broken_count ; i++) {
        temp = (int)((OPAL_PTRDIFF_TYPE)broken_iov[i].iov_base/stripe_size) %
            num_aggregators;
        num_entries [temp] ++;
    }

    if (0 == fh->f_rank%fh->f_aggregator_index) {
        fview_cnt = (int *) malloc (sizeof (int) * fh->f_size);
        if (NULL == fview_cnt) {
            opal_output (1, "OUT OF MEMORY\n");
            free(num_entries);
            free(broken_index);
            return OMPI_ERR_OUT_OF_RESOURCE;
        }
        req = (MPI_Request *)malloc (fh->f_size * sizeof(MPI_Request));
        if (NULL == req) {
            free(num_entries);
            free(broken_index);
            free(fview_cnt);
            return OMPI_ERR_OUT_OF_RESOURCE;
        }
    }

    sendreq = (MPI_Request *)malloc (num_aggregators * sizeof(MPI_Request));
    if (NULL == sendreq) {
        free(num_entries);
        free(broken_index);
        if (0 == fh->f_rank%fh->f_aggregator_index) {
            free(fview_cnt);
            free(req);
        }
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    /* gather at each aggregator how many entires from the broken file view it
       expects from each process */
    if (0 == fh->f_rank%fh->f_aggregator_index) {
        for (i=0; i<fh->f_size ; i++) {
            rc = MCA_PML_CALL(irecv(&fview_cnt[i],
                                    1,
                                    MPI_INT,
                                    i,
                                    OMPIO_TAG_GATHER,
                                    fh->f_comm,
                                    &req[i]));
            if (OMPI_SUCCESS != rc) {
                goto exit;
            }
        }
    }

    for (i=0 ; i<num_aggregators ; i++) {
        rc = MCA_PML_CALL(isend(&num_entries[i],
                                1,
                                MPI_INT,
                                i*fh->f_aggregator_index,
                                OMPIO_TAG_GATHER,
                                MCA_PML_BASE_SEND_STANDARD,
                                fh->f_comm,
                                &sendreq[i]));
        if (OMPI_SUCCESS != rc) {
            goto exit;
        }
    }

    if (0 == fh->f_rank%fh->f_aggregator_index) {
        rc = ompi_request_wait_all (fh->f_size, req, MPI_STATUSES_IGNORE);
        if (OMPI_SUCCESS != rc) {
            goto exit;
        }
    }
    rc = ompi_request_wait_all (num_aggregators, sendreq, MPI_STATUSES_IGNORE);
    if (OMPI_SUCCESS != rc) {
        goto exit;
    }

    /*
    for (i=0 ; i<num_aggregators ; i++) {
        fh->f_comm->c_coll.coll_gather (&num_entries[i],
                                        1,
                                        MPI_INT,
                                        fview_cnt,
                                        1,
                                        MPI_INT,
                                        i*fh->f_aggregator_index,
                                        fh->f_comm,
                                        fh->f_comm->c_coll.coll_gather_module);
    }
    */

    if (0 == fh->f_rank%fh->f_aggregator_index) {
        displs = (int*) malloc (fh->f_size * sizeof (int));
        if (NULL == displs) {
            opal_output (1, "OUT OF MEMORY\n");
            free(fview_cnt);
            free(num_entries);
            free(broken_index);
            return OMPI_ERR_OUT_OF_RESOURCE;
        }
        displs[0] = 0;
        global_fview_count = fview_cnt[0];
        for (i=1 ; i<fh->f_size ; i++) {
            global_fview_count += fview_cnt[i];
            displs[i] = displs[i-1] + fview_cnt[i-1];
        }

        if (global_fview_count) {
            global_fview = (struct iovec*)malloc (global_fview_count *
                                                  sizeof(struct iovec));
            if (NULL == global_fview) {
                opal_output (1, "OUT OF MEMORY\n");
                free(num_entries);
                free(broken_index);
                free(fview_cnt);
                free(displs);
                return OMPI_ERR_OUT_OF_RESOURCE;
            }
        }
    }

    broken = (struct iovec**)malloc (num_aggregators * sizeof(struct iovec *));
    if (NULL == broken) {
        opal_output (1, "OUT OF MEMORY\n");
        free(num_entries);
        free(broken_index);
        if (0 == fh->f_rank%fh->f_aggregator_index) {
            free(global_fview);
            free(displs);
            free(fview_cnt);
        }
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    for (i=0 ; i<num_aggregators ; i++) {
        broken[i] = NULL;
        if (0 != num_entries[i]) {
            broken[i] = (struct iovec*) malloc (num_entries[i] *
                                                sizeof (struct iovec));
            if (NULL == broken[i]) {
                int j;
                opal_output (1, "OUT OF MEMORY\n");
                free(num_entries);
                free(broken_index);
                for (j=0; j<i; j++) {
                    free(broken[j]);
                }
                if (0 == fh->f_rank%fh->f_aggregator_index) {
                    free(global_fview);
                    free(displs);
                    free(fview_cnt);
                }
                free(broken);
                return OMPI_ERR_OUT_OF_RESOURCE;
            }
        }
    }

    for (i=0 ; i<broken_count ; i++) {
        temp = (int)((OPAL_PTRDIFF_TYPE)broken_iov[i].iov_base/stripe_size) %
            num_aggregators;
        broken[temp][broken_index[temp]].iov_base = broken_iov[i].iov_base;
        broken[temp][broken_index[temp]].iov_len = broken_iov[i].iov_len;
        broken_index[temp] ++;
    }
    /*
    for (i=0 ; i<num_aggregators ; i++) {
        int j;
        for (j=0 ; j<num_entries[i] ; j++) {
            printf("%d->%d: OFFSET: %d   LENGTH: %d\n",
                   fh->f_rank,
                   i,
                   broken[i][j].iov_base,
                   broken[i][j].iov_len);
        }
    }
    sleep(1);
    */

    if (0 == fh->f_rank%fh->f_aggregator_index) {
        ptrdiff_t lb, extent;
        rc = ompi_datatype_get_extent(fh->f_iov_type, &lb, &extent);
        if (OMPI_SUCCESS != rc) {
                goto exit;
        }
        for (i=0; i<fh->f_size ; i++) {
            if (fview_cnt[i]) {
                char *ptmp;
                ptmp = ((char *) global_fview) + (extent * displs[i]);
                rc = MCA_PML_CALL(irecv(ptmp,
                                        fview_cnt[i],
                                        fh->f_iov_type,
                                        i,
                                        OMPIO_TAG_GATHERV,
                                        fh->f_comm,
                                        &req[i]));
                if (OMPI_SUCCESS != rc) {
                    goto exit;
                }
            }
        }
    }

    for (i=0 ; i<num_aggregators ; i++) {
        if (num_entries[i]) {
            rc = MCA_PML_CALL(isend(broken[i],
                                    num_entries[i],
                                    fh->f_iov_type,
                                    i*fh->f_aggregator_index,
                                    OMPIO_TAG_GATHERV,
                                    MCA_PML_BASE_SEND_STANDARD,
                                    fh->f_comm,
                                    &sendreq[i]));
            if (OMPI_SUCCESS != rc) {
                goto exit;
            }
        }
    }

    if (0 == fh->f_rank%fh->f_aggregator_index) {
        for (i=0; i<fh->f_size ; i++) {
            if (fview_cnt[i]) {
                rc = ompi_request_wait (&req[i], MPI_STATUS_IGNORE);
                if (OMPI_SUCCESS != rc) {
                    goto exit;
                }
            }
        }
    }

    for (i=0; i<num_aggregators ; i++) {
        if (num_entries[i]) {
            rc = ompi_request_wait (&sendreq[i], MPI_STATUS_IGNORE);
            if (OMPI_SUCCESS != rc) {
                goto exit;
            }
        }
    }

    /*
    for (i=0 ; i<num_aggregators ; i++) {
        fh->f_comm->c_coll.coll_gatherv (broken[i],
                                         num_entries[i],
                                         fh->f_iov_type,
                                         global_fview,
                                         fview_cnt,
                                         displs,
                                         fh->f_iov_type,
                                         i*fh->f_aggregator_index,
                                         fh->f_comm,
                                         fh->f_comm->c_coll.coll_gatherv_module);
    }
    */
    /*
    for (i=0 ; i<global_fview_count ; i++) {
        printf("%d: OFFSET: %d   LENGTH: %d\n",
               fh->f_rank,
               global_fview[i].iov_base,
               global_fview[i].iov_len);
    }
    */
 exit:

    if (NULL != broken) {
        for (i=0 ; i<num_aggregators ; i++) {
            if (NULL != broken[i]) {
                free (broken[i]);
            }
        }
        free (broken);
    }
    if (NULL != req) {
        free (req);
    }
    if (NULL != sendreq) {
        free (sendreq);
    }
    free (num_entries);
    free (broken_index);
    if (NULL != displs) {
        free (displs);
    }

    *fview_count = fview_cnt;
    *iov = global_fview;
    *count = global_fview_count;

    return rc;
}

int ompi_io_ompio_gather_data (mca_io_ompio_file_t *fh,
                               void *send_buf,
                               size_t total_bytes_sent,
                               int *bytes_sent,
                               struct iovec *broken_iovec,
                               int broken_index,
                               size_t partial,
                               void *global_buf,
                               int *bytes_per_process,
                               int *displs,
                               int num_aggregators,
                               size_t stripe_size)
{
    void **sbuf = NULL;
    size_t bytes_remaining;
    size_t *temp_position;
    size_t part;
    int current;
    int temp = 0;
    int i = 0;
    int rc = OMPI_SUCCESS;
    MPI_Request *req=NULL, *sendreq=NULL;


    current = broken_index;
    part = partial;

    sbuf = (void**) malloc (num_aggregators * sizeof(void *));
    if (NULL == sbuf) {
        opal_output (1, "OUT OF MEMORY\n");
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    temp_position = (size_t *) malloc (num_aggregators * sizeof(size_t));
    if (NULL == temp_position) {
        opal_output (1, "OUT OF MEMORY\n");
        free(sbuf);
        return OMPI_ERR_OUT_OF_RESOURCE;
    }
    memset (temp_position, 0x0, num_aggregators * sizeof (size_t));

    for (i=0 ; i<num_aggregators ; i++) {
        sbuf[i] = NULL;
        if (0 != bytes_sent[i]) {
            sbuf[i] = (void *) malloc (bytes_sent[i]);
            if (NULL == sbuf[i]) {
                opal_output (1, "OUT OF MEMORY\n");
                free(sbuf);
                free(temp_position);
                return OMPI_ERR_OUT_OF_RESOURCE;
            }
        }
    }

    bytes_remaining = total_bytes_sent;

    while (bytes_remaining) {
        temp = (int)((OPAL_PTRDIFF_TYPE)broken_iovec[current].iov_base/stripe_size)
            % num_aggregators;

        if (part) {
            if (bytes_remaining > part) {
                memcpy ((IOVBASE_TYPE *)((OPAL_PTRDIFF_TYPE)sbuf[temp]+
                                         temp_position[temp]),
                        (IOVBASE_TYPE *)((OPAL_PTRDIFF_TYPE)send_buf +
                                         (total_bytes_sent-bytes_remaining)),
                        part);
                bytes_remaining -= part;
                temp_position[temp] += part;
                part = 0;
                current ++;
            }
            else  {
                memcpy ((IOVBASE_TYPE *)((OPAL_PTRDIFF_TYPE)sbuf[temp]+
                                         temp_position[temp]),
                        (IOVBASE_TYPE *)((OPAL_PTRDIFF_TYPE)send_buf +
                                         (total_bytes_sent-bytes_remaining)),
                        bytes_remaining);
                break;
            }
        }
        else {
            if (bytes_remaining > broken_iovec[current].iov_len) {
                memcpy ((IOVBASE_TYPE *)((OPAL_PTRDIFF_TYPE)sbuf[temp]+
                                         temp_position[temp]),
                        (IOVBASE_TYPE *)((OPAL_PTRDIFF_TYPE)send_buf +
                                         (total_bytes_sent-bytes_remaining)),
                        broken_iovec[current].iov_len);
                bytes_remaining -= broken_iovec[current].iov_len;
                temp_position[temp] += broken_iovec[current].iov_len;
                current ++;
            }
            else {
                memcpy ((IOVBASE_TYPE *)((OPAL_PTRDIFF_TYPE)sbuf[temp]+
                                         temp_position[temp]),
                        (IOVBASE_TYPE *)((OPAL_PTRDIFF_TYPE)send_buf +
                                         (total_bytes_sent-bytes_remaining)),
                        bytes_remaining);
                break;
            }
        }
    }

    sendreq = (MPI_Request *)malloc (num_aggregators * sizeof(MPI_Request));
    if (NULL == sendreq) {
        free(sbuf);
        free(temp_position);
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    if (0 == fh->f_rank%fh->f_aggregator_index) {
        req = (MPI_Request *)malloc (fh->f_size * sizeof(MPI_Request));
        if (NULL == req) {
            free(sbuf);
            free(temp_position);
            free(sendreq);
            return OMPI_ERR_OUT_OF_RESOURCE;
        }
        for (i=0; i<fh->f_size ; i++) {
            if (bytes_per_process[i]) {
                rc = MCA_PML_CALL(irecv((char *)global_buf + displs[i],
                                        bytes_per_process[i],
                                        MPI_BYTE,
                                        i,
                                        OMPIO_TAG_GATHERV,
                                        fh->f_comm,
                                        &req[i]));
                if (OMPI_SUCCESS != rc) {
                    goto exit;
                }
            }
        }
    }

    for (i=0 ; i<num_aggregators ; i++) {
        if (bytes_sent[i]) {
            rc = MCA_PML_CALL(isend(sbuf[i],
                                    bytes_sent[i],
                                    MPI_BYTE,
                                    i*fh->f_aggregator_index,
                                    OMPIO_TAG_GATHERV,
                                    MCA_PML_BASE_SEND_STANDARD,
                                    fh->f_comm,
                                    &sendreq[i]));
            if (OMPI_SUCCESS != rc) {
                goto exit;
            }
        }
    }

    if (0 == fh->f_rank%fh->f_aggregator_index) {
        for (i=0; i<fh->f_size ; i++) {
            if (bytes_per_process[i]) {
                rc = ompi_request_wait (&req[i], MPI_STATUS_IGNORE);
                if (OMPI_SUCCESS != rc) {
                    goto exit;
                }
            }
        }
    }
    for (i=0; i<num_aggregators ; i++) {
        if (bytes_sent[i]) {
            rc = ompi_request_wait (&sendreq[i], MPI_STATUS_IGNORE);
            if (OMPI_SUCCESS != rc) {
                goto exit;
            }
        }
    }
    /*
    for (i=0 ; i<num_aggregators ; i++) {
        fh->f_comm->c_coll.coll_gatherv (sbuf[i],
                                         bytes_sent[i],
                                         MPI_BYTE,
                                         global_buf,
                                         bytes_per_process,
                                         displs,
                                         MPI_BYTE,
                                         i*fh->f_aggregator_index,
                                         fh->f_comm,
                                         fh->f_comm->c_coll.coll_gatherv_module);
    }
    */

 exit:
    for (i=0 ; i<num_aggregators ; i++) {
        if (NULL != sbuf[i]) {
            free (sbuf[i]);
        }
    }
    free (sbuf);
    if (NULL != req) {
        free (req);
    }
    if (NULL != sendreq) {
        free (sendreq);
    }
    free (temp_position);

    return rc;
}

int ompi_io_ompio_scatter_data (mca_io_ompio_file_t *fh,
                                void *receive_buf,
                                size_t total_bytes_recv,
                                int *bytes_received,
                                struct iovec *broken_iovec,
                                int broken_index,
                                size_t partial,
                                void *global_buf,
                                int *bytes_per_process,
                                int *displs,
                                int num_aggregators,
                                size_t stripe_size)
{
    void **rbuf = NULL;
    size_t bytes_remaining;
    size_t *temp_position = NULL;
    size_t part;
    int current;
    int temp = 0;
    int i = 0;
    int rc = OMPI_SUCCESS;
    MPI_Request *req=NULL, *recvreq=NULL;


    current = broken_index;
    part = partial;

    rbuf = (void**) malloc (num_aggregators * sizeof(void *));
    if (NULL == rbuf) {
        opal_output (1, "OUT OF MEMORY\n");
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    temp_position = (size_t *) malloc (num_aggregators * sizeof(size_t));
    if (NULL == temp_position) {
        opal_output (1, "OUT OF MEMORY\n");
        free(rbuf);
        return OMPI_ERR_OUT_OF_RESOURCE;
    }
    memset (temp_position, 0x0, num_aggregators * sizeof (size_t));

    for (i=0 ; i<num_aggregators ; i++) {
        rbuf[i] = NULL;
        if (0 != bytes_received[i]) {
            rbuf[i] = (void *) malloc (bytes_received[i]);
            if (NULL == rbuf[i]) {
                int j;
                opal_output (1, "OUT OF MEMORY\n");
                free(temp_position);
                for (j=0; j<i; j++) {
                    free(rbuf[j]);
                }
                free(rbuf);
                return OMPI_ERR_OUT_OF_RESOURCE;
            }
        }
    }

    recvreq = (MPI_Request *)malloc (num_aggregators * sizeof(MPI_Request));
    if (NULL == recvreq) {
        free(temp_position);
        for (i=0; i<num_aggregators; i++) {
            free(rbuf[i]);
        }
        free(rbuf);
        return OMPI_ERR_OUT_OF_RESOURCE;
    }
    for (i=0 ; i<num_aggregators ; i++) {
        if (bytes_received[i]) {
            rc = MCA_PML_CALL(irecv(rbuf[i],
                                    bytes_received[i],
                                    MPI_BYTE,
                                    i*fh->f_aggregator_index,
                                    OMPIO_TAG_SCATTERV,
                                    fh->f_comm,
                                    &recvreq[i]));
            if (OMPI_SUCCESS != rc) {
                goto exit;
            }
        }
    }

    if (0 == fh->f_rank%fh->f_aggregator_index) {
        req = (MPI_Request *)malloc (fh->f_size * sizeof(MPI_Request));
        if (NULL == req) {
            free(temp_position);
            for (i=0; i<num_aggregators; i++) {
                free(rbuf[i]);
            }
            free(rbuf);
            free(recvreq);
            return OMPI_ERR_OUT_OF_RESOURCE;
        }
        for (i=0; i<fh->f_size ; i++) {
            if (bytes_per_process[i]) {
                rc = MCA_PML_CALL(isend((char *)global_buf + displs[i],
                                        bytes_per_process[i],
                                        MPI_BYTE,
                                        i,
                                        OMPIO_TAG_SCATTERV,
                                        MCA_PML_BASE_SEND_STANDARD,
                                        fh->f_comm,
                                        &req[i]));
                if (OMPI_SUCCESS != rc) {
                    goto exit;
                }
            }
        }
    }

    for (i=0; i<num_aggregators ; i++) {
        if (bytes_received[i]) {
            rc = ompi_request_wait (&recvreq[i], MPI_STATUS_IGNORE);
            if (OMPI_SUCCESS != rc) {
                goto exit;
            }
        }
    }
    if (0 == fh->f_rank%fh->f_aggregator_index) {
        for (i=0; i<fh->f_size ; i++) {
            if (bytes_per_process[i]) {
                rc = ompi_request_wait (&req[i], MPI_STATUS_IGNORE);
                if (OMPI_SUCCESS != rc) {
                    goto exit;
                }
            }
        }
    }
    /*
    for (i=0 ; i<num_aggregators ; i++) {
        fh->f_comm->c_coll.coll_scatterv (global_buf,
                                          bytes_per_process,
                                          displs,
                                          MPI_BYTE,
                                          rbuf[i],
                                          bytes_received[i],
                                          MPI_BYTE,
                                          i*fh->f_aggregator_index,
                                          fh->f_comm,
                                          fh->f_comm->c_coll.coll_scatterv_module);
    }
    */
    bytes_remaining = total_bytes_recv;

    while (bytes_remaining) {
        temp = (int)((OPAL_PTRDIFF_TYPE)broken_iovec[current].iov_base/stripe_size)
            % num_aggregators;

        if (part) {
            if (bytes_remaining > part) {
                memcpy ((IOVBASE_TYPE *)((OPAL_PTRDIFF_TYPE)receive_buf +
                                         (total_bytes_recv-bytes_remaining)),
                        (IOVBASE_TYPE *)((OPAL_PTRDIFF_TYPE)rbuf[temp]+
                                         temp_position[temp]),
                        part);
                bytes_remaining -= part;
                temp_position[temp] += part;
                part = 0;
                current ++;
            }
            else  {
                memcpy ((IOVBASE_TYPE *)((OPAL_PTRDIFF_TYPE)receive_buf +
                                         (total_bytes_recv-bytes_remaining)),
                        (IOVBASE_TYPE *)((OPAL_PTRDIFF_TYPE)rbuf[temp]+
                                         temp_position[temp]),
                        bytes_remaining);
                break;
            }
        }
        else {
            if (bytes_remaining > broken_iovec[current].iov_len) {
                memcpy ((IOVBASE_TYPE *)((OPAL_PTRDIFF_TYPE)receive_buf +
                                         (total_bytes_recv-bytes_remaining)),
                        (IOVBASE_TYPE *)((OPAL_PTRDIFF_TYPE)rbuf[temp]+
                                         temp_position[temp]),
                        broken_iovec[current].iov_len);
                bytes_remaining -= broken_iovec[current].iov_len;
                temp_position[temp] += broken_iovec[current].iov_len;
                current ++;
            }
            else {
                memcpy ((IOVBASE_TYPE *)((OPAL_PTRDIFF_TYPE)receive_buf +
                                         (total_bytes_recv-bytes_remaining)),
                        (IOVBASE_TYPE *)((OPAL_PTRDIFF_TYPE)rbuf[temp]+
                                         temp_position[temp]),
                        bytes_remaining);
                break;
            }
        }
    }

 exit:
    for (i=0 ; i<num_aggregators ; i++) {
        if (NULL != rbuf[i]) {
            free (rbuf[i]);
            rbuf[i] = NULL;
        }
    }
    if (NULL != req) {
        free (req);
    }
    if (NULL != recvreq) {
        free (recvreq);
    }
    if (NULL != rbuf) {
        free (rbuf);
        rbuf = NULL;
    }
    if (NULL != temp_position) {
        free (temp_position);
        temp_position = NULL;
    }

    return rc;
}

void mca_io_ompio_get_num_aggregators ( int *num_aggregators)
{
    *num_aggregators = mca_io_ompio_num_aggregators;
    return;
}

void mca_io_ompio_get_bytes_per_agg ( int *bytes_per_agg)
{
    *bytes_per_agg = mca_io_ompio_bytes_per_agg;
    return;
}

/* Print queue related function implementations */
int ompi_io_ompio_set_print_queue (mca_io_ompio_print_queue **q,
				   int queue_type){

    int  ret = OMPI_SUCCESS;

    switch(queue_type) {

    case WRITE_PRINT_QUEUE:
	*q = coll_write_time;
	break;
    case READ_PRINT_QUEUE:
	*q = coll_read_time;
	break;
    }

    if (NULL == q){
	ret = OMPI_ERROR;
    }
    return ret;

}


int ompi_io_ompio_initialize_print_queue(mca_io_ompio_print_queue *q){

    int ret = OMPI_SUCCESS;
    q->first = 0;
    q->last = QUEUESIZE - 1;
    q->count = 0;
    return ret;
}
int ompi_io_ompio_register_print_entry (int queue_type,
					mca_io_ompio_print_entry x){

    int ret = OMPI_SUCCESS;
    mca_io_ompio_print_queue *q=NULL;

    ret = ompi_io_ompio_set_print_queue(&q, queue_type);

    if (ret != OMPI_ERROR){
	if (q->count >= QUEUESIZE){
	    return OMPI_ERROR;
	}
	else{
	    q->last = (q->last + 1) % QUEUESIZE;
	    q->entry[q->last] = x;
	    q->count = q->count + 1;
	}
    }
    return ret;
}

int  ompi_io_ompio_unregister_print_entry (int queue_type,
					   mca_io_ompio_print_entry *x){

    int ret = OMPI_SUCCESS;
    mca_io_ompio_print_queue *q=NULL;
    ret = ompi_io_ompio_set_print_queue(&q, queue_type);
    if (ret != OMPI_ERROR){
	if (q->count <= 0){
	    return OMPI_ERROR;
	}
	else{
	    *x = q->entry[q->first];
	    q->first = (q->first+1) % QUEUESIZE;
	    q->count = q->count - 1;
	}
    }
    return OMPI_SUCCESS;
}

int ompi_io_ompio_empty_print_queue(int queue_type){

    int ret = OMPI_SUCCESS;
    mca_io_ompio_print_queue *q=NULL;
    ret =  ompi_io_ompio_set_print_queue(&q, queue_type);

    assert (ret != OMPI_ERROR);
    (void)ret;  // silence compiler warning
    if (q->count == 0)
	    return 1;
    else
	return 0;


}

int ompi_io_ompio_full_print_queue(int queue_type){


    int ret = OMPI_SUCCESS;
    mca_io_ompio_print_queue *q=NULL;
    ret =  ompi_io_ompio_set_print_queue(&q, queue_type);

    assert ( ret != OMPI_ERROR);
    (void)ret;  // silence compiler warning
    if (q->count < QUEUESIZE)
	    return 0;
    else
	return 1;

}


int ompi_io_ompio_print_time_info(int queue_type,
				  char *name,
				  mca_io_ompio_file_t *fh){

    int i = 0, j=0, nprocs_for_coll = 0, ret = OMPI_SUCCESS, count = 0;
    double *time_details = NULL, *final_sum = NULL;
    double *final_max = NULL, *final_min = NULL;
    double *final_time_details=NULL;
    mca_io_ompio_print_queue *q=NULL;

    ret =  ompi_io_ompio_set_print_queue(&q, queue_type);

    assert (ret != OMPI_ERROR);
    nprocs_for_coll = q->entry[0].nprocs_for_coll;
    time_details = (double *) malloc (4*sizeof(double));
    if ( NULL == time_details){
	ret = OMPI_ERR_OUT_OF_RESOURCE;
	goto exit;

    }

    if (!fh->f_rank){

	final_min = (double *) malloc (3*sizeof(double));
	if ( NULL == final_min){
	    ret = OMPI_ERR_OUT_OF_RESOURCE;
	    goto exit;
	}

	final_max = (double *) malloc (3*sizeof(double));
	if ( NULL == final_max){
	    ret = OMPI_ERR_OUT_OF_RESOURCE;
	    goto exit;

	}

	final_sum = (double *) malloc (3*sizeof(double));
	if ( NULL == final_sum){
	    ret = OMPI_ERR_OUT_OF_RESOURCE;
	    goto exit;
	}

	final_time_details =
	    (double *)malloc
	    (fh->f_size * 4 * sizeof(double));
	if (NULL == final_time_details){
	    ret = OMPI_ERR_OUT_OF_RESOURCE;
	    goto exit;
	}

	count = 4 * fh->f_size;
	for(i=0;i<count;i++){
	    final_time_details[i] = 0.0;
	}


    }

    for (i = 0; i < 4; i++){
	time_details[i] = 0.0;
    }

    if (q->count > 0){
	for (i=0; i < q->count; i++){
	    for (j=0;j<3;j++){
		if (!fh->f_rank){
		    final_min[j] = 100000.0;
		    final_max[j] = 0.0;
		    final_sum[j] = 0.0;
		}
		time_details[j] += q->entry[i].time[j];
	    }
	    time_details[3]  = q->entry[i].aggregator;
	}
    }

    fh->f_comm->c_coll.coll_gather(time_details,
				   4,
				   MPI_DOUBLE,
				   final_time_details,
				   4,
				   MPI_DOUBLE,
				   0,
				   fh->f_comm,
				   fh->f_comm->c_coll.coll_gather_module);



    if (!fh->f_rank){

	for (i=0;i<count;i+=4){
	    if (final_time_details[i+3] == 1){
		final_sum[0] += final_time_details[i];
		final_sum[1] += final_time_details[i+1];
		final_sum[2] += final_time_details[i+2];

		if ( final_time_details[i] < final_min[0])
		    final_min[0] = final_time_details[i];
		if ( final_time_details[i+1] < final_min[1])
		    final_min[1] = final_time_details[i+1];
		if ( final_time_details[i+2] < final_min[2])
		    final_min[2] = final_time_details[i+2];



		if ( final_time_details[i] > final_max[0])
		    final_max[0] = final_time_details[i];
		if ( final_time_details[i+1] > final_max[1])
		    final_max[1] = final_time_details[i+1];
		if ( final_time_details[i+2] > final_max[2])
		    final_max[2] = final_time_details[i+2];

	    }
	}

	printf ("\n# MAX-%s AVG-%s MIN-%s MAX-COMM AVG-COMM MIN-COMM",
		name, name, name);
	printf (" MAX-EXCH AVG-EXCH MIN-EXCH\n");
	printf (" %f %f %f %f %f %f %f %f %f\n\n",
		final_max[0], final_sum[0]/nprocs_for_coll, final_min[0],
		final_max[1], final_sum[1]/nprocs_for_coll, final_min[1],
		final_max[2], final_sum[2]/nprocs_for_coll, final_min[2]);

    }

 exit:
    if ( NULL != final_max){
	free(final_max);
	final_max = NULL;
    }
    if (NULL != final_min){
	free(final_min);
	final_min = NULL;
    }
    if (NULL != final_sum){
	free(final_sum);
	final_sum = NULL;
    }
    if (NULL != time_details){
	free(time_details);
	time_details = NULL;
    }

    return ret;
}

int mca_io_ompio_create_groups(mca_io_ompio_file_t *fh,
		               size_t bytes_per_proc)
{

    int is_aggregator = 0;
    int final_aggr = 0;
    int final_num_aggrs = 0;
    int ompio_grouping_flag = 0;

    int *decision_list = NULL;

    OMPI_MPI_OFFSET_TYPE *start_offsets_lens = NULL;
    OMPI_MPI_OFFSET_TYPE *end_offsets = NULL;
    OMPI_MPI_OFFSET_TYPE bytes_per_group = 0;
    OMPI_MPI_OFFSET_TYPE *aggr_bytes_per_group = NULL;

    mca_io_ompio_prepare_to_group(fh,
	                          &start_offsets_lens,
	                          &end_offsets,
	                          &aggr_bytes_per_group,
				  &bytes_per_group,
	                          &decision_list,
                                  bytes_per_proc,
	                          &is_aggregator,
	                          &ompio_grouping_flag);

    switch(ompio_grouping_flag){

        case OMPIO_SPLIT:
            mca_io_ompio_split_initial_groups(fh,
                                              start_offsets_lens,
	       			              end_offsets,
					      bytes_per_group);
        break;

        case OMPIO_MERGE:
            mca_io_ompio_merge_initial_groups(fh,
		                              aggr_bytes_per_group,
					      decision_list,
				              is_aggregator);
        break;

        case  OMPIO_RETAIN:

            mca_io_ompio_retain_initial_groups(fh);

        break;


    }

    //Set aggregator index
    fh->f_aggregator_index = 0;

    //Calculate final number of aggregators
    if(fh->f_rank == fh->f_procs_in_group[fh->f_aggregator_index]){
	   final_aggr = 1;
    }
    fh->f_comm->c_coll.coll_allreduce (&final_aggr,
		                       &final_num_aggrs,
			               1,
			               MPI_INT,
			               MPI_SUM,
			               fh->f_comm,
			               fh->f_comm->c_coll.coll_allreduce_module);

    //Set final number of aggregators in file handle
    fh->f_final_num_aggrs = final_num_aggrs;

    //Print final number of aggregators if required

    /*if(fh->f_rank == 0){
        printf("Rank %d : has final_num_aggrs = %d\n",fh->f_rank,final_num_aggrs);
    }*/

    //Print final grouping
    /*if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank)  {
        for (j=0 ; j<fh->f_procs_per_group; j++) {
            printf ("%d: Proc %d: %d\n", fh->f_rank, j, fh->f_procs_in_group[j]);
        }

	printf("\n\n");
    }

   */
    if (NULL != start_offsets_lens) {
        free (start_offsets_lens);
        start_offsets_lens =  NULL;
    }
    if (NULL != end_offsets) {
        free (end_offsets);
        end_offsets =  NULL;
    }
    if(NULL != aggr_bytes_per_group){
      free(aggr_bytes_per_group);
      aggr_bytes_per_group = NULL;
    }
    if( NULL != decision_list){
      free(decision_list);
      decision_list = NULL;
    }


   return OMPI_SUCCESS;
}

int mca_io_ompio_merge_initial_groups(mca_io_ompio_file_t *fh,
		                      OMPI_MPI_OFFSET_TYPE *aggr_bytes_per_group,
				      int *decision_list,
	                              int is_aggregator){

    OMPI_MPI_OFFSET_TYPE sum_bytes = 0;

    MPI_Request *sendreq = NULL;

    int start = 0;
    int end = 0;
    int i = 0;
    int j = 0;
    int r  = 0;

    int merge_pair_flag = 4;
    int first_merge_flag = 4;

    int *merge_aggrs = NULL;

    int is_new_aggregator= 0;


    if(is_aggregator){
        i = 0;
	sum_bytes = 0;
        //go through the decision list
	//Find the aggregators that could merge

	while(i < fh->f_init_num_aggrs){
	    while(1){
	        if( i >= fh->f_init_num_aggrs){
	            break;
	        }
	        else if((decision_list[i] == OMPIO_MERGE) &&
	                (sum_bytes <= mca_io_ompio_bytes_per_agg)){
	            sum_bytes = sum_bytes + aggr_bytes_per_group[i];
	            decision_list[i] = merge_pair_flag;
	            i++;
	        }
	        else if((decision_list[i] == OMPIO_MERGE) &&
	                (sum_bytes >= mca_io_ompio_bytes_per_agg)){
	           if(decision_list[i+1] == OMPIO_MERGE){
	               merge_pair_flag++;
	               decision_list[i] = merge_pair_flag;
	               sum_bytes = aggr_bytes_per_group[i];
	               i++;
	           }
	           else{
	               decision_list[i] = merge_pair_flag;
	               i++;
	           }
	        }
	        else{
	            i++;
	            if(decision_list[i] == OMPIO_MERGE)
	               merge_pair_flag++;
	            sum_bytes = 0;
	            break;
	        }
	    }
        }

        //Now go through the new edited decision list and
	//make lists of aggregators to merge and number
	//of groups to me merged.
	i = 0;
	j = 0;

	while(i < fh->f_init_num_aggrs){
	   if(decision_list[i] >= first_merge_flag){
	       start = i;
	       while((decision_list[i] >= first_merge_flag) &&
		      (i < fh->f_init_num_aggrs-1)){
	           if(decision_list[i+1] == decision_list[i]){
	               i++;
	           }
	           else{
	               break;
	           }
	           end = i;
	       }
	       merge_aggrs = (int *)malloc((end - start + 1) * sizeof(int));
	       if (NULL == merge_aggrs) {
		  opal_output (1, "OUT OF MEMORY\n");
		  return OMPI_ERR_OUT_OF_RESOURCE;
	       }
	       j = 0;
	       for( j = 0 ; j < end - start + 1; j++){
	           merge_aggrs[j] = fh->f_init_aggr_list[start+j];
	       }
               if(fh->f_rank == merge_aggrs[0])
	          is_new_aggregator = 1;

	       for( j = 0 ; j < end-start+1 ;j++){
	          if(fh->f_rank == merge_aggrs[j]){
	              mca_io_ompio_merge_groups(fh,
		                                merge_aggrs,
			                        end-start+1);
		  }
	       }
               if(NULL != merge_aggrs){
	           free(merge_aggrs);
                   merge_aggrs = NULL;
               }

	   }
           i++;
        }

    }//end old aggregators

    //New aggregators communicate new grouping info to the groups
    if(is_new_aggregator){
       sendreq = (MPI_Request *)malloc ( 2 *fh->f_procs_per_group * sizeof(MPI_Request));
       if (NULL == sendreq) {
          return OMPI_ERR_OUT_OF_RESOURCE;
       }
       //Communicate grouping info
       for( j = 0 ; j < fh->f_procs_per_group; j++){
	   if (fh->f_procs_in_group[j] == fh->f_rank ) {
	       continue;
	   }
           //new aggregator sends new procs_per_group to all its members
	   MCA_PML_CALL(isend(&fh->f_procs_per_group,
                              1,
                              MPI_INT,
                              fh->f_procs_in_group[j],
                              OMPIO_PROCS_PER_GROUP_TAG,
                              MCA_PML_BASE_SEND_STANDARD,
                              fh->f_comm,
                              &sendreq[r++]));
	   //new aggregator sends distribution of process to all its new members
	   MCA_PML_CALL(isend(fh->f_procs_in_group,
			      fh->f_procs_per_group,
			      MPI_INT,
			      fh->f_procs_in_group[j],
			      OMPIO_PROCS_IN_GROUP_TAG,
			      MCA_PML_BASE_SEND_STANDARD,
			      fh->f_comm,
			      &sendreq[r++]));

       }
    }
    else {
	//All non aggregators
	//All processes receive initial process distribution from aggregators
	MCA_PML_CALL(recv(&fh->f_procs_per_group,
			  1,
			  MPI_INT,
			  MPI_ANY_SOURCE,
			  OMPIO_PROCS_PER_GROUP_TAG,
			  fh->f_comm,
			  MPI_STATUS_IGNORE));

	fh->f_procs_in_group = (int*)malloc (fh->f_procs_per_group * sizeof(int));
	if (NULL == fh->f_procs_in_group) {
	    opal_output (1, "OUT OF MEMORY\n");
	    return OMPI_ERR_OUT_OF_RESOURCE;
	}

	MCA_PML_CALL(recv(fh->f_procs_in_group,
			  fh->f_procs_per_group,
			  MPI_INT,
			  MPI_ANY_SOURCE,
			  OMPIO_PROCS_IN_GROUP_TAG,
			  fh->f_comm,
			  MPI_STATUS_IGNORE));
    }

    if(is_new_aggregator) {
	ompi_request_wait_all (r, sendreq, MPI_STATUSES_IGNORE);
	free (sendreq);
    }

    return OMPI_SUCCESS;
}

int mca_io_ompio_split_initial_groups(mca_io_ompio_file_t *fh,
		                      OMPI_MPI_OFFSET_TYPE *start_offsets_lens,
				      OMPI_MPI_OFFSET_TYPE *end_offsets,
				      OMPI_MPI_OFFSET_TYPE bytes_per_group){


    int size_new_group = 0;
    int size_old_group = 0;
    int size_last_group = 0;
    int size_smallest_group = 0;
    int num_groups = 0;

    OMPI_MPI_OFFSET_TYPE max_cci = 0;
    OMPI_MPI_OFFSET_TYPE min_cci = 0;

    size_new_group = ceil ((float)mca_io_ompio_bytes_per_agg * fh->f_init_procs_per_group/ bytes_per_group);
    size_old_group = fh->f_init_procs_per_group;

    mca_io_ompio_split_a_group(fh,
                               start_offsets_lens,
                               end_offsets,
                               size_new_group,
                               &max_cci,
                               &min_cci,
                               &num_groups,
                               &size_smallest_group);

    switch(mca_io_ompio_grouping_option){
        case DATA_VOLUME:
            //Just use size as returned by split group
            size_last_group = size_smallest_group;
	break;

	case UNIFORM_DISTRIBUTION:
	    if(size_smallest_group <= OMPIO_UNIFORM_DIST_THRESHOLD * size_new_group){
	        //uneven split need to call split again
	        if( size_old_group % num_groups == 0 ){
	           //most even distribution possible
	           size_new_group = size_old_group / num_groups;
	           size_last_group = size_new_group;
	        }
	        else{
	            //merge the last small group with the previous group
	            size_last_group = size_new_group + size_smallest_group;
	        }
	    }
	    else{
	         //Considered uniform
	         size_last_group = size_smallest_group;
	    }
	break;

	case CONTIGUITY:

   	    while(1){
		 if((max_cci < OMPIO_CONTG_THRESHOLD) &&
		    (size_new_group < size_old_group)){

		    size_new_group = floor( (float) (size_new_group + size_old_group ) / 2 );
  	            mca_io_ompio_split_a_group(fh,
		                               start_offsets_lens,
		                               end_offsets,
		                               size_new_group,
		                               &max_cci,
		                               &min_cci,
		                               &num_groups,
		                               &size_smallest_group);
                 }
                 else{
                     break;
                 }
            }
	    size_last_group = size_smallest_group;
	break;

	case OPTIMIZE_GROUPING:
            //This case is a combination of Data volume, contiguity and uniform distribution
	    while(1){
	         if((max_cci < OMPIO_CONTG_THRESHOLD) &&
	            (size_new_group < size_old_group)){  //can be a better condition
                 //monitor the previous iteration
		 //break if it has not changed.
	      	     size_new_group = ceil( (float) (size_new_group + size_old_group ) / 2 );
		     mca_io_ompio_split_a_group(fh,
		                                start_offsets_lens,
		                                end_offsets,
		                                size_new_group,
		                                &max_cci,
		                                &min_cci,
		                                &num_groups,
		                                &size_smallest_group);
		 }
		 else{
		     break;
		 }
	    }

	   if(size_smallest_group <= OMPIO_UNIFORM_DIST_THRESHOLD * size_new_group){
	       //uneven split need to call split again
	       if( size_old_group % num_groups == 0 ){
	           //most even distribution possible
	           size_new_group = size_old_group / num_groups;
		   size_last_group = size_new_group;
	       }
	       else{
	            //merge the last small group with the previous group
	            size_last_group = size_new_group + size_smallest_group;
	       }
	   }
	   else{
	       //Considered uniform
	       size_last_group = size_smallest_group;
	   }

	break;
    }

    mca_io_ompio_finalize_split(fh,
                                size_new_group,
	                        size_last_group);


    return OMPI_SUCCESS;
}


int mca_io_ompio_retain_initial_groups(mca_io_ompio_file_t *fh){

    int i = 0;

    fh->f_procs_per_group = fh->f_init_procs_per_group;
    fh->f_procs_in_group = (int*)malloc (fh->f_procs_per_group * sizeof(int));
    if (NULL == fh->f_procs_in_group) {
        opal_output (1, "OUT OF MEMORY\n");
        return OMPI_ERR_OUT_OF_RESOURCE;
    }
    for( i = 0 ; i < fh->f_procs_per_group; i++){
        fh->f_procs_in_group[i] = fh->f_init_procs_in_group[i];
    }


    return OMPI_SUCCESS;
}

int mca_io_ompio_merge_groups(mca_io_ompio_file_t *fh,
		              int *merge_aggrs,
			      int num_merge_aggrs)
{
    int i = 0;
    int *sizes_old_group;

    int *displs;



    sizes_old_group = (int*)malloc(num_merge_aggrs * sizeof(int));
    if (NULL == sizes_old_group) {
        opal_output (1, "OUT OF MEMORY\n");
        return OMPI_ERR_OUT_OF_RESOURCE;
    }


    displs = (int*)malloc(num_merge_aggrs * sizeof(int));
    if (NULL == displs) {
        opal_output (1, "OUT OF MEMORY\n");
        free(sizes_old_group);
        return OMPI_ERR_OUT_OF_RESOURCE;
    }


    //merge_aggrs[0] is considered the new aggregator
    //New aggregator collects group sizes of the groups to be merged
    ompi_io_ompio_allgather_array (&fh->f_init_procs_per_group,
     		                   1,
				   MPI_INT,
				   sizes_old_group,
				   1,
				   MPI_INT,
				   0,
				   merge_aggrs,
				   num_merge_aggrs,
				   fh->f_comm);

    fh->f_procs_per_group = 0;


    for( i = 0; i < num_merge_aggrs; i++){
        fh->f_procs_per_group = fh->f_procs_per_group + sizes_old_group[i];
    }

    displs[0] = 0;
    for(i = 1; i < num_merge_aggrs; i++){
	  displs[i] = displs[i-1] + sizes_old_group[i-1];
    }

    fh->f_procs_in_group = (int*)malloc (fh->f_procs_per_group * sizeof(int));
    if (NULL == fh->f_procs_in_group) {
        opal_output (1, "OUT OF MEMORY\n");
        free(sizes_old_group);
        free(displs);
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    //New aggregator also collects the grouping distribution
    //This is the actual merge
    //use allgatherv array
    ompi_io_ompio_allgatherv_array (fh->f_init_procs_in_group,
                                   fh->f_init_procs_per_group,
                                   MPI_INT,
                                   fh->f_procs_in_group,
                                   sizes_old_group,
				   displs,
                                   MPI_INT,
                                   0,
                                   merge_aggrs,
                                   num_merge_aggrs,
                                   fh->f_comm);


    free(displs);
    free (sizes_old_group);

    return OMPI_SUCCESS;

}



int mca_io_ompio_split_a_group(mca_io_ompio_file_t *fh,
     		             OMPI_MPI_OFFSET_TYPE *start_offsets_lens,
		             OMPI_MPI_OFFSET_TYPE *end_offsets,
		             int size_new_group,
		             OMPI_MPI_OFFSET_TYPE *max_cci,
		             OMPI_MPI_OFFSET_TYPE *min_cci,
		             int *num_groups,
		             int *size_smallest_group)
{

    OMPI_MPI_OFFSET_TYPE *cci = NULL;
    *num_groups = fh->f_init_procs_per_group / size_new_group;
    *size_smallest_group = size_new_group;
    int i = 0;
    int k = 0;
    int flag = 0; //all groups same size
    int size = 0;

    if( fh->f_init_procs_per_group % size_new_group != 0 ){
        *num_groups = *num_groups + 1;
	*size_smallest_group = fh->f_init_procs_per_group % size_new_group;
	flag = 1;
    }

    cci = (OMPI_MPI_OFFSET_TYPE*)malloc(*num_groups * sizeof( OMPI_MPI_OFFSET_TYPE ));
    if (NULL == cci) {
        opal_output(1, "OUT OF MEMORY\n");
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    //check contiguity within new groups
    size = size_new_group;
    for( i = 0; i < *num_groups; i++){
         cci[i] = start_offsets_lens[3*size_new_group*i  + 1];
         //if it is the last group check if it is the smallest group
	 if( (i == *num_groups-1) && flag == 1){
             size = *size_smallest_group;
	 }
	 for( k = 0; k < size-1; k++){
	     if( end_offsets[size_new_group* i + k] == start_offsets_lens[3*size_new_group*i + 3*(k+1)] ){
	         cci[i] += start_offsets_lens[3*size_new_group*i + 3*(k + 1) + 1];
	     }
       	 }
     }

     //get min and max cci
     *min_cci = cci[0];
     *max_cci = cci[0];
     for( i = 1 ; i < *num_groups; i++){
         if(cci[i] > *max_cci){
	     *max_cci = cci[i];
	 }
	 else if(cci[i] < *min_cci){
	     *min_cci = cci[i];
	 }
     }
     //if cci is not needed anymore
     if (NULL != cci) {
        free (cci);
	cci =  NULL;
     }
     return OMPI_SUCCESS;
}

int mca_io_ompio_finalize_split(mca_io_ompio_file_t *fh,
                                  int size_new_group,
                                  int size_last_group)
{
   //based on new group and last group finalize f_procs_per_group and f_procs_in_group

    int i = 0;
    int j = 0;
    int k = 0;

    for( i = 0; i < fh->f_init_procs_per_group ; i++){

        if( fh->f_rank == fh->f_init_procs_in_group[i]){
             if( i >= fh->f_init_procs_per_group - size_last_group ){
	         fh->f_procs_per_group = size_last_group;
	     }
             else{
	         fh->f_procs_per_group = size_new_group;
	     }
        }
    }


    fh->f_procs_in_group = (int*)malloc (fh->f_procs_per_group * sizeof(int));
    if (NULL == fh->f_procs_in_group) {
        opal_output (1, "OUT OF MEMORY\n");
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    for( i = 0; i < fh->f_init_procs_per_group ; i++){
        if( fh->f_rank == fh->f_init_procs_in_group[i]){
            if( i >= fh->f_init_procs_per_group - size_last_group ){
	       //distribution of last group
	       for( j = 0; j < fh->f_procs_per_group; j++){
	           fh->f_procs_in_group[j] = fh->f_init_procs_in_group[fh->f_init_procs_per_group - size_last_group + j];
	       }
	    }
	    else{
	         //distribute all other groups
		 for( j = 0 ; j < fh->f_init_procs_per_group; j = j + size_new_group){
	             if(i >= j && i < j+size_new_group  ){
                         for( k = 0; k < fh->f_procs_per_group ; k++){
	                    fh->f_procs_in_group[k] = fh->f_init_procs_in_group[j+k];
			 }
		     }
		 }
	    }

        }
    }

    return OMPI_SUCCESS;
}

int mca_io_ompio_prepare_to_group(mca_io_ompio_file_t *fh,
		                  OMPI_MPI_OFFSET_TYPE **start_offsets_lens,
				  OMPI_MPI_OFFSET_TYPE **end_offsets, // need it?
				  OMPI_MPI_OFFSET_TYPE **aggr_bytes_per_group,
				  OMPI_MPI_OFFSET_TYPE *bytes_per_group,
                                  int **decision_list,
		                  size_t bytes_per_proc,
				  int *is_aggregator,
				  int *ompio_grouping_flag)
{

    OMPI_MPI_OFFSET_TYPE start_offset_len[3] = {0};
    OMPI_MPI_OFFSET_TYPE *aggr_bytes_per_group_tmp = NULL;
    OMPI_MPI_OFFSET_TYPE *start_offsets_lens_tmp = NULL;
    OMPI_MPI_OFFSET_TYPE *end_offsets_tmp = NULL;
    int *decision_list_tmp = NULL;

    int i = 0;
    int j = 0;
    int k = 0;
    int merge_count = 0;
    int split_count = 0; //not req?
    int retain_as_is_count = 0; //not req?


    //Store start offset and length in an array //also add bytes per process
    if(NULL == fh->f_decoded_iov){
         start_offset_len[0] = 0;
         start_offset_len[1] = 0;
    }
    else{
         start_offset_len[0] = (OMPI_MPI_OFFSET_TYPE) fh->f_decoded_iov[0].iov_base;
         start_offset_len[1] = fh->f_decoded_iov[0].iov_len;
    }
    start_offset_len[2] = bytes_per_proc;
    start_offsets_lens_tmp = (OMPI_MPI_OFFSET_TYPE* )malloc (3 * fh->f_init_procs_per_group * sizeof(OMPI_MPI_OFFSET_TYPE));
    if (NULL == start_offsets_lens_tmp) {
        opal_output (1, "OUT OF MEMORY\n");
        return OMPI_ERR_OUT_OF_RESOURCE;
    }
    end_offsets_tmp = (OMPI_MPI_OFFSET_TYPE* )malloc (fh->f_init_procs_per_group * sizeof(OMPI_MPI_OFFSET_TYPE));
    if (NULL == end_offsets_tmp) {
        opal_output (1, "OUT OF MEMORY\n");
        free(start_offsets_lens_tmp);
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    //Gather start offsets across processes in a group on aggregator
    ompi_io_ompio_allgather_array (start_offset_len,
                                   3,
                                   OMPI_OFFSET_DATATYPE,
                                   start_offsets_lens_tmp,
                                   3,
                                   OMPI_OFFSET_DATATYPE,
                                   0,
                                   fh->f_init_procs_in_group,
                                   fh->f_init_procs_per_group,
                                   fh->f_comm);
    for( k = 0 ; k < fh->f_init_procs_per_group; k++){
        end_offsets_tmp[k] = start_offsets_lens_tmp[3*k] + start_offsets_lens_tmp[3*k+1];
    }
    //Every process has the total bytes written in its group
    for(j = 0; j < fh->f_init_procs_per_group; j++){
        *bytes_per_group = *bytes_per_group + start_offsets_lens_tmp[3*j+2];
    }

    *start_offsets_lens = &start_offsets_lens_tmp[0];
    *end_offsets = &end_offsets_tmp[0];


    for( j = 0 ; j < fh->f_init_num_aggrs ; j++){
        if(fh->f_rank == fh->f_init_aggr_list[j])
           *is_aggregator = 1;
    }
    //Decide groups going in for a merge or a split
    //Merge only if the groups are consecutive
    if(*is_aggregator == 1){
       aggr_bytes_per_group_tmp = (OMPI_MPI_OFFSET_TYPE*)malloc (fh->f_init_num_aggrs * sizeof(OMPI_MPI_OFFSET_TYPE));
       if (NULL == aggr_bytes_per_group_tmp) {
          opal_output (1, "OUT OF MEMORY\n");
          return OMPI_ERR_OUT_OF_RESOURCE;
       }
    decision_list_tmp = (int* )malloc (fh->f_init_num_aggrs * sizeof(int));
    if (NULL == decision_list_tmp) {
        opal_output (1, "OUT OF MEMORY\n");
        free(aggr_bytes_per_group_tmp);
        free(start_offsets_lens_tmp);
        free(end_offsets_tmp);
        return OMPI_ERR_OUT_OF_RESOURCE;
    }
    //Communicate bytes per group between all aggregators
    ompi_io_ompio_allgather_array (bytes_per_group,
                                   1,
                                   OMPI_OFFSET_DATATYPE,
                                   aggr_bytes_per_group_tmp,
                                   1,
                                   OMPI_OFFSET_DATATYPE,
                                   0,
                                   fh->f_init_aggr_list,
                                   fh->f_init_num_aggrs,
                                   fh->f_comm);

    for( i = 0; i < fh->f_init_num_aggrs; i++){
       if((size_t)(aggr_bytes_per_group_tmp[i])>
          (size_t)mca_io_ompio_bytes_per_agg){
          decision_list_tmp[i] = OMPIO_SPLIT;
          split_count++;
       }
       else if((size_t)(aggr_bytes_per_group_tmp[i])<
               (size_t)mca_io_ompio_bytes_per_agg){
            decision_list_tmp[i] = OMPIO_MERGE;
            merge_count++;
       }
       else{
	   decision_list_tmp[i] = OMPIO_RETAIN;
	   retain_as_is_count++;
	   }
    }

    *aggr_bytes_per_group = &aggr_bytes_per_group_tmp[0];
    //Go through the decision list to see if non consecutive
    //processes intend to merge, if yes retain original grouping
    for( i = 0; i < fh->f_init_num_aggrs ; i++){
        if(decision_list_tmp[i] == OMPIO_MERGE){
	    if( (i == 0) &&
	        (decision_list_tmp[i+1] != OMPIO_MERGE)){ //first group
		    decision_list_tmp[i] = OMPIO_RETAIN;
            }
	    else if( (i == fh->f_init_num_aggrs-1) &&
	             (decision_list_tmp[i-1] != OMPIO_MERGE)){

	        decision_list_tmp[i] = OMPIO_RETAIN;
	    }
	    else if(!((decision_list_tmp[i-1] == OMPIO_MERGE) ||
                      (decision_list_tmp[i+1] == OMPIO_MERGE))){

		 decision_list_tmp[i] = OMPIO_RETAIN;
	    }
        }
    }

    //Set the flag as per the decision list
    for( i = 0 ; i < fh->f_init_num_aggrs; i++){
        if((decision_list_tmp[i] == OMPIO_MERGE)&&
	   (fh->f_rank == fh->f_init_aggr_list[i]))
           *ompio_grouping_flag = OMPIO_MERGE;

       	if((decision_list_tmp[i] == OMPIO_SPLIT)&&
	   (fh->f_rank == fh->f_init_aggr_list[i]))
           *ompio_grouping_flag = OMPIO_SPLIT;

	if((decision_list_tmp[i] == OMPIO_RETAIN)&&
	   (fh->f_rank == fh->f_init_aggr_list[i]))
           *ompio_grouping_flag = OMPIO_RETAIN;
    }

    //print decision list of aggregators
    /*printf("RANK%d  : Printing decsion list   : \n",fh->f_rank);
    for( i = 0; i < fh->f_init_num_aggrs; i++){
        if(decision_list_tmp[i] == OMPIO_MERGE)
            printf("MERGE,");
        else if(decision_list_tmp[i] == OMPIO_SPLIT)
            printf("SPLIT, ");
	else if(decision_list_tmp[i] == OMPIO_RETAIN)
	    printf("RETAIN, " );
    }
    printf("\n\n");
   */
   *decision_list = &decision_list_tmp[0];
  }
    //Communicate flag to all group members
    ompi_io_ompio_bcast_array (ompio_grouping_flag,
                               1,
                               MPI_INT,
                               0,
                               fh->f_init_procs_in_group,
                               fh->f_init_procs_per_group,
                               fh->f_comm);



    return OMPI_SUCCESS;
}