openmpi/ompi/mca/bcol/basesmuma/bcol_basesmuma_smcm.c

/*
 *
 * Copyright (c) 2009-2012 Oak Ridge National Laboratory.  All rights reserved.
 * Copyright (c) 2009-2012 Mellanox Technologies.  All rights reserved.
 * Copyright (c) 2012      Los Alamos National Security, LLC.
 *                         All rights reserved.
 * Copyright (c) 2014      Intel, Inc.  All rights reserved.
 * $COPYRIGHT$
 *
 * Additional copyrights may follow
 *
 * $HEADER$
 */

#include "ompi_config.h"

#include <stdlib.h>
#include <stdio.h>
#include <fcntl.h>
#include <errno.h>
#ifdef HAVE_STRINGS_H
#include <strings.h>
#endif

#include "ompi/proc/proc.h"
#include "ompi/patterns/comm/coll_ops.h"

#include "opal/dss/dss.h"
#include "opal/util/error.h"
#include "opal/util/output.h"
#include "opal/class/opal_list.h"
#include "opal/class/opal_hash_table.h"
#include "opal/align.h"

#include "bcol_basesmuma.h"


#define SM_BACKING_FILE_NAME_MAX_LEN 256

struct file_info_t {
    uint32_t vpid;
    uint32_t jobid;
    uint64_t file_size;
    uint64_t size_ctl_structure;
    uint64_t data_seg_alignment;
    char file_name[SM_BACKING_FILE_NAME_MAX_LEN];
};

/* need to allocate space for the peer */
static void bcol_basesmuma_smcm_proc_item_t_construct
(bcol_basesmuma_smcm_proc_item_t * item) {
}

/* need to free the space for the peer */
static void bcol_basesmuma_smcm_proc_item_t_destruct
    (bcol_basesmuma_smcm_proc_item_t * item) {
}

OBJ_CLASS_INSTANCE(bcol_basesmuma_smcm_proc_item_t,
        opal_list_item_t,
        bcol_basesmuma_smcm_proc_item_t_construct,
        bcol_basesmuma_smcm_proc_item_t_destruct);


bcol_basesmuma_smcm_mmap_t* bcol_basesmuma_smcm_create_mmap(int fd, size_t size, char *file_name,
        size_t size_ctl_structure,
        size_t data_seg_alignment)
{
    bcol_basesmuma_smcm_mmap_t *map;
    bcol_basesmuma_smcm_file_header_t *seg;
    unsigned char *addr = NULL;

    /* map the file and initialize segment state */
    seg = (bcol_basesmuma_smcm_file_header_t*)
        mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
    if((void*)-1 == seg) {
        return NULL;
    }

    /* set up the map object */
    map = (bcol_basesmuma_smcm_mmap_t* )malloc(sizeof(bcol_basesmuma_smcm_mmap_t));
    assert(map);

    strncpy(map->map_path, file_name, OPAL_PATH_MAX);
    /* the first entry in the file is the control structure. The first
       entry in the control structure is an mca_common_sm_file_header_t
       element */
    map->map_seg = seg;

    addr = ((unsigned char *)seg) + size_ctl_structure;
    /* If we have a data segment (i.e., if 0 != data_seg_alignment),
       then make it the first aligned address after the control
       structure. */
    if (0 != data_seg_alignment) {
        addr = OPAL_ALIGN_PTR(addr,  data_seg_alignment, unsigned char*);

        /* is addr past end of file ? */
        if((unsigned char*)seg + size < addr) {
            opal_output(0, "bcol_basesmuma_smcm_mmap_init: "
                    "memory region too small len %lu  addr %p\n",
                    (unsigned long)size, addr);
            return NULL;
        }
    }
    map->data_addr = addr;
    map->map_addr = (unsigned char *)seg;
    map->map_size = size;

    return map;
}


/* smcm_allgather_connection:
   This function is called when a shared memory subgroup wants to establish shared memory "connections" amongs
   a group of processes.

   This function DOES NOT create any shared memory backing files, it only mmaps already existing files. Shared
   memory files are created by the shared memory registration function
   -----------------------------------------------------------------------------------------------------------
   Input params:

   - sbgp module   The subgrouping module contains the list of ranks to wire up.

   - peer_list      An opal list containing a list of bcol_basesmuma_smcm_proc_item_t types. This
   contains a list of peers whose shared memory files I have already mapped.
   Upon completion of the allgather exchange with all members of the group and depending on the
   value of "map_all", my peers' shared memory files are mapped into my local virtual memory
   space, with all pertinent information being stored in an bcol_basesmuma_smcm_proc_item_t which is
   subsequently appended onto the "peer_list".

   - comm           The ompi_communicator_t communicator.

   - input          A data struct that caches the information about my shared memory file.

   - map_all        Bool that determines whether or not to go ahead and map the files from all of the peers
   defined in the sbgp-ing module. If map_all == true, then go ahead and mmap all of the files
   obtained in the exchange and append the information to the "peer_list". If map_all == false
   then make a check and only mmap those peers' files whose vpid/jobid/filename combination do
   not already exist in the "peer_list". Once mapping is completed, append this peer's information
   to the "peer_list".
   -----------------------------------------------------------------------------------------------------------
 *
 */


int bcol_basesmuma_smcm_allgather_connection(
        mca_bcol_basesmuma_module_t *sm_bcol_module,
        mca_sbgp_base_module_t *module,
        opal_list_t *peer_list,
        bcol_basesmuma_smcm_proc_item_t ***back_files,
        ompi_communicator_t *comm,
        bcol_basesmuma_smcm_file_t input,
        char *base_fname,
        bool map_all)
{

	/* define local variables */

	int rc, i, cnt, index_in_group, fd = -1, n_files_mapped;
	size_t len, len_other;
	uint32_t rem_vpid, rem_jobid;
	uint64_t rem_size, rem_size_ctl_struct, rem_data_seg_align;
	char *rem_fname,*cpy_ret;
	ptrdiff_t mem_offset;
	ompi_proc_t *proc_temp, *my_id;
	bcol_basesmuma_smcm_proc_item_t *temp;
	bcol_basesmuma_smcm_proc_item_t *item_ptr = OBJ_NEW(bcol_basesmuma_smcm_proc_item_t);
	bcol_basesmuma_smcm_proc_item_t **backing_files;
        struct file_info_t local_file;
        struct file_info_t *all_files=NULL;

	backing_files= (bcol_basesmuma_smcm_proc_item_t **)
        malloc(sizeof(bcol_basesmuma_smcm_proc_item_t *)*module->group_size);
	if( !backing_files ) {
            rc=OMPI_ERR_OUT_OF_RESOURCE;
            goto Error;
    }
        *back_files=backing_files;

        /* check to see if we have already mapped all the files, if we have
         * just need to fill in backing_files array, and we are done
         */
        n_files_mapped=0;
        for (i = 0; i < module->group_size; i++) {
            /* get the proc info */
            proc_temp = ompi_comm_peer_lookup(comm,module->group_list[i]);
            rem_jobid = proc_temp->proc_name.jobid;
            rem_vpid = proc_temp->proc_name.vpid;
            index_in_group=i;

            for (item_ptr = (bcol_basesmuma_smcm_proc_item_t*) opal_list_get_first(peer_list);
                    item_ptr != (bcol_basesmuma_smcm_proc_item_t*) opal_list_get_end(peer_list);
                    item_ptr = (bcol_basesmuma_smcm_proc_item_t*) opal_list_get_next((opal_list_item_t *)item_ptr)) {

                /* if the vpid/jobid/filename combination already exists in the list,
                   then do not map this peer's file --- because you already have */
                if (rem_vpid == item_ptr->peer.vpid && rem_jobid == item_ptr->peer.jobid
                        && (strstr(item_ptr->sm_file.file_name,base_fname)) ){

                    /* record file data */
                    /* RLG - note - is this correct ? */
                    /*sm_bcol_module->ctl_backing_files_info[index_in_group]=(bcol_basesmuma_smcm_proc_item_t *)item_ptr; */
                    backing_files[index_in_group]=(bcol_basesmuma_smcm_proc_item_t *)item_ptr;
                    n_files_mapped++;
                    /* found it - no need to continue looking */
                    break;
                }
            }
        }
        /* check to see if we are done - our own files are not in this list*/
        if (n_files_mapped == (module->group_size-1) ) {
            return OMPI_SUCCESS;
        }


	/* Phase One:
	   gather a list of processes that will participate in the allgather - I'm
	   preparing this list from the sbgp-ing module that was passed into the function */

        my_id = ompi_proc_local();

        /* fill in local file information */
        local_file.vpid=my_id->proc_name.vpid;
        local_file.jobid=my_id->proc_name.jobid;
        local_file.file_size=input.size;
        local_file.size_ctl_structure=input.size_ctl_structure;
        local_file.data_seg_alignment=input.data_seg_alignment;

        len=strlen(input.file_name);
        if( len > SM_BACKING_FILE_NAME_MAX_LEN-1 ) {
            fprintf(stderr," backing file name too long:  %s len :: %d \n",
                    input.file_name,(int) len);
		rc = OMPI_ERROR;
		goto Error;
        }
        strcpy(&(local_file.file_name[0]),input.file_name);
        local_file.file_name[len]='\0';

        /* will exchange this data type as a string of characters -
         * this routine is first called before MPI_init() completes
         * and before error handling is setup, so can't use the
         * MPI data types to send this data */
        len=sizeof(struct file_info_t);
        all_files=(struct file_info_t *)malloc(
                module->group_size*len);
        if( !all_files ) {
            rc=OMPI_ERR_OUT_OF_RESOURCE;
            goto Error;
        }

        /* initialize the destination array */
        bzero(all_files,sizeof(struct file_info_t)*
                sm_bcol_module->super.sbgp_partner_module->group_size);

        /* exchange data */
        rc=comm_allgather_pml(&local_file,all_files,sizeof(struct file_info_t),
                MPI_CHAR,
                sm_bcol_module->super.sbgp_partner_module->my_index,
                sm_bcol_module->super.sbgp_partner_module->group_size,
                sm_bcol_module->super.sbgp_partner_module->group_list,
                sm_bcol_module->super.sbgp_partner_module->group_comm);
        if( OMPI_SUCCESS != rc ) {
            fprintf(stderr," failed in comm_allgather_pml.  Error code: %d \n",
                    rc);
            fflush(stderr);
            goto Error;
        }

	/* Phase four:
	   loop through the receive buffer, unpack the data recieved from remote peers */

        for (i = 0; i < module->group_size; i++) {

            index_in_group=i;
            rem_vpid=all_files[i].vpid;
            rem_jobid=all_files[i].jobid;
            rem_fname=&(all_files[i].file_name[0]);
            rem_size=all_files[i].file_size;
            rem_size_ctl_struct=all_files[i].size_ctl_structure;
            rem_data_seg_align=all_files[i].data_seg_alignment;

            /* if this is me, release resources and continue */
            len=strlen(input.file_name);
            len_other=strlen(rem_fname);
            if( (rem_vpid == my_id->proc_name.vpid) &&
                    (rem_jobid == my_id->proc_name.jobid) &&
                    ( len == len_other ) &&
                    (strncmp(input.file_name,rem_fname, len) == 0 )
              ) {

                /*free(rem_fname); */
                continue;
            }

            temp = OBJ_NEW(bcol_basesmuma_smcm_proc_item_t);

            temp->peer.vpid = rem_vpid;
            temp->peer.jobid = rem_jobid;
            temp->sm_file.file_name = (char *) malloc(len_other+1);
            if( !temp->sm_file.file_name) {
                rc = OMPI_ERR_OUT_OF_RESOURCE;
                goto Error;
            }
            cpy_ret=strncpy(temp->sm_file.file_name,&(all_files[i].file_name[0]),
                    len_other);
            if( !cpy_ret ) {
                rc = OMPI_ERROR;
                goto Error;
            }
            temp->sm_file.file_name[len_other]='\0';

            temp->sm_file.size = (size_t) rem_size;
            temp->sm_file.mpool_size = (size_t) rem_size;
            temp->sm_file.size_ctl_structure = (size_t) rem_size_ctl_struct;
            temp->sm_file.data_seg_alignment = (size_t) rem_data_seg_align;

            /* Phase Five:
               If map_all == true, then  we map every peer's file
               else we check to see if I have already mapped this
               vpid/jobid/filename combination and if I have, then
               I do not mmap this peer's file.
             *
             */
            if(map_all) {
                fd = -1;
                fd = open(temp->sm_file.file_name,O_RDWR,0600);
                if(0 > fd) {
                    fprintf(stderr,"SMCM Allgather failed to open sm backing file \n");
                    fflush(stderr);
                    rc = OMPI_ERROR;
                    goto Error;
                } else {

                    /* map the file */
                    temp->sm_mmap = bcol_basesmuma_smcm_create_mmap(fd,temp->sm_file.size,
                            temp->sm_file.file_name,
                            temp->sm_file.size_ctl_structure,
                            getpagesize());
                    if (NULL == temp->sm_mmap) {
                        fprintf(stderr,"mmapping failed to map remote peer's file\n");
                        fflush(stderr);
                        rc = OMPI_ERROR;
                        goto Error;
                    }

                    /* compute memory offset */
                    mem_offset = (ptrdiff_t) temp->sm_mmap->data_addr -
                        (ptrdiff_t) temp->sm_mmap->map_seg;
                    temp->sm_mmap->map_seg->seg_offset = mem_offset;
                    temp->sm_mmap->map_seg->seg_size = temp->sm_file.size - mem_offset;
                    /* more stuff to follow */
                }

                /* append this peer's info, including shared memory map addr, onto the
                   peer_list */

                /* record file data */
                sm_bcol_module->ctl_backing_files_info[index_in_group]=(bcol_basesmuma_smcm_proc_item_t *)temp;
                backing_files[index_in_group]=(bcol_basesmuma_smcm_proc_item_t *) temp;

                opal_list_append(peer_list, (opal_list_item_t*) temp);

            } else {
                /* check to see if I have already mapped this peer's file */
                cnt = 0;
                for (item_ptr = (bcol_basesmuma_smcm_proc_item_t*) opal_list_get_first(peer_list);
                        item_ptr != (bcol_basesmuma_smcm_proc_item_t*) opal_list_get_end(peer_list);
                        item_ptr = (bcol_basesmuma_smcm_proc_item_t*) opal_list_get_next((opal_list_item_t *)item_ptr)) {

                    /* if the vpid/jobid/filename combination already exists in the list,
                       then do not map this peer's file --- because you already have */
                    if (rem_vpid == item_ptr->peer.vpid && rem_jobid == item_ptr->peer.jobid
                            && (0 == strcmp(rem_fname,item_ptr->sm_file.file_name)) ){
                        cnt++;

                        /* record file data */
                        sm_bcol_module->ctl_backing_files_info[index_in_group]=(bcol_basesmuma_smcm_proc_item_t *)item_ptr;
                        backing_files[index_in_group]=(bcol_basesmuma_smcm_proc_item_t *)item_ptr;
                        /* found it - no need to continue looking */
                        break;
                    }
                }

                if (cnt == 0) {

                    /* then we haven't mmapped this file, so let's do it now. watch out josh,
                       the ordering of procs may sting you! Do we need to pack in a rank? */

                    fd = -1;
                    fd = open(temp->sm_file.file_name,O_RDWR,0600);
                    if (0 > fd) {
                        fprintf(stderr,"file open failed %s \n",
                                temp->sm_file.file_name);
                        fflush(stderr);
                        rc = OMPI_ERROR;
                        goto Error;
                    } else {

                        /* map this file */

                        temp->sm_mmap = bcol_basesmuma_smcm_create_mmap(fd,temp->sm_file.size,
                                temp->sm_file.file_name,
                                temp->sm_file.size_ctl_structure,
                                getpagesize());
                        if (NULL == temp->sm_mmap) {
                            fprintf(stderr,"mmapping failed to map remote peer's file\n");
                            fflush(stderr);
                            rc = OMPI_ERROR;
                            goto Error;
                        }

                        /* initialize the segment */
                        mem_offset = (ptrdiff_t) temp->sm_mmap->data_addr -
                            (ptrdiff_t) temp->sm_mmap->map_seg;

                    }

                    /* record file data */
                    backing_files[index_in_group]=(bcol_basesmuma_smcm_proc_item_t *) temp;
                    /*sm_bcol_module->ctl_backing_files_info[index_in_group]=(bcol_basesmuma_smcm_proc_item_t *)temp;*/
                    opal_list_append(peer_list, (opal_list_item_t*) temp);

                } else {
                    /* free the allocated memory - we are not going to put this on any list */
                    OBJ_RELEASE(temp);
                }
            }

        }

	/* clean-up */
        if( NULL != all_files ) {
            free(all_files);
            all_files=NULL;
        }

	return OMPI_SUCCESS;


Error:

	/* error clean-up and return */
        if( NULL != all_files ) {
            free(all_files);
            all_files=NULL;
        }

	return rc;
}


OBJ_CLASS_INSTANCE(
        bcol_basesmuma_smcm_mmap_t,
        opal_list_item_t,
        NULL,
        NULL
        );


/*
 * mmap the specified file as a shared file.  No information exchange with other
 * processes takes place within this routine.
 * This function assumes that the memory has already been allocated, and only the
 * mmap needs to be done.
 */
bcol_basesmuma_smcm_mmap_t *bcol_basesmuma_smcm_mem_reg(void *in_ptr,
		size_t length,
		size_t alignment,
		char* file_name)
{
	/* local variables */
	int fd = -1;
	bcol_basesmuma_smcm_mmap_t *map = NULL;
        /* if pointer is not allocated - return error.  We have no clue how the user will allocate or
         *   free this memory.
         */

	/* open the shared memory backing file */

	fd = open(file_name, O_CREAT|O_RDWR,0600);
	if (fd < 0) {
		opal_output(0, "basesmuma shared memory allocation open failed with errno: %d\n",
					errno);
	} else if (0 != ftruncate(fd,length)) {
		opal_output(0, "basesmuma shared memory allocation ftruncate failed with errno: %d\n",
					errno);
	} else {

		map = bcol_basesmuma_smcm_reg_mmap(in_ptr, fd, length, alignment, file_name);
		if (NULL == map) {
			return NULL;
		}
	}

	/* takes us to the top of the control structure */

	return map;

}

bcol_basesmuma_smcm_mmap_t * bcol_basesmuma_smcm_reg_mmap(void *in_ptr,
		int fd,
		size_t length,
		size_t alignment,
		char *file_name)
{

	/* local variables */
	bcol_basesmuma_smcm_mmap_t *map;
	bcol_basesmuma_smcm_file_header_t *seg;
	unsigned char* myaddr = NULL;

	/* map the file and initialize the segment state */
	seg = (bcol_basesmuma_smcm_file_header_t *)
            mmap(in_ptr, length, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_FIXED, fd, 0);
	if((void*)-1 == seg) {
		return NULL;
	}

	/* set up the map object */

	/*map = OBJ_NEW(mca_common_sm_mmap_t); */
        map=(bcol_basesmuma_smcm_mmap_t *)malloc(sizeof(bcol_basesmuma_smcm_mmap_t));
        assert(map);
	strncpy(map->map_path, file_name, OPAL_PATH_MAX);

	/* the first entry in the file is the control structure. the first entry
	 in the control structure is an mca_common_sm_file_header_t element */
	map->map_seg = seg;

	myaddr = (unsigned char *) seg;
	/* if we have a data segment (i.e. if 0 != data_seg_alignement) */

	if ( 0 != alignment) {
		myaddr = OPAL_ALIGN_PTR(myaddr, alignment, unsigned char*);

		/* is addr past the end of the file? */
		if ((unsigned char *) seg+length < myaddr) {
			opal_output(0, "mca_bcol_basesmuma_sm_alloc_mmap: memory region too small len %lu add %p\n",
						(unsigned long) length, myaddr);
			return NULL;
		}

	}


	map->data_addr = (unsigned char*) myaddr;
	map->map_addr = (unsigned char*) seg;
	map->map_size = length;

	return map;

}