1
1

sharedfp/individual: defer error when not being able to open datafile

This commit changes the behavior of the individual sharedfp component. If
the component cannot create either the datafile or the metadatafile during File_open,
no error is being raised going forward. This allows applications that do not use shared
file pointer operations to continue execution without any issue.

If the user however subsequently calls MPI_File_write_shared or similar operations, an error
will be raised.

Fixes issue #7429

Signed-off-by: Edgar Gabriel <egabriel@central.uh.edu>
Этот коммит содержится в:
Edgar Gabriel 2020-02-21 12:13:39 -06:00
родитель e9a54e8e0e
Коммит df6e3e503a
2 изменённых файлов: 50 добавлений и 23 удалений

Просмотреть файл

@ -9,7 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2013-2018 University of Houston. All rights reserved.
* Copyright (c) 2013-2019 University of Houston. All rights reserved.
* Copyright (c) 2015-2018 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2016-2017 IBM Corporation. All rights reserved.
@ -92,11 +92,18 @@ int mca_sharedfp_individual_file_open (struct ompi_communicator_t *comm,
MPI_MODE_RDWR | MPI_MODE_CREATE | MPI_MODE_DELETE_ON_CLOSE,
&(MPI_INFO_NULL->super), datafilehandle, false);
if ( OMPI_SUCCESS != err) {
opal_output(0, "mca_sharedfp_individual_file_open: Error during datafile file open\n");
opal_output(ompi_sharedfp_base_framework.framework_output,
"mca_sharedfp_individual_file_open: Error during datafile file open. Continuing anyway. \n");
free (sh);
free (datafilename);
free (datafilehandle);
return err;
// We reset the error code here to OMPI_SUCCESS since the individual component can act as
// a dummy component, in case no sharedfp operations are used by the code. Invoking any write/read
// operations will however lead to an error, since the sharedfp_data pointer will be NULL.
sh = NULL;
err = OMPI_SUCCESS;
goto exit;
}
/*----------------------------------------------------------*/
@ -113,9 +120,13 @@ int mca_sharedfp_individual_file_open (struct ompi_communicator_t *comm,
if ( NULL == metadatafilename ) {
free (sh);
free (datafilename);
mca_common_ompio_file_close ( datafilehandle);
free (datafilehandle);
opal_output(0, "mca_sharedfp_individual_file_open: Error during memory allocation\n");
return OMPI_ERR_OUT_OF_RESOURCE;
sh=NULL;
err = OMPI_ERR_OUT_OF_RESOURCE;
goto exit;
}
snprintf ( metadatafilename, len, "%s%s%d", filename, ".metadata.",fh->f_rank);
@ -123,22 +134,34 @@ int mca_sharedfp_individual_file_open (struct ompi_communicator_t *comm,
if ( NULL == metadatafilehandle ) {
free (sh);
free (datafilename);
mca_common_ompio_file_close ( datafilehandle);
free (datafilehandle);
free (metadatafilename);
opal_output(0, "mca_sharedfp_individual_file_open: Error during memory allocation\n");
return OMPI_ERR_OUT_OF_RESOURCE;
sh = NULL;
err = OMPI_ERR_OUT_OF_RESOURCE;
goto exit;
}
err = mca_common_ompio_file_open ( MPI_COMM_SELF,metadatafilename,
MPI_MODE_RDWR | MPI_MODE_CREATE | MPI_MODE_DELETE_ON_CLOSE,
&(MPI_INFO_NULL->super), metadatafilehandle, false);
if ( OMPI_SUCCESS != err) {
opal_output(0, "mca_sharedfp_individual_file_open: Error during metadatafile file open\n");
opal_output(ompi_sharedfp_base_framework.framework_output,
"mca_sharedfp_individual_file_open: Error during metadatafile file open. Continuing anyway. \n");
free (sh);
free (datafilename);
mca_common_ompio_file_close ( datafilehandle);
free (datafilehandle);
free (metadatafilename);
free (metadatafilehandle);
return err;
// We reset the error code here to OMPI_SUCCESS since the individual component can act as
// a dummy component, in case no sharedfp operations are used by the code. Invoking any write/read
// operations will however lead to an error, since the sharedfp_data pointer will be NULL.
sh = NULL;
err = OMPI_SUCCESS;
goto exit;
}
/*save the datafilehandle and metadatahandle in the sharedfp individual module data structure*/
@ -150,6 +173,8 @@ int mca_sharedfp_individual_file_open (struct ompi_communicator_t *comm,
headnode->metadatafilename = metadatafilename;
}
exit:
/*save the sharedfp individual module data structure in the ompio filehandle structure*/
fh->f_sharedfp_data = sh;

Просмотреть файл

@ -54,24 +54,26 @@ int mca_sharedfp_individual_write (ompio_file_t *fh,
/*Retrieve data structure for shared file pointer operations*/
sh = fh->f_sharedfp_data;
headnode = (mca_sharedfp_individual_header_record*)sh->selected_module_data;
if (headnode) {
/*Insert metadata record into a queue*/
mca_sharedfp_individual_insert_metadata(OMPI_FILE_WRITE_SHARED, totalbytes, sh);
/*Write the data into individual file*/
ret = mca_common_ompio_file_write_at ( headnode->datafilehandle,
headnode->datafile_offset,
buf, count, datatype, status);
if ( OMPI_SUCCESS != ret ) {
opal_output(0,"mca_sharedfp_individual_write: Error while writing the datafile \n");
return -1;
}
/* Update the datafileoffset*/
headnode->datafile_offset = headnode->datafile_offset + totalbytes;
if ( NULL == headnode) {
opal_output (0, "sharedfp_individual_write_ordered: headnode is NULL but file is open\n");
return OMPI_ERROR;
}
/*Insert metadata record into a queue*/
mca_sharedfp_individual_insert_metadata(OMPI_FILE_WRITE_SHARED, totalbytes, sh);
/*Write the data into individual file*/
ret = mca_common_ompio_file_write_at ( headnode->datafilehandle,
headnode->datafile_offset,
buf, count, datatype, status);
if ( OMPI_SUCCESS != ret ) {
opal_output(0,"mca_sharedfp_individual_write: Error while writing the datafile \n");
return -1;
}
/* Update the datafileoffset*/
headnode->datafile_offset = headnode->datafile_offset + totalbytes;
return ret;
}