From 6e9cbe397f29f75be6f3cba8ac67c9f5d97db351 Mon Sep 17 00:00:00 2001 From: Rob Latham Date: Wed, 3 Jun 2015 11:37:59 -0500 Subject: [PATCH] hint processing should not open files move opening of files from hint processing and into open routines. This is MPICH commit 92f1c69f0de8 and 22a77dceda11 see https://trac.mpich.org/projects/mpich/ticket/2261 Ref: https://github.com/open-mpi/ompi/issues/158 Signed-off-by: Pavan Balaji --- .../romio/adio/ad_lustre/ad_lustre_hints.c | 86 +++-------- .../romio/adio/ad_lustre/ad_lustre_open.c | 143 +++++++++++++----- 2 files changed, 123 insertions(+), 106 deletions(-) diff --git a/ompi/mca/io/romio314/romio/adio/ad_lustre/ad_lustre_hints.c b/ompi/mca/io/romio314/romio/adio/ad_lustre/ad_lustre_hints.c index 362d24f1fa..00aebd1a57 100644 --- a/ompi/mca/io/romio314/romio/adio/ad_lustre/ad_lustre_hints.c +++ b/ompi/mca/io/romio314/romio/adio/ad_lustre/ad_lustre_hints.c @@ -20,8 +20,7 @@ void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code) char *value; int flag; ADIO_Offset stripe_val[3], str_factor = -1, str_unit=0, start_iodev=-1; - struct lov_user_md lum = { 0 }; - int err, myrank, fd_sys, perm, amode, old_mask; + int err, myrank; static char myname[] = "ADIOI_LUSTRE_SETINFO"; value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); @@ -47,18 +46,25 @@ void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code) /* striping information */ ADIOI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL, value, &flag); - if (flag) + if (flag) { + ADIOI_Info_set(fd->info, "striping_unit", value); str_unit=atoll(value); + } ADIOI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL, value, &flag); - if (flag) + if (flag) { + ADIOI_Info_set(fd->info, "striping_factor", value); str_factor=atoll(value); + } ADIOI_Info_get(users_info, "romio_lustre_start_iodevice", MPI_MAX_INFO_VAL, value, &flag); - if (flag) + if (flag) { + ADIOI_Info_set(fd->info, "romio_lustre_start_iodevice", value); start_iodev=atoll(value); + } + /* direct read and write */ ADIOI_Info_get(users_info, "direct_read", MPI_MAX_INFO_VAL, @@ -84,72 +90,20 @@ void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code) } MPI_Bcast(stripe_val, 3, MPI_OFFSET, 0, fd->comm); + /* do not open file in hint processing. Open file in open routines, + * where we can better deal with EXCL flag . Continue to check the + * "all processors set a value" condition holds. */ if (stripe_val[0] != str_factor || stripe_val[1] != str_unit || stripe_val[2] != start_iodev) { - FPRINTF(stderr, "ADIOI_LUSTRE_SetInfo: All keys" - "-striping_factor:striping_unit:start_iodevice " - "need to be identical across all processes\n"); - MPI_Abort(MPI_COMM_WORLD, 1); - } else if ((str_factor > 0) || (str_unit > 0) || (start_iodev >= 0)) { - /* if user has specified striping info, process 0 tries to set it */ - if (!myrank) { - if (fd->perm == ADIO_PERM_NULL) { - old_mask = umask(022); - umask(old_mask); - perm = old_mask ^ 0666; - } - else perm = fd->perm; - - amode = 0; - if (fd->access_mode & ADIO_CREATE) - amode = amode | O_CREAT; - if (fd->access_mode & ADIO_RDONLY) - amode = amode | O_RDONLY; - if (fd->access_mode & ADIO_WRONLY) - amode = amode | O_WRONLY; - if (fd->access_mode & ADIO_RDWR) - amode = amode | O_RDWR; - if (fd->access_mode & ADIO_EXCL) - amode = amode | O_EXCL; - - /* we need to create file so ensure this is set */ - amode = amode | O_LOV_DELAY_CREATE | O_CREAT; - - fd_sys = open(fd->filename, amode, perm); - if (fd_sys == -1) { - if (errno != EEXIST) - fprintf(stderr, - "Failure to open file %s %d %d\n",strerror(errno), amode, perm); - } else { - lum.lmm_magic = LOV_USER_MAGIC; - lum.lmm_pattern = 0; - lum.lmm_stripe_size = str_unit; - /* crude check for overflow of lustre internal datatypes. - * Silently cap to large value if user provides a value - * larger than lustre supports */ - if (lum.lmm_stripe_size != str_unit) { - lum.lmm_stripe_size = UINT_MAX; - } - lum.lmm_stripe_count = str_factor; - if ( lum.lmm_stripe_count != str_factor) { - lum.lmm_stripe_count = USHRT_MAX; - } - lum.lmm_stripe_offset = start_iodev; - if (lum.lmm_stripe_offset != start_iodev) { - lum.lmm_stripe_offset = USHRT_MAX; - } - - err = ioctl(fd_sys, LL_IOC_LOV_SETSTRIPE, &lum); - if (err == -1 && errno != EEXIST) { - fprintf(stderr, "Failure to set stripe info %s \n", strerror(errno)); - } - close(fd_sys); - } - } /* End of striping parameters validation */ + MPIO_ERR_CREATE_CODE_INFO_NOT_SAME("ADIOI_LUSTRE_SetInfo", + "str_factor or str_unit or start_iodev", + error_code); + ADIOI_Free(value); + return; } - MPI_Barrier(fd->comm); } + /* get other hint */ if (users_info != MPI_INFO_NULL) { /* CO: IO Clients/OST, diff --git a/ompi/mca/io/romio314/romio/adio/ad_lustre/ad_lustre_open.c b/ompi/mca/io/romio314/romio/adio/ad_lustre/ad_lustre_open.c index 716e462fa8..75f57af05b 100644 --- a/ompi/mca/io/romio314/romio/adio/ad_lustre/ad_lustre_open.c +++ b/ompi/mca/io/romio314/romio/adio/ad_lustre/ad_lustre_open.c @@ -18,14 +18,17 @@ void ADIOI_LUSTRE_Open(ADIO_File fd, int *error_code) { int perm, old_mask, amode, amode_direct; - int lumlen; + int lumlen, myrank, flag, set_layout=0, err; struct lov_user_md *lum = NULL; char *value; + ADIO_Offset str_factor = -1, str_unit=0, start_iodev=-1; #if defined(MPICH) || !defined(PRINT_ERR_MSG) static char myname[] = "ADIOI_LUSTRE_OPEN"; #endif + MPI_Comm_rank(fd->comm, &myrank); + if (fd->perm == ADIO_PERM_NULL) { old_mask = umask(022); umask(old_mask); @@ -47,46 +50,103 @@ void ADIOI_LUSTRE_Open(ADIO_File fd, int *error_code) amode_direct = amode | O_DIRECT; - fd->fd_sys = open(fd->filename, amode|O_CREAT, perm); - - if (fd->fd_sys != -1) { - int err; - - /* get file striping information and set it in info */ - /* odd malloc here because lov_user_md contains some fixed data and - * then a list of 'lmm_objects' representing stripe */ - lumlen = sizeof(struct lov_user_md) + - MAX_LOV_UUID_COUNT * sizeof(struct lov_user_ost_data); - /* furthermore, Pascal Deveze reports that, even though we pass a - * "GETSTRIPE" (read) flag to the ioctl, if some of the values of this - * struct are uninitialzed, the call can give an error. calloc in case - * there are other members that must be initialized and in case - * lov_user_md struct changes in future */ - lum = (struct lov_user_md *)ADIOI_Calloc(1,lumlen); - lum->lmm_magic = LOV_USER_MAGIC; - err = ioctl(fd->fd_sys, LL_IOC_LOV_GETSTRIPE, (void *)lum); - if (!err) { - value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); - - fd->hints->striping_unit = lum->lmm_stripe_size; - sprintf(value, "%d", lum->lmm_stripe_size); - ADIOI_Info_set(fd->info, "striping_unit", value); - - fd->hints->striping_factor = lum->lmm_stripe_count; - sprintf(value, "%d", lum->lmm_stripe_count); - ADIOI_Info_set(fd->info, "striping_factor", value); - - fd->hints->fs_hints.lustre.start_iodevice = lum->lmm_stripe_offset; - sprintf(value, "%d", lum->lmm_stripe_offset); - ADIOI_Info_set(fd->info, "romio_lustre_start_iodevice", value); - - ADIOI_Free(value); - } - ADIOI_Free(lum); - - if (fd->access_mode & ADIO_APPEND) - fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END); } + /* odd length here because lov_user_md contains some fixed data and + * then a list of 'lmm_objects' representing stripe */ + lumlen = sizeof(struct lov_user_md) + + MAX_LOV_UUID_COUNT * sizeof(struct lov_user_ost_data); + lum = (struct lov_user_md *)ADIOI_Calloc(1,lumlen); + + value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); + /* we already validated in LUSTRE_SetInfo that these are going to be the same */ + if (fd->info != MPI_INFO_NULL) { + /* striping information */ + ADIOI_Info_get(fd->info, "striping_unit", MPI_MAX_INFO_VAL, + value, &flag); + if (flag) + str_unit=atoll(value); + + ADIOI_Info_get(fd->info, "striping_factor", MPI_MAX_INFO_VAL, + value, &flag); + if (flag) + str_factor=atoll(value); + + ADIOI_Info_get(fd->info, "romio_lustre_start_iodevice", + MPI_MAX_INFO_VAL, value, &flag); + if (flag) + start_iodev=atoll(value); + } + if ((str_factor > 0) || (str_unit > 0) || (start_iodev >= 0)) + set_layout = 1; + + /* if hints were set, we need to delay creation of any lustre objects. + * However, if we open the file with O_LOV_DELAY_CREATE and don't call the + * follow-up ioctl, subsequent writes will fail */ + if (myrank == 0 && set_layout) + amode = amode | O_LOV_DELAY_CREATE; + + fd->fd_sys = open(fd->filename, amode, perm); + if (fd->fd_sys == -1) goto fn_exit; + + /* we can only set these hints on new files */ + /* It was strange and buggy to open the file in the hint path. Instead, + * we'll apply the file tunings at open time */ + if ((amode & O_CREAT) && set_layout ) { + /* if user has specified striping info, process 0 tries to set it */ + if (!myrank) { + lum->lmm_magic = LOV_USER_MAGIC; + lum->lmm_pattern = 0; + /* crude check for overflow of lustre internal datatypes. + * Silently cap to large value if user provides a value + * larger than lustre supports */ + if (str_unit > UINT_MAX) + lum->lmm_stripe_size = UINT_MAX; + else + lum->lmm_stripe_size = str_unit; + + if (str_factor > USHRT_MAX) + lum->lmm_stripe_count = USHRT_MAX; + else + lum->lmm_stripe_count = str_factor; + + if (start_iodev > USHRT_MAX) + lum->lmm_stripe_offset = USHRT_MAX; + else + lum->lmm_stripe_offset = start_iodev; + err = ioctl(fd->fd_sys, LL_IOC_LOV_SETSTRIPE, lum); + if (err == -1 && errno != EEXIST) { + fprintf(stderr, "Failure to set stripe info %s \n", strerror(errno)); + /* not a fatal error, but user might care to know */ + } + } /* End of striping parameters validation */ + } + + /* Pascal Deveze reports that, even though we pass a + * "GETSTRIPE" (read) flag to the ioctl, if some of the values of this + * struct are uninitialzed, the call can give an error. zero it out in case + * there are other members that must be initialized and in case + * lov_user_md struct changes in future */ + memset(lum, 0, lumlen); + lum->lmm_magic = LOV_USER_MAGIC; + err = ioctl(fd->fd_sys, LL_IOC_LOV_GETSTRIPE, (void *)lum); + if (!err) { + + fd->hints->striping_unit = lum->lmm_stripe_size; + sprintf(value, "%d", lum->lmm_stripe_size); + ADIOI_Info_set(fd->info, "striping_unit", value); + + fd->hints->striping_factor = lum->lmm_stripe_count; + sprintf(value, "%d", lum->lmm_stripe_count); + ADIOI_Info_set(fd->info, "striping_factor", value); + + fd->hints->fs_hints.lustre.start_iodevice = lum->lmm_stripe_offset; + sprintf(value, "%d", lum->lmm_stripe_offset); + ADIOI_Info_set(fd->info, "romio_lustre_start_iodevice", value); + + } + + if (fd->access_mode & ADIO_APPEND) + fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END); if ((fd->fd_sys != -1) && (fd->access_mode & ADIO_APPEND)) fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END); @@ -101,6 +161,9 @@ void ADIOI_LUSTRE_Open(ADIO_File fd, int *error_code) fd->direct_write = fd->direct_read = 0; } } +fn_exit: + ADIOI_Free(lum); + ADIOI_Free(value); /* --BEGIN ERROR HANDLING-- */ if (fd->fd_sys == -1 || ((fd->fd_direct == -1) &&