1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */ 2 /* 3 * 4 * Copyright (C) 2007 UChicago/Argonne LLC 5 * See COPYRIGHT notice in top-level directory. 6 */ 7 8 #include "adio.h" 9 10 /* Generic version of a "collective open". Assumes a "real" underlying 11 * file system (meaning no wonky consistency semantics like NFS). 12 * 13 * optimization: by having just one process create a file, close it, 14 * then have all N processes open it, we can possibly avoid contention 15 * for write locks on a directory for some file systems. 16 * 17 * Happy side-effect: exclusive create (error if file already exists) 18 * just falls out 19 * 20 * Note: this is not a "scalable open" (c.f. "The impact of file systems 21 * on MPI-IO scalability"). 22 */ 23 24 void ADIOI_GEN_OpenColl(ADIO_File fd, int rank, 25 int access_mode, int *error_code) 26 { 27 int orig_amode_excl, orig_amode_wronly; 28 MPI_Comm tmp_comm; 29 30 orig_amode_excl = access_mode; 31 32 if (access_mode & ADIO_CREATE ){ 33 if(rank == fd->hints->ranklist[0]) { 34 /* remove delete_on_close flag if set */ 35 if (access_mode & ADIO_DELETE_ON_CLOSE) 36 fd->access_mode = access_mode ^ ADIO_DELETE_ON_CLOSE; 37 else 38 fd->access_mode = access_mode; 39 40 tmp_comm = fd->comm; 41 fd->comm = MPI_COMM_SELF; 42 (*(fd->fns->ADIOI_xxx_Open))(fd, error_code); 43 fd->comm = tmp_comm; 44 MPI_Bcast(error_code, 1, MPI_INT, \ 45 fd->hints->ranklist[0], fd->comm); 46 /* if no error, close the file and reopen normally below */ 47 if (*error_code == MPI_SUCCESS) 48 (*(fd->fns->ADIOI_xxx_Close))(fd, error_code); 49 50 fd->access_mode = access_mode; /* back to original */ 51 } 52 else MPI_Bcast(error_code, 1, MPI_INT, fd->hints->ranklist[0], fd->comm); 53 54 if (*error_code != MPI_SUCCESS) { 55 return; 56 } 57 else { 58 /* turn off CREAT (and EXCL if set) for real multi-processor open */ 59 access_mode ^= ADIO_CREATE; 60 if (access_mode & ADIO_EXCL) 61 access_mode ^= ADIO_EXCL; 62 } 63 } 64 fd->blksize = 1024*1024*4; /* this large default value should be good for 65 most file systems. any ROMIO driver is free 66 to stat the file and find an optimial value */ 67 68 /* if we are doing deferred open, non-aggregators should return now */ 69 if (fd->hints->deferred_open ) { 70 if (!(fd->is_agg)) { 71 /* we might have turned off EXCL for the aggregators. 72 * restore access_mode that non-aggregators get the right 73 * value from get_amode */ 74 fd->access_mode = orig_amode_excl; 75 /* In file-system specific open, a driver might collect some 76 * information via stat(). Deferred open means not every process 77 * participates in fs-specific open, but they all participate in 78 * this open call. Broadcast a bit of information in case 79 * lower-level file system driver (e.g. 'bluegene') collected it 80 * (not all do)*/ 81 MPI_Bcast(&(fd->blksize), 1, MPI_LONG, fd->hints->ranklist[0], fd->comm); 82 *error_code = MPI_SUCCESS; 83 ADIOI_Assert(fd->blksize > 0); 84 return; 85 } 86 } 87 88 /* For writing with data sieving, a read-modify-write is needed. If 89 the file is opened for write_only, the read will fail. Therefore, 90 if write_only, open the file as read_write, but record it as write_only 91 in fd, so that get_amode returns the right answer. */ 92 93 /* observation from David Knaak: file systems that do not support data 94 * sieving do not need to change the mode */ 95 96 orig_amode_wronly = access_mode; 97 if ( (access_mode & ADIO_WRONLY) && 98 ADIO_Feature(fd, ADIO_DATA_SIEVING_WRITES) ) { 99 access_mode = access_mode ^ ADIO_WRONLY; 100 access_mode = access_mode | ADIO_RDWR; 101 } 102 fd->access_mode = access_mode; 103 104 (*(fd->fns->ADIOI_xxx_Open))(fd, error_code); 105 106 /* if error, may be it was due to the change in amode above. 107 therefore, reopen with access mode provided by the user.*/ 108 fd->access_mode = orig_amode_wronly; 109 if (*error_code != MPI_SUCCESS) 110 (*(fd->fns->ADIOI_xxx_Open))(fd, error_code); 111 112 /* if we turned off EXCL earlier, then we should turn it back on */ 113 if (fd->access_mode != orig_amode_excl) fd->access_mode = orig_amode_excl; 114 115 /* broadcast a bit of information (blocksize for now) to all proceses in 116 * communicator, not just those who participated in open */ 117 MPI_Bcast(&(fd->blksize), 1, MPI_LONG, fd->hints->ranklist[0], fd->comm); 118 /* file domain code will get terribly confused in a hard-to-debug way if 119 * gpfs blocksize not sensible */ 120 ADIOI_Assert( fd->blksize > 0); 121 /* for deferred open: this process has opened the file (because if we are 122 * not an aggregaor and we are doing deferred open, we returned earlier)*/ 123 fd->is_open = 1; 124 125 } 126 127 /* 128 * vim: ts=8 sts=4 sw=4 noexpandtab 129 */