1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
2 /*
3 *
4 * Copyright (C) 2007 UChicago/Argonne LLC
5 * See COPYRIGHT notice in top-level directory.
6 */
7
8 #include "adio.h"
9
10 /* Generic version of a "collective open". Assumes a "real" underlying
11 * file system (meaning no wonky consistency semantics like NFS).
12 *
13 * optimization: by having just one process create a file, close it,
14 * then have all N processes open it, we can possibly avoid contention
15 * for write locks on a directory for some file systems.
16 *
17 * Happy side-effect: exclusive create (error if file already exists)
18 * just falls out
19 *
20 * Note: this is not a "scalable open" (c.f. "The impact of file systems
21 * on MPI-IO scalability").
22 */
23
24 void ADIOI_GEN_OpenColl(ADIO_File fd, int rank,
25 int access_mode, int *error_code)
26 {
27 int orig_amode_excl, orig_amode_wronly;
28 MPI_Comm tmp_comm;
29
30 orig_amode_excl = access_mode;
31
32 if (access_mode & ADIO_CREATE ){
33 if(rank == fd->hints->ranklist[0]) {
34 /* remove delete_on_close flag if set */
35 if (access_mode & ADIO_DELETE_ON_CLOSE)
36 fd->access_mode = access_mode ^ ADIO_DELETE_ON_CLOSE;
37 else
38 fd->access_mode = access_mode;
39
40 tmp_comm = fd->comm;
41 fd->comm = MPI_COMM_SELF;
42 (*(fd->fns->ADIOI_xxx_Open))(fd, error_code);
43 fd->comm = tmp_comm;
44 MPI_Bcast(error_code, 1, MPI_INT, \
45 fd->hints->ranklist[0], fd->comm);
46 /* if no error, close the file and reopen normally below */
47 if (*error_code == MPI_SUCCESS)
48 (*(fd->fns->ADIOI_xxx_Close))(fd, error_code);
49
50 fd->access_mode = access_mode; /* back to original */
51 }
52 else MPI_Bcast(error_code, 1, MPI_INT, fd->hints->ranklist[0], fd->comm);
53
54 if (*error_code != MPI_SUCCESS) {
55 return;
56 }
57 else {
58 /* turn off CREAT (and EXCL if set) for real multi-processor open */
59 access_mode ^= ADIO_CREATE;
60 if (access_mode & ADIO_EXCL)
61 access_mode ^= ADIO_EXCL;
62 }
63 }
64 fd->blksize = 1024*1024*4; /* this large default value should be good for
65 most file systems. any ROMIO driver is free
66 to stat the file and find an optimial value */
67
68 /* if we are doing deferred open, non-aggregators should return now */
69 if (fd->hints->deferred_open ) {
70 if (!(fd->is_agg)) {
71 /* we might have turned off EXCL for the aggregators.
72 * restore access_mode that non-aggregators get the right
73 * value from get_amode */
74 fd->access_mode = orig_amode_excl;
75 /* In file-system specific open, a driver might collect some
76 * information via stat(). Deferred open means not every process
77 * participates in fs-specific open, but they all participate in
78 * this open call. Broadcast a bit of information in case
79 * lower-level file system driver (e.g. 'bluegene') collected it
80 * (not all do)*/
81 MPI_Bcast(&(fd->blksize), 1, MPI_LONG, fd->hints->ranklist[0], fd->comm);
82 *error_code = MPI_SUCCESS;
83 ADIOI_Assert(fd->blksize > 0);
84 return;
85 }
86 }
87
88 /* For writing with data sieving, a read-modify-write is needed. If
89 the file is opened for write_only, the read will fail. Therefore,
90 if write_only, open the file as read_write, but record it as write_only
91 in fd, so that get_amode returns the right answer. */
92
93 /* observation from David Knaak: file systems that do not support data
94 * sieving do not need to change the mode */
95
96 orig_amode_wronly = access_mode;
97 if ( (access_mode & ADIO_WRONLY) &&
98 ADIO_Feature(fd, ADIO_DATA_SIEVING_WRITES) ) {
99 access_mode = access_mode ^ ADIO_WRONLY;
100 access_mode = access_mode | ADIO_RDWR;
101 }
102 fd->access_mode = access_mode;
103
104 (*(fd->fns->ADIOI_xxx_Open))(fd, error_code);
105
106 /* if error, may be it was due to the change in amode above.
107 therefore, reopen with access mode provided by the user.*/
108 fd->access_mode = orig_amode_wronly;
109 if (*error_code != MPI_SUCCESS)
110 (*(fd->fns->ADIOI_xxx_Open))(fd, error_code);
111
112 /* if we turned off EXCL earlier, then we should turn it back on */
113 if (fd->access_mode != orig_amode_excl) fd->access_mode = orig_amode_excl;
114
115 /* broadcast a bit of information (blocksize for now) to all proceses in
116 * communicator, not just those who participated in open */
117 MPI_Bcast(&(fd->blksize), 1, MPI_LONG, fd->hints->ranklist[0], fd->comm);
118 /* file domain code will get terribly confused in a hard-to-debug way if
119 * gpfs blocksize not sensible */
120 ADIOI_Assert( fd->blksize > 0);
121 /* for deferred open: this process has opened the file (because if we are
122 * not an aggregaor and we are doing deferred open, we returned earlier)*/
123 fd->is_open = 1;
124
125 }
126
127 /*
128 * vim: ts=8 sts=4 sw=4 noexpandtab
129 */