1 /* ---------------------------------------------------------------- */ 2 /* (C)Copyright IBM Corp. 2007, 2008 */ 3 /* ---------------------------------------------------------------- */ 4 /** 5 * \file ad_gpfs_tuning.c 6 * \brief Defines ad_gpfs performance tuning 7 */ 8 9 /* -*- Mode: C; c-basic-offset:4 ; -*- */ 10 /* 11 * Copyright (C) 2008 University of Chicago. 12 * See COPYRIGHT notice in top-level directory. 13 */ 14 15 /*--------------------------------------------------------------------- 16 * ad_gpfs_tuning.c 17 * 18 * defines global variables and functions for performance tuning and 19 * functional debugging. 20 *---------------------------------------------------------------------*/ 21 22 #include "ad_gpfs_tuning.h" 23 #include "mpi.h" 24 25 #if !defined(PVFS2_SUPER_MAGIC) 26 #define PVFS2_SUPER_MAGIC (0x20030528) 27 #endif 28 29 30 int gpfsmpio_timing; 31 int gpfsmpio_timing2; 32 int gpfsmpio_timing_cw_level; 33 int gpfsmpio_comm; 34 int gpfsmpio_tunegather; 35 int gpfsmpio_tuneblocking; 36 long bglocklessmpio_f_type; 37 int gpfsmpio_bg_nagg_pset; 38 int gpfsmpio_pthreadio; 39 int gpfsmpio_p2pcontig; 40 int gpfsmpio_write_aggmethod; 41 int gpfsmpio_read_aggmethod; 42 int gpfsmpio_balancecontig; 43 int gpfsmpio_devnullio; 44 int gpfsmpio_bridgeringagg; 45 int gpfsmpio_onesided_no_rmw; 46 int gpfsmpio_onesided_always_rmw; 47 int gpfsmpio_onesided_inform_rmw; 48 49 double gpfsmpio_prof_cw [GPFSMPIO_CIO_LAST+1]; 50 double gpfsmpio_prof_cr [GPFSMPIO_CIO_LAST+1]; 51 52 /* set internal variables for tuning environment variables */ 53 /** \page mpiio_vars MPIIO Configuration 54 \section env_sec Environment Variables 55 * - GPFSMPIO_COMM - Define how data is exchanged on collective 56 * reads and writes. Possible values: 57 * - 0 - Use MPI_Alltoallv. 58 * - 1 - Use MPI_Isend/MPI_Irecv. 59 * - Default is 0. 60 * 61 * - GPFSMPIO_TIMING - collect timing breakdown for MPI I/O collective calls. 62 * Possible values: 63 * - 0 - Do not collect/report timing. 64 * - 1 - Collect/report timing. 65 * - Default is 0. 66 * 67 * - GPFSMPIO_TUNEGATHER - Tune how starting and ending offsets are communicated 68 * for aggregator collective i/o. Possible values: 69 * - 0 - Use two MPI_Allgather's to collect starting and ending offsets. 70 * - 1 - Use MPI_Allreduce(MPI_MAX) to collect starting and ending offsets. 71 * - Default is 1. 72 * 73 * - GPFSMPIO_TUNEBLOCKING - Tune how aggregate file domains are 74 * calculated (block size). Possible values: 75 * - 0 - Evenly calculate file domains across aggregators. Also use 76 * MPI_Isend/MPI_Irecv to exchange domain information. 77 * - 1 - Align file domains with the underlying file system's block size. Also use 78 * MPI_Alltoallv to exchange domain information. 79 * - Default is 1. 80 * 81 * - BGLOCKLESSMPIO_F_TYPE - Specify a filesystem type that should run 82 * the ad_bglockless driver. NOTE: Using romio prefixes (such as 83 * "bg:" or "bglockless:") on a file name will override this environment 84 * variable. Possible values: 85 * - 0xnnnnnnnn - Any valid file system type (or "magic number") from 86 * statfs() field f_type. 87 * - The default is 0x20030528 (PVFS2_SUPER_MAGIC) 88 * 89 * - GPFSMPIO_NAGG_PSET - Specify a ratio of "I/O aggregators" to use for each 90 * compute group (compute nodes + i/o nodes). Possible values: 91 * - any integer 92 * - Default is 8 93 * 94 * - GPFSMPIO_PTHREADIO - Enables a very simple form of asyncronous io where a 95 * pthread is spawned to do the posix writes while the main thread does the 96 * data aggregation - useful for large files where multiple rounds are 97 * required (more that the cb_buffer_size of data per aggregator). User 98 * must ensure there is hw resource available for the thread to run. I 99 * am sure there is a better way to do this involving comm threads - this is 100 * just a start. NOTE: For some reason the stats collected when this is 101 * enabled misses some of the data so the data sizes are off a bit - this is 102 * a statistical issue only, the data is still accurately written out 103 * 104 * - GPFSMPIO_P2PCONTIG - Does simple point-to-point communication between the 105 * aggregator and the procs that feed it. Performance could be enhanced by a 106 * one-sided put algorithm. Current implementation allows only 1 round of 107 * data. Useful/allowed only when: 108 * 1.) The datatype is contiguous. 109 * 2.) The offsets are increasing in rank-order. 110 * 3.) There are no gaps between the offsets. 111 * 4.) No single rank has a data size which spans multiple file domains. 112 * 113 * - GPFSMPIO_WRITE_AGGMETHOD/GPFSMPIO_READ_AGGMETHOD - Replaces the two-phase 114 * collective IO aggregation 115 * with a one-sided algorithm, significantly reducing communication and 116 * memory overhead. Fully 117 * supports all datasets and datatypes, the only caveat is that any holes in the data 118 * when writing to a pre-existing file are ignored -- there is no read-modify-write 119 * support to maintain the correctness of regions of pre-existing data so every byte 120 * must be explicitly written to maintain correctness. Users must beware of middle-ware 121 * libraries like PNETCDF which may count on read-modify-write functionality for certain 122 * features (like fill values). Possible values: 123 * - 0 - Normal two-phase collective IO is used. 124 * - 1 - A separate one-sided MPI_Put or MPI_Get is used for each contigous chunk of data 125 * for a compute to write to or read from the collective buffer on the aggregator. 126 * - 2 - An MPI derived datatype is created using all the contigous chunks and just one 127 * call to MPI_Put or MPI_Get is done with the derived datatype. On Blue Gene /Q 128 * optimal performance for this is achieved when paired with PAMID_TYPED_ONESIDED=1. 129 * - Default is 0 130 * 131 * - GPFSMPIO_ONESIDED_NO_RMW - For one-sided aggregation (GPFSMPIO_WRITE_AGGMETHOD = 1 or 2) 132 * disable the detection of holes in the data when writing to a pre-existing 133 * file requiring a read-modify-write, thereby avoiding the communication 134 * overhead for this detection. 135 * - 0 (hole detection enabled) or 1 (hole detection disabled) 136 * - Default is 0 137 * 138 * - GPFSMPIO_ONESIDED_INFORM_RMW - For one-sided aggregation 139 * (GPFSMPIO_AGGMETHOD = 1 or 2) generate an informational message informing 140 * the user whether holes exist in the data when writing to a pre-existing 141 * file requiring a read-modify-write, thereby educating the user to set 142 * GPFSMPIO_ONESIDED_NO_RMW=1 on a future run to avoid the communication 143 * overhead for this detection. 144 * - 0 (disabled) or 1 (enabled) 145 * - Default is 0 146 * 147 * - GPFSMPIO_BALANCECONTIG - Relevant only to BGQ. File domain blocks are assigned 148 * to aggregators in a breadth-first fashion relative to the ions - additionally, 149 * file domains on the aggregators sharing the same bridgeset and ion have contiguous 150 * offsets. The breadth-first assignment improves performance in the case of 151 * a relatively small file of size less than the gpfs block size multiplied 152 * by the number of ions. Files: ad_gpfs_aggrs.c ad_bg_aggrs.c. Possible Values 153 * - 0 - assign file domain blocks in the traditional manner 154 * - 1 - if there are variable sized file domain blocks, spread them out 155 * (balance) across bridge nodes 156 * 157 * - GPFSMPIO_DEVNULLIO - do everything *except* write to / read from the file 158 * system. When experimenting with different two-phase I/O strategies, it's 159 * helpful to remove the highly variable file system from the experiment. 160 * - 0 (disabled) or 1 (enabled) 161 * - Default is 0 162 * 163 * - GPFSMPIO_BRIDGERINGAGG - Relevant only to BGQ. Aggregator placement 164 * optimization whch forms a 5-d ring around the bridge node starting at 165 * GPFSMPIO_BRIDGERINGAGG hops away. Experimental performance results 166 * suggest best value is 1 and only in conjunction with GPFSMPIO_P2PCONTIG 167 * and GPFSMPIO_BALANCECONTIG. The number of aggregators selected is still 168 * GPFSMPIO_NAGG_PSET however the bridge node itself is NOT selected. 169 * 170 */ 171 172 void ad_gpfs_get_env_vars() { 173 char *x, *dummy; 174 175 gpfsmpio_comm = 0; 176 x = getenv( "GPFSMPIO_COMM" ); 177 if (x) gpfsmpio_comm = atoi(x); 178 gpfsmpio_timing = 0; 179 x = getenv( "GPFSMPIO_TIMING" ); 180 if (x) gpfsmpio_timing = atoi(x); 181 gpfsmpio_tunegather = 1; 182 x = getenv( "GPFSMPIO_TUNEGATHER" ); 183 if (x) gpfsmpio_tunegather = atoi(x); 184 gpfsmpio_tuneblocking = 1; 185 x = getenv( "GPFSMPIO_TUNEBLOCKING" ); 186 if (x) gpfsmpio_tuneblocking = atoi(x); 187 bglocklessmpio_f_type = PVFS2_SUPER_MAGIC; 188 x = getenv( "BGLOCKLESSMPIO_F_TYPE" ); 189 if (x) bglocklessmpio_f_type = strtol(x,&dummy,0); 190 DBG_FPRINTF(stderr,"BGLOCKLESSMPIO_F_TYPE=%ld/%#lX\n", 191 bglocklessmpio_f_type,bglocklessmpio_f_type); 192 /* note: this value will be 'sanity checked' in ADIOI_BG_persInfo_init(), 193 * when we know a bit more about what "largest possible value" and 194 * "smallest possible value" should be */ 195 gpfsmpio_bg_nagg_pset = ADIOI_BG_NAGG_PSET_DFLT; 196 x = getenv("GPFSMPIO_NAGG_PSET"); 197 if (x) gpfsmpio_bg_nagg_pset = atoi(x); 198 199 gpfsmpio_pthreadio = 0; 200 x = getenv( "GPFSMPIO_PTHREADIO" ); 201 if (x) gpfsmpio_pthreadio = atoi(x); 202 203 gpfsmpio_p2pcontig = 0; 204 x = getenv( "GPFSMPIO_P2PCONTIG" ); 205 if (x) gpfsmpio_p2pcontig = atoi(x); 206 207 gpfsmpio_write_aggmethod = 0; 208 x = getenv( "GPFSMPIO_WRITE_AGGMETHOD" ); 209 if (x) gpfsmpio_write_aggmethod = atoi(x); 210 211 gpfsmpio_read_aggmethod = 0; 212 x = getenv( "GPFSMPIO_READ_AGGMETHOD" ); 213 if (x) gpfsmpio_read_aggmethod = atoi(x); 214 215 gpfsmpio_balancecontig = 0; 216 x = getenv( "GPFSMPIO_BALANCECONTIG" ); 217 if (x) gpfsmpio_balancecontig = atoi(x); 218 219 gpfsmpio_devnullio = 0; 220 x = getenv( "GPFSMPIO_DEVNULLIO" ); 221 if (x) gpfsmpio_devnullio = atoi(x); 222 223 gpfsmpio_bridgeringagg = 0; 224 x = getenv( "GPFSMPIO_BRIDGERINGAGG" ); 225 if (x) gpfsmpio_bridgeringagg = atoi(x); 226 227 gpfsmpio_onesided_no_rmw = 0; 228 x = getenv( "GPFSMPIO_ONESIDED_NO_RMW" ); 229 if (x) gpfsmpio_onesided_no_rmw = atoi(x); 230 231 gpfsmpio_onesided_always_rmw = 0; 232 x = getenv( "GPFSMPIO_ONESIDED_ALWAYS_RMW" ); 233 if (x) gpfsmpio_onesided_always_rmw = atoi(x); 234 if (gpfsmpio_onesided_always_rmw) 235 gpfsmpio_onesided_no_rmw = 1; 236 237 gpfsmpio_onesided_inform_rmw = 0; 238 x = getenv( "GPFSMPIO_ONESIDED_INFORM_RMW" ); 239 if (x) gpfsmpio_onesided_inform_rmw = atoi(x); 240 } 241 242 /* report timing breakdown for MPI I/O collective call */ 243 void ad_gpfs_timing_crw_report( int rw, ADIO_File fd, int myrank, int nprocs ) 244 { 245 int i; 246 247 if (gpfsmpio_timing) { 248 /* Timing across the whole communicator is a little bit interesting, 249 * but what is *more* interesting is if we single out the aggregators 250 * themselves. non-aggregators spend a lot of time in "exchange" not 251 * exchanging data, but blocked because they are waiting for 252 * aggregators to finish writing. If we focus on just the aggregator 253 * processes we will get a more clear picture about the data exchange 254 * vs. i/o time breakdown */ 255 256 /* if deferred open enabled, we could use the aggregator communicator */ 257 MPI_Comm agg_comm; 258 int nr_aggs, agg_rank; 259 MPI_Comm_split(fd->comm, (fd->is_agg ? 1 : MPI_UNDEFINED), 0, &agg_comm); 260 if(agg_comm != MPI_COMM_NULL) { 261 MPI_Comm_size(agg_comm, &nr_aggs); 262 MPI_Comm_rank(agg_comm, &agg_rank); 263 } 264 265 double *gpfsmpio_prof_org = gpfsmpio_prof_cr; 266 if (rw) gpfsmpio_prof_org = gpfsmpio_prof_cw; 267 268 double gpfsmpio_prof_avg[ GPFSMPIO_CIO_LAST ]; 269 double gpfsmpio_prof_max[ GPFSMPIO_CIO_LAST ]; 270 271 if( agg_comm != MPI_COMM_NULL) { 272 MPI_Reduce( gpfsmpio_prof_org, gpfsmpio_prof_avg, GPFSMPIO_CIO_LAST, MPI_DOUBLE, MPI_SUM, 0, agg_comm); 273 MPI_Reduce( gpfsmpio_prof_org, gpfsmpio_prof_max, GPFSMPIO_CIO_LAST, MPI_DOUBLE, MPI_MAX, 0, agg_comm); 274 } 275 if (agg_comm != MPI_COMM_NULL && agg_rank == 0) { 276 277 for (i=0; i<GPFSMPIO_CIO_LAST; i++) gpfsmpio_prof_avg[i] /= nr_aggs; 278 279 gpfsmpio_prof_avg[ GPFSMPIO_CIO_B_POSI_RW ] = 280 gpfsmpio_prof_avg[ GPFSMPIO_CIO_DATA_SIZE ] * nr_aggs / 281 gpfsmpio_prof_max[ GPFSMPIO_CIO_T_POSI_RW ]; 282 gpfsmpio_prof_avg[ GPFSMPIO_CIO_B_MPIO_RW ] = 283 gpfsmpio_prof_avg[ GPFSMPIO_CIO_DATA_SIZE ] * nr_aggs / 284 gpfsmpio_prof_max[ GPFSMPIO_CIO_T_MPIO_RW ]; 285 286 gpfsmpio_prof_avg[ GPFSMPIO_CIO_B_MPIO_CRW ] = 287 gpfsmpio_prof_avg[ GPFSMPIO_CIO_DATA_SIZE ] * nr_aggs / 288 gpfsmpio_prof_max[ GPFSMPIO_CIO_T_MPIO_CRW ]; 289 290 fprintf(stderr,"TIMING-%1s,", (rw ? "W" : "R") ); 291 fprintf(stderr,"SIZE: %12.4lld , ", (long long int)(gpfsmpio_prof_avg[ GPFSMPIO_CIO_DATA_SIZE ] * nr_aggs)); 292 fprintf(stderr,"SEEK-avg: %10.3f , ", 293 gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_SEEK ] ); 294 fprintf(stderr,"SEEK-max: %10.3f , ", 295 gpfsmpio_prof_max[ GPFSMPIO_CIO_T_SEEK ] ); 296 fprintf(stderr,"LOCAL-avg: %10.3f , ", 297 gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_LCOMP ] ); 298 fprintf(stderr,"GATHER-max: %10.3f , ", 299 gpfsmpio_prof_max[ GPFSMPIO_CIO_T_GATHER ] ); 300 fprintf(stderr,"PATTERN-avg: %10.3f , ", 301 gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_PATANA ] ); 302 fprintf(stderr,"FILEDOMAIN-avg: %10.3f , ", 303 gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_FD_PART ] ); 304 fprintf(stderr,"MYREQ-avg: %10.3f , ", 305 gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_MYREQ ] ); 306 fprintf(stderr,"OTHERREQ-max: %10.3f , ", 307 gpfsmpio_prof_max[ GPFSMPIO_CIO_T_OTHREQ ] ); 308 fprintf(stderr,"EXCHANGE-max: %10.3f , ", 309 gpfsmpio_prof_max[ GPFSMPIO_CIO_T_DEXCH ] ); 310 fprintf(stderr, "EXCHANGE-RECV_EXCH-max: %10.3f , ", 311 gpfsmpio_prof_max[ GPFSMPIO_CIO_T_DEXCH_RECV_EXCH] ); 312 fprintf(stderr, "EXCHANGE-SETUP-max: %10.3f , ", 313 gpfsmpio_prof_max[ GPFSMPIO_CIO_T_DEXCH_SETUP] ); 314 fprintf(stderr, "EXCHANGE-NET-max: %10.3f , ", 315 gpfsmpio_prof_max[ GPFSMPIO_CIO_T_DEXCH_NET] ); 316 fprintf(stderr, "EXCHANGE-SORT-max: %10.3f , ", 317 gpfsmpio_prof_max[ GPFSMPIO_CIO_T_DEXCH_SORT] ); 318 fprintf(stderr, "EXCHANGE-SIEVE-max: %10.3f , ", 319 gpfsmpio_prof_max[ GPFSMPIO_CIO_T_DEXCH_SIEVE] ); 320 fprintf(stderr,"POSIX-TIME-avg: %10.3f , ", 321 gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_POSI_RW ] ); 322 fprintf(stderr,"POSIX-TIME-max: %10.3f , ", 323 gpfsmpio_prof_max[ GPFSMPIO_CIO_T_POSI_RW ] ); 324 fprintf(stderr,"MPIIO-CONTIG-TIME-avg: %10.3f , ", 325 gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_MPIO_RW ] ); 326 fprintf(stderr,"MPIIO-STRIDED-TIME-avg: %10.3f , ", 327 gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_MPIO_CRW ] ); 328 fprintf(stderr,"POSIX-BW-avg: %10.3f , ", 329 gpfsmpio_prof_avg[ GPFSMPIO_CIO_B_POSI_RW ] ); 330 fprintf(stderr,"MPI-BW-avg: %10.3f , ", 331 gpfsmpio_prof_avg[ GPFSMPIO_CIO_B_MPIO_RW ] ); 332 fprintf(stderr,"MPI-BW-collective-avg: %10.3f\n ", 333 gpfsmpio_prof_avg[ GPFSMPIO_CIO_B_MPIO_CRW ] ); 334 } 335 if (agg_comm != MPI_COMM_NULL) MPI_Comm_free(&agg_comm); 336 } 337 338 }