root/ompi/mca/io/romio321/romio/adio/ad_gpfs/ad_gpfs_tuning.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. ad_gpfs_get_env_vars
  2. ad_gpfs_timing_crw_report

   1 /* ---------------------------------------------------------------- */
   2 /* (C)Copyright IBM Corp.  2007, 2008                               */
   3 /* ---------------------------------------------------------------- */
   4 /**
   5  * \file ad_gpfs_tuning.c
   6  * \brief Defines ad_gpfs performance tuning
   7  */
   8 
   9 /* -*- Mode: C; c-basic-offset:4 ; -*- */
  10 /*
  11  *   Copyright (C) 2008 University of Chicago.
  12  *   See COPYRIGHT notice in top-level directory.
  13  */
  14 
  15 /*---------------------------------------------------------------------
  16  * ad_gpfs_tuning.c
  17  *
  18  * defines global variables and functions for performance tuning and
  19  * functional debugging.
  20  *---------------------------------------------------------------------*/
  21 
  22 #include "ad_gpfs_tuning.h"
  23 #include "mpi.h"
  24 
  25 #if !defined(PVFS2_SUPER_MAGIC)
  26   #define PVFS2_SUPER_MAGIC (0x20030528)
  27 #endif
  28 
  29 
  30 int     gpfsmpio_timing;
  31 int     gpfsmpio_timing2;
  32 int     gpfsmpio_timing_cw_level;
  33 int     gpfsmpio_comm;
  34 int     gpfsmpio_tunegather;
  35 int     gpfsmpio_tuneblocking;
  36 long    bglocklessmpio_f_type;
  37 int     gpfsmpio_bg_nagg_pset;
  38 int     gpfsmpio_pthreadio;
  39 int     gpfsmpio_p2pcontig;
  40 int     gpfsmpio_write_aggmethod;
  41 int     gpfsmpio_read_aggmethod;
  42 int     gpfsmpio_balancecontig;
  43 int     gpfsmpio_devnullio;
  44 int     gpfsmpio_bridgeringagg;
  45 int     gpfsmpio_onesided_no_rmw;
  46 int     gpfsmpio_onesided_always_rmw;
  47 int     gpfsmpio_onesided_inform_rmw;
  48 
  49 double  gpfsmpio_prof_cw    [GPFSMPIO_CIO_LAST+1];
  50 double  gpfsmpio_prof_cr    [GPFSMPIO_CIO_LAST+1];
  51 
  52 /* set internal variables for tuning environment variables */
  53 /** \page mpiio_vars MPIIO Configuration
  54   \section env_sec Environment Variables
  55  * - GPFSMPIO_COMM - Define how data is exchanged on collective
  56  *   reads and writes.  Possible values:
  57  *   - 0 - Use MPI_Alltoallv.
  58  *   - 1 - Use MPI_Isend/MPI_Irecv.
  59  *   - Default is 0.
  60  *
  61  * - GPFSMPIO_TIMING - collect timing breakdown for MPI I/O collective calls.
  62  *   Possible values:
  63  *   - 0 - Do not collect/report timing.
  64  *   - 1 - Collect/report timing.
  65  *   - Default is 0.
  66  *
  67  * - GPFSMPIO_TUNEGATHER - Tune how starting and ending offsets are communicated
  68  *   for aggregator collective i/o.  Possible values:
  69  *   - 0 - Use two MPI_Allgather's to collect starting and ending offsets.
  70  *   - 1 - Use MPI_Allreduce(MPI_MAX) to collect starting and ending offsets.
  71  *   - Default is 1.
  72  *
  73  * - GPFSMPIO_TUNEBLOCKING - Tune how aggregate file domains are
  74  *   calculated (block size).  Possible values:
  75  *   - 0 - Evenly calculate file domains across aggregators.  Also use
  76  *   MPI_Isend/MPI_Irecv to exchange domain information.
  77  *   - 1 - Align file domains with the underlying file system's block size.  Also use
  78  *   MPI_Alltoallv to exchange domain information.
  79  *   - Default is 1.
  80  *
  81  * - BGLOCKLESSMPIO_F_TYPE - Specify a filesystem type that should run
  82  *   the ad_bglockless driver.   NOTE: Using romio prefixes (such as
  83  *   "bg:" or "bglockless:") on a file name will override this environment
  84  *   variable.  Possible values:
  85  *   - 0xnnnnnnnn - Any valid file system type (or "magic number") from
  86  *                  statfs() field f_type.
  87  *   - The default is 0x20030528 (PVFS2_SUPER_MAGIC)
  88  *
  89  * - GPFSMPIO_NAGG_PSET - Specify a ratio of "I/O aggregators" to use for each
  90  *   compute group (compute nodes + i/o nodes).    Possible values:
  91  *   - any integer
  92  *   - Default is 8
  93  *
  94  * - GPFSMPIO_PTHREADIO - Enables a very simple form of asyncronous io where a
  95  *   pthread is spawned to do the posix writes while the main thread does the
  96  *   data aggregation - useful for large files where multiple rounds are
  97  *   required (more that the cb_buffer_size of data per aggregator).   User
  98  *   must ensure there is hw resource available for the thread to run.  I
  99  *   am sure there is a better way to do this involving comm threads - this is
 100  *   just a start.  NOTE: For some reason the stats collected when this is
 101  *   enabled misses some of the data so the data sizes are off a bit - this is
 102  *   a statistical issue only, the data is still accurately written out
 103  *
 104  * - GPFSMPIO_P2PCONTIG -  Does simple point-to-point communication between the
 105  *   aggregator and the procs that feed it.  Performance could be enhanced by a
 106  *   one-sided put algorithm.  Current implementation allows only 1 round of
 107  *   data.  Useful/allowed only when:
 108  * 1.) The datatype is contiguous.
 109  * 2.) The offsets are increasing in rank-order.
 110  * 3.) There are no gaps between the offsets.
 111  * 4.) No single rank has a data size which spans multiple file domains.
 112  *
 113  * - GPFSMPIO_WRITE_AGGMETHOD/GPFSMPIO_READ_AGGMETHOD -  Replaces the two-phase
 114  *   collective IO aggregation
 115  *   with a one-sided algorithm, significantly reducing communication and
 116  *   memory overhead.  Fully
 117  *   supports all datasets and datatypes, the only caveat is that any holes in the data
 118  *   when writing to a pre-existing file are ignored -- there is no read-modify-write
 119  *   support to maintain the correctness of regions of pre-existing data so every byte
 120  *   must be explicitly written to maintain correctness.  Users must beware of middle-ware
 121  *   libraries like PNETCDF which may count on read-modify-write functionality for certain
 122  *   features (like fill values).  Possible values:
 123  *   - 0 - Normal two-phase collective IO is used.
 124  *   - 1 - A separate one-sided MPI_Put or MPI_Get is used for each contigous chunk of data
 125  *         for a compute to write to or read from the collective buffer on the aggregator.
 126  *   - 2 - An MPI derived datatype is created using all the contigous chunks and just one
 127  *         call to MPI_Put or MPI_Get is done with the derived datatype.  On Blue Gene /Q
 128  *         optimal performance for this is achieved when paired with PAMID_TYPED_ONESIDED=1.
 129  *   - Default is 0
 130  *
 131  * - GPFSMPIO_ONESIDED_NO_RMW - For one-sided aggregation (GPFSMPIO_WRITE_AGGMETHOD = 1 or 2)
 132  *   disable the detection of holes in the data when writing to a pre-existing
 133  *   file requiring a read-modify-write, thereby avoiding the communication
 134  *   overhead for this detection.
 135  *   - 0 (hole detection enabled) or 1 (hole detection disabled)
 136  *   - Default is 0
 137  *
 138  * - GPFSMPIO_ONESIDED_INFORM_RMW - For one-sided aggregation
 139  *   (GPFSMPIO_AGGMETHOD = 1 or 2) generate an informational message informing
 140  *   the user whether holes exist in the data when writing to a pre-existing
 141  *   file requiring a read-modify-write, thereby educating the user to set
 142  *   GPFSMPIO_ONESIDED_NO_RMW=1 on a future run to avoid the communication
 143  *   overhead for this detection.
 144  *   - 0 (disabled) or 1 (enabled)
 145  *   - Default is 0
 146  *
 147  * - GPFSMPIO_BALANCECONTIG -  Relevant only to BGQ.  File domain blocks are assigned
 148  *   to aggregators in a breadth-first fashion relative to the ions - additionally,
 149  *   file domains on the aggregators sharing the same bridgeset and ion have contiguous
 150  *   offsets.  The breadth-first assignment improves performance in the case of
 151  *   a relatively small file of size less than the gpfs block size multiplied
 152  *   by the number of ions. Files: ad_gpfs_aggrs.c ad_bg_aggrs.c.  Possible Values
 153  *   - 0 - assign file domain blocks in the traditional manner
 154  *   - 1 - if there are variable sized file domain blocks, spread them out
 155  *         (balance) across bridge nodes
 156  *
 157  * - GPFSMPIO_DEVNULLIO - do everything *except* write to / read from the file
 158  *   system. When experimenting with different two-phase I/O strategies, it's
 159  *   helpful to remove the highly variable file system from the experiment.
 160  *   - 0 (disabled) or 1 (enabled)
 161  *   - Default is 0
 162  *
 163  * - GPFSMPIO_BRIDGERINGAGG - Relevant only to BGQ.  Aggregator placement
 164  *   optimization whch forms a 5-d ring around the bridge node starting at
 165  *   GPFSMPIO_BRIDGERINGAGG hops away.  Experimental performance results
 166  *   suggest best value is 1 and only in conjunction with GPFSMPIO_P2PCONTIG
 167  *   and GPFSMPIO_BALANCECONTIG.  The number of aggregators selected is still
 168  *   GPFSMPIO_NAGG_PSET however the bridge node itself is NOT selected.
 169  *
 170  */
 171 
 172 void ad_gpfs_get_env_vars() {
 173     char *x, *dummy;
 174 
 175     gpfsmpio_comm   = 0;
 176         x = getenv( "GPFSMPIO_COMM"         );
 177         if (x) gpfsmpio_comm         = atoi(x);
 178     gpfsmpio_timing = 0;
 179         x = getenv( "GPFSMPIO_TIMING"       );
 180         if (x) gpfsmpio_timing       = atoi(x);
 181     gpfsmpio_tunegather = 1;
 182         x = getenv( "GPFSMPIO_TUNEGATHER"   );
 183         if (x) gpfsmpio_tunegather   = atoi(x);
 184     gpfsmpio_tuneblocking = 1;
 185     x = getenv( "GPFSMPIO_TUNEBLOCKING" );
 186     if (x) gpfsmpio_tuneblocking = atoi(x);
 187     bglocklessmpio_f_type = PVFS2_SUPER_MAGIC;
 188     x = getenv( "BGLOCKLESSMPIO_F_TYPE" );
 189     if (x) bglocklessmpio_f_type = strtol(x,&dummy,0);
 190     DBG_FPRINTF(stderr,"BGLOCKLESSMPIO_F_TYPE=%ld/%#lX\n",
 191             bglocklessmpio_f_type,bglocklessmpio_f_type);
 192     /* note: this value will be 'sanity checked' in ADIOI_BG_persInfo_init(),
 193      * when we know a bit more about what "largest possible value" and
 194      * "smallest possible value" should be */
 195     gpfsmpio_bg_nagg_pset = ADIOI_BG_NAGG_PSET_DFLT;
 196     x = getenv("GPFSMPIO_NAGG_PSET");
 197     if (x) gpfsmpio_bg_nagg_pset = atoi(x);
 198 
 199     gpfsmpio_pthreadio = 0;
 200     x = getenv( "GPFSMPIO_PTHREADIO" );
 201     if (x) gpfsmpio_pthreadio = atoi(x);
 202 
 203     gpfsmpio_p2pcontig = 0;
 204     x = getenv( "GPFSMPIO_P2PCONTIG" );
 205     if (x) gpfsmpio_p2pcontig = atoi(x);
 206 
 207     gpfsmpio_write_aggmethod = 0;
 208     x = getenv( "GPFSMPIO_WRITE_AGGMETHOD" );
 209     if (x) gpfsmpio_write_aggmethod = atoi(x);
 210 
 211     gpfsmpio_read_aggmethod = 0;
 212     x = getenv( "GPFSMPIO_READ_AGGMETHOD" );
 213     if (x) gpfsmpio_read_aggmethod = atoi(x);
 214 
 215     gpfsmpio_balancecontig = 0;
 216     x = getenv( "GPFSMPIO_BALANCECONTIG" );
 217     if (x) gpfsmpio_balancecontig = atoi(x);
 218 
 219     gpfsmpio_devnullio = 0;
 220     x = getenv( "GPFSMPIO_DEVNULLIO" );
 221     if (x) gpfsmpio_devnullio = atoi(x);
 222 
 223     gpfsmpio_bridgeringagg = 0;
 224     x = getenv( "GPFSMPIO_BRIDGERINGAGG" );
 225     if (x) gpfsmpio_bridgeringagg = atoi(x);
 226 
 227     gpfsmpio_onesided_no_rmw = 0;
 228     x = getenv( "GPFSMPIO_ONESIDED_NO_RMW" );
 229     if (x) gpfsmpio_onesided_no_rmw = atoi(x);
 230 
 231     gpfsmpio_onesided_always_rmw = 0;
 232     x = getenv( "GPFSMPIO_ONESIDED_ALWAYS_RMW" );
 233     if (x) gpfsmpio_onesided_always_rmw = atoi(x);
 234     if (gpfsmpio_onesided_always_rmw)
 235       gpfsmpio_onesided_no_rmw = 1;
 236 
 237     gpfsmpio_onesided_inform_rmw = 0;
 238     x = getenv( "GPFSMPIO_ONESIDED_INFORM_RMW" );
 239     if (x) gpfsmpio_onesided_inform_rmw = atoi(x);
 240 }
 241 
 242 /* report timing breakdown for MPI I/O collective call */
 243 void ad_gpfs_timing_crw_report( int rw, ADIO_File fd, int myrank, int nprocs )
 244 {
 245     int i;
 246 
 247     if (gpfsmpio_timing) {
 248         /* Timing across the whole communicator is a little bit interesting,
 249          * but what is *more* interesting is if we single out the aggregators
 250          * themselves.  non-aggregators spend a lot of time in "exchange" not
 251          * exchanging data, but blocked because they are waiting for
 252          * aggregators to finish writing.  If we focus on just the aggregator
 253          * processes we will get a more clear picture about the data exchange
 254          * vs. i/o time breakdown */
 255 
 256         /* if deferred open enabled, we could use the aggregator communicator */
 257         MPI_Comm agg_comm;
 258         int nr_aggs, agg_rank;
 259         MPI_Comm_split(fd->comm, (fd->is_agg ? 1 : MPI_UNDEFINED), 0, &agg_comm);
 260         if(agg_comm != MPI_COMM_NULL) {
 261             MPI_Comm_size(agg_comm, &nr_aggs);
 262             MPI_Comm_rank(agg_comm, &agg_rank);
 263         }
 264 
 265         double *gpfsmpio_prof_org = gpfsmpio_prof_cr;
 266         if (rw) gpfsmpio_prof_org = gpfsmpio_prof_cw;
 267 
 268         double gpfsmpio_prof_avg[ GPFSMPIO_CIO_LAST ];
 269         double gpfsmpio_prof_max[ GPFSMPIO_CIO_LAST ];
 270 
 271         if( agg_comm != MPI_COMM_NULL) {
 272             MPI_Reduce( gpfsmpio_prof_org, gpfsmpio_prof_avg, GPFSMPIO_CIO_LAST, MPI_DOUBLE, MPI_SUM, 0, agg_comm);
 273             MPI_Reduce( gpfsmpio_prof_org, gpfsmpio_prof_max, GPFSMPIO_CIO_LAST, MPI_DOUBLE, MPI_MAX, 0, agg_comm);
 274         }
 275         if (agg_comm != MPI_COMM_NULL && agg_rank == 0) {
 276 
 277             for (i=0; i<GPFSMPIO_CIO_LAST; i++) gpfsmpio_prof_avg[i] /= nr_aggs;
 278 
 279             gpfsmpio_prof_avg[ GPFSMPIO_CIO_B_POSI_RW  ] =
 280                 gpfsmpio_prof_avg[ GPFSMPIO_CIO_DATA_SIZE ] * nr_aggs /
 281                 gpfsmpio_prof_max[ GPFSMPIO_CIO_T_POSI_RW  ];
 282             gpfsmpio_prof_avg[ GPFSMPIO_CIO_B_MPIO_RW  ] =
 283                 gpfsmpio_prof_avg[ GPFSMPIO_CIO_DATA_SIZE ] * nr_aggs /
 284                 gpfsmpio_prof_max[ GPFSMPIO_CIO_T_MPIO_RW  ];
 285 
 286             gpfsmpio_prof_avg[ GPFSMPIO_CIO_B_MPIO_CRW ] =
 287                 gpfsmpio_prof_avg[ GPFSMPIO_CIO_DATA_SIZE ] * nr_aggs /
 288                 gpfsmpio_prof_max[ GPFSMPIO_CIO_T_MPIO_CRW ];
 289 
 290             fprintf(stderr,"TIMING-%1s,", (rw ? "W" : "R") );
 291             fprintf(stderr,"SIZE: %12.4lld , ", (long long int)(gpfsmpio_prof_avg[ GPFSMPIO_CIO_DATA_SIZE ] * nr_aggs));
 292             fprintf(stderr,"SEEK-avg: %10.3f , ",
 293                     gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_SEEK ]     );
 294             fprintf(stderr,"SEEK-max: %10.3f , ",
 295                     gpfsmpio_prof_max[ GPFSMPIO_CIO_T_SEEK ]     );
 296             fprintf(stderr,"LOCAL-avg: %10.3f , ",
 297                     gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_LCOMP ]    );
 298             fprintf(stderr,"GATHER-max: %10.3f , ",
 299                     gpfsmpio_prof_max[ GPFSMPIO_CIO_T_GATHER ]   );
 300             fprintf(stderr,"PATTERN-avg: %10.3f , ",
 301                     gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_PATANA ]   );
 302             fprintf(stderr,"FILEDOMAIN-avg: %10.3f , ",
 303                     gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_FD_PART ]  );
 304             fprintf(stderr,"MYREQ-avg: %10.3f , ",
 305                     gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_MYREQ ]    );
 306             fprintf(stderr,"OTHERREQ-max: %10.3f , ",
 307                     gpfsmpio_prof_max[ GPFSMPIO_CIO_T_OTHREQ ]   );
 308             fprintf(stderr,"EXCHANGE-max: %10.3f , ",
 309                     gpfsmpio_prof_max[ GPFSMPIO_CIO_T_DEXCH ]    );
 310             fprintf(stderr, "EXCHANGE-RECV_EXCH-max: %10.3f , ",
 311                     gpfsmpio_prof_max[ GPFSMPIO_CIO_T_DEXCH_RECV_EXCH]  );
 312             fprintf(stderr, "EXCHANGE-SETUP-max: %10.3f , ",
 313                     gpfsmpio_prof_max[ GPFSMPIO_CIO_T_DEXCH_SETUP]  );
 314             fprintf(stderr, "EXCHANGE-NET-max: %10.3f , ",
 315                     gpfsmpio_prof_max[ GPFSMPIO_CIO_T_DEXCH_NET]  );
 316             fprintf(stderr, "EXCHANGE-SORT-max: %10.3f , ",
 317                     gpfsmpio_prof_max[ GPFSMPIO_CIO_T_DEXCH_SORT]  );
 318             fprintf(stderr, "EXCHANGE-SIEVE-max: %10.3f , ",
 319                     gpfsmpio_prof_max[ GPFSMPIO_CIO_T_DEXCH_SIEVE]  );
 320             fprintf(stderr,"POSIX-TIME-avg: %10.3f , ",
 321                     gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_POSI_RW ]  );
 322             fprintf(stderr,"POSIX-TIME-max: %10.3f , ",
 323                     gpfsmpio_prof_max[ GPFSMPIO_CIO_T_POSI_RW ]  );
 324             fprintf(stderr,"MPIIO-CONTIG-TIME-avg: %10.3f , ",
 325                     gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_MPIO_RW ]  );
 326             fprintf(stderr,"MPIIO-STRIDED-TIME-avg: %10.3f , ",
 327                     gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_MPIO_CRW ] );
 328             fprintf(stderr,"POSIX-BW-avg: %10.3f , ",
 329                     gpfsmpio_prof_avg[ GPFSMPIO_CIO_B_POSI_RW ]  );
 330             fprintf(stderr,"MPI-BW-avg: %10.3f , ",
 331                     gpfsmpio_prof_avg[ GPFSMPIO_CIO_B_MPIO_RW ]  );
 332             fprintf(stderr,"MPI-BW-collective-avg: %10.3f\n ",
 333                     gpfsmpio_prof_avg[ GPFSMPIO_CIO_B_MPIO_CRW ] );
 334         }
 335         if (agg_comm != MPI_COMM_NULL) MPI_Comm_free(&agg_comm);
 336     }
 337 
 338 }

/* [<][>][^][v][top][bottom][index][help] */