1 /* ---------------------------------------------------------------- */ 2 /* (C)Copyright IBM Corp. 2007, 2008 */ 3 /* ---------------------------------------------------------------- */ 4 /** 5 * \file ad_gpfs_tuning.h 6 * \brief ??? 7 */ 8 9 /*--------------------------------------------------------------------- 10 * ad_gpfs_tuning.h 11 * 12 * declares global variables and macros for performance tuning and 13 * functional debugging. 14 *---------------------------------------------------------------------*/ 15 16 #ifndef AD_GPFS_TUNING_H_ 17 #define AD_GPFS_TUNING_H_ 18 19 #include "adio.h" 20 21 22 /*----------------------------------------- 23 * Global variables for the control of 24 * 1. timing 25 * 2. select specific optimizations 26 * 3. global flags for certain optimizations 27 *-----------------------------------------*/ 28 29 /* timing fields */ 30 enum { 31 GPFSMPIO_CIO_DATA_SIZE=0, 32 GPFSMPIO_CIO_T_SEEK, 33 GPFSMPIO_CIO_T_LCOMP, /* time for ADIOI_Calc_my_off_len(), local */ 34 GPFSMPIO_CIO_T_GATHER, /* time for previous MPI_Allgather, now Allreduce */ 35 GPFSMPIO_CIO_T_PATANA, /* time for a quick test if access is contiguous or not, local */ 36 GPFSMPIO_CIO_T_FD_PART, /* time for file domain partitioning, local */ 37 GPFSMPIO_CIO_T_MYREQ, /* time for ADIOI_Calc_my_req(), local */ 38 GPFSMPIO_CIO_T_OTHREQ, /* time for ADIOI_Calc_others_req(), short Alltoall */ 39 GPFSMPIO_CIO_T_DEXCH, /* time for I/O data exchange */ 40 /* the next DEXCH_* timers capture finer-grained portions of T_DEXCH */ 41 GPFSMPIO_CIO_T_DEXCH_RECV_EXCH,/* time for each process to exchange recieve 42 size info with everyone else */ 43 GPFSMPIO_CIO_T_DEXCH_SETUP, /* time for setup portion of I/O data exchange */ 44 GPFSMPIO_CIO_T_DEXCH_NET, /* time for network portion of I/O data exchange */ 45 GPFSMPIO_CIO_T_DEXCH_SORT, /* time to sort requesst in I/O data exchange */ 46 GPFSMPIO_CIO_T_DEXCH_SIEVE, /* time for read portion of RMW in two phase */ 47 GPFSMPIO_CIO_T_POSI_RW, 48 GPFSMPIO_CIO_B_POSI_RW, 49 GPFSMPIO_CIO_T_MPIO_RW, /* time for ADIOI_WriteContig() */ 50 GPFSMPIO_CIO_B_MPIO_RW, 51 GPFSMPIO_CIO_T_MPIO_CRW, /* time for ADIOI_GPFS_WriteStridedColl() */ 52 GPFSMPIO_CIO_B_MPIO_CRW, 53 GPFSMPIO_CIO_LAST 54 }; 55 56 /* +1 because GPFSMPIO_CIO_LAST is actually used to say "zero this counter"" */ 57 extern double gpfsmpio_prof_cw [GPFSMPIO_CIO_LAST+1]; 58 extern double gpfsmpio_prof_cr [GPFSMPIO_CIO_LAST+1]; 59 60 /* corresponds to environment variables to select optimizations and timing level */ 61 extern int gpfsmpio_timing; 62 extern int gpfsmpio_timing_cw_level; 63 extern int gpfsmpio_comm; 64 extern int gpfsmpio_tunegather; 65 extern int gpfsmpio_tuneblocking; 66 extern long bglocklessmpio_f_type; 67 extern int gpfsmpio_pthreadio; 68 extern int gpfsmpio_p2pcontig; 69 extern int gpfsmpio_write_aggmethod; 70 extern int gpfsmpio_read_aggmethod; 71 extern int gpfsmpio_balancecontig; 72 extern int gpfsmpio_devnullio; 73 extern int gpfsmpio_bridgeringagg; 74 extern int gpfsmpio_onesided_no_rmw; 75 extern int gpfsmpio_onesided_always_rmw; 76 extern int gpfsmpio_onesided_inform_rmw; 77 78 /* Default is, well, kind of complicated. Blue Gene /L and /P had "psets": one 79 * i/o node and all compute nodes wired to it. On Blue Gene /Q that 80 * relationship is a lot more fluid. There are still I/O nodes, and compute 81 * nodes are assigned to an i/o node, but there are two routes to the i/o node, 82 * via compute nodes designated as "bridge nodes". In this code, what we used 83 * to call a "pset" is actually "compute nodes associated with and including a 84 * bridge node". So, "nAgg" is roughly "number of aggregators per bridge", but 85 * look closely at ADIOI_BG_persInfo_init() for the details */ 86 87 #define ADIOI_BG_NAGG_PSET_DFLT 16 88 89 extern int gpfsmpio_bg_nagg_pset; 90 91 92 /* set internal variables for tuning environment variables */ 93 void ad_gpfs_get_env_vars(void); 94 95 /* report timing breakdown for MPI I/O collective call */ 96 void ad_gpfs_timing_crw_report( int rw, ADIO_File fd, int myrank, int nprocs ); 97 98 /* note: 99 * T := timing; 100 * CIO := collective I/O 101 */ 102 #define GPFSMPIO_T_CIO_RESET( RW ) \ 103 { \ 104 int _i; \ 105 for ( _i = 0; _i < GPFSMPIO_CIO_LAST; _i ++ ) \ 106 gpfsmpio_prof_c##RW [ _i ] = 0; \ 107 } 108 109 #define GPFSMPIO_T_CIO_REPORT( RW, FD, MYRANK, NPROCS ) \ 110 ad_gpfs_timing_crw_report ( RW, FD, MYRANK, NPROCS ); \ 111 112 #define GPFSMPIO_T_CIO_SET_GET(RW, ISSET, ISGET, VAR1, VAR2 ) \ 113 {\ 114 double temp = MPI_Wtime(); \ 115 if ( ISSET ) gpfsmpio_prof_c##RW [ VAR1 ] = temp; \ 116 if ( ISGET ) gpfsmpio_prof_c##RW [ VAR2 ] = temp - gpfsmpio_prof_c##RW [ VAR2 ] ;\ 117 } 118 119 #endif /* AD_GPFS_TUNING_H_ */