1 /* ---------------------------------------------------------------- */
2 /* (C)Copyright IBM Corp. 2007, 2008 */
3 /* ---------------------------------------------------------------- */
4 /**
5 * \file ad_gpfs_tuning.h
6 * \brief ???
7 */
8
9 /*---------------------------------------------------------------------
10 * ad_gpfs_tuning.h
11 *
12 * declares global variables and macros for performance tuning and
13 * functional debugging.
14 *---------------------------------------------------------------------*/
15
16 #ifndef AD_GPFS_TUNING_H_
17 #define AD_GPFS_TUNING_H_
18
19 #include "adio.h"
20
21
22 /*-----------------------------------------
23 * Global variables for the control of
24 * 1. timing
25 * 2. select specific optimizations
26 * 3. global flags for certain optimizations
27 *-----------------------------------------*/
28
29 /* timing fields */
30 enum {
31 GPFSMPIO_CIO_DATA_SIZE=0,
32 GPFSMPIO_CIO_T_SEEK,
33 GPFSMPIO_CIO_T_LCOMP, /* time for ADIOI_Calc_my_off_len(), local */
34 GPFSMPIO_CIO_T_GATHER, /* time for previous MPI_Allgather, now Allreduce */
35 GPFSMPIO_CIO_T_PATANA, /* time for a quick test if access is contiguous or not, local */
36 GPFSMPIO_CIO_T_FD_PART, /* time for file domain partitioning, local */
37 GPFSMPIO_CIO_T_MYREQ, /* time for ADIOI_Calc_my_req(), local */
38 GPFSMPIO_CIO_T_OTHREQ, /* time for ADIOI_Calc_others_req(), short Alltoall */
39 GPFSMPIO_CIO_T_DEXCH, /* time for I/O data exchange */
40 /* the next DEXCH_* timers capture finer-grained portions of T_DEXCH */
41 GPFSMPIO_CIO_T_DEXCH_RECV_EXCH,/* time for each process to exchange recieve
42 size info with everyone else */
43 GPFSMPIO_CIO_T_DEXCH_SETUP, /* time for setup portion of I/O data exchange */
44 GPFSMPIO_CIO_T_DEXCH_NET, /* time for network portion of I/O data exchange */
45 GPFSMPIO_CIO_T_DEXCH_SORT, /* time to sort requesst in I/O data exchange */
46 GPFSMPIO_CIO_T_DEXCH_SIEVE, /* time for read portion of RMW in two phase */
47 GPFSMPIO_CIO_T_POSI_RW,
48 GPFSMPIO_CIO_B_POSI_RW,
49 GPFSMPIO_CIO_T_MPIO_RW, /* time for ADIOI_WriteContig() */
50 GPFSMPIO_CIO_B_MPIO_RW,
51 GPFSMPIO_CIO_T_MPIO_CRW, /* time for ADIOI_GPFS_WriteStridedColl() */
52 GPFSMPIO_CIO_B_MPIO_CRW,
53 GPFSMPIO_CIO_LAST
54 };
55
56 /* +1 because GPFSMPIO_CIO_LAST is actually used to say "zero this counter"" */
57 extern double gpfsmpio_prof_cw [GPFSMPIO_CIO_LAST+1];
58 extern double gpfsmpio_prof_cr [GPFSMPIO_CIO_LAST+1];
59
60 /* corresponds to environment variables to select optimizations and timing level */
61 extern int gpfsmpio_timing;
62 extern int gpfsmpio_timing_cw_level;
63 extern int gpfsmpio_comm;
64 extern int gpfsmpio_tunegather;
65 extern int gpfsmpio_tuneblocking;
66 extern long bglocklessmpio_f_type;
67 extern int gpfsmpio_pthreadio;
68 extern int gpfsmpio_p2pcontig;
69 extern int gpfsmpio_write_aggmethod;
70 extern int gpfsmpio_read_aggmethod;
71 extern int gpfsmpio_balancecontig;
72 extern int gpfsmpio_devnullio;
73 extern int gpfsmpio_bridgeringagg;
74 extern int gpfsmpio_onesided_no_rmw;
75 extern int gpfsmpio_onesided_always_rmw;
76 extern int gpfsmpio_onesided_inform_rmw;
77
78 /* Default is, well, kind of complicated. Blue Gene /L and /P had "psets": one
79 * i/o node and all compute nodes wired to it. On Blue Gene /Q that
80 * relationship is a lot more fluid. There are still I/O nodes, and compute
81 * nodes are assigned to an i/o node, but there are two routes to the i/o node,
82 * via compute nodes designated as "bridge nodes". In this code, what we used
83 * to call a "pset" is actually "compute nodes associated with and including a
84 * bridge node". So, "nAgg" is roughly "number of aggregators per bridge", but
85 * look closely at ADIOI_BG_persInfo_init() for the details */
86
87 #define ADIOI_BG_NAGG_PSET_DFLT 16
88
89 extern int gpfsmpio_bg_nagg_pset;
90
91
92 /* set internal variables for tuning environment variables */
93 void ad_gpfs_get_env_vars(void);
94
95 /* report timing breakdown for MPI I/O collective call */
96 void ad_gpfs_timing_crw_report( int rw, ADIO_File fd, int myrank, int nprocs );
97
98 /* note:
99 * T := timing;
100 * CIO := collective I/O
101 */
102 #define GPFSMPIO_T_CIO_RESET( RW ) \
103 { \
104 int _i; \
105 for ( _i = 0; _i < GPFSMPIO_CIO_LAST; _i ++ ) \
106 gpfsmpio_prof_c##RW [ _i ] = 0; \
107 }
108
109 #define GPFSMPIO_T_CIO_REPORT( RW, FD, MYRANK, NPROCS ) \
110 ad_gpfs_timing_crw_report ( RW, FD, MYRANK, NPROCS ); \
111
112 #define GPFSMPIO_T_CIO_SET_GET(RW, ISSET, ISGET, VAR1, VAR2 ) \
113 {\
114 double temp = MPI_Wtime(); \
115 if ( ISSET ) gpfsmpio_prof_c##RW [ VAR1 ] = temp; \
116 if ( ISGET ) gpfsmpio_prof_c##RW [ VAR2 ] = temp - gpfsmpio_prof_c##RW [ VAR2 ] ;\
117 }
118
119 #endif /* AD_GPFS_TUNING_H_ */