This source file includes following definitions.
- ADIOI_GPFS_Calc_aggregator
- ADIOI_GPFS_Calc_file_domains
- ADIOI_GPFS_Calc_my_req
- ADIOI_GPFS_Calc_others_req
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16 #include "adio.h"
17 #include "adio_cb_config_list.h"
18 #include "ad_gpfs.h"
19 #include "ad_gpfs_aggrs.h"
20
21 #ifdef AGGREGATION_PROFILE
22 #include "mpe.h"
23 #endif
24
25
26 #ifdef USE_DBG_LOGGING
27 #define AGG_DEBUG 1
28 #endif
29
30 #ifndef TRACE_ERR
31 # define TRACE_ERR(format...)
32 #endif
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97 int ADIOI_GPFS_Calc_aggregator(ADIO_File fd,
98 ADIO_Offset off,
99 ADIO_Offset min_off,
100 ADIO_Offset *len,
101 ADIO_Offset fd_size,
102 ADIO_Offset *fd_start,
103 ADIO_Offset *fd_end)
104 {
105 int rank_index, rank;
106 ADIO_Offset avail_bytes;
107 TRACE_ERR("Entering ADIOI_GPFS_Calc_aggregator\n");
108
109 ADIOI_Assert ( (off <= fd_end[fd->hints->cb_nodes-1] && off >= min_off && fd_start[0] >= min_off ) );
110
111
112 int ub = fd->hints->cb_nodes;
113 int lb = 0;
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129 rank_index = fd->hints->cb_nodes / 2;
130 while ( off < fd_start[rank_index] || off > fd_end[rank_index] ) {
131 if ( off > fd_end [rank_index] ) {
132 lb = rank_index;
133 rank_index = (rank_index + ub) / 2;
134 }
135 else
136 if ( off < fd_start[rank_index] ) {
137 ub = rank_index;
138 rank_index = (rank_index + lb) / 2;
139 }
140 }
141
142
143
144 if (rank_index >= fd->hints->cb_nodes || rank_index < 0) {
145 FPRINTF(stderr, "Error in ADIOI_Calc_aggregator(): rank_index(%d) >= fd->hints->cb_nodes (%d) fd_size=%lld off=%lld\n",
146 rank_index,fd->hints->cb_nodes,fd_size,off);
147 MPI_Abort(MPI_COMM_WORLD, 1);
148 }
149
150
151
152
153
154
155
156
157
158
159
160 avail_bytes = fd_end[rank_index] + 1 - off;
161 if (avail_bytes < *len && avail_bytes > 0) {
162
163
164 *len = avail_bytes;
165 }
166
167
168
169 rank = fd->hints->ranklist[rank_index];
170 TRACE_ERR("Leaving ADIOI_GPFS_Calc_aggregator\n");
171
172 return rank;
173 }
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189 void ADIOI_GPFS_Calc_file_domains(ADIO_File fd,
190 ADIO_Offset *st_offsets,
191 ADIO_Offset *end_offsets,
192 int nprocs,
193 int nprocs_for_coll,
194 ADIO_Offset *min_st_offset_ptr,
195 ADIO_Offset **fd_start_ptr,
196 ADIO_Offset **fd_end_ptr,
197 ADIO_Offset *fd_size_ptr,
198 void *fs_ptr)
199 {
200 ADIO_Offset min_st_offset, max_end_offset, *fd_start, *fd_end, *fd_size;
201 int i, aggr;
202 TRACE_ERR("Entering ADIOI_GPFS_Calc_file_domains\n");
203 blksize_t blksize;
204
205 #ifdef AGGREGATION_PROFILE
206 MPE_Log_event (5004, 0, NULL);
207 #endif
208
209 # if AGG_DEBUG
210 static char myname[] = "ADIOI_GPFS_Calc_file_domains";
211 DBG_FPRINTF(stderr, "%s(%d): %d aggregator(s)\n",
212 myname,__LINE__,nprocs_for_coll);
213 # endif
214 if (fd->blksize <= 0)
215
216 fd->blksize = 1048576;
217 blksize = fd->blksize;
218
219 # if AGG_DEBUG
220 DBG_FPRINTF(stderr,"%s(%d): Blocksize=%ld\n",myname,__LINE__,blksize);
221 # endif
222
223 min_st_offset = st_offsets [0];
224 max_end_offset = end_offsets[0];
225 for (i=1; i<nprocs; i++) {
226 min_st_offset = ADIOI_MIN(min_st_offset, st_offsets[i]);
227 max_end_offset = ADIOI_MAX(max_end_offset, end_offsets[i]);
228 }
229
230
231
232
233
234
235
236 ADIO_Offset gpfs_ub = (max_end_offset +blksize-1) / blksize * blksize - 1;
237 ADIO_Offset gpfs_lb = min_st_offset / blksize * blksize;
238 ADIO_Offset gpfs_ub_rdoff = (max_end_offset +blksize-1) / blksize * blksize - 1 - max_end_offset;
239 ADIO_Offset gpfs_lb_rdoff = min_st_offset - min_st_offset / blksize * blksize;
240 ADIO_Offset fd_gpfs_range = gpfs_ub - gpfs_lb + 1;
241
242 int naggs = nprocs_for_coll;
243
244
245
246
247
248
249
250
251
252
253
254
255 fd_size = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll * sizeof(ADIO_Offset));
256 *fd_start_ptr = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll * sizeof(ADIO_Offset));
257 *fd_end_ptr = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll * sizeof(ADIO_Offset));
258 fd_start = *fd_start_ptr;
259 fd_end = *fd_end_ptr;
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286 ADIO_Offset n_gpfs_blk = fd_gpfs_range / blksize;
287 ADIO_Offset nb_cn_small = n_gpfs_blk/naggs;
288 ADIO_Offset naggs_large = n_gpfs_blk - naggs * (n_gpfs_blk/naggs);
289 ADIO_Offset naggs_small = naggs - naggs_large;
290
291 #ifdef BGQPLATFORM
292 if (gpfsmpio_balancecontig == 1) {
293
294
295
296
297
298
299 for (i=0; i<naggs; i++)
300 fd_size[i] = nb_cn_small * blksize;
301
302
303
304
305
306
307 int *bridgelistoffset =
308 (int *) ADIOI_Malloc(fd->hints->fs_hints.bg.numbridges*sizeof(int));
309
310
311
312 int *tmpbridgelistnum =
313 (int *) ADIOI_Malloc(fd->hints->fs_hints.bg.numbridges*sizeof(int));
314
315 int j;
316 for (j=0;j<fd->hints->fs_hints.bg.numbridges;j++) {
317 int k, bridgerankoffset = 0;
318 for (k=0;k<j;k++) {
319 bridgerankoffset += fd->hints->fs_hints.bg.bridgelistnum[k];
320 }
321 bridgelistoffset[j] = bridgerankoffset;
322 }
323
324 for (j=0;j<fd->hints->fs_hints.bg.numbridges;j++)
325 tmpbridgelistnum[j] = fd->hints->fs_hints.bg.bridgelistnum[j];
326 int bridgeiter = 0;
327
328
329
330
331
332
333 for (j=0;j<naggs_large;j++) {
334 int foundbridge = 0;
335 int numbridgelistpasses = 0;
336 while (!foundbridge) {
337 if (tmpbridgelistnum[bridgeiter] > 0) {
338 foundbridge = 1;
339
340
341
342
343
344 int currentbridgelistnum =
345 (fd->hints->fs_hints.bg.bridgelistnum[bridgeiter]-
346 tmpbridgelistnum[bridgeiter]);
347 int currentfdsizeindex = bridgelistoffset[bridgeiter] +
348 currentbridgelistnum;
349 fd_size[currentfdsizeindex] = (nb_cn_small+1) * blksize;
350 tmpbridgelistnum[bridgeiter]--;
351 }
352 if (bridgeiter == (fd->hints->fs_hints.bg.numbridges-1)) {
353
354
355 ADIOI_Assert(numbridgelistpasses == 0);
356 numbridgelistpasses++;
357 bridgeiter = 0;
358 }
359 else
360 bridgeiter++;
361 }
362 }
363 ADIOI_Free(tmpbridgelistnum);
364 ADIOI_Free(bridgelistoffset);
365
366 } else {
367
368
369 for (i=0; i<naggs; i++) {
370 if (i < naggs_large) {
371 fd_size[i] = (nb_cn_small+1) * blksize;
372 } else {
373 fd_size[i] = nb_cn_small * blksize;
374 }
375 }
376 }
377 #ifdef balancecontigtrace
378 int myrank;
379 MPI_Comm_rank(fd->comm,&myrank);
380 if (myrank == 0) {
381 fprintf(stderr,"naggs_small is %d nb_cn_small is %d\n",naggs_small,nb_cn_small);
382 for (i=0; i<naggs; i++) {
383 fprintf(stderr,"fd_size[%d] set to %d agg rank is %d\n",i,fd_size[i],fd->hints->ranklist[i]);
384 }
385 }
386 #endif
387
388 #else
389 for (i=0; i<naggs; i++) {
390 if (i < naggs_large) {
391 fd_size[i] = (nb_cn_small+1) * blksize;
392 } else {
393 fd_size[i] = nb_cn_small * blksize;
394 }
395 }
396
397 #endif
398
399
400 # if AGG_DEBUG
401 DBG_FPRINTF(stderr,"%s(%d): "
402 "gpfs_ub %llu, "
403 "gpfs_lb %llu, "
404 "gpfs_ub_rdoff %llu, "
405 "gpfs_lb_rdoff %llu, "
406 "fd_gpfs_range %llu, "
407 "n_gpfs_blk %llu, "
408 "nb_cn_small %llu, "
409 "naggs_large %llu, "
410 "naggs_small %llu, "
411 "\n",
412 myname,__LINE__,
413 gpfs_ub ,
414 gpfs_lb ,
415 gpfs_ub_rdoff,
416 gpfs_lb_rdoff,
417 fd_gpfs_range,
418 n_gpfs_blk ,
419 nb_cn_small ,
420 naggs_large ,
421 naggs_small
422 );
423 # endif
424
425 fd_size[0] -= gpfs_lb_rdoff;
426 fd_size[naggs-1] -= gpfs_ub_rdoff;
427
428
429 ADIO_Offset offset = min_st_offset;
430 for (aggr=0; aggr<naggs; aggr++) {
431 fd_start[aggr] = offset;
432 fd_end [aggr] = offset + fd_size[aggr] - 1;
433 offset += fd_size[aggr];
434 }
435
436 *fd_size_ptr = fd_size[0];
437 *min_st_offset_ptr = min_st_offset;
438
439 #ifdef AGGREGATION_PROFILE
440 MPE_Log_event (5005, 0, NULL);
441 #endif
442 ADIOI_Free (fd_size);
443 TRACE_ERR("Leaving ADIOI_GPFS_Calc_file_domains\n");
444 }
445
446
447
448
449
450
451
452
453
454 void ADIOI_GPFS_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list, ADIO_Offset *len_list,
455 int contig_access_count, ADIO_Offset
456 min_st_offset, ADIO_Offset *fd_start,
457 ADIO_Offset *fd_end, ADIO_Offset fd_size,
458 int nprocs,
459 int *count_my_req_procs_ptr,
460 int **count_my_req_per_proc_ptr,
461 ADIOI_Access **my_req_ptr,
462 int **buf_idx_ptr)
463
464
465 {
466 int *count_my_req_per_proc, count_my_req_procs, *buf_idx;
467 int i, l, proc;
468 ADIO_Offset fd_len, rem_len, curr_idx, off;
469 ADIOI_Access *my_req;
470 TRACE_ERR("Entering ADIOI_GPFS_Calc_my_req\n");
471
472 #ifdef AGGREGATION_PROFILE
473 MPE_Log_event (5024, 0, NULL);
474 #endif
475 *count_my_req_per_proc_ptr = (int *) ADIOI_Calloc(nprocs,sizeof(int));
476 count_my_req_per_proc = *count_my_req_per_proc_ptr;
477
478
479
480
481
482 buf_idx = (int *) ADIOI_Malloc(nprocs*sizeof(int));
483
484
485
486
487
488
489 for (i=0; i < nprocs; i++) buf_idx[i] = -1;
490
491
492
493
494 for (i=0; i < contig_access_count; i++) {
495
496
497 if (len_list[i] == 0)
498 continue;
499 off = offset_list[i];
500 fd_len = len_list[i];
501
502
503
504
505
506
507 proc = ADIOI_GPFS_Calc_aggregator(fd, off, min_st_offset, &fd_len, fd_size,
508 fd_start, fd_end);
509 count_my_req_per_proc[proc]++;
510
511
512
513
514
515 rem_len = len_list[i] - fd_len;
516
517 while (rem_len > 0) {
518 off += fd_len;
519 fd_len = rem_len;
520 proc = ADIOI_GPFS_Calc_aggregator(fd, off, min_st_offset, &fd_len,
521 fd_size, fd_start, fd_end);
522
523 count_my_req_per_proc[proc]++;
524 rem_len -= fd_len;
525 }
526 }
527
528
529
530 *my_req_ptr = (ADIOI_Access *)
531 ADIOI_Malloc(nprocs*sizeof(ADIOI_Access));
532 my_req = *my_req_ptr;
533
534 count_my_req_procs = 0;
535 for (i=0; i < nprocs; i++) {
536 if (count_my_req_per_proc[i]) {
537 my_req[i].offsets = (ADIO_Offset *)
538 ADIOI_Malloc(count_my_req_per_proc[i] * sizeof(ADIO_Offset));
539 my_req[i].lens =
540 ADIOI_Malloc(count_my_req_per_proc[i] * sizeof(ADIO_Offset));
541 count_my_req_procs++;
542 }
543 my_req[i].count = 0;
544
545 }
546
547
548 curr_idx = 0;
549 for (i=0; i<contig_access_count; i++) {
550
551
552 if (len_list[i] == 0)
553 continue;
554 off = offset_list[i];
555 fd_len = len_list[i];
556 proc = ADIOI_GPFS_Calc_aggregator(fd, off, min_st_offset, &fd_len, fd_size,
557 fd_start, fd_end);
558
559
560 if (buf_idx[proc] == -1)
561 {
562 ADIOI_Assert(curr_idx == (int) curr_idx);
563 buf_idx[proc] = (int) curr_idx;
564 }
565
566 l = my_req[proc].count;
567 curr_idx += fd_len;
568
569 rem_len = len_list[i] - fd_len;
570
571
572
573
574
575
576 my_req[proc].offsets[l] = off;
577 my_req[proc].lens[l] = fd_len;
578 my_req[proc].count++;
579
580 while (rem_len > 0) {
581 off += fd_len;
582 fd_len = rem_len;
583 proc = ADIOI_GPFS_Calc_aggregator(fd, off, min_st_offset, &fd_len,
584 fd_size, fd_start, fd_end);
585
586 if (buf_idx[proc] == -1)
587 {
588 ADIOI_Assert(curr_idx == (int) curr_idx);
589 buf_idx[proc] = (int) curr_idx;
590 }
591
592 l = my_req[proc].count;
593 curr_idx += fd_len;
594 rem_len -= fd_len;
595
596 my_req[proc].offsets[l] = off;
597 my_req[proc].lens[l] = fd_len;
598 my_req[proc].count++;
599 }
600 }
601
602
603
604 #ifdef AGG_DEBUG
605 for (i=0; i<nprocs; i++) {
606 if (count_my_req_per_proc[i] > 0) {
607 DBG_FPRINTF(stderr, "data needed from %d (count = %d):\n", i,
608 my_req[i].count);
609 for (l=0; l < my_req[i].count; l++) {
610 DBG_FPRINTF(stderr, " off[%d] = %lld, len[%d] = %lld\n", l,
611 my_req[i].offsets[l], l, my_req[i].lens[l]);
612 }
613 }
614 DBG_FPRINTF(stderr, "buf_idx[%d] = 0x%x\n", i, buf_idx[i]);
615 }
616 #endif
617
618 *count_my_req_procs_ptr = count_my_req_procs;
619 *buf_idx_ptr = buf_idx;
620 #ifdef AGGREGATION_PROFILE
621 MPE_Log_event (5025, 0, NULL);
622 #endif
623 TRACE_ERR("Leaving ADIOI_GPFS_Calc_my_req\n");
624 }
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643 void ADIOI_GPFS_Calc_others_req(ADIO_File fd, int count_my_req_procs,
644 int *count_my_req_per_proc,
645 ADIOI_Access *my_req,
646 int nprocs, int myrank,
647 int *count_others_req_procs_ptr,
648 ADIOI_Access **others_req_ptr)
649 {
650 TRACE_ERR("Entering ADIOI_GPFS_Calc_others_req\n");
651
652
653
654
655
656
657
658
659 int *count_others_req_per_proc, count_others_req_procs;
660 int i;
661 ADIOI_Access *others_req;
662
663
664 int *scounts, *sdispls, *rcounts, *rdispls;
665
666
667 #ifdef AGGREGATION_PROFILE
668 MPE_Log_event (5026, 0, NULL);
669 #endif
670
671
672
673
674
675 count_others_req_per_proc = (int *) ADIOI_Malloc(nprocs*sizeof(int));
676
677
678 MPI_Alltoall(count_my_req_per_proc, 1, MPI_INT,
679 count_others_req_per_proc, 1, MPI_INT, fd->comm);
680
681
682
683
684
685
686
687 *others_req_ptr = (ADIOI_Access *)
688 ADIOI_Malloc(nprocs*sizeof(ADIOI_Access));
689 others_req = *others_req_ptr;
690
691 scounts = ADIOI_Malloc(nprocs*sizeof(int));
692 sdispls = ADIOI_Malloc(nprocs*sizeof(int));
693 rcounts = ADIOI_Malloc(nprocs*sizeof(int));
694 rdispls = ADIOI_Malloc(nprocs*sizeof(int));
695
696
697
698
699
700
701 count_others_req_procs = 0;
702 for (i=0; i<nprocs; i++) {
703 if (count_others_req_per_proc[i])
704 {
705 others_req[i].count = count_others_req_per_proc[i];
706
707 others_req[i].offsets = (ADIO_Offset *)
708 ADIOI_Malloc(count_others_req_per_proc[i]*sizeof(ADIO_Offset));
709 others_req[i].lens =
710 ADIOI_Malloc(count_others_req_per_proc[i]*sizeof(ADIO_Offset));
711
712 others_req[i].mem_ptrs = (MPI_Aint *)
713 ADIOI_Malloc(count_others_req_per_proc[i]*sizeof(MPI_Aint));
714
715 count_others_req_procs++;
716 }
717 else
718 {
719 others_req[i].count = 0;
720 others_req[i].offsets = NULL;
721 others_req[i].lens = NULL;
722 }
723 }
724
725
726
727
728
729
730
731
732
733
734
735
736 int scount_total = 0;
737 int rcount_total = 0;
738 for (i=0; i<nprocs; i++)
739 {
740
741 scounts[i] = count_my_req_per_proc[i];
742 sdispls[i] = scount_total;
743 scount_total += scounts[i];
744
745
746 rcounts[i] = count_others_req_per_proc[i];
747 rdispls[i] = rcount_total;
748 rcount_total += rcounts[i];
749 }
750
751 void *sbuf_copy_of_req_info;
752 void *rbuf_copy_of_req_info;
753
754 sbuf_copy_of_req_info = (ADIO_Offset *) ADIOI_Malloc(scount_total * sizeof(ADIO_Offset));
755 rbuf_copy_of_req_info = (ADIO_Offset *) ADIOI_Malloc(rcount_total * sizeof(ADIO_Offset));
756 for (i=0; i<nprocs; i++)
757 {
758
759
760
761 memcpy(sbuf_copy_of_req_info + sdispls[i] * sizeof(ADIO_Offset),
762 my_req[i].offsets,
763 scounts[i] * sizeof(ADIO_Offset));
764 }
765
766
767 MPI_Alltoallv(sbuf_copy_of_req_info,
768 scounts, sdispls, ADIO_OFFSET,
769 rbuf_copy_of_req_info,
770 rcounts, rdispls, ADIO_OFFSET,
771 fd->comm);
772 for (i=0; i<nprocs; i++)
773 {
774 memcpy(others_req[i].offsets,
775 rbuf_copy_of_req_info + rdispls[i] * sizeof(ADIO_Offset),
776 rcounts[i] * sizeof(ADIO_Offset));
777 }
778
779
780
781
782
783 for (i=0; i<nprocs; i++)
784 {
785 memcpy(sbuf_copy_of_req_info + sdispls[i] * sizeof(ADIO_Offset),
786 my_req[i].lens,
787 scounts[i] * sizeof(ADIO_Offset));
788 }
789
790
791 MPI_Alltoallv(sbuf_copy_of_req_info,
792 scounts, sdispls, ADIO_OFFSET,
793 rbuf_copy_of_req_info,
794 rcounts, rdispls, ADIO_OFFSET,
795 fd->comm);
796 for (i=0; i<nprocs; i++)
797 {
798 memcpy(others_req[i].lens,
799 rbuf_copy_of_req_info + rdispls[i] * sizeof(ADIO_Offset),
800 rcounts[i] * sizeof(ADIO_Offset));
801 }
802
803
804 ADIOI_Free(sbuf_copy_of_req_info);
805 ADIOI_Free(rbuf_copy_of_req_info);
806 ADIOI_Free(count_others_req_per_proc);
807 ADIOI_Free (scounts);
808 ADIOI_Free (sdispls);
809 ADIOI_Free (rcounts);
810 ADIOI_Free (rdispls);
811
812 *count_others_req_procs_ptr = count_others_req_procs;
813 #ifdef AGGREGATION_PROFILE
814 MPE_Log_event (5027, 0, NULL);
815 #endif
816 TRACE_ERR("Leaving ADIOI_GPFS_Calc_others_req\n");
817 }