This source file includes following definitions.
- ADIOI_BG_gen_agg_ranklist
- intsort
- ADIOI_BG_compute_agg_ranklist_serial_do
- ADIOI_BG_compute_agg_ranklist_serial
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21 #include "adio.h"
22 #include "adio_cb_config_list.h"
23 #include "../ad_gpfs.h"
24 #include "ad_bg_pset.h"
25 #include "ad_bg_aggrs.h"
26 #ifdef AGGREGATION_PROFILE
27 #include "mpe.h"
28 #endif
29
30
31 #ifdef USE_DBG_LOGGING
32 #define AGG_DEBUG 1
33 #endif
34
35 #ifndef TRACE_ERR
36 # define TRACE_ERR(format...)
37 #endif
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71 static void
72 ADIOI_BG_compute_agg_ranklist_serial ( ADIO_File fd,
73 const ADIOI_BG_ConfInfo_t *confInfo,
74 ADIOI_BG_ProcInfo_t *all_procInfo);
75
76
77
78
79
80
81
82
83
84 int
85 ADIOI_BG_gen_agg_ranklist(ADIO_File fd, int n_aggrs_per_pset)
86 {
87 int r, s;
88 ADIOI_BG_ProcInfo_t *procInfo, *all_procInfo;
89 ADIOI_BG_ConfInfo_t *confInfo;
90 TRACE_ERR("Entering ADIOI_BG_gen_agg_ranklist\n");
91
92 MPI_Comm_size( fd->comm, &s );
93 MPI_Comm_rank( fd->comm, &r );
94
95
96 confInfo = ADIOI_BG_ConfInfo_new ();
97 procInfo = ADIOI_BG_ProcInfo_new ();
98 ADIOI_BG_persInfo_init( confInfo, procInfo, s, r, n_aggrs_per_pset, fd->comm);
99
100
101
102 all_procInfo = ADIOI_BG_ProcInfo_new_n (s);
103
104 MPI_Gather( (void *)procInfo, sizeof(ADIOI_BG_ProcInfo_t), MPI_BYTE,
105 (void *)all_procInfo, sizeof(ADIOI_BG_ProcInfo_t), MPI_BYTE,
106 0,
107 fd->comm );
108
109
110 if (r == 0) {
111 ADIOI_BG_compute_agg_ranklist_serial (fd, confInfo, all_procInfo);
112
113 }
114 ADIOI_BG_ProcInfo_free (all_procInfo);
115
116
117
118 ADIOI_cb_bcast_rank_map(fd);
119 if (gpfsmpio_balancecontig == 1) {
120
121
122 if (r != 0) {
123 fd->hints->fs_hints.bg.bridgelist =
124 ADIOI_Malloc(fd->hints->cb_nodes*sizeof(int));
125 if (fd->hints->fs_hints.bg.bridgelist == NULL) {
126
127 }
128 }
129 MPI_Bcast(fd->hints->fs_hints.bg.bridgelist, fd->hints->cb_nodes, MPI_INT, 0,
130 fd->comm);
131
132 if (r != 0) {
133 fd->hints->fs_hints.bg.bridgelistnum =
134 ADIOI_Malloc(fd->hints->cb_nodes*sizeof(int));
135 if (fd->hints->fs_hints.bg.bridgelistnum == NULL) {
136
137 }
138 }
139 MPI_Bcast(fd->hints->fs_hints.bg.bridgelistnum, fd->hints->cb_nodes,
140 MPI_INT, 0, fd->comm);
141
142 MPI_Bcast(&fd->hints->fs_hints.bg.numbridges, 1, MPI_INT, 0,
143 fd->comm);
144
145 }
146
147
148 ADIOI_BG_persInfo_free( confInfo, procInfo );
149 TRACE_ERR("Leaving ADIOI_BG_gen_agg_ranklist\n");
150 return 0;
151 }
152
153
154
155
156
157
158
159
160
161
162
163
164 typedef struct
165 {
166 int rank;
167 int bridge;
168 } sortstruct;
169
170 typedef struct
171 {
172 int bridgeRank;
173 int numAggsAssigned;
174 } bridgeAggAssignment;
175
176 static int intsort(const void *p1, const void *p2)
177 {
178 sortstruct *i1, *i2;
179 i1 = (sortstruct *)p1;
180 i2 = (sortstruct *)p2;
181 return(i1->bridge - i2->bridge);
182 }
183
184 static int
185 ADIOI_BG_compute_agg_ranklist_serial_do (const ADIOI_BG_ConfInfo_t *confInfo,
186 ADIOI_BG_ProcInfo_t *all_procInfo,
187 int *tmp_ranklist)
188 {
189 TRACE_ERR("Entering ADIOI_BG_compute_agg_ranklist_serial_do\n");
190
191 int i, j;
192 int aggTotal;
193 int *aggList;
194
195 if (gpfsmpio_bridgeringagg > 0) {
196
197 int numAggs = confInfo->aggRatio * confInfo->ioMinSize ;
198
199 if(numAggs == 1)
200 aggTotal = 1;
201 else
202 aggTotal = confInfo->numBridgeRanks * numAggs;
203
204 aggList = (int *)ADIOI_Malloc(aggTotal * sizeof(int));
205 if(aggTotal == 1) {
206
207 sortstruct *bridgelist = (sortstruct *)ADIOI_Malloc(confInfo->nProcs * sizeof(sortstruct));
208 for(i=0; i < confInfo->nProcs; i++)
209 {
210 bridgelist[i].bridge = all_procInfo[i].bridgeRank;
211 bridgelist[i].rank = i;
212 TRACE_ERR("bridgelist[%d].bridge: %d .rank: %d\n", i, bridgelist[i].bridge, i);
213 }
214
215
216 qsort(bridgelist, confInfo->nProcs, sizeof(sortstruct), intsort);
217
218 aggList[0] = bridgelist[0].bridge;
219 ADIOI_Free(bridgelist);
220
221 }
222 else {
223
224 int currentAggListSize = 0;
225 int numBridgesWithAggAssignments = 0;
226 bridgeAggAssignment *aggAssignments = (bridgeAggAssignment *)ADIOI_Malloc(confInfo->numBridgeRanks * sizeof(bridgeAggAssignment));
227
228 int partitionSize = all_procInfo[0].numNodesInPartition;
229 int *nodesAssigned = (int *)ADIOI_Malloc(partitionSize * sizeof(int));
230 for (i=0;i<partitionSize;i++)
231 nodesAssigned[i] = 0;
232
233 int currentNumHops = gpfsmpio_bridgeringagg;
234 int allAggsAssigned = 0;
235
236
237
238
239 while (!allAggsAssigned) {
240
241 int startingCurrentAggListSize = currentAggListSize;
242 int numIterForHopsWithNoAggs = 0;
243 for (i=0;i<confInfo->nProcs;i++) {
244 if (all_procInfo[i].manhattanDistanceToBridge == currentNumHops) {
245 if (nodesAssigned[all_procInfo[i].nodeRank] == 0) {
246 int foundBridge = 0;
247 for (j=0;(j<numBridgesWithAggAssignments && !foundBridge);j++) {
248 if (aggAssignments[j].bridgeRank == all_procInfo[i].bridgeRank) {
249 foundBridge = 1;
250 if (aggAssignments[j].numAggsAssigned < numAggs) {
251 aggAssignments[j].numAggsAssigned++;
252 nodesAssigned[all_procInfo[i].nodeRank] = 1;
253 aggList[currentAggListSize] = all_procInfo[i].rank;
254 currentAggListSize++;
255 #ifdef bridgeringaggtrace
256 printf("Assigned agg rank %d at nodeRank %d to bridge rank %d at a distance of %d hops\n",all_procInfo[i].rank,all_procInfo[i].nodeRank,all_procInfo[i].bridgeRank,currentNumHops);
257 #endif
258 }
259 }
260 }
261 if (!foundBridge) {
262 aggAssignments[numBridgesWithAggAssignments].bridgeRank = all_procInfo[i].bridgeRank;
263 aggAssignments[numBridgesWithAggAssignments].numAggsAssigned = 1;
264 numBridgesWithAggAssignments++;
265 nodesAssigned[all_procInfo[i].nodeRank] = 1;
266 aggList[currentAggListSize] = all_procInfo[i].rank;
267 currentAggListSize++;
268 #ifdef bridgeringaggtrace
269 printf("Assigned agg rank %d at nodeRank %d to bridge rank %d at a distance of %d hops\n",all_procInfo[i].rank,all_procInfo[i].nodeRank,all_procInfo[i].bridgeRank,currentNumHops);
270 #endif
271 }
272 }
273 }
274 }
275
276 if (numBridgesWithAggAssignments == confInfo->numBridgeRanks) {
277 allAggsAssigned = 1;
278 for (i=0;(i<numBridgesWithAggAssignments && allAggsAssigned);i++) {
279 if (aggAssignments[i].numAggsAssigned < numAggs)
280 allAggsAssigned = 0;
281 }
282 }
283 currentNumHops++;
284
285
286
287
288
289
290 if (currentNumHops > 16)
291 currentNumHops = 0;
292
293
294
295 if (startingCurrentAggListSize == currentAggListSize)
296 numIterForHopsWithNoAggs++;
297 else
298 numIterForHopsWithNoAggs = 0;
299 ADIOI_Assert(numIterForHopsWithNoAggs <= 3);
300 }
301
302 ADIOI_Free(aggAssignments);
303 ADIOI_Free(nodesAssigned);
304
305 }
306
307 memcpy(tmp_ranklist, aggList, aggTotal*sizeof(int));
308 }
309
310 else {
311
312 int distance, numAggs;
313
314
315
316
317 sortstruct *bridgelist = (sortstruct *)ADIOI_Malloc(confInfo->nProcs * sizeof(sortstruct));
318 for(i=0; i < confInfo->nProcs; i++)
319 {
320 bridgelist[i].bridge = all_procInfo[i].bridgeRank;
321 bridgelist[i].rank = i;
322 TRACE_ERR("bridgelist[%d].bridge: %d .rank: %d\n", i, bridgelist[i].bridge, i);
323 }
324
325
326 qsort(bridgelist, confInfo->nProcs, sizeof(sortstruct), intsort);
327
328
329
330
331 numAggs = confInfo->aggRatio * confInfo->ioMinSize ;
332 if(numAggs == 1)
333 aggTotal = 1;
334 else
335
336
337 aggTotal = confInfo->numBridgeRanks * (numAggs+1);
338
339 if(aggTotal>confInfo->nProcs) aggTotal=confInfo->nProcs;
340
341 TRACE_ERR("numBridgeRanks: %d, aggRatio: %f numBridge: %d pset size: %d/%d numAggs: %d, aggTotal: %d\n", confInfo->numBridgeRanks, confInfo->aggRatio, confInfo->numBridgeRanks, confInfo->ioMinSize, confInfo->ioMaxSize , numAggs, aggTotal);
342 aggList = (int *)ADIOI_Malloc(aggTotal * sizeof(int));
343
344
345
346
347 if(aggTotal == 1)
348 aggList[0] = bridgelist[0].bridge;
349 else
350 {
351 int lastBridge = bridgelist[confInfo->nProcs-1].bridge;
352 int nextBridge = 0, nextAggr = confInfo->numBridgeRanks;
353 int psetSize = 0;
354 int procIndex;
355 for(procIndex=confInfo->nProcs-1; procIndex>=0; procIndex--)
356 {
357 TRACE_ERR("bridgelist[%d].bridge %u/rank %u\n",procIndex, bridgelist[procIndex].bridge, bridgelist[procIndex].rank);
358 if(lastBridge == bridgelist[procIndex].bridge)
359 {
360 psetSize++;
361 if(procIndex) continue;
362 else procIndex--;
363 }
364
365
366
367
368
369
370
371
372
373
374
375
376
377 aggList[nextBridge]=lastBridge;
378 distance = psetSize/numAggs;
379 TRACE_ERR("nextBridge %u is bridge %u, distance %u, size %u\n",nextBridge, aggList[nextBridge],distance,psetSize);
380 if(numAggs>1)
381 {
382 for(j = 0; j < numAggs; j++)
383 {
384 ADIOI_Assert(nextAggr<aggTotal);
385 aggList[nextAggr] = bridgelist[procIndex+j*distance+1].rank;
386 TRACE_ERR("agglist[%d] -> bridgelist[%d] = %d\n", nextAggr, procIndex+j*distance+1,aggList[nextAggr]);
387 if(aggList[nextAggr]==lastBridge)
388 {
389 aggList[nextAggr] = bridgelist[procIndex+psetSize].rank;
390 TRACE_ERR("replacement agglist[%d] -> bridgelist[%d] = %d\n", nextAggr, procIndex+psetSize,aggList[nextAggr]);
391 }
392 nextAggr++;
393 }
394 }
395 if(procIndex<0) break;
396 lastBridge = bridgelist[procIndex].bridge;
397 psetSize = 1;
398 nextBridge++;
399 }
400 }
401
402 TRACE_ERR("memcpy(tmp_ranklist, aggList, (numAggs(%u)*confInfo->numBridgeRanks(%u)+numAggs(%u)) (%u) %u*sizeof(int))\n",numAggs,confInfo->numBridgeRanks,numAggs,(numAggs*confInfo->numBridgeRanks+numAggs),aggTotal);
403 memcpy(tmp_ranklist, aggList, aggTotal*sizeof(int));
404 for(i=0;i<aggTotal;i++)
405 {
406 TRACE_ERR("tmp_ranklist[%d]: %d\n", i, tmp_ranklist[i]);
407 }
408
409
410 ADIOI_Free (bridgelist);
411
412 TRACE_ERR("Leaving ADIOI_BG_compute_agg_ranklist_serial_do\n");
413 }
414
415 ADIOI_Free (aggList);
416 return aggTotal;
417
418 }
419
420
421
422
423 static void
424 ADIOI_BG_compute_agg_ranklist_serial ( ADIO_File fd,
425 const ADIOI_BG_ConfInfo_t *confInfo,
426 ADIOI_BG_ProcInfo_t *all_procInfo)
427 {
428 TRACE_ERR("Entering ADIOI_BG_compute_agg_ranklist_serial\n");
429 int i;
430 int naggs;
431 int size;
432 int *tmp_ranklist;
433
434
435 tmp_ranklist = (int *) ADIOI_Malloc (confInfo->nProcs * sizeof(int));
436
437 # if AGG_DEBUG
438 for (i=0; i<confInfo->nProcs; i++) {
439 DBG_FPRINTF(stderr, "\trank = %6d\n", all_procInfo[i].rank );
440 }
441 # endif
442
443 naggs=
444 ADIOI_BG_compute_agg_ranklist_serial_do (confInfo, all_procInfo, tmp_ranklist);
445
446 # define VERIFY 1
447 # if VERIFY
448 DBG_FPRINTF(stderr, "\tconfInfo = min: %3d, max: %3d, naggrs: %3d, bridge: %3d, nprocs: %3d, vpset: %3d, ratio: %.4f; naggs = %d\n",
449 confInfo->ioMinSize ,
450 confInfo->ioMaxSize ,
451 confInfo->nAggrs ,
452 confInfo->numBridgeRanks ,
453 confInfo->nProcs ,
454 confInfo->ioMaxSize ,
455 confInfo->aggRatio ,
456 naggs );
457 # endif
458 MPI_Comm_size( fd->comm, &size );
459
460
461
462
463 for(i=0;i<naggs;i++)
464 {
465 if(tmp_ranklist[i] > size)
466 {
467 TRACE_ERR("Using 0 as tmp_ranklist[%d] instead of %d for comm %x\n",
468 i, tmp_ranklist[i], fd->comm);
469 tmp_ranklist[i] = 0;
470 }
471 }
472
473 # if AGG_DEBUG
474 for (i=0; i<naggs; i++) {
475 DBG_FPRINTF(stderr, "\taggr %-4d = %6d\n", i, tmp_ranklist[i] );
476 }
477 # endif
478 if (gpfsmpio_balancecontig == 1) {
479
480
481
482
483
484
485 int *interleavedbridgeranklist = (int *) ADIOI_Malloc (naggs * sizeof(int));
486
487 int *bridgelist = (int *) ADIOI_Malloc (naggs * sizeof(int));
488
489
490
491 int *bridgelistnum = (int *) ADIOI_Malloc (naggs * sizeof(int));
492
493 int *ionlist = (int *) ADIOI_Malloc (naggs * sizeof(int));
494
495 int numbridges = 0;
496
497 for (i=0;i<naggs;i++)
498 bridgelistnum[i] = 0;
499
500
501
502 int *summarybridgeminionaggrank = (int *) ADIOI_Malloc (naggs * sizeof(int));
503 for (i=0;i<naggs;i++)
504 summarybridgeminionaggrank[i] = -1;
505
506
507
508
509 for (i=0;i<naggs;i++) {
510 int aggbridgerank = all_procInfo[tmp_ranklist[i]].bridgeRank;
511 int aggionid = all_procInfo[tmp_ranklist[i]].ionID;
512 int foundrank = 0;
513 int summaryranklistbridgeindex = 0;
514 int j;
515 for (j=0;(j<numbridges && !foundrank);j++) {
516 if (bridgelist[j] == aggbridgerank) {
517 foundrank = 1;
518 summaryranklistbridgeindex = j;
519 }
520 else
521 summaryranklistbridgeindex++;
522 }
523 if (!foundrank) {
524 bridgelist[summaryranklistbridgeindex] = aggbridgerank;
525 ionlist[summaryranklistbridgeindex] = aggionid;
526
527 if (summarybridgeminionaggrank[summaryranklistbridgeindex] == -1)
528 summarybridgeminionaggrank[summaryranklistbridgeindex] = aggbridgerank;
529 else if (summarybridgeminionaggrank[summaryranklistbridgeindex] > aggbridgerank)
530 summarybridgeminionaggrank[summaryranklistbridgeindex] = aggbridgerank;
531 numbridges++;
532 }
533
534 bridgelistnum[summaryranklistbridgeindex]++;
535 }
536
537
538
539 for (i=0;i<numbridges;i++) {
540 int aggIonId = ionlist[i];
541 int j;
542 for (j=0;j<numbridges;j++) {
543 if (ionlist[j] == aggIonId) {
544 if (summarybridgeminionaggrank[j] < summarybridgeminionaggrank[i])
545 summarybridgeminionaggrank[i] = summarybridgeminionaggrank[j];
546 }
547 }
548 }
549
550
551 int x;
552 for (x=0;x<numbridges;x++) {
553 for (i=0;i<(numbridges-1);i++) {
554 if (summarybridgeminionaggrank[i] > summarybridgeminionaggrank[i+1]) {
555 int tmpminionaggrank = summarybridgeminionaggrank[i];
556 summarybridgeminionaggrank[i] = summarybridgeminionaggrank[i+1];
557 summarybridgeminionaggrank[i+1] = tmpminionaggrank;
558 int tmpionid = ionlist[i];
559 ionlist[i] = ionlist[i+1];
560 ionlist[i+1] = tmpionid;
561 int tmpbridgerank = bridgelist[i];
562 bridgelist[i] = bridgelist[i+1];
563 bridgelist[i+1] = tmpbridgerank;
564 int tmpbridgeranknum = bridgelistnum[i];
565 bridgelistnum[i] = bridgelistnum[i+1];
566 bridgelistnum[i+1] = tmpbridgeranknum;
567 }
568 }
569 }
570
571
572 int startSortIndex = -1;
573 int endSortIndex = -1;
574 int currentBridgeIndex = 0;
575
576 while (currentBridgeIndex < numbridges) {
577 int currentIonId = ionlist[currentBridgeIndex];
578 startSortIndex = currentBridgeIndex;
579 while (ionlist[currentBridgeIndex] == currentIonId)
580 currentBridgeIndex++;
581 endSortIndex = currentBridgeIndex-1;
582 for (x=startSortIndex;x<=endSortIndex;x++) {
583 for (i=startSortIndex;i<endSortIndex;i++) {
584 if (bridgelist[i] > bridgelist[i+1]) {
585 int tmpbridgerank = bridgelist[i];
586 bridgelist[i] = bridgelist[i+1];
587 bridgelist[i+1] = tmpbridgerank;
588 int tmpbridgeranknum = bridgelistnum[i];
589 bridgelistnum[i] = bridgelistnum[i+1];
590 bridgelistnum[i+1] = tmpbridgeranknum;
591 }
592 }
593 }
594 }
595
596
597
598
599 int currentrankoffset = 0;
600 for (i=0;i<numbridges;i++) {
601 int *thisBridgeAggList = (int *) ADIOI_Malloc (naggs * sizeof(int));
602 int numAggsForThisBridge = 0;
603
604 int k;
605 for (k=0;k<naggs;k++) {
606 int aggbridgerank = all_procInfo[tmp_ranklist[k]].bridgeRank;
607 if (aggbridgerank == bridgelist[i]) {
608 thisBridgeAggList[numAggsForThisBridge] = tmp_ranklist[k];
609 numAggsForThisBridge++;
610 }
611 }
612
613
614 for (x=0;x<numAggsForThisBridge;x++) {
615 int n;
616 for (n=0;n<(numAggsForThisBridge-1);n++) {
617 if (thisBridgeAggList[n] > thisBridgeAggList[n+1]) {
618 int tmpthisBridgeAggList = thisBridgeAggList[n];
619 thisBridgeAggList[n] = thisBridgeAggList[n+1];
620 thisBridgeAggList[n+1] = tmpthisBridgeAggList;
621 }
622 }
623 }
624 int n;
625 for (n=0;n<numAggsForThisBridge;n++) {
626 interleavedbridgeranklist[currentrankoffset] = thisBridgeAggList[n];
627 currentrankoffset++;
628 }
629 ADIOI_Free(thisBridgeAggList);
630 }
631
632 #ifdef balancecontigtrace
633 fprintf(stderr,"Interleaved aggregator list:\n");
634 for (i=0;i<naggs;i++) {
635 fprintf(stderr,"Agg: %d Agg rank: %d with bridge rank %d and ion ID %d\n",i,interleavedbridgeranklist[i],all_procInfo[interleavedbridgeranklist[i]].bridgeRank,all_procInfo[interleavedbridgeranklist[i]].ionID);
636 }
637 fprintf(stderr,"Bridges list:\n");
638 for (i=0;i<numbridges;i++) {
639 fprintf(stderr,"bridge %d ion min rank %d rank %d number of aggs %d ion id %d\n",i,summarybridgeminionaggrank[i],bridgelist[i],bridgelistnum[i],ionlist[i]);
640 }
641
642 #endif
643
644 if(fd->hints->ranklist != NULL)
645 ADIOI_Free (fd->hints->ranklist);
646 if(fd->hints->fs_hints.bg.bridgelist != NULL)
647 ADIOI_Free (fd->hints->fs_hints.bg.bridgelist);
648 if(fd->hints->fs_hints.bg.bridgelistnum != NULL)
649 ADIOI_Free (fd->hints->fs_hints.bg.bridgelistnum);
650
651 fd->hints->cb_nodes = naggs;
652 fd->hints->fs_hints.bg.numbridges = numbridges;
653 fd->hints->ranklist = (int *) ADIOI_Malloc (naggs * sizeof(int));
654 memcpy( fd->hints->ranklist, interleavedbridgeranklist, naggs*sizeof(int) );
655
656 fd->hints->fs_hints.bg.bridgelist = (int *) ADIOI_Malloc (naggs * sizeof(int));
657 memcpy( fd->hints->fs_hints.bg.bridgelist, bridgelist, naggs*sizeof(int) );
658
659 fd->hints->fs_hints.bg.bridgelistnum = (int *) ADIOI_Malloc (naggs * sizeof(int));
660 memcpy( fd->hints->fs_hints.bg.bridgelistnum, bridgelistnum, naggs*sizeof(int) );
661
662 ADIOI_Free(summarybridgeminionaggrank);
663 ADIOI_Free( tmp_ranklist );
664 ADIOI_Free( bridgelistnum );
665 ADIOI_Free( bridgelist );
666 ADIOI_Free( interleavedbridgeranklist );
667 ADIOI_Free(ionlist);
668
669 } else {
670
671
672 if(fd->hints->ranklist != NULL) ADIOI_Free (fd->hints->ranklist);
673
674 fd->hints->cb_nodes = naggs;
675 fd->hints->ranklist = (int *) ADIOI_Malloc (naggs * sizeof(int));
676 memcpy( fd->hints->ranklist, tmp_ranklist, naggs*sizeof(int) );
677
678 ADIOI_Free( tmp_ranklist );
679 }
680 TRACE_ERR("Leaving ADIOI_BG_compute_agg_ranklist_serial\n");
681 return;
682 }