This source file includes following definitions.
- check_oversubscribing
- dump_int_array
- dump_double_array
- mca_topo_treematch_dist_graph_create
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 #include "ompi_config.h"
23
24 #include "opal/constants.h"
25 #include "opal/mca/hwloc/base/base.h"
26
27 #include "ompi/mca/topo/treematch/topo_treematch.h"
28 #include "ompi/mca/topo/treematch/treematch/treematch.h"
29 #include "ompi/mca/topo/treematch/treematch/tm_mapping.h"
30 #include "ompi/mca/topo/base/base.h"
31
32 #include "ompi/communicator/communicator.h"
33 #include "ompi/info/info.h"
34
35 #include "ompi/mca/pml/pml.h"
36
37 #include "opal/mca/pmix/pmix.h"
38
39
40
41
42
43
44
45
46
47
48
49
50
51 static int check_oversubscribing(int rank,
52 int num_nodes,
53 int num_objs_in_node,
54 int num_procs_in_node,
55 int *nodes_roots,
56 int *local_procs,
57 ompi_communicator_t *comm_old)
58 {
59 int oversubscribed = 0, local_oversub = 0, err;
60
61
62 if (rank == local_procs[0])
63 if(num_objs_in_node < num_procs_in_node)
64 local_oversub = 1;
65
66
67 if (OMPI_SUCCESS != (err = comm_old->c_coll->coll_allreduce(&local_oversub, &oversubscribed, 1, MPI_INT,
68 MPI_SUM, comm_old, comm_old->c_coll->coll_allreduce_module)))
69 return err;
70
71 return oversubscribed;
72 }
73
74 #ifdef __DEBUG__
75 static void dump_int_array( int level, int output_id, char* prolog, char* line_prolog, int* array, size_t length )
76 {
77 size_t i;
78 if( -1 == output_id ) return;
79
80 opal_output_verbose(level, output_id, "%s : ", prolog);
81 for(i = 0; i < length ; i++)
82 opal_output_verbose(level, output_id, "%s [%lu:%i] ", line_prolog, i, array[i]);
83 opal_output_verbose(level, output_id, "\n");
84 }
85 static void dump_double_array( int level, int output_id, char* prolog, char* line_prolog, double* array, size_t length )
86 {
87 size_t i;
88
89 if( -1 == output_id ) return;
90 opal_output_verbose(level, output_id, "%s : ", prolog);
91 for(i = 0; i < length ; i++)
92 opal_output_verbose(level, output_id, "%s [%lu:%lf] ", line_prolog, i, array[i]);
93 opal_output_verbose(level, output_id, "\n");
94 }
95 #endif
96
97 int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
98 ompi_communicator_t *comm_old,
99 int n, const int nodes[],
100 const int degrees[], const int targets[],
101 const int weights[],
102 struct opal_info_t *info, int reorder,
103 ompi_communicator_t **newcomm)
104 {
105 int err;
106
107 if (OMPI_SUCCESS != (err = mca_topo_base_dist_graph_distribute(topo_module, comm_old, n, nodes,
108 degrees, targets, weights,
109 &(topo_module->mtc.dist_graph))))
110 return err;
111
112 if(!reorder) {
113
114 fallback:
115
116 if( OMPI_SUCCESS == (err = ompi_comm_create(comm_old,
117 comm_old->c_local_group,
118 newcomm))){
119
120 (*newcomm)->c_flags |= OMPI_COMM_DIST_GRAPH;
121 (*newcomm)->c_topo = topo_module;
122 (*newcomm)->c_topo->reorder = reorder;
123 }
124 return err;
125 }
126
127 mca_topo_base_comm_dist_graph_2_2_0_t *topo = NULL;
128 ompi_proc_t *proc = NULL;
129 MPI_Request *reqs = NULL;
130 hwloc_cpuset_t set = NULL;
131 hwloc_obj_t object, root_obj;
132 hwloc_obj_t *tracker = NULL;
133 double *local_pattern = NULL;
134 int *vpids, *colors = NULL;
135 int *lindex_to_grank = NULL;
136 int *nodes_roots = NULL, *k = NULL;
137 int *localrank_to_objnum = NULL;
138 int depth = 0, effective_depth = 0, obj_rank = -1;
139 int num_objs_in_node = 0, num_pus_in_node = 0;
140 int numlevels = 0, num_nodes = 0, num_procs_in_node = 0;
141 int rank, size, newrank = -1, hwloc_err, i, j, idx;
142 int oversubscribing_objs = 0, oversubscribed_pus = 0;
143 uint32_t val, *pval;
144
145
146
147 if (OPAL_SUCCESS != opal_hwloc_base_get_topology()) {
148 goto fallback;
149 }
150 root_obj = hwloc_get_root_obj(opal_hwloc_topology);
151 if (NULL == root_obj) goto fallback;
152
153 topo = topo_module->mtc.dist_graph;
154 rank = ompi_comm_rank(comm_old);
155 size = ompi_comm_size(comm_old);
156
157 OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
158 "Process rank is : %i\n",rank));
159
160
161
162
163
164 vpids = (int *)malloc(size * sizeof(int));
165 colors = (int *)malloc(size * sizeof(int));
166 for(i = 0 ; i < size ; i++) {
167 proc = ompi_group_peer_lookup(comm_old->c_local_group, i);
168 if (( i == rank ) ||
169 (OPAL_PROC_ON_LOCAL_NODE(proc->super.proc_flags)))
170 vpids[num_procs_in_node++] = i;
171
172 pval = &val;
173 OPAL_MODEX_RECV_VALUE(err, OPAL_PMIX_NODEID, &(proc->super.proc_name), &pval, OPAL_UINT32);
174 if( OPAL_SUCCESS != err ) {
175 opal_output(0, "Unable to extract peer %s nodeid from the modex.\n",
176 OMPI_NAME_PRINT(&(proc->super.proc_name)));
177 colors[i] = -1;
178 continue;
179 }
180 colors[i] = (int)val;
181 }
182 lindex_to_grank = (int *)malloc(num_procs_in_node * sizeof(int));
183 memcpy(lindex_to_grank, vpids, num_procs_in_node * sizeof(int));
184 memcpy(vpids, colors, size * sizeof(int));
185
186 #ifdef __DEBUG__
187 if ( 0 == rank ) {
188 dump_int_array(10, ompi_topo_base_framework.framework_output,
189 "lindex_to_grank : ", "", lindex_to_grank, num_procs_in_node);
190 dump_int_array(10, ompi_topo_base_framework.framework_output,
191 "Vpids : ", "", colors, size);
192 }
193 #endif
194
195 for(i = 0; i < size ; i++) {
196 if ( -1 == vpids[i] ) continue;
197 num_nodes++;
198 for(j = i+1; j < size; j++)
199 if( vpids[i] == vpids[j] )
200 vpids[j] = -1;
201 }
202 if( 0 == num_nodes ) {
203
204
205 free(vpids);
206 free(colors);
207 goto fallback;
208 }
209
210
211 if(0 == rank) {
212 nodes_roots = (int *)calloc(num_nodes, sizeof(int));
213 for(i = idx = 0; i < size; i++)
214 if( vpids[i] != -1 )
215 nodes_roots[idx++] = i;
216 OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
217 "num nodes is %i\n", num_nodes));
218 #ifdef __DEBUG__
219 dump_int_array(10, ompi_topo_base_framework.framework_output,
220 "Root nodes are :\n", "root ", nodes_roots, num_nodes);
221 #endif
222 }
223 free(vpids);
224
225
226 set = hwloc_bitmap_alloc_full();
227 hwloc_get_cpubind(opal_hwloc_topology, set, 0);
228 num_pus_in_node = hwloc_get_nbobjs_by_type(opal_hwloc_topology, HWLOC_OBJ_PU);
229
230
231
232
233
234
235
236 if(hwloc_bitmap_isincluded(root_obj->cpuset,set)) {
237 if (0 == rank)
238 OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
239 ">>>>>>>>>>>>> Process Not bound <<<<<<<<<<<<<<<\n"));
240
241
242
243 depth = hwloc_get_type_or_above_depth(opal_hwloc_topology, HWLOC_OBJ_CORE);
244 num_objs_in_node = hwloc_get_nbobjs_by_depth(opal_hwloc_topology, depth);
245 } else {
246 object = hwloc_get_obj_covering_cpuset(opal_hwloc_topology, set);
247 obj_rank = object->logical_index;
248 effective_depth = object->depth;
249 num_objs_in_node = hwloc_get_nbobjs_by_depth(opal_hwloc_topology, effective_depth);
250 }
251 if( (0 == num_objs_in_node) || (0 == num_pus_in_node) ) {
252 free(colors);
253 goto fallback;
254 }
255
256 oversubscribing_objs = check_oversubscribing(rank, num_nodes,
257 num_objs_in_node, num_procs_in_node,
258 nodes_roots, lindex_to_grank, comm_old);
259
260 if(oversubscribing_objs) {
261 if(hwloc_bitmap_isincluded(root_obj->cpuset, set)) {
262 OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
263 "Oversubscribing OBJ/CORES resources => Trying to use PUs \n"));
264
265 oversubscribed_pus = check_oversubscribing(rank, num_nodes,
266 num_pus_in_node, num_procs_in_node,
267 nodes_roots, lindex_to_grank, comm_old);
268
269 if (!oversubscribed_pus) {
270 obj_rank = ompi_process_info.my_local_rank%num_pus_in_node;
271 effective_depth = hwloc_topology_get_depth(opal_hwloc_topology) - 1;
272 num_objs_in_node = num_pus_in_node;
273 OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
274 "Process %i not bound : binding on PU#%i \n", rank, obj_rank));
275 }
276 } else {
277
278 oversubscribed_pus = check_oversubscribing(rank, num_nodes,
279 num_objs_in_node, num_procs_in_node,
280 nodes_roots, lindex_to_grank, comm_old);
281 }
282 }
283
284 if( !oversubscribing_objs && !oversubscribed_pus ) {
285 if( hwloc_bitmap_isincluded(root_obj->cpuset, set) ) {
286 obj_rank = ompi_process_info.my_local_rank%num_objs_in_node;
287 effective_depth = depth;
288 object = hwloc_get_obj_by_depth(opal_hwloc_topology, effective_depth, obj_rank);
289 if( NULL == object) {
290 free(colors);
291 hwloc_bitmap_free(set);
292 goto fallback;
293 }
294
295 hwloc_bitmap_copy(set, object->cpuset);
296 hwloc_bitmap_singlify(set);
297 hwloc_err = hwloc_set_cpubind(opal_hwloc_topology, set, 0);
298 if( -1 == hwloc_err) {
299
300
301
302 OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
303 "Process %i failed to bind on OBJ#%i \n", rank, obj_rank));
304 } else
305 OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
306 "Process %i not bound : binding on OBJ#%i \n",rank, obj_rank));
307 } else {
308 OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
309 "Process %i bound on OBJ #%i \n"
310 "=====> Num obj in node : %i | num pus in node : %i\n",
311 rank, obj_rank,
312 num_objs_in_node, num_pus_in_node));
313 }
314 } else {
315 OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
316 "Oversubscribing PUs resources => Rank Reordering Impossible \n"));
317 free(colors);
318 hwloc_bitmap_free(set);
319 goto fallback;
320 }
321
322 reqs = (MPI_Request *)calloc(num_procs_in_node-1, sizeof(MPI_Request));
323 if( rank == lindex_to_grank[0] ) {
324 int array_size = effective_depth + 1;
325 int *myhierarchy = (int *)calloc(array_size, sizeof(int));
326
327 numlevels = 1;
328 myhierarchy[0] = hwloc_get_nbobjs_by_depth(opal_hwloc_topology, 0);
329 for (i = 1; i < array_size ; i++) {
330 myhierarchy[i] = hwloc_get_nbobjs_by_depth(opal_hwloc_topology, i);
331 OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
332 "hierarchy[%i] = %i\n", i, myhierarchy[i]));
333 if ((myhierarchy[i] != 0) && (myhierarchy[i] != myhierarchy[i-1]))
334 numlevels++;
335 }
336
337 tracker = (hwloc_obj_t *)calloc(numlevels, sizeof(hwloc_obj_t));
338 for(idx = 0, i = 1; i < array_size; i++) {
339 if(myhierarchy[i] != myhierarchy[i-1])
340 tracker[idx++] = hwloc_get_obj_by_depth(opal_hwloc_topology, i-1, 0);
341 }
342 tracker[idx] = hwloc_get_obj_by_depth(opal_hwloc_topology, effective_depth, 0);
343 free(myhierarchy);
344
345 OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
346 ">>>>>>>>>>>>>>>>>>>>> Effective depth is : %i (total depth %i)| num_levels %i\n",
347 effective_depth, hwloc_topology_get_depth(opal_hwloc_topology), numlevels));
348 for(i = 0 ; i < numlevels ; i++) {
349 OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
350 "tracker[%i] : arity %i | depth %i\n",
351 i, tracker[i]->arity, tracker[i]->depth));
352 }
353
354 localrank_to_objnum = (int *)calloc(num_procs_in_node, sizeof(int));
355 localrank_to_objnum[0] = obj_rank;
356
357 for(i = 1; i < num_procs_in_node; i++) {
358 if (OMPI_SUCCESS != ( err = MCA_PML_CALL(irecv(&localrank_to_objnum[i], 1, MPI_INT,
359 lindex_to_grank[i], -111, comm_old, &reqs[i-1])))) {
360 free(reqs); reqs = NULL;
361 goto release_and_return;
362 }
363 }
364 if (OMPI_SUCCESS != ( err = ompi_request_wait_all(num_procs_in_node-1,
365 reqs, MPI_STATUSES_IGNORE))) {
366 free(reqs); reqs = NULL;
367 goto release_and_return;
368 }
369 } else {
370
371 if (OMPI_SUCCESS != (err = MCA_PML_CALL(send(&obj_rank, 1, MPI_INT, lindex_to_grank[0],
372 -111, MCA_PML_BASE_SEND_STANDARD, comm_old)))) {
373 free(reqs); reqs = NULL;
374 goto release_and_return;
375 }
376 }
377 free(reqs); reqs = NULL;
378
379
380 if (0 == mca_topo_treematch_component.reorder_mode) {
381 int *k = NULL;
382 int *obj_mapping = NULL;
383 int num_objs_total = 0;
384
385
386
387
388
389 if( 0 == rank ) {
390
391 OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
392 "========== Centralized Reordering ========= \n"));
393 local_pattern = (double *)calloc(size*size,sizeof(double));
394 } else {
395 local_pattern = (double *)calloc(size,sizeof(double));
396 }
397 if( true == topo->weighted ) {
398 for(i = 0; i < topo->indegree ; i++)
399 local_pattern[topo->in[i]] += topo->inw[i];
400 for(i = 0; i < topo->outdegree ; i++)
401 local_pattern[topo->out[i]] += topo->outw[i];
402 }
403 err = comm_old->c_coll->coll_gather( (0 == rank ? MPI_IN_PLACE : local_pattern), size, MPI_DOUBLE,
404 local_pattern, size, MPI_DOUBLE,
405 0, comm_old, comm_old->c_coll->coll_gather_module);
406 if (OMPI_SUCCESS != err) {
407 goto release_and_return;
408 }
409
410 if( rank == lindex_to_grank[0] ) {
411 tm_topology_t *tm_topology = NULL;
412 int *obj_to_rank_in_comm = NULL;
413 int *hierarchies = NULL;
414 int min;
415
416
417 obj_to_rank_in_comm = (int *)malloc(num_objs_in_node*sizeof(int));
418 for(i = 0 ; i < num_objs_in_node ; i++) {
419 obj_to_rank_in_comm[i] = -1;
420 object = hwloc_get_obj_by_depth(opal_hwloc_topology, effective_depth, i);
421 for( j = 0; j < num_procs_in_node ; j++ )
422 if(localrank_to_objnum[j] == (int)(object->logical_index)) {
423 obj_to_rank_in_comm[i] = lindex_to_grank[j];
424 break;
425 }
426 }
427
428
429 if ( 0 == rank ) {
430 if ( num_nodes > 1 ) {
431 int *objs_per_node = NULL, displ;
432
433 objs_per_node = (int *)calloc(num_nodes, sizeof(int));
434 reqs = (MPI_Request *)calloc(num_nodes-1, sizeof(MPI_Request));
435 objs_per_node[0] = num_objs_in_node;
436 for(i = 1; i < num_nodes ; i++)
437 if (OMPI_SUCCESS != ( err = MCA_PML_CALL(irecv(objs_per_node + i, 1, MPI_INT,
438 nodes_roots[i], -112, comm_old, &reqs[i-1])))) {
439 free(obj_to_rank_in_comm);
440 free(objs_per_node);
441 goto release_and_return;
442 }
443
444 if (OMPI_SUCCESS != ( err = ompi_request_wait_all(num_nodes - 1,
445 reqs, MPI_STATUSES_IGNORE))) {
446 free(objs_per_node);
447 goto release_and_return;
448 }
449
450 for(i = 0; i < num_nodes; i++)
451 num_objs_total += objs_per_node[i];
452 obj_mapping = (int *)malloc(num_objs_total*sizeof(int));
453 for(i = 0; i < num_objs_total; i++)
454 obj_mapping[i] = -1;
455
456 memcpy(obj_mapping, obj_to_rank_in_comm, objs_per_node[0]*sizeof(int));
457 displ = objs_per_node[0];
458 for(i = 1; i < num_nodes ; i++) {
459 if (OMPI_SUCCESS != ( err = MCA_PML_CALL(irecv(obj_mapping + displ, objs_per_node[i], MPI_INT,
460 nodes_roots[i], -113, comm_old, &reqs[i-1])))) {
461 free(obj_to_rank_in_comm);
462 free(objs_per_node);
463 free(obj_mapping);
464 goto release_and_return;
465 }
466 displ += objs_per_node[i];
467 }
468 if (OMPI_SUCCESS != ( err = ompi_request_wait_all(num_nodes - 1,
469 reqs, MPI_STATUSES_IGNORE))) {
470 free(obj_to_rank_in_comm);
471 free(objs_per_node);
472 free(obj_mapping);
473 goto release_and_return;
474 }
475 free(objs_per_node);
476 } else {
477
478 num_objs_total = num_objs_in_node;
479 obj_mapping = (int *)calloc(num_objs_total, sizeof(int));
480 memcpy(obj_mapping, obj_to_rank_in_comm, num_objs_total*sizeof(int));
481 }
482 #ifdef __DEBUG__
483 dump_int_array(10, ompi_topo_base_framework.framework_output,
484 "Obj mapping : ", "", obj_mapping, num_objs_total );
485 #endif
486 } else {
487 if ( num_nodes > 1 ) {
488 if (OMPI_SUCCESS != (err = MCA_PML_CALL(send(&num_objs_in_node, 1, MPI_INT,
489 0, -112, MCA_PML_BASE_SEND_STANDARD, comm_old)))) {
490 free(obj_to_rank_in_comm);
491 goto release_and_return;
492 }
493 if (OMPI_SUCCESS != (err = MCA_PML_CALL(send(obj_to_rank_in_comm, num_objs_in_node, MPI_INT,
494 0, -113, MCA_PML_BASE_SEND_STANDARD, comm_old)))) {
495 free(obj_to_rank_in_comm);
496 goto release_and_return;
497 }
498 }
499 }
500 free(obj_to_rank_in_comm);
501
502 assert(numlevels < TM_MAX_LEVELS);
503 if( 0 == rank ) {
504 hierarchies = (int *)malloc(num_nodes*(TM_MAX_LEVELS+1)*sizeof(int));
505 } else {
506 hierarchies = (int *)malloc((TM_MAX_LEVELS+1)*sizeof(int));
507 }
508
509 hierarchies[0] = numlevels;
510
511 for(i = 0 ; i < hierarchies[0]; i++)
512 hierarchies[i+1] = tracker[i]->arity;
513 for(; i < (TM_MAX_LEVELS+1); i++)
514 hierarchies[i] = 0;
515
516
517 if ( num_nodes > 1 ) {
518 if( rank != 0 ) {
519 if (OMPI_SUCCESS != (err = MCA_PML_CALL(send(hierarchies,(TM_MAX_LEVELS+1), MPI_INT, 0,
520 -114, MCA_PML_BASE_SEND_STANDARD, comm_old)))) {
521 free(hierarchies);
522 goto release_and_return;
523 }
524 } else {
525 for(i = 1; i < num_nodes ; i++)
526 if (OMPI_SUCCESS != ( err = MCA_PML_CALL(irecv(hierarchies+i*(TM_MAX_LEVELS+1), (TM_MAX_LEVELS+1), MPI_INT,
527 nodes_roots[i], -114, comm_old, &reqs[i-1])))) {
528 free(obj_mapping);
529 free(hierarchies);
530 goto release_and_return;
531 }
532 if (OMPI_SUCCESS != ( err = ompi_request_wait_all(num_nodes - 1,
533 reqs, MPI_STATUSES_IGNORE))) {
534 free(obj_mapping);
535 free(hierarchies);
536 goto release_and_return;
537 }
538 free(reqs); reqs = NULL;
539 }
540 }
541
542 if ( 0 == rank ) {
543 tm_tree_t *comm_tree = NULL;
544 tm_solution_t *sol = NULL;
545 tm_affinity_mat_t *aff_mat = NULL;
546 double **comm_pattern = NULL;
547
548 #ifdef __DEBUG__
549 dump_int_array(10, ompi_topo_base_framework.framework_output,
550 "hierarchies : ", "", hierarchies, num_nodes*(TM_MAX_LEVELS+1));
551 #endif
552 tm_topology = (tm_topology_t *)malloc(sizeof(tm_topology_t));
553 tm_topology->nb_levels = hierarchies[0];
554
555
556 for(i = 1 ; i < num_nodes ; i++)
557 if (hierarchies[i*(TM_MAX_LEVELS+1)] < tm_topology->nb_levels)
558 tm_topology->nb_levels = hierarchies[i*(TM_MAX_LEVELS+1)];
559
560
561 for(i = 0; i < num_nodes ; i++) {
562 int *base_ptr = hierarchies + i*(TM_MAX_LEVELS+1);
563 int suppl = *base_ptr - tm_topology->nb_levels;
564 for(j = 1 ; j <= suppl ; j++)
565 *(base_ptr + tm_topology->nb_levels) *= *(base_ptr + tm_topology->nb_levels + j);
566 }
567 if( num_nodes > 1) {
568
569 tm_topology->nb_levels += 1;
570 tm_topology->arity = (int *)calloc(tm_topology->nb_levels, sizeof(int));
571 tm_topology->arity[0] = num_nodes;
572 for(i = 1; i < tm_topology->nb_levels; i++) {
573 min = hierarchies[i];
574 for(j = 1; j < num_nodes ; j++)
575 if( hierarchies[j*(TM_MAX_LEVELS+1) + i] < min)
576 min = hierarchies[j*(TM_MAX_LEVELS+1) + i];
577 tm_topology->arity[i] = min;
578 }
579 } else {
580 tm_topology->arity = (int *)calloc(tm_topology->nb_levels, sizeof(int));
581 for(i = 0; i < tm_topology->nb_levels; i++)
582 tm_topology->arity[i] = hierarchies[i+1];
583 }
584 free(hierarchies);
585
586 for(i = 0; i < tm_topology->nb_levels; i++) {
587 OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
588 "topo_arity[%i] = %i\n", i, tm_topology->arity[i]));
589 }
590
591
592 tm_topology->nb_nodes = (size_t *)calloc(tm_topology->nb_levels, sizeof(size_t));
593 tm_topology->nb_nodes[0] = 1;
594 for(i = 1 ; i < tm_topology->nb_levels; i++)
595 tm_topology->nb_nodes[i] = tm_topology->nb_nodes[i-1] * tm_topology->arity[i-1];
596
597 #ifdef __DEBUG__
598 assert(num_objs_total == (int)tm_topology->nb_nodes[tm_topology->nb_levels-1]);
599 #endif
600
601 tm_topology->node_id = (int *)malloc(num_objs_total*sizeof(int));
602 tm_topology->node_rank = (int *)malloc(num_objs_total*sizeof(int));
603 for( i = 0 ; i < num_objs_total ; i++ )
604 tm_topology->node_id[i] = tm_topology->node_rank[i] = -1;
605
606
607 for( i = 0 ; i < num_objs_total ; i++ ) {
608 tm_topology->node_id[i] = obj_mapping[i];
609 if (obj_mapping[i] != -1)
610 tm_topology->node_rank[obj_mapping[i]] = i;
611 }
612
613
614 tm_topology->cost = (double*)calloc(tm_topology->nb_levels,sizeof(double));
615 tm_topology->nb_proc_units = num_objs_total;
616
617 tm_topology->nb_constraints = 0;
618 for(i = 0; i < tm_topology->nb_proc_units ; i++)
619 if (obj_mapping[i] != -1)
620 tm_topology->nb_constraints++;
621 tm_topology->constraints = (int *)calloc(tm_topology->nb_constraints,sizeof(int));
622 for(idx = 0, i = 0; i < tm_topology->nb_proc_units ; i++)
623 if (obj_mapping[i] != -1)
624 tm_topology->constraints[idx++] = obj_mapping[i];
625 #ifdef __DEBUG__
626 assert(idx == tm_topology->nb_constraints);
627 #endif
628 tm_topology->oversub_fact = 1;
629
630 #ifdef __DEBUG__
631
632
633
634
635
636
637
638
639 tm_display_topology(tm_topology);
640 #endif
641 comm_pattern = (double **)malloc(size*sizeof(double *));
642 for(i = 0 ; i < size ; i++)
643 comm_pattern[i] = local_pattern + i * size;
644
645 for( i = 0; i < size ; i++ )
646 for( j = i; j < size ; j++ ) {
647 comm_pattern[i][j] = (comm_pattern[i][j] + comm_pattern[j][i]) / 2;
648 comm_pattern[j][i] = comm_pattern[i][j];
649 }
650 #ifdef __DEBUG__
651 opal_output_verbose(10, ompi_topo_base_framework.framework_output,
652 "==== COMM PATTERN ====\n");
653 for( i = 0 ; i < size ; i++) {
654 dump_double_array(10, ompi_topo_base_framework.framework_output,
655 "", "", comm_pattern[i], size);
656 }
657 #endif
658
659 aff_mat = tm_build_affinity_mat(comm_pattern,size);
660 comm_tree = tm_build_tree_from_topology(tm_topology,aff_mat, NULL, NULL);
661 sol = tm_compute_mapping(tm_topology, comm_tree);
662
663 k = (int *)calloc(sol->k_length, sizeof(int));
664 for(idx = 0 ; idx < (int)sol->k_length ; idx++)
665 k[idx] = sol->k[idx][0];
666 #ifdef __DEBUG__
667 opal_output_verbose(10, ompi_topo_base_framework.framework_output,
668 "====> nb levels : %i\n",tm_topology->nb_levels);
669 dump_int_array(10, ompi_topo_base_framework.framework_output,
670 "Rank permutation sigma/k : ", "", k, num_objs_total);
671 assert(size == (int)sol->sigma_length);
672 dump_int_array(10, ompi_topo_base_framework.framework_output,
673 "Matching : ", "",sol->sigma, sol->sigma_length);
674 #endif
675 free(obj_mapping);
676 free(comm_pattern);
677 free(aff_mat->sum_row);
678 free(aff_mat);
679 tm_free_solution(sol);
680 tm_free_tree(comm_tree);
681 tm_free_topology(tm_topology);
682 }
683 }
684
685
686
687
688 if (OMPI_SUCCESS != (err = comm_old->c_coll->coll_scatter(k, 1, MPI_INT,
689 &newrank, 1, MPI_INT,
690 0, comm_old,
691 comm_old->c_coll->coll_scatter_module))) {
692 if (NULL != k) free(k);
693 goto release_and_return;
694 }
695
696 if ( 0 == rank )
697 free(k);
698
699
700 if (OMPI_SUCCESS != (err = ompi_comm_split(comm_old, 0, newrank, newcomm, false))) {
701 goto release_and_return;
702 }
703
704
705
706 (*newcomm)->c_flags |= OMPI_COMM_DIST_GRAPH;
707 (*newcomm)->c_topo = topo_module;
708 (*newcomm)->c_topo->reorder = reorder;
709
710 } else {
711 int *grank_to_lrank = NULL, *lrank_to_grank = NULL, *marked = NULL;
712 int node_position = 0, offset = 0, pos = 0;
713 ompi_communicator_t *localcomm = NULL;
714
715 if (OMPI_SUCCESS != (err = ompi_comm_split(comm_old, colors[rank], rank,
716 &localcomm, false))) {
717 goto release_and_return;
718 }
719
720 lrank_to_grank = (int *)calloc(num_procs_in_node, sizeof(int));
721 if (OMPI_SUCCESS != (err = localcomm->c_coll->coll_allgather(&rank, 1, MPI_INT,
722 lrank_to_grank, 1, MPI_INT,
723 localcomm, localcomm->c_coll->coll_allgather_module))) {
724 free(lrank_to_grank);
725 ompi_comm_free(&localcomm);
726 goto release_and_return;
727 }
728
729 grank_to_lrank = (int *)malloc(size * sizeof(int));
730 for(i = 0 ; i < size ; grank_to_lrank[i++] = -1);
731 for(i = 0 ; i < num_procs_in_node ; i++)
732 grank_to_lrank[lrank_to_grank[i]] = i;
733
734
735 if (rank == lindex_to_grank[0]) {
736 OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
737 "========== Partially Distributed Reordering ========= \n"));
738 local_pattern = (double *)calloc(num_procs_in_node * num_procs_in_node, sizeof(double));
739 } else {
740 local_pattern = (double *)calloc(num_procs_in_node, sizeof(double));
741 }
742
743 if( true == topo->weighted ) {
744 for(i = 0; i < topo->indegree; i++)
745 if (grank_to_lrank[topo->in[i]] != -1)
746 local_pattern[grank_to_lrank[topo->in[i]]] += topo->inw[i];
747 for(i = 0; i < topo->outdegree; i++)
748 if (grank_to_lrank[topo->out[i]] != -1)
749 local_pattern[grank_to_lrank[topo->out[i]]] += topo->outw[i];
750 }
751 if (OMPI_SUCCESS != (err = localcomm->c_coll->coll_gather((rank == lindex_to_grank[0] ? MPI_IN_PLACE : local_pattern),
752 num_procs_in_node, MPI_DOUBLE,
753 local_pattern, num_procs_in_node, MPI_DOUBLE,
754 0, localcomm, localcomm->c_coll->coll_gather_module))) {
755 free(lrank_to_grank);
756 ompi_comm_free(&localcomm);
757 free(grank_to_lrank);
758 goto release_and_return;
759 }
760
761
762 if (rank == lindex_to_grank[0]) {
763 tm_topology_t *tm_topology = NULL;
764 tm_tree_t *comm_tree = NULL;
765 tm_solution_t *sol = NULL;
766 tm_affinity_mat_t *aff_mat = NULL;
767 double **comm_pattern = NULL;
768 int *obj_to_rank_in_lcomm = NULL;
769
770 comm_pattern = (double **)malloc(num_procs_in_node*sizeof(double *));
771 for( i = 0; i < num_procs_in_node; i++ ) {
772 comm_pattern[i] = local_pattern + i * num_procs_in_node;
773 }
774
775
776
777 for( i = 0; i < num_procs_in_node ; i++ )
778 for( j = i; j < num_procs_in_node ; j++ ) {
779 comm_pattern[i][j] = (comm_pattern[i][j] + comm_pattern[j][i]) / 2;
780 comm_pattern[j][i] = comm_pattern[i][j];
781 }
782
783 #ifdef __DEBUG__
784 OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
785 "========== COMM PATTERN ============= \n"));
786 for(i = 0 ; i < num_procs_in_node ; i++){
787 opal_output_verbose(10, ompi_topo_base_framework.framework_output," %i : ",i);
788 dump_double_array(10, ompi_topo_base_framework.framework_output,
789 "", "", comm_pattern[i], num_procs_in_node);
790 }
791 opal_output_verbose(10, ompi_topo_base_framework.framework_output,
792 "======================= \n");
793 #endif
794
795 tm_topology = (tm_topology_t *)malloc(sizeof(tm_topology_t));
796 tm_topology->nb_levels = numlevels;
797 tm_topology->arity = (int *)calloc(tm_topology->nb_levels, sizeof(int));
798 tm_topology->nb_nodes = (size_t *)calloc(tm_topology->nb_levels, sizeof(size_t));
799
800 for(i = 0 ; i < tm_topology->nb_levels ; i++){
801 int nb_objs = hwloc_get_nbobjs_by_depth(opal_hwloc_topology, tracker[i]->depth);
802 tm_topology->nb_nodes[i] = nb_objs;
803 tm_topology->arity[i] = tracker[i]->arity;
804 }
805
806
807 #ifdef __DEBUG__
808 assert(num_objs_in_node == (int)tm_topology->nb_nodes[tm_topology->nb_levels-1]);
809 #endif
810
811 obj_to_rank_in_lcomm = (int *)malloc(num_objs_in_node*sizeof(int));
812 for(i = 0 ; i < num_objs_in_node ; i++) {
813 obj_to_rank_in_lcomm[i] = -1;
814 object = hwloc_get_obj_by_depth(opal_hwloc_topology, effective_depth, i);
815 for( j = 0; j < num_procs_in_node ; j++ )
816 if(localrank_to_objnum[j] == (int)(object->logical_index)) {
817 obj_to_rank_in_lcomm[i] = j;
818 break;
819 }
820 }
821
822
823 tm_topology->node_id = (int *)malloc(num_objs_in_node*sizeof(int));
824 tm_topology->node_rank = (int *)malloc(num_objs_in_node*sizeof(int));
825 for(i = 1 ; i < num_objs_in_node; i++)
826 tm_topology->node_id[i] = tm_topology->node_rank[i] = -1;
827
828 for( i = 0 ; i < num_objs_in_node ; i++ ) {
829
830
831 tm_topology->node_id[i] = obj_to_rank_in_lcomm[i];
832 if( obj_to_rank_in_lcomm[i] != -1)
833 tm_topology->node_rank[obj_to_rank_in_lcomm[i]] = i;
834 }
835
836
837 tm_topology->cost = (double*)calloc(tm_topology->nb_levels,sizeof(double));
838
839 tm_topology->nb_proc_units = num_objs_in_node;
840 tm_topology->nb_constraints = 0;
841
842 for(i = 0; i < num_objs_in_node ; i++)
843 if (obj_to_rank_in_lcomm[i] != -1)
844 tm_topology->nb_constraints++;
845
846 tm_topology->constraints = (int *)calloc(tm_topology->nb_constraints,sizeof(int));
847 for(idx = 0,i = 0; i < num_objs_in_node ; i++)
848 if (obj_to_rank_in_lcomm[i] != -1)
849 tm_topology->constraints[idx++] = obj_to_rank_in_lcomm[i];
850
851 tm_topology->oversub_fact = 1;
852
853 #ifdef __DEBUG__
854 assert(num_objs_in_node == (int)tm_topology->nb_nodes[tm_topology->nb_levels-1]);
855 OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
856 "Levels in topo : %i | num procs in node : %i\n",
857 tm_topology->nb_levels,num_procs_in_node));
858 for(i = 0; i < tm_topology->nb_levels ; i++) {
859 OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
860 "Nb objs for level %i : %lu | arity %i\n ",
861 i, tm_topology->nb_nodes[i],tm_topology->arity[i]));
862 }
863 dump_int_array(10, ompi_topo_base_framework.framework_output,
864 "", "Obj id ", tm_topology->node_id, tm_topology->nb_nodes[tm_topology->nb_levels-1]);
865 tm_display_topology(tm_topology);
866 #endif
867
868 aff_mat = tm_build_affinity_mat(comm_pattern,num_procs_in_node);
869 comm_tree = tm_build_tree_from_topology(tm_topology,aff_mat, NULL, NULL);
870 sol = tm_compute_mapping(tm_topology, comm_tree);
871
872 assert((int)sol->k_length == num_objs_in_node);
873
874 k = (int *)calloc(sol->k_length, sizeof(int));
875 for(idx = 0 ; idx < (int)sol->k_length ; idx++)
876 k[idx] = sol->k[idx][0];
877
878 #ifdef __DEBUG__
879 OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
880 "====> nb levels : %i\n",tm_topology->nb_levels));
881 dump_int_array(10, ompi_topo_base_framework.framework_output,
882 "Rank permutation sigma/k : ", "", k, num_procs_in_node);
883 assert(num_procs_in_node == (int)sol->sigma_length);
884 dump_int_array(10, ompi_topo_base_framework.framework_output,
885 "Matching : ", "", sol->sigma, sol->sigma_length);
886 #endif
887 free(obj_to_rank_in_lcomm);
888 free(aff_mat->sum_row);
889 free(aff_mat);
890 free(comm_pattern);
891 tm_free_solution(sol);
892 tm_free_tree(comm_tree);
893 tm_free_topology(tm_topology);
894 }
895
896
897
898 if (OMPI_SUCCESS != (err = localcomm->c_coll->coll_scatter(k, 1, MPI_INT,
899 &newrank, 1, MPI_INT,
900 0, localcomm,
901 localcomm->c_coll->coll_scatter_module))) {
902 if (NULL != k) free(k);
903 ompi_comm_free(&localcomm);
904 free(lrank_to_grank);
905 free(grank_to_lrank);
906 goto release_and_return;
907 }
908
909
910
911 marked = (int *)malloc((num_nodes-1)*sizeof(int));
912 for(idx = 0 ; idx < num_nodes - 1 ; idx++)
913 marked[idx] = -1;
914
915 while( (node_position != rank) && (colors[node_position] != colors[rank])) {
916
917 for(idx = 0; idx < pos; idx++)
918 if( marked[idx] == colors[node_position] )
919 goto next_iter;
920
921 for(; idx < size; idx++)
922 if(colors[idx] == colors[node_position])
923 offset++;
924 marked[pos++] = colors[node_position];
925 next_iter:
926 node_position++;
927 }
928 newrank += offset;
929 free(marked);
930
931 if (rank == lindex_to_grank[0])
932 free(k);
933
934
935 if (OMPI_SUCCESS != (err = ompi_comm_split(comm_old, 0, newrank, newcomm, false))) {
936 ompi_comm_free(&localcomm);
937 free(lrank_to_grank);
938 free(grank_to_lrank);
939 goto release_and_return;
940 }
941
942
943
944 (*newcomm)->c_flags |= OMPI_COMM_DIST_GRAPH;
945 (*newcomm)->c_topo = topo_module;
946 (*newcomm)->c_topo->reorder = reorder;
947
948 free(grank_to_lrank);
949 free(lrank_to_grank);
950 }
951
952 release_and_return:
953 if (NULL != reqs ) free(reqs);
954 if (NULL != tracker) free(tracker);
955 if (NULL != local_pattern) free(local_pattern);
956 free(colors);
957 if (NULL != lindex_to_grank) free(lindex_to_grank);
958 if (NULL != nodes_roots) free(nodes_roots);
959 if (NULL != localrank_to_objnum) free(localrank_to_objnum);
960 if( NULL != set) hwloc_bitmap_free(set);
961
962
963
964 if( OMPI_SUCCESS != err ) goto fallback;
965 return OMPI_SUCCESS;
966 }