This source file includes following definitions.
- init
- finalize
- delete_route
- update_route
- get_route
- route_lost
- route_is_defined
- set_lifeline
- binomial_tree
- update_routing_plan
- get_routing_list
- num_routes
- binomial_ft_event
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16 #include "orte_config.h"
17 #include "orte/constants.h"
18
19 #include <stddef.h>
20
21 #include "opal/dss/dss.h"
22 #include "opal/class/opal_pointer_array.h"
23 #include "opal/class/opal_bitmap.h"
24 #include "opal/util/bit_ops.h"
25 #include "opal/util/output.h"
26
27 #include "orte/mca/errmgr/errmgr.h"
28 #include "orte/mca/ess/ess.h"
29 #include "orte/mca/rml/rml.h"
30 #include "orte/mca/rml/rml_types.h"
31 #include "orte/util/name_fns.h"
32 #include "orte/runtime/orte_globals.h"
33 #include "orte/runtime/orte_wait.h"
34 #include "orte/runtime/runtime.h"
35 #include "orte/runtime/data_type_support/orte_dt_support.h"
36
37 #include "orte/mca/rml/base/rml_contact.h"
38
39 #include "orte/mca/routed/base/base.h"
40 #include "routed_binomial.h"
41
42 static int init(void);
43 static int finalize(void);
44 static int delete_route(orte_process_name_t *proc);
45 static int update_route(orte_process_name_t *target,
46 orte_process_name_t *route);
47 static orte_process_name_t get_route(orte_process_name_t *target);
48 static int route_lost(const orte_process_name_t *route);
49 static bool route_is_defined(const orte_process_name_t *target);
50 static void update_routing_plan(void);
51 static void get_routing_list(opal_list_t *coll);
52 static int set_lifeline(orte_process_name_t *proc);
53 static size_t num_routes(void);
54
55 #if OPAL_ENABLE_FT_CR == 1
56 static int binomial_ft_event(int state);
57 #endif
58
59 orte_routed_module_t orte_routed_binomial_module = {
60 .initialize = init,
61 .finalize = finalize,
62 .delete_route = delete_route,
63 .update_route = update_route,
64 .get_route = get_route,
65 .route_lost = route_lost,
66 .route_is_defined = route_is_defined,
67 .set_lifeline = set_lifeline,
68 .update_routing_plan = update_routing_plan,
69 .get_routing_list = get_routing_list,
70 .num_routes = num_routes,
71 #if OPAL_ENABLE_FT_CR == 1
72 .ft_event = binomial_ft_event
73 #else
74 NULL
75 #endif
76 };
77
78
79 static orte_process_name_t *lifeline=NULL;
80 static orte_process_name_t local_lifeline;
81 static int num_children;
82 static opal_list_t my_children;
83 static bool hnp_direct=true;
84
85 static int init(void)
86 {
87 lifeline = NULL;
88
89 if (ORTE_PROC_IS_DAEMON) {
90
91 if (orte_static_ports) {
92 lifeline = ORTE_PROC_MY_PARENT;
93 } else {
94
95 lifeline = ORTE_PROC_MY_HNP;
96 }
97 ORTE_PROC_MY_PARENT->jobid = ORTE_PROC_MY_NAME->jobid;
98 } else if (ORTE_PROC_IS_APP) {
99
100
101 if (NULL == orte_process_info.my_daemon_uri) {
102 return ORTE_ERR_TAKE_NEXT_OPTION;
103 }
104
105 lifeline = ORTE_PROC_MY_DAEMON;
106 orte_routing_is_enabled = true;
107 }
108
109
110 OBJ_CONSTRUCT(&my_children, opal_list_t);
111 num_children = 0;
112
113 return ORTE_SUCCESS;
114 }
115
116 static int finalize(void)
117 {
118 opal_list_item_t *item;
119
120 lifeline = NULL;
121
122
123 while (NULL != (item = opal_list_remove_first(&my_children))) {
124 OBJ_RELEASE(item);
125 }
126 OBJ_DESTRUCT(&my_children);
127 num_children = 0;
128
129 return ORTE_SUCCESS;
130 }
131
132 static int delete_route(orte_process_name_t *proc)
133 {
134 if (proc->jobid == ORTE_JOBID_INVALID ||
135 proc->vpid == ORTE_VPID_INVALID) {
136 return ORTE_ERR_BAD_PARAM;
137 }
138
139
140
141
142 if (!ORTE_PROC_IS_HNP && !ORTE_PROC_IS_DAEMON &&
143 !ORTE_PROC_IS_TOOL) {
144 return ORTE_SUCCESS;
145 }
146
147 OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output,
148 "%s routed_binomial_delete_route for %s",
149 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
150 ORTE_NAME_PRINT(proc)));
151
152
153
154
155
156
157
158 return ORTE_SUCCESS;
159 }
160
161 static int update_route(orte_process_name_t *target,
162 orte_process_name_t *route)
163 {
164 if (target->jobid == ORTE_JOBID_INVALID ||
165 target->vpid == ORTE_VPID_INVALID) {
166 return ORTE_ERR_BAD_PARAM;
167 }
168
169
170
171
172 if (ORTE_PROC_IS_APP) {
173 return ORTE_SUCCESS;
174 }
175
176 OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output,
177 "%s routed_binomial_update: %s --> %s",
178 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
179 ORTE_NAME_PRINT(target),
180 ORTE_NAME_PRINT(route)));
181
182
183
184
185
186
187 if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_HNP, target) &&
188 OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_HNP, route)) {
189 hnp_direct = false;
190 return ORTE_SUCCESS;
191 }
192
193 return ORTE_SUCCESS;
194 }
195
196
197 static orte_process_name_t get_route(orte_process_name_t *target)
198 {
199 orte_process_name_t *ret, daemon;
200 opal_list_item_t *item;
201 orte_routed_tree_t *child;
202
203 if (!orte_routing_is_enabled) {
204 ret = target;
205 goto found;
206 }
207
208
209 daemon.jobid = ORTE_PROC_MY_DAEMON->jobid;
210 daemon.vpid = ORTE_PROC_MY_DAEMON->vpid;
211
212 if (target->jobid == ORTE_JOBID_INVALID ||
213 target->vpid == ORTE_VPID_INVALID) {
214 ret = ORTE_NAME_INVALID;
215 goto found;
216 }
217
218
219 if (OPAL_EQUAL == opal_dss.compare(ORTE_PROC_MY_NAME, target, ORTE_NAME)) {
220 ret = target;
221 goto found;
222 }
223
224
225 if (ORTE_PROC_IS_APP) {
226 ret = ORTE_PROC_MY_DAEMON;
227 goto found;
228 }
229
230
231
232
233 if (ORTE_PROC_IS_TOOL) {
234 if (ORTE_JOB_FAMILY(target->jobid) == ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) {
235 ret = target;
236 goto found;
237 } else {
238 ORTE_HNP_NAME_FROM_JOB(&daemon, target->jobid);
239 ret = &daemon;
240 goto found;
241 }
242 }
243
244
245 if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_HNP, target)) {
246 if (!hnp_direct || orte_static_ports) {
247 OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
248 "%s routing to the HNP through my parent %s",
249 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
250 ORTE_NAME_PRINT(ORTE_PROC_MY_PARENT)));
251 ret = ORTE_PROC_MY_PARENT;
252 goto found;
253 } else {
254 OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
255 "%s routing direct to the HNP",
256 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
257 ret = ORTE_PROC_MY_HNP;
258 goto found;
259 }
260 }
261
262
263 daemon.jobid = ORTE_PROC_MY_NAME->jobid;
264
265 if (ORTE_VPID_INVALID == (daemon.vpid = orte_get_proc_daemon_vpid(target))) {
266
267 ret = ORTE_NAME_INVALID;
268 goto found;
269 }
270
271
272 if (ORTE_PROC_MY_NAME->vpid == daemon.vpid) {
273 ret = target;
274 goto found;
275 }
276
277
278 for (item = opal_list_get_first(&my_children);
279 item != opal_list_get_end(&my_children);
280 item = opal_list_get_next(item)) {
281 child = (orte_routed_tree_t*)item;
282 if (child->vpid == daemon.vpid) {
283
284 ret = &daemon;
285 goto found;
286 }
287
288 if (opal_bitmap_is_set_bit(&child->relatives, daemon.vpid)) {
289
290 daemon.vpid = child->vpid;
291
292 ret = &daemon;
293 goto found;
294 }
295 }
296
297
298
299
300 daemon.vpid = ORTE_PROC_MY_PARENT->vpid;
301
302 ret = &daemon;
303
304 found:
305 OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output,
306 "%s routed_binomial_get(%s) --> %s",
307 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
308 ORTE_NAME_PRINT(target),
309 ORTE_NAME_PRINT(ret)));
310
311 return *ret;
312 }
313
314 static int route_lost(const orte_process_name_t *route)
315 {
316 opal_list_item_t *item;
317 orte_routed_tree_t *child;
318
319 OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
320 "%s route to %s lost",
321 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
322 ORTE_NAME_PRINT(route)));
323
324
325
326
327
328
329 if (!orte_finalizing &&
330 NULL != lifeline &&
331 OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, route, lifeline)) {
332 OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
333 "%s routed:binomial: Connection to lifeline %s lost",
334 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
335 ORTE_NAME_PRINT(lifeline)));
336 return ORTE_ERR_FATAL;
337 }
338
339
340
341
342 if ((ORTE_PROC_IS_DAEMON || ORTE_PROC_IS_HNP) &&
343 route->jobid == ORTE_PROC_MY_NAME->jobid) {
344 for (item = opal_list_get_first(&my_children);
345 item != opal_list_get_end(&my_children);
346 item = opal_list_get_next(item)) {
347 child = (orte_routed_tree_t*)item;
348 if (child->vpid == route->vpid) {
349 OPAL_OUTPUT_VERBOSE((4, orte_routed_base_framework.framework_output,
350 "%s routed_binomial: removing route to child daemon %s",
351 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
352 ORTE_NAME_PRINT(route)));
353 opal_list_remove_item(&my_children, item);
354 OBJ_RELEASE(item);
355 return ORTE_SUCCESS;
356 }
357 }
358 }
359
360
361 return ORTE_SUCCESS;
362 }
363
364
365 static bool route_is_defined(const orte_process_name_t *target)
366 {
367
368 if (ORTE_VPID_INVALID == orte_get_proc_daemon_vpid((orte_process_name_t*)target)) {
369 return false;
370 }
371
372 return true;
373 }
374
375 static int set_lifeline(orte_process_name_t *proc)
376 {
377
378
379
380 local_lifeline.jobid = proc->jobid;
381 local_lifeline.vpid = proc->vpid;
382 lifeline = &local_lifeline;
383
384 return ORTE_SUCCESS;
385 }
386
387 static int binomial_tree(int rank, int parent, int me, int num_procs,
388 int *nchildren, opal_list_t *childrn,
389 opal_bitmap_t *relatives, bool mine)
390 {
391 int i, bitmap, peer, hibit, mask, found;
392 orte_routed_tree_t *child;
393 opal_bitmap_t *relations;
394
395 OPAL_OUTPUT_VERBOSE((3, orte_routed_base_framework.framework_output,
396 "%s routed:binomial rank %d parent %d me %d num_procs %d",
397 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
398 rank, parent, me, num_procs));
399
400
401 if (me == rank) {
402 bitmap = opal_cube_dim(num_procs);
403
404 hibit = opal_hibit(rank, bitmap);
405 --bitmap;
406
407 for (i = hibit + 1, mask = 1 << i; i <= bitmap; ++i, mask <<= 1) {
408 peer = rank | mask;
409 if (peer < num_procs) {
410 child = OBJ_NEW(orte_routed_tree_t);
411 child->vpid = peer;
412 OPAL_OUTPUT_VERBOSE((3, orte_routed_base_framework.framework_output,
413 "%s routed:binomial %d found child %s",
414 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
415 rank,
416 ORTE_VPID_PRINT(child->vpid)));
417
418 if (mine) {
419
420 opal_list_append(childrn, &child->super);
421 (*nchildren)++;
422
423 opal_bitmap_init(&child->relatives, num_procs);
424
425
426 relations = &child->relatives;
427 } else {
428
429 opal_bitmap_set_bit(relatives, peer);
430
431 relations = relatives;
432 }
433
434 binomial_tree(0, 0, peer, num_procs, nchildren, childrn, relations, false);
435 }
436 }
437 return parent;
438 }
439
440
441 OPAL_OUTPUT_VERBOSE((5, orte_routed_base_framework.framework_output,
442 "%s routed:binomial find children of rank %d",
443 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), rank));
444 bitmap = opal_cube_dim(num_procs);
445
446 hibit = opal_hibit(rank, bitmap);
447 --bitmap;
448
449 for (i = hibit + 1, mask = 1 << i; i <= bitmap; ++i, mask <<= 1) {
450 peer = rank | mask;
451 OPAL_OUTPUT_VERBOSE((5, orte_routed_base_framework.framework_output,
452 "%s routed:binomial find children checking peer %d",
453 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), peer));
454 if (peer < num_procs) {
455 OPAL_OUTPUT_VERBOSE((5, orte_routed_base_framework.framework_output,
456 "%s routed:binomial find children computing tree",
457 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
458
459 if (0 <= (found = binomial_tree(peer, rank, me, num_procs, nchildren, childrn, relatives, mine))) {
460 OPAL_OUTPUT_VERBOSE((5, orte_routed_base_framework.framework_output,
461 "%s routed:binomial find children returning found value %d",
462 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), found));
463 return found;
464 }
465 }
466 }
467 return -1;
468 }
469
470 static void update_routing_plan(void)
471 {
472 orte_routed_tree_t *child;
473 int j;
474 opal_list_item_t *item;
475
476
477
478
479 if (!ORTE_PROC_IS_DAEMON && !ORTE_PROC_IS_HNP) {
480 return;
481 }
482
483
484 while (NULL != (item = opal_list_remove_first(&my_children))) {
485 OBJ_RELEASE(item);
486 }
487 num_children = 0;
488
489
490
491
492 ORTE_PROC_MY_PARENT->vpid = binomial_tree(0, 0, ORTE_PROC_MY_NAME->vpid,
493 orte_process_info.max_procs,
494 &num_children, &my_children, NULL, true);
495
496 if (0 < opal_output_get_verbosity(orte_routed_base_framework.framework_output)) {
497 opal_output(0, "%s: parent %d num_children %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_PROC_MY_PARENT->vpid, num_children);
498 for (item = opal_list_get_first(&my_children);
499 item != opal_list_get_end(&my_children);
500 item = opal_list_get_next(item)) {
501 child = (orte_routed_tree_t*)item;
502 opal_output(0, "%s: \tchild %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), child->vpid);
503 for (j=0; j < (int)orte_process_info.max_procs; j++) {
504 if (opal_bitmap_is_set_bit(&child->relatives, j)) {
505 opal_output(0, "%s: \t\trelation %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), j);
506 }
507 }
508 }
509 }
510 }
511
512 static void get_routing_list(opal_list_t *coll)
513 {
514
515
516
517
518 if (!ORTE_PROC_IS_DAEMON && !ORTE_PROC_IS_HNP) {
519 return;
520 }
521
522 orte_routed_base_xcast_routing(coll, &my_children);
523 }
524
525 static size_t num_routes(void)
526 {
527 OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
528 "%s num routes %d",
529 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
530 (int)opal_list_get_size(&my_children)));
531 return opal_list_get_size(&my_children);
532 }
533
534 #if OPAL_ENABLE_FT_CR == 1
535 static int binomial_ft_event(int state)
536 {
537 int ret, exit_status = ORTE_SUCCESS;
538
539
540 if(OPAL_CRS_CHECKPOINT == state) {
541 }
542
543 else if (OPAL_CRS_CONTINUE == state ) {
544 }
545 else if (OPAL_CRS_TERM == state ) {
546
547 }
548 else {
549
550 }
551
552 cleanup:
553 return exit_status;
554 }
555 #endif
556