This source file includes following definitions.
- init
- finalize
- delete_route
- update_route
- get_route
- route_lost
- route_is_defined
- set_lifeline
- radix_tree
- update_routing_plan
- get_routing_list
- num_routes
- radix_ft_event
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 #include "orte_config.h"
18 #include "orte/constants.h"
19
20 #include <stddef.h>
21
22 #include "opal/dss/dss.h"
23 #include "opal/class/opal_hash_table.h"
24 #include "opal/class/opal_bitmap.h"
25 #include "opal/runtime/opal_progress.h"
26 #include "opal/util/output.h"
27
28 #include "orte/mca/errmgr/errmgr.h"
29 #include "orte/mca/ess/ess.h"
30 #include "orte/mca/rml/rml.h"
31 #include "orte/mca/rml/rml_types.h"
32 #include "orte/util/name_fns.h"
33 #include "orte/runtime/orte_globals.h"
34 #include "orte/runtime/orte_wait.h"
35 #include "orte/runtime/runtime.h"
36 #include "orte/runtime/data_type_support/orte_dt_support.h"
37
38 #include "orte/mca/rml/base/rml_contact.h"
39
40 #include "orte/mca/routed/base/base.h"
41 #include "routed_radix.h"
42
43
44 static int init(void);
45 static int finalize(void);
46 static int delete_route(orte_process_name_t *proc);
47 static int update_route(orte_process_name_t *target,
48 orte_process_name_t *route);
49 static orte_process_name_t get_route(orte_process_name_t *target);
50 static int route_lost(const orte_process_name_t *route);
51 static bool route_is_defined(const orte_process_name_t *target);
52 static void update_routing_plan(void);
53 static void get_routing_list(opal_list_t *coll);
54 static int set_lifeline(orte_process_name_t *proc);
55 static size_t num_routes(void);
56
57 #if OPAL_ENABLE_FT_CR == 1
58 static int radix_ft_event(int state);
59 #endif
60
61 orte_routed_module_t orte_routed_radix_module = {
62 .initialize = init,
63 .finalize = finalize,
64 .delete_route = delete_route,
65 .update_route = update_route,
66 .get_route = get_route,
67 .route_lost = route_lost,
68 .route_is_defined = route_is_defined,
69 .set_lifeline = set_lifeline,
70 .update_routing_plan = update_routing_plan,
71 .get_routing_list = get_routing_list,
72 .num_routes = num_routes,
73 #if OPAL_ENABLE_FT_CR == 1
74 .ft_event = radix_ft_event
75 #else
76 NULL
77 #endif
78 };
79
80
81 static orte_process_name_t *lifeline=NULL;
82 static orte_process_name_t local_lifeline;
83 static int num_children;
84 static opal_list_t my_children;
85 static bool hnp_direct=true;
86
87 static int init(void)
88 {
89 lifeline = NULL;
90
91 if (ORTE_PROC_IS_DAEMON) {
92
93 if (orte_static_ports) {
94 lifeline = ORTE_PROC_MY_PARENT;
95 } else {
96
97 lifeline = ORTE_PROC_MY_HNP;
98 }
99 ORTE_PROC_MY_PARENT->jobid = ORTE_PROC_MY_NAME->jobid;
100 } else if (ORTE_PROC_IS_APP) {
101
102
103 if (NULL == orte_process_info.my_daemon_uri) {
104 return ORTE_ERR_TAKE_NEXT_OPTION;
105 }
106
107 lifeline = ORTE_PROC_MY_DAEMON;
108 orte_routing_is_enabled = true;
109 }
110
111
112 OBJ_CONSTRUCT(&my_children, opal_list_t);
113 num_children = 0;
114
115 return ORTE_SUCCESS;
116 }
117
118 static int finalize(void)
119 {
120 opal_list_item_t *item;
121
122 lifeline = NULL;
123
124
125 while (NULL != (item = opal_list_remove_first(&my_children))) {
126 OBJ_RELEASE(item);
127 }
128 OBJ_DESTRUCT(&my_children);
129 num_children = 0;
130
131 return ORTE_SUCCESS;
132 }
133
134 static int delete_route(orte_process_name_t *proc)
135 {
136 if (proc->jobid == ORTE_JOBID_INVALID ||
137 proc->vpid == ORTE_VPID_INVALID) {
138 return ORTE_ERR_BAD_PARAM;
139 }
140
141
142
143
144 if (!ORTE_PROC_IS_HNP && !ORTE_PROC_IS_DAEMON &&
145 !ORTE_PROC_IS_TOOL) {
146 return ORTE_SUCCESS;
147 }
148
149 OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output,
150 "%s routed_radix_delete_route for %s",
151 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
152 ORTE_NAME_PRINT(proc)));
153
154
155
156
157
158
159
160 return ORTE_SUCCESS;
161 }
162
163 static int update_route(orte_process_name_t *target,
164 orte_process_name_t *route)
165 {
166 if (target->jobid == ORTE_JOBID_INVALID ||
167 target->vpid == ORTE_VPID_INVALID) {
168 return ORTE_ERR_BAD_PARAM;
169 }
170
171
172
173
174 if (ORTE_PROC_IS_APP) {
175 return ORTE_SUCCESS;
176 }
177
178 OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output,
179 "%s routed_radix_update: %s --> %s",
180 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
181 ORTE_NAME_PRINT(target),
182 ORTE_NAME_PRINT(route)));
183
184
185
186
187
188
189 if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_HNP, target) &&
190 OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_HNP, route)) {
191 hnp_direct = false;
192 return ORTE_SUCCESS;
193 }
194
195 return ORTE_SUCCESS;
196 }
197
198
199 static orte_process_name_t get_route(orte_process_name_t *target)
200 {
201 orte_process_name_t *ret, daemon;
202 opal_list_item_t *item;
203 orte_routed_tree_t *child;
204
205 if (!orte_routing_is_enabled) {
206 ret = target;
207 goto found;
208 }
209
210
211 daemon.jobid = ORTE_PROC_MY_DAEMON->jobid;
212 daemon.vpid = ORTE_PROC_MY_DAEMON->vpid;
213
214 if (target->jobid == ORTE_JOBID_INVALID ||
215 target->vpid == ORTE_VPID_INVALID) {
216 ret = ORTE_NAME_INVALID;
217 goto found;
218 }
219
220
221 if (OPAL_EQUAL == opal_dss.compare(ORTE_PROC_MY_NAME, target, ORTE_NAME)) {
222 ret = target;
223 goto found;
224 }
225
226
227
228
229 if (ORTE_PROC_IS_TOOL) {
230 if (ORTE_JOB_FAMILY(target->jobid) == ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) {
231 ret = target;
232 goto found;
233 } else {
234 ORTE_HNP_NAME_FROM_JOB(&daemon, target->jobid);
235 ret = &daemon;
236 goto found;
237 }
238 }
239
240
241
242
243
244 if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_HNP, target)) {
245 if (!hnp_direct || orte_static_ports) {
246 OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
247 "%s routing to the HNP through my parent %s",
248 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
249 ORTE_NAME_PRINT(ORTE_PROC_MY_PARENT)));
250 ret = ORTE_PROC_MY_PARENT;
251 goto found;
252 } else {
253 OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
254 "%s routing direct to the HNP",
255 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
256 ret = ORTE_PROC_MY_HNP;
257 goto found;
258 }
259 }
260
261
262 if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_PARENT, target)) {
263 ret = ORTE_PROC_MY_PARENT;
264 goto found;
265 }
266
267
268
269 if (ORTE_JOB_FAMILY(target->jobid) != ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) {
270 ret = target;
271 goto found;
272 }
273
274 daemon.jobid = ORTE_PROC_MY_NAME->jobid;
275
276 if (ORTE_PROC_MY_NAME->jobid == target->jobid) {
277
278 daemon.vpid = target->vpid;
279 } else {
280 if (ORTE_VPID_INVALID == (daemon.vpid = orte_get_proc_daemon_vpid(target))) {
281 ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
282 ret = ORTE_NAME_INVALID;
283 goto found;
284 }
285 }
286
287
288 if (ORTE_PROC_MY_NAME->vpid == daemon.vpid) {
289 ret = target;
290 goto found;
291 } else {
292
293 for (item = opal_list_get_first(&my_children);
294 item != opal_list_get_end(&my_children);
295 item = opal_list_get_next(item)) {
296 child = (orte_routed_tree_t*)item;
297 if (child->vpid == daemon.vpid) {
298
299 ret = &daemon;
300 goto found;
301 }
302
303 if (opal_bitmap_is_set_bit(&child->relatives, daemon.vpid)) {
304
305 daemon.vpid = child->vpid;
306 ret = &daemon;
307 goto found;
308 }
309 }
310 }
311
312
313
314
315 daemon.vpid = ORTE_PROC_MY_PARENT->vpid;
316
317 ret = &daemon;
318
319 found:
320 OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output,
321 "%s routed_radix_get(%s) --> %s",
322 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
323 ORTE_NAME_PRINT(target),
324 ORTE_NAME_PRINT(ret)));
325
326 return *ret;
327 }
328
329 static int route_lost(const orte_process_name_t *route)
330 {
331 opal_list_item_t *item;
332 orte_routed_tree_t *child;
333
334 OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
335 "%s route to %s lost",
336 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
337 ORTE_NAME_PRINT(route)));
338
339
340
341
342
343
344 if (!orte_finalizing &&
345 NULL != lifeline &&
346 OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, route, lifeline)) {
347 OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
348 "%s routed:radix: Connection to lifeline %s lost",
349 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
350 ORTE_NAME_PRINT(lifeline)));
351 return ORTE_ERR_FATAL;
352 }
353
354
355
356
357 if ((ORTE_PROC_IS_DAEMON || ORTE_PROC_IS_HNP) &&
358 route->jobid == ORTE_PROC_MY_NAME->jobid) {
359 for (item = opal_list_get_first(&my_children);
360 item != opal_list_get_end(&my_children);
361 item = opal_list_get_next(item)) {
362 child = (orte_routed_tree_t*)item;
363 if (child->vpid == route->vpid) {
364 opal_list_remove_item(&my_children, item);
365 OBJ_RELEASE(item);
366 return ORTE_SUCCESS;
367 }
368 }
369 }
370
371
372 return ORTE_SUCCESS;
373 }
374
375 static bool route_is_defined(const orte_process_name_t *target)
376 {
377
378 if (ORTE_VPID_INVALID == orte_get_proc_daemon_vpid((orte_process_name_t*)target)) {
379 return false;
380 }
381
382 return true;
383 }
384
385 static int set_lifeline(orte_process_name_t *proc)
386 {
387
388
389
390 local_lifeline.jobid = proc->jobid;
391 local_lifeline.vpid = proc->vpid;
392 lifeline = &local_lifeline;
393
394 return ORTE_SUCCESS;
395 }
396
397 static void radix_tree(int rank, int *num_children,
398 opal_list_t *children, opal_bitmap_t *relatives)
399 {
400 int i, peer, Sum, NInLevel;
401 orte_routed_tree_t *child;
402 opal_bitmap_t *relations;
403
404
405 Sum=1;
406 NInLevel=1;
407
408 while ( Sum < (rank+1) ) {
409 NInLevel *= mca_routed_radix_component.radix;
410 Sum += NInLevel;
411 }
412
413
414 peer = rank + NInLevel;
415 for (i = 0; i < mca_routed_radix_component.radix; i++) {
416 if (peer < (int)orte_process_info.num_procs) {
417 child = OBJ_NEW(orte_routed_tree_t);
418 child->vpid = peer;
419 if (NULL != children) {
420
421 opal_list_append(children, &child->super);
422 (*num_children)++;
423
424 opal_bitmap_init(&child->relatives, orte_process_info.num_procs);
425
426 relations = &child->relatives;
427 } else {
428
429 if (OPAL_SUCCESS != opal_bitmap_set_bit(relatives, peer)) {
430 opal_output(0, "%s Error: could not set relations bit!", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
431 }
432
433 relations = relatives;
434 OBJ_RELEASE(child);
435 }
436
437 radix_tree(peer, NULL, NULL, relations);
438 }
439 peer += NInLevel;
440 }
441 }
442
443 static void update_routing_plan(void)
444 {
445 orte_routed_tree_t *child;
446 int j;
447 opal_list_item_t *item;
448 int Level,Sum,NInLevel,Ii;
449 int NInPrevLevel;
450
451
452
453
454 if (!ORTE_PROC_IS_DAEMON && !ORTE_PROC_IS_HNP) {
455 return;
456 }
457
458
459 while (NULL != (item = opal_list_remove_first(&my_children))) {
460 OBJ_RELEASE(item);
461 }
462 num_children = 0;
463
464
465 Ii = ORTE_PROC_MY_NAME->vpid;
466 Level=0;
467 Sum=1;
468 NInLevel=1;
469
470 while ( Sum < (Ii+1) ) {
471 Level++;
472 NInLevel *= mca_routed_radix_component.radix;
473 Sum += NInLevel;
474 }
475 Sum -= NInLevel;
476
477 NInPrevLevel = NInLevel/mca_routed_radix_component.radix;
478
479 if( 0 == Ii ) {
480 ORTE_PROC_MY_PARENT->vpid = -1;
481 } else {
482 ORTE_PROC_MY_PARENT->vpid = (Ii-Sum) % NInPrevLevel;
483 ORTE_PROC_MY_PARENT->vpid += (Sum - NInPrevLevel);
484 }
485
486
487
488
489 radix_tree(Ii, &num_children, &my_children, NULL);
490
491 if (0 < opal_output_get_verbosity(orte_routed_base_framework.framework_output)) {
492 opal_output(0, "%s: parent %d num_children %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_PROC_MY_PARENT->vpid, num_children);
493 for (item = opal_list_get_first(&my_children);
494 item != opal_list_get_end(&my_children);
495 item = opal_list_get_next(item)) {
496 child = (orte_routed_tree_t*)item;
497 opal_output(0, "%s: \tchild %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), child->vpid);
498 for (j=0; j < (int)orte_process_info.num_procs; j++) {
499 if (opal_bitmap_is_set_bit(&child->relatives, j)) {
500 opal_output(0, "%s: \t\trelation %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), j);
501 }
502 }
503 }
504 }
505 }
506
507 static void get_routing_list(opal_list_t *coll)
508 {
509
510
511
512 if (!ORTE_PROC_IS_DAEMON && !ORTE_PROC_IS_HNP) {
513 return;
514 }
515
516 orte_routed_base_xcast_routing(coll, &my_children);
517 }
518
519 static size_t num_routes(void)
520 {
521 return opal_list_get_size(&my_children);
522 }
523
524 #if OPAL_ENABLE_FT_CR == 1
525 static int radix_ft_event(int state)
526 {
527 int ret, exit_status = ORTE_SUCCESS;
528
529
530 if(OPAL_CRS_CHECKPOINT == state) {
531 }
532
533 else if (OPAL_CRS_CONTINUE == state ) {
534 }
535 else if (OPAL_CRS_TERM == state ) {
536
537 }
538 else {
539
540 }
541
542 cleanup:
543 return exit_status;
544 }
545 #endif