This source file includes following definitions.
- sample
1
2
3
4
5
6
7
8
9
10
11
12
13
14 #include "orte_config.h"
15 #include "orte/constants.h"
16 #include "orte/types.h"
17
18 #include <errno.h>
19 #ifdef HAVE_UNISTD_H
20 #include <unistd.h>
21 #endif
22 #ifdef HAVE_STRING_H
23 #include <string.h>
24 #endif
25 #include <stdio.h>
26 #ifdef HAVE_SIGNAL_H
27 #include <signal.h>
28 #endif
29
30 #include "opal_stdint.h"
31 #include "opal/util/alfg.h"
32 #include "opal/util/output.h"
33
34 #include "orte/util/error_strings.h"
35 #include "orte/util/name_fns.h"
36 #include "orte/mca/errmgr/errmgr.h"
37 #include "orte/runtime/orte_globals.h"
38
39 #include "orte/mca/sensor/base/base.h"
40 #include "orte/mca/sensor/base/sensor_private.h"
41 #include "sensor_ft_tester.h"
42
43
44 static void sample(void);
45
46
47 orte_sensor_base_module_t orte_sensor_ft_tester_module = {
48 NULL,
49 NULL,
50 NULL,
51 NULL,
52 sample,
53 NULL
54 };
55
56 static void sample(void)
57 {
58 float prob;
59 orte_proc_t *child;
60 int i;
61
62 OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
63 "%s sample:ft_tester considering killing something",
64 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
65
66
67 if (ORTE_PROC_IS_DAEMON &&
68 0 < mca_sensor_ft_tester_component.daemon_fail_prob) {
69 OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
70 "%s sample:ft_tester considering killing me!",
71 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
72
73 prob = (double)opal_rand(&orte_sensor_ft_rng_buff) / (double)UINT32_MAX;
74 if (prob < mca_sensor_ft_tester_component.daemon_fail_prob) {
75
76 OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
77 "%s sample:ft_tester committing suicide",
78 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
79 orte_errmgr.abort(1, NULL);
80 return;
81 }
82 }
83
84 if (0 < mca_sensor_ft_tester_component.fail_prob) {
85
86 for (i=0; i < orte_local_children->size; i++) {
87 if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
88 continue;
89 }
90 if (!child->alive || 0 == child->pid ||
91 ORTE_PROC_STATE_UNTERMINATED < child->state) {
92 OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
93 "%s sample:ft_tester ignoring child: %s alive %s pid %lu state %s",
94 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
95 ORTE_NAME_PRINT(&child->name),
96 child->alive ? "TRUE" : "FALSE",
97 (unsigned long)child->pid, orte_proc_state_to_str(child->state)));
98 continue;
99 }
100
101 prob = (double)opal_rand(&orte_sensor_ft_rng_buff) / (double)UINT32_MAX;
102 OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
103 "%s sample:ft_tester child: %s dice: %f prob %f",
104 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
105 ORTE_NAME_PRINT(&child->name),
106 prob, mca_sensor_ft_tester_component.fail_prob));
107 if (prob < mca_sensor_ft_tester_component.fail_prob) {
108
109 OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
110 "%s sample:ft_tester killing %s",
111 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
112 ORTE_NAME_PRINT(&child->name)));
113 kill(child->pid, SIGTERM);
114
115 if (!mca_sensor_ft_tester_component.multi_fail) {
116 break;
117 }
118 }
119 }
120 }
121 }