This source file includes following definitions.
- failed_cmd
- orte_plm_base_orted_exit
- orte_plm_base_orted_terminate_job
- orte_plm_base_orted_kill_local_procs
- orte_plm_base_orted_signal_local_procs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 #include "orte_config.h"
24 #include "orte/constants.h"
25
26 #include <string.h>
27 #ifdef HAVE_SYS_TIME_H
28 #include <sys/time.h>
29 #endif
30
31
32 #include "opal/dss/dss.h"
33 #include "opal/mca/event/event.h"
34
35 #include "orte/mca/odls/odls_types.h"
36 #include "orte/mca/grpcomm/base/base.h"
37 #include "orte/mca/errmgr/errmgr.h"
38 #include "orte/mca/ess/ess.h"
39 #include "orte/mca/rml/rml.h"
40 #include "orte/mca/rml/rml_types.h"
41 #include "orte/runtime/orte_globals.h"
42 #include "orte/runtime/orte_wait.h"
43 #include "orte/util/name_fns.h"
44 #include "orte/util/proc_info.h"
45 #include "orte/mca/state/state.h"
46 #include "orte/runtime/orte_wait.h"
47 #include "orte/orted/orted.h"
48
49 #include "orte/mca/plm/base/base.h"
50 #include "orte/mca/plm/base/plm_private.h"
51
52 #if 0
53 static void failed_cmd(int fd, short event, void *cbdata)
54 {
55 orte_timer_t *tm = (orte_timer_t*)cbdata;
56
57
58
59
60 OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
61 "%s plm:base:orted_cmd command timed out",
62 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
63 OBJ_RELEASE(tm);
64
65
66
67 }
68 #endif
69
70 int orte_plm_base_orted_exit(orte_daemon_cmd_flag_t command)
71 {
72 int rc;
73 opal_buffer_t *cmd;
74 orte_daemon_cmd_flag_t cmmnd;
75 orte_grpcomm_signature_t *sig;
76
77 OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
78 "%s plm:base:orted_cmd sending orted_exit commands",
79 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
80
81
82 orte_orteds_term_ordered = true;
83 cmmnd = command;
84
85
86
87
88
89
90 if (orte_abnormal_term_ordered ||
91 orte_never_launched ||
92 !orte_routing_is_enabled) {
93 cmmnd = ORTE_DAEMON_HALT_VM_CMD;
94 }
95
96
97 cmd = OBJ_NEW(opal_buffer_t);
98
99 if (ORTE_SUCCESS != (rc = opal_dss.pack(cmd, &cmmnd, 1, ORTE_DAEMON_CMD))) {
100 ORTE_ERROR_LOG(rc);
101 OBJ_RELEASE(cmd);
102 return rc;
103 }
104
105 sig = OBJ_NEW(orte_grpcomm_signature_t);
106 sig->signature = (orte_process_name_t*)malloc(sizeof(orte_process_name_t));
107 sig->signature[0].jobid = ORTE_PROC_MY_NAME->jobid;
108 sig->signature[0].vpid = ORTE_VPID_WILDCARD;
109 if (ORTE_SUCCESS != (rc = orte_grpcomm.xcast(sig, ORTE_RML_TAG_DAEMON, cmd))) {
110 ORTE_ERROR_LOG(rc);
111 }
112 OBJ_RELEASE(cmd);
113 OBJ_RELEASE(sig);
114
115 #if 0
116
117
118
119 if (orte_abnormal_term_ordered) {
120 ORTE_DETECT_TIMEOUT(orte_process_info.num_procs, 100, 3, failed_cmd, NULL);
121 }
122 #endif
123 return rc;
124 }
125
126
127 int orte_plm_base_orted_terminate_job(orte_jobid_t jobid)
128 {
129 opal_pointer_array_t procs;
130 orte_proc_t proc;
131 int rc;
132
133 OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
134 "%s plm:base:orted_terminate job %s",
135 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
136 ORTE_JOBID_PRINT(jobid)));
137
138 OBJ_CONSTRUCT(&procs, opal_pointer_array_t);
139 opal_pointer_array_init(&procs, 1, 1, 1);
140 OBJ_CONSTRUCT(&proc, orte_proc_t);
141 proc.name.jobid = jobid;
142 proc.name.vpid = ORTE_VPID_WILDCARD;
143 opal_pointer_array_add(&procs, &proc);
144 if (ORTE_SUCCESS != (rc = orte_plm_base_orted_kill_local_procs(&procs))) {
145 ORTE_ERROR_LOG(rc);
146 }
147 OBJ_DESTRUCT(&procs);
148 OBJ_DESTRUCT(&proc);
149 return rc;
150 }
151
152 int orte_plm_base_orted_kill_local_procs(opal_pointer_array_t *procs)
153 {
154 int rc;
155 opal_buffer_t *cmd;
156 orte_daemon_cmd_flag_t command=ORTE_DAEMON_KILL_LOCAL_PROCS;
157 int v;
158 orte_proc_t *proc;
159 orte_grpcomm_signature_t *sig;
160
161 OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
162 "%s plm:base:orted_cmd sending kill_local_procs cmds",
163 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
164
165 cmd = OBJ_NEW(opal_buffer_t);
166
167 if (ORTE_SUCCESS != (rc = opal_dss.pack(cmd, &command, 1, ORTE_DAEMON_CMD))) {
168 ORTE_ERROR_LOG(rc);
169 OBJ_RELEASE(cmd);
170 return rc;
171 }
172
173
174 if (NULL != procs) {
175 for (v=0; v < procs->size; v++) {
176 if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(procs, v))) {
177 continue;
178 }
179 if (ORTE_SUCCESS != (rc = opal_dss.pack(cmd, &(proc->name), 1, ORTE_NAME))) {
180 ORTE_ERROR_LOG(rc);
181 OBJ_RELEASE(cmd);
182 return rc;
183 }
184 }
185 }
186
187 sig = OBJ_NEW(orte_grpcomm_signature_t);
188 sig->signature = (orte_process_name_t*)malloc(sizeof(orte_process_name_t));
189 sig->signature[0].jobid = ORTE_PROC_MY_NAME->jobid;
190 sig->signature[0].vpid = ORTE_VPID_WILDCARD;
191 if (ORTE_SUCCESS != (rc = orte_grpcomm.xcast(sig, ORTE_RML_TAG_DAEMON, cmd))) {
192 ORTE_ERROR_LOG(rc);
193 }
194 OBJ_RELEASE(cmd);
195 OBJ_RELEASE(sig);
196
197
198 return rc;
199 }
200
201
202 int orte_plm_base_orted_signal_local_procs(orte_jobid_t job, int32_t signal)
203 {
204 int rc;
205 opal_buffer_t cmd;
206 orte_daemon_cmd_flag_t command=ORTE_DAEMON_SIGNAL_LOCAL_PROCS;
207 orte_grpcomm_signature_t *sig;
208
209 OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
210 "%s plm:base:orted_cmd sending signal_local_procs cmds",
211 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
212
213 OBJ_CONSTRUCT(&cmd, opal_buffer_t);
214
215
216 if (ORTE_SUCCESS != (rc = opal_dss.pack(&cmd, &command, 1, ORTE_DAEMON_CMD))) {
217 ORTE_ERROR_LOG(rc);
218 OBJ_DESTRUCT(&cmd);
219 return rc;
220 }
221
222
223 if (ORTE_SUCCESS != (rc = opal_dss.pack(&cmd, &job, 1, ORTE_JOBID))) {
224 ORTE_ERROR_LOG(rc);
225 OBJ_DESTRUCT(&cmd);
226 return rc;
227 }
228
229
230 if (ORTE_SUCCESS != (rc = opal_dss.pack(&cmd, &signal, 1, OPAL_INT32))) {
231 ORTE_ERROR_LOG(rc);
232 OBJ_DESTRUCT(&cmd);
233 return rc;
234 }
235
236
237 sig = OBJ_NEW(orte_grpcomm_signature_t);
238 sig->signature = (orte_process_name_t*)malloc(sizeof(orte_process_name_t));
239 sig->signature[0].jobid = ORTE_PROC_MY_NAME->jobid;
240 sig->signature[0].vpid = ORTE_VPID_WILDCARD;
241 if (ORTE_SUCCESS != (rc = orte_grpcomm.xcast(sig, ORTE_RML_TAG_DAEMON, &cmd))) {
242 ORTE_ERROR_LOG(rc);
243 }
244 OBJ_DESTRUCT(&cmd);
245 OBJ_RELEASE(sig);
246
247
248 return ORTE_SUCCESS;
249 }