This source file includes following definitions.
- plm_lsf_init
- plm_lsf_launch_job
- launch_daemons
- plm_lsf_terminate_orteds
- plm_lsf_signal_job
- plm_lsf_finalize
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32 #include "orte_config.h"
33 #include "orte/constants.h"
34 #include "orte/types.h"
35
36 #include <sys/types.h>
37 #ifdef HAVE_UNISTD_H
38 #include <unistd.h>
39 #endif
40 #include <signal.h>
41 #include <stdlib.h>
42 #ifdef HAVE_SYS_TYPES_H
43 #include <sys/types.h>
44 #endif
45 #ifdef HAVE_SYS_TIME_H
46 #include <sys/time.h>
47 #endif
48 #ifdef HAVE_SYS_STAT_H
49 #include <sys/stat.h>
50 #endif
51 #ifdef HAVE_FCNTL_H
52 #include <fcntl.h>
53 #endif
54
55 #define SR1_PJOBS
56 #include <lsf/lsbatch.h>
57
58 #include "opal/mca/base/base.h"
59 #include "opal/mca/installdirs/installdirs.h"
60 #include "opal/util/argv.h"
61 #include "opal/util/output.h"
62 #include "opal/util/opal_environ.h"
63
64 #include "orte/util/show_help.h"
65 #include "orte/runtime/orte_globals.h"
66 #include "orte/runtime/orte_wait.h"
67 #include "orte/mca/errmgr/errmgr.h"
68 #include "orte/mca/rmaps/rmaps.h"
69 #include "orte/mca/state/state.h"
70 #include "orte/util/threads.h"
71
72 #include "orte/mca/plm/plm.h"
73 #include "orte/mca/plm/base/base.h"
74 #include "orte/mca/plm/base/plm_private.h"
75 #include "plm_lsf.h"
76
77
78
79
80
81 static int plm_lsf_init(void);
82 static int plm_lsf_launch_job(orte_job_t *jdata);
83 static int plm_lsf_terminate_orteds(void);
84 static int plm_lsf_signal_job(orte_jobid_t jobid, int32_t signal);
85 static int plm_lsf_finalize(void);
86
87
88
89
90
91 orte_plm_base_module_t orte_plm_lsf_module = {
92 plm_lsf_init,
93 orte_plm_base_set_hnp_name,
94 plm_lsf_launch_job,
95 NULL,
96 orte_plm_base_orted_terminate_job,
97 plm_lsf_terminate_orteds,
98 orte_plm_base_orted_kill_local_procs,
99 plm_lsf_signal_job,
100 plm_lsf_finalize
101 };
102
103 static void launch_daemons(int fd, short args, void *cbdata);
104
105
106
107
108 int plm_lsf_init(void)
109 {
110 int rc;
111
112 if (ORTE_SUCCESS != (rc = orte_plm_base_comm_start())) {
113 ORTE_ERROR_LOG(rc);
114 }
115
116 if (orte_do_not_launch) {
117
118 orte_plm_globals.daemon_nodes_assigned_at_launch = true;
119 } else {
120
121
122
123
124
125
126 orte_plm_globals.daemon_nodes_assigned_at_launch = false;
127 }
128
129
130 if (ORTE_SUCCESS != (rc = orte_state.add_job_state(ORTE_JOB_STATE_LAUNCH_DAEMONS,
131 launch_daemons, ORTE_SYS_PRI))) {
132 ORTE_ERROR_LOG(rc);
133 return rc;
134 }
135
136 return rc;
137 }
138
139
140
141
142
143 static int plm_lsf_launch_job(orte_job_t *jdata)
144 {
145 if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RESTART)) {
146
147 ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP);
148 } else {
149
150 ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_INIT);
151 }
152 return ORTE_SUCCESS;
153 }
154
155 static void launch_daemons(int fd, short args, void *cbdata)
156 {
157 orte_job_map_t *map;
158 size_t num_nodes;
159 char *param;
160 char **argv = NULL;
161 int argc;
162 int rc;
163 char** env = NULL;
164 char **nodelist_argv;
165 int nodelist_argc;
166 char *vpid_string;
167 int i;
168 char *cur_prefix;
169 int proc_vpid_index = 0;
170 bool failed_launch = true;
171 orte_app_context_t *app;
172 orte_node_t *node;
173 orte_std_cntr_t nnode;
174 orte_job_t *daemons;
175 orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata;
176 orte_job_t *jdata;
177
178 ORTE_ACQUIRE_OBJECT(state);
179 jdata = state->jdata;
180
181
182 daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
183 if (ORTE_SUCCESS != (rc = orte_plm_base_setup_virtual_machine(jdata))) {
184 ORTE_ERROR_LOG(rc);
185 goto cleanup;
186 }
187
188
189
190
191
192 if (orte_do_not_launch) {
193
194
195
196
197 state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
198 ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
199 OBJ_RELEASE(state);
200 return;
201 }
202
203 OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
204 "%s plm:lsf: launching vm",
205 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
206
207
208
209 if (NULL == (map = daemons->map)) {
210 ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
211 rc = ORTE_ERR_NOT_FOUND;
212 goto cleanup;
213 }
214
215 num_nodes = map->num_new_daemons;
216 if (0 == num_nodes) {
217
218
219
220
221 OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
222 "%s plm:lsf: no new daemons to launch",
223 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
224 state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
225 ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
226 OBJ_RELEASE(state);
227 return;
228 }
229
230
231 nodelist_argv = NULL;
232 nodelist_argc = 0;
233
234 for (nnode=0; nnode < map->nodes->size; nnode++) {
235 if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, nnode))) {
236 continue;
237 }
238
239
240
241 if (ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_DAEMON_LAUNCHED)) {
242 continue;
243 }
244
245
246
247
248 opal_argv_append(&nodelist_argc, &nodelist_argv, node->name);
249 }
250
251
252
253
254 argv = NULL;
255 argc = 0;
256
257
258
259
260
261
262 orte_plm_base_setup_orted_cmd(&argc, &argv);
263
264
265
266 orte_plm_base_orted_append_basic_args(&argc, &argv,
267 "lsf",
268 &proc_vpid_index);
269
270
271
272
273 rc = orte_util_convert_vpid_to_string(&vpid_string, map->daemon_vpid_start);
274 if (ORTE_SUCCESS != rc) {
275 opal_output(0, "plm_lsf: unable to get daemon vpid as string");
276 goto cleanup;
277 }
278 free(argv[proc_vpid_index]);
279 argv[proc_vpid_index] = strdup(vpid_string);
280 free(vpid_string);
281
282
283 mca_base_cmd_line_wrap_args(argv);
284
285 if (0 < opal_output_get_verbosity(orte_plm_base_framework.framework_output)) {
286 param = opal_argv_join(argv, ' ');
287 if (NULL != param) {
288 opal_output(0, "plm:lsf: final top-level argv:");
289 opal_output(0, "plm:lsf: %s", param);
290 free(param);
291 }
292 }
293
294
295
296
297
298
299
300 cur_prefix = NULL;
301 for (i=0; i < jdata->apps->size; i++) {
302 char *app_prefix_dir=NULL;
303 if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
304 continue;
305 }
306 if (orte_get_attribute(&app->attributes, ORTE_APP_PREFIX_DIR, (void**)&app_prefix_dir, OPAL_STRING) &&
307 NULL != app_prefix_dir) {
308
309
310 if (NULL != cur_prefix &&
311 0 != strcmp (cur_prefix, app_prefix_dir)) {
312 orte_show_help("help-plm-lsf.txt", "multiple-prefixes",
313 true, cur_prefix, app_prefix_dir);
314 rc = ORTE_ERR_FAILED_TO_START;
315 goto cleanup;
316 }
317
318
319
320 if (NULL == cur_prefix) {
321 cur_prefix = strdup(app_prefix_dir);
322 OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
323 "%s plm:lsf: Set prefix:%s",
324 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), cur_prefix));
325 }
326 free(app_prefix_dir);
327 }
328 }
329
330
331 env = opal_argv_copy(orte_launch_environ);
332
333
334
335
336
337
338 orte_wait_disable();
339
340
341
342
343
344
345
346 if ( (rc = lsb_launch(nodelist_argv, argv, LSF_DJOB_REPLACE_ENV | LSF_DJOB_NOWAIT, env)) < 0) {
347 ORTE_ERROR_LOG(ORTE_ERR_FAILED_TO_START);
348 char *flattened_nodelist = NULL;
349 flattened_nodelist = opal_argv_join(nodelist_argv, '\n');
350 orte_show_help("help-plm-lsf.txt", "lsb_launch-failed",
351 true, rc, lsberrno, lsb_sysmsg(),
352 opal_argv_count(nodelist_argv), flattened_nodelist);
353 free(flattened_nodelist);
354 rc = ORTE_ERR_FAILED_TO_START;
355 orte_wait_enable();
356 goto cleanup;
357 }
358 orte_wait_enable();
359
360
361 state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
362 daemons->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
363
364
365 failed_launch = false;
366
367 cleanup:
368 if (NULL != argv) {
369 opal_argv_free(argv);
370 }
371 if (NULL != env) {
372 opal_argv_free(env);
373 }
374
375
376 OBJ_RELEASE(state);
377
378
379 if (failed_launch) {
380 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
381 }
382 }
383
384
385
386
387
388 static int plm_lsf_terminate_orteds(void)
389 {
390 int rc;
391
392 if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_CMD))) {
393 ORTE_ERROR_LOG(rc);
394 }
395
396 return rc;
397 }
398
399
400
401
402
403 static int plm_lsf_signal_job(orte_jobid_t jobid, int32_t signal)
404 {
405 int rc;
406
407
408 if (ORTE_SUCCESS != (rc = orte_plm_base_orted_signal_local_procs(jobid, signal))) {
409 ORTE_ERROR_LOG(rc);
410 }
411 return rc;
412 }
413
414
415 static int plm_lsf_finalize(void)
416 {
417 int rc;
418
419
420 if (ORTE_SUCCESS != (rc = orte_plm_base_comm_stop())) {
421 ORTE_ERROR_LOG(rc);
422 }
423
424 return ORTE_SUCCESS;
425 }