This source file includes following definitions.
- plm_alps_init
- plm_alps_launch_job
- launch_daemons
- plm_alps_terminate_orteds
- plm_alps_signal_job
- plm_alps_finalize
- alps_wait_cb
- plm_alps_start_proc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32 #include "orte_config.h"
33 #include "orte/constants.h"
34 #include "orte/types.h"
35
36 #include <sys/types.h>
37 #ifdef HAVE_UNISTD_H
38 #include <unistd.h>
39 #endif
40 #include <signal.h>
41 #include <stdlib.h>
42 #include <string.h>
43 #ifdef HAVE_SYS_TYPES_H
44 #include <sys/types.h>
45 #endif
46 #ifdef HAVE_SYS_TIME_H
47 #include <sys/time.h>
48 #endif
49 #ifdef HAVE_SYS_STAT_H
50 #include <sys/stat.h>
51 #endif
52 #ifdef HAVE_FCNTL_H
53 #include <fcntl.h>
54 #endif
55
56 #include "opal/mca/base/base.h"
57 #include "opal/mca/installdirs/installdirs.h"
58 #include "opal/util/argv.h"
59 #include "opal/util/output.h"
60 #include "opal/util/opal_environ.h"
61 #include "opal/util/path.h"
62 #include "opal/util/basename.h"
63
64 #include "orte/runtime/orte_globals.h"
65 #include "orte/util/name_fns.h"
66 #include "orte/util/show_help.h"
67 #include "orte/util/threads.h"
68 #include "orte/runtime/orte_wait.h"
69 #include "orte/mca/errmgr/errmgr.h"
70 #include "orte/mca/rmaps/rmaps.h"
71 #include "orte/mca/state/state.h"
72
73 #include "orte/mca/plm/plm.h"
74 #include "orte/mca/plm/base/base.h"
75 #include "orte/mca/plm/base/plm_private.h"
76 #include "plm_alps.h"
77
78
79
80
81
82 static int plm_alps_init(void);
83 static int plm_alps_launch_job(orte_job_t *jdata);
84 static int plm_alps_terminate_orteds(void);
85 static int plm_alps_signal_job(orte_jobid_t jobid, int32_t signal);
86 static int plm_alps_finalize(void);
87
88 static int plm_alps_start_proc(int argc, char **argv, char **env,
89 char *prefix);
90
91
92
93
94
95 orte_plm_base_module_t orte_plm_alps_module = {
96 plm_alps_init,
97 orte_plm_base_set_hnp_name,
98 plm_alps_launch_job,
99 NULL,
100 orte_plm_base_orted_terminate_job,
101 plm_alps_terminate_orteds,
102 orte_plm_base_orted_kill_local_procs,
103 plm_alps_signal_job,
104 plm_alps_finalize
105 };
106
107
108
109
110 static orte_proc_t *alpsrun = NULL;
111 static bool failed_launch;
112 static void launch_daemons(int fd, short args, void *cbdata);
113
114
115
116
117
118 static int plm_alps_init(void)
119 {
120 int rc;
121
122 if (ORTE_SUCCESS != (rc = orte_plm_base_comm_start())) {
123 ORTE_ERROR_LOG(rc);
124 return rc;
125 }
126
127 if (orte_do_not_launch) {
128
129 orte_plm_globals.daemon_nodes_assigned_at_launch = true;
130 } else {
131
132
133
134
135
136
137 orte_plm_globals.daemon_nodes_assigned_at_launch = false;
138 }
139
140
141 if (ORTE_SUCCESS != (rc = orte_state.add_job_state(ORTE_JOB_STATE_LAUNCH_DAEMONS,
142 launch_daemons, ORTE_SYS_PRI))) {
143 ORTE_ERROR_LOG(rc);
144 return rc;
145 }
146
147 return rc;
148 }
149
150
151
152
153
154
155 static int plm_alps_launch_job(orte_job_t *jdata)
156 {
157
158 if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RESTART)) {
159
160 ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP);
161 } else {
162
163 ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_INIT);
164 }
165 return ORTE_SUCCESS;
166 }
167
168 static void launch_daemons(int fd, short args, void *cbdata)
169 {
170 orte_job_map_t *map;
171 char *jobid_string = NULL;
172 char *param;
173 char **argv = NULL;
174 int argc;
175 int rc;
176 char *tmp;
177 char** env = NULL;
178 char *nodelist_flat;
179 char **nodelist_argv;
180 int nodelist_argc;
181 char *vpid_string;
182 char **custom_strings;
183 int num_args, i;
184 char *cur_prefix;
185 int proc_vpid_index;
186 orte_app_context_t *app;
187 orte_node_t *node;
188 orte_std_cntr_t nnode;
189 orte_job_t *daemons;
190 orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata;
191 char *ltmp;
192
193 ORTE_ACQUIRE_OBJECT(state);
194
195
196
197
198 if (ORTE_FLAG_TEST(state->jdata, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) {
199 state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
200 ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
201 OBJ_RELEASE(state);
202 return;
203 }
204
205
206 daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
207 if (ORTE_SUCCESS != (rc = orte_plm_base_setup_virtual_machine(state->jdata))) {
208 ORTE_ERROR_LOG(rc);
209 goto cleanup;
210 }
211
212
213
214
215
216 if (orte_do_not_launch) {
217
218
219
220
221 state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
222 ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
223 OBJ_RELEASE(state);
224 return;
225 }
226
227
228 if (NULL == (map = daemons->map)) {
229 ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
230 rc = ORTE_ERR_NOT_FOUND;
231 goto cleanup;
232 }
233
234 if (0 == map->num_new_daemons) {
235
236
237
238
239 OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
240 "%s plm:alps: no new daemons to launch",
241 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
242 state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
243 ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
244 OBJ_RELEASE(state);
245 return;
246 }
247
248
249 orte_util_convert_jobid_to_string(&jobid_string, daemons->jobid);
250
251
252
253
254 argv = NULL;
255 argc = 0;
256
257
258
259
260
261
262 opal_argv_append(&argc, &argv, mca_plm_alps_component.aprun_cmd);
263
264
265 if ( NULL != mca_plm_alps_component.custom_args ) {
266 custom_strings = opal_argv_split(mca_plm_alps_component.custom_args, ' ');
267 num_args = opal_argv_count(custom_strings);
268 for (i = 0; i < num_args; ++i) {
269 opal_argv_append(&argc, &argv, custom_strings[i]);
270 }
271 opal_argv_free(custom_strings);
272 }
273
274
275 opal_argv_append(&argc, &argv, "-n");
276 opal_asprintf(&tmp, "%lu", (unsigned long) map->num_new_daemons);
277 opal_argv_append(&argc, &argv, tmp);
278 free(tmp);
279 opal_argv_append(&argc, &argv, "-N");
280 opal_argv_append(&argc, &argv, "1");
281 opal_argv_append(&argc, &argv, "-cc");
282 opal_argv_append(&argc, &argv, "none");
283
284
285
286
287
288
289
290 opal_argv_append(&argc, &argv, "-e");
291 opal_argv_append(&argc, &argv, "PMI_NO_PREINITIALIZE=1");
292 opal_argv_append(&argc, &argv, "-e");
293 opal_argv_append(&argc, &argv, "PMI_NO_FORK=1");
294 opal_argv_append(&argc, &argv, "-e");
295 opal_argv_append(&argc, &argv, "OMPI_NO_USE_CRAY_PMI=1");
296
297
298
299
300 if ((map->num_new_daemons < orte_num_allocated_nodes) || (orte_num_allocated_nodes == 0)) {
301
302 nodelist_argv = NULL;
303 nodelist_argc = 0;
304
305 for (nnode=0; nnode < map->nodes->size; nnode++) {
306 if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, nnode))) {
307 continue;
308 }
309
310
311
312
313 if (ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_DAEMON_LAUNCHED)) {
314 continue;
315 }
316
317
318
319
320 opal_argv_append(&nodelist_argc, &nodelist_argv, node->name);
321 }
322 if (0 == opal_argv_count(nodelist_argv)) {
323 orte_show_help("help-plm-alps.txt", "no-hosts-in-list", true);
324 rc = ORTE_ERR_FAILED_TO_START;
325 goto cleanup;
326 }
327 nodelist_flat = opal_argv_join(nodelist_argv, ',');
328 opal_argv_free(nodelist_argv);
329
330 opal_argv_append(&argc, &argv, "-L");
331 opal_argv_append(&argc, &argv, nodelist_flat);
332 free(nodelist_flat);
333 }
334
335
336
337
338
339
340
341 orte_plm_base_setup_orted_cmd(&argc, &argv);
342
343
344 orte_plm_base_orted_append_basic_args(&argc, &argv,
345 NULL,
346 &proc_vpid_index);
347
348
349
350
351 rc = orte_util_convert_vpid_to_string(&vpid_string, map->daemon_vpid_start);
352 if (ORTE_SUCCESS != rc) {
353 opal_output(0, "plm_alps: unable to create process name");
354 goto cleanup;
355 }
356
357 free(argv[proc_vpid_index]);
358 argv[proc_vpid_index] = strdup(vpid_string);
359 free(vpid_string);
360
361 if (mca_plm_alps_component.debug) {
362 param = opal_argv_join(argv, ' ');
363 if (NULL != param) {
364 opal_output(0, "plm:alps: final top-level argv:");
365 opal_output(0, "plm:alps: %s", param);
366 free(param);
367 }
368 }
369
370
371
372
373
374
375
376 cur_prefix = NULL;
377 for (i=0; i < state->jdata->apps->size; i++) {
378 char *app_prefix_dir = NULL;
379 if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(state->jdata->apps, i))) {
380 continue;
381 }
382 orte_get_attribute(&app->attributes, ORTE_APP_PREFIX_DIR, (void**)&app_prefix_dir, OPAL_STRING);
383
384
385 if (NULL != app_prefix_dir) {
386 if (NULL != cur_prefix &&
387 0 != strcmp (cur_prefix, app_prefix_dir)) {
388 orte_show_help("help-plm-alps.txt", "multiple-prefixes",
389 true, cur_prefix, app_prefix_dir);
390 goto cleanup;
391 }
392
393
394
395 if (NULL == cur_prefix) {
396 cur_prefix = strdup(app_prefix_dir);
397 if (mca_plm_alps_component.debug) {
398 opal_output (0, "plm:alps: Set prefix:%s",
399 cur_prefix);
400 }
401 }
402 free(app_prefix_dir);
403 }
404 }
405
406
407 mca_base_cmd_line_wrap_args(argv);
408
409
410 env = opal_argv_copy(orte_launch_environ);
411
412 if (0 < opal_output_get_verbosity(orte_plm_base_framework.framework_output)) {
413 param = opal_argv_join(argv, ' ');
414 OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
415 "%s plm:alps: final top-level argv:\n\t%s",
416 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
417 (NULL == param) ? "NULL" : param));
418 if (NULL != param) free(param);
419 }
420
421
422 if (ORTE_SUCCESS != (rc = plm_alps_start_proc(argc, argv, env, cur_prefix))) {
423 ORTE_ERROR_LOG(rc);
424 goto cleanup;
425 }
426
427
428 state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
429 daemons->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
430
431
432 failed_launch = false;
433
434 cleanup:
435 if (NULL != argv) {
436 opal_argv_free(argv);
437 }
438 if (NULL != env) {
439 opal_argv_free(env);
440 }
441
442 if(NULL != jobid_string) {
443 free(jobid_string);
444 }
445
446
447 OBJ_RELEASE(state);
448
449
450 if (failed_launch) {
451 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
452 }
453 }
454
455
456
457
458
459
460 static int plm_alps_terminate_orteds(void)
461 {
462 int rc;
463 orte_job_t *jdata;
464
465 OPAL_OUTPUT_VERBOSE((10, orte_plm_base_framework.framework_output,
466 "%s plm:alps: terminating orteds",
467 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
468
469
470
471
472
473
474 if (NULL != alpsrun) {
475 orte_wait_cb_cancel(alpsrun);
476 }
477
478
479 if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_CMD))) {
480 ORTE_ERROR_LOG(rc);
481 }
482
483 jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
484 ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DAEMONS_TERMINATED);
485
486 OPAL_OUTPUT_VERBOSE((10, orte_plm_base_framework.framework_output,
487 "%s plm:alps: terminated orteds",
488 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
489 return rc;
490 }
491
492
493
494
495
496 static int plm_alps_signal_job(orte_jobid_t jobid, int32_t signal)
497 {
498 if (NULL != alpsrun && 0 != alpsrun->pid) {
499 kill(alpsrun->pid, (int)signal);
500 }
501 return ORTE_SUCCESS;
502 }
503
504
505 static int plm_alps_finalize(void)
506 {
507 int rc;
508
509 if (NULL != alpsrun) {
510 OBJ_RELEASE(alpsrun);
511 }
512
513
514 if (ORTE_SUCCESS != (rc = orte_plm_base_comm_stop())) {
515 ORTE_ERROR_LOG(rc);
516 }
517
518 return ORTE_SUCCESS;
519 }
520
521
522 static void alps_wait_cb(int sd, short args, void *cbdata) {
523 orte_wait_tracker_t *t2 = (orte_wait_tracker_t*)cbdata;
524 orte_proc_t *proc = t2->child;
525 orte_job_t *jdata;
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542 jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
543
544 if (0 != proc->exit_code) {
545 if (failed_launch) {
546
547
548
549 ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_FAILED_TO_START);
550 } else {
551
552
553
554 ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ABORTED);
555 }
556 }
557 OBJ_RELEASE(t2);
558 }
559
560
561 static int plm_alps_start_proc(int argc, char **argv, char **env,
562 char *prefix)
563 {
564 int fd;
565 pid_t alps_pid;
566 char *exec_argv = opal_path_findv(argv[0], 0, env, NULL);
567
568 if (NULL == exec_argv) {
569 return ORTE_ERR_NOT_FOUND;
570 }
571
572 alps_pid = fork();
573 if (-1 == alps_pid) {
574 ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_CHILDREN);
575 return ORTE_ERR_SYS_LIMITS_CHILDREN;
576 }
577
578 alpsrun = OBJ_NEW(orte_proc_t);
579 alpsrun->pid = alps_pid;
580
581 ORTE_FLAG_SET(alpsrun, ORTE_PROC_FLAG_ALIVE);
582
583 orte_wait_cb(alpsrun, alps_wait_cb, orte_event_base, NULL);
584
585 if (0 == alps_pid) {
586 char *bin_base = NULL, *lib_base = NULL;
587
588
589
590
591
592
593 lib_base = opal_basename(opal_install_dirs.libdir);
594 bin_base = opal_basename(opal_install_dirs.bindir);
595
596
597
598 if (NULL != prefix) {
599 char *oldenv, *newenv;
600
601
602 oldenv = getenv("PATH");
603 if (NULL != oldenv) {
604 opal_asprintf(&newenv, "%s/%s:%s", prefix, bin_base, oldenv);
605 } else {
606 opal_asprintf(&newenv, "%s/%s", prefix, bin_base);
607 }
608 opal_setenv("PATH", newenv, true, &env);
609 if (mca_plm_alps_component.debug) {
610 opal_output(0, "plm:alps: reset PATH: %s", newenv);
611 }
612 free(newenv);
613
614
615 oldenv = getenv("LD_LIBRARY_PATH");
616 if (NULL != oldenv) {
617 opal_asprintf(&newenv, "%s/%s:%s", prefix, lib_base, oldenv);
618 } else {
619 opal_asprintf(&newenv, "%s/%s", prefix, lib_base);
620 }
621 opal_setenv("LD_LIBRARY_PATH", newenv, true, &env);
622 if (mca_plm_alps_component.debug) {
623 opal_output(0, "plm:alps: reset LD_LIBRARY_PATH: %s",
624 newenv);
625 }
626 free(newenv);
627 }
628
629 fd = open("/dev/null", O_CREAT|O_WRONLY|O_TRUNC, 0666);
630 if(fd > 0) {
631 dup2(fd, 0);
632 }
633
634
635
636 if (0 == mca_plm_alps_component.debug && !orte_debug_daemons_flag) {
637 if (fd >= 0) {
638 if (fd != 1) {
639 dup2(fd,1);
640 }
641 if (fd != 2) {
642 dup2(fd,2);
643 }
644 }
645 }
646
647 if (fd > 2) {
648 close(fd);
649 }
650
651
652
653
654 setpgid(0, 0);
655
656
657 execve(exec_argv, argv, env);
658
659 opal_output(0, "plm:alps:start_proc: exec failed");
660
661
662 exit(1);
663 } else {
664
665
666
667 setpgid(alps_pid, alps_pid);
668
669 free(exec_argv);
670 }
671
672 return ORTE_SUCCESS;
673 }