This source file includes following definitions.
- plm_slurm_init
- plm_slurm_launch_job
- launch_daemons
- plm_slurm_terminate_orteds
- plm_slurm_signal_job
- plm_slurm_finalize
- srun_wait_cb
- plm_slurm_start_proc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29 #include "orte_config.h"
30 #include "orte/runtime/orte_globals.h"
31
32 #include <string.h>
33 #include <sys/types.h>
34 #ifdef HAVE_UNISTD_H
35 #include <unistd.h>
36 #endif
37 #include <signal.h>
38 #include <stdlib.h>
39 #ifdef HAVE_SYS_TYPES_H
40 #include <sys/types.h>
41 #endif
42 #ifdef HAVE_SYS_TIME_H
43 #include <sys/time.h>
44 #endif
45 #ifdef HAVE_SYS_STAT_H
46 #include <sys/stat.h>
47 #endif
48 #ifdef HAVE_FCNTL_H
49 #include <fcntl.h>
50 #endif
51
52 #include "opal/mca/base/base.h"
53 #include "opal/mca/installdirs/installdirs.h"
54 #include "opal/util/argv.h"
55 #include "opal/util/output.h"
56 #include "opal/util/opal_environ.h"
57 #include "opal/util/path.h"
58 #include "opal/util/basename.h"
59
60 #include "orte/constants.h"
61 #include "orte/types.h"
62 #include "orte/util/show_help.h"
63 #include "orte/util/name_fns.h"
64 #include "orte/util/threads.h"
65 #include "orte/runtime/orte_globals.h"
66 #include "orte/runtime/orte_wait.h"
67 #include "orte/runtime/orte_quit.h"
68 #include "orte/mca/errmgr/errmgr.h"
69 #include "orte/mca/rmaps/base/base.h"
70 #include "orte/mca/state/state.h"
71
72 #include "orte/orted/orted.h"
73
74 #include "orte/mca/plm/plm.h"
75 #include "orte/mca/plm/base/plm_private.h"
76 #include "plm_slurm.h"
77
78
79
80
81
82 static int plm_slurm_init(void);
83 static int plm_slurm_launch_job(orte_job_t *jdata);
84 static int plm_slurm_terminate_orteds(void);
85 static int plm_slurm_signal_job(orte_jobid_t jobid, int32_t signal);
86 static int plm_slurm_finalize(void);
87
88 static int plm_slurm_start_proc(int argc, char **argv, char **env,
89 char *prefix);
90
91
92
93
94
95 orte_plm_base_module_1_0_0_t orte_plm_slurm_module = {
96 plm_slurm_init,
97 orte_plm_base_set_hnp_name,
98 plm_slurm_launch_job,
99 NULL,
100 orte_plm_base_orted_terminate_job,
101 plm_slurm_terminate_orteds,
102 orte_plm_base_orted_kill_local_procs,
103 plm_slurm_signal_job,
104 plm_slurm_finalize
105 };
106
107
108
109
110 static pid_t primary_srun_pid = 0;
111 static bool primary_pid_set = false;
112 static void launch_daemons(int fd, short args, void *cbdata);
113
114
115
116
117 static int plm_slurm_init(void)
118 {
119 int rc;
120
121 if (ORTE_SUCCESS != (rc = orte_plm_base_comm_start())) {
122 ORTE_ERROR_LOG(rc);
123 return rc;
124 }
125
126
127
128
129
130 if (orte_do_not_launch) {
131 orte_plm_globals.daemon_nodes_assigned_at_launch = true;
132 } else {
133
134
135
136
137
138
139 orte_plm_globals.daemon_nodes_assigned_at_launch = false;
140 }
141
142
143 if (ORTE_SUCCESS != (rc = orte_state.add_job_state(ORTE_JOB_STATE_LAUNCH_DAEMONS,
144 launch_daemons, ORTE_SYS_PRI))) {
145 ORTE_ERROR_LOG(rc);
146 return rc;
147 }
148
149 return rc;
150 }
151
152
153
154
155
156 static int plm_slurm_launch_job(orte_job_t *jdata)
157 {
158 if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RESTART)) {
159
160 ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP);
161 } else {
162
163 ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_INIT);
164 }
165 return ORTE_SUCCESS;
166 }
167
168 static void launch_daemons(int fd, short args, void *cbdata)
169 {
170 orte_app_context_t *app;
171 orte_node_t *node;
172 orte_std_cntr_t n;
173 orte_job_map_t *map;
174 char *jobid_string = NULL;
175 char *param;
176 char **argv = NULL;
177 int argc;
178 int rc;
179 char *tmp;
180 char** env = NULL;
181 char *nodelist_flat;
182 char **nodelist_argv;
183 char *name_string;
184 char **custom_strings;
185 int num_args, i;
186 char *cur_prefix;
187 int proc_vpid_index;
188 bool failed_launch=true;
189 orte_job_t *daemons;
190 orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata;
191
192 ORTE_ACQUIRE_OBJECT(state);
193
194 OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
195 "%s plm:slurm: LAUNCH DAEMONS CALLED",
196 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
197
198
199
200
201 if (ORTE_FLAG_TEST(state->jdata, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) {
202 state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
203 ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
204 OBJ_RELEASE(state);
205 return;
206 }
207
208
209 daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
210 if (ORTE_SUCCESS != (rc = orte_plm_base_setup_virtual_machine(state->jdata))) {
211 ORTE_ERROR_LOG(rc);
212 goto cleanup;
213 }
214
215
216
217
218
219 if (orte_do_not_launch) {
220
221
222
223
224 state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
225 ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
226 OBJ_RELEASE(state);
227 return;
228 }
229
230
231 if (NULL == (map = daemons->map)) {
232 ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
233 rc = ORTE_ERR_NOT_FOUND;
234 goto cleanup;
235 }
236
237 if (0 == map->num_new_daemons) {
238
239
240
241
242 OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
243 "%s plm:slurm: no new daemons to launch",
244 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
245 state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
246 ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
247 OBJ_RELEASE(state);
248 return;
249 }
250
251
252 opal_asprintf(&jobid_string, "%lu", (unsigned long) daemons->jobid);
253
254
255
256
257 argv = NULL;
258 argc = 0;
259
260
261
262
263
264
265 opal_argv_append(&argc, &argv, "srun");
266
267
268 opal_argv_append(&argc, &argv, "--ntasks-per-node=1");
269
270 if (!orte_enable_recovery) {
271
272 opal_argv_append(&argc, &argv, "--kill-on-bad-exit");
273 }
274
275 #if SLURM_CRAY_ENV
276
277
278
279
280
281
282 opal_setenv("PMI_NO_PREINITIALIZE", "1", false, &orte_launch_environ);
283 opal_setenv("PMI_NO_FORK", "1", false, &orte_launch_environ);
284 opal_setenv("OMPI_NO_USE_CRAY_PMI", "1", false, &orte_launch_environ);
285 #endif
286
287
288 if ( NULL != mca_plm_slurm_component.custom_args ) {
289 custom_strings = opal_argv_split(mca_plm_slurm_component.custom_args, ' ');
290 num_args = opal_argv_count(custom_strings);
291 for (i = 0; i < num_args; ++i) {
292 opal_argv_append(&argc, &argv, custom_strings[i]);
293 }
294 opal_argv_free(custom_strings);
295 }
296
297
298 nodelist_argv = NULL;
299
300 for (n=0; n < map->nodes->size; n++ ) {
301 if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, n))) {
302 continue;
303 }
304
305
306
307 if (ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_DAEMON_LAUNCHED)) {
308 continue;
309 }
310
311
312
313
314 opal_argv_append_nosize(&nodelist_argv, node->name);
315 }
316 if (0 == opal_argv_count(nodelist_argv)) {
317 orte_show_help("help-plm-slurm.txt", "no-hosts-in-list", true);
318 rc = ORTE_ERR_FAILED_TO_START;
319 goto cleanup;
320 }
321 nodelist_flat = opal_argv_join(nodelist_argv, ',');
322 opal_argv_free(nodelist_argv);
323
324
325
326
327 if (map->num_new_daemons < orte_num_allocated_nodes) {
328 opal_asprintf(&tmp, "--nodes=%lu", (unsigned long)map->num_new_daemons);
329 opal_argv_append(&argc, &argv, tmp);
330 free(tmp);
331
332 opal_asprintf(&tmp, "--nodelist=%s", nodelist_flat);
333 opal_argv_append(&argc, &argv, tmp);
334 free(tmp);
335 }
336
337
338 opal_asprintf(&tmp, "--ntasks=%lu", (unsigned long)map->num_new_daemons);
339 opal_argv_append(&argc, &argv, tmp);
340 free(tmp);
341
342 OPAL_OUTPUT_VERBOSE((2, orte_plm_base_framework.framework_output,
343 "%s plm:slurm: launching on nodes %s",
344 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), nodelist_flat));
345 free(nodelist_flat);
346
347
348
349
350
351
352 orte_plm_base_setup_orted_cmd(&argc, &argv);
353
354
355 orte_plm_base_orted_append_basic_args(&argc, &argv,
356 "slurm", &proc_vpid_index);
357
358
359
360
361 rc = orte_util_convert_vpid_to_string(&name_string, map->daemon_vpid_start);
362 if (ORTE_SUCCESS != rc) {
363 opal_output(0, "plm_slurm: unable to get daemon vpid as string");
364 goto cleanup;
365 }
366
367 free(argv[proc_vpid_index]);
368 argv[proc_vpid_index] = strdup(name_string);
369 free(name_string);
370
371
372
373
374
375
376
377 cur_prefix = NULL;
378 for (n=0; n < state->jdata->apps->size; n++) {
379 char * app_prefix_dir;
380 if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(state->jdata->apps, n))) {
381 continue;
382 }
383 app_prefix_dir = NULL;
384 orte_get_attribute(&app->attributes, ORTE_APP_PREFIX_DIR, (void**)&app_prefix_dir, OPAL_STRING);
385
386
387 if (NULL != app_prefix_dir) {
388 if (NULL != cur_prefix &&
389 0 != strcmp (cur_prefix, app_prefix_dir)) {
390 orte_show_help("help-plm-slurm.txt", "multiple-prefixes",
391 true, cur_prefix, app_prefix_dir);
392 goto cleanup;
393 }
394
395
396
397
398 if (NULL == cur_prefix) {
399 cur_prefix = strdup(app_prefix_dir);
400 OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
401 "%s plm:slurm: Set prefix:%s",
402 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
403 cur_prefix));
404 }
405 free(app_prefix_dir);
406 }
407 }
408
409
410 mca_base_cmd_line_wrap_args(argv);
411
412
413 env = opal_argv_copy(orte_launch_environ);
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430 opal_setenv("SLURM_CPU_BIND", "none", true, &env);
431
432 if (0 < opal_output_get_verbosity(orte_plm_base_framework.framework_output)) {
433 param = opal_argv_join(argv, ' ');
434 opal_output(orte_plm_base_framework.framework_output,
435 "%s plm:slurm: final top-level argv:\n\t%s",
436 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
437 (NULL == param) ? "NULL" : param);
438 if (NULL != param) free(param);
439 }
440
441
442 if (ORTE_SUCCESS != (rc = plm_slurm_start_proc(argc, argv, env, cur_prefix))) {
443 ORTE_ERROR_LOG(rc);
444 goto cleanup;
445 }
446
447
448 state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
449 daemons->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
450
451
452 failed_launch = false;
453
454 cleanup:
455 if (NULL != argv) {
456 opal_argv_free(argv);
457 }
458 if (NULL != env) {
459 opal_argv_free(env);
460 }
461
462 if(NULL != jobid_string) {
463 free(jobid_string);
464 }
465
466
467 OBJ_RELEASE(state);
468
469
470 if (failed_launch) {
471 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
472 }
473 }
474
475
476
477
478
479 static int plm_slurm_terminate_orteds(void)
480 {
481 int rc=ORTE_SUCCESS;
482 orte_job_t *jdata;
483
484
485
486
487
488
489 if (primary_pid_set) {
490 if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_CMD))) {
491 ORTE_ERROR_LOG(rc);
492 }
493 } else {
494 OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
495 "%s plm:slurm: primary daemons complete!",
496 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
497 jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
498
499 jdata->num_terminated = jdata->num_procs;
500 ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DAEMONS_TERMINATED);
501 }
502
503 return rc;
504 }
505
506
507
508
509
510 static int plm_slurm_signal_job(orte_jobid_t jobid, int32_t signal)
511 {
512 int rc = ORTE_SUCCESS;
513
514
515 if (ORTE_SUCCESS != (rc = orte_plm_base_orted_signal_local_procs(jobid, signal))) {
516 ORTE_ERROR_LOG(rc);
517 }
518
519 return rc;
520 }
521
522
523 static int plm_slurm_finalize(void)
524 {
525 int rc;
526
527
528 if (ORTE_SUCCESS != (rc = orte_plm_base_comm_stop())) {
529 ORTE_ERROR_LOG(rc);
530 }
531
532 return ORTE_SUCCESS;
533 }
534
535
536 static void srun_wait_cb(int sd, short fd, void *cbdata){
537 orte_wait_tracker_t *t2 = (orte_wait_tracker_t*)cbdata;
538 orte_proc_t *proc = t2->child;
539 orte_job_t *jdata;
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561 jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
562
563
564
565
566 if (0 != proc->exit_code) {
567
568
569
570 OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
571 "%s plm:slurm: srun returned non-zero exit status (%d) from launching the per-node daemon",
572 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
573 proc->exit_code));
574 ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ABORTED);
575 } else {
576
577 if (primary_srun_pid == proc->pid) {
578
579
580
581 OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
582 "%s plm:slurm: primary daemons complete!",
583 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
584
585 jdata->num_terminated = jdata->num_procs;
586 ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DAEMONS_TERMINATED);
587 }
588 }
589
590
591 OBJ_RELEASE(t2);
592 }
593
594
595 static int plm_slurm_start_proc(int argc, char **argv, char **env,
596 char *prefix)
597 {
598 int fd;
599 int srun_pid;
600 char *exec_argv = opal_path_findv(argv[0], 0, env, NULL);
601 orte_proc_t *dummy;
602
603 if (NULL == exec_argv) {
604 orte_show_help("help-plm-slurm.txt", "no-srun", true);
605 return ORTE_ERR_SILENT;
606 }
607
608 srun_pid = fork();
609 if (-1 == srun_pid) {
610 ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_CHILDREN);
611 free(exec_argv);
612 return ORTE_ERR_SYS_LIMITS_CHILDREN;
613 }
614
615
616
617 if (0 < srun_pid && !primary_pid_set) {
618 primary_srun_pid = srun_pid;
619 primary_pid_set = true;
620 }
621
622
623 dummy = OBJ_NEW(orte_proc_t);
624 dummy->pid = srun_pid;
625
626 ORTE_FLAG_SET(dummy, ORTE_PROC_FLAG_ALIVE);
627
628 orte_wait_cb(dummy, srun_wait_cb, orte_event_base, NULL);
629
630 if (0 == srun_pid) {
631 char *bin_base = NULL, *lib_base = NULL;
632
633
634
635
636
637
638 lib_base = opal_basename(opal_install_dirs.libdir);
639 bin_base = opal_basename(opal_install_dirs.bindir);
640
641
642
643 if (NULL != prefix) {
644 char *oldenv, *newenv;
645
646
647 oldenv = getenv("PATH");
648 if (NULL != oldenv) {
649 opal_asprintf(&newenv, "%s/%s:%s", prefix, bin_base, oldenv);
650 } else {
651 opal_asprintf(&newenv, "%s/%s", prefix, bin_base);
652 }
653 opal_setenv("PATH", newenv, true, &env);
654 OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
655 "%s plm:slurm: reset PATH: %s",
656 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
657 newenv));
658 free(newenv);
659
660
661 oldenv = getenv("LD_LIBRARY_PATH");
662 if (NULL != oldenv) {
663 opal_asprintf(&newenv, "%s/%s:%s", prefix, lib_base, oldenv);
664 } else {
665 opal_asprintf(&newenv, "%s/%s", prefix, lib_base);
666 }
667 opal_setenv("LD_LIBRARY_PATH", newenv, true, &env);
668 OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
669 "%s plm:slurm: reset LD_LIBRARY_PATH: %s",
670 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
671 newenv));
672 free(newenv);
673 }
674
675 fd = open("/dev/null", O_CREAT|O_RDWR|O_TRUNC, 0666);
676 if (fd >= 0) {
677 dup2(fd, 0);
678
679
680
681
682 if (0 > opal_output_get_verbosity(orte_plm_base_framework.framework_output) &&
683 !orte_debug_daemons_flag && !orte_leave_session_attached) {
684 dup2(fd,1);
685 dup2(fd,2);
686 }
687
688
689 if (fd > 2) {
690 close(fd);
691 }
692 }
693
694
695
696
697 setpgid(0, 0);
698
699 execve(exec_argv, argv, env);
700
701 opal_output(0, "plm:slurm:start_proc: exec failed");
702
703
704 exit(1);
705 } else {
706
707
708
709 setpgid(srun_pid, srun_pid);
710
711 free(exec_argv);
712 }
713
714 return ORTE_SUCCESS;
715 }