This source file includes following definitions.
- odls_default_kill_local
- orte_odls_default_kill_local_procs
- set_handler_default
- write_help_msg
- send_error_show_help
- close_open_file_descriptors
- do_child
- do_parent
- odls_default_fork_local_proc
- orte_odls_default_launch_local_procs
- send_signal
- orte_odls_default_signal_local_procs
- orte_odls_default_restart_proc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71 #include "orte_config.h"
72 #include "orte/constants.h"
73 #include "orte/types.h"
74
75 #include <string.h>
76 #include <stdlib.h>
77 #ifdef HAVE_UNISTD_H
78 #include <unistd.h>
79 #endif
80 #include <errno.h>
81 #ifdef HAVE_SYS_TYPES_H
82 #include <sys/types.h>
83 #endif
84 #ifdef HAVE_SYS_WAIT_H
85 #include <sys/wait.h>
86 #endif
87 #include <signal.h>
88 #ifdef HAVE_FCNTL_H
89 #include <fcntl.h>
90 #endif
91 #ifdef HAVE_SYS_TIME_H
92 #include <sys/time.h>
93 #endif
94 #ifdef HAVE_SYS_PARAM_H
95 #include <sys/param.h>
96 #endif
97 #ifdef HAVE_NETDB_H
98 #include <netdb.h>
99 #endif
100 #include <stdlib.h>
101 #ifdef HAVE_SYS_STAT_H
102 #include <sys/stat.h>
103 #endif
104 #include <stdarg.h>
105 #ifdef HAVE_SYS_SELECT_H
106 #include <sys/select.h>
107 #endif
108 #ifdef HAVE_DIRENT_H
109 #include <dirent.h>
110 #endif
111 #include <ctype.h>
112
113 #include "opal/mca/hwloc/hwloc-internal.h"
114 #include "opal/mca/hwloc/base/base.h"
115 #include "opal/class/opal_pointer_array.h"
116 #include "opal/util/opal_environ.h"
117 #include "opal/util/show_help.h"
118 #include "opal/util/sys_limits.h"
119 #include "opal/util/fd.h"
120
121 #include "orte/util/show_help.h"
122 #include "orte/runtime/orte_wait.h"
123 #include "orte/runtime/orte_globals.h"
124 #include "orte/mca/errmgr/errmgr.h"
125 #include "orte/mca/ess/ess.h"
126 #include "orte/mca/iof/base/iof_base_setup.h"
127 #include "orte/mca/plm/plm.h"
128 #include "orte/mca/rtc/rtc.h"
129 #include "orte/util/name_fns.h"
130 #include "orte/util/threads.h"
131
132 #include "orte/mca/odls/base/base.h"
133 #include "orte/mca/odls/base/odls_private.h"
134 #include "orte/mca/odls/default/odls_default.h"
135 #include "orte/orted/pmix/pmix_server.h"
136
137
138
139
140 static int orte_odls_default_launch_local_procs(opal_buffer_t *data);
141 static int orte_odls_default_kill_local_procs(opal_pointer_array_t *procs);
142 static int orte_odls_default_signal_local_procs(const orte_process_name_t *proc, int32_t signal);
143 static int orte_odls_default_restart_proc(orte_proc_t *child);
144
145
146
147
148
149 static void send_error_show_help(int fd, int exit_status,
150 const char *file, const char *topic, ...)
151 __opal_attribute_noreturn__;
152
153 static int do_child(orte_odls_spawn_caddy_t *cd, int write_fd)
154 __opal_attribute_noreturn__;
155
156
157
158
159
160 orte_odls_base_module_t orte_odls_default_module = {
161 .get_add_procs_data = orte_odls_base_default_get_add_procs_data,
162 .launch_local_procs = orte_odls_default_launch_local_procs,
163 .kill_local_procs = orte_odls_default_kill_local_procs,
164 .signal_local_procs = orte_odls_default_signal_local_procs,
165 .restart_proc = orte_odls_default_restart_proc
166 };
167
168
169
170 static int odls_default_kill_local(pid_t pid, int signum)
171 {
172 pid_t pgrp;
173
174 #if HAVE_SETPGID
175 pgrp = getpgid(pid);
176 if (-1 != pgrp) {
177
178
179
180
181
182
183
184 pid = -pgrp;
185 }
186 #endif
187
188 if (0 != kill(pid, signum)) {
189 if (ESRCH != errno) {
190 OPAL_OUTPUT_VERBOSE((2, orte_odls_base_framework.framework_output,
191 "%s odls:default:SENT KILL %d TO PID %d GOT ERRNO %d",
192 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), signum, (int)pid, errno));
193 return errno;
194 }
195 }
196 OPAL_OUTPUT_VERBOSE((2, orte_odls_base_framework.framework_output,
197 "%s odls:default:SENT KILL %d TO PID %d SUCCESS",
198 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), signum, (int)pid));
199 return 0;
200 }
201
202 int orte_odls_default_kill_local_procs(opal_pointer_array_t *procs)
203 {
204 int rc;
205
206 if (ORTE_SUCCESS != (rc = orte_odls_base_default_kill_local_procs(procs,
207 odls_default_kill_local))) {
208 ORTE_ERROR_LOG(rc);
209 return rc;
210 }
211 return ORTE_SUCCESS;
212 }
213
214
215 static void set_handler_default(int sig)
216 {
217 struct sigaction act;
218
219 act.sa_handler = SIG_DFL;
220 act.sa_flags = 0;
221 sigemptyset(&act.sa_mask);
222
223 sigaction(sig, &act, (struct sigaction *)0);
224 }
225
226
227
228
229
230 static int write_help_msg(int fd, orte_odls_pipe_err_msg_t *msg, const char *file,
231 const char *topic, va_list ap)
232 {
233 int ret;
234 char *str;
235
236 if (NULL == file || NULL == topic) {
237 return OPAL_ERR_BAD_PARAM;
238 }
239
240 str = opal_show_help_vstring(file, topic, true, ap);
241
242 msg->file_str_len = (int) strlen(file);
243 if (msg->file_str_len > ORTE_ODLS_MAX_FILE_LEN) {
244 ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
245 return ORTE_ERR_BAD_PARAM;
246 }
247 msg->topic_str_len = (int) strlen(topic);
248 if (msg->topic_str_len > ORTE_ODLS_MAX_TOPIC_LEN) {
249 ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
250 return ORTE_ERR_BAD_PARAM;
251 }
252 msg->msg_str_len = (int) strlen(str);
253
254
255 if (OPAL_SUCCESS != (ret = opal_fd_write(fd, sizeof(*msg), msg))) {
256 goto out;
257 }
258 if (msg->file_str_len > 0 &&
259 OPAL_SUCCESS != (ret = opal_fd_write(fd, msg->file_str_len, file))) {
260 goto out;
261 }
262 if (msg->topic_str_len > 0 &&
263 OPAL_SUCCESS != (ret = opal_fd_write(fd, msg->topic_str_len, topic))) {
264 goto out;
265 }
266 if (msg->msg_str_len > 0 &&
267 OPAL_SUCCESS != (ret = opal_fd_write(fd, msg->msg_str_len, str))) {
268 goto out;
269 }
270
271 out:
272 free(str);
273 return ret;
274 }
275
276
277
278
279 static void send_error_show_help(int fd, int exit_status,
280 const char *file, const char *topic, ...)
281 {
282 va_list ap;
283 orte_odls_pipe_err_msg_t msg;
284
285 msg.fatal = true;
286 msg.exit_status = exit_status;
287
288
289 va_start(ap, topic);
290 write_help_msg(fd, &msg, file, topic, ap);
291 va_end(ap);
292
293 exit(exit_status);
294 }
295
296
297
298 static int close_open_file_descriptors(int write_fd,
299 orte_iof_base_io_conf_t opts) {
300 DIR *dir = opendir("/proc/self/fd");
301 if (NULL == dir) {
302 return ORTE_ERR_FILE_OPEN_FAILURE;
303 }
304 struct dirent *files;
305
306
307
308 int dir_scan_fd = dirfd(dir);
309 if(dir_scan_fd < 0 ) {
310 return ORTE_ERR_FILE_OPEN_FAILURE;
311 }
312
313
314 while (NULL != (files = readdir(dir))) {
315 if (!isdigit(files->d_name[0])) {
316 continue;
317 }
318 int fd = strtol(files->d_name, NULL, 10);
319 if (errno == EINVAL || errno == ERANGE) {
320 closedir(dir);
321 return ORTE_ERR_TYPE_MISMATCH;
322 }
323 if (fd >=3 &&
324 #if OPAL_PMIX_V1
325 fd != opts.p_internal[1] &&
326 #endif
327 fd != write_fd &&
328 fd != dir_scan_fd) {
329 close(fd);
330 }
331 }
332 closedir(dir);
333 return ORTE_SUCCESS;
334 }
335
336 static int do_child(orte_odls_spawn_caddy_t *cd, int write_fd)
337 {
338 int i;
339 sigset_t sigs;
340 long fd, fdmax = sysconf(_SC_OPEN_MAX);
341 char dir[MAXPATHLEN];
342
343 #if HAVE_SETPGID
344
345
346 setpgid(0, 0);
347 #endif
348
349
350 opal_fd_set_cloexec(write_fd);
351
352 if (NULL != cd->child) {
353
354
355
356
357
358
359
360
361
362
363
364
365
366 if (ORTE_FLAG_TEST(cd->jdata, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
367 if (ORTE_SUCCESS != (i = orte_iof_base_setup_child(&cd->opts, &cd->env))) {
368 ORTE_ERROR_LOG(i);
369 send_error_show_help(write_fd, 1,
370 "help-orte-odls-default.txt",
371 "iof setup failed",
372 orte_process_info.nodename, cd->app->app);
373
374 }
375 }
376
377
378 orte_rtc.set(cd->jdata, cd->child, &cd->env, write_fd);
379
380 } else if (!ORTE_FLAG_TEST(cd->jdata, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
381
382 int fdnull;
383 for (i=0; i < 3; i++) {
384 fdnull = open("/dev/null", O_RDONLY, 0);
385 if (fdnull > i && i != write_fd) {
386 dup2(fdnull, i);
387 }
388 close(fdnull);
389 }
390 #if OPAL_PMIX_V1
391 fdnull = open("/dev/null", O_RDONLY, 0);
392 if (fdnull > cd->opts.p_internal[1]) {
393 dup2(fdnull, cd->opts.p_internal[1]);
394 }
395 close(fdnull);
396 #endif
397 }
398
399
400
401
402 if (ORTE_SUCCESS != close_open_file_descriptors(write_fd, cd->opts)) {
403
404 for(fd=3; fd<fdmax; fd++) {
405 if (
406 #if OPAL_PMIX_V1
407 fd != cd->opts.p_internal[1] &&
408 #endif
409 fd != write_fd) {
410 close(fd);
411 }
412 }
413 }
414
415 if (cd->argv == NULL) {
416 cd->argv = malloc(sizeof(char*)*2);
417 cd->argv[0] = strdup(cd->app->app);
418 cd->argv[1] = NULL;
419 }
420
421
422
423
424
425
426
427
428 set_handler_default(SIGTERM);
429 set_handler_default(SIGINT);
430 set_handler_default(SIGHUP);
431 set_handler_default(SIGPIPE);
432 set_handler_default(SIGCHLD);
433
434
435
436
437
438 sigprocmask(0, 0, &sigs);
439 sigprocmask(SIG_UNBLOCK, &sigs, 0);
440
441
442 if (NULL != cd->wdir) {
443 if (0 != chdir(cd->wdir)) {
444 send_error_show_help(write_fd, 1,
445 "help-orterun.txt",
446 "orterun:wdir-not-found",
447 "orted",
448 cd->wdir,
449 orte_process_info.nodename,
450 (NULL == cd->child) ? 0 : cd->child->app_rank);
451
452 }
453 }
454
455
456 execve(cd->cmd, cd->argv, cd->env);
457 getcwd(dir, sizeof(dir));
458 send_error_show_help(write_fd, 1,
459 "help-orte-odls-default.txt", "execve error",
460 orte_process_info.nodename, dir, cd->app->app, strerror(errno));
461
462 }
463
464
465 static int do_parent(orte_odls_spawn_caddy_t *cd, int read_fd)
466 {
467 int rc;
468 orte_odls_pipe_err_msg_t msg;
469 char file[ORTE_ODLS_MAX_FILE_LEN + 1], topic[ORTE_ODLS_MAX_TOPIC_LEN + 1], *str = NULL;
470
471 if (cd->opts.connect_stdin) {
472 close(cd->opts.p_stdin[0]);
473 }
474 close(cd->opts.p_stdout[1]);
475 if( !orte_iof_base.redirect_app_stderr_to_stdout ) {
476 close(cd->opts.p_stderr[1]);
477 }
478 #if OPAL_PMIX_V1
479 close(cd->opts.p_internal[1]);
480 #endif
481
482
483 while (1) {
484 rc = opal_fd_read(read_fd, sizeof(msg), &msg);
485
486
487 if (OPAL_ERR_TIMEOUT == rc) {
488 break;
489 }
490
491
492 if (OPAL_SUCCESS != rc) {
493 ORTE_ERROR_LOG(rc);
494 close(read_fd);
495
496 if (NULL != cd->child) {
497 cd->child->state = ORTE_PROC_STATE_UNDEF;
498 }
499 return rc;
500 }
501
502
503 if (NULL != cd->child) {
504 if (msg.fatal) {
505 ORTE_FLAG_UNSET(cd->child, ORTE_PROC_FLAG_ALIVE);
506 } else {
507 ORTE_FLAG_SET(cd->child, ORTE_PROC_FLAG_ALIVE);
508 }
509 }
510
511
512 if (msg.file_str_len > 0) {
513 rc = opal_fd_read(read_fd, msg.file_str_len, file);
514 if (OPAL_SUCCESS != rc) {
515 orte_show_help("help-orte-odls-default.txt", "syscall fail",
516 true,
517 orte_process_info.nodename, cd->app->app,
518 "opal_fd_read", __FILE__, __LINE__);
519 if (NULL != cd->child) {
520 cd->child->state = ORTE_PROC_STATE_UNDEF;
521 }
522 return rc;
523 }
524 file[msg.file_str_len] = '\0';
525 }
526 if (msg.topic_str_len > 0) {
527 rc = opal_fd_read(read_fd, msg.topic_str_len, topic);
528 if (OPAL_SUCCESS != rc) {
529 orte_show_help("help-orte-odls-default.txt", "syscall fail",
530 true,
531 orte_process_info.nodename, cd->app->app,
532 "opal_fd_read", __FILE__, __LINE__);
533 if (NULL != cd->child) {
534 cd->child->state = ORTE_PROC_STATE_UNDEF;
535 }
536 return rc;
537 }
538 topic[msg.topic_str_len] = '\0';
539 }
540 if (msg.msg_str_len > 0) {
541 str = calloc(1, msg.msg_str_len + 1);
542 if (NULL == str) {
543 orte_show_help("help-orte-odls-default.txt", "syscall fail",
544 true,
545 orte_process_info.nodename, cd->app->app,
546 "opal_fd_read", __FILE__, __LINE__);
547 if (NULL != cd->child) {
548 cd->child->state = ORTE_PROC_STATE_UNDEF;
549 }
550 return rc;
551 }
552 rc = opal_fd_read(read_fd, msg.msg_str_len, str);
553 }
554
555
556
557 if (msg.msg_str_len > 0) {
558 orte_show_help_norender(file, topic, false, str);
559 free(str);
560 str = NULL;
561 }
562
563
564
565
566
567
568 if (msg.fatal) {
569 if (NULL != cd->child) {
570 cd->child->state = ORTE_PROC_STATE_FAILED_TO_START;
571 ORTE_FLAG_UNSET(cd->child, ORTE_PROC_FLAG_ALIVE);
572 }
573 close(read_fd);
574 return ORTE_ERR_FAILED_TO_START;
575 }
576 }
577
578
579
580
581 if (NULL != cd->child) {
582 cd->child->state = ORTE_PROC_STATE_RUNNING;
583 ORTE_FLAG_SET(cd->child, ORTE_PROC_FLAG_ALIVE);
584 }
585 close(read_fd);
586
587 return ORTE_SUCCESS;
588 }
589
590
591
592
593
594 static int odls_default_fork_local_proc(void *cdptr)
595 {
596 orte_odls_spawn_caddy_t *cd = (orte_odls_spawn_caddy_t*)cdptr;
597 int p[2];
598 pid_t pid;
599 orte_proc_t *child = cd->child;
600
601
602
603
604
605
606
607
608
609 if (pipe(p) < 0) {
610 ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_PIPES);
611 if (NULL != child) {
612 child->state = ORTE_PROC_STATE_FAILED_TO_START;
613 child->exit_code = ORTE_ERR_SYS_LIMITS_PIPES;
614 }
615 return ORTE_ERR_SYS_LIMITS_PIPES;
616 }
617
618
619 pid = fork();
620 if (NULL != child) {
621 child->pid = pid;
622 }
623
624 if (pid < 0) {
625 ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_CHILDREN);
626 if (NULL != child) {
627 child->state = ORTE_PROC_STATE_FAILED_TO_START;
628 child->exit_code = ORTE_ERR_SYS_LIMITS_CHILDREN;
629 }
630 return ORTE_ERR_SYS_LIMITS_CHILDREN;
631 }
632
633 if (pid == 0) {
634 close(p[0]);
635 do_child(cd, p[1]);
636
637 }
638
639 close(p[1]);
640 return do_parent(cd, p[0]);
641 }
642
643
644
645
646
647
648 int orte_odls_default_launch_local_procs(opal_buffer_t *data)
649 {
650 int rc;
651 orte_jobid_t job;
652
653
654 if (ORTE_SUCCESS != (rc = orte_odls_base_default_construct_child_list(data, &job))) {
655 OPAL_OUTPUT_VERBOSE((2, orte_odls_base_framework.framework_output,
656 "%s odls:default:launch:local failed to construct child list on error %s",
657 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_ERROR_NAME(rc)));
658 return rc;
659 }
660
661
662 ORTE_ACTIVATE_LOCAL_LAUNCH(job, odls_default_fork_local_proc);
663
664 return ORTE_SUCCESS;
665 }
666
667
668
669
670
671
672 static int send_signal(pid_t pd, int signal)
673 {
674 int rc = ORTE_SUCCESS;
675 pid_t pid;
676
677 if (orte_odls_globals.signal_direct_children_only) {
678 pid = pd;
679 } else {
680 #if HAVE_SETPGID
681
682
683 pid = -pd;
684 #else
685 pid = pd;
686 #endif
687 }
688
689 OPAL_OUTPUT_VERBOSE((1, orte_odls_base_framework.framework_output,
690 "%s sending signal %d to pid %ld",
691 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
692 signal, (long)pid));
693
694 if (kill(pid, signal) != 0) {
695 switch(errno) {
696 case EINVAL:
697 rc = ORTE_ERR_BAD_PARAM;
698 break;
699 case ESRCH:
700
701
702
703
704
705 break;
706 case EPERM:
707 rc = ORTE_ERR_PERM;
708 break;
709 default:
710 rc = ORTE_ERROR;
711 }
712 }
713
714 return rc;
715 }
716
717 static int orte_odls_default_signal_local_procs(const orte_process_name_t *proc, int32_t signal)
718 {
719 int rc;
720
721 if (ORTE_SUCCESS != (rc = orte_odls_base_default_signal_local_procs(proc, signal, send_signal))) {
722 ORTE_ERROR_LOG(rc);
723 return rc;
724 }
725 return ORTE_SUCCESS;
726 }
727
728 static int orte_odls_default_restart_proc(orte_proc_t *child)
729 {
730 int rc;
731
732
733 if (ORTE_SUCCESS != (rc = orte_odls_base_default_restart_proc(child, odls_default_fork_local_proc))) {
734 OPAL_OUTPUT_VERBOSE((2, orte_odls_base_framework.framework_output,
735 "%s odls:default:restart_proc failed to launch on error %s",
736 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_ERROR_NAME(rc)));
737 }
738 return rc;
739 }