This source file includes following definitions.
- odls_pspawn_kill_local
- orte_odls_pspawn_kill_local_procs
- close_open_file_descriptors
- odls_pspawn_fork_local_proc
- orte_odls_pspawn_launch_local_procs
- send_signal
- orte_odls_pspawn_signal_local_procs
- orte_odls_pspawn_restart_proc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71 #include "orte_config.h"
72 #include "orte/constants.h"
73 #include "orte/types.h"
74
75 #include <string.h>
76 #include <stdlib.h>
77 #ifdef HAVE_UNISTD_H
78 #include <unistd.h>
79 #endif
80 #include <errno.h>
81 #ifdef HAVE_SYS_TYPES_H
82 #include <sys/types.h>
83 #endif
84 #ifdef HAVE_SYS_WAIT_H
85 #include <sys/wait.h>
86 #endif
87 #include <signal.h>
88 #ifdef HAVE_FCNTL_H
89 #include <fcntl.h>
90 #endif
91 #ifdef HAVE_SYS_TIME_H
92 #include <sys/time.h>
93 #endif
94 #ifdef HAVE_SYS_PARAM_H
95 #include <sys/param.h>
96 #endif
97 #ifdef HAVE_NETDB_H
98 #include <netdb.h>
99 #endif
100 #include <stdlib.h>
101 #ifdef HAVE_SYS_STAT_H
102 #include <sys/stat.h>
103 #endif
104 #include <stdarg.h>
105 #ifdef HAVE_SYS_SELECT_H
106 #include <sys/select.h>
107 #endif
108 #ifdef HAVE_DIRENT_H
109 #include <dirent.h>
110 #endif
111 #include <ctype.h>
112 #ifdef HAVE_UTIL_H
113 #include <util.h>
114 #endif
115 #ifdef HAVE_PTY_H
116 #include <pty.h>
117 #endif
118 #ifdef HAVE_FCNTL_H
119 #include <fcntl.h>
120 #endif
121 #ifdef HAVE_TERMIOS_H
122 #include <termios.h>
123 # ifdef HAVE_TERMIO_H
124 # include <termio.h>
125 # endif
126 #endif
127 #ifdef HAVE_LIBUTIL_H
128 #include <libutil.h>
129 #endif
130
131 #include <spawn.h>
132
133 #include "opal/mca/hwloc/hwloc-internal.h"
134 #include "opal/mca/hwloc/base/base.h"
135 #include "opal/class/opal_pointer_array.h"
136 #include "opal/util/opal_environ.h"
137 #include "opal/util/show_help.h"
138 #include "opal/util/sys_limits.h"
139 #include "opal/util/fd.h"
140
141 #include "orte/util/show_help.h"
142 #include "orte/runtime/orte_wait.h"
143 #include "orte/runtime/orte_globals.h"
144 #include "orte/mca/errmgr/errmgr.h"
145 #include "orte/mca/ess/ess.h"
146 #include "orte/mca/iof/base/iof_base_setup.h"
147 #include "orte/mca/plm/plm.h"
148 #include "orte/mca/rtc/rtc.h"
149 #include "orte/util/name_fns.h"
150 #include "orte/util/threads.h"
151
152 #include "orte/mca/odls/base/base.h"
153 #include "orte/mca/odls/base/odls_private.h"
154 #include "orte/mca/odls/pspawn/odls_pspawn.h"
155 #include "orte/orted/pmix/pmix_server.h"
156
157
158
159
160 static int orte_odls_pspawn_launch_local_procs(opal_buffer_t *data);
161 static int orte_odls_pspawn_kill_local_procs(opal_pointer_array_t *procs);
162 static int orte_odls_pspawn_signal_local_procs(const orte_process_name_t *proc, int32_t signal);
163 static int orte_odls_pspawn_restart_proc(orte_proc_t *child);
164
165
166
167
168
169 orte_odls_base_module_t orte_odls_pspawn_module = {
170 .get_add_procs_data = orte_odls_base_default_get_add_procs_data,
171 .launch_local_procs = orte_odls_pspawn_launch_local_procs,
172 .kill_local_procs = orte_odls_pspawn_kill_local_procs,
173 .signal_local_procs = orte_odls_pspawn_signal_local_procs,
174 .restart_proc = orte_odls_pspawn_restart_proc
175 };
176
177
178
179 static int odls_pspawn_kill_local(pid_t pid, int signum)
180 {
181 pid_t pgrp;
182
183 #if HAVE_SETPGID
184 pgrp = getpgid(pid);
185 if (-1 != pgrp) {
186
187
188
189
190
191
192
193 pid = -pgrp;
194 }
195 #endif
196
197 if (0 != kill(pid, signum)) {
198 if (ESRCH != errno) {
199 OPAL_OUTPUT_VERBOSE((2, orte_odls_base_framework.framework_output,
200 "%s odls:pspawn:SENT KILL %d TO PID %d GOT ERRNO %d",
201 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), signum, (int)pid, errno));
202 return errno;
203 }
204 }
205 OPAL_OUTPUT_VERBOSE((2, orte_odls_base_framework.framework_output,
206 "%s odls:pspawn:SENT KILL %d TO PID %d SUCCESS",
207 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), signum, (int)pid));
208 return 0;
209 }
210
211 int orte_odls_pspawn_kill_local_procs(opal_pointer_array_t *procs)
212 {
213 int rc;
214
215 if (ORTE_SUCCESS != (rc = orte_odls_base_default_kill_local_procs(procs,
216 odls_pspawn_kill_local))) {
217 ORTE_ERROR_LOG(rc);
218 return rc;
219 }
220 return ORTE_SUCCESS;
221 }
222
223
224
225
226
227 static int close_open_file_descriptors(posix_spawn_file_actions_t *factions)
228 {
229 DIR *dir = opendir("/proc/self/fd");
230 if (NULL == dir) {
231 return ORTE_ERR_FILE_OPEN_FAILURE;
232 }
233 struct dirent *files;
234
235
236
237 int dir_scan_fd = dirfd(dir);
238 if(dir_scan_fd < 0 ) {
239 return ORTE_ERR_FILE_OPEN_FAILURE;
240 }
241
242 while (NULL != (files = readdir(dir))) {
243 if (!isdigit(files->d_name[0])) {
244 continue;
245 }
246 int fd = strtol(files->d_name, NULL, 10);
247 if (errno == EINVAL || errno == ERANGE) {
248 closedir(dir);
249 return ORTE_ERR_TYPE_MISMATCH;
250 }
251 if (fd >=3 && fd != dir_scan_fd) {
252 posix_spawn_file_actions_addclose(factions, fd);
253 }
254 }
255 closedir(dir);
256 return ORTE_SUCCESS;
257 }
258
259
260
261
262 static int odls_pspawn_fork_local_proc(void *cdptr)
263 {
264 orte_odls_spawn_caddy_t *cd = (orte_odls_spawn_caddy_t*)cdptr;
265 pid_t pid;
266 orte_proc_t *child = cd->child;
267 posix_spawn_file_actions_t factions;
268 posix_spawnattr_t attrs;
269 sigset_t sigs;
270 int rc;
271 orte_iof_base_io_conf_t *opts = &cd->opts;
272
273 ORTE_FLAG_UNSET(cd->child, ORTE_PROC_FLAG_ALIVE);
274
275
276 rc = posix_spawnattr_init(&attrs);
277 if (0 != rc) {
278 child->state = ORTE_PROC_STATE_FAILED_TO_START;
279 child->exit_code = 1;
280 return ORTE_ERROR;
281 }
282
283 sigprocmask(0, 0, &sigs);
284 sigprocmask(SIG_UNBLOCK, &sigs, 0);
285 posix_spawnattr_setsigmask(&attrs, &sigs);
286
287
288 rc = posix_spawn_file_actions_init(&factions);
289 if (0 != rc) {
290 posix_spawnattr_destroy(&attrs);
291 child->state = ORTE_PROC_STATE_FAILED_TO_START;
292 child->exit_code = 1;
293 return ORTE_ERROR;
294 }
295 if (ORTE_SUCCESS != close_open_file_descriptors(&factions)) {
296 posix_spawn_file_actions_destroy(&factions);
297 posix_spawnattr_destroy(&attrs);
298 child->state = ORTE_PROC_STATE_FAILED_TO_START;
299 child->exit_code = 1;
300 return ORTE_ERROR;
301 }
302
303 if (opts->connect_stdin) {
304 posix_spawn_file_actions_addclose(&factions, opts->p_stdin[1]);
305 }
306 posix_spawn_file_actions_addclose(&factions, opts->p_stdout[0]);
307 if( !orte_iof_base.redirect_app_stderr_to_stdout ) {
308 posix_spawn_file_actions_addclose(&factions, opts->p_stderr[0]);
309 }
310
311 if (opts->usepty) {
312
313 struct termios term_attrs;
314 if (tcgetattr(opts->p_stdout[1], &term_attrs) < 0) {
315 return ORTE_ERR_PIPE_SETUP_FAILURE;
316 }
317 term_attrs.c_lflag &= ~ (ECHO | ECHOE | ECHOK |
318 ECHOCTL | ECHOKE | ECHONL);
319 term_attrs.c_iflag &= ~ (ICRNL | INLCR | ISTRIP | INPCK | IXON);
320 term_attrs.c_oflag &= ~ (
321 #ifdef OCRNL
322
323
324 OCRNL |
325 #endif
326 ONLCR);
327 if (tcsetattr(opts->p_stdout[1], TCSANOW, &term_attrs) == -1) {
328 return ORTE_ERR_PIPE_SETUP_FAILURE;
329 }
330 posix_spawn_file_actions_adddup2(&factions, fileno(stdout), opts->p_stdout[1]);
331 if (orte_iof_base.redirect_app_stderr_to_stdout) {
332 posix_spawn_file_actions_adddup2(&factions, fileno(stderr), opts->p_stdout[1]);
333 }
334 } else {
335 if (opts->p_stdout[1] != fileno(stdout)) {
336 posix_spawn_file_actions_adddup2(&factions, fileno(stdout), opts->p_stdout[1]);
337 }
338 if (orte_iof_base.redirect_app_stderr_to_stdout) {
339 posix_spawn_file_actions_adddup2(&factions, fileno(stderr), opts->p_stdout[1]);
340 }
341 }
342 if (opts->connect_stdin) {
343 if (opts->p_stdin[0] != fileno(stdin)) {
344 posix_spawn_file_actions_adddup2(&factions, fileno(stdin), opts->p_stdin[0]);
345 }
346 }
347 if (opts->p_stderr[1] != fileno(stderr) && !orte_iof_base.redirect_app_stderr_to_stdout) {
348 posix_spawn_file_actions_adddup2(&factions, fileno(stderr), opts->p_stderr[1]);
349 }
350
351
352 rc = posix_spawn(&pid, cd->app->app, &factions, &attrs, cd->argv, cd->env);
353 posix_spawn_file_actions_destroy(&factions);
354 posix_spawnattr_destroy(&attrs);
355
356
357 if (cd->opts.connect_stdin) {
358 close(cd->opts.p_stdin[0]);
359 }
360 close(cd->opts.p_stdout[1]);
361 if( !orte_iof_base.redirect_app_stderr_to_stdout ) {
362 close(cd->opts.p_stderr[1]);
363 }
364
365 if (rc < 0) {
366 ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_CHILDREN);
367 child->state = ORTE_PROC_STATE_FAILED_TO_START;
368 child->exit_code = ORTE_ERR_SYS_LIMITS_CHILDREN;
369 return ORTE_ERR_SYS_LIMITS_CHILDREN;
370 }
371
372 cd->child->state = ORTE_PROC_STATE_RUNNING;
373 cd->child->pid = pid;
374 ORTE_FLAG_SET(cd->child, ORTE_PROC_FLAG_ALIVE);
375 return ORTE_SUCCESS;
376 }
377
378
379
380
381
382
383 int orte_odls_pspawn_launch_local_procs(opal_buffer_t *data)
384 {
385 int rc;
386 orte_jobid_t job;
387
388
389 if (ORTE_SUCCESS != (rc = orte_odls_base_default_construct_child_list(data, &job))) {
390 OPAL_OUTPUT_VERBOSE((2, orte_odls_base_framework.framework_output,
391 "%s odls:pspawn:launch:local failed to construct child list on error %s",
392 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_ERROR_NAME(rc)));
393 return rc;
394 }
395
396
397 ORTE_ACTIVATE_LOCAL_LAUNCH(job, odls_pspawn_fork_local_proc);
398
399 return ORTE_SUCCESS;
400 }
401
402
403
404
405
406
407 static int send_signal(pid_t pd, int signal)
408 {
409 int rc = ORTE_SUCCESS;
410 pid_t pid;
411
412 if (orte_odls_globals.signal_direct_children_only) {
413 pid = pd;
414 } else {
415 #if HAVE_SETPGID
416
417
418 pid = -pd;
419 #else
420 pid = pd;
421 #endif
422 }
423
424 OPAL_OUTPUT_VERBOSE((1, orte_odls_base_framework.framework_output,
425 "%s sending signal %d to pid %ld",
426 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
427 signal, (long)pid));
428
429 if (kill(pid, signal) != 0) {
430 switch(errno) {
431 case EINVAL:
432 rc = ORTE_ERR_BAD_PARAM;
433 break;
434 case ESRCH:
435
436
437
438
439
440 break;
441 case EPERM:
442 rc = ORTE_ERR_PERM;
443 break;
444 default:
445 rc = ORTE_ERROR;
446 }
447 }
448
449 return rc;
450 }
451
452 static int orte_odls_pspawn_signal_local_procs(const orte_process_name_t *proc, int32_t signal)
453 {
454 int rc;
455
456 if (ORTE_SUCCESS != (rc = orte_odls_base_default_signal_local_procs(proc, signal, send_signal))) {
457 ORTE_ERROR_LOG(rc);
458 return rc;
459 }
460 return ORTE_SUCCESS;
461 }
462
463 static int orte_odls_pspawn_restart_proc(orte_proc_t *child)
464 {
465 int rc;
466
467
468 if (ORTE_SUCCESS != (rc = orte_odls_base_default_restart_proc(child, odls_pspawn_fork_local_proc))) {
469 OPAL_OUTPUT_VERBOSE((2, orte_odls_base_framework.framework_output,
470 "%s odls:pspawn:restart_proc failed to launch on error %s",
471 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_ERROR_NAME(rc)));
472 }
473 return rc;
474 }