This source file includes following definitions.
- caddy_const
- caddy_dest
- rsh_init
- rsh_wait_daemon
- setup_launch
- ssh_child
- remote_spawn
- rsh_launch
- process_launch_list
- launch_daemons
- rsh_terminate_orteds
- rsh_finalize
- set_handler_default
- find_shell
- launch_agent_setup
- rsh_probe
- setup_shell
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33 #include "orte_config.h"
34 #include "orte/constants.h"
35
36 #include <stdlib.h>
37 #ifdef HAVE_UNISTD_H
38 #include <unistd.h>
39 #endif
40 #include <errno.h>
41 #include <string.h>
42 #ifdef HAVE_STRINGS_H
43 #include <strings.h>
44 #endif
45 #ifdef HAVE_SYS_SELECT_H
46 #include <sys/select.h>
47 #endif
48 #ifdef HAVE_SYS_TIME_H
49 #include <sys/time.h>
50 #endif
51 #include <time.h>
52 #ifdef HAVE_SYS_TYPES_H
53 #include <sys/types.h>
54 #endif
55 #ifdef HAVE_SYS_STAT_H
56 #include <sys/stat.h>
57 #endif
58 #ifdef HAVE_SYS_WAIT_H
59 #include <sys/wait.h>
60 #endif
61 #include <fcntl.h>
62 #include <signal.h>
63 #ifdef HAVE_PWD_H
64 #include <pwd.h>
65 #endif
66
67 #include "opal/mca/installdirs/installdirs.h"
68 #include "opal/util/output.h"
69 #include "opal/mca/base/base.h"
70 #include "opal/mca/event/event.h"
71 #include "opal/util/argv.h"
72 #include "opal/util/opal_environ.h"
73 #include "opal/util/basename.h"
74 #include "opal/util/path.h"
75 #include "opal/class/opal_pointer_array.h"
76
77 #include "orte/util/show_help.h"
78 #include "orte/runtime/orte_wait.h"
79 #include "orte/runtime/orte_globals.h"
80 #include "orte/util/name_fns.h"
81 #include "orte/util/proc_info.h"
82 #include "orte/util/threads.h"
83
84 #include "orte/mca/rml/rml.h"
85 #include "orte/mca/rml/rml_types.h"
86 #include "orte/mca/ess/ess.h"
87 #include "orte/mca/ess/base/base.h"
88 #include "orte/mca/errmgr/errmgr.h"
89 #include "orte/mca/grpcomm/base/base.h"
90 #include "orte/mca/oob/base/base.h"
91 #include "orte/mca/rmaps/rmaps.h"
92 #include "orte/mca/routed/routed.h"
93 #include "orte/mca/rml/base/rml_contact.h"
94 #include "orte/mca/state/state.h"
95
96 #include "orte/mca/plm/plm.h"
97 #include "orte/mca/plm/base/base.h"
98 #include "orte/mca/plm/base/plm_private.h"
99 #include "orte/mca/plm/rsh/plm_rsh.h"
100
101 static int rsh_init(void);
102 static int rsh_launch(orte_job_t *jdata);
103 static int remote_spawn(void);
104 static int rsh_terminate_orteds(void);
105 static int rsh_finalize(void);
106
107 orte_plm_base_module_t orte_plm_rsh_module = {
108 rsh_init,
109 orte_plm_base_set_hnp_name,
110 rsh_launch,
111 remote_spawn,
112 orte_plm_base_orted_terminate_job,
113 rsh_terminate_orteds,
114 orte_plm_base_orted_kill_local_procs,
115 orte_plm_base_orted_signal_local_procs,
116 rsh_finalize
117 };
118
119 typedef struct {
120 opal_list_item_t super;
121 int argc;
122 char **argv;
123 orte_proc_t *daemon;
124 } orte_plm_rsh_caddy_t;
125 static void caddy_const(orte_plm_rsh_caddy_t *ptr)
126 {
127 ptr->argv = NULL;
128 ptr->daemon = NULL;
129 }
130 static void caddy_dest(orte_plm_rsh_caddy_t *ptr)
131 {
132 if (NULL != ptr->argv) {
133 opal_argv_free(ptr->argv);
134 }
135 if (NULL != ptr->daemon) {
136 OBJ_RELEASE(ptr->daemon);
137 }
138 }
139 OBJ_CLASS_INSTANCE(orte_plm_rsh_caddy_t,
140 opal_list_item_t,
141 caddy_const, caddy_dest);
142
143 typedef enum {
144 ORTE_PLM_RSH_SHELL_BASH = 0,
145 ORTE_PLM_RSH_SHELL_ZSH,
146 ORTE_PLM_RSH_SHELL_TCSH,
147 ORTE_PLM_RSH_SHELL_CSH,
148 ORTE_PLM_RSH_SHELL_KSH,
149 ORTE_PLM_RSH_SHELL_SH,
150 ORTE_PLM_RSH_SHELL_UNKNOWN
151 } orte_plm_rsh_shell_t;
152
153
154 static const char *orte_plm_rsh_shell_name[7] = {
155 "bash",
156 "zsh",
157 "tcsh",
158 "csh",
159 "ksh",
160 "sh",
161 "unknown"
162 };
163
164
165
166
167 static void set_handler_default(int sig);
168 static orte_plm_rsh_shell_t find_shell(char *shell);
169 static int launch_agent_setup(const char *agent, char *path);
170 static void ssh_child(int argc, char **argv) __opal_attribute_noreturn__;
171 static int rsh_probe(char *nodename,
172 orte_plm_rsh_shell_t *shell);
173 static int setup_shell(orte_plm_rsh_shell_t *rshell,
174 orte_plm_rsh_shell_t *lshell,
175 char *nodename, int *argc, char ***argv);
176 static void launch_daemons(int fd, short args, void *cbdata);
177 static void process_launch_list(int fd, short args, void *cbdata);
178
179
180 static int num_in_progress=0;
181 static opal_list_t launch_list;
182 static opal_event_t launch_event;
183 static char *rsh_agent_path=NULL;
184 static char **rsh_agent_argv=NULL;
185
186
187
188
189 static int rsh_init(void)
190 {
191 char *tmp;
192 int rc;
193
194
195 if (mca_plm_rsh_component.using_qrsh) {
196
197 opal_asprintf(&tmp, "%s/bin/%s", getenv("SGE_ROOT"), getenv("ARC"));
198 if (ORTE_SUCCESS != (rc = launch_agent_setup("qrsh", tmp))) {
199 ORTE_ERROR_LOG(rc);
200 free(tmp);
201 return rc;
202 }
203 free(tmp);
204
205 opal_argv_append_nosize(&rsh_agent_argv, "-inherit");
206
207
208 opal_argv_append_nosize(&rsh_agent_argv, "-nostdin");
209 opal_argv_append_nosize(&rsh_agent_argv, "-V");
210 if (0 < opal_output_get_verbosity(orte_plm_base_framework.framework_output)) {
211 opal_argv_append_nosize(&rsh_agent_argv, "-verbose");
212 tmp = opal_argv_join(rsh_agent_argv, ' ');
213 opal_output_verbose(1, orte_plm_base_framework.framework_output,
214 "%s plm:rsh: using \"%s\" for launching\n",
215 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), tmp);
216 free(tmp);
217 }
218 } else if(mca_plm_rsh_component.using_llspawn) {
219
220 if (ORTE_SUCCESS != (rc = launch_agent_setup("llspawn", NULL))) {
221 ORTE_ERROR_LOG(rc);
222 return rc;
223 }
224 opal_output_verbose(1, orte_plm_base_framework.framework_output,
225 "%s plm:rsh: using \"%s\" for launching\n",
226 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
227 rsh_agent_path);
228 } else {
229
230 if (ORTE_SUCCESS != (rc = launch_agent_setup(mca_plm_rsh_component.agent, NULL))) {
231 ORTE_ERROR_LOG(rc);
232 return rc;
233 }
234 }
235
236
237 if (ORTE_SUCCESS != (rc = orte_state.add_job_state(ORTE_JOB_STATE_LAUNCH_DAEMONS,
238 launch_daemons, ORTE_SYS_PRI))) {
239 ORTE_ERROR_LOG(rc);
240 return rc;
241 }
242
243
244 OBJ_CONSTRUCT(&launch_list, opal_list_t);
245 opal_event_set(orte_event_base, &launch_event, -1, 0, process_launch_list, NULL);
246 opal_event_set_priority(&launch_event, ORTE_SYS_PRI);
247
248
249 if (ORTE_SUCCESS != (rc = orte_plm_base_comm_start())) {
250 ORTE_ERROR_LOG(rc);
251 }
252
253
254 orte_plm_globals.daemon_nodes_assigned_at_launch = true;
255
256 return rc;
257 }
258
259
260
261
262 static void rsh_wait_daemon(int sd, short flags, void *cbdata)
263 {
264 orte_job_t *jdata;
265 orte_wait_tracker_t *t2 = (orte_wait_tracker_t*)cbdata;
266 orte_plm_rsh_caddy_t *caddy=(orte_plm_rsh_caddy_t*)t2->cbdata;
267 orte_proc_t *daemon = caddy->daemon;
268
269 if (orte_orteds_term_ordered || orte_abnormal_term_ordered) {
270
271
272
273 OBJ_RELEASE(caddy);
274 OBJ_RELEASE(t2);
275 return;
276 }
277
278 if (!WIFEXITED(daemon->exit_code) ||
279 WEXITSTATUS(daemon->exit_code) != 0) {
280
281
282
283 if (!ORTE_PROC_IS_HNP) {
284 opal_buffer_t *buf;
285 OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
286 "%s daemon %d failed with status %d",
287 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
288 (int)daemon->name.vpid, WEXITSTATUS(daemon->exit_code)));
289 buf = OBJ_NEW(opal_buffer_t);
290 opal_dss.pack(buf, &(daemon->name.vpid), 1, ORTE_VPID);
291 opal_dss.pack(buf, &daemon->exit_code, 1, OPAL_INT);
292 orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buf,
293 ORTE_RML_TAG_REPORT_REMOTE_LAUNCH,
294 orte_rml_send_callback, NULL);
295
296 daemon->state = ORTE_PROC_STATE_FAILED_TO_START;
297 } else {
298 jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
299
300 OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
301 "%s daemon %d failed with status %d",
302 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
303 (int)daemon->name.vpid, WEXITSTATUS(daemon->exit_code)));
304
305 ORTE_UPDATE_EXIT_STATUS(WEXITSTATUS(daemon->exit_code));
306
307 daemon->state = ORTE_PROC_STATE_FAILED_TO_START;
308
309 jdata->num_terminated++;
310
311
312
313 orte_routed.route_lost(&daemon->name);
314
315 ORTE_ACTIVATE_PROC_STATE(&daemon->name, ORTE_PROC_STATE_FAILED_TO_START);
316 }
317 }
318
319
320 --num_in_progress;
321 if (num_in_progress < mca_plm_rsh_component.num_concurrent) {
322
323 opal_event_active(&launch_event, EV_WRITE, 1);
324 }
325
326 OBJ_RELEASE(t2);
327 }
328
329 static int setup_launch(int *argcptr, char ***argvptr,
330 char *nodename,
331 int *node_name_index1,
332 int *proc_vpid_index, char *prefix_dir)
333 {
334 int argc;
335 char **argv;
336 char *param, *value;
337 orte_plm_rsh_shell_t remote_shell, local_shell;
338 int orted_argc;
339 char **orted_argv;
340 char *orted_cmd, *orted_prefix, *final_cmd;
341 int orted_index;
342 int rc;
343 int i, j;
344 bool found;
345 char *lib_base=NULL, *bin_base=NULL;
346 char *opal_prefix = getenv("OPAL_PREFIX");
347 char* full_orted_cmd = NULL;
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379 argv = opal_argv_copy(rsh_agent_argv);
380 argc = opal_argv_count(argv);
381
382 if (NULL != mca_plm_rsh_component.ssh_args) {
383 char **ssh_argv;
384 ssh_argv = opal_argv_split(mca_plm_rsh_component.ssh_args, ' ');
385 for (i=0; NULL != ssh_argv[i]; i++) {
386 opal_argv_append(&argc, &argv, ssh_argv[i]);
387 }
388 opal_argv_free(ssh_argv);
389 }
390 *node_name_index1 = argc;
391 opal_argv_append(&argc, &argv, "<template>");
392
393
394 if (ORTE_SUCCESS != (rc = setup_shell(&remote_shell, &local_shell,
395 nodename, &argc, &argv))) {
396 ORTE_ERROR_LOG(rc);
397 return rc;
398 }
399
400
401
402
403
404
405
406
407
408 orted_argc = 0;
409 orted_argv = NULL;
410 orted_index = orte_plm_base_setup_orted_cmd(&orted_argc, &orted_argv);
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433 if (0 == orted_index) {
434
435
436
437
438
439 orted_cmd = opal_argv_join(orted_argv, ' ');
440 orted_prefix = NULL;
441 } else {
442
443
444
445 orted_prefix = opal_argv_join_range(orted_argv, 0, orted_index, ' ');
446 orted_cmd = opal_argv_join_range(orted_argv, orted_index, opal_argv_count(orted_argv), ' ');
447 }
448 opal_argv_free(orted_argv);
449
450
451 param = opal_basename(opal_install_dirs.libdir);
452 if (NULL != mca_plm_rsh_component.pass_libpath) {
453 if (NULL != prefix_dir) {
454 opal_asprintf(&lib_base, "%s:%s/%s", mca_plm_rsh_component.pass_libpath, prefix_dir, param);
455 } else {
456 opal_asprintf(&lib_base, "%s:%s", mca_plm_rsh_component.pass_libpath, param);
457 }
458 } else if (NULL != prefix_dir) {
459 opal_asprintf(&lib_base, "%s/%s", prefix_dir, param);
460 }
461 free(param);
462
463
464
465
466 if (NULL != prefix_dir) {
467
468
469
470
471
472 value = opal_basename(opal_install_dirs.bindir);
473 opal_asprintf(&bin_base, "%s/%s", prefix_dir, value);
474 free(value);
475
476 if (NULL != orted_cmd) {
477 if (0 == strcmp(orted_cmd, "orted")) {
478
479 opal_asprintf(&full_orted_cmd, "%s/%s", bin_base, orted_cmd);
480 } else {
481
482 full_orted_cmd = strdup(orted_cmd);
483 }
484 free(orted_cmd);
485 }
486 } else {
487 full_orted_cmd = orted_cmd;
488 }
489
490 if (NULL != lib_base || NULL != bin_base) {
491 if (ORTE_PLM_RSH_SHELL_SH == remote_shell ||
492 ORTE_PLM_RSH_SHELL_KSH == remote_shell ||
493 ORTE_PLM_RSH_SHELL_ZSH == remote_shell ||
494 ORTE_PLM_RSH_SHELL_BASH == remote_shell) {
495
496
497
498
499 opal_asprintf (&final_cmd,
500 "%s%s%s PATH=%s%s$PATH ; export PATH ; "
501 "LD_LIBRARY_PATH=%s%s$LD_LIBRARY_PATH ; export LD_LIBRARY_PATH ; "
502 "DYLD_LIBRARY_PATH=%s%s$DYLD_LIBRARY_PATH ; export DYLD_LIBRARY_PATH ; "
503 "%s %s",
504 (opal_prefix != NULL ? "OPAL_PREFIX=" : " "),
505 (opal_prefix != NULL ? opal_prefix : " "),
506 (opal_prefix != NULL ? " ; export OPAL_PREFIX;" : " "),
507 (NULL != bin_base ? bin_base : " "),
508 (NULL != bin_base ? ":" : " "),
509 (NULL != lib_base ? lib_base : " "),
510 (NULL != lib_base ? ":" : " "),
511 (NULL != lib_base ? lib_base : " "),
512 (NULL != lib_base ? ":" : " "),
513 (orted_prefix != NULL ? orted_prefix : " "),
514 (full_orted_cmd != NULL ? full_orted_cmd : " "));
515 } else if (ORTE_PLM_RSH_SHELL_TCSH == remote_shell ||
516 ORTE_PLM_RSH_SHELL_CSH == remote_shell) {
517
518
519
520
521
522
523
524
525
526
527
528
529 opal_asprintf (&final_cmd,
530 "%s%s%s set path = ( %s $path ) ; "
531 "if ( $?LD_LIBRARY_PATH == 1 ) "
532 "set OMPI_have_llp ; "
533 "if ( $?LD_LIBRARY_PATH == 0 ) "
534 "setenv LD_LIBRARY_PATH %s ; "
535 "if ( $?OMPI_have_llp == 1 ) "
536 "setenv LD_LIBRARY_PATH %s%s$LD_LIBRARY_PATH ; "
537 "if ( $?DYLD_LIBRARY_PATH == 1 ) "
538 "set OMPI_have_dllp ; "
539 "if ( $?DYLD_LIBRARY_PATH == 0 ) "
540 "setenv DYLD_LIBRARY_PATH %s ; "
541 "if ( $?OMPI_have_dllp == 1 ) "
542 "setenv DYLD_LIBRARY_PATH %s%s$DYLD_LIBRARY_PATH ; "
543 "%s %s",
544 (opal_prefix != NULL ? "setenv OPAL_PREFIX " : " "),
545 (opal_prefix != NULL ? opal_prefix : " "),
546 (opal_prefix != NULL ? " ;" : " "),
547 (NULL != bin_base ? bin_base : " "),
548 (NULL != lib_base ? lib_base : " "),
549 (NULL != lib_base ? lib_base : " "),
550 (NULL != lib_base ? ":" : " "),
551 (NULL != lib_base ? lib_base : " "),
552 (NULL != lib_base ? lib_base : " "),
553 (NULL != lib_base ? ":" : " "),
554 (orted_prefix != NULL ? orted_prefix : " "),
555 (full_orted_cmd != NULL ? full_orted_cmd : " "));
556 } else {
557 orte_show_help("help-plm-rsh.txt", "cannot-resolve-shell-with-prefix", true,
558 (NULL == opal_prefix) ? "NULL" : opal_prefix,
559 prefix_dir);
560 if (NULL != bin_base) {
561 free(bin_base);
562 }
563 if (NULL != lib_base) {
564 free(lib_base);
565 }
566 if (NULL != orted_prefix) free(orted_prefix);
567 if (NULL != full_orted_cmd) free(full_orted_cmd);
568 return ORTE_ERR_SILENT;
569 }
570 if (NULL != bin_base) {
571 free(bin_base);
572 }
573 if (NULL != lib_base) {
574 free(lib_base);
575 }
576 if( NULL != full_orted_cmd ) {
577 free(full_orted_cmd);
578 }
579 } else {
580
581 opal_asprintf(&final_cmd, "%s %s",
582 (orted_prefix != NULL ? orted_prefix : ""),
583 (full_orted_cmd != NULL ? full_orted_cmd : ""));
584 if (NULL != full_orted_cmd) {
585 free(full_orted_cmd);
586 }
587 }
588
589 opal_argv_append(&argc, &argv, final_cmd);
590 free(final_cmd);
591 if (NULL != orted_prefix) free(orted_prefix);
592
593
594
595
596 if (mca_plm_rsh_component.no_tree_spawn &&
597 !orte_debug_flag &&
598 !orte_debug_daemons_flag &&
599 !orte_debug_daemons_file_flag &&
600 !orte_leave_session_attached &&
601
602
603 ((!mca_plm_rsh_component.using_qrsh) ||
604 (mca_plm_rsh_component.using_qrsh && mca_plm_rsh_component.daemonize_qrsh)) &&
605 ((!mca_plm_rsh_component.using_llspawn) ||
606 (mca_plm_rsh_component.using_llspawn && mca_plm_rsh_component.daemonize_llspawn))) {
607 }
608
609
610
611
612
613 orte_plm_base_orted_append_basic_args(&argc, &argv,
614 "env",
615 proc_vpid_index);
616
617
618 opal_argv_append(&argc, &argv, "-"OPAL_MCA_CMD_LINE_ID);
619 opal_argv_append(&argc, &argv, "plm");
620 opal_argv_append(&argc, &argv, "rsh");
621
622
623
624 if (!mca_plm_rsh_component.no_tree_spawn) {
625 opal_argv_append(&argc, &argv, "--tree-spawn");
626 orte_oob_base_get_addr(¶m);
627 opal_argv_append(&argc, &argv, "-"OPAL_MCA_CMD_LINE_ID);
628 opal_argv_append(&argc, &argv, "orte_parent_uri");
629 opal_argv_append(&argc, &argv, param);
630 free(param);
631 }
632
633
634 if (mca_plm_rsh_component.pass_environ_mca_params) {
635
636
637
638 for (i = 0; NULL != environ[i]; ++i) {
639 if (0 == strncmp(OPAL_MCA_PREFIX"mca_base_env_list", environ[i],
640 strlen(OPAL_MCA_PREFIX"mca_base_env_list"))) {
641
642 continue;
643 }
644 if (0 == strncmp(OPAL_MCA_PREFIX, environ[i], 9)) {
645
646
647
648
649
650
651 param = strdup(&environ[i][9]);
652 value = strchr(param, '=');
653 *value = '\0';
654 value++;
655 found = false;
656
657 for (j=0; NULL != argv[j]; j++) {
658 if (0 == strcmp(param, argv[j])) {
659 found = true;
660 break;
661 }
662 }
663 if (!found) {
664
665 opal_argv_append(&argc, &argv, "-"OPAL_MCA_CMD_LINE_ID);
666 opal_argv_append(&argc, &argv, param);
667 opal_argv_append(&argc, &argv, value);
668 }
669 free(param);
670 }
671 }
672 }
673
674
675 mca_base_cmd_line_wrap_args(argv);
676
677 value = opal_argv_join(argv, ' ');
678 if (sysconf(_SC_ARG_MAX) < (int)strlen(value)) {
679 orte_show_help("help-plm-rsh.txt", "cmd-line-too-long",
680 true, strlen(value), sysconf(_SC_ARG_MAX));
681 free(value);
682 return ORTE_ERR_SILENT;
683 }
684 free(value);
685
686 if (ORTE_PLM_RSH_SHELL_SH == remote_shell ||
687 ORTE_PLM_RSH_SHELL_KSH == remote_shell) {
688 opal_argv_append(&argc, &argv, ")");
689 }
690
691 if (0 < opal_output_get_verbosity(orte_plm_base_framework.framework_output)) {
692 param = opal_argv_join(argv, ' ');
693 opal_output(orte_plm_base_framework.framework_output,
694 "%s plm:rsh: final template argv:\n\t%s",
695 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
696 (NULL == param) ? "NULL" : param);
697 if (NULL != param) free(param);
698 }
699
700
701 *argcptr = argc;
702 *argvptr = argv;
703 return ORTE_SUCCESS;
704 }
705
706
707 static void ssh_child(int argc, char **argv)
708 {
709 char** env;
710 char* var;
711 long fd, fdmax = sysconf(_SC_OPEN_MAX);
712 char *exec_path;
713 char **exec_argv;
714 int fdin;
715 sigset_t sigs;
716
717
718 env = opal_argv_copy(orte_launch_environ);
719
720
721
722
723
724
725
726
727
728
729
730
731 exec_argv = argv;
732 exec_path = strdup(rsh_agent_path);
733
734
735 fdin = open("/dev/null", O_RDWR);
736 dup2(fdin, 0);
737 close(fdin);
738
739
740 for(fd=3; fd<fdmax; fd++)
741 close(fd);
742
743
744
745
746
747
748
749
750 set_handler_default(SIGTERM);
751 set_handler_default(SIGINT);
752 set_handler_default(SIGHUP);
753 set_handler_default(SIGPIPE);
754 set_handler_default(SIGCHLD);
755
756
757
758
759
760
761
762
763 sigprocmask(0, 0, &sigs);
764 sigprocmask(SIG_UNBLOCK, &sigs, 0);
765
766
767 var = opal_argv_join(argv, ' ');
768 OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
769 "%s plm:rsh: executing: (%s) [%s]",
770 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
771 exec_path, (NULL == var) ? "NULL" : var));
772 if (NULL != var) free(var);
773
774 execve(exec_path, exec_argv, env);
775 opal_output(0, "plm:rsh: execv of %s failed with errno=%s(%d)\n",
776 exec_path, strerror(errno), errno);
777 exit(-1);
778 }
779
780
781
782
783 static int remote_spawn(void)
784 {
785 int node_name_index1;
786 int proc_vpid_index;
787 char **argv = NULL;
788 char *prefix, *hostname, *var;
789 int argc;
790 int rc=ORTE_SUCCESS;
791 bool failed_launch = true;
792 orte_process_name_t target;
793 orte_plm_rsh_caddy_t *caddy;
794 orte_job_t *daemons;
795 opal_list_t coll;
796 orte_namelist_t *child;
797
798 OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
799 "%s plm:rsh: remote spawn called",
800 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
801
802
803 target.vpid = ORTE_PROC_MY_NAME->vpid;
804
805
806
807
808 if ((bool)ORTE_WANT_ORTERUN_PREFIX_BY_DEFAULT) {
809 prefix = strdup(opal_install_dirs.prefix);
810 } else {
811 prefix = NULL;
812 }
813
814
815 OBJ_CONSTRUCT(&coll, opal_list_t);
816 orte_routed.get_routing_list(&coll);
817
818
819 if (0 == opal_list_get_size(&coll)) {
820 OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
821 "%s plm:rsh: remote spawn - have no children!",
822 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
823 failed_launch = false;
824 rc = ORTE_SUCCESS;
825 OBJ_DESTRUCT(&coll);
826 goto cleanup;
827 }
828
829
830 if (ORTE_SUCCESS != (rc = setup_launch(&argc, &argv,
831 orte_process_info.nodename, &node_name_index1,
832 &proc_vpid_index, prefix))) {
833 ORTE_ERROR_LOG(rc);
834 OBJ_DESTRUCT(&coll);
835 goto cleanup;
836 }
837
838
839 if (NULL == (daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
840 ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
841 rc = ORTE_ERR_NOT_FOUND;
842 OBJ_DESTRUCT(&coll);
843 goto cleanup;
844 }
845
846 target.jobid = ORTE_PROC_MY_NAME->jobid;
847 OPAL_LIST_FOREACH(child, &coll, orte_namelist_t) {
848 target.vpid = child->name.vpid;
849
850
851 if (NULL == (hostname = orte_get_proc_hostname(&target))) {
852 opal_output(0, "%s unable to get hostname for daemon %s",
853 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_VPID_PRINT(child->name.vpid));
854 rc = ORTE_ERR_NOT_FOUND;
855 OBJ_DESTRUCT(&coll);
856 goto cleanup;
857 }
858
859 free(argv[node_name_index1]);
860 argv[node_name_index1] = strdup(hostname);
861
862
863 rc = orte_util_convert_vpid_to_string(&var, target.vpid);
864 if (ORTE_SUCCESS != rc) {
865 opal_output(0, "orte_plm_rsh: unable to get daemon vpid as string");
866 exit(-1);
867 }
868 free(argv[proc_vpid_index]);
869 argv[proc_vpid_index] = strdup(var);
870 free(var);
871
872
873 caddy = OBJ_NEW(orte_plm_rsh_caddy_t);
874 caddy->argc = argc;
875 caddy->argv = opal_argv_copy(argv);
876
877
878
879 caddy->daemon = OBJ_NEW(orte_proc_t);
880 caddy->daemon->name.jobid = ORTE_PROC_MY_NAME->jobid;
881 caddy->daemon->name.vpid = target.vpid;
882 opal_list_append(&launch_list, &caddy->super);
883 }
884 OPAL_LIST_DESTRUCT(&coll);
885
886
887
888 mca_plm_rsh_component.no_tree_spawn = true;
889
890
891 OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
892 "%s plm:rsh: activating launch event",
893 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
894 opal_event_active(&launch_event, EV_WRITE, 1);
895
896
897 failed_launch = false;
898
899 cleanup:
900 if (NULL != argv) {
901 opal_argv_free(argv);
902 }
903
904
905 if (failed_launch) {
906
907 opal_buffer_t *buf;
908 buf = OBJ_NEW(opal_buffer_t);
909 opal_dss.pack(buf, &target.vpid, 1, ORTE_VPID);
910 opal_dss.pack(buf, &rc, 1, OPAL_INT);
911 orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buf,
912 ORTE_RML_TAG_REPORT_REMOTE_LAUNCH,
913 orte_rml_send_callback, NULL);
914 }
915
916 return rc;
917 }
918
919
920
921
922
923
924 static int rsh_launch(orte_job_t *jdata)
925 {
926 if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RESTART)) {
927
928 ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP);
929 } else {
930
931 ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_INIT);
932 }
933 return ORTE_SUCCESS;
934 }
935
936 static void process_launch_list(int fd, short args, void *cbdata)
937 {
938 opal_list_item_t *item;
939 pid_t pid;
940 orte_plm_rsh_caddy_t *caddy;
941
942 ORTE_ACQUIRE_OBJECT(caddy);
943
944 while (num_in_progress < mca_plm_rsh_component.num_concurrent) {
945 item = opal_list_remove_first(&launch_list);
946 if (NULL == item) {
947
948 break;
949 }
950 caddy = (orte_plm_rsh_caddy_t*)item;
951
952 ORTE_FLAG_SET(caddy->daemon, ORTE_PROC_FLAG_ALIVE);
953 orte_wait_cb(caddy->daemon, rsh_wait_daemon, orte_event_base, (void*)caddy);
954
955
956 pid = fork();
957 if (pid < 0) {
958 ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_CHILDREN);
959 orte_wait_cb_cancel(caddy->daemon);
960 continue;
961 }
962
963
964 if (pid == 0) {
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982 #if HAVE_SETPGID
983 if( 0 != setpgid(0, 0) ) {
984 opal_output(0, "plm:rsh: Error: setpgid(0,0) failed in child with errno=%s(%d)\n",
985 strerror(errno), errno);
986 exit(-1);
987 }
988 #endif
989
990
991 ssh_child(caddy->argc, caddy->argv);
992 } else {
993
994
995 #if HAVE_SETPGID
996 if( 0 != setpgid(pid, pid) ) {
997 opal_output(0, "plm:rsh: Warning: setpgid(%ld,%ld) failed in parent with errno=%s(%d)\n",
998 (long)pid, (long)pid, strerror(errno), errno);
999
1000
1001 }
1002 #endif
1003
1004
1005 caddy->daemon->state = ORTE_PROC_STATE_RUNNING;
1006
1007 caddy->daemon->pid = pid;
1008
1009 OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
1010 "%s plm:rsh: recording launch of daemon %s",
1011 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1012 ORTE_NAME_PRINT(&(caddy->daemon->name))));
1013 num_in_progress++;
1014 }
1015 }
1016 }
1017
1018 static void launch_daemons(int fd, short args, void *cbdata)
1019 {
1020 orte_job_map_t *map = NULL;
1021 int node_name_index1;
1022 int proc_vpid_index;
1023 char **argv = NULL;
1024 char *prefix_dir=NULL, *var;
1025 int argc;
1026 int rc;
1027 orte_app_context_t *app;
1028 orte_node_t *node, *nd;
1029 orte_std_cntr_t nnode;
1030 orte_job_t *daemons;
1031 orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata;
1032 orte_plm_rsh_caddy_t *caddy;
1033 opal_list_t coll;
1034 char *username;
1035 int port, *portptr;
1036 orte_namelist_t *child;
1037
1038 ORTE_ACQUIRE_OBJECT(state);
1039
1040
1041
1042
1043 if (ORTE_FLAG_TEST(state->jdata, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) {
1044 state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
1045 ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
1046 OBJ_RELEASE(state);
1047 return;
1048 }
1049
1050
1051 daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
1052 if (ORTE_SUCCESS != (rc = orte_plm_base_setup_virtual_machine(state->jdata))) {
1053 ORTE_ERROR_LOG(rc);
1054 goto cleanup;
1055 }
1056
1057
1058
1059
1060
1061 if (orte_do_not_launch) {
1062
1063
1064
1065
1066 state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
1067 ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
1068 OBJ_RELEASE(state);
1069 return;
1070 }
1071
1072
1073 if (NULL == (map = daemons->map)) {
1074 ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
1075 rc = ORTE_ERR_NOT_FOUND;
1076 goto cleanup;
1077 }
1078
1079 if (0 == map->num_new_daemons) {
1080
1081
1082
1083
1084 state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
1085 ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
1086 OBJ_RELEASE(state);
1087 return;
1088 }
1089
1090 OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
1091 "%s plm:rsh: launching vm",
1092 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
1093
1094 if ((0 < opal_output_get_verbosity(orte_plm_base_framework.framework_output) ||
1095 orte_leave_session_attached) &&
1096 mca_plm_rsh_component.num_concurrent < map->num_new_daemons) {
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110 orte_show_help("help-plm-rsh.txt", "deadlock-params",
1111 true, mca_plm_rsh_component.num_concurrent, map->num_new_daemons);
1112 ORTE_ERROR_LOG(ORTE_ERR_FATAL);
1113 OBJ_RELEASE(state);
1114 rc = ORTE_ERR_SILENT;
1115 goto cleanup;
1116 }
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135 app = (orte_app_context_t*)opal_pointer_array_get_item(state->jdata->apps, 0);
1136 if (!orte_get_attribute(&app->attributes, ORTE_APP_PREFIX_DIR, (void**)&prefix_dir, OPAL_STRING)) {
1137
1138
1139
1140 if ((bool)ORTE_WANT_ORTERUN_PREFIX_BY_DEFAULT) {
1141 prefix_dir = strdup(opal_install_dirs.prefix);
1142 }
1143 }
1144
1145
1146
1147 node = NULL;
1148 for (nnode = 0; nnode < map->nodes->size; nnode++) {
1149 if (NULL != (nd = (orte_node_t*)opal_pointer_array_get_item(map->nodes, nnode))) {
1150 node = nd;
1151
1152
1153
1154
1155 if (0 != strcmp(node->name, orte_process_info.nodename)) {
1156 break;
1157 }
1158 }
1159 }
1160 if (NULL == node) {
1161
1162
1163 ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
1164 rc = ORTE_ERR_NOT_FOUND;
1165 goto cleanup;
1166 }
1167
1168
1169 if (!mca_plm_rsh_component.no_tree_spawn) {
1170 orte_job_t *jdatorted;
1171
1172
1173 if (NULL == (jdatorted = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
1174 ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
1175 rc = ORTE_ERR_NOT_FOUND;
1176 goto cleanup;
1177 }
1178
1179
1180 OBJ_CONSTRUCT(&coll, opal_list_t);
1181 orte_routed.get_routing_list(&coll);
1182 }
1183
1184
1185 if (ORTE_SUCCESS != (rc = setup_launch(&argc, &argv, node->name, &node_name_index1,
1186 &proc_vpid_index, prefix_dir))) {
1187 ORTE_ERROR_LOG(rc);
1188 goto cleanup;
1189 }
1190
1191
1192
1193
1194 for (nnode=0; nnode < map->nodes->size; nnode++) {
1195 if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, nnode))) {
1196 continue;
1197 }
1198
1199
1200 if (!mca_plm_rsh_component.no_tree_spawn) {
1201 OPAL_LIST_FOREACH(child, &coll, orte_namelist_t) {
1202 if (child->name.vpid == node->daemon->name.vpid) {
1203 goto launch;
1204 }
1205 }
1206
1207 OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
1208 "%s plm:rsh:launch daemon %s not a child of mine",
1209 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1210 ORTE_VPID_PRINT(node->daemon->name.vpid)));
1211 continue;
1212 }
1213
1214 launch:
1215
1216 if (ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_DAEMON_LAUNCHED)) {
1217 OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
1218 "%s plm:rsh:launch daemon already exists on node %s",
1219 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1220 node->name));
1221 continue;
1222 }
1223
1224
1225
1226
1227 if (NULL == node->daemon) {
1228 ORTE_ERROR_LOG(ORTE_ERR_FATAL);
1229 OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
1230 "%s plm:rsh:launch daemon failed to be defined on node %s",
1231 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1232 node->name));
1233 continue;
1234 }
1235
1236
1237 free(argv[node_name_index1]);
1238 username = NULL;
1239 if (orte_get_attribute(&node->attributes, ORTE_NODE_USERNAME, (void**)&username, OPAL_STRING)) {
1240 opal_asprintf (&argv[node_name_index1], "%s@%s",
1241 username, node->name);
1242 free(username);
1243 } else {
1244 argv[node_name_index1] = strdup(node->name);
1245 }
1246
1247
1248 rc = orte_util_convert_vpid_to_string(&var, node->daemon->name.vpid);
1249 if (ORTE_SUCCESS != rc) {
1250 opal_output(0, "orte_plm_rsh: unable to get daemon vpid as string");
1251 exit(-1);
1252 }
1253 free(argv[proc_vpid_index]);
1254 argv[proc_vpid_index] = strdup(var);
1255 free(var);
1256
1257 OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
1258 "%s plm:rsh: adding node %s to launch list",
1259 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1260 node->name));
1261
1262
1263 caddy = OBJ_NEW(orte_plm_rsh_caddy_t);
1264 caddy->argc = argc;
1265 caddy->argv = opal_argv_copy(argv);
1266
1267 portptr = &port;
1268 if (orte_get_attribute(&node->attributes, ORTE_NODE_PORT, (void**)&portptr, OPAL_INT)) {
1269 char portname[16];
1270
1271 opal_argv_insert_element(&caddy->argv, node_name_index1+1, "-p");
1272 snprintf (portname, 15, "%d", port);
1273 opal_argv_insert_element(&caddy->argv, node_name_index1+2, portname);
1274 }
1275 caddy->daemon = node->daemon;
1276 OBJ_RETAIN(caddy->daemon);
1277 opal_list_append(&launch_list, &caddy->super);
1278 }
1279
1280
1281
1282 mca_plm_rsh_component.no_tree_spawn = true;
1283
1284
1285 state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
1286
1287
1288 OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
1289 "%s plm:rsh: activating launch event",
1290 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
1291 ORTE_POST_OBJECT(state);
1292 opal_event_active(&launch_event, EV_WRITE, 1);
1293
1294
1295
1296
1297 OBJ_RELEASE(state);
1298 opal_argv_free(argv);
1299 return;
1300
1301 cleanup:
1302 OBJ_RELEASE(state);
1303 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
1304 }
1305
1306
1307
1308
1309 static int rsh_terminate_orteds(void)
1310 {
1311 int rc;
1312
1313 if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_CMD))) {
1314 ORTE_ERROR_LOG(rc);
1315 }
1316
1317 return rc;
1318 }
1319
1320 static int rsh_finalize(void)
1321 {
1322 int rc, i;
1323 orte_job_t *jdata;
1324 orte_proc_t *proc;
1325 pid_t ret;
1326
1327
1328 opal_event_del(&launch_event);
1329 OPAL_LIST_DESTRUCT(&launch_list);
1330
1331
1332 if (ORTE_SUCCESS != (rc = orte_plm_base_comm_stop())) {
1333 ORTE_ERROR_LOG(rc);
1334 }
1335
1336 if ((ORTE_PROC_IS_DAEMON || ORTE_PROC_IS_HNP) && orte_abnormal_term_ordered) {
1337
1338 if (NULL == (jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
1339 return rc;
1340 }
1341 for (i=0; i < jdata->procs->size; i++) {
1342 if (NULL == (proc = opal_pointer_array_get_item(jdata->procs, i))) {
1343 continue;
1344 }
1345 if (0 < proc->pid) {
1346
1347 ret = waitpid(proc->pid, &proc->exit_code, WNOHANG);
1348 if (-1 == ret && ECHILD == errno) {
1349
1350
1351 continue;
1352 }
1353 if (ret == proc->pid) {
1354
1355 continue;
1356 }
1357
1358 kill(proc->pid, SIGKILL);
1359 }
1360 }
1361 }
1362 free(mca_plm_rsh_component.agent_path);
1363 free(rsh_agent_path);
1364 opal_argv_free(mca_plm_rsh_component.agent_argv);
1365 opal_argv_free(rsh_agent_argv);
1366
1367 return rc;
1368 }
1369
1370
1371 static void set_handler_default(int sig)
1372 {
1373 struct sigaction act;
1374
1375 act.sa_handler = SIG_DFL;
1376 act.sa_flags = 0;
1377 sigemptyset(&act.sa_mask);
1378
1379 sigaction(sig, &act, (struct sigaction *)0);
1380 }
1381
1382
1383 static orte_plm_rsh_shell_t find_shell(char *shell)
1384 {
1385 int i = 0;
1386 char *sh_name = NULL;
1387
1388 if( (NULL == shell) || (strlen(shell) == 1) ) {
1389
1390 return ORTE_PLM_RSH_SHELL_UNKNOWN;
1391 }
1392
1393 sh_name = rindex(shell, '/');
1394 if( NULL == sh_name ) {
1395
1396 return ORTE_PLM_RSH_SHELL_UNKNOWN;
1397 }
1398
1399
1400 ++sh_name;
1401 for (i = 0; i < (int)(sizeof (orte_plm_rsh_shell_name) /
1402 sizeof(orte_plm_rsh_shell_name[0])); ++i) {
1403 if (NULL != strstr(sh_name, orte_plm_rsh_shell_name[i])) {
1404 return (orte_plm_rsh_shell_t)i;
1405 }
1406 }
1407
1408
1409 return ORTE_PLM_RSH_SHELL_UNKNOWN;
1410 }
1411
1412 static int launch_agent_setup(const char *agent, char *path)
1413 {
1414 char *bname;
1415 int i;
1416
1417
1418 if (NULL == mca_plm_rsh_component.agent && NULL == agent) {
1419 return ORTE_ERR_NOT_FOUND;
1420 }
1421
1422
1423 OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
1424 "%s plm:rsh_setup on agent %s path %s",
1425 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1426 (NULL == agent) ? mca_plm_rsh_component.agent : agent,
1427 (NULL == path) ? "NULL" : path));
1428 rsh_agent_argv = orte_plm_rsh_search(agent, path);
1429
1430 if (0 == opal_argv_count(rsh_agent_argv)) {
1431
1432 return ORTE_ERR_NOT_FOUND;
1433 }
1434
1435
1436 rsh_agent_path = opal_path_findv(rsh_agent_argv[0], X_OK, environ, path);
1437
1438 if (NULL == rsh_agent_path) {
1439
1440 opal_argv_free(rsh_agent_argv);
1441 return ORTE_ERR_NOT_FOUND;
1442 }
1443
1444 bname = opal_basename(rsh_agent_argv[0]);
1445 if (NULL != bname && 0 == strcmp(bname, "ssh")) {
1446
1447 if (NULL != orte_xterm) {
1448 opal_argv_append_unique_nosize(&rsh_agent_argv, "-X", false);
1449 } else if (0 >= opal_output_get_verbosity(orte_plm_base_framework.framework_output)) {
1450
1451
1452
1453
1454 for (i = 1; NULL != rsh_agent_argv[i]; ++i) {
1455 if (0 == strcasecmp("-x", rsh_agent_argv[i])) {
1456 break;
1457 }
1458 }
1459 if (NULL == rsh_agent_argv[i]) {
1460 opal_argv_append_nosize(&rsh_agent_argv, "-x");
1461 }
1462 }
1463 }
1464 if (NULL != bname) {
1465 free(bname);
1466 }
1467
1468
1469 return ORTE_SUCCESS;
1470 }
1471
1472
1473
1474
1475 static int rsh_probe(char *nodename,
1476 orte_plm_rsh_shell_t *shell)
1477 {
1478 char ** argv;
1479 int argc, rc = ORTE_SUCCESS, i;
1480 int fd[2];
1481 pid_t pid;
1482 char outbuf[4096];
1483
1484 OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
1485 "%s plm:rsh: going to check SHELL variable on node %s",
1486 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1487 nodename));
1488
1489 *shell = ORTE_PLM_RSH_SHELL_UNKNOWN;
1490 if (pipe(fd)) {
1491 OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
1492 "%s plm:rsh: pipe failed with errno=%d",
1493 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1494 errno));
1495 return ORTE_ERR_IN_ERRNO;
1496 }
1497 if ((pid = fork()) < 0) {
1498 OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
1499 "%s plm:rsh: fork failed with errno=%d",
1500 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1501 errno));
1502 return ORTE_ERR_IN_ERRNO;
1503 }
1504 else if (pid == 0) {
1505 if (dup2(fd[1], 1) < 0) {
1506 OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
1507 "%s plm:rsh: dup2 failed with errno=%d",
1508 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1509 errno));
1510 exit(01);
1511 }
1512
1513 argv = opal_argv_copy(mca_plm_rsh_component.agent_argv);
1514 argc = opal_argv_count(mca_plm_rsh_component.agent_argv);
1515 opal_argv_append(&argc, &argv, nodename);
1516 opal_argv_append(&argc, &argv, "echo $SHELL");
1517
1518 execvp(argv[0], argv);
1519 exit(errno);
1520 }
1521 if (close(fd[1])) {
1522 OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
1523 "%s plm:rsh: close failed with errno=%d",
1524 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1525 errno));
1526 return ORTE_ERR_IN_ERRNO;
1527 }
1528
1529 {
1530 ssize_t ret = 1;
1531 char* ptr = outbuf;
1532 size_t outbufsize = sizeof(outbuf);
1533
1534 do {
1535 ret = read (fd[0], ptr, outbufsize-1);
1536 if (ret < 0) {
1537 if (errno == EINTR)
1538 continue;
1539 OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
1540 "%s plm:rsh: Unable to detect the remote shell (error %s)",
1541 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1542 strerror(errno)));
1543 rc = ORTE_ERR_IN_ERRNO;
1544 break;
1545 }
1546 if( outbufsize > 1 ) {
1547 outbufsize -= ret;
1548 ptr += ret;
1549 }
1550 } while( 0 != ret );
1551 *ptr = '\0';
1552 }
1553 close(fd[0]);
1554
1555 if( outbuf[0] != '\0' ) {
1556 char *sh_name = rindex(outbuf, '/');
1557 if( NULL != sh_name ) {
1558 sh_name++;
1559
1560 for (i = 0; i < (int)(sizeof (orte_plm_rsh_shell_name)/
1561 sizeof(orte_plm_rsh_shell_name[0])); i++) {
1562 if ( NULL != strstr(sh_name, orte_plm_rsh_shell_name[i]) ) {
1563 *shell = (orte_plm_rsh_shell_t)i;
1564 break;
1565 }
1566 }
1567 }
1568 }
1569
1570 OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
1571 "%s plm:rsh: node %s has SHELL: %s",
1572 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1573 nodename,
1574 (ORTE_PLM_RSH_SHELL_UNKNOWN == *shell) ? "UNHANDLED" : (char*)orte_plm_rsh_shell_name[*shell]));
1575
1576 return rc;
1577 }
1578
1579 static int setup_shell(orte_plm_rsh_shell_t *rshell,
1580 orte_plm_rsh_shell_t *lshell,
1581 char *nodename, int *argc, char ***argv)
1582 {
1583 orte_plm_rsh_shell_t remote_shell, local_shell;
1584 char *param;
1585 int rc;
1586
1587
1588 local_shell = ORTE_PLM_RSH_SHELL_UNKNOWN;
1589
1590 #if OPAL_ENABLE_GETPWUID
1591 {
1592 struct passwd *p;
1593
1594 p = getpwuid(getuid());
1595 if( NULL != p ) {
1596 param = p->pw_shell;
1597 local_shell = find_shell(p->pw_shell);
1598 }
1599 }
1600 #endif
1601
1602
1603
1604
1605 if (ORTE_PLM_RSH_SHELL_UNKNOWN == local_shell &&
1606 NULL != (param = getenv("SHELL"))) {
1607 local_shell = find_shell(param);
1608 }
1609
1610 if (ORTE_PLM_RSH_SHELL_UNKNOWN == local_shell) {
1611 opal_output(0, "WARNING: local probe returned unhandled shell:%s assuming bash\n",
1612 (NULL != param) ? param : "unknown");
1613 local_shell = ORTE_PLM_RSH_SHELL_BASH;
1614 }
1615
1616 OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
1617 "%s plm:rsh: local shell: %d (%s)",
1618 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1619 local_shell, orte_plm_rsh_shell_name[local_shell]));
1620
1621
1622 if (mca_plm_rsh_component.assume_same_shell) {
1623 remote_shell = local_shell;
1624 OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
1625 "%s plm:rsh: assuming same remote shell as local shell",
1626 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
1627 } else {
1628 rc = rsh_probe(nodename, &remote_shell);
1629
1630 if (ORTE_SUCCESS != rc) {
1631 ORTE_ERROR_LOG(rc);
1632 return rc;
1633 }
1634
1635 if (ORTE_PLM_RSH_SHELL_UNKNOWN == remote_shell) {
1636 opal_output(0, "WARNING: rsh probe returned unhandled shell; assuming bash\n");
1637 remote_shell = ORTE_PLM_RSH_SHELL_BASH;
1638 }
1639 }
1640
1641 OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
1642 "%s plm:rsh: remote shell: %d (%s)",
1643 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1644 remote_shell, orte_plm_rsh_shell_name[remote_shell]));
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654 if (ORTE_PLM_RSH_SHELL_SH == remote_shell ||
1655 ORTE_PLM_RSH_SHELL_KSH == remote_shell) {
1656 int i;
1657 char **tmp;
1658 tmp = opal_argv_split("( test ! -r ./.profile || . ./.profile;", ' ');
1659 if (NULL == tmp) {
1660 return ORTE_ERR_OUT_OF_RESOURCE;
1661 }
1662 for (i = 0; NULL != tmp[i]; ++i) {
1663 opal_argv_append(argc, argv, tmp[i]);
1664 }
1665 opal_argv_free(tmp);
1666 }
1667
1668
1669 *rshell = remote_shell;
1670 *lshell = local_shell;
1671
1672 return ORTE_SUCCESS;
1673 }