This source file includes following definitions.
- MPIR_Breakpoint
- tcon
- tdes
- orte_submit_init
- print_help
- orte_submit_finalize
- orte_submit_cancel
- orte_submit_halt
- orte_submit_job
- init_globals
- parse_globals
- parse_locals
- create_app
- set_classpath_jar_file
- parse_appfile
- launch_recv
- complete_recv
- orte_debugger_init_before_spawn
- _send_notification
- orte_debugger_dump
- setup_debugger_job
- orte_debugger_init_after_spawn
- process
- open_fifo
- attach_debugger
- build_debugger_args
- run_debugger
- orte_debugger_detached
- stack_trace_recv
- stack_trace_timeout
- orte_timeout_wakeup
- profile_timeout
- profile_recv
- orte_profile_wakeup
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28 #include "orte_config.h"
29 #include "orte/constants.h"
30
31 #include <string.h>
32 #include <stdio.h>
33 #include <stdlib.h>
34 #ifdef HAVE_STRINGS_H
35 #include <strings.h>
36 #endif
37 #ifdef HAVE_UNISTD_H
38 #include <unistd.h>
39 #endif
40 #ifdef HAVE_SYS_PARAM_H
41 #include <sys/param.h>
42 #endif
43 #include <errno.h>
44 #include <signal.h>
45 #include <ctype.h>
46 #ifdef HAVE_SYS_TYPES_H
47 #include <sys/types.h>
48 #endif
49 #ifdef HAVE_SYS_WAIT_H
50 #include <sys/wait.h>
51 #endif
52 #ifdef HAVE_SYS_TIME_H
53 #include <sys/time.h>
54 #endif
55 #include <fcntl.h>
56 #ifdef HAVE_SYS_STAT_H
57 #include <sys/stat.h>
58 #endif
59 #include <poll.h>
60
61 #include "opal/dss/dss.h"
62 #include "opal/mca/event/event.h"
63 #include "opal/mca/installdirs/installdirs.h"
64 #include "opal/mca/hwloc/base/base.h"
65 #include "opal/mca/base/base.h"
66 #include "opal/mca/pmix/pmix.h"
67 #include "opal/util/argv.h"
68 #include "opal/util/output.h"
69 #include "opal/util/basename.h"
70 #include "opal/util/cmd_line.h"
71 #include "opal/util/opal_environ.h"
72 #include "opal/util/opal_getcwd.h"
73 #include "opal/util/show_help.h"
74 #include "opal/util/fd.h"
75 #include "opal/util/string_copy.h"
76 #include "opal/sys/atomic.h"
77 #if OPAL_ENABLE_FT_CR == 1
78 #include "opal/runtime/opal_cr.h"
79 #endif
80
81 #include "opal/version.h"
82 #include "opal/runtime/opal.h"
83 #include "opal/runtime/opal_info_support.h"
84 #include "opal/util/os_path.h"
85 #include "opal/util/path.h"
86 #include "opal/class/opal_pointer_array.h"
87 #include "opal/dss/dss.h"
88
89 #include "orte/mca/odls/odls_types.h"
90 #include "orte/mca/plm/plm.h"
91 #include "orte/mca/rmaps/rmaps_types.h"
92 #include "orte/mca/rmaps/base/base.h"
93
94 #include "orte/mca/errmgr/errmgr.h"
95 #include "orte/mca/grpcomm/grpcomm.h"
96 #include "orte/mca/oob/base/base.h"
97 #include "orte/mca/plm/base/plm_private.h"
98 #include "orte/mca/rml/rml.h"
99 #include "orte/mca/rml/base/rml_contact.h"
100 #include "orte/mca/routed/routed.h"
101 #include "orte/mca/schizo/base/base.h"
102 #include "orte/mca/state/state.h"
103
104 #include "orte/runtime/runtime.h"
105 #include "orte/runtime/orte_globals.h"
106 #include "orte/runtime/orte_wait.h"
107 #include "orte/runtime/orte_quit.h"
108 #include "orte/util/pre_condition_transports.h"
109 #include "orte/util/show_help.h"
110
111 #include "orted_submit.h"
112
113
114
115
116 orte_cmd_options_t orte_cmd_options = {0};
117 opal_cmd_line_t *orte_cmd_line = NULL;
118
119 static char **global_mca_env = NULL;
120 static orte_std_cntr_t total_num_apps = 0;
121 static bool want_prefix_by_default = (bool) ORTE_WANT_ORTERUN_PREFIX_BY_DEFAULT;
122 static opal_pointer_array_t tool_jobs;
123 static int timeout_seconds;
124 static orte_timer_t *orte_memprofile_timeout;
125
126 int orte_debugger_attach_fd = -1;
127 bool orte_debugger_fifo_active=false;
128 opal_event_t *orte_debugger_attach=NULL;
129
130
131
132
133 static int create_app(int argc, char* argv[],
134 orte_job_t *jdata,
135 orte_app_context_t **app,
136 bool *made_app, char ***app_env);
137 static int init_globals(void);
138 static int parse_globals(int argc, char* argv[], opal_cmd_line_t *cmd_line);
139 static int parse_locals(orte_job_t *jdata, int argc, char* argv[]);
140 static void set_classpath_jar_file(orte_app_context_t *app, int index, char *jarfile);
141 static int parse_appfile(orte_job_t *jdata, char *filename, char ***env);
142 static void orte_timeout_wakeup(int sd, short args, void *cbdata);
143 static void orte_profile_wakeup(int sd, short args, void *cbdata);
144 static void profile_recv(int status, orte_process_name_t* sender,
145 opal_buffer_t *buffer, orte_rml_tag_t tag,
146 void* cbdata);
147 static void launch_recv(int status, orte_process_name_t* sender,
148 opal_buffer_t *buffer,
149 orte_rml_tag_t tag, void *cbdata);
150 static void complete_recv(int status, orte_process_name_t* sender,
151 opal_buffer_t *buffer,
152 orte_rml_tag_t tag, void *cbdata);
153 static void attach_debugger(int fd, short event, void *arg);
154 static void build_debugger_args(orte_app_context_t *debugger);
155 static void open_fifo (void);
156 static void run_debugger(char *basename, opal_cmd_line_t *cmd_line,
157 int argc, char *argv[], int num_procs);
158 static void print_help(void);
159
160
161 #define MPIR_MAX_PATH_LENGTH 512
162 #define MPIR_MAX_ARG_LENGTH 1024
163 struct MPIR_PROCDESC *MPIR_proctable = NULL;
164 int MPIR_proctable_size = 0;
165 volatile int MPIR_being_debugged = 0;
166 volatile int MPIR_debug_state = 0;
167 int MPIR_i_am_starter = 0;
168 int MPIR_partial_attach_ok = 1;
169 char MPIR_executable_path[MPIR_MAX_PATH_LENGTH] = {0};
170 char MPIR_server_arguments[MPIR_MAX_ARG_LENGTH] = {0};
171 volatile int MPIR_forward_output = 0;
172 volatile int MPIR_forward_comm = 0;
173 char MPIR_attach_fifo[MPIR_MAX_PATH_LENGTH] = {0};
174 int MPIR_force_to_main = 0;
175 static void orte_debugger_init_before_spawn(orte_job_t *jdata);
176
177 ORTE_DECLSPEC void* __opal_attribute_optnone__ MPIR_Breakpoint(void);
178
179
180
181
182 void* MPIR_Breakpoint(void)
183 {
184 return NULL;
185 }
186
187
188 typedef struct {
189 opal_object_t super;
190 orte_job_t *jdata;
191 int index;
192 orte_submit_cbfunc_t launch_cb;
193 void *launch_cbdata;
194 orte_submit_cbfunc_t complete_cb;
195 void *complete_cbdata;
196 } trackr_t;
197 static void tcon(trackr_t *p)
198 {
199 p->jdata = NULL;
200 p->launch_cb = NULL;
201 p->launch_cbdata = NULL;
202 p->complete_cb = NULL;
203 p->complete_cbdata = NULL;
204 }
205 static void tdes(trackr_t *p)
206 {
207 if (NULL != p->jdata) {
208 OBJ_RELEASE(p->jdata);
209 }
210 }
211 static OBJ_CLASS_INSTANCE(trackr_t,
212 opal_object_t,
213 tcon, tdes);
214
215 int orte_submit_init(int argc, char *argv[],
216 opal_cmd_line_init_t *opts)
217 {
218 int rc, i;
219 char *param;
220
221
222 memset(&orte_cmd_options, 0, sizeof(orte_cmd_options));
223
224
225
226 orte_basename = opal_basename(argv[0]);
227
228
229 for (i=0; NULL != argv[i]; i++) {
230 if (':' == argv[i][0] ||
231 NULL == argv[i+1] || NULL == argv[i+2]) {
232 break;
233 }
234 if (0 == strncmp(argv[i], "-"OPAL_MCA_CMD_LINE_ID, strlen("-"OPAL_MCA_CMD_LINE_ID)) ||
235 0 == strncmp(argv[i], "--"OPAL_MCA_CMD_LINE_ID, strlen("--"OPAL_MCA_CMD_LINE_ID)) ||
236 0 == strncmp(argv[i], "-g"OPAL_MCA_CMD_LINE_ID, strlen("-g"OPAL_MCA_CMD_LINE_ID)) ||
237 0 == strncmp(argv[i], "--g"OPAL_MCA_CMD_LINE_ID, strlen("--g"OPAL_MCA_CMD_LINE_ID))) {
238 (void) mca_base_var_env_name (argv[i+1], ¶m);
239 opal_setenv(param, argv[i+2], true, &environ);
240 free(param);
241 } else if (0 == strcmp(argv[i], "-am") ||
242 0 == strcmp(argv[i], "--am")) {
243 (void)mca_base_var_env_name("mca_base_param_file_prefix", ¶m);
244 opal_setenv(param, argv[i+1], true, &environ);
245 free(param);
246 } else if (0 == strcmp(argv[i], "-tune") ||
247 0 == strcmp(argv[i], "--tune")) {
248 (void)mca_base_var_env_name("mca_base_envar_file_prefix", ¶m);
249 opal_setenv(param, argv[i+1], true, &environ);
250 free(param);
251 }
252 }
253
254
255 if (OPAL_SUCCESS != (rc = opal_init_util(&argc, &argv))) {
256 return rc;
257 }
258
259 if (ORTE_SUCCESS != (rc = mca_base_framework_open(&orte_schizo_base_framework, 0))) {
260 ORTE_ERROR_LOG(rc);
261 return rc;
262 }
263 if (ORTE_SUCCESS != (rc = orte_schizo_base_select())) {
264 ORTE_ERROR_LOG(rc);
265 return rc;
266 }
267
268 OBJ_CONSTRUCT(&tool_jobs, opal_pointer_array_t);
269 opal_pointer_array_init(&tool_jobs, 256, INT_MAX, 128);
270
271
272
273 orte_cmd_line = OBJ_NEW(opal_cmd_line_t);
274
275
276 if (NULL != opts) {
277 if (OPAL_SUCCESS != (rc = opal_cmd_line_add(orte_cmd_line, opts))) {
278 return rc;
279 }
280 }
281
282
283 if (OPAL_SUCCESS != (rc = orte_schizo.define_cli(orte_cmd_line))) {
284 return rc;
285 }
286
287
288 mca_base_cmd_line_setup(orte_cmd_line);
289
290
291 if (OPAL_SUCCESS != (rc = opal_cmd_line_parse(orte_cmd_line,
292 true, false, argc, argv)) ) {
293 if (OPAL_ERR_SILENT != rc) {
294 fprintf(stderr, "%s: command line error (%s)\n", argv[0],
295 opal_strerror(rc));
296 }
297 return rc;
298 }
299
300
301
302
303 if (orte_cmd_options.version) {
304 char *str, *project_name = NULL;
305 if (0 == strcmp(orte_basename, "mpirun")) {
306 project_name = "Open MPI";
307 } else {
308 project_name = "OpenRTE";
309 }
310 str = opal_info_make_version_str("all",
311 OPAL_MAJOR_VERSION, OPAL_MINOR_VERSION,
312 OPAL_RELEASE_VERSION,
313 OPAL_GREEK_VERSION,
314 OPAL_REPO_REV);
315 if (NULL != str) {
316 fprintf(stdout, "%s (%s) %s\n\nReport bugs to %s\n",
317 orte_basename, project_name, str, PACKAGE_BUGREPORT);
318 free(str);
319 }
320 exit(0);
321 }
322
323
324
325
326
327 if (0 == geteuid() && !orte_cmd_options.run_as_root) {
328
329 char *r1, *r2;
330 if (NULL != (r1 = getenv("OMPI_ALLOW_RUN_AS_ROOT")) &&
331 NULL != (r2 = getenv("OMPI_ALLOW_RUN_AS_ROOT_CONFIRM"))) {
332 if (0 == strcmp(r1, "1") && 0 == strcmp(r2, "1")) {
333 goto moveon;
334 }
335 }
336
337 fprintf(stderr, "--------------------------------------------------------------------------\n");
338 if (orte_cmd_options.help) {
339 fprintf(stderr, "%s cannot provide the help message when run as root.\n\n", orte_basename);
340 } else {
341 fprintf(stderr, "%s has detected an attempt to run as root.\n\n", orte_basename);
342 }
343
344 fprintf(stderr, "Running as root is *strongly* discouraged as any mistake (e.g., in\n");
345 fprintf(stderr, "defining TMPDIR) or bug can result in catastrophic damage to the OS\n");
346 fprintf(stderr, "file system, leaving your system in an unusable state.\n\n");
347
348 fprintf(stderr, "We strongly suggest that you run %s as a non-root user.\n\n", orte_basename);
349
350 fprintf(stderr, "You can override this protection by adding the --allow-run-as-root option\n");
351 fprintf(stderr, "to the cmd line or by setting two environment variables in the following way:\n");
352 fprintf(stderr, "the variable OMPI_ALLOW_RUN_AS_ROOT=1 to indicate the desire to override this\n");
353 fprintf(stderr, "protection, and OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 to confirm the choice and\n");
354 fprintf(stderr, "add one more layer of certainty that you want to do so.\n");
355 fprintf(stderr, "We reiterate our advice against doing so - please proceed at your own risk.\n");
356 fprintf(stderr, "--------------------------------------------------------------------------\n");
357 exit(1);
358 }
359
360 moveon:
361
362 rc = mca_base_cmd_line_process_args(orte_cmd_line, &environ, &environ);
363 if (ORTE_SUCCESS != rc) {
364 return rc;
365 }
366
367
368 if (OPAL_SUCCESS != (rc = opal_init(&argc, &argv))) {
369 return rc;
370 }
371
372
373 if (NULL != orte_cmd_options.help) {
374 print_help();
375
376
377 exit(0);
378 }
379
380
381 if (ORTE_PROC_TYPE_NONE == orte_process_info.proc_type) {
382
383
384 if (NULL == orte_cmd_options.hnp) {
385 orte_process_info.proc_type = ORTE_PROC_HNP;
386 } else {
387 orte_process_info.proc_type = ORTE_PROC_TOOL;
388 }
389 }
390 if (ORTE_PROC_IS_TOOL) {
391 if (0 == strncasecmp(orte_cmd_options.hnp, "file", strlen("file"))) {
392 char input[1024], *filename;
393 FILE *fp;
394
395
396 filename = strchr(orte_cmd_options.hnp, ':');
397 if (NULL == filename) {
398
399 orte_show_help("help-orte-top.txt", "orte-top:hnp-filename-bad", true, "uri", orte_cmd_options.hnp);
400 exit(1);
401 }
402 ++filename;
403
404 if (0 >= strlen(filename)) {
405
406 orte_show_help("help-orte-top.txt", "orte-top:hnp-filename-bad", true, "uri", orte_cmd_options.hnp);
407 exit(1);
408 }
409
410
411 fp = fopen(filename, "r");
412 if (NULL == fp) {
413 orte_show_help("help-orte-top.txt", "orte-top:hnp-filename-access", true, orte_cmd_options.hnp);
414 exit(1);
415 }
416
417
418 memset(input, 0, 1024);
419 if (NULL == fgets(input, 1024, fp)) {
420
421 fclose(fp);
422 orte_show_help("help-orte-top.txt", "orte-top:hnp-file-bad", true, orte_cmd_options.hnp);
423 exit(1);
424 }
425 fclose(fp);
426 input[strlen(input)-1] = '\0';
427
428 opal_setenv(OPAL_MCA_PREFIX"orte_hnp_uri", input, true, &environ);
429 } else {
430
431 opal_setenv(OPAL_MCA_PREFIX"orte_hnp_uri", orte_cmd_options.hnp, true, &environ);
432 }
433
434
435 opal_setenv(OPAL_MCA_PREFIX"ess", "tool", true, &environ);
436 } else {
437
438
439
440
441
442
443
444
445
446
447 orte_cmd_options.prefix = NULL;
448 orte_cmd_options.path_to_mpirun = NULL;
449 if (opal_cmd_line_is_taken(orte_cmd_line, "prefix") ||
450 '/' == argv[0][0] || want_prefix_by_default) {
451 size_t param_len;
452 if ('/' == argv[0][0]) {
453 char* tmp_basename = NULL;
454
455
456 orte_cmd_options.path_to_mpirun = opal_dirname(argv[0]);
457
458
459
460
461 tmp_basename = opal_basename(orte_cmd_options.path_to_mpirun);
462 if (0 == strcmp("bin", tmp_basename)) {
463 char* tmp = orte_cmd_options.path_to_mpirun;
464 orte_cmd_options.path_to_mpirun = opal_dirname(tmp);
465 free(tmp);
466 } else {
467 free(orte_cmd_options.path_to_mpirun);
468 orte_cmd_options.path_to_mpirun = NULL;
469 }
470 free(tmp_basename);
471 }
472
473 if (opal_cmd_line_is_taken(orte_cmd_line, "prefix") &&
474 NULL != orte_cmd_options.path_to_mpirun) {
475 char *tmp_basename;
476
477 param = strdup(opal_cmd_line_get_param(orte_cmd_line, "prefix", 0, 0));
478
479 if (0 == strcmp(OPAL_PATH_SEP, &(param[strlen(param)-1]))) {
480 param[strlen(param)-1] = '\0';
481 }
482 tmp_basename = strdup(orte_cmd_options.path_to_mpirun);
483 if (0 == strcmp(OPAL_PATH_SEP, &(tmp_basename[strlen(tmp_basename)-1]))) {
484 tmp_basename[strlen(tmp_basename)-1] = '\0';
485 }
486 if (0 != strcmp(param, tmp_basename)) {
487 orte_show_help("help-orterun.txt", "orterun:double-prefix",
488 true, orte_basename, orte_basename,
489 param, tmp_basename, orte_basename);
490
491
492
493
494 free(orte_cmd_options.path_to_mpirun);
495 orte_cmd_options.path_to_mpirun = NULL;
496 }
497 free(tmp_basename);
498 } else if (NULL != orte_cmd_options.path_to_mpirun) {
499 param = strdup(orte_cmd_options.path_to_mpirun);
500 } else if (opal_cmd_line_is_taken(orte_cmd_line, "prefix")){
501
502 param = strdup(opal_cmd_line_get_param(orte_cmd_line, "prefix", 0, 0));
503 } else {
504
505 param = strdup(opal_install_dirs.prefix);
506 }
507
508 if (NULL != param) {
509
510 param_len = strlen(param);
511 while (0 == strcmp (OPAL_PATH_SEP, &(param[param_len-1]))) {
512 param[param_len-1] = '\0';
513 param_len--;
514 if (0 == param_len) {
515 orte_show_help("help-orterun.txt", "orterun:empty-prefix",
516 true, orte_basename, orte_basename);
517 free(param);
518 return ORTE_ERR_FATAL;
519 }
520 }
521
522 orte_cmd_options.prefix = param;
523 }
524 want_prefix_by_default = true;
525 }
526 }
527
528
529 orte_register_params();
530
531 if (orte_cmd_options.debug) {
532 orte_devel_level_output = true;
533 }
534
535
536
537
538
539
540
541 if (ORTE_SUCCESS != (rc = orte_init(&argc, &argv,
542 orte_process_info.proc_type))) {
543
544
545
546 return rc;
547 }
548
549
550
551 opal_finalize();
552
553 if (ORTE_PROC_IS_TOOL) {
554 opal_value_t val;
555
556 if (ORTE_SUCCESS != orte_rml_base_parse_uris(orte_process_info.my_hnp_uri, ORTE_PROC_MY_HNP, NULL)) {
557 orte_show_help("help-orte-top.txt", "orte-top:hnp-uri-bad", true, orte_process_info.my_hnp_uri);
558 exit(1);
559 }
560
561 OBJ_CONSTRUCT(&val, opal_value_t);
562 val.key = OPAL_PMIX_PROC_URI;
563 val.type = OPAL_STRING;
564 val.data.string = orte_process_info.my_hnp_uri;
565 if (OPAL_SUCCESS != opal_pmix.store_local(ORTE_PROC_MY_HNP, &val)) {
566 val.key = NULL;
567 val.data.string = NULL;
568 OBJ_DESTRUCT(&val);
569 orte_show_help("help-orte-top.txt", "orte-top:hnp-uri-bad", true, orte_process_info.my_hnp_uri);
570 orte_finalize();
571 exit(1);
572 }
573 val.key = NULL;
574 val.data.string = NULL;
575 OBJ_DESTRUCT(&val);
576
577
578 if (ORTE_SUCCESS != orte_routed.update_route(ORTE_PROC_MY_HNP, ORTE_PROC_MY_HNP)) {
579 orte_show_help("help-orte-top.txt", "orte-top:hnp-uri-bad", true, orte_process_info.my_hnp_uri);
580 orte_finalize();
581 exit(1);
582 }
583
584
585 orte_routed.set_lifeline(ORTE_PROC_MY_HNP);
586
587
588 orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_NOTIFY_COMPLETE,
589 ORTE_RML_PERSISTENT, complete_recv, NULL);
590 orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_LAUNCH_RESP,
591 ORTE_RML_PERSISTENT, launch_recv, NULL);
592 } else {
593
594
595
596
597
598
599 orte_launch_environ = opal_argv_copy(environ);
600
601
602 opal_unsetenv(OPAL_MCA_PREFIX"ess", &orte_launch_environ);
603 opal_unsetenv(OPAL_MCA_PREFIX"pmix", &orte_launch_environ);
604 }
605
606 return ORTE_SUCCESS;
607 }
608
609 static void print_help()
610 {
611 char *str = NULL, *args;
612 char *project_name = NULL;
613
614 if (0 == strcmp(orte_basename, "mpirun")) {
615 project_name = "Open MPI";
616 } else {
617 project_name = "OpenRTE";
618 }
619 args = opal_cmd_line_get_usage_msg(orte_cmd_line);
620 str = opal_show_help_string("help-orterun.txt", "orterun:usage", false,
621 orte_basename, project_name, OPAL_VERSION,
622 orte_basename, args,
623 PACKAGE_BUGREPORT);
624 if (NULL != str) {
625 printf("%s", str);
626 free(str);
627 }
628 free(args);
629 }
630
631 void orte_submit_finalize(void)
632 {
633 trackr_t *trk;
634 int i, rc;
635
636 for (i=0; i < tool_jobs.size; i++) {
637 if (NULL != (trk = (trackr_t*)opal_pointer_array_get_item(&tool_jobs, i))) {
638 OBJ_RELEASE(trk);
639 }
640 }
641 OBJ_DESTRUCT(&tool_jobs);
642
643
644 if (ORTE_SUCCESS != (rc = mca_base_framework_close(&orte_schizo_base_framework))) {
645 ORTE_ERROR_LOG(rc);
646 return;
647 }
648
649
650 if (OPAL_SUCCESS != (rc = opal_finalize_util())) {
651 return;
652 }
653
654
655 if (NULL != orte_cmd_line) {
656 OBJ_RELEASE(orte_cmd_line);
657 }
658
659
660 if (0 <= orte_debugger_attach_fd) {
661 if (orte_debugger_fifo_active) {
662 opal_event_del(orte_debugger_attach);
663 free(orte_debugger_attach);
664 }
665 close(orte_debugger_attach_fd);
666 unlink(MPIR_attach_fifo);
667 }
668
669 if (NULL != orte_cmd_options.prefix) {
670 free(orte_cmd_options.prefix);
671 }
672 if (NULL != orte_launch_environ) {
673 opal_argv_free(orte_launch_environ);
674 }
675 if (NULL != orte_basename) {
676 free(orte_basename);
677 }
678 }
679
680 int orte_submit_cancel(int index) {
681
682 int rc;
683 trackr_t *trk;
684 opal_buffer_t *req;
685 orte_daemon_cmd_flag_t cmd = ORTE_DAEMON_TERMINATE_JOB_CMD;
686
687
688 if (NULL == (trk = (trackr_t*)opal_pointer_array_get_item(&tool_jobs, index))) {
689 opal_output(0, "TRACKER ID %d RETURNED INDEX TO NULL OBJECT", index);
690 return ORTE_ERROR;
691 }
692
693
694 req = OBJ_NEW(opal_buffer_t);
695 if (OPAL_SUCCESS != (rc = opal_dss.pack(req, &cmd, 1, ORTE_DAEMON_CMD))) {
696 ORTE_ERROR_LOG(rc);
697 return rc;
698 }
699 if (OPAL_SUCCESS != (rc = opal_dss.pack(req, &trk->jdata->jobid, 1, ORTE_JOBID))) {
700 ORTE_ERROR_LOG(rc);
701 return rc;
702 }
703 rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, req, ORTE_RML_TAG_DAEMON,
704 orte_rml_send_callback, NULL);
705 if (ORTE_SUCCESS != rc) {
706 ORTE_ERROR_LOG(rc);
707 OBJ_RELEASE(req);
708 return rc;
709 }
710
711 return ORTE_ERR_OP_IN_PROGRESS;
712 }
713
714
715 int orte_submit_halt(void)
716 {
717 int rc;
718 opal_buffer_t *req;
719 orte_daemon_cmd_flag_t cmd = ORTE_DAEMON_HALT_DVM_CMD;
720
721 req = OBJ_NEW(opal_buffer_t);
722 if (OPAL_SUCCESS != (rc = opal_dss.pack(req, &cmd, 1, ORTE_DAEMON_CMD))) {
723 ORTE_ERROR_LOG(rc);
724 return rc;
725 }
726 rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, req,
727 ORTE_RML_TAG_DAEMON,
728 orte_rml_send_callback, NULL);
729 if (ORTE_SUCCESS != rc) {
730 ORTE_ERROR_LOG(rc);
731 OBJ_RELEASE(req);
732 return rc;
733 }
734
735 return ORTE_ERR_OP_IN_PROGRESS;
736 }
737
738
739
740
741 int orte_submit_job(char *argv[], int *index,
742 orte_submit_cbfunc_t launch_cb,
743 void *launch_cbdata,
744 orte_submit_cbfunc_t complete_cb,
745 void *complete_cbdata)
746 {
747 opal_buffer_t *req;
748 int rc, n;
749 orte_app_idx_t i;
750 orte_daemon_cmd_flag_t cmd = ORTE_DAEMON_SPAWN_JOB_CMD;
751 char *param;
752 orte_job_t *jdata = NULL, *daemons;
753 orte_app_context_t *app, *dapp;
754 trackr_t *trk;
755 int argc;
756
757
758 if (NULL != getenv("OMPI_UNIVERSE_SIZE")) {
759 fprintf(stderr, "\n\n**********************************************************\n\n");
760 fprintf(stderr, "%s does not support recursive calls\n", orte_basename);
761 fprintf(stderr, "\n**********************************************************\n");
762 return ORTE_ERR_FATAL;
763 }
764
765
766
767 init_globals();
768
769 argc = opal_argv_count(argv);
770
771
772
773 if (OPAL_SUCCESS != (rc = opal_cmd_line_parse(orte_cmd_line, true, false,
774 argc, argv)) ) {
775 if (OPAL_ERR_SILENT != rc) {
776 fprintf(stderr, "%s: command line error (%s)\n", argv[0],
777 opal_strerror(rc));
778 }
779 return rc;
780 }
781
782
783 parse_globals(argc, argv, orte_cmd_line);
784
785
786
787
788
789 jdata = OBJ_NEW(orte_job_t);
790 if (NULL == jdata) {
791
792
793
794 return ORTE_ERR_OUT_OF_RESOURCE;
795 }
796
797 if (NULL != orte_cmd_options.personality) {
798 jdata->personality = opal_argv_split(orte_cmd_options.personality, ',');
799 } else {
800
801 opal_argv_append_nosize(&jdata->personality, "ompi");
802 }
803
804 trk = OBJ_NEW(trackr_t);
805 trk->jdata = jdata;
806 trk->launch_cb = launch_cb;
807 trk->launch_cbdata = launch_cbdata;
808 trk->complete_cb = complete_cb;
809 trk->complete_cbdata = complete_cbdata;
810 trk->index = opal_pointer_array_add(&tool_jobs, trk);
811
812
813
814 orte_set_attribute(&jdata->attributes, ORTE_JOB_ROOM_NUM, ORTE_ATTR_GLOBAL, &trk->index, OPAL_INT);
815
816
817
818 if (orte_cmd_options.tag_output) {
819 orte_set_attribute(&jdata->attributes, ORTE_JOB_TAG_OUTPUT, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL);
820 }
821
822 if (orte_cmd_options.timestamp_output) {
823 orte_set_attribute(&jdata->attributes, ORTE_JOB_TIMESTAMP_OUTPUT, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL);
824 }
825
826 if (NULL != orte_cmd_options.output_filename) {
827
828
829
830
831 if (!opal_path_is_absolute(orte_cmd_options.output_filename)) {
832 char cwd[OPAL_PATH_MAX], *path;
833 getcwd(cwd, sizeof(cwd));
834 path = opal_os_path(false, cwd, orte_cmd_options.output_filename, NULL);
835 orte_set_attribute(&jdata->attributes, ORTE_JOB_OUTPUT_TO_FILE, ORTE_ATTR_GLOBAL, path, OPAL_STRING);
836 free(path);
837 } else {
838 orte_set_attribute(&jdata->attributes, ORTE_JOB_OUTPUT_TO_FILE, ORTE_ATTR_GLOBAL, orte_cmd_options.output_filename, OPAL_STRING);
839 }
840 }
841
842 if (orte_cmd_options.merge) {
843 orte_set_attribute(&jdata->attributes, ORTE_JOB_MERGE_STDERR_STDOUT, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL);
844 }
845
846
847 if (NULL != orte_cmd_options.stdin_target) {
848 if (0 == strcmp(orte_cmd_options.stdin_target, "all")) {
849 jdata->stdin_target = ORTE_VPID_WILDCARD;
850 } else if (0 == strcmp(orte_cmd_options.stdin_target, "none")) {
851 jdata->stdin_target = ORTE_VPID_INVALID;
852 } else {
853 jdata->stdin_target = strtoul(orte_cmd_options.stdin_target, NULL, 10);
854 }
855 }
856
857
858 if (orte_cmd_options.index_argv) {
859 orte_set_attribute(&jdata->attributes, ORTE_JOB_INDEX_ARGV, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL);
860 }
861
862
863 parse_locals(jdata, argc, argv);
864
865 if (0 == jdata->num_apps) {
866
867
868 orte_show_help("help-orterun.txt", "orterun:nothing-to-do",
869 true, orte_basename);
870 ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
871 return ORTE_ERR_FATAL;
872 }
873
874
875 jdata->map = OBJ_NEW(orte_job_map_t);
876
877 if (NULL != orte_cmd_options.mapping_policy) {
878 if (ORTE_SUCCESS != (rc = orte_rmaps_base_set_mapping_policy(jdata, &jdata->map->mapping, NULL, orte_cmd_options.mapping_policy))) {
879 ORTE_ERROR_LOG(rc);
880 return rc;
881 }
882 } else if (orte_cmd_options.pernode) {
883 ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_PPR);
884 ORTE_SET_MAPPING_DIRECTIVE(jdata->map->mapping, ORTE_MAPPING_GIVEN);
885
886 jdata->map->ppr = strdup("1:node");
887 } else if (0 < orte_cmd_options.npernode) {
888 ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_PPR);
889 ORTE_SET_MAPPING_DIRECTIVE(jdata->map->mapping, ORTE_MAPPING_GIVEN);
890
891 opal_asprintf(&jdata->map->ppr, "%d:node", orte_cmd_options.npernode);
892 } else if (0 < orte_cmd_options.npersocket) {
893 ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_PPR);
894 ORTE_SET_MAPPING_DIRECTIVE(jdata->map->mapping, ORTE_MAPPING_GIVEN);
895
896 opal_asprintf(&jdata->map->ppr, "%d:socket", orte_cmd_options.npersocket);
897 }
898
899
900
901 if (0 < orte_cmd_options.cpus_per_proc) {
902 jdata->map->cpus_per_rank = orte_cmd_options.cpus_per_proc;
903 }
904
905 if (NULL != orte_cmd_options.ranking_policy) {
906 if (ORTE_SUCCESS != (rc = orte_rmaps_base_set_ranking_policy(&jdata->map->ranking,
907 jdata->map->mapping,
908 orte_cmd_options.ranking_policy))) {
909 ORTE_ERROR_LOG(rc);
910 return rc;
911 }
912 }
913
914 if (NULL != orte_cmd_options.binding_policy) {
915 if (ORTE_SUCCESS != (rc = opal_hwloc_base_set_binding_policy(&jdata->map->binding,
916 orte_cmd_options.binding_policy))) {
917 ORTE_ERROR_LOG(rc);
918 return rc;
919 }
920 }
921
922
923 if (orte_cmd_options.nolocal) {
924 ORTE_SET_MAPPING_DIRECTIVE(jdata->map->mapping, ORTE_MAPPING_NO_USE_LOCAL);
925 }
926 if (orte_cmd_options.no_oversubscribe) {
927 ORTE_SET_MAPPING_DIRECTIVE(jdata->map->mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE);
928 }
929 if (orte_cmd_options.oversubscribe) {
930 ORTE_UNSET_MAPPING_DIRECTIVE(jdata->map->mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE);
931 ORTE_SET_MAPPING_DIRECTIVE(jdata->map->mapping, ORTE_MAPPING_SUBSCRIBE_GIVEN);
932 }
933 if (orte_cmd_options.report_bindings) {
934 orte_set_attribute(&jdata->attributes, ORTE_JOB_REPORT_BINDINGS, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL);
935 }
936 if (orte_cmd_options.cpu_list) {
937 orte_set_attribute(&jdata->attributes, ORTE_JOB_CPU_LIST, ORTE_ATTR_GLOBAL, orte_cmd_options.cpu_list, OPAL_STRING);
938 }
939
940
941 if (orte_enable_recovery) {
942 ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_RECOVERABLE);
943 if (0 == orte_max_restarts) {
944
945 orte_set_attribute(&jdata->attributes, ORTE_JOB_CONTINUOUS_OP, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL);
946 }
947 }
948
949 if (0 < orte_max_restarts) {
950 for (i=0; i < jdata->num_apps; i++) {
951 if (NULL != (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
952 orte_set_attribute(&app->attributes, ORTE_APP_MAX_RESTARTS, ORTE_ATTR_GLOBAL, &orte_max_restarts, OPAL_INT32);
953 }
954 }
955 }
956
957 if (orte_cmd_options.continuous) {
958
959 orte_set_attribute(&jdata->attributes, ORTE_JOB_CONTINUOUS_OP, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL);
960 }
961
962
963 if (NULL != getenv("ORTE_TEST_DEBUGGER_ATTACH")) {
964 char *evar;
965 evar = getenv("ORTE_TEST_DEBUGGER_SLEEP");
966 for (n=0; n < (int)jdata->num_apps; n++) {
967 if (NULL != (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, n))) {
968 opal_setenv("ORTE_TEST_DEBUGGER_ATTACH", "1", true, &app->env);
969 if (NULL != evar) {
970 opal_setenv("ORTE_TEST_DEBUGGER_SLEEP", evar, true, &app->env);
971 }
972 }
973 }
974 }
975
976
977 if (NULL != getenv("ORTE_TEST_HNP_SUICIDE") ||
978 NULL != getenv("ORTE_TEST_ORTED_SUICIDE")) {
979
980
981 ORTE_FLAG_UNSET(jdata, ORTE_JOB_FLAG_FORWARD_OUTPUT);
982 }
983
984
985
986
987 param = NULL;
988 if (0 < orte_cmd_options.timeout ||
989 NULL != (param = getenv("MPIEXEC_TIMEOUT"))) {
990 if (NULL != param) {
991 timeout_seconds = strtol(param, NULL, 10);
992
993 if (0 < orte_cmd_options.timeout && timeout_seconds != orte_cmd_options.timeout) {
994 orte_show_help("help-orterun.txt", "orterun:timeoutconflict", false,
995 orte_basename, orte_cmd_options.timeout, param);
996 exit(1);
997 }
998 } else {
999 timeout_seconds = orte_cmd_options.timeout;
1000 }
1001 if (NULL == (orte_mpiexec_timeout = OBJ_NEW(orte_timer_t))) {
1002 ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
1003 ORTE_UPDATE_EXIT_STATUS(ORTE_ERR_OUT_OF_RESOURCE);
1004
1005 }
1006 orte_mpiexec_timeout->tv.tv_sec = timeout_seconds;
1007 orte_mpiexec_timeout->tv.tv_usec = 0;
1008 opal_event_evtimer_set(orte_event_base, orte_mpiexec_timeout->ev,
1009 orte_timeout_wakeup, jdata);
1010 opal_event_set_priority(orte_mpiexec_timeout->ev, ORTE_ERROR_PRI);
1011 opal_event_evtimer_add(orte_mpiexec_timeout->ev, &orte_mpiexec_timeout->tv);
1012 }
1013
1014
1015 if (NULL != (param = getenv("OMPI_MEMPROFILE"))) {
1016 timeout_seconds = strtol(param, NULL, 10);
1017 if (NULL == (orte_memprofile_timeout = OBJ_NEW(orte_timer_t))) {
1018 ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
1019 ORTE_UPDATE_EXIT_STATUS(ORTE_ERR_OUT_OF_RESOURCE);
1020
1021 }
1022 orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_MEMPROFILE,
1023 ORTE_RML_PERSISTENT, profile_recv, NULL);
1024 orte_memprofile_timeout->tv.tv_sec = timeout_seconds;
1025 orte_memprofile_timeout->tv.tv_usec = 0;
1026 opal_event_evtimer_set(orte_event_base, orte_memprofile_timeout->ev,
1027 orte_profile_wakeup, jdata);
1028 opal_event_set_priority(orte_memprofile_timeout->ev, ORTE_ERROR_PRI);
1029 opal_event_evtimer_add(orte_memprofile_timeout->ev, &orte_memprofile_timeout->tv);
1030 }
1031 if (ORTE_PROC_IS_HNP) {
1032
1033 daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
1034
1035
1036 if (NULL != orte_cmd_options.report_uri) {
1037 FILE *fp;
1038 char *rml_uri;
1039 orte_oob_base_get_addr(&rml_uri);
1040 if (0 == strcmp(orte_cmd_options.report_uri, "-")) {
1041
1042 printf("%s\n", (NULL == rml_uri) ? "NULL" : rml_uri);
1043 } else if (0 == strcmp(orte_cmd_options.report_uri, "+")) {
1044
1045 fprintf(stderr, "%s\n", (NULL == rml_uri) ? "NULL" : rml_uri);
1046 } else {
1047 fp = fopen(orte_cmd_options.report_uri, "w");
1048 if (NULL == fp) {
1049 orte_show_help("help-orterun.txt", "orterun:write_file", false,
1050 orte_basename, "uri", orte_cmd_options.report_uri);
1051 exit(1);
1052 }
1053 fprintf(fp, "%s\n", (NULL == rml_uri) ? "NULL" : rml_uri);
1054 fclose(fp);
1055 }
1056 if (NULL != rml_uri) {
1057 free(rml_uri);
1058 }
1059 }
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069 param = NULL;
1070 if (NULL != (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, 0)) &&
1071 orte_get_attribute(&app->attributes, ORTE_APP_PREFIX_DIR, (void**)¶m, OPAL_STRING)) {
1072 char *oldenv, *newenv, *lib_base, *bin_base;
1073
1074
1075
1076
1077 if (NULL == (dapp = (orte_app_context_t*)opal_pointer_array_get_item(daemons->apps, 0))) {
1078
1079 ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
1080 return ORTE_ERR_NOT_FOUND;
1081 }
1082 orte_set_attribute(&dapp->attributes, ORTE_APP_PREFIX_DIR, ORTE_ATTR_LOCAL, param, OPAL_STRING);
1083
1084 lib_base = opal_basename(opal_install_dirs.libdir);
1085 bin_base = opal_basename(opal_install_dirs.bindir);
1086
1087
1088 newenv = opal_os_path( false, param, bin_base, NULL );
1089 oldenv = getenv("PATH");
1090 if (NULL != oldenv) {
1091 char *temp;
1092 opal_asprintf(&temp, "%s:%s", newenv, oldenv );
1093 free( newenv );
1094 newenv = temp;
1095 }
1096 opal_setenv("PATH", newenv, true, &orte_launch_environ);
1097 if (orte_debug_flag) {
1098 opal_output(0, "%s: reset PATH: %s", orte_basename, newenv);
1099 }
1100 free(newenv);
1101 free(bin_base);
1102
1103
1104 newenv = opal_os_path( false, param, lib_base, NULL );
1105 oldenv = getenv("LD_LIBRARY_PATH");
1106 if (NULL != oldenv) {
1107 char* temp;
1108 opal_asprintf(&temp, "%s:%s", newenv, oldenv);
1109 free(newenv);
1110 newenv = temp;
1111 }
1112 opal_setenv("LD_LIBRARY_PATH", newenv, true, &orte_launch_environ);
1113 if (orte_debug_flag) {
1114 opal_output(0, "%s: reset LD_LIBRARY_PATH: %s",
1115 orte_basename, newenv);
1116 }
1117 free(newenv);
1118 free(lib_base);
1119 free(param);
1120 }
1121
1122
1123 orte_debugger_init_before_spawn(jdata);
1124
1125 rc = orte_plm.spawn(jdata);
1126 } else {
1127
1128 orte_set_attribute(&jdata->attributes, ORTE_JOB_DVM_JOB, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL);
1129
1130
1131
1132 orte_set_attribute(&jdata->attributes, ORTE_JOB_FIXED_DVM, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL);
1133
1134 req = OBJ_NEW(opal_buffer_t);
1135 if (OPAL_SUCCESS != (rc = opal_dss.pack(req, &cmd, 1, ORTE_DAEMON_CMD))) {
1136 ORTE_ERROR_LOG(rc);
1137 return rc;
1138 }
1139 if (OPAL_SUCCESS != (rc = opal_dss.pack(req, &jdata, 1, ORTE_JOB))) {
1140 ORTE_ERROR_LOG(rc);
1141 return rc;
1142 }
1143 if (OPAL_SUCCESS != (rc = opal_dss.pack(req, &trk->index, 1, OPAL_INT))) {
1144 ORTE_ERROR_LOG(rc);
1145 return rc;
1146 }
1147 orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, req, ORTE_RML_TAG_DAEMON,
1148 orte_rml_send_callback, NULL);
1149
1150
1151 if (NULL != index) {
1152 *index = trk->index;
1153 }
1154 }
1155
1156 return ORTE_SUCCESS;
1157
1158 }
1159
1160
1161 static int init_globals(void)
1162 {
1163
1164 orte_cmd_options.help = NULL;
1165 orte_cmd_options.version = false;
1166 orte_cmd_options.num_procs = 0;
1167 if (NULL != orte_cmd_options.appfile) {
1168 free(orte_cmd_options.appfile);
1169 orte_cmd_options.appfile = NULL;
1170 }
1171 if (NULL != orte_cmd_options.wdir) {
1172 free(orte_cmd_options.wdir);
1173 orte_cmd_options.wdir = NULL;
1174 }
1175 orte_cmd_options.set_cwd_to_session_dir = false;
1176 if (NULL != orte_cmd_options.path) {
1177 free(orte_cmd_options.path);
1178 orte_cmd_options.path = NULL;
1179 }
1180 if (NULL != orte_cmd_options.hnp) {
1181 free(orte_cmd_options.hnp);
1182 orte_cmd_options.hnp = NULL;
1183 }
1184 if (NULL != orte_cmd_options.stdin_target) {
1185 free(orte_cmd_options.stdin_target);
1186 orte_cmd_options.stdin_target = NULL ;
1187 }
1188 if (NULL != orte_cmd_options.output_filename) {
1189 free(orte_cmd_options.output_filename);
1190 orte_cmd_options.output_filename = NULL ;
1191 }
1192 if (NULL != orte_cmd_options.binding_policy) {
1193 free(orte_cmd_options.binding_policy);
1194 orte_cmd_options.binding_policy = NULL;
1195 }
1196 if (NULL != orte_cmd_options.mapping_policy) {
1197 free(orte_cmd_options.mapping_policy);
1198 orte_cmd_options.mapping_policy = NULL;
1199 }
1200 if (NULL != orte_cmd_options.ranking_policy) {
1201 free(orte_cmd_options.ranking_policy);
1202 orte_cmd_options.ranking_policy = NULL;
1203 }
1204
1205 if (NULL != orte_cmd_options.report_pid) {
1206 free(orte_cmd_options.report_pid);
1207 orte_cmd_options.report_pid = NULL;
1208 }
1209 if (NULL != orte_cmd_options.report_uri) {
1210 free(orte_cmd_options.report_uri);
1211 orte_cmd_options.report_uri = NULL;
1212 }
1213 if (NULL != orte_cmd_options.cpu_list) {
1214 free(orte_cmd_options.cpu_list);
1215 orte_cmd_options.cpu_list= NULL;
1216 }
1217 orte_cmd_options.preload_binaries = false;
1218 if (NULL != orte_cmd_options.preload_files) {
1219 free(orte_cmd_options.preload_files);
1220 orte_cmd_options.preload_files = NULL;
1221 }
1222
1223
1224
1225 return ORTE_SUCCESS;
1226 }
1227
1228
1229 static int parse_globals(int argc, char* argv[], opal_cmd_line_t *cmd_line)
1230 {
1231
1232 if (NULL != orte_cmd_options.report_pid) {
1233 FILE *fp;
1234 if (0 == strcmp(orte_cmd_options.report_pid, "-")) {
1235
1236 printf("%d\n", (int)getpid());
1237 } else if (0 == strcmp(orte_cmd_options.report_pid, "+")) {
1238
1239 fprintf(stderr, "%d\n", (int)getpid());
1240 } else {
1241 fp = fopen(orte_cmd_options.report_pid, "w");
1242 if (NULL == fp) {
1243 orte_show_help("help-orterun.txt", "orterun:write_file", false,
1244 orte_basename, "pid", orte_cmd_options.report_pid);
1245 exit(0);
1246 }
1247 fprintf(fp, "%d\n", (int)getpid());
1248 fclose(fp);
1249 }
1250 }
1251
1252
1253
1254 if (orte_cmd_options.debugger) {
1255 run_debugger(orte_basename, cmd_line, argc, argv, orte_cmd_options.num_procs);
1256 }
1257
1258 return ORTE_SUCCESS;
1259 }
1260
1261
1262 static int parse_locals(orte_job_t *jdata, int argc, char* argv[])
1263 {
1264 int i, rc, app_num;
1265 int temp_argc;
1266 char **temp_argv, **env;
1267 orte_app_context_t *app;
1268 bool made_app;
1269 orte_std_cntr_t j, size1;
1270
1271
1272 temp_argc = 0;
1273 temp_argv = NULL;
1274 opal_argv_append(&temp_argc, &temp_argv, argv[0]);
1275
1276
1277
1278
1279
1280 env = NULL;
1281 for (app_num = 0, i = 1; i < argc; ++i) {
1282 if (0 == strcmp(argv[i], ":")) {
1283
1284 if (opal_argv_count(temp_argv) > 1) {
1285 if (NULL != env) {
1286 opal_argv_free(env);
1287 env = NULL;
1288 }
1289 app = NULL;
1290 rc = create_app(temp_argc, temp_argv, jdata, &app, &made_app, &env);
1291
1292 if (ORTE_SUCCESS != rc) {
1293
1294
1295
1296 exit(1);
1297 }
1298 if (made_app) {
1299 app->idx = app_num;
1300 ++app_num;
1301 opal_pointer_array_add(jdata->apps, app);
1302 ++jdata->num_apps;
1303 }
1304
1305
1306
1307 temp_argc = 0;
1308 temp_argv = NULL;
1309 opal_argv_append(&temp_argc, &temp_argv, argv[0]);
1310 }
1311 } else {
1312 opal_argv_append(&temp_argc, &temp_argv, argv[i]);
1313 }
1314 }
1315
1316 if (opal_argv_count(temp_argv) > 1) {
1317 app = NULL;
1318 rc = create_app(temp_argc, temp_argv, jdata, &app, &made_app, &env);
1319 if (ORTE_SUCCESS != rc) {
1320
1321
1322 exit(1);
1323 }
1324 if (made_app) {
1325 app->idx = app_num;
1326 ++app_num;
1327 opal_pointer_array_add(jdata->apps, app);
1328 ++jdata->num_apps;
1329 }
1330 }
1331 if (NULL != env) {
1332 opal_argv_free(env);
1333 }
1334 opal_argv_free(temp_argv);
1335
1336
1337
1338
1339
1340 if (NULL != global_mca_env) {
1341 size1 = (size_t)opal_pointer_array_get_size(jdata->apps);
1342
1343 for (j = 0; j < size1; ++j) {
1344 app = (orte_app_context_t *)
1345 opal_pointer_array_get_item(jdata->apps, j);
1346 if (NULL != app) {
1347
1348 env = opal_environ_merge(global_mca_env, app->env);
1349 opal_argv_free(app->env);
1350 app->env = env;
1351 }
1352 }
1353 }
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366 env = NULL;
1367 if (NULL != global_mca_env) {
1368 env = global_mca_env;
1369 } else {
1370 if (opal_pointer_array_get_size(jdata->apps) >= 1) {
1371
1372
1373
1374 app = (orte_app_context_t *)
1375 opal_pointer_array_get_item(jdata->apps, 0);
1376 if (NULL != app) {
1377 env = app->env;
1378 for (j = 1; j < opal_pointer_array_get_size(jdata->apps); ++j) {
1379 if (NULL != opal_pointer_array_get_item(jdata->apps, j)) {
1380 env = NULL;
1381 break;
1382 }
1383 }
1384 }
1385 }
1386 }
1387
1388 if (NULL != env) {
1389 size1 = opal_argv_count(env);
1390 for (j = 0; j < size1; ++j) {
1391
1392
1393
1394
1395
1396
1397
1398 char *value, *s = strdup(env[j]);
1399
1400 if (NULL == s) {
1401 return OPAL_ERR_OUT_OF_RESOURCE;
1402 }
1403
1404 value = strchr(s, '=');
1405 if (NULL != value) {
1406 value++;
1407 }
1408 opal_setenv(s, value, true, &environ);
1409 free(s);
1410 }
1411 }
1412
1413
1414
1415 return ORTE_SUCCESS;
1416 }
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440 static int create_app(int argc, char* argv[],
1441 orte_job_t *jdata,
1442 orte_app_context_t **app_ptr,
1443 bool *made_app, char ***app_env)
1444 {
1445 char cwd[OPAL_PATH_MAX];
1446 int i, j, count, rc;
1447 char *param, *value;
1448 orte_app_context_t *app = NULL;
1449 bool found = false;
1450 char *appname = NULL;
1451
1452 *made_app = false;
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464 if (NULL != orte_cmd_options.appfile) {
1465 if (ORTE_SUCCESS != (rc = orte_schizo.parse_cli(argc, 0, argv))) {
1466 goto cleanup;
1467 }
1468 }
1469
1470
1471 init_globals();
1472
1473
1474 if (OPAL_SUCCESS != (rc = opal_cmd_line_parse(orte_cmd_line, true, false,
1475 argc, argv)) ) {
1476 if (OPAL_ERR_SILENT != rc) {
1477 fprintf(stderr, "%s: command line error (%s)\n", argv[0],
1478 opal_strerror(rc));
1479 }
1480 return rc;
1481 }
1482
1483
1484 if (NULL != orte_cmd_options.appfile) {
1485 return parse_appfile(jdata, strdup(orte_cmd_options.appfile), app_env);
1486 }
1487
1488
1489 app = OBJ_NEW(orte_app_context_t);
1490 opal_cmd_line_get_tail(orte_cmd_line, &count, &app->argv);
1491
1492
1493 if (0 == count) {
1494 orte_show_help("help-orterun.txt", "orterun:executable-not-specified",
1495 true, orte_basename, orte_basename);
1496 rc = ORTE_ERR_NOT_FOUND;
1497 goto cleanup;
1498 }
1499
1500
1501
1502
1503
1504
1505
1506
1507 if (ORTE_SUCCESS != (rc = orte_schizo.parse_cli(argc, count, argv))) {
1508 goto cleanup;
1509 }
1510
1511
1512
1513 app->env = opal_argv_copy(*app_env);
1514 if (ORTE_SUCCESS != (rc = orte_schizo.parse_env(orte_cmd_options.path,
1515 orte_cmd_line,
1516 environ, &app->env))) {
1517 goto cleanup;
1518 }
1519
1520
1521
1522
1523 if (NULL != orte_cmd_options.wdir) {
1524
1525 if (opal_path_is_absolute(orte_cmd_options.wdir)) {
1526 app->cwd = strdup(orte_cmd_options.wdir);
1527 } else {
1528
1529 if (OPAL_SUCCESS != (rc = opal_getcwd(cwd, sizeof(cwd)))) {
1530 orte_show_help("help-orterun.txt", "orterun:init-failure",
1531 true, "get the cwd", rc);
1532 goto cleanup;
1533 }
1534
1535 app->cwd = opal_os_path(false, cwd, orte_cmd_options.wdir, NULL);
1536 }
1537 orte_set_attribute(&app->attributes, ORTE_APP_USER_CWD, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL);
1538 } else if (orte_cmd_options.set_cwd_to_session_dir) {
1539 orte_set_attribute(&app->attributes, ORTE_APP_SSNDIR_CWD, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL);
1540 orte_set_attribute(&app->attributes, ORTE_APP_USER_CWD, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL);
1541 } else {
1542 if (OPAL_SUCCESS != (rc = opal_getcwd(cwd, sizeof(cwd)))) {
1543 orte_show_help("help-orterun.txt", "orterun:init-failure",
1544 true, "get the cwd", rc);
1545 goto cleanup;
1546 }
1547 app->cwd = strdup(cwd);
1548 }
1549
1550
1551
1552
1553
1554
1555 if (0 == total_num_apps) {
1556
1557
1558
1559 if (opal_cmd_line_is_taken(orte_cmd_line, "noprefix")) {
1560 want_prefix_by_default = false;
1561 }
1562
1563
1564 if (opal_cmd_line_is_taken(orte_cmd_line, "prefix") || want_prefix_by_default) {
1565 size_t param_len;
1566
1567
1568
1569 if (opal_cmd_line_is_taken(orte_cmd_line, "prefix") &&
1570 NULL != orte_cmd_options.prefix) {
1571
1572 param = strdup(opal_cmd_line_get_param(orte_cmd_line, "prefix", 0, 0));
1573
1574 if (0 == strcmp(OPAL_PATH_SEP, &(param[strlen(param)-1]))) {
1575 param[strlen(param)-1] = '\0';
1576 }
1577 value = strdup(orte_cmd_options.prefix);
1578 if (0 == strcmp(OPAL_PATH_SEP, &(value[strlen(value)-1]))) {
1579 value[strlen(value)-1] = '\0';
1580 }
1581 if (0 != strcmp(param, value)) {
1582 orte_show_help("help-orterun.txt", "orterun:app-prefix-conflict",
1583 true, orte_basename, value, param);
1584
1585
1586
1587 free(param);
1588 param = strdup(orte_cmd_options.prefix);
1589 }
1590 free(value);
1591 } else if (NULL != orte_cmd_options.prefix) {
1592 param = strdup(orte_cmd_options.prefix);
1593 } else if (opal_cmd_line_is_taken(orte_cmd_line, "prefix")){
1594
1595 param = strdup(opal_cmd_line_get_param(orte_cmd_line, "prefix", 0, 0));
1596 } else {
1597
1598 param = strdup(opal_install_dirs.prefix);
1599 }
1600
1601 if (NULL != param) {
1602
1603 param_len = strlen(param);
1604 while (0 == strcmp (OPAL_PATH_SEP, &(param[param_len-1]))) {
1605 param[param_len-1] = '\0';
1606 param_len--;
1607 if (0 == param_len) {
1608 orte_show_help("help-orterun.txt", "orterun:empty-prefix",
1609 true, orte_basename, orte_basename);
1610 free(param);
1611 return ORTE_ERR_FATAL;
1612 }
1613 }
1614 orte_set_attribute(&app->attributes, ORTE_APP_PREFIX_DIR, ORTE_ATTR_GLOBAL, param, OPAL_STRING);
1615 free(param);
1616 }
1617 }
1618 }
1619
1620
1621 if (NULL != orte_cmd_options.pset) {
1622 orte_set_attribute(&app->attributes, ORTE_APP_PSET_NAME, ORTE_ATTR_GLOBAL,
1623 orte_cmd_options.pset, OPAL_STRING);
1624 }
1625
1626
1627
1628
1629
1630 if (0 < (j = opal_cmd_line_get_ninsts(orte_cmd_line, "hostfile"))) {
1631 if(1 < j) {
1632 orte_show_help("help-orterun.txt", "orterun:multiple-hostfiles",
1633 true, orte_basename, NULL);
1634 return ORTE_ERR_FATAL;
1635 } else {
1636 value = opal_cmd_line_get_param(orte_cmd_line, "hostfile", 0, 0);
1637 orte_set_attribute(&app->attributes, ORTE_APP_HOSTFILE, ORTE_ATTR_GLOBAL, value, OPAL_STRING);
1638 }
1639 }
1640 if (0 < (j = opal_cmd_line_get_ninsts(orte_cmd_line, "machinefile"))) {
1641 if(1 < j || orte_get_attribute(&app->attributes, ORTE_APP_HOSTFILE, NULL, OPAL_STRING)) {
1642 orte_show_help("help-orterun.txt", "orterun:multiple-hostfiles",
1643 true, orte_basename, NULL);
1644 return ORTE_ERR_FATAL;
1645 } else {
1646 value = opal_cmd_line_get_param(orte_cmd_line, "machinefile", 0, 0);
1647 orte_set_attribute(&app->attributes, ORTE_APP_HOSTFILE, ORTE_ATTR_GLOBAL, value, OPAL_STRING);
1648 }
1649 }
1650
1651
1652 if (0 < (j = opal_cmd_line_get_ninsts(orte_cmd_line, "host"))) {
1653 char **targ=NULL, *tval;
1654 for (i = 0; i < j; ++i) {
1655 value = opal_cmd_line_get_param(orte_cmd_line, "host", i, 0);
1656 opal_argv_append_nosize(&targ, value);
1657 }
1658 tval = opal_argv_join(targ, ',');
1659 orte_set_attribute(&app->attributes, ORTE_APP_DASH_HOST, ORTE_ATTR_GLOBAL, tval, OPAL_STRING);
1660 opal_argv_free(targ);
1661 free(tval);
1662 } else if (NULL != orte_default_dash_host) {
1663 orte_set_attribute(&app->attributes, ORTE_APP_DASH_HOST, ORTE_ATTR_LOCAL,
1664 orte_default_dash_host, OPAL_STRING);
1665 }
1666
1667
1668 if (0 > orte_cmd_options.num_procs) {
1669 orte_show_help("help-orterun.txt", "orterun:negative-nprocs",
1670 true, orte_basename, app->argv[0],
1671 orte_cmd_options.num_procs, NULL);
1672 return ORTE_ERR_FATAL;
1673 }
1674
1675 app->num_procs = (orte_std_cntr_t)orte_cmd_options.num_procs;
1676 total_num_apps++;
1677
1678
1679
1680
1681
1682
1683 if (NULL == strstr(app->argv[0], "java")) {
1684 if (orte_cmd_options.preload_binaries) {
1685 orte_set_attribute(&app->attributes, ORTE_APP_SSNDIR_CWD, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL);
1686 orte_set_attribute(&app->attributes, ORTE_APP_PRELOAD_BIN, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL);
1687
1688 orte_set_attribute(&app->attributes, ORTE_APP_USER_CWD, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL);
1689 }
1690 }
1691 if (NULL != orte_cmd_options.preload_files) {
1692 orte_set_attribute(&app->attributes, ORTE_APP_PRELOAD_FILES, ORTE_ATTR_GLOBAL,
1693 orte_cmd_options.preload_files, OPAL_STRING);
1694 }
1695
1696
1697
1698
1699
1700
1701 app->app = strdup(app->argv[0]);
1702 if (NULL == app->app) {
1703 orte_show_help("help-orterun.txt", "orterun:call-failed",
1704 true, orte_basename, "library", "strdup returned NULL", errno);
1705 rc = ORTE_ERR_NOT_FOUND;
1706 goto cleanup;
1707 }
1708
1709
1710
1711
1712
1713
1714 appname = opal_basename(app->app);
1715 if (0 == strcmp(appname, "java")) {
1716
1717 found = false;
1718 for (i=1; NULL != app->argv[i]; i++) {
1719 if (NULL != strstr(app->argv[i], "java.library.path")) {
1720 char *dptr;
1721
1722 if (NULL == (dptr = strchr(app->argv[i], '='))) {
1723
1724 rc = ORTE_ERR_BAD_PARAM;
1725 goto cleanup;
1726 }
1727
1728 ++dptr;
1729
1730 found = true;
1731 if (NULL == strstr(app->argv[i], opal_install_dirs.libdir)) {
1732
1733 if (':' == app->argv[i][strlen(app->argv[i]-1)]) {
1734 opal_asprintf(&value, "-Djava.library.path=%s%s", dptr, opal_install_dirs.libdir);
1735 } else {
1736 opal_asprintf(&value, "-Djava.library.path=%s:%s", dptr, opal_install_dirs.libdir);
1737 }
1738 free(app->argv[i]);
1739 app->argv[i] = value;
1740 }
1741 break;
1742 }
1743 }
1744 if (!found) {
1745
1746 opal_asprintf(&value, "-Djava.library.path=%s", opal_install_dirs.libdir);
1747 opal_argv_insert_element(&app->argv, 1, value);
1748 free(value);
1749 }
1750
1751
1752 found = false;
1753 for (i=1; NULL != app->argv[i]; i++) {
1754 if (NULL != strstr(app->argv[i], "cp") ||
1755 NULL != strstr(app->argv[i], "classpath")) {
1756
1757 found = true;
1758
1759 value = opal_os_path(false, opal_install_dirs.libdir, "mpi.jar", NULL);
1760 if (access(value, F_OK ) != -1) {
1761 set_classpath_jar_file(app, i+1, "mpi.jar");
1762 }
1763 free(value);
1764
1765 value = opal_os_path(false, opal_install_dirs.libdir, "shmem.jar", NULL);
1766 if (access(value, F_OK ) != -1) {
1767 set_classpath_jar_file(app, i+1, "shmem.jar");
1768 }
1769 free(value);
1770
1771 opal_asprintf(&value, "%s:%s", app->cwd, app->argv[i+1]);
1772 free(app->argv[i+1]);
1773 app->argv[i+1] = value;
1774 break;
1775 }
1776 }
1777 if (!found) {
1778
1779 found = false;
1780 for (i=0; NULL != environ[i]; i++) {
1781 if (0 == strncmp(environ[i], "CLASSPATH", strlen("CLASSPATH"))) {
1782 value = strchr(environ[i], '=');
1783 ++value;
1784 opal_argv_insert_element(&app->argv, 1, value);
1785
1786 value = opal_os_path(false, opal_install_dirs.libdir, "mpi.jar", NULL);
1787 if (access(value, F_OK ) != -1) {
1788 set_classpath_jar_file(app, 1, "mpi.jar");
1789 }
1790 free(value);
1791
1792 value = opal_os_path(false, opal_install_dirs.libdir, "shmem.jar", NULL);
1793 if (access(value, F_OK ) != -1) {
1794 set_classpath_jar_file(app, 1, "shmem.jar");
1795 }
1796 free(value);
1797
1798 opal_asprintf(&value, "%s:%s", app->cwd, app->argv[1]);
1799 free(app->argv[1]);
1800 app->argv[1] = value;
1801 opal_argv_insert_element(&app->argv, 1, "-cp");
1802 found = true;
1803 break;
1804 }
1805 }
1806 if (!found) {
1807
1808
1809
1810
1811 char *str, *str2;
1812
1813 str = strdup(app->cwd);
1814
1815 value = opal_os_path(false, opal_install_dirs.libdir, "mpi.jar", NULL);
1816 if (access(value, F_OK ) != -1) {
1817 opal_asprintf(&str2, "%s:%s", str, value);
1818 free(str);
1819 str = str2;
1820 }
1821 free(value);
1822
1823 value = opal_os_path(false, opal_install_dirs.libdir, "shmem.jar", NULL);
1824 if (access(value, F_OK ) != -1) {
1825 opal_asprintf(&str2, "%s:%s", str, value);
1826 free(str);
1827 str = str2;
1828 }
1829 free(value);
1830 opal_argv_insert_element(&app->argv, 1, str);
1831 free(str);
1832 opal_argv_insert_element(&app->argv, 1, "-cp");
1833 }
1834 }
1835
1836 for (i=1; i < opal_argv_count(app->argv); i++) {
1837 if (NULL != strstr(app->argv[i], "java.library.path")) {
1838 continue;
1839 } else if (NULL != strstr(app->argv[i], "cp") ||
1840 NULL != strstr(app->argv[i], "classpath")) {
1841
1842 i++;
1843 continue;
1844 }
1845
1846 opal_setenv("OMPI_COMMAND", app->argv[i], true, &app->env);
1847
1848 if ((i+1) < opal_argv_count(app->argv)) {
1849 value = opal_argv_join(&app->argv[i+1], ' ');
1850 opal_setenv("OMPI_ARGV", value, true, &app->env);
1851 free(value);
1852 }
1853 break;
1854 }
1855 } else {
1856
1857 opal_setenv("OMPI_COMMAND", appname, true, &app->env);
1858 if (1 < opal_argv_count(app->argv)) {
1859 value = opal_argv_join(&app->argv[1], ' ');
1860 opal_setenv("OMPI_ARGV", value, true, &app->env);
1861 free(value);
1862 }
1863 }
1864
1865 *app_ptr = app;
1866 app = NULL;
1867 *made_app = true;
1868
1869
1870
1871 cleanup:
1872 if (NULL != app) {
1873 OBJ_RELEASE(app);
1874 }
1875 if (NULL != appname) {
1876 free(appname);
1877 }
1878 return rc;
1879 }
1880
1881 static void set_classpath_jar_file(orte_app_context_t *app, int index, char *jarfile)
1882 {
1883 if (NULL == strstr(app->argv[index], jarfile)) {
1884
1885 char *fmt = ':' == app->argv[index][strlen(app->argv[index]-1)]
1886 ? "%s%s/%s" : "%s:%s/%s";
1887 char *str;
1888 opal_asprintf(&str, fmt, app->argv[index], opal_install_dirs.libdir, jarfile);
1889 free(app->argv[index]);
1890 app->argv[index] = str;
1891 }
1892 }
1893
1894 static int parse_appfile(orte_job_t *jdata, char *filename, char ***env)
1895 {
1896 size_t i, len;
1897 FILE *fp;
1898 char line[BUFSIZ];
1899 int rc, argc, app_num;
1900 char **argv;
1901 orte_app_context_t *app;
1902 bool blank, made_app;
1903 char bogus[] = "bogus ";
1904 char **tmp_env;
1905
1906
1907
1908
1909
1910 if (NULL != orte_cmd_options.appfile) {
1911 free(orte_cmd_options.appfile);
1912 orte_cmd_options.appfile = NULL;
1913 }
1914
1915
1916
1917 fp = fopen(filename, "r");
1918 if (NULL == fp) {
1919 orte_show_help("help-orterun.txt", "orterun:appfile-not-found", true,
1920 filename);
1921 return ORTE_ERR_NOT_FOUND;
1922 }
1923
1924
1925
1926 line[sizeof(line) - 1] = '\0';
1927 app_num = 0;
1928 do {
1929
1930
1931
1932
1933
1934
1935
1936 line[0] = '\0';
1937 strcat(line, bogus);
1938
1939 if (NULL == fgets(line + sizeof(bogus) - 1,
1940 sizeof(line) - sizeof(bogus) - 1, fp)) {
1941 break;
1942 }
1943
1944
1945
1946 len = strlen(line);
1947 if (len > 0 && '\n' == line[len - 1]) {
1948 line[len - 1] = '\0';
1949 if (len > 0) {
1950 --len;
1951 }
1952 }
1953
1954
1955
1956 for (i = 0; i < len; ++i) {
1957 if ('#' == line[i]) {
1958 line[i] = '\0';
1959 break;
1960 } else if (i + 1 < len && '/' == line[i] && '/' == line[i + 1]) {
1961 line[i] = '\0';
1962 break;
1963 }
1964 }
1965
1966
1967
1968 len = strlen(line);
1969 for (blank = true, i = sizeof(bogus); i < len; ++i) {
1970 if (!isspace(line[i])) {
1971 blank = false;
1972 break;
1973 }
1974 }
1975 if (blank) {
1976 continue;
1977 }
1978
1979
1980
1981 argv = opal_argv_split(line, ' ');
1982 argc = opal_argv_count(argv);
1983 if (argc > 0) {
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997 if (NULL != *env) {
1998 tmp_env = opal_argv_copy(*env);
1999 if (NULL == tmp_env) {
2000 fclose(fp);
2001 opal_argv_free(argv);
2002 return ORTE_ERR_OUT_OF_RESOURCE;
2003 }
2004 } else {
2005 tmp_env = NULL;
2006 }
2007
2008 rc = create_app(argc, argv, jdata, &app, &made_app, &tmp_env);
2009 if (ORTE_SUCCESS != rc) {
2010
2011
2012 exit(1);
2013 }
2014 if (NULL != tmp_env) {
2015 opal_argv_free(tmp_env);
2016 }
2017 if (made_app) {
2018 app->idx = app_num;
2019 ++app_num;
2020 opal_pointer_array_add(jdata->apps, app);
2021 ++jdata->num_apps;
2022 }
2023 }
2024 opal_argv_free(argv);
2025 } while (!feof(fp));
2026 fclose(fp);
2027
2028
2029
2030 free(filename);
2031
2032 return ORTE_SUCCESS;
2033 }
2034
2035 static void launch_recv(int status, orte_process_name_t* sender,
2036 opal_buffer_t *buffer,
2037 orte_rml_tag_t tag, void *cbdata)
2038 {
2039 int rc;
2040 int32_t ret;
2041 int32_t cnt;
2042 orte_jobid_t jobid;
2043 orte_app_context_t *app;
2044 orte_proc_t *proc;
2045 orte_node_t *node;
2046 int tool_job_index;
2047 trackr_t *trk;
2048
2049
2050 cnt = 1;
2051 if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &ret, &cnt, OPAL_INT32))) {
2052 ORTE_ERROR_LOG(rc);
2053 ORTE_UPDATE_EXIT_STATUS(rc);
2054 return;
2055 }
2056
2057 ORTE_UPDATE_EXIT_STATUS(ret);
2058
2059
2060 cnt = 1;
2061 if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &jobid, &cnt, ORTE_JOBID))) {
2062 ORTE_ERROR_LOG(rc);
2063 ORTE_UPDATE_EXIT_STATUS(rc);
2064 return;
2065 }
2066
2067
2068 cnt = 1;
2069 if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &tool_job_index, &cnt, OPAL_INT))) {
2070 ORTE_ERROR_LOG(rc);
2071 ORTE_UPDATE_EXIT_STATUS(rc);
2072 return;
2073 }
2074
2075
2076 if (NULL == (trk = (trackr_t*)opal_pointer_array_get_item(&tool_jobs, tool_job_index))) {
2077 opal_output(0, "SPAWN OF TRACKER ID %d RETURNED INDEX TO NULL OBJECT", tool_job_index);
2078 return;
2079 }
2080 trk->jdata->jobid = jobid;
2081
2082 if (ORTE_SUCCESS == ret) {
2083 printf("[ORTE] Task: %d is launched! (Job ID: %s)\n", tool_job_index, ORTE_JOBID_PRINT(jobid));
2084 } else {
2085
2086 cnt = 1;
2087 if (OPAL_SUCCESS == opal_dss.unpack(buffer, &trk->jdata->state, &cnt, ORTE_JOB_STATE_T)) {
2088 cnt = 1;
2089 opal_dss.unpack(buffer, &proc, &cnt, ORTE_PROC);
2090 proc->exit_code = ret;
2091 app = (orte_app_context_t*)opal_pointer_array_get_item(trk->jdata->apps, proc->app_idx);
2092 cnt = 1;
2093 opal_dss.unpack(buffer, &node, &cnt, ORTE_NODE);
2094 orte_print_aborted_job(trk->jdata, app, proc, node);
2095 }
2096 }
2097
2098
2099 if (NULL != trk->launch_cb) {
2100 trk->launch_cb(tool_job_index, trk->jdata, ret, trk->launch_cbdata);
2101 }
2102
2103
2104 if (ORTE_SUCCESS != ret) {
2105 opal_pointer_array_set_item(&tool_jobs, tool_job_index, NULL);
2106 OBJ_RELEASE(trk);
2107 }
2108 }
2109
2110 static void complete_recv(int status, orte_process_name_t* sender,
2111 opal_buffer_t *buffer,
2112 orte_rml_tag_t tag, void *cbdata)
2113 {
2114 int rc, ret;
2115 int32_t cnt;
2116 orte_jobid_t jobid;
2117 orte_app_context_t *app;
2118 orte_proc_t *proc;
2119 orte_node_t *node;
2120 int tool_job_index;
2121 trackr_t *trk;
2122
2123
2124 cnt = 1;
2125 if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &ret, &cnt, OPAL_INT))) {
2126 ORTE_ERROR_LOG(rc);
2127 ORTE_UPDATE_EXIT_STATUS(rc);
2128 return;
2129 }
2130
2131
2132 cnt = 1;
2133 if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &jobid, &cnt, ORTE_JOBID))) {
2134 ORTE_ERROR_LOG(rc);
2135 ORTE_UPDATE_EXIT_STATUS(rc);
2136 return;
2137 }
2138
2139
2140 cnt = 1;
2141 if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &tool_job_index, &cnt, OPAL_INT))) {
2142 ORTE_ERROR_LOG(rc);
2143 ORTE_UPDATE_EXIT_STATUS(rc);
2144 return;
2145 }
2146
2147
2148 if (NULL == (trk = (trackr_t*)opal_pointer_array_get_item(&tool_jobs, tool_job_index))) {
2149 opal_output(0, "TRACKER ID %d RETURNED INDEX TO NULL OBJECT", tool_job_index);
2150 return;
2151 }
2152
2153 if (ORTE_SUCCESS == ret) {
2154 printf("[ORTE] Task: %d returned: %d (Job ID: %s)\n", tool_job_index, ret, ORTE_JOBID_PRINT(jobid));
2155 } else {
2156
2157 cnt = 1;
2158 opal_dss.unpack(buffer, &trk->jdata->state, &cnt, ORTE_JOB_STATE_T);
2159 cnt = 1;
2160 opal_dss.unpack(buffer, &proc, &cnt, ORTE_PROC);
2161 proc->exit_code = ret;
2162 app = (orte_app_context_t*)opal_pointer_array_get_item(trk->jdata->apps, proc->app_idx);
2163 cnt = 1;
2164 opal_dss.unpack(buffer, &node, &cnt, ORTE_NODE);
2165 orte_print_aborted_job(trk->jdata, app, proc, node);
2166 }
2167
2168
2169 if (NULL != trk && NULL != trk->complete_cb) {
2170 trk->complete_cb(tool_job_index, trk->jdata, ret, trk->complete_cbdata);
2171 }
2172
2173 opal_pointer_array_set_item(&tool_jobs, tool_job_index, NULL);
2174 OBJ_RELEASE(trk);
2175 }
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248 #define DUMP_INT(X) fprintf(stderr, " %s = %d\n", # X, X);
2249 #define FILE_MODE (S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)
2250
2251 struct MPIR_PROCDESC {
2252 char *host_name;
2253 char *executable_name;
2254 int pid;
2255 };
2256
2257
2258
2259
2260
2261
2262
2263
2264 static bool mpir_warning_printed = false;
2265
2266 static void orte_debugger_init_before_spawn(orte_job_t *jdata)
2267 {
2268 char *env_name;
2269 orte_app_context_t *app;
2270 int i;
2271 char *attach_fifo;
2272
2273 if (!MPIR_being_debugged && !orte_in_parallel_debugger) {
2274
2275
2276
2277 if (NULL != orte_debugger_test_daemon && !orte_debugger_test_attach) {
2278 opal_output_verbose(2, orte_debug_output,
2279 "%s Debugger test daemon specified: %s",
2280 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
2281 orte_debugger_test_daemon);
2282 goto launchit;
2283 }
2284
2285
2286
2287 if (0 < orte_debugger_check_rate) {
2288 opal_output_verbose(2, orte_debug_output,
2289 "%s Setting debugger attach check rate for %d seconds",
2290 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
2291 orte_debugger_check_rate);
2292 ORTE_TIMER_EVENT(orte_debugger_check_rate, 0, attach_debugger, ORTE_SYS_PRI);
2293 } else if (orte_create_session_dirs) {
2294
2295
2296
2297 attach_fifo = opal_os_path(false, orte_process_info.job_session_dir,
2298 "debugger_attach_fifo", NULL);
2299 if ((mkfifo(attach_fifo, FILE_MODE) < 0) && errno != EEXIST) {
2300 opal_output(0, "CANNOT CREATE FIFO %s: errno %d", attach_fifo, errno);
2301 free(attach_fifo);
2302 return;
2303 }
2304 opal_string_copy(MPIR_attach_fifo, attach_fifo,
2305 MPIR_MAX_PATH_LENGTH);
2306 free(attach_fifo);
2307 open_fifo();
2308 }
2309 return;
2310 }
2311
2312 launchit:
2313 opal_output_verbose(1, orte_debug_output, "Info: Spawned by a debugger");
2314
2315
2316 if (!mpir_warning_printed) {
2317 mpir_warning_printed = true;
2318
2319 if (NULL == getenv("OMPI_MPIR_DO_NOT_WARN")) {
2320 orte_show_help("help-orted.txt", "mpir-debugger-detected", true);
2321 }
2322 }
2323
2324
2325 (void) mca_base_var_env_name ("orte_in_parallel_debugger", &env_name);
2326
2327 for (i=0; i < jdata->apps->size; i++) {
2328 if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
2329 continue;
2330 }
2331 opal_setenv(env_name, "1", true, &app->env);
2332 }
2333 free(env_name);
2334
2335
2336 if (orte_create_session_dirs) {
2337
2338
2339
2340 attach_fifo = opal_os_path(false, orte_process_info.job_session_dir,
2341 "debugger_attach_fifo", NULL);
2342 if ((mkfifo(attach_fifo, FILE_MODE) < 0) && errno != EEXIST) {
2343 opal_output(0, "CANNOT CREATE FIFO %s: errno %d", attach_fifo, errno);
2344 free(attach_fifo);
2345 return;
2346 }
2347 strncpy(MPIR_attach_fifo, attach_fifo, MPIR_MAX_PATH_LENGTH - 1);
2348 free(attach_fifo);
2349 open_fifo();
2350 }
2351 }
2352
2353 static bool mpir_breakpoint_fired = false;
2354
2355 static void _send_notification(int status)
2356 {
2357 opal_buffer_t buf;
2358 orte_grpcomm_signature_t sig;
2359 int rc;
2360 opal_value_t kv, *kvptr;
2361
2362 OBJ_CONSTRUCT(&buf, opal_buffer_t);
2363
2364
2365 if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &status, 1, OPAL_INT))) {
2366 ORTE_ERROR_LOG(rc);
2367 OBJ_DESTRUCT(&buf);
2368 return;
2369 }
2370
2371
2372 if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, ORTE_PROC_MY_NAME, 1, ORTE_NAME))) {
2373 ORTE_ERROR_LOG(rc);
2374 OBJ_DESTRUCT(&buf);
2375 return;
2376 }
2377
2378
2379 status = 1;
2380 if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &status, 1, OPAL_INT))) {
2381 ORTE_ERROR_LOG(rc);
2382 OBJ_DESTRUCT(&buf);
2383 return;
2384 }
2385 OBJ_CONSTRUCT(&kv, opal_value_t);
2386 kv.key = strdup(OPAL_PMIX_EVENT_NON_DEFAULT);
2387 kv.type = OPAL_BOOL;
2388 kv.data.flag = true;
2389 kvptr = &kv;
2390 if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &kvptr, 1, OPAL_VALUE))) {
2391 ORTE_ERROR_LOG(rc);
2392 OBJ_DESTRUCT(&kv);
2393 OBJ_DESTRUCT(&buf);
2394 return;
2395 }
2396 OBJ_DESTRUCT(&kv);
2397
2398
2399 OBJ_CONSTRUCT(&sig, orte_grpcomm_signature_t);
2400 sig.signature = (orte_process_name_t*)malloc(sizeof(orte_process_name_t));
2401 sig.signature[0].jobid = ORTE_PROC_MY_NAME->jobid;
2402 sig.signature[0].vpid = ORTE_VPID_WILDCARD;
2403 sig.sz = 1;
2404
2405 if (ORTE_SUCCESS != (rc = orte_grpcomm.xcast(&sig, ORTE_RML_TAG_NOTIFICATION, &buf))) {
2406 ORTE_ERROR_LOG(rc);
2407 }
2408 OBJ_DESTRUCT(&sig);
2409 OBJ_DESTRUCT(&buf);
2410 }
2411
2412 static void orte_debugger_dump(void)
2413 {
2414 int i;
2415
2416 DUMP_INT(MPIR_being_debugged);
2417 DUMP_INT(MPIR_debug_state);
2418 DUMP_INT(MPIR_partial_attach_ok);
2419 DUMP_INT(MPIR_i_am_starter);
2420 DUMP_INT(MPIR_forward_output);
2421 DUMP_INT(MPIR_proctable_size);
2422 fprintf(stderr, " MPIR_proctable:\n");
2423 for (i = 0; i < MPIR_proctable_size; i++) {
2424 fprintf(stderr,
2425 " (i, host, exe, pid) = (%d, %s, %s, %d)\n",
2426 i,
2427 MPIR_proctable[i].host_name,
2428 MPIR_proctable[i].executable_name,
2429 MPIR_proctable[i].pid);
2430 }
2431 fprintf(stderr, "MPIR_executable_path: %s\n",
2432 ('\0' == MPIR_executable_path[0]) ?
2433 "NULL" : (char*) MPIR_executable_path);
2434 fprintf(stderr, "MPIR_server_arguments: %s\n",
2435 ('\0' == MPIR_server_arguments[0]) ?
2436 "NULL" : (char*) MPIR_server_arguments);
2437 }
2438
2439 static void setup_debugger_job(orte_jobid_t jobid)
2440 {
2441 orte_job_t *debugger;
2442 orte_app_context_t *app;
2443 int rc;
2444 char cwd[OPAL_PATH_MAX];
2445 bool flag = true;
2446
2447
2448 debugger = OBJ_NEW(orte_job_t);
2449
2450
2451
2452 orte_plm_base_create_jobid(debugger);
2453
2454 opal_argv_append_nosize(&debugger->personality, "orte");
2455
2456 ORTE_FLAG_SET(debugger, ORTE_JOB_FLAG_DEBUGGER_DAEMON);
2457
2458 if (!MPIR_forward_output) {
2459 ORTE_FLAG_SET(debugger, ORTE_JOB_FLAG_FORWARD_OUTPUT);
2460 }
2461
2462 debugger->stdin_target = ORTE_VPID_INVALID;
2463
2464 opal_hash_table_set_value_uint32(orte_job_data, debugger->jobid, debugger);
2465
2466 app = OBJ_NEW(orte_app_context_t);
2467 if (NULL != orte_debugger_test_daemon) {
2468 app->app = strdup(orte_debugger_test_daemon);
2469 } else {
2470 app->app = strdup((char*)MPIR_executable_path);
2471 }
2472
2473
2474
2475 if (OPAL_SUCCESS != (rc = opal_getcwd(cwd, sizeof(cwd)))) {
2476 orte_show_help("help-orterun.txt", "orterun:init-failure",
2477 true, "get the cwd", rc);
2478 return;
2479 }
2480 app->cwd = strdup(cwd);
2481 orte_set_attribute(&app->attributes, ORTE_APP_USER_CWD, ORTE_ATTR_GLOBAL, &flag, OPAL_BOOL);
2482 opal_argv_append_nosize(&app->argv, app->app);
2483 build_debugger_args(app);
2484 opal_pointer_array_add(debugger->apps, app);
2485 debugger->num_apps = 1;
2486
2487 debugger->map = OBJ_NEW(orte_job_map_t);
2488 ORTE_SET_MAPPING_POLICY(debugger->map->mapping, ORTE_MAPPING_PPR);
2489 ORTE_SET_MAPPING_DIRECTIVE(debugger->map->mapping, ORTE_MAPPING_GIVEN);
2490 ORTE_SET_MAPPING_DIRECTIVE(debugger->map->mapping, ORTE_MAPPING_DEBUGGER);
2491
2492 debugger->map->ppr = strdup("1:node");
2493
2494 if (ORTE_SUCCESS != (rc = opal_hwloc_base_set_binding_policy(&debugger->map->binding, "none"))) {
2495 ORTE_ERROR_LOG(rc);
2496 return;
2497 }
2498
2499 rc = orte_plm.spawn(debugger);
2500 if (ORTE_SUCCESS != rc) {
2501 ORTE_ERROR_LOG(rc);
2502 }
2503 }
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514 void orte_debugger_init_after_spawn(int fd, short event, void *cbdata)
2515 {
2516 orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
2517 orte_job_t *jdata = caddy->jdata;
2518 orte_proc_t *proc;
2519 orte_app_context_t *appctx;
2520 orte_vpid_t i, j;
2521 char **aliases, *aptr;
2522
2523
2524
2525
2526
2527 if (MPIR_proctable || 0 == jdata->num_procs) {
2528
2529
2530 opal_output_verbose(5, orte_debug_output,
2531 "%s: debugger already initialized or zero procs",
2532 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
2533
2534 if (MPIR_being_debugged || NULL != orte_debugger_test_daemon ||
2535 NULL != getenv("ORTE_TEST_DEBUGGER_ATTACH")) {
2536 OBJ_RELEASE(caddy);
2537
2538 if (!mpir_warning_printed) {
2539 mpir_warning_printed = true;
2540
2541 if (NULL == getenv("OMPI_MPIR_DO_NOT_WARN")) {
2542 orte_show_help("help-orted.txt", "mpir-debugger-detected", true);
2543 }
2544 }
2545 if (!mpir_breakpoint_fired) {
2546
2547 mpir_breakpoint_fired = true;
2548
2549
2550 MPIR_Breakpoint();
2551
2552 opal_output_verbose(5, orte_debug_output,
2553 "%s NOTIFYING DEBUGGER RELEASE",
2554 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
2555
2556 _send_notification(OPAL_ERR_DEBUGGER_RELEASE);
2557 }
2558 }
2559 return;
2560 }
2561
2562
2563
2564 opal_output_verbose(5, orte_debug_output,
2565 "%s: Setting up debugger process table for applications",
2566 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
2567
2568 MPIR_debug_state = 1;
2569
2570
2571 MPIR_proctable_size = jdata->num_procs;
2572
2573
2574 MPIR_proctable = (struct MPIR_PROCDESC *)malloc(sizeof(struct MPIR_PROCDESC) *
2575 MPIR_proctable_size);
2576 if (MPIR_proctable == NULL) {
2577 ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
2578 OBJ_RELEASE(caddy);
2579 return;
2580 }
2581
2582 if (orte_debugger_dump_proctable) {
2583 opal_output(orte_clean_output, "MPIR Proctable for job %s", ORTE_JOBID_PRINT(jdata->jobid));
2584 }
2585
2586
2587 for (j=0; j < jdata->num_procs; j++) {
2588 if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, j))) {
2589 continue;
2590 }
2591
2592
2593
2594 i = proc->name.vpid;
2595 if (NULL == (appctx = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, proc->app_idx))) {
2596 continue;
2597 }
2598
2599
2600 if (orte_retain_aliases) {
2601 aliases = NULL;
2602 aptr = NULL;
2603 if (orte_get_attribute(&proc->node->attributes, ORTE_NODE_ALIAS, (void**)&aptr, OPAL_STRING)) {
2604 aliases = opal_argv_split(aptr, ',');
2605 free(aptr);
2606 if (orte_use_hostname_alias <= opal_argv_count(aliases)) {
2607 MPIR_proctable[i].host_name = strdup(aliases[orte_use_hostname_alias-1]);
2608 }
2609 opal_argv_free(aliases);
2610 }
2611 } else {
2612
2613 MPIR_proctable[i].host_name = strdup(proc->node->name);
2614 }
2615
2616 if ( 0 == strncmp(appctx->app, OPAL_PATH_SEP, 1 )) {
2617 MPIR_proctable[i].executable_name =
2618 opal_os_path( false, appctx->app, NULL );
2619 } else {
2620 MPIR_proctable[i].executable_name =
2621 opal_os_path( false, appctx->cwd, appctx->app, NULL );
2622 }
2623 MPIR_proctable[i].pid = proc->pid;
2624 if (orte_debugger_dump_proctable) {
2625 opal_output(orte_clean_output, "%s: Host %s Exe %s Pid %d",
2626 ORTE_VPID_PRINT(i), MPIR_proctable[i].host_name,
2627 MPIR_proctable[i].executable_name, MPIR_proctable[i].pid);
2628 }
2629 }
2630
2631 if (0 < opal_output_get_verbosity(orte_debug_output)) {
2632 orte_debugger_dump();
2633 }
2634
2635
2636
2637
2638 if (MPIR_being_debugged || NULL != orte_debugger_test_daemon ||
2639 NULL != getenv("ORTE_TEST_DEBUGGER_ATTACH")) {
2640
2641 if (!mpir_warning_printed) {
2642 mpir_warning_printed = true;
2643
2644 if (NULL == getenv("OMPI_MPIR_DO_NOT_WARN")) {
2645 orte_show_help("help-orted.txt", "mpir-debugger-detected", true);
2646 }
2647 }
2648
2649
2650
2651
2652
2653 if ('\0' == MPIR_executable_path[0] && NULL == orte_debugger_test_daemon) {
2654
2655 mpir_breakpoint_fired = true;
2656
2657
2658 MPIR_Breakpoint();
2659
2660 opal_output_verbose(2, orte_debug_output,
2661 "%s NOTIFYING DEBUGGER RELEASE",
2662 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
2663
2664 _send_notification(OPAL_ERR_DEBUGGER_RELEASE);
2665 } else if (!orte_debugger_test_attach) {
2666
2667
2668
2669
2670 opal_output_verbose(2, orte_debug_output,
2671 "%s Cospawning debugger daemons %s",
2672 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
2673 (NULL == orte_debugger_test_daemon) ?
2674 MPIR_executable_path : orte_debugger_test_daemon);
2675 setup_debugger_job(jdata->jobid);
2676 }
2677
2678 OBJ_RELEASE(caddy);
2679 return;
2680 }
2681
2682
2683 OBJ_RELEASE(caddy);
2684 }
2685
2686
2687
2688
2689
2690
2691 static int process(char *orig_line, char *basename, opal_cmd_line_t *cmd_line,
2692 int argc, char **argv, char ***new_argv, int num_procs)
2693 {
2694 int ret = ORTE_SUCCESS;
2695 int i, j, count;
2696 char *line = NULL, *tmp = NULL, *full_line = strdup(orig_line);
2697 char **orterun_argv = NULL, **executable_argv = NULL, **line_argv = NULL;
2698 char cwd[OPAL_PATH_MAX];
2699 bool used_num_procs = false;
2700 bool single_app = false;
2701 bool fail_needed_executable = false;
2702
2703 line = full_line;
2704 if (NULL == line) {
2705 ret = ORTE_ERR_OUT_OF_RESOURCE;
2706 goto out;
2707 }
2708
2709
2710
2711 for (i = 0; '\0' != line[i] && isspace(line[i]); ++line) {
2712 continue;
2713 }
2714 for (i = strlen(line) - 2; i > 0 && isspace(line[i]); ++i) {
2715 line[i] = '\0';
2716 }
2717 if (strlen(line) <= 0) {
2718 ret = ORTE_ERROR;
2719 goto out;
2720 }
2721
2722
2723
2724
2725 opal_cmd_line_get_tail(cmd_line, &i, &executable_argv);
2726
2727
2728
2729
2730
2731 orterun_argv = opal_argv_copy(argv);
2732 count = opal_argv_count(orterun_argv);
2733 opal_argv_delete(&count, &orterun_argv, 0, 1);
2734 for (i = 0; NULL != orterun_argv[i]; ++i) {
2735 count = opal_argv_count(orterun_argv);
2736 if (0 == strcmp(orterun_argv[i], "-debug") ||
2737 0 == strcmp(orterun_argv[i], "--debug")) {
2738 opal_argv_delete(&count, &orterun_argv, i, 1);
2739 } else if (0 == strcmp(orterun_argv[i], "-tv") ||
2740 0 == strcmp(orterun_argv[i], "--tv")) {
2741 opal_argv_delete(&count, &orterun_argv, i, 1);
2742 } else if (0 == strcmp(orterun_argv[i], "--debugger") ||
2743 0 == strcmp(orterun_argv[i], "-debugger")) {
2744 opal_argv_delete(&count, &orterun_argv, i, 2);
2745 }
2746 }
2747
2748
2749
2750
2751 *new_argv = NULL;
2752 line_argv = opal_argv_split(line, ' ');
2753 if (NULL == line_argv) {
2754 ret = ORTE_ERR_NOT_FOUND;
2755 goto out;
2756 }
2757 for (i = 0; NULL != line_argv[i]; ++i) {
2758 if (0 == strcmp(line_argv[i], "@mpirun@") ||
2759 0 == strcmp(line_argv[i], "@orterun@")) {
2760 opal_argv_append_nosize(new_argv, argv[0]);
2761 } else if (0 == strcmp(line_argv[i], "@mpirun_args@") ||
2762 0 == strcmp(line_argv[i], "@orterun_args@")) {
2763 for (j = 0; NULL != orterun_argv && NULL != orterun_argv[j]; ++j) {
2764 opal_argv_append_nosize(new_argv, orterun_argv[j]);
2765 }
2766 } else if (0 == strcmp(line_argv[i], "@np@")) {
2767 used_num_procs = true;
2768 opal_asprintf(&tmp, "%d", num_procs);
2769 opal_argv_append_nosize(new_argv, tmp);
2770 free(tmp);
2771 } else if (0 == strcmp(line_argv[i], "@single_app@")) {
2772
2773
2774 single_app = true;
2775 } else if (0 == strcmp(line_argv[i], "@executable@")) {
2776
2777
2778 if (NULL != executable_argv) {
2779 opal_argv_append_nosize(new_argv, executable_argv[0]);
2780 } else {
2781 fail_needed_executable = true;
2782 }
2783 } else if (0 == strcmp(line_argv[i], "@executable_argv@")) {
2784
2785
2786 if (NULL != executable_argv) {
2787 for (j = 1; NULL != executable_argv[j]; ++j) {
2788 opal_argv_append_nosize(new_argv, executable_argv[j]);
2789 }
2790 } else {
2791 fail_needed_executable = true;
2792 }
2793 } else {
2794
2795 opal_argv_append_nosize(new_argv, line_argv[i]);
2796 }
2797 }
2798
2799
2800
2801 getcwd(cwd, OPAL_PATH_MAX);
2802 tmp = opal_path_findv((*new_argv)[0], X_OK, environ, cwd);
2803 if (NULL != tmp) {
2804 free(tmp);
2805
2806
2807
2808 tmp = opal_argv_join(argv, ' ');
2809
2810
2811
2812
2813 if (used_num_procs && 0 == num_procs) {
2814 free(tmp);
2815 tmp = opal_argv_join(orterun_argv, ' ');
2816 orte_show_help("help-orterun.txt", "debugger requires -np",
2817 true, (*new_argv)[0], argv[0], tmp,
2818 (*new_argv)[0]);
2819
2820 }
2821
2822
2823 else if (single_app && NULL != strstr(tmp, " : ")) {
2824 orte_show_help("help-orterun.txt",
2825 "debugger only accepts single app", true,
2826 (*new_argv)[0], (*new_argv)[0]);
2827
2828 }
2829
2830
2831
2832
2833 else if (fail_needed_executable) {
2834 orte_show_help("help-orterun.txt",
2835 "debugger requires executable", true,
2836 (*new_argv)[0], argv[0], (*new_argv)[0], argv[0],
2837 (*new_argv)[0]);
2838
2839 }
2840
2841
2842 else {
2843 goto out;
2844 }
2845 }
2846
2847
2848
2849 opal_argv_free(*new_argv);
2850 *new_argv = NULL;
2851 ret = ORTE_ERR_NOT_FOUND;
2852
2853 out:
2854 if (NULL != orterun_argv) {
2855 opal_argv_free(orterun_argv);
2856 }
2857 if (NULL != executable_argv) {
2858 opal_argv_free(executable_argv);
2859 }
2860 if (NULL != line_argv) {
2861 opal_argv_free(line_argv);
2862 }
2863 if (NULL != tmp) {
2864 free(tmp);
2865 }
2866 if (NULL != full_line) {
2867 free(full_line);
2868 }
2869 return ret;
2870 }
2871
2872 static void open_fifo(void)
2873 {
2874 if (orte_debugger_attach_fd > 0) {
2875 close(orte_debugger_attach_fd);
2876 }
2877
2878 orte_debugger_attach_fd = open(MPIR_attach_fifo, O_RDONLY | O_NONBLOCK, 0);
2879 if (orte_debugger_attach_fd < 0) {
2880 opal_output(0, "%s unable to open debugger attach fifo",
2881 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
2882 return;
2883 }
2884
2885
2886 if (opal_fd_set_cloexec(orte_debugger_attach_fd) != OPAL_SUCCESS) {
2887 opal_output(0, "%s unable to set debugger attach fifo to CLOEXEC",
2888 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
2889 close(orte_debugger_attach_fd);
2890 orte_debugger_attach_fd = -1;
2891 return;
2892 }
2893
2894 if (orte_debugger_test_attach) {
2895 opal_output(0, "%s Monitoring debugger attach fifo %s",
2896 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
2897 MPIR_attach_fifo);
2898 } else {
2899 opal_output_verbose(2, orte_debug_output,
2900 "%s Monitoring debugger attach fifo %s",
2901 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
2902 MPIR_attach_fifo);
2903 }
2904 orte_debugger_attach = (opal_event_t*)malloc(sizeof(opal_event_t));
2905 opal_event_set(orte_event_base, orte_debugger_attach, orte_debugger_attach_fd,
2906 OPAL_EV_READ, attach_debugger, orte_debugger_attach);
2907
2908 orte_debugger_fifo_active = true;
2909 opal_event_add(orte_debugger_attach, 0);
2910 }
2911
2912 static bool did_once = false;
2913
2914 static void attach_debugger(int fd, short event, void *arg)
2915 {
2916 unsigned char fifo_cmd;
2917 int rc;
2918 orte_timer_t *tm;
2919
2920 if (orte_debugger_fifo_active) {
2921 orte_debugger_attach = (opal_event_t*)arg;
2922 orte_debugger_fifo_active = false;
2923
2924 rc = read(orte_debugger_attach_fd, &fifo_cmd, sizeof(fifo_cmd));
2925 if (!rc) {
2926
2927 opal_event_free(orte_debugger_attach);
2928
2929 open_fifo();
2930 return;
2931 }
2932 if (1 != fifo_cmd) {
2933
2934 orte_debugger_fifo_active = true;
2935 opal_event_add(orte_debugger_attach, 0);
2936 return;
2937 }
2938 }
2939
2940 if (!MPIR_being_debugged && !orte_debugger_test_attach) {
2941
2942 if (0 == orte_debugger_check_rate) {
2943 orte_debugger_fifo_active = true;
2944 opal_event_add(orte_debugger_attach, 0);
2945 } else if (!MPIR_being_debugged) {
2946 tm = (orte_timer_t*)arg;
2947
2948 opal_event_evtimer_add(tm->ev, &tm->tv);
2949 }
2950 return;
2951 }
2952
2953 opal_output_verbose(1, orte_debug_output,
2954 "%s Attaching debugger %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
2955 (NULL == orte_debugger_test_daemon) ? MPIR_executable_path : orte_debugger_test_daemon);
2956
2957
2958 if (!mpir_warning_printed) {
2959 mpir_warning_printed = true;
2960
2961 if (NULL == getenv("OMPI_MPIR_DO_NOT_WARN")) {
2962 orte_show_help("help-orted.txt", "mpir-debugger-detected", true);
2963 }
2964 }
2965
2966
2967
2968
2969
2970 if ('\0' != MPIR_executable_path[0] || NULL != orte_debugger_test_daemon) {
2971 opal_output_verbose(2, orte_debug_output,
2972 "%s Spawning debugger daemons %s",
2973 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
2974 (NULL == orte_debugger_test_daemon) ?
2975 MPIR_executable_path : orte_debugger_test_daemon);
2976 setup_debugger_job(ORTE_JOBID_WILDCARD);
2977 did_once = true;
2978 }
2979
2980
2981 if (NULL != orte_debugger_test_daemon && did_once) {
2982 return;
2983 }
2984
2985
2986 if (0 == orte_debugger_check_rate) {
2987 orte_debugger_fifo_active = true;
2988 opal_event_add(orte_debugger_attach, 0);
2989 } else if (!MPIR_being_debugged) {
2990 tm = (orte_timer_t*)arg;
2991
2992 opal_event_evtimer_add(tm->ev, &tm->tv);
2993 }
2994 }
2995
2996 static void build_debugger_args(orte_app_context_t *debugger)
2997 {
2998 int i, j;
2999 char mpir_arg[MPIR_MAX_ARG_LENGTH];
3000
3001 if ('\0' != MPIR_server_arguments[0]) {
3002 j=0;
3003 memset(mpir_arg, 0, MPIR_MAX_ARG_LENGTH);
3004 for (i=0; i < MPIR_MAX_ARG_LENGTH; i++) {
3005 if (MPIR_server_arguments[i] == '\0') {
3006 if (0 < j) {
3007 opal_argv_append_nosize(&debugger->argv, mpir_arg);
3008 memset(mpir_arg, 0, MPIR_MAX_ARG_LENGTH);
3009 j=0;
3010 }
3011 } else {
3012 mpir_arg[j] = MPIR_server_arguments[i];
3013 j++;
3014 }
3015 }
3016 }
3017 }
3018
3019
3020
3021
3022 static void run_debugger(char *basename, opal_cmd_line_t *cmd_line,
3023 int argc, char *argv[], int num_procs)
3024 {
3025 int i, id, ret;
3026 char **new_argv = NULL;
3027 const char **tmp = NULL;
3028 char *value, **lines, *env_name;
3029
3030
3031
3032
3033 id = mca_base_var_find("orte", "orte", NULL, "base_user_debugger");
3034 if (id < 0) {
3035 orte_show_help("help-orterun.txt", "debugger-mca-param-not-found",
3036 true);
3037 exit(1);
3038 }
3039
3040 ret = mca_base_var_get_value (id, &tmp, NULL, NULL);
3041 if (OPAL_SUCCESS != ret || NULL == tmp || NULL == tmp[0]) {
3042 orte_show_help("help-orterun.txt", "debugger-orte_base_user_debugger-empty",
3043 true);
3044 exit(1);
3045 }
3046
3047
3048
3049 lines = opal_argv_split(tmp[0], ':');
3050 for (i = 0; NULL != lines[i]; ++i) {
3051 if (ORTE_SUCCESS == process(lines[i], basename, cmd_line, argc, argv,
3052 &new_argv, num_procs)) {
3053 break;
3054 }
3055 }
3056
3057
3058
3059 if (NULL == lines[i]) {
3060 orte_show_help("help-orterun.txt", "debugger-not-found", true);
3061 exit(1);
3062 }
3063 opal_argv_free(lines);
3064
3065
3066
3067
3068 memset((char*)MPIR_executable_path, 0, MPIR_MAX_PATH_LENGTH);
3069 memset((char*)MPIR_server_arguments, 0, MPIR_MAX_ARG_LENGTH);
3070
3071
3072
3073
3074
3075 ret = mca_base_var_env_name ("orte_in_parallel_debugger", &env_name);
3076 if (OPAL_SUCCESS == ret && NULL != env_name) {
3077 opal_setenv(env_name, "1", true, &environ);
3078 free(env_name);
3079 }
3080
3081
3082 if (!mpir_warning_printed) {
3083 mpir_warning_printed = true;
3084
3085 if (NULL == getenv("OMPI_MPIR_DO_NOT_WARN")) {
3086 orte_show_help("help-orted.txt", "mpir-debugger-detected", true);
3087 }
3088 }
3089
3090
3091 execvp(new_argv[0], new_argv);
3092 value = opal_argv_join(new_argv, ' ');
3093 orte_show_help("help-orterun.txt", "debugger-exec-failed",
3094 true, basename, value, new_argv[0]);
3095 free(value);
3096 opal_argv_free(new_argv);
3097 exit(1);
3098 }
3099
3100 void orte_debugger_detached(int fd, short event, void *cbdata)
3101 {
3102 orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
3103 OBJ_RELEASE(caddy);
3104
3105
3106 mpir_breakpoint_fired = false;
3107 }
3108
3109 static uint32_t ntraces = 0;
3110 static orte_timer_t stack_trace_timer;
3111
3112 static void stack_trace_recv(int status, orte_process_name_t* sender,
3113 opal_buffer_t *buffer, orte_rml_tag_t tag,
3114 void* cbdata)
3115 {
3116 opal_buffer_t *blob;
3117 char *st;
3118 int32_t cnt;
3119 orte_process_name_t name;
3120 char *hostname;
3121 pid_t pid;
3122
3123
3124 cnt = 1;
3125 while (OPAL_SUCCESS == opal_dss.unpack(buffer, &blob, &cnt, OPAL_BUFFER)) {
3126
3127 cnt = 1;
3128 if (OPAL_SUCCESS != opal_dss.unpack(blob, &name, &cnt, ORTE_NAME) ||
3129 OPAL_SUCCESS != opal_dss.unpack(blob, &hostname, &cnt, OPAL_STRING) ||
3130 OPAL_SUCCESS != opal_dss.unpack(blob, &pid, &cnt, OPAL_PID)) {
3131 OBJ_RELEASE(blob);
3132 continue;
3133 }
3134 fprintf(stderr, "STACK TRACE FOR PROC %s (%s, PID %lu)\n", ORTE_NAME_PRINT(&name), hostname, (unsigned long) pid);
3135 free(hostname);
3136
3137 cnt = 1;
3138 while (OPAL_SUCCESS == opal_dss.unpack(blob, &st, &cnt, OPAL_STRING)) {
3139 fprintf(stderr, "\t%s", st);
3140 free(st);
3141 cnt = 1;
3142 }
3143 fprintf(stderr, "\n");
3144 OBJ_RELEASE(blob);
3145 cnt = 1;
3146 }
3147 ++ntraces;
3148 if (orte_process_info.num_procs == ntraces) {
3149 if( orte_stack_trace_wait_timeout > 0 ) {
3150
3151 OBJ_DESTRUCT(&stack_trace_timer);
3152 }
3153
3154 ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_ALL_JOBS_COMPLETE);
3155
3156 orte_abnormal_term_ordered = true;
3157 }
3158 }
3159
3160 static void stack_trace_timeout(int sd, short args, void *cbdata)
3161 {
3162
3163 ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_ALL_JOBS_COMPLETE);
3164
3165 orte_abnormal_term_ordered = true;
3166 }
3167
3168 void orte_timeout_wakeup(int sd, short args, void *cbdata)
3169 {
3170 orte_job_t *jdata;
3171 orte_proc_t *proc;
3172 int i;
3173 int rc;
3174 uint32_t key;
3175 void *nptr;
3176
3177
3178
3179
3180 orte_show_help("help-orterun.txt", "orterun:timeout",
3181 true, timeout_seconds);
3182 ORTE_UPDATE_EXIT_STATUS(ETIMEDOUT);
3183
3184 if (ORTE_PROC_IS_HNP &&
3185 NULL != getenv("ORTE_TEST_HNP_SUICIDE")) {
3186 opal_output(0, "HNP exiting w/o cleanup");
3187 exit(1);
3188 }
3189 if (orte_cmd_options.report_state_on_timeout) {
3190
3191 rc = opal_hash_table_get_first_key_uint32(orte_job_data, &key, (void **)&jdata, &nptr);
3192 while (OPAL_SUCCESS == rc) {
3193
3194 fprintf(stderr, "DATA FOR JOB: %s\n", ORTE_JOBID_PRINT(jdata->jobid));
3195 fprintf(stderr, "\tNum apps: %d\tNum procs: %d\tJobState: %s\tAbort: %s\n",
3196 (int)jdata->num_apps, (int)jdata->num_procs,
3197 orte_job_state_to_str(jdata->state),
3198 (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) ? "True" : "False");
3199 fprintf(stderr, "\tNum launched: %ld\tNum reported: %ld\tNum terminated: %ld\n",
3200 (long)jdata->num_launched, (long)jdata->num_reported, (long)jdata->num_terminated);
3201 fprintf(stderr, "\n\tProcs:\n");
3202 for (i=0; i < jdata->procs->size; i++) {
3203 if (NULL != (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) {
3204 fprintf(stderr, "\t\tRank: %s\tNode: %s\tPID: %u\tState: %s\tExitCode %d\n",
3205 ORTE_VPID_PRINT(proc->name.vpid),
3206 (NULL == proc->node) ? "UNKNOWN" : proc->node->name,
3207 (unsigned int)proc->pid,
3208 orte_proc_state_to_str(proc->state), proc->exit_code);
3209 }
3210 }
3211 fprintf(stderr, "\n");
3212 rc = opal_hash_table_get_next_key_uint32(orte_job_data, &key, (void **)&jdata, nptr, &nptr);
3213 }
3214 }
3215
3216
3217 if (orte_cmd_options.get_stack_traces) {
3218 orte_daemon_cmd_flag_t command = ORTE_DAEMON_GET_STACK_TRACES;
3219 opal_buffer_t *buffer;
3220 orte_grpcomm_signature_t *sig;
3221
3222 fprintf(stderr, "Waiting for stack traces (this may take a few moments)...\n");
3223
3224
3225 orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_STACK_TRACE,
3226 ORTE_RML_PERSISTENT, stack_trace_recv, NULL);
3227
3228
3229 buffer = OBJ_NEW(opal_buffer_t);
3230
3231 if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &command, 1, ORTE_DAEMON_CMD))) {
3232 ORTE_ERROR_LOG(rc);
3233 OBJ_RELEASE(buffer);
3234 goto giveup;
3235 }
3236
3237 sig = OBJ_NEW(orte_grpcomm_signature_t);
3238 sig->signature = (orte_process_name_t*)malloc(sizeof(orte_process_name_t));
3239 sig->signature[0].jobid = ORTE_PROC_MY_NAME->jobid;
3240 sig->signature[0].vpid = ORTE_VPID_WILDCARD;
3241 sig->sz = 1;
3242 if (ORTE_SUCCESS != (rc = orte_grpcomm.xcast(sig, ORTE_RML_TAG_DAEMON, buffer))) {
3243 ORTE_ERROR_LOG(rc);
3244 OBJ_RELEASE(buffer);
3245 OBJ_RELEASE(sig);
3246 goto giveup;
3247 }
3248 OBJ_RELEASE(buffer);
3249
3250 OBJ_RELEASE(sig);
3251
3252
3253 if( orte_stack_trace_wait_timeout > 0 ) {
3254 OBJ_CONSTRUCT(&stack_trace_timer, orte_timer_t);
3255 opal_event_evtimer_set(orte_event_base,
3256 stack_trace_timer.ev, stack_trace_timeout, NULL);
3257 opal_event_set_priority(stack_trace_timer.ev, ORTE_ERROR_PRI);
3258 stack_trace_timer.tv.tv_sec = orte_stack_trace_wait_timeout;
3259 opal_event_evtimer_add(stack_trace_timer.ev, &stack_trace_timer.tv);
3260 }
3261 return;
3262 }
3263 giveup:
3264
3265 ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_ALL_JOBS_COMPLETE);
3266
3267 orte_abnormal_term_ordered = true;
3268 }
3269
3270 static int nreports = 0;
3271 static orte_timer_t profile_timer;
3272 static int nchecks = 0;
3273
3274 static void profile_timeout(int sd, short args, void *cbdata)
3275 {
3276
3277 ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_ALL_JOBS_COMPLETE);
3278
3279 orte_abnormal_term_ordered = true;
3280 }
3281
3282
3283 static void profile_recv(int status, orte_process_name_t* sender,
3284 opal_buffer_t *buffer, orte_rml_tag_t tag,
3285 void* cbdata)
3286 {
3287 int32_t cnt;
3288 char *hostname;
3289 float dpss, pss;
3290
3291
3292 cnt = 1;
3293 if (OPAL_SUCCESS != opal_dss.unpack(buffer, &hostname, &cnt, OPAL_STRING)) {
3294 goto done;
3295 }
3296
3297 fprintf(stderr, "Memory profile from host: %s\n", hostname);
3298 free(hostname);
3299
3300
3301 cnt = 1;
3302 if (OPAL_SUCCESS != opal_dss.unpack(buffer, &dpss, &cnt, OPAL_FLOAT)) {
3303 goto done;
3304 }
3305
3306 cnt = 1;
3307 if (OPAL_SUCCESS != opal_dss.unpack(buffer, &pss, &cnt, OPAL_FLOAT)) {
3308 goto done;
3309 }
3310
3311 fprintf(stderr, "\tDaemon: %8.2fM\tProcs: %8.2fM\n", dpss, pss);
3312
3313 done:
3314 --nreports;
3315 if (nreports == 0) {
3316 ++nchecks;
3317
3318 OBJ_DESTRUCT(&profile_timer);
3319
3320 _send_notification(12345);
3321
3322
3323 if (2 > nchecks) {
3324
3325 opal_event_evtimer_set(orte_event_base, orte_memprofile_timeout->ev,
3326 orte_profile_wakeup, NULL);
3327 opal_event_set_priority(orte_memprofile_timeout->ev, ORTE_ERROR_PRI);
3328 opal_event_evtimer_add(orte_memprofile_timeout->ev, &orte_memprofile_timeout->tv);
3329
3330 OBJ_CONSTRUCT(&profile_timer, orte_timer_t);
3331 opal_event_evtimer_set(orte_event_base,
3332 profile_timer.ev, profile_timeout, NULL);
3333 opal_event_set_priority(profile_timer.ev, ORTE_ERROR_PRI);
3334 profile_timer.tv.tv_sec = 30;
3335 opal_event_evtimer_add(profile_timer.ev, &profile_timer.tv);
3336 return;
3337 }
3338 }
3339 }
3340
3341 void orte_profile_wakeup(int sd, short args, void *cbdata)
3342 {
3343 orte_job_t *dmns;
3344 orte_proc_t *dmn;
3345 int i;
3346 int rc;
3347 orte_daemon_cmd_flag_t command = ORTE_DAEMON_GET_MEMPROFILE;
3348 opal_buffer_t *buffer;
3349 orte_process_name_t name;
3350
3351
3352
3353
3354
3355
3356
3357 nreports = 1;
3358
3359
3360 buffer = OBJ_NEW(opal_buffer_t);
3361
3362 if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &command, 1, ORTE_DAEMON_CMD))) {
3363 ORTE_ERROR_LOG(rc);
3364 OBJ_RELEASE(buffer);
3365 goto giveup;
3366 }
3367
3368 dmns = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
3369 if (NULL != (dmn = (orte_proc_t*)opal_pointer_array_get_item(dmns->procs, 1))) {
3370 ++nreports;
3371 }
3372
3373
3374 name.jobid = ORTE_PROC_MY_NAME->jobid;
3375 for (i=0; i < nreports; i++) {
3376 OBJ_RETAIN(buffer);
3377 name.vpid = i;
3378 if (0 > (rc = orte_rml.send_buffer_nb(&name, buffer,
3379 ORTE_RML_TAG_DAEMON,
3380 orte_rml_send_callback, NULL))) {
3381 ORTE_ERROR_LOG(rc);
3382 OBJ_RELEASE(buffer);
3383 }
3384 }
3385 OBJ_RELEASE(buffer);
3386
3387
3388
3389 OBJ_CONSTRUCT(&profile_timer, orte_timer_t);
3390 opal_event_evtimer_set(orte_event_base,
3391 profile_timer.ev, profile_timeout, NULL);
3392 opal_event_set_priority(profile_timer.ev, ORTE_ERROR_PRI);
3393 profile_timer.tv.tv_sec = 30;
3394 opal_event_evtimer_add(profile_timer.ev, &profile_timer.tv);
3395 return;
3396
3397 giveup:
3398
3399 ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_ALL_JOBS_COMPLETE);
3400 }