This source file includes following definitions.
- setup_sighandler
- orte_ess_base_orted_setup
- orte_ess_base_orted_finalize
- shutdown_signal
- epipe_signal_callback
- signal_forward_callback
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26 #include "orte_config.h"
27 #include "orte/constants.h"
28
29 #include <sys/types.h>
30 #include <stdio.h>
31 #ifdef HAVE_FCNTL_H
32 #include <fcntl.h>
33 #endif
34 #ifdef HAVE_UNISTD_H
35 #include <unistd.h>
36 #endif
37
38 #include "opal/dss/dss.h"
39 #include "opal/mca/event/event.h"
40 #include "opal/runtime/opal.h"
41 #include "opal/mca/hwloc/base/base.h"
42 #include "opal/mca/pmix/base/base.h"
43 #include "opal/mca/pstat/base/base.h"
44 #include "opal/util/arch.h"
45 #include "opal/util/opal_environ.h"
46 #include "opal/util/os_path.h"
47 #include "opal/util/proc.h"
48
49 #include "orte/mca/rtc/base/base.h"
50 #include "orte/mca/rml/base/base.h"
51 #include "orte/mca/rml/base/rml_contact.h"
52 #include "orte/mca/routed/base/base.h"
53 #include "orte/mca/routed/routed.h"
54 #include "orte/mca/oob/base/base.h"
55 #include "orte/mca/grpcomm/grpcomm.h"
56 #include "orte/mca/grpcomm/base/base.h"
57 #include "orte/mca/iof/base/base.h"
58 #include "orte/mca/plm/base/base.h"
59 #include "orte/mca/odls/base/base.h"
60 #include "orte/mca/errmgr/errmgr.h"
61 #include "orte/mca/rmaps/base/base.h"
62 #include "orte/mca/filem/base/base.h"
63 #include "orte/util/proc_info.h"
64 #include "orte/util/session_dir.h"
65 #include "orte/util/name_fns.h"
66 #include "orte/util/show_help.h"
67 #include "orte/mca/errmgr/base/base.h"
68 #include "orte/mca/state/base/base.h"
69 #include "orte/mca/state/state.h"
70 #include "orte/runtime/orte_wait.h"
71 #include "orte/runtime/orte_globals.h"
72 #include "orte/runtime/orte_quit.h"
73 #include "orte/orted/pmix/pmix_server.h"
74
75 #include "orte/mca/ess/base/base.h"
76
77
78 static bool plm_in_use=false;
79 static bool signals_set=false;
80 static opal_event_t term_handler;
81 static opal_event_t int_handler;
82 static opal_event_t epipe_handler;
83 static char *log_path = NULL;
84 static void shutdown_signal(int fd, short flags, void *arg);
85 static void epipe_signal_callback(int fd, short flags, void *arg);
86 static void signal_forward_callback(int fd, short event, void *arg);
87 static opal_event_t *forward_signals_events = NULL;
88
89 static void setup_sighandler(int signal, opal_event_t *ev,
90 opal_event_cbfunc_t cbfunc)
91 {
92 opal_event_signal_set(orte_event_base, ev, signal, cbfunc, ev);
93 opal_event_set_priority(ev, ORTE_ERROR_PRI);
94 opal_event_signal_add(ev, NULL);
95 }
96
97
98 int orte_ess_base_orted_setup(void)
99 {
100 int ret = ORTE_ERROR;
101 int fd;
102 char log_file[PATH_MAX];
103 char *jobidstring;
104 char *error = NULL;
105 orte_job_t *jdata;
106 orte_proc_t *proc;
107 orte_app_context_t *app;
108 char *param;
109 hwloc_obj_t obj;
110 unsigned i, j;
111 orte_topology_t *t;
112 orte_ess_base_signal_t *sig;
113 int idx;
114
115
116 orte_process_info.super.proc_name = *(opal_process_name_t*)ORTE_PROC_MY_NAME;
117 orte_process_info.super.proc_hostname = strdup(orte_process_info.nodename);
118 orte_process_info.super.proc_flags = OPAL_PROC_ALL_LOCAL;
119 orte_process_info.super.proc_arch = opal_local_arch;
120 opal_proc_local_set(&orte_process_info.super);
121
122 plm_in_use = false;
123
124
125 setup_sighandler(SIGPIPE, &epipe_handler, epipe_signal_callback);
126
127
128
129 setup_sighandler(SIGTERM, &term_handler, shutdown_signal);
130 setup_sighandler(SIGINT, &int_handler, shutdown_signal);
131
132 if (0 < (idx = opal_list_get_size(&orte_ess_base_signals))) {
133 forward_signals_events = (opal_event_t*)malloc(sizeof(opal_event_t) * idx);
134 if (NULL == forward_signals_events) {
135 ret = ORTE_ERR_OUT_OF_RESOURCE;
136 error = "unable to malloc";
137 goto error;
138 }
139 idx = 0;
140 OPAL_LIST_FOREACH(sig, &orte_ess_base_signals, orte_ess_base_signal_t) {
141 setup_sighandler(sig->signal, forward_signals_events + idx, signal_forward_callback);
142 ++idx;
143 }
144 }
145 signals_set = true;
146
147
148
149 if (NULL == opal_hwloc_topology) {
150 if (OPAL_SUCCESS != (ret = opal_hwloc_base_get_topology())) {
151 error = "topology discovery";
152 goto error;
153 }
154 }
155
156 orte_topo_signature = opal_hwloc_base_get_topo_signature(opal_hwloc_topology);
157
158
159
160
161
162 obj = hwloc_get_root_obj(opal_hwloc_topology);
163 for (i=0; i < obj->infos_count; i++) {
164 if (NULL == obj->infos[i].name ||
165 NULL == obj->infos[i].value) {
166 continue;
167 }
168 if (0 == strncmp(obj->infos[i].name, "HostName", strlen("HostName"))) {
169 free(obj->infos[i].name);
170 free(obj->infos[i].value);
171
172 for (j=i; j < obj->infos_count-1; j++) {
173 obj->infos[j] = obj->infos[j+1];
174 }
175 obj->infos[obj->infos_count-1].name = NULL;
176 obj->infos[obj->infos_count-1].value = NULL;
177 obj->infos_count--;
178 break;
179 }
180 }
181 if (15 < opal_output_get_verbosity(orte_ess_base_framework.framework_output)) {
182 opal_output(0, "%s Topology Info:", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
183 opal_dss.dump(0, opal_hwloc_topology, OPAL_HWLOC_TOPO);
184 }
185
186
187
188
189 if (ORTE_SUCCESS != (ret = mca_base_framework_open(&opal_pstat_base_framework, 0))) {
190 ORTE_ERROR_LOG(ret);
191 error = "opal_pstat_base_open";
192 goto error;
193 }
194 if (ORTE_SUCCESS != (ret = opal_pstat_base_select())) {
195 ORTE_ERROR_LOG(ret);
196 error = "opal_pstat_base_select";
197 goto error;
198 }
199
200
201 ORTE_PROC_MY_HNP->jobid = ORTE_PROC_MY_NAME->jobid;
202 ORTE_PROC_MY_HNP->vpid = 0;
203
204
205 if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_state_base_framework, 0))) {
206 ORTE_ERROR_LOG(ret);
207 error = "orte_state_base_open";
208 goto error;
209 }
210 if (ORTE_SUCCESS != (ret = orte_state_base_select())) {
211 ORTE_ERROR_LOG(ret);
212 error = "orte_state_base_select";
213 goto error;
214 }
215
216 if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_errmgr_base_framework, 0))) {
217 ORTE_ERROR_LOG(ret);
218 error = "orte_errmgr_base_open";
219 goto error;
220 }
221
222
223
224
225 (void) mca_base_var_env_name("plm", ¶m);
226 plm_in_use = !!(getenv(param));
227 free (param);
228 if (plm_in_use) {
229 if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_plm_base_framework, 0))) {
230 ORTE_ERROR_LOG(ret);
231 error = "orte_plm_base_open";
232 goto error;
233 }
234 if (ORTE_SUCCESS != (ret = orte_plm_base_select())) {
235 ORTE_ERROR_LOG(ret);
236 error = "orte_plm_base_select";
237 goto error;
238 }
239 }
240
241 if (orte_create_session_dirs) {
242 OPAL_OUTPUT_VERBOSE((2, orte_ess_base_framework.framework_output,
243 "%s setting up session dir with\n\ttmpdir: %s\n\thost %s",
244 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
245 (NULL == orte_process_info.tmpdir_base) ? "UNDEF" : orte_process_info.tmpdir_base,
246 orte_process_info.nodename));
247
248
249
250
251 if (ORTE_SUCCESS != (ret = orte_session_dir(false, ORTE_PROC_MY_NAME))) {
252 ORTE_ERROR_LOG(ret);
253 error = "orte_session_dir define";
254 goto error;
255 }
256
257
258
259 orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
260
261 if (ORTE_SUCCESS != (ret = orte_session_dir(true, ORTE_PROC_MY_NAME))) {
262 ORTE_ERROR_LOG(ret);
263 error = "orte_session_dir";
264 goto error;
265 }
266
267
268 opal_output_set_output_file_info(orte_process_info.proc_session_dir,
269 "output-", NULL, NULL);
270
271 if (orte_debug_daemons_file_flag) {
272
273
274
275
276 if (ORTE_SUCCESS != (ret = orte_util_convert_jobid_to_string(&jobidstring,
277 ORTE_PROC_MY_NAME->jobid))) {
278 ORTE_ERROR_LOG(ret);
279 error = "convert_jobid";
280 goto error;
281 }
282
283 snprintf(log_file, PATH_MAX, "output-orted-%s-%s.log",
284 jobidstring, orte_process_info.nodename);
285 log_path = opal_os_path(false, orte_process_info.top_session_dir,
286 log_file, NULL);
287
288 fd = open(log_path, O_RDWR|O_CREAT|O_TRUNC, 0640);
289 if (fd < 0) {
290
291
292
293 fd = open("/dev/null", O_RDWR|O_CREAT|O_TRUNC, 0666);
294 } else {
295 dup2(fd, STDOUT_FILENO);
296 dup2(fd, STDERR_FILENO);
297 if(fd != STDOUT_FILENO && fd != STDERR_FILENO) {
298 close(fd);
299 }
300 }
301 }
302 }
303
304 orte_job_data = OBJ_NEW(opal_hash_table_t);
305 if (ORTE_SUCCESS != (ret = opal_hash_table_init(orte_job_data, 128))) {
306 ORTE_ERROR_LOG(ret);
307 error = "setup job array";
308 goto error;
309 }
310 orte_node_pool = OBJ_NEW(opal_pointer_array_t);
311 if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_pool,
312 ORTE_GLOBAL_ARRAY_BLOCK_SIZE,
313 ORTE_GLOBAL_ARRAY_MAX_SIZE,
314 ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) {
315 ORTE_ERROR_LOG(ret);
316 error = "setup node array";
317 goto error;
318 }
319 orte_node_topologies = OBJ_NEW(opal_pointer_array_t);
320 if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_topologies,
321 ORTE_GLOBAL_ARRAY_BLOCK_SIZE,
322 ORTE_GLOBAL_ARRAY_MAX_SIZE,
323 ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) {
324 ORTE_ERROR_LOG(ret);
325 error = "setup node topologies array";
326 goto error;
327 }
328
329
330 jdata = OBJ_NEW(orte_job_t);
331 jdata->jobid = ORTE_PROC_MY_NAME->jobid;
332 opal_hash_table_set_value_uint32(orte_job_data, jdata->jobid, jdata);
333
334 app = OBJ_NEW(orte_app_context_t);
335 opal_pointer_array_set_item(jdata->apps, 0, app);
336 jdata->num_apps++;
337
338
339 proc = OBJ_NEW(orte_proc_t);
340 proc->name.jobid = ORTE_PROC_MY_NAME->jobid;
341 proc->name.vpid = ORTE_PROC_MY_NAME->vpid;
342 proc->pid = orte_process_info.pid;
343 proc->state = ORTE_PROC_STATE_RUNNING;
344 opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc);
345
346 jdata->num_procs = 1;
347 jdata->state = ORTE_JOB_STATE_RUNNING;
348
349 jdata->num_reported = 1;
350
351
352
353 opal_setenv("OMPI_MCA_pmix", "^s1,s2,cray,isolated", false, &environ);
354 if (OPAL_SUCCESS != (ret = mca_base_framework_open(&opal_pmix_base_framework, 0))) {
355 ORTE_ERROR_LOG(ret);
356 error = "orte_pmix_base_open";
357 goto error;
358 }
359 if (ORTE_SUCCESS != (ret = opal_pmix_base_select())) {
360 ORTE_ERROR_LOG(ret);
361 error = "opal_pmix_base_select";
362 goto error;
363 }
364
365 opal_pmix_base_set_evbase(orte_event_base);
366
367
368
369 if (ORTE_SUCCESS != (ret = pmix_server_init())) {
370
371 ret = ORTE_ERR_SILENT;
372 error = "pmix_server_init";
373 goto error;
374 }
375
376
377
378 if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_routed_base_framework, 0))) {
379 ORTE_ERROR_LOG(ret);
380 error = "orte_routed_base_open";
381 goto error;
382 }
383 if (ORTE_SUCCESS != (ret = orte_routed_base_select())) {
384 ORTE_ERROR_LOG(ret);
385 error = "orte_routed_base_select";
386 goto error;
387 }
388 if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_oob_base_framework, 0))) {
389 ORTE_ERROR_LOG(ret);
390 error = "orte_oob_base_open";
391 goto error;
392 }
393 if (ORTE_SUCCESS != (ret = orte_oob_base_select())) {
394 ORTE_ERROR_LOG(ret);
395 error = "orte_oob_base_select";
396 goto error;
397 }
398 if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_rml_base_framework, 0))) {
399 ORTE_ERROR_LOG(ret);
400 error = "orte_rml_base_open";
401 goto error;
402 }
403 if (ORTE_SUCCESS != (ret = orte_rml_base_select())) {
404 ORTE_ERROR_LOG(ret);
405 error = "orte_rml_base_select";
406 goto error;
407 }
408
409
410 pmix_server_start();
411
412 if (NULL != orte_process_info.my_hnp_uri) {
413 opal_value_t val;
414
415
416 if (ORTE_SUCCESS != (ret = orte_rml_base_parse_uris(orte_process_info.my_hnp_uri,
417 ORTE_PROC_MY_HNP, NULL))) {
418 ORTE_ERROR_LOG(ret);
419 error = "orte_rml_parse_HNP";
420 goto error;
421 }
422
423
424
425
426 OBJ_CONSTRUCT(&val, opal_value_t);
427 val.key = OPAL_PMIX_PROC_URI;
428 val.type = OPAL_STRING;
429 val.data.string = orte_process_info.my_hnp_uri;
430 if (OPAL_SUCCESS != (ret = opal_pmix.store_local(ORTE_PROC_MY_HNP, &val))) {
431 ORTE_ERROR_LOG(ret);
432 val.key = NULL;
433 val.data.string = NULL;
434 OBJ_DESTRUCT(&val);
435 error = "store HNP URI";
436 goto error;
437 }
438 val.key = NULL;
439 val.data.string = NULL;
440 OBJ_DESTRUCT(&val);
441 }
442
443
444 if (ORTE_SUCCESS != (ret = orte_errmgr_base_select())) {
445 ORTE_ERROR_LOG(ret);
446 error = "orte_errmgr_base_select";
447 goto error;
448 }
449
450
451
452
453 if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_grpcomm_base_framework, 0))) {
454 ORTE_ERROR_LOG(ret);
455 error = "orte_grpcomm_base_open";
456 goto error;
457 }
458 if (ORTE_SUCCESS != (ret = orte_grpcomm_base_select())) {
459 ORTE_ERROR_LOG(ret);
460 error = "orte_grpcomm_base_select";
461 goto error;
462 }
463
464 if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_odls_base_framework, 0))) {
465 ORTE_ERROR_LOG(ret);
466 error = "orte_odls_base_open";
467 goto error;
468 }
469 if (ORTE_SUCCESS != (ret = orte_odls_base_select())) {
470 ORTE_ERROR_LOG(ret);
471 error = "orte_odls_base_select";
472 goto error;
473 }
474
475 if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_rtc_base_framework, 0))) {
476 ORTE_ERROR_LOG(ret);
477 error = "orte_rtc_base_open";
478 goto error;
479 }
480 if (ORTE_SUCCESS != (ret = orte_rtc_base_select())) {
481 ORTE_ERROR_LOG(ret);
482 error = "orte_rtc_base_select";
483 goto error;
484 }
485 if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_rmaps_base_framework, 0))) {
486 ORTE_ERROR_LOG(ret);
487 error = "orte_rmaps_base_open";
488 goto error;
489 }
490 if (ORTE_SUCCESS != (ret = orte_rmaps_base_select())) {
491 ORTE_ERROR_LOG(ret);
492 error = "orte_rmaps_base_select";
493 goto error;
494 }
495
496
497
498
499
500 t = OBJ_NEW(orte_topology_t);
501 t->topo = opal_hwloc_topology;
502
503 orte_topo_signature = opal_hwloc_base_get_topo_signature(opal_hwloc_topology);
504 t->sig = strdup(orte_topo_signature);
505 opal_pointer_array_add(orte_node_topologies, t);
506 if (15 < opal_output_get_verbosity(orte_ess_base_framework.framework_output)) {
507 opal_output(0, "%s Topology Info:", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
508 opal_dss.dump(0, opal_hwloc_topology, OPAL_HWLOC_TOPO);
509 }
510
511
512
513
514
515
516
517
518 if (plm_in_use) {
519 if (ORTE_SUCCESS != (ret = orte_plm.init())) {
520 ORTE_ERROR_LOG(ret);
521 error = "orte_plm_init";
522 goto error;
523 }
524 }
525
526
527 if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_iof_base_framework, 0))) {
528 ORTE_ERROR_LOG(ret);
529 error = "orte_iof_base_open";
530 goto error;
531 }
532 if (ORTE_SUCCESS != (ret = orte_iof_base_select())) {
533 ORTE_ERROR_LOG(ret);
534 error = "orte_iof_base_select";
535 goto error;
536 }
537
538 if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_filem_base_framework, 0))) {
539 ORTE_ERROR_LOG(ret);
540 error = "orte_filem_base_open";
541 goto error;
542 }
543 if (ORTE_SUCCESS != (ret = orte_filem_base_select())) {
544 ORTE_ERROR_LOG(ret);
545 error = "orte_filem_base_select";
546 goto error;
547 }
548
549 return ORTE_SUCCESS;
550
551 error:
552 orte_show_help("help-orte-runtime.txt",
553 "orte_init:startup:internal-failure",
554 true, error, ORTE_ERROR_NAME(ret), ret);
555
556 orte_session_dir_finalize(ORTE_PROC_MY_NAME);
557
558 orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
559 return ORTE_ERR_SILENT;
560 }
561
562 int orte_ess_base_orted_finalize(void)
563 {
564 orte_ess_base_signal_t *sig;
565 unsigned int i;
566
567 if (signals_set) {
568 opal_event_del(&epipe_handler);
569 opal_event_del(&term_handler);
570 opal_event_del(&int_handler);
571
572 i = 0;
573 OPAL_LIST_FOREACH(sig, &orte_ess_base_signals, orte_ess_base_signal_t) {
574 opal_event_signal_del(forward_signals_events + i);
575 ++i;
576 }
577 free (forward_signals_events);
578 forward_signals_events = NULL;
579 signals_set = false;
580 }
581
582
583 if (NULL != log_path) {
584 unlink(log_path);
585 }
586
587 pmix_server_finalize();
588 (void) mca_base_framework_close(&opal_pmix_base_framework);
589
590
591 (void) mca_base_framework_close(&orte_filem_base_framework);
592 (void) mca_base_framework_close(&orte_grpcomm_base_framework);
593 (void) mca_base_framework_close(&orte_iof_base_framework);
594 (void) mca_base_framework_close(&orte_errmgr_base_framework);
595 (void) mca_base_framework_close(&orte_plm_base_framework);
596
597 orte_odls.kill_local_procs(NULL);
598 (void) mca_base_framework_close(&orte_rmaps_base_framework);
599 (void) mca_base_framework_close(&orte_rtc_base_framework);
600 (void) mca_base_framework_close(&orte_odls_base_framework);
601 (void) mca_base_framework_close(&orte_routed_base_framework);
602 (void) mca_base_framework_close(&orte_rml_base_framework);
603 (void) mca_base_framework_close(&orte_oob_base_framework);
604 (void) mca_base_framework_close(&orte_state_base_framework);
605
606 orte_session_dir_finalize(ORTE_PROC_MY_NAME);
607
608 orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
609
610 OBJ_RELEASE(orte_job_data);
611 return ORTE_SUCCESS;
612 }
613
614 static void shutdown_signal(int fd, short flags, void *arg)
615 {
616
617
618
619
620 ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
621 ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_FORCED_EXIT);
622 }
623
624
625
626
627 static void epipe_signal_callback(int fd, short flags, void *arg)
628 {
629
630 return;
631 }
632
633
634 static void signal_forward_callback(int fd, short event, void *arg)
635 {
636 opal_event_t *signal = (opal_event_t*)arg;
637 int32_t signum, rc;
638 opal_buffer_t *cmd;
639 orte_daemon_cmd_flag_t command=ORTE_DAEMON_SIGNAL_LOCAL_PROCS;
640 orte_jobid_t job = ORTE_JOBID_WILDCARD;
641
642 signum = OPAL_EVENT_SIGNAL(signal);
643 if (!orte_execute_quiet){
644 fprintf(stderr, "%s: Forwarding signal %d to job\n",
645 orte_basename, signum);
646 }
647
648 cmd = OBJ_NEW(opal_buffer_t);
649
650
651 if (ORTE_SUCCESS != (rc = opal_dss.pack(cmd, &command, 1, ORTE_DAEMON_CMD))) {
652 ORTE_ERROR_LOG(rc);
653 OBJ_RELEASE(cmd);
654 return;
655 }
656
657
658 if (ORTE_SUCCESS != (rc = opal_dss.pack(cmd, &job, 1, ORTE_JOBID))) {
659 ORTE_ERROR_LOG(rc);
660 OBJ_RELEASE(cmd);
661 return;
662 }
663
664
665 if (ORTE_SUCCESS != (rc = opal_dss.pack(cmd, &signum, 1, OPAL_INT32))) {
666 ORTE_ERROR_LOG(rc);
667 OBJ_RELEASE(cmd);
668 return;
669 }
670
671
672 if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_NAME, cmd,
673 ORTE_RML_TAG_DAEMON,
674 NULL, NULL))) {
675 ORTE_ERROR_LOG(rc);
676 OBJ_RELEASE(cmd);
677 }
678
679 }