This source file includes following definitions.
- orte_daemon_recv
- get_orted_comm_cmd_str
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27 #include "orte_config.h"
28 #include "orte/constants.h"
29
30 #include <stdio.h>
31 #include <stddef.h>
32 #include <ctype.h>
33 #ifdef HAVE_UNISTD_H
34 #include <unistd.h>
35 #endif
36 #ifdef HAVE_NETDB_H
37 #include <netdb.h>
38 #endif
39 #ifdef HAVE_SYS_PARAM_H
40 #include <sys/param.h>
41 #endif
42 #include <fcntl.h>
43 #include <errno.h>
44 #include <signal.h>
45 #include <time.h>
46
47
48 #include "opal/mca/event/event.h"
49 #include "opal/mca/base/base.h"
50 #include "opal/mca/pstat/pstat.h"
51 #include "opal/util/output.h"
52 #include "opal/util/opal_environ.h"
53 #include "opal/util/path.h"
54 #include "opal/runtime/opal.h"
55 #include "opal/runtime/opal_progress.h"
56 #include "opal/dss/dss.h"
57 #include "opal/mca/compress/compress.h"
58
59 #include "orte/util/proc_info.h"
60 #include "orte/util/session_dir.h"
61 #include "orte/util/name_fns.h"
62 #include "orte/util/nidmap.h"
63
64 #include "orte/mca/errmgr/errmgr.h"
65 #include "orte/mca/grpcomm/base/base.h"
66 #include "orte/mca/iof/base/base.h"
67 #include "orte/mca/rml/rml.h"
68 #include "orte/mca/rml/rml_types.h"
69 #include "orte/mca/odls/odls.h"
70 #include "orte/mca/odls/base/base.h"
71 #include "orte/mca/oob/base/base.h"
72 #include "orte/mca/plm/plm.h"
73 #include "orte/mca/plm/base/plm_private.h"
74 #include "orte/mca/rmaps/rmaps_types.h"
75 #include "orte/mca/routed/routed.h"
76 #include "orte/mca/ess/ess.h"
77 #include "orte/mca/state/state.h"
78
79 #include "orte/mca/odls/base/odls_private.h"
80
81 #include "orte/runtime/runtime.h"
82 #include "orte/runtime/orte_globals.h"
83 #include "orte/runtime/orte_wait.h"
84 #include "orte/runtime/orte_quit.h"
85
86 #include "orte/orted/orted.h"
87
88
89
90
91 static char *get_orted_comm_cmd_str(int command);
92
93 static opal_pointer_array_t *procs_prev_ordered_to_terminate = NULL;
94
95 void orte_daemon_recv(int status, orte_process_name_t* sender,
96 opal_buffer_t *buffer, orte_rml_tag_t tag,
97 void* cbdata)
98 {
99 orte_daemon_cmd_flag_t command;
100 opal_buffer_t *relay_msg;
101 int ret;
102 orte_std_cntr_t n;
103 int32_t signal;
104 orte_jobid_t job;
105 char *contact_info;
106 opal_buffer_t data, *answer;
107 orte_job_t *jdata;
108 orte_process_name_t proc, proc2;
109 orte_process_name_t *return_addr;
110 int32_t i, num_replies;
111 bool hnp_accounted_for;
112 opal_pointer_array_t procarray;
113 orte_proc_t *proct;
114 char *cmd_str = NULL;
115 opal_pointer_array_t *procs_to_kill = NULL;
116 orte_std_cntr_t num_procs, num_new_procs = 0, p;
117 orte_proc_t *cur_proc = NULL, *prev_proc = NULL;
118 bool found = false;
119 orte_node_t *node;
120 orte_grpcomm_signature_t *sig;
121 FILE *fp;
122 char gscmd[256], path[1035], *pathptr;
123 char string[256], *string_ptr = string;
124 float pss;
125 opal_pstats_t pstat;
126 char *coprocessors;
127 orte_job_map_t *map;
128 int8_t flag;
129 uint8_t *cmpdata, u8;
130 size_t cmplen;
131
132
133 n = 1;
134 if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &command, &n, ORTE_DAEMON_CMD))) {
135 ORTE_ERROR_LOG(ret);
136 return;
137 }
138
139 cmd_str = get_orted_comm_cmd_str(command);
140 OPAL_OUTPUT_VERBOSE((1, orte_debug_output,
141 "%s orted:comm:process_commands() Processing Command: %s",
142 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), cmd_str));
143 free(cmd_str);
144 cmd_str = NULL;
145
146
147 switch(command) {
148
149
150 case ORTE_DAEMON_NULL_CMD:
151 ret = ORTE_SUCCESS;
152 break;
153
154
155 case ORTE_DAEMON_KILL_LOCAL_PROCS:
156 num_replies = 0;
157
158
159 OBJ_CONSTRUCT(&procarray, opal_pointer_array_t);
160 opal_pointer_array_init(&procarray, num_replies, ORTE_GLOBAL_ARRAY_MAX_SIZE, 16);
161
162
163 while (ORTE_SUCCESS == (ret = opal_dss.unpack(buffer, &proc, &n, ORTE_NAME))) {
164 proct = OBJ_NEW(orte_proc_t);
165 proct->name.jobid = proc.jobid;
166 proct->name.vpid = proc.vpid;
167
168 opal_pointer_array_add(&procarray, proct);
169 num_replies++;
170 }
171 if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER != ret) {
172 ORTE_ERROR_LOG(ret);
173 goto KILL_PROC_CLEANUP;
174 }
175
176 if (0 == num_replies) {
177
178 if (ORTE_SUCCESS != (ret = orte_odls.kill_local_procs(NULL))) {
179 ORTE_ERROR_LOG(ret);
180 }
181 break;
182 } else {
183
184 if (ORTE_SUCCESS != (ret = orte_odls.kill_local_procs(&procarray))) {
185 ORTE_ERROR_LOG(ret);
186 }
187 }
188
189
190 KILL_PROC_CLEANUP:
191 for (i=0; i < procarray.size; i++) {
192 if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(&procarray, i))) {
193 free(proct);
194 }
195 }
196 OBJ_DESTRUCT(&procarray);
197 break;
198
199
200 case ORTE_DAEMON_SIGNAL_LOCAL_PROCS:
201
202 n = 1;
203 if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &job, &n, ORTE_JOBID))) {
204 ORTE_ERROR_LOG(ret);
205 goto CLEANUP;
206 }
207
208
209 jdata = orte_get_job_data_object(job);
210
211
212 n = 1;
213 if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &signal, &n, OPAL_INT32))) {
214 ORTE_ERROR_LOG(ret);
215 goto CLEANUP;
216 }
217
218
219 if (SIGTSTP == signal) {
220 if (orte_debug_daemons_flag) {
221 opal_output(0, "%s orted_cmd: converted SIGTSTP to SIGSTOP before delivering",
222 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
223 }
224 signal = SIGSTOP;
225 if (NULL != jdata) {
226 jdata->state |= ORTE_JOB_STATE_SUSPENDED;
227 }
228 } else if (SIGCONT == signal && NULL != jdata) {
229 jdata->state &= ~ORTE_JOB_STATE_SUSPENDED;
230 }
231
232 if (orte_debug_daemons_flag) {
233 opal_output(0, "%s orted_cmd: received signal_local_procs, delivering signal %d",
234 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
235 signal);
236 }
237
238
239 if (ORTE_SUCCESS != (ret = orte_odls.signal_local_procs(NULL, signal))) {
240 ORTE_ERROR_LOG(ret);
241 }
242 break;
243
244
245 case ORTE_DAEMON_PASS_NODE_INFO_CMD:
246 if (orte_debug_daemons_flag) {
247 opal_output(0, "%s orted_cmd: received pass_node_info",
248 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
249 }
250 if (!ORTE_PROC_IS_HNP) {
251 n = 1;
252 if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &u8, &n, OPAL_UINT8))) {
253 ORTE_ERROR_LOG(ret);
254 goto CLEANUP;
255 }
256 if (1 == u8) {
257 if (ORTE_SUCCESS != (ret = orte_util_decode_nidmap(buffer))) {
258 ORTE_ERROR_LOG(ret);
259 goto CLEANUP;
260 }
261 }
262 if (ORTE_SUCCESS != (ret = orte_util_parse_node_info(buffer))) {
263 ORTE_ERROR_LOG(ret);
264 goto CLEANUP;
265 }
266 }
267 break;
268
269
270
271 case ORTE_DAEMON_ADD_LOCAL_PROCS:
272 case ORTE_DAEMON_DVM_ADD_PROCS:
273 if (orte_debug_daemons_flag) {
274 opal_output(0, "%s orted_cmd: received add_local_procs",
275 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
276 }
277
278
279 if (ORTE_SUCCESS != (ret = orte_odls.launch_local_procs(buffer))) {
280 OPAL_OUTPUT_VERBOSE((1, orte_debug_output,
281 "%s orted:comm:add_procs failed to launch on error %s",
282 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_ERROR_NAME(ret)));
283 }
284 break;
285
286 case ORTE_DAEMON_ABORT_PROCS_CALLED:
287 if (orte_debug_daemons_flag) {
288 opal_output(0, "%s orted_cmd: received abort_procs report",
289 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
290 }
291
292
293 n = 1;
294 if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &num_procs, &n, ORTE_STD_CNTR)) ) {
295 ORTE_ERROR_LOG(ret);
296 goto CLEANUP;
297 }
298
299
300 procs_to_kill = OBJ_NEW(opal_pointer_array_t);
301 opal_pointer_array_init(procs_to_kill, num_procs, INT32_MAX, 2);
302
303
304
305
306 if( NULL == procs_prev_ordered_to_terminate ) {
307 procs_prev_ordered_to_terminate = OBJ_NEW(opal_pointer_array_t);
308 opal_pointer_array_init(procs_prev_ordered_to_terminate, num_procs+1, INT32_MAX, 8);
309 }
310
311 num_new_procs = 0;
312 for( i = 0; i < num_procs; ++i) {
313 cur_proc = OBJ_NEW(orte_proc_t);
314
315 n = 1;
316 if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &(cur_proc->name), &n, ORTE_NAME)) ) {
317 ORTE_ERROR_LOG(ret);
318 goto CLEANUP;
319 }
320
321
322 found = false;
323 for( p = 0; p < procs_prev_ordered_to_terminate->size; ++p) {
324 if( NULL == (prev_proc = (orte_proc_t*)opal_pointer_array_get_item(procs_prev_ordered_to_terminate, p))) {
325 continue;
326 }
327 if(OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL,
328 &cur_proc->name,
329 &prev_proc->name) ) {
330 found = true;
331 break;
332 }
333 }
334
335 OPAL_OUTPUT_VERBOSE((2, orte_debug_output,
336 "%s orted:comm:abort_procs Application %s requests term. of %s (%2d of %2d) %3s.",
337 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
338 ORTE_NAME_PRINT(sender),
339 ORTE_NAME_PRINT(&(cur_proc->name)), i, num_procs,
340 (found ? "Dup" : "New") ));
341
342
343 if( !found ) {
344 opal_pointer_array_add(procs_to_kill, (void*)cur_proc);
345 OBJ_RETAIN(cur_proc);
346 opal_pointer_array_add(procs_prev_ordered_to_terminate, (void*)cur_proc);
347 num_new_procs++;
348 }
349 }
350
351
352
353
354 if( num_new_procs > 0 ) {
355 OPAL_OUTPUT_VERBOSE((2, orte_debug_output,
356 "%s orted:comm:abort_procs Terminating application requested processes (%2d / %2d).",
357 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
358 num_new_procs, num_procs));
359 orte_plm.terminate_procs(procs_to_kill);
360 } else {
361 OPAL_OUTPUT_VERBOSE((2, orte_debug_output,
362 "%s orted:comm:abort_procs No new application processes to terminating from request (%2d / %2d).",
363 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
364 num_new_procs, num_procs));
365 }
366
367 break;
368
369
370 case ORTE_DAEMON_EXIT_CMD:
371 if (orte_debug_daemons_flag) {
372 opal_output(0, "%s orted_cmd: received exit cmd",
373 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
374 }
375 if (orte_do_not_launch) {
376 ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
377 return;
378 }
379
380 orte_odls.kill_local_procs(NULL);
381
382 orte_orteds_term_ordered = true;
383
384 if (0 == (ret = orte_routed.num_routes())) {
385 for (i=0; i < orte_local_children->size; i++) {
386 if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) &&
387 ORTE_FLAG_TEST(proct, ORTE_PROC_FLAG_ALIVE)) {
388
389 if (orte_debug_daemons_flag) {
390 opal_output(0, "%s orted_cmd: exit cmd, but proc %s is alive",
391 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
392 ORTE_NAME_PRINT(&proct->name));
393 }
394 return;
395 }
396 }
397
398 if (orte_debug_daemons_flag) {
399 opal_output(0, "%s orted_cmd: all routes and children gone - exiting",
400 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
401 }
402 ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
403 } else if (orte_debug_daemons_flag) {
404 opal_output(0, "%s orted_cmd: exit cmd, %d routes still exist",
405 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ret);
406 }
407 return;
408 break;
409
410
411 case ORTE_DAEMON_HALT_VM_CMD:
412 if (orte_debug_daemons_flag) {
413 opal_output(0, "%s orted_cmd: received halt_vm cmd",
414 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
415 }
416 if (orte_do_not_launch) {
417 ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
418 return;
419 }
420
421 orte_odls.kill_local_procs(NULL);
422
423 orte_orteds_term_ordered = true;
424 if (ORTE_PROC_IS_HNP) {
425
426 if (0 == orte_routed.num_routes()) {
427 for (i=0; i < orte_local_children->size; i++) {
428 if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) &&
429 ORTE_FLAG_TEST(proct, ORTE_PROC_FLAG_ALIVE)) {
430
431 return;
432 }
433 }
434
435 if (orte_debug_daemons_flag) {
436 opal_output(0, "%s orted_cmd: all routes and children gone - exiting",
437 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
438 }
439 ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
440 }
441 } else {
442 ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
443 }
444 return;
445 break;
446
447
448 case ORTE_DAEMON_HALT_DVM_CMD:
449 if (orte_debug_daemons_flag) {
450 opal_output(0, "%s orted_cmd: received halt_dvm cmd",
451 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
452 }
453
454
455 answer = OBJ_NEW(opal_buffer_t);
456 command = ORTE_DAEMON_HALT_VM_CMD;
457 opal_dss.pack(answer, &command, 1, ORTE_DAEMON_CMD);
458 sig = OBJ_NEW(orte_grpcomm_signature_t);
459 sig->signature = (orte_process_name_t*)malloc(sizeof(orte_process_name_t));
460 sig->signature[0].jobid = ORTE_PROC_MY_NAME->jobid;
461 sig->signature[0].vpid = ORTE_VPID_WILDCARD;
462 orte_grpcomm.xcast(sig, ORTE_RML_TAG_DAEMON, answer);
463 OBJ_RELEASE(answer);
464 OBJ_RELEASE(sig);
465 return;
466 break;
467
468
469 case ORTE_DAEMON_SPAWN_JOB_CMD:
470 if (orte_debug_daemons_flag) {
471 opal_output(0, "%s orted_cmd: received spawn job",
472 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
473 }
474
475 if (ORTE_PROC_IS_HNP) {
476
477 n = 1;
478 if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &jdata, &n, ORTE_JOB))) {
479 ORTE_ERROR_LOG(ret);
480 ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_FAILED_TO_LAUNCH);
481 break;
482 }
483
484 jdata->originator = *sender;
485
486 if (ORTE_SUCCESS != (ret = orte_plm_base_create_jobid(jdata))) {
487 ORTE_ERROR_LOG(ret);
488 ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_FAILED_TO_LAUNCH);
489 break;
490 }
491
492 opal_hash_table_set_value_uint32(orte_job_data, jdata->jobid, jdata);
493
494
495 {
496 orte_iof_tag_t ioftag;
497 opal_buffer_t *iofbuf;
498 orte_process_name_t source;
499
500 ioftag = ORTE_IOF_EXCLUSIVE | ORTE_IOF_STDOUTALL | ORTE_IOF_PULL;
501 iofbuf = OBJ_NEW(opal_buffer_t);
502
503 if (ORTE_SUCCESS != (ret = opal_dss.pack(iofbuf, &ioftag, 1, ORTE_IOF_TAG))) {
504 ORTE_ERROR_LOG(ret);
505 OBJ_RELEASE(iofbuf);
506 ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_FAILED_TO_LAUNCH);
507 break;
508 }
509
510 source.jobid = jdata->jobid;
511 source.vpid = ORTE_VPID_WILDCARD;
512 if (ORTE_SUCCESS != (ret = opal_dss.pack(iofbuf, &source, 1, ORTE_NAME))) {
513 ORTE_ERROR_LOG(ret);
514 OBJ_RELEASE(iofbuf);
515 ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_FAILED_TO_LAUNCH);
516 break;
517 }
518
519 if (ORTE_SUCCESS != (ret = opal_dss.pack(iofbuf, sender, 1, ORTE_NAME))) {
520 ORTE_ERROR_LOG(ret);
521 OBJ_RELEASE(iofbuf);
522 ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_FAILED_TO_LAUNCH);
523 break;
524 }
525
526 orte_rml.send_buffer_nb(ORTE_PROC_MY_NAME, iofbuf, ORTE_RML_TAG_IOF_HNP,
527 orte_rml_send_callback, NULL);
528 }
529 for (i=1; i < orte_node_pool->size; i++) {
530 if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
531 node->state = ORTE_NODE_STATE_ADDED;
532 }
533 }
534
535 if (ORTE_SUCCESS != (ret = orte_plm.spawn(jdata))) {
536 ORTE_ERROR_LOG(ret);
537 ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_FAILED_TO_LAUNCH);
538 break;
539 }
540 }
541 break;
542
543
544 case ORTE_DAEMON_TERMINATE_JOB_CMD:
545
546
547 n = 1;
548 if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &job, &n, ORTE_JOBID))) {
549 ORTE_ERROR_LOG(ret);
550 goto CLEANUP;
551 }
552
553
554 if (NULL == (jdata = orte_get_job_data_object(job))) {
555 ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
556 goto CLEANUP;
557 }
558
559
560 if (ORTE_SUCCESS != (ret = orte_set_attribute(&jdata->attributes, ORTE_JOB_CANCELLED,
561 ORTE_ATTR_LOCAL, NULL, OPAL_BOOL))) {
562 ORTE_ERROR_LOG(ret);
563 goto CLEANUP;
564 }
565
566 if (ORTE_SUCCESS != (ret = orte_plm.terminate_job(job))) {
567 ORTE_ERROR_LOG(ret);
568 goto CLEANUP;
569 }
570 break;
571
572
573
574 case ORTE_DAEMON_DVM_CLEANUP_JOB_CMD:
575
576 n = 1;
577 if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &job, &n, ORTE_JOBID))) {
578 ORTE_ERROR_LOG(ret);
579 goto CLEANUP;
580 }
581
582
583 if (NULL == (jdata = orte_get_job_data_object(job))) {
584
585
586 goto CLEANUP;
587 }
588
589
590
591
592 if (0 < jdata->num_local_procs) {
593 goto CLEANUP;
594 }
595
596
597
598 if (NULL != jdata->map) {
599 map = (orte_job_map_t*)jdata->map;
600 for (n = 0; n < map->nodes->size; n++) {
601 if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, n))) {
602 continue;
603 }
604 for (i = 0; i < node->procs->size; i++) {
605 if (NULL == (proct = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
606 continue;
607 }
608 if (proct->name.jobid != jdata->jobid) {
609
610 continue;
611 }
612 node->slots_inuse--;
613 node->num_procs--;
614
615 opal_pointer_array_set_item(node->procs, i, NULL);
616
617 OBJ_RELEASE(proct);
618 }
619
620 opal_pointer_array_set_item(map->nodes, n, NULL);
621
622 OBJ_RELEASE(node);
623
624 ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED);
625 }
626 OBJ_RELEASE(map);
627 jdata->map = NULL;
628 }
629 break;
630
631
632
633 case ORTE_DAEMON_REPORT_TOPOLOGY_CMD:
634 OBJ_CONSTRUCT(&data, opal_buffer_t);
635
636 if (ORTE_SUCCESS != (ret = opal_dss.pack(&data, &orte_topo_signature, 1, OPAL_STRING))) {
637 ORTE_ERROR_LOG(ret);
638 OBJ_DESTRUCT(&data);
639 goto CLEANUP;
640 }
641
642 if (ORTE_SUCCESS != (ret = opal_dss.pack(&data, &opal_hwloc_topology, 1, OPAL_HWLOC_TOPO))) {
643 ORTE_ERROR_LOG(ret);
644 OBJ_DESTRUCT(&data);
645 goto CLEANUP;
646 }
647
648
649 coprocessors = opal_hwloc_base_find_coprocessors(opal_hwloc_topology);
650 if (ORTE_SUCCESS != (ret = opal_dss.pack(&data, &coprocessors, 1, OPAL_STRING))) {
651 ORTE_ERROR_LOG(ret);
652 }
653 if (NULL != coprocessors) {
654 free(coprocessors);
655 }
656
657 coprocessors = opal_hwloc_base_check_on_coprocessor();
658 if (ORTE_SUCCESS != (ret = opal_dss.pack(&data, &coprocessors, 1, OPAL_STRING))) {
659 ORTE_ERROR_LOG(ret);
660 }
661 if (NULL!= coprocessors) {
662 free(coprocessors);
663 }
664 answer = OBJ_NEW(opal_buffer_t);
665 if (opal_compress.compress_block((uint8_t*)data.base_ptr, data.bytes_used,
666 &cmpdata, &cmplen)) {
667
668 flag = 1;
669 if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &flag, 1, OPAL_INT8))) {
670 ORTE_ERROR_LOG(ret);
671 free(cmpdata);
672 OBJ_DESTRUCT(&data);
673 OBJ_RELEASE(answer);
674 goto CLEANUP;
675 }
676
677 if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &cmplen, 1, OPAL_SIZE))) {
678 ORTE_ERROR_LOG(ret);
679 free(cmpdata);
680 OBJ_DESTRUCT(&data);
681 OBJ_RELEASE(answer);
682 goto CLEANUP;
683 }
684
685 if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &data.bytes_used, 1, OPAL_SIZE))) {
686 ORTE_ERROR_LOG(ret);
687 free(cmpdata);
688 OBJ_DESTRUCT(&data);
689 OBJ_RELEASE(answer);
690 goto CLEANUP;
691 }
692
693 if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, cmpdata, cmplen, OPAL_UINT8))) {
694 ORTE_ERROR_LOG(ret);
695 free(cmpdata);
696 OBJ_DESTRUCT(&data);
697 OBJ_RELEASE(answer);
698 goto CLEANUP;
699 }
700 OBJ_DESTRUCT(&data);
701 free(cmpdata);
702 } else {
703
704 flag = 0;
705 if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &flag, 1, OPAL_INT8))) {
706 ORTE_ERROR_LOG(ret);
707 OBJ_DESTRUCT(&data);
708 free(cmpdata);
709 OBJ_RELEASE(answer);
710 goto CLEANUP;
711 }
712
713 opal_dss.copy_payload(answer, &data);
714 OBJ_DESTRUCT(&data);
715 }
716
717 if (0 > (ret = orte_rml.send_buffer_nb(sender, answer, ORTE_RML_TAG_TOPOLOGY_REPORT,
718 orte_rml_send_callback, NULL))) {
719 ORTE_ERROR_LOG(ret);
720 OBJ_RELEASE(answer);
721 }
722 break;
723
724
725 case ORTE_DAEMON_CONTACT_QUERY_CMD:
726 if (orte_debug_daemons_flag) {
727 opal_output(0, "%s orted_cmd: received contact query",
728 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
729 }
730
731 orte_oob_base_get_addr(&contact_info);
732
733 if (NULL == contact_info) {
734 ORTE_ERROR_LOG(ORTE_ERROR);
735 ret = ORTE_ERROR;
736 goto CLEANUP;
737 }
738
739
740 answer = OBJ_NEW(opal_buffer_t);
741 if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &contact_info, 1, OPAL_STRING))) {
742 ORTE_ERROR_LOG(ret);
743 OBJ_RELEASE(answer);
744 goto CLEANUP;
745 }
746
747 if (0 > (ret = orte_rml.send_buffer_nb(sender, answer, ORTE_RML_TAG_TOOL,
748 orte_rml_send_callback, NULL))) {
749 ORTE_ERROR_LOG(ret);
750 OBJ_RELEASE(answer);
751 }
752 break;
753
754
755 case ORTE_DAEMON_REPORT_JOB_INFO_CMD:
756 if (orte_debug_daemons_flag) {
757 opal_output(0, "%s orted_cmd: received job info query",
758 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
759 }
760
761
762
763 if (!ORTE_PROC_IS_HNP) {
764 int32_t zero=0;
765
766 answer = OBJ_NEW(opal_buffer_t);
767 if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &zero, 1, OPAL_INT32))) {
768 ORTE_ERROR_LOG(ret);
769 OBJ_RELEASE(answer);
770 goto CLEANUP;
771 }
772 if (0 > (ret = orte_rml.send_buffer_nb(sender, answer, ORTE_RML_TAG_TOOL,
773 orte_rml_send_callback, NULL))) {
774 ORTE_ERROR_LOG(ret);
775 OBJ_RELEASE(answer);
776 }
777 } else {
778
779 int32_t rc, num_jobs;
780 orte_job_t *jobdat;
781
782
783 n = 1;
784 if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &job, &n, ORTE_JOBID))) {
785 ORTE_ERROR_LOG(ret);
786 goto CLEANUP;
787 }
788
789
790 answer = OBJ_NEW(opal_buffer_t);
791
792
793 if (ORTE_JOBID_WILDCARD != job) {
794 job = ORTE_CONSTRUCT_LOCAL_JOBID(ORTE_PROC_MY_NAME->jobid, job);
795 if (NULL != (jobdat = orte_get_job_data_object(job))) {
796 num_jobs = 1;
797 if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &num_jobs, 1, OPAL_INT32))) {
798 ORTE_ERROR_LOG(ret);
799 OBJ_RELEASE(answer);
800 goto CLEANUP;
801 }
802 if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &jobdat, 1, ORTE_JOB))) {
803 ORTE_ERROR_LOG(ret);
804 OBJ_RELEASE(answer);
805 goto CLEANUP;
806 }
807 } else {
808
809 num_jobs = 0;
810 if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &num_jobs, 1, OPAL_INT32))) {
811 ORTE_ERROR_LOG(ret);
812 OBJ_RELEASE(answer);
813 goto CLEANUP;
814 }
815 }
816 } else {
817 uint32_t u32;
818 void *nptr;
819 num_jobs = opal_hash_table_get_size(orte_job_data);
820
821 if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &num_jobs, 1, OPAL_INT32))) {
822 ORTE_ERROR_LOG(ret);
823 OBJ_RELEASE(answer);
824 goto CLEANUP;
825 }
826
827 rc = opal_hash_table_get_first_key_uint32(orte_job_data, &u32, (void **)&jobdat, &nptr);
828 while (OPAL_SUCCESS == rc) {
829 if (NULL != jobdat) {
830
831 if (ORTE_SUCCESS != (rc = opal_dss.pack(answer, &jobdat, 1, ORTE_JOB))) {
832 ORTE_ERROR_LOG(ret);
833 OBJ_RELEASE(answer);
834 goto CLEANUP;
835 }
836 ++num_jobs;
837 }
838 rc = opal_hash_table_get_next_key_uint32(orte_job_data, &u32, (void **)&jobdat, nptr, &nptr);
839 }
840 }
841 if (0 > (ret = orte_rml.send_buffer_nb(sender, answer, ORTE_RML_TAG_TOOL,
842 orte_rml_send_callback, NULL))) {
843 ORTE_ERROR_LOG(ret);
844 OBJ_RELEASE(answer);
845 }
846 }
847 break;
848
849
850 case ORTE_DAEMON_REPORT_NODE_INFO_CMD:
851 if (orte_debug_daemons_flag) {
852 opal_output(0, "%s orted_cmd: received node info query",
853 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
854 }
855
856
857
858 if (!ORTE_PROC_IS_HNP) {
859 int32_t zero=0;
860
861 answer = OBJ_NEW(opal_buffer_t);
862 if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &zero, 1, OPAL_INT32))) {
863 ORTE_ERROR_LOG(ret);
864 OBJ_RELEASE(answer);
865 goto CLEANUP;
866 }
867 if (0 > (ret = orte_rml.send_buffer_nb(sender, answer, ORTE_RML_TAG_TOOL,
868 orte_rml_send_callback, NULL))) {
869 ORTE_ERROR_LOG(ret);
870 OBJ_RELEASE(answer);
871 }
872 } else {
873
874 int32_t i, num_nodes;
875 orte_node_t *node;
876 char *nid;
877
878
879 n = 1;
880 if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &nid, &n, OPAL_STRING))) {
881 ORTE_ERROR_LOG(ret);
882 goto CLEANUP;
883 }
884
885
886 answer = OBJ_NEW(opal_buffer_t);
887 num_nodes = 0;
888
889
890 if (NULL != nid) {
891
892 for (i=0; i < orte_node_pool->size; i++) {
893 if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
894 continue;
895 }
896 if (0 == strcmp(nid, node->name)) {
897 num_nodes = 1;
898 break;
899 }
900 }
901 if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &num_nodes, 1, OPAL_INT32))) {
902 ORTE_ERROR_LOG(ret);
903 OBJ_RELEASE(answer);
904 goto CLEANUP;
905 }
906 if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &node, 1, ORTE_NODE))) {
907 ORTE_ERROR_LOG(ret);
908 OBJ_RELEASE(answer);
909 goto CLEANUP;
910 }
911 } else {
912
913 for (i=0; i < orte_node_pool->size; i++) {
914 if (NULL != opal_pointer_array_get_item(orte_node_pool, i)) {
915 num_nodes++;
916 }
917 }
918
919 if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &num_nodes, 1, OPAL_INT32))) {
920 ORTE_ERROR_LOG(ret);
921 OBJ_RELEASE(answer);
922 goto CLEANUP;
923 }
924
925 for (i=0; i < orte_node_pool->size; i++) {
926 if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
927 if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &node, 1, ORTE_NODE))) {
928 ORTE_ERROR_LOG(ret);
929 OBJ_RELEASE(answer);
930 goto CLEANUP;
931 }
932 }
933 }
934 }
935
936 if (0 > (ret = orte_rml.send_buffer_nb(sender, answer, ORTE_RML_TAG_TOOL,
937 orte_rml_send_callback, NULL))) {
938 ORTE_ERROR_LOG(ret);
939 OBJ_RELEASE(answer);
940 }
941 }
942 break;
943
944
945 case ORTE_DAEMON_REPORT_PROC_INFO_CMD:
946 if (orte_debug_daemons_flag) {
947 opal_output(0, "%s orted_cmd: received proc info query",
948 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
949 }
950
951
952
953 if (!ORTE_PROC_IS_HNP) {
954 int32_t zero=0;
955
956 answer = OBJ_NEW(opal_buffer_t);
957 if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &zero, 1, OPAL_INT32))) {
958 ORTE_ERROR_LOG(ret);
959 OBJ_RELEASE(answer);
960 goto CLEANUP;
961 }
962 if (0 > (ret = orte_rml.send_buffer_nb(sender, answer, ORTE_RML_TAG_TOOL,
963 orte_rml_send_callback, NULL))) {
964 ORTE_ERROR_LOG(ret);
965 OBJ_RELEASE(answer);
966 }
967 } else {
968
969 orte_job_t *jdata;
970 orte_proc_t *proc;
971 orte_vpid_t vpid;
972 int32_t i, num_procs;
973 char *nid;
974
975
976 answer = OBJ_NEW(opal_buffer_t);
977
978
979 n = 1;
980 if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &job, &n, ORTE_JOBID))) {
981 ORTE_ERROR_LOG(ret);
982 goto CLEANUP;
983 }
984
985
986 job = ORTE_CONSTRUCT_LOCAL_JOBID(ORTE_PROC_MY_NAME->jobid, job);
987 if (NULL == (jdata = orte_get_job_data_object(job))) {
988 ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
989 goto CLEANUP;
990 }
991
992
993 n = 1;
994 if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &vpid, &n, ORTE_VPID))) {
995 ORTE_ERROR_LOG(ret);
996 goto CLEANUP;
997 }
998
999
1000
1001 if (ORTE_VPID_WILDCARD != vpid) {
1002
1003 for (i=0; i < jdata->procs->size; i++) {
1004 if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) {
1005 continue;
1006 }
1007 if (vpid == proc->name.vpid) {
1008 num_procs = 1;
1009 if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &num_procs, 1, OPAL_INT32))) {
1010 ORTE_ERROR_LOG(ret);
1011 goto CLEANUP;
1012 }
1013 if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &proc, 1, ORTE_PROC))) {
1014 ORTE_ERROR_LOG(ret);
1015 goto CLEANUP;
1016 }
1017
1018
1019
1020
1021 if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &proc->pid, 1, OPAL_PID))) {
1022 ORTE_ERROR_LOG(ret);
1023 goto CLEANUP;
1024 }
1025 if (NULL == proc->node) {
1026 nid = "UNKNOWN";
1027 } else {
1028 nid = proc->node->name;
1029 }
1030 if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &nid, 1, OPAL_STRING))) {
1031 ORTE_ERROR_LOG(ret);
1032 goto CLEANUP;
1033 }
1034 break;
1035 }
1036 }
1037 } else {
1038
1039 num_procs = 0;
1040 for (i=0; i < jdata->procs->size; i++) {
1041 if (NULL != opal_pointer_array_get_item(jdata->procs, i)) {
1042 num_procs++;
1043 }
1044 }
1045
1046 if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &num_procs, 1, OPAL_INT32))) {
1047 ORTE_ERROR_LOG(ret);
1048 OBJ_RELEASE(answer);
1049 goto CLEANUP;
1050 }
1051
1052 for (i=0; i < jdata->procs->size; i++) {
1053 if (NULL != (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) {
1054 if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &proc, 1, ORTE_PROC))) {
1055 ORTE_ERROR_LOG(ret);
1056 OBJ_RELEASE(answer);
1057 goto CLEANUP;
1058 }
1059
1060
1061
1062
1063 if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &proc->pid, 1, OPAL_PID))) {
1064 ORTE_ERROR_LOG(ret);
1065 goto CLEANUP;
1066 }
1067 if (NULL == proc->node) {
1068 nid = "UNKNOWN";
1069 } else {
1070 nid = proc->node->name;
1071 }
1072 if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &nid, 1, OPAL_STRING))) {
1073 ORTE_ERROR_LOG(ret);
1074 goto CLEANUP;
1075 }
1076 }
1077 }
1078 }
1079
1080 if (0 > (ret = orte_rml.send_buffer_nb(sender, answer, ORTE_RML_TAG_TOOL,
1081 orte_rml_send_callback, NULL))) {
1082 ORTE_ERROR_LOG(ret);
1083 OBJ_RELEASE(answer);
1084 }
1085 }
1086 break;
1087
1088
1089 case ORTE_DAEMON_HEARTBEAT_CMD:
1090 ORTE_ERROR_LOG(ORTE_ERR_NOT_IMPLEMENTED);
1091 ret = ORTE_ERR_NOT_IMPLEMENTED;
1092 break;
1093
1094
1095 case ORTE_DAEMON_TOP_CMD:
1096
1097 answer = OBJ_NEW(opal_buffer_t);
1098 num_replies = 0;
1099 hnp_accounted_for = false;
1100
1101 n = 1;
1102 return_addr = NULL;
1103 while (ORTE_SUCCESS == opal_dss.unpack(buffer, &proc, &n, ORTE_NAME)) {
1104
1105
1106
1107 proc.jobid = ORTE_CONSTRUCT_LOCAL_JOBID(ORTE_PROC_MY_NAME->jobid, proc.jobid);
1108 if (ORTE_PROC_IS_HNP) {
1109 return_addr = sender;
1110 proc2.jobid = ORTE_PROC_MY_NAME->jobid;
1111
1112
1113
1114
1115 if (ORTE_VPID_WILDCARD == proc.vpid) {
1116
1117 for (proc2.vpid=1; proc2.vpid < orte_process_info.num_procs; proc2.vpid++) {
1118
1119
1120 relay_msg = OBJ_NEW(opal_buffer_t);
1121 command = ORTE_DAEMON_TOP_CMD;
1122 if (ORTE_SUCCESS != (ret = opal_dss.pack(relay_msg, &command, 1, ORTE_DAEMON_CMD))) {
1123 ORTE_ERROR_LOG(ret);
1124 OBJ_RELEASE(relay_msg);
1125 goto SEND_TOP_ANSWER;
1126 }
1127 if (ORTE_SUCCESS != (ret = opal_dss.pack(relay_msg, &proc, 1, ORTE_NAME))) {
1128 ORTE_ERROR_LOG(ret);
1129 OBJ_RELEASE(relay_msg);
1130 goto SEND_TOP_ANSWER;
1131 }
1132 if (ORTE_SUCCESS != (ret = opal_dss.pack(relay_msg, sender, 1, ORTE_NAME))) {
1133 ORTE_ERROR_LOG(ret);
1134 OBJ_RELEASE(relay_msg);
1135 goto SEND_TOP_ANSWER;
1136 }
1137
1138 if (0 > orte_rml.send_buffer_nb(&proc2, relay_msg,
1139 ORTE_RML_TAG_DAEMON,
1140 orte_rml_send_callback, NULL)) {
1141 ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
1142 OBJ_RELEASE(relay_msg);
1143 ret = ORTE_ERR_COMM_FAILURE;
1144 }
1145 num_replies++;
1146 }
1147
1148 if (!hnp_accounted_for) {
1149 hnp_accounted_for = true;
1150 num_replies++;
1151 }
1152
1153 goto GET_TOP;
1154 } else {
1155
1156
1157
1158 if (ORTE_VPID_INVALID == (proc2.vpid = orte_get_proc_daemon_vpid(&proc))) {
1159 ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
1160 goto SEND_TOP_ANSWER;
1161 }
1162
1163 if (proc2.vpid == ORTE_PROC_MY_NAME->vpid) {
1164 if (!hnp_accounted_for) {
1165 hnp_accounted_for = true;
1166 num_replies++;
1167 }
1168 goto GET_TOP;
1169 }
1170
1171 relay_msg = OBJ_NEW(opal_buffer_t);
1172 command = ORTE_DAEMON_TOP_CMD;
1173 if (ORTE_SUCCESS != (ret = opal_dss.pack(relay_msg, &command, 1, ORTE_DAEMON_CMD))) {
1174 ORTE_ERROR_LOG(ret);
1175 OBJ_RELEASE(relay_msg);
1176 goto SEND_TOP_ANSWER;
1177 }
1178 if (ORTE_SUCCESS != (ret = opal_dss.pack(relay_msg, &proc, 1, ORTE_NAME))) {
1179 ORTE_ERROR_LOG(ret);
1180 OBJ_RELEASE(relay_msg);
1181 goto SEND_TOP_ANSWER;
1182 }
1183 if (ORTE_SUCCESS != (ret = opal_dss.pack(relay_msg, sender, 1, ORTE_NAME))) {
1184 ORTE_ERROR_LOG(ret);
1185 OBJ_RELEASE(relay_msg);
1186 goto SEND_TOP_ANSWER;
1187 }
1188
1189 if (0 > orte_rml.send_buffer_nb(&proc2, relay_msg,
1190 ORTE_RML_TAG_DAEMON,
1191 orte_rml_send_callback, NULL)) {
1192 ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
1193 OBJ_RELEASE(relay_msg);
1194 ret = ORTE_ERR_COMM_FAILURE;
1195 }
1196 }
1197
1198 } else {
1199
1200
1201
1202 n = 1;
1203 if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &proc2, &n, ORTE_NAME))) {
1204 ORTE_ERROR_LOG(ret);
1205
1206
1207
1208
1209
1210 goto CLEANUP;
1211 }
1212 return_addr = &proc2;
1213 GET_TOP:
1214
1215
1216
1217 if (ORTE_SUCCESS != (ret = orte_odls_base_get_proc_stats(answer, &proc))) {
1218 ORTE_ERROR_LOG(ret);
1219 goto SEND_TOP_ANSWER;
1220 }
1221 }
1222 }
1223 SEND_TOP_ANSWER:
1224
1225 if (ORTE_PROC_IS_HNP) {
1226
1227
1228
1229 time_t mytime;
1230 char *cptr;
1231
1232 relay_msg = OBJ_NEW(opal_buffer_t);
1233 if (ORTE_SUCCESS != (ret = opal_dss.pack(relay_msg, &num_replies, 1, OPAL_INT32))) {
1234 ORTE_ERROR_LOG(ret);
1235 }
1236 time(&mytime);
1237 cptr = ctime(&mytime);
1238 cptr[strlen(cptr)-1] = '\0';
1239 if (ORTE_SUCCESS != (ret = opal_dss.pack(relay_msg, &cptr, 1, OPAL_STRING))) {
1240 ORTE_ERROR_LOG(ret);
1241 }
1242
1243 opal_dss.copy_payload(relay_msg, answer);
1244 OBJ_RELEASE(answer);
1245 answer = relay_msg;
1246 }
1247
1248 if (NULL == return_addr) {
1249 ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
1250 ret = ORTE_ERR_COMM_FAILURE;
1251 break;
1252 }
1253 if (0 > (ret = orte_rml.send_buffer_nb(return_addr, answer, ORTE_RML_TAG_TOOL,
1254 orte_rml_send_callback, NULL))) {
1255 ORTE_ERROR_LOG(ret);
1256 OBJ_RELEASE(answer);
1257 }
1258 break;
1259
1260 case ORTE_DAEMON_GET_STACK_TRACES:
1261
1262 answer = OBJ_NEW(opal_buffer_t);
1263 pathptr = path;
1264
1265
1266
1267
1268
1269 char *gstack_exec;
1270 gstack_exec = opal_find_absolute_path("gstack");
1271
1272
1273 for (i=0; i < orte_local_children->size; i++) {
1274 if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) &&
1275 ORTE_FLAG_TEST(proct, ORTE_PROC_FLAG_ALIVE)) {
1276 relay_msg = OBJ_NEW(opal_buffer_t);
1277 if (OPAL_SUCCESS != opal_dss.pack(relay_msg, &proct->name, 1, ORTE_NAME) ||
1278 OPAL_SUCCESS != opal_dss.pack(relay_msg, &proct->node->name, 1, OPAL_STRING) ||
1279 OPAL_SUCCESS != opal_dss.pack(relay_msg, &proct->pid, 1, OPAL_PID)) {
1280 OBJ_RELEASE(relay_msg);
1281 break;
1282 }
1283
1284
1285
1286 fp = NULL;
1287 if (NULL != gstack_exec) {
1288 (void) snprintf(gscmd, sizeof(gscmd), "%s %lu",
1289 gstack_exec, (unsigned long) proct->pid);
1290 fp = popen(gscmd, "r");
1291 }
1292
1293
1294
1295 if (NULL == gstack_exec || NULL == fp) {
1296 (void) snprintf(string, sizeof(string),
1297 "Failed to %s \"%s\" on %s to obtain stack traces",
1298 (NULL == gstack_exec) ? "find" : "run",
1299 (NULL == gstack_exec) ? "gstack" : gstack_exec,
1300 proct->node->name);
1301 if (OPAL_SUCCESS ==
1302 opal_dss.pack(relay_msg, &string_ptr, 1, OPAL_STRING)) {
1303 opal_dss.pack(answer, &relay_msg, 1, OPAL_BUFFER);
1304 }
1305 OBJ_RELEASE(relay_msg);
1306 break;
1307 }
1308
1309 memset(path, 0, sizeof(path));
1310 while (fgets(path, sizeof(path)-1, fp) != NULL) {
1311 if (OPAL_SUCCESS != opal_dss.pack(relay_msg, &pathptr, 1, OPAL_STRING)) {
1312 OBJ_RELEASE(relay_msg);
1313 break;
1314 }
1315 memset(path, 0, sizeof(path));
1316 }
1317
1318 pclose(fp);
1319
1320 if (OPAL_SUCCESS != opal_dss.pack(answer, &relay_msg, 1, OPAL_BUFFER)) {
1321 OBJ_RELEASE(relay_msg);
1322 break;
1323 }
1324 OBJ_RELEASE(relay_msg);
1325 }
1326 }
1327 if (NULL != gstack_exec) {
1328 free(gstack_exec);
1329 }
1330
1331 if (0 > (ret = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, answer,
1332 ORTE_RML_TAG_STACK_TRACE,
1333 orte_rml_send_callback, NULL))) {
1334 ORTE_ERROR_LOG(ret);
1335 OBJ_RELEASE(answer);
1336 }
1337 break;
1338
1339 case ORTE_DAEMON_GET_MEMPROFILE:
1340 answer = OBJ_NEW(opal_buffer_t);
1341
1342 opal_dss.pack(answer, &orte_process_info.nodename, 1, OPAL_STRING);
1343
1344 OBJ_CONSTRUCT(&pstat, opal_pstats_t);
1345 opal_pstat.query(orte_process_info.pid, &pstat, NULL);
1346 opal_dss.pack(answer, &pstat.pss, 1, OPAL_FLOAT);
1347 OBJ_DESTRUCT(&pstat);
1348
1349 pss = 0.0;
1350 num_replies = 0;
1351 for (i=0; i < orte_local_children->size; i++) {
1352 if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) &&
1353 ORTE_FLAG_TEST(proct, ORTE_PROC_FLAG_ALIVE)) {
1354
1355 OBJ_CONSTRUCT(&pstat, opal_pstats_t);
1356 if (OPAL_SUCCESS == opal_pstat.query(proct->pid, &pstat, NULL)) {
1357 pss += pstat.pss;
1358 ++num_replies;
1359 }
1360 OBJ_DESTRUCT(&pstat);
1361 }
1362 }
1363
1364 if (0 < num_replies) {
1365 pss /= (float)num_replies;
1366 }
1367 opal_dss.pack(answer, &pss, 1, OPAL_FLOAT);
1368
1369 if (0 > (ret = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, answer,
1370 ORTE_RML_TAG_MEMPROFILE,
1371 orte_rml_send_callback, NULL))) {
1372 ORTE_ERROR_LOG(ret);
1373 OBJ_RELEASE(answer);
1374 }
1375 break;
1376
1377 default:
1378 ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
1379 }
1380
1381 CLEANUP:
1382 return;
1383 }
1384
1385 static char *get_orted_comm_cmd_str(int command)
1386 {
1387 switch(command) {
1388 case ORTE_DAEMON_CONTACT_QUERY_CMD:
1389 return strdup("ORTE_DAEMON_CONTACT_QUERY_CMD");
1390 case ORTE_DAEMON_KILL_LOCAL_PROCS:
1391 return strdup("ORTE_DAEMON_KILL_LOCAL_PROCS");
1392 case ORTE_DAEMON_SIGNAL_LOCAL_PROCS:
1393 return strdup("ORTE_DAEMON_SIGNAL_LOCAL_PROCS");
1394 case ORTE_DAEMON_ADD_LOCAL_PROCS:
1395 return strdup("ORTE_DAEMON_ADD_LOCAL_PROCS");
1396
1397 case ORTE_DAEMON_HEARTBEAT_CMD:
1398 return strdup("ORTE_DAEMON_HEARTBEAT_CMD");
1399 case ORTE_DAEMON_EXIT_CMD:
1400 return strdup("ORTE_DAEMON_EXIT_CMD");
1401 case ORTE_DAEMON_PROCESS_AND_RELAY_CMD:
1402 return strdup("ORTE_DAEMON_PROCESS_AND_RELAY_CMD");
1403 case ORTE_DAEMON_NULL_CMD:
1404 return strdup("NULL");
1405
1406 case ORTE_DAEMON_REPORT_JOB_INFO_CMD:
1407 return strdup("ORTE_DAEMON_REPORT_JOB_INFO_CMD");
1408 case ORTE_DAEMON_REPORT_NODE_INFO_CMD:
1409 return strdup("ORTE_DAEMON_REPORT_NODE_INFO_CMD");
1410 case ORTE_DAEMON_REPORT_PROC_INFO_CMD:
1411 return strdup("ORTE_DAEMON_REPORT_PROC_INFO_CMD");
1412 case ORTE_DAEMON_SPAWN_JOB_CMD:
1413 return strdup("ORTE_DAEMON_SPAWN_JOB_CMD");
1414 case ORTE_DAEMON_TERMINATE_JOB_CMD:
1415 return strdup("ORTE_DAEMON_TERMINATE_JOB_CMD");
1416
1417 case ORTE_DAEMON_HALT_VM_CMD:
1418 return strdup("ORTE_DAEMON_HALT_VM_CMD");
1419 case ORTE_DAEMON_HALT_DVM_CMD:
1420 return strdup("ORTE_DAEMON_HALT_DVM_CMD");
1421 case ORTE_DAEMON_REPORT_JOB_COMPLETE:
1422 return strdup("ORTE_DAEMON_REPORT_JOB_COMPLETE");
1423
1424 case ORTE_DAEMON_TOP_CMD:
1425 return strdup("ORTE_DAEMON_TOP_CMD");
1426 case ORTE_DAEMON_NAME_REQ_CMD:
1427 return strdup("ORTE_DAEMON_NAME_REQ_CMD");
1428 case ORTE_DAEMON_CHECKIN_CMD:
1429 return strdup("ORTE_DAEMON_CHECKIN_CMD");
1430
1431 case ORTE_TOOL_CHECKIN_CMD:
1432 return strdup("ORTE_TOOL_CHECKIN_CMD");
1433 case ORTE_DAEMON_PROCESS_CMD:
1434 return strdup("ORTE_DAEMON_PROCESS_CMD");
1435 case ORTE_DAEMON_ABORT_PROCS_CALLED:
1436 return strdup("ORTE_DAEMON_ABORT_PROCS_CALLED");
1437
1438 case ORTE_DAEMON_DVM_NIDMAP_CMD:
1439 return strdup("ORTE_DAEMON_DVM_NIDMAP_CMD");
1440 case ORTE_DAEMON_DVM_ADD_PROCS:
1441 return strdup("ORTE_DAEMON_DVM_ADD_PROCS");
1442
1443 case ORTE_DAEMON_GET_STACK_TRACES:
1444 return strdup("ORTE_DAEMON_GET_STACK_TRACES");
1445
1446 case ORTE_DAEMON_GET_MEMPROFILE:
1447 return strdup("ORTE_DAEMON_GET_MEMPROFILE");
1448
1449 case ORTE_DAEMON_DVM_CLEANUP_JOB_CMD:
1450 return strdup("ORTE_DAEMON_DVM_CLEANUP_JOB_CMD");
1451
1452 default:
1453 return strdup("Unknown Command!");
1454 }
1455 }