This source file includes following definitions.
- orte_odls_base_register
- orte_odls_base_harvest_threads
- orte_odls_base_start_threads
- orte_odls_base_close
- orte_odls_base_open
- launch_local_const
- launch_local_dest
- sccon
- scdes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27 #include "orte_config.h"
28 #include "orte/constants.h"
29
30 #include <string.h>
31 #include <signal.h>
32
33 #include "opal/class/opal_ring_buffer.h"
34 #include "orte/mca/mca.h"
35 #include "opal/mca/base/base.h"
36 #include "opal/mca/hwloc/hwloc-internal.h"
37 #include "opal/runtime/opal_progress_threads.h"
38 #include "opal/util/output.h"
39 #include "opal/util/path.h"
40 #include "opal/util/argv.h"
41 #include "opal/util/printf.h"
42
43 #include "orte/mca/errmgr/errmgr.h"
44 #include "orte/mca/ess/ess.h"
45 #include "orte/mca/plm/plm_types.h"
46 #include "orte/runtime/orte_globals.h"
47 #include "orte/util/name_fns.h"
48 #include "orte/util/parse_options.h"
49 #include "orte/util/show_help.h"
50 #include "orte/util/threads.h"
51
52 #include "orte/mca/odls/base/odls_private.h"
53 #include "orte/mca/odls/base/base.h"
54
55
56
57
58
59
60
61
62 #include "orte/mca/odls/base/static-components.h"
63
64
65
66
67 orte_odls_base_module_t orte_odls = {0};
68
69
70
71
72 orte_odls_globals_t orte_odls_globals = {0};
73
74 static int orte_odls_base_register(mca_base_register_flag_t flags)
75 {
76 orte_odls_globals.timeout_before_sigkill = 1;
77 (void) mca_base_var_register("orte", "odls", "base", "sigkill_timeout",
78 "Time to wait for a process to die after issuing a kill signal to it",
79 MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
80 OPAL_INFO_LVL_9,
81 MCA_BASE_VAR_SCOPE_READONLY,
82 &orte_odls_globals.timeout_before_sigkill);
83
84 orte_odls_globals.max_threads = 4;
85 (void) mca_base_var_register("orte", "odls", "base", "max_threads",
86 "Maximum number of threads to use for spawning local procs",
87 MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
88 OPAL_INFO_LVL_9,
89 MCA_BASE_VAR_SCOPE_READONLY,
90 &orte_odls_globals.max_threads);
91
92 orte_odls_globals.num_threads = -1;
93 (void) mca_base_var_register("orte", "odls", "base", "num_threads",
94 "Specific number of threads to use for spawning local procs",
95 MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
96 OPAL_INFO_LVL_9,
97 MCA_BASE_VAR_SCOPE_READONLY,
98 &orte_odls_globals.num_threads);
99
100 orte_odls_globals.cutoff = 32;
101 (void) mca_base_var_register("orte", "odls", "base", "cutoff",
102 "Minimum number of local procs before using thread pool for spawn",
103 MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
104 OPAL_INFO_LVL_9,
105 MCA_BASE_VAR_SCOPE_READONLY,
106 &orte_odls_globals.cutoff);
107
108 orte_odls_globals.signal_direct_children_only = false;
109 (void) mca_base_var_register("orte", "odls", "base", "signal_direct_children_only",
110 "Whether to restrict signals (e.g., SIGTERM) to direct children, or "
111 "to apply them as well to any children spawned by those processes",
112 MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
113 OPAL_INFO_LVL_9,
114 MCA_BASE_VAR_SCOPE_READONLY,
115 &orte_odls_globals.signal_direct_children_only);
116
117 return ORTE_SUCCESS;
118 }
119
120 void orte_odls_base_harvest_threads(void)
121 {
122 int i;
123
124 ORTE_ACQUIRE_THREAD(&orte_odls_globals.lock);
125 if (0 < orte_odls_globals.num_threads) {
126
127 if (NULL != orte_odls_globals.ev_threads) {
128 for (i=0; NULL != orte_odls_globals.ev_threads[i]; i++) {
129 opal_progress_thread_finalize(orte_odls_globals.ev_threads[i]);
130 }
131 }
132 free(orte_odls_globals.ev_bases);
133 orte_odls_globals.ev_bases = (opal_event_base_t**)malloc(sizeof(opal_event_base_t*));
134
135 orte_odls_globals.ev_bases[0] = orte_event_base;
136 orte_odls_globals.num_threads = 0;
137 if (NULL != orte_odls_globals.ev_threads) {
138 opal_argv_free(orte_odls_globals.ev_threads);
139 orte_odls_globals.ev_threads = NULL;
140 }
141 }
142 ORTE_RELEASE_THREAD(&orte_odls_globals.lock);
143 }
144
145 void orte_odls_base_start_threads(orte_job_t *jdata)
146 {
147 int i;
148 char *tmp;
149
150 ORTE_ACQUIRE_THREAD(&orte_odls_globals.lock);
151
152 if (NULL != orte_odls_globals.ev_threads) {
153 ORTE_RELEASE_THREAD(&orte_odls_globals.lock);
154 return;
155 }
156
157
158 orte_odls_globals.ev_threads = NULL;
159 orte_odls_globals.next_base = 0;
160 if (-1 == orte_odls_globals.num_threads) {
161 if ((int)jdata->num_local_procs < orte_odls_globals.cutoff) {
162
163 orte_odls_globals.num_threads = 0;
164 } else {
165
166
167
168 orte_odls_globals.num_threads = jdata->num_local_procs / 8;
169 if (0 == orte_odls_globals.num_threads) {
170 orte_odls_globals.num_threads = 1;
171 } else if (orte_odls_globals.max_threads < orte_odls_globals.num_threads) {
172 orte_odls_globals.num_threads = orte_odls_globals.max_threads;
173 }
174 }
175 }
176 if (0 == orte_odls_globals.num_threads) {
177 orte_odls_globals.ev_bases = (opal_event_base_t**)malloc(sizeof(opal_event_base_t*));
178
179 orte_odls_globals.ev_bases[0] = orte_event_base;
180 } else {
181 orte_odls_globals.ev_bases =
182 (opal_event_base_t**)malloc(orte_odls_globals.num_threads * sizeof(opal_event_base_t*));
183 for (i=0; i < orte_odls_globals.num_threads; i++) {
184 opal_asprintf(&tmp, "ORTE-ODLS-%d", i);
185 orte_odls_globals.ev_bases[i] = opal_progress_thread_init(tmp);
186 opal_argv_append_nosize(&orte_odls_globals.ev_threads, tmp);
187 free(tmp);
188 }
189 }
190 ORTE_RELEASE_THREAD(&orte_odls_globals.lock);
191 }
192
193 static int orte_odls_base_close(void)
194 {
195 int i;
196 orte_proc_t *proc;
197 opal_list_item_t *item;
198
199
200 while (NULL != (item = opal_list_remove_first(&orte_odls_globals.xterm_ranks))) {
201 OBJ_RELEASE(item);
202 }
203 OBJ_DESTRUCT(&orte_odls_globals.xterm_ranks);
204
205
206 for (i=0; i < orte_local_children->size; i++) {
207 if (NULL != (proc = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
208 OBJ_RELEASE(proc);
209 }
210 }
211 OBJ_RELEASE(orte_local_children);
212
213 orte_odls_base_harvest_threads();
214
215 ORTE_DESTRUCT_LOCK(&orte_odls_globals.lock);
216
217 return mca_base_framework_components_close(&orte_odls_base_framework, NULL);
218 }
219
220
221
222
223
224 static int orte_odls_base_open(mca_base_open_flag_t flags)
225 {
226 char **ranks=NULL, *tmp;
227 int rc, i, rank;
228 orte_namelist_t *nm;
229 bool xterm_hold;
230 sigset_t unblock;
231
232 ORTE_CONSTRUCT_LOCK(&orte_odls_globals.lock);
233 orte_odls_globals.lock.active = false;
234
235
236 orte_local_children = OBJ_NEW(opal_pointer_array_t);
237 if (OPAL_SUCCESS != (rc = opal_pointer_array_init(orte_local_children,
238 1,
239 ORTE_GLOBAL_ARRAY_MAX_SIZE,
240 1))) {
241 ORTE_ERROR_LOG(rc);
242 return rc;
243 }
244
245
246 OBJ_CONSTRUCT(&orte_odls_globals.xterm_ranks, opal_list_t);
247 orte_odls_globals.xtermcmd = NULL;
248
249
250 if (0 != sigemptyset(&unblock)) {
251 return ORTE_ERROR;
252 }
253 if (0 != sigaddset(&unblock, SIGCHLD)) {
254 return ORTE_ERROR;
255 }
256 if (0 != sigprocmask(SIG_UNBLOCK, &unblock, NULL)) {
257 return ORTE_ERR_NOT_SUPPORTED;
258 }
259
260
261 if (NULL != orte_xterm) {
262
263 xterm_hold = false;
264 orte_util_parse_range_options(orte_xterm, &ranks);
265 for (i=0; i < opal_argv_count(ranks); i++) {
266 if (0 == strcmp(ranks[i], "BANG")) {
267 xterm_hold = true;
268 continue;
269 }
270 nm = OBJ_NEW(orte_namelist_t);
271 rank = strtol(ranks[i], NULL, 10);
272 if (-1 == rank) {
273
274 nm->name.vpid = ORTE_VPID_WILDCARD;
275 } else if (rank < 0) {
276
277 orte_show_help("help-orte-odls-base.txt",
278 "orte-odls-base:xterm-neg-rank",
279 true, rank);
280 return ORTE_ERROR;
281 } else {
282
283
284
285
286 nm->name.vpid = rank;
287 }
288 opal_list_append(&orte_odls_globals.xterm_ranks, &nm->super);
289 }
290 opal_argv_free(ranks);
291
292 orte_odls_globals.xtermcmd = NULL;
293 tmp = opal_find_absolute_path("xterm");
294 if (NULL == tmp) {
295 return ORTE_ERROR;
296 }
297 opal_argv_append_nosize(&orte_odls_globals.xtermcmd, tmp);
298 free(tmp);
299 opal_argv_append_nosize(&orte_odls_globals.xtermcmd, "-T");
300 opal_argv_append_nosize(&orte_odls_globals.xtermcmd, "save");
301 if (xterm_hold) {
302 opal_argv_append_nosize(&orte_odls_globals.xtermcmd, "-hold");
303 }
304 opal_argv_append_nosize(&orte_odls_globals.xtermcmd, "-e");
305 }
306
307
308 return mca_base_framework_components_open(&orte_odls_base_framework, flags);
309 }
310
311 MCA_BASE_FRAMEWORK_DECLARE(orte, odls, "ORTE Daemon Launch Subsystem",
312 orte_odls_base_register, orte_odls_base_open, orte_odls_base_close,
313 mca_odls_base_static_components, 0);
314
315 static void launch_local_const(orte_odls_launch_local_t *ptr)
316 {
317 ptr->ev = opal_event_alloc();
318 ptr->job = ORTE_JOBID_INVALID;
319 ptr->fork_local = NULL;
320 ptr->retries = 0;
321 }
322 static void launch_local_dest(orte_odls_launch_local_t *ptr)
323 {
324 opal_event_free(ptr->ev);
325 }
326 OBJ_CLASS_INSTANCE(orte_odls_launch_local_t,
327 opal_object_t,
328 launch_local_const,
329 launch_local_dest);
330
331 static void sccon(orte_odls_spawn_caddy_t *p)
332 {
333 memset(&p->opts, 0, sizeof(orte_iof_base_io_conf_t));
334 p->cmd = NULL;
335 p->wdir = NULL;
336 p->argv = NULL;
337 p->env = NULL;
338 }
339 static void scdes(orte_odls_spawn_caddy_t *p)
340 {
341 if (NULL != p->cmd) {
342 free(p->cmd);
343 }
344 if (NULL != p->wdir) {
345 free(p->wdir);
346 }
347 if (NULL != p->argv) {
348 opal_argv_free(p->argv);
349 }
350 if (NULL != p->env) {
351 opal_argv_free(p->env);
352 }
353 }
354 OBJ_CLASS_INSTANCE(orte_odls_spawn_caddy_t,
355 opal_object_t,
356 sccon, scdes);