This source file includes following definitions.
- orte_oob_base_send_nb
- orte_oob_base_get_addr
- process_uri
- orte_oob_base_ft_event
1
2
3
4
5
6
7
8
9
10
11
12
13
14 #include "orte_config.h"
15 #include "orte/constants.h"
16
17 #include "opal/util/output.h"
18 #include "opal/mca/pmix/pmix.h"
19 #include "opal/util/argv.h"
20 #include "opal/util/printf.h"
21
22 #include "orte/mca/errmgr/errmgr.h"
23 #include "orte/mca/state/state.h"
24 #include "orte/mca/rml/rml.h"
25 #include "orte/util/threads.h"
26 #include "orte/mca/oob/base/base.h"
27 #if OPAL_ENABLE_FT_CR == 1
28 #include "orte/mca/state/base/base.h"
29 #endif
30
31 static void process_uri(char *uri);
32
33 void orte_oob_base_send_nb(int fd, short args, void *cbdata)
34 {
35 orte_oob_send_t *cd = (orte_oob_send_t*)cbdata;
36 orte_rml_send_t *msg;
37 mca_base_component_list_item_t *cli;
38 orte_oob_base_peer_t *pr;
39 int rc;
40 uint64_t ui64;
41 bool msg_sent;
42 mca_oob_base_component_t *component;
43 bool reachable;
44 char *uri;
45
46 ORTE_ACQUIRE_OBJECT(cd);
47
48
49 msg = cd->msg;
50 OBJ_RELEASE(cd);
51
52 opal_output_verbose(5, orte_oob_base_framework.framework_output,
53 "%s oob:base:send to target %s - attempt %u",
54 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
55 ORTE_NAME_PRINT(&msg->dst), msg->retries);
56
57
58
59
60 if (orte_rml_base.max_retries <= msg->retries) {
61 msg->status = ORTE_ERR_NO_PATH_TO_TARGET;
62 ORTE_RML_SEND_COMPLETE(msg);
63 return;
64 }
65
66
67 memcpy(&ui64, (char*)&msg->dst, sizeof(uint64_t));
68 if (OPAL_SUCCESS != opal_hash_table_get_value_uint64(&orte_oob_base.peers,
69 ui64, (void**)&pr) ||
70 NULL == pr) {
71 opal_output_verbose(5, orte_oob_base_framework.framework_output,
72 "%s oob:base:send unknown peer %s",
73 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
74 ORTE_NAME_PRINT(&msg->dst));
75
76
77
78
79
80 OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, OPAL_PMIX_PROC_URI, &msg->dst,
81 (char**)&uri, OPAL_STRING);
82 if (OPAL_SUCCESS == rc ) {
83 if (NULL != uri) {
84 process_uri(uri);
85 if (OPAL_SUCCESS != opal_hash_table_get_value_uint64(&orte_oob_base.peers,
86 ui64, (void**)&pr) ||
87 NULL == pr) {
88
89 ORTE_ERROR_LOG(ORTE_ERR_ADDRESSEE_UNKNOWN);
90 msg->status = ORTE_ERR_ADDRESSEE_UNKNOWN;
91 ORTE_RML_SEND_COMPLETE(msg);
92 return;
93 }
94 } else {
95 ORTE_ERROR_LOG(ORTE_ERR_ADDRESSEE_UNKNOWN);
96 msg->status = ORTE_ERR_ADDRESSEE_UNKNOWN;
97 ORTE_RML_SEND_COMPLETE(msg);
98 return;
99 }
100 } else {
101
102
103
104
105 reachable = false;
106 pr = NULL;
107 OPAL_LIST_FOREACH(cli, &orte_oob_base.actives, mca_base_component_list_item_t) {
108 component = (mca_oob_base_component_t*)cli->cli_component;
109 if (NULL != component->is_reachable) {
110 if (component->is_reachable(&msg->dst)) {
111
112
113
114 if (NULL == pr) {
115 pr = OBJ_NEW(orte_oob_base_peer_t);
116 if (OPAL_SUCCESS != (rc = opal_hash_table_set_value_uint64(&orte_oob_base.peers, ui64, (void*)pr))) {
117 ORTE_ERROR_LOG(rc);
118 msg->status = ORTE_ERR_ADDRESSEE_UNKNOWN;
119 ORTE_RML_SEND_COMPLETE(msg);
120 return;
121 }
122 }
123
124 opal_bitmap_set_bit(&pr->addressable, component->idx);
125
126 reachable = true;
127 }
128 }
129 }
130
131 if (!reachable) {
132
133
134
135 if (ORTE_PROC_IS_DAEMON || ORTE_PROC_IS_HNP) {
136 ++msg->retries;
137 if (msg->retries < orte_rml_base.max_retries) {
138 ORTE_OOB_SEND(msg);
139 return;
140 }
141 }
142 msg->status = ORTE_ERR_ADDRESSEE_UNKNOWN;
143 ORTE_RML_SEND_COMPLETE(msg);
144 return;
145 }
146 }
147 }
148
149
150
151 if (NULL != pr->component) {
152
153
154
155 opal_output_verbose(5, orte_oob_base_framework.framework_output,
156 "%s oob:base:send known transport for peer %s",
157 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
158 ORTE_NAME_PRINT(&msg->dst));
159 if (ORTE_SUCCESS == (rc = pr->component->send_nb(msg))) {
160 return;
161 }
162 }
163
164
165
166
167
168
169 msg_sent = false;
170 OPAL_LIST_FOREACH(cli, &orte_oob_base.actives, mca_base_component_list_item_t) {
171 component = (mca_oob_base_component_t*)cli->cli_component;
172
173 if (!component->is_reachable(&msg->dst)) {
174 continue;
175 }
176
177 if (ORTE_SUCCESS == (rc = component->send_nb(msg))) {
178
179 msg_sent = true;
180
181 pr->component = component;
182 break;
183 } else if (ORTE_ERR_TAKE_NEXT_OPTION != rc) {
184
185
186
187 ORTE_ERROR_LOG(rc);
188 msg->status = rc;
189 ORTE_RML_SEND_COMPLETE(msg);
190 return;
191 }
192 }
193
194
195
196
197 if (!msg_sent) {
198 opal_output_verbose(5, orte_oob_base_framework.framework_output,
199 "%s oob:base:send no path to target %s",
200 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
201 ORTE_NAME_PRINT(&msg->dst));
202 msg->status = ORTE_ERR_NO_PATH_TO_TARGET;
203 ORTE_RML_SEND_COMPLETE(msg);
204 }
205 }
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223 void orte_oob_base_get_addr(char **uri)
224 {
225 char *turi, *final=NULL, *tmp;
226 size_t len = 0;
227 int rc=ORTE_SUCCESS;
228 bool one_added = false;
229 mca_base_component_list_item_t *cli;
230 mca_oob_base_component_t *component;
231 opal_value_t val;
232
233
234 if (ORTE_SUCCESS != (rc = orte_util_convert_process_name_to_string(&final, ORTE_PROC_MY_NAME))) {
235 ORTE_ERROR_LOG(rc);
236 *uri = NULL;
237 return;
238 }
239 len = strlen(final);
240
241
242
243
244 OPAL_LIST_FOREACH(cli, &orte_oob_base.actives, mca_base_component_list_item_t) {
245 component = (mca_oob_base_component_t*)cli->cli_component;
246
247
248
249 if (NULL == component->get_addr) {
250
251 continue;
252 }
253
254
255
256
257
258 turi = component->get_addr();
259 if (NULL != turi) {
260
261 if (0 < orte_oob_base.max_uri_length &&
262 orte_oob_base.max_uri_length < (int)(len + strlen(turi))) {
263
264 continue;
265 }
266
267 opal_asprintf(&tmp, "%s;%s", final, turi);
268 free(turi);
269 free(final);
270 final = tmp;
271 len = strlen(final);
272
273 one_added = true;
274 }
275 }
276
277 if (!one_added) {
278
279 if (NULL != final) {
280 free(final);
281 final = NULL;
282 }
283 }
284
285 *uri = final;
286
287 OBJ_CONSTRUCT(&val, opal_value_t);
288 val.key = OPAL_PMIX_PROC_URI;
289 val.type = OPAL_STRING;
290 val.data.string = final;
291 if (OPAL_SUCCESS != (rc = opal_pmix.store_local(ORTE_PROC_MY_NAME, &val))) {
292 ORTE_ERROR_LOG(rc);
293 }
294 val.key = NULL;
295 val.data.string = NULL;
296 OBJ_DESTRUCT(&val);
297 }
298
299 static void process_uri(char *uri)
300 {
301 orte_process_name_t peer;
302 char *cptr;
303 mca_base_component_list_item_t *cli;
304 mca_oob_base_component_t *component;
305 char **uris=NULL;
306 int rc;
307 uint64_t ui64;
308 orte_oob_base_peer_t *pr;
309
310
311 cptr = strchr(uri, ';');
312 if (NULL == cptr) {
313
314
315
316
317 ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
318 return;
319 }
320 *cptr = '\0';
321 cptr++;
322
323
324 orte_util_convert_string_to_process_name(&peer, uri);
325
326
327
328
329 if (peer.jobid == ORTE_PROC_MY_NAME->jobid &&
330 peer.vpid == ORTE_PROC_MY_NAME->vpid) {
331 opal_output_verbose(5, orte_oob_base_framework.framework_output,
332 "%s:set_addr peer %s is me",
333 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
334 ORTE_NAME_PRINT(&peer));
335 return;
336 }
337
338
339 uris = opal_argv_split(cptr, ';');
340
341
342 memcpy(&ui64, (char*)&peer, sizeof(uint64_t));
343 if (OPAL_SUCCESS != opal_hash_table_get_value_uint64(&orte_oob_base.peers,
344 ui64, (void**)&pr) ||
345 NULL == pr) {
346 pr = OBJ_NEW(orte_oob_base_peer_t);
347 if (OPAL_SUCCESS != (rc = opal_hash_table_set_value_uint64(&orte_oob_base.peers, ui64, (void*)pr))) {
348 ORTE_ERROR_LOG(rc);
349 opal_argv_free(uris);
350 return;
351 }
352 }
353
354
355
356
357
358
359 rc = ORTE_ERR_UNREACH;
360 OPAL_LIST_FOREACH(cli, &orte_oob_base.actives, mca_base_component_list_item_t) {
361 component = (mca_oob_base_component_t*)cli->cli_component;
362 opal_output_verbose(5, orte_oob_base_framework.framework_output,
363 "%s:set_addr checking if peer %s is reachable via component %s",
364 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
365 ORTE_NAME_PRINT(&peer), component->oob_base.mca_component_name);
366 if (NULL != component->set_addr) {
367 if (ORTE_SUCCESS == component->set_addr(&peer, uris)) {
368
369
370
371 opal_output_verbose(5, orte_oob_base_framework.framework_output,
372 "%s: peer %s is reachable via component %s",
373 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
374 ORTE_NAME_PRINT(&peer), component->oob_base.mca_component_name);
375 opal_bitmap_set_bit(&pr->addressable, component->idx);
376 } else {
377 opal_output_verbose(5, orte_oob_base_framework.framework_output,
378 "%s: peer %s is NOT reachable via component %s",
379 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
380 ORTE_NAME_PRINT(&peer), component->oob_base.mca_component_name);
381 }
382 }
383 }
384 opal_argv_free(uris);
385 }
386
387 #if OPAL_ENABLE_FT_CR == 1
388 void orte_oob_base_ft_event(int sd, short argc, void *cbdata)
389 {
390 int rc;
391 mca_base_component_list_item_t *cli;
392 mca_oob_base_component_t *component;
393 orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata;
394
395 opal_output_verbose(5, orte_oob_base_framework.framework_output,
396 "%s oob:base:ft_event %s(%d)",
397 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
398 orte_job_state_to_str(state->job_state),
399 state->job_state);
400
401
402
403
404 OPAL_LIST_FOREACH(cli, &orte_oob_base.actives, mca_base_component_list_item_t) {
405 component = (mca_oob_base_component_t*)cli->cli_component;
406 if (NULL == component->ft_event) {
407
408 continue;
409 }
410
411 if (ORTE_SUCCESS != (rc = component->ft_event(state->job_state))) {
412 ORTE_ERROR_LOG(rc);
413 }
414 }
415 OBJ_RELEASE(state);
416 }
417
418 #endif