This source file includes following definitions.
- parser_ini
- parser_separated_columns
- orte_ras_get_appinfo_path
- orte_ras_alps_allocate
- ras_alps_getline
- orte_ras_alps_read_appinfo_file
- orte_ras_alps_finalize
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24 #include "orte_config.h"
25 #include "orte/constants.h"
26
27 #include "opal/mca/installdirs/installdirs.h"
28 #include "opal/util/output.h"
29 #include "orte/util/show_help.h"
30 #include "orte/mca/errmgr/errmgr.h"
31 #include "orte/mca/ras/base/ras_private.h"
32 #include "ras_alps.h"
33
34 #include <unistd.h>
35 #include <string.h>
36 #include <ctype.h>
37 #include <errno.h>
38 #include <fcntl.h>
39 #ifdef HAVE_SYS_STAT_H
40 #include <sys/stat.h>
41 #endif
42
43 #include <alps/apInfo.h>
44
45 typedef int (*parser_fn_t)(char **val_if_found, FILE *fp,
46 const char *var_name);
47
48 typedef struct orte_ras_alps_sysconfig_t {
49
50 char *path;
51
52 char *var_name;
53
54 parser_fn_t parse;
55 } orte_ras_alps_sysconfig_t;
56
57
58 static int orte_ras_alps_allocate(orte_job_t *jdata, opal_list_t *nodes);
59
60 static int orte_ras_alps_finalize(void);
61
62 static char *ras_alps_getline(FILE *fp);
63
64 static int orte_ras_alps_read_appinfo_file(opal_list_t *nodes,
65 char *filename,
66 unsigned int *uMe);
67
68 static char *orte_ras_get_appinfo_path(void);
69
70 static int parser_ini(char **val_if_found, FILE *fp, const char *var_name);
71
72 static int parser_separated_columns(char **val_if_found, FILE *fp,
73 const char *var_name);
74
75
76 static const orte_ras_alps_sysconfig_t sysconfigs[] = {
77 {"/etc/sysconfig/alps", "ALPS_SHARED_DIR_PATH", parser_ini},
78 {"/etc/alps.conf" , "sharedDir" , parser_separated_columns},
79 {"/etc/opt/cray/alps/alps.conf", "sharedDir" , parser_separated_columns},
80
81 {NULL , NULL , NULL}
82 };
83
84
85 orte_ras_base_module_t orte_ras_alps_module = {
86 NULL,
87 orte_ras_alps_allocate,
88 NULL,
89 orte_ras_alps_finalize
90 };
91
92
93 static int
94 parser_ini(char **val_if_found, FILE *fp, const char *var_name)
95 {
96 char *alps_config_str = NULL;
97
98 opal_output_verbose(1, orte_ras_base_framework.framework_output,
99 "ras:alps:allocate: parser_ini");
100
101
102 if (NULL == val_if_found) {
103 ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
104 return ORTE_ERR_BAD_PARAM;
105 }
106
107 *val_if_found = NULL;
108
109 while ((alps_config_str = ras_alps_getline(fp))) {
110 char *cpq;
111 char *cpr;
112
113 cpq = strchr(alps_config_str, '#');
114 cpr = strchr(alps_config_str, '=');
115 if (!cpr ||
116 (cpq && cpq < cpr)) {
117 free(alps_config_str);
118 continue;
119 }
120 for (cpr--;
121 (*cpr == ' ' || *cpr == '\t'); cpr--);
122 for (cpq = alps_config_str;
123 (*cpq == ' ' || *cpq == '\t'); cpq++);
124
125 if (strncmp(cpq, var_name, strlen(var_name))) {
126
127 free(alps_config_str);
128 continue;
129 }
130 if (!(cpq = strchr(cpr, '"'))) {
131 free(alps_config_str);
132 ORTE_ERROR_LOG(ORTE_ERR_FILE_OPEN_FAILURE);
133 return ORTE_ERR_FILE_OPEN_FAILURE;
134 }
135 if (!(cpr = strchr(++cpq, '"'))) {
136 free(alps_config_str);
137 ORTE_ERROR_LOG(ORTE_ERR_FILE_OPEN_FAILURE);
138 return ORTE_ERR_FILE_OPEN_FAILURE;
139 }
140 *cpr = '\0';
141 if (strlen(cpq) + 8 > PATH_MAX) {
142 free(alps_config_str);
143 ORTE_ERROR_LOG(ORTE_ERR_FILE_OPEN_FAILURE);
144 return ORTE_ERR_FILE_OPEN_FAILURE;
145 }
146
147 opal_asprintf(val_if_found, "%s/appinfo", cpq);
148 if (NULL == val_if_found) {
149 free(alps_config_str);
150 ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
151 return ORTE_ERR_OUT_OF_RESOURCE;
152 }
153 free(alps_config_str);
154 return ORTE_SUCCESS;
155 }
156
157
158
159 return ORTE_SUCCESS;
160 }
161
162
163 static int
164 parser_separated_columns(char **val_if_found, FILE *fp, const char *var_name)
165 {
166 char *alps_config_str = NULL;
167 int var_len = strlen(var_name);
168 int i;
169
170 opal_output_verbose(1, orte_ras_base_framework.framework_output,
171 "ras:alps:allocate: parser_separated_columns");
172
173
174 if (NULL == val_if_found) {
175 ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
176 return ORTE_ERR_BAD_PARAM;
177 }
178
179 *val_if_found = NULL;
180
181 while ((alps_config_str = ras_alps_getline(fp))) {
182 char *cpq = alps_config_str;
183 char *cpr;
184
185
186 while (' ' == *cpq || '\t' == *cpq) {
187 cpq++;
188 }
189
190
191 if ('#' == *cpq || strncmp(cpq, var_name, var_len)) {
192 free(alps_config_str);
193 continue;
194 }
195
196 for (i = 0; i < var_len && '\0' != *cpq; ++i, ++cpq);
197
198 while (' ' == *cpq || '\t' == *cpq) {
199 cpq++;
200 }
201
202 cpr = cpq;
203 while ('\0' != *cpr && (' ' != *cpr || '\t' != *cpr)) {
204 cpr++;
205 }
206 *cpr = '\0';
207
208 if (strlen(cpq) + 8 > PATH_MAX) {
209 free(alps_config_str);
210 ORTE_ERROR_LOG(ORTE_ERR_FILE_OPEN_FAILURE);
211 return ORTE_ERR_FILE_OPEN_FAILURE;
212 }
213
214 opal_asprintf(val_if_found, "%s/appinfo", cpq);
215 if (NULL == val_if_found) {
216 free(alps_config_str);
217 ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
218 return ORTE_ERR_OUT_OF_RESOURCE;
219 }
220 free(alps_config_str);
221 return ORTE_SUCCESS;
222 }
223
224
225
226 return ORTE_SUCCESS;
227 }
228
229
230
231
232
233
234
235 static char *
236 orte_ras_get_appinfo_path(void)
237 {
238 int i, rc = ORTE_ERROR;
239 FILE *fp = NULL;
240 char *appinfo_path = NULL;
241
242
243
244
245 for (i = 0; NULL != sysconfigs[i].path; ++i) {
246 opal_output_verbose(1, orte_ras_base_framework.framework_output,
247 "ras:alps:allocate: Trying ALPS configuration "
248 "file: \"%s\"",
249 sysconfigs[i].path);
250 if (NULL == (fp = fopen(sysconfigs[i].path, "r"))) {
251 int err = errno;
252 opal_output_verbose(1, orte_ras_base_framework.framework_output,
253 "ras:alps:allocate: Skipping ALPS "
254 "configuration file: \"%s\" (%s).",
255 sysconfigs[i].path, strerror(err));
256 continue;
257 }
258
259 rc = sysconfigs[i].parse(&appinfo_path, fp, sysconfigs[i].var_name);
260
261 fclose(fp);
262
263 if (ORTE_SUCCESS == rc) {
264
265 if (NULL != appinfo_path) {
266 break;
267 }
268
269 else {
270 continue;
271 }
272 }
273
274 else {
275 opal_output_verbose(1, orte_ras_base_framework.framework_output,
276 "ras:alps:allocate: failure "
277 "(get_appinfo_dir_path = %d)", rc);
278 return NULL;
279 }
280 }
281
282 if (NULL != sysconfigs[i].path) {
283 opal_output_verbose(1, orte_ras_base_framework.framework_output,
284 "ras:alps:allocate: Located ALPS scheduler file: "
285 "\"%s\"", appinfo_path);
286 return appinfo_path;
287 }
288
289 else {
290 opal_output_verbose(1, orte_ras_base_framework.framework_output,
291 "ras:alps:allocate: Could not locate ALPS "
292 "scheduler file.");
293 return NULL;
294 }
295
296
297 return NULL;
298 }
299
300
301
302
303
304 static int
305 orte_ras_alps_allocate(orte_job_t *jdata, opal_list_t *nodes)
306 {
307 int ret;
308 char *appinfo_path = NULL;
309
310 if (0 == orte_ras_alps_res_id) {
311 orte_show_help("help-ras-alps.txt", "alps-env-var-not-found", 1);
312 return ORTE_ERR_NOT_FOUND;
313 }
314 if (NULL == (appinfo_path = orte_ras_get_appinfo_path())) {
315 return ORTE_ERR_NOT_FOUND;
316 }
317
318 if (ORTE_SUCCESS != (ret = orte_ras_alps_read_appinfo_file(
319 nodes,
320 appinfo_path,
321 (unsigned int *)&orte_ras_alps_res_id))) {
322 ORTE_ERROR_LOG(ret);
323 goto cleanup;
324 }
325
326
327 orte_num_allocated_nodes = opal_list_get_size(nodes);
328
329 cleanup:
330
331 if (NULL != appinfo_path) {
332 free(appinfo_path);
333 }
334 if (ORTE_SUCCESS == ret) {
335 opal_output_verbose(1, orte_ras_base_framework.framework_output,
336 "ras:alps:allocate: success");
337 }
338 else {
339 opal_output_verbose(1, orte_ras_base_framework.framework_output,
340 "ras:alps:allocate: failure "
341 "(base_allocate_nodes = %d)", ret);
342 }
343 return ret;
344 }
345
346 #define RAS_BASE_FILE_MAX_LINE_LENGTH (PATH_MAX * 2)
347
348 static char *
349 ras_alps_getline(FILE *fp)
350 {
351 char *ret = NULL, *input = NULL;
352
353 input = (char *)calloc(RAS_BASE_FILE_MAX_LINE_LENGTH + 1, sizeof(char));
354
355 if (NULL == input) {
356 ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
357 return NULL;
358 }
359 ret = fgets(input, RAS_BASE_FILE_MAX_LINE_LENGTH, fp);
360 if (NULL != ret) {
361 input[strlen(input) - 1] = '\0';
362 return input;
363 }
364
365 return NULL;
366 }
367
368 #if ALPS_APPINFO_VERSION > 0 && ALPS_APPINFO_VERSION < 3
369 typedef placeNodeList_t orte_ras_alps_placeNodeList_t;
370 #else
371 typedef placeNodeList_ver3_t orte_ras_alps_placeNodeList_t;
372 #endif
373
374 static int
375 orte_ras_alps_read_appinfo_file(opal_list_t *nodes, char *filename,
376 unsigned int *uMe)
377 {
378 int iq;
379 int ix;
380 int iFd;
381 int iTrips;
382 int max_appinfo_read_attempts;
383 struct stat ssBuf;
384 size_t szLen;
385 off_t oNow;
386 off_t oInfo=sizeof(appInfoHdr_t);
387 off_t oDet=sizeof(appInfo_t);
388 off_t oSlots;
389 off_t oEntry;
390 int32_t sNodes=0;
391 char *cpBuf;
392 char *hostname;
393 orte_node_t *node = NULL;
394 appInfoHdr_t *apHdr;
395 appInfo_t *apInfo;
396 #if ALPS_APPINFO_VERSION==0
397 placeList_t *apSlots;
398 #else
399 orte_ras_alps_placeNodeList_t *apNodes;
400 #endif
401
402 orte_ras_alps_get_appinfo_attempts(&max_appinfo_read_attempts);
403 oNow=0;
404 iTrips=0;
405 opal_output_verbose(1, orte_ras_base_framework.framework_output,
406 "ras:alps:allocate: begin processing appinfo file");
407
408 while(!oNow) {
409 iTrips++;
410
411 iFd=open( filename, O_RDONLY );
412 if( iFd==-1 ) {
413 opal_output_verbose(1, orte_ras_base_framework.framework_output,
414 "ras:alps:allocate: ALPS information open failure");
415 usleep(iTrips*50000);
416
417
418 if( iTrips <= max_appinfo_read_attempts ) continue;
419 ORTE_ERROR_LOG(ORTE_ERR_FILE_OPEN_FAILURE);
420 return ORTE_ERR_FILE_OPEN_FAILURE;
421 }
422 if( fstat( iFd, &ssBuf )==-1 ) {
423
424 ORTE_ERROR_LOG(ORTE_ERR_NOT_AVAILABLE);
425 return ORTE_ERR_NOT_AVAILABLE;
426 }
427
428 szLen=ssBuf.st_size;
429 cpBuf=malloc(szLen+1);
430 if (NULL == cpBuf) {
431 ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
432 return ORTE_ERR_OUT_OF_RESOURCE;
433 }
434
435
436
437 if( (oNow=read( iFd, cpBuf, szLen ))!=(off_t)szLen ) {
438
439
440 opal_output_verbose(1, orte_ras_base_framework.framework_output,
441 "ras:alps:allocate: ALPS information read failure: %ld bytes", (long int)oNow);
442
443 free(cpBuf);
444 close(iFd);
445 oNow=0;
446 usleep(iTrips*50000);
447
448
449 if( iTrips<=max_appinfo_read_attempts ) continue;
450 ORTE_ERROR_LOG(ORTE_ERR_FILE_READ_FAILURE);
451 return ORTE_ERR_FILE_READ_FAILURE;
452 }
453 }
454 close(iFd);
455 opal_output_verbose(1, orte_ras_base_framework.framework_output,
456 "ras:alps:allocate: file %s read", filename);
457
458
459
460 oNow=0;
461 apHdr=(appInfoHdr_t *)cpBuf;
462
463 opal_output_verbose(1, orte_ras_base_framework.framework_output,
464 "ras:alps:allocate: %d entries in file", apHdr->apNum);
465
466
467
468
469
470 for( iq=0; iq<apHdr->apNum; iq++ ) {
471
472
473
474
475
476
477
478 apInfo=(appInfo_t *)(cpBuf+oNow+oInfo);
479
480
481 oSlots=sizeof(cmdDetail_t)*apInfo->numCmds;
482
483 opal_output_verbose(1, orte_ras_base_framework.framework_output,
484 "ras:alps:allocate: read data for resId %u - myId %u",
485 apInfo->resId, *uMe);
486
487
488 #if ALPS_APPINFO_VERSION==0
489
490
491
492
493
494
495 apSlots=(placeList_t *)(cpBuf+oNow+oInfo+oDet+oSlots);
496 oEntry=sizeof(placeList_t)*apInfo->numPlaces;
497
498 oNow+=(oDet+oSlots+oEntry);
499
500 if( apInfo->resId != *uMe ) continue;
501
502
503
504
505
506 for( ix=0; ix<apInfo->numPlaces; ix++ ) {
507
508 opal_output_verbose(5, orte_ras_base_framework.framework_output,
509 "ras:alps:read_appinfo: got NID %d", apSlots[ix].nid);
510
511 opal_asprintf( &hostname, "%d", apSlots[ix].nid );
512 if (NULL == hostname) {
513 ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
514 return ORTE_ERR_OUT_OF_RESOURCE;
515 }
516
517
518 if( NULL!=node && !strcmp(node->name, hostname) ) {
519
520 free(hostname);
521 ++node->slots;
522 } else {
523
524 opal_output_verbose(1, orte_ras_base_framework.framework_output,
525 "ras:alps:read_appinfo: added NID %d to list", apSlots[ix].nid);
526
527 node = OBJ_NEW(orte_node_t);
528 node->name = hostname;
529 orte_set_attribute(&node->attributes, ORTE_NODE_LAUNCH_ID, ORTE_ATTR_LOCAL, &apSlots[ix].nid, OPAL_INT32);
530 node->slots_inuse = 0;
531 node->slots_max = 0;
532 node->slots = 1;
533 node->state = ORTE_NODE_STATE_UP;
534
535
536
537
538 opal_list_append(nodes, &node->super);
539 sNodes++;
540 }
541 }
542 #else
543
544
545
546
547 apNodes=(orte_ras_alps_placeNodeList_t *)(cpBuf+oNow+oInfo+oDet+oSlots);
548 oEntry=sizeof(orte_ras_alps_placeNodeList_t)*apInfo->numPlaces;
549
550 oNow+=(oDet+oSlots+oEntry);
551
552 if( apInfo->resId != *uMe ) continue;
553
554 for( ix=0; ix<apInfo->numPlaces; ix++ ) {
555 opal_output_verbose(5, orte_ras_base_framework.framework_output,
556 "ras:alps:read_appinfo(modern): processing NID %d with %d slots",
557 apNodes[ix].nid, apNodes[ix].numPEs);
558 opal_asprintf( &hostname, "%d", apNodes[ix].nid );
559 if (NULL == hostname) {
560 ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
561 return ORTE_ERR_OUT_OF_RESOURCE;
562 }
563
564 node = OBJ_NEW(orte_node_t);
565 node->name = hostname;
566 orte_set_attribute(&node->attributes, ORTE_NODE_LAUNCH_ID, ORTE_ATTR_LOCAL, &apNodes[ix].nid, OPAL_INT32);
567 node->slots_inuse = 0;
568 node->slots_max = 0;
569 if (opal_hwloc_use_hwthreads_as_cpus) {
570 node->slots = apNodes[ix].cpuCnt;
571 } else {
572 node->slots = apNodes[ix].numPEs;
573 }
574 node->state = ORTE_NODE_STATE_UP;
575
576
577
578
579 opal_list_append(nodes, &node->super);
580 sNodes++;
581 }
582 #endif
583 break;
584 }
585
586 free(cpBuf);
587
588 return ORTE_SUCCESS;
589 }
590
591
592 static int
593 orte_ras_alps_finalize(void)
594 {
595 opal_output_verbose(1, orte_ras_base_framework.framework_output,
596 "ras:alps:finalize: success (nothing to do)");
597 return ORTE_SUCCESS;
598 }