root/opal/mca/hwloc/hwloc201/hwloc/hwloc/topology-linux.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. hwloc_mbind
  2. hwloc_set_mempolicy
  3. hwloc_get_mempolicy
  4. hwloc_migrate_pages
  5. hwloc_move_pages
  6. hwloc_checkat
  7. hwloc_openat
  8. hwloc_fopenat
  9. hwloc_accessat
  10. hwloc_fstatat
  11. hwloc_opendirat
  12. hwloc_readlinkat
  13. hwloc_open
  14. hwloc_fopen
  15. hwloc_access
  16. hwloc_stat
  17. hwloc_lstat
  18. hwloc_opendir
  19. hwloc_readlink
  20. hwloc_read_path_by_length
  21. hwloc_read_path_as_int
  22. hwloc_read_path_as_uint
  23. hwloc__read_fd
  24. hwloc__read_fd_as_cpumask
  25. hwloc__read_path_as_cpumask
  26. hwloc__alloc_read_path_as_cpumask
  27. hwloc_linux_read_path_as_cpumask
  28. hwloc__read_fd_as_cpulist
  29. hwloc_linux_set_tid_cpubind
  30. hwloc_linux_find_kernel_nr_cpus
  31. hwloc_linux_get_tid_cpubind
  32. hwloc_linux_get_proc_tids
  33. hwloc_linux_foreach_proc_tid
  34. hwloc_linux_foreach_proc_tid_set_cpubind_cb
  35. hwloc_linux_set_pid_cpubind
  36. hwloc_linux_foreach_proc_tid_get_cpubind_cb
  37. hwloc_linux_get_pid_cpubind
  38. hwloc_linux_set_proc_cpubind
  39. hwloc_linux_get_proc_cpubind
  40. hwloc_linux_set_thisproc_cpubind
  41. hwloc_linux_get_thisproc_cpubind
  42. hwloc_linux_set_thisthread_cpubind
  43. hwloc_linux_get_thisthread_cpubind
  44. hwloc_linux_set_thread_cpubind
  45. hwloc_linux_get_thread_cpubind
  46. hwloc_linux_get_tid_last_cpu_location
  47. hwloc_linux_foreach_proc_tid_get_last_cpu_location_cb
  48. hwloc_linux_get_pid_last_cpu_location
  49. hwloc_linux_get_proc_last_cpu_location
  50. hwloc_linux_get_thisproc_last_cpu_location
  51. hwloc_linux_get_thisthread_last_cpu_location
  52. hwloc_linux_membind_policy_from_hwloc
  53. hwloc_linux_membind_mask_from_nodeset
  54. hwloc_linux_membind_mask_to_nodeset
  55. hwloc_linux_set_area_membind
  56. hwloc_linux_alloc_membind
  57. hwloc_linux_set_thisthread_membind
  58. hwloc_linux_find_kernel_max_numnodes
  59. hwloc_linux_membind_policy_to_hwloc
  60. hwloc_linux_mask_is_empty
  61. hwloc_linux_get_thisthread_membind
  62. hwloc_linux_get_area_membind
  63. hwloc_linux_get_area_memlocation
  64. hwloc_linux_get_allowed_resources_hook
  65. hwloc_set_linuxfs_hooks
  66. hwloc_find_linux_cpuset_mntpnt
  67. hwloc_read_linux_cpuset_name
  68. hwloc_admin_disable_set_from_cpuset
  69. hwloc_parse_meminfo_info
  70. hwloc_parse_hugepages_info
  71. hwloc_get_procfs_meminfo_info
  72. hwloc_sysfs_node_meminfo_info
  73. hwloc_parse_nodes_distances
  74. hwloc__get_dmi_id_one_info
  75. hwloc__get_dmi_id_info
  76. hwloc_read_raw
  77. hwloc_read_str
  78. hwloc_read_unit32be
  79. add_device_tree_cpus_node
  80. look_powerpc_device_tree_discover_cache
  81. try__add_cache_from_device_tree_cpu
  82. try_add_cache_from_device_tree_cpu
  83. look_powerpc_device_tree
  84. hwloc_linux_try_handle_knl_hwdata_properties
  85. list_sysfsnode
  86. look_sysfsnode
  87. look_sysfscpu
  88. hwloc_linux_parse_cpuinfo_x86
  89. hwloc_linux_parse_cpuinfo_ia64
  90. hwloc_linux_parse_cpuinfo_arm
  91. hwloc_linux_parse_cpuinfo_ppc
  92. hwloc_linux_parse_cpuinfo_generic
  93. hwloc_linux_parse_cpuinfo
  94. hwloc_linux_free_cpuinfo
  95. look_cpuinfo
  96. hwloc__linux_get_mic_sn
  97. hwloc_gather_system_info
  98. hwloc_linux_try_hardwired_cpuinfo
  99. hwloc_linux__get_allowed_resources
  100. hwloc_linux_fallback_pu_level
  101. hwloc_look_linuxfs
  102. hwloc_linux_backend_get_pci_busid_cpuset
  103. hwloc_linux_backend_disable
  104. hwloc_linux_component_instantiate
  105. hwloc_linuxfs_find_osdev_parent
  106. hwloc_linux_add_os_device
  107. hwloc_linuxfs_block_class_fillinfos
  108. hwloc_linuxfs_lookup_block_class
  109. hwloc_linuxfs_net_class_fillinfos
  110. hwloc_linuxfs_lookup_net_class
  111. hwloc_linuxfs_infiniband_class_fillinfos
  112. hwloc_linuxfs_lookup_infiniband_class
  113. hwloc_linuxfs_mic_class_fillinfos
  114. hwloc_linuxfs_lookup_mic_class
  115. hwloc_linuxfs_lookup_drm_class
  116. hwloc_linuxfs_lookup_dma_class
  117. check_dmi_entry
  118. hwloc__get_firmware_dmi_memory_info_one
  119. hwloc__get_firmware_dmi_memory_info
  120. hwloc_linuxfs_pci_look_pcidevices
  121. hwloc_linuxfs_pci_find_pcislot_obj
  122. hwloc_linuxfs_pci_look_pcislots
  123. hwloc_look_linuxfs_io
  124. hwloc_linuxio_component_instantiate

   1 /*
   2  * Copyright © 2009 CNRS
   3  * Copyright © 2009-2018 Inria.  All rights reserved.
   4  * Copyright © 2009-2013, 2015 Université Bordeaux
   5  * Copyright © 2009-2014 Cisco Systems, Inc.  All rights reserved.
   6  * Copyright © 2015 Intel, Inc.  All rights reserved.
   7  * Copyright © 2010 IBM
   8  * See COPYING in top-level directory.
   9  */
  10 
  11 #include <private/autogen/config.h>
  12 #include <hwloc.h>
  13 #include <hwloc/linux.h>
  14 #include <private/misc.h>
  15 #include <private/private.h>
  16 #include <private/misc.h>
  17 #include <private/debug.h>
  18 
  19 #include <limits.h>
  20 #include <stdio.h>
  21 #include <fcntl.h>
  22 #include <errno.h>
  23 #include <assert.h>
  24 #ifdef HAVE_DIRENT_H
  25 #include <dirent.h>
  26 #endif
  27 #ifdef HAVE_UNISTD_H
  28 #include <unistd.h>
  29 #endif
  30 #ifdef HWLOC_HAVE_LIBUDEV
  31 #include <libudev.h>
  32 #endif
  33 #include <sys/types.h>
  34 #include <sys/stat.h>
  35 #include <sched.h>
  36 #include <pthread.h>
  37 #include <sys/mman.h>
  38 #include <sys/syscall.h>
  39 #include <mntent.h>
  40 
  41 struct hwloc_linux_backend_data_s {
  42   char *root_path; /* NULL if unused */
  43   int root_fd; /* The file descriptor for the file system root, used when browsing, e.g., Linux' sysfs and procfs. */
  44   int is_real_fsroot; /* Boolean saying whether root_fd points to the real filesystem root of the system */
  45 #ifdef HWLOC_HAVE_LIBUDEV
  46   struct udev *udev; /* Global udev context */
  47 #endif
  48   char *dumped_hwdata_dirname;
  49   enum {
  50     HWLOC_LINUX_ARCH_X86, /* x86 32 or 64bits, including k1om (KNC) */
  51     HWLOC_LINUX_ARCH_IA64,
  52     HWLOC_LINUX_ARCH_ARM,
  53     HWLOC_LINUX_ARCH_POWER,
  54     HWLOC_LINUX_ARCH_UNKNOWN
  55   } arch;
  56   int is_knl;
  57   int is_amd_with_CU;
  58   struct utsname utsname; /* fields contain \0 when unknown */
  59   int fallback_nbprocessors; /* only used in hwloc_linux_fallback_pu_level(), maybe be <= 0 (error) earlier */
  60   unsigned pagesize;
  61 };
  62 
  63 
  64 
  65 /***************************
  66  * Misc Abstraction layers *
  67  ***************************/
  68 
  69 #include <linux/unistd.h>
  70 
  71 #if !(defined HWLOC_HAVE_SCHED_SETAFFINITY) && (defined HWLOC_HAVE_SYSCALL)
  72 /* libc doesn't have support for sched_setaffinity, make system call
  73  * ourselves: */
  74 #    ifndef __NR_sched_setaffinity
  75 #       ifdef __i386__
  76 #         define __NR_sched_setaffinity 241
  77 #       elif defined(__x86_64__)
  78 #         define __NR_sched_setaffinity 203
  79 #       elif defined(__ia64__)
  80 #         define __NR_sched_setaffinity 1231
  81 #       elif defined(__hppa__)
  82 #         define __NR_sched_setaffinity 211
  83 #       elif defined(__alpha__)
  84 #         define __NR_sched_setaffinity 395
  85 #       elif defined(__s390__)
  86 #         define __NR_sched_setaffinity 239
  87 #       elif defined(__sparc__)
  88 #         define __NR_sched_setaffinity 261
  89 #       elif defined(__m68k__)
  90 #         define __NR_sched_setaffinity 311
  91 #       elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) || defined(__powerpc64__) || defined(__ppc64__)
  92 #         define __NR_sched_setaffinity 222
  93 #       elif defined(__arm__)
  94 #         define __NR_sched_setaffinity 241
  95 #       elif defined(__cris__)
  96 #         define __NR_sched_setaffinity 241
  97 /*#       elif defined(__mips__)
  98   #         define __NR_sched_setaffinity TODO (32/64/nabi) */
  99 #       else
 100 #         warning "don't know the syscall number for sched_setaffinity on this architecture, will not support binding"
 101 #         define sched_setaffinity(pid, lg, mask) (errno = ENOSYS, -1)
 102 #       endif
 103 #    endif
 104 #    ifndef sched_setaffinity
 105 #      define sched_setaffinity(pid, lg, mask) syscall(__NR_sched_setaffinity, pid, lg, mask)
 106 #    endif
 107 #    ifndef __NR_sched_getaffinity
 108 #       ifdef __i386__
 109 #         define __NR_sched_getaffinity 242
 110 #       elif defined(__x86_64__)
 111 #         define __NR_sched_getaffinity 204
 112 #       elif defined(__ia64__)
 113 #         define __NR_sched_getaffinity 1232
 114 #       elif defined(__hppa__)
 115 #         define __NR_sched_getaffinity 212
 116 #       elif defined(__alpha__)
 117 #         define __NR_sched_getaffinity 396
 118 #       elif defined(__s390__)
 119 #         define __NR_sched_getaffinity 240
 120 #       elif defined(__sparc__)
 121 #         define __NR_sched_getaffinity 260
 122 #       elif defined(__m68k__)
 123 #         define __NR_sched_getaffinity 312
 124 #       elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) || defined(__powerpc64__) || defined(__ppc64__)
 125 #         define __NR_sched_getaffinity 223
 126 #       elif defined(__arm__)
 127 #         define __NR_sched_getaffinity 242
 128 #       elif defined(__cris__)
 129 #         define __NR_sched_getaffinity 242
 130 /*#       elif defined(__mips__)
 131   #         define __NR_sched_getaffinity TODO (32/64/nabi) */
 132 #       else
 133 #         warning "don't know the syscall number for sched_getaffinity on this architecture, will not support getting binding"
 134 #         define sched_getaffinity(pid, lg, mask) (errno = ENOSYS, -1)
 135 #       endif
 136 #    endif
 137 #    ifndef sched_getaffinity
 138 #      define sched_getaffinity(pid, lg, mask) (syscall(__NR_sched_getaffinity, pid, lg, mask) < 0 ? -1 : 0)
 139 #    endif
 140 #endif
 141 
 142 /* numa syscalls are only in libnuma, but libnuma devel headers aren't widely installed.
 143  * just redefine these syscalls to avoid requiring libnuma devel headers just because of these missing syscalls.
 144  * __NR_foo should be defined in headers in all modern platforms.
 145  * Just redefine the basic ones on important platform when not to hard to detect/define.
 146  */
 147 
 148 #ifndef MPOL_DEFAULT
 149 # define MPOL_DEFAULT 0
 150 #endif
 151 #ifndef MPOL_PREFERRED
 152 # define MPOL_PREFERRED 1
 153 #endif
 154 #ifndef MPOL_BIND
 155 # define MPOL_BIND 2
 156 #endif
 157 #ifndef MPOL_INTERLEAVE
 158 # define MPOL_INTERLEAVE 3
 159 #endif
 160 #ifndef MPOL_LOCAL
 161 # define MPOL_LOCAL 4
 162 #endif
 163 #ifndef MPOL_F_ADDR
 164 # define  MPOL_F_ADDR (1<<1)
 165 #endif
 166 #ifndef MPOL_MF_STRICT
 167 # define MPOL_MF_STRICT (1<<0)
 168 #endif
 169 #ifndef MPOL_MF_MOVE
 170 # define MPOL_MF_MOVE (1<<1)
 171 #endif
 172 
 173 #ifndef __NR_mbind
 174 # ifdef __i386__
 175 #  define __NR_mbind 274
 176 # elif defined(__x86_64__)
 177 #  define __NR_mbind 237
 178 # elif defined(__ia64__)
 179 #  define __NR_mbind 1259
 180 # elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) || defined(__powerpc64__) || defined(__ppc64__)
 181 #  define __NR_mbind 259
 182 # elif defined(__sparc__)
 183 #  define __NR_mbind 353
 184 # endif
 185 #endif
 186 static __hwloc_inline long hwloc_mbind(void *addr __hwloc_attribute_unused,
 187                                        unsigned long len __hwloc_attribute_unused,
 188                                        int mode __hwloc_attribute_unused,
 189                                        const unsigned long *nodemask __hwloc_attribute_unused,
 190                                        unsigned long maxnode __hwloc_attribute_unused,
 191                                        unsigned flags __hwloc_attribute_unused)
 192 {
 193 #if (defined __NR_mbind) && (defined HWLOC_HAVE_SYSCALL)
 194   return syscall(__NR_mbind, (long) addr, len, mode, (long)nodemask, maxnode, flags);
 195 #else
 196   errno = ENOSYS;
 197   return -1;
 198 #endif
 199 }
 200 
 201 #ifndef __NR_set_mempolicy
 202 # ifdef __i386__
 203 #  define __NR_set_mempolicy 276
 204 # elif defined(__x86_64__)
 205 #  define __NR_set_mempolicy 239
 206 # elif defined(__ia64__)
 207 #  define __NR_set_mempolicy 1261
 208 # elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) || defined(__powerpc64__) || defined(__ppc64__)
 209 #  define __NR_set_mempolicy 261
 210 # elif defined(__sparc__)
 211 #  define __NR_set_mempolicy 305
 212 # endif
 213 #endif
 214 static __hwloc_inline long hwloc_set_mempolicy(int mode __hwloc_attribute_unused,
 215                                                const unsigned long *nodemask __hwloc_attribute_unused,
 216                                                unsigned long maxnode __hwloc_attribute_unused)
 217 {
 218 #if (defined __NR_set_mempolicy) && (defined HWLOC_HAVE_SYSCALL)
 219   return syscall(__NR_set_mempolicy, mode, nodemask, maxnode);
 220 #else
 221   errno = ENOSYS;
 222   return -1;
 223 #endif
 224 }
 225 
 226 #ifndef __NR_get_mempolicy
 227 # ifdef __i386__
 228 #  define __NR_get_mempolicy 275
 229 # elif defined(__x86_64__)
 230 #  define __NR_get_mempolicy 238
 231 # elif defined(__ia64__)
 232 #  define __NR_get_mempolicy 1260
 233 # elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) || defined(__powerpc64__) || defined(__ppc64__)
 234 #  define __NR_get_mempolicy 260
 235 # elif defined(__sparc__)
 236 #  define __NR_get_mempolicy 304
 237 # endif
 238 #endif
 239 static __hwloc_inline long hwloc_get_mempolicy(int *mode __hwloc_attribute_unused,
 240                                                const unsigned long *nodemask __hwloc_attribute_unused,
 241                                                unsigned long maxnode __hwloc_attribute_unused,
 242                                                void *addr __hwloc_attribute_unused,
 243                                                int flags __hwloc_attribute_unused)
 244 {
 245 #if (defined __NR_get_mempolicy) && (defined HWLOC_HAVE_SYSCALL)
 246   return syscall(__NR_get_mempolicy, mode, nodemask, maxnode, addr, flags);
 247 #else
 248   errno = ENOSYS;
 249   return -1;
 250 #endif
 251 }
 252 
 253 #ifndef __NR_migrate_pages
 254 # ifdef __i386__
 255 #  define __NR_migrate_pages 204
 256 # elif defined(__x86_64__)
 257 #  define __NR_migrate_pages 256
 258 # elif defined(__ia64__)
 259 #  define __NR_migrate_pages 1280
 260 # elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) || defined(__powerpc64__) || defined(__ppc64__)
 261 #  define __NR_migrate_pages 258
 262 # elif defined(__sparc__)
 263 #  define __NR_migrate_pages 302
 264 # endif
 265 #endif
 266 static __hwloc_inline long hwloc_migrate_pages(int pid __hwloc_attribute_unused,
 267                                                unsigned long maxnode __hwloc_attribute_unused,
 268                                                const unsigned long *oldnodes __hwloc_attribute_unused,
 269                                                const unsigned long *newnodes __hwloc_attribute_unused)
 270 {
 271 #if (defined __NR_migrate_pages) && (defined HWLOC_HAVE_SYSCALL)
 272   return syscall(__NR_migrate_pages, pid, maxnode, oldnodes, newnodes);
 273 #else
 274   errno = ENOSYS;
 275   return -1;
 276 #endif
 277 }
 278 
 279 #ifndef __NR_move_pages
 280 # ifdef __i386__
 281 #  define __NR_move_pages 317
 282 # elif defined(__x86_64__)
 283 #  define __NR_move_pages 279
 284 # elif defined(__ia64__)
 285 #  define __NR_move_pages 1276
 286 # elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) || defined(__powerpc64__) || defined(__ppc64__)
 287 #  define __NR_move_pages 301
 288 # elif defined(__sparc__)
 289 #  define __NR_move_pages 307
 290 # endif
 291 #endif
 292 static __hwloc_inline long hwloc_move_pages(int pid __hwloc_attribute_unused,
 293                                             unsigned long count __hwloc_attribute_unused,
 294                                             void **pages __hwloc_attribute_unused,
 295                                             const int *nodes __hwloc_attribute_unused,
 296                                             int *status __hwloc_attribute_unused,
 297                                             int flags __hwloc_attribute_unused)
 298 {
 299 #if (defined __NR_move_pages) && (defined HWLOC_HAVE_SYSCALL)
 300   return syscall(__NR_move_pages, pid, count, pages, nodes, status, flags);
 301 #else
 302   errno = ENOSYS;
 303   return -1;
 304 #endif
 305 }
 306 
 307 
 308 /* Added for ntohl() */
 309 #include <arpa/inet.h>
 310 
 311 #ifdef HAVE_OPENAT
 312 /* Use our own filesystem functions if we have openat */
 313 
 314 static const char *
 315 hwloc_checkat(const char *path, int fsroot_fd)
 316 {
 317   const char *relative_path;
 318   if (fsroot_fd < 0) {
 319     errno = EBADF;
 320     return NULL;
 321   }
 322 
 323   /* Skip leading slashes.  */
 324   for (relative_path = path; *relative_path == '/'; relative_path++);
 325 
 326   return relative_path;
 327 }
 328 
 329 static int
 330 hwloc_openat(const char *path, int fsroot_fd)
 331 {
 332   const char *relative_path;
 333 
 334   relative_path = hwloc_checkat(path, fsroot_fd);
 335   if (!relative_path)
 336     return -1;
 337 
 338   return openat (fsroot_fd, relative_path, O_RDONLY);
 339 }
 340 
 341 static FILE *
 342 hwloc_fopenat(const char *path, const char *mode, int fsroot_fd)
 343 {
 344   int fd;
 345 
 346   if (strcmp(mode, "r")) {
 347     errno = ENOTSUP;
 348     return NULL;
 349   }
 350 
 351   fd = hwloc_openat (path, fsroot_fd);
 352   if (fd == -1)
 353     return NULL;
 354 
 355   return fdopen(fd, mode);
 356 }
 357 
 358 static int
 359 hwloc_accessat(const char *path, int mode, int fsroot_fd)
 360 {
 361   const char *relative_path;
 362 
 363   relative_path = hwloc_checkat(path, fsroot_fd);
 364   if (!relative_path)
 365     return -1;
 366 
 367   return faccessat(fsroot_fd, relative_path, mode, 0);
 368 }
 369 
 370 static int
 371 hwloc_fstatat(const char *path, struct stat *st, int flags, int fsroot_fd)
 372 {
 373   const char *relative_path;
 374 
 375   relative_path = hwloc_checkat(path, fsroot_fd);
 376   if (!relative_path)
 377     return -1;
 378 
 379   return fstatat(fsroot_fd, relative_path, st, flags);
 380 }
 381 
 382 static DIR*
 383 hwloc_opendirat(const char *path, int fsroot_fd)
 384 {
 385   int dir_fd;
 386   const char *relative_path;
 387 
 388   relative_path = hwloc_checkat(path, fsroot_fd);
 389   if (!relative_path)
 390     return NULL;
 391 
 392   dir_fd = openat(fsroot_fd, relative_path, O_RDONLY | O_DIRECTORY);
 393   if (dir_fd < 0)
 394     return NULL;
 395 
 396   return fdopendir(dir_fd);
 397 }
 398 
 399 static int
 400 hwloc_readlinkat(const char *path, char *buf, size_t buflen, int fsroot_fd)
 401 {
 402   const char *relative_path;
 403 
 404   relative_path = hwloc_checkat(path, fsroot_fd);
 405   if (!relative_path)
 406     return -1;
 407 
 408   return readlinkat(fsroot_fd, relative_path, buf, buflen);
 409 }
 410 
 411 #endif /* HAVE_OPENAT */
 412 
 413 /* Static inline version of fopen so that we can use openat if we have
 414    it, but still preserve compiler parameter checking */
 415 static __hwloc_inline int
 416 hwloc_open(const char *p, int d __hwloc_attribute_unused)
 417 {
 418 #ifdef HAVE_OPENAT
 419     return hwloc_openat(p, d);
 420 #else
 421     return open(p, O_RDONLY);
 422 #endif
 423 }
 424 
 425 static __hwloc_inline FILE *
 426 hwloc_fopen(const char *p, const char *m, int d __hwloc_attribute_unused)
 427 {
 428 #ifdef HAVE_OPENAT
 429     return hwloc_fopenat(p, m, d);
 430 #else
 431     return fopen(p, m);
 432 #endif
 433 }
 434 
 435 /* Static inline version of access so that we can use openat if we have
 436    it, but still preserve compiler parameter checking */
 437 static __hwloc_inline int
 438 hwloc_access(const char *p, int m, int d __hwloc_attribute_unused)
 439 {
 440 #ifdef HAVE_OPENAT
 441     return hwloc_accessat(p, m, d);
 442 #else
 443     return access(p, m);
 444 #endif
 445 }
 446 
 447 static __hwloc_inline int
 448 hwloc_stat(const char *p, struct stat *st, int d __hwloc_attribute_unused)
 449 {
 450 #ifdef HAVE_OPENAT
 451     return hwloc_fstatat(p, st, 0, d);
 452 #else
 453     return stat(p, st);
 454 #endif
 455 }
 456 
 457 static __hwloc_inline int
 458 hwloc_lstat(const char *p, struct stat *st, int d __hwloc_attribute_unused)
 459 {
 460 #ifdef HAVE_OPENAT
 461     return hwloc_fstatat(p, st, AT_SYMLINK_NOFOLLOW, d);
 462 #else
 463     return lstat(p, st);
 464 #endif
 465 }
 466 
 467 /* Static inline version of opendir so that we can use openat if we have
 468    it, but still preserve compiler parameter checking */
 469 static __hwloc_inline DIR *
 470 hwloc_opendir(const char *p, int d __hwloc_attribute_unused)
 471 {
 472 #ifdef HAVE_OPENAT
 473     return hwloc_opendirat(p, d);
 474 #else
 475     return opendir(p);
 476 #endif
 477 }
 478 
 479 static __hwloc_inline int
 480 hwloc_readlink(const char *p, char *l, size_t ll, int d __hwloc_attribute_unused)
 481 {
 482 #ifdef HAVE_OPENAT
 483   return hwloc_readlinkat(p, l, ll, d);
 484 #else
 485   return readlink(p, l, ll);
 486 #endif
 487 }
 488 
 489 
 490 /*****************************************
 491  ******* Helpers for reading files *******
 492  *****************************************/
 493 
 494 static __hwloc_inline int
 495 hwloc_read_path_by_length(const char *path, char *string, size_t length, int fsroot_fd)
 496 {
 497   int fd, ret;
 498 
 499   fd = hwloc_open(path, fsroot_fd);
 500   if (fd < 0)
 501     return -1;
 502 
 503   ret = read(fd, string, length-1); /* read -1 to put the ending \0 */
 504   close(fd);
 505 
 506   if (ret <= 0)
 507     return -1;
 508 
 509   string[ret] = 0;
 510 
 511   return 0;
 512 }
 513 
 514 static __hwloc_inline int
 515 hwloc_read_path_as_int(const char *path, int *value, int fsroot_fd)
 516 {
 517   char string[11];
 518   if (hwloc_read_path_by_length(path, string, sizeof(string), fsroot_fd) < 0)
 519     return -1;
 520   *value = atoi(string);
 521   return 0;
 522 }
 523 
 524 static __hwloc_inline int
 525 hwloc_read_path_as_uint(const char *path, unsigned *value, int fsroot_fd)
 526 {
 527   char string[11];
 528   if (hwloc_read_path_by_length(path, string, sizeof(string), fsroot_fd) < 0)
 529     return -1;
 530   *value = (unsigned) strtoul(string, NULL, 10);
 531   return 0;
 532 }
 533 
 534 /* Read everything from fd and save it into a newly allocated buffer
 535  * returned in bufferp. Use sizep as a default buffer size, and returned
 536  * the actually needed size in sizep.
 537  */
 538 static __hwloc_inline int
 539 hwloc__read_fd(int fd, char **bufferp, size_t *sizep)
 540 {
 541   char *buffer;
 542   size_t toread, filesize, totalread;
 543   ssize_t ret;
 544 
 545   toread = filesize = *sizep;
 546 
 547   /* Alloc and read +1 so that we get EOF on 2^n without reading once more */
 548   buffer = malloc(filesize+1);
 549   if (!buffer)
 550     return -1;
 551 
 552   ret = read(fd, buffer, toread+1);
 553   if (ret < 0) {
 554     free(buffer);
 555     return -1;
 556   }
 557 
 558   totalread = (size_t) ret;
 559 
 560   if (totalread < toread + 1)
 561     /* Normal case, a single read got EOF */
 562     goto done;
 563 
 564   /* Unexpected case, must extend the buffer and read again.
 565    * Only occurs on first invocation and if the kernel ever uses multiple page for a single mask.
 566    */
 567   do {
 568     char *tmp;
 569 
 570     toread = filesize;
 571     filesize *= 2;
 572 
 573     tmp = realloc(buffer, filesize+1);
 574     if (!tmp) {
 575       free(buffer);
 576       return -1;
 577     }
 578     buffer = tmp;
 579 
 580     ret = read(fd, buffer+toread+1, toread);
 581     if (ret < 0) {
 582       free(buffer);
 583       return -1;
 584     }
 585 
 586     totalread += ret;
 587   } while ((size_t) ret == toread);
 588 
 589  done:
 590   buffer[totalread] = '\0';
 591   *bufferp = buffer;
 592   *sizep = filesize;
 593   return 0;
 594 }
 595 
 596 /* kernel cpumaps are composed of an array of 32bits cpumasks */
 597 #define KERNEL_CPU_MASK_BITS 32
 598 #define KERNEL_CPU_MAP_LEN (KERNEL_CPU_MASK_BITS/4+2)
 599 
 600 static __hwloc_inline int
 601 hwloc__read_fd_as_cpumask(int fd, hwloc_bitmap_t set)
 602 {
 603   static size_t _filesize = 0; /* will be dynamically initialized to hwloc_get_pagesize(), and increased later if needed */
 604   size_t filesize;
 605   unsigned long *maps;
 606   unsigned long map;
 607   int nr_maps = 0;
 608   static int _nr_maps_allocated = 8; /* Only compute the power-of-two above the kernel cpumask size once.
 609                                       * Actually, it may increase multiple times if first read cpumaps start with zeroes.
 610                                       */
 611   int nr_maps_allocated = _nr_maps_allocated;
 612   char *buffer, *tmpbuf;
 613   int i;
 614 
 615   /* Kernel sysfs files are usually at most one page. 4kB may contain 455 32-bit
 616    * masks (followed by comma), enough for 14k PUs. So allocate a page by default for now.
 617    *
 618    * If we ever need a larger buffer, we'll realloc() the buffer during the first
 619    * invocation of this function so that others directly allocate the right size
 620    * (all cpumask files have the exact same size).
 621    */
 622   filesize = _filesize;
 623   if (!filesize)
 624     filesize = hwloc_getpagesize();
 625   if (hwloc__read_fd(fd, &buffer, &filesize) < 0)
 626     return -1;
 627   /* Only update the static value with the final one,
 628    * to avoid sharing intermediate values that we modify,
 629    * in case there's ever multiple concurrent calls.
 630    */
 631   _filesize = filesize;
 632 
 633   maps = malloc(nr_maps_allocated * sizeof(*maps));
 634   if (!maps) {
 635     free(buffer);
 636     return -1;
 637   }
 638 
 639   /* reset to zero first */
 640   hwloc_bitmap_zero(set);
 641 
 642   /* parse the whole mask */
 643   tmpbuf = buffer;
 644   while (sscanf(tmpbuf, "%lx", &map) == 1) {
 645     /* read one kernel cpu mask and the ending comma */
 646     if (nr_maps == nr_maps_allocated) {
 647       unsigned long *tmp = realloc(maps, 2*nr_maps_allocated * sizeof(*maps));
 648       if (!tmp) {
 649         free(buffer);
 650         free(maps);
 651         return -1;
 652       }
 653       maps = tmp;
 654       nr_maps_allocated *= 2;
 655     }
 656 
 657     tmpbuf = strchr(tmpbuf, ',');
 658     if (!tmpbuf) {
 659       maps[nr_maps++] = map;
 660       break;
 661     } else
 662       tmpbuf++;
 663 
 664     if (!map && !nr_maps)
 665       /* ignore the first map if it's empty */
 666       continue;
 667 
 668     maps[nr_maps++] = map;
 669   }
 670 
 671   free(buffer);
 672 
 673   /* convert into a set */
 674 #if KERNEL_CPU_MASK_BITS == HWLOC_BITS_PER_LONG
 675   for(i=0; i<nr_maps; i++)
 676     hwloc_bitmap_set_ith_ulong(set, i, maps[nr_maps-1-i]);
 677 #else
 678   for(i=0; i<(nr_maps+1)/2; i++) {
 679     unsigned long mask;
 680     mask = maps[nr_maps-2*i-1];
 681     if (2*i+1<nr_maps)
 682       mask |= maps[nr_maps-2*i-2] << KERNEL_CPU_MASK_BITS;
 683     hwloc_bitmap_set_ith_ulong(set, i, mask);
 684   }
 685 #endif
 686 
 687   free(maps);
 688 
 689   /* Only update the static value with the final one,
 690    * to avoid sharing intermediate values that we modify,
 691    * in case there's ever multiple concurrent calls.
 692    */
 693   if (nr_maps_allocated > _nr_maps_allocated)
 694     _nr_maps_allocated = nr_maps_allocated;
 695   return 0;
 696 }
 697 
 698 static __hwloc_inline int
 699 hwloc__read_path_as_cpumask(const char *maskpath, hwloc_bitmap_t set, int fsroot_fd)
 700 {
 701   int fd, err;
 702   fd = hwloc_open(maskpath, fsroot_fd);
 703   if (fd < 0)
 704     return -1;
 705   err = hwloc__read_fd_as_cpumask(fd, set);
 706   close(fd);
 707   return err;
 708 }
 709 
 710 static __hwloc_inline hwloc_bitmap_t
 711 hwloc__alloc_read_path_as_cpumask(const char *maskpath, int fsroot_fd)
 712 {
 713   hwloc_bitmap_t set;
 714   int err;
 715   set = hwloc_bitmap_alloc();
 716   if (!set)
 717     return NULL;
 718   err = hwloc__read_path_as_cpumask(maskpath, set, fsroot_fd);
 719   if (err < 0) {
 720     hwloc_bitmap_free(set);
 721     return NULL;
 722   } else
 723     return set;
 724 }
 725 
 726 int
 727 hwloc_linux_read_path_as_cpumask(const char *maskpath, hwloc_bitmap_t set)
 728 {
 729   int fd, err;
 730   fd = open(maskpath, O_RDONLY);
 731   if (fd < 0)
 732     return -1;
 733   err = hwloc__read_fd_as_cpumask(fd, set);
 734   close(fd);
 735   return err;
 736 }
 737 
 738 /* set must be full on input */
 739 static __hwloc_inline int
 740 hwloc__read_fd_as_cpulist(int fd, hwloc_bitmap_t set)
 741 {
 742   /* Kernel sysfs files are usually at most one page.
 743    * But cpulists can be of very different sizes depending on the fragmentation,
 744    * so don't bother remember the actual read size between invocations.
 745    * We don't have many invocations anyway.
 746    */
 747   size_t filesize = hwloc_getpagesize();
 748   char *buffer, *current, *comma, *tmp;
 749   int prevlast, nextfirst, nextlast; /* beginning/end of enabled-segments */
 750 
 751   if (hwloc__read_fd(fd, &buffer, &filesize) < 0)
 752     return -1;
 753 
 754   current = buffer;
 755   prevlast = -1;
 756 
 757   while (1) {
 758     /* save a pointer to the next comma and erase it to simplify things */
 759     comma = strchr(current, ',');
 760     if (comma)
 761       *comma = '\0';
 762 
 763     /* find current enabled-segment bounds */
 764     nextfirst = strtoul(current, &tmp, 0);
 765     if (*tmp == '-')
 766       nextlast = strtoul(tmp+1, NULL, 0);
 767     else
 768       nextlast = nextfirst;
 769     if (prevlast+1 <= nextfirst-1)
 770       hwloc_bitmap_clr_range(set, prevlast+1, nextfirst-1);
 771 
 772     /* switch to next enabled-segment */
 773     prevlast = nextlast;
 774     if (!comma)
 775       break;
 776     current = comma+1;
 777   }
 778 
 779   hwloc_bitmap_clr_range(set, prevlast+1, -1);
 780   free(buffer);
 781   return 0;
 782 }
 783 
 784 
 785 /*****************************
 786  ******* CpuBind Hooks *******
 787  *****************************/
 788 
 789 int
 790 hwloc_linux_set_tid_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, pid_t tid __hwloc_attribute_unused, hwloc_const_bitmap_t hwloc_set __hwloc_attribute_unused)
 791 {
 792   /* The resulting binding is always strict */
 793 
 794 #if defined(HWLOC_HAVE_CPU_SET_S) && !defined(HWLOC_HAVE_OLD_SCHED_SETAFFINITY)
 795   cpu_set_t *plinux_set;
 796   unsigned cpu;
 797   int last;
 798   size_t setsize;
 799   int err;
 800 
 801   last = hwloc_bitmap_last(hwloc_set);
 802   if (last == -1) {
 803     errno = EINVAL;
 804     return -1;
 805   }
 806 
 807   setsize = CPU_ALLOC_SIZE(last+1);
 808   plinux_set = CPU_ALLOC(last+1);
 809 
 810   CPU_ZERO_S(setsize, plinux_set);
 811   hwloc_bitmap_foreach_begin(cpu, hwloc_set)
 812     CPU_SET_S(cpu, setsize, plinux_set);
 813   hwloc_bitmap_foreach_end();
 814 
 815   err = sched_setaffinity(tid, setsize, plinux_set);
 816 
 817   CPU_FREE(plinux_set);
 818   return err;
 819 #elif defined(HWLOC_HAVE_CPU_SET)
 820   cpu_set_t linux_set;
 821   unsigned cpu;
 822 
 823   CPU_ZERO(&linux_set);
 824   hwloc_bitmap_foreach_begin(cpu, hwloc_set)
 825     CPU_SET(cpu, &linux_set);
 826   hwloc_bitmap_foreach_end();
 827 
 828 #ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
 829   return sched_setaffinity(tid, &linux_set);
 830 #else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
 831   return sched_setaffinity(tid, sizeof(linux_set), &linux_set);
 832 #endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
 833 #elif defined(HWLOC_HAVE_SYSCALL)
 834   unsigned long mask = hwloc_bitmap_to_ulong(hwloc_set);
 835 
 836 #ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
 837   return sched_setaffinity(tid, (void*) &mask);
 838 #else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
 839   return sched_setaffinity(tid, sizeof(mask), (void*) &mask);
 840 #endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
 841 #else /* !SYSCALL */
 842   errno = ENOSYS;
 843   return -1;
 844 #endif /* !SYSCALL */
 845 }
 846 
 847 #if defined(HWLOC_HAVE_CPU_SET_S) && !defined(HWLOC_HAVE_OLD_SCHED_SETAFFINITY)
 848 /*
 849  * On some kernels, sched_getaffinity requires the output size to be larger
 850  * than the kernel cpu_set size (defined by CONFIG_NR_CPUS).
 851  * Try sched_affinity on ourself until we find a nr_cpus value that makes
 852  * the kernel happy.
 853  */
 854 static int
 855 hwloc_linux_find_kernel_nr_cpus(hwloc_topology_t topology)
 856 {
 857   static int _nr_cpus = -1;
 858   int nr_cpus = _nr_cpus;
 859   int fd;
 860 
 861   if (nr_cpus != -1)
 862     /* already computed */
 863     return nr_cpus;
 864 
 865   if (topology->levels[0][0]->complete_cpuset)
 866     /* start with a nr_cpus that may contain the whole topology */
 867     nr_cpus = hwloc_bitmap_last(topology->levels[0][0]->complete_cpuset) + 1;
 868   if (nr_cpus <= 0)
 869     /* start from scratch, the topology isn't ready yet (complete_cpuset is missing (-1) or empty (0))*/
 870     nr_cpus = 1;
 871 
 872   fd = open("/sys/devices/system/cpu/possible", O_RDONLY); /* binding only supported in real fsroot, no need for data->root_fd */
 873   if (fd >= 0) {
 874     hwloc_bitmap_t possible_bitmap = hwloc_bitmap_alloc_full();
 875     if (hwloc__read_fd_as_cpulist(fd, possible_bitmap) == 0) {
 876       int max_possible = hwloc_bitmap_last(possible_bitmap);
 877       hwloc_debug_bitmap("possible CPUs are %s\n", possible_bitmap);
 878 
 879       if (nr_cpus < max_possible + 1)
 880         nr_cpus = max_possible + 1;
 881     }
 882     close(fd);
 883     hwloc_bitmap_free(possible_bitmap);
 884   }
 885 
 886   while (1) {
 887     cpu_set_t *set = CPU_ALLOC(nr_cpus);
 888     size_t setsize = CPU_ALLOC_SIZE(nr_cpus);
 889     int err = sched_getaffinity(0, setsize, set); /* always works, unless setsize is too small */
 890     CPU_FREE(set);
 891     nr_cpus = setsize * 8; /* that's the value that was actually tested */
 892     if (!err)
 893       /* Found it. Only update the static value with the final one,
 894        * to avoid sharing intermediate values that we modify,
 895        * in case there's ever multiple concurrent calls.
 896        */
 897       return _nr_cpus = nr_cpus;
 898     nr_cpus *= 2;
 899   }
 900 }
 901 #endif
 902 
 903 int
 904 hwloc_linux_get_tid_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, pid_t tid __hwloc_attribute_unused, hwloc_bitmap_t hwloc_set __hwloc_attribute_unused)
 905 {
 906   int err __hwloc_attribute_unused;
 907 
 908 #if defined(HWLOC_HAVE_CPU_SET_S) && !defined(HWLOC_HAVE_OLD_SCHED_SETAFFINITY)
 909   cpu_set_t *plinux_set;
 910   unsigned cpu;
 911   int last;
 912   size_t setsize;
 913   int kernel_nr_cpus;
 914 
 915   /* find the kernel nr_cpus so as to use a large enough cpu_set size */
 916   kernel_nr_cpus = hwloc_linux_find_kernel_nr_cpus(topology);
 917   setsize = CPU_ALLOC_SIZE(kernel_nr_cpus);
 918   plinux_set = CPU_ALLOC(kernel_nr_cpus);
 919 
 920   err = sched_getaffinity(tid, setsize, plinux_set);
 921 
 922   if (err < 0) {
 923     CPU_FREE(plinux_set);
 924     return -1;
 925   }
 926 
 927   last = -1;
 928   if (topology->levels[0][0]->complete_cpuset)
 929     last = hwloc_bitmap_last(topology->levels[0][0]->complete_cpuset);
 930   if (last == -1)
 931     /* round the maximal support number, the topology isn't ready yet (complete_cpuset is missing or empty)*/
 932     last = kernel_nr_cpus-1;
 933 
 934   hwloc_bitmap_zero(hwloc_set);
 935   for(cpu=0; cpu<=(unsigned) last; cpu++)
 936     if (CPU_ISSET_S(cpu, setsize, plinux_set))
 937       hwloc_bitmap_set(hwloc_set, cpu);
 938 
 939   CPU_FREE(plinux_set);
 940 #elif defined(HWLOC_HAVE_CPU_SET)
 941   cpu_set_t linux_set;
 942   unsigned cpu;
 943 
 944 #ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
 945   err = sched_getaffinity(tid, &linux_set);
 946 #else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
 947   err = sched_getaffinity(tid, sizeof(linux_set), &linux_set);
 948 #endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
 949   if (err < 0)
 950     return -1;
 951 
 952   hwloc_bitmap_zero(hwloc_set);
 953   for(cpu=0; cpu<CPU_SETSIZE; cpu++)
 954     if (CPU_ISSET(cpu, &linux_set))
 955       hwloc_bitmap_set(hwloc_set, cpu);
 956 #elif defined(HWLOC_HAVE_SYSCALL)
 957   unsigned long mask;
 958 
 959 #ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
 960   err = sched_getaffinity(tid, (void*) &mask);
 961 #else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
 962   err = sched_getaffinity(tid, sizeof(mask), (void*) &mask);
 963 #endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
 964   if (err < 0)
 965     return -1;
 966 
 967   hwloc_bitmap_from_ulong(hwloc_set, mask);
 968 #else /* !SYSCALL */
 969   errno = ENOSYS;
 970   return -1;
 971 #endif /* !SYSCALL */
 972 
 973   return 0;
 974 }
 975 
 976 /* Get the array of tids of a process from the task directory in /proc */
 977 static int
 978 hwloc_linux_get_proc_tids(DIR *taskdir, unsigned *nr_tidsp, pid_t ** tidsp)
 979 {
 980   struct dirent *dirent;
 981   unsigned nr_tids = 0;
 982   unsigned max_tids = 32;
 983   pid_t *tids;
 984   struct stat sb;
 985 
 986   /* take the number of links as a good estimate for the number of tids */
 987   if (fstat(dirfd(taskdir), &sb) == 0)
 988     max_tids = sb.st_nlink;
 989 
 990   tids = malloc(max_tids*sizeof(pid_t));
 991   if (!tids) {
 992     errno = ENOMEM;
 993     return -1;
 994   }
 995 
 996   rewinddir(taskdir);
 997 
 998   while ((dirent = readdir(taskdir)) != NULL) {
 999     if (nr_tids == max_tids) {
1000       pid_t *newtids;
1001       max_tids += 8;
1002       newtids = realloc(tids, max_tids*sizeof(pid_t));
1003       if (!newtids) {
1004         free(tids);
1005         errno = ENOMEM;
1006         return -1;
1007       }
1008       tids = newtids;
1009     }
1010     if (!strcmp(dirent->d_name, ".") || !strcmp(dirent->d_name, ".."))
1011       continue;
1012     tids[nr_tids++] = atoi(dirent->d_name);
1013   }
1014 
1015   *nr_tidsp = nr_tids;
1016   *tidsp = tids;
1017   return 0;
1018 }
1019 
1020 /* Per-tid callbacks */
1021 typedef int (*hwloc_linux_foreach_proc_tid_cb_t)(hwloc_topology_t topology, pid_t tid, void *data, int idx);
1022 
1023 static int
1024 hwloc_linux_foreach_proc_tid(hwloc_topology_t topology,
1025                              pid_t pid, hwloc_linux_foreach_proc_tid_cb_t cb,
1026                              void *data)
1027 {
1028   char taskdir_path[128];
1029   DIR *taskdir;
1030   pid_t *tids, *newtids;
1031   unsigned i, nr, newnr, failed = 0, failed_errno = 0;
1032   unsigned retrynr = 0;
1033   int err;
1034 
1035   if (pid)
1036     snprintf(taskdir_path, sizeof(taskdir_path), "/proc/%u/task", (unsigned) pid);
1037   else
1038     snprintf(taskdir_path, sizeof(taskdir_path), "/proc/self/task");
1039 
1040   taskdir = opendir(taskdir_path);
1041   if (!taskdir) {
1042     if (errno == ENOENT)
1043       errno = EINVAL;
1044     err = -1;
1045     goto out;
1046   }
1047 
1048   /* read the current list of threads */
1049   err = hwloc_linux_get_proc_tids(taskdir, &nr, &tids);
1050   if (err < 0)
1051     goto out_with_dir;
1052 
1053  retry:
1054   /* apply the callback to all threads */
1055   failed=0;
1056   for(i=0; i<nr; i++) {
1057     err = cb(topology, tids[i], data, i);
1058     if (err < 0) {
1059       failed++;
1060       failed_errno = errno;
1061     }
1062   }
1063 
1064   /* re-read the list of thread */
1065   err = hwloc_linux_get_proc_tids(taskdir, &newnr, &newtids);
1066   if (err < 0)
1067     goto out_with_tids;
1068   /* retry if the list changed in the meantime, or we failed for *some* threads only.
1069    * if we're really unlucky, all threads changed but we got the same set of tids. no way to support this.
1070    */
1071   if (newnr != nr || memcmp(newtids, tids, nr*sizeof(pid_t)) || (failed && failed != nr)) {
1072     free(tids);
1073     tids = newtids;
1074     nr = newnr;
1075     if (++retrynr > 10) {
1076       /* we tried 10 times, it didn't work, the application is probably creating/destroying many threads, stop trying */
1077       errno = EAGAIN;
1078       err = -1;
1079       goto out_with_tids;
1080     }
1081     goto retry;
1082   } else {
1083     free(newtids);
1084   }
1085 
1086   /* if all threads failed, return the last errno. */
1087   if (failed) {
1088     err = -1;
1089     errno = failed_errno;
1090     goto out_with_tids;
1091   }
1092 
1093   err = 0;
1094  out_with_tids:
1095   free(tids);
1096  out_with_dir:
1097   closedir(taskdir);
1098  out:
1099   return err;
1100 }
1101 
1102 /* Per-tid proc_set_cpubind callback and caller.
1103  * Callback data is a hwloc_bitmap_t. */
1104 static int
1105 hwloc_linux_foreach_proc_tid_set_cpubind_cb(hwloc_topology_t topology, pid_t tid, void *data, int idx __hwloc_attribute_unused)
1106 {
1107   return hwloc_linux_set_tid_cpubind(topology, tid, (hwloc_bitmap_t) data);
1108 }
1109 
1110 static int
1111 hwloc_linux_set_pid_cpubind(hwloc_topology_t topology, pid_t pid, hwloc_const_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
1112 {
1113   return hwloc_linux_foreach_proc_tid(topology, pid,
1114                                       hwloc_linux_foreach_proc_tid_set_cpubind_cb,
1115                                       (void*) hwloc_set);
1116 }
1117 
1118 /* Per-tid proc_get_cpubind callback data, callback function and caller */
1119 struct hwloc_linux_foreach_proc_tid_get_cpubind_cb_data_s {
1120   hwloc_bitmap_t cpuset;
1121   hwloc_bitmap_t tidset;
1122   int flags;
1123 };
1124 
1125 static int
1126 hwloc_linux_foreach_proc_tid_get_cpubind_cb(hwloc_topology_t topology, pid_t tid, void *_data, int idx)
1127 {
1128   struct hwloc_linux_foreach_proc_tid_get_cpubind_cb_data_s *data = _data;
1129   hwloc_bitmap_t cpuset = data->cpuset;
1130   hwloc_bitmap_t tidset = data->tidset;
1131   int flags = data->flags;
1132 
1133   if (hwloc_linux_get_tid_cpubind(topology, tid, tidset))
1134     return -1;
1135 
1136   /* reset the cpuset on first iteration */
1137   if (!idx)
1138     hwloc_bitmap_zero(cpuset);
1139 
1140   if (flags & HWLOC_CPUBIND_STRICT) {
1141     /* if STRICT, we want all threads to have the same binding */
1142     if (!idx) {
1143       /* this is the first thread, copy its binding */
1144       hwloc_bitmap_copy(cpuset, tidset);
1145     } else if (!hwloc_bitmap_isequal(cpuset, tidset)) {
1146       /* this is not the first thread, and it's binding is different */
1147       errno = EXDEV;
1148       return -1;
1149     }
1150   } else {
1151     /* if not STRICT, just OR all thread bindings */
1152     hwloc_bitmap_or(cpuset, cpuset, tidset);
1153   }
1154   return 0;
1155 }
1156 
1157 static int
1158 hwloc_linux_get_pid_cpubind(hwloc_topology_t topology, pid_t pid, hwloc_bitmap_t hwloc_set, int flags)
1159 {
1160   struct hwloc_linux_foreach_proc_tid_get_cpubind_cb_data_s data;
1161   hwloc_bitmap_t tidset = hwloc_bitmap_alloc();
1162   int ret;
1163 
1164   data.cpuset = hwloc_set;
1165   data.tidset = tidset;
1166   data.flags = flags;
1167   ret = hwloc_linux_foreach_proc_tid(topology, pid,
1168                                      hwloc_linux_foreach_proc_tid_get_cpubind_cb,
1169                                      (void*) &data);
1170   hwloc_bitmap_free(tidset);
1171   return ret;
1172 }
1173 
1174 static int
1175 hwloc_linux_set_proc_cpubind(hwloc_topology_t topology, pid_t pid, hwloc_const_bitmap_t hwloc_set, int flags)
1176 {
1177   if (pid == 0)
1178     pid = topology->pid;
1179   if (flags & HWLOC_CPUBIND_THREAD)
1180     return hwloc_linux_set_tid_cpubind(topology, pid, hwloc_set);
1181   else
1182     return hwloc_linux_set_pid_cpubind(topology, pid, hwloc_set, flags);
1183 }
1184 
1185 static int
1186 hwloc_linux_get_proc_cpubind(hwloc_topology_t topology, pid_t pid, hwloc_bitmap_t hwloc_set, int flags)
1187 {
1188   if (pid == 0)
1189     pid = topology->pid;
1190   if (flags & HWLOC_CPUBIND_THREAD)
1191     return hwloc_linux_get_tid_cpubind(topology, pid, hwloc_set);
1192   else
1193     return hwloc_linux_get_pid_cpubind(topology, pid, hwloc_set, flags);
1194 }
1195 
1196 static int
1197 hwloc_linux_set_thisproc_cpubind(hwloc_topology_t topology, hwloc_const_bitmap_t hwloc_set, int flags)
1198 {
1199   return hwloc_linux_set_pid_cpubind(topology, topology->pid, hwloc_set, flags);
1200 }
1201 
1202 static int
1203 hwloc_linux_get_thisproc_cpubind(hwloc_topology_t topology, hwloc_bitmap_t hwloc_set, int flags)
1204 {
1205   return hwloc_linux_get_pid_cpubind(topology, topology->pid, hwloc_set, flags);
1206 }
1207 
1208 static int
1209 hwloc_linux_set_thisthread_cpubind(hwloc_topology_t topology, hwloc_const_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
1210 {
1211   if (topology->pid) {
1212     errno = ENOSYS;
1213     return -1;
1214   }
1215   return hwloc_linux_set_tid_cpubind(topology, 0, hwloc_set);
1216 }
1217 
1218 static int
1219 hwloc_linux_get_thisthread_cpubind(hwloc_topology_t topology, hwloc_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
1220 {
1221   if (topology->pid) {
1222     errno = ENOSYS;
1223     return -1;
1224   }
1225   return hwloc_linux_get_tid_cpubind(topology, 0, hwloc_set);
1226 }
1227 
1228 #if HAVE_DECL_PTHREAD_SETAFFINITY_NP
1229 #pragma weak pthread_setaffinity_np
1230 #pragma weak pthread_self
1231 
1232 static int
1233 hwloc_linux_set_thread_cpubind(hwloc_topology_t topology, pthread_t tid, hwloc_const_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
1234 {
1235   int err;
1236 
1237   if (topology->pid) {
1238     errno = ENOSYS;
1239     return -1;
1240   }
1241 
1242   if (!pthread_self) {
1243     /* ?! Application uses set_thread_cpubind, but doesn't link against libpthread ?! */
1244     errno = ENOSYS;
1245     return -1;
1246   }
1247   if (tid == pthread_self())
1248     return hwloc_linux_set_tid_cpubind(topology, 0, hwloc_set);
1249 
1250   if (!pthread_setaffinity_np) {
1251     errno = ENOSYS;
1252     return -1;
1253   }
1254 
1255 #if defined(HWLOC_HAVE_CPU_SET_S) && !defined(HWLOC_HAVE_OLD_SCHED_SETAFFINITY)
1256   /* Use a separate block so that we can define specific variable
1257      types here */
1258   {
1259      cpu_set_t *plinux_set;
1260      unsigned cpu;
1261      int last;
1262      size_t setsize;
1263 
1264      last = hwloc_bitmap_last(hwloc_set);
1265      if (last == -1) {
1266        errno = EINVAL;
1267        return -1;
1268      }
1269 
1270      setsize = CPU_ALLOC_SIZE(last+1);
1271      plinux_set = CPU_ALLOC(last+1);
1272 
1273      CPU_ZERO_S(setsize, plinux_set);
1274      hwloc_bitmap_foreach_begin(cpu, hwloc_set)
1275          CPU_SET_S(cpu, setsize, plinux_set);
1276      hwloc_bitmap_foreach_end();
1277 
1278      err = pthread_setaffinity_np(tid, setsize, plinux_set);
1279 
1280      CPU_FREE(plinux_set);
1281   }
1282 #elif defined(HWLOC_HAVE_CPU_SET)
1283   /* Use a separate block so that we can define specific variable
1284      types here */
1285   {
1286      cpu_set_t linux_set;
1287      unsigned cpu;
1288 
1289      CPU_ZERO(&linux_set);
1290      hwloc_bitmap_foreach_begin(cpu, hwloc_set)
1291          CPU_SET(cpu, &linux_set);
1292      hwloc_bitmap_foreach_end();
1293 
1294 #ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
1295      err = pthread_setaffinity_np(tid, &linux_set);
1296 #else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
1297      err = pthread_setaffinity_np(tid, sizeof(linux_set), &linux_set);
1298 #endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
1299   }
1300 #else /* CPU_SET */
1301   /* Use a separate block so that we can define specific variable
1302      types here */
1303   {
1304       unsigned long mask = hwloc_bitmap_to_ulong(hwloc_set);
1305 
1306 #ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
1307       err = pthread_setaffinity_np(tid, (void*) &mask);
1308 #else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
1309       err = pthread_setaffinity_np(tid, sizeof(mask), (void*) &mask);
1310 #endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
1311   }
1312 #endif /* CPU_SET */
1313 
1314   if (err) {
1315     errno = err;
1316     return -1;
1317   }
1318   return 0;
1319 }
1320 #endif /* HAVE_DECL_PTHREAD_SETAFFINITY_NP */
1321 
1322 #if HAVE_DECL_PTHREAD_GETAFFINITY_NP
1323 #pragma weak pthread_getaffinity_np
1324 #pragma weak pthread_self
1325 
1326 static int
1327 hwloc_linux_get_thread_cpubind(hwloc_topology_t topology, pthread_t tid, hwloc_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
1328 {
1329   int err;
1330 
1331   if (topology->pid) {
1332     errno = ENOSYS;
1333     return -1;
1334   }
1335 
1336   if (!pthread_self) {
1337     /* ?! Application uses set_thread_cpubind, but doesn't link against libpthread ?! */
1338     errno = ENOSYS;
1339     return -1;
1340   }
1341   if (tid == pthread_self())
1342     return hwloc_linux_get_tid_cpubind(topology, 0, hwloc_set);
1343 
1344   if (!pthread_getaffinity_np) {
1345     errno = ENOSYS;
1346     return -1;
1347   }
1348 
1349 #if defined(HWLOC_HAVE_CPU_SET_S) && !defined(HWLOC_HAVE_OLD_SCHED_SETAFFINITY)
1350   /* Use a separate block so that we can define specific variable
1351      types here */
1352   {
1353      cpu_set_t *plinux_set;
1354      unsigned cpu;
1355      int last;
1356      size_t setsize;
1357 
1358      last = hwloc_bitmap_last(topology->levels[0][0]->complete_cpuset);
1359      assert (last != -1);
1360 
1361      setsize = CPU_ALLOC_SIZE(last+1);
1362      plinux_set = CPU_ALLOC(last+1);
1363 
1364      err = pthread_getaffinity_np(tid, setsize, plinux_set);
1365      if (err) {
1366         CPU_FREE(plinux_set);
1367         errno = err;
1368         return -1;
1369      }
1370 
1371      hwloc_bitmap_zero(hwloc_set);
1372      for(cpu=0; cpu<=(unsigned) last; cpu++)
1373        if (CPU_ISSET_S(cpu, setsize, plinux_set))
1374          hwloc_bitmap_set(hwloc_set, cpu);
1375 
1376      CPU_FREE(plinux_set);
1377   }
1378 #elif defined(HWLOC_HAVE_CPU_SET)
1379   /* Use a separate block so that we can define specific variable
1380      types here */
1381   {
1382      cpu_set_t linux_set;
1383      unsigned cpu;
1384 
1385 #ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
1386      err = pthread_getaffinity_np(tid, &linux_set);
1387 #else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
1388      err = pthread_getaffinity_np(tid, sizeof(linux_set), &linux_set);
1389 #endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
1390      if (err) {
1391         errno = err;
1392         return -1;
1393      }
1394 
1395      hwloc_bitmap_zero(hwloc_set);
1396      for(cpu=0; cpu<CPU_SETSIZE; cpu++)
1397        if (CPU_ISSET(cpu, &linux_set))
1398          hwloc_bitmap_set(hwloc_set, cpu);
1399   }
1400 #else /* CPU_SET */
1401   /* Use a separate block so that we can define specific variable
1402      types here */
1403   {
1404       unsigned long mask;
1405 
1406 #ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
1407       err = pthread_getaffinity_np(tid, (void*) &mask);
1408 #else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
1409       err = pthread_getaffinity_np(tid, sizeof(mask), (void*) &mask);
1410 #endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
1411       if (err) {
1412         errno = err;
1413         return -1;
1414       }
1415 
1416      hwloc_bitmap_from_ulong(hwloc_set, mask);
1417   }
1418 #endif /* CPU_SET */
1419 
1420   return 0;
1421 }
1422 #endif /* HAVE_DECL_PTHREAD_GETAFFINITY_NP */
1423 
1424 int
1425 hwloc_linux_get_tid_last_cpu_location(hwloc_topology_t topology __hwloc_attribute_unused, pid_t tid, hwloc_bitmap_t set)
1426 {
1427   /* read /proc/pid/stat.
1428    * its second field contains the command name between parentheses,
1429    * and the command itself may contain parentheses,
1430    * so read the whole line and find the last closing parenthesis to find the third field.
1431    */
1432   char buf[1024] = "";
1433   char name[64];
1434   char *tmp;
1435   int fd, i, err;
1436 
1437   /* TODO: find a way to use sched_getcpu().
1438    * either compare tid with gettid() in all callbacks.
1439    * or pass gettid() in the callback data.
1440    */
1441 
1442   if (!tid) {
1443 #ifdef SYS_gettid
1444     tid = syscall(SYS_gettid);
1445 #else
1446     errno = ENOSYS;
1447     return -1;
1448 #endif
1449   }
1450 
1451   snprintf(name, sizeof(name), "/proc/%lu/stat", (unsigned long) tid);
1452   fd = open(name, O_RDONLY); /* no fsroot for real /proc */
1453   if (fd < 0) {
1454     errno = ENOSYS;
1455     return -1;
1456   }
1457   err = read(fd, buf, sizeof(buf)-1); /* read -1 to put the ending \0 */
1458   close(fd);
1459   if (err <= 0) {
1460     errno = ENOSYS;
1461     return -1;
1462   }
1463   buf[err-1] = '\0';
1464 
1465   tmp = strrchr(buf, ')');
1466   if (!tmp) {
1467     errno = ENOSYS;
1468     return -1;
1469   }
1470   /* skip ') ' to find the actual third argument */
1471   tmp += 2;
1472 
1473   /* skip 35 fields */
1474   for(i=0; i<36; i++) {
1475     tmp = strchr(tmp, ' ');
1476     if (!tmp) {
1477       errno = ENOSYS;
1478       return -1;
1479     }
1480     /* skip the ' ' itself */
1481     tmp++;
1482   }
1483 
1484   /* read the last cpu in the 38th field now */
1485   if (sscanf(tmp, "%d ", &i) != 1) {
1486     errno = ENOSYS;
1487     return -1;
1488   }
1489 
1490   hwloc_bitmap_only(set, i);
1491   return 0;
1492 }
1493 
1494 /* Per-tid proc_get_last_cpu_location callback data, callback function and caller */
1495 struct hwloc_linux_foreach_proc_tid_get_last_cpu_location_cb_data_s {
1496   hwloc_bitmap_t cpuset;
1497   hwloc_bitmap_t tidset;
1498 };
1499 
1500 static int
1501 hwloc_linux_foreach_proc_tid_get_last_cpu_location_cb(hwloc_topology_t topology, pid_t tid, void *_data, int idx)
1502 {
1503   struct hwloc_linux_foreach_proc_tid_get_last_cpu_location_cb_data_s *data = _data;
1504   hwloc_bitmap_t cpuset = data->cpuset;
1505   hwloc_bitmap_t tidset = data->tidset;
1506 
1507   if (hwloc_linux_get_tid_last_cpu_location(topology, tid, tidset))
1508     return -1;
1509 
1510   /* reset the cpuset on first iteration */
1511   if (!idx)
1512     hwloc_bitmap_zero(cpuset);
1513 
1514   hwloc_bitmap_or(cpuset, cpuset, tidset);
1515   return 0;
1516 }
1517 
1518 static int
1519 hwloc_linux_get_pid_last_cpu_location(hwloc_topology_t topology, pid_t pid, hwloc_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
1520 {
1521   struct hwloc_linux_foreach_proc_tid_get_last_cpu_location_cb_data_s data;
1522   hwloc_bitmap_t tidset = hwloc_bitmap_alloc();
1523   int ret;
1524 
1525   data.cpuset = hwloc_set;
1526   data.tidset = tidset;
1527   ret = hwloc_linux_foreach_proc_tid(topology, pid,
1528                                      hwloc_linux_foreach_proc_tid_get_last_cpu_location_cb,
1529                                      &data);
1530   hwloc_bitmap_free(tidset);
1531   return ret;
1532 }
1533 
1534 static int
1535 hwloc_linux_get_proc_last_cpu_location(hwloc_topology_t topology, pid_t pid, hwloc_bitmap_t hwloc_set, int flags)
1536 {
1537   if (pid == 0)
1538     pid = topology->pid;
1539   if (flags & HWLOC_CPUBIND_THREAD)
1540     return hwloc_linux_get_tid_last_cpu_location(topology, pid, hwloc_set);
1541   else
1542     return hwloc_linux_get_pid_last_cpu_location(topology, pid, hwloc_set, flags);
1543 }
1544 
1545 static int
1546 hwloc_linux_get_thisproc_last_cpu_location(hwloc_topology_t topology, hwloc_bitmap_t hwloc_set, int flags)
1547 {
1548   return hwloc_linux_get_pid_last_cpu_location(topology, topology->pid, hwloc_set, flags);
1549 }
1550 
1551 static int
1552 hwloc_linux_get_thisthread_last_cpu_location(hwloc_topology_t topology, hwloc_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
1553 {
1554   if (topology->pid) {
1555     errno = ENOSYS;
1556     return -1;
1557   }
1558 
1559 #if HAVE_DECL_SCHED_GETCPU
1560   {
1561     int pu = sched_getcpu();
1562     if (pu >= 0) {
1563       hwloc_bitmap_only(hwloc_set, pu);
1564       return 0;
1565     }
1566   }
1567 #endif
1568 
1569   return hwloc_linux_get_tid_last_cpu_location(topology, 0, hwloc_set);
1570 }
1571 
1572 
1573 
1574 /***************************
1575  ****** Membind hooks ******
1576  ***************************/
1577 
1578 static int
1579 hwloc_linux_membind_policy_from_hwloc(int *linuxpolicy, hwloc_membind_policy_t policy, int flags)
1580 {
1581   switch (policy) {
1582   case HWLOC_MEMBIND_DEFAULT:
1583     *linuxpolicy = MPOL_DEFAULT;
1584     break;
1585   case HWLOC_MEMBIND_FIRSTTOUCH:
1586     *linuxpolicy = MPOL_LOCAL;
1587     break;
1588   case HWLOC_MEMBIND_BIND:
1589     if (flags & HWLOC_MEMBIND_STRICT)
1590       *linuxpolicy = MPOL_BIND;
1591     else
1592       *linuxpolicy = MPOL_PREFERRED;
1593     break;
1594   case HWLOC_MEMBIND_INTERLEAVE:
1595     *linuxpolicy = MPOL_INTERLEAVE;
1596     break;
1597   /* TODO: next-touch when (if?) patch applied upstream */
1598   default:
1599     errno = ENOSYS;
1600     return -1;
1601   }
1602   return 0;
1603 }
1604 
1605 static int
1606 hwloc_linux_membind_mask_from_nodeset(hwloc_topology_t topology __hwloc_attribute_unused,
1607                                       hwloc_const_nodeset_t nodeset,
1608                                       unsigned *max_os_index_p, unsigned long **linuxmaskp)
1609 {
1610   unsigned max_os_index = 0; /* highest os_index + 1 */
1611   unsigned long *linuxmask;
1612   unsigned i;
1613   hwloc_nodeset_t linux_nodeset = NULL;
1614 
1615   if (hwloc_bitmap_isfull(nodeset)) {
1616     linux_nodeset = hwloc_bitmap_alloc();
1617     hwloc_bitmap_only(linux_nodeset, 0);
1618     nodeset = linux_nodeset;
1619   }
1620 
1621   max_os_index = hwloc_bitmap_last(nodeset);
1622   if (max_os_index == (unsigned) -1)
1623     max_os_index = 0;
1624   /* add 1 to convert the last os_index into a max_os_index,
1625    * and round up to the nearest multiple of BITS_PER_LONG */
1626   max_os_index = (max_os_index + 1 + HWLOC_BITS_PER_LONG - 1) & ~(HWLOC_BITS_PER_LONG - 1);
1627 
1628   linuxmask = calloc(max_os_index/HWLOC_BITS_PER_LONG, sizeof(long));
1629   if (!linuxmask) {
1630     hwloc_bitmap_free(linux_nodeset);
1631     errno = ENOMEM;
1632     return -1;
1633   }
1634 
1635   for(i=0; i<max_os_index/HWLOC_BITS_PER_LONG; i++)
1636     linuxmask[i] = hwloc_bitmap_to_ith_ulong(nodeset, i);
1637 
1638   if (linux_nodeset)
1639     hwloc_bitmap_free(linux_nodeset);
1640 
1641   *max_os_index_p = max_os_index;
1642   *linuxmaskp = linuxmask;
1643   return 0;
1644 }
1645 
1646 static void
1647 hwloc_linux_membind_mask_to_nodeset(hwloc_topology_t topology __hwloc_attribute_unused,
1648                                     hwloc_nodeset_t nodeset,
1649                                     unsigned max_os_index, const unsigned long *linuxmask)
1650 {
1651   unsigned i;
1652 
1653 #ifdef HWLOC_DEBUG
1654   /* max_os_index comes from hwloc_linux_find_kernel_max_numnodes() so it's a multiple of HWLOC_BITS_PER_LONG */
1655   assert(!(max_os_index%HWLOC_BITS_PER_LONG));
1656 #endif
1657 
1658   hwloc_bitmap_zero(nodeset);
1659   for(i=0; i<max_os_index/HWLOC_BITS_PER_LONG; i++)
1660     hwloc_bitmap_set_ith_ulong(nodeset, i, linuxmask[i]);
1661 }
1662 
1663 static int
1664 hwloc_linux_set_area_membind(hwloc_topology_t topology, const void *addr, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
1665 {
1666   unsigned max_os_index; /* highest os_index + 1 */
1667   unsigned long *linuxmask;
1668   size_t remainder;
1669   int linuxpolicy;
1670   unsigned linuxflags = 0;
1671   int err;
1672 
1673   remainder = (uintptr_t) addr & (hwloc_getpagesize()-1);
1674   addr = (char*) addr - remainder;
1675   len += remainder;
1676 
1677   err = hwloc_linux_membind_policy_from_hwloc(&linuxpolicy, policy, flags);
1678   if (err < 0)
1679     return err;
1680 
1681   if (linuxpolicy == MPOL_DEFAULT) {
1682     /* Some Linux kernels don't like being passed a set */
1683     return hwloc_mbind((void *) addr, len, linuxpolicy, NULL, 0, 0);
1684 
1685   } else if (linuxpolicy == MPOL_LOCAL) {
1686     if (!hwloc_bitmap_isequal(nodeset, hwloc_topology_get_complete_nodeset(topology))) {
1687       errno = EXDEV;
1688       return -1;
1689     }
1690     /* MPOL_LOCAL isn't supported before 3.8, and it's identical to PREFERRED with no nodeset, which was supported way before */
1691     return hwloc_mbind((void *) addr, len, MPOL_PREFERRED, NULL, 0, 0);
1692   }
1693 
1694   err = hwloc_linux_membind_mask_from_nodeset(topology, nodeset, &max_os_index, &linuxmask);
1695   if (err < 0)
1696     goto out;
1697 
1698   if (flags & HWLOC_MEMBIND_MIGRATE) {
1699     linuxflags = MPOL_MF_MOVE;
1700     if (flags & HWLOC_MEMBIND_STRICT)
1701       linuxflags |= MPOL_MF_STRICT;
1702   }
1703 
1704   err = hwloc_mbind((void *) addr, len, linuxpolicy, linuxmask, max_os_index+1, linuxflags);
1705   if (err < 0)
1706     goto out_with_mask;
1707 
1708   free(linuxmask);
1709   return 0;
1710 
1711  out_with_mask:
1712   free(linuxmask);
1713  out:
1714   return -1;
1715 }
1716 
1717 static void *
1718 hwloc_linux_alloc_membind(hwloc_topology_t topology, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
1719 {
1720   void *buffer;
1721   int err;
1722 
1723   buffer = hwloc_alloc_mmap(topology, len);
1724   if (!buffer)
1725     return NULL;
1726 
1727   err = hwloc_linux_set_area_membind(topology, buffer, len, nodeset, policy, flags);
1728   if (err < 0 && (flags & HWLOC_MEMBIND_STRICT)) {
1729     munmap(buffer, len);
1730     return NULL;
1731   }
1732 
1733   return buffer;
1734 }
1735 
1736 static int
1737 hwloc_linux_set_thisthread_membind(hwloc_topology_t topology, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
1738 {
1739   unsigned max_os_index; /* highest os_index + 1 */
1740   unsigned long *linuxmask;
1741   int linuxpolicy;
1742   int err;
1743 
1744   err = hwloc_linux_membind_policy_from_hwloc(&linuxpolicy, policy, flags);
1745   if (err < 0)
1746     return err;
1747 
1748   if (linuxpolicy == MPOL_DEFAULT) {
1749     /* Some Linux kernels don't like being passed a set */
1750     return hwloc_set_mempolicy(linuxpolicy, NULL, 0);
1751 
1752   } else if (linuxpolicy == MPOL_LOCAL) {
1753     if (!hwloc_bitmap_isequal(nodeset, hwloc_topology_get_complete_nodeset(topology))) {
1754       errno = EXDEV;
1755       return -1;
1756     }
1757     /* MPOL_LOCAL isn't supported before 3.8, and it's identical to PREFERRED with no nodeset, which was supported way before */
1758     return hwloc_set_mempolicy(MPOL_PREFERRED, NULL, 0);
1759   }
1760 
1761   err = hwloc_linux_membind_mask_from_nodeset(topology, nodeset, &max_os_index, &linuxmask);
1762   if (err < 0)
1763     goto out;
1764 
1765   if (flags & HWLOC_MEMBIND_MIGRATE) {
1766     unsigned long fullmask[max_os_index/HWLOC_BITS_PER_LONG];
1767     memset(fullmask, 0xf, max_os_index/HWLOC_BITS_PER_LONG * sizeof(long));
1768     err = hwloc_migrate_pages(0, max_os_index+1, fullmask, linuxmask);
1769     if (err < 0 && (flags & HWLOC_MEMBIND_STRICT))
1770       goto out_with_mask;
1771   }
1772 
1773   err = hwloc_set_mempolicy(linuxpolicy, linuxmask, max_os_index+1);
1774   if (err < 0)
1775     goto out_with_mask;
1776 
1777   free(linuxmask);
1778   return 0;
1779 
1780  out_with_mask:
1781   free(linuxmask);
1782  out:
1783   return -1;
1784 }
1785 
1786 /*
1787  * On some kernels, get_mempolicy requires the output size to be larger
1788  * than the kernel MAX_NUMNODES (defined by CONFIG_NODES_SHIFT).
1789  * Try get_mempolicy on ourself until we find a max_os_index value that
1790  * makes the kernel happy.
1791  */
1792 static int
1793 hwloc_linux_find_kernel_max_numnodes(hwloc_topology_t topology __hwloc_attribute_unused)
1794 {
1795   static int _max_numnodes = -1, max_numnodes;
1796   int linuxpolicy;
1797 
1798   if (_max_numnodes != -1)
1799     /* already computed */
1800     return _max_numnodes;
1801 
1802   /* start with a single ulong, it's the minimal and it's enough for most machines */
1803   max_numnodes = HWLOC_BITS_PER_LONG;
1804   while (1) {
1805     unsigned long mask[max_numnodes / HWLOC_BITS_PER_LONG];
1806     int err = hwloc_get_mempolicy(&linuxpolicy, mask, max_numnodes, 0, 0);
1807     if (!err || errno != EINVAL)
1808       /* Found it. Only update the static value with the final one,
1809        * to avoid sharing intermediate values that we modify,
1810        * in case there's ever multiple concurrent calls.
1811        */
1812       return _max_numnodes = max_numnodes;
1813     max_numnodes *= 2;
1814   }
1815 }
1816 
1817 static int
1818 hwloc_linux_membind_policy_to_hwloc(int linuxpolicy, hwloc_membind_policy_t *policy)
1819 {
1820   switch (linuxpolicy) {
1821   case MPOL_DEFAULT:
1822   case MPOL_LOCAL: /* converted from MPOL_PREFERRED + empty nodeset by the caller */
1823     *policy = HWLOC_MEMBIND_FIRSTTOUCH;
1824     return 0;
1825   case MPOL_PREFERRED:
1826   case MPOL_BIND:
1827     *policy = HWLOC_MEMBIND_BIND;
1828     return 0;
1829   case MPOL_INTERLEAVE:
1830     *policy = HWLOC_MEMBIND_INTERLEAVE;
1831     return 0;
1832   default:
1833     errno = EINVAL;
1834     return -1;
1835   }
1836 }
1837 
1838 static int hwloc_linux_mask_is_empty(unsigned max_os_index, unsigned long *linuxmask)
1839 {
1840   unsigned i;
1841   for(i=0; i<max_os_index/HWLOC_BITS_PER_LONG; i++)
1842     if (linuxmask[i])
1843       return 0;
1844   return 1;
1845 }
1846 
1847 static int
1848 hwloc_linux_get_thisthread_membind(hwloc_topology_t topology, hwloc_nodeset_t nodeset, hwloc_membind_policy_t *policy, int flags __hwloc_attribute_unused)
1849 {
1850   unsigned max_os_index;
1851   int linuxpolicy;
1852   int err;
1853 
1854   max_os_index = hwloc_linux_find_kernel_max_numnodes(topology);
1855 
1856   unsigned long linuxmask[max_os_index/HWLOC_BITS_PER_LONG];
1857 
1858   err = hwloc_get_mempolicy(&linuxpolicy, linuxmask, max_os_index, 0, 0);
1859   if (err < 0)
1860     goto out;
1861 
1862   /* MPOL_PREFERRED with empty mask is MPOL_LOCAL */
1863   if (linuxpolicy == MPOL_PREFERRED && hwloc_linux_mask_is_empty(max_os_index, linuxmask))
1864     linuxpolicy = MPOL_LOCAL;
1865 
1866   if (linuxpolicy == MPOL_DEFAULT || linuxpolicy == MPOL_LOCAL) {
1867     hwloc_bitmap_copy(nodeset, hwloc_topology_get_topology_nodeset(topology));
1868   } else {
1869     hwloc_linux_membind_mask_to_nodeset(topology, nodeset, max_os_index, linuxmask);
1870   }
1871 
1872   err = hwloc_linux_membind_policy_to_hwloc(linuxpolicy, policy);
1873   if (err < 0)
1874     goto out;
1875 
1876   return 0;
1877 
1878  out:
1879   return -1;
1880 }
1881 
1882 static int
1883 hwloc_linux_get_area_membind(hwloc_topology_t topology, const void *addr, size_t len, hwloc_nodeset_t nodeset, hwloc_membind_policy_t *policy, int flags __hwloc_attribute_unused)
1884 {
1885   unsigned max_os_index;
1886   int linuxpolicy = 0, globallinuxpolicy = 0; /* shut-up the compiler */
1887   int mixed = 0;
1888   int full = 0;
1889   int first = 1;
1890   int pagesize = hwloc_getpagesize();
1891   char *tmpaddr;
1892   int err;
1893   unsigned i;
1894 
1895   max_os_index = hwloc_linux_find_kernel_max_numnodes(topology);
1896 
1897   unsigned long linuxmask[max_os_index/HWLOC_BITS_PER_LONG];
1898   unsigned long globallinuxmask[max_os_index/HWLOC_BITS_PER_LONG];
1899   memset(globallinuxmask, 0, sizeof(globallinuxmask));
1900 
1901   for(tmpaddr = (char *)((unsigned long)addr & ~(pagesize-1));
1902       tmpaddr < (char *)addr + len;
1903       tmpaddr += pagesize) {
1904     err = hwloc_get_mempolicy(&linuxpolicy, linuxmask, max_os_index, tmpaddr, MPOL_F_ADDR);
1905     if (err < 0)
1906       goto out;
1907 
1908     /* MPOL_PREFERRED with empty mask is MPOL_LOCAL */
1909     if (linuxpolicy == MPOL_PREFERRED && hwloc_linux_mask_is_empty(max_os_index, linuxmask))
1910       linuxpolicy = MPOL_LOCAL;
1911 
1912     /* use the first found policy. if we find a different one later, set mixed to 1 */
1913     if (first)
1914       globallinuxpolicy = linuxpolicy;
1915     else if (globallinuxpolicy != linuxpolicy)
1916       mixed = 1;
1917 
1918     /* agregate masks, and set full to 1 if we ever find DEFAULT or LOCAL */
1919     if (full || linuxpolicy == MPOL_DEFAULT || linuxpolicy == MPOL_LOCAL) {
1920       full = 1;
1921     } else {
1922       for(i=0; i<max_os_index/HWLOC_BITS_PER_LONG; i++)
1923         globallinuxmask[i] |= linuxmask[i];
1924     }
1925 
1926     first = 0;
1927   }
1928 
1929   if (mixed) {
1930     *policy = HWLOC_MEMBIND_MIXED;
1931   } else {
1932     err = hwloc_linux_membind_policy_to_hwloc(linuxpolicy, policy);
1933     if (err < 0)
1934       goto out;
1935   }
1936 
1937   if (full) {
1938     hwloc_bitmap_copy(nodeset, hwloc_topology_get_topology_nodeset(topology));
1939   } else {
1940     hwloc_linux_membind_mask_to_nodeset(topology, nodeset, max_os_index, globallinuxmask);
1941   }
1942 
1943   return 0;
1944 
1945  out:
1946   return -1;
1947 }
1948 
1949 static int
1950 hwloc_linux_get_area_memlocation(hwloc_topology_t topology __hwloc_attribute_unused, const void *addr, size_t len, hwloc_nodeset_t nodeset, int flags __hwloc_attribute_unused)
1951 {
1952   unsigned offset;
1953   unsigned long count;
1954   void **pages;
1955   int *status;
1956   int pagesize = hwloc_getpagesize();
1957   int ret;
1958   unsigned i;
1959 
1960   offset = ((unsigned long) addr) & (pagesize-1);
1961   addr = ((char*) addr) - offset;
1962   len += offset;
1963   count = (len + pagesize-1)/pagesize;
1964   pages = malloc(count*sizeof(*pages));
1965   status = malloc(count*sizeof(*status));
1966   if (!pages || !status) {
1967     ret = -1;
1968     goto out_with_pages;
1969   }
1970 
1971   for(i=0; i<count; i++)
1972     pages[i] = ((char*)addr) + i*pagesize;
1973 
1974   ret = hwloc_move_pages(0, count, pages, NULL, status, 0);
1975   if (ret  < 0)
1976     goto out_with_pages;
1977 
1978   hwloc_bitmap_zero(nodeset);
1979   for(i=0; i<count; i++)
1980     if (status[i] >= 0)
1981       hwloc_bitmap_set(nodeset, status[i]);
1982   ret = 0;
1983 
1984  out_with_pages:
1985   free(pages);
1986   free(status);
1987   return ret;
1988 }
1989 
1990 static void hwloc_linux__get_allowed_resources(hwloc_topology_t topology, const char *root_path, int root_fd, char **cpuset_namep);
1991 
1992 static int hwloc_linux_get_allowed_resources_hook(hwloc_topology_t topology)
1993 {
1994   const char *fsroot_path;
1995   char *cpuset_name;
1996   int root_fd = -1;
1997 
1998   fsroot_path = getenv("HWLOC_FSROOT");
1999   if (!fsroot_path)
2000     fsroot_path = "/";
2001 
2002 #ifdef HAVE_OPENAT
2003   root_fd = open(fsroot_path, O_RDONLY | O_DIRECTORY);
2004   if (root_fd < 0)
2005     goto out;
2006 #else
2007   if (strcmp(fsroot_path, "/")) {
2008     errno = ENOSYS;
2009     goto out;
2010   }
2011 #endif
2012 
2013   /* we could also error-out if the current topology doesn't actually match the system,
2014    * at least for PUs and NUMA nodes. But it would increase the overhead of loading XMLs.
2015    *
2016    * Just trust the user when he sets THISSYSTEM=1. It enables hacky
2017    * tests such as restricting random XML or synthetic to the current
2018    * machine (uses the default cgroup).
2019    */
2020 
2021   hwloc_linux__get_allowed_resources(topology, fsroot_path, root_fd, &cpuset_name);
2022   if (cpuset_name) {
2023     hwloc_obj_add_info(topology->levels[0][0], "LinuxCgroup", cpuset_name);
2024     free(cpuset_name);
2025   }
2026   if (root_fd != -1)
2027     close(root_fd);
2028 
2029  out:
2030   return -1;
2031 }
2032 
2033 void
2034 hwloc_set_linuxfs_hooks(struct hwloc_binding_hooks *hooks,
2035                         struct hwloc_topology_support *support __hwloc_attribute_unused)
2036 {
2037   hooks->set_thisthread_cpubind = hwloc_linux_set_thisthread_cpubind;
2038   hooks->get_thisthread_cpubind = hwloc_linux_get_thisthread_cpubind;
2039   hooks->set_thisproc_cpubind = hwloc_linux_set_thisproc_cpubind;
2040   hooks->get_thisproc_cpubind = hwloc_linux_get_thisproc_cpubind;
2041   hooks->set_proc_cpubind = hwloc_linux_set_proc_cpubind;
2042   hooks->get_proc_cpubind = hwloc_linux_get_proc_cpubind;
2043 #if HAVE_DECL_PTHREAD_SETAFFINITY_NP
2044   hooks->set_thread_cpubind = hwloc_linux_set_thread_cpubind;
2045 #endif /* HAVE_DECL_PTHREAD_SETAFFINITY_NP */
2046 #if HAVE_DECL_PTHREAD_GETAFFINITY_NP
2047   hooks->get_thread_cpubind = hwloc_linux_get_thread_cpubind;
2048 #endif /* HAVE_DECL_PTHREAD_GETAFFINITY_NP */
2049   hooks->get_thisthread_last_cpu_location = hwloc_linux_get_thisthread_last_cpu_location;
2050   hooks->get_thisproc_last_cpu_location = hwloc_linux_get_thisproc_last_cpu_location;
2051   hooks->get_proc_last_cpu_location = hwloc_linux_get_proc_last_cpu_location;
2052   hooks->set_thisthread_membind = hwloc_linux_set_thisthread_membind;
2053   hooks->get_thisthread_membind = hwloc_linux_get_thisthread_membind;
2054   hooks->get_area_membind = hwloc_linux_get_area_membind;
2055   hooks->set_area_membind = hwloc_linux_set_area_membind;
2056   hooks->get_area_memlocation = hwloc_linux_get_area_memlocation;
2057   hooks->alloc_membind = hwloc_linux_alloc_membind;
2058   hooks->alloc = hwloc_alloc_mmap;
2059   hooks->free_membind = hwloc_free_mmap;
2060   support->membind->firsttouch_membind = 1;
2061   support->membind->bind_membind = 1;
2062   support->membind->interleave_membind = 1;
2063   support->membind->migrate_membind = 1;
2064   hooks->get_allowed_resources = hwloc_linux_get_allowed_resources_hook;
2065 }
2066 
2067 
2068 /*******************************************
2069  *** Misc Helpers for Topology Discovery ***
2070  *******************************************/
2071 
2072 /* cpuinfo array */
2073 struct hwloc_linux_cpuinfo_proc {
2074   /* set during hwloc_linux_parse_cpuinfo */
2075   unsigned long Pproc;
2076   /* set during hwloc_linux_parse_cpuinfo or -1 if unknown*/
2077   long Pcore, Ppkg;
2078   /* set later, or -1 if unknown */
2079   long Lcore, Lpkg;
2080 
2081   /* custom info, set during hwloc_linux_parse_cpuinfo */
2082   struct hwloc_info_s *infos;
2083   unsigned infos_count;
2084 };
2085 
2086 static void
2087 hwloc_find_linux_cpuset_mntpnt(char **cgroup_mntpnt, char **cpuset_mntpnt, const char *root_path)
2088 {
2089   char *mount_path;
2090   struct mntent mntent;
2091   FILE *fd;
2092   int err;
2093   size_t bufsize;
2094 
2095   *cgroup_mntpnt = NULL;
2096   *cpuset_mntpnt = NULL;
2097 
2098   if (root_path) {
2099     /* setmntent() doesn't support openat(), so use the root_path directly */
2100     err = asprintf(&mount_path, "%s/proc/mounts", root_path);
2101     if (err < 0)
2102       return;
2103     fd = setmntent(mount_path, "r");
2104     free(mount_path);
2105   } else {
2106     fd = setmntent("/proc/mounts", "r");
2107   }
2108   if (!fd)
2109     return;
2110 
2111   /* getmntent_r() doesn't actually report an error when the buffer
2112    * is too small. It just silently truncates things. So we can't
2113    * dynamically resize things.
2114    *
2115    * Linux limits mount type, string, and options to one page each.
2116    * getmntent() limits the line size to 4kB.
2117    * so use 4*pagesize to be far above both.
2118    */
2119   bufsize = hwloc_getpagesize()*4;
2120   char buf[bufsize];
2121 
2122   while (getmntent_r(fd, &mntent, buf, bufsize)) {
2123     if (!strcmp(mntent.mnt_type, "cpuset")) {
2124       hwloc_debug("Found cpuset mount point on %s\n", mntent.mnt_dir);
2125       *cpuset_mntpnt = strdup(mntent.mnt_dir);
2126       break;
2127     } else if (!strcmp(mntent.mnt_type, "cgroup")) {
2128       /* found a cgroup mntpnt */
2129       char *opt, *opts = mntent.mnt_opts;
2130       int cpuset_opt = 0;
2131       int noprefix_opt = 0;
2132       /* look at options */
2133       while ((opt = strsep(&opts, ",")) != NULL) {
2134         if (!strcmp(opt, "cpuset"))
2135           cpuset_opt = 1;
2136         else if (!strcmp(opt, "noprefix"))
2137           noprefix_opt = 1;
2138       }
2139       if (!cpuset_opt)
2140         continue;
2141       if (noprefix_opt) {
2142         hwloc_debug("Found cgroup emulating a cpuset mount point on %s\n", mntent.mnt_dir);
2143         *cpuset_mntpnt = strdup(mntent.mnt_dir);
2144       } else {
2145         hwloc_debug("Found cgroup/cpuset mount point on %s\n", mntent.mnt_dir);
2146         *cgroup_mntpnt = strdup(mntent.mnt_dir);
2147       }
2148       break;
2149     }
2150   }
2151 
2152   endmntent(fd);
2153 }
2154 
2155 /*
2156  * Linux cpusets may be managed directly or through cgroup.
2157  * If cgroup is used, tasks get a /proc/pid/cgroup which may contain a
2158  * single line %d:cpuset:<name>. If cpuset are used they get /proc/pid/cpuset
2159  * containing <name>.
2160  */
2161 static char *
2162 hwloc_read_linux_cpuset_name(int fsroot_fd, hwloc_pid_t pid)
2163 {
2164 #define CPUSET_NAME_LEN 128
2165   char cpuset_name[CPUSET_NAME_LEN];
2166   FILE *file;
2167   int err;
2168   char *tmp;
2169 
2170   /* check whether a cgroup-cpuset is enabled */
2171   if (!pid)
2172     file = hwloc_fopen("/proc/self/cgroup", "r", fsroot_fd);
2173   else {
2174     char path[] = "/proc/XXXXXXXXXX/cgroup";
2175     snprintf(path, sizeof(path), "/proc/%d/cgroup", pid);
2176     file = hwloc_fopen(path, "r", fsroot_fd);
2177   }
2178   if (file) {
2179     /* find a cpuset line */
2180 #define CGROUP_LINE_LEN 256
2181     char line[CGROUP_LINE_LEN];
2182     while (fgets(line, sizeof(line), file)) {
2183       char *end, *colon = strchr(line, ':');
2184       if (!colon)
2185         continue;
2186       if (strncmp(colon, ":cpuset:", 8))
2187         continue;
2188 
2189       /* found a cgroup-cpuset line, return the name */
2190       fclose(file);
2191       end = strchr(colon, '\n');
2192       if (end)
2193         *end = '\0';
2194       hwloc_debug("Found cgroup-cpuset %s\n", colon+8);
2195       return strdup(colon+8);
2196     }
2197     fclose(file);
2198   }
2199 
2200   /* check whether a cpuset is enabled */
2201   if (!pid)
2202     err = hwloc_read_path_by_length("/proc/self/cpuset", cpuset_name, sizeof(cpuset_name), fsroot_fd);
2203   else {
2204     char path[] = "/proc/XXXXXXXXXX/cpuset";
2205     snprintf(path, sizeof(path), "/proc/%d/cpuset", pid);
2206     err = hwloc_read_path_by_length(path, cpuset_name, sizeof(cpuset_name), fsroot_fd);
2207   }
2208   if (err < 0) {
2209     /* found nothing */
2210     hwloc_debug("%s", "No cgroup or cpuset found\n");
2211     return NULL;
2212   }
2213 
2214   /* found a cpuset, return the name */
2215   tmp = strchr(cpuset_name, '\n');
2216   if (tmp)
2217     *tmp = '\0';
2218   hwloc_debug("Found cpuset %s\n", cpuset_name);
2219   return strdup(cpuset_name);
2220 }
2221 
2222 /*
2223  * Then, the cpuset description is available from either the cgroup or
2224  * the cpuset filesystem (usually mounted in / or /dev) where there
2225  * are cgroup<name>/cpuset.{cpus,mems} or cpuset<name>/{cpus,mems} files.
2226  */
2227 static void
2228 hwloc_admin_disable_set_from_cpuset(int root_fd,
2229                                     const char *cgroup_mntpnt, const char *cpuset_mntpnt, const char *cpuset_name,
2230                                     const char *attr_name,
2231                                     hwloc_bitmap_t admin_enabled_cpus_set)
2232 {
2233 #define CPUSET_FILENAME_LEN 256
2234   char cpuset_filename[CPUSET_FILENAME_LEN];
2235   int fd;
2236   int err;
2237 
2238   if (cgroup_mntpnt) {
2239     /* try to read the cpuset from cgroup */
2240     snprintf(cpuset_filename, CPUSET_FILENAME_LEN, "%s%s/cpuset.%s", cgroup_mntpnt, cpuset_name, attr_name);
2241     hwloc_debug("Trying to read cgroup file <%s>\n", cpuset_filename);
2242   } else if (cpuset_mntpnt) {
2243     /* try to read the cpuset directly */
2244     snprintf(cpuset_filename, CPUSET_FILENAME_LEN, "%s%s/%s", cpuset_mntpnt, cpuset_name, attr_name);
2245     hwloc_debug("Trying to read cpuset file <%s>\n", cpuset_filename);
2246   }
2247 
2248   fd = hwloc_open(cpuset_filename, root_fd);
2249   if (fd < 0) {
2250     /* found no cpuset description, ignore it */
2251     hwloc_debug("Couldn't find cpuset <%s> description, ignoring\n", cpuset_name);
2252     return;
2253   }
2254 
2255   err = hwloc__read_fd_as_cpulist(fd, admin_enabled_cpus_set);
2256   close(fd);
2257 
2258   if (err < 0)
2259     hwloc_bitmap_fill(admin_enabled_cpus_set);
2260   else
2261     hwloc_debug_bitmap("cpuset includes %s\n", admin_enabled_cpus_set);
2262 }
2263 
2264 static void
2265 hwloc_parse_meminfo_info(struct hwloc_linux_backend_data_s *data,
2266                          const char *path,
2267                          uint64_t *local_memory,
2268                          uint64_t *meminfo_hugepages_count,
2269                          uint64_t *meminfo_hugepages_size,
2270                          int onlytotal)
2271 {
2272   char *tmp;
2273   char buffer[4096];
2274   unsigned long long number;
2275 
2276   if (hwloc_read_path_by_length(path, buffer, sizeof(buffer), data->root_fd) < 0)
2277     return;
2278 
2279   tmp = strstr(buffer, "MemTotal: "); /* MemTotal: %llu kB */
2280   if (tmp) {
2281     number = strtoull(tmp+10, NULL, 10);
2282     *local_memory = number << 10;
2283 
2284     if (onlytotal)
2285       return;
2286 
2287     tmp = strstr(tmp, "Hugepagesize: "); /* Hugepagesize: %llu */
2288     if (tmp) {
2289       number = strtoull(tmp+14, NULL, 10);
2290       *meminfo_hugepages_size = number << 10;
2291 
2292       tmp = strstr(tmp, "HugePages_Free: "); /* HugePages_Free: %llu */
2293       if (tmp) {
2294         number = strtoull(tmp+16, NULL, 10);
2295         *meminfo_hugepages_count = number;
2296       }
2297     }
2298   }
2299 }
2300 
2301 #define SYSFS_NUMA_NODE_PATH_LEN 128
2302 
2303 static void
2304 hwloc_parse_hugepages_info(struct hwloc_linux_backend_data_s *data,
2305                            const char *dirpath,
2306                            struct hwloc_numanode_attr_s *memory,
2307                            uint64_t *remaining_local_memory)
2308 {
2309   DIR *dir;
2310   struct dirent *dirent;
2311   unsigned long index_ = 1;
2312   char line[64];
2313   char path[SYSFS_NUMA_NODE_PATH_LEN];
2314 
2315   dir = hwloc_opendir(dirpath, data->root_fd);
2316   if (dir) {
2317     while ((dirent = readdir(dir)) != NULL) {
2318       int err;
2319       if (strncmp(dirent->d_name, "hugepages-", 10))
2320         continue;
2321       memory->page_types[index_].size = strtoul(dirent->d_name+10, NULL, 0) * 1024ULL;
2322       err = snprintf(path, sizeof(path), "%s/%s/nr_hugepages", dirpath, dirent->d_name);
2323       if ((size_t) err < sizeof(path)
2324           && !hwloc_read_path_by_length(path, line, sizeof(line), data->root_fd)) {
2325         /* these are the actual total amount of huge pages */
2326         memory->page_types[index_].count = strtoull(line, NULL, 0);
2327         *remaining_local_memory -= memory->page_types[index_].count * memory->page_types[index_].size;
2328         index_++;
2329       }
2330     }
2331     closedir(dir);
2332     memory->page_types_len = index_;
2333   }
2334 }
2335 
2336 static void
2337 hwloc_get_procfs_meminfo_info(struct hwloc_topology *topology,
2338                               struct hwloc_linux_backend_data_s *data,
2339                               struct hwloc_numanode_attr_s *memory)
2340 {
2341   uint64_t meminfo_hugepages_count, meminfo_hugepages_size = 0;
2342   struct stat st;
2343   int has_sysfs_hugepages = 0;
2344   const char *pagesize_env = getenv("HWLOC_DEBUG_PAGESIZE");
2345   int types = 2;
2346   int err;
2347 
2348   err = hwloc_stat("/sys/kernel/mm/hugepages", &st, data->root_fd);
2349   if (!err) {
2350     types = 1 + st.st_nlink-2;
2351     has_sysfs_hugepages = 1;
2352   }
2353 
2354   if (topology->is_thissystem || pagesize_env) {
2355     /* we cannot report any page_type info unless we have the page size.
2356      * we'll take it either from the system if local, or from the debug env variable
2357      */
2358     memory->page_types_len = types;
2359     memory->page_types = calloc(types, sizeof(*memory->page_types));
2360   }
2361 
2362   if (topology->is_thissystem) {
2363     /* Get the page and hugepage sizes from sysconf */
2364 #if HAVE_DECL__SC_LARGE_PAGESIZE
2365     memory->page_types[1].size = sysconf(_SC_LARGE_PAGESIZE);
2366 #endif
2367     memory->page_types[0].size = data->pagesize; /* might be overwritten later by /proc/meminfo or sysfs */
2368   }
2369 
2370   hwloc_parse_meminfo_info(data, "/proc/meminfo",
2371                            &memory->local_memory,
2372                            &meminfo_hugepages_count, &meminfo_hugepages_size,
2373                            memory->page_types == NULL);
2374 
2375   if (memory->page_types) {
2376     uint64_t remaining_local_memory = memory->local_memory;
2377     if (has_sysfs_hugepages) {
2378       /* read from node%d/hugepages/hugepages-%skB/nr_hugepages */
2379       hwloc_parse_hugepages_info(data, "/sys/kernel/mm/hugepages", memory, &remaining_local_memory);
2380     } else {
2381       /* use what we found in meminfo */
2382       if (meminfo_hugepages_size) {
2383         memory->page_types[1].size = meminfo_hugepages_size;
2384         memory->page_types[1].count = meminfo_hugepages_count;
2385         remaining_local_memory -= meminfo_hugepages_count * meminfo_hugepages_size;
2386       } else {
2387         memory->page_types_len = 1;
2388       }
2389     }
2390 
2391     if (pagesize_env) {
2392       /* We cannot get the pagesize if not thissystem, use the env-given one to experience the code during make check */
2393       memory->page_types[0].size = strtoull(pagesize_env, NULL, 10);
2394       /* If failed, use 4kB */
2395       if (!memory->page_types[0].size)
2396         memory->page_types[0].size = 4096;
2397     }
2398     assert(memory->page_types[0].size); /* from sysconf if local or from the env */
2399     /* memory->page_types[1].size from sysconf if local, or from /proc/meminfo, or from sysfs,
2400      * may be 0 if no hugepage support in the kernel */
2401 
2402     memory->page_types[0].count = remaining_local_memory / memory->page_types[0].size;
2403   }
2404 }
2405 
2406 static void
2407 hwloc_sysfs_node_meminfo_info(struct hwloc_topology *topology,
2408                               struct hwloc_linux_backend_data_s *data,
2409                               const char *syspath, int node,
2410                               struct hwloc_numanode_attr_s *memory)
2411 {
2412   char path[SYSFS_NUMA_NODE_PATH_LEN];
2413   char meminfopath[SYSFS_NUMA_NODE_PATH_LEN];
2414   uint64_t meminfo_hugepages_count = 0;
2415   uint64_t meminfo_hugepages_size = 0;
2416   struct stat st;
2417   int has_sysfs_hugepages = 0;
2418   int types = 2;
2419   int err;
2420 
2421   sprintf(path, "%s/node%d/hugepages", syspath, node);
2422   err = hwloc_stat(path, &st, data->root_fd);
2423   if (!err) {
2424     types = 1 + st.st_nlink-2;
2425     has_sysfs_hugepages = 1;
2426   }
2427 
2428   if (topology->is_thissystem) {
2429     memory->page_types_len = types;
2430     memory->page_types = malloc(types*sizeof(*memory->page_types));
2431     memset(memory->page_types, 0, types*sizeof(*memory->page_types));
2432   }
2433 
2434   sprintf(meminfopath, "%s/node%d/meminfo", syspath, node);
2435   hwloc_parse_meminfo_info(data, meminfopath,
2436                            &memory->local_memory,
2437                            &meminfo_hugepages_count, NULL /* no hugepage size in node-specific meminfo */,
2438                            memory->page_types == NULL);
2439 
2440   if (memory->page_types) {
2441     uint64_t remaining_local_memory = memory->local_memory;
2442     if (has_sysfs_hugepages) {
2443       /* read from node%d/hugepages/hugepages-%skB/nr_hugepages */
2444       hwloc_parse_hugepages_info(data, path, memory, &remaining_local_memory);
2445     } else {
2446       /* get hugepage size from machine-specific meminfo since there is no size in node-specific meminfo,
2447        * hwloc_get_procfs_meminfo_info must have been called earlier */
2448       meminfo_hugepages_size = topology->machine_memory.page_types[1].size;
2449       /* use what we found in meminfo */
2450       if (meminfo_hugepages_size) {
2451         memory->page_types[1].count = meminfo_hugepages_count;
2452         memory->page_types[1].size = meminfo_hugepages_size;
2453         remaining_local_memory -= meminfo_hugepages_count * meminfo_hugepages_size;
2454       } else {
2455         memory->page_types_len = 1;
2456       }
2457     }
2458     /* update what's remaining as normal pages */
2459     memory->page_types[0].size = data->pagesize;
2460     memory->page_types[0].count = remaining_local_memory / memory->page_types[0].size;
2461   }
2462 }
2463 
2464 static int
2465 hwloc_parse_nodes_distances(const char *path, unsigned nbnodes, unsigned *indexes, uint64_t *distances, int fsroot_fd)
2466 {
2467   size_t len = (10+1)*nbnodes;
2468   uint64_t *curdist = distances;
2469   char *string;
2470   unsigned i;
2471 
2472   string = malloc(len); /* space-separated %d */
2473   if (!string)
2474     goto out;
2475 
2476   for(i=0; i<nbnodes; i++) {
2477     unsigned osnode = indexes[i];
2478     char distancepath[SYSFS_NUMA_NODE_PATH_LEN];
2479     char *tmp, *next;
2480     unsigned found;
2481 
2482     /* Linux nodeX/distance file contains distance from X to other localities (from ACPI SLIT table or so),
2483      * store them in slots X*N...X*N+N-1 */
2484     sprintf(distancepath, "%s/node%u/distance", path, osnode);
2485     if (hwloc_read_path_by_length(distancepath, string, len, fsroot_fd) < 0)
2486       goto out_with_string;
2487 
2488     tmp = string;
2489     found = 0;
2490     while (tmp) {
2491       unsigned distance = strtoul(tmp, &next, 0); /* stored as a %d */
2492       if (next == tmp)
2493         break;
2494       *curdist = (uint64_t) distance;
2495       curdist++;
2496       found++;
2497       if (found == nbnodes)
2498         break;
2499       tmp = next+1;
2500     }
2501     if (found != nbnodes)
2502       goto out_with_string;
2503   }
2504 
2505   free(string);
2506   return 0;
2507 
2508  out_with_string:
2509   free(string);
2510  out:
2511   return -1;
2512 }
2513 
2514 static void
2515 hwloc__get_dmi_id_one_info(struct hwloc_linux_backend_data_s *data,
2516                            hwloc_obj_t obj,
2517                            char *path, unsigned pathlen,
2518                            const char *dmi_name, const char *hwloc_name)
2519 {
2520   char dmi_line[64];
2521 
2522   strcpy(path+pathlen, dmi_name);
2523   if (hwloc_read_path_by_length(path, dmi_line, sizeof(dmi_line), data->root_fd) < 0)
2524     return;
2525 
2526   if (dmi_line[0] != '\0') {
2527     char *tmp = strchr(dmi_line, '\n');
2528     if (tmp)
2529       *tmp = '\0';
2530     hwloc_debug("found %s '%s'\n", hwloc_name, dmi_line);
2531     hwloc_obj_add_info(obj, hwloc_name, dmi_line);
2532   }
2533 }
2534 
2535 static void
2536 hwloc__get_dmi_id_info(struct hwloc_linux_backend_data_s *data, hwloc_obj_t obj)
2537 {
2538   char path[128];
2539   unsigned pathlen;
2540   DIR *dir;
2541 
2542   strcpy(path, "/sys/devices/virtual/dmi/id");
2543   dir = hwloc_opendir(path, data->root_fd);
2544   if (dir) {
2545     pathlen = 27;
2546   } else {
2547     strcpy(path, "/sys/class/dmi/id");
2548     dir = hwloc_opendir(path, data->root_fd);
2549     if (dir)
2550       pathlen = 17;
2551     else
2552       return;
2553   }
2554   closedir(dir);
2555 
2556   path[pathlen++] = '/';
2557 
2558   hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "product_name", "DMIProductName");
2559   hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "product_version", "DMIProductVersion");
2560   hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "product_serial", "DMIProductSerial");
2561   hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "product_uuid", "DMIProductUUID");
2562   hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "board_vendor", "DMIBoardVendor");
2563   hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "board_name", "DMIBoardName");
2564   hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "board_version", "DMIBoardVersion");
2565   hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "board_serial", "DMIBoardSerial");
2566   hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "board_asset_tag", "DMIBoardAssetTag");
2567   hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "chassis_vendor", "DMIChassisVendor");
2568   hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "chassis_type", "DMIChassisType");
2569   hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "chassis_version", "DMIChassisVersion");
2570   hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "chassis_serial", "DMIChassisSerial");
2571   hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "chassis_asset_tag", "DMIChassisAssetTag");
2572   hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "bios_vendor", "DMIBIOSVendor");
2573   hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "bios_version", "DMIBIOSVersion");
2574   hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "bios_date", "DMIBIOSDate");
2575   hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "sys_vendor", "DMISysVendor");
2576 }
2577 
2578 
2579 /***********************************
2580  ****** Device tree Discovery ******
2581  ***********************************/
2582 
2583 /* Reads the entire file and returns bytes read if bytes_read != NULL
2584  * Returned pointer can be freed by using free().  */
2585 static void *
2586 hwloc_read_raw(const char *p, const char *p1, size_t *bytes_read, int root_fd)
2587 {
2588   char fname[256];
2589   char *ret = NULL;
2590   struct stat fs;
2591   int file = -1;
2592 
2593   snprintf(fname, sizeof(fname), "%s/%s", p, p1);
2594 
2595   file = hwloc_open(fname, root_fd);
2596   if (-1 == file) {
2597       goto out_no_close;
2598   }
2599   if (fstat(file, &fs)) {
2600     goto out;
2601   }
2602 
2603   ret = (char *) malloc(fs.st_size);
2604   if (NULL != ret) {
2605     ssize_t cb = read(file, ret, fs.st_size);
2606     if (cb == -1) {
2607       free(ret);
2608       ret = NULL;
2609     } else {
2610       if (NULL != bytes_read)
2611         *bytes_read = cb;
2612     }
2613   }
2614 
2615  out:
2616   close(file);
2617  out_no_close:
2618   return ret;
2619 }
2620 
2621 /* Reads the entire file and returns it as a 0-terminated string
2622  * Returned pointer can be freed by using free().  */
2623 static char *
2624 hwloc_read_str(const char *p, const char *p1, int root_fd)
2625 {
2626   size_t cb = 0;
2627   char *ret = hwloc_read_raw(p, p1, &cb, root_fd);
2628   if ((NULL != ret) && (0 < cb) && (0 != ret[cb-1])) {
2629     char *tmp = realloc(ret, cb + 1);
2630     if (!tmp) {
2631       free(ret);
2632       return NULL;
2633     }
2634     ret = tmp;
2635     ret[cb] = 0;
2636   }
2637   return ret;
2638 }
2639 
2640 /* Reads first 32bit bigendian value */
2641 static ssize_t
2642 hwloc_read_unit32be(const char *p, const char *p1, uint32_t *buf, int root_fd)
2643 {
2644   size_t cb = 0;
2645   uint32_t *tmp = hwloc_read_raw(p, p1, &cb, root_fd);
2646   if (sizeof(*buf) != cb) {
2647     errno = EINVAL;
2648     free(tmp); /* tmp is either NULL or contains useless things */
2649     return -1;
2650   }
2651   *buf = htonl(*tmp);
2652   free(tmp);
2653   return sizeof(*buf);
2654 }
2655 
2656 typedef struct {
2657   unsigned int n, allocated;
2658   struct {
2659     hwloc_bitmap_t cpuset;
2660     uint32_t phandle;
2661     uint32_t l2_cache;
2662     char *name;
2663   } *p;
2664 } device_tree_cpus_t;
2665 
2666 static void
2667 add_device_tree_cpus_node(device_tree_cpus_t *cpus, hwloc_bitmap_t cpuset,
2668     uint32_t l2_cache, uint32_t phandle, const char *name)
2669 {
2670   if (cpus->n == cpus->allocated) {
2671     void *tmp;
2672     unsigned allocated;
2673     if (!cpus->allocated)
2674       allocated = 64;
2675     else
2676       allocated = 2 * cpus->allocated;
2677     tmp = realloc(cpus->p, allocated * sizeof(cpus->p[0]));
2678     if (!tmp)
2679       return; /* failed to realloc, ignore this entry */
2680     cpus->p = tmp;
2681     cpus->allocated = allocated;
2682   }
2683   cpus->p[cpus->n].phandle = phandle;
2684   cpus->p[cpus->n].cpuset = (NULL == cpuset)?NULL:hwloc_bitmap_dup(cpuset);
2685   cpus->p[cpus->n].l2_cache = l2_cache;
2686   cpus->p[cpus->n].name = strdup(name);
2687   ++cpus->n;
2688 }
2689 
2690 /* Walks over the cache list in order to detect nested caches and CPU mask for each */
2691 static int
2692 look_powerpc_device_tree_discover_cache(device_tree_cpus_t *cpus,
2693     uint32_t phandle, unsigned int *level, hwloc_bitmap_t cpuset)
2694 {
2695   unsigned int i;
2696   int ret = -1;
2697   if ((NULL == level) || (NULL == cpuset) || phandle == (uint32_t) -1)
2698     return ret;
2699   for (i = 0; i < cpus->n; ++i) {
2700     if (phandle != cpus->p[i].l2_cache)
2701       continue;
2702     if (NULL != cpus->p[i].cpuset) {
2703       hwloc_bitmap_or(cpuset, cpuset, cpus->p[i].cpuset);
2704       ret = 0;
2705     } else {
2706       ++(*level);
2707       if (0 == look_powerpc_device_tree_discover_cache(cpus,
2708             cpus->p[i].phandle, level, cpuset))
2709         ret = 0;
2710     }
2711   }
2712   return ret;
2713 }
2714 
2715 static void
2716 try__add_cache_from_device_tree_cpu(struct hwloc_topology *topology,
2717                                     unsigned int level, hwloc_obj_cache_type_t ctype,
2718                                     uint32_t cache_line_size, uint32_t cache_size, uint32_t cache_sets,
2719                                     hwloc_bitmap_t cpuset)
2720 {
2721   struct hwloc_obj *c = NULL;
2722   hwloc_obj_type_t otype;
2723 
2724   if (0 == cache_size)
2725     return;
2726 
2727   otype = hwloc_cache_type_by_depth_type(level, ctype);
2728   if (otype == HWLOC_OBJ_TYPE_NONE)
2729     return;
2730   if (!hwloc_filter_check_keep_object_type(topology, otype))
2731     return;
2732 
2733   c = hwloc_alloc_setup_object(topology, otype, HWLOC_UNKNOWN_INDEX);
2734   c->attr->cache.depth = level;
2735   c->attr->cache.linesize = cache_line_size;
2736   c->attr->cache.size = cache_size;
2737   c->attr->cache.type = ctype;
2738   if (cache_sets == 1)
2739     /* likely wrong, make it unknown */
2740     cache_sets = 0;
2741   if (cache_sets && cache_line_size)
2742     c->attr->cache.associativity = cache_size / (cache_sets * cache_line_size);
2743   else
2744     c->attr->cache.associativity = 0;
2745   c->cpuset = hwloc_bitmap_dup(cpuset);
2746   hwloc_debug_2args_bitmap("cache (%s) depth %u has cpuset %s\n",
2747                            ctype == HWLOC_OBJ_CACHE_UNIFIED ? "unified" : (ctype == HWLOC_OBJ_CACHE_DATA ? "data" : "instruction"),
2748                            level, c->cpuset);
2749   hwloc_insert_object_by_cpuset(topology, c);
2750 }
2751 
2752 static void
2753 try_add_cache_from_device_tree_cpu(struct hwloc_topology *topology,
2754                                    struct hwloc_linux_backend_data_s *data,
2755                                    const char *cpu, unsigned int level, hwloc_bitmap_t cpuset)
2756 {
2757   /* d-cache-block-size - ignore */
2758   /* d-cache-line-size - to read, in bytes */
2759   /* d-cache-sets - ignore */
2760   /* d-cache-size - to read, in bytes */
2761   /* i-cache, same for instruction */
2762   /* cache-unified only exist if data and instruction caches are unified */
2763   /* d-tlb-sets - ignore */
2764   /* d-tlb-size - ignore, always 0 on power6 */
2765   /* i-tlb-*, same */
2766   uint32_t d_cache_line_size = 0, d_cache_size = 0, d_cache_sets = 0;
2767   uint32_t i_cache_line_size = 0, i_cache_size = 0, i_cache_sets = 0;
2768   char unified_path[1024];
2769   struct stat statbuf;
2770   int unified;
2771 
2772   snprintf(unified_path, sizeof(unified_path), "%s/cache-unified", cpu);
2773   unified = (hwloc_stat(unified_path, &statbuf, data->root_fd) == 0);
2774 
2775   hwloc_read_unit32be(cpu, "d-cache-line-size", &d_cache_line_size,
2776       data->root_fd);
2777   hwloc_read_unit32be(cpu, "d-cache-size", &d_cache_size,
2778       data->root_fd);
2779   hwloc_read_unit32be(cpu, "d-cache-sets", &d_cache_sets,
2780       data->root_fd);
2781   hwloc_read_unit32be(cpu, "i-cache-line-size", &i_cache_line_size,
2782       data->root_fd);
2783   hwloc_read_unit32be(cpu, "i-cache-size", &i_cache_size,
2784       data->root_fd);
2785   hwloc_read_unit32be(cpu, "i-cache-sets", &i_cache_sets,
2786       data->root_fd);
2787 
2788   if (!unified)
2789     try__add_cache_from_device_tree_cpu(topology, level, HWLOC_OBJ_CACHE_INSTRUCTION,
2790                                         i_cache_line_size, i_cache_size, i_cache_sets, cpuset);
2791   try__add_cache_from_device_tree_cpu(topology, level, unified ? HWLOC_OBJ_CACHE_UNIFIED : HWLOC_OBJ_CACHE_DATA,
2792                                       d_cache_line_size, d_cache_size, d_cache_sets, cpuset);
2793 }
2794 
2795 /*
2796  * Discovers L1/L2/L3 cache information on IBM PowerPC systems for old kernels (RHEL5.*)
2797  * which provide NUMA nodes information without any details
2798  */
2799 static void
2800 look_powerpc_device_tree(struct hwloc_topology *topology,
2801                          struct hwloc_linux_backend_data_s *data)
2802 {
2803   device_tree_cpus_t cpus;
2804   const char ofroot[] = "/proc/device-tree/cpus";
2805   unsigned int i;
2806   int root_fd = data->root_fd;
2807   DIR *dt = hwloc_opendir(ofroot, root_fd);
2808   struct dirent *dirent;
2809 
2810   if (NULL == dt)
2811     return;
2812 
2813   /* only works for Power so far, and not useful on ARM */
2814   if (data->arch != HWLOC_LINUX_ARCH_POWER) {
2815     closedir(dt);
2816     return;
2817   }
2818 
2819   cpus.n = 0;
2820   cpus.p = NULL;
2821   cpus.allocated = 0;
2822 
2823   while (NULL != (dirent = readdir(dt))) {
2824     char cpu[256];
2825     char *device_type;
2826     uint32_t reg = -1, l2_cache = -1, phandle = -1;
2827     int err;
2828 
2829     if ('.' == dirent->d_name[0])
2830       continue;
2831 
2832     err = snprintf(cpu, sizeof(cpu), "%s/%s", ofroot, dirent->d_name);
2833     if ((size_t) err >= sizeof(cpu))
2834       continue;
2835 
2836     device_type = hwloc_read_str(cpu, "device_type", root_fd);
2837     if (NULL == device_type)
2838       continue;
2839 
2840     hwloc_read_unit32be(cpu, "reg", &reg, root_fd);
2841     if (hwloc_read_unit32be(cpu, "next-level-cache", &l2_cache, root_fd) == -1)
2842       hwloc_read_unit32be(cpu, "l2-cache", &l2_cache, root_fd);
2843     if (hwloc_read_unit32be(cpu, "phandle", &phandle, root_fd) == -1)
2844       if (hwloc_read_unit32be(cpu, "ibm,phandle", &phandle, root_fd) == -1)
2845         hwloc_read_unit32be(cpu, "linux,phandle", &phandle, root_fd);
2846 
2847     if (0 == strcmp(device_type, "cache")) {
2848       add_device_tree_cpus_node(&cpus, NULL, l2_cache, phandle, dirent->d_name);
2849     }
2850     else if (0 == strcmp(device_type, "cpu")) {
2851       /* Found CPU */
2852       hwloc_bitmap_t cpuset = NULL;
2853       size_t cb = 0;
2854       uint32_t *threads = hwloc_read_raw(cpu, "ibm,ppc-interrupt-server#s", &cb, root_fd);
2855       uint32_t nthreads = cb / sizeof(threads[0]);
2856 
2857       if (NULL != threads) {
2858         cpuset = hwloc_bitmap_alloc();
2859         for (i = 0; i < nthreads; ++i) {
2860           if (hwloc_bitmap_isset(topology->levels[0][0]->complete_cpuset, ntohl(threads[i])))
2861             hwloc_bitmap_set(cpuset, ntohl(threads[i]));
2862         }
2863         free(threads);
2864       } else if ((unsigned int)-1 != reg) {
2865         /* Doesn't work on ARM because cpu "reg" do not start at 0.
2866          * We know the first cpu "reg" is the lowest. The others are likely
2867          * in order assuming the device-tree shows objects in order.
2868          */
2869         cpuset = hwloc_bitmap_alloc();
2870         hwloc_bitmap_set(cpuset, reg);
2871       }
2872 
2873       if (NULL == cpuset) {
2874         hwloc_debug("%s has no \"reg\" property, skipping\n", cpu);
2875       } else {
2876         struct hwloc_obj *core = NULL;
2877         add_device_tree_cpus_node(&cpus, cpuset, l2_cache, phandle, dirent->d_name);
2878 
2879         if (hwloc_filter_check_keep_object_type(topology, HWLOC_OBJ_CORE)) {
2880           /* Add core */
2881           core = hwloc_alloc_setup_object(topology, HWLOC_OBJ_CORE, (unsigned) reg);
2882           core->cpuset = hwloc_bitmap_dup(cpuset);
2883           hwloc_insert_object_by_cpuset(topology, core);
2884         }
2885 
2886         /* Add L1 cache */
2887         try_add_cache_from_device_tree_cpu(topology, data, cpu, 1, cpuset);
2888 
2889         hwloc_bitmap_free(cpuset);
2890       }
2891     }
2892     free(device_type);
2893   }
2894   closedir(dt);
2895 
2896   /* No cores and L2 cache were found, exiting */
2897   if (0 == cpus.n) {
2898     hwloc_debug("No cores and L2 cache were found in %s, exiting\n", ofroot);
2899     return;
2900   }
2901 
2902 #ifdef HWLOC_DEBUG
2903   for (i = 0; i < cpus.n; ++i) {
2904     hwloc_debug("%u: %s  ibm,phandle=%08X l2_cache=%08X ",
2905       i, cpus.p[i].name, cpus.p[i].phandle, cpus.p[i].l2_cache);
2906     if (NULL == cpus.p[i].cpuset) {
2907       hwloc_debug("%s\n", "no cpuset");
2908     } else {
2909       hwloc_debug_bitmap("cpuset %s\n", cpus.p[i].cpuset);
2910     }
2911   }
2912 #endif
2913 
2914   /* Scan L2/L3/... caches */
2915   for (i = 0; i < cpus.n; ++i) {
2916     unsigned int level = 2;
2917     hwloc_bitmap_t cpuset;
2918     /* Skip real CPUs */
2919     if (NULL != cpus.p[i].cpuset)
2920       continue;
2921 
2922     /* Calculate cache level and CPU mask */
2923     cpuset = hwloc_bitmap_alloc();
2924     if (0 == look_powerpc_device_tree_discover_cache(&cpus,
2925           cpus.p[i].phandle, &level, cpuset)) {
2926       char cpu[256];
2927       snprintf(cpu, sizeof(cpu), "%s/%s", ofroot, cpus.p[i].name);
2928       try_add_cache_from_device_tree_cpu(topology, data, cpu, level, cpuset);
2929     }
2930     hwloc_bitmap_free(cpuset);
2931   }
2932 
2933   /* Do cleanup */
2934   for (i = 0; i < cpus.n; ++i) {
2935     hwloc_bitmap_free(cpus.p[i].cpuset);
2936     free(cpus.p[i].name);
2937   }
2938   free(cpus.p);
2939 }
2940 
2941 struct knl_hwdata {
2942   char memory_mode[32];
2943   char cluster_mode[32];
2944   long long int mcdram_cache_size; /* mcdram_cache_* is valid only if size > 0 */
2945   int mcdram_cache_associativity;
2946   int mcdram_cache_inclusiveness;
2947   int mcdram_cache_line_size;
2948 };
2949 
2950 /* Try to handle knl hwdata properties
2951  * Returns 0 on success and -1 otherwise */
2952 static int hwloc_linux_try_handle_knl_hwdata_properties(struct hwloc_linux_backend_data_s *data,
2953                                                         struct knl_hwdata *hwdata,
2954                                                         unsigned DDR_nbnodes,
2955                                                         unsigned long DDR_numa_size,
2956                                                         unsigned MCDRAM_nbnodes,
2957                                                         unsigned long MCDRAM_numa_size)
2958 {
2959   char *knl_cache_file;
2960   int version = 0;
2961   char buffer[512] = {0};
2962   char *data_beg = NULL;
2963   char * fallback_env = getenv("HWLOC_KNL_HDH_FALLBACK");
2964   int fallback = fallback_env ? atoi(fallback_env) : -1; /* by default, only fallback if needed */
2965 
2966   hwdata->memory_mode[0] = '\0';
2967   hwdata->cluster_mode[0] = '\0';
2968   hwdata->mcdram_cache_size = -1;
2969   hwdata->mcdram_cache_associativity = -1;
2970   hwdata->mcdram_cache_inclusiveness = -1;
2971   hwdata->mcdram_cache_line_size = -1;
2972 
2973   if (fallback == 1) {
2974     hwloc_debug("KNL dumped hwdata ignored, forcing fallback\n");
2975     goto fallback;
2976   }
2977 
2978   if (asprintf(&knl_cache_file, "%s/knl_memoryside_cache", data->dumped_hwdata_dirname) < 0)
2979     goto fallback;
2980 
2981   hwloc_debug("Reading knl cache data from: %s\n", knl_cache_file);
2982   if (hwloc_read_path_by_length(knl_cache_file, buffer, sizeof(buffer), data->root_fd) < 0) {
2983     hwloc_debug("Unable to open KNL data file `%s' (%s)\n", knl_cache_file, strerror(errno));
2984     free(knl_cache_file);
2985     goto fallback;
2986   }
2987   free(knl_cache_file);
2988 
2989   data_beg = &buffer[0];
2990 
2991   /* file must start with version information */
2992   if (sscanf(data_beg, "version: %d", &version) != 1) {
2993     fprintf(stderr, "Invalid knl_memoryside_cache header, expected \"version: <int>\".\n");
2994     goto fallback;
2995   }
2996 
2997   while (1) {
2998     char *line_end = strstr(data_beg, "\n");
2999     if (!line_end)
3000         break;
3001     if (version >= 1) {
3002       if (!strncmp("cache_size:", data_beg, strlen("cache_size"))) {
3003           sscanf(data_beg, "cache_size: %lld", &hwdata->mcdram_cache_size);
3004           hwloc_debug("read cache_size=%lld\n", hwdata->mcdram_cache_size);
3005       } else if (!strncmp("line_size:", data_beg, strlen("line_size:"))) {
3006           sscanf(data_beg, "line_size: %d", &hwdata->mcdram_cache_line_size);
3007           hwloc_debug("read line_size=%d\n", hwdata->mcdram_cache_line_size);
3008       } else if (!strncmp("inclusiveness:", data_beg, strlen("inclusiveness:"))) {
3009           sscanf(data_beg, "inclusiveness: %d", &hwdata->mcdram_cache_inclusiveness);
3010           hwloc_debug("read inclusiveness=%d\n", hwdata->mcdram_cache_inclusiveness);
3011       } else if (!strncmp("associativity:", data_beg, strlen("associativity:"))) {
3012           sscanf(data_beg, "associativity: %d\n", &hwdata->mcdram_cache_associativity);
3013           hwloc_debug("read associativity=%d\n", hwdata->mcdram_cache_associativity);
3014       }
3015     }
3016     if (version >= 2) {
3017       if (!strncmp("cluster_mode: ", data_beg, strlen("cluster_mode: "))) {
3018         size_t length;
3019         data_beg += strlen("cluster_mode: ");
3020         length = line_end-data_beg;
3021         if (length > sizeof(hwdata->cluster_mode)-1)
3022           length = sizeof(hwdata->cluster_mode)-1;
3023         memcpy(hwdata->cluster_mode, data_beg, length);
3024         hwdata->cluster_mode[length] = '\0';
3025         hwloc_debug("read cluster_mode=%s\n", hwdata->cluster_mode);
3026       } else if (!strncmp("memory_mode: ", data_beg, strlen("memory_mode: "))) {
3027         size_t length;
3028         data_beg += strlen("memory_mode: ");
3029         length = line_end-data_beg;
3030         if (length > sizeof(hwdata->memory_mode)-1)
3031           length = sizeof(hwdata->memory_mode)-1;
3032         memcpy(hwdata->memory_mode, data_beg, length);
3033         hwdata->memory_mode[length] = '\0';
3034         hwloc_debug("read memory_mode=%s\n", hwdata->memory_mode);
3035       }
3036     }
3037 
3038     data_beg = line_end + 1;
3039   }
3040 
3041   if (hwdata->mcdram_cache_size == -1
3042       || hwdata->mcdram_cache_line_size == -1
3043       || hwdata->mcdram_cache_associativity == -1
3044       || hwdata->mcdram_cache_inclusiveness == -1) {
3045     hwloc_debug("Incorrect file format cache_size=%lld line_size=%d associativity=%d inclusiveness=%d\n",
3046                 hwdata->mcdram_cache_size,
3047                 hwdata->mcdram_cache_line_size,
3048                 hwdata->mcdram_cache_associativity,
3049                 hwdata->mcdram_cache_inclusiveness);
3050     hwdata->mcdram_cache_size = -1; /* mark cache as invalid */
3051   }
3052 
3053   return 0;
3054 
3055  fallback:
3056   if (fallback == 0) {
3057     hwloc_debug("KNL hwdata fallback disabled\n");
3058     return -1;
3059   }
3060 
3061   hwloc_debug("Falling back to a heuristic\n");
3062 
3063   /* there can be 0 MCDRAM_nbnodes, but we must have at least one DDR node (not cpuless) */
3064   assert(DDR_nbnodes);
3065   /* there are either no MCDRAM nodes, or as many as DDR nodes */
3066   assert(!MCDRAM_nbnodes || MCDRAM_nbnodes == DDR_nbnodes);
3067 
3068   if (!MCDRAM_nbnodes && DDR_numa_size <= 16UL*1024*1024*1024) {
3069     /* We only found DDR numa nodes, but they are <=16GB.
3070      * It could be a DDR-less KNL where numa nodes are actually MCDRAM, we can't know for sure.
3071      * Both cases are unlikely, disable the heuristic for now.
3072      *
3073      * In theory we could check if DDR_numa_size == 8/12/16GB exactly (amount of MCDRAM numa size in H50/H25/Flat modes),
3074      * but that's never the case since some kilobytes are always stolen by the system.
3075      */
3076     hwloc_debug("Cannot guess if MCDRAM is in Cache or if the node is DDR-less (total NUMA node size %lu)\n",
3077                 DDR_numa_size);
3078     return -1;
3079   }
3080 
3081   /* all commercial KNL/KNM have 16GB of MCDRAM */
3082   unsigned long total_cache_size = 16UL*1024*1024*1024 - MCDRAM_numa_size;
3083 
3084   if (!MCDRAM_nbnodes) {
3085     strcpy(hwdata->memory_mode, "Cache");
3086   } else {
3087     if (!total_cache_size)
3088       strcpy(hwdata->memory_mode, "Flat");
3089     else if (total_cache_size == 8UL*1024*1024*1024)
3090       strcpy(hwdata->memory_mode, "Hybrid50");
3091     else if (total_cache_size == 4UL*1024*1024*1024)
3092       strcpy(hwdata->memory_mode, "Hybrid25");
3093     else
3094       fprintf(stderr, "Unexpected KNL MCDRAM cache size %lu\n", total_cache_size);
3095   }
3096   if (DDR_nbnodes == 4) {
3097     strcpy(hwdata->cluster_mode, "SNC4");
3098   } else if (DDR_nbnodes == 2) {
3099     strcpy(hwdata->cluster_mode, "SNC2");
3100   } else if (DDR_nbnodes == 1) {
3101     /* either Quadrant, All2ALL or Hemisphere */
3102   } else {
3103     fprintf(stderr, "Unexpected number of KNL non-MCDRAM NUMA nodes %u\n", DDR_nbnodes);
3104   }
3105 
3106   hwdata->mcdram_cache_size = total_cache_size/DDR_nbnodes;
3107   hwdata->mcdram_cache_associativity = 1;
3108   hwdata->mcdram_cache_inclusiveness = 1;
3109   hwdata->mcdram_cache_line_size = 64;
3110 
3111   return 0;
3112 }
3113 
3114 
3115 
3116 /**************************************
3117  ****** Sysfs Topology Discovery ******
3118  **************************************/
3119 
3120 static unsigned *
3121 list_sysfsnode(struct hwloc_linux_backend_data_s *data,
3122                const char *path,
3123                unsigned *nbnodesp)
3124 {
3125   DIR *dir;
3126   unsigned osnode, nbnodes = 0;
3127   unsigned *indexes, index_;
3128   hwloc_bitmap_t nodeset;
3129   struct dirent *dirent;
3130 
3131   /* Get the list of nodes first */
3132   dir = hwloc_opendir(path, data->root_fd);
3133   if (!dir)
3134     return NULL;
3135 
3136   nodeset = hwloc_bitmap_alloc();
3137   if (!nodeset)
3138     return NULL;
3139 
3140   while ((dirent = readdir(dir)) != NULL) {
3141     if (strncmp(dirent->d_name, "node", 4))
3142       continue;
3143     osnode = strtoul(dirent->d_name+4, NULL, 0);
3144     hwloc_bitmap_set(nodeset, osnode);
3145     nbnodes++;
3146   }
3147   closedir(dir);
3148 
3149   indexes = calloc(nbnodes, sizeof(*indexes));
3150   if (!indexes) {
3151     hwloc_bitmap_free(nodeset);
3152     return NULL;
3153   }
3154 
3155   /* we don't know if sysfs returns nodes in order, we can't merge these loops */
3156 
3157   /* Unsparsify node indexes.
3158    * We'll need them later because Linux groups sparse distances
3159    * and keeps them in order in the sysfs distance files.
3160    * It'll simplify things in the meantime.
3161    */
3162   index_ = 0;
3163   hwloc_bitmap_foreach_begin (osnode, nodeset) {
3164     indexes[index_] = osnode;
3165     index_++;
3166   } hwloc_bitmap_foreach_end();
3167 
3168   hwloc_bitmap_free(nodeset);
3169 
3170 #ifdef HWLOC_DEBUG
3171   hwloc_debug("%s", "NUMA indexes: ");
3172   for (index_ = 0; index_ < nbnodes; index_++)
3173     hwloc_debug(" %u", indexes[index_]);
3174   hwloc_debug("%s", "\n");
3175 #endif
3176 
3177   *nbnodesp = nbnodes;
3178   return indexes;
3179 }
3180 
3181 static int
3182 look_sysfsnode(struct hwloc_topology *topology,
3183                struct hwloc_linux_backend_data_s *data,
3184                const char *path, unsigned *found)
3185 {
3186   unsigned osnode;
3187   unsigned nbnodes;
3188   hwloc_obj_t * nodes;
3189   unsigned *indexes;
3190   uint64_t * distances;
3191   hwloc_bitmap_t nodes_cpuset;
3192   struct knl_hwdata knl_hwdata;
3193   int failednodes = 0;
3194   unsigned i;
3195 
3196   /* NUMA nodes cannot be filtered out */
3197   indexes = list_sysfsnode(data, path, &nbnodes);
3198   if (!indexes)
3199     return 0;
3200 
3201   nodes = calloc(nbnodes,
3202                  sizeof(hwloc_obj_t));
3203   distances = malloc(nbnodes*nbnodes*sizeof(*distances));
3204   nodes_cpuset  = hwloc_bitmap_alloc();
3205   if (NULL == nodes_cpuset || NULL == nodes || NULL == distances) {
3206     free(nodes);
3207     free(indexes);
3208     free(distances);
3209     hwloc_bitmap_free(nodes_cpuset);
3210     nbnodes = 0;
3211     goto out;
3212   }
3213 
3214       /* Create NUMA objects */
3215       for (i = 0; i < nbnodes; i++) {
3216           hwloc_obj_t node;
3217           int annotate;
3218 
3219           osnode = indexes[i];
3220 
3221           node = hwloc_get_numanode_obj_by_os_index(topology, osnode);
3222           annotate = (node != NULL);
3223           if (!annotate) {
3224             /* create a new node */
3225             char nodepath[SYSFS_NUMA_NODE_PATH_LEN];
3226             hwloc_bitmap_t cpuset;
3227             sprintf(nodepath, "%s/node%u/cpumap", path, osnode);
3228             cpuset = hwloc__alloc_read_path_as_cpumask(nodepath, data->root_fd);
3229             if (!cpuset) {
3230               /* This NUMA object won't be inserted, we'll ignore distances */
3231               failednodes++;
3232               continue;
3233             }
3234             if (hwloc_bitmap_intersects(nodes_cpuset, cpuset)) {
3235               /* crazy BIOS with overlapping NUMA node cpusets, impossible on Linux so far */
3236               hwloc_debug_1arg_bitmap("node P#%u cpuset %s intersects with previous nodes, ignoring that node.\n", osnode, cpuset);
3237               hwloc_bitmap_free(cpuset);
3238               failednodes++;
3239               continue;
3240             } else {
3241               hwloc_bitmap_or(nodes_cpuset, nodes_cpuset, cpuset);
3242             }
3243 
3244             node = hwloc_alloc_setup_object(topology, HWLOC_OBJ_NUMANODE, osnode);
3245             node->cpuset = cpuset;
3246             node->nodeset = hwloc_bitmap_alloc();
3247             hwloc_bitmap_set(node->nodeset, osnode);
3248           }
3249           hwloc_sysfs_node_meminfo_info(topology, data, path, osnode, &node->attr->numanode);
3250 
3251           nodes[i] = node;
3252           hwloc_debug_1arg_bitmap("os node %u has cpuset %s\n",
3253                                   osnode, node->cpuset);
3254       }
3255       topology->support.discovery->numa = 1;
3256       topology->support.discovery->numa_memory = 1;
3257 
3258       hwloc_bitmap_free(nodes_cpuset);
3259 
3260       if (failednodes || nbnodes <= 1) {
3261         /* failed to read/create some nodes, don't bother reading/fixing
3262          * a distance matrix that would likely be wrong anyway.
3263          */
3264         free(distances);
3265         distances = NULL;
3266       }
3267 
3268       if (distances && hwloc_parse_nodes_distances(path, nbnodes, indexes, distances, data->root_fd) < 0) {
3269         free(distances);
3270         distances = NULL;
3271       }
3272 
3273       free(indexes);
3274 
3275       unsigned nr_knl_clusters = 0;
3276       hwloc_obj_t knl_clusters[4]= { NULL, NULL, NULL, NULL };
3277       int node_knl_cluster[8] = { -1, -1, -1, -1, -1, -1, -1, -1};
3278 
3279       if (data->is_knl && !failednodes) {
3280         char *env = getenv("HWLOC_KNL_NUMA_QUIRK");
3281         int noquirk = (env && !atoi(env)) || !distances || !hwloc_filter_check_keep_object_type(topology, HWLOC_OBJ_GROUP);
3282         int mscache;
3283         unsigned j, closest;
3284         unsigned long MCDRAM_numa_size, DDR_numa_size;
3285         unsigned MCDRAM_nbnodes, DDR_nbnodes;
3286 
3287         DDR_numa_size = 0;
3288         DDR_nbnodes = 0;
3289         MCDRAM_numa_size = 0;
3290         MCDRAM_nbnodes = 0;
3291         for(i=0; i<nbnodes; i++)
3292           if (hwloc_bitmap_iszero(nodes[i]->cpuset)) {
3293             MCDRAM_numa_size += nodes[i]->attr->numanode.local_memory;
3294             MCDRAM_nbnodes++;
3295           } else {
3296             DDR_numa_size += nodes[i]->attr->numanode.local_memory;
3297             DDR_nbnodes++;
3298           }
3299         assert(DDR_nbnodes + MCDRAM_nbnodes == nbnodes);
3300 
3301         hwloc_linux_try_handle_knl_hwdata_properties(data, &knl_hwdata,
3302                                                      DDR_nbnodes, DDR_numa_size,
3303                                                      MCDRAM_nbnodes, MCDRAM_numa_size);
3304         mscache = knl_hwdata.mcdram_cache_size > 0 && hwloc_filter_check_keep_object_type(topology, HWLOC_OBJ_L3CACHE);
3305 
3306         if (knl_hwdata.cluster_mode[0])
3307           hwloc_obj_add_info(topology->levels[0][0], "ClusterMode", knl_hwdata.cluster_mode);
3308         if (knl_hwdata.memory_mode[0])
3309           hwloc_obj_add_info(topology->levels[0][0], "MemoryMode", knl_hwdata.memory_mode);
3310 
3311         for(i=0; i<nbnodes; i++) {
3312           if (!hwloc_bitmap_iszero(nodes[i]->cpuset)) {
3313             /* DDR, see if there's a MCDRAM cache to add */
3314             if (mscache) {
3315               hwloc_obj_t cache = hwloc_alloc_setup_object(topology, HWLOC_OBJ_L3CACHE, HWLOC_UNKNOWN_INDEX);
3316               if (cache) {
3317                 cache->attr->cache.depth = 3;
3318                 cache->attr->cache.type = HWLOC_OBJ_CACHE_UNIFIED;
3319                 cache->attr->cache.size = knl_hwdata.mcdram_cache_size;
3320                 cache->attr->cache.linesize = knl_hwdata.mcdram_cache_line_size;
3321                 cache->attr->cache.associativity = knl_hwdata.mcdram_cache_associativity;
3322                 hwloc_obj_add_info(cache, "Inclusive", knl_hwdata.mcdram_cache_inclusiveness ? "1" : "0");
3323                 cache->cpuset = hwloc_bitmap_dup(nodes[i]->cpuset);
3324                 cache->nodeset = hwloc_bitmap_dup(nodes[i]->nodeset); /* only applies to DDR */
3325                 cache->subtype = strdup("MemorySideCache");
3326                 hwloc_insert_object_by_cpuset(topology, cache);
3327               }
3328             }
3329             /* nothing else to do for DDR */
3330             continue;
3331           }
3332           /* MCDRAM */
3333           nodes[i]->subtype = strdup("MCDRAM");
3334 
3335           if (noquirk)
3336             continue;
3337 
3338           /* DDR is the closest node with CPUs */
3339           closest = (unsigned)-1;
3340           for(j=0; j<nbnodes; j++) {
3341             if (j==i)
3342               continue;
3343             if (hwloc_bitmap_iszero(nodes[j]->cpuset))
3344               /* nodes without CPU, that's another MCDRAM, skip it */
3345               continue;
3346             if (closest == (unsigned)-1 || distances[i*nbnodes+j]<distances[i*nbnodes+closest])
3347               closest = j;
3348           }
3349           if (closest != (unsigned) -1) {
3350             /* Change MCDRAM cpuset to DDR cpuset for clarity.
3351              * Not actually useful if we insert with hwloc__attach_memory_object() below.
3352              * The cpuset will be updated by the core later anyway.
3353              */
3354             hwloc_bitmap_copy(nodes[i]->cpuset, nodes[closest]->cpuset);
3355             /* Add a Group for Cluster containing this MCDRAM + DDR */
3356             hwloc_obj_t cluster = hwloc_alloc_setup_object(topology, HWLOC_OBJ_GROUP, HWLOC_UNKNOWN_INDEX);
3357             hwloc_obj_add_other_obj_sets(cluster, nodes[i]);
3358             hwloc_obj_add_other_obj_sets(cluster, nodes[closest]);
3359             cluster->subtype = strdup("Cluster");
3360             cluster->attr->group.kind = HWLOC_GROUP_KIND_INTEL_KNL_SUBNUMA_CLUSTER;
3361             knl_clusters[nr_knl_clusters] = cluster;
3362             node_knl_cluster[i] = nr_knl_clusters;
3363             node_knl_cluster[closest] = nr_knl_clusters;
3364             nr_knl_clusters++;
3365           }
3366         }
3367         if (!noquirk) {
3368           /* drop the distance matrix, it contradicts the above NUMA layout groups */
3369           free(distances);
3370           distances = NULL;
3371         }
3372       }
3373 
3374       /* everything is ready for insertion now */
3375 
3376       /* insert knl clusters */
3377       if (data->is_knl) {
3378         for(i=0; i<nr_knl_clusters; i++) {
3379           knl_clusters[i] = hwloc_insert_object_by_cpuset(topology, knl_clusters[i]);
3380           /* failure or replace can be ignored */
3381         }
3382       }
3383 
3384       /* insert actual numa nodes */
3385       for (i = 0; i < nbnodes; i++) {
3386         hwloc_obj_t node = nodes[i];
3387         if (node) {
3388           hwloc_obj_t res_obj;
3389           if (data->is_knl && node_knl_cluster[i] != -1) {
3390             /* directly attach to the existing cluster */
3391             hwloc_obj_t parent = knl_clusters[node_knl_cluster[i]];
3392             res_obj = hwloc__attach_memory_object(topology, parent, node, hwloc_report_os_error);
3393           } else {
3394             /* we don't know where to attach, let the core find or insert if needed */
3395             res_obj = hwloc__insert_object_by_cpuset(topology, NULL, node, hwloc_report_os_error);
3396           }
3397           if (res_obj != node)
3398             /* This NUMA node got merged somehow, could be a buggy BIOS reporting wrong NUMA node cpuset.
3399              * This object disappeared, we'll ignore distances */
3400             failednodes++;
3401         }
3402       }
3403 
3404       if (failednodes) {
3405         free(distances);
3406         distances = NULL;
3407       }
3408 
3409       /* Inserted distances now that nodes are properly inserted */
3410       if (distances)
3411         hwloc_internal_distances_add(topology, nbnodes, nodes, distances,
3412                                      HWLOC_DISTANCES_KIND_FROM_OS|HWLOC_DISTANCES_KIND_MEANS_LATENCY,
3413                                      HWLOC_DISTANCES_ADD_FLAG_GROUP);
3414       else
3415         free(nodes);
3416 
3417  out:
3418   *found = nbnodes - failednodes;
3419   return 0;
3420 }
3421 
3422 /* Look at Linux' /sys/devices/system/cpu/cpu%d/topology/ */
3423 static int
3424 look_sysfscpu(struct hwloc_topology *topology,
3425               struct hwloc_linux_backend_data_s *data,
3426               const char *path,
3427               struct hwloc_linux_cpuinfo_proc * cpuinfo_Lprocs, unsigned cpuinfo_numprocs)
3428 {
3429   hwloc_bitmap_t cpuset; /* Set of cpus for which we have topology information */
3430 #define CPU_TOPOLOGY_STR_LEN 128
3431   char str[CPU_TOPOLOGY_STR_LEN];
3432   DIR *dir;
3433   int i,j;
3434   unsigned caches_added, merge_buggy_core_siblings;
3435   hwloc_obj_t packages = NULL; /* temporary list of packages before actual insert in the tree */
3436   int threadwithcoreid = data->is_amd_with_CU ? -1 : 0; /* -1 means we don't know yet if threads have their own coreids within thread_siblings */
3437 
3438   /* fill the cpuset of interesting cpus */
3439   dir = hwloc_opendir(path, data->root_fd);
3440   if (!dir)
3441     return -1;
3442   else {
3443     struct dirent *dirent;
3444     cpuset = hwloc_bitmap_alloc();
3445 
3446     while ((dirent = readdir(dir)) != NULL) {
3447       unsigned long cpu;
3448       char online[2];
3449 
3450       if (strncmp(dirent->d_name, "cpu", 3))
3451         continue;
3452       cpu = strtoul(dirent->d_name+3, NULL, 0);
3453 
3454       /* Maybe we don't have topology information but at least it exists */
3455       hwloc_bitmap_set(topology->levels[0][0]->complete_cpuset, cpu);
3456 
3457       /* check whether this processor is online */
3458       sprintf(str, "%s/cpu%lu/online", path, cpu);
3459       if (hwloc_read_path_by_length(str, online, sizeof(online), data->root_fd) == 0) {
3460         if (!atoi(online)) {
3461           hwloc_debug("os proc %lu is offline\n", cpu);
3462           continue;
3463         }
3464       }
3465 
3466       /* check whether the kernel exports topology information for this cpu */
3467       sprintf(str, "%s/cpu%lu/topology", path, cpu);
3468       if (hwloc_access(str, X_OK, data->root_fd) < 0 && errno == ENOENT) {
3469         hwloc_debug("os proc %lu has no accessible %s/cpu%lu/topology\n",
3470                    cpu, path, cpu);
3471         continue;
3472       }
3473 
3474       hwloc_bitmap_set(cpuset, cpu);
3475     }
3476     closedir(dir);
3477   }
3478 
3479   topology->support.discovery->pu = 1;
3480   hwloc_debug_1arg_bitmap("found %d cpu topologies, cpuset %s\n",
3481              hwloc_bitmap_weight(cpuset), cpuset);
3482 
3483   merge_buggy_core_siblings = (data->arch == HWLOC_LINUX_ARCH_X86);
3484   caches_added = 0;
3485   hwloc_bitmap_foreach_begin(i, cpuset) {
3486     hwloc_bitmap_t packageset, coreset, bookset, threadset;
3487     int tmpint;
3488 
3489     if (hwloc_filter_check_keep_object_type(topology, HWLOC_OBJ_PACKAGE)) {
3490       /* look at the package */
3491       sprintf(str, "%s/cpu%d/topology/core_siblings", path, i);
3492       packageset = hwloc__alloc_read_path_as_cpumask(str, data->root_fd);
3493       if (packageset) {
3494         hwloc_bitmap_and(packageset, packageset, cpuset);
3495         if (hwloc_bitmap_first(packageset) == i) {
3496           /* first cpu in this package, add the package */
3497           struct hwloc_obj *package;
3498           unsigned mypackageid;
3499           mypackageid = (unsigned) -1;
3500           sprintf(str, "%s/cpu%d/topology/physical_package_id", path, i); /* contains %d at least up to 4.9 */
3501           if (hwloc_read_path_as_int(str, &tmpint, data->root_fd) == 0)
3502             mypackageid = (unsigned) tmpint;
3503 
3504           if (merge_buggy_core_siblings) {
3505             /* check for another package with same physical_package_id */
3506             hwloc_obj_t curpackage = packages;
3507             while (curpackage) {
3508               if (curpackage->os_index == mypackageid) {
3509                 /* found another package with same physical_package_id but different core_siblings.
3510                  * looks like a buggy kernel on Intel Xeon E5 v3 processor with two rings.
3511                  * merge these core_siblings to extend the existing first package object.
3512                  */
3513                 static int reported = 0;
3514                 if (!reported && !hwloc_hide_errors()) {
3515                   char *a, *b;
3516                   hwloc_bitmap_asprintf(&a, curpackage->cpuset);
3517                   hwloc_bitmap_asprintf(&b, packageset);
3518                   fprintf(stderr, "****************************************************************************\n");
3519                   fprintf(stderr, "* hwloc %s has detected buggy sysfs package information: Two packages have\n", HWLOC_VERSION);
3520                   fprintf(stderr, "* the same physical package id %u but different core_siblings %s and %s\n",
3521                           mypackageid, a, b);
3522                   fprintf(stderr, "* hwloc is merging these packages into a single one assuming your Linux kernel\n");
3523                   fprintf(stderr, "* does not support this processor correctly.\n");
3524                   fprintf(stderr, "* You may hide this warning by setting HWLOC_HIDE_ERRORS=1 in the environment.\n");
3525                   fprintf(stderr, "*\n");
3526                   fprintf(stderr, "* If hwloc does not report the right number of packages,\n");
3527                   fprintf(stderr, "* please report this error message to the hwloc user's mailing list,\n");
3528                   fprintf(stderr, "* along with the files generated by the hwloc-gather-topology script.\n");
3529                   fprintf(stderr, "****************************************************************************\n");
3530                   reported = 1;
3531                   free(a);
3532                   free(b);
3533                 }
3534                 hwloc_bitmap_or(curpackage->cpuset, curpackage->cpuset, packageset);
3535                 goto package_done;
3536               }
3537               curpackage = curpackage->next_cousin;
3538             }
3539           }
3540 
3541           /* no package with same physical_package_id, create a new one */
3542           package = hwloc_alloc_setup_object(topology, HWLOC_OBJ_PACKAGE, mypackageid);
3543           package->cpuset = packageset;
3544           hwloc_debug_1arg_bitmap("os package %u has cpuset %s\n",
3545                                   mypackageid, packageset);
3546           /* add cpuinfo */
3547           if (cpuinfo_Lprocs) {
3548             for(j=0; j<(int) cpuinfo_numprocs; j++)
3549               if ((int) cpuinfo_Lprocs[j].Pproc == i) {
3550                 hwloc__move_infos(&package->infos, &package->infos_count,
3551                                   &cpuinfo_Lprocs[j].infos, &cpuinfo_Lprocs[j].infos_count);
3552               }
3553           }
3554           /* insert in a temporary list in case we have to modify the cpuset by merging other core_siblings later.
3555            * we'll actually insert the tree at the end of the entire sysfs cpu loop.
3556            */
3557           package->next_cousin = packages;
3558           packages = package;
3559 
3560           packageset = NULL; /* don't free it */
3561         }
3562       package_done:
3563         hwloc_bitmap_free(packageset);
3564       }
3565     }
3566 
3567     if (hwloc_filter_check_keep_object_type(topology, HWLOC_OBJ_CORE)) {
3568       /* look at the core */
3569       sprintf(str, "%s/cpu%d/topology/thread_siblings", path, i);
3570       coreset = hwloc__alloc_read_path_as_cpumask(str, data->root_fd);
3571       if (coreset) {
3572         unsigned mycoreid;
3573         int gotcoreid = 0; /* to avoid reading the coreid twice */
3574         hwloc_bitmap_and(coreset, coreset, cpuset);
3575         if (hwloc_bitmap_weight(coreset) > 1 && threadwithcoreid == -1) {
3576           /* check if this is hyper-threading or different coreids */
3577           unsigned siblingid, siblingcoreid;
3578 
3579           mycoreid = (unsigned) -1;
3580           sprintf(str, "%s/cpu%d/topology/core_id", path, i); /* contains %d at least up to 4.9 */
3581           if (hwloc_read_path_as_int(str, &tmpint, data->root_fd) == 0)
3582             mycoreid = (unsigned) tmpint;
3583           gotcoreid = 1;
3584 
3585           siblingid = hwloc_bitmap_first(coreset);
3586           if (siblingid == (unsigned) i)
3587             siblingid = hwloc_bitmap_next(coreset, i);
3588           siblingcoreid = (unsigned) -1;
3589           sprintf(str, "%s/cpu%u/topology/core_id", path, siblingid); /* contains %d at least up to 4.9 */
3590           if (hwloc_read_path_as_int(str, &tmpint, data->root_fd) == 0)
3591             siblingcoreid = (unsigned) tmpint;
3592           threadwithcoreid = (siblingcoreid != mycoreid);
3593         }
3594         if (hwloc_bitmap_first(coreset) == i || threadwithcoreid) {
3595           /* regular core */
3596           struct hwloc_obj *core;
3597 
3598           if (!gotcoreid) {
3599             mycoreid = (unsigned) -1;
3600             sprintf(str, "%s/cpu%d/topology/core_id", path, i); /* contains %d at least up to 4.9 */
3601             if (hwloc_read_path_as_int(str, &tmpint, data->root_fd) == 0)
3602               mycoreid = (unsigned) tmpint;
3603           }
3604 
3605           core = hwloc_alloc_setup_object(topology, HWLOC_OBJ_CORE, mycoreid);
3606           if (threadwithcoreid)
3607             /* amd multicore compute-unit, create one core per thread */
3608             hwloc_bitmap_only(coreset, i);
3609           core->cpuset = coreset;
3610           hwloc_debug_1arg_bitmap("os core %u has cpuset %s\n",
3611                                   mycoreid, core->cpuset);
3612           hwloc_insert_object_by_cpuset(topology, core);
3613           coreset = NULL; /* don't free it */
3614         }
3615         hwloc_bitmap_free(coreset);
3616       }
3617     }
3618 
3619     if (hwloc_filter_check_keep_object_type(topology, HWLOC_OBJ_GROUP)) {
3620       /* look at the books */
3621       sprintf(str, "%s/cpu%d/topology/book_siblings", path, i);
3622       bookset = hwloc__alloc_read_path_as_cpumask(str, data->root_fd);
3623       if (bookset) {
3624         hwloc_bitmap_and(bookset, bookset, cpuset);
3625         if (hwloc_bitmap_first(bookset) == i) {
3626           struct hwloc_obj *book;
3627           unsigned mybookid;
3628           mybookid = (unsigned) -1;
3629           sprintf(str, "%s/cpu%d/topology/book_id", path, i); /* contains %d at least up to 4.9 */
3630           if (hwloc_read_path_as_int(str, &tmpint, data->root_fd) == 0) {
3631             mybookid = (unsigned) tmpint;
3632 
3633           book = hwloc_alloc_setup_object(topology, HWLOC_OBJ_GROUP, mybookid);
3634           book->cpuset = bookset;
3635           hwloc_debug_1arg_bitmap("os book %u has cpuset %s\n",
3636                        mybookid, bookset);
3637           book->subtype = strdup("Book");
3638           book->attr->group.kind = HWLOC_GROUP_KIND_S390_BOOK;
3639           hwloc_insert_object_by_cpuset(topology, book);
3640           bookset = NULL; /* don't free it */
3641           }
3642         }
3643         hwloc_bitmap_free(bookset);
3644       }
3645     }
3646 
3647     /* PU cannot be filtered-out */
3648     {
3649       /* look at the thread */
3650       struct hwloc_obj *thread = hwloc_alloc_setup_object(topology, HWLOC_OBJ_PU, (unsigned) i);
3651       threadset = hwloc_bitmap_alloc();
3652       hwloc_bitmap_only(threadset, i);
3653       thread->cpuset = threadset;
3654       hwloc_debug_1arg_bitmap("thread %d has cpuset %s\n",
3655                  i, threadset);
3656       hwloc_insert_object_by_cpuset(topology, thread);
3657     }
3658 
3659     /* look at the caches */
3660     for(j=0; j<10; j++) {
3661       char str2[20]; /* enough for a level number (one digit) or a type (Data/Instruction/Unified) */
3662       hwloc_bitmap_t cacheset;
3663 
3664       sprintf(str, "%s/cpu%d/cache/index%d/shared_cpu_map", path, i, j);
3665       cacheset = hwloc__alloc_read_path_as_cpumask(str, data->root_fd);
3666       if (cacheset) {
3667         if (hwloc_bitmap_iszero(cacheset)) {
3668           /* ia64 returning empty L3 and L2i? use the core set instead */
3669           hwloc_bitmap_t tmpset;
3670           sprintf(str, "%s/cpu%d/topology/thread_siblings", path, i);
3671           tmpset = hwloc__alloc_read_path_as_cpumask(str, data->root_fd);
3672           /* only use it if we actually got something */
3673           if (tmpset) {
3674             hwloc_bitmap_free(cacheset);
3675             cacheset = tmpset;
3676           }
3677         }
3678         hwloc_bitmap_and(cacheset, cacheset, cpuset);
3679 
3680         if (hwloc_bitmap_first(cacheset) == i) {
3681           unsigned kB;
3682           unsigned linesize;
3683           unsigned sets, lines_per_tag;
3684           unsigned depth; /* 1 for L1, .... */
3685           hwloc_obj_cache_type_t ctype = HWLOC_OBJ_CACHE_UNIFIED; /* default */
3686           hwloc_obj_type_t otype;
3687           struct hwloc_obj *cache;
3688 
3689           /* get the cache level depth */
3690           sprintf(str, "%s/cpu%d/cache/index%d/level", path, i, j); /* contains %u at least up to 4.9 */
3691           if (hwloc_read_path_as_uint(str, &depth, data->root_fd) < 0) {
3692             hwloc_bitmap_free(cacheset);
3693             continue;
3694           }
3695 
3696           /* cache type */
3697           sprintf(str, "%s/cpu%d/cache/index%d/type", path, i, j);
3698           if (hwloc_read_path_by_length(str, str2, sizeof(str2), data->root_fd) == 0) {
3699             if (!strncmp(str2, "Data", 4))
3700               ctype = HWLOC_OBJ_CACHE_DATA;
3701             else if (!strncmp(str2, "Unified", 7))
3702               ctype = HWLOC_OBJ_CACHE_UNIFIED;
3703             else if (!strncmp(str2, "Instruction", 11))
3704               ctype = HWLOC_OBJ_CACHE_INSTRUCTION;
3705           }
3706 
3707           otype = hwloc_cache_type_by_depth_type(depth, ctype);
3708           if (otype == HWLOC_OBJ_TYPE_NONE
3709               || !hwloc_filter_check_keep_object_type(topology, otype)) {
3710             hwloc_bitmap_free(cacheset);
3711             continue;
3712           }
3713 
3714           /* FIXME: if Bulldozer/Piledriver, add compute unit Groups when L2/L1i filtered-out */
3715           /* FIXME: if KNL, add tile Groups when L2/L1i filtered-out */
3716 
3717           /* get the cache size */
3718           kB = 0;
3719           sprintf(str, "%s/cpu%d/cache/index%d/size", path, i, j); /* contains %uK at least up to 4.9 */
3720           hwloc_read_path_as_uint(str, &kB, data->root_fd);
3721           /* KNL reports L3 with size=0 and full cpuset in cpuid.
3722            * Let hwloc_linux_try_add_knl_mcdram_cache() detect it better.
3723            */
3724           if (!kB && otype == HWLOC_OBJ_L3CACHE && data->is_knl) {
3725             hwloc_bitmap_free(cacheset);
3726             continue;
3727           }
3728 
3729           /* get the line size */
3730           linesize = 0;
3731           sprintf(str, "%s/cpu%d/cache/index%d/coherency_line_size", path, i, j); /* contains %u at least up to 4.9 */
3732           hwloc_read_path_as_uint(str, &linesize, data->root_fd);
3733 
3734           /* get the number of sets and lines per tag.
3735            * don't take the associativity directly in "ways_of_associativity" because
3736            * some archs (ia64, ppc) put 0 there when fully-associative, while others (x86) put something like -1 there.
3737            */
3738           sets = 0;
3739           sprintf(str, "%s/cpu%d/cache/index%d/number_of_sets", path, i, j); /* contains %u at least up to 4.9 */
3740           hwloc_read_path_as_uint(str, &sets, data->root_fd);
3741 
3742           lines_per_tag = 1;
3743           sprintf(str, "%s/cpu%d/cache/index%d/physical_line_partition", path, i, j); /* contains %u at least up to 4.9 */
3744           hwloc_read_path_as_uint(str, &lines_per_tag, data->root_fd);
3745 
3746           /* first cpu in this cache, add the cache */
3747           cache = hwloc_alloc_setup_object(topology, otype, HWLOC_UNKNOWN_INDEX);
3748           cache->attr->cache.size = ((uint64_t)kB) << 10;
3749           cache->attr->cache.depth = depth;
3750           cache->attr->cache.linesize = linesize;
3751           cache->attr->cache.type = ctype;
3752           if (!linesize || !lines_per_tag || !sets)
3753             cache->attr->cache.associativity = 0; /* unknown */
3754           else if (sets == 1)
3755             cache->attr->cache.associativity = 0; /* likely wrong, make it unknown */
3756           else
3757             cache->attr->cache.associativity = (kB << 10) / linesize / lines_per_tag / sets;
3758           cache->cpuset = cacheset;
3759           hwloc_debug_1arg_bitmap("cache depth %u has cpuset %s\n",
3760                                   depth, cacheset);
3761           hwloc_insert_object_by_cpuset(topology, cache);
3762           cacheset = NULL; /* don't free it */
3763           ++caches_added;
3764         }
3765       }
3766       hwloc_bitmap_free(cacheset);
3767      }
3768 
3769   } hwloc_bitmap_foreach_end();
3770 
3771   /* actually insert in the tree now that package cpusets have been fixed-up */
3772   while (packages) {
3773     hwloc_obj_t next = packages->next_cousin;
3774     packages->next_cousin = NULL;
3775     hwloc_insert_object_by_cpuset(topology, packages);
3776     packages = next;
3777   }
3778 
3779   if (0 == caches_added)
3780     look_powerpc_device_tree(topology, data);
3781 
3782   hwloc_bitmap_free(cpuset);
3783 
3784   return 0;
3785 }
3786 
3787 
3788 
3789 /****************************************
3790  ****** cpuinfo Topology Discovery ******
3791  ****************************************/
3792 
3793 static int
3794 hwloc_linux_parse_cpuinfo_x86(const char *prefix, const char *value,
3795                               struct hwloc_info_s **infos, unsigned *infos_count,
3796                               int is_global __hwloc_attribute_unused)
3797 {
3798   if (!strcmp("vendor_id", prefix)) {
3799     if (value[0])
3800       hwloc__add_info(infos, infos_count, "CPUVendor", value);
3801   } else if (!strcmp("model name", prefix)) {
3802     if (value[0])
3803       hwloc__add_info(infos, infos_count, "CPUModel", value);
3804   } else if (!strcmp("model", prefix)) {
3805     if (value[0])
3806       hwloc__add_info(infos, infos_count, "CPUModelNumber", value);
3807   } else if (!strcmp("cpu family", prefix)) {
3808     if (value[0])
3809       hwloc__add_info(infos, infos_count, "CPUFamilyNumber", value);
3810   } else if (!strcmp("stepping", prefix)) {
3811     if (value[0])
3812       hwloc__add_info(infos, infos_count, "CPUStepping", value);
3813   }
3814   return 0;
3815 }
3816 
3817 static int
3818 hwloc_linux_parse_cpuinfo_ia64(const char *prefix, const char *value,
3819                                struct hwloc_info_s **infos, unsigned *infos_count,
3820                                int is_global __hwloc_attribute_unused)
3821 {
3822   if (!strcmp("vendor", prefix)) {
3823     if (value[0])
3824       hwloc__add_info(infos, infos_count, "CPUVendor", value);
3825   } else if (!strcmp("model name", prefix)) {
3826     if (value[0])
3827       hwloc__add_info(infos, infos_count, "CPUModel", value);
3828   } else if (!strcmp("model", prefix)) {
3829     if (value[0])
3830       hwloc__add_info(infos, infos_count, "CPUModelNumber", value);
3831   } else if (!strcmp("family", prefix)) {
3832     if (value[0])
3833       hwloc__add_info(infos, infos_count, "CPUFamilyNumber", value);
3834   }
3835   return 0;
3836 }
3837 
3838 static int
3839 hwloc_linux_parse_cpuinfo_arm(const char *prefix, const char *value,
3840                               struct hwloc_info_s **infos, unsigned *infos_count,
3841                               int is_global __hwloc_attribute_unused)
3842 {
3843   if (!strcmp("Processor", prefix) /* old kernels with one Processor header */
3844       || !strcmp("model name", prefix) /* new kernels with one model name per core */) {
3845     if (value[0])
3846       hwloc__add_info(infos, infos_count, "CPUModel", value);
3847   } else if (!strcmp("CPU implementer", prefix)) {
3848     if (value[0])
3849       hwloc__add_info(infos, infos_count, "CPUImplementer", value);
3850   } else if (!strcmp("CPU architecture", prefix)) {
3851     if (value[0])
3852       hwloc__add_info(infos, infos_count, "CPUArchitecture", value);
3853   } else if (!strcmp("CPU variant", prefix)) {
3854     if (value[0])
3855       hwloc__add_info(infos, infos_count, "CPUVariant", value);
3856   } else if (!strcmp("CPU part", prefix)) {
3857     if (value[0])
3858       hwloc__add_info(infos, infos_count, "CPUPart", value);
3859   } else if (!strcmp("CPU revision", prefix)) {
3860     if (value[0])
3861       hwloc__add_info(infos, infos_count, "CPURevision", value);
3862   } else if (!strcmp("Hardware", prefix)) {
3863     if (value[0])
3864       hwloc__add_info(infos, infos_count, "HardwareName", value);
3865   } else if (!strcmp("Revision", prefix)) {
3866     if (value[0])
3867       hwloc__add_info(infos, infos_count, "HardwareRevision", value);
3868   } else if (!strcmp("Serial", prefix)) {
3869     if (value[0])
3870       hwloc__add_info(infos, infos_count, "HardwareSerial", value);
3871   }
3872   return 0;
3873 }
3874 
3875 static int
3876 hwloc_linux_parse_cpuinfo_ppc(const char *prefix, const char *value,
3877                               struct hwloc_info_s **infos, unsigned *infos_count,
3878                               int is_global)
3879 {
3880   /* common fields */
3881   if (!strcmp("cpu", prefix)) {
3882     if (value[0])
3883       hwloc__add_info(infos, infos_count, "CPUModel", value);
3884   } else if (!strcmp("platform", prefix)) {
3885     if (value[0])
3886       hwloc__add_info(infos, infos_count, "PlatformName", value);
3887   } else if (!strcmp("model", prefix)) {
3888     if (value[0])
3889       hwloc__add_info(infos, infos_count, "PlatformModel", value);
3890   }
3891   /* platform-specific fields */
3892   else if (!strcasecmp("vendor", prefix)) {
3893     if (value[0])
3894       hwloc__add_info(infos, infos_count, "PlatformVendor", value);
3895   } else if (!strcmp("Board ID", prefix)) {
3896     if (value[0])
3897       hwloc__add_info(infos, infos_count, "PlatformBoardID", value);
3898   } else if (!strcmp("Board", prefix)
3899              || !strcasecmp("Machine", prefix)) {
3900     /* machine and board are similar (and often more precise) than model above */
3901     if (value[0])
3902       hwloc__add_info_nodup(infos, infos_count, "PlatformModel", value, 1);
3903   } else if (!strcasecmp("Revision", prefix)
3904              || !strcmp("Hardware rev", prefix)) {
3905     if (value[0])
3906       hwloc__add_info(infos, infos_count, is_global ? "PlatformRevision" : "CPURevision", value);
3907   } else if (!strcmp("SVR", prefix)) {
3908     if (value[0])
3909       hwloc__add_info(infos, infos_count, "SystemVersionRegister", value);
3910   } else if (!strcmp("PVR", prefix)) {
3911     if (value[0])
3912       hwloc__add_info(infos, infos_count, "ProcessorVersionRegister", value);
3913   }
3914   /* don't match 'board*' because there's also "board l2" on some platforms */
3915   return 0;
3916 }
3917 
3918 /*
3919  * avr32: "chip type\t:"                        => OK
3920  * blackfin: "model name\t:"                    => OK
3921  * h8300: "CPU:"                                => OK
3922  * m68k: "CPU:"                                 => OK
3923  * mips: "cpu model\t\t:"                       => OK
3924  * openrisc: "CPU:"                             => OK
3925  * sparc: "cpu\t\t:"                            => OK
3926  * tile: "model name\t:"                        => OK
3927  * unicore32: "Processor\t:"                    => OK
3928  * alpha: "cpu\t\t\t: Alpha" + "cpu model\t\t:" => "cpu" overwritten by "cpu model", no processor indexes
3929  * cris: "cpu\t\t:" + "cpu model\t:"            => only "cpu"
3930  * frv: "CPU-Core:" + "CPU:"                    => only "CPU"
3931  * mn10300: "cpu core   :" + "model name :"     => only "model name"
3932  * parisc: "cpu family\t:" + "cpu\t\t:"         => only "cpu"
3933  *
3934  * not supported because of conflicts with other arch minor lines:
3935  * m32r: "cpu family\t:"                        => KO (adding "cpu family" would break "blackfin")
3936  * microblaze: "CPU-Family:"                    => KO
3937  * sh: "cpu family\t:" + "cpu type\t:"          => KO
3938  * xtensa: "model\t\t:"                         => KO
3939  */
3940 static int
3941 hwloc_linux_parse_cpuinfo_generic(const char *prefix, const char *value,
3942                                   struct hwloc_info_s **infos, unsigned *infos_count,
3943                                   int is_global __hwloc_attribute_unused)
3944 {
3945   if (!strcmp("model name", prefix)
3946       || !strcmp("Processor", prefix)
3947       || !strcmp("chip type", prefix)
3948       || !strcmp("cpu model", prefix)
3949       || !strcasecmp("cpu", prefix)) {
3950     /* keep the last one, assume it's more precise than the first one.
3951      * we should have the Architecture keypair for basic information anyway.
3952      */
3953     if (value[0])
3954       hwloc__add_info_nodup(infos, infos_count, "CPUModel", value, 1);
3955   }
3956   return 0;
3957 }
3958 
3959 /* Lprocs_p set to NULL unless returns > 0 */
3960 static int
3961 hwloc_linux_parse_cpuinfo(struct hwloc_linux_backend_data_s *data,
3962                           const char *path,
3963                           struct hwloc_linux_cpuinfo_proc ** Lprocs_p,
3964                           struct hwloc_info_s **global_infos, unsigned *global_infos_count)
3965 {
3966   FILE *fd;
3967   unsigned len = 128;
3968   char str[len]; /* vendor/model can be very long */
3969   char *endptr;
3970   unsigned allocated_Lprocs = 0;
3971   struct hwloc_linux_cpuinfo_proc * Lprocs = NULL;
3972   unsigned numprocs = 0;
3973   int curproc = -1;
3974   int (*parse_cpuinfo_func)(const char *, const char *, struct hwloc_info_s **, unsigned *, int) = NULL;
3975 
3976   if (!(fd=hwloc_fopen(path,"r", data->root_fd)))
3977     {
3978       hwloc_debug("could not open %s\n", path);
3979       return -1;
3980     }
3981 
3982 #      define PROCESSOR "processor"
3983 #      define PACKAGEID "physical id" /* the longest one */
3984 #      define COREID "core id"
3985   hwloc_debug("\n\n * Topology extraction from %s *\n\n", path);
3986   while (fgets(str,len,fd)!=NULL) {
3987     unsigned long Ppkg, Pcore, Pproc;
3988     char *end, *dot, *prefix, *value;
3989     int noend = 0;
3990 
3991     /* remove the ending \n */
3992     end = strchr(str, '\n');
3993     if (end)
3994       *end = 0;
3995     else
3996       noend = 1;
3997     /* if empty line, skip and reset curproc */
3998     if (!*str) {
3999       curproc = -1;
4000       continue;
4001     }
4002     /* skip lines with no dot */
4003     dot = strchr(str, ':');
4004     if (!dot)
4005       continue;
4006     /* skip lines not starting with a letter */
4007     if ((*str > 'z' || *str < 'a')
4008         && (*str > 'Z' || *str < 'A'))
4009       continue;
4010 
4011     /* mark the end of the prefix */
4012     prefix = str;
4013     end = dot;
4014     while (end[-1] == ' ' || end[-1] == '\t') end--; /* need a strrspn() */
4015     *end = 0;
4016     /* find beginning of value, its end is already marked */
4017     value = dot+1 + strspn(dot+1, " \t");
4018 
4019     /* defines for parsing numbers */
4020 #   define getprocnb_begin(field, var)                                  \
4021     if (!strcmp(field,prefix)) {                                        \
4022       var = strtoul(value,&endptr,0);                                   \
4023       if (endptr==value) {                                              \
4024         hwloc_debug("no number in "field" field of %s\n", path);        \
4025         goto err;                                                       \
4026       } else if (var==ULONG_MAX) {                                      \
4027         hwloc_debug("too big "field" number in %s\n", path);            \
4028         goto err;                                                       \
4029       }                                                                 \
4030       hwloc_debug(field " %lu\n", var)
4031 #   define getprocnb_end()                                              \
4032     }
4033     /* actually parse numbers */
4034     getprocnb_begin(PROCESSOR, Pproc);
4035     curproc = numprocs++;
4036     if (numprocs > allocated_Lprocs) {
4037       struct hwloc_linux_cpuinfo_proc * tmp;
4038       if (!allocated_Lprocs)
4039         allocated_Lprocs = 8;
4040       else
4041         allocated_Lprocs *= 2;
4042       tmp = realloc(Lprocs, allocated_Lprocs * sizeof(*Lprocs));
4043       if (!tmp)
4044         goto err;
4045       Lprocs = tmp;
4046     }
4047     Lprocs[curproc].Pproc = Pproc;
4048     Lprocs[curproc].Pcore = -1;
4049     Lprocs[curproc].Ppkg = -1;
4050     Lprocs[curproc].Lcore = -1;
4051     Lprocs[curproc].Lpkg = -1;
4052     Lprocs[curproc].infos = NULL;
4053     Lprocs[curproc].infos_count = 0;
4054     getprocnb_end() else
4055     getprocnb_begin(PACKAGEID, Ppkg);
4056     Lprocs[curproc].Ppkg = Ppkg;
4057     getprocnb_end() else
4058     getprocnb_begin(COREID, Pcore);
4059     Lprocs[curproc].Pcore = Pcore;
4060     getprocnb_end() else {
4061 
4062       /* architecture specific or default routine for parsing cpumodel */
4063       switch (data->arch) {
4064       case HWLOC_LINUX_ARCH_X86:
4065         parse_cpuinfo_func = hwloc_linux_parse_cpuinfo_x86;
4066         break;
4067       case HWLOC_LINUX_ARCH_ARM:
4068         parse_cpuinfo_func = hwloc_linux_parse_cpuinfo_arm;
4069         break;
4070       case HWLOC_LINUX_ARCH_POWER:
4071         parse_cpuinfo_func = hwloc_linux_parse_cpuinfo_ppc;
4072         break;
4073       case HWLOC_LINUX_ARCH_IA64:
4074         parse_cpuinfo_func = hwloc_linux_parse_cpuinfo_ia64;
4075         break;
4076       default:
4077         parse_cpuinfo_func = hwloc_linux_parse_cpuinfo_generic;
4078       }
4079 
4080       /* we can't assume that we already got a processor index line:
4081        * alpha/frv/h8300/m68k/microblaze/sparc have no processor lines at all, only a global entry.
4082        * tile has a global section with model name before the list of processor lines.
4083        */
4084       parse_cpuinfo_func(prefix, value,
4085                          curproc >= 0 ? &Lprocs[curproc].infos : global_infos,
4086                          curproc >= 0 ? &Lprocs[curproc].infos_count : global_infos_count,
4087                          curproc < 0);
4088     }
4089 
4090     if (noend) {
4091       /* ignore end of line */
4092       if (fscanf(fd,"%*[^\n]") == EOF)
4093         break;
4094       getc(fd);
4095     }
4096   }
4097   fclose(fd);
4098 
4099   *Lprocs_p = Lprocs;
4100   return numprocs;
4101 
4102  err:
4103   fclose(fd);
4104   free(Lprocs);
4105   *Lprocs_p = NULL;
4106   return -1;
4107 }
4108 
4109 static void
4110 hwloc_linux_free_cpuinfo(struct hwloc_linux_cpuinfo_proc * Lprocs, unsigned numprocs,
4111                          struct hwloc_info_s *global_infos, unsigned global_infos_count)
4112 {
4113   if (Lprocs) {
4114     unsigned i;
4115     for(i=0; i<numprocs; i++) {
4116       hwloc__free_infos(Lprocs[i].infos, Lprocs[i].infos_count);
4117     }
4118     free(Lprocs);
4119   }
4120   hwloc__free_infos(global_infos, global_infos_count);
4121 }
4122 
4123 static int
4124 look_cpuinfo(struct hwloc_topology *topology,
4125              struct hwloc_linux_cpuinfo_proc * Lprocs,
4126              unsigned numprocs)
4127 {
4128   /* P for physical/OS index, L for logical (e.g. in we order we get them, not in the final hwloc logical order) */
4129   unsigned *Lcore_to_Pcore;
4130   unsigned *Lcore_to_Ppkg; /* needed because Lcore is equivalent to Pcore+Ppkg, not to Pcore alone */
4131   unsigned *Lpkg_to_Ppkg;
4132   unsigned numpkgs=0;
4133   unsigned numcores=0;
4134   unsigned long Lproc;
4135   unsigned missingpkg;
4136   unsigned missingcore;
4137   unsigned i,j;
4138 
4139   /* initialize misc arrays, there can be at most numprocs entries */
4140   Lcore_to_Pcore = malloc(numprocs * sizeof(*Lcore_to_Pcore));
4141   Lcore_to_Ppkg = malloc(numprocs * sizeof(*Lcore_to_Ppkg));
4142   Lpkg_to_Ppkg = malloc(numprocs * sizeof(*Lpkg_to_Ppkg));
4143   for (i = 0; i < numprocs; i++) {
4144     Lcore_to_Pcore[i] = -1;
4145     Lcore_to_Ppkg[i] = -1;
4146     Lpkg_to_Ppkg[i] = -1;
4147   }
4148 
4149   /* create PU objects */
4150   for(Lproc=0; Lproc<numprocs; Lproc++) {
4151     unsigned long Pproc = Lprocs[Lproc].Pproc;
4152     hwloc_obj_t obj = hwloc_alloc_setup_object(topology, HWLOC_OBJ_PU, (unsigned)Pproc);
4153     obj->cpuset = hwloc_bitmap_alloc();
4154     hwloc_bitmap_only(obj->cpuset, Pproc);
4155     hwloc_debug_2args_bitmap("cpu %lu (os %lu) has cpuset %s\n",
4156                              Lproc, Pproc, obj->cpuset);
4157     hwloc_insert_object_by_cpuset(topology, obj);
4158   }
4159 
4160   topology->support.discovery->pu = 1;
4161 
4162   hwloc_debug("%s", "\n * Topology summary *\n");
4163   hwloc_debug("%u processors)\n", numprocs);
4164 
4165   /* fill Lprocs[].Lpkg and Lpkg_to_Ppkg */
4166   for(Lproc=0; Lproc<numprocs; Lproc++) {
4167     long Ppkg = Lprocs[Lproc].Ppkg;
4168     if (Ppkg != -1) {
4169       unsigned long Pproc = Lprocs[Lproc].Pproc;
4170       for (i=0; i<numpkgs; i++)
4171         if ((unsigned) Ppkg == Lpkg_to_Ppkg[i])
4172           break;
4173       Lprocs[Lproc].Lpkg = i;
4174       hwloc_debug("%lu on package %u (%lx)\n", Pproc, i, (unsigned long) Ppkg);
4175       if (i==numpkgs) {
4176         Lpkg_to_Ppkg[numpkgs] = Ppkg;
4177         numpkgs++;
4178       }
4179     }
4180   }
4181   /* Some buggy Linuxes don't provide numbers for processor 0, which makes us
4182    * provide bogus information. We should rather drop it. */
4183   missingpkg=0;
4184   for(j=0; j<numprocs; j++)
4185     if (Lprocs[j].Ppkg == -1) {
4186       missingpkg=1;
4187       break;
4188     }
4189   /* create package objects */
4190   hwloc_debug("%u pkgs%s\n", numpkgs, missingpkg ? ", but some missing package" : "");
4191   if (!missingpkg && numpkgs>0
4192       && hwloc_filter_check_keep_object_type(topology, HWLOC_OBJ_PACKAGE)) {
4193     for (i = 0; i < numpkgs; i++) {
4194       struct hwloc_obj *obj = hwloc_alloc_setup_object(topology, HWLOC_OBJ_PACKAGE, Lpkg_to_Ppkg[i]);
4195       int doneinfos = 0;
4196       obj->cpuset = hwloc_bitmap_alloc();
4197       for(j=0; j<numprocs; j++)
4198         if ((unsigned) Lprocs[j].Lpkg == i) {
4199           hwloc_bitmap_set(obj->cpuset, Lprocs[j].Pproc);
4200           if (!doneinfos) {
4201             hwloc__move_infos(&obj->infos, &obj->infos_count, &Lprocs[j].infos, &Lprocs[j].infos_count);
4202             doneinfos = 1;
4203           }
4204         }
4205       hwloc_debug_1arg_bitmap("Package %u has cpuset %s\n", i, obj->cpuset);
4206       hwloc_insert_object_by_cpuset(topology, obj);
4207     }
4208     hwloc_debug("%s", "\n");
4209   }
4210 
4211   /* fill Lprocs[].Lcore, Lcore_to_Ppkg and Lcore_to_Pcore */
4212   for(Lproc=0; Lproc<numprocs; Lproc++) {
4213     long Pcore = Lprocs[Lproc].Pcore;
4214     if (Pcore != -1) {
4215       for (i=0; i<numcores; i++)
4216         if ((unsigned) Pcore == Lcore_to_Pcore[i] && (unsigned) Lprocs[Lproc].Ppkg == Lcore_to_Ppkg[i])
4217           break;
4218       Lprocs[Lproc].Lcore = i;
4219       if (i==numcores) {
4220         Lcore_to_Ppkg[numcores] = Lprocs[Lproc].Ppkg;
4221         Lcore_to_Pcore[numcores] = Pcore;
4222         numcores++;
4223       }
4224     }
4225   }
4226   /* Some buggy Linuxes don't provide numbers for processor 0, which makes us
4227    * provide bogus information. We should rather drop it. */
4228   missingcore=0;
4229   for(j=0; j<numprocs; j++)
4230     if (Lprocs[j].Pcore == -1) {
4231       missingcore=1;
4232       break;
4233     }
4234   /* create Core objects */
4235   hwloc_debug("%u cores%s\n", numcores, missingcore ? ", but some missing core" : "");
4236   if (!missingcore && numcores>0
4237       && hwloc_filter_check_keep_object_type(topology, HWLOC_OBJ_CORE)) {
4238     for (i = 0; i < numcores; i++) {
4239       struct hwloc_obj *obj = hwloc_alloc_setup_object(topology, HWLOC_OBJ_CORE, Lcore_to_Pcore[i]);
4240       obj->cpuset = hwloc_bitmap_alloc();
4241       for(j=0; j<numprocs; j++)
4242         if ((unsigned) Lprocs[j].Lcore == i)
4243           hwloc_bitmap_set(obj->cpuset, Lprocs[j].Pproc);
4244       hwloc_debug_1arg_bitmap("Core %u has cpuset %s\n", i, obj->cpuset);
4245       hwloc_insert_object_by_cpuset(topology, obj);
4246     }
4247     hwloc_debug("%s", "\n");
4248   }
4249 
4250   free(Lcore_to_Pcore);
4251   free(Lcore_to_Ppkg);
4252   free(Lpkg_to_Ppkg);
4253   return 0;
4254 }
4255 
4256 
4257 
4258 /*************************************
4259  ****** Main Topology Discovery ******
4260  *************************************/
4261 
4262 static void
4263 hwloc__linux_get_mic_sn(struct hwloc_topology *topology, struct hwloc_linux_backend_data_s *data)
4264 {
4265   char line[64], *tmp, *end;
4266   if (hwloc_read_path_by_length("/proc/elog", line, sizeof(line), data->root_fd) < 0)
4267     return;
4268   if (strncmp(line, "Card ", 5))
4269     return;
4270   tmp = line + 5;
4271   end = strchr(tmp, ':');
4272   if (!end)
4273     return;
4274   *end = '\0';
4275 
4276   if (tmp[0])
4277     hwloc_obj_add_info(hwloc_get_root_obj(topology), "MICSerialNumber", tmp);
4278 }
4279 
4280 static void
4281 hwloc_gather_system_info(struct hwloc_topology *topology,
4282                          struct hwloc_linux_backend_data_s *data)
4283 {
4284   FILE *file;
4285   char line[128]; /* enough for utsname fields */
4286   const char *env;
4287 
4288   /* initialize to something sane, in case !is_thissystem and we can't find things in /proc/hwloc-nofile-info */
4289   memset(&data->utsname, 0, sizeof(data->utsname));
4290   data->fallback_nbprocessors = -1; /* unknown yet */
4291   data->pagesize = 4096;
4292 
4293   /* read thissystem info */
4294   if (topology->is_thissystem) {
4295     uname(&data->utsname);
4296     data->fallback_nbprocessors = hwloc_fallback_nbprocessors(topology); /* errors managed in hwloc_linux_fallback_pu_level() */
4297     data->pagesize = hwloc_getpagesize();
4298   }
4299 
4300   /* overwrite with optional /proc/hwloc-nofile-info */
4301   file = hwloc_fopen("/proc/hwloc-nofile-info", "r", data->root_fd);
4302   if (file) {
4303     while (fgets(line, sizeof(line), file)) {
4304       char *tmp = strchr(line, '\n');
4305       if (!strncmp("OSName: ", line, 8)) {
4306         if (tmp)
4307           *tmp = '\0';
4308         strncpy(data->utsname.sysname, line+8, sizeof(data->utsname.sysname));
4309         data->utsname.sysname[sizeof(data->utsname.sysname)-1] = '\0';
4310       } else if (!strncmp("OSRelease: ", line, 11)) {
4311         if (tmp)
4312           *tmp = '\0';
4313         strncpy(data->utsname.release, line+11, sizeof(data->utsname.release));
4314         data->utsname.release[sizeof(data->utsname.release)-1] = '\0';
4315       } else if (!strncmp("OSVersion: ", line, 11)) {
4316         if (tmp)
4317           *tmp = '\0';
4318         strncpy(data->utsname.version, line+11, sizeof(data->utsname.version));
4319         data->utsname.version[sizeof(data->utsname.version)-1] = '\0';
4320       } else if (!strncmp("HostName: ", line, 10)) {
4321         if (tmp)
4322           *tmp = '\0';
4323         strncpy(data->utsname.nodename, line+10, sizeof(data->utsname.nodename));
4324         data->utsname.nodename[sizeof(data->utsname.nodename)-1] = '\0';
4325       } else if (!strncmp("Architecture: ", line, 14)) {
4326         if (tmp)
4327           *tmp = '\0';
4328         strncpy(data->utsname.machine, line+14, sizeof(data->utsname.machine));
4329         data->utsname.machine[sizeof(data->utsname.machine)-1] = '\0';
4330       } else if (!strncmp("FallbackNbProcessors: ", line, 22)) {
4331         if (tmp)
4332           *tmp = '\0';
4333         data->fallback_nbprocessors = atoi(line+22);
4334       } else if (!strncmp("PageSize: ", line, 10)) {
4335         if (tmp)
4336          *tmp = '\0';
4337         data->pagesize = strtoull(line+10, NULL, 10);
4338       } else {
4339         hwloc_debug("ignored /proc/hwloc-nofile-info line %s\n", line);
4340         /* ignored */
4341       }
4342     }
4343     fclose(file);
4344   }
4345 
4346   env = getenv("HWLOC_DUMP_NOFILE_INFO");
4347   if (env && *env) {
4348     file = fopen(env, "w");
4349     if (file) {
4350       if (*data->utsname.sysname)
4351         fprintf(file, "OSName: %s\n", data->utsname.sysname);
4352       if (*data->utsname.release)
4353         fprintf(file, "OSRelease: %s\n", data->utsname.release);
4354       if (*data->utsname.version)
4355         fprintf(file, "OSVersion: %s\n", data->utsname.version);
4356       if (*data->utsname.nodename)
4357         fprintf(file, "HostName: %s\n", data->utsname.nodename);
4358       if (*data->utsname.machine)
4359         fprintf(file, "Architecture: %s\n", data->utsname.machine);
4360       fprintf(file, "FallbackNbProcessors: %d\n", data->fallback_nbprocessors);
4361       fprintf(file, "PageSize: %llu\n", (unsigned long long) data->pagesize);
4362       fclose(file);
4363     }
4364   }
4365 
4366   /* detect arch for quirks, using configure #defines if possible, or uname */
4367 #if (defined HWLOC_X86_32_ARCH) || (defined HWLOC_X86_64_ARCH) /* does not cover KNC */
4368   if (topology->is_thissystem)
4369     data->arch = HWLOC_LINUX_ARCH_X86;
4370 #endif
4371   if (data->arch == HWLOC_LINUX_ARCH_UNKNOWN && *data->utsname.machine) {
4372     if (!strcmp(data->utsname.machine, "x86_64")
4373         || (data->utsname.machine[0] == 'i' && !strcmp(data->utsname.machine+2, "86"))
4374         || !strcmp(data->utsname.machine, "k1om"))
4375       data->arch = HWLOC_LINUX_ARCH_X86;
4376     else if (!strncmp(data->utsname.machine, "arm", 3))
4377       data->arch = HWLOC_LINUX_ARCH_ARM;
4378     else if (!strncmp(data->utsname.machine, "ppc", 3)
4379              || !strncmp(data->utsname.machine, "power", 5))
4380       data->arch = HWLOC_LINUX_ARCH_POWER;
4381     else if (!strcmp(data->utsname.machine, "ia64"))
4382       data->arch = HWLOC_LINUX_ARCH_IA64;
4383   }
4384 }
4385 
4386 /* returns 0 on success, -1 on non-match or error during hardwired load */
4387 static int
4388 hwloc_linux_try_hardwired_cpuinfo(struct hwloc_backend *backend)
4389 {
4390   struct hwloc_topology *topology = backend->topology;
4391   struct hwloc_linux_backend_data_s *data = backend->private_data;
4392 
4393   if (getenv("HWLOC_NO_HARDWIRED_TOPOLOGY"))
4394     return -1;
4395 
4396   if (!strcmp(data->utsname.machine, "s64fx")) {
4397     char line[128];
4398     /* Fujistu K-computer, FX10, and FX100 use specific processors
4399      * whose Linux topology support is broken until 4.1 (acc455cffa75070d55e74fc7802b49edbc080e92and)
4400      * and existing machines will likely never be fixed by kernel upgrade.
4401      */
4402 
4403     /* /proc/cpuinfo starts with one of these lines:
4404      * "cpu             : Fujitsu SPARC64 VIIIfx"
4405      * "cpu             : Fujitsu SPARC64 XIfx"
4406      * "cpu             : Fujitsu SPARC64 IXfx"
4407      */
4408     if (hwloc_read_path_by_length("/proc/cpuinfo", line, sizeof(line), data->root_fd) < 0)
4409       return -1;
4410 
4411     if (strncmp(line, "cpu\t", 4))
4412       return -1;
4413 
4414     if (strstr(line, "Fujitsu SPARC64 VIIIfx"))
4415       return hwloc_look_hardwired_fujitsu_k(topology);
4416     else if (strstr(line, "Fujitsu SPARC64 IXfx"))
4417       return hwloc_look_hardwired_fujitsu_fx10(topology);
4418     else if (strstr(line, "FUJITSU SPARC64 XIfx"))
4419       return hwloc_look_hardwired_fujitsu_fx100(topology);
4420   }
4421   return -1;
4422 }
4423 
4424 static void hwloc_linux__get_allowed_resources(hwloc_topology_t topology, const char *root_path, int root_fd, char **cpuset_namep)
4425 {
4426   char *cpuset_mntpnt, *cgroup_mntpnt, *cpuset_name = NULL;
4427   hwloc_find_linux_cpuset_mntpnt(&cgroup_mntpnt, &cpuset_mntpnt, root_path);
4428   if (cgroup_mntpnt || cpuset_mntpnt) {
4429     cpuset_name = hwloc_read_linux_cpuset_name(root_fd, topology->pid);
4430     if (cpuset_name) {
4431       hwloc_admin_disable_set_from_cpuset(root_fd, cgroup_mntpnt, cpuset_mntpnt, cpuset_name, "cpus", topology->allowed_cpuset);
4432       hwloc_admin_disable_set_from_cpuset(root_fd, cgroup_mntpnt, cpuset_mntpnt, cpuset_name, "mems", topology->allowed_nodeset);
4433     }
4434     free(cgroup_mntpnt);
4435     free(cpuset_mntpnt);
4436   }
4437   *cpuset_namep = cpuset_name;
4438 }
4439 
4440 static void
4441 hwloc_linux_fallback_pu_level(struct hwloc_backend *backend)
4442 {
4443   struct hwloc_topology *topology = backend->topology;
4444   struct hwloc_linux_backend_data_s *data = backend->private_data;
4445 
4446   if (data->fallback_nbprocessors >= 1)
4447     topology->support.discovery->pu = 1;
4448   else
4449     data->fallback_nbprocessors = 1;
4450   hwloc_setup_pu_level(topology, data->fallback_nbprocessors);
4451 }
4452 
4453 static int
4454 hwloc_look_linuxfs(struct hwloc_backend *backend)
4455 {
4456   struct hwloc_topology *topology = backend->topology;
4457   struct hwloc_linux_backend_data_s *data = backend->private_data;
4458   unsigned nbnodes;
4459   char *cpuset_name;
4460   struct hwloc_linux_cpuinfo_proc * Lprocs = NULL;
4461   struct hwloc_info_s *global_infos = NULL;
4462   unsigned global_infos_count = 0;
4463   int numprocs;
4464   int already_pus;
4465   int already_numanodes;
4466   const char *sysfs_cpu_path;
4467   const char *sysfs_node_path;
4468   int err;
4469 
4470   /* look for sysfs cpu path containing at least one of core_siblings and thread_siblings */
4471   if (!hwloc_access("/sys/bus/cpu/devices/cpu0/topology/thread_siblings", R_OK, data->root_fd)
4472       || !hwloc_access("/sys/bus/cpu/devices/cpu0/topology/core_siblings", R_OK, data->root_fd))
4473     sysfs_cpu_path = "/sys/bus/cpu/devices";
4474   else if (!hwloc_access("/sys/devices/system/cpu/cpu0/topology/core_siblings", R_OK, data->root_fd)
4475       || !hwloc_access("/sys/devices/system/cpu/cpu0/topology/thread_siblings", R_OK, data->root_fd))
4476     sysfs_cpu_path = "/sys/devices/system/cpu";
4477   else
4478     sysfs_cpu_path = NULL;
4479   hwloc_debug("Found sysfs cpu files under %s\n", sysfs_cpu_path);
4480 
4481   /* look for sysfs node path */
4482   if (!hwloc_access("/sys/bus/node/devices/node0/cpumap", R_OK, data->root_fd))
4483     sysfs_node_path = "/sys/bus/node/devices";
4484   else if (!hwloc_access("/sys/devices/system/node/node0/cpumap", R_OK, data->root_fd))
4485     sysfs_node_path = "/sys/devices/system/node";
4486   else
4487     sysfs_node_path = NULL;
4488   hwloc_debug("Found sysfs node files under %s\n", sysfs_node_path);
4489 
4490   already_pus = (topology->levels[0][0]->complete_cpuset != NULL
4491                  && !hwloc_bitmap_iszero(topology->levels[0][0]->complete_cpuset));
4492   /* if there are PUs, still look at memory information
4493    * since x86 misses NUMA node information (unless the processor supports topoext)
4494    * memory size.
4495    */
4496   already_numanodes = (topology->levels[0][0]->complete_nodeset != NULL
4497                        && !hwloc_bitmap_iszero(topology->levels[0][0]->complete_nodeset));
4498   /* if there are already NUMA nodes, we'll just annotate them with memory information,
4499    * which requires the NUMA level to be connected.
4500    */
4501   if (already_numanodes)
4502     hwloc_topology_reconnect(topology, 0);
4503 
4504   hwloc_alloc_root_sets(topology->levels[0][0]);
4505 
4506   /*********************************
4507    * Platform information for later
4508    */
4509   hwloc_gather_system_info(topology, data);
4510 
4511   /**********************
4512    * /proc/cpuinfo
4513    */
4514   numprocs = hwloc_linux_parse_cpuinfo(data, "/proc/cpuinfo", &Lprocs, &global_infos, &global_infos_count);
4515   if (numprocs < 0)
4516     numprocs = 0;
4517 
4518   /**************************
4519    * detect model for quirks
4520    */
4521   if (data->arch == HWLOC_LINUX_ARCH_X86 && numprocs > 0) {
4522       unsigned i;
4523       const char *cpuvendor = NULL, *cpufamilynumber = NULL, *cpumodelnumber = NULL;
4524       for(i=0; i<Lprocs[0].infos_count; i++) {
4525         if (!strcmp(Lprocs[0].infos[i].name, "CPUVendor")) {
4526           cpuvendor = Lprocs[0].infos[i].value;
4527         } else if (!strcmp(Lprocs[0].infos[i].name, "CPUFamilyNumber")) {
4528           cpufamilynumber = Lprocs[0].infos[i].value;
4529         } else if (!strcmp(Lprocs[0].infos[i].name, "CPUModelNumber")) {
4530           cpumodelnumber = Lprocs[0].infos[i].value;
4531         }
4532       }
4533       if (cpuvendor && !strcmp(cpuvendor, "GenuineIntel")
4534           && cpufamilynumber && !strcmp(cpufamilynumber, "6")
4535           && cpumodelnumber && (!strcmp(cpumodelnumber, "87")
4536           || !strcmp(cpumodelnumber, "133")))
4537         data->is_knl = 1;
4538       if (cpuvendor && !strcmp(cpuvendor, "AuthenticAMD")
4539           && cpufamilynumber
4540           && (!strcmp(cpufamilynumber, "21")
4541               || !strcmp(cpufamilynumber, "22")))
4542         data->is_amd_with_CU = 1;
4543   }
4544 
4545   /**********************
4546    * Gather the list of admin-disabled cpus and mems
4547    */
4548   hwloc_linux__get_allowed_resources(topology, data->root_path, data->root_fd, &cpuset_name);
4549 
4550   /**********************
4551    * CPU information
4552    */
4553 
4554   /* Don't rediscover CPU resources if already done */
4555   if (already_pus)
4556     goto cpudone;
4557 
4558   /* Gather the list of cpus now */
4559   err = hwloc_linux_try_hardwired_cpuinfo(backend);
4560   if (!err)
4561     goto cpudone;
4562 
4563   /* setup root info */
4564   hwloc__move_infos(&hwloc_get_root_obj(topology)->infos, &hwloc_get_root_obj(topology)->infos_count,
4565                     &global_infos, &global_infos_count);
4566 
4567   if (getenv("HWLOC_LINUX_USE_CPUINFO") || !sysfs_cpu_path) {
4568     /* revert to reading cpuinfo only if /sys/.../topology unavailable (before 2.6.16)
4569      * or not containing anything interesting */
4570     if (numprocs > 0)
4571       err = look_cpuinfo(topology, Lprocs, numprocs);
4572     else
4573       err = -1;
4574     if (err < 0)
4575       hwloc_linux_fallback_pu_level(backend);
4576     look_powerpc_device_tree(topology, data);
4577 
4578   } else {
4579     /* sysfs */
4580     if (look_sysfscpu(topology, data, sysfs_cpu_path, Lprocs, numprocs) < 0)
4581       /* sysfs but we failed to read cpu topology, fallback */
4582       hwloc_linux_fallback_pu_level(backend);
4583   }
4584 
4585  cpudone:
4586 
4587   /*********************
4588    * Memory information
4589    */
4590 
4591   /* Get the machine memory attributes */
4592   hwloc_get_procfs_meminfo_info(topology, data, &topology->machine_memory);
4593 
4594   /* Gather NUMA information. Must be after hwloc_get_procfs_meminfo_info so that the hugepage size is known */
4595   if (sysfs_node_path)
4596     look_sysfsnode(topology, data, sysfs_node_path, &nbnodes);
4597   else
4598     nbnodes = 0;
4599 
4600   /**********************
4601    * Misc
4602    */
4603 
4604   /* Gather DMI info */
4605   hwloc__get_dmi_id_info(data, topology->levels[0][0]);
4606 
4607   hwloc_obj_add_info(topology->levels[0][0], "Backend", "Linux");
4608   if (cpuset_name) {
4609     hwloc_obj_add_info(topology->levels[0][0], "LinuxCgroup", cpuset_name);
4610     free(cpuset_name);
4611   }
4612 
4613   hwloc__linux_get_mic_sn(topology, data);
4614 
4615   /* data->utsname was filled with real uname or \0, we can safely pass it */
4616   hwloc_add_uname_info(topology, &data->utsname);
4617 
4618   hwloc_linux_free_cpuinfo(Lprocs, numprocs, global_infos, global_infos_count);
4619   return 0;
4620 }
4621 
4622 
4623 
4624 /****************************************
4625  ***** Linux PCI backend callbacks ******
4626  ****************************************/
4627 
4628 /*
4629  * backend callback for retrieving the location of a pci device
4630  */
4631 static int
4632 hwloc_linux_backend_get_pci_busid_cpuset(struct hwloc_backend *backend,
4633                                          struct hwloc_pcidev_attr_s *busid, hwloc_bitmap_t cpuset)
4634 {
4635   struct hwloc_linux_backend_data_s *data = backend->private_data;
4636   char path[256];
4637   int err;
4638 
4639   snprintf(path, sizeof(path), "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/local_cpus",
4640            busid->domain, busid->bus,
4641            busid->dev, busid->func);
4642   err = hwloc__read_path_as_cpumask(path, cpuset, data->root_fd);
4643   if (!err && !hwloc_bitmap_iszero(cpuset))
4644     return 0;
4645   return -1;
4646 }
4647 
4648 
4649 
4650 /*******************************
4651  ******* Linux component *******
4652  *******************************/
4653 
4654 static void
4655 hwloc_linux_backend_disable(struct hwloc_backend *backend)
4656 {
4657   struct hwloc_linux_backend_data_s *data = backend->private_data;
4658 #ifdef HAVE_OPENAT
4659   free(data->root_path);
4660   close(data->root_fd);
4661 #endif
4662 #ifdef HWLOC_HAVE_LIBUDEV
4663   if (data->udev)
4664     udev_unref(data->udev);
4665 #endif
4666   free(data);
4667 }
4668 
4669 static struct hwloc_backend *
4670 hwloc_linux_component_instantiate(struct hwloc_disc_component *component,
4671                                   const void *_data1 __hwloc_attribute_unused,
4672                                   const void *_data2 __hwloc_attribute_unused,
4673                                   const void *_data3 __hwloc_attribute_unused)
4674 {
4675   struct hwloc_backend *backend;
4676   struct hwloc_linux_backend_data_s *data;
4677   const char * fsroot_path;
4678   int flags, root = -1;
4679 
4680   backend = hwloc_backend_alloc(component);
4681   if (!backend)
4682     goto out;
4683 
4684   data = malloc(sizeof(*data));
4685   if (!data) {
4686     errno = ENOMEM;
4687     goto out_with_backend;
4688   }
4689 
4690   backend->private_data = data;
4691   backend->discover = hwloc_look_linuxfs;
4692   backend->get_pci_busid_cpuset = hwloc_linux_backend_get_pci_busid_cpuset;
4693   backend->disable = hwloc_linux_backend_disable;
4694 
4695   /* default values */
4696   data->arch = HWLOC_LINUX_ARCH_UNKNOWN;
4697   data->is_knl = 0;
4698   data->is_amd_with_CU = 0;
4699   data->is_real_fsroot = 1;
4700   data->root_path = NULL;
4701   fsroot_path = getenv("HWLOC_FSROOT");
4702   if (!fsroot_path)
4703     fsroot_path = "/";
4704 
4705 #ifdef HAVE_OPENAT
4706   root = open(fsroot_path, O_RDONLY | O_DIRECTORY);
4707   if (root < 0)
4708     goto out_with_data;
4709 
4710   if (strcmp(fsroot_path, "/")) {
4711     backend->is_thissystem = 0;
4712     data->is_real_fsroot = 0;
4713     data->root_path = strdup(fsroot_path);
4714   }
4715 
4716   /* Since this fd stays open after hwloc returns, mark it as
4717      close-on-exec so that children don't inherit it.  Stevens says
4718      that we should GETFD before we SETFD, so we do. */
4719   flags = fcntl(root, F_GETFD, 0);
4720   if (-1 == flags ||
4721       -1 == fcntl(root, F_SETFD, FD_CLOEXEC | flags)) {
4722       close(root);
4723       root = -1;
4724       goto out_with_data;
4725   }
4726 #else
4727   if (strcmp(fsroot_path, "/")) {
4728     errno = ENOSYS;
4729     goto out_with_data;
4730   }
4731 #endif
4732   data->root_fd = root;
4733 
4734 #ifdef HWLOC_HAVE_LIBUDEV
4735   data->udev = NULL;
4736   if (data->is_real_fsroot) {
4737     data->udev = udev_new();
4738   }
4739 #endif
4740 
4741   data->dumped_hwdata_dirname = getenv("HWLOC_DUMPED_HWDATA_DIR");
4742   if (!data->dumped_hwdata_dirname)
4743     data->dumped_hwdata_dirname = (char *) RUNSTATEDIR "/hwloc/";
4744 
4745   return backend;
4746 
4747  out_with_data:
4748 #ifdef HAVE_OPENAT
4749   free(data->root_path);
4750 #endif
4751   free(data);
4752  out_with_backend:
4753   free(backend);
4754  out:
4755   return NULL;
4756 }
4757 
4758 static struct hwloc_disc_component hwloc_linux_disc_component = {
4759   HWLOC_DISC_COMPONENT_TYPE_CPU,
4760   "linux",
4761   HWLOC_DISC_COMPONENT_TYPE_GLOBAL,
4762   hwloc_linux_component_instantiate,
4763   50,
4764   1,
4765   NULL
4766 };
4767 
4768 const struct hwloc_component hwloc_linux_component = {
4769   HWLOC_COMPONENT_ABI,
4770   NULL, NULL,
4771   HWLOC_COMPONENT_TYPE_DISC,
4772   0,
4773   &hwloc_linux_disc_component
4774 };
4775 
4776 
4777 
4778 
4779 #ifdef HWLOC_HAVE_LINUXIO
4780 
4781 /***********************************
4782  ******* Linux I/O component *******
4783  ***********************************/
4784 
4785 #define HWLOC_LINUXFS_FIND_OSDEV_FLAG_VIRTUAL (1<<0)
4786 #define HWLOC_LINUXFS_FIND_OSDEV_FLAG_USB (1<<1)
4787 
4788 static hwloc_obj_t
4789 hwloc_linuxfs_find_osdev_parent(struct hwloc_backend *backend, int root_fd,
4790                                 const char *osdevpath, unsigned osdev_flags)
4791 {
4792   struct hwloc_topology *topology = backend->topology;
4793   char path[256], buf[10];
4794   int fd;
4795   int foundpci;
4796   unsigned pcidomain = 0, pcibus = 0, pcidev = 0, pcifunc = 0;
4797   unsigned _pcidomain, _pcibus, _pcidev, _pcifunc;
4798   hwloc_bitmap_t cpuset;
4799   const char *tmp;
4800   hwloc_obj_t parent;
4801   int err;
4802 
4803   err = hwloc_readlink(osdevpath, path, sizeof(path), root_fd);
4804   if (err < 0) {
4805     /* /sys/class/<class>/<name> is a directory instead of a symlink on old kernels (at least around 2.6.18 and 2.6.25).
4806      * The link to parse can be found in /sys/class/<class>/<name>/device instead, at least for "/pci..."
4807      */
4808     char olddevpath[256];
4809     snprintf(olddevpath, sizeof(olddevpath), "%s/device", osdevpath);
4810     err = hwloc_readlink(olddevpath, path, sizeof(path), root_fd);
4811     if (err < 0)
4812       return NULL;
4813   }
4814   path[err] = '\0';
4815 
4816   if (!(osdev_flags & HWLOC_LINUXFS_FIND_OSDEV_FLAG_VIRTUAL)) {
4817     if (strstr(path, "/virtual/"))
4818       return NULL;
4819   }
4820 
4821   if (!(osdev_flags & HWLOC_LINUXFS_FIND_OSDEV_FLAG_USB)) {
4822     if (strstr(path, "/usb"))
4823       return NULL;
4824   }
4825 
4826   tmp = strstr(path, "/pci");
4827   if (!tmp)
4828     goto nopci;
4829   tmp = strchr(tmp+4, '/');
4830   if (!tmp)
4831     goto nopci;
4832   tmp++;
4833 
4834   /* iterate through busid to find the last one (previous ones are bridges) */
4835   foundpci = 0;
4836  nextpci:
4837   if (sscanf(tmp+1, "%x:%x:%x.%x", &_pcidomain, &_pcibus, &_pcidev, &_pcifunc) == 4) {
4838     foundpci = 1;
4839     pcidomain = _pcidomain;
4840     pcibus = _pcibus;
4841     pcidev = _pcidev;
4842     pcifunc = _pcifunc;
4843     tmp += 13;
4844     goto nextpci;
4845   }
4846   if (sscanf(tmp+1, "%x:%x.%x", &_pcibus, &_pcidev, &_pcifunc) == 3) {
4847     foundpci = 1;
4848     pcidomain = 0;
4849     pcibus = _pcibus;
4850     pcidev = _pcidev;
4851     pcifunc = _pcifunc;
4852     tmp += 8;
4853     goto nextpci;
4854   }
4855 
4856   if (foundpci) {
4857     /* attach to a PCI parent */
4858     parent = hwloc_pcidisc_find_by_busid(topology, pcidomain, pcibus, pcidev, pcifunc);
4859     if (parent)
4860       return parent;
4861     /* attach to a normal (non-I/O) parent found by PCI affinity */
4862     parent = hwloc_pcidisc_find_busid_parent(topology, pcidomain, pcibus, pcidev, pcifunc);
4863     if (parent)
4864       return parent;
4865   }
4866 
4867  nopci:
4868   /* attach directly near the right NUMA node */
4869   snprintf(path, sizeof(path), "%s/device/numa_node", osdevpath);
4870   fd = hwloc_open(path, root_fd);
4871   if (fd >= 0) {
4872     err = read(fd, buf, sizeof(buf));
4873     close(fd);
4874     if (err > 0) {
4875       int node = atoi(buf);
4876       if (node >= 0) {
4877         parent = hwloc_get_numanode_obj_by_os_index(topology, (unsigned) node);
4878         if (parent) {
4879           /* don't attach I/O under numa node, attach to the same normal parent */
4880           while (hwloc__obj_type_is_memory(parent->type))
4881             parent = parent->parent;
4882           return parent;
4883         }
4884       }
4885     }
4886   }
4887 
4888   /* attach directly to the right cpuset */
4889   snprintf(path, sizeof(path), "%s/device/local_cpus", osdevpath);
4890   cpuset = hwloc__alloc_read_path_as_cpumask(path, root_fd);
4891   if (cpuset) {
4892     parent = hwloc_find_insert_io_parent_by_complete_cpuset(topology, cpuset);
4893     hwloc_bitmap_free(cpuset);
4894     if (parent)
4895       return parent;
4896   }
4897 
4898   /* FIXME: {numa_node,local_cpus} may be missing when the device link points to a subdirectory.
4899    * For instance, device of scsi blocks may point to foo/ata1/host0/target0:0:0/0:0:0:0/ instead of foo/
4900    * In such case, we should look for device/../../../../{numa_node,local_cpus} instead of device/{numa_node,local_cpus}
4901    * Not needed yet since scsi blocks use the PCI locality above.
4902    */
4903 
4904   /* fallback to the root object */
4905   return hwloc_get_root_obj(topology);
4906 }
4907 
4908 static hwloc_obj_t
4909 hwloc_linux_add_os_device(struct hwloc_backend *backend, struct hwloc_obj *pcidev, hwloc_obj_osdev_type_t type, const char *name)
4910 {
4911   struct hwloc_topology *topology = backend->topology;
4912   struct hwloc_obj *obj = hwloc_alloc_setup_object(topology, HWLOC_OBJ_OS_DEVICE, HWLOC_UNKNOWN_INDEX);
4913   obj->name = strdup(name);
4914   obj->attr->osdev.type = type;
4915 
4916   hwloc_insert_object_by_parent(topology, pcidev, obj);
4917   /* insert_object_by_parent() doesn't merge during insert, so obj is still valid */
4918 
4919   return obj;
4920 }
4921 
4922 static void
4923 hwloc_linuxfs_block_class_fillinfos(struct hwloc_backend *backend __hwloc_attribute_unused, int root_fd,
4924                                     struct hwloc_obj *obj, const char *osdevpath)
4925 {
4926 #ifdef HWLOC_HAVE_LIBUDEV
4927   struct hwloc_linux_backend_data_s *data = backend->private_data;
4928 #endif
4929   FILE *file;
4930   char path[256];
4931   char line[128];
4932   char vendor[64] = "";
4933   char model[64] = "";
4934   char serial[64] = "";
4935   char revision[64] = "";
4936   char blocktype[64] = "";
4937   unsigned sectorsize = 0;
4938   unsigned major_id, minor_id;
4939   char *tmp;
4940 
4941   snprintf(path, sizeof(path), "%s/size", osdevpath);
4942   if (!hwloc_read_path_by_length(path, line, sizeof(line), root_fd)) {
4943     unsigned long long sectors = strtoull(line, NULL, 10);
4944     /* linux always reports size in 512-byte units, we want kB */
4945     snprintf(line, sizeof(line), "%llu", sectors / 2);
4946     hwloc_obj_add_info(obj, "Size", line);
4947   }
4948 
4949   snprintf(path, sizeof(path), "%s/queue/hw_sector_size", osdevpath);
4950   if (!hwloc_read_path_by_length(path, line, sizeof(line), root_fd)) {
4951     sectorsize = strtoul(line, NULL, 10);
4952   }
4953 
4954   /* pmem have device/devtype containing "nd_btt" (sectors)
4955    * or "nd_namespace_io" (byte-granularity).
4956    * Note that device/sector_size in btt devices includes integrity metadata
4957    * (512/4096 block + 0/N) while queue/hw_sector_size above is the user sectorsize
4958    * without metadata.
4959    */
4960   snprintf(path, sizeof(path), "%s/device/devtype", osdevpath);
4961   if (!hwloc_read_path_by_length(path, line, sizeof(line), root_fd)) {
4962     if (!strncmp(line, "nd_", 3)) {
4963       strcpy(blocktype, "NVDIMM"); /* Save the blocktype now since udev reports "" so far */
4964       if (!strcmp(line, "nd_namespace_io"))
4965         sectorsize = 1;
4966     }
4967   }
4968   if (sectorsize) {
4969     snprintf(line, sizeof(line), "%u", sectorsize);
4970     hwloc_obj_add_info(obj, "SectorSize", line);
4971   }
4972 
4973   snprintf(path, sizeof(path), "%s/dev", osdevpath);
4974   if (hwloc_read_path_by_length(path, line, sizeof(line), root_fd) < 0)
4975     goto done;
4976   if (sscanf(line, "%u:%u", &major_id, &minor_id) != 2)
4977     goto done;
4978   tmp = strchr(line, '\n');
4979   if (tmp)
4980     *tmp = '\0';
4981   hwloc_obj_add_info(obj, "LinuxDeviceID", line);
4982 
4983 #ifdef HWLOC_HAVE_LIBUDEV
4984   if (data->udev) {
4985     struct udev_device *dev;
4986     const char *prop;
4987     dev = udev_device_new_from_subsystem_sysname(data->udev, "block", obj->name);
4988     if (!dev)
4989       goto done;
4990     prop = udev_device_get_property_value(dev, "ID_VENDOR");
4991     if (prop) {
4992       strncpy(vendor, prop, sizeof(vendor));
4993       vendor[sizeof(vendor)-1] = '\0';
4994     }
4995     prop = udev_device_get_property_value(dev, "ID_MODEL");
4996     if (prop) {
4997       strncpy(model, prop, sizeof(model));
4998       model[sizeof(model)-1] = '\0';
4999     }
5000     prop = udev_device_get_property_value(dev, "ID_REVISION");
5001     if (prop) {
5002       strncpy(revision, prop, sizeof(revision));
5003       revision[sizeof(revision)-1] = '\0';
5004     }
5005     prop = udev_device_get_property_value(dev, "ID_SERIAL_SHORT");
5006     if (prop) {
5007       strncpy(serial, prop, sizeof(serial));
5008       serial[sizeof(serial)-1] = '\0';
5009     }
5010     prop = udev_device_get_property_value(dev, "ID_TYPE");
5011     if (prop) {
5012       strncpy(blocktype, prop, sizeof(blocktype));
5013       blocktype[sizeof(blocktype)-1] = '\0';
5014     }
5015 
5016     udev_device_unref(dev);
5017   } else
5018     /* fallback to reading files, works with any fsroot */
5019 #endif
5020  {
5021   snprintf(path, sizeof(path), "/run/udev/data/b%u:%u", major_id, minor_id);
5022   file = hwloc_fopen(path, "r", root_fd);
5023   if (!file)
5024     goto done;
5025 
5026   while (NULL != fgets(line, sizeof(line), file)) {
5027     tmp = strchr(line, '\n');
5028     if (tmp)
5029       *tmp = '\0';
5030     if (!strncmp(line, "E:ID_VENDOR=", strlen("E:ID_VENDOR="))) {
5031       strncpy(vendor, line+strlen("E:ID_VENDOR="), sizeof(vendor));
5032       vendor[sizeof(vendor)-1] = '\0';
5033     } else if (!strncmp(line, "E:ID_MODEL=", strlen("E:ID_MODEL="))) {
5034       strncpy(model, line+strlen("E:ID_MODEL="), sizeof(model));
5035       model[sizeof(model)-1] = '\0';
5036     } else if (!strncmp(line, "E:ID_REVISION=", strlen("E:ID_REVISION="))) {
5037       strncpy(revision, line+strlen("E:ID_REVISION="), sizeof(revision));
5038       revision[sizeof(revision)-1] = '\0';
5039     } else if (!strncmp(line, "E:ID_SERIAL_SHORT=", strlen("E:ID_SERIAL_SHORT="))) {
5040       strncpy(serial, line+strlen("E:ID_SERIAL_SHORT="), sizeof(serial));
5041       serial[sizeof(serial)-1] = '\0';
5042     } else if (!strncmp(line, "E:ID_TYPE=", strlen("E:ID_TYPE="))) {
5043       strncpy(blocktype, line+strlen("E:ID_TYPE="), sizeof(blocktype));
5044       blocktype[sizeof(blocktype)-1] = '\0';
5045     }
5046   }
5047   fclose(file);
5048  }
5049 
5050  done:
5051   /* clear fake "ATA" vendor name */
5052   if (!strcasecmp(vendor, "ATA"))
5053     *vendor = '\0';
5054   /* overwrite vendor name from model when possible */
5055   if (!*vendor) {
5056     if (!strncasecmp(model, "wd", 2))
5057       strcpy(vendor, "Western Digital");
5058     else if (!strncasecmp(model, "st", 2))
5059       strcpy(vendor, "Seagate");
5060     else if (!strncasecmp(model, "samsung", 7))
5061       strcpy(vendor, "Samsung");
5062     else if (!strncasecmp(model, "sandisk", 7))
5063       strcpy(vendor, "SanDisk");
5064     else if (!strncasecmp(model, "toshiba", 7))
5065       strcpy(vendor, "Toshiba");
5066   }
5067 
5068   if (*vendor)
5069     hwloc_obj_add_info(obj, "Vendor", vendor);
5070   if (*model)
5071     hwloc_obj_add_info(obj, "Model", model);
5072   if (*revision)
5073     hwloc_obj_add_info(obj, "Revision", revision);
5074   if (*serial)
5075     hwloc_obj_add_info(obj, "SerialNumber", serial);
5076 
5077   if (!strcmp(blocktype, "disk") || !strncmp(obj->name, "nvme", 4))
5078     obj->subtype = strdup("Disk");
5079   else if (!strcmp(blocktype, "NVDIMM")) /* FIXME: set by us above, to workaround udev returning "" so far */
5080     obj->subtype = strdup("NVDIMM");
5081   else if (!strcmp(blocktype, "tape"))
5082     obj->subtype = strdup("Tape");
5083   else if (!strcmp(blocktype, "cd") || !strcmp(blocktype, "floppy") || !strcmp(blocktype, "optical"))
5084     obj->subtype = strdup("Removable Media Device");
5085   else {
5086     /* generic, usb mass storage/rbc, usb mass storage/scsi */
5087   }
5088 }
5089 
5090 static int
5091 hwloc_linuxfs_lookup_block_class(struct hwloc_backend *backend, unsigned osdev_flags)
5092 {
5093   struct hwloc_linux_backend_data_s *data = backend->private_data;
5094   int root_fd = data->root_fd;
5095   DIR *dir;
5096   struct dirent *dirent;
5097 
5098   dir = hwloc_opendir("/sys/class/block", root_fd);
5099   if (!dir)
5100     return 0;
5101 
5102   while ((dirent = readdir(dir)) != NULL) {
5103     char path[256];
5104     struct stat stbuf;
5105     hwloc_obj_t obj, parent;
5106     int err;
5107 
5108     if (!strcmp(dirent->d_name, ".") || !strcmp(dirent->d_name, ".."))
5109       continue;
5110 
5111     /* ignore partitions */
5112     err = snprintf(path, sizeof(path), "/sys/class/block/%s/partition", dirent->d_name);
5113     if ((size_t) err < sizeof(path)
5114         && hwloc_stat(path, &stbuf, root_fd) >= 0)
5115       continue;
5116 
5117     err = snprintf(path, sizeof(path), "/sys/class/block/%s", dirent->d_name);
5118     if ((size_t) err >= sizeof(path))
5119       continue;
5120     parent = hwloc_linuxfs_find_osdev_parent(backend, root_fd, path, osdev_flags);
5121     if (!parent)
5122       continue;
5123 
5124     /* USB device are created here but removed later when USB PCI devices get filtered out
5125      * (unless WHOLE_IO is enabled).
5126      */
5127 
5128     obj = hwloc_linux_add_os_device(backend, parent, HWLOC_OBJ_OSDEV_BLOCK, dirent->d_name);
5129 
5130     hwloc_linuxfs_block_class_fillinfos(backend, root_fd, obj, path);
5131   }
5132 
5133   closedir(dir);
5134 
5135   return 0;
5136 }
5137 
5138 static void
5139 hwloc_linuxfs_net_class_fillinfos(int root_fd,
5140                                   struct hwloc_obj *obj, const char *osdevpath)
5141 {
5142   struct stat st;
5143   char path[256];
5144   char address[128];
5145   snprintf(path, sizeof(path), "%s/address", osdevpath);
5146   if (!hwloc_read_path_by_length(path, address, sizeof(address), root_fd)) {
5147     char *eol = strchr(address, '\n');
5148     if (eol)
5149       *eol = 0;
5150     hwloc_obj_add_info(obj, "Address", address);
5151   }
5152   snprintf(path, sizeof(path), "%s/device/infiniband", osdevpath);
5153   if (!hwloc_stat(path, &st, root_fd)) {
5154     char hexid[16];
5155     snprintf(path, sizeof(path), "%s/dev_id", osdevpath);
5156     if (!hwloc_read_path_by_length(path, hexid, sizeof(hexid), root_fd)) {
5157       char *eoid;
5158       unsigned long port;
5159       port = strtoul(hexid, &eoid, 0);
5160       if (eoid != hexid) {
5161         char portstr[16];
5162         snprintf(portstr, sizeof(portstr), "%lu", port+1);
5163         hwloc_obj_add_info(obj, "Port", portstr);
5164       }
5165     }
5166   }
5167 }
5168 
5169 static int
5170 hwloc_linuxfs_lookup_net_class(struct hwloc_backend *backend, unsigned osdev_flags)
5171 {
5172   struct hwloc_linux_backend_data_s *data = backend->private_data;
5173   int root_fd = data->root_fd;
5174   DIR *dir;
5175   struct dirent *dirent;
5176 
5177   dir = hwloc_opendir("/sys/class/net", root_fd);
5178   if (!dir)
5179     return 0;
5180 
5181   while ((dirent = readdir(dir)) != NULL) {
5182     char path[256];
5183     hwloc_obj_t obj, parent;
5184     int err;
5185 
5186     if (!strcmp(dirent->d_name, ".") || !strcmp(dirent->d_name, ".."))
5187       continue;
5188 
5189     err = snprintf(path, sizeof(path), "/sys/class/net/%s", dirent->d_name);
5190     if ((size_t) err >= sizeof(path))
5191       continue;
5192     parent = hwloc_linuxfs_find_osdev_parent(backend, root_fd, path, osdev_flags);
5193     if (!parent)
5194       continue;
5195 
5196     obj = hwloc_linux_add_os_device(backend, parent, HWLOC_OBJ_OSDEV_NETWORK, dirent->d_name);
5197 
5198     hwloc_linuxfs_net_class_fillinfos(root_fd, obj, path);
5199   }
5200 
5201   closedir(dir);
5202 
5203   return 0;
5204 }
5205 
5206 static void
5207 hwloc_linuxfs_infiniband_class_fillinfos(int root_fd,
5208                                          struct hwloc_obj *obj, const char *osdevpath)
5209 {
5210   char path[256];
5211   char guidvalue[20];
5212   unsigned i,j;
5213 
5214   snprintf(path, sizeof(path), "%s/node_guid", osdevpath);
5215   if (!hwloc_read_path_by_length(path, guidvalue, sizeof(guidvalue), root_fd)) {
5216     size_t len;
5217     len = strspn(guidvalue, "0123456789abcdefx:");
5218     guidvalue[len] = '\0';
5219     hwloc_obj_add_info(obj, "NodeGUID", guidvalue);
5220   }
5221 
5222   snprintf(path, sizeof(path), "%s/sys_image_guid", osdevpath);
5223   if (!hwloc_read_path_by_length(path, guidvalue, sizeof(guidvalue), root_fd)) {
5224     size_t len;
5225     len = strspn(guidvalue, "0123456789abcdefx:");
5226     guidvalue[len] = '\0';
5227     hwloc_obj_add_info(obj, "SysImageGUID", guidvalue);
5228   }
5229 
5230   for(i=1; ; i++) {
5231     char statevalue[2];
5232     char lidvalue[11];
5233     char gidvalue[40];
5234 
5235     snprintf(path, sizeof(path), "%s/ports/%u/state", osdevpath, i);
5236     if (!hwloc_read_path_by_length(path, statevalue, sizeof(statevalue), root_fd)) {
5237       char statename[32];
5238       statevalue[1] = '\0'; /* only keep the first byte/digit */
5239       snprintf(statename, sizeof(statename), "Port%uState", i);
5240       hwloc_obj_add_info(obj, statename, statevalue);
5241     } else {
5242       /* no such port */
5243       break;
5244     }
5245 
5246     snprintf(path, sizeof(path), "%s/ports/%u/lid", osdevpath, i);
5247     if (!hwloc_read_path_by_length(path, lidvalue, sizeof(lidvalue), root_fd)) {
5248       char lidname[32];
5249       size_t len;
5250       len = strspn(lidvalue, "0123456789abcdefx");
5251       lidvalue[len] = '\0';
5252       snprintf(lidname, sizeof(lidname), "Port%uLID", i);
5253       hwloc_obj_add_info(obj, lidname, lidvalue);
5254     }
5255 
5256     snprintf(path, sizeof(path), "%s/ports/%u/lid_mask_count", osdevpath, i);
5257     if (!hwloc_read_path_by_length(path, lidvalue, sizeof(lidvalue), root_fd)) {
5258       char lidname[32];
5259       size_t len;
5260       len = strspn(lidvalue, "0123456789");
5261       lidvalue[len] = '\0';
5262       snprintf(lidname, sizeof(lidname), "Port%uLMC", i);
5263       hwloc_obj_add_info(obj, lidname, lidvalue);
5264     }
5265 
5266     for(j=0; ; j++) {
5267       snprintf(path, sizeof(path), "%s/ports/%u/gids/%u", osdevpath, i, j);
5268       if (!hwloc_read_path_by_length(path, gidvalue, sizeof(gidvalue), root_fd)) {
5269         char gidname[32];
5270         size_t len;
5271         len = strspn(gidvalue, "0123456789abcdefx:");
5272         gidvalue[len] = '\0';
5273         if (strncmp(gidvalue+20, "0000:0000:0000:0000", 19)) {
5274           /* only keep initialized GIDs */
5275           snprintf(gidname, sizeof(gidname), "Port%uGID%u", i, j);
5276           hwloc_obj_add_info(obj, gidname, gidvalue);
5277         }
5278       } else {
5279         /* no such port */
5280         break;
5281       }
5282     }
5283   }
5284 }
5285 
5286 static int
5287 hwloc_linuxfs_lookup_infiniband_class(struct hwloc_backend *backend, unsigned osdev_flags)
5288 {
5289   struct hwloc_linux_backend_data_s *data = backend->private_data;
5290   int root_fd = data->root_fd;
5291   DIR *dir;
5292   struct dirent *dirent;
5293 
5294   dir = hwloc_opendir("/sys/class/infiniband", root_fd);
5295   if (!dir)
5296     return 0;
5297 
5298   while ((dirent = readdir(dir)) != NULL) {
5299     char path[256];
5300     hwloc_obj_t obj, parent;
5301     int err;
5302 
5303     if (!strcmp(dirent->d_name, ".") || !strcmp(dirent->d_name, ".."))
5304       continue;
5305 
5306     /* blocklist scif* fake devices */
5307     if (!strncmp(dirent->d_name, "scif", 4))
5308       continue;
5309 
5310     err = snprintf(path, sizeof(path), "/sys/class/infiniband/%s", dirent->d_name);
5311     if ((size_t) err > sizeof(path))
5312       continue;
5313     parent = hwloc_linuxfs_find_osdev_parent(backend, root_fd, path, osdev_flags);
5314     if (!parent)
5315       continue;
5316 
5317     obj = hwloc_linux_add_os_device(backend, parent, HWLOC_OBJ_OSDEV_OPENFABRICS, dirent->d_name);
5318 
5319     hwloc_linuxfs_infiniband_class_fillinfos(root_fd, obj, path);
5320   }
5321 
5322   closedir(dir);
5323 
5324   return 0;
5325 }
5326 
5327 static void
5328 hwloc_linuxfs_mic_class_fillinfos(int root_fd,
5329                                   struct hwloc_obj *obj, const char *osdevpath)
5330 {
5331   char path[256];
5332   char family[64];
5333   char sku[64];
5334   char sn[64];
5335   char string[20];
5336 
5337   obj->subtype = strdup("MIC");
5338 
5339   snprintf(path, sizeof(path), "%s/family", osdevpath);
5340   if (!hwloc_read_path_by_length(path, family, sizeof(family), root_fd)) {
5341     char *eol = strchr(family, '\n');
5342     if (eol)
5343       *eol = 0;
5344     hwloc_obj_add_info(obj, "MICFamily", family);
5345   }
5346 
5347   snprintf(path, sizeof(path), "%s/sku", osdevpath);
5348   if (!hwloc_read_path_by_length(path, sku, sizeof(sku), root_fd)) {
5349     char *eol = strchr(sku, '\n');
5350     if (eol)
5351       *eol = 0;
5352     hwloc_obj_add_info(obj, "MICSKU", sku);
5353   }
5354 
5355   snprintf(path, sizeof(path), "%s/serialnumber", osdevpath);
5356   if (!hwloc_read_path_by_length(path, sn, sizeof(sn), root_fd)) {
5357     char *eol;
5358     eol = strchr(sn, '\n');
5359     if (eol)
5360       *eol = 0;
5361     hwloc_obj_add_info(obj, "MICSerialNumber", sn);
5362   }
5363 
5364   snprintf(path, sizeof(path), "%s/active_cores", osdevpath);
5365   if (!hwloc_read_path_by_length(path, string, sizeof(string), root_fd)) {
5366     unsigned long count = strtoul(string, NULL, 16);
5367     snprintf(string, sizeof(string), "%lu", count);
5368     hwloc_obj_add_info(obj, "MICActiveCores", string);
5369   }
5370 
5371   snprintf(path, sizeof(path), "%s/memsize", osdevpath);
5372   if (!hwloc_read_path_by_length(path, string, sizeof(string), root_fd)) {
5373     unsigned long count = strtoul(string, NULL, 16);
5374     snprintf(string, sizeof(string), "%lu", count);
5375     hwloc_obj_add_info(obj, "MICMemorySize", string);
5376   }
5377 }
5378 
5379 static int
5380 hwloc_linuxfs_lookup_mic_class(struct hwloc_backend *backend, unsigned osdev_flags)
5381 {
5382   struct hwloc_linux_backend_data_s *data = backend->private_data;
5383   int root_fd = data->root_fd;
5384   unsigned idx;
5385   DIR *dir;
5386   struct dirent *dirent;
5387 
5388   dir = hwloc_opendir("/sys/class/mic", root_fd);
5389   if (!dir)
5390     return 0;
5391 
5392   while ((dirent = readdir(dir)) != NULL) {
5393     char path[256];
5394     hwloc_obj_t obj, parent;
5395 
5396     if (!strcmp(dirent->d_name, ".") || !strcmp(dirent->d_name, ".."))
5397       continue;
5398     if (sscanf(dirent->d_name, "mic%u", &idx) != 1)
5399       continue;
5400 
5401     snprintf(path, sizeof(path), "/sys/class/mic/mic%u", idx);
5402     parent = hwloc_linuxfs_find_osdev_parent(backend, root_fd, path, osdev_flags);
5403     if (!parent)
5404       continue;
5405 
5406     obj = hwloc_linux_add_os_device(backend, parent, HWLOC_OBJ_OSDEV_COPROC, dirent->d_name);
5407 
5408     hwloc_linuxfs_mic_class_fillinfos(root_fd, obj, path);
5409   }
5410 
5411   closedir(dir);
5412 
5413   return 0;
5414 }
5415 
5416 static int
5417 hwloc_linuxfs_lookup_drm_class(struct hwloc_backend *backend, unsigned osdev_flags)
5418 {
5419   struct hwloc_linux_backend_data_s *data = backend->private_data;
5420   int root_fd = data->root_fd;
5421   DIR *dir;
5422   struct dirent *dirent;
5423 
5424   dir = hwloc_opendir("/sys/class/drm", root_fd);
5425   if (!dir)
5426     return 0;
5427 
5428   while ((dirent = readdir(dir)) != NULL) {
5429     char path[256];
5430     hwloc_obj_t parent;
5431     struct stat stbuf;
5432     int err;
5433 
5434     if (!strcmp(dirent->d_name, ".") || !strcmp(dirent->d_name, ".."))
5435       continue;
5436 
5437     /* only keep main devices, not subdevices for outputs */
5438     err = snprintf(path, sizeof(path), "/sys/class/drm/%s/dev", dirent->d_name);
5439     if ((size_t) err < sizeof(path)
5440         && hwloc_stat(path, &stbuf, root_fd) < 0)
5441       continue;
5442 
5443     /* Most drivers expose a card%d device.
5444      * Some (free?) drivers also expose render%d.
5445      * Old kernels also have a controlD%d. On recent kernels, it's a symlink to card%d (deprecated?).
5446      * There can also exist some output-specific files such as card0-DP-1.
5447      *
5448      * All these aren't very useful compared to CUDA/OpenCL/...
5449      * Hence the DRM class is only enabled when KEEP_ALL.
5450      *
5451      * FIXME: We might want to filter everything out but card%d.
5452      * Maybe look at the driver (read the end of /sys/class/drm/<name>/device/driver symlink),
5453      * to decide whether card%d could be useful (likely not for NVIDIA).
5454      */
5455 
5456     err = snprintf(path, sizeof(path), "/sys/class/drm/%s", dirent->d_name);
5457     if ((size_t) err >= sizeof(path))
5458       continue;
5459     parent = hwloc_linuxfs_find_osdev_parent(backend, root_fd, path, osdev_flags);
5460     if (!parent)
5461       continue;
5462 
5463     hwloc_linux_add_os_device(backend, parent, HWLOC_OBJ_OSDEV_GPU, dirent->d_name);
5464   }
5465 
5466   closedir(dir);
5467 
5468   return 0;
5469 }
5470 
5471 static int
5472 hwloc_linuxfs_lookup_dma_class(struct hwloc_backend *backend, unsigned osdev_flags)
5473 {
5474   struct hwloc_linux_backend_data_s *data = backend->private_data;
5475   int root_fd = data->root_fd;
5476   DIR *dir;
5477   struct dirent *dirent;
5478 
5479   dir = hwloc_opendir("/sys/class/dma", root_fd);
5480   if (!dir)
5481     return 0;
5482 
5483   while ((dirent = readdir(dir)) != NULL) {
5484     char path[256];
5485     hwloc_obj_t parent;
5486     int err;
5487 
5488     if (!strcmp(dirent->d_name, ".") || !strcmp(dirent->d_name, ".."))
5489       continue;
5490 
5491     err = snprintf(path, sizeof(path), "/sys/class/dma/%s", dirent->d_name);
5492     if ((size_t) err >= sizeof(path))
5493       continue;
5494     parent = hwloc_linuxfs_find_osdev_parent(backend, root_fd, path, osdev_flags);
5495     if (!parent)
5496       continue;
5497 
5498     hwloc_linux_add_os_device(backend, parent, HWLOC_OBJ_OSDEV_DMA, dirent->d_name);
5499   }
5500 
5501   closedir(dir);
5502 
5503   return 0;
5504 }
5505 
5506 struct hwloc_firmware_dmi_mem_device_header {
5507   unsigned char type;
5508   unsigned char length;
5509   unsigned char handle[2];
5510   unsigned char phy_mem_handle[2];
5511   unsigned char mem_err_handle[2];
5512   unsigned char tot_width[2];
5513   unsigned char dat_width[2];
5514   unsigned char size[2];
5515   unsigned char ff;
5516   unsigned char dev_set;
5517   unsigned char dev_loc_str_num;
5518   unsigned char bank_loc_str_num;
5519   unsigned char mem_type;
5520   unsigned char type_detail[2];
5521   unsigned char speed[2];
5522   unsigned char manuf_str_num;
5523   unsigned char serial_str_num;
5524   unsigned char asset_tag_str_num;
5525   unsigned char part_num_str_num;
5526   /* don't include the following fields since we don't need them,
5527    * some old implementations may miss them.
5528    */
5529 };
5530 
5531 static int check_dmi_entry(const char *buffer)
5532 {
5533   /* reject empty strings */
5534   if (!*buffer)
5535     return 0;
5536   /* reject strings of spaces (at least Dell use this for empty memory slots) */
5537   if (strspn(buffer, " ") == strlen(buffer))
5538     return 0;
5539   return 1;
5540 }
5541 
5542 static int
5543 hwloc__get_firmware_dmi_memory_info_one(struct hwloc_topology *topology,
5544                                         unsigned idx, const char *path, FILE *fd,
5545                                         struct hwloc_firmware_dmi_mem_device_header *header)
5546 {
5547   unsigned slen;
5548   char buffer[256]; /* enough for memory device strings, or at least for each of them */
5549   unsigned foff; /* offset in raw file */
5550   unsigned boff; /* offset in buffer read from raw file */
5551   unsigned i;
5552   struct hwloc_info_s *infos = NULL;
5553   unsigned infos_count = 0;
5554   hwloc_obj_t misc;
5555   int foundinfo = 0;
5556 
5557   /* start after the header */
5558   foff = header->length;
5559   i = 1;
5560   while (1) {
5561     /* read one buffer */
5562     if (fseek(fd, foff, SEEK_SET) < 0)
5563       break;
5564     if (!fgets(buffer, sizeof(buffer), fd))
5565       break;
5566     /* read string at the beginning of the buffer */
5567     boff = 0;
5568     while (1) {
5569       /* stop on empty string */
5570       if (!buffer[boff])
5571         goto done;
5572       /* stop if this string goes to the end of the buffer */
5573       slen = strlen(buffer+boff);
5574       if (boff + slen+1 == sizeof(buffer))
5575         break;
5576       /* string didn't get truncated, should be OK */
5577       if (i == header->manuf_str_num) {
5578         if (check_dmi_entry(buffer+boff)) {
5579           hwloc__add_info(&infos, &infos_count, "Vendor", buffer+boff);
5580           foundinfo = 1;
5581         }
5582       } else if (i == header->serial_str_num) {
5583         if (check_dmi_entry(buffer+boff)) {
5584           hwloc__add_info(&infos, &infos_count, "SerialNumber", buffer+boff);
5585           foundinfo = 1;
5586         }
5587       } else if (i == header->asset_tag_str_num) {
5588         if (check_dmi_entry(buffer+boff)) {
5589           hwloc__add_info(&infos, &infos_count, "AssetTag", buffer+boff);
5590           foundinfo = 1;
5591         }
5592       } else if (i == header->part_num_str_num) {
5593         if (check_dmi_entry(buffer+boff)) {
5594           hwloc__add_info(&infos, &infos_count, "PartNumber", buffer+boff);
5595           foundinfo = 1;
5596         }
5597       } else if (i == header->dev_loc_str_num) {
5598         if (check_dmi_entry(buffer+boff)) {
5599           hwloc__add_info(&infos, &infos_count, "DeviceLocation", buffer+boff);
5600           /* only a location, not an actual info about the device */
5601         }
5602       } else if (i == header->bank_loc_str_num) {
5603         if (check_dmi_entry(buffer+boff)) {
5604           hwloc__add_info(&infos, &infos_count, "BankLocation", buffer+boff);
5605           /* only a location, not an actual info about the device */
5606         }
5607       } else {
5608         goto done;
5609       }
5610       /* next string in buffer */
5611       boff += slen+1;
5612       i++;
5613     }
5614     /* couldn't read a single full string from that buffer, we're screwed */
5615     if (!boff) {
5616       fprintf(stderr, "hwloc could read a DMI firmware entry #%u in %s\n",
5617               i, path);
5618       break;
5619     }
5620     /* reread buffer after previous string */
5621     foff += boff;
5622   }
5623 
5624 done:
5625   if (!foundinfo) {
5626     /* found no actual info about the device. if there's only location info, the slot may be empty */
5627     goto out_with_infos;
5628   }
5629 
5630   misc = hwloc_alloc_setup_object(topology, HWLOC_OBJ_MISC, idx);
5631   if (!misc)
5632     goto out_with_infos;
5633 
5634   misc->subtype = strdup("MemoryModule");
5635 
5636   hwloc__move_infos(&misc->infos, &misc->infos_count, &infos, &infos_count);
5637   /* FIXME: find a way to identify the corresponding NUMA node and attach these objects there.
5638    * but it means we need to parse DeviceLocation=DIMM_B4 but these vary significantly
5639    * with the vendor, and it's hard to be 100% sure 'B' is second socket.
5640    * Examples at http://sourceforge.net/p/edac-utils/code/HEAD/tree/trunk/src/etc/labels.db
5641    * or https://github.com/grondo/edac-utils/blob/master/src/etc/labels.db
5642    */
5643   hwloc_insert_object_by_parent(topology, hwloc_get_root_obj(topology), misc);
5644   return 1;
5645 
5646  out_with_infos:
5647   hwloc__free_infos(infos, infos_count);
5648   return 0;
5649 }
5650 
5651 static int
5652 hwloc__get_firmware_dmi_memory_info(struct hwloc_topology *topology,
5653                                     struct hwloc_linux_backend_data_s *data)
5654 {
5655   char path[128];
5656   unsigned i;
5657 
5658   for(i=0; ; i++) {
5659     FILE *fd;
5660     struct hwloc_firmware_dmi_mem_device_header header;
5661     int err;
5662 
5663     snprintf(path, sizeof(path), "/sys/firmware/dmi/entries/17-%u/raw", i);
5664     fd = hwloc_fopen(path, "r", data->root_fd);
5665     if (!fd)
5666       break;
5667 
5668     err = fread(&header, sizeof(header), 1, fd);
5669     if (err != 1) {
5670       fclose(fd);
5671       break;
5672     }
5673     if (header.length < sizeof(header)) {
5674       /* invalid, or too old entry/spec that doesn't contain what we need */
5675       fclose(fd);
5676       break;
5677     }
5678 
5679     hwloc__get_firmware_dmi_memory_info_one(topology, i, path, fd, &header);
5680 
5681     fclose(fd);
5682   }
5683 
5684   return 0;
5685 }
5686 
5687 #ifdef HWLOC_HAVE_LINUXPCI
5688 
5689 #define HWLOC_PCI_REVISION_ID 0x08
5690 #define HWLOC_PCI_CAP_ID_EXP 0x10
5691 #define HWLOC_PCI_CLASS_NOT_DEFINED 0x0000
5692 
5693 static int
5694 hwloc_linuxfs_pci_look_pcidevices(struct hwloc_backend *backend)
5695 {
5696   struct hwloc_linux_backend_data_s *data = backend->private_data;
5697   struct hwloc_topology *topology = backend->topology;
5698   hwloc_obj_t tree = NULL;
5699   int root_fd = data->root_fd;
5700   DIR *dir;
5701   struct dirent *dirent;
5702 
5703   /* We could lookup /sys/devices/pci.../.../busid1/.../busid2 recursively
5704    * to build the hierarchy of bridges/devices directly.
5705    * But that would require readdirs in all bridge sysfs subdirectories.
5706    * Do a single readdir in the linear list in /sys/bus/pci/devices/...
5707    * and build the hierarchy manually instead.
5708    */
5709   dir = hwloc_opendir("/sys/bus/pci/devices/", root_fd);
5710   if (!dir)
5711     return 0;
5712 
5713   while ((dirent = readdir(dir)) != NULL) {
5714 #define CONFIG_SPACE_CACHESIZE 256
5715     unsigned char config_space_cache[CONFIG_SPACE_CACHESIZE];
5716     unsigned domain, bus, dev, func;
5717     unsigned short class_id;
5718     hwloc_obj_type_t type;
5719     hwloc_obj_t obj;
5720     struct hwloc_pcidev_attr_s *attr;
5721     unsigned offset;
5722     char path[64];
5723     char value[16];
5724     size_t ret;
5725     int fd, err;
5726 
5727     if (sscanf(dirent->d_name, "%04x:%02x:%02x.%01x", &domain, &bus, &dev, &func) != 4)
5728       continue;
5729 
5730     /* initialize the config space in case we fail to read it (missing permissions, etc). */
5731     memset(config_space_cache, 0xff, CONFIG_SPACE_CACHESIZE);
5732     err = snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/config", dirent->d_name);
5733     if ((size_t) err < sizeof(path)) {
5734       /* don't use hwloc_read_path_by_length() because we don't want the ending \0 */
5735       fd = hwloc_open(path, root_fd);
5736       if (fd >= 0) {
5737         ret = read(fd, config_space_cache, CONFIG_SPACE_CACHESIZE);
5738         (void) ret; /* we initialized config_space_cache in case we don't read enough, ignore the read length */
5739         close(fd);
5740       }
5741     }
5742 
5743     class_id = HWLOC_PCI_CLASS_NOT_DEFINED;
5744     err = snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/class", dirent->d_name);
5745     if ((size_t) err < sizeof(path)
5746         && !hwloc_read_path_by_length(path, value, sizeof(value), root_fd))
5747       class_id = strtoul(value, NULL, 16) >> 8;
5748 
5749     type = hwloc_pcidisc_check_bridge_type(class_id, config_space_cache);
5750 
5751     /* filtered? */
5752     if (type == HWLOC_OBJ_PCI_DEVICE) {
5753       enum hwloc_type_filter_e filter;
5754       hwloc_topology_get_type_filter(topology, HWLOC_OBJ_PCI_DEVICE, &filter);
5755       if (filter == HWLOC_TYPE_FILTER_KEEP_NONE)
5756         continue;
5757       if (filter == HWLOC_TYPE_FILTER_KEEP_IMPORTANT
5758           && !hwloc_filter_check_pcidev_subtype_important(class_id))
5759         continue;
5760     } else if (type == HWLOC_OBJ_BRIDGE) {
5761       enum hwloc_type_filter_e filter;
5762       hwloc_topology_get_type_filter(topology, HWLOC_OBJ_BRIDGE, &filter);
5763       if (filter == HWLOC_TYPE_FILTER_KEEP_NONE)
5764         continue;
5765       /* HWLOC_TYPE_FILTER_KEEP_IMPORTANT filtered later in the core */
5766     }
5767 
5768     obj = hwloc_alloc_setup_object(topology, type, HWLOC_UNKNOWN_INDEX);
5769     if (!obj)
5770       break;
5771     attr = &obj->attr->pcidev;
5772 
5773     attr->domain = domain;
5774     attr->bus = bus;
5775     attr->dev = dev;
5776     attr->func = func;
5777 
5778     /* default (unknown) values */
5779     attr->vendor_id = 0;
5780     attr->device_id = 0;
5781     attr->class_id = class_id;
5782     attr->revision = 0;
5783     attr->subvendor_id = 0;
5784     attr->subdevice_id = 0;
5785     attr->linkspeed = 0;
5786 
5787     err = snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/vendor", dirent->d_name);
5788     if ((size_t) err < sizeof(path)
5789         && !hwloc_read_path_by_length(path, value, sizeof(value), root_fd))
5790       attr->vendor_id = strtoul(value, NULL, 16);
5791 
5792     err = snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/device", dirent->d_name);
5793     if ((size_t) err < sizeof(path)
5794         && !hwloc_read_path_by_length(path, value, sizeof(value), root_fd))
5795       attr->device_id = strtoul(value, NULL, 16);
5796 
5797     err = snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/subsystem_vendor", dirent->d_name);
5798     if ((size_t) err < sizeof(path)
5799         && !hwloc_read_path_by_length(path, value, sizeof(value), root_fd))
5800       attr->subvendor_id = strtoul(value, NULL, 16);
5801 
5802     err = snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/subsystem_device", dirent->d_name);
5803     if ((size_t) err < sizeof(path)
5804         && !hwloc_read_path_by_length(path, value, sizeof(value), root_fd))
5805       attr->subdevice_id = strtoul(value, NULL, 16);
5806 
5807     /* bridge specific attributes */
5808     if (type == HWLOC_OBJ_BRIDGE) {
5809       if (hwloc_pcidisc_setup_bridge_attr(obj, config_space_cache) < 0)
5810         continue;
5811     }
5812 
5813     /* get the revision */
5814     attr->revision = config_space_cache[HWLOC_PCI_REVISION_ID];
5815 
5816     /* try to get the link speed */
5817     offset = hwloc_pcidisc_find_cap(config_space_cache, HWLOC_PCI_CAP_ID_EXP);
5818     if (offset > 0 && offset + 20 /* size of PCI express block up to link status */ <= CONFIG_SPACE_CACHESIZE)
5819       hwloc_pcidisc_find_linkspeed(config_space_cache, offset, &attr->linkspeed);
5820 
5821     hwloc_pcidisc_tree_insert_by_busid(&tree, obj);
5822   }
5823 
5824   closedir(dir);
5825 
5826   hwloc_pcidisc_tree_attach(backend->topology, tree);
5827   return 0;
5828 }
5829 
5830 static hwloc_obj_t
5831 hwloc_linuxfs_pci_find_pcislot_obj(struct hwloc_obj *tree,
5832                                    unsigned domain, unsigned bus, unsigned dev)
5833 {
5834   for ( ; tree; tree = tree->next_sibling) {
5835     if (tree->type == HWLOC_OBJ_PCI_DEVICE
5836         || (tree->type == HWLOC_OBJ_BRIDGE
5837             && tree->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_PCI)) {
5838       if (tree->attr->pcidev.domain == domain
5839           && tree->attr->pcidev.bus == bus
5840           && tree->attr->pcidev.dev == dev
5841           && tree->attr->pcidev.func == 0)
5842         /* that's the right bus id */
5843         return tree;
5844       if (tree->attr->pcidev.domain > domain
5845           || (tree->attr->pcidev.domain == domain
5846               && tree->attr->pcidev.bus > bus))
5847         /* bus id too high, won't find anything later */
5848         return NULL;
5849       if (tree->type == HWLOC_OBJ_BRIDGE
5850           && tree->attr->bridge.downstream_type == HWLOC_OBJ_BRIDGE_PCI
5851           && tree->attr->bridge.downstream.pci.domain == domain
5852           && tree->attr->bridge.downstream.pci.secondary_bus <= bus
5853           && tree->attr->bridge.downstream.pci.subordinate_bus >= bus)
5854         /* not the right bus id, but it's included in the bus below that bridge */
5855         return hwloc_linuxfs_pci_find_pcislot_obj(tree->io_first_child, domain, bus, dev);
5856 
5857     } else if (tree->type == HWLOC_OBJ_BRIDGE
5858                && tree->attr->bridge.upstream_type != HWLOC_OBJ_BRIDGE_PCI
5859                && tree->attr->bridge.downstream_type == HWLOC_OBJ_BRIDGE_PCI
5860                /* non-PCI to PCI bridge, just look at the subordinate bus */
5861                && tree->attr->bridge.downstream.pci.domain == domain
5862                && tree->attr->bridge.downstream.pci.secondary_bus <= bus
5863                && tree->attr->bridge.downstream.pci.subordinate_bus >= bus) {
5864       /* contains our bus, recurse */
5865       return hwloc_linuxfs_pci_find_pcislot_obj(tree->io_first_child, domain, bus, dev);
5866     }
5867   }
5868   return NULL;
5869 }
5870 
5871 static int
5872 hwloc_linuxfs_pci_look_pcislots(struct hwloc_backend *backend)
5873 {
5874   struct hwloc_topology *topology = backend->topology;
5875   struct hwloc_linux_backend_data_s *data = backend->private_data;
5876   int root_fd = data->root_fd;
5877   DIR *dir;
5878   struct dirent *dirent;
5879 
5880   dir = hwloc_opendir("/sys/bus/pci/slots/", root_fd);
5881   if (dir) {
5882     while ((dirent = readdir(dir)) != NULL) {
5883       char path[64];
5884       char buf[64];
5885       unsigned domain, bus, dev;
5886       int err;
5887 
5888       if (dirent->d_name[0] == '.')
5889         continue;
5890       err = snprintf(path, sizeof(path), "/sys/bus/pci/slots/%s/address", dirent->d_name);
5891       if ((size_t) err < sizeof(path)
5892           && !hwloc_read_path_by_length(path, buf, sizeof(buf), root_fd)
5893           && sscanf(buf, "%x:%x:%x", &domain, &bus, &dev) == 3) {
5894         hwloc_obj_t obj = hwloc_linuxfs_pci_find_pcislot_obj(hwloc_get_root_obj(topology)->io_first_child, domain, bus, dev);
5895         if (obj) {
5896           while (obj && obj->attr->pcidev.dev == dev /* sibling have same domain+bus */) {
5897             hwloc_obj_add_info(obj, "PCISlot", dirent->d_name);
5898             obj = obj->next_sibling;
5899           }
5900         }
5901       }
5902     }
5903     closedir(dir);
5904   }
5905 
5906   return 0;
5907 }
5908 #endif /* HWLOC_HAVE_LINUXPCI */
5909 
5910 static int
5911 hwloc_look_linuxfs_io(struct hwloc_backend *backend)
5912 {
5913   struct hwloc_topology *topology = backend->topology;
5914   struct hwloc_linux_backend_data_s *data = NULL;
5915   struct hwloc_backend *tmpbackend;
5916   enum hwloc_type_filter_e pfilter, bfilter, ofilter, mfilter;
5917   int root_fd = -1;
5918 #ifdef HWLOC_HAVE_LINUXPCI
5919   struct hwloc_obj *tmp;
5920   int needpcidiscovery;
5921 #endif
5922 
5923   hwloc_topology_get_type_filter(topology, HWLOC_OBJ_PCI_DEVICE, &pfilter);
5924   hwloc_topology_get_type_filter(topology, HWLOC_OBJ_BRIDGE, &bfilter);
5925   hwloc_topology_get_type_filter(topology, HWLOC_OBJ_OS_DEVICE, &ofilter);
5926   hwloc_topology_get_type_filter(topology, HWLOC_OBJ_MISC, &mfilter);
5927  if (bfilter == HWLOC_TYPE_FILTER_KEEP_NONE
5928       && pfilter == HWLOC_TYPE_FILTER_KEEP_NONE
5929       && ofilter == HWLOC_TYPE_FILTER_KEEP_NONE
5930       && mfilter == HWLOC_TYPE_FILTER_KEEP_NONE)
5931     return 0;
5932 
5933   /* hackily find the linux backend to steal its private_data (for fsroot) */
5934   tmpbackend = topology->backends;
5935   while (tmpbackend) {
5936     if (tmpbackend->component == &hwloc_linux_disc_component) {
5937       data = tmpbackend->private_data;
5938       break;
5939     }
5940     tmpbackend = tmpbackend->next;
5941   }
5942   if (!data) {
5943     hwloc_debug("linuxio failed to find linux backend private_data, aborting its discovery()\n");
5944     return -1;
5945   }
5946   backend->private_data = data;
5947   root_fd = data->root_fd;
5948   hwloc_debug("linuxio backend stole linux backend root_fd %d\n", root_fd);
5949 
5950   if (bfilter != HWLOC_TYPE_FILTER_KEEP_NONE
5951       || pfilter != HWLOC_TYPE_FILTER_KEEP_NONE) {
5952 #ifdef HWLOC_HAVE_LINUXPCI
5953   /* don't rediscovery PCI devices if another backend did it
5954    * (they are attached to root until later in the core discovery)
5955    */
5956   needpcidiscovery = 1;
5957   for_each_io_child(tmp, hwloc_get_root_obj(topology)) {
5958     if (tmp->type == HWLOC_OBJ_PCI_DEVICE
5959         || (tmp->type == HWLOC_OBJ_BRIDGE && tmp->attr->bridge.downstream_type == HWLOC_OBJ_BRIDGE_PCI)) {
5960       hwloc_debug("%s", "PCI objects already added, ignoring linuxio PCI discovery.\n");
5961       needpcidiscovery = 0;
5962       break;
5963     }
5964   }
5965 
5966   if (needpcidiscovery)
5967     hwloc_linuxfs_pci_look_pcidevices(backend);
5968 
5969   hwloc_linuxfs_pci_look_pcislots(backend);
5970 #endif /* HWLOC_HAVE_LINUXPCI */
5971   }
5972 
5973   if (ofilter != HWLOC_TYPE_FILTER_KEEP_NONE) {
5974     unsigned osdev_flags = 0;
5975     if (getenv("HWLOC_VIRTUAL_LINUX_OSDEV"))
5976       osdev_flags |= HWLOC_LINUXFS_FIND_OSDEV_FLAG_VIRTUAL;
5977     if (ofilter == HWLOC_TYPE_FILTER_KEEP_ALL)
5978       osdev_flags |= HWLOC_LINUXFS_FIND_OSDEV_FLAG_USB;
5979 
5980     hwloc_linuxfs_lookup_block_class(backend, osdev_flags);
5981     hwloc_linuxfs_lookup_net_class(backend, osdev_flags);
5982     hwloc_linuxfs_lookup_infiniband_class(backend, osdev_flags);
5983     hwloc_linuxfs_lookup_mic_class(backend, osdev_flags);
5984       if (ofilter != HWLOC_TYPE_FILTER_KEEP_IMPORTANT) {
5985         hwloc_linuxfs_lookup_drm_class(backend, osdev_flags);
5986         hwloc_linuxfs_lookup_dma_class(backend, osdev_flags);
5987       }
5988   }
5989   if (mfilter != HWLOC_TYPE_FILTER_KEEP_NONE) {
5990     hwloc__get_firmware_dmi_memory_info(topology, data);
5991   }
5992 
5993   return 0;
5994 }
5995 
5996 static struct hwloc_backend *
5997 hwloc_linuxio_component_instantiate(struct hwloc_disc_component *component,
5998                                     const void *_data1 __hwloc_attribute_unused,
5999                                     const void *_data2 __hwloc_attribute_unused,
6000                                     const void *_data3 __hwloc_attribute_unused)
6001 {
6002   struct hwloc_backend *backend;
6003 
6004   backend = hwloc_backend_alloc(component);
6005   if (!backend)
6006     return NULL;
6007   backend->discover = hwloc_look_linuxfs_io;
6008 
6009   /* backend->is_thissystem should be what the linux backend has,
6010    * but it's actually useless since both backends will change the main topology->is_thissystem in the same way.
6011    */
6012 
6013   /* backend->private_data will point to the main linux private_data after load(),
6014    * once the main linux component is instantiated for sure.
6015    * it remains valid until the main linux component gets disabled during topology destroy.
6016    */
6017   return backend;
6018 }
6019 
6020 static struct hwloc_disc_component hwloc_linuxio_disc_component = {
6021   HWLOC_DISC_COMPONENT_TYPE_MISC,
6022   "linuxio",
6023   HWLOC_DISC_COMPONENT_TYPE_GLOBAL,
6024   hwloc_linuxio_component_instantiate,
6025   19, /* after pci */
6026   1,
6027   NULL
6028 };
6029 
6030 const struct hwloc_component hwloc_linuxio_component = {
6031   HWLOC_COMPONENT_ABI,
6032   NULL, NULL,
6033   HWLOC_COMPONENT_TYPE_DISC,
6034   0,
6035   &hwloc_linuxio_disc_component
6036 };
6037 
6038 #endif /* HWLOC_HAVE_LINUXIO */

/* [<][>][^][v][top][bottom][index][help] */