This source file includes following definitions.
- get_ptag
- get_cookie
- mca_btl_ugni_get_nic_address
- mca_btl_ugni_device_init
- mca_btl_ugni_device_fini
- mca_btl_ugni_send_modex
- mca_btl_ugni_fini
- mca_btl_ugni_init
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 #include "btl_ugni.h"
18 #include "btl_ugni_endpoint.h"
19 #include "btl_ugni_frag.h"
20
21 #include "opal/class/opal_list.h"
22 #include "opal/dss/dss.h"
23 #include "opal/mca/pmix/pmix.h"
24 #include "opal/util/bit_ops.h"
25 #include "opal/mca/hwloc/base/base.h"
26
27 static inline int get_ptag(uint8_t *out_ptag)
28 {
29
30 char *ptr;
31 uint8_t tmp_ptag;
32
33 if (NULL == (ptr = getenv("PMI_GNI_PTAG"))) {
34
35 return OPAL_ERR_NOT_FOUND;
36 }
37 errno = 0;
38 tmp_ptag = (uint8_t)strtoul (ptr, (char **)NULL, 10);
39 if (0 != errno) {
40
41 return OPAL_ERR_VALUE_OUT_OF_BOUNDS;
42 }
43 *out_ptag = tmp_ptag;
44 return OPAL_SUCCESS;
45 }
46
47 static inline int get_cookie (uint32_t *out_cookie)
48 {
49
50 char *ptr;
51 uint32_t tmp_cookie;
52
53 if (NULL == (ptr = getenv("PMI_GNI_COOKIE"))) {
54
55 return OPAL_ERR_NOT_FOUND;
56 }
57 errno = 0;
58 tmp_cookie = (uint32_t) strtoul (ptr, NULL, 10);
59 if (0 != errno) {
60
61 return OPAL_ERR_VALUE_OUT_OF_BOUNDS;
62 }
63
64 *out_cookie = tmp_cookie;
65 return OPAL_SUCCESS;
66 }
67
68 static unsigned int mca_btl_ugni_get_nic_address(int device_id)
69 {
70 unsigned int address, cpu_id;
71 gni_return_t status;
72 int i, alps_dev_id = -1;
73 char *token,*p_ptr;
74
75 p_ptr = getenv("PMI_GNI_DEV_ID");
76 if (!p_ptr) {
77 status = GNI_CdmGetNicAddress(device_id, &address, &cpu_id);
78 if(status != GNI_RC_SUCCESS) {
79 opal_output (0, "FAILED:GNI_CdmGetNicAddress returned error %d", status);
80 return (unsigned int)-1;
81 }
82 return address;
83 }
84
85 while (NULL != (token = strtok(p_ptr, ":"))) {
86 alps_dev_id = atoi(token);
87 if (alps_dev_id == device_id) {
88 break;
89 }
90 p_ptr = NULL;
91 }
92
93 if (OPAL_UNLIKELY(-1 == alps_dev_id)) {
94 return (unsigned int)-1;
95 }
96
97 p_ptr = getenv("PMI_GNI_LOC_ADDR");
98 if (OPAL_UNLIKELY(NULL == p_ptr)) {
99 return (unsigned int)-1;
100 }
101
102 i = 0;
103 while (NULL != (token = strtok(p_ptr, ":"))) {
104 if (i == alps_dev_id) {
105 return strtoul (token, NULL, 10);
106 }
107 p_ptr = NULL;
108 ++i;
109 }
110
111 return (unsigned int)-1;
112 }
113
114 int mca_btl_ugni_device_init (mca_btl_ugni_device_t *device, int virtual_device_id)
115 {
116 uint32_t dev_pe_addr;
117 int rc;
118
119 OBJ_CONSTRUCT(&device->rdma_descs, opal_free_list_t);
120
121
122 rc = GNI_CdmCreate (mca_btl_ugni_component.cdm_id_base | virtual_device_id, mca_btl_ugni_component.ptag,
123 mca_btl_ugni_component.cookie, mca_btl_ugni_component.cdm_flags, &device->dev_cd_handle);
124 if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc)) {
125
126 BTL_VERBOSE(("Error: Creating communication domain %d for virtual device %d", rc, virtual_device_id));
127 return mca_btl_rc_ugni_to_opal (rc);
128 }
129
130 device->dev_index = virtual_device_id;
131
132
133 OPAL_OUTPUT((-1, "Got NIC Addr: 0x%08x, CPU ID: %d", mca_btl_ugni_component.dev_addr, 0));
134
135
136 rc = GNI_CdmAttach (device->dev_cd_handle, 0, &dev_pe_addr, &device->dev_handle);
137 if (GNI_RC_SUCCESS != rc) {
138 BTL_VERBOSE(("Error: Attaching to communication domain. rc = %d, virtual device = %d", rc, virtual_device_id));
139 return mca_btl_rc_ugni_to_opal (rc);
140 }
141
142 rc = opal_free_list_init (&device->rdma_descs, sizeof (mca_btl_ugni_rdma_desc_t),
143 64, OBJ_CLASS(mca_btl_ugni_rdma_desc_t), 0, 8, 0,
144 mca_btl_ugni_component.local_rdma_cq_size, 32,
145 NULL, 0, NULL, mca_btl_ugni_rdma_desc_init, (void *) device);
146 if (OPAL_SUCCESS != rc) {
147 OBJ_DESTRUCT(&device->rdma_descs);
148 return rc;
149 }
150
151 device->lock = 0;
152 device->dev_rdma_local_cq.gni_handle = 0;
153 device->dev_rdma_local_cq.active_operations = 0;
154 device->dev_rdma_local_irq_cq.gni_handle = 0;
155 device->dev_rdma_local_irq_cq.active_operations = 0;
156 device->dev_smsg_local_cq.gni_handle = 0;
157 device->dev_smsg_local_cq.active_operations = 0;
158 device->flushed = true;
159
160 return OPAL_SUCCESS;
161 }
162
163 int mca_btl_ugni_device_fini (mca_btl_ugni_device_t *dev)
164 {
165 int rc;
166
167 OBJ_DESTRUCT(&dev->rdma_descs);
168
169 if (0 != dev->dev_rdma_local_cq.gni_handle) {
170 GNI_CqDestroy (dev->dev_rdma_local_cq.gni_handle);
171 dev->dev_rdma_local_cq.gni_handle = 0;
172 }
173
174 if (0 != dev->dev_rdma_local_irq_cq.gni_handle) {
175 GNI_CqDestroy (dev->dev_rdma_local_irq_cq.gni_handle);
176 dev->dev_rdma_local_irq_cq.gni_handle = 0;
177 }
178
179 if (0 != dev->dev_smsg_local_cq.gni_handle) {
180 GNI_CqDestroy (dev->dev_smsg_local_cq.gni_handle);
181 dev->dev_smsg_local_cq.gni_handle = 0;
182 }
183
184 rc = GNI_CdmDestroy (dev->dev_cd_handle);
185 if (GNI_RC_SUCCESS != rc) {
186 BTL_VERBOSE(("error destroying cdm handle"));
187 }
188
189 return OPAL_SUCCESS;
190 }
191
192
193
194
195
196 static int mca_btl_ugni_send_modex (void)
197 {
198 struct mca_btl_ugni_modex_t modex;
199 uint32_t modex_size;
200 char *modex_msg;
201 int rc;
202
203 modex_size = sizeof (struct mca_btl_ugni_modex_t);
204
205 modex_msg = (char *) malloc (modex_size);
206 if (NULL == modex_msg) {
207 OPAL_OUTPUT((-1, "Error allocating memory for modex @ %s:%d",
208 __FILE__, __LINE__));
209 return OPAL_ERR_OUT_OF_RESOURCE;
210 }
211
212 modex.addr = mca_btl_ugni_component.dev_addr;
213 modex.id = mca_btl_ugni_component.cdm_id_base;
214
215 BTL_VERBOSE(("sending modex. addr: %d, id: %d", modex.addr, modex.id));
216
217 memcpy ((void *) modex_msg, (void *) &modex, modex_size);
218
219
220
221
222
223
224 OPAL_MODEX_SEND(rc, OPAL_PMIX_GLOBAL,
225 &mca_btl_ugni_component.super.btl_version,
226 modex_msg, modex_size);
227
228 free (modex_msg);
229
230 return rc;
231 }
232
233 int mca_btl_ugni_fini (void)
234 {
235 return OPAL_SUCCESS;
236 }
237
238 int mca_btl_ugni_init (void)
239 {
240 int32_t pid_max = 32768;
241 int rc, bit;
242 FILE *fh;
243
244 if (0 == mca_btl_ugni_component.virtual_device_count) {
245 int core_count;
246
247 (void) opal_hwloc_base_get_topology ();
248 core_count = hwloc_get_nbobjs_by_type (opal_hwloc_topology, HWLOC_OBJ_CORE);
249
250 if (core_count <= opal_process_info.num_local_peers || !opal_using_threads()) {
251
252
253 mca_btl_ugni_component.virtual_device_count = 1;
254 } else {
255 mca_btl_ugni_component.virtual_device_count = core_count / (opal_process_info.num_local_peers + 1);
256 }
257 }
258
259 if (MCA_BTL_UGNI_MAX_DEV_HANDLES < mca_btl_ugni_component.virtual_device_count) {
260 mca_btl_ugni_component.virtual_device_count = MCA_BTL_UGNI_MAX_DEV_HANDLES;
261 }
262
263 if (0 == mca_btl_ugni_component.local_rdma_cq_size) {
264 if (1 == mca_btl_ugni_component.virtual_device_count) {
265 mca_btl_ugni_component.local_rdma_cq_size = 2048;
266 } else {
267 mca_btl_ugni_component.local_rdma_cq_size = 256;
268 }
269 }
270
271 if ((mca_btl_ugni_component.virtual_device_count * (1 + opal_process_info.num_local_peers)) < 122) {
272
273
274 mca_base_var_source_t source = MCA_BASE_VAR_SOURCE_DEFAULT;
275
276 mca_base_var_get_value (mca_btl_ugni_component.cdm_flags_id, NULL, &source, NULL);
277 if (MCA_BASE_VAR_SOURCE_DEFAULT == source) {
278 BTL_VERBOSE(("disabling shared FMA sharing"));
279
280 mca_btl_ugni_component.cdm_flags &= ~GNI_CDM_MODE_FMA_SHARED;
281 mca_btl_ugni_component.cdm_flags |= GNI_CDM_MODE_FMA_DEDICATED;
282 }
283 }
284
285 fh = fopen ("/proc/sys/kernel/pid_max", "r");
286 if (NULL != fh) {
287 fscanf (fh, "%d", &pid_max);
288 fclose (fh);
289 }
290
291
292
293
294 bit = opal_hibit (pid_max, 31);
295 if (bit >= 31) {
296 mca_btl_ugni_component.virtual_device_count = 1;
297 mca_btl_ugni_component.cdm_id_base = getpid();
298 } else if (bit >= 30 && mca_btl_ugni_component.virtual_device_count > 2) {
299 mca_btl_ugni_component.virtual_device_count = 2;
300 mca_btl_ugni_component.cdm_id_base = getpid() << 1;
301 } else {
302 mca_btl_ugni_component.cdm_id_base = getpid() << 8;
303 }
304
305
306
307 rc = get_ptag(&mca_btl_ugni_component.ptag);
308 if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
309 return rc;
310 }
311
312 rc = get_cookie(&mca_btl_ugni_component.cookie);
313 if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
314 return rc;
315 }
316
317
318 mca_btl_ugni_component.dev_addr = mca_btl_ugni_get_nic_address (0);
319
320
321 mca_btl_ugni_send_modex ();
322
323 return OPAL_SUCCESS;
324 }