1 /*
2 * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
3 * University Research and Technology
4 * Corporation. All rights reserved.
5 * Copyright (c) 2004-2006 The University of Tennessee and The University
6 * of Tennessee Research Foundation. All rights
7 * reserved.
8 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
9 * University of Stuttgart. All rights reserved.
10 * Copyright (c) 2004-2005 The Regents of the University of California.
11 * All rights reserved.
12 * Copyright (c) 2006 Sandia National Laboratories. All rights
13 * reserved.
14 * Copyright (c) 2013-2017 Cisco Systems, Inc. All rights reserved.
15 * $COPYRIGHT$
16 *
17 * Additional copyrights may follow
18 *
19 * $HEADER$
20 */
21
22 #ifndef OPAL_BTL_USNIC_ENDPOINT_H
23 #define OPAL_BTL_USNIC_ENDPOINT_H
24
25 #include <rdma/fabric.h>
26
27 #include "opal/class/opal_list.h"
28 #include "opal/class/opal_hotel.h"
29 #include "opal/mca/event/event.h"
30
31 #include "btl_usnic.h"
32
33 BEGIN_C_DECLS
34
35 /*
36 * Forward declarations to avoid include loops
37 */
38 struct opal_btl_usnic_module_t;
39 struct opal_btl_usnic_send_segment_t;
40
41 /*
42 * Have the window size as a compile-time constant that is a power of
43 * two so that we can take advantage of fast bit operations.
44 */
45 #define WINDOW_SIZE 4096
46 #define WINDOW_SIZE_MOD(a) (((a) & (WINDOW_SIZE - 1)))
47 #define WINDOW_OPEN(E) (SEQ_LT((E)->endpoint_next_seq_to_send, \
48 ((E)->endpoint_ack_seq_rcvd + WINDOW_SIZE)))
49 #define WINDOW_EMPTY(E) ((E)->endpoint_ack_seq_rcvd == \
50 ((E)->endpoint_next_seq_to_send-1))
51
52 /*
53 * Returns true when an endpoint has nothing left to send
54 */
55 #define ENDPOINT_DRAINED(E) (WINDOW_EMPTY(E) && \
56 opal_list_is_empty(&(E)->endpoint_frag_send_queue))
57
58 /*
59 * Channel IDs
60 */
61 typedef enum opal_btl_usnic_channel_id_t {
62 USNIC_PRIORITY_CHANNEL,
63 USNIC_DATA_CHANNEL,
64 USNIC_NUM_CHANNELS
65 } opal_btl_usnic_channel_id_t;
66
67 typedef struct opal_btl_usnic_modex_t {
68 /* Stored in network order */
69 uint32_t ipv4_addr;
70 /* Stored in host order */
71 uint32_t ports[USNIC_NUM_CHANNELS];
72 /* Stored in network order */
73 uint32_t netmask;
74 /* Stored in host order */
75 uint32_t connectivity_udp_port;
76 uint32_t link_speed_mbps;
77 uint16_t max_msg_size;
78 opal_btl_usnic_seq_t isn;
79 uint32_t protocol;
80 } opal_btl_usnic_modex_t;
81
82 struct opal_btl_usnic_send_segment_t;
83 struct opal_btl_usnic_proc_t;
84
85 /*
86 * This is a descriptor for an incoming fragment that is broken
87 * into chunks. When the first reference to this frag_id is seen,
88 * memory is allocated for it. When the last byte arrives, the assembled
89 * fragment is passed to the PML.
90 *
91 * The endpoint structure has space for WINDOW_SIZE/2 simultaneous fragments.
92 * This is the largest number of fragments that can possibly be in-flight
93 * to us from a particular endpoint because eash chunked fragment will occupy
94 * at least two segments, and only WINDOW_SIZE segments can be in flight.
95 * OK, so there is an extremely pathological case where we could see
96 * (WINDOW_SIZE/2)+1 "in flight" at once, but just dropping that last one
97 * and waiting for retrans is just fine in this hypothetical hyper-pathological
98 * case, which is what we'll do.
99 */
100 #define MAX_ACTIVE_FRAGS (WINDOW_SIZE/2)
101 typedef struct opal_btl_usnic_rx_frag_info_t {
102 uint32_t rfi_frag_id; /* ID for this fragment */
103 uint32_t rfi_frag_size; /* bytes in this fragment */
104 uint32_t rfi_bytes_left; /* bytes remaining to RX in fragment */
105 bool rfi_data_in_pool; /* data in data_pool if true, else malloced */
106 int rfi_data_pool; /* if <0, data malloced, else rx buf pool */
107 char *rfi_data; /* pointer to assembly area */
108 opal_free_list_item_t *rfi_fl_elt; /* free list elemement from buf pool
109 when rfi_data_pool is nonzero */
110 } opal_btl_usnic_rx_frag_info_t;
111
112 /**
113 * An abstraction that represents a connection to a remote process.
114 * An instance of mca_btl_base_endpoint_t is associated with each
115 * (btl_usnic_proc_t, btl_usnic_module_t) tuple and address
116 * information is exchanged at startup. The usnic BTL is
117 * connectionless, so no connection is ever established.
118 */
119 typedef struct mca_btl_base_endpoint_t {
120 opal_list_item_t super;
121
122 /** BTL module that created this connection */
123 struct opal_btl_usnic_module_t *endpoint_module;
124
125 /** proc that owns this endpoint */
126 struct opal_btl_usnic_proc_t *endpoint_proc;
127 int endpoint_proc_index; /* index in owning proc's endpoint array */
128
129 /** True when proc has been deleted, but still have sends that need ACKs */
130 bool endpoint_exiting;
131
132 /** List item for linking into module "all_endpoints" */
133 opal_list_item_t endpoint_endpoint_li;
134
135 /** List item for linking into "need ack" */
136 opal_list_item_t endpoint_ack_li;
137
138 /** Remote address information */
139 opal_btl_usnic_modex_t endpoint_remote_modex;
140
141 /** Remote address handle. Need one for each
142 channel because each remote channel has different dest port */
143 fi_addr_t endpoint_remote_addrs[USNIC_NUM_CHANNELS];
144
145 /** Send-related data */
146 bool endpoint_ready_to_send;
147 opal_list_t endpoint_frag_send_queue;
148 int32_t endpoint_send_credits;
149 uint32_t endpoint_next_frag_id;
150
151 /** Receive-related data */
152 struct opal_btl_usnic_rx_frag_info_t *endpoint_rx_frag_info;
153
154 /** OPAL hotel to track outstanding stends */
155 opal_hotel_t endpoint_hotel;
156
157 /** Sliding window parameters for this peer */
158 /* Values for the current proc to send to this endpoint on the
159 peer proc */
160 opal_btl_usnic_seq_t endpoint_next_seq_to_send; /* n_t */
161 opal_btl_usnic_seq_t endpoint_ack_seq_rcvd; /* n_a */
162
163 /* Table where sent segments sit while waiting for their ACKs.
164 When a segment is ACKed, it is removed from this table. */
165 struct opal_btl_usnic_send_segment_t *endpoint_sent_segs[WINDOW_SIZE];
166
167 /* Values for the current proc to receive from this endpoint on
168 the peer proc */
169 bool endpoint_ack_needed;
170
171 /* When we receive a packet that needs an ACK, set this
172 * to delay the ACK to allow for piggybacking
173 */
174 uint64_t endpoint_acktime;
175
176 opal_btl_usnic_seq_t endpoint_next_contig_seq_to_recv; /* n_r */
177 opal_btl_usnic_seq_t endpoint_highest_seq_rcvd; /* n_s */
178
179 bool endpoint_rcvd_segs[WINDOW_SIZE];
180 uint32_t endpoint_rfstart;
181
182 bool endpoint_connectivity_checked;
183 bool endpoint_on_all_endpoints;
184 } mca_btl_base_endpoint_t;
185
186 typedef mca_btl_base_endpoint_t opal_btl_usnic_endpoint_t;
187 OBJ_CLASS_DECLARATION(opal_btl_usnic_endpoint_t);
188
189 /*
190 * Helper struct for the asynchornous creation of fi_addr array
191 */
192 typedef struct {
193 opal_btl_usnic_endpoint_t *endpoint;
194 opal_btl_usnic_channel_id_t channel_id;
195 } opal_btl_usnic_addr_context_t;
196
197 /*
198 * Flush all pending sends and resends from and endpoint
199 */
200 void
201 opal_btl_usnic_flush_endpoint(
202 opal_btl_usnic_endpoint_t *endpoint);
203
204 END_C_DECLS
205 #endif