1 /* 2 * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana 3 * University Research and Technology 4 * Corporation. All rights reserved. 5 * Copyright (c) 2004-2006 The University of Tennessee and The University 6 * of Tennessee Research Foundation. All rights 7 * reserved. 8 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, 9 * University of Stuttgart. All rights reserved. 10 * Copyright (c) 2004-2005 The Regents of the University of California. 11 * All rights reserved. 12 * Copyright (c) 2006 Sandia National Laboratories. All rights 13 * reserved. 14 * Copyright (c) 2013-2017 Cisco Systems, Inc. All rights reserved. 15 * $COPYRIGHT$ 16 * 17 * Additional copyrights may follow 18 * 19 * $HEADER$ 20 */ 21 22 #ifndef OPAL_BTL_USNIC_ENDPOINT_H 23 #define OPAL_BTL_USNIC_ENDPOINT_H 24 25 #include <rdma/fabric.h> 26 27 #include "opal/class/opal_list.h" 28 #include "opal/class/opal_hotel.h" 29 #include "opal/mca/event/event.h" 30 31 #include "btl_usnic.h" 32 33 BEGIN_C_DECLS 34 35 /* 36 * Forward declarations to avoid include loops 37 */ 38 struct opal_btl_usnic_module_t; 39 struct opal_btl_usnic_send_segment_t; 40 41 /* 42 * Have the window size as a compile-time constant that is a power of 43 * two so that we can take advantage of fast bit operations. 44 */ 45 #define WINDOW_SIZE 4096 46 #define WINDOW_SIZE_MOD(a) (((a) & (WINDOW_SIZE - 1))) 47 #define WINDOW_OPEN(E) (SEQ_LT((E)->endpoint_next_seq_to_send, \ 48 ((E)->endpoint_ack_seq_rcvd + WINDOW_SIZE))) 49 #define WINDOW_EMPTY(E) ((E)->endpoint_ack_seq_rcvd == \ 50 ((E)->endpoint_next_seq_to_send-1)) 51 52 /* 53 * Returns true when an endpoint has nothing left to send 54 */ 55 #define ENDPOINT_DRAINED(E) (WINDOW_EMPTY(E) && \ 56 opal_list_is_empty(&(E)->endpoint_frag_send_queue)) 57 58 /* 59 * Channel IDs 60 */ 61 typedef enum opal_btl_usnic_channel_id_t { 62 USNIC_PRIORITY_CHANNEL, 63 USNIC_DATA_CHANNEL, 64 USNIC_NUM_CHANNELS 65 } opal_btl_usnic_channel_id_t; 66 67 typedef struct opal_btl_usnic_modex_t { 68 /* Stored in network order */ 69 uint32_t ipv4_addr; 70 /* Stored in host order */ 71 uint32_t ports[USNIC_NUM_CHANNELS]; 72 /* Stored in network order */ 73 uint32_t netmask; 74 /* Stored in host order */ 75 uint32_t connectivity_udp_port; 76 uint32_t link_speed_mbps; 77 uint16_t max_msg_size; 78 opal_btl_usnic_seq_t isn; 79 uint32_t protocol; 80 } opal_btl_usnic_modex_t; 81 82 struct opal_btl_usnic_send_segment_t; 83 struct opal_btl_usnic_proc_t; 84 85 /* 86 * This is a descriptor for an incoming fragment that is broken 87 * into chunks. When the first reference to this frag_id is seen, 88 * memory is allocated for it. When the last byte arrives, the assembled 89 * fragment is passed to the PML. 90 * 91 * The endpoint structure has space for WINDOW_SIZE/2 simultaneous fragments. 92 * This is the largest number of fragments that can possibly be in-flight 93 * to us from a particular endpoint because eash chunked fragment will occupy 94 * at least two segments, and only WINDOW_SIZE segments can be in flight. 95 * OK, so there is an extremely pathological case where we could see 96 * (WINDOW_SIZE/2)+1 "in flight" at once, but just dropping that last one 97 * and waiting for retrans is just fine in this hypothetical hyper-pathological 98 * case, which is what we'll do. 99 */ 100 #define MAX_ACTIVE_FRAGS (WINDOW_SIZE/2) 101 typedef struct opal_btl_usnic_rx_frag_info_t { 102 uint32_t rfi_frag_id; /* ID for this fragment */ 103 uint32_t rfi_frag_size; /* bytes in this fragment */ 104 uint32_t rfi_bytes_left; /* bytes remaining to RX in fragment */ 105 bool rfi_data_in_pool; /* data in data_pool if true, else malloced */ 106 int rfi_data_pool; /* if <0, data malloced, else rx buf pool */ 107 char *rfi_data; /* pointer to assembly area */ 108 opal_free_list_item_t *rfi_fl_elt; /* free list elemement from buf pool 109 when rfi_data_pool is nonzero */ 110 } opal_btl_usnic_rx_frag_info_t; 111 112 /** 113 * An abstraction that represents a connection to a remote process. 114 * An instance of mca_btl_base_endpoint_t is associated with each 115 * (btl_usnic_proc_t, btl_usnic_module_t) tuple and address 116 * information is exchanged at startup. The usnic BTL is 117 * connectionless, so no connection is ever established. 118 */ 119 typedef struct mca_btl_base_endpoint_t { 120 opal_list_item_t super; 121 122 /** BTL module that created this connection */ 123 struct opal_btl_usnic_module_t *endpoint_module; 124 125 /** proc that owns this endpoint */ 126 struct opal_btl_usnic_proc_t *endpoint_proc; 127 int endpoint_proc_index; /* index in owning proc's endpoint array */ 128 129 /** True when proc has been deleted, but still have sends that need ACKs */ 130 bool endpoint_exiting; 131 132 /** List item for linking into module "all_endpoints" */ 133 opal_list_item_t endpoint_endpoint_li; 134 135 /** List item for linking into "need ack" */ 136 opal_list_item_t endpoint_ack_li; 137 138 /** Remote address information */ 139 opal_btl_usnic_modex_t endpoint_remote_modex; 140 141 /** Remote address handle. Need one for each 142 channel because each remote channel has different dest port */ 143 fi_addr_t endpoint_remote_addrs[USNIC_NUM_CHANNELS]; 144 145 /** Send-related data */ 146 bool endpoint_ready_to_send; 147 opal_list_t endpoint_frag_send_queue; 148 int32_t endpoint_send_credits; 149 uint32_t endpoint_next_frag_id; 150 151 /** Receive-related data */ 152 struct opal_btl_usnic_rx_frag_info_t *endpoint_rx_frag_info; 153 154 /** OPAL hotel to track outstanding stends */ 155 opal_hotel_t endpoint_hotel; 156 157 /** Sliding window parameters for this peer */ 158 /* Values for the current proc to send to this endpoint on the 159 peer proc */ 160 opal_btl_usnic_seq_t endpoint_next_seq_to_send; /* n_t */ 161 opal_btl_usnic_seq_t endpoint_ack_seq_rcvd; /* n_a */ 162 163 /* Table where sent segments sit while waiting for their ACKs. 164 When a segment is ACKed, it is removed from this table. */ 165 struct opal_btl_usnic_send_segment_t *endpoint_sent_segs[WINDOW_SIZE]; 166 167 /* Values for the current proc to receive from this endpoint on 168 the peer proc */ 169 bool endpoint_ack_needed; 170 171 /* When we receive a packet that needs an ACK, set this 172 * to delay the ACK to allow for piggybacking 173 */ 174 uint64_t endpoint_acktime; 175 176 opal_btl_usnic_seq_t endpoint_next_contig_seq_to_recv; /* n_r */ 177 opal_btl_usnic_seq_t endpoint_highest_seq_rcvd; /* n_s */ 178 179 bool endpoint_rcvd_segs[WINDOW_SIZE]; 180 uint32_t endpoint_rfstart; 181 182 bool endpoint_connectivity_checked; 183 bool endpoint_on_all_endpoints; 184 } mca_btl_base_endpoint_t; 185 186 typedef mca_btl_base_endpoint_t opal_btl_usnic_endpoint_t; 187 OBJ_CLASS_DECLARATION(opal_btl_usnic_endpoint_t); 188 189 /* 190 * Helper struct for the asynchornous creation of fi_addr array 191 */ 192 typedef struct { 193 opal_btl_usnic_endpoint_t *endpoint; 194 opal_btl_usnic_channel_id_t channel_id; 195 } opal_btl_usnic_addr_context_t; 196 197 /* 198 * Flush all pending sends and resends from and endpoint 199 */ 200 void 201 opal_btl_usnic_flush_endpoint( 202 opal_btl_usnic_endpoint_t *endpoint); 203 204 END_C_DECLS 205 #endif