]>
Commit | Line | Data |
---|---|---|
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- | |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2016 XSKY <haomai@xsky.com> | |
7 | * | |
8 | * Author: Haomai Wang <haomaiwang@gmail.com> | |
9 | * | |
10 | * This is free software; you can redistribute it and/or | |
11 | * modify it under the terms of the GNU Lesser General Public | |
12 | * License version 2.1, as published by the Free Software | |
13 | * Foundation. See file COPYING. | |
14 | * | |
15 | */ | |
16 | ||
17 | #ifndef CEPH_INFINIBAND_H | |
18 | #define CEPH_INFINIBAND_H | |
19 | ||
20 | #include <string> | |
21 | #include <vector> | |
22 | ||
23 | #include <infiniband/verbs.h> | |
24 | ||
25 | #include "include/int_types.h" | |
26 | #include "include/page.h" | |
27 | #include "common/debug.h" | |
28 | #include "common/errno.h" | |
29 | #include "common/Mutex.h" | |
30 | #include "msg/msg_types.h" | |
31 | #include "msg/async/net_handler.h" | |
32 | ||
33 | #define HUGE_PAGE_SIZE (2 * 1024 * 1024) | |
34 | #define ALIGN_TO_PAGE_SIZE(x) \ | |
35 | (((x) + HUGE_PAGE_SIZE -1) / HUGE_PAGE_SIZE * HUGE_PAGE_SIZE) | |
36 | ||
37 | struct IBSYNMsg { | |
38 | uint16_t lid; | |
39 | uint32_t qpn; | |
40 | uint32_t psn; | |
41 | uint32_t peer_qpn; | |
42 | union ibv_gid gid; | |
43 | } __attribute__((packed)); | |
44 | ||
45 | class RDMAStack; | |
46 | class CephContext; | |
47 | ||
48 | class Port { | |
49 | struct ibv_context* ctxt; | |
50 | int port_num; | |
51 | struct ibv_port_attr* port_attr; | |
52 | uint16_t lid; | |
53 | int gid_idx; | |
54 | union ibv_gid gid; | |
55 | ||
56 | public: | |
57 | explicit Port(CephContext *cct, struct ibv_context* ictxt, uint8_t ipn); | |
58 | uint16_t get_lid() { return lid; } | |
59 | ibv_gid get_gid() { return gid; } | |
60 | int get_port_num() { return port_num; } | |
61 | ibv_port_attr* get_port_attr() { return port_attr; } | |
62 | int get_gid_idx() { return gid_idx; } | |
63 | }; | |
64 | ||
65 | ||
66 | class Device { | |
67 | ibv_device *device; | |
68 | const char* name; | |
69 | uint8_t port_cnt; | |
70 | public: | |
71 | explicit Device(CephContext *c, ibv_device* d); | |
72 | ~Device() { | |
73 | if (active_port) { | |
74 | delete active_port; | |
75 | assert(ibv_close_device(ctxt) == 0); | |
76 | } | |
77 | } | |
78 | const char* get_name() { return name;} | |
79 | uint16_t get_lid() { return active_port->get_lid(); } | |
80 | ibv_gid get_gid() { return active_port->get_gid(); } | |
81 | int get_gid_idx() { return active_port->get_gid_idx(); } | |
82 | void binding_port(CephContext *c, int port_num); | |
83 | struct ibv_context *ctxt; | |
84 | ibv_device_attr *device_attr; | |
85 | Port* active_port; | |
86 | }; | |
87 | ||
88 | ||
89 | class DeviceList { | |
90 | struct ibv_device ** device_list; | |
91 | int num; | |
92 | Device** devices; | |
93 | public: | |
94 | DeviceList(CephContext *cct): device_list(ibv_get_device_list(&num)) { | |
95 | if (device_list == NULL || num == 0) { | |
96 | lderr(cct) << __func__ << " failed to get rdma device list. " << cpp_strerror(errno) << dendl; | |
97 | ceph_abort(); | |
98 | } | |
99 | devices = new Device*[num]; | |
100 | ||
101 | for (int i = 0;i < num; ++i) { | |
102 | devices[i] = new Device(cct, device_list[i]); | |
103 | } | |
104 | } | |
105 | ~DeviceList() { | |
106 | for (int i=0; i < num; ++i) { | |
107 | delete devices[i]; | |
108 | } | |
109 | delete []devices; | |
110 | ibv_free_device_list(device_list); | |
111 | } | |
112 | ||
113 | Device* get_device(const char* device_name) { | |
114 | assert(devices); | |
115 | for (int i = 0; i < num; ++i) { | |
116 | if (!strlen(device_name) || !strcmp(device_name, devices[i]->get_name())) { | |
117 | return devices[i]; | |
118 | } | |
119 | } | |
120 | return NULL; | |
121 | } | |
122 | }; | |
123 | ||
124 | ||
125 | class RDMADispatcher; | |
126 | ||
127 | class Infiniband { | |
128 | public: | |
129 | class ProtectionDomain { | |
130 | public: | |
131 | explicit ProtectionDomain(CephContext *cct, Device *device); | |
132 | ~ProtectionDomain(); | |
133 | ||
134 | ibv_pd* const pd; | |
135 | }; | |
136 | ||
137 | ||
138 | class MemoryManager { | |
139 | public: | |
140 | class Chunk { | |
141 | public: | |
142 | Chunk(ibv_mr* m, uint32_t len, char* b); | |
143 | ~Chunk(); | |
144 | ||
145 | void set_offset(uint32_t o); | |
146 | uint32_t get_offset(); | |
147 | void set_bound(uint32_t b); | |
148 | void prepare_read(uint32_t b); | |
149 | uint32_t get_bound(); | |
150 | uint32_t read(char* buf, uint32_t len); | |
151 | uint32_t write(char* buf, uint32_t len); | |
152 | bool full(); | |
153 | bool over(); | |
154 | void clear(); | |
155 | void post_srq(Infiniband *ib); | |
156 | ||
157 | public: | |
158 | ibv_mr* mr; | |
159 | uint32_t bytes; | |
160 | uint32_t bound; | |
161 | uint32_t offset; | |
162 | char* buffer; | |
163 | }; | |
164 | ||
165 | class Cluster { | |
166 | public: | |
167 | Cluster(MemoryManager& m, uint32_t s); | |
168 | ~Cluster(); | |
169 | ||
170 | int fill(uint32_t num); | |
171 | void take_back(std::vector<Chunk*> &ck); | |
172 | int get_buffers(std::vector<Chunk*> &chunks, size_t bytes); | |
173 | Chunk *get_chunk_by_buffer(const char *c) { | |
174 | uint32_t idx = (c - base) / buffer_size; | |
175 | Chunk *chunk = chunk_base + idx; | |
176 | return chunk; | |
177 | } | |
178 | bool is_my_buffer(const char *c) const { | |
179 | return c >= base && c < end; | |
180 | } | |
181 | ||
182 | MemoryManager& manager; | |
183 | uint32_t buffer_size; | |
184 | uint32_t num_chunk; | |
185 | Mutex lock; | |
186 | std::vector<Chunk*> free_chunks; | |
187 | char *base = nullptr; | |
188 | char *end = nullptr; | |
189 | Chunk* chunk_base = nullptr; | |
190 | }; | |
191 | ||
192 | MemoryManager(Device *d, ProtectionDomain *p, bool hugepage); | |
193 | ~MemoryManager(); | |
194 | ||
195 | void* malloc_huge_pages(size_t size); | |
196 | void free_huge_pages(void *ptr); | |
197 | void register_rx_tx(uint32_t size, uint32_t rx_num, uint32_t tx_num); | |
198 | void return_tx(std::vector<Chunk*> &chunks); | |
199 | int get_send_buffers(std::vector<Chunk*> &c, size_t bytes); | |
200 | int get_channel_buffers(std::vector<Chunk*> &chunks, size_t bytes); | |
201 | bool is_tx_buffer(const char* c) { return send->is_my_buffer(c); } | |
202 | bool is_rx_buffer(const char* c) { return channel->is_my_buffer(c); } | |
203 | Chunk *get_tx_chunk_by_buffer(const char *c) { | |
204 | return send->get_chunk_by_buffer(c); | |
205 | } | |
206 | uint32_t get_tx_buffer_size() const { | |
207 | return send->buffer_size; | |
208 | } | |
209 | ||
210 | bool enabled_huge_page; | |
211 | ||
212 | private: | |
213 | Cluster* channel;//RECV | |
214 | Cluster* send;// SEND | |
215 | Device *device; | |
216 | ProtectionDomain *pd; | |
217 | }; | |
218 | ||
219 | private: | |
220 | uint32_t max_send_wr = 0; | |
221 | uint32_t max_recv_wr = 0; | |
222 | uint32_t max_sge = 0; | |
223 | uint8_t ib_physical_port = 0; | |
224 | MemoryManager* memory_manager = nullptr; | |
225 | ibv_srq* srq = nullptr; // shared receive work queue | |
226 | Device *device = NULL; | |
227 | ProtectionDomain *pd = NULL; | |
228 | DeviceList *device_list = nullptr; | |
229 | RDMADispatcher *dispatcher = nullptr; | |
230 | void wire_gid_to_gid(const char *wgid, union ibv_gid *gid); | |
231 | void gid_to_wire_gid(const union ibv_gid *gid, char wgid[]); | |
232 | CephContext *cct; | |
233 | Mutex lock; | |
234 | bool initialized = false; | |
235 | const std::string &device_name; | |
236 | uint8_t port_num; | |
237 | ||
238 | public: | |
239 | explicit Infiniband(CephContext *c, const std::string &device_name, uint8_t p); | |
240 | ~Infiniband(); | |
241 | void init(); | |
242 | ||
243 | void set_dispatcher(RDMADispatcher *d); | |
244 | ||
245 | class CompletionChannel { | |
246 | static const uint32_t MAX_ACK_EVENT = 5000; | |
247 | CephContext *cct; | |
248 | Infiniband& infiniband; | |
249 | ibv_comp_channel *channel; | |
250 | ibv_cq *cq; | |
251 | uint32_t cq_events_that_need_ack; | |
252 | ||
253 | public: | |
254 | CompletionChannel(CephContext *c, Infiniband &ib); | |
255 | ~CompletionChannel(); | |
256 | int init(); | |
257 | bool get_cq_event(); | |
258 | int get_fd() { return channel->fd; } | |
259 | ibv_comp_channel* get_channel() { return channel; } | |
260 | void bind_cq(ibv_cq *c) { cq = c; } | |
261 | void ack_events(); | |
262 | }; | |
263 | ||
264 | // this class encapsulates the creation, use, and destruction of an RC | |
265 | // completion queue. | |
266 | // | |
267 | // You need to call init and it will create a cq and associate to comp channel | |
268 | class CompletionQueue { | |
269 | public: | |
270 | CompletionQueue(CephContext *c, Infiniband &ib, | |
271 | const uint32_t qd, CompletionChannel *cc) | |
272 | : cct(c), infiniband(ib), channel(cc), cq(NULL), queue_depth(qd) {} | |
273 | ~CompletionQueue(); | |
274 | int init(); | |
275 | int poll_cq(int num_entries, ibv_wc *ret_wc_array); | |
276 | ||
277 | ibv_cq* get_cq() const { return cq; } | |
278 | int rearm_notify(bool solicited_only=true); | |
279 | CompletionChannel* get_cc() const { return channel; } | |
280 | private: | |
281 | CephContext *cct; | |
282 | Infiniband& infiniband; // Infiniband to which this QP belongs | |
283 | CompletionChannel *channel; | |
284 | ibv_cq *cq; | |
285 | uint32_t queue_depth; | |
286 | }; | |
287 | ||
288 | // this class encapsulates the creation, use, and destruction of an RC | |
289 | // queue pair. | |
290 | // | |
291 | // you need call init and it will create a qp and bring it to the INIT state. | |
292 | // after obtaining the lid, qpn, and psn of a remote queue pair, one | |
293 | // must call plumb() to bring the queue pair to the RTS state. | |
294 | class QueuePair { | |
295 | public: | |
296 | QueuePair(CephContext *c, Infiniband& infiniband, ibv_qp_type type, | |
297 | int ib_physical_port, ibv_srq *srq, | |
298 | Infiniband::CompletionQueue* txcq, | |
299 | Infiniband::CompletionQueue* rxcq, | |
300 | uint32_t max_send_wr, uint32_t max_recv_wr, uint32_t q_key = 0); | |
301 | ~QueuePair(); | |
302 | ||
303 | int init(); | |
304 | ||
305 | /** | |
306 | * Get the initial packet sequence number for this QueuePair. | |
307 | * This is randomly generated on creation. It should not be confused | |
308 | * with the remote side's PSN, which is set in #plumb(). | |
309 | */ | |
310 | uint32_t get_initial_psn() const { return initial_psn; }; | |
311 | /** | |
312 | * Get the local queue pair number for this QueuePair. | |
313 | * QPNs are analogous to UDP/TCP port numbers. | |
314 | */ | |
315 | uint32_t get_local_qp_number() const { return qp->qp_num; }; | |
316 | /** | |
317 | * Get the remote queue pair number for this QueuePair, as set in #plumb(). | |
318 | * QPNs are analogous to UDP/TCP port numbers. | |
319 | */ | |
320 | int get_remote_qp_number(uint32_t *rqp) const; | |
321 | /** | |
322 | * Get the remote infiniband address for this QueuePair, as set in #plumb(). | |
323 | * LIDs are "local IDs" in infiniband terminology. They are short, locally | |
324 | * routable addresses. | |
325 | */ | |
326 | int get_remote_lid(uint16_t *lid) const; | |
327 | /** | |
328 | * Get the state of a QueuePair. | |
329 | */ | |
330 | int get_state() const; | |
331 | /** | |
332 | * Return true if the queue pair is in an error state, false otherwise. | |
333 | */ | |
334 | bool is_error() const; | |
335 | ibv_qp* get_qp() const { return qp; } | |
336 | Infiniband::CompletionQueue* get_tx_cq() const { return txcq; } | |
337 | Infiniband::CompletionQueue* get_rx_cq() const { return rxcq; } | |
338 | int to_dead(); | |
339 | bool is_dead() const { return dead; } | |
340 | ||
341 | private: | |
342 | CephContext *cct; | |
343 | Infiniband& infiniband; // Infiniband to which this QP belongs | |
344 | ibv_qp_type type; // QP type (IBV_QPT_RC, etc.) | |
345 | ibv_context* ctxt; // device context of the HCA to use | |
346 | int ib_physical_port; | |
347 | ibv_pd* pd; // protection domain | |
348 | ibv_srq* srq; // shared receive queue | |
349 | ibv_qp* qp; // infiniband verbs QP handle | |
350 | Infiniband::CompletionQueue* txcq; | |
351 | Infiniband::CompletionQueue* rxcq; | |
352 | uint32_t initial_psn; // initial packet sequence number | |
353 | uint32_t max_send_wr; | |
354 | uint32_t max_recv_wr; | |
355 | uint32_t q_key; | |
356 | bool dead; | |
357 | }; | |
358 | ||
359 | public: | |
360 | typedef MemoryManager::Cluster Cluster; | |
361 | typedef MemoryManager::Chunk Chunk; | |
362 | QueuePair* create_queue_pair(CephContext *c, CompletionQueue*, CompletionQueue*, ibv_qp_type type); | |
363 | ibv_srq* create_shared_receive_queue(uint32_t max_wr, uint32_t max_sge); | |
364 | int post_chunk(Chunk* chunk); | |
365 | int post_channel_cluster(); | |
366 | int get_tx_buffers(std::vector<Chunk*> &c, size_t bytes); | |
367 | CompletionChannel *create_comp_channel(CephContext *c); | |
368 | CompletionQueue *create_comp_queue(CephContext *c, CompletionChannel *cc=NULL); | |
369 | uint8_t get_ib_physical_port() { return ib_physical_port; } | |
370 | int send_msg(CephContext *cct, int sd, IBSYNMsg& msg); | |
371 | int recv_msg(CephContext *cct, int sd, IBSYNMsg& msg); | |
372 | uint16_t get_lid() { return device->get_lid(); } | |
373 | ibv_gid get_gid() { return device->get_gid(); } | |
374 | MemoryManager* get_memory_manager() { return memory_manager; } | |
375 | Device* get_device() { return device; } | |
376 | int get_async_fd() { return device->ctxt->async_fd; } | |
377 | bool is_tx_buffer(const char* c) { return memory_manager->is_tx_buffer(c);} | |
378 | bool is_rx_buffer(const char* c) { return memory_manager->is_rx_buffer(c);} | |
379 | Chunk *get_tx_chunk_by_buffer(const char *c) { return memory_manager->get_tx_chunk_by_buffer(c); } | |
380 | static const char* wc_status_to_string(int status); | |
381 | static const char* qp_state_string(int status); | |
382 | }; | |
383 | ||
384 | #endif |