]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2016 XSKY <haomai@xsky.com> | |
7 | * | |
8 | * Author: Haomai Wang <haomaiwang@gmail.com> | |
9 | * | |
10 | * This is free software; you can redistribute it and/or | |
11 | * modify it under the terms of the GNU Lesser General Public | |
12 | * License version 2.1, as published by the Free Software | |
13 | * Foundation. See file COPYING. | |
14 | * | |
15 | */ | |
16 | ||
17 | #ifndef CEPH_INFINIBAND_H | |
18 | #define CEPH_INFINIBAND_H | |
19 | ||
11fdf7f2 TL |
20 | #include <boost/pool/pool.hpp> |
21 | // need this because boost messes with ceph log/assert definitions | |
22 | #include "include/ceph_assert.h" | |
7c673cae | 23 | |
31f18b77 | 24 | #include <infiniband/verbs.h> |
11fdf7f2 TL |
25 | #include <rdma/rdma_cma.h> |
26 | ||
27 | #include <atomic> | |
9f95a23c | 28 | #include <functional> |
11fdf7f2 TL |
29 | #include <string> |
30 | #include <vector> | |
31f18b77 | 31 | |
9f95a23c | 32 | #include "include/common_fwd.h" |
7c673cae FG |
33 | #include "include/int_types.h" |
34 | #include "include/page.h" | |
9f95a23c | 35 | #include "include/scope_guard.h" |
7c673cae FG |
36 | #include "common/debug.h" |
37 | #include "common/errno.h" | |
9f95a23c | 38 | #include "common/ceph_mutex.h" |
11fdf7f2 | 39 | #include "common/perf_counters.h" |
7c673cae FG |
40 | #include "msg/msg_types.h" |
41 | #include "msg/async/net_handler.h" | |
7c673cae | 42 | |
9f95a23c TL |
43 | #define HUGE_PAGE_SIZE_2MB (2 * 1024 * 1024) |
44 | #define ALIGN_TO_PAGE_2MB(x) \ | |
45 | (((x) + (HUGE_PAGE_SIZE_2MB - 1)) & ~(HUGE_PAGE_SIZE_2MB - 1)) | |
7c673cae | 46 | |
9f95a23c TL |
47 | #define PSN_LEN 24 |
48 | #define PSN_MSK ((1 << PSN_LEN) - 1) | |
49 | ||
50 | #define BEACON_WRID 0xDEADBEEF | |
51 | ||
52 | struct ib_cm_meta_t { | |
7c673cae | 53 | uint16_t lid; |
9f95a23c | 54 | uint32_t local_qpn; |
7c673cae FG |
55 | uint32_t psn; |
56 | uint32_t peer_qpn; | |
57 | union ibv_gid gid; | |
58 | } __attribute__((packed)); | |
59 | ||
60 | class RDMAStack; | |
31f18b77 FG |
61 | |
62 | class Port { | |
63 | struct ibv_context* ctxt; | |
64 | int port_num; | |
9f95a23c | 65 | struct ibv_port_attr port_attr; |
31f18b77 | 66 | uint16_t lid; |
9f95a23c | 67 | int gid_idx; |
31f18b77 FG |
68 | union ibv_gid gid; |
69 | ||
70 | public: | |
71 | explicit Port(CephContext *cct, struct ibv_context* ictxt, uint8_t ipn); | |
72 | uint16_t get_lid() { return lid; } | |
73 | ibv_gid get_gid() { return gid; } | |
74 | int get_port_num() { return port_num; } | |
9f95a23c | 75 | ibv_port_attr* get_port_attr() { return &port_attr; } |
31f18b77 FG |
76 | int get_gid_idx() { return gid_idx; } |
77 | }; | |
78 | ||
79 | ||
80 | class Device { | |
81 | ibv_device *device; | |
82 | const char* name; | |
11fdf7f2 | 83 | uint8_t port_cnt = 0; |
31f18b77 | 84 | public: |
9f95a23c TL |
85 | explicit Device(CephContext *c, ibv_device* ib_dev); |
86 | explicit Device(CephContext *c, ibv_context *ib_ctx); | |
31f18b77 FG |
87 | ~Device() { |
88 | if (active_port) { | |
89 | delete active_port; | |
11fdf7f2 | 90 | ceph_assert(ibv_close_device(ctxt) == 0); |
31f18b77 FG |
91 | } |
92 | } | |
93 | const char* get_name() { return name;} | |
94 | uint16_t get_lid() { return active_port->get_lid(); } | |
95 | ibv_gid get_gid() { return active_port->get_gid(); } | |
96 | int get_gid_idx() { return active_port->get_gid_idx(); } | |
97 | void binding_port(CephContext *c, int port_num); | |
98 | struct ibv_context *ctxt; | |
9f95a23c | 99 | ibv_device_attr device_attr; |
31f18b77 FG |
100 | Port* active_port; |
101 | }; | |
102 | ||
103 | ||
104 | class DeviceList { | |
105 | struct ibv_device ** device_list; | |
11fdf7f2 | 106 | struct ibv_context ** device_context_list; |
31f18b77 FG |
107 | int num; |
108 | Device** devices; | |
109 | public: | |
9f95a23c TL |
110 | explicit DeviceList(CephContext *cct): device_list(nullptr), device_context_list(nullptr), |
111 | num(0), devices(nullptr) { | |
112 | device_list = ibv_get_device_list(&num); | |
113 | ceph_assert(device_list); | |
114 | ceph_assert(num); | |
115 | if (cct->_conf->ms_async_rdma_cm) { | |
116 | device_context_list = rdma_get_devices(NULL); | |
117 | ceph_assert(device_context_list); | |
31f18b77 FG |
118 | } |
119 | devices = new Device*[num]; | |
120 | ||
121 | for (int i = 0;i < num; ++i) { | |
9f95a23c TL |
122 | if (cct->_conf->ms_async_rdma_cm) { |
123 | devices[i] = new Device(cct, device_context_list[i]); | |
124 | } else { | |
125 | devices[i] = new Device(cct, device_list[i]); | |
126 | } | |
31f18b77 FG |
127 | } |
128 | } | |
129 | ~DeviceList() { | |
130 | for (int i=0; i < num; ++i) { | |
131 | delete devices[i]; | |
132 | } | |
133 | delete []devices; | |
134 | ibv_free_device_list(device_list); | |
9f95a23c | 135 | rdma_free_devices(device_context_list); |
31f18b77 FG |
136 | } |
137 | ||
138 | Device* get_device(const char* device_name) { | |
31f18b77 FG |
139 | for (int i = 0; i < num; ++i) { |
140 | if (!strlen(device_name) || !strcmp(device_name, devices[i]->get_name())) { | |
141 | return devices[i]; | |
142 | } | |
143 | } | |
144 | return NULL; | |
145 | } | |
146 | }; | |
147 | ||
11fdf7f2 TL |
148 | // stat counters |
149 | enum { | |
150 | l_msgr_rdma_dispatcher_first = 94000, | |
151 | ||
152 | l_msgr_rdma_polling, | |
153 | l_msgr_rdma_inflight_tx_chunks, | |
154 | l_msgr_rdma_rx_bufs_in_use, | |
155 | l_msgr_rdma_rx_bufs_total, | |
156 | ||
157 | l_msgr_rdma_tx_total_wc, | |
158 | l_msgr_rdma_tx_total_wc_errors, | |
159 | l_msgr_rdma_tx_wc_retry_errors, | |
160 | l_msgr_rdma_tx_wc_wr_flush_errors, | |
161 | ||
162 | l_msgr_rdma_rx_total_wc, | |
163 | l_msgr_rdma_rx_total_wc_errors, | |
164 | l_msgr_rdma_rx_fin, | |
165 | ||
166 | l_msgr_rdma_handshake_errors, | |
167 | ||
168 | l_msgr_rdma_total_async_events, | |
169 | l_msgr_rdma_async_last_wqe_events, | |
170 | ||
171 | l_msgr_rdma_created_queue_pair, | |
172 | l_msgr_rdma_active_queue_pair, | |
173 | ||
174 | l_msgr_rdma_dispatcher_last, | |
175 | }; | |
176 | ||
177 | enum { | |
178 | l_msgr_rdma_first = 95000, | |
179 | ||
180 | l_msgr_rdma_tx_no_mem, | |
181 | l_msgr_rdma_tx_parital_mem, | |
182 | l_msgr_rdma_tx_failed, | |
183 | ||
184 | l_msgr_rdma_tx_chunks, | |
185 | l_msgr_rdma_tx_bytes, | |
186 | l_msgr_rdma_rx_chunks, | |
187 | l_msgr_rdma_rx_bytes, | |
188 | l_msgr_rdma_pending_sent_conns, | |
189 | ||
190 | l_msgr_rdma_last, | |
191 | }; | |
31f18b77 | 192 | |
7c673cae FG |
193 | class RDMADispatcher; |
194 | ||
195 | class Infiniband { | |
196 | public: | |
197 | class ProtectionDomain { | |
198 | public: | |
199 | explicit ProtectionDomain(CephContext *cct, Device *device); | |
200 | ~ProtectionDomain(); | |
201 | ||
202 | ibv_pd* const pd; | |
203 | }; | |
204 | ||
9f95a23c | 205 | class QueuePair; |
7c673cae FG |
206 | class MemoryManager { |
207 | public: | |
208 | class Chunk { | |
209 | public: | |
9f95a23c | 210 | Chunk(ibv_mr* m, uint32_t bytes, char* buffer, uint32_t offset = 0, uint32_t bound = 0, uint32_t lkey = 0, QueuePair* qp = nullptr); |
7c673cae FG |
211 | ~Chunk(); |
212 | ||
7c673cae | 213 | uint32_t get_offset(); |
9f95a23c | 214 | uint32_t get_size() const; |
7c673cae FG |
215 | void prepare_read(uint32_t b); |
216 | uint32_t get_bound(); | |
217 | uint32_t read(char* buf, uint32_t len); | |
218 | uint32_t write(char* buf, uint32_t len); | |
219 | bool full(); | |
9f95a23c TL |
220 | void reset_read_chunk(); |
221 | void reset_write_chunk(); | |
222 | void set_qp(QueuePair *qp) { this->qp = qp; } | |
223 | void clear_qp() { set_qp(nullptr); } | |
224 | QueuePair* get_qp() { return qp; } | |
7c673cae FG |
225 | |
226 | public: | |
227 | ibv_mr* mr; | |
9f95a23c TL |
228 | QueuePair *qp; |
229 | uint32_t lkey; | |
7c673cae | 230 | uint32_t bytes; |
7c673cae | 231 | uint32_t offset; |
9f95a23c | 232 | uint32_t bound; |
11fdf7f2 TL |
233 | char* buffer; // TODO: remove buffer/refactor TX |
234 | char data[0]; | |
7c673cae FG |
235 | }; |
236 | ||
237 | class Cluster { | |
238 | public: | |
239 | Cluster(MemoryManager& m, uint32_t s); | |
240 | ~Cluster(); | |
241 | ||
242 | int fill(uint32_t num); | |
243 | void take_back(std::vector<Chunk*> &ck); | |
244 | int get_buffers(std::vector<Chunk*> &chunks, size_t bytes); | |
245 | Chunk *get_chunk_by_buffer(const char *c) { | |
246 | uint32_t idx = (c - base) / buffer_size; | |
247 | Chunk *chunk = chunk_base + idx; | |
248 | return chunk; | |
249 | } | |
250 | bool is_my_buffer(const char *c) const { | |
251 | return c >= base && c < end; | |
252 | } | |
253 | ||
254 | MemoryManager& manager; | |
255 | uint32_t buffer_size; | |
11fdf7f2 | 256 | uint32_t num_chunk = 0; |
9f95a23c | 257 | ceph::mutex lock = ceph::make_mutex("cluster_lock"); |
7c673cae FG |
258 | std::vector<Chunk*> free_chunks; |
259 | char *base = nullptr; | |
260 | char *end = nullptr; | |
261 | Chunk* chunk_base = nullptr; | |
262 | }; | |
263 | ||
11fdf7f2 TL |
264 | class MemPoolContext { |
265 | PerfCounters *perf_logger; | |
266 | ||
267 | public: | |
268 | MemoryManager *manager; | |
269 | unsigned n_bufs_allocated; | |
270 | // true if it is possible to alloc | |
271 | // more memory for the pool | |
272 | explicit MemPoolContext(MemoryManager *m) : | |
273 | perf_logger(nullptr), | |
274 | manager(m), | |
275 | n_bufs_allocated(0) {} | |
276 | bool can_alloc(unsigned nbufs); | |
277 | void update_stats(int val); | |
278 | void set_stat_logger(PerfCounters *logger); | |
279 | }; | |
280 | ||
281 | class PoolAllocator { | |
282 | struct mem_info { | |
283 | ibv_mr *mr; | |
284 | MemPoolContext *ctx; | |
285 | unsigned nbufs; | |
286 | Chunk chunks[0]; | |
287 | }; | |
288 | public: | |
289 | typedef std::size_t size_type; | |
290 | typedef std::ptrdiff_t difference_type; | |
291 | ||
292 | static char * malloc(const size_type bytes); | |
293 | static void free(char * const block); | |
294 | ||
9f95a23c TL |
295 | template<typename Func> |
296 | static std::invoke_result_t<Func> with_context(MemPoolContext* ctx, | |
297 | Func&& func) { | |
298 | std::lock_guard l{get_lock()}; | |
299 | g_ctx = ctx; | |
300 | scope_guard reset_ctx{[] { g_ctx = nullptr; }}; | |
301 | return std::move(func)(); | |
302 | } | |
303 | private: | |
304 | static ceph::mutex& get_lock(); | |
305 | static MemPoolContext* g_ctx; | |
11fdf7f2 TL |
306 | }; |
307 | ||
308 | /** | |
309 | * modify boost pool so that it is possible to | |
310 | * have a thread safe 'context' when allocating/freeing | |
311 | * the memory. It is needed to allow a different pool | |
312 | * configurations and bookkeeping per CephContext and | |
313 | * also to be able to use same allocator to deal with | |
314 | * RX and TX pool. | |
315 | * TODO: use boost pool to allocate TX chunks too | |
316 | */ | |
317 | class mem_pool : public boost::pool<PoolAllocator> { | |
318 | private: | |
319 | MemPoolContext *ctx; | |
320 | void *slow_malloc(); | |
321 | ||
322 | public: | |
9f95a23c | 323 | ceph::mutex lock = ceph::make_mutex("mem_pool_lock"); |
11fdf7f2 TL |
324 | explicit mem_pool(MemPoolContext *ctx, const size_type nrequested_size, |
325 | const size_type nnext_size = 32, | |
326 | const size_type nmax_size = 0) : | |
327 | pool(nrequested_size, nnext_size, nmax_size), | |
328 | ctx(ctx) { } | |
329 | ||
330 | void *malloc() { | |
331 | if (!store().empty()) | |
332 | return (store().malloc)(); | |
333 | // need to alloc more memory... | |
334 | // slow path code | |
335 | return slow_malloc(); | |
336 | } | |
337 | }; | |
338 | ||
339 | MemoryManager(CephContext *c, Device *d, ProtectionDomain *p); | |
7c673cae FG |
340 | ~MemoryManager(); |
341 | ||
11fdf7f2 TL |
342 | void* malloc(size_t size); |
343 | void free(void *ptr); | |
344 | ||
345 | void create_tx_pool(uint32_t size, uint32_t tx_num); | |
7c673cae FG |
346 | void return_tx(std::vector<Chunk*> &chunks); |
347 | int get_send_buffers(std::vector<Chunk*> &c, size_t bytes); | |
7c673cae | 348 | bool is_tx_buffer(const char* c) { return send->is_my_buffer(c); } |
7c673cae FG |
349 | Chunk *get_tx_chunk_by_buffer(const char *c) { |
350 | return send->get_chunk_by_buffer(c); | |
351 | } | |
352 | uint32_t get_tx_buffer_size() const { | |
353 | return send->buffer_size; | |
354 | } | |
355 | ||
11fdf7f2 | 356 | Chunk *get_rx_buffer() { |
9f95a23c | 357 | std::lock_guard l{rxbuf_pool.lock}; |
11fdf7f2 TL |
358 | return reinterpret_cast<Chunk *>(rxbuf_pool.malloc()); |
359 | } | |
360 | ||
361 | void release_rx_buffer(Chunk *chunk) { | |
9f95a23c TL |
362 | std::lock_guard l{rxbuf_pool.lock}; |
363 | chunk->clear_qp(); | |
11fdf7f2 TL |
364 | rxbuf_pool.free(chunk); |
365 | } | |
366 | ||
367 | void set_rx_stat_logger(PerfCounters *logger) { | |
368 | rxbuf_pool_ctx.set_stat_logger(logger); | |
369 | } | |
7c673cae | 370 | |
11fdf7f2 | 371 | CephContext *cct; |
7c673cae | 372 | private: |
11fdf7f2 TL |
373 | // TODO: Cluster -> TxPool txbuf_pool |
374 | // chunk layout fix | |
375 | // | |
376 | Cluster* send = nullptr;// SEND | |
7c673cae FG |
377 | Device *device; |
378 | ProtectionDomain *pd; | |
11fdf7f2 TL |
379 | MemPoolContext rxbuf_pool_ctx; |
380 | mem_pool rxbuf_pool; | |
381 | ||
382 | ||
383 | void* huge_pages_malloc(size_t size); | |
384 | void huge_pages_free(void *ptr); | |
7c673cae FG |
385 | }; |
386 | ||
387 | private: | |
11fdf7f2 TL |
388 | uint32_t tx_queue_len = 0; |
389 | uint32_t rx_queue_len = 0; | |
d2e6a577 FG |
390 | uint32_t max_sge = 0; |
391 | uint8_t ib_physical_port = 0; | |
392 | MemoryManager* memory_manager = nullptr; | |
393 | ibv_srq* srq = nullptr; // shared receive work queue | |
394 | Device *device = NULL; | |
395 | ProtectionDomain *pd = NULL; | |
31f18b77 | 396 | DeviceList *device_list = nullptr; |
7c673cae | 397 | CephContext *cct; |
9f95a23c | 398 | ceph::mutex lock = ceph::make_mutex("IB lock"); |
7c673cae | 399 | bool initialized = false; |
31f18b77 FG |
400 | const std::string &device_name; |
401 | uint8_t port_num; | |
11fdf7f2 | 402 | bool support_srq = false; |
7c673cae FG |
403 | |
404 | public: | |
11fdf7f2 | 405 | explicit Infiniband(CephContext *c); |
7c673cae FG |
406 | ~Infiniband(); |
407 | void init(); | |
11fdf7f2 | 408 | static void verify_prereq(CephContext *cct); |
7c673cae FG |
409 | |
410 | class CompletionChannel { | |
411 | static const uint32_t MAX_ACK_EVENT = 5000; | |
412 | CephContext *cct; | |
31f18b77 | 413 | Infiniband& infiniband; |
7c673cae FG |
414 | ibv_comp_channel *channel; |
415 | ibv_cq *cq; | |
416 | uint32_t cq_events_that_need_ack; | |
417 | ||
418 | public: | |
31f18b77 | 419 | CompletionChannel(CephContext *c, Infiniband &ib); |
7c673cae FG |
420 | ~CompletionChannel(); |
421 | int init(); | |
422 | bool get_cq_event(); | |
423 | int get_fd() { return channel->fd; } | |
424 | ibv_comp_channel* get_channel() { return channel; } | |
425 | void bind_cq(ibv_cq *c) { cq = c; } | |
426 | void ack_events(); | |
427 | }; | |
428 | ||
429 | // this class encapsulates the creation, use, and destruction of an RC | |
430 | // completion queue. | |
431 | // | |
432 | // You need to call init and it will create a cq and associate to comp channel | |
433 | class CompletionQueue { | |
434 | public: | |
31f18b77 | 435 | CompletionQueue(CephContext *c, Infiniband &ib, |
7c673cae | 436 | const uint32_t qd, CompletionChannel *cc) |
31f18b77 | 437 | : cct(c), infiniband(ib), channel(cc), cq(NULL), queue_depth(qd) {} |
7c673cae FG |
438 | ~CompletionQueue(); |
439 | int init(); | |
440 | int poll_cq(int num_entries, ibv_wc *ret_wc_array); | |
441 | ||
442 | ibv_cq* get_cq() const { return cq; } | |
443 | int rearm_notify(bool solicited_only=true); | |
444 | CompletionChannel* get_cc() const { return channel; } | |
445 | private: | |
446 | CephContext *cct; | |
31f18b77 | 447 | Infiniband& infiniband; // Infiniband to which this QP belongs |
7c673cae FG |
448 | CompletionChannel *channel; |
449 | ibv_cq *cq; | |
450 | uint32_t queue_depth; | |
451 | }; | |
452 | ||
453 | // this class encapsulates the creation, use, and destruction of an RC | |
454 | // queue pair. | |
455 | // | |
456 | // you need call init and it will create a qp and bring it to the INIT state. | |
457 | // after obtaining the lid, qpn, and psn of a remote queue pair, one | |
458 | // must call plumb() to bring the queue pair to the RTS state. | |
459 | class QueuePair { | |
460 | public: | |
9f95a23c | 461 | typedef MemoryManager::Chunk Chunk; |
31f18b77 | 462 | QueuePair(CephContext *c, Infiniband& infiniband, ibv_qp_type type, |
7c673cae FG |
463 | int ib_physical_port, ibv_srq *srq, |
464 | Infiniband::CompletionQueue* txcq, | |
465 | Infiniband::CompletionQueue* rxcq, | |
11fdf7f2 | 466 | uint32_t tx_queue_len, uint32_t max_recv_wr, struct rdma_cm_id *cid, uint32_t q_key = 0); |
7c673cae FG |
467 | ~QueuePair(); |
468 | ||
9f95a23c TL |
469 | int modify_qp_to_error(); |
470 | int modify_qp_to_rts(); | |
471 | int modify_qp_to_rtr(); | |
472 | int modify_qp_to_init(); | |
7c673cae FG |
473 | int init(); |
474 | ||
475 | /** | |
476 | * Get the initial packet sequence number for this QueuePair. | |
477 | * This is randomly generated on creation. It should not be confused | |
478 | * with the remote side's PSN, which is set in #plumb(). | |
479 | */ | |
480 | uint32_t get_initial_psn() const { return initial_psn; }; | |
481 | /** | |
482 | * Get the local queue pair number for this QueuePair. | |
483 | * QPNs are analogous to UDP/TCP port numbers. | |
484 | */ | |
485 | uint32_t get_local_qp_number() const { return qp->qp_num; }; | |
486 | /** | |
487 | * Get the remote queue pair number for this QueuePair, as set in #plumb(). | |
488 | * QPNs are analogous to UDP/TCP port numbers. | |
489 | */ | |
490 | int get_remote_qp_number(uint32_t *rqp) const; | |
491 | /** | |
492 | * Get the remote infiniband address for this QueuePair, as set in #plumb(). | |
493 | * LIDs are "local IDs" in infiniband terminology. They are short, locally | |
494 | * routable addresses. | |
495 | */ | |
496 | int get_remote_lid(uint16_t *lid) const; | |
497 | /** | |
498 | * Get the state of a QueuePair. | |
499 | */ | |
500 | int get_state() const; | |
9f95a23c TL |
501 | /* |
502 | * send/receive connection management meta data | |
7c673cae | 503 | */ |
9f95a23c TL |
504 | int send_cm_meta(CephContext *cct, int socket_fd); |
505 | int recv_cm_meta(CephContext *cct, int socket_fd); | |
506 | void wire_gid_to_gid(const char *wgid, ib_cm_meta_t* cm_meta_data); | |
507 | void gid_to_wire_gid(const ib_cm_meta_t& cm_meta_data, char wgid[]); | |
7c673cae FG |
508 | ibv_qp* get_qp() const { return qp; } |
509 | Infiniband::CompletionQueue* get_tx_cq() const { return txcq; } | |
510 | Infiniband::CompletionQueue* get_rx_cq() const { return rxcq; } | |
511 | int to_dead(); | |
512 | bool is_dead() const { return dead; } | |
9f95a23c TL |
513 | ib_cm_meta_t& get_peer_cm_meta() { return peer_cm_meta; } |
514 | ib_cm_meta_t& get_local_cm_meta() { return local_cm_meta; } | |
515 | void add_rq_wr(Chunk* chunk) | |
516 | { | |
517 | if (srq) return; | |
518 | ||
519 | std::lock_guard l{lock}; | |
520 | recv_queue.push_back(chunk); | |
521 | } | |
522 | ||
523 | void remove_rq_wr(Chunk* chunk) { | |
524 | if (srq) return; | |
525 | ||
526 | std::lock_guard l{lock}; | |
527 | auto it = std::find(recv_queue.begin(), recv_queue.end(), chunk); | |
528 | ceph_assert(it != recv_queue.end()); | |
529 | recv_queue.erase(it); | |
530 | } | |
531 | ibv_srq* get_srq() const { return srq; } | |
7c673cae FG |
532 | |
533 | private: | |
534 | CephContext *cct; | |
31f18b77 | 535 | Infiniband& infiniband; // Infiniband to which this QP belongs |
7c673cae FG |
536 | ibv_qp_type type; // QP type (IBV_QPT_RC, etc.) |
537 | ibv_context* ctxt; // device context of the HCA to use | |
538 | int ib_physical_port; | |
539 | ibv_pd* pd; // protection domain | |
540 | ibv_srq* srq; // shared receive queue | |
541 | ibv_qp* qp; // infiniband verbs QP handle | |
11fdf7f2 | 542 | struct rdma_cm_id *cm_id; |
9f95a23c TL |
543 | ib_cm_meta_t peer_cm_meta; |
544 | ib_cm_meta_t local_cm_meta; | |
7c673cae FG |
545 | Infiniband::CompletionQueue* txcq; |
546 | Infiniband::CompletionQueue* rxcq; | |
547 | uint32_t initial_psn; // initial packet sequence number | |
548 | uint32_t max_send_wr; | |
549 | uint32_t max_recv_wr; | |
550 | uint32_t q_key; | |
551 | bool dead; | |
9f95a23c TL |
552 | vector<Chunk*> recv_queue; |
553 | ceph::mutex lock = ceph::make_mutex("queue_pair_lock"); | |
7c673cae FG |
554 | }; |
555 | ||
556 | public: | |
31f18b77 FG |
557 | typedef MemoryManager::Cluster Cluster; |
558 | typedef MemoryManager::Chunk Chunk; | |
11fdf7f2 TL |
559 | QueuePair* create_queue_pair(CephContext *c, CompletionQueue*, CompletionQueue*, |
560 | ibv_qp_type type, struct rdma_cm_id *cm_id); | |
31f18b77 | 561 | ibv_srq* create_shared_receive_queue(uint32_t max_wr, uint32_t max_sge); |
11fdf7f2 | 562 | // post rx buffers to srq, return number of buffers actually posted |
9f95a23c | 563 | int post_chunks_to_rq(int num, QueuePair *qp = nullptr); |
11fdf7f2 | 564 | void post_chunk_to_pool(Chunk* chunk) { |
9f95a23c TL |
565 | QueuePair *qp = chunk->get_qp(); |
566 | if (qp != nullptr) { | |
567 | qp->remove_rq_wr(chunk); | |
568 | } | |
11fdf7f2 TL |
569 | get_memory_manager()->release_rx_buffer(chunk); |
570 | } | |
31f18b77 FG |
571 | int get_tx_buffers(std::vector<Chunk*> &c, size_t bytes); |
572 | CompletionChannel *create_comp_channel(CephContext *c); | |
573 | CompletionQueue *create_comp_queue(CephContext *c, CompletionChannel *cc=NULL); | |
574 | uint8_t get_ib_physical_port() { return ib_physical_port; } | |
31f18b77 FG |
575 | uint16_t get_lid() { return device->get_lid(); } |
576 | ibv_gid get_gid() { return device->get_gid(); } | |
577 | MemoryManager* get_memory_manager() { return memory_manager; } | |
578 | Device* get_device() { return device; } | |
579 | int get_async_fd() { return device->ctxt->async_fd; } | |
580 | bool is_tx_buffer(const char* c) { return memory_manager->is_tx_buffer(c);} | |
31f18b77 | 581 | Chunk *get_tx_chunk_by_buffer(const char *c) { return memory_manager->get_tx_chunk_by_buffer(c); } |
7c673cae FG |
582 | static const char* wc_status_to_string(int status); |
583 | static const char* qp_state_string(int status); | |
11fdf7f2 | 584 | uint32_t get_rx_queue_len() const { return rx_queue_len; } |
7c673cae FG |
585 | }; |
586 | ||
7c673cae | 587 | #endif |