]> git.proxmox.com Git - ceph.git/blame - ceph/src/msg/async/rdma/Infiniband.h
bump version to 18.2.2-pve1
[ceph.git] / ceph / src / msg / async / rdma / Infiniband.h
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2016 XSKY <haomai@xsky.com>
7 *
8 * Author: Haomai Wang <haomaiwang@gmail.com>
9 *
10 * This is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License version 2.1, as published by the Free Software
13 * Foundation. See file COPYING.
14 *
15 */
16
17#ifndef CEPH_INFINIBAND_H
18#define CEPH_INFINIBAND_H
19
11fdf7f2
TL
20#include <boost/pool/pool.hpp>
21// need this because boost messes with ceph log/assert definitions
22#include "include/ceph_assert.h"
7c673cae 23
31f18b77 24#include <infiniband/verbs.h>
11fdf7f2
TL
25#include <rdma/rdma_cma.h>
26
27#include <atomic>
9f95a23c 28#include <functional>
11fdf7f2
TL
29#include <string>
30#include <vector>
31f18b77 31
9f95a23c 32#include "include/common_fwd.h"
7c673cae
FG
33#include "include/int_types.h"
34#include "include/page.h"
9f95a23c 35#include "include/scope_guard.h"
7c673cae
FG
36#include "common/debug.h"
37#include "common/errno.h"
9f95a23c 38#include "common/ceph_mutex.h"
11fdf7f2 39#include "common/perf_counters.h"
7c673cae
FG
40#include "msg/msg_types.h"
41#include "msg/async/net_handler.h"
7c673cae 42
9f95a23c
TL
43#define HUGE_PAGE_SIZE_2MB (2 * 1024 * 1024)
44#define ALIGN_TO_PAGE_2MB(x) \
45 (((x) + (HUGE_PAGE_SIZE_2MB - 1)) & ~(HUGE_PAGE_SIZE_2MB - 1))
7c673cae 46
9f95a23c
TL
47#define PSN_LEN 24
48#define PSN_MSK ((1 << PSN_LEN) - 1)
49
50#define BEACON_WRID 0xDEADBEEF
51
52struct ib_cm_meta_t {
7c673cae 53 uint16_t lid;
9f95a23c 54 uint32_t local_qpn;
7c673cae
FG
55 uint32_t psn;
56 uint32_t peer_qpn;
57 union ibv_gid gid;
58} __attribute__((packed));
59
60class RDMAStack;
31f18b77
FG
61
62class Port {
63 struct ibv_context* ctxt;
64 int port_num;
9f95a23c 65 struct ibv_port_attr port_attr;
31f18b77 66 uint16_t lid;
9f95a23c 67 int gid_idx;
31f18b77
FG
68 union ibv_gid gid;
69
70 public:
71 explicit Port(CephContext *cct, struct ibv_context* ictxt, uint8_t ipn);
72 uint16_t get_lid() { return lid; }
73 ibv_gid get_gid() { return gid; }
74 int get_port_num() { return port_num; }
9f95a23c 75 ibv_port_attr* get_port_attr() { return &port_attr; }
31f18b77
FG
76 int get_gid_idx() { return gid_idx; }
77};
78
79
80class Device {
81 ibv_device *device;
82 const char* name;
11fdf7f2 83 uint8_t port_cnt = 0;
31f18b77 84 public:
9f95a23c
TL
85 explicit Device(CephContext *c, ibv_device* ib_dev);
86 explicit Device(CephContext *c, ibv_context *ib_ctx);
31f18b77
FG
87 ~Device() {
88 if (active_port) {
89 delete active_port;
11fdf7f2 90 ceph_assert(ibv_close_device(ctxt) == 0);
31f18b77
FG
91 }
92 }
93 const char* get_name() { return name;}
94 uint16_t get_lid() { return active_port->get_lid(); }
95 ibv_gid get_gid() { return active_port->get_gid(); }
96 int get_gid_idx() { return active_port->get_gid_idx(); }
97 void binding_port(CephContext *c, int port_num);
98 struct ibv_context *ctxt;
9f95a23c 99 ibv_device_attr device_attr;
31f18b77
FG
100 Port* active_port;
101};
102
103
104class DeviceList {
105 struct ibv_device ** device_list;
11fdf7f2 106 struct ibv_context ** device_context_list;
31f18b77
FG
107 int num;
108 Device** devices;
109 public:
9f95a23c
TL
110 explicit DeviceList(CephContext *cct): device_list(nullptr), device_context_list(nullptr),
111 num(0), devices(nullptr) {
112 device_list = ibv_get_device_list(&num);
113 ceph_assert(device_list);
114 ceph_assert(num);
115 if (cct->_conf->ms_async_rdma_cm) {
116 device_context_list = rdma_get_devices(NULL);
117 ceph_assert(device_context_list);
31f18b77
FG
118 }
119 devices = new Device*[num];
120
121 for (int i = 0;i < num; ++i) {
9f95a23c
TL
122 if (cct->_conf->ms_async_rdma_cm) {
123 devices[i] = new Device(cct, device_context_list[i]);
124 } else {
125 devices[i] = new Device(cct, device_list[i]);
126 }
31f18b77
FG
127 }
128 }
129 ~DeviceList() {
130 for (int i=0; i < num; ++i) {
131 delete devices[i];
132 }
133 delete []devices;
134 ibv_free_device_list(device_list);
9f95a23c 135 rdma_free_devices(device_context_list);
31f18b77
FG
136 }
137
138 Device* get_device(const char* device_name) {
31f18b77
FG
139 for (int i = 0; i < num; ++i) {
140 if (!strlen(device_name) || !strcmp(device_name, devices[i]->get_name())) {
141 return devices[i];
142 }
143 }
144 return NULL;
145 }
146};
147
11fdf7f2
TL
148// stat counters
149enum {
150 l_msgr_rdma_dispatcher_first = 94000,
151
152 l_msgr_rdma_polling,
153 l_msgr_rdma_inflight_tx_chunks,
154 l_msgr_rdma_rx_bufs_in_use,
155 l_msgr_rdma_rx_bufs_total,
156
157 l_msgr_rdma_tx_total_wc,
158 l_msgr_rdma_tx_total_wc_errors,
159 l_msgr_rdma_tx_wc_retry_errors,
160 l_msgr_rdma_tx_wc_wr_flush_errors,
161
162 l_msgr_rdma_rx_total_wc,
163 l_msgr_rdma_rx_total_wc_errors,
164 l_msgr_rdma_rx_fin,
165
166 l_msgr_rdma_handshake_errors,
167
168 l_msgr_rdma_total_async_events,
169 l_msgr_rdma_async_last_wqe_events,
170
171 l_msgr_rdma_created_queue_pair,
172 l_msgr_rdma_active_queue_pair,
173
174 l_msgr_rdma_dispatcher_last,
175};
176
177enum {
178 l_msgr_rdma_first = 95000,
179
180 l_msgr_rdma_tx_no_mem,
181 l_msgr_rdma_tx_parital_mem,
182 l_msgr_rdma_tx_failed,
183
184 l_msgr_rdma_tx_chunks,
185 l_msgr_rdma_tx_bytes,
186 l_msgr_rdma_rx_chunks,
187 l_msgr_rdma_rx_bytes,
188 l_msgr_rdma_pending_sent_conns,
189
190 l_msgr_rdma_last,
191};
31f18b77 192
7c673cae
FG
193class RDMADispatcher;
194
195class Infiniband {
196 public:
197 class ProtectionDomain {
198 public:
199 explicit ProtectionDomain(CephContext *cct, Device *device);
200 ~ProtectionDomain();
201
202 ibv_pd* const pd;
203 };
204
9f95a23c 205 class QueuePair;
7c673cae
FG
206 class MemoryManager {
207 public:
208 class Chunk {
209 public:
9f95a23c 210 Chunk(ibv_mr* m, uint32_t bytes, char* buffer, uint32_t offset = 0, uint32_t bound = 0, uint32_t lkey = 0, QueuePair* qp = nullptr);
7c673cae
FG
211 ~Chunk();
212
7c673cae 213 uint32_t get_offset();
9f95a23c 214 uint32_t get_size() const;
7c673cae
FG
215 void prepare_read(uint32_t b);
216 uint32_t get_bound();
217 uint32_t read(char* buf, uint32_t len);
218 uint32_t write(char* buf, uint32_t len);
219 bool full();
9f95a23c
TL
220 void reset_read_chunk();
221 void reset_write_chunk();
222 void set_qp(QueuePair *qp) { this->qp = qp; }
223 void clear_qp() { set_qp(nullptr); }
224 QueuePair* get_qp() { return qp; }
7c673cae
FG
225
226 public:
227 ibv_mr* mr;
9f95a23c
TL
228 QueuePair *qp;
229 uint32_t lkey;
7c673cae 230 uint32_t bytes;
7c673cae 231 uint32_t offset;
9f95a23c 232 uint32_t bound;
11fdf7f2
TL
233 char* buffer; // TODO: remove buffer/refactor TX
234 char data[0];
7c673cae
FG
235 };
236
237 class Cluster {
238 public:
239 Cluster(MemoryManager& m, uint32_t s);
240 ~Cluster();
241
242 int fill(uint32_t num);
243 void take_back(std::vector<Chunk*> &ck);
244 int get_buffers(std::vector<Chunk*> &chunks, size_t bytes);
245 Chunk *get_chunk_by_buffer(const char *c) {
246 uint32_t idx = (c - base) / buffer_size;
247 Chunk *chunk = chunk_base + idx;
248 return chunk;
249 }
250 bool is_my_buffer(const char *c) const {
251 return c >= base && c < end;
252 }
253
f67539c2
TL
254 bool is_valid_chunk(const Chunk* c) const {
255 return c >= chunk_base && c < chunk_base + num_chunk;
256 }
7c673cae
FG
257 MemoryManager& manager;
258 uint32_t buffer_size;
11fdf7f2 259 uint32_t num_chunk = 0;
9f95a23c 260 ceph::mutex lock = ceph::make_mutex("cluster_lock");
7c673cae
FG
261 std::vector<Chunk*> free_chunks;
262 char *base = nullptr;
263 char *end = nullptr;
264 Chunk* chunk_base = nullptr;
265 };
266
11fdf7f2
TL
267 class MemPoolContext {
268 PerfCounters *perf_logger;
269
270 public:
271 MemoryManager *manager;
272 unsigned n_bufs_allocated;
273 // true if it is possible to alloc
274 // more memory for the pool
275 explicit MemPoolContext(MemoryManager *m) :
276 perf_logger(nullptr),
277 manager(m),
278 n_bufs_allocated(0) {}
279 bool can_alloc(unsigned nbufs);
280 void update_stats(int val);
281 void set_stat_logger(PerfCounters *logger);
282 };
283
284 class PoolAllocator {
285 struct mem_info {
286 ibv_mr *mr;
287 MemPoolContext *ctx;
288 unsigned nbufs;
289 Chunk chunks[0];
290 };
291 public:
292 typedef std::size_t size_type;
293 typedef std::ptrdiff_t difference_type;
294
295 static char * malloc(const size_type bytes);
296 static void free(char * const block);
297
9f95a23c
TL
298 template<typename Func>
299 static std::invoke_result_t<Func> with_context(MemPoolContext* ctx,
300 Func&& func) {
301 std::lock_guard l{get_lock()};
302 g_ctx = ctx;
303 scope_guard reset_ctx{[] { g_ctx = nullptr; }};
304 return std::move(func)();
305 }
306 private:
307 static ceph::mutex& get_lock();
308 static MemPoolContext* g_ctx;
11fdf7f2
TL
309 };
310
311 /**
312 * modify boost pool so that it is possible to
313 * have a thread safe 'context' when allocating/freeing
314 * the memory. It is needed to allow a different pool
315 * configurations and bookkeeping per CephContext and
316 * also to be able to use same allocator to deal with
317 * RX and TX pool.
318 * TODO: use boost pool to allocate TX chunks too
319 */
320 class mem_pool : public boost::pool<PoolAllocator> {
321 private:
322 MemPoolContext *ctx;
323 void *slow_malloc();
324
325 public:
9f95a23c 326 ceph::mutex lock = ceph::make_mutex("mem_pool_lock");
11fdf7f2
TL
327 explicit mem_pool(MemPoolContext *ctx, const size_type nrequested_size,
328 const size_type nnext_size = 32,
329 const size_type nmax_size = 0) :
330 pool(nrequested_size, nnext_size, nmax_size),
331 ctx(ctx) { }
332
333 void *malloc() {
334 if (!store().empty())
335 return (store().malloc)();
336 // need to alloc more memory...
337 // slow path code
338 return slow_malloc();
339 }
340 };
341
342 MemoryManager(CephContext *c, Device *d, ProtectionDomain *p);
7c673cae
FG
343 ~MemoryManager();
344
11fdf7f2
TL
345 void* malloc(size_t size);
346 void free(void *ptr);
347
348 void create_tx_pool(uint32_t size, uint32_t tx_num);
7c673cae
FG
349 void return_tx(std::vector<Chunk*> &chunks);
350 int get_send_buffers(std::vector<Chunk*> &c, size_t bytes);
7c673cae 351 bool is_tx_buffer(const char* c) { return send->is_my_buffer(c); }
f67539c2 352 bool is_valid_chunk(const Chunk* c) { return send->is_valid_chunk(c); }
7c673cae
FG
353 Chunk *get_tx_chunk_by_buffer(const char *c) {
354 return send->get_chunk_by_buffer(c);
355 }
356 uint32_t get_tx_buffer_size() const {
357 return send->buffer_size;
358 }
359
11fdf7f2 360 Chunk *get_rx_buffer() {
9f95a23c 361 std::lock_guard l{rxbuf_pool.lock};
11fdf7f2
TL
362 return reinterpret_cast<Chunk *>(rxbuf_pool.malloc());
363 }
364
365 void release_rx_buffer(Chunk *chunk) {
9f95a23c
TL
366 std::lock_guard l{rxbuf_pool.lock};
367 chunk->clear_qp();
11fdf7f2
TL
368 rxbuf_pool.free(chunk);
369 }
370
371 void set_rx_stat_logger(PerfCounters *logger) {
372 rxbuf_pool_ctx.set_stat_logger(logger);
373 }
7c673cae 374
11fdf7f2 375 CephContext *cct;
7c673cae 376 private:
11fdf7f2
TL
377 // TODO: Cluster -> TxPool txbuf_pool
378 // chunk layout fix
379 //
380 Cluster* send = nullptr;// SEND
7c673cae
FG
381 Device *device;
382 ProtectionDomain *pd;
11fdf7f2
TL
383 MemPoolContext rxbuf_pool_ctx;
384 mem_pool rxbuf_pool;
385
386
387 void* huge_pages_malloc(size_t size);
388 void huge_pages_free(void *ptr);
7c673cae
FG
389 };
390
391 private:
11fdf7f2
TL
392 uint32_t tx_queue_len = 0;
393 uint32_t rx_queue_len = 0;
d2e6a577
FG
394 uint32_t max_sge = 0;
395 uint8_t ib_physical_port = 0;
396 MemoryManager* memory_manager = nullptr;
397 ibv_srq* srq = nullptr; // shared receive work queue
398 Device *device = NULL;
399 ProtectionDomain *pd = NULL;
31f18b77 400 DeviceList *device_list = nullptr;
7c673cae 401 CephContext *cct;
9f95a23c 402 ceph::mutex lock = ceph::make_mutex("IB lock");
7c673cae 403 bool initialized = false;
31f18b77
FG
404 const std::string &device_name;
405 uint8_t port_num;
11fdf7f2 406 bool support_srq = false;
7c673cae
FG
407
408 public:
11fdf7f2 409 explicit Infiniband(CephContext *c);
7c673cae
FG
410 ~Infiniband();
411 void init();
11fdf7f2 412 static void verify_prereq(CephContext *cct);
7c673cae
FG
413
414 class CompletionChannel {
415 static const uint32_t MAX_ACK_EVENT = 5000;
416 CephContext *cct;
31f18b77 417 Infiniband& infiniband;
7c673cae
FG
418 ibv_comp_channel *channel;
419 ibv_cq *cq;
420 uint32_t cq_events_that_need_ack;
421
422 public:
31f18b77 423 CompletionChannel(CephContext *c, Infiniband &ib);
7c673cae
FG
424 ~CompletionChannel();
425 int init();
426 bool get_cq_event();
427 int get_fd() { return channel->fd; }
428 ibv_comp_channel* get_channel() { return channel; }
429 void bind_cq(ibv_cq *c) { cq = c; }
430 void ack_events();
431 };
432
433 // this class encapsulates the creation, use, and destruction of an RC
434 // completion queue.
435 //
436 // You need to call init and it will create a cq and associate to comp channel
437 class CompletionQueue {
438 public:
31f18b77 439 CompletionQueue(CephContext *c, Infiniband &ib,
7c673cae 440 const uint32_t qd, CompletionChannel *cc)
31f18b77 441 : cct(c), infiniband(ib), channel(cc), cq(NULL), queue_depth(qd) {}
7c673cae
FG
442 ~CompletionQueue();
443 int init();
444 int poll_cq(int num_entries, ibv_wc *ret_wc_array);
445
446 ibv_cq* get_cq() const { return cq; }
447 int rearm_notify(bool solicited_only=true);
448 CompletionChannel* get_cc() const { return channel; }
449 private:
450 CephContext *cct;
31f18b77 451 Infiniband& infiniband; // Infiniband to which this QP belongs
7c673cae
FG
452 CompletionChannel *channel;
453 ibv_cq *cq;
454 uint32_t queue_depth;
455 };
456
457 // this class encapsulates the creation, use, and destruction of an RC
458 // queue pair.
459 //
460 // you need call init and it will create a qp and bring it to the INIT state.
461 // after obtaining the lid, qpn, and psn of a remote queue pair, one
462 // must call plumb() to bring the queue pair to the RTS state.
463 class QueuePair {
464 public:
9f95a23c 465 typedef MemoryManager::Chunk Chunk;
31f18b77 466 QueuePair(CephContext *c, Infiniband& infiniband, ibv_qp_type type,
7c673cae
FG
467 int ib_physical_port, ibv_srq *srq,
468 Infiniband::CompletionQueue* txcq,
469 Infiniband::CompletionQueue* rxcq,
11fdf7f2 470 uint32_t tx_queue_len, uint32_t max_recv_wr, struct rdma_cm_id *cid, uint32_t q_key = 0);
7c673cae
FG
471 ~QueuePair();
472
9f95a23c
TL
473 int modify_qp_to_error();
474 int modify_qp_to_rts();
475 int modify_qp_to_rtr();
476 int modify_qp_to_init();
7c673cae
FG
477 int init();
478
479 /**
480 * Get the initial packet sequence number for this QueuePair.
481 * This is randomly generated on creation. It should not be confused
482 * with the remote side's PSN, which is set in #plumb().
483 */
484 uint32_t get_initial_psn() const { return initial_psn; };
485 /**
486 * Get the local queue pair number for this QueuePair.
487 * QPNs are analogous to UDP/TCP port numbers.
488 */
489 uint32_t get_local_qp_number() const { return qp->qp_num; };
490 /**
491 * Get the remote queue pair number for this QueuePair, as set in #plumb().
492 * QPNs are analogous to UDP/TCP port numbers.
493 */
494 int get_remote_qp_number(uint32_t *rqp) const;
495 /**
496 * Get the remote infiniband address for this QueuePair, as set in #plumb().
497 * LIDs are "local IDs" in infiniband terminology. They are short, locally
498 * routable addresses.
499 */
500 int get_remote_lid(uint16_t *lid) const;
501 /**
502 * Get the state of a QueuePair.
503 */
504 int get_state() const;
9f95a23c
TL
505 /*
506 * send/receive connection management meta data
7c673cae 507 */
9f95a23c
TL
508 int send_cm_meta(CephContext *cct, int socket_fd);
509 int recv_cm_meta(CephContext *cct, int socket_fd);
510 void wire_gid_to_gid(const char *wgid, ib_cm_meta_t* cm_meta_data);
511 void gid_to_wire_gid(const ib_cm_meta_t& cm_meta_data, char wgid[]);
7c673cae
FG
512 ibv_qp* get_qp() const { return qp; }
513 Infiniband::CompletionQueue* get_tx_cq() const { return txcq; }
514 Infiniband::CompletionQueue* get_rx_cq() const { return rxcq; }
515 int to_dead();
516 bool is_dead() const { return dead; }
9f95a23c
TL
517 ib_cm_meta_t& get_peer_cm_meta() { return peer_cm_meta; }
518 ib_cm_meta_t& get_local_cm_meta() { return local_cm_meta; }
519 void add_rq_wr(Chunk* chunk)
520 {
521 if (srq) return;
522
523 std::lock_guard l{lock};
524 recv_queue.push_back(chunk);
525 }
526
527 void remove_rq_wr(Chunk* chunk) {
528 if (srq) return;
529
530 std::lock_guard l{lock};
531 auto it = std::find(recv_queue.begin(), recv_queue.end(), chunk);
532 ceph_assert(it != recv_queue.end());
533 recv_queue.erase(it);
534 }
535 ibv_srq* get_srq() const { return srq; }
7c673cae
FG
536
537 private:
538 CephContext *cct;
31f18b77 539 Infiniband& infiniband; // Infiniband to which this QP belongs
7c673cae
FG
540 ibv_qp_type type; // QP type (IBV_QPT_RC, etc.)
541 ibv_context* ctxt; // device context of the HCA to use
542 int ib_physical_port;
543 ibv_pd* pd; // protection domain
544 ibv_srq* srq; // shared receive queue
545 ibv_qp* qp; // infiniband verbs QP handle
11fdf7f2 546 struct rdma_cm_id *cm_id;
9f95a23c
TL
547 ib_cm_meta_t peer_cm_meta;
548 ib_cm_meta_t local_cm_meta;
7c673cae
FG
549 Infiniband::CompletionQueue* txcq;
550 Infiniband::CompletionQueue* rxcq;
551 uint32_t initial_psn; // initial packet sequence number
552 uint32_t max_send_wr;
553 uint32_t max_recv_wr;
554 uint32_t q_key;
555 bool dead;
f67539c2 556 std::vector<Chunk*> recv_queue;
9f95a23c 557 ceph::mutex lock = ceph::make_mutex("queue_pair_lock");
7c673cae
FG
558 };
559
560 public:
31f18b77
FG
561 typedef MemoryManager::Cluster Cluster;
562 typedef MemoryManager::Chunk Chunk;
11fdf7f2
TL
563 QueuePair* create_queue_pair(CephContext *c, CompletionQueue*, CompletionQueue*,
564 ibv_qp_type type, struct rdma_cm_id *cm_id);
31f18b77 565 ibv_srq* create_shared_receive_queue(uint32_t max_wr, uint32_t max_sge);
11fdf7f2 566 // post rx buffers to srq, return number of buffers actually posted
9f95a23c 567 int post_chunks_to_rq(int num, QueuePair *qp = nullptr);
11fdf7f2 568 void post_chunk_to_pool(Chunk* chunk) {
9f95a23c
TL
569 QueuePair *qp = chunk->get_qp();
570 if (qp != nullptr) {
571 qp->remove_rq_wr(chunk);
572 }
11fdf7f2
TL
573 get_memory_manager()->release_rx_buffer(chunk);
574 }
31f18b77
FG
575 int get_tx_buffers(std::vector<Chunk*> &c, size_t bytes);
576 CompletionChannel *create_comp_channel(CephContext *c);
577 CompletionQueue *create_comp_queue(CephContext *c, CompletionChannel *cc=NULL);
578 uint8_t get_ib_physical_port() { return ib_physical_port; }
31f18b77
FG
579 uint16_t get_lid() { return device->get_lid(); }
580 ibv_gid get_gid() { return device->get_gid(); }
581 MemoryManager* get_memory_manager() { return memory_manager; }
582 Device* get_device() { return device; }
583 int get_async_fd() { return device->ctxt->async_fd; }
584 bool is_tx_buffer(const char* c) { return memory_manager->is_tx_buffer(c);}
31f18b77 585 Chunk *get_tx_chunk_by_buffer(const char *c) { return memory_manager->get_tx_chunk_by_buffer(c); }
7c673cae
FG
586 static const char* wc_status_to_string(int status);
587 static const char* qp_state_string(int status);
11fdf7f2 588 uint32_t get_rx_queue_len() const { return rx_queue_len; }
7c673cae
FG
589};
590
7c673cae 591#endif