]> git.proxmox.com Git - ceph.git/blame - ceph/src/msg/async/rdma/Infiniband.h
update sources to v12.1.3
[ceph.git] / ceph / src / msg / async / rdma / Infiniband.h
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2016 XSKY <haomai@xsky.com>
7 *
8 * Author: Haomai Wang <haomaiwang@gmail.com>
9 *
10 * This is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License version 2.1, as published by the Free Software
13 * Foundation. See file COPYING.
14 *
15 */
16
17#ifndef CEPH_INFINIBAND_H
18#define CEPH_INFINIBAND_H
19
7c673cae
FG
20#include <string>
21#include <vector>
22
31f18b77
FG
23#include <infiniband/verbs.h>
24
7c673cae
FG
25#include "include/int_types.h"
26#include "include/page.h"
27#include "common/debug.h"
28#include "common/errno.h"
31f18b77 29#include "common/Mutex.h"
7c673cae
FG
30#include "msg/msg_types.h"
31#include "msg/async/net_handler.h"
7c673cae
FG
32
33#define HUGE_PAGE_SIZE (2 * 1024 * 1024)
34#define ALIGN_TO_PAGE_SIZE(x) \
35 (((x) + HUGE_PAGE_SIZE -1) / HUGE_PAGE_SIZE * HUGE_PAGE_SIZE)
36
37struct IBSYNMsg {
38 uint16_t lid;
39 uint32_t qpn;
40 uint32_t psn;
41 uint32_t peer_qpn;
42 union ibv_gid gid;
43} __attribute__((packed));
44
45class RDMAStack;
46class CephContext;
31f18b77
FG
47
48class Port {
49 struct ibv_context* ctxt;
50 int port_num;
51 struct ibv_port_attr* port_attr;
52 uint16_t lid;
53 int gid_idx;
54 union ibv_gid gid;
55
56 public:
57 explicit Port(CephContext *cct, struct ibv_context* ictxt, uint8_t ipn);
58 uint16_t get_lid() { return lid; }
59 ibv_gid get_gid() { return gid; }
60 int get_port_num() { return port_num; }
61 ibv_port_attr* get_port_attr() { return port_attr; }
62 int get_gid_idx() { return gid_idx; }
63};
64
65
66class Device {
67 ibv_device *device;
68 const char* name;
69 uint8_t port_cnt;
70 public:
71 explicit Device(CephContext *c, ibv_device* d);
72 ~Device() {
73 if (active_port) {
74 delete active_port;
75 assert(ibv_close_device(ctxt) == 0);
76 }
77 }
78 const char* get_name() { return name;}
79 uint16_t get_lid() { return active_port->get_lid(); }
80 ibv_gid get_gid() { return active_port->get_gid(); }
81 int get_gid_idx() { return active_port->get_gid_idx(); }
82 void binding_port(CephContext *c, int port_num);
83 struct ibv_context *ctxt;
84 ibv_device_attr *device_attr;
85 Port* active_port;
86};
87
88
89class DeviceList {
90 struct ibv_device ** device_list;
91 int num;
92 Device** devices;
93 public:
94 DeviceList(CephContext *cct): device_list(ibv_get_device_list(&num)) {
95 if (device_list == NULL || num == 0) {
96 lderr(cct) << __func__ << " failed to get rdma device list. " << cpp_strerror(errno) << dendl;
97 ceph_abort();
98 }
99 devices = new Device*[num];
100
101 for (int i = 0;i < num; ++i) {
102 devices[i] = new Device(cct, device_list[i]);
103 }
104 }
105 ~DeviceList() {
106 for (int i=0; i < num; ++i) {
107 delete devices[i];
108 }
109 delete []devices;
110 ibv_free_device_list(device_list);
111 }
112
113 Device* get_device(const char* device_name) {
114 assert(devices);
115 for (int i = 0; i < num; ++i) {
116 if (!strlen(device_name) || !strcmp(device_name, devices[i]->get_name())) {
117 return devices[i];
118 }
119 }
120 return NULL;
121 }
122};
123
124
7c673cae
FG
125class RDMADispatcher;
126
127class Infiniband {
128 public:
129 class ProtectionDomain {
130 public:
131 explicit ProtectionDomain(CephContext *cct, Device *device);
132 ~ProtectionDomain();
133
134 ibv_pd* const pd;
135 };
136
137
138 class MemoryManager {
139 public:
140 class Chunk {
141 public:
142 Chunk(ibv_mr* m, uint32_t len, char* b);
143 ~Chunk();
144
145 void set_offset(uint32_t o);
146 uint32_t get_offset();
147 void set_bound(uint32_t b);
148 void prepare_read(uint32_t b);
149 uint32_t get_bound();
150 uint32_t read(char* buf, uint32_t len);
151 uint32_t write(char* buf, uint32_t len);
152 bool full();
153 bool over();
154 void clear();
31f18b77 155 void post_srq(Infiniband *ib);
7c673cae
FG
156
157 public:
158 ibv_mr* mr;
159 uint32_t bytes;
160 uint32_t bound;
161 uint32_t offset;
162 char* buffer;
163 };
164
165 class Cluster {
166 public:
167 Cluster(MemoryManager& m, uint32_t s);
168 ~Cluster();
169
170 int fill(uint32_t num);
171 void take_back(std::vector<Chunk*> &ck);
172 int get_buffers(std::vector<Chunk*> &chunks, size_t bytes);
173 Chunk *get_chunk_by_buffer(const char *c) {
174 uint32_t idx = (c - base) / buffer_size;
175 Chunk *chunk = chunk_base + idx;
176 return chunk;
177 }
178 bool is_my_buffer(const char *c) const {
179 return c >= base && c < end;
180 }
181
182 MemoryManager& manager;
183 uint32_t buffer_size;
184 uint32_t num_chunk;
185 Mutex lock;
186 std::vector<Chunk*> free_chunks;
187 char *base = nullptr;
188 char *end = nullptr;
189 Chunk* chunk_base = nullptr;
190 };
191
192 MemoryManager(Device *d, ProtectionDomain *p, bool hugepage);
193 ~MemoryManager();
194
195 void* malloc_huge_pages(size_t size);
196 void free_huge_pages(void *ptr);
197 void register_rx_tx(uint32_t size, uint32_t rx_num, uint32_t tx_num);
198 void return_tx(std::vector<Chunk*> &chunks);
199 int get_send_buffers(std::vector<Chunk*> &c, size_t bytes);
200 int get_channel_buffers(std::vector<Chunk*> &chunks, size_t bytes);
201 bool is_tx_buffer(const char* c) { return send->is_my_buffer(c); }
202 bool is_rx_buffer(const char* c) { return channel->is_my_buffer(c); }
203 Chunk *get_tx_chunk_by_buffer(const char *c) {
204 return send->get_chunk_by_buffer(c);
205 }
206 uint32_t get_tx_buffer_size() const {
207 return send->buffer_size;
208 }
209
210 bool enabled_huge_page;
211
212 private:
213 Cluster* channel;//RECV
214 Cluster* send;// SEND
215 Device *device;
216 ProtectionDomain *pd;
217 };
218
219 private:
d2e6a577
FG
220 uint32_t max_send_wr = 0;
221 uint32_t max_recv_wr = 0;
222 uint32_t max_sge = 0;
223 uint8_t ib_physical_port = 0;
224 MemoryManager* memory_manager = nullptr;
225 ibv_srq* srq = nullptr; // shared receive work queue
226 Device *device = NULL;
227 ProtectionDomain *pd = NULL;
31f18b77
FG
228 DeviceList *device_list = nullptr;
229 RDMADispatcher *dispatcher = nullptr;
230 void wire_gid_to_gid(const char *wgid, union ibv_gid *gid);
231 void gid_to_wire_gid(const union ibv_gid *gid, char wgid[]);
7c673cae
FG
232 CephContext *cct;
233 Mutex lock;
234 bool initialized = false;
31f18b77
FG
235 const std::string &device_name;
236 uint8_t port_num;
7c673cae
FG
237
238 public:
31f18b77 239 explicit Infiniband(CephContext *c, const std::string &device_name, uint8_t p);
7c673cae
FG
240 ~Infiniband();
241 void init();
242
243 void set_dispatcher(RDMADispatcher *d);
244
245 class CompletionChannel {
246 static const uint32_t MAX_ACK_EVENT = 5000;
247 CephContext *cct;
31f18b77 248 Infiniband& infiniband;
7c673cae
FG
249 ibv_comp_channel *channel;
250 ibv_cq *cq;
251 uint32_t cq_events_that_need_ack;
252
253 public:
31f18b77 254 CompletionChannel(CephContext *c, Infiniband &ib);
7c673cae
FG
255 ~CompletionChannel();
256 int init();
257 bool get_cq_event();
258 int get_fd() { return channel->fd; }
259 ibv_comp_channel* get_channel() { return channel; }
260 void bind_cq(ibv_cq *c) { cq = c; }
261 void ack_events();
262 };
263
264 // this class encapsulates the creation, use, and destruction of an RC
265 // completion queue.
266 //
267 // You need to call init and it will create a cq and associate to comp channel
268 class CompletionQueue {
269 public:
31f18b77 270 CompletionQueue(CephContext *c, Infiniband &ib,
7c673cae 271 const uint32_t qd, CompletionChannel *cc)
31f18b77 272 : cct(c), infiniband(ib), channel(cc), cq(NULL), queue_depth(qd) {}
7c673cae
FG
273 ~CompletionQueue();
274 int init();
275 int poll_cq(int num_entries, ibv_wc *ret_wc_array);
276
277 ibv_cq* get_cq() const { return cq; }
278 int rearm_notify(bool solicited_only=true);
279 CompletionChannel* get_cc() const { return channel; }
280 private:
281 CephContext *cct;
31f18b77 282 Infiniband& infiniband; // Infiniband to which this QP belongs
7c673cae
FG
283 CompletionChannel *channel;
284 ibv_cq *cq;
285 uint32_t queue_depth;
286 };
287
288 // this class encapsulates the creation, use, and destruction of an RC
289 // queue pair.
290 //
291 // you need call init and it will create a qp and bring it to the INIT state.
292 // after obtaining the lid, qpn, and psn of a remote queue pair, one
293 // must call plumb() to bring the queue pair to the RTS state.
294 class QueuePair {
295 public:
31f18b77 296 QueuePair(CephContext *c, Infiniband& infiniband, ibv_qp_type type,
7c673cae
FG
297 int ib_physical_port, ibv_srq *srq,
298 Infiniband::CompletionQueue* txcq,
299 Infiniband::CompletionQueue* rxcq,
300 uint32_t max_send_wr, uint32_t max_recv_wr, uint32_t q_key = 0);
301 ~QueuePair();
302
303 int init();
304
305 /**
306 * Get the initial packet sequence number for this QueuePair.
307 * This is randomly generated on creation. It should not be confused
308 * with the remote side's PSN, which is set in #plumb().
309 */
310 uint32_t get_initial_psn() const { return initial_psn; };
311 /**
312 * Get the local queue pair number for this QueuePair.
313 * QPNs are analogous to UDP/TCP port numbers.
314 */
315 uint32_t get_local_qp_number() const { return qp->qp_num; };
316 /**
317 * Get the remote queue pair number for this QueuePair, as set in #plumb().
318 * QPNs are analogous to UDP/TCP port numbers.
319 */
320 int get_remote_qp_number(uint32_t *rqp) const;
321 /**
322 * Get the remote infiniband address for this QueuePair, as set in #plumb().
323 * LIDs are "local IDs" in infiniband terminology. They are short, locally
324 * routable addresses.
325 */
326 int get_remote_lid(uint16_t *lid) const;
327 /**
328 * Get the state of a QueuePair.
329 */
330 int get_state() const;
331 /**
332 * Return true if the queue pair is in an error state, false otherwise.
333 */
334 bool is_error() const;
335 ibv_qp* get_qp() const { return qp; }
336 Infiniband::CompletionQueue* get_tx_cq() const { return txcq; }
337 Infiniband::CompletionQueue* get_rx_cq() const { return rxcq; }
338 int to_dead();
339 bool is_dead() const { return dead; }
340
341 private:
342 CephContext *cct;
31f18b77 343 Infiniband& infiniband; // Infiniband to which this QP belongs
7c673cae
FG
344 ibv_qp_type type; // QP type (IBV_QPT_RC, etc.)
345 ibv_context* ctxt; // device context of the HCA to use
346 int ib_physical_port;
347 ibv_pd* pd; // protection domain
348 ibv_srq* srq; // shared receive queue
349 ibv_qp* qp; // infiniband verbs QP handle
350 Infiniband::CompletionQueue* txcq;
351 Infiniband::CompletionQueue* rxcq;
352 uint32_t initial_psn; // initial packet sequence number
353 uint32_t max_send_wr;
354 uint32_t max_recv_wr;
355 uint32_t q_key;
356 bool dead;
357 };
358
359 public:
31f18b77
FG
360 typedef MemoryManager::Cluster Cluster;
361 typedef MemoryManager::Chunk Chunk;
362 QueuePair* create_queue_pair(CephContext *c, CompletionQueue*, CompletionQueue*, ibv_qp_type type);
363 ibv_srq* create_shared_receive_queue(uint32_t max_wr, uint32_t max_sge);
364 int post_chunk(Chunk* chunk);
365 int post_channel_cluster();
366 int get_tx_buffers(std::vector<Chunk*> &c, size_t bytes);
367 CompletionChannel *create_comp_channel(CephContext *c);
368 CompletionQueue *create_comp_queue(CephContext *c, CompletionChannel *cc=NULL);
369 uint8_t get_ib_physical_port() { return ib_physical_port; }
370 int send_msg(CephContext *cct, int sd, IBSYNMsg& msg);
371 int recv_msg(CephContext *cct, int sd, IBSYNMsg& msg);
372 uint16_t get_lid() { return device->get_lid(); }
373 ibv_gid get_gid() { return device->get_gid(); }
374 MemoryManager* get_memory_manager() { return memory_manager; }
375 Device* get_device() { return device; }
376 int get_async_fd() { return device->ctxt->async_fd; }
377 bool is_tx_buffer(const char* c) { return memory_manager->is_tx_buffer(c);}
378 bool is_rx_buffer(const char* c) { return memory_manager->is_rx_buffer(c);}
379 Chunk *get_tx_chunk_by_buffer(const char *c) { return memory_manager->get_tx_chunk_by_buffer(c); }
7c673cae
FG
380 static const char* wc_status_to_string(int status);
381 static const char* qp_state_string(int status);
7c673cae
FG
382};
383
7c673cae 384#endif