]>
git.proxmox.com Git - ceph.git/blob - ceph/src/msg/async/rdma/Infiniband.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2016 XSKY <haomai@xsky.com>
8 * Author: Haomai Wang <haomaiwang@gmail.com>
10 * This is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License version 2.1, as published by the Free Software
13 * Foundation. See file COPYING.
17 #ifndef CEPH_INFINIBAND_H
18 #define CEPH_INFINIBAND_H
23 #include <infiniband/verbs.h>
25 #include "include/int_types.h"
26 #include "include/page.h"
27 #include "common/debug.h"
28 #include "common/errno.h"
29 #include "common/Mutex.h"
30 #include "msg/msg_types.h"
31 #include "msg/async/net_handler.h"
33 #define HUGE_PAGE_SIZE (2 * 1024 * 1024)
34 #define ALIGN_TO_PAGE_SIZE(x) \
35 (((x) + HUGE_PAGE_SIZE -1) / HUGE_PAGE_SIZE * HUGE_PAGE_SIZE)
43 } __attribute__((packed
));
49 struct ibv_context
* ctxt
;
51 struct ibv_port_attr
* port_attr
;
57 explicit Port(CephContext
*cct
, struct ibv_context
* ictxt
, uint8_t ipn
);
58 uint16_t get_lid() { return lid
; }
59 ibv_gid
get_gid() { return gid
; }
60 int get_port_num() { return port_num
; }
61 ibv_port_attr
* get_port_attr() { return port_attr
; }
62 int get_gid_idx() { return gid_idx
; }
71 explicit Device(CephContext
*c
, ibv_device
* d
);
75 assert(ibv_close_device(ctxt
) == 0);
78 const char* get_name() { return name
;}
79 uint16_t get_lid() { return active_port
->get_lid(); }
80 ibv_gid
get_gid() { return active_port
->get_gid(); }
81 int get_gid_idx() { return active_port
->get_gid_idx(); }
82 void binding_port(CephContext
*c
, int port_num
);
83 struct ibv_context
*ctxt
;
84 ibv_device_attr
*device_attr
;
90 struct ibv_device
** device_list
;
94 DeviceList(CephContext
*cct
): device_list(ibv_get_device_list(&num
)) {
95 if (device_list
== NULL
|| num
== 0) {
96 lderr(cct
) << __func__
<< " failed to get rdma device list. " << cpp_strerror(errno
) << dendl
;
99 devices
= new Device
*[num
];
101 for (int i
= 0;i
< num
; ++i
) {
102 devices
[i
] = new Device(cct
, device_list
[i
]);
106 for (int i
=0; i
< num
; ++i
) {
110 ibv_free_device_list(device_list
);
113 Device
* get_device(const char* device_name
) {
115 for (int i
= 0; i
< num
; ++i
) {
116 if (!strlen(device_name
) || !strcmp(device_name
, devices
[i
]->get_name())) {
125 class RDMADispatcher
;
129 class ProtectionDomain
{
131 explicit ProtectionDomain(CephContext
*cct
, Device
*device
);
138 class MemoryManager
{
142 Chunk(ibv_mr
* m
, uint32_t len
, char* b
);
145 void set_offset(uint32_t o
);
146 uint32_t get_offset();
147 void set_bound(uint32_t b
);
148 void prepare_read(uint32_t b
);
149 uint32_t get_bound();
150 uint32_t read(char* buf
, uint32_t len
);
151 uint32_t write(char* buf
, uint32_t len
);
155 void post_srq(Infiniband
*ib
);
167 Cluster(MemoryManager
& m
, uint32_t s
);
170 int fill(uint32_t num
);
171 void take_back(std::vector
<Chunk
*> &ck
);
172 int get_buffers(std::vector
<Chunk
*> &chunks
, size_t bytes
);
173 Chunk
*get_chunk_by_buffer(const char *c
) {
174 uint32_t idx
= (c
- base
) / buffer_size
;
175 Chunk
*chunk
= chunk_base
+ idx
;
178 bool is_my_buffer(const char *c
) const {
179 return c
>= base
&& c
< end
;
182 MemoryManager
& manager
;
183 uint32_t buffer_size
;
186 std::vector
<Chunk
*> free_chunks
;
187 char *base
= nullptr;
189 Chunk
* chunk_base
= nullptr;
192 MemoryManager(Device
*d
, ProtectionDomain
*p
, bool hugepage
);
195 void* malloc_huge_pages(size_t size
);
196 void free_huge_pages(void *ptr
);
197 void register_rx_tx(uint32_t size
, uint32_t rx_num
, uint32_t tx_num
);
198 void return_tx(std::vector
<Chunk
*> &chunks
);
199 int get_send_buffers(std::vector
<Chunk
*> &c
, size_t bytes
);
200 int get_channel_buffers(std::vector
<Chunk
*> &chunks
, size_t bytes
);
201 bool is_tx_buffer(const char* c
) { return send
->is_my_buffer(c
); }
202 bool is_rx_buffer(const char* c
) { return channel
->is_my_buffer(c
); }
203 Chunk
*get_tx_chunk_by_buffer(const char *c
) {
204 return send
->get_chunk_by_buffer(c
);
206 uint32_t get_tx_buffer_size() const {
207 return send
->buffer_size
;
210 bool enabled_huge_page
;
213 Cluster
* channel
;//RECV
214 Cluster
* send
;// SEND
216 ProtectionDomain
*pd
;
220 uint32_t max_send_wr
;
221 uint32_t max_recv_wr
;
223 uint8_t ib_physical_port
;
224 MemoryManager
* memory_manager
;
225 ibv_srq
* srq
; // shared receive work queue
227 ProtectionDomain
*pd
;
228 DeviceList
*device_list
= nullptr;
229 RDMADispatcher
*dispatcher
= nullptr;
230 void wire_gid_to_gid(const char *wgid
, union ibv_gid
*gid
);
231 void gid_to_wire_gid(const union ibv_gid
*gid
, char wgid
[]);
234 bool initialized
= false;
235 const std::string
&device_name
;
239 explicit Infiniband(CephContext
*c
, const std::string
&device_name
, uint8_t p
);
243 void set_dispatcher(RDMADispatcher
*d
);
245 class CompletionChannel
{
246 static const uint32_t MAX_ACK_EVENT
= 5000;
248 Infiniband
& infiniband
;
249 ibv_comp_channel
*channel
;
251 uint32_t cq_events_that_need_ack
;
254 CompletionChannel(CephContext
*c
, Infiniband
&ib
);
255 ~CompletionChannel();
258 int get_fd() { return channel
->fd
; }
259 ibv_comp_channel
* get_channel() { return channel
; }
260 void bind_cq(ibv_cq
*c
) { cq
= c
; }
264 // this class encapsulates the creation, use, and destruction of an RC
267 // You need to call init and it will create a cq and associate to comp channel
268 class CompletionQueue
{
270 CompletionQueue(CephContext
*c
, Infiniband
&ib
,
271 const uint32_t qd
, CompletionChannel
*cc
)
272 : cct(c
), infiniband(ib
), channel(cc
), cq(NULL
), queue_depth(qd
) {}
275 int poll_cq(int num_entries
, ibv_wc
*ret_wc_array
);
277 ibv_cq
* get_cq() const { return cq
; }
278 int rearm_notify(bool solicited_only
=true);
279 CompletionChannel
* get_cc() const { return channel
; }
282 Infiniband
& infiniband
; // Infiniband to which this QP belongs
283 CompletionChannel
*channel
;
285 uint32_t queue_depth
;
288 // this class encapsulates the creation, use, and destruction of an RC
291 // you need call init and it will create a qp and bring it to the INIT state.
292 // after obtaining the lid, qpn, and psn of a remote queue pair, one
293 // must call plumb() to bring the queue pair to the RTS state.
296 QueuePair(CephContext
*c
, Infiniband
& infiniband
, ibv_qp_type type
,
297 int ib_physical_port
, ibv_srq
*srq
,
298 Infiniband::CompletionQueue
* txcq
,
299 Infiniband::CompletionQueue
* rxcq
,
300 uint32_t max_send_wr
, uint32_t max_recv_wr
, uint32_t q_key
= 0);
306 * Get the initial packet sequence number for this QueuePair.
307 * This is randomly generated on creation. It should not be confused
308 * with the remote side's PSN, which is set in #plumb().
310 uint32_t get_initial_psn() const { return initial_psn
; };
312 * Get the local queue pair number for this QueuePair.
313 * QPNs are analogous to UDP/TCP port numbers.
315 uint32_t get_local_qp_number() const { return qp
->qp_num
; };
317 * Get the remote queue pair number for this QueuePair, as set in #plumb().
318 * QPNs are analogous to UDP/TCP port numbers.
320 int get_remote_qp_number(uint32_t *rqp
) const;
322 * Get the remote infiniband address for this QueuePair, as set in #plumb().
323 * LIDs are "local IDs" in infiniband terminology. They are short, locally
324 * routable addresses.
326 int get_remote_lid(uint16_t *lid
) const;
328 * Get the state of a QueuePair.
330 int get_state() const;
332 * Return true if the queue pair is in an error state, false otherwise.
334 bool is_error() const;
335 ibv_qp
* get_qp() const { return qp
; }
336 Infiniband::CompletionQueue
* get_tx_cq() const { return txcq
; }
337 Infiniband::CompletionQueue
* get_rx_cq() const { return rxcq
; }
339 bool is_dead() const { return dead
; }
343 Infiniband
& infiniband
; // Infiniband to which this QP belongs
344 ibv_qp_type type
; // QP type (IBV_QPT_RC, etc.)
345 ibv_context
* ctxt
; // device context of the HCA to use
346 int ib_physical_port
;
347 ibv_pd
* pd
; // protection domain
348 ibv_srq
* srq
; // shared receive queue
349 ibv_qp
* qp
; // infiniband verbs QP handle
350 Infiniband::CompletionQueue
* txcq
;
351 Infiniband::CompletionQueue
* rxcq
;
352 uint32_t initial_psn
; // initial packet sequence number
353 uint32_t max_send_wr
;
354 uint32_t max_recv_wr
;
360 typedef MemoryManager::Cluster Cluster
;
361 typedef MemoryManager::Chunk Chunk
;
362 QueuePair
* create_queue_pair(CephContext
*c
, CompletionQueue
*, CompletionQueue
*, ibv_qp_type type
);
363 ibv_srq
* create_shared_receive_queue(uint32_t max_wr
, uint32_t max_sge
);
364 int post_chunk(Chunk
* chunk
);
365 int post_channel_cluster();
366 int get_tx_buffers(std::vector
<Chunk
*> &c
, size_t bytes
);
367 CompletionChannel
*create_comp_channel(CephContext
*c
);
368 CompletionQueue
*create_comp_queue(CephContext
*c
, CompletionChannel
*cc
=NULL
);
369 uint8_t get_ib_physical_port() { return ib_physical_port
; }
370 int send_msg(CephContext
*cct
, int sd
, IBSYNMsg
& msg
);
371 int recv_msg(CephContext
*cct
, int sd
, IBSYNMsg
& msg
);
372 uint16_t get_lid() { return device
->get_lid(); }
373 ibv_gid
get_gid() { return device
->get_gid(); }
374 MemoryManager
* get_memory_manager() { return memory_manager
; }
375 Device
* get_device() { return device
; }
376 int get_async_fd() { return device
->ctxt
->async_fd
; }
377 bool is_tx_buffer(const char* c
) { return memory_manager
->is_tx_buffer(c
);}
378 bool is_rx_buffer(const char* c
) { return memory_manager
->is_rx_buffer(c
);}
379 Chunk
*get_tx_chunk_by_buffer(const char *c
) { return memory_manager
->get_tx_chunk_by_buffer(c
); }
380 static const char* wc_status_to_string(int status
);
381 static const char* qp_state_string(int status
);