]>
Commit | Line | Data |
---|---|---|
2a2359b8 XY |
1 | /* |
2 | * Export QEMU block device via VDUSE | |
3 | * | |
4 | * Copyright (C) 2022 Bytedance Inc. and/or its affiliates. All rights reserved. | |
5 | * | |
6 | * Author: | |
7 | * Xie Yongji <xieyongji@bytedance.com> | |
8 | * | |
9 | * This work is licensed under the terms of the GNU GPL, version 2 or | |
10 | * later. See the COPYING file in the top-level directory. | |
11 | */ | |
12 | ||
2ca10fae | 13 | #include "qemu/osdep.h" |
2a2359b8 XY |
14 | #include <sys/eventfd.h> |
15 | ||
2a2359b8 XY |
16 | #include "qapi/error.h" |
17 | #include "block/export.h" | |
18 | #include "qemu/error-report.h" | |
19 | #include "util/block-helpers.h" | |
20 | #include "subprojects/libvduse/libvduse.h" | |
21 | #include "virtio-blk-handler.h" | |
22 | ||
23 | #include "standard-headers/linux/virtio_blk.h" | |
24 | ||
25 | #define VDUSE_DEFAULT_NUM_QUEUE 1 | |
26 | #define VDUSE_DEFAULT_QUEUE_SIZE 256 | |
27 | ||
28 | typedef struct VduseBlkExport { | |
29 | BlockExport export; | |
30 | VirtioBlkHandler handler; | |
31 | VduseDev *dev; | |
32 | uint16_t num_queues; | |
d043e2db | 33 | char *recon_file; |
195332c1 SH |
34 | unsigned int inflight; /* atomic */ |
35 | bool vqs_started; | |
2a2359b8 XY |
36 | } VduseBlkExport; |
37 | ||
38 | typedef struct VduseBlkReq { | |
39 | VduseVirtqElement elem; | |
40 | VduseVirtq *vq; | |
41 | } VduseBlkReq; | |
42 | ||
43 | static void vduse_blk_inflight_inc(VduseBlkExport *vblk_exp) | |
44 | { | |
195332c1 SH |
45 | if (qatomic_fetch_inc(&vblk_exp->inflight) == 0) { |
46 | /* Prevent export from being deleted */ | |
195332c1 | 47 | blk_exp_ref(&vblk_exp->export); |
195332c1 | 48 | } |
2a2359b8 XY |
49 | } |
50 | ||
51 | static void vduse_blk_inflight_dec(VduseBlkExport *vblk_exp) | |
52 | { | |
195332c1 SH |
53 | if (qatomic_fetch_dec(&vblk_exp->inflight) == 1) { |
54 | /* Wake AIO_WAIT_WHILE() */ | |
2a2359b8 | 55 | aio_wait_kick(); |
195332c1 SH |
56 | |
57 | /* Now the export can be deleted */ | |
195332c1 | 58 | blk_exp_unref(&vblk_exp->export); |
2a2359b8 XY |
59 | } |
60 | } | |
61 | ||
62 | static void vduse_blk_req_complete(VduseBlkReq *req, size_t in_len) | |
63 | { | |
64 | vduse_queue_push(req->vq, &req->elem, in_len); | |
65 | vduse_queue_notify(req->vq); | |
66 | ||
67 | free(req); | |
68 | } | |
69 | ||
70 | static void coroutine_fn vduse_blk_virtio_process_req(void *opaque) | |
71 | { | |
72 | VduseBlkReq *req = opaque; | |
73 | VduseVirtq *vq = req->vq; | |
74 | VduseDev *dev = vduse_queue_get_dev(vq); | |
75 | VduseBlkExport *vblk_exp = vduse_dev_get_priv(dev); | |
76 | VirtioBlkHandler *handler = &vblk_exp->handler; | |
77 | VduseVirtqElement *elem = &req->elem; | |
78 | struct iovec *in_iov = elem->in_sg; | |
79 | struct iovec *out_iov = elem->out_sg; | |
80 | unsigned in_num = elem->in_num; | |
81 | unsigned out_num = elem->out_num; | |
82 | int in_len; | |
83 | ||
84 | in_len = virtio_blk_process_req(handler, in_iov, | |
85 | out_iov, in_num, out_num); | |
86 | if (in_len < 0) { | |
87 | free(req); | |
88 | return; | |
89 | } | |
90 | ||
91 | vduse_blk_req_complete(req, in_len); | |
92 | vduse_blk_inflight_dec(vblk_exp); | |
93 | } | |
94 | ||
95 | static void vduse_blk_vq_handler(VduseDev *dev, VduseVirtq *vq) | |
96 | { | |
97 | VduseBlkExport *vblk_exp = vduse_dev_get_priv(dev); | |
98 | ||
99 | while (1) { | |
100 | VduseBlkReq *req; | |
101 | ||
102 | req = vduse_queue_pop(vq, sizeof(VduseBlkReq)); | |
103 | if (!req) { | |
104 | break; | |
105 | } | |
106 | req->vq = vq; | |
107 | ||
108 | Coroutine *co = | |
109 | qemu_coroutine_create(vduse_blk_virtio_process_req, req); | |
110 | ||
111 | vduse_blk_inflight_inc(vblk_exp); | |
112 | qemu_coroutine_enter(co); | |
113 | } | |
114 | } | |
115 | ||
116 | static void on_vduse_vq_kick(void *opaque) | |
117 | { | |
118 | VduseVirtq *vq = opaque; | |
119 | VduseDev *dev = vduse_queue_get_dev(vq); | |
120 | int fd = vduse_queue_get_fd(vq); | |
121 | eventfd_t kick_data; | |
122 | ||
123 | if (eventfd_read(fd, &kick_data) == -1) { | |
124 | error_report("failed to read data from eventfd"); | |
125 | return; | |
126 | } | |
127 | ||
128 | vduse_blk_vq_handler(dev, vq); | |
129 | } | |
130 | ||
131 | static void vduse_blk_enable_queue(VduseDev *dev, VduseVirtq *vq) | |
132 | { | |
133 | VduseBlkExport *vblk_exp = vduse_dev_get_priv(dev); | |
134 | ||
195332c1 SH |
135 | if (!vblk_exp->vqs_started) { |
136 | return; /* vduse_blk_drained_end() will start vqs later */ | |
137 | } | |
138 | ||
2a2359b8 | 139 | aio_set_fd_handler(vblk_exp->export.ctx, vduse_queue_get_fd(vq), |
60f782b6 | 140 | on_vduse_vq_kick, NULL, NULL, NULL, vq); |
d043e2db XY |
141 | /* Make sure we don't miss any kick afer reconnecting */ |
142 | eventfd_write(vduse_queue_get_fd(vq), 1); | |
2a2359b8 XY |
143 | } |
144 | ||
145 | static void vduse_blk_disable_queue(VduseDev *dev, VduseVirtq *vq) | |
146 | { | |
147 | VduseBlkExport *vblk_exp = vduse_dev_get_priv(dev); | |
195332c1 | 148 | int fd = vduse_queue_get_fd(vq); |
2a2359b8 | 149 | |
195332c1 SH |
150 | if (fd < 0) { |
151 | return; | |
152 | } | |
153 | ||
60f782b6 | 154 | aio_set_fd_handler(vblk_exp->export.ctx, fd, |
195332c1 | 155 | NULL, NULL, NULL, NULL, NULL); |
2a2359b8 XY |
156 | } |
157 | ||
158 | static const VduseOps vduse_blk_ops = { | |
159 | .enable_queue = vduse_blk_enable_queue, | |
160 | .disable_queue = vduse_blk_disable_queue, | |
161 | }; | |
162 | ||
163 | static void on_vduse_dev_kick(void *opaque) | |
164 | { | |
165 | VduseDev *dev = opaque; | |
166 | ||
167 | vduse_dev_handler(dev); | |
168 | } | |
169 | ||
170 | static void vduse_blk_attach_ctx(VduseBlkExport *vblk_exp, AioContext *ctx) | |
171 | { | |
2a2359b8 | 172 | aio_set_fd_handler(vblk_exp->export.ctx, vduse_dev_get_fd(vblk_exp->dev), |
60f782b6 | 173 | on_vduse_dev_kick, NULL, NULL, NULL, |
2a2359b8 XY |
174 | vblk_exp->dev); |
175 | ||
195332c1 | 176 | /* Virtqueues are handled by vduse_blk_drained_end() */ |
2a2359b8 XY |
177 | } |
178 | ||
179 | static void vduse_blk_detach_ctx(VduseBlkExport *vblk_exp) | |
180 | { | |
2a2359b8 | 181 | aio_set_fd_handler(vblk_exp->export.ctx, vduse_dev_get_fd(vblk_exp->dev), |
60f782b6 | 182 | NULL, NULL, NULL, NULL, NULL); |
2a2359b8 | 183 | |
195332c1 | 184 | /* Virtqueues are handled by vduse_blk_drained_begin() */ |
2a2359b8 XY |
185 | } |
186 | ||
187 | ||
188 | static void blk_aio_attached(AioContext *ctx, void *opaque) | |
189 | { | |
190 | VduseBlkExport *vblk_exp = opaque; | |
191 | ||
192 | vblk_exp->export.ctx = ctx; | |
193 | vduse_blk_attach_ctx(vblk_exp, ctx); | |
194 | } | |
195 | ||
196 | static void blk_aio_detach(void *opaque) | |
197 | { | |
198 | VduseBlkExport *vblk_exp = opaque; | |
199 | ||
200 | vduse_blk_detach_ctx(vblk_exp); | |
201 | vblk_exp->export.ctx = NULL; | |
202 | } | |
203 | ||
9e4dea67 XY |
204 | static void vduse_blk_resize(void *opaque) |
205 | { | |
206 | BlockExport *exp = opaque; | |
207 | VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export); | |
208 | struct virtio_blk_config config; | |
209 | ||
210 | config.capacity = | |
211 | cpu_to_le64(blk_getlength(exp->blk) >> VIRTIO_BLK_SECTOR_BITS); | |
212 | vduse_dev_update_config(vblk_exp->dev, sizeof(config.capacity), | |
213 | offsetof(struct virtio_blk_config, capacity), | |
214 | (char *)&config.capacity); | |
215 | } | |
216 | ||
195332c1 SH |
217 | static void vduse_blk_stop_virtqueues(VduseBlkExport *vblk_exp) |
218 | { | |
219 | for (uint16_t i = 0; i < vblk_exp->num_queues; i++) { | |
220 | VduseVirtq *vq = vduse_dev_get_queue(vblk_exp->dev, i); | |
221 | vduse_blk_disable_queue(vblk_exp->dev, vq); | |
222 | } | |
223 | ||
224 | vblk_exp->vqs_started = false; | |
225 | } | |
226 | ||
227 | static void vduse_blk_start_virtqueues(VduseBlkExport *vblk_exp) | |
228 | { | |
229 | vblk_exp->vqs_started = true; | |
230 | ||
231 | for (uint16_t i = 0; i < vblk_exp->num_queues; i++) { | |
232 | VduseVirtq *vq = vduse_dev_get_queue(vblk_exp->dev, i); | |
233 | vduse_blk_enable_queue(vblk_exp->dev, vq); | |
234 | } | |
235 | } | |
236 | ||
237 | static void vduse_blk_drained_begin(void *opaque) | |
238 | { | |
239 | BlockExport *exp = opaque; | |
240 | VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export); | |
241 | ||
242 | vduse_blk_stop_virtqueues(vblk_exp); | |
243 | } | |
244 | ||
245 | static void vduse_blk_drained_end(void *opaque) | |
246 | { | |
247 | BlockExport *exp = opaque; | |
248 | VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export); | |
249 | ||
250 | vduse_blk_start_virtqueues(vblk_exp); | |
251 | } | |
252 | ||
253 | static bool vduse_blk_drained_poll(void *opaque) | |
254 | { | |
255 | BlockExport *exp = opaque; | |
256 | VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export); | |
257 | ||
258 | return qatomic_read(&vblk_exp->inflight) > 0; | |
259 | } | |
260 | ||
9e4dea67 | 261 | static const BlockDevOps vduse_block_ops = { |
195332c1 SH |
262 | .resize_cb = vduse_blk_resize, |
263 | .drained_begin = vduse_blk_drained_begin, | |
264 | .drained_end = vduse_blk_drained_end, | |
265 | .drained_poll = vduse_blk_drained_poll, | |
9e4dea67 XY |
266 | }; |
267 | ||
2a2359b8 XY |
268 | static int vduse_blk_exp_create(BlockExport *exp, BlockExportOptions *opts, |
269 | Error **errp) | |
270 | { | |
271 | VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export); | |
272 | BlockExportOptionsVduseBlk *vblk_opts = &opts->u.vduse_blk; | |
273 | uint64_t logical_block_size = VIRTIO_BLK_SECTOR_SIZE; | |
274 | uint16_t num_queues = VDUSE_DEFAULT_NUM_QUEUE; | |
275 | uint16_t queue_size = VDUSE_DEFAULT_QUEUE_SIZE; | |
276 | Error *local_err = NULL; | |
277 | struct virtio_blk_config config = { 0 }; | |
278 | uint64_t features; | |
0862a087 | 279 | int i, ret; |
2a2359b8 XY |
280 | |
281 | if (vblk_opts->has_num_queues) { | |
282 | num_queues = vblk_opts->num_queues; | |
283 | if (num_queues == 0) { | |
284 | error_setg(errp, "num-queues must be greater than 0"); | |
285 | return -EINVAL; | |
286 | } | |
287 | } | |
288 | ||
289 | if (vblk_opts->has_queue_size) { | |
290 | queue_size = vblk_opts->queue_size; | |
291 | if (queue_size <= 2 || !is_power_of_2(queue_size) || | |
292 | queue_size > VIRTQUEUE_MAX_SIZE) { | |
293 | error_setg(errp, "queue-size is invalid"); | |
294 | return -EINVAL; | |
295 | } | |
296 | } | |
297 | ||
298 | if (vblk_opts->has_logical_block_size) { | |
299 | logical_block_size = vblk_opts->logical_block_size; | |
300 | check_block_size(exp->id, "logical-block-size", logical_block_size, | |
301 | &local_err); | |
302 | if (local_err) { | |
303 | error_propagate(errp, local_err); | |
304 | return -EINVAL; | |
305 | } | |
306 | } | |
307 | vblk_exp->num_queues = num_queues; | |
308 | vblk_exp->handler.blk = exp->blk; | |
54fde4ff | 309 | vblk_exp->handler.serial = g_strdup(vblk_opts->serial ?: ""); |
2a2359b8 XY |
310 | vblk_exp->handler.logical_block_size = logical_block_size; |
311 | vblk_exp->handler.writable = opts->writable; | |
195332c1 | 312 | vblk_exp->vqs_started = true; |
2a2359b8 XY |
313 | |
314 | config.capacity = | |
315 | cpu_to_le64(blk_getlength(exp->blk) >> VIRTIO_BLK_SECTOR_BITS); | |
316 | config.seg_max = cpu_to_le32(queue_size - 2); | |
317 | config.min_io_size = cpu_to_le16(1); | |
318 | config.opt_io_size = cpu_to_le32(1); | |
319 | config.num_queues = cpu_to_le16(num_queues); | |
320 | config.blk_size = cpu_to_le32(logical_block_size); | |
321 | config.max_discard_sectors = cpu_to_le32(VIRTIO_BLK_MAX_DISCARD_SECTORS); | |
322 | config.max_discard_seg = cpu_to_le32(1); | |
323 | config.discard_sector_alignment = | |
324 | cpu_to_le32(logical_block_size >> VIRTIO_BLK_SECTOR_BITS); | |
325 | config.max_write_zeroes_sectors = | |
326 | cpu_to_le32(VIRTIO_BLK_MAX_WRITE_ZEROES_SECTORS); | |
327 | config.max_write_zeroes_seg = cpu_to_le32(1); | |
328 | ||
329 | features = vduse_get_virtio_features() | | |
330 | (1ULL << VIRTIO_BLK_F_SEG_MAX) | | |
331 | (1ULL << VIRTIO_BLK_F_TOPOLOGY) | | |
332 | (1ULL << VIRTIO_BLK_F_BLK_SIZE) | | |
333 | (1ULL << VIRTIO_BLK_F_FLUSH) | | |
334 | (1ULL << VIRTIO_BLK_F_DISCARD) | | |
335 | (1ULL << VIRTIO_BLK_F_WRITE_ZEROES); | |
336 | ||
337 | if (num_queues > 1) { | |
338 | features |= 1ULL << VIRTIO_BLK_F_MQ; | |
339 | } | |
340 | if (!opts->writable) { | |
341 | features |= 1ULL << VIRTIO_BLK_F_RO; | |
342 | } | |
343 | ||
779d82e1 | 344 | vblk_exp->dev = vduse_dev_create(vblk_opts->name, VIRTIO_ID_BLOCK, 0, |
2a2359b8 XY |
345 | features, num_queues, |
346 | sizeof(struct virtio_blk_config), | |
347 | (char *)&config, &vduse_blk_ops, | |
348 | vblk_exp); | |
349 | if (!vblk_exp->dev) { | |
350 | error_setg(errp, "failed to create vduse device"); | |
0862a087 XY |
351 | ret = -ENOMEM; |
352 | goto err_dev; | |
2a2359b8 XY |
353 | } |
354 | ||
d043e2db | 355 | vblk_exp->recon_file = g_strdup_printf("%s/vduse-blk-%s", |
779d82e1 | 356 | g_get_tmp_dir(), vblk_opts->name); |
d043e2db XY |
357 | if (vduse_set_reconnect_log_file(vblk_exp->dev, vblk_exp->recon_file)) { |
358 | error_setg(errp, "failed to set reconnect log file"); | |
0862a087 XY |
359 | ret = -EINVAL; |
360 | goto err; | |
d043e2db XY |
361 | } |
362 | ||
2a2359b8 XY |
363 | for (i = 0; i < num_queues; i++) { |
364 | vduse_dev_setup_queue(vblk_exp->dev, i, queue_size); | |
365 | } | |
366 | ||
60f782b6 | 367 | aio_set_fd_handler(exp->ctx, vduse_dev_get_fd(vblk_exp->dev), |
2a2359b8 XY |
368 | on_vduse_dev_kick, NULL, NULL, NULL, vblk_exp->dev); |
369 | ||
370 | blk_add_aio_context_notifier(exp->blk, blk_aio_attached, blk_aio_detach, | |
371 | vblk_exp); | |
9e4dea67 XY |
372 | blk_set_dev_ops(exp->blk, &vduse_block_ops, exp); |
373 | ||
195332c1 SH |
374 | /* |
375 | * We handle draining ourselves using an in-flight counter and by disabling | |
376 | * virtqueue fd handlers. Do not queue BlockBackend requests, they need to | |
377 | * complete so the in-flight counter reaches zero. | |
378 | */ | |
379 | blk_set_disable_request_queuing(exp->blk, true); | |
380 | ||
2a2359b8 | 381 | return 0; |
0862a087 XY |
382 | err: |
383 | vduse_dev_destroy(vblk_exp->dev); | |
384 | g_free(vblk_exp->recon_file); | |
385 | err_dev: | |
386 | g_free(vblk_exp->handler.serial); | |
387 | return ret; | |
2a2359b8 XY |
388 | } |
389 | ||
390 | static void vduse_blk_exp_delete(BlockExport *exp) | |
391 | { | |
392 | VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export); | |
d043e2db | 393 | int ret; |
2a2359b8 | 394 | |
195332c1 SH |
395 | assert(qatomic_read(&vblk_exp->inflight) == 0); |
396 | ||
397 | vduse_blk_detach_ctx(vblk_exp); | |
2a2359b8 XY |
398 | blk_remove_aio_context_notifier(exp->blk, blk_aio_attached, blk_aio_detach, |
399 | vblk_exp); | |
d043e2db XY |
400 | ret = vduse_dev_destroy(vblk_exp->dev); |
401 | if (ret != -EBUSY) { | |
402 | unlink(vblk_exp->recon_file); | |
403 | } | |
404 | g_free(vblk_exp->recon_file); | |
0862a087 | 405 | g_free(vblk_exp->handler.serial); |
2a2359b8 XY |
406 | } |
407 | ||
195332c1 | 408 | /* Called with exp->ctx acquired */ |
2a2359b8 XY |
409 | static void vduse_blk_exp_request_shutdown(BlockExport *exp) |
410 | { | |
411 | VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export); | |
412 | ||
195332c1 | 413 | vduse_blk_stop_virtqueues(vblk_exp); |
2a2359b8 XY |
414 | } |
415 | ||
416 | const BlockExportDriver blk_exp_vduse_blk = { | |
417 | .type = BLOCK_EXPORT_TYPE_VDUSE_BLK, | |
418 | .instance_size = sizeof(VduseBlkExport), | |
419 | .create = vduse_blk_exp_create, | |
420 | .delete = vduse_blk_exp_delete, | |
421 | .request_shutdown = vduse_blk_exp_request_shutdown, | |
422 | }; |