]> git.proxmox.com Git - mirror_qemu.git/blob - block.c
block: do not allocate an iovec per read of a growable/zero_after_eof BDS
[mirror_qemu.git] / block.c
1 /*
2 * QEMU System Emulator block driver
3 *
4 * Copyright (c) 2003 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24 #include "config-host.h"
25 #include "qemu-common.h"
26 #include "trace.h"
27 #include "block/block_int.h"
28 #include "block/blockjob.h"
29 #include "qemu/module.h"
30 #include "qapi/qmp/qjson.h"
31 #include "sysemu/block-backend.h"
32 #include "sysemu/sysemu.h"
33 #include "qemu/notify.h"
34 #include "block/coroutine.h"
35 #include "block/qapi.h"
36 #include "qmp-commands.h"
37 #include "qemu/timer.h"
38 #include "qapi-event.h"
39
40 #ifdef CONFIG_BSD
41 #include <sys/types.h>
42 #include <sys/stat.h>
43 #include <sys/ioctl.h>
44 #include <sys/queue.h>
45 #ifndef __DragonFly__
46 #include <sys/disk.h>
47 #endif
48 #endif
49
50 #ifdef _WIN32
51 #include <windows.h>
52 #endif
53
54 struct BdrvDirtyBitmap {
55 HBitmap *bitmap;
56 QLIST_ENTRY(BdrvDirtyBitmap) list;
57 };
58
59 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
60
61 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
62 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
63 BlockCompletionFunc *cb, void *opaque);
64 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
65 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
66 BlockCompletionFunc *cb, void *opaque);
67 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
68 int64_t sector_num, int nb_sectors,
69 QEMUIOVector *iov);
70 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
71 int64_t sector_num, int nb_sectors,
72 QEMUIOVector *iov);
73 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
74 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
75 BdrvRequestFlags flags);
76 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
77 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
78 BdrvRequestFlags flags);
79 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
80 int64_t sector_num,
81 QEMUIOVector *qiov,
82 int nb_sectors,
83 BdrvRequestFlags flags,
84 BlockCompletionFunc *cb,
85 void *opaque,
86 bool is_write);
87 static void coroutine_fn bdrv_co_do_rw(void *opaque);
88 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
89 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
90
91 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
92 QTAILQ_HEAD_INITIALIZER(bdrv_states);
93
94 static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
95 QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
96
97 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
98 QLIST_HEAD_INITIALIZER(bdrv_drivers);
99
100 /* If non-zero, use only whitelisted block drivers */
101 static int use_bdrv_whitelist;
102
103 #ifdef _WIN32
104 static int is_windows_drive_prefix(const char *filename)
105 {
106 return (((filename[0] >= 'a' && filename[0] <= 'z') ||
107 (filename[0] >= 'A' && filename[0] <= 'Z')) &&
108 filename[1] == ':');
109 }
110
111 int is_windows_drive(const char *filename)
112 {
113 if (is_windows_drive_prefix(filename) &&
114 filename[2] == '\0')
115 return 1;
116 if (strstart(filename, "\\\\.\\", NULL) ||
117 strstart(filename, "//./", NULL))
118 return 1;
119 return 0;
120 }
121 #endif
122
123 /* throttling disk I/O limits */
124 void bdrv_set_io_limits(BlockDriverState *bs,
125 ThrottleConfig *cfg)
126 {
127 int i;
128
129 throttle_config(&bs->throttle_state, cfg);
130
131 for (i = 0; i < 2; i++) {
132 qemu_co_enter_next(&bs->throttled_reqs[i]);
133 }
134 }
135
136 /* this function drain all the throttled IOs */
137 static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
138 {
139 bool drained = false;
140 bool enabled = bs->io_limits_enabled;
141 int i;
142
143 bs->io_limits_enabled = false;
144
145 for (i = 0; i < 2; i++) {
146 while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
147 drained = true;
148 }
149 }
150
151 bs->io_limits_enabled = enabled;
152
153 return drained;
154 }
155
156 void bdrv_io_limits_disable(BlockDriverState *bs)
157 {
158 bs->io_limits_enabled = false;
159
160 bdrv_start_throttled_reqs(bs);
161
162 throttle_destroy(&bs->throttle_state);
163 }
164
165 static void bdrv_throttle_read_timer_cb(void *opaque)
166 {
167 BlockDriverState *bs = opaque;
168 qemu_co_enter_next(&bs->throttled_reqs[0]);
169 }
170
171 static void bdrv_throttle_write_timer_cb(void *opaque)
172 {
173 BlockDriverState *bs = opaque;
174 qemu_co_enter_next(&bs->throttled_reqs[1]);
175 }
176
177 /* should be called before bdrv_set_io_limits if a limit is set */
178 void bdrv_io_limits_enable(BlockDriverState *bs)
179 {
180 assert(!bs->io_limits_enabled);
181 throttle_init(&bs->throttle_state,
182 bdrv_get_aio_context(bs),
183 QEMU_CLOCK_VIRTUAL,
184 bdrv_throttle_read_timer_cb,
185 bdrv_throttle_write_timer_cb,
186 bs);
187 bs->io_limits_enabled = true;
188 }
189
190 /* This function makes an IO wait if needed
191 *
192 * @nb_sectors: the number of sectors of the IO
193 * @is_write: is the IO a write
194 */
195 static void bdrv_io_limits_intercept(BlockDriverState *bs,
196 unsigned int bytes,
197 bool is_write)
198 {
199 /* does this io must wait */
200 bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
201
202 /* if must wait or any request of this type throttled queue the IO */
203 if (must_wait ||
204 !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
205 qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
206 }
207
208 /* the IO will be executed, do the accounting */
209 throttle_account(&bs->throttle_state, is_write, bytes);
210
211
212 /* if the next request must wait -> do nothing */
213 if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
214 return;
215 }
216
217 /* else queue next request for execution */
218 qemu_co_queue_next(&bs->throttled_reqs[is_write]);
219 }
220
221 size_t bdrv_opt_mem_align(BlockDriverState *bs)
222 {
223 if (!bs || !bs->drv) {
224 /* 4k should be on the safe side */
225 return 4096;
226 }
227
228 return bs->bl.opt_mem_alignment;
229 }
230
231 /* check if the path starts with "<protocol>:" */
232 int path_has_protocol(const char *path)
233 {
234 const char *p;
235
236 #ifdef _WIN32
237 if (is_windows_drive(path) ||
238 is_windows_drive_prefix(path)) {
239 return 0;
240 }
241 p = path + strcspn(path, ":/\\");
242 #else
243 p = path + strcspn(path, ":/");
244 #endif
245
246 return *p == ':';
247 }
248
249 int path_is_absolute(const char *path)
250 {
251 #ifdef _WIN32
252 /* specific case for names like: "\\.\d:" */
253 if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
254 return 1;
255 }
256 return (*path == '/' || *path == '\\');
257 #else
258 return (*path == '/');
259 #endif
260 }
261
262 /* if filename is absolute, just copy it to dest. Otherwise, build a
263 path to it by considering it is relative to base_path. URL are
264 supported. */
265 void path_combine(char *dest, int dest_size,
266 const char *base_path,
267 const char *filename)
268 {
269 const char *p, *p1;
270 int len;
271
272 if (dest_size <= 0)
273 return;
274 if (path_is_absolute(filename)) {
275 pstrcpy(dest, dest_size, filename);
276 } else {
277 p = strchr(base_path, ':');
278 if (p)
279 p++;
280 else
281 p = base_path;
282 p1 = strrchr(base_path, '/');
283 #ifdef _WIN32
284 {
285 const char *p2;
286 p2 = strrchr(base_path, '\\');
287 if (!p1 || p2 > p1)
288 p1 = p2;
289 }
290 #endif
291 if (p1)
292 p1++;
293 else
294 p1 = base_path;
295 if (p1 > p)
296 p = p1;
297 len = p - base_path;
298 if (len > dest_size - 1)
299 len = dest_size - 1;
300 memcpy(dest, base_path, len);
301 dest[len] = '\0';
302 pstrcat(dest, dest_size, filename);
303 }
304 }
305
306 void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
307 {
308 if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
309 pstrcpy(dest, sz, bs->backing_file);
310 } else {
311 path_combine(dest, sz, bs->filename, bs->backing_file);
312 }
313 }
314
315 void bdrv_register(BlockDriver *bdrv)
316 {
317 /* Block drivers without coroutine functions need emulation */
318 if (!bdrv->bdrv_co_readv) {
319 bdrv->bdrv_co_readv = bdrv_co_readv_em;
320 bdrv->bdrv_co_writev = bdrv_co_writev_em;
321
322 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
323 * the block driver lacks aio we need to emulate that too.
324 */
325 if (!bdrv->bdrv_aio_readv) {
326 /* add AIO emulation layer */
327 bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
328 bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
329 }
330 }
331
332 QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
333 }
334
335 BlockDriverState *bdrv_new_root(void)
336 {
337 BlockDriverState *bs = bdrv_new();
338
339 QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
340 return bs;
341 }
342
343 BlockDriverState *bdrv_new(void)
344 {
345 BlockDriverState *bs;
346 int i;
347
348 bs = g_new0(BlockDriverState, 1);
349 QLIST_INIT(&bs->dirty_bitmaps);
350 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
351 QLIST_INIT(&bs->op_blockers[i]);
352 }
353 bdrv_iostatus_disable(bs);
354 notifier_list_init(&bs->close_notifiers);
355 notifier_with_return_list_init(&bs->before_write_notifiers);
356 qemu_co_queue_init(&bs->throttled_reqs[0]);
357 qemu_co_queue_init(&bs->throttled_reqs[1]);
358 bs->refcnt = 1;
359 bs->aio_context = qemu_get_aio_context();
360
361 return bs;
362 }
363
364 void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
365 {
366 notifier_list_add(&bs->close_notifiers, notify);
367 }
368
369 BlockDriver *bdrv_find_format(const char *format_name)
370 {
371 BlockDriver *drv1;
372 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
373 if (!strcmp(drv1->format_name, format_name)) {
374 return drv1;
375 }
376 }
377 return NULL;
378 }
379
380 static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
381 {
382 static const char *whitelist_rw[] = {
383 CONFIG_BDRV_RW_WHITELIST
384 };
385 static const char *whitelist_ro[] = {
386 CONFIG_BDRV_RO_WHITELIST
387 };
388 const char **p;
389
390 if (!whitelist_rw[0] && !whitelist_ro[0]) {
391 return 1; /* no whitelist, anything goes */
392 }
393
394 for (p = whitelist_rw; *p; p++) {
395 if (!strcmp(drv->format_name, *p)) {
396 return 1;
397 }
398 }
399 if (read_only) {
400 for (p = whitelist_ro; *p; p++) {
401 if (!strcmp(drv->format_name, *p)) {
402 return 1;
403 }
404 }
405 }
406 return 0;
407 }
408
409 BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
410 bool read_only)
411 {
412 BlockDriver *drv = bdrv_find_format(format_name);
413 return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
414 }
415
416 typedef struct CreateCo {
417 BlockDriver *drv;
418 char *filename;
419 QemuOpts *opts;
420 int ret;
421 Error *err;
422 } CreateCo;
423
424 static void coroutine_fn bdrv_create_co_entry(void *opaque)
425 {
426 Error *local_err = NULL;
427 int ret;
428
429 CreateCo *cco = opaque;
430 assert(cco->drv);
431
432 ret = cco->drv->bdrv_create(cco->filename, cco->opts, &local_err);
433 if (local_err) {
434 error_propagate(&cco->err, local_err);
435 }
436 cco->ret = ret;
437 }
438
439 int bdrv_create(BlockDriver *drv, const char* filename,
440 QemuOpts *opts, Error **errp)
441 {
442 int ret;
443
444 Coroutine *co;
445 CreateCo cco = {
446 .drv = drv,
447 .filename = g_strdup(filename),
448 .opts = opts,
449 .ret = NOT_DONE,
450 .err = NULL,
451 };
452
453 if (!drv->bdrv_create) {
454 error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
455 ret = -ENOTSUP;
456 goto out;
457 }
458
459 if (qemu_in_coroutine()) {
460 /* Fast-path if already in coroutine context */
461 bdrv_create_co_entry(&cco);
462 } else {
463 co = qemu_coroutine_create(bdrv_create_co_entry);
464 qemu_coroutine_enter(co, &cco);
465 while (cco.ret == NOT_DONE) {
466 aio_poll(qemu_get_aio_context(), true);
467 }
468 }
469
470 ret = cco.ret;
471 if (ret < 0) {
472 if (cco.err) {
473 error_propagate(errp, cco.err);
474 } else {
475 error_setg_errno(errp, -ret, "Could not create image");
476 }
477 }
478
479 out:
480 g_free(cco.filename);
481 return ret;
482 }
483
484 int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp)
485 {
486 BlockDriver *drv;
487 Error *local_err = NULL;
488 int ret;
489
490 drv = bdrv_find_protocol(filename, true);
491 if (drv == NULL) {
492 error_setg(errp, "Could not find protocol for file '%s'", filename);
493 return -ENOENT;
494 }
495
496 ret = bdrv_create(drv, filename, opts, &local_err);
497 if (local_err) {
498 error_propagate(errp, local_err);
499 }
500 return ret;
501 }
502
503 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
504 {
505 BlockDriver *drv = bs->drv;
506 Error *local_err = NULL;
507
508 memset(&bs->bl, 0, sizeof(bs->bl));
509
510 if (!drv) {
511 return;
512 }
513
514 /* Take some limits from the children as a default */
515 if (bs->file) {
516 bdrv_refresh_limits(bs->file, &local_err);
517 if (local_err) {
518 error_propagate(errp, local_err);
519 return;
520 }
521 bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
522 bs->bl.max_transfer_length = bs->file->bl.max_transfer_length;
523 bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
524 } else {
525 bs->bl.opt_mem_alignment = 512;
526 }
527
528 if (bs->backing_hd) {
529 bdrv_refresh_limits(bs->backing_hd, &local_err);
530 if (local_err) {
531 error_propagate(errp, local_err);
532 return;
533 }
534 bs->bl.opt_transfer_length =
535 MAX(bs->bl.opt_transfer_length,
536 bs->backing_hd->bl.opt_transfer_length);
537 bs->bl.max_transfer_length =
538 MIN_NON_ZERO(bs->bl.max_transfer_length,
539 bs->backing_hd->bl.max_transfer_length);
540 bs->bl.opt_mem_alignment =
541 MAX(bs->bl.opt_mem_alignment,
542 bs->backing_hd->bl.opt_mem_alignment);
543 }
544
545 /* Then let the driver override it */
546 if (drv->bdrv_refresh_limits) {
547 drv->bdrv_refresh_limits(bs, errp);
548 }
549 }
550
551 /*
552 * Create a uniquely-named empty temporary file.
553 * Return 0 upon success, otherwise a negative errno value.
554 */
555 int get_tmp_filename(char *filename, int size)
556 {
557 #ifdef _WIN32
558 char temp_dir[MAX_PATH];
559 /* GetTempFileName requires that its output buffer (4th param)
560 have length MAX_PATH or greater. */
561 assert(size >= MAX_PATH);
562 return (GetTempPath(MAX_PATH, temp_dir)
563 && GetTempFileName(temp_dir, "qem", 0, filename)
564 ? 0 : -GetLastError());
565 #else
566 int fd;
567 const char *tmpdir;
568 tmpdir = getenv("TMPDIR");
569 if (!tmpdir) {
570 tmpdir = "/var/tmp";
571 }
572 if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
573 return -EOVERFLOW;
574 }
575 fd = mkstemp(filename);
576 if (fd < 0) {
577 return -errno;
578 }
579 if (close(fd) != 0) {
580 unlink(filename);
581 return -errno;
582 }
583 return 0;
584 #endif
585 }
586
587 /*
588 * Detect host devices. By convention, /dev/cdrom[N] is always
589 * recognized as a host CDROM.
590 */
591 static BlockDriver *find_hdev_driver(const char *filename)
592 {
593 int score_max = 0, score;
594 BlockDriver *drv = NULL, *d;
595
596 QLIST_FOREACH(d, &bdrv_drivers, list) {
597 if (d->bdrv_probe_device) {
598 score = d->bdrv_probe_device(filename);
599 if (score > score_max) {
600 score_max = score;
601 drv = d;
602 }
603 }
604 }
605
606 return drv;
607 }
608
609 BlockDriver *bdrv_find_protocol(const char *filename,
610 bool allow_protocol_prefix)
611 {
612 BlockDriver *drv1;
613 char protocol[128];
614 int len;
615 const char *p;
616
617 /* TODO Drivers without bdrv_file_open must be specified explicitly */
618
619 /*
620 * XXX(hch): we really should not let host device detection
621 * override an explicit protocol specification, but moving this
622 * later breaks access to device names with colons in them.
623 * Thanks to the brain-dead persistent naming schemes on udev-
624 * based Linux systems those actually are quite common.
625 */
626 drv1 = find_hdev_driver(filename);
627 if (drv1) {
628 return drv1;
629 }
630
631 if (!path_has_protocol(filename) || !allow_protocol_prefix) {
632 return &bdrv_file;
633 }
634
635 p = strchr(filename, ':');
636 assert(p != NULL);
637 len = p - filename;
638 if (len > sizeof(protocol) - 1)
639 len = sizeof(protocol) - 1;
640 memcpy(protocol, filename, len);
641 protocol[len] = '\0';
642 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
643 if (drv1->protocol_name &&
644 !strcmp(drv1->protocol_name, protocol)) {
645 return drv1;
646 }
647 }
648 return NULL;
649 }
650
651 /*
652 * Guess image format by probing its contents.
653 * This is not a good idea when your image is raw (CVE-2008-2004), but
654 * we do it anyway for backward compatibility.
655 *
656 * @buf contains the image's first @buf_size bytes.
657 * @buf_size is the buffer size in bytes (generally BLOCK_PROBE_BUF_SIZE,
658 * but can be smaller if the image file is smaller)
659 * @filename is its filename.
660 *
661 * For all block drivers, call the bdrv_probe() method to get its
662 * probing score.
663 * Return the first block driver with the highest probing score.
664 */
665 BlockDriver *bdrv_probe_all(const uint8_t *buf, int buf_size,
666 const char *filename)
667 {
668 int score_max = 0, score;
669 BlockDriver *drv = NULL, *d;
670
671 QLIST_FOREACH(d, &bdrv_drivers, list) {
672 if (d->bdrv_probe) {
673 score = d->bdrv_probe(buf, buf_size, filename);
674 if (score > score_max) {
675 score_max = score;
676 drv = d;
677 }
678 }
679 }
680
681 return drv;
682 }
683
684 static int find_image_format(BlockDriverState *bs, const char *filename,
685 BlockDriver **pdrv, Error **errp)
686 {
687 BlockDriver *drv;
688 uint8_t buf[BLOCK_PROBE_BUF_SIZE];
689 int ret = 0;
690
691 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
692 if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
693 *pdrv = &bdrv_raw;
694 return ret;
695 }
696
697 ret = bdrv_pread(bs, 0, buf, sizeof(buf));
698 if (ret < 0) {
699 error_setg_errno(errp, -ret, "Could not read image for determining its "
700 "format");
701 *pdrv = NULL;
702 return ret;
703 }
704
705 drv = bdrv_probe_all(buf, ret, filename);
706 if (!drv) {
707 error_setg(errp, "Could not determine image format: No compatible "
708 "driver found");
709 ret = -ENOENT;
710 }
711 *pdrv = drv;
712 return ret;
713 }
714
715 /**
716 * Set the current 'total_sectors' value
717 * Return 0 on success, -errno on error.
718 */
719 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
720 {
721 BlockDriver *drv = bs->drv;
722
723 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
724 if (bs->sg)
725 return 0;
726
727 /* query actual device if possible, otherwise just trust the hint */
728 if (drv->bdrv_getlength) {
729 int64_t length = drv->bdrv_getlength(bs);
730 if (length < 0) {
731 return length;
732 }
733 hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
734 }
735
736 bs->total_sectors = hint;
737 return 0;
738 }
739
740 /**
741 * Set open flags for a given discard mode
742 *
743 * Return 0 on success, -1 if the discard mode was invalid.
744 */
745 int bdrv_parse_discard_flags(const char *mode, int *flags)
746 {
747 *flags &= ~BDRV_O_UNMAP;
748
749 if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
750 /* do nothing */
751 } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
752 *flags |= BDRV_O_UNMAP;
753 } else {
754 return -1;
755 }
756
757 return 0;
758 }
759
760 /**
761 * Set open flags for a given cache mode
762 *
763 * Return 0 on success, -1 if the cache mode was invalid.
764 */
765 int bdrv_parse_cache_flags(const char *mode, int *flags)
766 {
767 *flags &= ~BDRV_O_CACHE_MASK;
768
769 if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
770 *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
771 } else if (!strcmp(mode, "directsync")) {
772 *flags |= BDRV_O_NOCACHE;
773 } else if (!strcmp(mode, "writeback")) {
774 *flags |= BDRV_O_CACHE_WB;
775 } else if (!strcmp(mode, "unsafe")) {
776 *flags |= BDRV_O_CACHE_WB;
777 *flags |= BDRV_O_NO_FLUSH;
778 } else if (!strcmp(mode, "writethrough")) {
779 /* this is the default */
780 } else {
781 return -1;
782 }
783
784 return 0;
785 }
786
787 /**
788 * The copy-on-read flag is actually a reference count so multiple users may
789 * use the feature without worrying about clobbering its previous state.
790 * Copy-on-read stays enabled until all users have called to disable it.
791 */
792 void bdrv_enable_copy_on_read(BlockDriverState *bs)
793 {
794 bs->copy_on_read++;
795 }
796
797 void bdrv_disable_copy_on_read(BlockDriverState *bs)
798 {
799 assert(bs->copy_on_read > 0);
800 bs->copy_on_read--;
801 }
802
803 /*
804 * Returns the flags that a temporary snapshot should get, based on the
805 * originally requested flags (the originally requested image will have flags
806 * like a backing file)
807 */
808 static int bdrv_temp_snapshot_flags(int flags)
809 {
810 return (flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY;
811 }
812
813 /*
814 * Returns the flags that bs->file should get, based on the given flags for
815 * the parent BDS
816 */
817 static int bdrv_inherited_flags(int flags)
818 {
819 /* Enable protocol handling, disable format probing for bs->file */
820 flags |= BDRV_O_PROTOCOL;
821
822 /* Our block drivers take care to send flushes and respect unmap policy,
823 * so we can enable both unconditionally on lower layers. */
824 flags |= BDRV_O_CACHE_WB | BDRV_O_UNMAP;
825
826 /* Clear flags that only apply to the top layer */
827 flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_COPY_ON_READ);
828
829 return flags;
830 }
831
832 /*
833 * Returns the flags that bs->backing_hd should get, based on the given flags
834 * for the parent BDS
835 */
836 static int bdrv_backing_flags(int flags)
837 {
838 /* backing files always opened read-only */
839 flags &= ~(BDRV_O_RDWR | BDRV_O_COPY_ON_READ);
840
841 /* snapshot=on is handled on the top layer */
842 flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_TEMPORARY);
843
844 return flags;
845 }
846
847 static int bdrv_open_flags(BlockDriverState *bs, int flags)
848 {
849 int open_flags = flags | BDRV_O_CACHE_WB;
850
851 /*
852 * Clear flags that are internal to the block layer before opening the
853 * image.
854 */
855 open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_PROTOCOL);
856
857 /*
858 * Snapshots should be writable.
859 */
860 if (flags & BDRV_O_TEMPORARY) {
861 open_flags |= BDRV_O_RDWR;
862 }
863
864 return open_flags;
865 }
866
867 static void bdrv_assign_node_name(BlockDriverState *bs,
868 const char *node_name,
869 Error **errp)
870 {
871 if (!node_name) {
872 return;
873 }
874
875 /* Check for empty string or invalid characters */
876 if (!id_wellformed(node_name)) {
877 error_setg(errp, "Invalid node name");
878 return;
879 }
880
881 /* takes care of avoiding namespaces collisions */
882 if (blk_by_name(node_name)) {
883 error_setg(errp, "node-name=%s is conflicting with a device id",
884 node_name);
885 return;
886 }
887
888 /* takes care of avoiding duplicates node names */
889 if (bdrv_find_node(node_name)) {
890 error_setg(errp, "Duplicate node name");
891 return;
892 }
893
894 /* copy node name into the bs and insert it into the graph list */
895 pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
896 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
897 }
898
899 /*
900 * Common part for opening disk images and files
901 *
902 * Removes all processed options from *options.
903 */
904 static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
905 QDict *options, int flags, BlockDriver *drv, Error **errp)
906 {
907 int ret, open_flags;
908 const char *filename;
909 const char *node_name = NULL;
910 Error *local_err = NULL;
911
912 assert(drv != NULL);
913 assert(bs->file == NULL);
914 assert(options != NULL && bs->options != options);
915
916 if (file != NULL) {
917 filename = file->filename;
918 } else {
919 filename = qdict_get_try_str(options, "filename");
920 }
921
922 if (drv->bdrv_needs_filename && !filename) {
923 error_setg(errp, "The '%s' block driver requires a file name",
924 drv->format_name);
925 return -EINVAL;
926 }
927
928 trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
929
930 node_name = qdict_get_try_str(options, "node-name");
931 bdrv_assign_node_name(bs, node_name, &local_err);
932 if (local_err) {
933 error_propagate(errp, local_err);
934 return -EINVAL;
935 }
936 qdict_del(options, "node-name");
937
938 /* bdrv_open() with directly using a protocol as drv. This layer is already
939 * opened, so assign it to bs (while file becomes a closed BlockDriverState)
940 * and return immediately. */
941 if (file != NULL && drv->bdrv_file_open) {
942 bdrv_swap(file, bs);
943 return 0;
944 }
945
946 bs->open_flags = flags;
947 bs->guest_block_size = 512;
948 bs->request_alignment = 512;
949 bs->zero_beyond_eof = true;
950 open_flags = bdrv_open_flags(bs, flags);
951 bs->read_only = !(open_flags & BDRV_O_RDWR);
952 bs->growable = !!(flags & BDRV_O_PROTOCOL);
953
954 if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
955 error_setg(errp,
956 !bs->read_only && bdrv_is_whitelisted(drv, true)
957 ? "Driver '%s' can only be used for read-only devices"
958 : "Driver '%s' is not whitelisted",
959 drv->format_name);
960 return -ENOTSUP;
961 }
962
963 assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
964 if (flags & BDRV_O_COPY_ON_READ) {
965 if (!bs->read_only) {
966 bdrv_enable_copy_on_read(bs);
967 } else {
968 error_setg(errp, "Can't use copy-on-read on read-only device");
969 return -EINVAL;
970 }
971 }
972
973 if (filename != NULL) {
974 pstrcpy(bs->filename, sizeof(bs->filename), filename);
975 } else {
976 bs->filename[0] = '\0';
977 }
978 pstrcpy(bs->exact_filename, sizeof(bs->exact_filename), bs->filename);
979
980 bs->drv = drv;
981 bs->opaque = g_malloc0(drv->instance_size);
982
983 bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
984
985 /* Open the image, either directly or using a protocol */
986 if (drv->bdrv_file_open) {
987 assert(file == NULL);
988 assert(!drv->bdrv_needs_filename || filename != NULL);
989 ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
990 } else {
991 if (file == NULL) {
992 error_setg(errp, "Can't use '%s' as a block driver for the "
993 "protocol level", drv->format_name);
994 ret = -EINVAL;
995 goto free_and_fail;
996 }
997 bs->file = file;
998 ret = drv->bdrv_open(bs, options, open_flags, &local_err);
999 }
1000
1001 if (ret < 0) {
1002 if (local_err) {
1003 error_propagate(errp, local_err);
1004 } else if (bs->filename[0]) {
1005 error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
1006 } else {
1007 error_setg_errno(errp, -ret, "Could not open image");
1008 }
1009 goto free_and_fail;
1010 }
1011
1012 ret = refresh_total_sectors(bs, bs->total_sectors);
1013 if (ret < 0) {
1014 error_setg_errno(errp, -ret, "Could not refresh total sector count");
1015 goto free_and_fail;
1016 }
1017
1018 bdrv_refresh_limits(bs, &local_err);
1019 if (local_err) {
1020 error_propagate(errp, local_err);
1021 ret = -EINVAL;
1022 goto free_and_fail;
1023 }
1024
1025 assert(bdrv_opt_mem_align(bs) != 0);
1026 assert((bs->request_alignment != 0) || bs->sg);
1027 return 0;
1028
1029 free_and_fail:
1030 bs->file = NULL;
1031 g_free(bs->opaque);
1032 bs->opaque = NULL;
1033 bs->drv = NULL;
1034 return ret;
1035 }
1036
1037 static QDict *parse_json_filename(const char *filename, Error **errp)
1038 {
1039 QObject *options_obj;
1040 QDict *options;
1041 int ret;
1042
1043 ret = strstart(filename, "json:", &filename);
1044 assert(ret);
1045
1046 options_obj = qobject_from_json(filename);
1047 if (!options_obj) {
1048 error_setg(errp, "Could not parse the JSON options");
1049 return NULL;
1050 }
1051
1052 if (qobject_type(options_obj) != QTYPE_QDICT) {
1053 qobject_decref(options_obj);
1054 error_setg(errp, "Invalid JSON object given");
1055 return NULL;
1056 }
1057
1058 options = qobject_to_qdict(options_obj);
1059 qdict_flatten(options);
1060
1061 return options;
1062 }
1063
1064 /*
1065 * Fills in default options for opening images and converts the legacy
1066 * filename/flags pair to option QDict entries.
1067 */
1068 static int bdrv_fill_options(QDict **options, const char **pfilename, int flags,
1069 BlockDriver *drv, Error **errp)
1070 {
1071 const char *filename = *pfilename;
1072 const char *drvname;
1073 bool protocol = flags & BDRV_O_PROTOCOL;
1074 bool parse_filename = false;
1075 Error *local_err = NULL;
1076
1077 /* Parse json: pseudo-protocol */
1078 if (filename && g_str_has_prefix(filename, "json:")) {
1079 QDict *json_options = parse_json_filename(filename, &local_err);
1080 if (local_err) {
1081 error_propagate(errp, local_err);
1082 return -EINVAL;
1083 }
1084
1085 /* Options given in the filename have lower priority than options
1086 * specified directly */
1087 qdict_join(*options, json_options, false);
1088 QDECREF(json_options);
1089 *pfilename = filename = NULL;
1090 }
1091
1092 /* Fetch the file name from the options QDict if necessary */
1093 if (protocol && filename) {
1094 if (!qdict_haskey(*options, "filename")) {
1095 qdict_put(*options, "filename", qstring_from_str(filename));
1096 parse_filename = true;
1097 } else {
1098 error_setg(errp, "Can't specify 'file' and 'filename' options at "
1099 "the same time");
1100 return -EINVAL;
1101 }
1102 }
1103
1104 /* Find the right block driver */
1105 filename = qdict_get_try_str(*options, "filename");
1106 drvname = qdict_get_try_str(*options, "driver");
1107
1108 if (drv) {
1109 if (drvname) {
1110 error_setg(errp, "Driver specified twice");
1111 return -EINVAL;
1112 }
1113 drvname = drv->format_name;
1114 qdict_put(*options, "driver", qstring_from_str(drvname));
1115 } else {
1116 if (!drvname && protocol) {
1117 if (filename) {
1118 drv = bdrv_find_protocol(filename, parse_filename);
1119 if (!drv) {
1120 error_setg(errp, "Unknown protocol");
1121 return -EINVAL;
1122 }
1123
1124 drvname = drv->format_name;
1125 qdict_put(*options, "driver", qstring_from_str(drvname));
1126 } else {
1127 error_setg(errp, "Must specify either driver or file");
1128 return -EINVAL;
1129 }
1130 } else if (drvname) {
1131 drv = bdrv_find_format(drvname);
1132 if (!drv) {
1133 error_setg(errp, "Unknown driver '%s'", drvname);
1134 return -ENOENT;
1135 }
1136 }
1137 }
1138
1139 assert(drv || !protocol);
1140
1141 /* Driver-specific filename parsing */
1142 if (drv && drv->bdrv_parse_filename && parse_filename) {
1143 drv->bdrv_parse_filename(filename, *options, &local_err);
1144 if (local_err) {
1145 error_propagate(errp, local_err);
1146 return -EINVAL;
1147 }
1148
1149 if (!drv->bdrv_needs_filename) {
1150 qdict_del(*options, "filename");
1151 }
1152 }
1153
1154 return 0;
1155 }
1156
1157 void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd)
1158 {
1159
1160 if (bs->backing_hd) {
1161 assert(bs->backing_blocker);
1162 bdrv_op_unblock_all(bs->backing_hd, bs->backing_blocker);
1163 } else if (backing_hd) {
1164 error_setg(&bs->backing_blocker,
1165 "device is used as backing hd of '%s'",
1166 bdrv_get_device_name(bs));
1167 }
1168
1169 bs->backing_hd = backing_hd;
1170 if (!backing_hd) {
1171 error_free(bs->backing_blocker);
1172 bs->backing_blocker = NULL;
1173 goto out;
1174 }
1175 bs->open_flags &= ~BDRV_O_NO_BACKING;
1176 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_hd->filename);
1177 pstrcpy(bs->backing_format, sizeof(bs->backing_format),
1178 backing_hd->drv ? backing_hd->drv->format_name : "");
1179
1180 bdrv_op_block_all(bs->backing_hd, bs->backing_blocker);
1181 /* Otherwise we won't be able to commit due to check in bdrv_commit */
1182 bdrv_op_unblock(bs->backing_hd, BLOCK_OP_TYPE_COMMIT,
1183 bs->backing_blocker);
1184 out:
1185 bdrv_refresh_limits(bs, NULL);
1186 }
1187
1188 /*
1189 * Opens the backing file for a BlockDriverState if not yet open
1190 *
1191 * options is a QDict of options to pass to the block drivers, or NULL for an
1192 * empty set of options. The reference to the QDict is transferred to this
1193 * function (even on failure), so if the caller intends to reuse the dictionary,
1194 * it needs to use QINCREF() before calling bdrv_file_open.
1195 */
1196 int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
1197 {
1198 char *backing_filename = g_malloc0(PATH_MAX);
1199 int ret = 0;
1200 BlockDriverState *backing_hd;
1201 Error *local_err = NULL;
1202
1203 if (bs->backing_hd != NULL) {
1204 QDECREF(options);
1205 goto free_exit;
1206 }
1207
1208 /* NULL means an empty set of options */
1209 if (options == NULL) {
1210 options = qdict_new();
1211 }
1212
1213 bs->open_flags &= ~BDRV_O_NO_BACKING;
1214 if (qdict_haskey(options, "file.filename")) {
1215 backing_filename[0] = '\0';
1216 } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
1217 QDECREF(options);
1218 goto free_exit;
1219 } else {
1220 bdrv_get_full_backing_filename(bs, backing_filename, PATH_MAX);
1221 }
1222
1223 if (!bs->drv || !bs->drv->supports_backing) {
1224 ret = -EINVAL;
1225 error_setg(errp, "Driver doesn't support backing files");
1226 QDECREF(options);
1227 goto free_exit;
1228 }
1229
1230 backing_hd = bdrv_new();
1231
1232 if (bs->backing_format[0] != '\0' && !qdict_haskey(options, "driver")) {
1233 qdict_put(options, "driver", qstring_from_str(bs->backing_format));
1234 }
1235
1236 assert(bs->backing_hd == NULL);
1237 ret = bdrv_open(&backing_hd,
1238 *backing_filename ? backing_filename : NULL, NULL, options,
1239 bdrv_backing_flags(bs->open_flags), NULL, &local_err);
1240 if (ret < 0) {
1241 bdrv_unref(backing_hd);
1242 backing_hd = NULL;
1243 bs->open_flags |= BDRV_O_NO_BACKING;
1244 error_setg(errp, "Could not open backing file: %s",
1245 error_get_pretty(local_err));
1246 error_free(local_err);
1247 goto free_exit;
1248 }
1249 bdrv_set_backing_hd(bs, backing_hd);
1250
1251 free_exit:
1252 g_free(backing_filename);
1253 return ret;
1254 }
1255
1256 /*
1257 * Opens a disk image whose options are given as BlockdevRef in another block
1258 * device's options.
1259 *
1260 * If allow_none is true, no image will be opened if filename is false and no
1261 * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
1262 *
1263 * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
1264 * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
1265 * itself, all options starting with "${bdref_key}." are considered part of the
1266 * BlockdevRef.
1267 *
1268 * The BlockdevRef will be removed from the options QDict.
1269 *
1270 * To conform with the behavior of bdrv_open(), *pbs has to be NULL.
1271 */
1272 int bdrv_open_image(BlockDriverState **pbs, const char *filename,
1273 QDict *options, const char *bdref_key, int flags,
1274 bool allow_none, Error **errp)
1275 {
1276 QDict *image_options;
1277 int ret;
1278 char *bdref_key_dot;
1279 const char *reference;
1280
1281 assert(pbs);
1282 assert(*pbs == NULL);
1283
1284 bdref_key_dot = g_strdup_printf("%s.", bdref_key);
1285 qdict_extract_subqdict(options, &image_options, bdref_key_dot);
1286 g_free(bdref_key_dot);
1287
1288 reference = qdict_get_try_str(options, bdref_key);
1289 if (!filename && !reference && !qdict_size(image_options)) {
1290 if (allow_none) {
1291 ret = 0;
1292 } else {
1293 error_setg(errp, "A block device must be specified for \"%s\"",
1294 bdref_key);
1295 ret = -EINVAL;
1296 }
1297 QDECREF(image_options);
1298 goto done;
1299 }
1300
1301 ret = bdrv_open(pbs, filename, reference, image_options, flags, NULL, errp);
1302
1303 done:
1304 qdict_del(options, bdref_key);
1305 return ret;
1306 }
1307
1308 int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp)
1309 {
1310 /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1311 char *tmp_filename = g_malloc0(PATH_MAX + 1);
1312 int64_t total_size;
1313 QemuOpts *opts = NULL;
1314 QDict *snapshot_options;
1315 BlockDriverState *bs_snapshot;
1316 Error *local_err;
1317 int ret;
1318
1319 /* if snapshot, we create a temporary backing file and open it
1320 instead of opening 'filename' directly */
1321
1322 /* Get the required size from the image */
1323 total_size = bdrv_getlength(bs);
1324 if (total_size < 0) {
1325 ret = total_size;
1326 error_setg_errno(errp, -total_size, "Could not get image size");
1327 goto out;
1328 }
1329
1330 /* Create the temporary image */
1331 ret = get_tmp_filename(tmp_filename, PATH_MAX + 1);
1332 if (ret < 0) {
1333 error_setg_errno(errp, -ret, "Could not get temporary filename");
1334 goto out;
1335 }
1336
1337 opts = qemu_opts_create(bdrv_qcow2.create_opts, NULL, 0,
1338 &error_abort);
1339 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, total_size);
1340 ret = bdrv_create(&bdrv_qcow2, tmp_filename, opts, &local_err);
1341 qemu_opts_del(opts);
1342 if (ret < 0) {
1343 error_setg_errno(errp, -ret, "Could not create temporary overlay "
1344 "'%s': %s", tmp_filename,
1345 error_get_pretty(local_err));
1346 error_free(local_err);
1347 goto out;
1348 }
1349
1350 /* Prepare a new options QDict for the temporary file */
1351 snapshot_options = qdict_new();
1352 qdict_put(snapshot_options, "file.driver",
1353 qstring_from_str("file"));
1354 qdict_put(snapshot_options, "file.filename",
1355 qstring_from_str(tmp_filename));
1356
1357 bs_snapshot = bdrv_new();
1358
1359 ret = bdrv_open(&bs_snapshot, NULL, NULL, snapshot_options,
1360 flags, &bdrv_qcow2, &local_err);
1361 if (ret < 0) {
1362 error_propagate(errp, local_err);
1363 goto out;
1364 }
1365
1366 bdrv_append(bs_snapshot, bs);
1367
1368 out:
1369 g_free(tmp_filename);
1370 return ret;
1371 }
1372
1373 /*
1374 * Opens a disk image (raw, qcow2, vmdk, ...)
1375 *
1376 * options is a QDict of options to pass to the block drivers, or NULL for an
1377 * empty set of options. The reference to the QDict belongs to the block layer
1378 * after the call (even on failure), so if the caller intends to reuse the
1379 * dictionary, it needs to use QINCREF() before calling bdrv_open.
1380 *
1381 * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
1382 * If it is not NULL, the referenced BDS will be reused.
1383 *
1384 * The reference parameter may be used to specify an existing block device which
1385 * should be opened. If specified, neither options nor a filename may be given,
1386 * nor can an existing BDS be reused (that is, *pbs has to be NULL).
1387 */
1388 int bdrv_open(BlockDriverState **pbs, const char *filename,
1389 const char *reference, QDict *options, int flags,
1390 BlockDriver *drv, Error **errp)
1391 {
1392 int ret;
1393 BlockDriverState *file = NULL, *bs;
1394 const char *drvname;
1395 Error *local_err = NULL;
1396 int snapshot_flags = 0;
1397
1398 assert(pbs);
1399
1400 if (reference) {
1401 bool options_non_empty = options ? qdict_size(options) : false;
1402 QDECREF(options);
1403
1404 if (*pbs) {
1405 error_setg(errp, "Cannot reuse an existing BDS when referencing "
1406 "another block device");
1407 return -EINVAL;
1408 }
1409
1410 if (filename || options_non_empty) {
1411 error_setg(errp, "Cannot reference an existing block device with "
1412 "additional options or a new filename");
1413 return -EINVAL;
1414 }
1415
1416 bs = bdrv_lookup_bs(reference, reference, errp);
1417 if (!bs) {
1418 return -ENODEV;
1419 }
1420 bdrv_ref(bs);
1421 *pbs = bs;
1422 return 0;
1423 }
1424
1425 if (*pbs) {
1426 bs = *pbs;
1427 } else {
1428 bs = bdrv_new();
1429 }
1430
1431 /* NULL means an empty set of options */
1432 if (options == NULL) {
1433 options = qdict_new();
1434 }
1435
1436 ret = bdrv_fill_options(&options, &filename, flags, drv, &local_err);
1437 if (local_err) {
1438 goto fail;
1439 }
1440
1441 /* Find the right image format driver */
1442 drv = NULL;
1443 drvname = qdict_get_try_str(options, "driver");
1444 if (drvname) {
1445 drv = bdrv_find_format(drvname);
1446 qdict_del(options, "driver");
1447 if (!drv) {
1448 error_setg(errp, "Unknown driver: '%s'", drvname);
1449 ret = -EINVAL;
1450 goto fail;
1451 }
1452 }
1453
1454 assert(drvname || !(flags & BDRV_O_PROTOCOL));
1455 if (drv && !drv->bdrv_file_open) {
1456 /* If the user explicitly wants a format driver here, we'll need to add
1457 * another layer for the protocol in bs->file */
1458 flags &= ~BDRV_O_PROTOCOL;
1459 }
1460
1461 bs->options = options;
1462 options = qdict_clone_shallow(options);
1463
1464 /* Open image file without format layer */
1465 if ((flags & BDRV_O_PROTOCOL) == 0) {
1466 if (flags & BDRV_O_RDWR) {
1467 flags |= BDRV_O_ALLOW_RDWR;
1468 }
1469 if (flags & BDRV_O_SNAPSHOT) {
1470 snapshot_flags = bdrv_temp_snapshot_flags(flags);
1471 flags = bdrv_backing_flags(flags);
1472 }
1473
1474 assert(file == NULL);
1475 ret = bdrv_open_image(&file, filename, options, "file",
1476 bdrv_inherited_flags(flags),
1477 true, &local_err);
1478 if (ret < 0) {
1479 goto fail;
1480 }
1481 }
1482
1483 /* Image format probing */
1484 bs->probed = !drv;
1485 if (!drv && file) {
1486 ret = find_image_format(file, filename, &drv, &local_err);
1487 if (ret < 0) {
1488 goto fail;
1489 }
1490 } else if (!drv) {
1491 error_setg(errp, "Must specify either driver or file");
1492 ret = -EINVAL;
1493 goto fail;
1494 }
1495
1496 /* Open the image */
1497 ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
1498 if (ret < 0) {
1499 goto fail;
1500 }
1501
1502 if (file && (bs->file != file)) {
1503 bdrv_unref(file);
1504 file = NULL;
1505 }
1506
1507 /* If there is a backing file, use it */
1508 if ((flags & BDRV_O_NO_BACKING) == 0) {
1509 QDict *backing_options;
1510
1511 qdict_extract_subqdict(options, &backing_options, "backing.");
1512 ret = bdrv_open_backing_file(bs, backing_options, &local_err);
1513 if (ret < 0) {
1514 goto close_and_fail;
1515 }
1516 }
1517
1518 bdrv_refresh_filename(bs);
1519
1520 /* For snapshot=on, create a temporary qcow2 overlay. bs points to the
1521 * temporary snapshot afterwards. */
1522 if (snapshot_flags) {
1523 ret = bdrv_append_temp_snapshot(bs, snapshot_flags, &local_err);
1524 if (local_err) {
1525 goto close_and_fail;
1526 }
1527 }
1528
1529 /* Check if any unknown options were used */
1530 if (options && (qdict_size(options) != 0)) {
1531 const QDictEntry *entry = qdict_first(options);
1532 if (flags & BDRV_O_PROTOCOL) {
1533 error_setg(errp, "Block protocol '%s' doesn't support the option "
1534 "'%s'", drv->format_name, entry->key);
1535 } else {
1536 error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1537 "support the option '%s'", drv->format_name,
1538 bdrv_get_device_name(bs), entry->key);
1539 }
1540
1541 ret = -EINVAL;
1542 goto close_and_fail;
1543 }
1544
1545 if (!bdrv_key_required(bs)) {
1546 if (bs->blk) {
1547 blk_dev_change_media_cb(bs->blk, true);
1548 }
1549 } else if (!runstate_check(RUN_STATE_PRELAUNCH)
1550 && !runstate_check(RUN_STATE_INMIGRATE)
1551 && !runstate_check(RUN_STATE_PAUSED)) { /* HACK */
1552 error_setg(errp,
1553 "Guest must be stopped for opening of encrypted image");
1554 ret = -EBUSY;
1555 goto close_and_fail;
1556 }
1557
1558 QDECREF(options);
1559 *pbs = bs;
1560 return 0;
1561
1562 fail:
1563 if (file != NULL) {
1564 bdrv_unref(file);
1565 }
1566 QDECREF(bs->options);
1567 QDECREF(options);
1568 bs->options = NULL;
1569 if (!*pbs) {
1570 /* If *pbs is NULL, a new BDS has been created in this function and
1571 needs to be freed now. Otherwise, it does not need to be closed,
1572 since it has not really been opened yet. */
1573 bdrv_unref(bs);
1574 }
1575 if (local_err) {
1576 error_propagate(errp, local_err);
1577 }
1578 return ret;
1579
1580 close_and_fail:
1581 /* See fail path, but now the BDS has to be always closed */
1582 if (*pbs) {
1583 bdrv_close(bs);
1584 } else {
1585 bdrv_unref(bs);
1586 }
1587 QDECREF(options);
1588 if (local_err) {
1589 error_propagate(errp, local_err);
1590 }
1591 return ret;
1592 }
1593
1594 typedef struct BlockReopenQueueEntry {
1595 bool prepared;
1596 BDRVReopenState state;
1597 QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1598 } BlockReopenQueueEntry;
1599
1600 /*
1601 * Adds a BlockDriverState to a simple queue for an atomic, transactional
1602 * reopen of multiple devices.
1603 *
1604 * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1605 * already performed, or alternatively may be NULL a new BlockReopenQueue will
1606 * be created and initialized. This newly created BlockReopenQueue should be
1607 * passed back in for subsequent calls that are intended to be of the same
1608 * atomic 'set'.
1609 *
1610 * bs is the BlockDriverState to add to the reopen queue.
1611 *
1612 * flags contains the open flags for the associated bs
1613 *
1614 * returns a pointer to bs_queue, which is either the newly allocated
1615 * bs_queue, or the existing bs_queue being used.
1616 *
1617 */
1618 BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1619 BlockDriverState *bs, int flags)
1620 {
1621 assert(bs != NULL);
1622
1623 BlockReopenQueueEntry *bs_entry;
1624 if (bs_queue == NULL) {
1625 bs_queue = g_new0(BlockReopenQueue, 1);
1626 QSIMPLEQ_INIT(bs_queue);
1627 }
1628
1629 /* bdrv_open() masks this flag out */
1630 flags &= ~BDRV_O_PROTOCOL;
1631
1632 if (bs->file) {
1633 bdrv_reopen_queue(bs_queue, bs->file, bdrv_inherited_flags(flags));
1634 }
1635
1636 bs_entry = g_new0(BlockReopenQueueEntry, 1);
1637 QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1638
1639 bs_entry->state.bs = bs;
1640 bs_entry->state.flags = flags;
1641
1642 return bs_queue;
1643 }
1644
1645 /*
1646 * Reopen multiple BlockDriverStates atomically & transactionally.
1647 *
1648 * The queue passed in (bs_queue) must have been built up previous
1649 * via bdrv_reopen_queue().
1650 *
1651 * Reopens all BDS specified in the queue, with the appropriate
1652 * flags. All devices are prepared for reopen, and failure of any
1653 * device will cause all device changes to be abandonded, and intermediate
1654 * data cleaned up.
1655 *
1656 * If all devices prepare successfully, then the changes are committed
1657 * to all devices.
1658 *
1659 */
1660 int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1661 {
1662 int ret = -1;
1663 BlockReopenQueueEntry *bs_entry, *next;
1664 Error *local_err = NULL;
1665
1666 assert(bs_queue != NULL);
1667
1668 bdrv_drain_all();
1669
1670 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1671 if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1672 error_propagate(errp, local_err);
1673 goto cleanup;
1674 }
1675 bs_entry->prepared = true;
1676 }
1677
1678 /* If we reach this point, we have success and just need to apply the
1679 * changes
1680 */
1681 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1682 bdrv_reopen_commit(&bs_entry->state);
1683 }
1684
1685 ret = 0;
1686
1687 cleanup:
1688 QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1689 if (ret && bs_entry->prepared) {
1690 bdrv_reopen_abort(&bs_entry->state);
1691 }
1692 g_free(bs_entry);
1693 }
1694 g_free(bs_queue);
1695 return ret;
1696 }
1697
1698
1699 /* Reopen a single BlockDriverState with the specified flags. */
1700 int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1701 {
1702 int ret = -1;
1703 Error *local_err = NULL;
1704 BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1705
1706 ret = bdrv_reopen_multiple(queue, &local_err);
1707 if (local_err != NULL) {
1708 error_propagate(errp, local_err);
1709 }
1710 return ret;
1711 }
1712
1713
1714 /*
1715 * Prepares a BlockDriverState for reopen. All changes are staged in the
1716 * 'opaque' field of the BDRVReopenState, which is used and allocated by
1717 * the block driver layer .bdrv_reopen_prepare()
1718 *
1719 * bs is the BlockDriverState to reopen
1720 * flags are the new open flags
1721 * queue is the reopen queue
1722 *
1723 * Returns 0 on success, non-zero on error. On error errp will be set
1724 * as well.
1725 *
1726 * On failure, bdrv_reopen_abort() will be called to clean up any data.
1727 * It is the responsibility of the caller to then call the abort() or
1728 * commit() for any other BDS that have been left in a prepare() state
1729 *
1730 */
1731 int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1732 Error **errp)
1733 {
1734 int ret = -1;
1735 Error *local_err = NULL;
1736 BlockDriver *drv;
1737
1738 assert(reopen_state != NULL);
1739 assert(reopen_state->bs->drv != NULL);
1740 drv = reopen_state->bs->drv;
1741
1742 /* if we are to stay read-only, do not allow permission change
1743 * to r/w */
1744 if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1745 reopen_state->flags & BDRV_O_RDWR) {
1746 error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1747 bdrv_get_device_name(reopen_state->bs));
1748 goto error;
1749 }
1750
1751
1752 ret = bdrv_flush(reopen_state->bs);
1753 if (ret) {
1754 error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1755 strerror(-ret));
1756 goto error;
1757 }
1758
1759 if (drv->bdrv_reopen_prepare) {
1760 ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1761 if (ret) {
1762 if (local_err != NULL) {
1763 error_propagate(errp, local_err);
1764 } else {
1765 error_setg(errp, "failed while preparing to reopen image '%s'",
1766 reopen_state->bs->filename);
1767 }
1768 goto error;
1769 }
1770 } else {
1771 /* It is currently mandatory to have a bdrv_reopen_prepare()
1772 * handler for each supported drv. */
1773 error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1774 drv->format_name, bdrv_get_device_name(reopen_state->bs),
1775 "reopening of file");
1776 ret = -1;
1777 goto error;
1778 }
1779
1780 ret = 0;
1781
1782 error:
1783 return ret;
1784 }
1785
1786 /*
1787 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1788 * makes them final by swapping the staging BlockDriverState contents into
1789 * the active BlockDriverState contents.
1790 */
1791 void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1792 {
1793 BlockDriver *drv;
1794
1795 assert(reopen_state != NULL);
1796 drv = reopen_state->bs->drv;
1797 assert(drv != NULL);
1798
1799 /* If there are any driver level actions to take */
1800 if (drv->bdrv_reopen_commit) {
1801 drv->bdrv_reopen_commit(reopen_state);
1802 }
1803
1804 /* set BDS specific flags now */
1805 reopen_state->bs->open_flags = reopen_state->flags;
1806 reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1807 BDRV_O_CACHE_WB);
1808 reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1809
1810 bdrv_refresh_limits(reopen_state->bs, NULL);
1811 }
1812
1813 /*
1814 * Abort the reopen, and delete and free the staged changes in
1815 * reopen_state
1816 */
1817 void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1818 {
1819 BlockDriver *drv;
1820
1821 assert(reopen_state != NULL);
1822 drv = reopen_state->bs->drv;
1823 assert(drv != NULL);
1824
1825 if (drv->bdrv_reopen_abort) {
1826 drv->bdrv_reopen_abort(reopen_state);
1827 }
1828 }
1829
1830
1831 void bdrv_close(BlockDriverState *bs)
1832 {
1833 BdrvAioNotifier *ban, *ban_next;
1834
1835 if (bs->job) {
1836 block_job_cancel_sync(bs->job);
1837 }
1838 bdrv_drain_all(); /* complete I/O */
1839 bdrv_flush(bs);
1840 bdrv_drain_all(); /* in case flush left pending I/O */
1841 notifier_list_notify(&bs->close_notifiers, bs);
1842
1843 if (bs->drv) {
1844 if (bs->backing_hd) {
1845 BlockDriverState *backing_hd = bs->backing_hd;
1846 bdrv_set_backing_hd(bs, NULL);
1847 bdrv_unref(backing_hd);
1848 }
1849 bs->drv->bdrv_close(bs);
1850 g_free(bs->opaque);
1851 bs->opaque = NULL;
1852 bs->drv = NULL;
1853 bs->copy_on_read = 0;
1854 bs->backing_file[0] = '\0';
1855 bs->backing_format[0] = '\0';
1856 bs->total_sectors = 0;
1857 bs->encrypted = 0;
1858 bs->valid_key = 0;
1859 bs->sg = 0;
1860 bs->growable = 0;
1861 bs->zero_beyond_eof = false;
1862 QDECREF(bs->options);
1863 bs->options = NULL;
1864 QDECREF(bs->full_open_options);
1865 bs->full_open_options = NULL;
1866
1867 if (bs->file != NULL) {
1868 bdrv_unref(bs->file);
1869 bs->file = NULL;
1870 }
1871 }
1872
1873 if (bs->blk) {
1874 blk_dev_change_media_cb(bs->blk, false);
1875 }
1876
1877 /*throttling disk I/O limits*/
1878 if (bs->io_limits_enabled) {
1879 bdrv_io_limits_disable(bs);
1880 }
1881
1882 QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
1883 g_free(ban);
1884 }
1885 QLIST_INIT(&bs->aio_notifiers);
1886 }
1887
1888 void bdrv_close_all(void)
1889 {
1890 BlockDriverState *bs;
1891
1892 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1893 AioContext *aio_context = bdrv_get_aio_context(bs);
1894
1895 aio_context_acquire(aio_context);
1896 bdrv_close(bs);
1897 aio_context_release(aio_context);
1898 }
1899 }
1900
1901 /* Check if any requests are in-flight (including throttled requests) */
1902 static bool bdrv_requests_pending(BlockDriverState *bs)
1903 {
1904 if (!QLIST_EMPTY(&bs->tracked_requests)) {
1905 return true;
1906 }
1907 if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1908 return true;
1909 }
1910 if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
1911 return true;
1912 }
1913 if (bs->file && bdrv_requests_pending(bs->file)) {
1914 return true;
1915 }
1916 if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1917 return true;
1918 }
1919 return false;
1920 }
1921
1922 static bool bdrv_drain_one(BlockDriverState *bs)
1923 {
1924 bool bs_busy;
1925
1926 bdrv_flush_io_queue(bs);
1927 bdrv_start_throttled_reqs(bs);
1928 bs_busy = bdrv_requests_pending(bs);
1929 bs_busy |= aio_poll(bdrv_get_aio_context(bs), bs_busy);
1930 return bs_busy;
1931 }
1932
1933 /*
1934 * Wait for pending requests to complete on a single BlockDriverState subtree
1935 *
1936 * See the warning in bdrv_drain_all(). This function can only be called if
1937 * you are sure nothing can generate I/O because you have op blockers
1938 * installed.
1939 *
1940 * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
1941 * AioContext.
1942 */
1943 void bdrv_drain(BlockDriverState *bs)
1944 {
1945 while (bdrv_drain_one(bs)) {
1946 /* Keep iterating */
1947 }
1948 }
1949
1950 /*
1951 * Wait for pending requests to complete across all BlockDriverStates
1952 *
1953 * This function does not flush data to disk, use bdrv_flush_all() for that
1954 * after calling this function.
1955 *
1956 * Note that completion of an asynchronous I/O operation can trigger any
1957 * number of other I/O operations on other devices---for example a coroutine
1958 * can be arbitrarily complex and a constant flow of I/O can come until the
1959 * coroutine is complete. Because of this, it is not possible to have a
1960 * function to drain a single device's I/O queue.
1961 */
1962 void bdrv_drain_all(void)
1963 {
1964 /* Always run first iteration so any pending completion BHs run */
1965 bool busy = true;
1966 BlockDriverState *bs;
1967
1968 while (busy) {
1969 busy = false;
1970
1971 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1972 AioContext *aio_context = bdrv_get_aio_context(bs);
1973
1974 aio_context_acquire(aio_context);
1975 busy |= bdrv_drain_one(bs);
1976 aio_context_release(aio_context);
1977 }
1978 }
1979 }
1980
1981 /* make a BlockDriverState anonymous by removing from bdrv_state and
1982 * graph_bdrv_state list.
1983 Also, NULL terminate the device_name to prevent double remove */
1984 void bdrv_make_anon(BlockDriverState *bs)
1985 {
1986 /*
1987 * Take care to remove bs from bdrv_states only when it's actually
1988 * in it. Note that bs->device_list.tqe_prev is initially null,
1989 * and gets set to non-null by QTAILQ_INSERT_TAIL(). Establish
1990 * the useful invariant "bs in bdrv_states iff bs->tqe_prev" by
1991 * resetting it to null on remove.
1992 */
1993 if (bs->device_list.tqe_prev) {
1994 QTAILQ_REMOVE(&bdrv_states, bs, device_list);
1995 bs->device_list.tqe_prev = NULL;
1996 }
1997 if (bs->node_name[0] != '\0') {
1998 QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
1999 }
2000 bs->node_name[0] = '\0';
2001 }
2002
2003 static void bdrv_rebind(BlockDriverState *bs)
2004 {
2005 if (bs->drv && bs->drv->bdrv_rebind) {
2006 bs->drv->bdrv_rebind(bs);
2007 }
2008 }
2009
2010 static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
2011 BlockDriverState *bs_src)
2012 {
2013 /* move some fields that need to stay attached to the device */
2014
2015 /* dev info */
2016 bs_dest->guest_block_size = bs_src->guest_block_size;
2017 bs_dest->copy_on_read = bs_src->copy_on_read;
2018
2019 bs_dest->enable_write_cache = bs_src->enable_write_cache;
2020
2021 /* i/o throttled req */
2022 memcpy(&bs_dest->throttle_state,
2023 &bs_src->throttle_state,
2024 sizeof(ThrottleState));
2025 bs_dest->throttled_reqs[0] = bs_src->throttled_reqs[0];
2026 bs_dest->throttled_reqs[1] = bs_src->throttled_reqs[1];
2027 bs_dest->io_limits_enabled = bs_src->io_limits_enabled;
2028
2029 /* r/w error */
2030 bs_dest->on_read_error = bs_src->on_read_error;
2031 bs_dest->on_write_error = bs_src->on_write_error;
2032
2033 /* i/o status */
2034 bs_dest->iostatus_enabled = bs_src->iostatus_enabled;
2035 bs_dest->iostatus = bs_src->iostatus;
2036
2037 /* dirty bitmap */
2038 bs_dest->dirty_bitmaps = bs_src->dirty_bitmaps;
2039
2040 /* reference count */
2041 bs_dest->refcnt = bs_src->refcnt;
2042
2043 /* job */
2044 bs_dest->job = bs_src->job;
2045
2046 /* keep the same entry in bdrv_states */
2047 bs_dest->device_list = bs_src->device_list;
2048 bs_dest->blk = bs_src->blk;
2049
2050 memcpy(bs_dest->op_blockers, bs_src->op_blockers,
2051 sizeof(bs_dest->op_blockers));
2052 }
2053
2054 /*
2055 * Swap bs contents for two image chains while they are live,
2056 * while keeping required fields on the BlockDriverState that is
2057 * actually attached to a device.
2058 *
2059 * This will modify the BlockDriverState fields, and swap contents
2060 * between bs_new and bs_old. Both bs_new and bs_old are modified.
2061 *
2062 * bs_new must not be attached to a BlockBackend.
2063 *
2064 * This function does not create any image files.
2065 */
2066 void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
2067 {
2068 BlockDriverState tmp;
2069
2070 /* The code needs to swap the node_name but simply swapping node_list won't
2071 * work so first remove the nodes from the graph list, do the swap then
2072 * insert them back if needed.
2073 */
2074 if (bs_new->node_name[0] != '\0') {
2075 QTAILQ_REMOVE(&graph_bdrv_states, bs_new, node_list);
2076 }
2077 if (bs_old->node_name[0] != '\0') {
2078 QTAILQ_REMOVE(&graph_bdrv_states, bs_old, node_list);
2079 }
2080
2081 /* bs_new must be unattached and shouldn't have anything fancy enabled */
2082 assert(!bs_new->blk);
2083 assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
2084 assert(bs_new->job == NULL);
2085 assert(bs_new->io_limits_enabled == false);
2086 assert(!throttle_have_timer(&bs_new->throttle_state));
2087
2088 tmp = *bs_new;
2089 *bs_new = *bs_old;
2090 *bs_old = tmp;
2091
2092 /* there are some fields that should not be swapped, move them back */
2093 bdrv_move_feature_fields(&tmp, bs_old);
2094 bdrv_move_feature_fields(bs_old, bs_new);
2095 bdrv_move_feature_fields(bs_new, &tmp);
2096
2097 /* bs_new must remain unattached */
2098 assert(!bs_new->blk);
2099
2100 /* Check a few fields that should remain attached to the device */
2101 assert(bs_new->job == NULL);
2102 assert(bs_new->io_limits_enabled == false);
2103 assert(!throttle_have_timer(&bs_new->throttle_state));
2104
2105 /* insert the nodes back into the graph node list if needed */
2106 if (bs_new->node_name[0] != '\0') {
2107 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_new, node_list);
2108 }
2109 if (bs_old->node_name[0] != '\0') {
2110 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_old, node_list);
2111 }
2112
2113 bdrv_rebind(bs_new);
2114 bdrv_rebind(bs_old);
2115 }
2116
2117 /*
2118 * Add new bs contents at the top of an image chain while the chain is
2119 * live, while keeping required fields on the top layer.
2120 *
2121 * This will modify the BlockDriverState fields, and swap contents
2122 * between bs_new and bs_top. Both bs_new and bs_top are modified.
2123 *
2124 * bs_new must not be attached to a BlockBackend.
2125 *
2126 * This function does not create any image files.
2127 */
2128 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
2129 {
2130 bdrv_swap(bs_new, bs_top);
2131
2132 /* The contents of 'tmp' will become bs_top, as we are
2133 * swapping bs_new and bs_top contents. */
2134 bdrv_set_backing_hd(bs_top, bs_new);
2135 }
2136
2137 static void bdrv_delete(BlockDriverState *bs)
2138 {
2139 assert(!bs->job);
2140 assert(bdrv_op_blocker_is_empty(bs));
2141 assert(!bs->refcnt);
2142 assert(QLIST_EMPTY(&bs->dirty_bitmaps));
2143
2144 bdrv_close(bs);
2145
2146 /* remove from list, if necessary */
2147 bdrv_make_anon(bs);
2148
2149 g_free(bs);
2150 }
2151
2152 /*
2153 * Run consistency checks on an image
2154 *
2155 * Returns 0 if the check could be completed (it doesn't mean that the image is
2156 * free of errors) or -errno when an internal error occurred. The results of the
2157 * check are stored in res.
2158 */
2159 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
2160 {
2161 if (bs->drv == NULL) {
2162 return -ENOMEDIUM;
2163 }
2164 if (bs->drv->bdrv_check == NULL) {
2165 return -ENOTSUP;
2166 }
2167
2168 memset(res, 0, sizeof(*res));
2169 return bs->drv->bdrv_check(bs, res, fix);
2170 }
2171
2172 #define COMMIT_BUF_SECTORS 2048
2173
2174 /* commit COW file into the raw image */
2175 int bdrv_commit(BlockDriverState *bs)
2176 {
2177 BlockDriver *drv = bs->drv;
2178 int64_t sector, total_sectors, length, backing_length;
2179 int n, ro, open_flags;
2180 int ret = 0;
2181 uint8_t *buf = NULL;
2182 char filename[PATH_MAX];
2183
2184 if (!drv)
2185 return -ENOMEDIUM;
2186
2187 if (!bs->backing_hd) {
2188 return -ENOTSUP;
2189 }
2190
2191 if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_COMMIT, NULL) ||
2192 bdrv_op_is_blocked(bs->backing_hd, BLOCK_OP_TYPE_COMMIT, NULL)) {
2193 return -EBUSY;
2194 }
2195
2196 ro = bs->backing_hd->read_only;
2197 /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
2198 pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
2199 open_flags = bs->backing_hd->open_flags;
2200
2201 if (ro) {
2202 if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
2203 return -EACCES;
2204 }
2205 }
2206
2207 length = bdrv_getlength(bs);
2208 if (length < 0) {
2209 ret = length;
2210 goto ro_cleanup;
2211 }
2212
2213 backing_length = bdrv_getlength(bs->backing_hd);
2214 if (backing_length < 0) {
2215 ret = backing_length;
2216 goto ro_cleanup;
2217 }
2218
2219 /* If our top snapshot is larger than the backing file image,
2220 * grow the backing file image if possible. If not possible,
2221 * we must return an error */
2222 if (length > backing_length) {
2223 ret = bdrv_truncate(bs->backing_hd, length);
2224 if (ret < 0) {
2225 goto ro_cleanup;
2226 }
2227 }
2228
2229 total_sectors = length >> BDRV_SECTOR_BITS;
2230
2231 /* qemu_try_blockalign() for bs will choose an alignment that works for
2232 * bs->backing_hd as well, so no need to compare the alignment manually. */
2233 buf = qemu_try_blockalign(bs, COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
2234 if (buf == NULL) {
2235 ret = -ENOMEM;
2236 goto ro_cleanup;
2237 }
2238
2239 for (sector = 0; sector < total_sectors; sector += n) {
2240 ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
2241 if (ret < 0) {
2242 goto ro_cleanup;
2243 }
2244 if (ret) {
2245 ret = bdrv_read(bs, sector, buf, n);
2246 if (ret < 0) {
2247 goto ro_cleanup;
2248 }
2249
2250 ret = bdrv_write(bs->backing_hd, sector, buf, n);
2251 if (ret < 0) {
2252 goto ro_cleanup;
2253 }
2254 }
2255 }
2256
2257 if (drv->bdrv_make_empty) {
2258 ret = drv->bdrv_make_empty(bs);
2259 if (ret < 0) {
2260 goto ro_cleanup;
2261 }
2262 bdrv_flush(bs);
2263 }
2264
2265 /*
2266 * Make sure all data we wrote to the backing device is actually
2267 * stable on disk.
2268 */
2269 if (bs->backing_hd) {
2270 bdrv_flush(bs->backing_hd);
2271 }
2272
2273 ret = 0;
2274 ro_cleanup:
2275 qemu_vfree(buf);
2276
2277 if (ro) {
2278 /* ignoring error return here */
2279 bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
2280 }
2281
2282 return ret;
2283 }
2284
2285 int bdrv_commit_all(void)
2286 {
2287 BlockDriverState *bs;
2288
2289 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
2290 AioContext *aio_context = bdrv_get_aio_context(bs);
2291
2292 aio_context_acquire(aio_context);
2293 if (bs->drv && bs->backing_hd) {
2294 int ret = bdrv_commit(bs);
2295 if (ret < 0) {
2296 aio_context_release(aio_context);
2297 return ret;
2298 }
2299 }
2300 aio_context_release(aio_context);
2301 }
2302 return 0;
2303 }
2304
2305 /**
2306 * Remove an active request from the tracked requests list
2307 *
2308 * This function should be called when a tracked request is completing.
2309 */
2310 static void tracked_request_end(BdrvTrackedRequest *req)
2311 {
2312 if (req->serialising) {
2313 req->bs->serialising_in_flight--;
2314 }
2315
2316 QLIST_REMOVE(req, list);
2317 qemu_co_queue_restart_all(&req->wait_queue);
2318 }
2319
2320 /**
2321 * Add an active request to the tracked requests list
2322 */
2323 static void tracked_request_begin(BdrvTrackedRequest *req,
2324 BlockDriverState *bs,
2325 int64_t offset,
2326 unsigned int bytes, bool is_write)
2327 {
2328 *req = (BdrvTrackedRequest){
2329 .bs = bs,
2330 .offset = offset,
2331 .bytes = bytes,
2332 .is_write = is_write,
2333 .co = qemu_coroutine_self(),
2334 .serialising = false,
2335 .overlap_offset = offset,
2336 .overlap_bytes = bytes,
2337 };
2338
2339 qemu_co_queue_init(&req->wait_queue);
2340
2341 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2342 }
2343
2344 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
2345 {
2346 int64_t overlap_offset = req->offset & ~(align - 1);
2347 unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
2348 - overlap_offset;
2349
2350 if (!req->serialising) {
2351 req->bs->serialising_in_flight++;
2352 req->serialising = true;
2353 }
2354
2355 req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
2356 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
2357 }
2358
2359 /**
2360 * Round a region to cluster boundaries
2361 */
2362 void bdrv_round_to_clusters(BlockDriverState *bs,
2363 int64_t sector_num, int nb_sectors,
2364 int64_t *cluster_sector_num,
2365 int *cluster_nb_sectors)
2366 {
2367 BlockDriverInfo bdi;
2368
2369 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2370 *cluster_sector_num = sector_num;
2371 *cluster_nb_sectors = nb_sectors;
2372 } else {
2373 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2374 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2375 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2376 nb_sectors, c);
2377 }
2378 }
2379
2380 static int bdrv_get_cluster_size(BlockDriverState *bs)
2381 {
2382 BlockDriverInfo bdi;
2383 int ret;
2384
2385 ret = bdrv_get_info(bs, &bdi);
2386 if (ret < 0 || bdi.cluster_size == 0) {
2387 return bs->request_alignment;
2388 } else {
2389 return bdi.cluster_size;
2390 }
2391 }
2392
2393 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
2394 int64_t offset, unsigned int bytes)
2395 {
2396 /* aaaa bbbb */
2397 if (offset >= req->overlap_offset + req->overlap_bytes) {
2398 return false;
2399 }
2400 /* bbbb aaaa */
2401 if (req->overlap_offset >= offset + bytes) {
2402 return false;
2403 }
2404 return true;
2405 }
2406
2407 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
2408 {
2409 BlockDriverState *bs = self->bs;
2410 BdrvTrackedRequest *req;
2411 bool retry;
2412 bool waited = false;
2413
2414 if (!bs->serialising_in_flight) {
2415 return false;
2416 }
2417
2418 do {
2419 retry = false;
2420 QLIST_FOREACH(req, &bs->tracked_requests, list) {
2421 if (req == self || (!req->serialising && !self->serialising)) {
2422 continue;
2423 }
2424 if (tracked_request_overlaps(req, self->overlap_offset,
2425 self->overlap_bytes))
2426 {
2427 /* Hitting this means there was a reentrant request, for
2428 * example, a block driver issuing nested requests. This must
2429 * never happen since it means deadlock.
2430 */
2431 assert(qemu_coroutine_self() != req->co);
2432
2433 /* If the request is already (indirectly) waiting for us, or
2434 * will wait for us as soon as it wakes up, then just go on
2435 * (instead of producing a deadlock in the former case). */
2436 if (!req->waiting_for) {
2437 self->waiting_for = req;
2438 qemu_co_queue_wait(&req->wait_queue);
2439 self->waiting_for = NULL;
2440 retry = true;
2441 waited = true;
2442 break;
2443 }
2444 }
2445 }
2446 } while (retry);
2447
2448 return waited;
2449 }
2450
2451 /*
2452 * Return values:
2453 * 0 - success
2454 * -EINVAL - backing format specified, but no file
2455 * -ENOSPC - can't update the backing file because no space is left in the
2456 * image file header
2457 * -ENOTSUP - format driver doesn't support changing the backing file
2458 */
2459 int bdrv_change_backing_file(BlockDriverState *bs,
2460 const char *backing_file, const char *backing_fmt)
2461 {
2462 BlockDriver *drv = bs->drv;
2463 int ret;
2464
2465 /* Backing file format doesn't make sense without a backing file */
2466 if (backing_fmt && !backing_file) {
2467 return -EINVAL;
2468 }
2469
2470 if (drv->bdrv_change_backing_file != NULL) {
2471 ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
2472 } else {
2473 ret = -ENOTSUP;
2474 }
2475
2476 if (ret == 0) {
2477 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2478 pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2479 }
2480 return ret;
2481 }
2482
2483 /*
2484 * Finds the image layer in the chain that has 'bs' as its backing file.
2485 *
2486 * active is the current topmost image.
2487 *
2488 * Returns NULL if bs is not found in active's image chain,
2489 * or if active == bs.
2490 *
2491 * Returns the bottommost base image if bs == NULL.
2492 */
2493 BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2494 BlockDriverState *bs)
2495 {
2496 while (active && bs != active->backing_hd) {
2497 active = active->backing_hd;
2498 }
2499
2500 return active;
2501 }
2502
2503 /* Given a BDS, searches for the base layer. */
2504 BlockDriverState *bdrv_find_base(BlockDriverState *bs)
2505 {
2506 return bdrv_find_overlay(bs, NULL);
2507 }
2508
2509 typedef struct BlkIntermediateStates {
2510 BlockDriverState *bs;
2511 QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2512 } BlkIntermediateStates;
2513
2514
2515 /*
2516 * Drops images above 'base' up to and including 'top', and sets the image
2517 * above 'top' to have base as its backing file.
2518 *
2519 * Requires that the overlay to 'top' is opened r/w, so that the backing file
2520 * information in 'bs' can be properly updated.
2521 *
2522 * E.g., this will convert the following chain:
2523 * bottom <- base <- intermediate <- top <- active
2524 *
2525 * to
2526 *
2527 * bottom <- base <- active
2528 *
2529 * It is allowed for bottom==base, in which case it converts:
2530 *
2531 * base <- intermediate <- top <- active
2532 *
2533 * to
2534 *
2535 * base <- active
2536 *
2537 * If backing_file_str is non-NULL, it will be used when modifying top's
2538 * overlay image metadata.
2539 *
2540 * Error conditions:
2541 * if active == top, that is considered an error
2542 *
2543 */
2544 int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2545 BlockDriverState *base, const char *backing_file_str)
2546 {
2547 BlockDriverState *intermediate;
2548 BlockDriverState *base_bs = NULL;
2549 BlockDriverState *new_top_bs = NULL;
2550 BlkIntermediateStates *intermediate_state, *next;
2551 int ret = -EIO;
2552
2553 QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2554 QSIMPLEQ_INIT(&states_to_delete);
2555
2556 if (!top->drv || !base->drv) {
2557 goto exit;
2558 }
2559
2560 new_top_bs = bdrv_find_overlay(active, top);
2561
2562 if (new_top_bs == NULL) {
2563 /* we could not find the image above 'top', this is an error */
2564 goto exit;
2565 }
2566
2567 /* special case of new_top_bs->backing_hd already pointing to base - nothing
2568 * to do, no intermediate images */
2569 if (new_top_bs->backing_hd == base) {
2570 ret = 0;
2571 goto exit;
2572 }
2573
2574 intermediate = top;
2575
2576 /* now we will go down through the list, and add each BDS we find
2577 * into our deletion queue, until we hit the 'base'
2578 */
2579 while (intermediate) {
2580 intermediate_state = g_new0(BlkIntermediateStates, 1);
2581 intermediate_state->bs = intermediate;
2582 QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2583
2584 if (intermediate->backing_hd == base) {
2585 base_bs = intermediate->backing_hd;
2586 break;
2587 }
2588 intermediate = intermediate->backing_hd;
2589 }
2590 if (base_bs == NULL) {
2591 /* something went wrong, we did not end at the base. safely
2592 * unravel everything, and exit with error */
2593 goto exit;
2594 }
2595
2596 /* success - we can delete the intermediate states, and link top->base */
2597 backing_file_str = backing_file_str ? backing_file_str : base_bs->filename;
2598 ret = bdrv_change_backing_file(new_top_bs, backing_file_str,
2599 base_bs->drv ? base_bs->drv->format_name : "");
2600 if (ret) {
2601 goto exit;
2602 }
2603 bdrv_set_backing_hd(new_top_bs, base_bs);
2604
2605 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2606 /* so that bdrv_close() does not recursively close the chain */
2607 bdrv_set_backing_hd(intermediate_state->bs, NULL);
2608 bdrv_unref(intermediate_state->bs);
2609 }
2610 ret = 0;
2611
2612 exit:
2613 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2614 g_free(intermediate_state);
2615 }
2616 return ret;
2617 }
2618
2619
2620 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2621 size_t size)
2622 {
2623 int64_t len;
2624
2625 if (size > INT_MAX) {
2626 return -EIO;
2627 }
2628
2629 if (!bdrv_is_inserted(bs))
2630 return -ENOMEDIUM;
2631
2632 if (bs->growable)
2633 return 0;
2634
2635 len = bdrv_getlength(bs);
2636
2637 if (offset < 0)
2638 return -EIO;
2639
2640 if ((offset > len) || (len - offset < size))
2641 return -EIO;
2642
2643 return 0;
2644 }
2645
2646 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2647 int nb_sectors)
2648 {
2649 if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2650 return -EIO;
2651 }
2652
2653 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2654 nb_sectors * BDRV_SECTOR_SIZE);
2655 }
2656
2657 typedef struct RwCo {
2658 BlockDriverState *bs;
2659 int64_t offset;
2660 QEMUIOVector *qiov;
2661 bool is_write;
2662 int ret;
2663 BdrvRequestFlags flags;
2664 } RwCo;
2665
2666 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
2667 {
2668 RwCo *rwco = opaque;
2669
2670 if (!rwco->is_write) {
2671 rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
2672 rwco->qiov->size, rwco->qiov,
2673 rwco->flags);
2674 } else {
2675 rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
2676 rwco->qiov->size, rwco->qiov,
2677 rwco->flags);
2678 }
2679 }
2680
2681 /*
2682 * Process a vectored synchronous request using coroutines
2683 */
2684 static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
2685 QEMUIOVector *qiov, bool is_write,
2686 BdrvRequestFlags flags)
2687 {
2688 Coroutine *co;
2689 RwCo rwco = {
2690 .bs = bs,
2691 .offset = offset,
2692 .qiov = qiov,
2693 .is_write = is_write,
2694 .ret = NOT_DONE,
2695 .flags = flags,
2696 };
2697
2698 /**
2699 * In sync call context, when the vcpu is blocked, this throttling timer
2700 * will not fire; so the I/O throttling function has to be disabled here
2701 * if it has been enabled.
2702 */
2703 if (bs->io_limits_enabled) {
2704 fprintf(stderr, "Disabling I/O throttling on '%s' due "
2705 "to synchronous I/O.\n", bdrv_get_device_name(bs));
2706 bdrv_io_limits_disable(bs);
2707 }
2708
2709 if (qemu_in_coroutine()) {
2710 /* Fast-path if already in coroutine context */
2711 bdrv_rw_co_entry(&rwco);
2712 } else {
2713 AioContext *aio_context = bdrv_get_aio_context(bs);
2714
2715 co = qemu_coroutine_create(bdrv_rw_co_entry);
2716 qemu_coroutine_enter(co, &rwco);
2717 while (rwco.ret == NOT_DONE) {
2718 aio_poll(aio_context, true);
2719 }
2720 }
2721 return rwco.ret;
2722 }
2723
2724 /*
2725 * Process a synchronous request using coroutines
2726 */
2727 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
2728 int nb_sectors, bool is_write, BdrvRequestFlags flags)
2729 {
2730 QEMUIOVector qiov;
2731 struct iovec iov = {
2732 .iov_base = (void *)buf,
2733 .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2734 };
2735
2736 if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2737 return -EINVAL;
2738 }
2739
2740 qemu_iovec_init_external(&qiov, &iov, 1);
2741 return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
2742 &qiov, is_write, flags);
2743 }
2744
2745 /* return < 0 if error. See bdrv_write() for the return codes */
2746 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2747 uint8_t *buf, int nb_sectors)
2748 {
2749 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
2750 }
2751
2752 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2753 int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2754 uint8_t *buf, int nb_sectors)
2755 {
2756 bool enabled;
2757 int ret;
2758
2759 enabled = bs->io_limits_enabled;
2760 bs->io_limits_enabled = false;
2761 ret = bdrv_read(bs, sector_num, buf, nb_sectors);
2762 bs->io_limits_enabled = enabled;
2763 return ret;
2764 }
2765
2766 /* Return < 0 if error. Important errors are:
2767 -EIO generic I/O error (may happen for all errors)
2768 -ENOMEDIUM No media inserted.
2769 -EINVAL Invalid sector number or nb_sectors
2770 -EACCES Trying to write a read-only device
2771 */
2772 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
2773 const uint8_t *buf, int nb_sectors)
2774 {
2775 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
2776 }
2777
2778 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2779 int nb_sectors, BdrvRequestFlags flags)
2780 {
2781 return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
2782 BDRV_REQ_ZERO_WRITE | flags);
2783 }
2784
2785 /*
2786 * Completely zero out a block device with the help of bdrv_write_zeroes.
2787 * The operation is sped up by checking the block status and only writing
2788 * zeroes to the device if they currently do not return zeroes. Optional
2789 * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2790 *
2791 * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2792 */
2793 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2794 {
2795 int64_t target_sectors, ret, nb_sectors, sector_num = 0;
2796 int n;
2797
2798 target_sectors = bdrv_nb_sectors(bs);
2799 if (target_sectors < 0) {
2800 return target_sectors;
2801 }
2802
2803 for (;;) {
2804 nb_sectors = target_sectors - sector_num;
2805 if (nb_sectors <= 0) {
2806 return 0;
2807 }
2808 if (nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2809 nb_sectors = INT_MAX / BDRV_SECTOR_SIZE;
2810 }
2811 ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
2812 if (ret < 0) {
2813 error_report("error getting block status at sector %" PRId64 ": %s",
2814 sector_num, strerror(-ret));
2815 return ret;
2816 }
2817 if (ret & BDRV_BLOCK_ZERO) {
2818 sector_num += n;
2819 continue;
2820 }
2821 ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2822 if (ret < 0) {
2823 error_report("error writing zeroes at sector %" PRId64 ": %s",
2824 sector_num, strerror(-ret));
2825 return ret;
2826 }
2827 sector_num += n;
2828 }
2829 }
2830
2831 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
2832 {
2833 QEMUIOVector qiov;
2834 struct iovec iov = {
2835 .iov_base = (void *)buf,
2836 .iov_len = bytes,
2837 };
2838 int ret;
2839
2840 if (bytes < 0) {
2841 return -EINVAL;
2842 }
2843
2844 qemu_iovec_init_external(&qiov, &iov, 1);
2845 ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
2846 if (ret < 0) {
2847 return ret;
2848 }
2849
2850 return bytes;
2851 }
2852
2853 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
2854 {
2855 int ret;
2856
2857 ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
2858 if (ret < 0) {
2859 return ret;
2860 }
2861
2862 return qiov->size;
2863 }
2864
2865 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2866 const void *buf, int bytes)
2867 {
2868 QEMUIOVector qiov;
2869 struct iovec iov = {
2870 .iov_base = (void *) buf,
2871 .iov_len = bytes,
2872 };
2873
2874 if (bytes < 0) {
2875 return -EINVAL;
2876 }
2877
2878 qemu_iovec_init_external(&qiov, &iov, 1);
2879 return bdrv_pwritev(bs, offset, &qiov);
2880 }
2881
2882 /*
2883 * Writes to the file and ensures that no writes are reordered across this
2884 * request (acts as a barrier)
2885 *
2886 * Returns 0 on success, -errno in error cases.
2887 */
2888 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2889 const void *buf, int count)
2890 {
2891 int ret;
2892
2893 ret = bdrv_pwrite(bs, offset, buf, count);
2894 if (ret < 0) {
2895 return ret;
2896 }
2897
2898 /* No flush needed for cache modes that already do it */
2899 if (bs->enable_write_cache) {
2900 bdrv_flush(bs);
2901 }
2902
2903 return 0;
2904 }
2905
2906 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
2907 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2908 {
2909 /* Perform I/O through a temporary buffer so that users who scribble over
2910 * their read buffer while the operation is in progress do not end up
2911 * modifying the image file. This is critical for zero-copy guest I/O
2912 * where anything might happen inside guest memory.
2913 */
2914 void *bounce_buffer;
2915
2916 BlockDriver *drv = bs->drv;
2917 struct iovec iov;
2918 QEMUIOVector bounce_qiov;
2919 int64_t cluster_sector_num;
2920 int cluster_nb_sectors;
2921 size_t skip_bytes;
2922 int ret;
2923
2924 /* Cover entire cluster so no additional backing file I/O is required when
2925 * allocating cluster in the image file.
2926 */
2927 bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2928 &cluster_sector_num, &cluster_nb_sectors);
2929
2930 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2931 cluster_sector_num, cluster_nb_sectors);
2932
2933 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2934 iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len);
2935 if (bounce_buffer == NULL) {
2936 ret = -ENOMEM;
2937 goto err;
2938 }
2939
2940 qemu_iovec_init_external(&bounce_qiov, &iov, 1);
2941
2942 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
2943 &bounce_qiov);
2944 if (ret < 0) {
2945 goto err;
2946 }
2947
2948 if (drv->bdrv_co_write_zeroes &&
2949 buffer_is_zero(bounce_buffer, iov.iov_len)) {
2950 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
2951 cluster_nb_sectors, 0);
2952 } else {
2953 /* This does not change the data on the disk, it is not necessary
2954 * to flush even in cache=writethrough mode.
2955 */
2956 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
2957 &bounce_qiov);
2958 }
2959
2960 if (ret < 0) {
2961 /* It might be okay to ignore write errors for guest requests. If this
2962 * is a deliberate copy-on-read then we don't want to ignore the error.
2963 * Simply report it in all cases.
2964 */
2965 goto err;
2966 }
2967
2968 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
2969 qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
2970 nb_sectors * BDRV_SECTOR_SIZE);
2971
2972 err:
2973 qemu_vfree(bounce_buffer);
2974 return ret;
2975 }
2976
2977 /*
2978 * Forwards an already correctly aligned request to the BlockDriver. This
2979 * handles copy on read and zeroing after EOF; any other features must be
2980 * implemented by the caller.
2981 */
2982 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
2983 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
2984 int64_t align, QEMUIOVector *qiov, int flags)
2985 {
2986 BlockDriver *drv = bs->drv;
2987 int ret;
2988
2989 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
2990 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
2991
2992 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
2993 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
2994 assert(!qiov || bytes == qiov->size);
2995
2996 /* Handle Copy on Read and associated serialisation */
2997 if (flags & BDRV_REQ_COPY_ON_READ) {
2998 /* If we touch the same cluster it counts as an overlap. This
2999 * guarantees that allocating writes will be serialized and not race
3000 * with each other for the same cluster. For example, in copy-on-read
3001 * it ensures that the CoR read and write operations are atomic and
3002 * guest writes cannot interleave between them. */
3003 mark_request_serialising(req, bdrv_get_cluster_size(bs));
3004 }
3005
3006 wait_serialising_requests(req);
3007
3008 if (flags & BDRV_REQ_COPY_ON_READ) {
3009 int pnum;
3010
3011 ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
3012 if (ret < 0) {
3013 goto out;
3014 }
3015
3016 if (!ret || pnum != nb_sectors) {
3017 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
3018 goto out;
3019 }
3020 }
3021
3022 /* Forward the request to the BlockDriver */
3023 if (!(bs->zero_beyond_eof && bs->growable)) {
3024 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
3025 } else {
3026 /* Read zeros after EOF of growable BDSes */
3027 int64_t total_sectors, max_nb_sectors;
3028
3029 total_sectors = bdrv_nb_sectors(bs);
3030 if (total_sectors < 0) {
3031 ret = total_sectors;
3032 goto out;
3033 }
3034
3035 max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
3036 align >> BDRV_SECTOR_BITS);
3037 if (nb_sectors < max_nb_sectors) {
3038 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
3039 } else if (max_nb_sectors > 0) {
3040 QEMUIOVector local_qiov;
3041
3042 qemu_iovec_init(&local_qiov, qiov->niov);
3043 qemu_iovec_concat(&local_qiov, qiov, 0,
3044 max_nb_sectors * BDRV_SECTOR_SIZE);
3045
3046 ret = drv->bdrv_co_readv(bs, sector_num, max_nb_sectors,
3047 &local_qiov);
3048
3049 qemu_iovec_destroy(&local_qiov);
3050 } else {
3051 ret = 0;
3052 }
3053
3054 /* Reading beyond end of file is supposed to produce zeroes */
3055 if (ret == 0 && total_sectors < sector_num + nb_sectors) {
3056 uint64_t offset = MAX(0, total_sectors - sector_num);
3057 uint64_t bytes = (sector_num + nb_sectors - offset) *
3058 BDRV_SECTOR_SIZE;
3059 qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
3060 }
3061 }
3062
3063 out:
3064 return ret;
3065 }
3066
3067 /*
3068 * Handle a read request in coroutine context
3069 */
3070 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
3071 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3072 BdrvRequestFlags flags)
3073 {
3074 BlockDriver *drv = bs->drv;
3075 BdrvTrackedRequest req;
3076
3077 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3078 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3079 uint8_t *head_buf = NULL;
3080 uint8_t *tail_buf = NULL;
3081 QEMUIOVector local_qiov;
3082 bool use_local_qiov = false;
3083 int ret;
3084
3085 if (!drv) {
3086 return -ENOMEDIUM;
3087 }
3088 if (bdrv_check_byte_request(bs, offset, bytes)) {
3089 return -EIO;
3090 }
3091
3092 if (bs->copy_on_read) {
3093 flags |= BDRV_REQ_COPY_ON_READ;
3094 }
3095
3096 /* throttling disk I/O */
3097 if (bs->io_limits_enabled) {
3098 bdrv_io_limits_intercept(bs, bytes, false);
3099 }
3100
3101 /* Align read if necessary by padding qiov */
3102 if (offset & (align - 1)) {
3103 head_buf = qemu_blockalign(bs, align);
3104 qemu_iovec_init(&local_qiov, qiov->niov + 2);
3105 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3106 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3107 use_local_qiov = true;
3108
3109 bytes += offset & (align - 1);
3110 offset = offset & ~(align - 1);
3111 }
3112
3113 if ((offset + bytes) & (align - 1)) {
3114 if (!use_local_qiov) {
3115 qemu_iovec_init(&local_qiov, qiov->niov + 1);
3116 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3117 use_local_qiov = true;
3118 }
3119 tail_buf = qemu_blockalign(bs, align);
3120 qemu_iovec_add(&local_qiov, tail_buf,
3121 align - ((offset + bytes) & (align - 1)));
3122
3123 bytes = ROUND_UP(bytes, align);
3124 }
3125
3126 tracked_request_begin(&req, bs, offset, bytes, false);
3127 ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
3128 use_local_qiov ? &local_qiov : qiov,
3129 flags);
3130 tracked_request_end(&req);
3131
3132 if (use_local_qiov) {
3133 qemu_iovec_destroy(&local_qiov);
3134 qemu_vfree(head_buf);
3135 qemu_vfree(tail_buf);
3136 }
3137
3138 return ret;
3139 }
3140
3141 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
3142 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3143 BdrvRequestFlags flags)
3144 {
3145 if (nb_sectors < 0 || nb_sectors > (UINT_MAX >> BDRV_SECTOR_BITS)) {
3146 return -EINVAL;
3147 }
3148
3149 return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
3150 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3151 }
3152
3153 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
3154 int nb_sectors, QEMUIOVector *qiov)
3155 {
3156 trace_bdrv_co_readv(bs, sector_num, nb_sectors);
3157
3158 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
3159 }
3160
3161 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
3162 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
3163 {
3164 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
3165
3166 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
3167 BDRV_REQ_COPY_ON_READ);
3168 }
3169
3170 /* if no limit is specified in the BlockLimits use a default
3171 * of 32768 512-byte sectors (16 MiB) per request.
3172 */
3173 #define MAX_WRITE_ZEROES_DEFAULT 32768
3174
3175 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
3176 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
3177 {
3178 BlockDriver *drv = bs->drv;
3179 QEMUIOVector qiov;
3180 struct iovec iov = {0};
3181 int ret = 0;
3182
3183 int max_write_zeroes = bs->bl.max_write_zeroes ?
3184 bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
3185
3186 while (nb_sectors > 0 && !ret) {
3187 int num = nb_sectors;
3188
3189 /* Align request. Block drivers can expect the "bulk" of the request
3190 * to be aligned.
3191 */
3192 if (bs->bl.write_zeroes_alignment
3193 && num > bs->bl.write_zeroes_alignment) {
3194 if (sector_num % bs->bl.write_zeroes_alignment != 0) {
3195 /* Make a small request up to the first aligned sector. */
3196 num = bs->bl.write_zeroes_alignment;
3197 num -= sector_num % bs->bl.write_zeroes_alignment;
3198 } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
3199 /* Shorten the request to the last aligned sector. num cannot
3200 * underflow because num > bs->bl.write_zeroes_alignment.
3201 */
3202 num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
3203 }
3204 }
3205
3206 /* limit request size */
3207 if (num > max_write_zeroes) {
3208 num = max_write_zeroes;
3209 }
3210
3211 ret = -ENOTSUP;
3212 /* First try the efficient write zeroes operation */
3213 if (drv->bdrv_co_write_zeroes) {
3214 ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
3215 }
3216
3217 if (ret == -ENOTSUP) {
3218 /* Fall back to bounce buffer if write zeroes is unsupported */
3219 iov.iov_len = num * BDRV_SECTOR_SIZE;
3220 if (iov.iov_base == NULL) {
3221 iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE);
3222 if (iov.iov_base == NULL) {
3223 ret = -ENOMEM;
3224 goto fail;
3225 }
3226 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
3227 }
3228 qemu_iovec_init_external(&qiov, &iov, 1);
3229
3230 ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
3231
3232 /* Keep bounce buffer around if it is big enough for all
3233 * all future requests.
3234 */
3235 if (num < max_write_zeroes) {
3236 qemu_vfree(iov.iov_base);
3237 iov.iov_base = NULL;
3238 }
3239 }
3240
3241 sector_num += num;
3242 nb_sectors -= num;
3243 }
3244
3245 fail:
3246 qemu_vfree(iov.iov_base);
3247 return ret;
3248 }
3249
3250 /*
3251 * Forwards an already correctly aligned write request to the BlockDriver.
3252 */
3253 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
3254 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3255 QEMUIOVector *qiov, int flags)
3256 {
3257 BlockDriver *drv = bs->drv;
3258 bool waited;
3259 int ret;
3260
3261 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3262 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
3263
3264 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3265 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3266 assert(!qiov || bytes == qiov->size);
3267
3268 waited = wait_serialising_requests(req);
3269 assert(!waited || !req->serialising);
3270 assert(req->overlap_offset <= offset);
3271 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
3272
3273 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
3274
3275 if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
3276 !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes &&
3277 qemu_iovec_is_zero(qiov)) {
3278 flags |= BDRV_REQ_ZERO_WRITE;
3279 if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
3280 flags |= BDRV_REQ_MAY_UNMAP;
3281 }
3282 }
3283
3284 if (ret < 0) {
3285 /* Do nothing, write notifier decided to fail this request */
3286 } else if (flags & BDRV_REQ_ZERO_WRITE) {
3287 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
3288 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
3289 } else {
3290 BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
3291 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
3292 }
3293 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
3294
3295 if (ret == 0 && !bs->enable_write_cache) {
3296 ret = bdrv_co_flush(bs);
3297 }
3298
3299 bdrv_set_dirty(bs, sector_num, nb_sectors);
3300
3301 block_acct_highest_sector(&bs->stats, sector_num, nb_sectors);
3302
3303 if (bs->growable && ret >= 0) {
3304 bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
3305 }
3306
3307 return ret;
3308 }
3309
3310 /*
3311 * Handle a write request in coroutine context
3312 */
3313 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
3314 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3315 BdrvRequestFlags flags)
3316 {
3317 BdrvTrackedRequest req;
3318 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3319 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3320 uint8_t *head_buf = NULL;
3321 uint8_t *tail_buf = NULL;
3322 QEMUIOVector local_qiov;
3323 bool use_local_qiov = false;
3324 int ret;
3325
3326 if (!bs->drv) {
3327 return -ENOMEDIUM;
3328 }
3329 if (bs->read_only) {
3330 return -EACCES;
3331 }
3332 if (bdrv_check_byte_request(bs, offset, bytes)) {
3333 return -EIO;
3334 }
3335
3336 /* throttling disk I/O */
3337 if (bs->io_limits_enabled) {
3338 bdrv_io_limits_intercept(bs, bytes, true);
3339 }
3340
3341 /*
3342 * Align write if necessary by performing a read-modify-write cycle.
3343 * Pad qiov with the read parts and be sure to have a tracked request not
3344 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
3345 */
3346 tracked_request_begin(&req, bs, offset, bytes, true);
3347
3348 if (offset & (align - 1)) {
3349 QEMUIOVector head_qiov;
3350 struct iovec head_iov;
3351
3352 mark_request_serialising(&req, align);
3353 wait_serialising_requests(&req);
3354
3355 head_buf = qemu_blockalign(bs, align);
3356 head_iov = (struct iovec) {
3357 .iov_base = head_buf,
3358 .iov_len = align,
3359 };
3360 qemu_iovec_init_external(&head_qiov, &head_iov, 1);
3361
3362 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
3363 ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
3364 align, &head_qiov, 0);
3365 if (ret < 0) {
3366 goto fail;
3367 }
3368 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
3369
3370 qemu_iovec_init(&local_qiov, qiov->niov + 2);
3371 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3372 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3373 use_local_qiov = true;
3374
3375 bytes += offset & (align - 1);
3376 offset = offset & ~(align - 1);
3377 }
3378
3379 if ((offset + bytes) & (align - 1)) {
3380 QEMUIOVector tail_qiov;
3381 struct iovec tail_iov;
3382 size_t tail_bytes;
3383 bool waited;
3384
3385 mark_request_serialising(&req, align);
3386 waited = wait_serialising_requests(&req);
3387 assert(!waited || !use_local_qiov);
3388
3389 tail_buf = qemu_blockalign(bs, align);
3390 tail_iov = (struct iovec) {
3391 .iov_base = tail_buf,
3392 .iov_len = align,
3393 };
3394 qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
3395
3396 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
3397 ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
3398 align, &tail_qiov, 0);
3399 if (ret < 0) {
3400 goto fail;
3401 }
3402 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
3403
3404 if (!use_local_qiov) {
3405 qemu_iovec_init(&local_qiov, qiov->niov + 1);
3406 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3407 use_local_qiov = true;
3408 }
3409
3410 tail_bytes = (offset + bytes) & (align - 1);
3411 qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
3412
3413 bytes = ROUND_UP(bytes, align);
3414 }
3415
3416 ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
3417 use_local_qiov ? &local_qiov : qiov,
3418 flags);
3419
3420 fail:
3421 tracked_request_end(&req);
3422
3423 if (use_local_qiov) {
3424 qemu_iovec_destroy(&local_qiov);
3425 }
3426 qemu_vfree(head_buf);
3427 qemu_vfree(tail_buf);
3428
3429 return ret;
3430 }
3431
3432 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
3433 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3434 BdrvRequestFlags flags)
3435 {
3436 if (nb_sectors < 0 || nb_sectors > (INT_MAX >> BDRV_SECTOR_BITS)) {
3437 return -EINVAL;
3438 }
3439
3440 return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
3441 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3442 }
3443
3444 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
3445 int nb_sectors, QEMUIOVector *qiov)
3446 {
3447 trace_bdrv_co_writev(bs, sector_num, nb_sectors);
3448
3449 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
3450 }
3451
3452 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
3453 int64_t sector_num, int nb_sectors,
3454 BdrvRequestFlags flags)
3455 {
3456 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
3457
3458 if (!(bs->open_flags & BDRV_O_UNMAP)) {
3459 flags &= ~BDRV_REQ_MAY_UNMAP;
3460 }
3461
3462 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
3463 BDRV_REQ_ZERO_WRITE | flags);
3464 }
3465
3466 /**
3467 * Truncate file to 'offset' bytes (needed only for file protocols)
3468 */
3469 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
3470 {
3471 BlockDriver *drv = bs->drv;
3472 int ret;
3473 if (!drv)
3474 return -ENOMEDIUM;
3475 if (!drv->bdrv_truncate)
3476 return -ENOTSUP;
3477 if (bs->read_only)
3478 return -EACCES;
3479
3480 ret = drv->bdrv_truncate(bs, offset);
3481 if (ret == 0) {
3482 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
3483 if (bs->blk) {
3484 blk_dev_resize_cb(bs->blk);
3485 }
3486 }
3487 return ret;
3488 }
3489
3490 /**
3491 * Length of a allocated file in bytes. Sparse files are counted by actual
3492 * allocated space. Return < 0 if error or unknown.
3493 */
3494 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
3495 {
3496 BlockDriver *drv = bs->drv;
3497 if (!drv) {
3498 return -ENOMEDIUM;
3499 }
3500 if (drv->bdrv_get_allocated_file_size) {
3501 return drv->bdrv_get_allocated_file_size(bs);
3502 }
3503 if (bs->file) {
3504 return bdrv_get_allocated_file_size(bs->file);
3505 }
3506 return -ENOTSUP;
3507 }
3508
3509 /**
3510 * Return number of sectors on success, -errno on error.
3511 */
3512 int64_t bdrv_nb_sectors(BlockDriverState *bs)
3513 {
3514 BlockDriver *drv = bs->drv;
3515
3516 if (!drv)
3517 return -ENOMEDIUM;
3518
3519 if (drv->has_variable_length) {
3520 int ret = refresh_total_sectors(bs, bs->total_sectors);
3521 if (ret < 0) {
3522 return ret;
3523 }
3524 }
3525 return bs->total_sectors;
3526 }
3527
3528 /**
3529 * Return length in bytes on success, -errno on error.
3530 * The length is always a multiple of BDRV_SECTOR_SIZE.
3531 */
3532 int64_t bdrv_getlength(BlockDriverState *bs)
3533 {
3534 int64_t ret = bdrv_nb_sectors(bs);
3535
3536 return ret < 0 ? ret : ret * BDRV_SECTOR_SIZE;
3537 }
3538
3539 /* return 0 as number of sectors if no device present or error */
3540 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
3541 {
3542 int64_t nb_sectors = bdrv_nb_sectors(bs);
3543
3544 *nb_sectors_ptr = nb_sectors < 0 ? 0 : nb_sectors;
3545 }
3546
3547 void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
3548 BlockdevOnError on_write_error)
3549 {
3550 bs->on_read_error = on_read_error;
3551 bs->on_write_error = on_write_error;
3552 }
3553
3554 BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
3555 {
3556 return is_read ? bs->on_read_error : bs->on_write_error;
3557 }
3558
3559 BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
3560 {
3561 BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
3562
3563 switch (on_err) {
3564 case BLOCKDEV_ON_ERROR_ENOSPC:
3565 return (error == ENOSPC) ?
3566 BLOCK_ERROR_ACTION_STOP : BLOCK_ERROR_ACTION_REPORT;
3567 case BLOCKDEV_ON_ERROR_STOP:
3568 return BLOCK_ERROR_ACTION_STOP;
3569 case BLOCKDEV_ON_ERROR_REPORT:
3570 return BLOCK_ERROR_ACTION_REPORT;
3571 case BLOCKDEV_ON_ERROR_IGNORE:
3572 return BLOCK_ERROR_ACTION_IGNORE;
3573 default:
3574 abort();
3575 }
3576 }
3577
3578 static void send_qmp_error_event(BlockDriverState *bs,
3579 BlockErrorAction action,
3580 bool is_read, int error)
3581 {
3582 IoOperationType optype;
3583
3584 optype = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE;
3585 qapi_event_send_block_io_error(bdrv_get_device_name(bs), optype, action,
3586 bdrv_iostatus_is_enabled(bs),
3587 error == ENOSPC, strerror(error),
3588 &error_abort);
3589 }
3590
3591 /* This is done by device models because, while the block layer knows
3592 * about the error, it does not know whether an operation comes from
3593 * the device or the block layer (from a job, for example).
3594 */
3595 void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3596 bool is_read, int error)
3597 {
3598 assert(error >= 0);
3599
3600 if (action == BLOCK_ERROR_ACTION_STOP) {
3601 /* First set the iostatus, so that "info block" returns an iostatus
3602 * that matches the events raised so far (an additional error iostatus
3603 * is fine, but not a lost one).
3604 */
3605 bdrv_iostatus_set_err(bs, error);
3606
3607 /* Then raise the request to stop the VM and the event.
3608 * qemu_system_vmstop_request_prepare has two effects. First,
3609 * it ensures that the STOP event always comes after the
3610 * BLOCK_IO_ERROR event. Second, it ensures that even if management
3611 * can observe the STOP event and do a "cont" before the STOP
3612 * event is issued, the VM will not stop. In this case, vm_start()
3613 * also ensures that the STOP/RESUME pair of events is emitted.
3614 */
3615 qemu_system_vmstop_request_prepare();
3616 send_qmp_error_event(bs, action, is_read, error);
3617 qemu_system_vmstop_request(RUN_STATE_IO_ERROR);
3618 } else {
3619 send_qmp_error_event(bs, action, is_read, error);
3620 }
3621 }
3622
3623 int bdrv_is_read_only(BlockDriverState *bs)
3624 {
3625 return bs->read_only;
3626 }
3627
3628 int bdrv_is_sg(BlockDriverState *bs)
3629 {
3630 return bs->sg;
3631 }
3632
3633 int bdrv_enable_write_cache(BlockDriverState *bs)
3634 {
3635 return bs->enable_write_cache;
3636 }
3637
3638 void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3639 {
3640 bs->enable_write_cache = wce;
3641
3642 /* so a reopen() will preserve wce */
3643 if (wce) {
3644 bs->open_flags |= BDRV_O_CACHE_WB;
3645 } else {
3646 bs->open_flags &= ~BDRV_O_CACHE_WB;
3647 }
3648 }
3649
3650 int bdrv_is_encrypted(BlockDriverState *bs)
3651 {
3652 if (bs->backing_hd && bs->backing_hd->encrypted)
3653 return 1;
3654 return bs->encrypted;
3655 }
3656
3657 int bdrv_key_required(BlockDriverState *bs)
3658 {
3659 BlockDriverState *backing_hd = bs->backing_hd;
3660
3661 if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3662 return 1;
3663 return (bs->encrypted && !bs->valid_key);
3664 }
3665
3666 int bdrv_set_key(BlockDriverState *bs, const char *key)
3667 {
3668 int ret;
3669 if (bs->backing_hd && bs->backing_hd->encrypted) {
3670 ret = bdrv_set_key(bs->backing_hd, key);
3671 if (ret < 0)
3672 return ret;
3673 if (!bs->encrypted)
3674 return 0;
3675 }
3676 if (!bs->encrypted) {
3677 return -EINVAL;
3678 } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3679 return -ENOMEDIUM;
3680 }
3681 ret = bs->drv->bdrv_set_key(bs, key);
3682 if (ret < 0) {
3683 bs->valid_key = 0;
3684 } else if (!bs->valid_key) {
3685 bs->valid_key = 1;
3686 if (bs->blk) {
3687 /* call the change callback now, we skipped it on open */
3688 blk_dev_change_media_cb(bs->blk, true);
3689 }
3690 }
3691 return ret;
3692 }
3693
3694 const char *bdrv_get_format_name(BlockDriverState *bs)
3695 {
3696 return bs->drv ? bs->drv->format_name : NULL;
3697 }
3698
3699 static int qsort_strcmp(const void *a, const void *b)
3700 {
3701 return strcmp(a, b);
3702 }
3703
3704 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
3705 void *opaque)
3706 {
3707 BlockDriver *drv;
3708 int count = 0;
3709 int i;
3710 const char **formats = NULL;
3711
3712 QLIST_FOREACH(drv, &bdrv_drivers, list) {
3713 if (drv->format_name) {
3714 bool found = false;
3715 int i = count;
3716 while (formats && i && !found) {
3717 found = !strcmp(formats[--i], drv->format_name);
3718 }
3719
3720 if (!found) {
3721 formats = g_renew(const char *, formats, count + 1);
3722 formats[count++] = drv->format_name;
3723 }
3724 }
3725 }
3726
3727 qsort(formats, count, sizeof(formats[0]), qsort_strcmp);
3728
3729 for (i = 0; i < count; i++) {
3730 it(opaque, formats[i]);
3731 }
3732
3733 g_free(formats);
3734 }
3735
3736 /* This function is to find block backend bs */
3737 /* TODO convert callers to blk_by_name(), then remove */
3738 BlockDriverState *bdrv_find(const char *name)
3739 {
3740 BlockBackend *blk = blk_by_name(name);
3741
3742 return blk ? blk_bs(blk) : NULL;
3743 }
3744
3745 /* This function is to find a node in the bs graph */
3746 BlockDriverState *bdrv_find_node(const char *node_name)
3747 {
3748 BlockDriverState *bs;
3749
3750 assert(node_name);
3751
3752 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3753 if (!strcmp(node_name, bs->node_name)) {
3754 return bs;
3755 }
3756 }
3757 return NULL;
3758 }
3759
3760 /* Put this QMP function here so it can access the static graph_bdrv_states. */
3761 BlockDeviceInfoList *bdrv_named_nodes_list(void)
3762 {
3763 BlockDeviceInfoList *list, *entry;
3764 BlockDriverState *bs;
3765
3766 list = NULL;
3767 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3768 entry = g_malloc0(sizeof(*entry));
3769 entry->value = bdrv_block_device_info(bs);
3770 entry->next = list;
3771 list = entry;
3772 }
3773
3774 return list;
3775 }
3776
3777 BlockDriverState *bdrv_lookup_bs(const char *device,
3778 const char *node_name,
3779 Error **errp)
3780 {
3781 BlockBackend *blk;
3782 BlockDriverState *bs;
3783
3784 if (device) {
3785 blk = blk_by_name(device);
3786
3787 if (blk) {
3788 return blk_bs(blk);
3789 }
3790 }
3791
3792 if (node_name) {
3793 bs = bdrv_find_node(node_name);
3794
3795 if (bs) {
3796 return bs;
3797 }
3798 }
3799
3800 error_setg(errp, "Cannot find device=%s nor node_name=%s",
3801 device ? device : "",
3802 node_name ? node_name : "");
3803 return NULL;
3804 }
3805
3806 /* If 'base' is in the same chain as 'top', return true. Otherwise,
3807 * return false. If either argument is NULL, return false. */
3808 bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base)
3809 {
3810 while (top && top != base) {
3811 top = top->backing_hd;
3812 }
3813
3814 return top != NULL;
3815 }
3816
3817 BlockDriverState *bdrv_next_node(BlockDriverState *bs)
3818 {
3819 if (!bs) {
3820 return QTAILQ_FIRST(&graph_bdrv_states);
3821 }
3822 return QTAILQ_NEXT(bs, node_list);
3823 }
3824
3825 BlockDriverState *bdrv_next(BlockDriverState *bs)
3826 {
3827 if (!bs) {
3828 return QTAILQ_FIRST(&bdrv_states);
3829 }
3830 return QTAILQ_NEXT(bs, device_list);
3831 }
3832
3833 const char *bdrv_get_node_name(const BlockDriverState *bs)
3834 {
3835 return bs->node_name;
3836 }
3837
3838 /* TODO check what callers really want: bs->node_name or blk_name() */
3839 const char *bdrv_get_device_name(const BlockDriverState *bs)
3840 {
3841 return bs->blk ? blk_name(bs->blk) : "";
3842 }
3843
3844 int bdrv_get_flags(BlockDriverState *bs)
3845 {
3846 return bs->open_flags;
3847 }
3848
3849 int bdrv_flush_all(void)
3850 {
3851 BlockDriverState *bs;
3852 int result = 0;
3853
3854 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3855 AioContext *aio_context = bdrv_get_aio_context(bs);
3856 int ret;
3857
3858 aio_context_acquire(aio_context);
3859 ret = bdrv_flush(bs);
3860 if (ret < 0 && !result) {
3861 result = ret;
3862 }
3863 aio_context_release(aio_context);
3864 }
3865
3866 return result;
3867 }
3868
3869 int bdrv_has_zero_init_1(BlockDriverState *bs)
3870 {
3871 return 1;
3872 }
3873
3874 int bdrv_has_zero_init(BlockDriverState *bs)
3875 {
3876 assert(bs->drv);
3877
3878 /* If BS is a copy on write image, it is initialized to
3879 the contents of the base image, which may not be zeroes. */
3880 if (bs->backing_hd) {
3881 return 0;
3882 }
3883 if (bs->drv->bdrv_has_zero_init) {
3884 return bs->drv->bdrv_has_zero_init(bs);
3885 }
3886
3887 /* safe default */
3888 return 0;
3889 }
3890
3891 bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
3892 {
3893 BlockDriverInfo bdi;
3894
3895 if (bs->backing_hd) {
3896 return false;
3897 }
3898
3899 if (bdrv_get_info(bs, &bdi) == 0) {
3900 return bdi.unallocated_blocks_are_zero;
3901 }
3902
3903 return false;
3904 }
3905
3906 bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
3907 {
3908 BlockDriverInfo bdi;
3909
3910 if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
3911 return false;
3912 }
3913
3914 if (bdrv_get_info(bs, &bdi) == 0) {
3915 return bdi.can_write_zeroes_with_unmap;
3916 }
3917
3918 return false;
3919 }
3920
3921 typedef struct BdrvCoGetBlockStatusData {
3922 BlockDriverState *bs;
3923 BlockDriverState *base;
3924 int64_t sector_num;
3925 int nb_sectors;
3926 int *pnum;
3927 int64_t ret;
3928 bool done;
3929 } BdrvCoGetBlockStatusData;
3930
3931 /*
3932 * Returns the allocation status of the specified sectors.
3933 * Drivers not implementing the functionality are assumed to not support
3934 * backing files, hence all their sectors are reported as allocated.
3935 *
3936 * If 'sector_num' is beyond the end of the disk image the return value is 0
3937 * and 'pnum' is set to 0.
3938 *
3939 * 'pnum' is set to the number of sectors (including and immediately following
3940 * the specified sector) that are known to be in the same
3941 * allocated/unallocated state.
3942 *
3943 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
3944 * beyond the end of the disk image it will be clamped.
3945 */
3946 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
3947 int64_t sector_num,
3948 int nb_sectors, int *pnum)
3949 {
3950 int64_t total_sectors;
3951 int64_t n;
3952 int64_t ret, ret2;
3953
3954 total_sectors = bdrv_nb_sectors(bs);
3955 if (total_sectors < 0) {
3956 return total_sectors;
3957 }
3958
3959 if (sector_num >= total_sectors) {
3960 *pnum = 0;
3961 return 0;
3962 }
3963
3964 n = total_sectors - sector_num;
3965 if (n < nb_sectors) {
3966 nb_sectors = n;
3967 }
3968
3969 if (!bs->drv->bdrv_co_get_block_status) {
3970 *pnum = nb_sectors;
3971 ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
3972 if (bs->drv->protocol_name) {
3973 ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
3974 }
3975 return ret;
3976 }
3977
3978 ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
3979 if (ret < 0) {
3980 *pnum = 0;
3981 return ret;
3982 }
3983
3984 if (ret & BDRV_BLOCK_RAW) {
3985 assert(ret & BDRV_BLOCK_OFFSET_VALID);
3986 return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3987 *pnum, pnum);
3988 }
3989
3990 if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
3991 ret |= BDRV_BLOCK_ALLOCATED;
3992 }
3993
3994 if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
3995 if (bdrv_unallocated_blocks_are_zero(bs)) {
3996 ret |= BDRV_BLOCK_ZERO;
3997 } else if (bs->backing_hd) {
3998 BlockDriverState *bs2 = bs->backing_hd;
3999 int64_t nb_sectors2 = bdrv_nb_sectors(bs2);
4000 if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) {
4001 ret |= BDRV_BLOCK_ZERO;
4002 }
4003 }
4004 }
4005
4006 if (bs->file &&
4007 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
4008 (ret & BDRV_BLOCK_OFFSET_VALID)) {
4009 int file_pnum;
4010
4011 ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
4012 *pnum, &file_pnum);
4013 if (ret2 >= 0) {
4014 /* Ignore errors. This is just providing extra information, it
4015 * is useful but not necessary.
4016 */
4017 if (!file_pnum) {
4018 /* !file_pnum indicates an offset at or beyond the EOF; it is
4019 * perfectly valid for the format block driver to point to such
4020 * offsets, so catch it and mark everything as zero */
4021 ret |= BDRV_BLOCK_ZERO;
4022 } else {
4023 /* Limit request to the range reported by the protocol driver */
4024 *pnum = file_pnum;
4025 ret |= (ret2 & BDRV_BLOCK_ZERO);
4026 }
4027 }
4028 }
4029
4030 return ret;
4031 }
4032
4033 /* Coroutine wrapper for bdrv_get_block_status() */
4034 static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
4035 {
4036 BdrvCoGetBlockStatusData *data = opaque;
4037 BlockDriverState *bs = data->bs;
4038
4039 data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
4040 data->pnum);
4041 data->done = true;
4042 }
4043
4044 /*
4045 * Synchronous wrapper around bdrv_co_get_block_status().
4046 *
4047 * See bdrv_co_get_block_status() for details.
4048 */
4049 int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
4050 int nb_sectors, int *pnum)
4051 {
4052 Coroutine *co;
4053 BdrvCoGetBlockStatusData data = {
4054 .bs = bs,
4055 .sector_num = sector_num,
4056 .nb_sectors = nb_sectors,
4057 .pnum = pnum,
4058 .done = false,
4059 };
4060
4061 if (qemu_in_coroutine()) {
4062 /* Fast-path if already in coroutine context */
4063 bdrv_get_block_status_co_entry(&data);
4064 } else {
4065 AioContext *aio_context = bdrv_get_aio_context(bs);
4066
4067 co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
4068 qemu_coroutine_enter(co, &data);
4069 while (!data.done) {
4070 aio_poll(aio_context, true);
4071 }
4072 }
4073 return data.ret;
4074 }
4075
4076 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
4077 int nb_sectors, int *pnum)
4078 {
4079 int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
4080 if (ret < 0) {
4081 return ret;
4082 }
4083 return !!(ret & BDRV_BLOCK_ALLOCATED);
4084 }
4085
4086 /*
4087 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
4088 *
4089 * Return true if the given sector is allocated in any image between
4090 * BASE and TOP (inclusive). BASE can be NULL to check if the given
4091 * sector is allocated in any image of the chain. Return false otherwise.
4092 *
4093 * 'pnum' is set to the number of sectors (including and immediately following
4094 * the specified sector) that are known to be in the same
4095 * allocated/unallocated state.
4096 *
4097 */
4098 int bdrv_is_allocated_above(BlockDriverState *top,
4099 BlockDriverState *base,
4100 int64_t sector_num,
4101 int nb_sectors, int *pnum)
4102 {
4103 BlockDriverState *intermediate;
4104 int ret, n = nb_sectors;
4105
4106 intermediate = top;
4107 while (intermediate && intermediate != base) {
4108 int pnum_inter;
4109 ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
4110 &pnum_inter);
4111 if (ret < 0) {
4112 return ret;
4113 } else if (ret) {
4114 *pnum = pnum_inter;
4115 return 1;
4116 }
4117
4118 /*
4119 * [sector_num, nb_sectors] is unallocated on top but intermediate
4120 * might have
4121 *
4122 * [sector_num+x, nr_sectors] allocated.
4123 */
4124 if (n > pnum_inter &&
4125 (intermediate == top ||
4126 sector_num + pnum_inter < intermediate->total_sectors)) {
4127 n = pnum_inter;
4128 }
4129
4130 intermediate = intermediate->backing_hd;
4131 }
4132
4133 *pnum = n;
4134 return 0;
4135 }
4136
4137 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
4138 {
4139 if (bs->backing_hd && bs->backing_hd->encrypted)
4140 return bs->backing_file;
4141 else if (bs->encrypted)
4142 return bs->filename;
4143 else
4144 return NULL;
4145 }
4146
4147 void bdrv_get_backing_filename(BlockDriverState *bs,
4148 char *filename, int filename_size)
4149 {
4150 pstrcpy(filename, filename_size, bs->backing_file);
4151 }
4152
4153 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
4154 const uint8_t *buf, int nb_sectors)
4155 {
4156 BlockDriver *drv = bs->drv;
4157 if (!drv)
4158 return -ENOMEDIUM;
4159 if (!drv->bdrv_write_compressed)
4160 return -ENOTSUP;
4161 if (bdrv_check_request(bs, sector_num, nb_sectors))
4162 return -EIO;
4163
4164 assert(QLIST_EMPTY(&bs->dirty_bitmaps));
4165
4166 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
4167 }
4168
4169 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
4170 {
4171 BlockDriver *drv = bs->drv;
4172 if (!drv)
4173 return -ENOMEDIUM;
4174 if (!drv->bdrv_get_info)
4175 return -ENOTSUP;
4176 memset(bdi, 0, sizeof(*bdi));
4177 return drv->bdrv_get_info(bs, bdi);
4178 }
4179
4180 ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
4181 {
4182 BlockDriver *drv = bs->drv;
4183 if (drv && drv->bdrv_get_specific_info) {
4184 return drv->bdrv_get_specific_info(bs);
4185 }
4186 return NULL;
4187 }
4188
4189 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
4190 int64_t pos, int size)
4191 {
4192 QEMUIOVector qiov;
4193 struct iovec iov = {
4194 .iov_base = (void *) buf,
4195 .iov_len = size,
4196 };
4197
4198 qemu_iovec_init_external(&qiov, &iov, 1);
4199 return bdrv_writev_vmstate(bs, &qiov, pos);
4200 }
4201
4202 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
4203 {
4204 BlockDriver *drv = bs->drv;
4205
4206 if (!drv) {
4207 return -ENOMEDIUM;
4208 } else if (drv->bdrv_save_vmstate) {
4209 return drv->bdrv_save_vmstate(bs, qiov, pos);
4210 } else if (bs->file) {
4211 return bdrv_writev_vmstate(bs->file, qiov, pos);
4212 }
4213
4214 return -ENOTSUP;
4215 }
4216
4217 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
4218 int64_t pos, int size)
4219 {
4220 BlockDriver *drv = bs->drv;
4221 if (!drv)
4222 return -ENOMEDIUM;
4223 if (drv->bdrv_load_vmstate)
4224 return drv->bdrv_load_vmstate(bs, buf, pos, size);
4225 if (bs->file)
4226 return bdrv_load_vmstate(bs->file, buf, pos, size);
4227 return -ENOTSUP;
4228 }
4229
4230 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
4231 {
4232 if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
4233 return;
4234 }
4235
4236 bs->drv->bdrv_debug_event(bs, event);
4237 }
4238
4239 int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
4240 const char *tag)
4241 {
4242 while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
4243 bs = bs->file;
4244 }
4245
4246 if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
4247 return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
4248 }
4249
4250 return -ENOTSUP;
4251 }
4252
4253 int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
4254 {
4255 while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
4256 bs = bs->file;
4257 }
4258
4259 if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
4260 return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
4261 }
4262
4263 return -ENOTSUP;
4264 }
4265
4266 int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
4267 {
4268 while (bs && (!bs->drv || !bs->drv->bdrv_debug_resume)) {
4269 bs = bs->file;
4270 }
4271
4272 if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
4273 return bs->drv->bdrv_debug_resume(bs, tag);
4274 }
4275
4276 return -ENOTSUP;
4277 }
4278
4279 bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
4280 {
4281 while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
4282 bs = bs->file;
4283 }
4284
4285 if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
4286 return bs->drv->bdrv_debug_is_suspended(bs, tag);
4287 }
4288
4289 return false;
4290 }
4291
4292 int bdrv_is_snapshot(BlockDriverState *bs)
4293 {
4294 return !!(bs->open_flags & BDRV_O_SNAPSHOT);
4295 }
4296
4297 /* backing_file can either be relative, or absolute, or a protocol. If it is
4298 * relative, it must be relative to the chain. So, passing in bs->filename
4299 * from a BDS as backing_file should not be done, as that may be relative to
4300 * the CWD rather than the chain. */
4301 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
4302 const char *backing_file)
4303 {
4304 char *filename_full = NULL;
4305 char *backing_file_full = NULL;
4306 char *filename_tmp = NULL;
4307 int is_protocol = 0;
4308 BlockDriverState *curr_bs = NULL;
4309 BlockDriverState *retval = NULL;
4310
4311 if (!bs || !bs->drv || !backing_file) {
4312 return NULL;
4313 }
4314
4315 filename_full = g_malloc(PATH_MAX);
4316 backing_file_full = g_malloc(PATH_MAX);
4317 filename_tmp = g_malloc(PATH_MAX);
4318
4319 is_protocol = path_has_protocol(backing_file);
4320
4321 for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
4322
4323 /* If either of the filename paths is actually a protocol, then
4324 * compare unmodified paths; otherwise make paths relative */
4325 if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
4326 if (strcmp(backing_file, curr_bs->backing_file) == 0) {
4327 retval = curr_bs->backing_hd;
4328 break;
4329 }
4330 } else {
4331 /* If not an absolute filename path, make it relative to the current
4332 * image's filename path */
4333 path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4334 backing_file);
4335
4336 /* We are going to compare absolute pathnames */
4337 if (!realpath(filename_tmp, filename_full)) {
4338 continue;
4339 }
4340
4341 /* We need to make sure the backing filename we are comparing against
4342 * is relative to the current image filename (or absolute) */
4343 path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4344 curr_bs->backing_file);
4345
4346 if (!realpath(filename_tmp, backing_file_full)) {
4347 continue;
4348 }
4349
4350 if (strcmp(backing_file_full, filename_full) == 0) {
4351 retval = curr_bs->backing_hd;
4352 break;
4353 }
4354 }
4355 }
4356
4357 g_free(filename_full);
4358 g_free(backing_file_full);
4359 g_free(filename_tmp);
4360 return retval;
4361 }
4362
4363 int bdrv_get_backing_file_depth(BlockDriverState *bs)
4364 {
4365 if (!bs->drv) {
4366 return 0;
4367 }
4368
4369 if (!bs->backing_hd) {
4370 return 0;
4371 }
4372
4373 return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
4374 }
4375
4376 /**************************************************************/
4377 /* async I/Os */
4378
4379 BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
4380 QEMUIOVector *qiov, int nb_sectors,
4381 BlockCompletionFunc *cb, void *opaque)
4382 {
4383 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
4384
4385 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4386 cb, opaque, false);
4387 }
4388
4389 BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
4390 QEMUIOVector *qiov, int nb_sectors,
4391 BlockCompletionFunc *cb, void *opaque)
4392 {
4393 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
4394
4395 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4396 cb, opaque, true);
4397 }
4398
4399 BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
4400 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
4401 BlockCompletionFunc *cb, void *opaque)
4402 {
4403 trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
4404
4405 return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
4406 BDRV_REQ_ZERO_WRITE | flags,
4407 cb, opaque, true);
4408 }
4409
4410
4411 typedef struct MultiwriteCB {
4412 int error;
4413 int num_requests;
4414 int num_callbacks;
4415 struct {
4416 BlockCompletionFunc *cb;
4417 void *opaque;
4418 QEMUIOVector *free_qiov;
4419 } callbacks[];
4420 } MultiwriteCB;
4421
4422 static void multiwrite_user_cb(MultiwriteCB *mcb)
4423 {
4424 int i;
4425
4426 for (i = 0; i < mcb->num_callbacks; i++) {
4427 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
4428 if (mcb->callbacks[i].free_qiov) {
4429 qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
4430 }
4431 g_free(mcb->callbacks[i].free_qiov);
4432 }
4433 }
4434
4435 static void multiwrite_cb(void *opaque, int ret)
4436 {
4437 MultiwriteCB *mcb = opaque;
4438
4439 trace_multiwrite_cb(mcb, ret);
4440
4441 if (ret < 0 && !mcb->error) {
4442 mcb->error = ret;
4443 }
4444
4445 mcb->num_requests--;
4446 if (mcb->num_requests == 0) {
4447 multiwrite_user_cb(mcb);
4448 g_free(mcb);
4449 }
4450 }
4451
4452 static int multiwrite_req_compare(const void *a, const void *b)
4453 {
4454 const BlockRequest *req1 = a, *req2 = b;
4455
4456 /*
4457 * Note that we can't simply subtract req2->sector from req1->sector
4458 * here as that could overflow the return value.
4459 */
4460 if (req1->sector > req2->sector) {
4461 return 1;
4462 } else if (req1->sector < req2->sector) {
4463 return -1;
4464 } else {
4465 return 0;
4466 }
4467 }
4468
4469 /*
4470 * Takes a bunch of requests and tries to merge them. Returns the number of
4471 * requests that remain after merging.
4472 */
4473 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
4474 int num_reqs, MultiwriteCB *mcb)
4475 {
4476 int i, outidx;
4477
4478 // Sort requests by start sector
4479 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
4480
4481 // Check if adjacent requests touch the same clusters. If so, combine them,
4482 // filling up gaps with zero sectors.
4483 outidx = 0;
4484 for (i = 1; i < num_reqs; i++) {
4485 int merge = 0;
4486 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
4487
4488 // Handle exactly sequential writes and overlapping writes.
4489 if (reqs[i].sector <= oldreq_last) {
4490 merge = 1;
4491 }
4492
4493 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
4494 merge = 0;
4495 }
4496
4497 if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors +
4498 reqs[i].nb_sectors > bs->bl.max_transfer_length) {
4499 merge = 0;
4500 }
4501
4502 if (merge) {
4503 size_t size;
4504 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
4505 qemu_iovec_init(qiov,
4506 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
4507
4508 // Add the first request to the merged one. If the requests are
4509 // overlapping, drop the last sectors of the first request.
4510 size = (reqs[i].sector - reqs[outidx].sector) << 9;
4511 qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
4512
4513 // We should need to add any zeros between the two requests
4514 assert (reqs[i].sector <= oldreq_last);
4515
4516 // Add the second request
4517 qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
4518
4519 // Add tail of first request, if necessary
4520 if (qiov->size < reqs[outidx].qiov->size) {
4521 qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size,
4522 reqs[outidx].qiov->size - qiov->size);
4523 }
4524
4525 reqs[outidx].nb_sectors = qiov->size >> 9;
4526 reqs[outidx].qiov = qiov;
4527
4528 mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
4529 } else {
4530 outidx++;
4531 reqs[outidx].sector = reqs[i].sector;
4532 reqs[outidx].nb_sectors = reqs[i].nb_sectors;
4533 reqs[outidx].qiov = reqs[i].qiov;
4534 }
4535 }
4536
4537 return outidx + 1;
4538 }
4539
4540 /*
4541 * Submit multiple AIO write requests at once.
4542 *
4543 * On success, the function returns 0 and all requests in the reqs array have
4544 * been submitted. In error case this function returns -1, and any of the
4545 * requests may or may not be submitted yet. In particular, this means that the
4546 * callback will be called for some of the requests, for others it won't. The
4547 * caller must check the error field of the BlockRequest to wait for the right
4548 * callbacks (if error != 0, no callback will be called).
4549 *
4550 * The implementation may modify the contents of the reqs array, e.g. to merge
4551 * requests. However, the fields opaque and error are left unmodified as they
4552 * are used to signal failure for a single request to the caller.
4553 */
4554 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
4555 {
4556 MultiwriteCB *mcb;
4557 int i;
4558
4559 /* don't submit writes if we don't have a medium */
4560 if (bs->drv == NULL) {
4561 for (i = 0; i < num_reqs; i++) {
4562 reqs[i].error = -ENOMEDIUM;
4563 }
4564 return -1;
4565 }
4566
4567 if (num_reqs == 0) {
4568 return 0;
4569 }
4570
4571 // Create MultiwriteCB structure
4572 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
4573 mcb->num_requests = 0;
4574 mcb->num_callbacks = num_reqs;
4575
4576 for (i = 0; i < num_reqs; i++) {
4577 mcb->callbacks[i].cb = reqs[i].cb;
4578 mcb->callbacks[i].opaque = reqs[i].opaque;
4579 }
4580
4581 // Check for mergable requests
4582 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
4583
4584 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
4585
4586 /* Run the aio requests. */
4587 mcb->num_requests = num_reqs;
4588 for (i = 0; i < num_reqs; i++) {
4589 bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
4590 reqs[i].nb_sectors, reqs[i].flags,
4591 multiwrite_cb, mcb,
4592 true);
4593 }
4594
4595 return 0;
4596 }
4597
4598 void bdrv_aio_cancel(BlockAIOCB *acb)
4599 {
4600 qemu_aio_ref(acb);
4601 bdrv_aio_cancel_async(acb);
4602 while (acb->refcnt > 1) {
4603 if (acb->aiocb_info->get_aio_context) {
4604 aio_poll(acb->aiocb_info->get_aio_context(acb), true);
4605 } else if (acb->bs) {
4606 aio_poll(bdrv_get_aio_context(acb->bs), true);
4607 } else {
4608 abort();
4609 }
4610 }
4611 qemu_aio_unref(acb);
4612 }
4613
4614 /* Async version of aio cancel. The caller is not blocked if the acb implements
4615 * cancel_async, otherwise we do nothing and let the request normally complete.
4616 * In either case the completion callback must be called. */
4617 void bdrv_aio_cancel_async(BlockAIOCB *acb)
4618 {
4619 if (acb->aiocb_info->cancel_async) {
4620 acb->aiocb_info->cancel_async(acb);
4621 }
4622 }
4623
4624 /**************************************************************/
4625 /* async block device emulation */
4626
4627 typedef struct BlockAIOCBSync {
4628 BlockAIOCB common;
4629 QEMUBH *bh;
4630 int ret;
4631 /* vector translation state */
4632 QEMUIOVector *qiov;
4633 uint8_t *bounce;
4634 int is_write;
4635 } BlockAIOCBSync;
4636
4637 static const AIOCBInfo bdrv_em_aiocb_info = {
4638 .aiocb_size = sizeof(BlockAIOCBSync),
4639 };
4640
4641 static void bdrv_aio_bh_cb(void *opaque)
4642 {
4643 BlockAIOCBSync *acb = opaque;
4644
4645 if (!acb->is_write && acb->ret >= 0) {
4646 qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
4647 }
4648 qemu_vfree(acb->bounce);
4649 acb->common.cb(acb->common.opaque, acb->ret);
4650 qemu_bh_delete(acb->bh);
4651 acb->bh = NULL;
4652 qemu_aio_unref(acb);
4653 }
4654
4655 static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
4656 int64_t sector_num,
4657 QEMUIOVector *qiov,
4658 int nb_sectors,
4659 BlockCompletionFunc *cb,
4660 void *opaque,
4661 int is_write)
4662
4663 {
4664 BlockAIOCBSync *acb;
4665
4666 acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
4667 acb->is_write = is_write;
4668 acb->qiov = qiov;
4669 acb->bounce = qemu_try_blockalign(bs, qiov->size);
4670 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb);
4671
4672 if (acb->bounce == NULL) {
4673 acb->ret = -ENOMEM;
4674 } else if (is_write) {
4675 qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
4676 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
4677 } else {
4678 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
4679 }
4680
4681 qemu_bh_schedule(acb->bh);
4682
4683 return &acb->common;
4684 }
4685
4686 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
4687 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4688 BlockCompletionFunc *cb, void *opaque)
4689 {
4690 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
4691 }
4692
4693 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
4694 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4695 BlockCompletionFunc *cb, void *opaque)
4696 {
4697 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
4698 }
4699
4700
4701 typedef struct BlockAIOCBCoroutine {
4702 BlockAIOCB common;
4703 BlockRequest req;
4704 bool is_write;
4705 bool *done;
4706 QEMUBH* bh;
4707 } BlockAIOCBCoroutine;
4708
4709 static const AIOCBInfo bdrv_em_co_aiocb_info = {
4710 .aiocb_size = sizeof(BlockAIOCBCoroutine),
4711 };
4712
4713 static void bdrv_co_em_bh(void *opaque)
4714 {
4715 BlockAIOCBCoroutine *acb = opaque;
4716
4717 acb->common.cb(acb->common.opaque, acb->req.error);
4718
4719 qemu_bh_delete(acb->bh);
4720 qemu_aio_unref(acb);
4721 }
4722
4723 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4724 static void coroutine_fn bdrv_co_do_rw(void *opaque)
4725 {
4726 BlockAIOCBCoroutine *acb = opaque;
4727 BlockDriverState *bs = acb->common.bs;
4728
4729 if (!acb->is_write) {
4730 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
4731 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4732 } else {
4733 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
4734 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4735 }
4736
4737 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4738 qemu_bh_schedule(acb->bh);
4739 }
4740
4741 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4742 int64_t sector_num,
4743 QEMUIOVector *qiov,
4744 int nb_sectors,
4745 BdrvRequestFlags flags,
4746 BlockCompletionFunc *cb,
4747 void *opaque,
4748 bool is_write)
4749 {
4750 Coroutine *co;
4751 BlockAIOCBCoroutine *acb;
4752
4753 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4754 acb->req.sector = sector_num;
4755 acb->req.nb_sectors = nb_sectors;
4756 acb->req.qiov = qiov;
4757 acb->req.flags = flags;
4758 acb->is_write = is_write;
4759
4760 co = qemu_coroutine_create(bdrv_co_do_rw);
4761 qemu_coroutine_enter(co, acb);
4762
4763 return &acb->common;
4764 }
4765
4766 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
4767 {
4768 BlockAIOCBCoroutine *acb = opaque;
4769 BlockDriverState *bs = acb->common.bs;
4770
4771 acb->req.error = bdrv_co_flush(bs);
4772 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4773 qemu_bh_schedule(acb->bh);
4774 }
4775
4776 BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
4777 BlockCompletionFunc *cb, void *opaque)
4778 {
4779 trace_bdrv_aio_flush(bs, opaque);
4780
4781 Coroutine *co;
4782 BlockAIOCBCoroutine *acb;
4783
4784 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4785
4786 co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4787 qemu_coroutine_enter(co, acb);
4788
4789 return &acb->common;
4790 }
4791
4792 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4793 {
4794 BlockAIOCBCoroutine *acb = opaque;
4795 BlockDriverState *bs = acb->common.bs;
4796
4797 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
4798 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4799 qemu_bh_schedule(acb->bh);
4800 }
4801
4802 BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4803 int64_t sector_num, int nb_sectors,
4804 BlockCompletionFunc *cb, void *opaque)
4805 {
4806 Coroutine *co;
4807 BlockAIOCBCoroutine *acb;
4808
4809 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4810
4811 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4812 acb->req.sector = sector_num;
4813 acb->req.nb_sectors = nb_sectors;
4814 co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
4815 qemu_coroutine_enter(co, acb);
4816
4817 return &acb->common;
4818 }
4819
4820 void bdrv_init(void)
4821 {
4822 module_call_init(MODULE_INIT_BLOCK);
4823 }
4824
4825 void bdrv_init_with_whitelist(void)
4826 {
4827 use_bdrv_whitelist = 1;
4828 bdrv_init();
4829 }
4830
4831 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
4832 BlockCompletionFunc *cb, void *opaque)
4833 {
4834 BlockAIOCB *acb;
4835
4836 acb = g_slice_alloc(aiocb_info->aiocb_size);
4837 acb->aiocb_info = aiocb_info;
4838 acb->bs = bs;
4839 acb->cb = cb;
4840 acb->opaque = opaque;
4841 acb->refcnt = 1;
4842 return acb;
4843 }
4844
4845 void qemu_aio_ref(void *p)
4846 {
4847 BlockAIOCB *acb = p;
4848 acb->refcnt++;
4849 }
4850
4851 void qemu_aio_unref(void *p)
4852 {
4853 BlockAIOCB *acb = p;
4854 assert(acb->refcnt > 0);
4855 if (--acb->refcnt == 0) {
4856 g_slice_free1(acb->aiocb_info->aiocb_size, acb);
4857 }
4858 }
4859
4860 /**************************************************************/
4861 /* Coroutine block device emulation */
4862
4863 typedef struct CoroutineIOCompletion {
4864 Coroutine *coroutine;
4865 int ret;
4866 } CoroutineIOCompletion;
4867
4868 static void bdrv_co_io_em_complete(void *opaque, int ret)
4869 {
4870 CoroutineIOCompletion *co = opaque;
4871
4872 co->ret = ret;
4873 qemu_coroutine_enter(co->coroutine, NULL);
4874 }
4875
4876 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
4877 int nb_sectors, QEMUIOVector *iov,
4878 bool is_write)
4879 {
4880 CoroutineIOCompletion co = {
4881 .coroutine = qemu_coroutine_self(),
4882 };
4883 BlockAIOCB *acb;
4884
4885 if (is_write) {
4886 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
4887 bdrv_co_io_em_complete, &co);
4888 } else {
4889 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
4890 bdrv_co_io_em_complete, &co);
4891 }
4892
4893 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
4894 if (!acb) {
4895 return -EIO;
4896 }
4897 qemu_coroutine_yield();
4898
4899 return co.ret;
4900 }
4901
4902 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4903 int64_t sector_num, int nb_sectors,
4904 QEMUIOVector *iov)
4905 {
4906 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4907 }
4908
4909 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4910 int64_t sector_num, int nb_sectors,
4911 QEMUIOVector *iov)
4912 {
4913 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4914 }
4915
4916 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
4917 {
4918 RwCo *rwco = opaque;
4919
4920 rwco->ret = bdrv_co_flush(rwco->bs);
4921 }
4922
4923 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4924 {
4925 int ret;
4926
4927 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
4928 return 0;
4929 }
4930
4931 /* Write back cached data to the OS even with cache=unsafe */
4932 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
4933 if (bs->drv->bdrv_co_flush_to_os) {
4934 ret = bs->drv->bdrv_co_flush_to_os(bs);
4935 if (ret < 0) {
4936 return ret;
4937 }
4938 }
4939
4940 /* But don't actually force it to the disk with cache=unsafe */
4941 if (bs->open_flags & BDRV_O_NO_FLUSH) {
4942 goto flush_parent;
4943 }
4944
4945 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
4946 if (bs->drv->bdrv_co_flush_to_disk) {
4947 ret = bs->drv->bdrv_co_flush_to_disk(bs);
4948 } else if (bs->drv->bdrv_aio_flush) {
4949 BlockAIOCB *acb;
4950 CoroutineIOCompletion co = {
4951 .coroutine = qemu_coroutine_self(),
4952 };
4953
4954 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4955 if (acb == NULL) {
4956 ret = -EIO;
4957 } else {
4958 qemu_coroutine_yield();
4959 ret = co.ret;
4960 }
4961 } else {
4962 /*
4963 * Some block drivers always operate in either writethrough or unsafe
4964 * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4965 * know how the server works (because the behaviour is hardcoded or
4966 * depends on server-side configuration), so we can't ensure that
4967 * everything is safe on disk. Returning an error doesn't work because
4968 * that would break guests even if the server operates in writethrough
4969 * mode.
4970 *
4971 * Let's hope the user knows what he's doing.
4972 */
4973 ret = 0;
4974 }
4975 if (ret < 0) {
4976 return ret;
4977 }
4978
4979 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
4980 * in the case of cache=unsafe, so there are no useless flushes.
4981 */
4982 flush_parent:
4983 return bdrv_co_flush(bs->file);
4984 }
4985
4986 void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)
4987 {
4988 Error *local_err = NULL;
4989 int ret;
4990
4991 if (!bs->drv) {
4992 return;
4993 }
4994
4995 if (!(bs->open_flags & BDRV_O_INCOMING)) {
4996 return;
4997 }
4998 bs->open_flags &= ~BDRV_O_INCOMING;
4999
5000 if (bs->drv->bdrv_invalidate_cache) {
5001 bs->drv->bdrv_invalidate_cache(bs, &local_err);
5002 } else if (bs->file) {
5003 bdrv_invalidate_cache(bs->file, &local_err);
5004 }
5005 if (local_err) {
5006 error_propagate(errp, local_err);
5007 return;
5008 }
5009
5010 ret = refresh_total_sectors(bs, bs->total_sectors);
5011 if (ret < 0) {
5012 error_setg_errno(errp, -ret, "Could not refresh total sector count");
5013 return;
5014 }
5015 }
5016
5017 void bdrv_invalidate_cache_all(Error **errp)
5018 {
5019 BlockDriverState *bs;
5020 Error *local_err = NULL;
5021
5022 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5023 AioContext *aio_context = bdrv_get_aio_context(bs);
5024
5025 aio_context_acquire(aio_context);
5026 bdrv_invalidate_cache(bs, &local_err);
5027 aio_context_release(aio_context);
5028 if (local_err) {
5029 error_propagate(errp, local_err);
5030 return;
5031 }
5032 }
5033 }
5034
5035 int bdrv_flush(BlockDriverState *bs)
5036 {
5037 Coroutine *co;
5038 RwCo rwco = {
5039 .bs = bs,
5040 .ret = NOT_DONE,
5041 };
5042
5043 if (qemu_in_coroutine()) {
5044 /* Fast-path if already in coroutine context */
5045 bdrv_flush_co_entry(&rwco);
5046 } else {
5047 AioContext *aio_context = bdrv_get_aio_context(bs);
5048
5049 co = qemu_coroutine_create(bdrv_flush_co_entry);
5050 qemu_coroutine_enter(co, &rwco);
5051 while (rwco.ret == NOT_DONE) {
5052 aio_poll(aio_context, true);
5053 }
5054 }
5055
5056 return rwco.ret;
5057 }
5058
5059 typedef struct DiscardCo {
5060 BlockDriverState *bs;
5061 int64_t sector_num;
5062 int nb_sectors;
5063 int ret;
5064 } DiscardCo;
5065 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
5066 {
5067 DiscardCo *rwco = opaque;
5068
5069 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
5070 }
5071
5072 /* if no limit is specified in the BlockLimits use a default
5073 * of 32768 512-byte sectors (16 MiB) per request.
5074 */
5075 #define MAX_DISCARD_DEFAULT 32768
5076
5077 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
5078 int nb_sectors)
5079 {
5080 int max_discard;
5081
5082 if (!bs->drv) {
5083 return -ENOMEDIUM;
5084 } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
5085 return -EIO;
5086 } else if (bs->read_only) {
5087 return -EROFS;
5088 }
5089
5090 bdrv_reset_dirty(bs, sector_num, nb_sectors);
5091
5092 /* Do nothing if disabled. */
5093 if (!(bs->open_flags & BDRV_O_UNMAP)) {
5094 return 0;
5095 }
5096
5097 if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
5098 return 0;
5099 }
5100
5101 max_discard = bs->bl.max_discard ? bs->bl.max_discard : MAX_DISCARD_DEFAULT;
5102 while (nb_sectors > 0) {
5103 int ret;
5104 int num = nb_sectors;
5105
5106 /* align request */
5107 if (bs->bl.discard_alignment &&
5108 num >= bs->bl.discard_alignment &&
5109 sector_num % bs->bl.discard_alignment) {
5110 if (num > bs->bl.discard_alignment) {
5111 num = bs->bl.discard_alignment;
5112 }
5113 num -= sector_num % bs->bl.discard_alignment;
5114 }
5115
5116 /* limit request size */
5117 if (num > max_discard) {
5118 num = max_discard;
5119 }
5120
5121 if (bs->drv->bdrv_co_discard) {
5122 ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
5123 } else {
5124 BlockAIOCB *acb;
5125 CoroutineIOCompletion co = {
5126 .coroutine = qemu_coroutine_self(),
5127 };
5128
5129 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
5130 bdrv_co_io_em_complete, &co);
5131 if (acb == NULL) {
5132 return -EIO;
5133 } else {
5134 qemu_coroutine_yield();
5135 ret = co.ret;
5136 }
5137 }
5138 if (ret && ret != -ENOTSUP) {
5139 return ret;
5140 }
5141
5142 sector_num += num;
5143 nb_sectors -= num;
5144 }
5145 return 0;
5146 }
5147
5148 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
5149 {
5150 Coroutine *co;
5151 DiscardCo rwco = {
5152 .bs = bs,
5153 .sector_num = sector_num,
5154 .nb_sectors = nb_sectors,
5155 .ret = NOT_DONE,
5156 };
5157
5158 if (qemu_in_coroutine()) {
5159 /* Fast-path if already in coroutine context */
5160 bdrv_discard_co_entry(&rwco);
5161 } else {
5162 AioContext *aio_context = bdrv_get_aio_context(bs);
5163
5164 co = qemu_coroutine_create(bdrv_discard_co_entry);
5165 qemu_coroutine_enter(co, &rwco);
5166 while (rwco.ret == NOT_DONE) {
5167 aio_poll(aio_context, true);
5168 }
5169 }
5170
5171 return rwco.ret;
5172 }
5173
5174 /**************************************************************/
5175 /* removable device support */
5176
5177 /**
5178 * Return TRUE if the media is present
5179 */
5180 int bdrv_is_inserted(BlockDriverState *bs)
5181 {
5182 BlockDriver *drv = bs->drv;
5183
5184 if (!drv)
5185 return 0;
5186 if (!drv->bdrv_is_inserted)
5187 return 1;
5188 return drv->bdrv_is_inserted(bs);
5189 }
5190
5191 /**
5192 * Return whether the media changed since the last call to this
5193 * function, or -ENOTSUP if we don't know. Most drivers don't know.
5194 */
5195 int bdrv_media_changed(BlockDriverState *bs)
5196 {
5197 BlockDriver *drv = bs->drv;
5198
5199 if (drv && drv->bdrv_media_changed) {
5200 return drv->bdrv_media_changed(bs);
5201 }
5202 return -ENOTSUP;
5203 }
5204
5205 /**
5206 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
5207 */
5208 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
5209 {
5210 BlockDriver *drv = bs->drv;
5211 const char *device_name;
5212
5213 if (drv && drv->bdrv_eject) {
5214 drv->bdrv_eject(bs, eject_flag);
5215 }
5216
5217 device_name = bdrv_get_device_name(bs);
5218 if (device_name[0] != '\0') {
5219 qapi_event_send_device_tray_moved(device_name,
5220 eject_flag, &error_abort);
5221 }
5222 }
5223
5224 /**
5225 * Lock or unlock the media (if it is locked, the user won't be able
5226 * to eject it manually).
5227 */
5228 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
5229 {
5230 BlockDriver *drv = bs->drv;
5231
5232 trace_bdrv_lock_medium(bs, locked);
5233
5234 if (drv && drv->bdrv_lock_medium) {
5235 drv->bdrv_lock_medium(bs, locked);
5236 }
5237 }
5238
5239 /* needed for generic scsi interface */
5240
5241 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
5242 {
5243 BlockDriver *drv = bs->drv;
5244
5245 if (drv && drv->bdrv_ioctl)
5246 return drv->bdrv_ioctl(bs, req, buf);
5247 return -ENOTSUP;
5248 }
5249
5250 BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
5251 unsigned long int req, void *buf,
5252 BlockCompletionFunc *cb, void *opaque)
5253 {
5254 BlockDriver *drv = bs->drv;
5255
5256 if (drv && drv->bdrv_aio_ioctl)
5257 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
5258 return NULL;
5259 }
5260
5261 void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
5262 {
5263 bs->guest_block_size = align;
5264 }
5265
5266 void *qemu_blockalign(BlockDriverState *bs, size_t size)
5267 {
5268 return qemu_memalign(bdrv_opt_mem_align(bs), size);
5269 }
5270
5271 void *qemu_blockalign0(BlockDriverState *bs, size_t size)
5272 {
5273 return memset(qemu_blockalign(bs, size), 0, size);
5274 }
5275
5276 void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
5277 {
5278 size_t align = bdrv_opt_mem_align(bs);
5279
5280 /* Ensure that NULL is never returned on success */
5281 assert(align > 0);
5282 if (size == 0) {
5283 size = align;
5284 }
5285
5286 return qemu_try_memalign(align, size);
5287 }
5288
5289 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
5290 {
5291 void *mem = qemu_try_blockalign(bs, size);
5292
5293 if (mem) {
5294 memset(mem, 0, size);
5295 }
5296
5297 return mem;
5298 }
5299
5300 /*
5301 * Check if all memory in this vector is sector aligned.
5302 */
5303 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
5304 {
5305 int i;
5306 size_t alignment = bdrv_opt_mem_align(bs);
5307
5308 for (i = 0; i < qiov->niov; i++) {
5309 if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
5310 return false;
5311 }
5312 if (qiov->iov[i].iov_len % alignment) {
5313 return false;
5314 }
5315 }
5316
5317 return true;
5318 }
5319
5320 BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity,
5321 Error **errp)
5322 {
5323 int64_t bitmap_size;
5324 BdrvDirtyBitmap *bitmap;
5325
5326 assert((granularity & (granularity - 1)) == 0);
5327
5328 granularity >>= BDRV_SECTOR_BITS;
5329 assert(granularity);
5330 bitmap_size = bdrv_nb_sectors(bs);
5331 if (bitmap_size < 0) {
5332 error_setg_errno(errp, -bitmap_size, "could not get length of device");
5333 errno = -bitmap_size;
5334 return NULL;
5335 }
5336 bitmap = g_new0(BdrvDirtyBitmap, 1);
5337 bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
5338 QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
5339 return bitmap;
5340 }
5341
5342 void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5343 {
5344 BdrvDirtyBitmap *bm, *next;
5345 QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
5346 if (bm == bitmap) {
5347 QLIST_REMOVE(bitmap, list);
5348 hbitmap_free(bitmap->bitmap);
5349 g_free(bitmap);
5350 return;
5351 }
5352 }
5353 }
5354
5355 BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
5356 {
5357 BdrvDirtyBitmap *bm;
5358 BlockDirtyInfoList *list = NULL;
5359 BlockDirtyInfoList **plist = &list;
5360
5361 QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
5362 BlockDirtyInfo *info = g_new0(BlockDirtyInfo, 1);
5363 BlockDirtyInfoList *entry = g_new0(BlockDirtyInfoList, 1);
5364 info->count = bdrv_get_dirty_count(bs, bm);
5365 info->granularity =
5366 ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
5367 entry->value = info;
5368 *plist = entry;
5369 plist = &entry->next;
5370 }
5371
5372 return list;
5373 }
5374
5375 int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
5376 {
5377 if (bitmap) {
5378 return hbitmap_get(bitmap->bitmap, sector);
5379 } else {
5380 return 0;
5381 }
5382 }
5383
5384 void bdrv_dirty_iter_init(BlockDriverState *bs,
5385 BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
5386 {
5387 hbitmap_iter_init(hbi, bitmap->bitmap, 0);
5388 }
5389
5390 void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
5391 int nr_sectors)
5392 {
5393 BdrvDirtyBitmap *bitmap;
5394 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5395 hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5396 }
5397 }
5398
5399 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors)
5400 {
5401 BdrvDirtyBitmap *bitmap;
5402 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5403 hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5404 }
5405 }
5406
5407 int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5408 {
5409 return hbitmap_count(bitmap->bitmap);
5410 }
5411
5412 /* Get a reference to bs */
5413 void bdrv_ref(BlockDriverState *bs)
5414 {
5415 bs->refcnt++;
5416 }
5417
5418 /* Release a previously grabbed reference to bs.
5419 * If after releasing, reference count is zero, the BlockDriverState is
5420 * deleted. */
5421 void bdrv_unref(BlockDriverState *bs)
5422 {
5423 if (!bs) {
5424 return;
5425 }
5426 assert(bs->refcnt > 0);
5427 if (--bs->refcnt == 0) {
5428 bdrv_delete(bs);
5429 }
5430 }
5431
5432 struct BdrvOpBlocker {
5433 Error *reason;
5434 QLIST_ENTRY(BdrvOpBlocker) list;
5435 };
5436
5437 bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp)
5438 {
5439 BdrvOpBlocker *blocker;
5440 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5441 if (!QLIST_EMPTY(&bs->op_blockers[op])) {
5442 blocker = QLIST_FIRST(&bs->op_blockers[op]);
5443 if (errp) {
5444 error_setg(errp, "Device '%s' is busy: %s",
5445 bdrv_get_device_name(bs),
5446 error_get_pretty(blocker->reason));
5447 }
5448 return true;
5449 }
5450 return false;
5451 }
5452
5453 void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason)
5454 {
5455 BdrvOpBlocker *blocker;
5456 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5457
5458 blocker = g_new0(BdrvOpBlocker, 1);
5459 blocker->reason = reason;
5460 QLIST_INSERT_HEAD(&bs->op_blockers[op], blocker, list);
5461 }
5462
5463 void bdrv_op_unblock(BlockDriverState *bs, BlockOpType op, Error *reason)
5464 {
5465 BdrvOpBlocker *blocker, *next;
5466 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5467 QLIST_FOREACH_SAFE(blocker, &bs->op_blockers[op], list, next) {
5468 if (blocker->reason == reason) {
5469 QLIST_REMOVE(blocker, list);
5470 g_free(blocker);
5471 }
5472 }
5473 }
5474
5475 void bdrv_op_block_all(BlockDriverState *bs, Error *reason)
5476 {
5477 int i;
5478 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5479 bdrv_op_block(bs, i, reason);
5480 }
5481 }
5482
5483 void bdrv_op_unblock_all(BlockDriverState *bs, Error *reason)
5484 {
5485 int i;
5486 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5487 bdrv_op_unblock(bs, i, reason);
5488 }
5489 }
5490
5491 bool bdrv_op_blocker_is_empty(BlockDriverState *bs)
5492 {
5493 int i;
5494
5495 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5496 if (!QLIST_EMPTY(&bs->op_blockers[i])) {
5497 return false;
5498 }
5499 }
5500 return true;
5501 }
5502
5503 void bdrv_iostatus_enable(BlockDriverState *bs)
5504 {
5505 bs->iostatus_enabled = true;
5506 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5507 }
5508
5509 /* The I/O status is only enabled if the drive explicitly
5510 * enables it _and_ the VM is configured to stop on errors */
5511 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
5512 {
5513 return (bs->iostatus_enabled &&
5514 (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
5515 bs->on_write_error == BLOCKDEV_ON_ERROR_STOP ||
5516 bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
5517 }
5518
5519 void bdrv_iostatus_disable(BlockDriverState *bs)
5520 {
5521 bs->iostatus_enabled = false;
5522 }
5523
5524 void bdrv_iostatus_reset(BlockDriverState *bs)
5525 {
5526 if (bdrv_iostatus_is_enabled(bs)) {
5527 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5528 if (bs->job) {
5529 block_job_iostatus_reset(bs->job);
5530 }
5531 }
5532 }
5533
5534 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
5535 {
5536 assert(bdrv_iostatus_is_enabled(bs));
5537 if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
5538 bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
5539 BLOCK_DEVICE_IO_STATUS_FAILED;
5540 }
5541 }
5542
5543 void bdrv_img_create(const char *filename, const char *fmt,
5544 const char *base_filename, const char *base_fmt,
5545 char *options, uint64_t img_size, int flags,
5546 Error **errp, bool quiet)
5547 {
5548 QemuOptsList *create_opts = NULL;
5549 QemuOpts *opts = NULL;
5550 const char *backing_fmt, *backing_file;
5551 int64_t size;
5552 BlockDriver *drv, *proto_drv;
5553 BlockDriver *backing_drv = NULL;
5554 Error *local_err = NULL;
5555 int ret = 0;
5556
5557 /* Find driver and parse its options */
5558 drv = bdrv_find_format(fmt);
5559 if (!drv) {
5560 error_setg(errp, "Unknown file format '%s'", fmt);
5561 return;
5562 }
5563
5564 proto_drv = bdrv_find_protocol(filename, true);
5565 if (!proto_drv) {
5566 error_setg(errp, "Unknown protocol '%s'", filename);
5567 return;
5568 }
5569
5570 if (!drv->create_opts) {
5571 error_setg(errp, "Format driver '%s' does not support image creation",
5572 drv->format_name);
5573 return;
5574 }
5575
5576 if (!proto_drv->create_opts) {
5577 error_setg(errp, "Protocol driver '%s' does not support image creation",
5578 proto_drv->format_name);
5579 return;
5580 }
5581
5582 create_opts = qemu_opts_append(create_opts, drv->create_opts);
5583 create_opts = qemu_opts_append(create_opts, proto_drv->create_opts);
5584
5585 /* Create parameter list with default values */
5586 opts = qemu_opts_create(create_opts, NULL, 0, &error_abort);
5587 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, img_size);
5588
5589 /* Parse -o options */
5590 if (options) {
5591 if (qemu_opts_do_parse(opts, options, NULL) != 0) {
5592 error_setg(errp, "Invalid options for file format '%s'", fmt);
5593 goto out;
5594 }
5595 }
5596
5597 if (base_filename) {
5598 if (qemu_opt_set(opts, BLOCK_OPT_BACKING_FILE, base_filename)) {
5599 error_setg(errp, "Backing file not supported for file format '%s'",
5600 fmt);
5601 goto out;
5602 }
5603 }
5604
5605 if (base_fmt) {
5606 if (qemu_opt_set(opts, BLOCK_OPT_BACKING_FMT, base_fmt)) {
5607 error_setg(errp, "Backing file format not supported for file "
5608 "format '%s'", fmt);
5609 goto out;
5610 }
5611 }
5612
5613 backing_file = qemu_opt_get(opts, BLOCK_OPT_BACKING_FILE);
5614 if (backing_file) {
5615 if (!strcmp(filename, backing_file)) {
5616 error_setg(errp, "Error: Trying to create an image with the "
5617 "same filename as the backing file");
5618 goto out;
5619 }
5620 }
5621
5622 backing_fmt = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT);
5623 if (backing_fmt) {
5624 backing_drv = bdrv_find_format(backing_fmt);
5625 if (!backing_drv) {
5626 error_setg(errp, "Unknown backing file format '%s'",
5627 backing_fmt);
5628 goto out;
5629 }
5630 }
5631
5632 // The size for the image must always be specified, with one exception:
5633 // If we are using a backing file, we can obtain the size from there
5634 size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 0);
5635 if (size == -1) {
5636 if (backing_file) {
5637 BlockDriverState *bs;
5638 int64_t size;
5639 int back_flags;
5640
5641 /* backing files always opened read-only */
5642 back_flags =
5643 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
5644
5645 bs = NULL;
5646 ret = bdrv_open(&bs, backing_file, NULL, NULL, back_flags,
5647 backing_drv, &local_err);
5648 if (ret < 0) {
5649 goto out;
5650 }
5651 size = bdrv_getlength(bs);
5652 if (size < 0) {
5653 error_setg_errno(errp, -size, "Could not get size of '%s'",
5654 backing_file);
5655 bdrv_unref(bs);
5656 goto out;
5657 }
5658
5659 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, size);
5660
5661 bdrv_unref(bs);
5662 } else {
5663 error_setg(errp, "Image creation needs a size parameter");
5664 goto out;
5665 }
5666 }
5667
5668 if (!quiet) {
5669 printf("Formatting '%s', fmt=%s", filename, fmt);
5670 qemu_opts_print(opts, " ");
5671 puts("");
5672 }
5673
5674 ret = bdrv_create(drv, filename, opts, &local_err);
5675
5676 if (ret == -EFBIG) {
5677 /* This is generally a better message than whatever the driver would
5678 * deliver (especially because of the cluster_size_hint), since that
5679 * is most probably not much different from "image too large". */
5680 const char *cluster_size_hint = "";
5681 if (qemu_opt_get_size(opts, BLOCK_OPT_CLUSTER_SIZE, 0)) {
5682 cluster_size_hint = " (try using a larger cluster size)";
5683 }
5684 error_setg(errp, "The image size is too large for file format '%s'"
5685 "%s", fmt, cluster_size_hint);
5686 error_free(local_err);
5687 local_err = NULL;
5688 }
5689
5690 out:
5691 qemu_opts_del(opts);
5692 qemu_opts_free(create_opts);
5693 if (local_err) {
5694 error_propagate(errp, local_err);
5695 }
5696 }
5697
5698 AioContext *bdrv_get_aio_context(BlockDriverState *bs)
5699 {
5700 return bs->aio_context;
5701 }
5702
5703 void bdrv_detach_aio_context(BlockDriverState *bs)
5704 {
5705 BdrvAioNotifier *baf;
5706
5707 if (!bs->drv) {
5708 return;
5709 }
5710
5711 QLIST_FOREACH(baf, &bs->aio_notifiers, list) {
5712 baf->detach_aio_context(baf->opaque);
5713 }
5714
5715 if (bs->io_limits_enabled) {
5716 throttle_detach_aio_context(&bs->throttle_state);
5717 }
5718 if (bs->drv->bdrv_detach_aio_context) {
5719 bs->drv->bdrv_detach_aio_context(bs);
5720 }
5721 if (bs->file) {
5722 bdrv_detach_aio_context(bs->file);
5723 }
5724 if (bs->backing_hd) {
5725 bdrv_detach_aio_context(bs->backing_hd);
5726 }
5727
5728 bs->aio_context = NULL;
5729 }
5730
5731 void bdrv_attach_aio_context(BlockDriverState *bs,
5732 AioContext *new_context)
5733 {
5734 BdrvAioNotifier *ban;
5735
5736 if (!bs->drv) {
5737 return;
5738 }
5739
5740 bs->aio_context = new_context;
5741
5742 if (bs->backing_hd) {
5743 bdrv_attach_aio_context(bs->backing_hd, new_context);
5744 }
5745 if (bs->file) {
5746 bdrv_attach_aio_context(bs->file, new_context);
5747 }
5748 if (bs->drv->bdrv_attach_aio_context) {
5749 bs->drv->bdrv_attach_aio_context(bs, new_context);
5750 }
5751 if (bs->io_limits_enabled) {
5752 throttle_attach_aio_context(&bs->throttle_state, new_context);
5753 }
5754
5755 QLIST_FOREACH(ban, &bs->aio_notifiers, list) {
5756 ban->attached_aio_context(new_context, ban->opaque);
5757 }
5758 }
5759
5760 void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
5761 {
5762 bdrv_drain_all(); /* ensure there are no in-flight requests */
5763
5764 bdrv_detach_aio_context(bs);
5765
5766 /* This function executes in the old AioContext so acquire the new one in
5767 * case it runs in a different thread.
5768 */
5769 aio_context_acquire(new_context);
5770 bdrv_attach_aio_context(bs, new_context);
5771 aio_context_release(new_context);
5772 }
5773
5774 void bdrv_add_aio_context_notifier(BlockDriverState *bs,
5775 void (*attached_aio_context)(AioContext *new_context, void *opaque),
5776 void (*detach_aio_context)(void *opaque), void *opaque)
5777 {
5778 BdrvAioNotifier *ban = g_new(BdrvAioNotifier, 1);
5779 *ban = (BdrvAioNotifier){
5780 .attached_aio_context = attached_aio_context,
5781 .detach_aio_context = detach_aio_context,
5782 .opaque = opaque
5783 };
5784
5785 QLIST_INSERT_HEAD(&bs->aio_notifiers, ban, list);
5786 }
5787
5788 void bdrv_remove_aio_context_notifier(BlockDriverState *bs,
5789 void (*attached_aio_context)(AioContext *,
5790 void *),
5791 void (*detach_aio_context)(void *),
5792 void *opaque)
5793 {
5794 BdrvAioNotifier *ban, *ban_next;
5795
5796 QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
5797 if (ban->attached_aio_context == attached_aio_context &&
5798 ban->detach_aio_context == detach_aio_context &&
5799 ban->opaque == opaque)
5800 {
5801 QLIST_REMOVE(ban, list);
5802 g_free(ban);
5803
5804 return;
5805 }
5806 }
5807
5808 abort();
5809 }
5810
5811 void bdrv_add_before_write_notifier(BlockDriverState *bs,
5812 NotifierWithReturn *notifier)
5813 {
5814 notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
5815 }
5816
5817 int bdrv_amend_options(BlockDriverState *bs, QemuOpts *opts,
5818 BlockDriverAmendStatusCB *status_cb)
5819 {
5820 if (!bs->drv->bdrv_amend_options) {
5821 return -ENOTSUP;
5822 }
5823 return bs->drv->bdrv_amend_options(bs, opts, status_cb);
5824 }
5825
5826 /* This function will be called by the bdrv_recurse_is_first_non_filter method
5827 * of block filter and by bdrv_is_first_non_filter.
5828 * It is used to test if the given bs is the candidate or recurse more in the
5829 * node graph.
5830 */
5831 bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
5832 BlockDriverState *candidate)
5833 {
5834 /* return false if basic checks fails */
5835 if (!bs || !bs->drv) {
5836 return false;
5837 }
5838
5839 /* the code reached a non block filter driver -> check if the bs is
5840 * the same as the candidate. It's the recursion termination condition.
5841 */
5842 if (!bs->drv->is_filter) {
5843 return bs == candidate;
5844 }
5845 /* Down this path the driver is a block filter driver */
5846
5847 /* If the block filter recursion method is defined use it to recurse down
5848 * the node graph.
5849 */
5850 if (bs->drv->bdrv_recurse_is_first_non_filter) {
5851 return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate);
5852 }
5853
5854 /* the driver is a block filter but don't allow to recurse -> return false
5855 */
5856 return false;
5857 }
5858
5859 /* This function checks if the candidate is the first non filter bs down it's
5860 * bs chain. Since we don't have pointers to parents it explore all bs chains
5861 * from the top. Some filters can choose not to pass down the recursion.
5862 */
5863 bool bdrv_is_first_non_filter(BlockDriverState *candidate)
5864 {
5865 BlockDriverState *bs;
5866
5867 /* walk down the bs forest recursively */
5868 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5869 bool perm;
5870
5871 /* try to recurse in this top level bs */
5872 perm = bdrv_recurse_is_first_non_filter(bs, candidate);
5873
5874 /* candidate is the first non filter */
5875 if (perm) {
5876 return true;
5877 }
5878 }
5879
5880 return false;
5881 }
5882
5883 BlockDriverState *check_to_replace_node(const char *node_name, Error **errp)
5884 {
5885 BlockDriverState *to_replace_bs = bdrv_find_node(node_name);
5886 AioContext *aio_context;
5887
5888 if (!to_replace_bs) {
5889 error_setg(errp, "Node name '%s' not found", node_name);
5890 return NULL;
5891 }
5892
5893 aio_context = bdrv_get_aio_context(to_replace_bs);
5894 aio_context_acquire(aio_context);
5895
5896 if (bdrv_op_is_blocked(to_replace_bs, BLOCK_OP_TYPE_REPLACE, errp)) {
5897 to_replace_bs = NULL;
5898 goto out;
5899 }
5900
5901 /* We don't want arbitrary node of the BDS chain to be replaced only the top
5902 * most non filter in order to prevent data corruption.
5903 * Another benefit is that this tests exclude backing files which are
5904 * blocked by the backing blockers.
5905 */
5906 if (!bdrv_is_first_non_filter(to_replace_bs)) {
5907 error_setg(errp, "Only top most non filter can be replaced");
5908 to_replace_bs = NULL;
5909 goto out;
5910 }
5911
5912 out:
5913 aio_context_release(aio_context);
5914 return to_replace_bs;
5915 }
5916
5917 void bdrv_io_plug(BlockDriverState *bs)
5918 {
5919 BlockDriver *drv = bs->drv;
5920 if (drv && drv->bdrv_io_plug) {
5921 drv->bdrv_io_plug(bs);
5922 } else if (bs->file) {
5923 bdrv_io_plug(bs->file);
5924 }
5925 }
5926
5927 void bdrv_io_unplug(BlockDriverState *bs)
5928 {
5929 BlockDriver *drv = bs->drv;
5930 if (drv && drv->bdrv_io_unplug) {
5931 drv->bdrv_io_unplug(bs);
5932 } else if (bs->file) {
5933 bdrv_io_unplug(bs->file);
5934 }
5935 }
5936
5937 void bdrv_flush_io_queue(BlockDriverState *bs)
5938 {
5939 BlockDriver *drv = bs->drv;
5940 if (drv && drv->bdrv_flush_io_queue) {
5941 drv->bdrv_flush_io_queue(bs);
5942 } else if (bs->file) {
5943 bdrv_flush_io_queue(bs->file);
5944 }
5945 }
5946
5947 static bool append_open_options(QDict *d, BlockDriverState *bs)
5948 {
5949 const QDictEntry *entry;
5950 bool found_any = false;
5951
5952 for (entry = qdict_first(bs->options); entry;
5953 entry = qdict_next(bs->options, entry))
5954 {
5955 /* Only take options for this level and exclude all non-driver-specific
5956 * options */
5957 if (!strchr(qdict_entry_key(entry), '.') &&
5958 strcmp(qdict_entry_key(entry), "node-name"))
5959 {
5960 qobject_incref(qdict_entry_value(entry));
5961 qdict_put_obj(d, qdict_entry_key(entry), qdict_entry_value(entry));
5962 found_any = true;
5963 }
5964 }
5965
5966 return found_any;
5967 }
5968
5969 /* Updates the following BDS fields:
5970 * - exact_filename: A filename which may be used for opening a block device
5971 * which (mostly) equals the given BDS (even without any
5972 * other options; so reading and writing must return the same
5973 * results, but caching etc. may be different)
5974 * - full_open_options: Options which, when given when opening a block device
5975 * (without a filename), result in a BDS (mostly)
5976 * equalling the given one
5977 * - filename: If exact_filename is set, it is copied here. Otherwise,
5978 * full_open_options is converted to a JSON object, prefixed with
5979 * "json:" (for use through the JSON pseudo protocol) and put here.
5980 */
5981 void bdrv_refresh_filename(BlockDriverState *bs)
5982 {
5983 BlockDriver *drv = bs->drv;
5984 QDict *opts;
5985
5986 if (!drv) {
5987 return;
5988 }
5989
5990 /* This BDS's file name will most probably depend on its file's name, so
5991 * refresh that first */
5992 if (bs->file) {
5993 bdrv_refresh_filename(bs->file);
5994 }
5995
5996 if (drv->bdrv_refresh_filename) {
5997 /* Obsolete information is of no use here, so drop the old file name
5998 * information before refreshing it */
5999 bs->exact_filename[0] = '\0';
6000 if (bs->full_open_options) {
6001 QDECREF(bs->full_open_options);
6002 bs->full_open_options = NULL;
6003 }
6004
6005 drv->bdrv_refresh_filename(bs);
6006 } else if (bs->file) {
6007 /* Try to reconstruct valid information from the underlying file */
6008 bool has_open_options;
6009
6010 bs->exact_filename[0] = '\0';
6011 if (bs->full_open_options) {
6012 QDECREF(bs->full_open_options);
6013 bs->full_open_options = NULL;
6014 }
6015
6016 opts = qdict_new();
6017 has_open_options = append_open_options(opts, bs);
6018
6019 /* If no specific options have been given for this BDS, the filename of
6020 * the underlying file should suffice for this one as well */
6021 if (bs->file->exact_filename[0] && !has_open_options) {
6022 strcpy(bs->exact_filename, bs->file->exact_filename);
6023 }
6024 /* Reconstructing the full options QDict is simple for most format block
6025 * drivers, as long as the full options are known for the underlying
6026 * file BDS. The full options QDict of that file BDS should somehow
6027 * contain a representation of the filename, therefore the following
6028 * suffices without querying the (exact_)filename of this BDS. */
6029 if (bs->file->full_open_options) {
6030 qdict_put_obj(opts, "driver",
6031 QOBJECT(qstring_from_str(drv->format_name)));
6032 QINCREF(bs->file->full_open_options);
6033 qdict_put_obj(opts, "file", QOBJECT(bs->file->full_open_options));
6034
6035 bs->full_open_options = opts;
6036 } else {
6037 QDECREF(opts);
6038 }
6039 } else if (!bs->full_open_options && qdict_size(bs->options)) {
6040 /* There is no underlying file BDS (at least referenced by BDS.file),
6041 * so the full options QDict should be equal to the options given
6042 * specifically for this block device when it was opened (plus the
6043 * driver specification).
6044 * Because those options don't change, there is no need to update
6045 * full_open_options when it's already set. */
6046
6047 opts = qdict_new();
6048 append_open_options(opts, bs);
6049 qdict_put_obj(opts, "driver",
6050 QOBJECT(qstring_from_str(drv->format_name)));
6051
6052 if (bs->exact_filename[0]) {
6053 /* This may not work for all block protocol drivers (some may
6054 * require this filename to be parsed), but we have to find some
6055 * default solution here, so just include it. If some block driver
6056 * does not support pure options without any filename at all or
6057 * needs some special format of the options QDict, it needs to
6058 * implement the driver-specific bdrv_refresh_filename() function.
6059 */
6060 qdict_put_obj(opts, "filename",
6061 QOBJECT(qstring_from_str(bs->exact_filename)));
6062 }
6063
6064 bs->full_open_options = opts;
6065 }
6066
6067 if (bs->exact_filename[0]) {
6068 pstrcpy(bs->filename, sizeof(bs->filename), bs->exact_filename);
6069 } else if (bs->full_open_options) {
6070 QString *json = qobject_to_json(QOBJECT(bs->full_open_options));
6071 snprintf(bs->filename, sizeof(bs->filename), "json:%s",
6072 qstring_get_str(json));
6073 QDECREF(json);
6074 }
6075 }
6076
6077 /* This accessor function purpose is to allow the device models to access the
6078 * BlockAcctStats structure embedded inside a BlockDriverState without being
6079 * aware of the BlockDriverState structure layout.
6080 * It will go away when the BlockAcctStats structure will be moved inside
6081 * the device models.
6082 */
6083 BlockAcctStats *bdrv_get_stats(BlockDriverState *bs)
6084 {
6085 return &bs->stats;
6086 }