]> git.proxmox.com Git - mirror_qemu.git/blob - block.c
block: Ensure consistent bitmap function prototypes
[mirror_qemu.git] / block.c
1 /*
2 * QEMU System Emulator block driver
3 *
4 * Copyright (c) 2003 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24 #include "config-host.h"
25 #include "qemu-common.h"
26 #include "trace.h"
27 #include "block/block_int.h"
28 #include "block/blockjob.h"
29 #include "qemu/module.h"
30 #include "qapi/qmp/qjson.h"
31 #include "sysemu/block-backend.h"
32 #include "sysemu/sysemu.h"
33 #include "sysemu/qtest.h"
34 #include "qemu/notify.h"
35 #include "block/coroutine.h"
36 #include "block/qapi.h"
37 #include "qmp-commands.h"
38 #include "qemu/timer.h"
39 #include "qapi-event.h"
40
41 #ifdef CONFIG_BSD
42 #include <sys/types.h>
43 #include <sys/stat.h>
44 #include <sys/ioctl.h>
45 #include <sys/queue.h>
46 #ifndef __DragonFly__
47 #include <sys/disk.h>
48 #endif
49 #endif
50
51 #ifdef _WIN32
52 #include <windows.h>
53 #endif
54
55 /**
56 * A BdrvDirtyBitmap can be in three possible states:
57 * (1) successor is NULL and disabled is false: full r/w mode
58 * (2) successor is NULL and disabled is true: read only mode ("disabled")
59 * (3) successor is set: frozen mode.
60 * A frozen bitmap cannot be renamed, deleted, anonymized, cleared, set,
61 * or enabled. A frozen bitmap can only abdicate() or reclaim().
62 */
63 struct BdrvDirtyBitmap {
64 HBitmap *bitmap; /* Dirty sector bitmap implementation */
65 BdrvDirtyBitmap *successor; /* Anonymous child; implies frozen status */
66 char *name; /* Optional non-empty unique ID */
67 int64_t size; /* Size of the bitmap (Number of sectors) */
68 bool disabled; /* Bitmap is read-only */
69 QLIST_ENTRY(BdrvDirtyBitmap) list;
70 };
71
72 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
73
74 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
75 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
76 BlockCompletionFunc *cb, void *opaque);
77 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
78 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
79 BlockCompletionFunc *cb, void *opaque);
80 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
81 int64_t sector_num, int nb_sectors,
82 QEMUIOVector *iov);
83 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
84 int64_t sector_num, int nb_sectors,
85 QEMUIOVector *iov);
86 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
87 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
88 BdrvRequestFlags flags);
89 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
90 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
91 BdrvRequestFlags flags);
92 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
93 int64_t sector_num,
94 QEMUIOVector *qiov,
95 int nb_sectors,
96 BdrvRequestFlags flags,
97 BlockCompletionFunc *cb,
98 void *opaque,
99 bool is_write);
100 static void coroutine_fn bdrv_co_do_rw(void *opaque);
101 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
102 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
103
104 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
105 QTAILQ_HEAD_INITIALIZER(bdrv_states);
106
107 static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
108 QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
109
110 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
111 QLIST_HEAD_INITIALIZER(bdrv_drivers);
112
113 static void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
114 int nr_sectors);
115 static void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
116 int nr_sectors);
117 /* If non-zero, use only whitelisted block drivers */
118 static int use_bdrv_whitelist;
119
120 #ifdef _WIN32
121 static int is_windows_drive_prefix(const char *filename)
122 {
123 return (((filename[0] >= 'a' && filename[0] <= 'z') ||
124 (filename[0] >= 'A' && filename[0] <= 'Z')) &&
125 filename[1] == ':');
126 }
127
128 int is_windows_drive(const char *filename)
129 {
130 if (is_windows_drive_prefix(filename) &&
131 filename[2] == '\0')
132 return 1;
133 if (strstart(filename, "\\\\.\\", NULL) ||
134 strstart(filename, "//./", NULL))
135 return 1;
136 return 0;
137 }
138 #endif
139
140 /* throttling disk I/O limits */
141 void bdrv_set_io_limits(BlockDriverState *bs,
142 ThrottleConfig *cfg)
143 {
144 int i;
145
146 throttle_config(&bs->throttle_state, cfg);
147
148 for (i = 0; i < 2; i++) {
149 qemu_co_enter_next(&bs->throttled_reqs[i]);
150 }
151 }
152
153 /* this function drain all the throttled IOs */
154 static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
155 {
156 bool drained = false;
157 bool enabled = bs->io_limits_enabled;
158 int i;
159
160 bs->io_limits_enabled = false;
161
162 for (i = 0; i < 2; i++) {
163 while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
164 drained = true;
165 }
166 }
167
168 bs->io_limits_enabled = enabled;
169
170 return drained;
171 }
172
173 void bdrv_io_limits_disable(BlockDriverState *bs)
174 {
175 bs->io_limits_enabled = false;
176
177 bdrv_start_throttled_reqs(bs);
178
179 throttle_destroy(&bs->throttle_state);
180 }
181
182 static void bdrv_throttle_read_timer_cb(void *opaque)
183 {
184 BlockDriverState *bs = opaque;
185 qemu_co_enter_next(&bs->throttled_reqs[0]);
186 }
187
188 static void bdrv_throttle_write_timer_cb(void *opaque)
189 {
190 BlockDriverState *bs = opaque;
191 qemu_co_enter_next(&bs->throttled_reqs[1]);
192 }
193
194 /* should be called before bdrv_set_io_limits if a limit is set */
195 void bdrv_io_limits_enable(BlockDriverState *bs)
196 {
197 int clock_type = QEMU_CLOCK_REALTIME;
198
199 if (qtest_enabled()) {
200 /* For testing block IO throttling only */
201 clock_type = QEMU_CLOCK_VIRTUAL;
202 }
203 assert(!bs->io_limits_enabled);
204 throttle_init(&bs->throttle_state,
205 bdrv_get_aio_context(bs),
206 clock_type,
207 bdrv_throttle_read_timer_cb,
208 bdrv_throttle_write_timer_cb,
209 bs);
210 bs->io_limits_enabled = true;
211 }
212
213 /* This function makes an IO wait if needed
214 *
215 * @nb_sectors: the number of sectors of the IO
216 * @is_write: is the IO a write
217 */
218 static void bdrv_io_limits_intercept(BlockDriverState *bs,
219 unsigned int bytes,
220 bool is_write)
221 {
222 /* does this io must wait */
223 bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
224
225 /* if must wait or any request of this type throttled queue the IO */
226 if (must_wait ||
227 !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
228 qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
229 }
230
231 /* the IO will be executed, do the accounting */
232 throttle_account(&bs->throttle_state, is_write, bytes);
233
234
235 /* if the next request must wait -> do nothing */
236 if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
237 return;
238 }
239
240 /* else queue next request for execution */
241 qemu_co_queue_next(&bs->throttled_reqs[is_write]);
242 }
243
244 size_t bdrv_opt_mem_align(BlockDriverState *bs)
245 {
246 if (!bs || !bs->drv) {
247 /* 4k should be on the safe side */
248 return 4096;
249 }
250
251 return bs->bl.opt_mem_alignment;
252 }
253
254 /* check if the path starts with "<protocol>:" */
255 int path_has_protocol(const char *path)
256 {
257 const char *p;
258
259 #ifdef _WIN32
260 if (is_windows_drive(path) ||
261 is_windows_drive_prefix(path)) {
262 return 0;
263 }
264 p = path + strcspn(path, ":/\\");
265 #else
266 p = path + strcspn(path, ":/");
267 #endif
268
269 return *p == ':';
270 }
271
272 int path_is_absolute(const char *path)
273 {
274 #ifdef _WIN32
275 /* specific case for names like: "\\.\d:" */
276 if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
277 return 1;
278 }
279 return (*path == '/' || *path == '\\');
280 #else
281 return (*path == '/');
282 #endif
283 }
284
285 /* if filename is absolute, just copy it to dest. Otherwise, build a
286 path to it by considering it is relative to base_path. URL are
287 supported. */
288 void path_combine(char *dest, int dest_size,
289 const char *base_path,
290 const char *filename)
291 {
292 const char *p, *p1;
293 int len;
294
295 if (dest_size <= 0)
296 return;
297 if (path_is_absolute(filename)) {
298 pstrcpy(dest, dest_size, filename);
299 } else {
300 p = strchr(base_path, ':');
301 if (p)
302 p++;
303 else
304 p = base_path;
305 p1 = strrchr(base_path, '/');
306 #ifdef _WIN32
307 {
308 const char *p2;
309 p2 = strrchr(base_path, '\\');
310 if (!p1 || p2 > p1)
311 p1 = p2;
312 }
313 #endif
314 if (p1)
315 p1++;
316 else
317 p1 = base_path;
318 if (p1 > p)
319 p = p1;
320 len = p - base_path;
321 if (len > dest_size - 1)
322 len = dest_size - 1;
323 memcpy(dest, base_path, len);
324 dest[len] = '\0';
325 pstrcat(dest, dest_size, filename);
326 }
327 }
328
329 void bdrv_get_full_backing_filename_from_filename(const char *backed,
330 const char *backing,
331 char *dest, size_t sz,
332 Error **errp)
333 {
334 if (backing[0] == '\0' || path_has_protocol(backing) ||
335 path_is_absolute(backing))
336 {
337 pstrcpy(dest, sz, backing);
338 } else if (backed[0] == '\0' || strstart(backed, "json:", NULL)) {
339 error_setg(errp, "Cannot use relative backing file names for '%s'",
340 backed);
341 } else {
342 path_combine(dest, sz, backed, backing);
343 }
344 }
345
346 void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz,
347 Error **errp)
348 {
349 char *backed = bs->exact_filename[0] ? bs->exact_filename : bs->filename;
350
351 bdrv_get_full_backing_filename_from_filename(backed, bs->backing_file,
352 dest, sz, errp);
353 }
354
355 void bdrv_register(BlockDriver *bdrv)
356 {
357 /* Block drivers without coroutine functions need emulation */
358 if (!bdrv->bdrv_co_readv) {
359 bdrv->bdrv_co_readv = bdrv_co_readv_em;
360 bdrv->bdrv_co_writev = bdrv_co_writev_em;
361
362 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
363 * the block driver lacks aio we need to emulate that too.
364 */
365 if (!bdrv->bdrv_aio_readv) {
366 /* add AIO emulation layer */
367 bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
368 bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
369 }
370 }
371
372 QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
373 }
374
375 BlockDriverState *bdrv_new_root(void)
376 {
377 BlockDriverState *bs = bdrv_new();
378
379 QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
380 return bs;
381 }
382
383 BlockDriverState *bdrv_new(void)
384 {
385 BlockDriverState *bs;
386 int i;
387
388 bs = g_new0(BlockDriverState, 1);
389 QLIST_INIT(&bs->dirty_bitmaps);
390 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
391 QLIST_INIT(&bs->op_blockers[i]);
392 }
393 bdrv_iostatus_disable(bs);
394 notifier_list_init(&bs->close_notifiers);
395 notifier_with_return_list_init(&bs->before_write_notifiers);
396 qemu_co_queue_init(&bs->throttled_reqs[0]);
397 qemu_co_queue_init(&bs->throttled_reqs[1]);
398 bs->refcnt = 1;
399 bs->aio_context = qemu_get_aio_context();
400
401 return bs;
402 }
403
404 void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
405 {
406 notifier_list_add(&bs->close_notifiers, notify);
407 }
408
409 BlockDriver *bdrv_find_format(const char *format_name)
410 {
411 BlockDriver *drv1;
412 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
413 if (!strcmp(drv1->format_name, format_name)) {
414 return drv1;
415 }
416 }
417 return NULL;
418 }
419
420 static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
421 {
422 static const char *whitelist_rw[] = {
423 CONFIG_BDRV_RW_WHITELIST
424 };
425 static const char *whitelist_ro[] = {
426 CONFIG_BDRV_RO_WHITELIST
427 };
428 const char **p;
429
430 if (!whitelist_rw[0] && !whitelist_ro[0]) {
431 return 1; /* no whitelist, anything goes */
432 }
433
434 for (p = whitelist_rw; *p; p++) {
435 if (!strcmp(drv->format_name, *p)) {
436 return 1;
437 }
438 }
439 if (read_only) {
440 for (p = whitelist_ro; *p; p++) {
441 if (!strcmp(drv->format_name, *p)) {
442 return 1;
443 }
444 }
445 }
446 return 0;
447 }
448
449 BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
450 bool read_only)
451 {
452 BlockDriver *drv = bdrv_find_format(format_name);
453 return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
454 }
455
456 typedef struct CreateCo {
457 BlockDriver *drv;
458 char *filename;
459 QemuOpts *opts;
460 int ret;
461 Error *err;
462 } CreateCo;
463
464 static void coroutine_fn bdrv_create_co_entry(void *opaque)
465 {
466 Error *local_err = NULL;
467 int ret;
468
469 CreateCo *cco = opaque;
470 assert(cco->drv);
471
472 ret = cco->drv->bdrv_create(cco->filename, cco->opts, &local_err);
473 if (local_err) {
474 error_propagate(&cco->err, local_err);
475 }
476 cco->ret = ret;
477 }
478
479 int bdrv_create(BlockDriver *drv, const char* filename,
480 QemuOpts *opts, Error **errp)
481 {
482 int ret;
483
484 Coroutine *co;
485 CreateCo cco = {
486 .drv = drv,
487 .filename = g_strdup(filename),
488 .opts = opts,
489 .ret = NOT_DONE,
490 .err = NULL,
491 };
492
493 if (!drv->bdrv_create) {
494 error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
495 ret = -ENOTSUP;
496 goto out;
497 }
498
499 if (qemu_in_coroutine()) {
500 /* Fast-path if already in coroutine context */
501 bdrv_create_co_entry(&cco);
502 } else {
503 co = qemu_coroutine_create(bdrv_create_co_entry);
504 qemu_coroutine_enter(co, &cco);
505 while (cco.ret == NOT_DONE) {
506 aio_poll(qemu_get_aio_context(), true);
507 }
508 }
509
510 ret = cco.ret;
511 if (ret < 0) {
512 if (cco.err) {
513 error_propagate(errp, cco.err);
514 } else {
515 error_setg_errno(errp, -ret, "Could not create image");
516 }
517 }
518
519 out:
520 g_free(cco.filename);
521 return ret;
522 }
523
524 int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp)
525 {
526 BlockDriver *drv;
527 Error *local_err = NULL;
528 int ret;
529
530 drv = bdrv_find_protocol(filename, true, errp);
531 if (drv == NULL) {
532 return -ENOENT;
533 }
534
535 ret = bdrv_create(drv, filename, opts, &local_err);
536 if (local_err) {
537 error_propagate(errp, local_err);
538 }
539 return ret;
540 }
541
542 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
543 {
544 BlockDriver *drv = bs->drv;
545 Error *local_err = NULL;
546
547 memset(&bs->bl, 0, sizeof(bs->bl));
548
549 if (!drv) {
550 return;
551 }
552
553 /* Take some limits from the children as a default */
554 if (bs->file) {
555 bdrv_refresh_limits(bs->file, &local_err);
556 if (local_err) {
557 error_propagate(errp, local_err);
558 return;
559 }
560 bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
561 bs->bl.max_transfer_length = bs->file->bl.max_transfer_length;
562 bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
563 } else {
564 bs->bl.opt_mem_alignment = 512;
565 }
566
567 if (bs->backing_hd) {
568 bdrv_refresh_limits(bs->backing_hd, &local_err);
569 if (local_err) {
570 error_propagate(errp, local_err);
571 return;
572 }
573 bs->bl.opt_transfer_length =
574 MAX(bs->bl.opt_transfer_length,
575 bs->backing_hd->bl.opt_transfer_length);
576 bs->bl.max_transfer_length =
577 MIN_NON_ZERO(bs->bl.max_transfer_length,
578 bs->backing_hd->bl.max_transfer_length);
579 bs->bl.opt_mem_alignment =
580 MAX(bs->bl.opt_mem_alignment,
581 bs->backing_hd->bl.opt_mem_alignment);
582 }
583
584 /* Then let the driver override it */
585 if (drv->bdrv_refresh_limits) {
586 drv->bdrv_refresh_limits(bs, errp);
587 }
588 }
589
590 /**
591 * Try to get @bs's logical and physical block size.
592 * On success, store them in @bsz struct and return 0.
593 * On failure return -errno.
594 * @bs must not be empty.
595 */
596 int bdrv_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
597 {
598 BlockDriver *drv = bs->drv;
599
600 if (drv && drv->bdrv_probe_blocksizes) {
601 return drv->bdrv_probe_blocksizes(bs, bsz);
602 }
603
604 return -ENOTSUP;
605 }
606
607 /**
608 * Try to get @bs's geometry (cyls, heads, sectors).
609 * On success, store them in @geo struct and return 0.
610 * On failure return -errno.
611 * @bs must not be empty.
612 */
613 int bdrv_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
614 {
615 BlockDriver *drv = bs->drv;
616
617 if (drv && drv->bdrv_probe_geometry) {
618 return drv->bdrv_probe_geometry(bs, geo);
619 }
620
621 return -ENOTSUP;
622 }
623
624 /*
625 * Create a uniquely-named empty temporary file.
626 * Return 0 upon success, otherwise a negative errno value.
627 */
628 int get_tmp_filename(char *filename, int size)
629 {
630 #ifdef _WIN32
631 char temp_dir[MAX_PATH];
632 /* GetTempFileName requires that its output buffer (4th param)
633 have length MAX_PATH or greater. */
634 assert(size >= MAX_PATH);
635 return (GetTempPath(MAX_PATH, temp_dir)
636 && GetTempFileName(temp_dir, "qem", 0, filename)
637 ? 0 : -GetLastError());
638 #else
639 int fd;
640 const char *tmpdir;
641 tmpdir = getenv("TMPDIR");
642 if (!tmpdir) {
643 tmpdir = "/var/tmp";
644 }
645 if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
646 return -EOVERFLOW;
647 }
648 fd = mkstemp(filename);
649 if (fd < 0) {
650 return -errno;
651 }
652 if (close(fd) != 0) {
653 unlink(filename);
654 return -errno;
655 }
656 return 0;
657 #endif
658 }
659
660 /*
661 * Detect host devices. By convention, /dev/cdrom[N] is always
662 * recognized as a host CDROM.
663 */
664 static BlockDriver *find_hdev_driver(const char *filename)
665 {
666 int score_max = 0, score;
667 BlockDriver *drv = NULL, *d;
668
669 QLIST_FOREACH(d, &bdrv_drivers, list) {
670 if (d->bdrv_probe_device) {
671 score = d->bdrv_probe_device(filename);
672 if (score > score_max) {
673 score_max = score;
674 drv = d;
675 }
676 }
677 }
678
679 return drv;
680 }
681
682 BlockDriver *bdrv_find_protocol(const char *filename,
683 bool allow_protocol_prefix,
684 Error **errp)
685 {
686 BlockDriver *drv1;
687 char protocol[128];
688 int len;
689 const char *p;
690
691 /* TODO Drivers without bdrv_file_open must be specified explicitly */
692
693 /*
694 * XXX(hch): we really should not let host device detection
695 * override an explicit protocol specification, but moving this
696 * later breaks access to device names with colons in them.
697 * Thanks to the brain-dead persistent naming schemes on udev-
698 * based Linux systems those actually are quite common.
699 */
700 drv1 = find_hdev_driver(filename);
701 if (drv1) {
702 return drv1;
703 }
704
705 if (!path_has_protocol(filename) || !allow_protocol_prefix) {
706 return &bdrv_file;
707 }
708
709 p = strchr(filename, ':');
710 assert(p != NULL);
711 len = p - filename;
712 if (len > sizeof(protocol) - 1)
713 len = sizeof(protocol) - 1;
714 memcpy(protocol, filename, len);
715 protocol[len] = '\0';
716 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
717 if (drv1->protocol_name &&
718 !strcmp(drv1->protocol_name, protocol)) {
719 return drv1;
720 }
721 }
722
723 error_setg(errp, "Unknown protocol '%s'", protocol);
724 return NULL;
725 }
726
727 /*
728 * Guess image format by probing its contents.
729 * This is not a good idea when your image is raw (CVE-2008-2004), but
730 * we do it anyway for backward compatibility.
731 *
732 * @buf contains the image's first @buf_size bytes.
733 * @buf_size is the buffer size in bytes (generally BLOCK_PROBE_BUF_SIZE,
734 * but can be smaller if the image file is smaller)
735 * @filename is its filename.
736 *
737 * For all block drivers, call the bdrv_probe() method to get its
738 * probing score.
739 * Return the first block driver with the highest probing score.
740 */
741 BlockDriver *bdrv_probe_all(const uint8_t *buf, int buf_size,
742 const char *filename)
743 {
744 int score_max = 0, score;
745 BlockDriver *drv = NULL, *d;
746
747 QLIST_FOREACH(d, &bdrv_drivers, list) {
748 if (d->bdrv_probe) {
749 score = d->bdrv_probe(buf, buf_size, filename);
750 if (score > score_max) {
751 score_max = score;
752 drv = d;
753 }
754 }
755 }
756
757 return drv;
758 }
759
760 static int find_image_format(BlockDriverState *bs, const char *filename,
761 BlockDriver **pdrv, Error **errp)
762 {
763 BlockDriver *drv;
764 uint8_t buf[BLOCK_PROBE_BUF_SIZE];
765 int ret = 0;
766
767 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
768 if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
769 *pdrv = &bdrv_raw;
770 return ret;
771 }
772
773 ret = bdrv_pread(bs, 0, buf, sizeof(buf));
774 if (ret < 0) {
775 error_setg_errno(errp, -ret, "Could not read image for determining its "
776 "format");
777 *pdrv = NULL;
778 return ret;
779 }
780
781 drv = bdrv_probe_all(buf, ret, filename);
782 if (!drv) {
783 error_setg(errp, "Could not determine image format: No compatible "
784 "driver found");
785 ret = -ENOENT;
786 }
787 *pdrv = drv;
788 return ret;
789 }
790
791 /**
792 * Set the current 'total_sectors' value
793 * Return 0 on success, -errno on error.
794 */
795 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
796 {
797 BlockDriver *drv = bs->drv;
798
799 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
800 if (bs->sg)
801 return 0;
802
803 /* query actual device if possible, otherwise just trust the hint */
804 if (drv->bdrv_getlength) {
805 int64_t length = drv->bdrv_getlength(bs);
806 if (length < 0) {
807 return length;
808 }
809 hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
810 }
811
812 bs->total_sectors = hint;
813 return 0;
814 }
815
816 /**
817 * Set open flags for a given discard mode
818 *
819 * Return 0 on success, -1 if the discard mode was invalid.
820 */
821 int bdrv_parse_discard_flags(const char *mode, int *flags)
822 {
823 *flags &= ~BDRV_O_UNMAP;
824
825 if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
826 /* do nothing */
827 } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
828 *flags |= BDRV_O_UNMAP;
829 } else {
830 return -1;
831 }
832
833 return 0;
834 }
835
836 /**
837 * Set open flags for a given cache mode
838 *
839 * Return 0 on success, -1 if the cache mode was invalid.
840 */
841 int bdrv_parse_cache_flags(const char *mode, int *flags)
842 {
843 *flags &= ~BDRV_O_CACHE_MASK;
844
845 if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
846 *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
847 } else if (!strcmp(mode, "directsync")) {
848 *flags |= BDRV_O_NOCACHE;
849 } else if (!strcmp(mode, "writeback")) {
850 *flags |= BDRV_O_CACHE_WB;
851 } else if (!strcmp(mode, "unsafe")) {
852 *flags |= BDRV_O_CACHE_WB;
853 *flags |= BDRV_O_NO_FLUSH;
854 } else if (!strcmp(mode, "writethrough")) {
855 /* this is the default */
856 } else {
857 return -1;
858 }
859
860 return 0;
861 }
862
863 /**
864 * The copy-on-read flag is actually a reference count so multiple users may
865 * use the feature without worrying about clobbering its previous state.
866 * Copy-on-read stays enabled until all users have called to disable it.
867 */
868 void bdrv_enable_copy_on_read(BlockDriverState *bs)
869 {
870 bs->copy_on_read++;
871 }
872
873 void bdrv_disable_copy_on_read(BlockDriverState *bs)
874 {
875 assert(bs->copy_on_read > 0);
876 bs->copy_on_read--;
877 }
878
879 /*
880 * Returns the flags that a temporary snapshot should get, based on the
881 * originally requested flags (the originally requested image will have flags
882 * like a backing file)
883 */
884 static int bdrv_temp_snapshot_flags(int flags)
885 {
886 return (flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY;
887 }
888
889 /*
890 * Returns the flags that bs->file should get, based on the given flags for
891 * the parent BDS
892 */
893 static int bdrv_inherited_flags(int flags)
894 {
895 /* Enable protocol handling, disable format probing for bs->file */
896 flags |= BDRV_O_PROTOCOL;
897
898 /* Our block drivers take care to send flushes and respect unmap policy,
899 * so we can enable both unconditionally on lower layers. */
900 flags |= BDRV_O_CACHE_WB | BDRV_O_UNMAP;
901
902 /* Clear flags that only apply to the top layer */
903 flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_COPY_ON_READ);
904
905 return flags;
906 }
907
908 /*
909 * Returns the flags that bs->backing_hd should get, based on the given flags
910 * for the parent BDS
911 */
912 static int bdrv_backing_flags(int flags)
913 {
914 /* backing files always opened read-only */
915 flags &= ~(BDRV_O_RDWR | BDRV_O_COPY_ON_READ);
916
917 /* snapshot=on is handled on the top layer */
918 flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_TEMPORARY);
919
920 return flags;
921 }
922
923 static int bdrv_open_flags(BlockDriverState *bs, int flags)
924 {
925 int open_flags = flags | BDRV_O_CACHE_WB;
926
927 /*
928 * Clear flags that are internal to the block layer before opening the
929 * image.
930 */
931 open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_PROTOCOL);
932
933 /*
934 * Snapshots should be writable.
935 */
936 if (flags & BDRV_O_TEMPORARY) {
937 open_flags |= BDRV_O_RDWR;
938 }
939
940 return open_flags;
941 }
942
943 static void bdrv_assign_node_name(BlockDriverState *bs,
944 const char *node_name,
945 Error **errp)
946 {
947 if (!node_name) {
948 return;
949 }
950
951 /* Check for empty string or invalid characters */
952 if (!id_wellformed(node_name)) {
953 error_setg(errp, "Invalid node name");
954 return;
955 }
956
957 /* takes care of avoiding namespaces collisions */
958 if (blk_by_name(node_name)) {
959 error_setg(errp, "node-name=%s is conflicting with a device id",
960 node_name);
961 return;
962 }
963
964 /* takes care of avoiding duplicates node names */
965 if (bdrv_find_node(node_name)) {
966 error_setg(errp, "Duplicate node name");
967 return;
968 }
969
970 /* copy node name into the bs and insert it into the graph list */
971 pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
972 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
973 }
974
975 /*
976 * Common part for opening disk images and files
977 *
978 * Removes all processed options from *options.
979 */
980 static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
981 QDict *options, int flags, BlockDriver *drv, Error **errp)
982 {
983 int ret, open_flags;
984 const char *filename;
985 const char *node_name = NULL;
986 Error *local_err = NULL;
987
988 assert(drv != NULL);
989 assert(bs->file == NULL);
990 assert(options != NULL && bs->options != options);
991
992 if (file != NULL) {
993 filename = file->filename;
994 } else {
995 filename = qdict_get_try_str(options, "filename");
996 }
997
998 if (drv->bdrv_needs_filename && !filename) {
999 error_setg(errp, "The '%s' block driver requires a file name",
1000 drv->format_name);
1001 return -EINVAL;
1002 }
1003
1004 trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
1005
1006 node_name = qdict_get_try_str(options, "node-name");
1007 bdrv_assign_node_name(bs, node_name, &local_err);
1008 if (local_err) {
1009 error_propagate(errp, local_err);
1010 return -EINVAL;
1011 }
1012 qdict_del(options, "node-name");
1013
1014 /* bdrv_open() with directly using a protocol as drv. This layer is already
1015 * opened, so assign it to bs (while file becomes a closed BlockDriverState)
1016 * and return immediately. */
1017 if (file != NULL && drv->bdrv_file_open) {
1018 bdrv_swap(file, bs);
1019 return 0;
1020 }
1021
1022 bs->open_flags = flags;
1023 bs->guest_block_size = 512;
1024 bs->request_alignment = 512;
1025 bs->zero_beyond_eof = true;
1026 open_flags = bdrv_open_flags(bs, flags);
1027 bs->read_only = !(open_flags & BDRV_O_RDWR);
1028
1029 if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
1030 error_setg(errp,
1031 !bs->read_only && bdrv_is_whitelisted(drv, true)
1032 ? "Driver '%s' can only be used for read-only devices"
1033 : "Driver '%s' is not whitelisted",
1034 drv->format_name);
1035 return -ENOTSUP;
1036 }
1037
1038 assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
1039 if (flags & BDRV_O_COPY_ON_READ) {
1040 if (!bs->read_only) {
1041 bdrv_enable_copy_on_read(bs);
1042 } else {
1043 error_setg(errp, "Can't use copy-on-read on read-only device");
1044 return -EINVAL;
1045 }
1046 }
1047
1048 if (filename != NULL) {
1049 pstrcpy(bs->filename, sizeof(bs->filename), filename);
1050 } else {
1051 bs->filename[0] = '\0';
1052 }
1053 pstrcpy(bs->exact_filename, sizeof(bs->exact_filename), bs->filename);
1054
1055 bs->drv = drv;
1056 bs->opaque = g_malloc0(drv->instance_size);
1057
1058 bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
1059
1060 /* Open the image, either directly or using a protocol */
1061 if (drv->bdrv_file_open) {
1062 assert(file == NULL);
1063 assert(!drv->bdrv_needs_filename || filename != NULL);
1064 ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
1065 } else {
1066 if (file == NULL) {
1067 error_setg(errp, "Can't use '%s' as a block driver for the "
1068 "protocol level", drv->format_name);
1069 ret = -EINVAL;
1070 goto free_and_fail;
1071 }
1072 bs->file = file;
1073 ret = drv->bdrv_open(bs, options, open_flags, &local_err);
1074 }
1075
1076 if (ret < 0) {
1077 if (local_err) {
1078 error_propagate(errp, local_err);
1079 } else if (bs->filename[0]) {
1080 error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
1081 } else {
1082 error_setg_errno(errp, -ret, "Could not open image");
1083 }
1084 goto free_and_fail;
1085 }
1086
1087 if (bs->encrypted) {
1088 error_report("Encrypted images are deprecated");
1089 error_printf("Support for them will be removed in a future release.\n"
1090 "You can use 'qemu-img convert' to convert your image"
1091 " to an unencrypted one.\n");
1092 }
1093
1094 ret = refresh_total_sectors(bs, bs->total_sectors);
1095 if (ret < 0) {
1096 error_setg_errno(errp, -ret, "Could not refresh total sector count");
1097 goto free_and_fail;
1098 }
1099
1100 bdrv_refresh_limits(bs, &local_err);
1101 if (local_err) {
1102 error_propagate(errp, local_err);
1103 ret = -EINVAL;
1104 goto free_and_fail;
1105 }
1106
1107 assert(bdrv_opt_mem_align(bs) != 0);
1108 assert((bs->request_alignment != 0) || bs->sg);
1109 return 0;
1110
1111 free_and_fail:
1112 bs->file = NULL;
1113 g_free(bs->opaque);
1114 bs->opaque = NULL;
1115 bs->drv = NULL;
1116 return ret;
1117 }
1118
1119 static QDict *parse_json_filename(const char *filename, Error **errp)
1120 {
1121 QObject *options_obj;
1122 QDict *options;
1123 int ret;
1124
1125 ret = strstart(filename, "json:", &filename);
1126 assert(ret);
1127
1128 options_obj = qobject_from_json(filename);
1129 if (!options_obj) {
1130 error_setg(errp, "Could not parse the JSON options");
1131 return NULL;
1132 }
1133
1134 if (qobject_type(options_obj) != QTYPE_QDICT) {
1135 qobject_decref(options_obj);
1136 error_setg(errp, "Invalid JSON object given");
1137 return NULL;
1138 }
1139
1140 options = qobject_to_qdict(options_obj);
1141 qdict_flatten(options);
1142
1143 return options;
1144 }
1145
1146 /*
1147 * Fills in default options for opening images and converts the legacy
1148 * filename/flags pair to option QDict entries.
1149 */
1150 static int bdrv_fill_options(QDict **options, const char **pfilename, int flags,
1151 BlockDriver *drv, Error **errp)
1152 {
1153 const char *filename = *pfilename;
1154 const char *drvname;
1155 bool protocol = flags & BDRV_O_PROTOCOL;
1156 bool parse_filename = false;
1157 Error *local_err = NULL;
1158
1159 /* Parse json: pseudo-protocol */
1160 if (filename && g_str_has_prefix(filename, "json:")) {
1161 QDict *json_options = parse_json_filename(filename, &local_err);
1162 if (local_err) {
1163 error_propagate(errp, local_err);
1164 return -EINVAL;
1165 }
1166
1167 /* Options given in the filename have lower priority than options
1168 * specified directly */
1169 qdict_join(*options, json_options, false);
1170 QDECREF(json_options);
1171 *pfilename = filename = NULL;
1172 }
1173
1174 /* Fetch the file name from the options QDict if necessary */
1175 if (protocol && filename) {
1176 if (!qdict_haskey(*options, "filename")) {
1177 qdict_put(*options, "filename", qstring_from_str(filename));
1178 parse_filename = true;
1179 } else {
1180 error_setg(errp, "Can't specify 'file' and 'filename' options at "
1181 "the same time");
1182 return -EINVAL;
1183 }
1184 }
1185
1186 /* Find the right block driver */
1187 filename = qdict_get_try_str(*options, "filename");
1188 drvname = qdict_get_try_str(*options, "driver");
1189
1190 if (drv) {
1191 if (drvname) {
1192 error_setg(errp, "Driver specified twice");
1193 return -EINVAL;
1194 }
1195 drvname = drv->format_name;
1196 qdict_put(*options, "driver", qstring_from_str(drvname));
1197 } else {
1198 if (!drvname && protocol) {
1199 if (filename) {
1200 drv = bdrv_find_protocol(filename, parse_filename, errp);
1201 if (!drv) {
1202 return -EINVAL;
1203 }
1204
1205 drvname = drv->format_name;
1206 qdict_put(*options, "driver", qstring_from_str(drvname));
1207 } else {
1208 error_setg(errp, "Must specify either driver or file");
1209 return -EINVAL;
1210 }
1211 } else if (drvname) {
1212 drv = bdrv_find_format(drvname);
1213 if (!drv) {
1214 error_setg(errp, "Unknown driver '%s'", drvname);
1215 return -ENOENT;
1216 }
1217 }
1218 }
1219
1220 assert(drv || !protocol);
1221
1222 /* Driver-specific filename parsing */
1223 if (drv && drv->bdrv_parse_filename && parse_filename) {
1224 drv->bdrv_parse_filename(filename, *options, &local_err);
1225 if (local_err) {
1226 error_propagate(errp, local_err);
1227 return -EINVAL;
1228 }
1229
1230 if (!drv->bdrv_needs_filename) {
1231 qdict_del(*options, "filename");
1232 }
1233 }
1234
1235 return 0;
1236 }
1237
1238 void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd)
1239 {
1240
1241 if (bs->backing_hd) {
1242 assert(bs->backing_blocker);
1243 bdrv_op_unblock_all(bs->backing_hd, bs->backing_blocker);
1244 } else if (backing_hd) {
1245 error_setg(&bs->backing_blocker,
1246 "node is used as backing hd of '%s'",
1247 bdrv_get_device_or_node_name(bs));
1248 }
1249
1250 bs->backing_hd = backing_hd;
1251 if (!backing_hd) {
1252 error_free(bs->backing_blocker);
1253 bs->backing_blocker = NULL;
1254 goto out;
1255 }
1256 bs->open_flags &= ~BDRV_O_NO_BACKING;
1257 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_hd->filename);
1258 pstrcpy(bs->backing_format, sizeof(bs->backing_format),
1259 backing_hd->drv ? backing_hd->drv->format_name : "");
1260
1261 bdrv_op_block_all(bs->backing_hd, bs->backing_blocker);
1262 /* Otherwise we won't be able to commit due to check in bdrv_commit */
1263 bdrv_op_unblock(bs->backing_hd, BLOCK_OP_TYPE_COMMIT_TARGET,
1264 bs->backing_blocker);
1265 out:
1266 bdrv_refresh_limits(bs, NULL);
1267 }
1268
1269 /*
1270 * Opens the backing file for a BlockDriverState if not yet open
1271 *
1272 * options is a QDict of options to pass to the block drivers, or NULL for an
1273 * empty set of options. The reference to the QDict is transferred to this
1274 * function (even on failure), so if the caller intends to reuse the dictionary,
1275 * it needs to use QINCREF() before calling bdrv_file_open.
1276 */
1277 int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
1278 {
1279 char *backing_filename = g_malloc0(PATH_MAX);
1280 int ret = 0;
1281 BlockDriverState *backing_hd;
1282 Error *local_err = NULL;
1283
1284 if (bs->backing_hd != NULL) {
1285 QDECREF(options);
1286 goto free_exit;
1287 }
1288
1289 /* NULL means an empty set of options */
1290 if (options == NULL) {
1291 options = qdict_new();
1292 }
1293
1294 bs->open_flags &= ~BDRV_O_NO_BACKING;
1295 if (qdict_haskey(options, "file.filename")) {
1296 backing_filename[0] = '\0';
1297 } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
1298 QDECREF(options);
1299 goto free_exit;
1300 } else {
1301 bdrv_get_full_backing_filename(bs, backing_filename, PATH_MAX,
1302 &local_err);
1303 if (local_err) {
1304 ret = -EINVAL;
1305 error_propagate(errp, local_err);
1306 QDECREF(options);
1307 goto free_exit;
1308 }
1309 }
1310
1311 if (!bs->drv || !bs->drv->supports_backing) {
1312 ret = -EINVAL;
1313 error_setg(errp, "Driver doesn't support backing files");
1314 QDECREF(options);
1315 goto free_exit;
1316 }
1317
1318 backing_hd = bdrv_new();
1319
1320 if (bs->backing_format[0] != '\0' && !qdict_haskey(options, "driver")) {
1321 qdict_put(options, "driver", qstring_from_str(bs->backing_format));
1322 }
1323
1324 assert(bs->backing_hd == NULL);
1325 ret = bdrv_open(&backing_hd,
1326 *backing_filename ? backing_filename : NULL, NULL, options,
1327 bdrv_backing_flags(bs->open_flags), NULL, &local_err);
1328 if (ret < 0) {
1329 bdrv_unref(backing_hd);
1330 backing_hd = NULL;
1331 bs->open_flags |= BDRV_O_NO_BACKING;
1332 error_setg(errp, "Could not open backing file: %s",
1333 error_get_pretty(local_err));
1334 error_free(local_err);
1335 goto free_exit;
1336 }
1337 bdrv_set_backing_hd(bs, backing_hd);
1338
1339 free_exit:
1340 g_free(backing_filename);
1341 return ret;
1342 }
1343
1344 /*
1345 * Opens a disk image whose options are given as BlockdevRef in another block
1346 * device's options.
1347 *
1348 * If allow_none is true, no image will be opened if filename is false and no
1349 * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
1350 *
1351 * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
1352 * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
1353 * itself, all options starting with "${bdref_key}." are considered part of the
1354 * BlockdevRef.
1355 *
1356 * The BlockdevRef will be removed from the options QDict.
1357 *
1358 * To conform with the behavior of bdrv_open(), *pbs has to be NULL.
1359 */
1360 int bdrv_open_image(BlockDriverState **pbs, const char *filename,
1361 QDict *options, const char *bdref_key, int flags,
1362 bool allow_none, Error **errp)
1363 {
1364 QDict *image_options;
1365 int ret;
1366 char *bdref_key_dot;
1367 const char *reference;
1368
1369 assert(pbs);
1370 assert(*pbs == NULL);
1371
1372 bdref_key_dot = g_strdup_printf("%s.", bdref_key);
1373 qdict_extract_subqdict(options, &image_options, bdref_key_dot);
1374 g_free(bdref_key_dot);
1375
1376 reference = qdict_get_try_str(options, bdref_key);
1377 if (!filename && !reference && !qdict_size(image_options)) {
1378 if (allow_none) {
1379 ret = 0;
1380 } else {
1381 error_setg(errp, "A block device must be specified for \"%s\"",
1382 bdref_key);
1383 ret = -EINVAL;
1384 }
1385 QDECREF(image_options);
1386 goto done;
1387 }
1388
1389 ret = bdrv_open(pbs, filename, reference, image_options, flags, NULL, errp);
1390
1391 done:
1392 qdict_del(options, bdref_key);
1393 return ret;
1394 }
1395
1396 int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp)
1397 {
1398 /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1399 char *tmp_filename = g_malloc0(PATH_MAX + 1);
1400 int64_t total_size;
1401 QemuOpts *opts = NULL;
1402 QDict *snapshot_options;
1403 BlockDriverState *bs_snapshot;
1404 Error *local_err;
1405 int ret;
1406
1407 /* if snapshot, we create a temporary backing file and open it
1408 instead of opening 'filename' directly */
1409
1410 /* Get the required size from the image */
1411 total_size = bdrv_getlength(bs);
1412 if (total_size < 0) {
1413 ret = total_size;
1414 error_setg_errno(errp, -total_size, "Could not get image size");
1415 goto out;
1416 }
1417
1418 /* Create the temporary image */
1419 ret = get_tmp_filename(tmp_filename, PATH_MAX + 1);
1420 if (ret < 0) {
1421 error_setg_errno(errp, -ret, "Could not get temporary filename");
1422 goto out;
1423 }
1424
1425 opts = qemu_opts_create(bdrv_qcow2.create_opts, NULL, 0,
1426 &error_abort);
1427 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, total_size, &error_abort);
1428 ret = bdrv_create(&bdrv_qcow2, tmp_filename, opts, &local_err);
1429 qemu_opts_del(opts);
1430 if (ret < 0) {
1431 error_setg_errno(errp, -ret, "Could not create temporary overlay "
1432 "'%s': %s", tmp_filename,
1433 error_get_pretty(local_err));
1434 error_free(local_err);
1435 goto out;
1436 }
1437
1438 /* Prepare a new options QDict for the temporary file */
1439 snapshot_options = qdict_new();
1440 qdict_put(snapshot_options, "file.driver",
1441 qstring_from_str("file"));
1442 qdict_put(snapshot_options, "file.filename",
1443 qstring_from_str(tmp_filename));
1444
1445 bs_snapshot = bdrv_new();
1446
1447 ret = bdrv_open(&bs_snapshot, NULL, NULL, snapshot_options,
1448 flags, &bdrv_qcow2, &local_err);
1449 if (ret < 0) {
1450 error_propagate(errp, local_err);
1451 goto out;
1452 }
1453
1454 bdrv_append(bs_snapshot, bs);
1455
1456 out:
1457 g_free(tmp_filename);
1458 return ret;
1459 }
1460
1461 /*
1462 * Opens a disk image (raw, qcow2, vmdk, ...)
1463 *
1464 * options is a QDict of options to pass to the block drivers, or NULL for an
1465 * empty set of options. The reference to the QDict belongs to the block layer
1466 * after the call (even on failure), so if the caller intends to reuse the
1467 * dictionary, it needs to use QINCREF() before calling bdrv_open.
1468 *
1469 * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
1470 * If it is not NULL, the referenced BDS will be reused.
1471 *
1472 * The reference parameter may be used to specify an existing block device which
1473 * should be opened. If specified, neither options nor a filename may be given,
1474 * nor can an existing BDS be reused (that is, *pbs has to be NULL).
1475 */
1476 int bdrv_open(BlockDriverState **pbs, const char *filename,
1477 const char *reference, QDict *options, int flags,
1478 BlockDriver *drv, Error **errp)
1479 {
1480 int ret;
1481 BlockDriverState *file = NULL, *bs;
1482 const char *drvname;
1483 Error *local_err = NULL;
1484 int snapshot_flags = 0;
1485
1486 assert(pbs);
1487
1488 if (reference) {
1489 bool options_non_empty = options ? qdict_size(options) : false;
1490 QDECREF(options);
1491
1492 if (*pbs) {
1493 error_setg(errp, "Cannot reuse an existing BDS when referencing "
1494 "another block device");
1495 return -EINVAL;
1496 }
1497
1498 if (filename || options_non_empty) {
1499 error_setg(errp, "Cannot reference an existing block device with "
1500 "additional options or a new filename");
1501 return -EINVAL;
1502 }
1503
1504 bs = bdrv_lookup_bs(reference, reference, errp);
1505 if (!bs) {
1506 return -ENODEV;
1507 }
1508 bdrv_ref(bs);
1509 *pbs = bs;
1510 return 0;
1511 }
1512
1513 if (*pbs) {
1514 bs = *pbs;
1515 } else {
1516 bs = bdrv_new();
1517 }
1518
1519 /* NULL means an empty set of options */
1520 if (options == NULL) {
1521 options = qdict_new();
1522 }
1523
1524 ret = bdrv_fill_options(&options, &filename, flags, drv, &local_err);
1525 if (local_err) {
1526 goto fail;
1527 }
1528
1529 /* Find the right image format driver */
1530 drv = NULL;
1531 drvname = qdict_get_try_str(options, "driver");
1532 if (drvname) {
1533 drv = bdrv_find_format(drvname);
1534 qdict_del(options, "driver");
1535 if (!drv) {
1536 error_setg(errp, "Unknown driver: '%s'", drvname);
1537 ret = -EINVAL;
1538 goto fail;
1539 }
1540 }
1541
1542 assert(drvname || !(flags & BDRV_O_PROTOCOL));
1543 if (drv && !drv->bdrv_file_open) {
1544 /* If the user explicitly wants a format driver here, we'll need to add
1545 * another layer for the protocol in bs->file */
1546 flags &= ~BDRV_O_PROTOCOL;
1547 }
1548
1549 bs->options = options;
1550 options = qdict_clone_shallow(options);
1551
1552 /* Open image file without format layer */
1553 if ((flags & BDRV_O_PROTOCOL) == 0) {
1554 if (flags & BDRV_O_RDWR) {
1555 flags |= BDRV_O_ALLOW_RDWR;
1556 }
1557 if (flags & BDRV_O_SNAPSHOT) {
1558 snapshot_flags = bdrv_temp_snapshot_flags(flags);
1559 flags = bdrv_backing_flags(flags);
1560 }
1561
1562 assert(file == NULL);
1563 ret = bdrv_open_image(&file, filename, options, "file",
1564 bdrv_inherited_flags(flags),
1565 true, &local_err);
1566 if (ret < 0) {
1567 goto fail;
1568 }
1569 }
1570
1571 /* Image format probing */
1572 bs->probed = !drv;
1573 if (!drv && file) {
1574 ret = find_image_format(file, filename, &drv, &local_err);
1575 if (ret < 0) {
1576 goto fail;
1577 }
1578 } else if (!drv) {
1579 error_setg(errp, "Must specify either driver or file");
1580 ret = -EINVAL;
1581 goto fail;
1582 }
1583
1584 /* Open the image */
1585 ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
1586 if (ret < 0) {
1587 goto fail;
1588 }
1589
1590 if (file && (bs->file != file)) {
1591 bdrv_unref(file);
1592 file = NULL;
1593 }
1594
1595 /* If there is a backing file, use it */
1596 if ((flags & BDRV_O_NO_BACKING) == 0) {
1597 QDict *backing_options;
1598
1599 qdict_extract_subqdict(options, &backing_options, "backing.");
1600 ret = bdrv_open_backing_file(bs, backing_options, &local_err);
1601 if (ret < 0) {
1602 goto close_and_fail;
1603 }
1604 }
1605
1606 bdrv_refresh_filename(bs);
1607
1608 /* For snapshot=on, create a temporary qcow2 overlay. bs points to the
1609 * temporary snapshot afterwards. */
1610 if (snapshot_flags) {
1611 ret = bdrv_append_temp_snapshot(bs, snapshot_flags, &local_err);
1612 if (local_err) {
1613 goto close_and_fail;
1614 }
1615 }
1616
1617 /* Check if any unknown options were used */
1618 if (options && (qdict_size(options) != 0)) {
1619 const QDictEntry *entry = qdict_first(options);
1620 if (flags & BDRV_O_PROTOCOL) {
1621 error_setg(errp, "Block protocol '%s' doesn't support the option "
1622 "'%s'", drv->format_name, entry->key);
1623 } else {
1624 error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1625 "support the option '%s'", drv->format_name,
1626 bdrv_get_device_name(bs), entry->key);
1627 }
1628
1629 ret = -EINVAL;
1630 goto close_and_fail;
1631 }
1632
1633 if (!bdrv_key_required(bs)) {
1634 if (bs->blk) {
1635 blk_dev_change_media_cb(bs->blk, true);
1636 }
1637 } else if (!runstate_check(RUN_STATE_PRELAUNCH)
1638 && !runstate_check(RUN_STATE_INMIGRATE)
1639 && !runstate_check(RUN_STATE_PAUSED)) { /* HACK */
1640 error_setg(errp,
1641 "Guest must be stopped for opening of encrypted image");
1642 ret = -EBUSY;
1643 goto close_and_fail;
1644 }
1645
1646 QDECREF(options);
1647 *pbs = bs;
1648 return 0;
1649
1650 fail:
1651 if (file != NULL) {
1652 bdrv_unref(file);
1653 }
1654 QDECREF(bs->options);
1655 QDECREF(options);
1656 bs->options = NULL;
1657 if (!*pbs) {
1658 /* If *pbs is NULL, a new BDS has been created in this function and
1659 needs to be freed now. Otherwise, it does not need to be closed,
1660 since it has not really been opened yet. */
1661 bdrv_unref(bs);
1662 }
1663 if (local_err) {
1664 error_propagate(errp, local_err);
1665 }
1666 return ret;
1667
1668 close_and_fail:
1669 /* See fail path, but now the BDS has to be always closed */
1670 if (*pbs) {
1671 bdrv_close(bs);
1672 } else {
1673 bdrv_unref(bs);
1674 }
1675 QDECREF(options);
1676 if (local_err) {
1677 error_propagate(errp, local_err);
1678 }
1679 return ret;
1680 }
1681
1682 typedef struct BlockReopenQueueEntry {
1683 bool prepared;
1684 BDRVReopenState state;
1685 QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1686 } BlockReopenQueueEntry;
1687
1688 /*
1689 * Adds a BlockDriverState to a simple queue for an atomic, transactional
1690 * reopen of multiple devices.
1691 *
1692 * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1693 * already performed, or alternatively may be NULL a new BlockReopenQueue will
1694 * be created and initialized. This newly created BlockReopenQueue should be
1695 * passed back in for subsequent calls that are intended to be of the same
1696 * atomic 'set'.
1697 *
1698 * bs is the BlockDriverState to add to the reopen queue.
1699 *
1700 * flags contains the open flags for the associated bs
1701 *
1702 * returns a pointer to bs_queue, which is either the newly allocated
1703 * bs_queue, or the existing bs_queue being used.
1704 *
1705 */
1706 BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1707 BlockDriverState *bs, int flags)
1708 {
1709 assert(bs != NULL);
1710
1711 BlockReopenQueueEntry *bs_entry;
1712 if (bs_queue == NULL) {
1713 bs_queue = g_new0(BlockReopenQueue, 1);
1714 QSIMPLEQ_INIT(bs_queue);
1715 }
1716
1717 /* bdrv_open() masks this flag out */
1718 flags &= ~BDRV_O_PROTOCOL;
1719
1720 if (bs->file) {
1721 bdrv_reopen_queue(bs_queue, bs->file, bdrv_inherited_flags(flags));
1722 }
1723
1724 bs_entry = g_new0(BlockReopenQueueEntry, 1);
1725 QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1726
1727 bs_entry->state.bs = bs;
1728 bs_entry->state.flags = flags;
1729
1730 return bs_queue;
1731 }
1732
1733 /*
1734 * Reopen multiple BlockDriverStates atomically & transactionally.
1735 *
1736 * The queue passed in (bs_queue) must have been built up previous
1737 * via bdrv_reopen_queue().
1738 *
1739 * Reopens all BDS specified in the queue, with the appropriate
1740 * flags. All devices are prepared for reopen, and failure of any
1741 * device will cause all device changes to be abandonded, and intermediate
1742 * data cleaned up.
1743 *
1744 * If all devices prepare successfully, then the changes are committed
1745 * to all devices.
1746 *
1747 */
1748 int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1749 {
1750 int ret = -1;
1751 BlockReopenQueueEntry *bs_entry, *next;
1752 Error *local_err = NULL;
1753
1754 assert(bs_queue != NULL);
1755
1756 bdrv_drain_all();
1757
1758 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1759 if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1760 error_propagate(errp, local_err);
1761 goto cleanup;
1762 }
1763 bs_entry->prepared = true;
1764 }
1765
1766 /* If we reach this point, we have success and just need to apply the
1767 * changes
1768 */
1769 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1770 bdrv_reopen_commit(&bs_entry->state);
1771 }
1772
1773 ret = 0;
1774
1775 cleanup:
1776 QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1777 if (ret && bs_entry->prepared) {
1778 bdrv_reopen_abort(&bs_entry->state);
1779 }
1780 g_free(bs_entry);
1781 }
1782 g_free(bs_queue);
1783 return ret;
1784 }
1785
1786
1787 /* Reopen a single BlockDriverState with the specified flags. */
1788 int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1789 {
1790 int ret = -1;
1791 Error *local_err = NULL;
1792 BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1793
1794 ret = bdrv_reopen_multiple(queue, &local_err);
1795 if (local_err != NULL) {
1796 error_propagate(errp, local_err);
1797 }
1798 return ret;
1799 }
1800
1801
1802 /*
1803 * Prepares a BlockDriverState for reopen. All changes are staged in the
1804 * 'opaque' field of the BDRVReopenState, which is used and allocated by
1805 * the block driver layer .bdrv_reopen_prepare()
1806 *
1807 * bs is the BlockDriverState to reopen
1808 * flags are the new open flags
1809 * queue is the reopen queue
1810 *
1811 * Returns 0 on success, non-zero on error. On error errp will be set
1812 * as well.
1813 *
1814 * On failure, bdrv_reopen_abort() will be called to clean up any data.
1815 * It is the responsibility of the caller to then call the abort() or
1816 * commit() for any other BDS that have been left in a prepare() state
1817 *
1818 */
1819 int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1820 Error **errp)
1821 {
1822 int ret = -1;
1823 Error *local_err = NULL;
1824 BlockDriver *drv;
1825
1826 assert(reopen_state != NULL);
1827 assert(reopen_state->bs->drv != NULL);
1828 drv = reopen_state->bs->drv;
1829
1830 /* if we are to stay read-only, do not allow permission change
1831 * to r/w */
1832 if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1833 reopen_state->flags & BDRV_O_RDWR) {
1834 error_setg(errp, "Node '%s' is read only",
1835 bdrv_get_device_or_node_name(reopen_state->bs));
1836 goto error;
1837 }
1838
1839
1840 ret = bdrv_flush(reopen_state->bs);
1841 if (ret) {
1842 error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1843 strerror(-ret));
1844 goto error;
1845 }
1846
1847 if (drv->bdrv_reopen_prepare) {
1848 ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1849 if (ret) {
1850 if (local_err != NULL) {
1851 error_propagate(errp, local_err);
1852 } else {
1853 error_setg(errp, "failed while preparing to reopen image '%s'",
1854 reopen_state->bs->filename);
1855 }
1856 goto error;
1857 }
1858 } else {
1859 /* It is currently mandatory to have a bdrv_reopen_prepare()
1860 * handler for each supported drv. */
1861 error_setg(errp, "Block format '%s' used by node '%s' "
1862 "does not support reopening files", drv->format_name,
1863 bdrv_get_device_or_node_name(reopen_state->bs));
1864 ret = -1;
1865 goto error;
1866 }
1867
1868 ret = 0;
1869
1870 error:
1871 return ret;
1872 }
1873
1874 /*
1875 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1876 * makes them final by swapping the staging BlockDriverState contents into
1877 * the active BlockDriverState contents.
1878 */
1879 void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1880 {
1881 BlockDriver *drv;
1882
1883 assert(reopen_state != NULL);
1884 drv = reopen_state->bs->drv;
1885 assert(drv != NULL);
1886
1887 /* If there are any driver level actions to take */
1888 if (drv->bdrv_reopen_commit) {
1889 drv->bdrv_reopen_commit(reopen_state);
1890 }
1891
1892 /* set BDS specific flags now */
1893 reopen_state->bs->open_flags = reopen_state->flags;
1894 reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1895 BDRV_O_CACHE_WB);
1896 reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1897
1898 bdrv_refresh_limits(reopen_state->bs, NULL);
1899 }
1900
1901 /*
1902 * Abort the reopen, and delete and free the staged changes in
1903 * reopen_state
1904 */
1905 void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1906 {
1907 BlockDriver *drv;
1908
1909 assert(reopen_state != NULL);
1910 drv = reopen_state->bs->drv;
1911 assert(drv != NULL);
1912
1913 if (drv->bdrv_reopen_abort) {
1914 drv->bdrv_reopen_abort(reopen_state);
1915 }
1916 }
1917
1918
1919 void bdrv_close(BlockDriverState *bs)
1920 {
1921 BdrvAioNotifier *ban, *ban_next;
1922
1923 if (bs->job) {
1924 block_job_cancel_sync(bs->job);
1925 }
1926 bdrv_drain_all(); /* complete I/O */
1927 bdrv_flush(bs);
1928 bdrv_drain_all(); /* in case flush left pending I/O */
1929 notifier_list_notify(&bs->close_notifiers, bs);
1930
1931 if (bs->drv) {
1932 if (bs->backing_hd) {
1933 BlockDriverState *backing_hd = bs->backing_hd;
1934 bdrv_set_backing_hd(bs, NULL);
1935 bdrv_unref(backing_hd);
1936 }
1937 bs->drv->bdrv_close(bs);
1938 g_free(bs->opaque);
1939 bs->opaque = NULL;
1940 bs->drv = NULL;
1941 bs->copy_on_read = 0;
1942 bs->backing_file[0] = '\0';
1943 bs->backing_format[0] = '\0';
1944 bs->total_sectors = 0;
1945 bs->encrypted = 0;
1946 bs->valid_key = 0;
1947 bs->sg = 0;
1948 bs->zero_beyond_eof = false;
1949 QDECREF(bs->options);
1950 bs->options = NULL;
1951 QDECREF(bs->full_open_options);
1952 bs->full_open_options = NULL;
1953
1954 if (bs->file != NULL) {
1955 bdrv_unref(bs->file);
1956 bs->file = NULL;
1957 }
1958 }
1959
1960 if (bs->blk) {
1961 blk_dev_change_media_cb(bs->blk, false);
1962 }
1963
1964 /*throttling disk I/O limits*/
1965 if (bs->io_limits_enabled) {
1966 bdrv_io_limits_disable(bs);
1967 }
1968
1969 QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
1970 g_free(ban);
1971 }
1972 QLIST_INIT(&bs->aio_notifiers);
1973 }
1974
1975 void bdrv_close_all(void)
1976 {
1977 BlockDriverState *bs;
1978
1979 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1980 AioContext *aio_context = bdrv_get_aio_context(bs);
1981
1982 aio_context_acquire(aio_context);
1983 bdrv_close(bs);
1984 aio_context_release(aio_context);
1985 }
1986 }
1987
1988 /* Check if any requests are in-flight (including throttled requests) */
1989 static bool bdrv_requests_pending(BlockDriverState *bs)
1990 {
1991 if (!QLIST_EMPTY(&bs->tracked_requests)) {
1992 return true;
1993 }
1994 if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1995 return true;
1996 }
1997 if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
1998 return true;
1999 }
2000 if (bs->file && bdrv_requests_pending(bs->file)) {
2001 return true;
2002 }
2003 if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
2004 return true;
2005 }
2006 return false;
2007 }
2008
2009 static bool bdrv_drain_one(BlockDriverState *bs)
2010 {
2011 bool bs_busy;
2012
2013 bdrv_flush_io_queue(bs);
2014 bdrv_start_throttled_reqs(bs);
2015 bs_busy = bdrv_requests_pending(bs);
2016 bs_busy |= aio_poll(bdrv_get_aio_context(bs), bs_busy);
2017 return bs_busy;
2018 }
2019
2020 /*
2021 * Wait for pending requests to complete on a single BlockDriverState subtree
2022 *
2023 * See the warning in bdrv_drain_all(). This function can only be called if
2024 * you are sure nothing can generate I/O because you have op blockers
2025 * installed.
2026 *
2027 * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
2028 * AioContext.
2029 */
2030 void bdrv_drain(BlockDriverState *bs)
2031 {
2032 while (bdrv_drain_one(bs)) {
2033 /* Keep iterating */
2034 }
2035 }
2036
2037 /*
2038 * Wait for pending requests to complete across all BlockDriverStates
2039 *
2040 * This function does not flush data to disk, use bdrv_flush_all() for that
2041 * after calling this function.
2042 *
2043 * Note that completion of an asynchronous I/O operation can trigger any
2044 * number of other I/O operations on other devices---for example a coroutine
2045 * can be arbitrarily complex and a constant flow of I/O can come until the
2046 * coroutine is complete. Because of this, it is not possible to have a
2047 * function to drain a single device's I/O queue.
2048 */
2049 void bdrv_drain_all(void)
2050 {
2051 /* Always run first iteration so any pending completion BHs run */
2052 bool busy = true;
2053 BlockDriverState *bs;
2054
2055 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
2056 AioContext *aio_context = bdrv_get_aio_context(bs);
2057
2058 aio_context_acquire(aio_context);
2059 if (bs->job) {
2060 block_job_pause(bs->job);
2061 }
2062 aio_context_release(aio_context);
2063 }
2064
2065 while (busy) {
2066 busy = false;
2067
2068 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
2069 AioContext *aio_context = bdrv_get_aio_context(bs);
2070
2071 aio_context_acquire(aio_context);
2072 busy |= bdrv_drain_one(bs);
2073 aio_context_release(aio_context);
2074 }
2075 }
2076
2077 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
2078 AioContext *aio_context = bdrv_get_aio_context(bs);
2079
2080 aio_context_acquire(aio_context);
2081 if (bs->job) {
2082 block_job_resume(bs->job);
2083 }
2084 aio_context_release(aio_context);
2085 }
2086 }
2087
2088 /* make a BlockDriverState anonymous by removing from bdrv_state and
2089 * graph_bdrv_state list.
2090 Also, NULL terminate the device_name to prevent double remove */
2091 void bdrv_make_anon(BlockDriverState *bs)
2092 {
2093 /*
2094 * Take care to remove bs from bdrv_states only when it's actually
2095 * in it. Note that bs->device_list.tqe_prev is initially null,
2096 * and gets set to non-null by QTAILQ_INSERT_TAIL(). Establish
2097 * the useful invariant "bs in bdrv_states iff bs->tqe_prev" by
2098 * resetting it to null on remove.
2099 */
2100 if (bs->device_list.tqe_prev) {
2101 QTAILQ_REMOVE(&bdrv_states, bs, device_list);
2102 bs->device_list.tqe_prev = NULL;
2103 }
2104 if (bs->node_name[0] != '\0') {
2105 QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
2106 }
2107 bs->node_name[0] = '\0';
2108 }
2109
2110 static void bdrv_rebind(BlockDriverState *bs)
2111 {
2112 if (bs->drv && bs->drv->bdrv_rebind) {
2113 bs->drv->bdrv_rebind(bs);
2114 }
2115 }
2116
2117 static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
2118 BlockDriverState *bs_src)
2119 {
2120 /* move some fields that need to stay attached to the device */
2121
2122 /* dev info */
2123 bs_dest->guest_block_size = bs_src->guest_block_size;
2124 bs_dest->copy_on_read = bs_src->copy_on_read;
2125
2126 bs_dest->enable_write_cache = bs_src->enable_write_cache;
2127
2128 /* i/o throttled req */
2129 memcpy(&bs_dest->throttle_state,
2130 &bs_src->throttle_state,
2131 sizeof(ThrottleState));
2132 bs_dest->throttled_reqs[0] = bs_src->throttled_reqs[0];
2133 bs_dest->throttled_reqs[1] = bs_src->throttled_reqs[1];
2134 bs_dest->io_limits_enabled = bs_src->io_limits_enabled;
2135
2136 /* r/w error */
2137 bs_dest->on_read_error = bs_src->on_read_error;
2138 bs_dest->on_write_error = bs_src->on_write_error;
2139
2140 /* i/o status */
2141 bs_dest->iostatus_enabled = bs_src->iostatus_enabled;
2142 bs_dest->iostatus = bs_src->iostatus;
2143
2144 /* dirty bitmap */
2145 bs_dest->dirty_bitmaps = bs_src->dirty_bitmaps;
2146
2147 /* reference count */
2148 bs_dest->refcnt = bs_src->refcnt;
2149
2150 /* job */
2151 bs_dest->job = bs_src->job;
2152
2153 /* keep the same entry in bdrv_states */
2154 bs_dest->device_list = bs_src->device_list;
2155 bs_dest->blk = bs_src->blk;
2156
2157 memcpy(bs_dest->op_blockers, bs_src->op_blockers,
2158 sizeof(bs_dest->op_blockers));
2159 }
2160
2161 /*
2162 * Swap bs contents for two image chains while they are live,
2163 * while keeping required fields on the BlockDriverState that is
2164 * actually attached to a device.
2165 *
2166 * This will modify the BlockDriverState fields, and swap contents
2167 * between bs_new and bs_old. Both bs_new and bs_old are modified.
2168 *
2169 * bs_new must not be attached to a BlockBackend.
2170 *
2171 * This function does not create any image files.
2172 */
2173 void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
2174 {
2175 BlockDriverState tmp;
2176
2177 /* The code needs to swap the node_name but simply swapping node_list won't
2178 * work so first remove the nodes from the graph list, do the swap then
2179 * insert them back if needed.
2180 */
2181 if (bs_new->node_name[0] != '\0') {
2182 QTAILQ_REMOVE(&graph_bdrv_states, bs_new, node_list);
2183 }
2184 if (bs_old->node_name[0] != '\0') {
2185 QTAILQ_REMOVE(&graph_bdrv_states, bs_old, node_list);
2186 }
2187
2188 /* bs_new must be unattached and shouldn't have anything fancy enabled */
2189 assert(!bs_new->blk);
2190 assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
2191 assert(bs_new->job == NULL);
2192 assert(bs_new->io_limits_enabled == false);
2193 assert(!throttle_have_timer(&bs_new->throttle_state));
2194
2195 tmp = *bs_new;
2196 *bs_new = *bs_old;
2197 *bs_old = tmp;
2198
2199 /* there are some fields that should not be swapped, move them back */
2200 bdrv_move_feature_fields(&tmp, bs_old);
2201 bdrv_move_feature_fields(bs_old, bs_new);
2202 bdrv_move_feature_fields(bs_new, &tmp);
2203
2204 /* bs_new must remain unattached */
2205 assert(!bs_new->blk);
2206
2207 /* Check a few fields that should remain attached to the device */
2208 assert(bs_new->job == NULL);
2209 assert(bs_new->io_limits_enabled == false);
2210 assert(!throttle_have_timer(&bs_new->throttle_state));
2211
2212 /* insert the nodes back into the graph node list if needed */
2213 if (bs_new->node_name[0] != '\0') {
2214 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_new, node_list);
2215 }
2216 if (bs_old->node_name[0] != '\0') {
2217 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_old, node_list);
2218 }
2219
2220 bdrv_rebind(bs_new);
2221 bdrv_rebind(bs_old);
2222 }
2223
2224 /*
2225 * Add new bs contents at the top of an image chain while the chain is
2226 * live, while keeping required fields on the top layer.
2227 *
2228 * This will modify the BlockDriverState fields, and swap contents
2229 * between bs_new and bs_top. Both bs_new and bs_top are modified.
2230 *
2231 * bs_new must not be attached to a BlockBackend.
2232 *
2233 * This function does not create any image files.
2234 */
2235 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
2236 {
2237 bdrv_swap(bs_new, bs_top);
2238
2239 /* The contents of 'tmp' will become bs_top, as we are
2240 * swapping bs_new and bs_top contents. */
2241 bdrv_set_backing_hd(bs_top, bs_new);
2242 }
2243
2244 static void bdrv_delete(BlockDriverState *bs)
2245 {
2246 assert(!bs->job);
2247 assert(bdrv_op_blocker_is_empty(bs));
2248 assert(!bs->refcnt);
2249 assert(QLIST_EMPTY(&bs->dirty_bitmaps));
2250
2251 bdrv_close(bs);
2252
2253 /* remove from list, if necessary */
2254 bdrv_make_anon(bs);
2255
2256 g_free(bs);
2257 }
2258
2259 /*
2260 * Run consistency checks on an image
2261 *
2262 * Returns 0 if the check could be completed (it doesn't mean that the image is
2263 * free of errors) or -errno when an internal error occurred. The results of the
2264 * check are stored in res.
2265 */
2266 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
2267 {
2268 if (bs->drv == NULL) {
2269 return -ENOMEDIUM;
2270 }
2271 if (bs->drv->bdrv_check == NULL) {
2272 return -ENOTSUP;
2273 }
2274
2275 memset(res, 0, sizeof(*res));
2276 return bs->drv->bdrv_check(bs, res, fix);
2277 }
2278
2279 #define COMMIT_BUF_SECTORS 2048
2280
2281 /* commit COW file into the raw image */
2282 int bdrv_commit(BlockDriverState *bs)
2283 {
2284 BlockDriver *drv = bs->drv;
2285 int64_t sector, total_sectors, length, backing_length;
2286 int n, ro, open_flags;
2287 int ret = 0;
2288 uint8_t *buf = NULL;
2289
2290 if (!drv)
2291 return -ENOMEDIUM;
2292
2293 if (!bs->backing_hd) {
2294 return -ENOTSUP;
2295 }
2296
2297 if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_COMMIT_SOURCE, NULL) ||
2298 bdrv_op_is_blocked(bs->backing_hd, BLOCK_OP_TYPE_COMMIT_TARGET, NULL)) {
2299 return -EBUSY;
2300 }
2301
2302 ro = bs->backing_hd->read_only;
2303 open_flags = bs->backing_hd->open_flags;
2304
2305 if (ro) {
2306 if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
2307 return -EACCES;
2308 }
2309 }
2310
2311 length = bdrv_getlength(bs);
2312 if (length < 0) {
2313 ret = length;
2314 goto ro_cleanup;
2315 }
2316
2317 backing_length = bdrv_getlength(bs->backing_hd);
2318 if (backing_length < 0) {
2319 ret = backing_length;
2320 goto ro_cleanup;
2321 }
2322
2323 /* If our top snapshot is larger than the backing file image,
2324 * grow the backing file image if possible. If not possible,
2325 * we must return an error */
2326 if (length > backing_length) {
2327 ret = bdrv_truncate(bs->backing_hd, length);
2328 if (ret < 0) {
2329 goto ro_cleanup;
2330 }
2331 }
2332
2333 total_sectors = length >> BDRV_SECTOR_BITS;
2334
2335 /* qemu_try_blockalign() for bs will choose an alignment that works for
2336 * bs->backing_hd as well, so no need to compare the alignment manually. */
2337 buf = qemu_try_blockalign(bs, COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
2338 if (buf == NULL) {
2339 ret = -ENOMEM;
2340 goto ro_cleanup;
2341 }
2342
2343 for (sector = 0; sector < total_sectors; sector += n) {
2344 ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
2345 if (ret < 0) {
2346 goto ro_cleanup;
2347 }
2348 if (ret) {
2349 ret = bdrv_read(bs, sector, buf, n);
2350 if (ret < 0) {
2351 goto ro_cleanup;
2352 }
2353
2354 ret = bdrv_write(bs->backing_hd, sector, buf, n);
2355 if (ret < 0) {
2356 goto ro_cleanup;
2357 }
2358 }
2359 }
2360
2361 if (drv->bdrv_make_empty) {
2362 ret = drv->bdrv_make_empty(bs);
2363 if (ret < 0) {
2364 goto ro_cleanup;
2365 }
2366 bdrv_flush(bs);
2367 }
2368
2369 /*
2370 * Make sure all data we wrote to the backing device is actually
2371 * stable on disk.
2372 */
2373 if (bs->backing_hd) {
2374 bdrv_flush(bs->backing_hd);
2375 }
2376
2377 ret = 0;
2378 ro_cleanup:
2379 qemu_vfree(buf);
2380
2381 if (ro) {
2382 /* ignoring error return here */
2383 bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
2384 }
2385
2386 return ret;
2387 }
2388
2389 int bdrv_commit_all(void)
2390 {
2391 BlockDriverState *bs;
2392
2393 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
2394 AioContext *aio_context = bdrv_get_aio_context(bs);
2395
2396 aio_context_acquire(aio_context);
2397 if (bs->drv && bs->backing_hd) {
2398 int ret = bdrv_commit(bs);
2399 if (ret < 0) {
2400 aio_context_release(aio_context);
2401 return ret;
2402 }
2403 }
2404 aio_context_release(aio_context);
2405 }
2406 return 0;
2407 }
2408
2409 /**
2410 * Remove an active request from the tracked requests list
2411 *
2412 * This function should be called when a tracked request is completing.
2413 */
2414 static void tracked_request_end(BdrvTrackedRequest *req)
2415 {
2416 if (req->serialising) {
2417 req->bs->serialising_in_flight--;
2418 }
2419
2420 QLIST_REMOVE(req, list);
2421 qemu_co_queue_restart_all(&req->wait_queue);
2422 }
2423
2424 /**
2425 * Add an active request to the tracked requests list
2426 */
2427 static void tracked_request_begin(BdrvTrackedRequest *req,
2428 BlockDriverState *bs,
2429 int64_t offset,
2430 unsigned int bytes, bool is_write)
2431 {
2432 *req = (BdrvTrackedRequest){
2433 .bs = bs,
2434 .offset = offset,
2435 .bytes = bytes,
2436 .is_write = is_write,
2437 .co = qemu_coroutine_self(),
2438 .serialising = false,
2439 .overlap_offset = offset,
2440 .overlap_bytes = bytes,
2441 };
2442
2443 qemu_co_queue_init(&req->wait_queue);
2444
2445 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2446 }
2447
2448 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
2449 {
2450 int64_t overlap_offset = req->offset & ~(align - 1);
2451 unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
2452 - overlap_offset;
2453
2454 if (!req->serialising) {
2455 req->bs->serialising_in_flight++;
2456 req->serialising = true;
2457 }
2458
2459 req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
2460 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
2461 }
2462
2463 /**
2464 * Round a region to cluster boundaries
2465 */
2466 void bdrv_round_to_clusters(BlockDriverState *bs,
2467 int64_t sector_num, int nb_sectors,
2468 int64_t *cluster_sector_num,
2469 int *cluster_nb_sectors)
2470 {
2471 BlockDriverInfo bdi;
2472
2473 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2474 *cluster_sector_num = sector_num;
2475 *cluster_nb_sectors = nb_sectors;
2476 } else {
2477 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2478 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2479 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2480 nb_sectors, c);
2481 }
2482 }
2483
2484 static int bdrv_get_cluster_size(BlockDriverState *bs)
2485 {
2486 BlockDriverInfo bdi;
2487 int ret;
2488
2489 ret = bdrv_get_info(bs, &bdi);
2490 if (ret < 0 || bdi.cluster_size == 0) {
2491 return bs->request_alignment;
2492 } else {
2493 return bdi.cluster_size;
2494 }
2495 }
2496
2497 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
2498 int64_t offset, unsigned int bytes)
2499 {
2500 /* aaaa bbbb */
2501 if (offset >= req->overlap_offset + req->overlap_bytes) {
2502 return false;
2503 }
2504 /* bbbb aaaa */
2505 if (req->overlap_offset >= offset + bytes) {
2506 return false;
2507 }
2508 return true;
2509 }
2510
2511 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
2512 {
2513 BlockDriverState *bs = self->bs;
2514 BdrvTrackedRequest *req;
2515 bool retry;
2516 bool waited = false;
2517
2518 if (!bs->serialising_in_flight) {
2519 return false;
2520 }
2521
2522 do {
2523 retry = false;
2524 QLIST_FOREACH(req, &bs->tracked_requests, list) {
2525 if (req == self || (!req->serialising && !self->serialising)) {
2526 continue;
2527 }
2528 if (tracked_request_overlaps(req, self->overlap_offset,
2529 self->overlap_bytes))
2530 {
2531 /* Hitting this means there was a reentrant request, for
2532 * example, a block driver issuing nested requests. This must
2533 * never happen since it means deadlock.
2534 */
2535 assert(qemu_coroutine_self() != req->co);
2536
2537 /* If the request is already (indirectly) waiting for us, or
2538 * will wait for us as soon as it wakes up, then just go on
2539 * (instead of producing a deadlock in the former case). */
2540 if (!req->waiting_for) {
2541 self->waiting_for = req;
2542 qemu_co_queue_wait(&req->wait_queue);
2543 self->waiting_for = NULL;
2544 retry = true;
2545 waited = true;
2546 break;
2547 }
2548 }
2549 }
2550 } while (retry);
2551
2552 return waited;
2553 }
2554
2555 /*
2556 * Return values:
2557 * 0 - success
2558 * -EINVAL - backing format specified, but no file
2559 * -ENOSPC - can't update the backing file because no space is left in the
2560 * image file header
2561 * -ENOTSUP - format driver doesn't support changing the backing file
2562 */
2563 int bdrv_change_backing_file(BlockDriverState *bs,
2564 const char *backing_file, const char *backing_fmt)
2565 {
2566 BlockDriver *drv = bs->drv;
2567 int ret;
2568
2569 /* Backing file format doesn't make sense without a backing file */
2570 if (backing_fmt && !backing_file) {
2571 return -EINVAL;
2572 }
2573
2574 if (drv->bdrv_change_backing_file != NULL) {
2575 ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
2576 } else {
2577 ret = -ENOTSUP;
2578 }
2579
2580 if (ret == 0) {
2581 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2582 pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2583 }
2584 return ret;
2585 }
2586
2587 /*
2588 * Finds the image layer in the chain that has 'bs' as its backing file.
2589 *
2590 * active is the current topmost image.
2591 *
2592 * Returns NULL if bs is not found in active's image chain,
2593 * or if active == bs.
2594 *
2595 * Returns the bottommost base image if bs == NULL.
2596 */
2597 BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2598 BlockDriverState *bs)
2599 {
2600 while (active && bs != active->backing_hd) {
2601 active = active->backing_hd;
2602 }
2603
2604 return active;
2605 }
2606
2607 /* Given a BDS, searches for the base layer. */
2608 BlockDriverState *bdrv_find_base(BlockDriverState *bs)
2609 {
2610 return bdrv_find_overlay(bs, NULL);
2611 }
2612
2613 typedef struct BlkIntermediateStates {
2614 BlockDriverState *bs;
2615 QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2616 } BlkIntermediateStates;
2617
2618
2619 /*
2620 * Drops images above 'base' up to and including 'top', and sets the image
2621 * above 'top' to have base as its backing file.
2622 *
2623 * Requires that the overlay to 'top' is opened r/w, so that the backing file
2624 * information in 'bs' can be properly updated.
2625 *
2626 * E.g., this will convert the following chain:
2627 * bottom <- base <- intermediate <- top <- active
2628 *
2629 * to
2630 *
2631 * bottom <- base <- active
2632 *
2633 * It is allowed for bottom==base, in which case it converts:
2634 *
2635 * base <- intermediate <- top <- active
2636 *
2637 * to
2638 *
2639 * base <- active
2640 *
2641 * If backing_file_str is non-NULL, it will be used when modifying top's
2642 * overlay image metadata.
2643 *
2644 * Error conditions:
2645 * if active == top, that is considered an error
2646 *
2647 */
2648 int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2649 BlockDriverState *base, const char *backing_file_str)
2650 {
2651 BlockDriverState *intermediate;
2652 BlockDriverState *base_bs = NULL;
2653 BlockDriverState *new_top_bs = NULL;
2654 BlkIntermediateStates *intermediate_state, *next;
2655 int ret = -EIO;
2656
2657 QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2658 QSIMPLEQ_INIT(&states_to_delete);
2659
2660 if (!top->drv || !base->drv) {
2661 goto exit;
2662 }
2663
2664 new_top_bs = bdrv_find_overlay(active, top);
2665
2666 if (new_top_bs == NULL) {
2667 /* we could not find the image above 'top', this is an error */
2668 goto exit;
2669 }
2670
2671 /* special case of new_top_bs->backing_hd already pointing to base - nothing
2672 * to do, no intermediate images */
2673 if (new_top_bs->backing_hd == base) {
2674 ret = 0;
2675 goto exit;
2676 }
2677
2678 intermediate = top;
2679
2680 /* now we will go down through the list, and add each BDS we find
2681 * into our deletion queue, until we hit the 'base'
2682 */
2683 while (intermediate) {
2684 intermediate_state = g_new0(BlkIntermediateStates, 1);
2685 intermediate_state->bs = intermediate;
2686 QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2687
2688 if (intermediate->backing_hd == base) {
2689 base_bs = intermediate->backing_hd;
2690 break;
2691 }
2692 intermediate = intermediate->backing_hd;
2693 }
2694 if (base_bs == NULL) {
2695 /* something went wrong, we did not end at the base. safely
2696 * unravel everything, and exit with error */
2697 goto exit;
2698 }
2699
2700 /* success - we can delete the intermediate states, and link top->base */
2701 backing_file_str = backing_file_str ? backing_file_str : base_bs->filename;
2702 ret = bdrv_change_backing_file(new_top_bs, backing_file_str,
2703 base_bs->drv ? base_bs->drv->format_name : "");
2704 if (ret) {
2705 goto exit;
2706 }
2707 bdrv_set_backing_hd(new_top_bs, base_bs);
2708
2709 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2710 /* so that bdrv_close() does not recursively close the chain */
2711 bdrv_set_backing_hd(intermediate_state->bs, NULL);
2712 bdrv_unref(intermediate_state->bs);
2713 }
2714 ret = 0;
2715
2716 exit:
2717 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2718 g_free(intermediate_state);
2719 }
2720 return ret;
2721 }
2722
2723
2724 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2725 size_t size)
2726 {
2727 if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) {
2728 return -EIO;
2729 }
2730
2731 if (!bdrv_is_inserted(bs)) {
2732 return -ENOMEDIUM;
2733 }
2734
2735 if (offset < 0) {
2736 return -EIO;
2737 }
2738
2739 return 0;
2740 }
2741
2742 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2743 int nb_sectors)
2744 {
2745 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
2746 return -EIO;
2747 }
2748
2749 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2750 nb_sectors * BDRV_SECTOR_SIZE);
2751 }
2752
2753 typedef struct RwCo {
2754 BlockDriverState *bs;
2755 int64_t offset;
2756 QEMUIOVector *qiov;
2757 bool is_write;
2758 int ret;
2759 BdrvRequestFlags flags;
2760 } RwCo;
2761
2762 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
2763 {
2764 RwCo *rwco = opaque;
2765
2766 if (!rwco->is_write) {
2767 rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
2768 rwco->qiov->size, rwco->qiov,
2769 rwco->flags);
2770 } else {
2771 rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
2772 rwco->qiov->size, rwco->qiov,
2773 rwco->flags);
2774 }
2775 }
2776
2777 /*
2778 * Process a vectored synchronous request using coroutines
2779 */
2780 static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
2781 QEMUIOVector *qiov, bool is_write,
2782 BdrvRequestFlags flags)
2783 {
2784 Coroutine *co;
2785 RwCo rwco = {
2786 .bs = bs,
2787 .offset = offset,
2788 .qiov = qiov,
2789 .is_write = is_write,
2790 .ret = NOT_DONE,
2791 .flags = flags,
2792 };
2793
2794 /**
2795 * In sync call context, when the vcpu is blocked, this throttling timer
2796 * will not fire; so the I/O throttling function has to be disabled here
2797 * if it has been enabled.
2798 */
2799 if (bs->io_limits_enabled) {
2800 fprintf(stderr, "Disabling I/O throttling on '%s' due "
2801 "to synchronous I/O.\n", bdrv_get_device_name(bs));
2802 bdrv_io_limits_disable(bs);
2803 }
2804
2805 if (qemu_in_coroutine()) {
2806 /* Fast-path if already in coroutine context */
2807 bdrv_rw_co_entry(&rwco);
2808 } else {
2809 AioContext *aio_context = bdrv_get_aio_context(bs);
2810
2811 co = qemu_coroutine_create(bdrv_rw_co_entry);
2812 qemu_coroutine_enter(co, &rwco);
2813 while (rwco.ret == NOT_DONE) {
2814 aio_poll(aio_context, true);
2815 }
2816 }
2817 return rwco.ret;
2818 }
2819
2820 /*
2821 * Process a synchronous request using coroutines
2822 */
2823 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
2824 int nb_sectors, bool is_write, BdrvRequestFlags flags)
2825 {
2826 QEMUIOVector qiov;
2827 struct iovec iov = {
2828 .iov_base = (void *)buf,
2829 .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2830 };
2831
2832 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
2833 return -EINVAL;
2834 }
2835
2836 qemu_iovec_init_external(&qiov, &iov, 1);
2837 return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
2838 &qiov, is_write, flags);
2839 }
2840
2841 /* return < 0 if error. See bdrv_write() for the return codes */
2842 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2843 uint8_t *buf, int nb_sectors)
2844 {
2845 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
2846 }
2847
2848 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2849 int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2850 uint8_t *buf, int nb_sectors)
2851 {
2852 bool enabled;
2853 int ret;
2854
2855 enabled = bs->io_limits_enabled;
2856 bs->io_limits_enabled = false;
2857 ret = bdrv_read(bs, sector_num, buf, nb_sectors);
2858 bs->io_limits_enabled = enabled;
2859 return ret;
2860 }
2861
2862 /* Return < 0 if error. Important errors are:
2863 -EIO generic I/O error (may happen for all errors)
2864 -ENOMEDIUM No media inserted.
2865 -EINVAL Invalid sector number or nb_sectors
2866 -EACCES Trying to write a read-only device
2867 */
2868 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
2869 const uint8_t *buf, int nb_sectors)
2870 {
2871 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
2872 }
2873
2874 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2875 int nb_sectors, BdrvRequestFlags flags)
2876 {
2877 return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
2878 BDRV_REQ_ZERO_WRITE | flags);
2879 }
2880
2881 /*
2882 * Completely zero out a block device with the help of bdrv_write_zeroes.
2883 * The operation is sped up by checking the block status and only writing
2884 * zeroes to the device if they currently do not return zeroes. Optional
2885 * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2886 *
2887 * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2888 */
2889 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2890 {
2891 int64_t target_sectors, ret, nb_sectors, sector_num = 0;
2892 int n;
2893
2894 target_sectors = bdrv_nb_sectors(bs);
2895 if (target_sectors < 0) {
2896 return target_sectors;
2897 }
2898
2899 for (;;) {
2900 nb_sectors = MIN(target_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS);
2901 if (nb_sectors <= 0) {
2902 return 0;
2903 }
2904 ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
2905 if (ret < 0) {
2906 error_report("error getting block status at sector %" PRId64 ": %s",
2907 sector_num, strerror(-ret));
2908 return ret;
2909 }
2910 if (ret & BDRV_BLOCK_ZERO) {
2911 sector_num += n;
2912 continue;
2913 }
2914 ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2915 if (ret < 0) {
2916 error_report("error writing zeroes at sector %" PRId64 ": %s",
2917 sector_num, strerror(-ret));
2918 return ret;
2919 }
2920 sector_num += n;
2921 }
2922 }
2923
2924 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
2925 {
2926 QEMUIOVector qiov;
2927 struct iovec iov = {
2928 .iov_base = (void *)buf,
2929 .iov_len = bytes,
2930 };
2931 int ret;
2932
2933 if (bytes < 0) {
2934 return -EINVAL;
2935 }
2936
2937 qemu_iovec_init_external(&qiov, &iov, 1);
2938 ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
2939 if (ret < 0) {
2940 return ret;
2941 }
2942
2943 return bytes;
2944 }
2945
2946 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
2947 {
2948 int ret;
2949
2950 ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
2951 if (ret < 0) {
2952 return ret;
2953 }
2954
2955 return qiov->size;
2956 }
2957
2958 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2959 const void *buf, int bytes)
2960 {
2961 QEMUIOVector qiov;
2962 struct iovec iov = {
2963 .iov_base = (void *) buf,
2964 .iov_len = bytes,
2965 };
2966
2967 if (bytes < 0) {
2968 return -EINVAL;
2969 }
2970
2971 qemu_iovec_init_external(&qiov, &iov, 1);
2972 return bdrv_pwritev(bs, offset, &qiov);
2973 }
2974
2975 /*
2976 * Writes to the file and ensures that no writes are reordered across this
2977 * request (acts as a barrier)
2978 *
2979 * Returns 0 on success, -errno in error cases.
2980 */
2981 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2982 const void *buf, int count)
2983 {
2984 int ret;
2985
2986 ret = bdrv_pwrite(bs, offset, buf, count);
2987 if (ret < 0) {
2988 return ret;
2989 }
2990
2991 /* No flush needed for cache modes that already do it */
2992 if (bs->enable_write_cache) {
2993 bdrv_flush(bs);
2994 }
2995
2996 return 0;
2997 }
2998
2999 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
3000 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
3001 {
3002 /* Perform I/O through a temporary buffer so that users who scribble over
3003 * their read buffer while the operation is in progress do not end up
3004 * modifying the image file. This is critical for zero-copy guest I/O
3005 * where anything might happen inside guest memory.
3006 */
3007 void *bounce_buffer;
3008
3009 BlockDriver *drv = bs->drv;
3010 struct iovec iov;
3011 QEMUIOVector bounce_qiov;
3012 int64_t cluster_sector_num;
3013 int cluster_nb_sectors;
3014 size_t skip_bytes;
3015 int ret;
3016
3017 /* Cover entire cluster so no additional backing file I/O is required when
3018 * allocating cluster in the image file.
3019 */
3020 bdrv_round_to_clusters(bs, sector_num, nb_sectors,
3021 &cluster_sector_num, &cluster_nb_sectors);
3022
3023 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
3024 cluster_sector_num, cluster_nb_sectors);
3025
3026 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
3027 iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len);
3028 if (bounce_buffer == NULL) {
3029 ret = -ENOMEM;
3030 goto err;
3031 }
3032
3033 qemu_iovec_init_external(&bounce_qiov, &iov, 1);
3034
3035 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
3036 &bounce_qiov);
3037 if (ret < 0) {
3038 goto err;
3039 }
3040
3041 if (drv->bdrv_co_write_zeroes &&
3042 buffer_is_zero(bounce_buffer, iov.iov_len)) {
3043 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
3044 cluster_nb_sectors, 0);
3045 } else {
3046 /* This does not change the data on the disk, it is not necessary
3047 * to flush even in cache=writethrough mode.
3048 */
3049 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
3050 &bounce_qiov);
3051 }
3052
3053 if (ret < 0) {
3054 /* It might be okay to ignore write errors for guest requests. If this
3055 * is a deliberate copy-on-read then we don't want to ignore the error.
3056 * Simply report it in all cases.
3057 */
3058 goto err;
3059 }
3060
3061 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
3062 qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
3063 nb_sectors * BDRV_SECTOR_SIZE);
3064
3065 err:
3066 qemu_vfree(bounce_buffer);
3067 return ret;
3068 }
3069
3070 /*
3071 * Forwards an already correctly aligned request to the BlockDriver. This
3072 * handles copy on read and zeroing after EOF; any other features must be
3073 * implemented by the caller.
3074 */
3075 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
3076 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3077 int64_t align, QEMUIOVector *qiov, int flags)
3078 {
3079 BlockDriver *drv = bs->drv;
3080 int ret;
3081
3082 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3083 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
3084
3085 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3086 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3087 assert(!qiov || bytes == qiov->size);
3088
3089 /* Handle Copy on Read and associated serialisation */
3090 if (flags & BDRV_REQ_COPY_ON_READ) {
3091 /* If we touch the same cluster it counts as an overlap. This
3092 * guarantees that allocating writes will be serialized and not race
3093 * with each other for the same cluster. For example, in copy-on-read
3094 * it ensures that the CoR read and write operations are atomic and
3095 * guest writes cannot interleave between them. */
3096 mark_request_serialising(req, bdrv_get_cluster_size(bs));
3097 }
3098
3099 wait_serialising_requests(req);
3100
3101 if (flags & BDRV_REQ_COPY_ON_READ) {
3102 int pnum;
3103
3104 ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
3105 if (ret < 0) {
3106 goto out;
3107 }
3108
3109 if (!ret || pnum != nb_sectors) {
3110 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
3111 goto out;
3112 }
3113 }
3114
3115 /* Forward the request to the BlockDriver */
3116 if (!bs->zero_beyond_eof) {
3117 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
3118 } else {
3119 /* Read zeros after EOF */
3120 int64_t total_sectors, max_nb_sectors;
3121
3122 total_sectors = bdrv_nb_sectors(bs);
3123 if (total_sectors < 0) {
3124 ret = total_sectors;
3125 goto out;
3126 }
3127
3128 max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
3129 align >> BDRV_SECTOR_BITS);
3130 if (nb_sectors < max_nb_sectors) {
3131 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
3132 } else if (max_nb_sectors > 0) {
3133 QEMUIOVector local_qiov;
3134
3135 qemu_iovec_init(&local_qiov, qiov->niov);
3136 qemu_iovec_concat(&local_qiov, qiov, 0,
3137 max_nb_sectors * BDRV_SECTOR_SIZE);
3138
3139 ret = drv->bdrv_co_readv(bs, sector_num, max_nb_sectors,
3140 &local_qiov);
3141
3142 qemu_iovec_destroy(&local_qiov);
3143 } else {
3144 ret = 0;
3145 }
3146
3147 /* Reading beyond end of file is supposed to produce zeroes */
3148 if (ret == 0 && total_sectors < sector_num + nb_sectors) {
3149 uint64_t offset = MAX(0, total_sectors - sector_num);
3150 uint64_t bytes = (sector_num + nb_sectors - offset) *
3151 BDRV_SECTOR_SIZE;
3152 qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
3153 }
3154 }
3155
3156 out:
3157 return ret;
3158 }
3159
3160 static inline uint64_t bdrv_get_align(BlockDriverState *bs)
3161 {
3162 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3163 return MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3164 }
3165
3166 static inline bool bdrv_req_is_aligned(BlockDriverState *bs,
3167 int64_t offset, size_t bytes)
3168 {
3169 int64_t align = bdrv_get_align(bs);
3170 return !(offset & (align - 1) || (bytes & (align - 1)));
3171 }
3172
3173 /*
3174 * Handle a read request in coroutine context
3175 */
3176 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
3177 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3178 BdrvRequestFlags flags)
3179 {
3180 BlockDriver *drv = bs->drv;
3181 BdrvTrackedRequest req;
3182
3183 uint64_t align = bdrv_get_align(bs);
3184 uint8_t *head_buf = NULL;
3185 uint8_t *tail_buf = NULL;
3186 QEMUIOVector local_qiov;
3187 bool use_local_qiov = false;
3188 int ret;
3189
3190 if (!drv) {
3191 return -ENOMEDIUM;
3192 }
3193
3194 ret = bdrv_check_byte_request(bs, offset, bytes);
3195 if (ret < 0) {
3196 return ret;
3197 }
3198
3199 if (bs->copy_on_read) {
3200 flags |= BDRV_REQ_COPY_ON_READ;
3201 }
3202
3203 /* throttling disk I/O */
3204 if (bs->io_limits_enabled) {
3205 bdrv_io_limits_intercept(bs, bytes, false);
3206 }
3207
3208 /* Align read if necessary by padding qiov */
3209 if (offset & (align - 1)) {
3210 head_buf = qemu_blockalign(bs, align);
3211 qemu_iovec_init(&local_qiov, qiov->niov + 2);
3212 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3213 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3214 use_local_qiov = true;
3215
3216 bytes += offset & (align - 1);
3217 offset = offset & ~(align - 1);
3218 }
3219
3220 if ((offset + bytes) & (align - 1)) {
3221 if (!use_local_qiov) {
3222 qemu_iovec_init(&local_qiov, qiov->niov + 1);
3223 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3224 use_local_qiov = true;
3225 }
3226 tail_buf = qemu_blockalign(bs, align);
3227 qemu_iovec_add(&local_qiov, tail_buf,
3228 align - ((offset + bytes) & (align - 1)));
3229
3230 bytes = ROUND_UP(bytes, align);
3231 }
3232
3233 tracked_request_begin(&req, bs, offset, bytes, false);
3234 ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
3235 use_local_qiov ? &local_qiov : qiov,
3236 flags);
3237 tracked_request_end(&req);
3238
3239 if (use_local_qiov) {
3240 qemu_iovec_destroy(&local_qiov);
3241 qemu_vfree(head_buf);
3242 qemu_vfree(tail_buf);
3243 }
3244
3245 return ret;
3246 }
3247
3248 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
3249 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3250 BdrvRequestFlags flags)
3251 {
3252 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
3253 return -EINVAL;
3254 }
3255
3256 return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
3257 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3258 }
3259
3260 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
3261 int nb_sectors, QEMUIOVector *qiov)
3262 {
3263 trace_bdrv_co_readv(bs, sector_num, nb_sectors);
3264
3265 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
3266 }
3267
3268 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
3269 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
3270 {
3271 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
3272
3273 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
3274 BDRV_REQ_COPY_ON_READ);
3275 }
3276
3277 #define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768
3278
3279 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
3280 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
3281 {
3282 BlockDriver *drv = bs->drv;
3283 QEMUIOVector qiov;
3284 struct iovec iov = {0};
3285 int ret = 0;
3286
3287 int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_write_zeroes,
3288 BDRV_REQUEST_MAX_SECTORS);
3289
3290 while (nb_sectors > 0 && !ret) {
3291 int num = nb_sectors;
3292
3293 /* Align request. Block drivers can expect the "bulk" of the request
3294 * to be aligned.
3295 */
3296 if (bs->bl.write_zeroes_alignment
3297 && num > bs->bl.write_zeroes_alignment) {
3298 if (sector_num % bs->bl.write_zeroes_alignment != 0) {
3299 /* Make a small request up to the first aligned sector. */
3300 num = bs->bl.write_zeroes_alignment;
3301 num -= sector_num % bs->bl.write_zeroes_alignment;
3302 } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
3303 /* Shorten the request to the last aligned sector. num cannot
3304 * underflow because num > bs->bl.write_zeroes_alignment.
3305 */
3306 num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
3307 }
3308 }
3309
3310 /* limit request size */
3311 if (num > max_write_zeroes) {
3312 num = max_write_zeroes;
3313 }
3314
3315 ret = -ENOTSUP;
3316 /* First try the efficient write zeroes operation */
3317 if (drv->bdrv_co_write_zeroes) {
3318 ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
3319 }
3320
3321 if (ret == -ENOTSUP) {
3322 /* Fall back to bounce buffer if write zeroes is unsupported */
3323 int max_xfer_len = MIN_NON_ZERO(bs->bl.max_transfer_length,
3324 MAX_WRITE_ZEROES_BOUNCE_BUFFER);
3325 num = MIN(num, max_xfer_len);
3326 iov.iov_len = num * BDRV_SECTOR_SIZE;
3327 if (iov.iov_base == NULL) {
3328 iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE);
3329 if (iov.iov_base == NULL) {
3330 ret = -ENOMEM;
3331 goto fail;
3332 }
3333 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
3334 }
3335 qemu_iovec_init_external(&qiov, &iov, 1);
3336
3337 ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
3338
3339 /* Keep bounce buffer around if it is big enough for all
3340 * all future requests.
3341 */
3342 if (num < max_xfer_len) {
3343 qemu_vfree(iov.iov_base);
3344 iov.iov_base = NULL;
3345 }
3346 }
3347
3348 sector_num += num;
3349 nb_sectors -= num;
3350 }
3351
3352 fail:
3353 qemu_vfree(iov.iov_base);
3354 return ret;
3355 }
3356
3357 /*
3358 * Forwards an already correctly aligned write request to the BlockDriver.
3359 */
3360 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
3361 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3362 QEMUIOVector *qiov, int flags)
3363 {
3364 BlockDriver *drv = bs->drv;
3365 bool waited;
3366 int ret;
3367
3368 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3369 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
3370
3371 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3372 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3373 assert(!qiov || bytes == qiov->size);
3374
3375 waited = wait_serialising_requests(req);
3376 assert(!waited || !req->serialising);
3377 assert(req->overlap_offset <= offset);
3378 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
3379
3380 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
3381
3382 if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
3383 !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes &&
3384 qemu_iovec_is_zero(qiov)) {
3385 flags |= BDRV_REQ_ZERO_WRITE;
3386 if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
3387 flags |= BDRV_REQ_MAY_UNMAP;
3388 }
3389 }
3390
3391 if (ret < 0) {
3392 /* Do nothing, write notifier decided to fail this request */
3393 } else if (flags & BDRV_REQ_ZERO_WRITE) {
3394 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
3395 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
3396 } else {
3397 BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
3398 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
3399 }
3400 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
3401
3402 if (ret == 0 && !bs->enable_write_cache) {
3403 ret = bdrv_co_flush(bs);
3404 }
3405
3406 bdrv_set_dirty(bs, sector_num, nb_sectors);
3407
3408 block_acct_highest_sector(&bs->stats, sector_num, nb_sectors);
3409
3410 if (ret >= 0) {
3411 bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
3412 }
3413
3414 return ret;
3415 }
3416
3417 /*
3418 * Handle a write request in coroutine context
3419 */
3420 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
3421 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3422 BdrvRequestFlags flags)
3423 {
3424 BdrvTrackedRequest req;
3425 uint64_t align = bdrv_get_align(bs);
3426 uint8_t *head_buf = NULL;
3427 uint8_t *tail_buf = NULL;
3428 QEMUIOVector local_qiov;
3429 bool use_local_qiov = false;
3430 int ret;
3431
3432 if (!bs->drv) {
3433 return -ENOMEDIUM;
3434 }
3435 if (bs->read_only) {
3436 return -EACCES;
3437 }
3438
3439 ret = bdrv_check_byte_request(bs, offset, bytes);
3440 if (ret < 0) {
3441 return ret;
3442 }
3443
3444 /* throttling disk I/O */
3445 if (bs->io_limits_enabled) {
3446 bdrv_io_limits_intercept(bs, bytes, true);
3447 }
3448
3449 /*
3450 * Align write if necessary by performing a read-modify-write cycle.
3451 * Pad qiov with the read parts and be sure to have a tracked request not
3452 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
3453 */
3454 tracked_request_begin(&req, bs, offset, bytes, true);
3455
3456 if (offset & (align - 1)) {
3457 QEMUIOVector head_qiov;
3458 struct iovec head_iov;
3459
3460 mark_request_serialising(&req, align);
3461 wait_serialising_requests(&req);
3462
3463 head_buf = qemu_blockalign(bs, align);
3464 head_iov = (struct iovec) {
3465 .iov_base = head_buf,
3466 .iov_len = align,
3467 };
3468 qemu_iovec_init_external(&head_qiov, &head_iov, 1);
3469
3470 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
3471 ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
3472 align, &head_qiov, 0);
3473 if (ret < 0) {
3474 goto fail;
3475 }
3476 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
3477
3478 qemu_iovec_init(&local_qiov, qiov->niov + 2);
3479 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3480 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3481 use_local_qiov = true;
3482
3483 bytes += offset & (align - 1);
3484 offset = offset & ~(align - 1);
3485 }
3486
3487 if ((offset + bytes) & (align - 1)) {
3488 QEMUIOVector tail_qiov;
3489 struct iovec tail_iov;
3490 size_t tail_bytes;
3491 bool waited;
3492
3493 mark_request_serialising(&req, align);
3494 waited = wait_serialising_requests(&req);
3495 assert(!waited || !use_local_qiov);
3496
3497 tail_buf = qemu_blockalign(bs, align);
3498 tail_iov = (struct iovec) {
3499 .iov_base = tail_buf,
3500 .iov_len = align,
3501 };
3502 qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
3503
3504 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
3505 ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
3506 align, &tail_qiov, 0);
3507 if (ret < 0) {
3508 goto fail;
3509 }
3510 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
3511
3512 if (!use_local_qiov) {
3513 qemu_iovec_init(&local_qiov, qiov->niov + 1);
3514 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3515 use_local_qiov = true;
3516 }
3517
3518 tail_bytes = (offset + bytes) & (align - 1);
3519 qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
3520
3521 bytes = ROUND_UP(bytes, align);
3522 }
3523
3524 if (use_local_qiov) {
3525 /* Local buffer may have non-zero data. */
3526 flags &= ~BDRV_REQ_ZERO_WRITE;
3527 }
3528 ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
3529 use_local_qiov ? &local_qiov : qiov,
3530 flags);
3531
3532 fail:
3533 tracked_request_end(&req);
3534
3535 if (use_local_qiov) {
3536 qemu_iovec_destroy(&local_qiov);
3537 }
3538 qemu_vfree(head_buf);
3539 qemu_vfree(tail_buf);
3540
3541 return ret;
3542 }
3543
3544 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
3545 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3546 BdrvRequestFlags flags)
3547 {
3548 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
3549 return -EINVAL;
3550 }
3551
3552 return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
3553 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3554 }
3555
3556 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
3557 int nb_sectors, QEMUIOVector *qiov)
3558 {
3559 trace_bdrv_co_writev(bs, sector_num, nb_sectors);
3560
3561 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
3562 }
3563
3564 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
3565 int64_t sector_num, int nb_sectors,
3566 BdrvRequestFlags flags)
3567 {
3568 int ret;
3569
3570 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
3571
3572 if (!(bs->open_flags & BDRV_O_UNMAP)) {
3573 flags &= ~BDRV_REQ_MAY_UNMAP;
3574 }
3575 if (bdrv_req_is_aligned(bs, sector_num << BDRV_SECTOR_BITS,
3576 nb_sectors << BDRV_SECTOR_BITS)) {
3577 ret = bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
3578 BDRV_REQ_ZERO_WRITE | flags);
3579 } else {
3580 uint8_t *buf;
3581 QEMUIOVector local_qiov;
3582 size_t bytes = nb_sectors << BDRV_SECTOR_BITS;
3583
3584 buf = qemu_memalign(bdrv_opt_mem_align(bs), bytes);
3585 memset(buf, 0, bytes);
3586 qemu_iovec_init(&local_qiov, 1);
3587 qemu_iovec_add(&local_qiov, buf, bytes);
3588
3589 ret = bdrv_co_do_writev(bs, sector_num, nb_sectors, &local_qiov,
3590 BDRV_REQ_ZERO_WRITE | flags);
3591 qemu_vfree(buf);
3592 }
3593 return ret;
3594 }
3595
3596 /**
3597 * Truncate file to 'offset' bytes (needed only for file protocols)
3598 */
3599 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
3600 {
3601 BlockDriver *drv = bs->drv;
3602 int ret;
3603 if (!drv)
3604 return -ENOMEDIUM;
3605 if (!drv->bdrv_truncate)
3606 return -ENOTSUP;
3607 if (bs->read_only)
3608 return -EACCES;
3609
3610 ret = drv->bdrv_truncate(bs, offset);
3611 if (ret == 0) {
3612 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
3613 if (bs->blk) {
3614 blk_dev_resize_cb(bs->blk);
3615 }
3616 }
3617 return ret;
3618 }
3619
3620 /**
3621 * Length of a allocated file in bytes. Sparse files are counted by actual
3622 * allocated space. Return < 0 if error or unknown.
3623 */
3624 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
3625 {
3626 BlockDriver *drv = bs->drv;
3627 if (!drv) {
3628 return -ENOMEDIUM;
3629 }
3630 if (drv->bdrv_get_allocated_file_size) {
3631 return drv->bdrv_get_allocated_file_size(bs);
3632 }
3633 if (bs->file) {
3634 return bdrv_get_allocated_file_size(bs->file);
3635 }
3636 return -ENOTSUP;
3637 }
3638
3639 /**
3640 * Return number of sectors on success, -errno on error.
3641 */
3642 int64_t bdrv_nb_sectors(BlockDriverState *bs)
3643 {
3644 BlockDriver *drv = bs->drv;
3645
3646 if (!drv)
3647 return -ENOMEDIUM;
3648
3649 if (drv->has_variable_length) {
3650 int ret = refresh_total_sectors(bs, bs->total_sectors);
3651 if (ret < 0) {
3652 return ret;
3653 }
3654 }
3655 return bs->total_sectors;
3656 }
3657
3658 /**
3659 * Return length in bytes on success, -errno on error.
3660 * The length is always a multiple of BDRV_SECTOR_SIZE.
3661 */
3662 int64_t bdrv_getlength(BlockDriverState *bs)
3663 {
3664 int64_t ret = bdrv_nb_sectors(bs);
3665
3666 return ret < 0 ? ret : ret * BDRV_SECTOR_SIZE;
3667 }
3668
3669 /* return 0 as number of sectors if no device present or error */
3670 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
3671 {
3672 int64_t nb_sectors = bdrv_nb_sectors(bs);
3673
3674 *nb_sectors_ptr = nb_sectors < 0 ? 0 : nb_sectors;
3675 }
3676
3677 void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
3678 BlockdevOnError on_write_error)
3679 {
3680 bs->on_read_error = on_read_error;
3681 bs->on_write_error = on_write_error;
3682 }
3683
3684 BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
3685 {
3686 return is_read ? bs->on_read_error : bs->on_write_error;
3687 }
3688
3689 BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
3690 {
3691 BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
3692
3693 switch (on_err) {
3694 case BLOCKDEV_ON_ERROR_ENOSPC:
3695 return (error == ENOSPC) ?
3696 BLOCK_ERROR_ACTION_STOP : BLOCK_ERROR_ACTION_REPORT;
3697 case BLOCKDEV_ON_ERROR_STOP:
3698 return BLOCK_ERROR_ACTION_STOP;
3699 case BLOCKDEV_ON_ERROR_REPORT:
3700 return BLOCK_ERROR_ACTION_REPORT;
3701 case BLOCKDEV_ON_ERROR_IGNORE:
3702 return BLOCK_ERROR_ACTION_IGNORE;
3703 default:
3704 abort();
3705 }
3706 }
3707
3708 static void send_qmp_error_event(BlockDriverState *bs,
3709 BlockErrorAction action,
3710 bool is_read, int error)
3711 {
3712 IoOperationType optype;
3713
3714 optype = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE;
3715 qapi_event_send_block_io_error(bdrv_get_device_name(bs), optype, action,
3716 bdrv_iostatus_is_enabled(bs),
3717 error == ENOSPC, strerror(error),
3718 &error_abort);
3719 }
3720
3721 /* This is done by device models because, while the block layer knows
3722 * about the error, it does not know whether an operation comes from
3723 * the device or the block layer (from a job, for example).
3724 */
3725 void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3726 bool is_read, int error)
3727 {
3728 assert(error >= 0);
3729
3730 if (action == BLOCK_ERROR_ACTION_STOP) {
3731 /* First set the iostatus, so that "info block" returns an iostatus
3732 * that matches the events raised so far (an additional error iostatus
3733 * is fine, but not a lost one).
3734 */
3735 bdrv_iostatus_set_err(bs, error);
3736
3737 /* Then raise the request to stop the VM and the event.
3738 * qemu_system_vmstop_request_prepare has two effects. First,
3739 * it ensures that the STOP event always comes after the
3740 * BLOCK_IO_ERROR event. Second, it ensures that even if management
3741 * can observe the STOP event and do a "cont" before the STOP
3742 * event is issued, the VM will not stop. In this case, vm_start()
3743 * also ensures that the STOP/RESUME pair of events is emitted.
3744 */
3745 qemu_system_vmstop_request_prepare();
3746 send_qmp_error_event(bs, action, is_read, error);
3747 qemu_system_vmstop_request(RUN_STATE_IO_ERROR);
3748 } else {
3749 send_qmp_error_event(bs, action, is_read, error);
3750 }
3751 }
3752
3753 int bdrv_is_read_only(BlockDriverState *bs)
3754 {
3755 return bs->read_only;
3756 }
3757
3758 int bdrv_is_sg(BlockDriverState *bs)
3759 {
3760 return bs->sg;
3761 }
3762
3763 int bdrv_enable_write_cache(BlockDriverState *bs)
3764 {
3765 return bs->enable_write_cache;
3766 }
3767
3768 void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3769 {
3770 bs->enable_write_cache = wce;
3771
3772 /* so a reopen() will preserve wce */
3773 if (wce) {
3774 bs->open_flags |= BDRV_O_CACHE_WB;
3775 } else {
3776 bs->open_flags &= ~BDRV_O_CACHE_WB;
3777 }
3778 }
3779
3780 int bdrv_is_encrypted(BlockDriverState *bs)
3781 {
3782 if (bs->backing_hd && bs->backing_hd->encrypted)
3783 return 1;
3784 return bs->encrypted;
3785 }
3786
3787 int bdrv_key_required(BlockDriverState *bs)
3788 {
3789 BlockDriverState *backing_hd = bs->backing_hd;
3790
3791 if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3792 return 1;
3793 return (bs->encrypted && !bs->valid_key);
3794 }
3795
3796 int bdrv_set_key(BlockDriverState *bs, const char *key)
3797 {
3798 int ret;
3799 if (bs->backing_hd && bs->backing_hd->encrypted) {
3800 ret = bdrv_set_key(bs->backing_hd, key);
3801 if (ret < 0)
3802 return ret;
3803 if (!bs->encrypted)
3804 return 0;
3805 }
3806 if (!bs->encrypted) {
3807 return -EINVAL;
3808 } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3809 return -ENOMEDIUM;
3810 }
3811 ret = bs->drv->bdrv_set_key(bs, key);
3812 if (ret < 0) {
3813 bs->valid_key = 0;
3814 } else if (!bs->valid_key) {
3815 bs->valid_key = 1;
3816 if (bs->blk) {
3817 /* call the change callback now, we skipped it on open */
3818 blk_dev_change_media_cb(bs->blk, true);
3819 }
3820 }
3821 return ret;
3822 }
3823
3824 /*
3825 * Provide an encryption key for @bs.
3826 * If @key is non-null:
3827 * If @bs is not encrypted, fail.
3828 * Else if the key is invalid, fail.
3829 * Else set @bs's key to @key, replacing the existing key, if any.
3830 * If @key is null:
3831 * If @bs is encrypted and still lacks a key, fail.
3832 * Else do nothing.
3833 * On failure, store an error object through @errp if non-null.
3834 */
3835 void bdrv_add_key(BlockDriverState *bs, const char *key, Error **errp)
3836 {
3837 if (key) {
3838 if (!bdrv_is_encrypted(bs)) {
3839 error_setg(errp, "Node '%s' is not encrypted",
3840 bdrv_get_device_or_node_name(bs));
3841 } else if (bdrv_set_key(bs, key) < 0) {
3842 error_set(errp, QERR_INVALID_PASSWORD);
3843 }
3844 } else {
3845 if (bdrv_key_required(bs)) {
3846 error_set(errp, ERROR_CLASS_DEVICE_ENCRYPTED,
3847 "'%s' (%s) is encrypted",
3848 bdrv_get_device_or_node_name(bs),
3849 bdrv_get_encrypted_filename(bs));
3850 }
3851 }
3852 }
3853
3854 const char *bdrv_get_format_name(BlockDriverState *bs)
3855 {
3856 return bs->drv ? bs->drv->format_name : NULL;
3857 }
3858
3859 static int qsort_strcmp(const void *a, const void *b)
3860 {
3861 return strcmp(a, b);
3862 }
3863
3864 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
3865 void *opaque)
3866 {
3867 BlockDriver *drv;
3868 int count = 0;
3869 int i;
3870 const char **formats = NULL;
3871
3872 QLIST_FOREACH(drv, &bdrv_drivers, list) {
3873 if (drv->format_name) {
3874 bool found = false;
3875 int i = count;
3876 while (formats && i && !found) {
3877 found = !strcmp(formats[--i], drv->format_name);
3878 }
3879
3880 if (!found) {
3881 formats = g_renew(const char *, formats, count + 1);
3882 formats[count++] = drv->format_name;
3883 }
3884 }
3885 }
3886
3887 qsort(formats, count, sizeof(formats[0]), qsort_strcmp);
3888
3889 for (i = 0; i < count; i++) {
3890 it(opaque, formats[i]);
3891 }
3892
3893 g_free(formats);
3894 }
3895
3896 /* This function is to find a node in the bs graph */
3897 BlockDriverState *bdrv_find_node(const char *node_name)
3898 {
3899 BlockDriverState *bs;
3900
3901 assert(node_name);
3902
3903 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3904 if (!strcmp(node_name, bs->node_name)) {
3905 return bs;
3906 }
3907 }
3908 return NULL;
3909 }
3910
3911 /* Put this QMP function here so it can access the static graph_bdrv_states. */
3912 BlockDeviceInfoList *bdrv_named_nodes_list(Error **errp)
3913 {
3914 BlockDeviceInfoList *list, *entry;
3915 BlockDriverState *bs;
3916
3917 list = NULL;
3918 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3919 BlockDeviceInfo *info = bdrv_block_device_info(bs, errp);
3920 if (!info) {
3921 qapi_free_BlockDeviceInfoList(list);
3922 return NULL;
3923 }
3924 entry = g_malloc0(sizeof(*entry));
3925 entry->value = info;
3926 entry->next = list;
3927 list = entry;
3928 }
3929
3930 return list;
3931 }
3932
3933 BlockDriverState *bdrv_lookup_bs(const char *device,
3934 const char *node_name,
3935 Error **errp)
3936 {
3937 BlockBackend *blk;
3938 BlockDriverState *bs;
3939
3940 if (device) {
3941 blk = blk_by_name(device);
3942
3943 if (blk) {
3944 return blk_bs(blk);
3945 }
3946 }
3947
3948 if (node_name) {
3949 bs = bdrv_find_node(node_name);
3950
3951 if (bs) {
3952 return bs;
3953 }
3954 }
3955
3956 error_setg(errp, "Cannot find device=%s nor node_name=%s",
3957 device ? device : "",
3958 node_name ? node_name : "");
3959 return NULL;
3960 }
3961
3962 /* If 'base' is in the same chain as 'top', return true. Otherwise,
3963 * return false. If either argument is NULL, return false. */
3964 bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base)
3965 {
3966 while (top && top != base) {
3967 top = top->backing_hd;
3968 }
3969
3970 return top != NULL;
3971 }
3972
3973 BlockDriverState *bdrv_next_node(BlockDriverState *bs)
3974 {
3975 if (!bs) {
3976 return QTAILQ_FIRST(&graph_bdrv_states);
3977 }
3978 return QTAILQ_NEXT(bs, node_list);
3979 }
3980
3981 BlockDriverState *bdrv_next(BlockDriverState *bs)
3982 {
3983 if (!bs) {
3984 return QTAILQ_FIRST(&bdrv_states);
3985 }
3986 return QTAILQ_NEXT(bs, device_list);
3987 }
3988
3989 const char *bdrv_get_node_name(const BlockDriverState *bs)
3990 {
3991 return bs->node_name;
3992 }
3993
3994 /* TODO check what callers really want: bs->node_name or blk_name() */
3995 const char *bdrv_get_device_name(const BlockDriverState *bs)
3996 {
3997 return bs->blk ? blk_name(bs->blk) : "";
3998 }
3999
4000 /* This can be used to identify nodes that might not have a device
4001 * name associated. Since node and device names live in the same
4002 * namespace, the result is unambiguous. The exception is if both are
4003 * absent, then this returns an empty (non-null) string. */
4004 const char *bdrv_get_device_or_node_name(const BlockDriverState *bs)
4005 {
4006 return bs->blk ? blk_name(bs->blk) : bs->node_name;
4007 }
4008
4009 int bdrv_get_flags(BlockDriverState *bs)
4010 {
4011 return bs->open_flags;
4012 }
4013
4014 int bdrv_flush_all(void)
4015 {
4016 BlockDriverState *bs;
4017 int result = 0;
4018
4019 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
4020 AioContext *aio_context = bdrv_get_aio_context(bs);
4021 int ret;
4022
4023 aio_context_acquire(aio_context);
4024 ret = bdrv_flush(bs);
4025 if (ret < 0 && !result) {
4026 result = ret;
4027 }
4028 aio_context_release(aio_context);
4029 }
4030
4031 return result;
4032 }
4033
4034 int bdrv_has_zero_init_1(BlockDriverState *bs)
4035 {
4036 return 1;
4037 }
4038
4039 int bdrv_has_zero_init(BlockDriverState *bs)
4040 {
4041 assert(bs->drv);
4042
4043 /* If BS is a copy on write image, it is initialized to
4044 the contents of the base image, which may not be zeroes. */
4045 if (bs->backing_hd) {
4046 return 0;
4047 }
4048 if (bs->drv->bdrv_has_zero_init) {
4049 return bs->drv->bdrv_has_zero_init(bs);
4050 }
4051
4052 /* safe default */
4053 return 0;
4054 }
4055
4056 bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
4057 {
4058 BlockDriverInfo bdi;
4059
4060 if (bs->backing_hd) {
4061 return false;
4062 }
4063
4064 if (bdrv_get_info(bs, &bdi) == 0) {
4065 return bdi.unallocated_blocks_are_zero;
4066 }
4067
4068 return false;
4069 }
4070
4071 bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
4072 {
4073 BlockDriverInfo bdi;
4074
4075 if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
4076 return false;
4077 }
4078
4079 if (bdrv_get_info(bs, &bdi) == 0) {
4080 return bdi.can_write_zeroes_with_unmap;
4081 }
4082
4083 return false;
4084 }
4085
4086 typedef struct BdrvCoGetBlockStatusData {
4087 BlockDriverState *bs;
4088 BlockDriverState *base;
4089 int64_t sector_num;
4090 int nb_sectors;
4091 int *pnum;
4092 int64_t ret;
4093 bool done;
4094 } BdrvCoGetBlockStatusData;
4095
4096 /*
4097 * Returns the allocation status of the specified sectors.
4098 * Drivers not implementing the functionality are assumed to not support
4099 * backing files, hence all their sectors are reported as allocated.
4100 *
4101 * If 'sector_num' is beyond the end of the disk image the return value is 0
4102 * and 'pnum' is set to 0.
4103 *
4104 * 'pnum' is set to the number of sectors (including and immediately following
4105 * the specified sector) that are known to be in the same
4106 * allocated/unallocated state.
4107 *
4108 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
4109 * beyond the end of the disk image it will be clamped.
4110 */
4111 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
4112 int64_t sector_num,
4113 int nb_sectors, int *pnum)
4114 {
4115 int64_t total_sectors;
4116 int64_t n;
4117 int64_t ret, ret2;
4118
4119 total_sectors = bdrv_nb_sectors(bs);
4120 if (total_sectors < 0) {
4121 return total_sectors;
4122 }
4123
4124 if (sector_num >= total_sectors) {
4125 *pnum = 0;
4126 return 0;
4127 }
4128
4129 n = total_sectors - sector_num;
4130 if (n < nb_sectors) {
4131 nb_sectors = n;
4132 }
4133
4134 if (!bs->drv->bdrv_co_get_block_status) {
4135 *pnum = nb_sectors;
4136 ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
4137 if (bs->drv->protocol_name) {
4138 ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
4139 }
4140 return ret;
4141 }
4142
4143 ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
4144 if (ret < 0) {
4145 *pnum = 0;
4146 return ret;
4147 }
4148
4149 if (ret & BDRV_BLOCK_RAW) {
4150 assert(ret & BDRV_BLOCK_OFFSET_VALID);
4151 return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
4152 *pnum, pnum);
4153 }
4154
4155 if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
4156 ret |= BDRV_BLOCK_ALLOCATED;
4157 }
4158
4159 if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
4160 if (bdrv_unallocated_blocks_are_zero(bs)) {
4161 ret |= BDRV_BLOCK_ZERO;
4162 } else if (bs->backing_hd) {
4163 BlockDriverState *bs2 = bs->backing_hd;
4164 int64_t nb_sectors2 = bdrv_nb_sectors(bs2);
4165 if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) {
4166 ret |= BDRV_BLOCK_ZERO;
4167 }
4168 }
4169 }
4170
4171 if (bs->file &&
4172 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
4173 (ret & BDRV_BLOCK_OFFSET_VALID)) {
4174 int file_pnum;
4175
4176 ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
4177 *pnum, &file_pnum);
4178 if (ret2 >= 0) {
4179 /* Ignore errors. This is just providing extra information, it
4180 * is useful but not necessary.
4181 */
4182 if (!file_pnum) {
4183 /* !file_pnum indicates an offset at or beyond the EOF; it is
4184 * perfectly valid for the format block driver to point to such
4185 * offsets, so catch it and mark everything as zero */
4186 ret |= BDRV_BLOCK_ZERO;
4187 } else {
4188 /* Limit request to the range reported by the protocol driver */
4189 *pnum = file_pnum;
4190 ret |= (ret2 & BDRV_BLOCK_ZERO);
4191 }
4192 }
4193 }
4194
4195 return ret;
4196 }
4197
4198 /* Coroutine wrapper for bdrv_get_block_status() */
4199 static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
4200 {
4201 BdrvCoGetBlockStatusData *data = opaque;
4202 BlockDriverState *bs = data->bs;
4203
4204 data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
4205 data->pnum);
4206 data->done = true;
4207 }
4208
4209 /*
4210 * Synchronous wrapper around bdrv_co_get_block_status().
4211 *
4212 * See bdrv_co_get_block_status() for details.
4213 */
4214 int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
4215 int nb_sectors, int *pnum)
4216 {
4217 Coroutine *co;
4218 BdrvCoGetBlockStatusData data = {
4219 .bs = bs,
4220 .sector_num = sector_num,
4221 .nb_sectors = nb_sectors,
4222 .pnum = pnum,
4223 .done = false,
4224 };
4225
4226 if (qemu_in_coroutine()) {
4227 /* Fast-path if already in coroutine context */
4228 bdrv_get_block_status_co_entry(&data);
4229 } else {
4230 AioContext *aio_context = bdrv_get_aio_context(bs);
4231
4232 co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
4233 qemu_coroutine_enter(co, &data);
4234 while (!data.done) {
4235 aio_poll(aio_context, true);
4236 }
4237 }
4238 return data.ret;
4239 }
4240
4241 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
4242 int nb_sectors, int *pnum)
4243 {
4244 int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
4245 if (ret < 0) {
4246 return ret;
4247 }
4248 return !!(ret & BDRV_BLOCK_ALLOCATED);
4249 }
4250
4251 /*
4252 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
4253 *
4254 * Return true if the given sector is allocated in any image between
4255 * BASE and TOP (inclusive). BASE can be NULL to check if the given
4256 * sector is allocated in any image of the chain. Return false otherwise.
4257 *
4258 * 'pnum' is set to the number of sectors (including and immediately following
4259 * the specified sector) that are known to be in the same
4260 * allocated/unallocated state.
4261 *
4262 */
4263 int bdrv_is_allocated_above(BlockDriverState *top,
4264 BlockDriverState *base,
4265 int64_t sector_num,
4266 int nb_sectors, int *pnum)
4267 {
4268 BlockDriverState *intermediate;
4269 int ret, n = nb_sectors;
4270
4271 intermediate = top;
4272 while (intermediate && intermediate != base) {
4273 int pnum_inter;
4274 ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
4275 &pnum_inter);
4276 if (ret < 0) {
4277 return ret;
4278 } else if (ret) {
4279 *pnum = pnum_inter;
4280 return 1;
4281 }
4282
4283 /*
4284 * [sector_num, nb_sectors] is unallocated on top but intermediate
4285 * might have
4286 *
4287 * [sector_num+x, nr_sectors] allocated.
4288 */
4289 if (n > pnum_inter &&
4290 (intermediate == top ||
4291 sector_num + pnum_inter < intermediate->total_sectors)) {
4292 n = pnum_inter;
4293 }
4294
4295 intermediate = intermediate->backing_hd;
4296 }
4297
4298 *pnum = n;
4299 return 0;
4300 }
4301
4302 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
4303 {
4304 if (bs->backing_hd && bs->backing_hd->encrypted)
4305 return bs->backing_file;
4306 else if (bs->encrypted)
4307 return bs->filename;
4308 else
4309 return NULL;
4310 }
4311
4312 void bdrv_get_backing_filename(BlockDriverState *bs,
4313 char *filename, int filename_size)
4314 {
4315 pstrcpy(filename, filename_size, bs->backing_file);
4316 }
4317
4318 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
4319 const uint8_t *buf, int nb_sectors)
4320 {
4321 BlockDriver *drv = bs->drv;
4322 int ret;
4323
4324 if (!drv) {
4325 return -ENOMEDIUM;
4326 }
4327 if (!drv->bdrv_write_compressed) {
4328 return -ENOTSUP;
4329 }
4330 ret = bdrv_check_request(bs, sector_num, nb_sectors);
4331 if (ret < 0) {
4332 return ret;
4333 }
4334
4335 assert(QLIST_EMPTY(&bs->dirty_bitmaps));
4336
4337 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
4338 }
4339
4340 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
4341 {
4342 BlockDriver *drv = bs->drv;
4343 if (!drv)
4344 return -ENOMEDIUM;
4345 if (!drv->bdrv_get_info)
4346 return -ENOTSUP;
4347 memset(bdi, 0, sizeof(*bdi));
4348 return drv->bdrv_get_info(bs, bdi);
4349 }
4350
4351 ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
4352 {
4353 BlockDriver *drv = bs->drv;
4354 if (drv && drv->bdrv_get_specific_info) {
4355 return drv->bdrv_get_specific_info(bs);
4356 }
4357 return NULL;
4358 }
4359
4360 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
4361 int64_t pos, int size)
4362 {
4363 QEMUIOVector qiov;
4364 struct iovec iov = {
4365 .iov_base = (void *) buf,
4366 .iov_len = size,
4367 };
4368
4369 qemu_iovec_init_external(&qiov, &iov, 1);
4370 return bdrv_writev_vmstate(bs, &qiov, pos);
4371 }
4372
4373 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
4374 {
4375 BlockDriver *drv = bs->drv;
4376
4377 if (!drv) {
4378 return -ENOMEDIUM;
4379 } else if (drv->bdrv_save_vmstate) {
4380 return drv->bdrv_save_vmstate(bs, qiov, pos);
4381 } else if (bs->file) {
4382 return bdrv_writev_vmstate(bs->file, qiov, pos);
4383 }
4384
4385 return -ENOTSUP;
4386 }
4387
4388 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
4389 int64_t pos, int size)
4390 {
4391 BlockDriver *drv = bs->drv;
4392 if (!drv)
4393 return -ENOMEDIUM;
4394 if (drv->bdrv_load_vmstate)
4395 return drv->bdrv_load_vmstate(bs, buf, pos, size);
4396 if (bs->file)
4397 return bdrv_load_vmstate(bs->file, buf, pos, size);
4398 return -ENOTSUP;
4399 }
4400
4401 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
4402 {
4403 if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
4404 return;
4405 }
4406
4407 bs->drv->bdrv_debug_event(bs, event);
4408 }
4409
4410 int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
4411 const char *tag)
4412 {
4413 while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
4414 bs = bs->file;
4415 }
4416
4417 if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
4418 return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
4419 }
4420
4421 return -ENOTSUP;
4422 }
4423
4424 int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
4425 {
4426 while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
4427 bs = bs->file;
4428 }
4429
4430 if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
4431 return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
4432 }
4433
4434 return -ENOTSUP;
4435 }
4436
4437 int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
4438 {
4439 while (bs && (!bs->drv || !bs->drv->bdrv_debug_resume)) {
4440 bs = bs->file;
4441 }
4442
4443 if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
4444 return bs->drv->bdrv_debug_resume(bs, tag);
4445 }
4446
4447 return -ENOTSUP;
4448 }
4449
4450 bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
4451 {
4452 while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
4453 bs = bs->file;
4454 }
4455
4456 if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
4457 return bs->drv->bdrv_debug_is_suspended(bs, tag);
4458 }
4459
4460 return false;
4461 }
4462
4463 int bdrv_is_snapshot(BlockDriverState *bs)
4464 {
4465 return !!(bs->open_flags & BDRV_O_SNAPSHOT);
4466 }
4467
4468 /* backing_file can either be relative, or absolute, or a protocol. If it is
4469 * relative, it must be relative to the chain. So, passing in bs->filename
4470 * from a BDS as backing_file should not be done, as that may be relative to
4471 * the CWD rather than the chain. */
4472 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
4473 const char *backing_file)
4474 {
4475 char *filename_full = NULL;
4476 char *backing_file_full = NULL;
4477 char *filename_tmp = NULL;
4478 int is_protocol = 0;
4479 BlockDriverState *curr_bs = NULL;
4480 BlockDriverState *retval = NULL;
4481
4482 if (!bs || !bs->drv || !backing_file) {
4483 return NULL;
4484 }
4485
4486 filename_full = g_malloc(PATH_MAX);
4487 backing_file_full = g_malloc(PATH_MAX);
4488 filename_tmp = g_malloc(PATH_MAX);
4489
4490 is_protocol = path_has_protocol(backing_file);
4491
4492 for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
4493
4494 /* If either of the filename paths is actually a protocol, then
4495 * compare unmodified paths; otherwise make paths relative */
4496 if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
4497 if (strcmp(backing_file, curr_bs->backing_file) == 0) {
4498 retval = curr_bs->backing_hd;
4499 break;
4500 }
4501 } else {
4502 /* If not an absolute filename path, make it relative to the current
4503 * image's filename path */
4504 path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4505 backing_file);
4506
4507 /* We are going to compare absolute pathnames */
4508 if (!realpath(filename_tmp, filename_full)) {
4509 continue;
4510 }
4511
4512 /* We need to make sure the backing filename we are comparing against
4513 * is relative to the current image filename (or absolute) */
4514 path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4515 curr_bs->backing_file);
4516
4517 if (!realpath(filename_tmp, backing_file_full)) {
4518 continue;
4519 }
4520
4521 if (strcmp(backing_file_full, filename_full) == 0) {
4522 retval = curr_bs->backing_hd;
4523 break;
4524 }
4525 }
4526 }
4527
4528 g_free(filename_full);
4529 g_free(backing_file_full);
4530 g_free(filename_tmp);
4531 return retval;
4532 }
4533
4534 int bdrv_get_backing_file_depth(BlockDriverState *bs)
4535 {
4536 if (!bs->drv) {
4537 return 0;
4538 }
4539
4540 if (!bs->backing_hd) {
4541 return 0;
4542 }
4543
4544 return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
4545 }
4546
4547 /**************************************************************/
4548 /* async I/Os */
4549
4550 BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
4551 QEMUIOVector *qiov, int nb_sectors,
4552 BlockCompletionFunc *cb, void *opaque)
4553 {
4554 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
4555
4556 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4557 cb, opaque, false);
4558 }
4559
4560 BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
4561 QEMUIOVector *qiov, int nb_sectors,
4562 BlockCompletionFunc *cb, void *opaque)
4563 {
4564 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
4565
4566 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4567 cb, opaque, true);
4568 }
4569
4570 BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
4571 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
4572 BlockCompletionFunc *cb, void *opaque)
4573 {
4574 trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
4575
4576 return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
4577 BDRV_REQ_ZERO_WRITE | flags,
4578 cb, opaque, true);
4579 }
4580
4581
4582 typedef struct MultiwriteCB {
4583 int error;
4584 int num_requests;
4585 int num_callbacks;
4586 struct {
4587 BlockCompletionFunc *cb;
4588 void *opaque;
4589 QEMUIOVector *free_qiov;
4590 } callbacks[];
4591 } MultiwriteCB;
4592
4593 static void multiwrite_user_cb(MultiwriteCB *mcb)
4594 {
4595 int i;
4596
4597 for (i = 0; i < mcb->num_callbacks; i++) {
4598 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
4599 if (mcb->callbacks[i].free_qiov) {
4600 qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
4601 }
4602 g_free(mcb->callbacks[i].free_qiov);
4603 }
4604 }
4605
4606 static void multiwrite_cb(void *opaque, int ret)
4607 {
4608 MultiwriteCB *mcb = opaque;
4609
4610 trace_multiwrite_cb(mcb, ret);
4611
4612 if (ret < 0 && !mcb->error) {
4613 mcb->error = ret;
4614 }
4615
4616 mcb->num_requests--;
4617 if (mcb->num_requests == 0) {
4618 multiwrite_user_cb(mcb);
4619 g_free(mcb);
4620 }
4621 }
4622
4623 static int multiwrite_req_compare(const void *a, const void *b)
4624 {
4625 const BlockRequest *req1 = a, *req2 = b;
4626
4627 /*
4628 * Note that we can't simply subtract req2->sector from req1->sector
4629 * here as that could overflow the return value.
4630 */
4631 if (req1->sector > req2->sector) {
4632 return 1;
4633 } else if (req1->sector < req2->sector) {
4634 return -1;
4635 } else {
4636 return 0;
4637 }
4638 }
4639
4640 /*
4641 * Takes a bunch of requests and tries to merge them. Returns the number of
4642 * requests that remain after merging.
4643 */
4644 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
4645 int num_reqs, MultiwriteCB *mcb)
4646 {
4647 int i, outidx;
4648
4649 // Sort requests by start sector
4650 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
4651
4652 // Check if adjacent requests touch the same clusters. If so, combine them,
4653 // filling up gaps with zero sectors.
4654 outidx = 0;
4655 for (i = 1; i < num_reqs; i++) {
4656 int merge = 0;
4657 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
4658
4659 // Handle exactly sequential writes and overlapping writes.
4660 if (reqs[i].sector <= oldreq_last) {
4661 merge = 1;
4662 }
4663
4664 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
4665 merge = 0;
4666 }
4667
4668 if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors +
4669 reqs[i].nb_sectors > bs->bl.max_transfer_length) {
4670 merge = 0;
4671 }
4672
4673 if (merge) {
4674 size_t size;
4675 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
4676 qemu_iovec_init(qiov,
4677 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
4678
4679 // Add the first request to the merged one. If the requests are
4680 // overlapping, drop the last sectors of the first request.
4681 size = (reqs[i].sector - reqs[outidx].sector) << 9;
4682 qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
4683
4684 // We should need to add any zeros between the two requests
4685 assert (reqs[i].sector <= oldreq_last);
4686
4687 // Add the second request
4688 qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
4689
4690 // Add tail of first request, if necessary
4691 if (qiov->size < reqs[outidx].qiov->size) {
4692 qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size,
4693 reqs[outidx].qiov->size - qiov->size);
4694 }
4695
4696 reqs[outidx].nb_sectors = qiov->size >> 9;
4697 reqs[outidx].qiov = qiov;
4698
4699 mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
4700 } else {
4701 outidx++;
4702 reqs[outidx].sector = reqs[i].sector;
4703 reqs[outidx].nb_sectors = reqs[i].nb_sectors;
4704 reqs[outidx].qiov = reqs[i].qiov;
4705 }
4706 }
4707
4708 block_acct_merge_done(&bs->stats, BLOCK_ACCT_WRITE, num_reqs - outidx - 1);
4709
4710 return outidx + 1;
4711 }
4712
4713 /*
4714 * Submit multiple AIO write requests at once.
4715 *
4716 * On success, the function returns 0 and all requests in the reqs array have
4717 * been submitted. In error case this function returns -1, and any of the
4718 * requests may or may not be submitted yet. In particular, this means that the
4719 * callback will be called for some of the requests, for others it won't. The
4720 * caller must check the error field of the BlockRequest to wait for the right
4721 * callbacks (if error != 0, no callback will be called).
4722 *
4723 * The implementation may modify the contents of the reqs array, e.g. to merge
4724 * requests. However, the fields opaque and error are left unmodified as they
4725 * are used to signal failure for a single request to the caller.
4726 */
4727 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
4728 {
4729 MultiwriteCB *mcb;
4730 int i;
4731
4732 /* don't submit writes if we don't have a medium */
4733 if (bs->drv == NULL) {
4734 for (i = 0; i < num_reqs; i++) {
4735 reqs[i].error = -ENOMEDIUM;
4736 }
4737 return -1;
4738 }
4739
4740 if (num_reqs == 0) {
4741 return 0;
4742 }
4743
4744 // Create MultiwriteCB structure
4745 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
4746 mcb->num_requests = 0;
4747 mcb->num_callbacks = num_reqs;
4748
4749 for (i = 0; i < num_reqs; i++) {
4750 mcb->callbacks[i].cb = reqs[i].cb;
4751 mcb->callbacks[i].opaque = reqs[i].opaque;
4752 }
4753
4754 // Check for mergable requests
4755 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
4756
4757 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
4758
4759 /* Run the aio requests. */
4760 mcb->num_requests = num_reqs;
4761 for (i = 0; i < num_reqs; i++) {
4762 bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
4763 reqs[i].nb_sectors, reqs[i].flags,
4764 multiwrite_cb, mcb,
4765 true);
4766 }
4767
4768 return 0;
4769 }
4770
4771 void bdrv_aio_cancel(BlockAIOCB *acb)
4772 {
4773 qemu_aio_ref(acb);
4774 bdrv_aio_cancel_async(acb);
4775 while (acb->refcnt > 1) {
4776 if (acb->aiocb_info->get_aio_context) {
4777 aio_poll(acb->aiocb_info->get_aio_context(acb), true);
4778 } else if (acb->bs) {
4779 aio_poll(bdrv_get_aio_context(acb->bs), true);
4780 } else {
4781 abort();
4782 }
4783 }
4784 qemu_aio_unref(acb);
4785 }
4786
4787 /* Async version of aio cancel. The caller is not blocked if the acb implements
4788 * cancel_async, otherwise we do nothing and let the request normally complete.
4789 * In either case the completion callback must be called. */
4790 void bdrv_aio_cancel_async(BlockAIOCB *acb)
4791 {
4792 if (acb->aiocb_info->cancel_async) {
4793 acb->aiocb_info->cancel_async(acb);
4794 }
4795 }
4796
4797 /**************************************************************/
4798 /* async block device emulation */
4799
4800 typedef struct BlockAIOCBSync {
4801 BlockAIOCB common;
4802 QEMUBH *bh;
4803 int ret;
4804 /* vector translation state */
4805 QEMUIOVector *qiov;
4806 uint8_t *bounce;
4807 int is_write;
4808 } BlockAIOCBSync;
4809
4810 static const AIOCBInfo bdrv_em_aiocb_info = {
4811 .aiocb_size = sizeof(BlockAIOCBSync),
4812 };
4813
4814 static void bdrv_aio_bh_cb(void *opaque)
4815 {
4816 BlockAIOCBSync *acb = opaque;
4817
4818 if (!acb->is_write && acb->ret >= 0) {
4819 qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
4820 }
4821 qemu_vfree(acb->bounce);
4822 acb->common.cb(acb->common.opaque, acb->ret);
4823 qemu_bh_delete(acb->bh);
4824 acb->bh = NULL;
4825 qemu_aio_unref(acb);
4826 }
4827
4828 static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
4829 int64_t sector_num,
4830 QEMUIOVector *qiov,
4831 int nb_sectors,
4832 BlockCompletionFunc *cb,
4833 void *opaque,
4834 int is_write)
4835
4836 {
4837 BlockAIOCBSync *acb;
4838
4839 acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
4840 acb->is_write = is_write;
4841 acb->qiov = qiov;
4842 acb->bounce = qemu_try_blockalign(bs, qiov->size);
4843 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb);
4844
4845 if (acb->bounce == NULL) {
4846 acb->ret = -ENOMEM;
4847 } else if (is_write) {
4848 qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
4849 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
4850 } else {
4851 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
4852 }
4853
4854 qemu_bh_schedule(acb->bh);
4855
4856 return &acb->common;
4857 }
4858
4859 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
4860 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4861 BlockCompletionFunc *cb, void *opaque)
4862 {
4863 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
4864 }
4865
4866 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
4867 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4868 BlockCompletionFunc *cb, void *opaque)
4869 {
4870 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
4871 }
4872
4873
4874 typedef struct BlockAIOCBCoroutine {
4875 BlockAIOCB common;
4876 BlockRequest req;
4877 bool is_write;
4878 bool need_bh;
4879 bool *done;
4880 QEMUBH* bh;
4881 } BlockAIOCBCoroutine;
4882
4883 static const AIOCBInfo bdrv_em_co_aiocb_info = {
4884 .aiocb_size = sizeof(BlockAIOCBCoroutine),
4885 };
4886
4887 static void bdrv_co_complete(BlockAIOCBCoroutine *acb)
4888 {
4889 if (!acb->need_bh) {
4890 acb->common.cb(acb->common.opaque, acb->req.error);
4891 qemu_aio_unref(acb);
4892 }
4893 }
4894
4895 static void bdrv_co_em_bh(void *opaque)
4896 {
4897 BlockAIOCBCoroutine *acb = opaque;
4898
4899 assert(!acb->need_bh);
4900 qemu_bh_delete(acb->bh);
4901 bdrv_co_complete(acb);
4902 }
4903
4904 static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb)
4905 {
4906 acb->need_bh = false;
4907 if (acb->req.error != -EINPROGRESS) {
4908 BlockDriverState *bs = acb->common.bs;
4909
4910 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4911 qemu_bh_schedule(acb->bh);
4912 }
4913 }
4914
4915 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4916 static void coroutine_fn bdrv_co_do_rw(void *opaque)
4917 {
4918 BlockAIOCBCoroutine *acb = opaque;
4919 BlockDriverState *bs = acb->common.bs;
4920
4921 if (!acb->is_write) {
4922 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
4923 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4924 } else {
4925 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
4926 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4927 }
4928
4929 bdrv_co_complete(acb);
4930 }
4931
4932 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4933 int64_t sector_num,
4934 QEMUIOVector *qiov,
4935 int nb_sectors,
4936 BdrvRequestFlags flags,
4937 BlockCompletionFunc *cb,
4938 void *opaque,
4939 bool is_write)
4940 {
4941 Coroutine *co;
4942 BlockAIOCBCoroutine *acb;
4943
4944 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4945 acb->need_bh = true;
4946 acb->req.error = -EINPROGRESS;
4947 acb->req.sector = sector_num;
4948 acb->req.nb_sectors = nb_sectors;
4949 acb->req.qiov = qiov;
4950 acb->req.flags = flags;
4951 acb->is_write = is_write;
4952
4953 co = qemu_coroutine_create(bdrv_co_do_rw);
4954 qemu_coroutine_enter(co, acb);
4955
4956 bdrv_co_maybe_schedule_bh(acb);
4957 return &acb->common;
4958 }
4959
4960 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
4961 {
4962 BlockAIOCBCoroutine *acb = opaque;
4963 BlockDriverState *bs = acb->common.bs;
4964
4965 acb->req.error = bdrv_co_flush(bs);
4966 bdrv_co_complete(acb);
4967 }
4968
4969 BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
4970 BlockCompletionFunc *cb, void *opaque)
4971 {
4972 trace_bdrv_aio_flush(bs, opaque);
4973
4974 Coroutine *co;
4975 BlockAIOCBCoroutine *acb;
4976
4977 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4978 acb->need_bh = true;
4979 acb->req.error = -EINPROGRESS;
4980
4981 co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4982 qemu_coroutine_enter(co, acb);
4983
4984 bdrv_co_maybe_schedule_bh(acb);
4985 return &acb->common;
4986 }
4987
4988 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4989 {
4990 BlockAIOCBCoroutine *acb = opaque;
4991 BlockDriverState *bs = acb->common.bs;
4992
4993 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
4994 bdrv_co_complete(acb);
4995 }
4996
4997 BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4998 int64_t sector_num, int nb_sectors,
4999 BlockCompletionFunc *cb, void *opaque)
5000 {
5001 Coroutine *co;
5002 BlockAIOCBCoroutine *acb;
5003
5004 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
5005
5006 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
5007 acb->need_bh = true;
5008 acb->req.error = -EINPROGRESS;
5009 acb->req.sector = sector_num;
5010 acb->req.nb_sectors = nb_sectors;
5011 co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
5012 qemu_coroutine_enter(co, acb);
5013
5014 bdrv_co_maybe_schedule_bh(acb);
5015 return &acb->common;
5016 }
5017
5018 void bdrv_init(void)
5019 {
5020 module_call_init(MODULE_INIT_BLOCK);
5021 }
5022
5023 void bdrv_init_with_whitelist(void)
5024 {
5025 use_bdrv_whitelist = 1;
5026 bdrv_init();
5027 }
5028
5029 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
5030 BlockCompletionFunc *cb, void *opaque)
5031 {
5032 BlockAIOCB *acb;
5033
5034 acb = g_slice_alloc(aiocb_info->aiocb_size);
5035 acb->aiocb_info = aiocb_info;
5036 acb->bs = bs;
5037 acb->cb = cb;
5038 acb->opaque = opaque;
5039 acb->refcnt = 1;
5040 return acb;
5041 }
5042
5043 void qemu_aio_ref(void *p)
5044 {
5045 BlockAIOCB *acb = p;
5046 acb->refcnt++;
5047 }
5048
5049 void qemu_aio_unref(void *p)
5050 {
5051 BlockAIOCB *acb = p;
5052 assert(acb->refcnt > 0);
5053 if (--acb->refcnt == 0) {
5054 g_slice_free1(acb->aiocb_info->aiocb_size, acb);
5055 }
5056 }
5057
5058 /**************************************************************/
5059 /* Coroutine block device emulation */
5060
5061 typedef struct CoroutineIOCompletion {
5062 Coroutine *coroutine;
5063 int ret;
5064 } CoroutineIOCompletion;
5065
5066 static void bdrv_co_io_em_complete(void *opaque, int ret)
5067 {
5068 CoroutineIOCompletion *co = opaque;
5069
5070 co->ret = ret;
5071 qemu_coroutine_enter(co->coroutine, NULL);
5072 }
5073
5074 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
5075 int nb_sectors, QEMUIOVector *iov,
5076 bool is_write)
5077 {
5078 CoroutineIOCompletion co = {
5079 .coroutine = qemu_coroutine_self(),
5080 };
5081 BlockAIOCB *acb;
5082
5083 if (is_write) {
5084 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
5085 bdrv_co_io_em_complete, &co);
5086 } else {
5087 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
5088 bdrv_co_io_em_complete, &co);
5089 }
5090
5091 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
5092 if (!acb) {
5093 return -EIO;
5094 }
5095 qemu_coroutine_yield();
5096
5097 return co.ret;
5098 }
5099
5100 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
5101 int64_t sector_num, int nb_sectors,
5102 QEMUIOVector *iov)
5103 {
5104 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
5105 }
5106
5107 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
5108 int64_t sector_num, int nb_sectors,
5109 QEMUIOVector *iov)
5110 {
5111 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
5112 }
5113
5114 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
5115 {
5116 RwCo *rwco = opaque;
5117
5118 rwco->ret = bdrv_co_flush(rwco->bs);
5119 }
5120
5121 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
5122 {
5123 int ret;
5124
5125 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
5126 return 0;
5127 }
5128
5129 /* Write back cached data to the OS even with cache=unsafe */
5130 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
5131 if (bs->drv->bdrv_co_flush_to_os) {
5132 ret = bs->drv->bdrv_co_flush_to_os(bs);
5133 if (ret < 0) {
5134 return ret;
5135 }
5136 }
5137
5138 /* But don't actually force it to the disk with cache=unsafe */
5139 if (bs->open_flags & BDRV_O_NO_FLUSH) {
5140 goto flush_parent;
5141 }
5142
5143 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
5144 if (bs->drv->bdrv_co_flush_to_disk) {
5145 ret = bs->drv->bdrv_co_flush_to_disk(bs);
5146 } else if (bs->drv->bdrv_aio_flush) {
5147 BlockAIOCB *acb;
5148 CoroutineIOCompletion co = {
5149 .coroutine = qemu_coroutine_self(),
5150 };
5151
5152 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
5153 if (acb == NULL) {
5154 ret = -EIO;
5155 } else {
5156 qemu_coroutine_yield();
5157 ret = co.ret;
5158 }
5159 } else {
5160 /*
5161 * Some block drivers always operate in either writethrough or unsafe
5162 * mode and don't support bdrv_flush therefore. Usually qemu doesn't
5163 * know how the server works (because the behaviour is hardcoded or
5164 * depends on server-side configuration), so we can't ensure that
5165 * everything is safe on disk. Returning an error doesn't work because
5166 * that would break guests even if the server operates in writethrough
5167 * mode.
5168 *
5169 * Let's hope the user knows what he's doing.
5170 */
5171 ret = 0;
5172 }
5173 if (ret < 0) {
5174 return ret;
5175 }
5176
5177 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
5178 * in the case of cache=unsafe, so there are no useless flushes.
5179 */
5180 flush_parent:
5181 return bdrv_co_flush(bs->file);
5182 }
5183
5184 void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)
5185 {
5186 Error *local_err = NULL;
5187 int ret;
5188
5189 if (!bs->drv) {
5190 return;
5191 }
5192
5193 if (!(bs->open_flags & BDRV_O_INCOMING)) {
5194 return;
5195 }
5196 bs->open_flags &= ~BDRV_O_INCOMING;
5197
5198 if (bs->drv->bdrv_invalidate_cache) {
5199 bs->drv->bdrv_invalidate_cache(bs, &local_err);
5200 } else if (bs->file) {
5201 bdrv_invalidate_cache(bs->file, &local_err);
5202 }
5203 if (local_err) {
5204 error_propagate(errp, local_err);
5205 return;
5206 }
5207
5208 ret = refresh_total_sectors(bs, bs->total_sectors);
5209 if (ret < 0) {
5210 error_setg_errno(errp, -ret, "Could not refresh total sector count");
5211 return;
5212 }
5213 }
5214
5215 void bdrv_invalidate_cache_all(Error **errp)
5216 {
5217 BlockDriverState *bs;
5218 Error *local_err = NULL;
5219
5220 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5221 AioContext *aio_context = bdrv_get_aio_context(bs);
5222
5223 aio_context_acquire(aio_context);
5224 bdrv_invalidate_cache(bs, &local_err);
5225 aio_context_release(aio_context);
5226 if (local_err) {
5227 error_propagate(errp, local_err);
5228 return;
5229 }
5230 }
5231 }
5232
5233 int bdrv_flush(BlockDriverState *bs)
5234 {
5235 Coroutine *co;
5236 RwCo rwco = {
5237 .bs = bs,
5238 .ret = NOT_DONE,
5239 };
5240
5241 if (qemu_in_coroutine()) {
5242 /* Fast-path if already in coroutine context */
5243 bdrv_flush_co_entry(&rwco);
5244 } else {
5245 AioContext *aio_context = bdrv_get_aio_context(bs);
5246
5247 co = qemu_coroutine_create(bdrv_flush_co_entry);
5248 qemu_coroutine_enter(co, &rwco);
5249 while (rwco.ret == NOT_DONE) {
5250 aio_poll(aio_context, true);
5251 }
5252 }
5253
5254 return rwco.ret;
5255 }
5256
5257 typedef struct DiscardCo {
5258 BlockDriverState *bs;
5259 int64_t sector_num;
5260 int nb_sectors;
5261 int ret;
5262 } DiscardCo;
5263 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
5264 {
5265 DiscardCo *rwco = opaque;
5266
5267 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
5268 }
5269
5270 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
5271 int nb_sectors)
5272 {
5273 int max_discard, ret;
5274
5275 if (!bs->drv) {
5276 return -ENOMEDIUM;
5277 }
5278
5279 ret = bdrv_check_request(bs, sector_num, nb_sectors);
5280 if (ret < 0) {
5281 return ret;
5282 } else if (bs->read_only) {
5283 return -EROFS;
5284 }
5285
5286 bdrv_reset_dirty(bs, sector_num, nb_sectors);
5287
5288 /* Do nothing if disabled. */
5289 if (!(bs->open_flags & BDRV_O_UNMAP)) {
5290 return 0;
5291 }
5292
5293 if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
5294 return 0;
5295 }
5296
5297 max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS);
5298 while (nb_sectors > 0) {
5299 int ret;
5300 int num = nb_sectors;
5301
5302 /* align request */
5303 if (bs->bl.discard_alignment &&
5304 num >= bs->bl.discard_alignment &&
5305 sector_num % bs->bl.discard_alignment) {
5306 if (num > bs->bl.discard_alignment) {
5307 num = bs->bl.discard_alignment;
5308 }
5309 num -= sector_num % bs->bl.discard_alignment;
5310 }
5311
5312 /* limit request size */
5313 if (num > max_discard) {
5314 num = max_discard;
5315 }
5316
5317 if (bs->drv->bdrv_co_discard) {
5318 ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
5319 } else {
5320 BlockAIOCB *acb;
5321 CoroutineIOCompletion co = {
5322 .coroutine = qemu_coroutine_self(),
5323 };
5324
5325 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
5326 bdrv_co_io_em_complete, &co);
5327 if (acb == NULL) {
5328 return -EIO;
5329 } else {
5330 qemu_coroutine_yield();
5331 ret = co.ret;
5332 }
5333 }
5334 if (ret && ret != -ENOTSUP) {
5335 return ret;
5336 }
5337
5338 sector_num += num;
5339 nb_sectors -= num;
5340 }
5341 return 0;
5342 }
5343
5344 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
5345 {
5346 Coroutine *co;
5347 DiscardCo rwco = {
5348 .bs = bs,
5349 .sector_num = sector_num,
5350 .nb_sectors = nb_sectors,
5351 .ret = NOT_DONE,
5352 };
5353
5354 if (qemu_in_coroutine()) {
5355 /* Fast-path if already in coroutine context */
5356 bdrv_discard_co_entry(&rwco);
5357 } else {
5358 AioContext *aio_context = bdrv_get_aio_context(bs);
5359
5360 co = qemu_coroutine_create(bdrv_discard_co_entry);
5361 qemu_coroutine_enter(co, &rwco);
5362 while (rwco.ret == NOT_DONE) {
5363 aio_poll(aio_context, true);
5364 }
5365 }
5366
5367 return rwco.ret;
5368 }
5369
5370 /**************************************************************/
5371 /* removable device support */
5372
5373 /**
5374 * Return TRUE if the media is present
5375 */
5376 int bdrv_is_inserted(BlockDriverState *bs)
5377 {
5378 BlockDriver *drv = bs->drv;
5379
5380 if (!drv)
5381 return 0;
5382 if (!drv->bdrv_is_inserted)
5383 return 1;
5384 return drv->bdrv_is_inserted(bs);
5385 }
5386
5387 /**
5388 * Return whether the media changed since the last call to this
5389 * function, or -ENOTSUP if we don't know. Most drivers don't know.
5390 */
5391 int bdrv_media_changed(BlockDriverState *bs)
5392 {
5393 BlockDriver *drv = bs->drv;
5394
5395 if (drv && drv->bdrv_media_changed) {
5396 return drv->bdrv_media_changed(bs);
5397 }
5398 return -ENOTSUP;
5399 }
5400
5401 /**
5402 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
5403 */
5404 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
5405 {
5406 BlockDriver *drv = bs->drv;
5407 const char *device_name;
5408
5409 if (drv && drv->bdrv_eject) {
5410 drv->bdrv_eject(bs, eject_flag);
5411 }
5412
5413 device_name = bdrv_get_device_name(bs);
5414 if (device_name[0] != '\0') {
5415 qapi_event_send_device_tray_moved(device_name,
5416 eject_flag, &error_abort);
5417 }
5418 }
5419
5420 /**
5421 * Lock or unlock the media (if it is locked, the user won't be able
5422 * to eject it manually).
5423 */
5424 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
5425 {
5426 BlockDriver *drv = bs->drv;
5427
5428 trace_bdrv_lock_medium(bs, locked);
5429
5430 if (drv && drv->bdrv_lock_medium) {
5431 drv->bdrv_lock_medium(bs, locked);
5432 }
5433 }
5434
5435 /* needed for generic scsi interface */
5436
5437 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
5438 {
5439 BlockDriver *drv = bs->drv;
5440
5441 if (drv && drv->bdrv_ioctl)
5442 return drv->bdrv_ioctl(bs, req, buf);
5443 return -ENOTSUP;
5444 }
5445
5446 BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
5447 unsigned long int req, void *buf,
5448 BlockCompletionFunc *cb, void *opaque)
5449 {
5450 BlockDriver *drv = bs->drv;
5451
5452 if (drv && drv->bdrv_aio_ioctl)
5453 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
5454 return NULL;
5455 }
5456
5457 void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
5458 {
5459 bs->guest_block_size = align;
5460 }
5461
5462 void *qemu_blockalign(BlockDriverState *bs, size_t size)
5463 {
5464 return qemu_memalign(bdrv_opt_mem_align(bs), size);
5465 }
5466
5467 void *qemu_blockalign0(BlockDriverState *bs, size_t size)
5468 {
5469 return memset(qemu_blockalign(bs, size), 0, size);
5470 }
5471
5472 void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
5473 {
5474 size_t align = bdrv_opt_mem_align(bs);
5475
5476 /* Ensure that NULL is never returned on success */
5477 assert(align > 0);
5478 if (size == 0) {
5479 size = align;
5480 }
5481
5482 return qemu_try_memalign(align, size);
5483 }
5484
5485 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
5486 {
5487 void *mem = qemu_try_blockalign(bs, size);
5488
5489 if (mem) {
5490 memset(mem, 0, size);
5491 }
5492
5493 return mem;
5494 }
5495
5496 /*
5497 * Check if all memory in this vector is sector aligned.
5498 */
5499 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
5500 {
5501 int i;
5502 size_t alignment = bdrv_opt_mem_align(bs);
5503
5504 for (i = 0; i < qiov->niov; i++) {
5505 if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
5506 return false;
5507 }
5508 if (qiov->iov[i].iov_len % alignment) {
5509 return false;
5510 }
5511 }
5512
5513 return true;
5514 }
5515
5516 BdrvDirtyBitmap *bdrv_find_dirty_bitmap(BlockDriverState *bs, const char *name)
5517 {
5518 BdrvDirtyBitmap *bm;
5519
5520 assert(name);
5521 QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
5522 if (bm->name && !strcmp(name, bm->name)) {
5523 return bm;
5524 }
5525 }
5526 return NULL;
5527 }
5528
5529 void bdrv_dirty_bitmap_make_anon(BdrvDirtyBitmap *bitmap)
5530 {
5531 assert(!bdrv_dirty_bitmap_frozen(bitmap));
5532 g_free(bitmap->name);
5533 bitmap->name = NULL;
5534 }
5535
5536 BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs,
5537 uint32_t granularity,
5538 const char *name,
5539 Error **errp)
5540 {
5541 int64_t bitmap_size;
5542 BdrvDirtyBitmap *bitmap;
5543 uint32_t sector_granularity;
5544
5545 assert((granularity & (granularity - 1)) == 0);
5546
5547 if (name && bdrv_find_dirty_bitmap(bs, name)) {
5548 error_setg(errp, "Bitmap already exists: %s", name);
5549 return NULL;
5550 }
5551 sector_granularity = granularity >> BDRV_SECTOR_BITS;
5552 assert(sector_granularity);
5553 bitmap_size = bdrv_nb_sectors(bs);
5554 if (bitmap_size < 0) {
5555 error_setg_errno(errp, -bitmap_size, "could not get length of device");
5556 errno = -bitmap_size;
5557 return NULL;
5558 }
5559 bitmap = g_new0(BdrvDirtyBitmap, 1);
5560 bitmap->bitmap = hbitmap_alloc(bitmap_size, ctz32(sector_granularity));
5561 bitmap->size = bitmap_size;
5562 bitmap->name = g_strdup(name);
5563 bitmap->disabled = false;
5564 QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
5565 return bitmap;
5566 }
5567
5568 bool bdrv_dirty_bitmap_frozen(BdrvDirtyBitmap *bitmap)
5569 {
5570 return bitmap->successor;
5571 }
5572
5573 bool bdrv_dirty_bitmap_enabled(BdrvDirtyBitmap *bitmap)
5574 {
5575 return !(bitmap->disabled || bitmap->successor);
5576 }
5577
5578 /**
5579 * Create a successor bitmap destined to replace this bitmap after an operation.
5580 * Requires that the bitmap is not frozen and has no successor.
5581 */
5582 int bdrv_dirty_bitmap_create_successor(BlockDriverState *bs,
5583 BdrvDirtyBitmap *bitmap, Error **errp)
5584 {
5585 uint64_t granularity;
5586 BdrvDirtyBitmap *child;
5587
5588 if (bdrv_dirty_bitmap_frozen(bitmap)) {
5589 error_setg(errp, "Cannot create a successor for a bitmap that is "
5590 "currently frozen");
5591 return -1;
5592 }
5593 assert(!bitmap->successor);
5594
5595 /* Create an anonymous successor */
5596 granularity = bdrv_dirty_bitmap_granularity(bitmap);
5597 child = bdrv_create_dirty_bitmap(bs, granularity, NULL, errp);
5598 if (!child) {
5599 return -1;
5600 }
5601
5602 /* Successor will be on or off based on our current state. */
5603 child->disabled = bitmap->disabled;
5604
5605 /* Install the successor and freeze the parent */
5606 bitmap->successor = child;
5607 return 0;
5608 }
5609
5610 /**
5611 * For a bitmap with a successor, yield our name to the successor,
5612 * delete the old bitmap, and return a handle to the new bitmap.
5613 */
5614 BdrvDirtyBitmap *bdrv_dirty_bitmap_abdicate(BlockDriverState *bs,
5615 BdrvDirtyBitmap *bitmap,
5616 Error **errp)
5617 {
5618 char *name;
5619 BdrvDirtyBitmap *successor = bitmap->successor;
5620
5621 if (successor == NULL) {
5622 error_setg(errp, "Cannot relinquish control if "
5623 "there's no successor present");
5624 return NULL;
5625 }
5626
5627 name = bitmap->name;
5628 bitmap->name = NULL;
5629 successor->name = name;
5630 bitmap->successor = NULL;
5631 bdrv_release_dirty_bitmap(bs, bitmap);
5632
5633 return successor;
5634 }
5635
5636 /**
5637 * In cases of failure where we can no longer safely delete the parent,
5638 * we may wish to re-join the parent and child/successor.
5639 * The merged parent will be un-frozen, but not explicitly re-enabled.
5640 */
5641 BdrvDirtyBitmap *bdrv_reclaim_dirty_bitmap(BlockDriverState *bs,
5642 BdrvDirtyBitmap *parent,
5643 Error **errp)
5644 {
5645 BdrvDirtyBitmap *successor = parent->successor;
5646
5647 if (!successor) {
5648 error_setg(errp, "Cannot reclaim a successor when none is present");
5649 return NULL;
5650 }
5651
5652 if (!hbitmap_merge(parent->bitmap, successor->bitmap)) {
5653 error_setg(errp, "Merging of parent and successor bitmap failed");
5654 return NULL;
5655 }
5656 bdrv_release_dirty_bitmap(bs, successor);
5657 parent->successor = NULL;
5658
5659 return parent;
5660 }
5661
5662 void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5663 {
5664 BdrvDirtyBitmap *bm, *next;
5665 QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
5666 if (bm == bitmap) {
5667 assert(!bdrv_dirty_bitmap_frozen(bm));
5668 QLIST_REMOVE(bitmap, list);
5669 hbitmap_free(bitmap->bitmap);
5670 g_free(bitmap->name);
5671 g_free(bitmap);
5672 return;
5673 }
5674 }
5675 }
5676
5677 void bdrv_disable_dirty_bitmap(BdrvDirtyBitmap *bitmap)
5678 {
5679 assert(!bdrv_dirty_bitmap_frozen(bitmap));
5680 bitmap->disabled = true;
5681 }
5682
5683 void bdrv_enable_dirty_bitmap(BdrvDirtyBitmap *bitmap)
5684 {
5685 assert(!bdrv_dirty_bitmap_frozen(bitmap));
5686 bitmap->disabled = false;
5687 }
5688
5689 BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
5690 {
5691 BdrvDirtyBitmap *bm;
5692 BlockDirtyInfoList *list = NULL;
5693 BlockDirtyInfoList **plist = &list;
5694
5695 QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
5696 BlockDirtyInfo *info = g_new0(BlockDirtyInfo, 1);
5697 BlockDirtyInfoList *entry = g_new0(BlockDirtyInfoList, 1);
5698 info->count = bdrv_get_dirty_count(bm);
5699 info->granularity = bdrv_dirty_bitmap_granularity(bm);
5700 info->has_name = !!bm->name;
5701 info->name = g_strdup(bm->name);
5702 info->frozen = bdrv_dirty_bitmap_frozen(bm);
5703 entry->value = info;
5704 *plist = entry;
5705 plist = &entry->next;
5706 }
5707
5708 return list;
5709 }
5710
5711 int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
5712 {
5713 if (bitmap) {
5714 return hbitmap_get(bitmap->bitmap, sector);
5715 } else {
5716 return 0;
5717 }
5718 }
5719
5720 /**
5721 * Chooses a default granularity based on the existing cluster size,
5722 * but clamped between [4K, 64K]. Defaults to 64K in the case that there
5723 * is no cluster size information available.
5724 */
5725 uint32_t bdrv_get_default_bitmap_granularity(BlockDriverState *bs)
5726 {
5727 BlockDriverInfo bdi;
5728 uint32_t granularity;
5729
5730 if (bdrv_get_info(bs, &bdi) >= 0 && bdi.cluster_size > 0) {
5731 granularity = MAX(4096, bdi.cluster_size);
5732 granularity = MIN(65536, granularity);
5733 } else {
5734 granularity = 65536;
5735 }
5736
5737 return granularity;
5738 }
5739
5740 uint32_t bdrv_dirty_bitmap_granularity(BdrvDirtyBitmap *bitmap)
5741 {
5742 return BDRV_SECTOR_SIZE << hbitmap_granularity(bitmap->bitmap);
5743 }
5744
5745 void bdrv_dirty_iter_init(BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
5746 {
5747 hbitmap_iter_init(hbi, bitmap->bitmap, 0);
5748 }
5749
5750 void bdrv_set_dirty_bitmap(BdrvDirtyBitmap *bitmap,
5751 int64_t cur_sector, int nr_sectors)
5752 {
5753 assert(bdrv_dirty_bitmap_enabled(bitmap));
5754 hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5755 }
5756
5757 void bdrv_reset_dirty_bitmap(BdrvDirtyBitmap *bitmap,
5758 int64_t cur_sector, int nr_sectors)
5759 {
5760 assert(bdrv_dirty_bitmap_enabled(bitmap));
5761 hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5762 }
5763
5764 void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap)
5765 {
5766 assert(bdrv_dirty_bitmap_enabled(bitmap));
5767 hbitmap_reset(bitmap->bitmap, 0, bitmap->size);
5768 }
5769
5770 static void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
5771 int nr_sectors)
5772 {
5773 BdrvDirtyBitmap *bitmap;
5774 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5775 if (!bdrv_dirty_bitmap_enabled(bitmap)) {
5776 continue;
5777 }
5778 hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5779 }
5780 }
5781
5782 static void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
5783 int nr_sectors)
5784 {
5785 BdrvDirtyBitmap *bitmap;
5786 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5787 if (!bdrv_dirty_bitmap_enabled(bitmap)) {
5788 continue;
5789 }
5790 hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5791 }
5792 }
5793
5794 /**
5795 * Advance an HBitmapIter to an arbitrary offset.
5796 */
5797 void bdrv_set_dirty_iter(HBitmapIter *hbi, int64_t offset)
5798 {
5799 assert(hbi->hb);
5800 hbitmap_iter_init(hbi, hbi->hb, offset);
5801 }
5802
5803 int64_t bdrv_get_dirty_count(BdrvDirtyBitmap *bitmap)
5804 {
5805 return hbitmap_count(bitmap->bitmap);
5806 }
5807
5808 /* Get a reference to bs */
5809 void bdrv_ref(BlockDriverState *bs)
5810 {
5811 bs->refcnt++;
5812 }
5813
5814 /* Release a previously grabbed reference to bs.
5815 * If after releasing, reference count is zero, the BlockDriverState is
5816 * deleted. */
5817 void bdrv_unref(BlockDriverState *bs)
5818 {
5819 if (!bs) {
5820 return;
5821 }
5822 assert(bs->refcnt > 0);
5823 if (--bs->refcnt == 0) {
5824 bdrv_delete(bs);
5825 }
5826 }
5827
5828 struct BdrvOpBlocker {
5829 Error *reason;
5830 QLIST_ENTRY(BdrvOpBlocker) list;
5831 };
5832
5833 bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp)
5834 {
5835 BdrvOpBlocker *blocker;
5836 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5837 if (!QLIST_EMPTY(&bs->op_blockers[op])) {
5838 blocker = QLIST_FIRST(&bs->op_blockers[op]);
5839 if (errp) {
5840 error_setg(errp, "Node '%s' is busy: %s",
5841 bdrv_get_device_or_node_name(bs),
5842 error_get_pretty(blocker->reason));
5843 }
5844 return true;
5845 }
5846 return false;
5847 }
5848
5849 void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason)
5850 {
5851 BdrvOpBlocker *blocker;
5852 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5853
5854 blocker = g_new0(BdrvOpBlocker, 1);
5855 blocker->reason = reason;
5856 QLIST_INSERT_HEAD(&bs->op_blockers[op], blocker, list);
5857 }
5858
5859 void bdrv_op_unblock(BlockDriverState *bs, BlockOpType op, Error *reason)
5860 {
5861 BdrvOpBlocker *blocker, *next;
5862 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5863 QLIST_FOREACH_SAFE(blocker, &bs->op_blockers[op], list, next) {
5864 if (blocker->reason == reason) {
5865 QLIST_REMOVE(blocker, list);
5866 g_free(blocker);
5867 }
5868 }
5869 }
5870
5871 void bdrv_op_block_all(BlockDriverState *bs, Error *reason)
5872 {
5873 int i;
5874 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5875 bdrv_op_block(bs, i, reason);
5876 }
5877 }
5878
5879 void bdrv_op_unblock_all(BlockDriverState *bs, Error *reason)
5880 {
5881 int i;
5882 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5883 bdrv_op_unblock(bs, i, reason);
5884 }
5885 }
5886
5887 bool bdrv_op_blocker_is_empty(BlockDriverState *bs)
5888 {
5889 int i;
5890
5891 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5892 if (!QLIST_EMPTY(&bs->op_blockers[i])) {
5893 return false;
5894 }
5895 }
5896 return true;
5897 }
5898
5899 void bdrv_iostatus_enable(BlockDriverState *bs)
5900 {
5901 bs->iostatus_enabled = true;
5902 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5903 }
5904
5905 /* The I/O status is only enabled if the drive explicitly
5906 * enables it _and_ the VM is configured to stop on errors */
5907 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
5908 {
5909 return (bs->iostatus_enabled &&
5910 (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
5911 bs->on_write_error == BLOCKDEV_ON_ERROR_STOP ||
5912 bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
5913 }
5914
5915 void bdrv_iostatus_disable(BlockDriverState *bs)
5916 {
5917 bs->iostatus_enabled = false;
5918 }
5919
5920 void bdrv_iostatus_reset(BlockDriverState *bs)
5921 {
5922 if (bdrv_iostatus_is_enabled(bs)) {
5923 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5924 if (bs->job) {
5925 block_job_iostatus_reset(bs->job);
5926 }
5927 }
5928 }
5929
5930 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
5931 {
5932 assert(bdrv_iostatus_is_enabled(bs));
5933 if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
5934 bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
5935 BLOCK_DEVICE_IO_STATUS_FAILED;
5936 }
5937 }
5938
5939 void bdrv_img_create(const char *filename, const char *fmt,
5940 const char *base_filename, const char *base_fmt,
5941 char *options, uint64_t img_size, int flags,
5942 Error **errp, bool quiet)
5943 {
5944 QemuOptsList *create_opts = NULL;
5945 QemuOpts *opts = NULL;
5946 const char *backing_fmt, *backing_file;
5947 int64_t size;
5948 BlockDriver *drv, *proto_drv;
5949 BlockDriver *backing_drv = NULL;
5950 Error *local_err = NULL;
5951 int ret = 0;
5952
5953 /* Find driver and parse its options */
5954 drv = bdrv_find_format(fmt);
5955 if (!drv) {
5956 error_setg(errp, "Unknown file format '%s'", fmt);
5957 return;
5958 }
5959
5960 proto_drv = bdrv_find_protocol(filename, true, errp);
5961 if (!proto_drv) {
5962 return;
5963 }
5964
5965 if (!drv->create_opts) {
5966 error_setg(errp, "Format driver '%s' does not support image creation",
5967 drv->format_name);
5968 return;
5969 }
5970
5971 if (!proto_drv->create_opts) {
5972 error_setg(errp, "Protocol driver '%s' does not support image creation",
5973 proto_drv->format_name);
5974 return;
5975 }
5976
5977 create_opts = qemu_opts_append(create_opts, drv->create_opts);
5978 create_opts = qemu_opts_append(create_opts, proto_drv->create_opts);
5979
5980 /* Create parameter list with default values */
5981 opts = qemu_opts_create(create_opts, NULL, 0, &error_abort);
5982 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, img_size, &error_abort);
5983
5984 /* Parse -o options */
5985 if (options) {
5986 qemu_opts_do_parse(opts, options, NULL, &local_err);
5987 if (local_err) {
5988 error_report_err(local_err);
5989 local_err = NULL;
5990 error_setg(errp, "Invalid options for file format '%s'", fmt);
5991 goto out;
5992 }
5993 }
5994
5995 if (base_filename) {
5996 qemu_opt_set(opts, BLOCK_OPT_BACKING_FILE, base_filename, &local_err);
5997 if (local_err) {
5998 error_setg(errp, "Backing file not supported for file format '%s'",
5999 fmt);
6000 goto out;
6001 }
6002 }
6003
6004 if (base_fmt) {
6005 qemu_opt_set(opts, BLOCK_OPT_BACKING_FMT, base_fmt, &local_err);
6006 if (local_err) {
6007 error_setg(errp, "Backing file format not supported for file "
6008 "format '%s'", fmt);
6009 goto out;
6010 }
6011 }
6012
6013 backing_file = qemu_opt_get(opts, BLOCK_OPT_BACKING_FILE);
6014 if (backing_file) {
6015 if (!strcmp(filename, backing_file)) {
6016 error_setg(errp, "Error: Trying to create an image with the "
6017 "same filename as the backing file");
6018 goto out;
6019 }
6020 }
6021
6022 backing_fmt = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT);
6023 if (backing_fmt) {
6024 backing_drv = bdrv_find_format(backing_fmt);
6025 if (!backing_drv) {
6026 error_setg(errp, "Unknown backing file format '%s'",
6027 backing_fmt);
6028 goto out;
6029 }
6030 }
6031
6032 // The size for the image must always be specified, with one exception:
6033 // If we are using a backing file, we can obtain the size from there
6034 size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 0);
6035 if (size == -1) {
6036 if (backing_file) {
6037 BlockDriverState *bs;
6038 char *full_backing = g_new0(char, PATH_MAX);
6039 int64_t size;
6040 int back_flags;
6041
6042 bdrv_get_full_backing_filename_from_filename(filename, backing_file,
6043 full_backing, PATH_MAX,
6044 &local_err);
6045 if (local_err) {
6046 g_free(full_backing);
6047 goto out;
6048 }
6049
6050 /* backing files always opened read-only */
6051 back_flags =
6052 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
6053
6054 bs = NULL;
6055 ret = bdrv_open(&bs, full_backing, NULL, NULL, back_flags,
6056 backing_drv, &local_err);
6057 g_free(full_backing);
6058 if (ret < 0) {
6059 goto out;
6060 }
6061 size = bdrv_getlength(bs);
6062 if (size < 0) {
6063 error_setg_errno(errp, -size, "Could not get size of '%s'",
6064 backing_file);
6065 bdrv_unref(bs);
6066 goto out;
6067 }
6068
6069 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, size, &error_abort);
6070
6071 bdrv_unref(bs);
6072 } else {
6073 error_setg(errp, "Image creation needs a size parameter");
6074 goto out;
6075 }
6076 }
6077
6078 if (!quiet) {
6079 printf("Formatting '%s', fmt=%s", filename, fmt);
6080 qemu_opts_print(opts, " ");
6081 puts("");
6082 }
6083
6084 ret = bdrv_create(drv, filename, opts, &local_err);
6085
6086 if (ret == -EFBIG) {
6087 /* This is generally a better message than whatever the driver would
6088 * deliver (especially because of the cluster_size_hint), since that
6089 * is most probably not much different from "image too large". */
6090 const char *cluster_size_hint = "";
6091 if (qemu_opt_get_size(opts, BLOCK_OPT_CLUSTER_SIZE, 0)) {
6092 cluster_size_hint = " (try using a larger cluster size)";
6093 }
6094 error_setg(errp, "The image size is too large for file format '%s'"
6095 "%s", fmt, cluster_size_hint);
6096 error_free(local_err);
6097 local_err = NULL;
6098 }
6099
6100 out:
6101 qemu_opts_del(opts);
6102 qemu_opts_free(create_opts);
6103 if (local_err) {
6104 error_propagate(errp, local_err);
6105 }
6106 }
6107
6108 AioContext *bdrv_get_aio_context(BlockDriverState *bs)
6109 {
6110 return bs->aio_context;
6111 }
6112
6113 void bdrv_detach_aio_context(BlockDriverState *bs)
6114 {
6115 BdrvAioNotifier *baf;
6116
6117 if (!bs->drv) {
6118 return;
6119 }
6120
6121 QLIST_FOREACH(baf, &bs->aio_notifiers, list) {
6122 baf->detach_aio_context(baf->opaque);
6123 }
6124
6125 if (bs->io_limits_enabled) {
6126 throttle_detach_aio_context(&bs->throttle_state);
6127 }
6128 if (bs->drv->bdrv_detach_aio_context) {
6129 bs->drv->bdrv_detach_aio_context(bs);
6130 }
6131 if (bs->file) {
6132 bdrv_detach_aio_context(bs->file);
6133 }
6134 if (bs->backing_hd) {
6135 bdrv_detach_aio_context(bs->backing_hd);
6136 }
6137
6138 bs->aio_context = NULL;
6139 }
6140
6141 void bdrv_attach_aio_context(BlockDriverState *bs,
6142 AioContext *new_context)
6143 {
6144 BdrvAioNotifier *ban;
6145
6146 if (!bs->drv) {
6147 return;
6148 }
6149
6150 bs->aio_context = new_context;
6151
6152 if (bs->backing_hd) {
6153 bdrv_attach_aio_context(bs->backing_hd, new_context);
6154 }
6155 if (bs->file) {
6156 bdrv_attach_aio_context(bs->file, new_context);
6157 }
6158 if (bs->drv->bdrv_attach_aio_context) {
6159 bs->drv->bdrv_attach_aio_context(bs, new_context);
6160 }
6161 if (bs->io_limits_enabled) {
6162 throttle_attach_aio_context(&bs->throttle_state, new_context);
6163 }
6164
6165 QLIST_FOREACH(ban, &bs->aio_notifiers, list) {
6166 ban->attached_aio_context(new_context, ban->opaque);
6167 }
6168 }
6169
6170 void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
6171 {
6172 bdrv_drain_all(); /* ensure there are no in-flight requests */
6173
6174 bdrv_detach_aio_context(bs);
6175
6176 /* This function executes in the old AioContext so acquire the new one in
6177 * case it runs in a different thread.
6178 */
6179 aio_context_acquire(new_context);
6180 bdrv_attach_aio_context(bs, new_context);
6181 aio_context_release(new_context);
6182 }
6183
6184 void bdrv_add_aio_context_notifier(BlockDriverState *bs,
6185 void (*attached_aio_context)(AioContext *new_context, void *opaque),
6186 void (*detach_aio_context)(void *opaque), void *opaque)
6187 {
6188 BdrvAioNotifier *ban = g_new(BdrvAioNotifier, 1);
6189 *ban = (BdrvAioNotifier){
6190 .attached_aio_context = attached_aio_context,
6191 .detach_aio_context = detach_aio_context,
6192 .opaque = opaque
6193 };
6194
6195 QLIST_INSERT_HEAD(&bs->aio_notifiers, ban, list);
6196 }
6197
6198 void bdrv_remove_aio_context_notifier(BlockDriverState *bs,
6199 void (*attached_aio_context)(AioContext *,
6200 void *),
6201 void (*detach_aio_context)(void *),
6202 void *opaque)
6203 {
6204 BdrvAioNotifier *ban, *ban_next;
6205
6206 QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
6207 if (ban->attached_aio_context == attached_aio_context &&
6208 ban->detach_aio_context == detach_aio_context &&
6209 ban->opaque == opaque)
6210 {
6211 QLIST_REMOVE(ban, list);
6212 g_free(ban);
6213
6214 return;
6215 }
6216 }
6217
6218 abort();
6219 }
6220
6221 void bdrv_add_before_write_notifier(BlockDriverState *bs,
6222 NotifierWithReturn *notifier)
6223 {
6224 notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
6225 }
6226
6227 int bdrv_amend_options(BlockDriverState *bs, QemuOpts *opts,
6228 BlockDriverAmendStatusCB *status_cb)
6229 {
6230 if (!bs->drv->bdrv_amend_options) {
6231 return -ENOTSUP;
6232 }
6233 return bs->drv->bdrv_amend_options(bs, opts, status_cb);
6234 }
6235
6236 /* This function will be called by the bdrv_recurse_is_first_non_filter method
6237 * of block filter and by bdrv_is_first_non_filter.
6238 * It is used to test if the given bs is the candidate or recurse more in the
6239 * node graph.
6240 */
6241 bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
6242 BlockDriverState *candidate)
6243 {
6244 /* return false if basic checks fails */
6245 if (!bs || !bs->drv) {
6246 return false;
6247 }
6248
6249 /* the code reached a non block filter driver -> check if the bs is
6250 * the same as the candidate. It's the recursion termination condition.
6251 */
6252 if (!bs->drv->is_filter) {
6253 return bs == candidate;
6254 }
6255 /* Down this path the driver is a block filter driver */
6256
6257 /* If the block filter recursion method is defined use it to recurse down
6258 * the node graph.
6259 */
6260 if (bs->drv->bdrv_recurse_is_first_non_filter) {
6261 return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate);
6262 }
6263
6264 /* the driver is a block filter but don't allow to recurse -> return false
6265 */
6266 return false;
6267 }
6268
6269 /* This function checks if the candidate is the first non filter bs down it's
6270 * bs chain. Since we don't have pointers to parents it explore all bs chains
6271 * from the top. Some filters can choose not to pass down the recursion.
6272 */
6273 bool bdrv_is_first_non_filter(BlockDriverState *candidate)
6274 {
6275 BlockDriverState *bs;
6276
6277 /* walk down the bs forest recursively */
6278 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
6279 bool perm;
6280
6281 /* try to recurse in this top level bs */
6282 perm = bdrv_recurse_is_first_non_filter(bs, candidate);
6283
6284 /* candidate is the first non filter */
6285 if (perm) {
6286 return true;
6287 }
6288 }
6289
6290 return false;
6291 }
6292
6293 BlockDriverState *check_to_replace_node(const char *node_name, Error **errp)
6294 {
6295 BlockDriverState *to_replace_bs = bdrv_find_node(node_name);
6296 AioContext *aio_context;
6297
6298 if (!to_replace_bs) {
6299 error_setg(errp, "Node name '%s' not found", node_name);
6300 return NULL;
6301 }
6302
6303 aio_context = bdrv_get_aio_context(to_replace_bs);
6304 aio_context_acquire(aio_context);
6305
6306 if (bdrv_op_is_blocked(to_replace_bs, BLOCK_OP_TYPE_REPLACE, errp)) {
6307 to_replace_bs = NULL;
6308 goto out;
6309 }
6310
6311 /* We don't want arbitrary node of the BDS chain to be replaced only the top
6312 * most non filter in order to prevent data corruption.
6313 * Another benefit is that this tests exclude backing files which are
6314 * blocked by the backing blockers.
6315 */
6316 if (!bdrv_is_first_non_filter(to_replace_bs)) {
6317 error_setg(errp, "Only top most non filter can be replaced");
6318 to_replace_bs = NULL;
6319 goto out;
6320 }
6321
6322 out:
6323 aio_context_release(aio_context);
6324 return to_replace_bs;
6325 }
6326
6327 void bdrv_io_plug(BlockDriverState *bs)
6328 {
6329 BlockDriver *drv = bs->drv;
6330 if (drv && drv->bdrv_io_plug) {
6331 drv->bdrv_io_plug(bs);
6332 } else if (bs->file) {
6333 bdrv_io_plug(bs->file);
6334 }
6335 }
6336
6337 void bdrv_io_unplug(BlockDriverState *bs)
6338 {
6339 BlockDriver *drv = bs->drv;
6340 if (drv && drv->bdrv_io_unplug) {
6341 drv->bdrv_io_unplug(bs);
6342 } else if (bs->file) {
6343 bdrv_io_unplug(bs->file);
6344 }
6345 }
6346
6347 void bdrv_flush_io_queue(BlockDriverState *bs)
6348 {
6349 BlockDriver *drv = bs->drv;
6350 if (drv && drv->bdrv_flush_io_queue) {
6351 drv->bdrv_flush_io_queue(bs);
6352 } else if (bs->file) {
6353 bdrv_flush_io_queue(bs->file);
6354 }
6355 }
6356
6357 static bool append_open_options(QDict *d, BlockDriverState *bs)
6358 {
6359 const QDictEntry *entry;
6360 bool found_any = false;
6361
6362 for (entry = qdict_first(bs->options); entry;
6363 entry = qdict_next(bs->options, entry))
6364 {
6365 /* Only take options for this level and exclude all non-driver-specific
6366 * options */
6367 if (!strchr(qdict_entry_key(entry), '.') &&
6368 strcmp(qdict_entry_key(entry), "node-name"))
6369 {
6370 qobject_incref(qdict_entry_value(entry));
6371 qdict_put_obj(d, qdict_entry_key(entry), qdict_entry_value(entry));
6372 found_any = true;
6373 }
6374 }
6375
6376 return found_any;
6377 }
6378
6379 /* Updates the following BDS fields:
6380 * - exact_filename: A filename which may be used for opening a block device
6381 * which (mostly) equals the given BDS (even without any
6382 * other options; so reading and writing must return the same
6383 * results, but caching etc. may be different)
6384 * - full_open_options: Options which, when given when opening a block device
6385 * (without a filename), result in a BDS (mostly)
6386 * equalling the given one
6387 * - filename: If exact_filename is set, it is copied here. Otherwise,
6388 * full_open_options is converted to a JSON object, prefixed with
6389 * "json:" (for use through the JSON pseudo protocol) and put here.
6390 */
6391 void bdrv_refresh_filename(BlockDriverState *bs)
6392 {
6393 BlockDriver *drv = bs->drv;
6394 QDict *opts;
6395
6396 if (!drv) {
6397 return;
6398 }
6399
6400 /* This BDS's file name will most probably depend on its file's name, so
6401 * refresh that first */
6402 if (bs->file) {
6403 bdrv_refresh_filename(bs->file);
6404 }
6405
6406 if (drv->bdrv_refresh_filename) {
6407 /* Obsolete information is of no use here, so drop the old file name
6408 * information before refreshing it */
6409 bs->exact_filename[0] = '\0';
6410 if (bs->full_open_options) {
6411 QDECREF(bs->full_open_options);
6412 bs->full_open_options = NULL;
6413 }
6414
6415 drv->bdrv_refresh_filename(bs);
6416 } else if (bs->file) {
6417 /* Try to reconstruct valid information from the underlying file */
6418 bool has_open_options;
6419
6420 bs->exact_filename[0] = '\0';
6421 if (bs->full_open_options) {
6422 QDECREF(bs->full_open_options);
6423 bs->full_open_options = NULL;
6424 }
6425
6426 opts = qdict_new();
6427 has_open_options = append_open_options(opts, bs);
6428
6429 /* If no specific options have been given for this BDS, the filename of
6430 * the underlying file should suffice for this one as well */
6431 if (bs->file->exact_filename[0] && !has_open_options) {
6432 strcpy(bs->exact_filename, bs->file->exact_filename);
6433 }
6434 /* Reconstructing the full options QDict is simple for most format block
6435 * drivers, as long as the full options are known for the underlying
6436 * file BDS. The full options QDict of that file BDS should somehow
6437 * contain a representation of the filename, therefore the following
6438 * suffices without querying the (exact_)filename of this BDS. */
6439 if (bs->file->full_open_options) {
6440 qdict_put_obj(opts, "driver",
6441 QOBJECT(qstring_from_str(drv->format_name)));
6442 QINCREF(bs->file->full_open_options);
6443 qdict_put_obj(opts, "file", QOBJECT(bs->file->full_open_options));
6444
6445 bs->full_open_options = opts;
6446 } else {
6447 QDECREF(opts);
6448 }
6449 } else if (!bs->full_open_options && qdict_size(bs->options)) {
6450 /* There is no underlying file BDS (at least referenced by BDS.file),
6451 * so the full options QDict should be equal to the options given
6452 * specifically for this block device when it was opened (plus the
6453 * driver specification).
6454 * Because those options don't change, there is no need to update
6455 * full_open_options when it's already set. */
6456
6457 opts = qdict_new();
6458 append_open_options(opts, bs);
6459 qdict_put_obj(opts, "driver",
6460 QOBJECT(qstring_from_str(drv->format_name)));
6461
6462 if (bs->exact_filename[0]) {
6463 /* This may not work for all block protocol drivers (some may
6464 * require this filename to be parsed), but we have to find some
6465 * default solution here, so just include it. If some block driver
6466 * does not support pure options without any filename at all or
6467 * needs some special format of the options QDict, it needs to
6468 * implement the driver-specific bdrv_refresh_filename() function.
6469 */
6470 qdict_put_obj(opts, "filename",
6471 QOBJECT(qstring_from_str(bs->exact_filename)));
6472 }
6473
6474 bs->full_open_options = opts;
6475 }
6476
6477 if (bs->exact_filename[0]) {
6478 pstrcpy(bs->filename, sizeof(bs->filename), bs->exact_filename);
6479 } else if (bs->full_open_options) {
6480 QString *json = qobject_to_json(QOBJECT(bs->full_open_options));
6481 snprintf(bs->filename, sizeof(bs->filename), "json:%s",
6482 qstring_get_str(json));
6483 QDECREF(json);
6484 }
6485 }
6486
6487 /* This accessor function purpose is to allow the device models to access the
6488 * BlockAcctStats structure embedded inside a BlockDriverState without being
6489 * aware of the BlockDriverState structure layout.
6490 * It will go away when the BlockAcctStats structure will be moved inside
6491 * the device models.
6492 */
6493 BlockAcctStats *bdrv_get_stats(BlockDriverState *bs)
6494 {
6495 return &bs->stats;
6496 }