]> git.proxmox.com Git - mirror_qemu.git/blob - block.c
BlockLimits: introduce max_transfer_length
[mirror_qemu.git] / block.c
1 /*
2 * QEMU System Emulator block driver
3 *
4 * Copyright (c) 2003 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24 #include "config-host.h"
25 #include "qemu-common.h"
26 #include "trace.h"
27 #include "block/block_int.h"
28 #include "block/blockjob.h"
29 #include "qemu/module.h"
30 #include "qapi/qmp/qjson.h"
31 #include "sysemu/block-backend.h"
32 #include "sysemu/sysemu.h"
33 #include "qemu/notify.h"
34 #include "block/coroutine.h"
35 #include "block/qapi.h"
36 #include "qmp-commands.h"
37 #include "qemu/timer.h"
38 #include "qapi-event.h"
39
40 #ifdef CONFIG_BSD
41 #include <sys/types.h>
42 #include <sys/stat.h>
43 #include <sys/ioctl.h>
44 #include <sys/queue.h>
45 #ifndef __DragonFly__
46 #include <sys/disk.h>
47 #endif
48 #endif
49
50 #ifdef _WIN32
51 #include <windows.h>
52 #endif
53
54 struct BdrvDirtyBitmap {
55 HBitmap *bitmap;
56 QLIST_ENTRY(BdrvDirtyBitmap) list;
57 };
58
59 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
60
61 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
62 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
63 BlockCompletionFunc *cb, void *opaque);
64 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
65 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
66 BlockCompletionFunc *cb, void *opaque);
67 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
68 int64_t sector_num, int nb_sectors,
69 QEMUIOVector *iov);
70 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
71 int64_t sector_num, int nb_sectors,
72 QEMUIOVector *iov);
73 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
74 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
75 BdrvRequestFlags flags);
76 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
77 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
78 BdrvRequestFlags flags);
79 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
80 int64_t sector_num,
81 QEMUIOVector *qiov,
82 int nb_sectors,
83 BdrvRequestFlags flags,
84 BlockCompletionFunc *cb,
85 void *opaque,
86 bool is_write);
87 static void coroutine_fn bdrv_co_do_rw(void *opaque);
88 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
89 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
90
91 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
92 QTAILQ_HEAD_INITIALIZER(bdrv_states);
93
94 static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
95 QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
96
97 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
98 QLIST_HEAD_INITIALIZER(bdrv_drivers);
99
100 /* If non-zero, use only whitelisted block drivers */
101 static int use_bdrv_whitelist;
102
103 #ifdef _WIN32
104 static int is_windows_drive_prefix(const char *filename)
105 {
106 return (((filename[0] >= 'a' && filename[0] <= 'z') ||
107 (filename[0] >= 'A' && filename[0] <= 'Z')) &&
108 filename[1] == ':');
109 }
110
111 int is_windows_drive(const char *filename)
112 {
113 if (is_windows_drive_prefix(filename) &&
114 filename[2] == '\0')
115 return 1;
116 if (strstart(filename, "\\\\.\\", NULL) ||
117 strstart(filename, "//./", NULL))
118 return 1;
119 return 0;
120 }
121 #endif
122
123 /* throttling disk I/O limits */
124 void bdrv_set_io_limits(BlockDriverState *bs,
125 ThrottleConfig *cfg)
126 {
127 int i;
128
129 throttle_config(&bs->throttle_state, cfg);
130
131 for (i = 0; i < 2; i++) {
132 qemu_co_enter_next(&bs->throttled_reqs[i]);
133 }
134 }
135
136 /* this function drain all the throttled IOs */
137 static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
138 {
139 bool drained = false;
140 bool enabled = bs->io_limits_enabled;
141 int i;
142
143 bs->io_limits_enabled = false;
144
145 for (i = 0; i < 2; i++) {
146 while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
147 drained = true;
148 }
149 }
150
151 bs->io_limits_enabled = enabled;
152
153 return drained;
154 }
155
156 void bdrv_io_limits_disable(BlockDriverState *bs)
157 {
158 bs->io_limits_enabled = false;
159
160 bdrv_start_throttled_reqs(bs);
161
162 throttle_destroy(&bs->throttle_state);
163 }
164
165 static void bdrv_throttle_read_timer_cb(void *opaque)
166 {
167 BlockDriverState *bs = opaque;
168 qemu_co_enter_next(&bs->throttled_reqs[0]);
169 }
170
171 static void bdrv_throttle_write_timer_cb(void *opaque)
172 {
173 BlockDriverState *bs = opaque;
174 qemu_co_enter_next(&bs->throttled_reqs[1]);
175 }
176
177 /* should be called before bdrv_set_io_limits if a limit is set */
178 void bdrv_io_limits_enable(BlockDriverState *bs)
179 {
180 assert(!bs->io_limits_enabled);
181 throttle_init(&bs->throttle_state,
182 bdrv_get_aio_context(bs),
183 QEMU_CLOCK_VIRTUAL,
184 bdrv_throttle_read_timer_cb,
185 bdrv_throttle_write_timer_cb,
186 bs);
187 bs->io_limits_enabled = true;
188 }
189
190 /* This function makes an IO wait if needed
191 *
192 * @nb_sectors: the number of sectors of the IO
193 * @is_write: is the IO a write
194 */
195 static void bdrv_io_limits_intercept(BlockDriverState *bs,
196 unsigned int bytes,
197 bool is_write)
198 {
199 /* does this io must wait */
200 bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
201
202 /* if must wait or any request of this type throttled queue the IO */
203 if (must_wait ||
204 !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
205 qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
206 }
207
208 /* the IO will be executed, do the accounting */
209 throttle_account(&bs->throttle_state, is_write, bytes);
210
211
212 /* if the next request must wait -> do nothing */
213 if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
214 return;
215 }
216
217 /* else queue next request for execution */
218 qemu_co_queue_next(&bs->throttled_reqs[is_write]);
219 }
220
221 size_t bdrv_opt_mem_align(BlockDriverState *bs)
222 {
223 if (!bs || !bs->drv) {
224 /* 4k should be on the safe side */
225 return 4096;
226 }
227
228 return bs->bl.opt_mem_alignment;
229 }
230
231 /* check if the path starts with "<protocol>:" */
232 static int path_has_protocol(const char *path)
233 {
234 const char *p;
235
236 #ifdef _WIN32
237 if (is_windows_drive(path) ||
238 is_windows_drive_prefix(path)) {
239 return 0;
240 }
241 p = path + strcspn(path, ":/\\");
242 #else
243 p = path + strcspn(path, ":/");
244 #endif
245
246 return *p == ':';
247 }
248
249 int path_is_absolute(const char *path)
250 {
251 #ifdef _WIN32
252 /* specific case for names like: "\\.\d:" */
253 if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
254 return 1;
255 }
256 return (*path == '/' || *path == '\\');
257 #else
258 return (*path == '/');
259 #endif
260 }
261
262 /* if filename is absolute, just copy it to dest. Otherwise, build a
263 path to it by considering it is relative to base_path. URL are
264 supported. */
265 void path_combine(char *dest, int dest_size,
266 const char *base_path,
267 const char *filename)
268 {
269 const char *p, *p1;
270 int len;
271
272 if (dest_size <= 0)
273 return;
274 if (path_is_absolute(filename)) {
275 pstrcpy(dest, dest_size, filename);
276 } else {
277 p = strchr(base_path, ':');
278 if (p)
279 p++;
280 else
281 p = base_path;
282 p1 = strrchr(base_path, '/');
283 #ifdef _WIN32
284 {
285 const char *p2;
286 p2 = strrchr(base_path, '\\');
287 if (!p1 || p2 > p1)
288 p1 = p2;
289 }
290 #endif
291 if (p1)
292 p1++;
293 else
294 p1 = base_path;
295 if (p1 > p)
296 p = p1;
297 len = p - base_path;
298 if (len > dest_size - 1)
299 len = dest_size - 1;
300 memcpy(dest, base_path, len);
301 dest[len] = '\0';
302 pstrcat(dest, dest_size, filename);
303 }
304 }
305
306 void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
307 {
308 if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
309 pstrcpy(dest, sz, bs->backing_file);
310 } else {
311 path_combine(dest, sz, bs->filename, bs->backing_file);
312 }
313 }
314
315 void bdrv_register(BlockDriver *bdrv)
316 {
317 /* Block drivers without coroutine functions need emulation */
318 if (!bdrv->bdrv_co_readv) {
319 bdrv->bdrv_co_readv = bdrv_co_readv_em;
320 bdrv->bdrv_co_writev = bdrv_co_writev_em;
321
322 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
323 * the block driver lacks aio we need to emulate that too.
324 */
325 if (!bdrv->bdrv_aio_readv) {
326 /* add AIO emulation layer */
327 bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
328 bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
329 }
330 }
331
332 QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
333 }
334
335 BlockDriverState *bdrv_new_root(void)
336 {
337 BlockDriverState *bs = bdrv_new();
338
339 QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
340 return bs;
341 }
342
343 BlockDriverState *bdrv_new(void)
344 {
345 BlockDriverState *bs;
346 int i;
347
348 bs = g_new0(BlockDriverState, 1);
349 QLIST_INIT(&bs->dirty_bitmaps);
350 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
351 QLIST_INIT(&bs->op_blockers[i]);
352 }
353 bdrv_iostatus_disable(bs);
354 notifier_list_init(&bs->close_notifiers);
355 notifier_with_return_list_init(&bs->before_write_notifiers);
356 qemu_co_queue_init(&bs->throttled_reqs[0]);
357 qemu_co_queue_init(&bs->throttled_reqs[1]);
358 bs->refcnt = 1;
359 bs->aio_context = qemu_get_aio_context();
360
361 return bs;
362 }
363
364 void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
365 {
366 notifier_list_add(&bs->close_notifiers, notify);
367 }
368
369 BlockDriver *bdrv_find_format(const char *format_name)
370 {
371 BlockDriver *drv1;
372 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
373 if (!strcmp(drv1->format_name, format_name)) {
374 return drv1;
375 }
376 }
377 return NULL;
378 }
379
380 static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
381 {
382 static const char *whitelist_rw[] = {
383 CONFIG_BDRV_RW_WHITELIST
384 };
385 static const char *whitelist_ro[] = {
386 CONFIG_BDRV_RO_WHITELIST
387 };
388 const char **p;
389
390 if (!whitelist_rw[0] && !whitelist_ro[0]) {
391 return 1; /* no whitelist, anything goes */
392 }
393
394 for (p = whitelist_rw; *p; p++) {
395 if (!strcmp(drv->format_name, *p)) {
396 return 1;
397 }
398 }
399 if (read_only) {
400 for (p = whitelist_ro; *p; p++) {
401 if (!strcmp(drv->format_name, *p)) {
402 return 1;
403 }
404 }
405 }
406 return 0;
407 }
408
409 BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
410 bool read_only)
411 {
412 BlockDriver *drv = bdrv_find_format(format_name);
413 return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
414 }
415
416 typedef struct CreateCo {
417 BlockDriver *drv;
418 char *filename;
419 QemuOpts *opts;
420 int ret;
421 Error *err;
422 } CreateCo;
423
424 static void coroutine_fn bdrv_create_co_entry(void *opaque)
425 {
426 Error *local_err = NULL;
427 int ret;
428
429 CreateCo *cco = opaque;
430 assert(cco->drv);
431
432 ret = cco->drv->bdrv_create(cco->filename, cco->opts, &local_err);
433 if (local_err) {
434 error_propagate(&cco->err, local_err);
435 }
436 cco->ret = ret;
437 }
438
439 int bdrv_create(BlockDriver *drv, const char* filename,
440 QemuOpts *opts, Error **errp)
441 {
442 int ret;
443
444 Coroutine *co;
445 CreateCo cco = {
446 .drv = drv,
447 .filename = g_strdup(filename),
448 .opts = opts,
449 .ret = NOT_DONE,
450 .err = NULL,
451 };
452
453 if (!drv->bdrv_create) {
454 error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
455 ret = -ENOTSUP;
456 goto out;
457 }
458
459 if (qemu_in_coroutine()) {
460 /* Fast-path if already in coroutine context */
461 bdrv_create_co_entry(&cco);
462 } else {
463 co = qemu_coroutine_create(bdrv_create_co_entry);
464 qemu_coroutine_enter(co, &cco);
465 while (cco.ret == NOT_DONE) {
466 aio_poll(qemu_get_aio_context(), true);
467 }
468 }
469
470 ret = cco.ret;
471 if (ret < 0) {
472 if (cco.err) {
473 error_propagate(errp, cco.err);
474 } else {
475 error_setg_errno(errp, -ret, "Could not create image");
476 }
477 }
478
479 out:
480 g_free(cco.filename);
481 return ret;
482 }
483
484 int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp)
485 {
486 BlockDriver *drv;
487 Error *local_err = NULL;
488 int ret;
489
490 drv = bdrv_find_protocol(filename, true);
491 if (drv == NULL) {
492 error_setg(errp, "Could not find protocol for file '%s'", filename);
493 return -ENOENT;
494 }
495
496 ret = bdrv_create(drv, filename, opts, &local_err);
497 if (local_err) {
498 error_propagate(errp, local_err);
499 }
500 return ret;
501 }
502
503 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
504 {
505 BlockDriver *drv = bs->drv;
506 Error *local_err = NULL;
507
508 memset(&bs->bl, 0, sizeof(bs->bl));
509
510 if (!drv) {
511 return;
512 }
513
514 /* Take some limits from the children as a default */
515 if (bs->file) {
516 bdrv_refresh_limits(bs->file, &local_err);
517 if (local_err) {
518 error_propagate(errp, local_err);
519 return;
520 }
521 bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
522 bs->bl.max_transfer_length = bs->file->bl.max_transfer_length;
523 bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
524 } else {
525 bs->bl.opt_mem_alignment = 512;
526 }
527
528 if (bs->backing_hd) {
529 bdrv_refresh_limits(bs->backing_hd, &local_err);
530 if (local_err) {
531 error_propagate(errp, local_err);
532 return;
533 }
534 bs->bl.opt_transfer_length =
535 MAX(bs->bl.opt_transfer_length,
536 bs->backing_hd->bl.opt_transfer_length);
537 bs->bl.max_transfer_length =
538 MIN_NON_ZERO(bs->bl.max_transfer_length,
539 bs->backing_hd->bl.max_transfer_length);
540 bs->bl.opt_mem_alignment =
541 MAX(bs->bl.opt_mem_alignment,
542 bs->backing_hd->bl.opt_mem_alignment);
543 }
544
545 /* Then let the driver override it */
546 if (drv->bdrv_refresh_limits) {
547 drv->bdrv_refresh_limits(bs, errp);
548 }
549 }
550
551 /*
552 * Create a uniquely-named empty temporary file.
553 * Return 0 upon success, otherwise a negative errno value.
554 */
555 int get_tmp_filename(char *filename, int size)
556 {
557 #ifdef _WIN32
558 char temp_dir[MAX_PATH];
559 /* GetTempFileName requires that its output buffer (4th param)
560 have length MAX_PATH or greater. */
561 assert(size >= MAX_PATH);
562 return (GetTempPath(MAX_PATH, temp_dir)
563 && GetTempFileName(temp_dir, "qem", 0, filename)
564 ? 0 : -GetLastError());
565 #else
566 int fd;
567 const char *tmpdir;
568 tmpdir = getenv("TMPDIR");
569 if (!tmpdir) {
570 tmpdir = "/var/tmp";
571 }
572 if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
573 return -EOVERFLOW;
574 }
575 fd = mkstemp(filename);
576 if (fd < 0) {
577 return -errno;
578 }
579 if (close(fd) != 0) {
580 unlink(filename);
581 return -errno;
582 }
583 return 0;
584 #endif
585 }
586
587 /*
588 * Detect host devices. By convention, /dev/cdrom[N] is always
589 * recognized as a host CDROM.
590 */
591 static BlockDriver *find_hdev_driver(const char *filename)
592 {
593 int score_max = 0, score;
594 BlockDriver *drv = NULL, *d;
595
596 QLIST_FOREACH(d, &bdrv_drivers, list) {
597 if (d->bdrv_probe_device) {
598 score = d->bdrv_probe_device(filename);
599 if (score > score_max) {
600 score_max = score;
601 drv = d;
602 }
603 }
604 }
605
606 return drv;
607 }
608
609 BlockDriver *bdrv_find_protocol(const char *filename,
610 bool allow_protocol_prefix)
611 {
612 BlockDriver *drv1;
613 char protocol[128];
614 int len;
615 const char *p;
616
617 /* TODO Drivers without bdrv_file_open must be specified explicitly */
618
619 /*
620 * XXX(hch): we really should not let host device detection
621 * override an explicit protocol specification, but moving this
622 * later breaks access to device names with colons in them.
623 * Thanks to the brain-dead persistent naming schemes on udev-
624 * based Linux systems those actually are quite common.
625 */
626 drv1 = find_hdev_driver(filename);
627 if (drv1) {
628 return drv1;
629 }
630
631 if (!path_has_protocol(filename) || !allow_protocol_prefix) {
632 return bdrv_find_format("file");
633 }
634
635 p = strchr(filename, ':');
636 assert(p != NULL);
637 len = p - filename;
638 if (len > sizeof(protocol) - 1)
639 len = sizeof(protocol) - 1;
640 memcpy(protocol, filename, len);
641 protocol[len] = '\0';
642 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
643 if (drv1->protocol_name &&
644 !strcmp(drv1->protocol_name, protocol)) {
645 return drv1;
646 }
647 }
648 return NULL;
649 }
650
651 static int find_image_format(BlockDriverState *bs, const char *filename,
652 BlockDriver **pdrv, Error **errp)
653 {
654 int score, score_max;
655 BlockDriver *drv1, *drv;
656 uint8_t buf[2048];
657 int ret = 0;
658
659 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
660 if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
661 drv = bdrv_find_format("raw");
662 if (!drv) {
663 error_setg(errp, "Could not find raw image format");
664 ret = -ENOENT;
665 }
666 *pdrv = drv;
667 return ret;
668 }
669
670 ret = bdrv_pread(bs, 0, buf, sizeof(buf));
671 if (ret < 0) {
672 error_setg_errno(errp, -ret, "Could not read image for determining its "
673 "format");
674 *pdrv = NULL;
675 return ret;
676 }
677
678 score_max = 0;
679 drv = NULL;
680 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
681 if (drv1->bdrv_probe) {
682 score = drv1->bdrv_probe(buf, ret, filename);
683 if (score > score_max) {
684 score_max = score;
685 drv = drv1;
686 }
687 }
688 }
689 if (!drv) {
690 error_setg(errp, "Could not determine image format: No compatible "
691 "driver found");
692 ret = -ENOENT;
693 }
694 *pdrv = drv;
695 return ret;
696 }
697
698 /**
699 * Set the current 'total_sectors' value
700 * Return 0 on success, -errno on error.
701 */
702 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
703 {
704 BlockDriver *drv = bs->drv;
705
706 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
707 if (bs->sg)
708 return 0;
709
710 /* query actual device if possible, otherwise just trust the hint */
711 if (drv->bdrv_getlength) {
712 int64_t length = drv->bdrv_getlength(bs);
713 if (length < 0) {
714 return length;
715 }
716 hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
717 }
718
719 bs->total_sectors = hint;
720 return 0;
721 }
722
723 /**
724 * Set open flags for a given discard mode
725 *
726 * Return 0 on success, -1 if the discard mode was invalid.
727 */
728 int bdrv_parse_discard_flags(const char *mode, int *flags)
729 {
730 *flags &= ~BDRV_O_UNMAP;
731
732 if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
733 /* do nothing */
734 } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
735 *flags |= BDRV_O_UNMAP;
736 } else {
737 return -1;
738 }
739
740 return 0;
741 }
742
743 /**
744 * Set open flags for a given cache mode
745 *
746 * Return 0 on success, -1 if the cache mode was invalid.
747 */
748 int bdrv_parse_cache_flags(const char *mode, int *flags)
749 {
750 *flags &= ~BDRV_O_CACHE_MASK;
751
752 if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
753 *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
754 } else if (!strcmp(mode, "directsync")) {
755 *flags |= BDRV_O_NOCACHE;
756 } else if (!strcmp(mode, "writeback")) {
757 *flags |= BDRV_O_CACHE_WB;
758 } else if (!strcmp(mode, "unsafe")) {
759 *flags |= BDRV_O_CACHE_WB;
760 *flags |= BDRV_O_NO_FLUSH;
761 } else if (!strcmp(mode, "writethrough")) {
762 /* this is the default */
763 } else {
764 return -1;
765 }
766
767 return 0;
768 }
769
770 /**
771 * The copy-on-read flag is actually a reference count so multiple users may
772 * use the feature without worrying about clobbering its previous state.
773 * Copy-on-read stays enabled until all users have called to disable it.
774 */
775 void bdrv_enable_copy_on_read(BlockDriverState *bs)
776 {
777 bs->copy_on_read++;
778 }
779
780 void bdrv_disable_copy_on_read(BlockDriverState *bs)
781 {
782 assert(bs->copy_on_read > 0);
783 bs->copy_on_read--;
784 }
785
786 /*
787 * Returns the flags that a temporary snapshot should get, based on the
788 * originally requested flags (the originally requested image will have flags
789 * like a backing file)
790 */
791 static int bdrv_temp_snapshot_flags(int flags)
792 {
793 return (flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY;
794 }
795
796 /*
797 * Returns the flags that bs->file should get, based on the given flags for
798 * the parent BDS
799 */
800 static int bdrv_inherited_flags(int flags)
801 {
802 /* Enable protocol handling, disable format probing for bs->file */
803 flags |= BDRV_O_PROTOCOL;
804
805 /* Our block drivers take care to send flushes and respect unmap policy,
806 * so we can enable both unconditionally on lower layers. */
807 flags |= BDRV_O_CACHE_WB | BDRV_O_UNMAP;
808
809 /* Clear flags that only apply to the top layer */
810 flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_COPY_ON_READ);
811
812 return flags;
813 }
814
815 /*
816 * Returns the flags that bs->backing_hd should get, based on the given flags
817 * for the parent BDS
818 */
819 static int bdrv_backing_flags(int flags)
820 {
821 /* backing files always opened read-only */
822 flags &= ~(BDRV_O_RDWR | BDRV_O_COPY_ON_READ);
823
824 /* snapshot=on is handled on the top layer */
825 flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_TEMPORARY);
826
827 return flags;
828 }
829
830 static int bdrv_open_flags(BlockDriverState *bs, int flags)
831 {
832 int open_flags = flags | BDRV_O_CACHE_WB;
833
834 /*
835 * Clear flags that are internal to the block layer before opening the
836 * image.
837 */
838 open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_PROTOCOL);
839
840 /*
841 * Snapshots should be writable.
842 */
843 if (flags & BDRV_O_TEMPORARY) {
844 open_flags |= BDRV_O_RDWR;
845 }
846
847 return open_flags;
848 }
849
850 static void bdrv_assign_node_name(BlockDriverState *bs,
851 const char *node_name,
852 Error **errp)
853 {
854 if (!node_name) {
855 return;
856 }
857
858 /* Check for empty string or invalid characters */
859 if (!id_wellformed(node_name)) {
860 error_setg(errp, "Invalid node name");
861 return;
862 }
863
864 /* takes care of avoiding namespaces collisions */
865 if (blk_by_name(node_name)) {
866 error_setg(errp, "node-name=%s is conflicting with a device id",
867 node_name);
868 return;
869 }
870
871 /* takes care of avoiding duplicates node names */
872 if (bdrv_find_node(node_name)) {
873 error_setg(errp, "Duplicate node name");
874 return;
875 }
876
877 /* copy node name into the bs and insert it into the graph list */
878 pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
879 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
880 }
881
882 /*
883 * Common part for opening disk images and files
884 *
885 * Removes all processed options from *options.
886 */
887 static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
888 QDict *options, int flags, BlockDriver *drv, Error **errp)
889 {
890 int ret, open_flags;
891 const char *filename;
892 const char *node_name = NULL;
893 Error *local_err = NULL;
894
895 assert(drv != NULL);
896 assert(bs->file == NULL);
897 assert(options != NULL && bs->options != options);
898
899 if (file != NULL) {
900 filename = file->filename;
901 } else {
902 filename = qdict_get_try_str(options, "filename");
903 }
904
905 if (drv->bdrv_needs_filename && !filename) {
906 error_setg(errp, "The '%s' block driver requires a file name",
907 drv->format_name);
908 return -EINVAL;
909 }
910
911 trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
912
913 node_name = qdict_get_try_str(options, "node-name");
914 bdrv_assign_node_name(bs, node_name, &local_err);
915 if (local_err) {
916 error_propagate(errp, local_err);
917 return -EINVAL;
918 }
919 qdict_del(options, "node-name");
920
921 /* bdrv_open() with directly using a protocol as drv. This layer is already
922 * opened, so assign it to bs (while file becomes a closed BlockDriverState)
923 * and return immediately. */
924 if (file != NULL && drv->bdrv_file_open) {
925 bdrv_swap(file, bs);
926 return 0;
927 }
928
929 bs->open_flags = flags;
930 bs->guest_block_size = 512;
931 bs->request_alignment = 512;
932 bs->zero_beyond_eof = true;
933 open_flags = bdrv_open_flags(bs, flags);
934 bs->read_only = !(open_flags & BDRV_O_RDWR);
935 bs->growable = !!(flags & BDRV_O_PROTOCOL);
936
937 if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
938 error_setg(errp,
939 !bs->read_only && bdrv_is_whitelisted(drv, true)
940 ? "Driver '%s' can only be used for read-only devices"
941 : "Driver '%s' is not whitelisted",
942 drv->format_name);
943 return -ENOTSUP;
944 }
945
946 assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
947 if (flags & BDRV_O_COPY_ON_READ) {
948 if (!bs->read_only) {
949 bdrv_enable_copy_on_read(bs);
950 } else {
951 error_setg(errp, "Can't use copy-on-read on read-only device");
952 return -EINVAL;
953 }
954 }
955
956 if (filename != NULL) {
957 pstrcpy(bs->filename, sizeof(bs->filename), filename);
958 } else {
959 bs->filename[0] = '\0';
960 }
961 pstrcpy(bs->exact_filename, sizeof(bs->exact_filename), bs->filename);
962
963 bs->drv = drv;
964 bs->opaque = g_malloc0(drv->instance_size);
965
966 bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
967
968 /* Open the image, either directly or using a protocol */
969 if (drv->bdrv_file_open) {
970 assert(file == NULL);
971 assert(!drv->bdrv_needs_filename || filename != NULL);
972 ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
973 } else {
974 if (file == NULL) {
975 error_setg(errp, "Can't use '%s' as a block driver for the "
976 "protocol level", drv->format_name);
977 ret = -EINVAL;
978 goto free_and_fail;
979 }
980 bs->file = file;
981 ret = drv->bdrv_open(bs, options, open_flags, &local_err);
982 }
983
984 if (ret < 0) {
985 if (local_err) {
986 error_propagate(errp, local_err);
987 } else if (bs->filename[0]) {
988 error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
989 } else {
990 error_setg_errno(errp, -ret, "Could not open image");
991 }
992 goto free_and_fail;
993 }
994
995 ret = refresh_total_sectors(bs, bs->total_sectors);
996 if (ret < 0) {
997 error_setg_errno(errp, -ret, "Could not refresh total sector count");
998 goto free_and_fail;
999 }
1000
1001 bdrv_refresh_limits(bs, &local_err);
1002 if (local_err) {
1003 error_propagate(errp, local_err);
1004 ret = -EINVAL;
1005 goto free_and_fail;
1006 }
1007
1008 assert(bdrv_opt_mem_align(bs) != 0);
1009 assert((bs->request_alignment != 0) || bs->sg);
1010 return 0;
1011
1012 free_and_fail:
1013 bs->file = NULL;
1014 g_free(bs->opaque);
1015 bs->opaque = NULL;
1016 bs->drv = NULL;
1017 return ret;
1018 }
1019
1020 static QDict *parse_json_filename(const char *filename, Error **errp)
1021 {
1022 QObject *options_obj;
1023 QDict *options;
1024 int ret;
1025
1026 ret = strstart(filename, "json:", &filename);
1027 assert(ret);
1028
1029 options_obj = qobject_from_json(filename);
1030 if (!options_obj) {
1031 error_setg(errp, "Could not parse the JSON options");
1032 return NULL;
1033 }
1034
1035 if (qobject_type(options_obj) != QTYPE_QDICT) {
1036 qobject_decref(options_obj);
1037 error_setg(errp, "Invalid JSON object given");
1038 return NULL;
1039 }
1040
1041 options = qobject_to_qdict(options_obj);
1042 qdict_flatten(options);
1043
1044 return options;
1045 }
1046
1047 /*
1048 * Fills in default options for opening images and converts the legacy
1049 * filename/flags pair to option QDict entries.
1050 */
1051 static int bdrv_fill_options(QDict **options, const char **pfilename, int flags,
1052 BlockDriver *drv, Error **errp)
1053 {
1054 const char *filename = *pfilename;
1055 const char *drvname;
1056 bool protocol = flags & BDRV_O_PROTOCOL;
1057 bool parse_filename = false;
1058 Error *local_err = NULL;
1059
1060 /* Parse json: pseudo-protocol */
1061 if (filename && g_str_has_prefix(filename, "json:")) {
1062 QDict *json_options = parse_json_filename(filename, &local_err);
1063 if (local_err) {
1064 error_propagate(errp, local_err);
1065 return -EINVAL;
1066 }
1067
1068 /* Options given in the filename have lower priority than options
1069 * specified directly */
1070 qdict_join(*options, json_options, false);
1071 QDECREF(json_options);
1072 *pfilename = filename = NULL;
1073 }
1074
1075 /* Fetch the file name from the options QDict if necessary */
1076 if (protocol && filename) {
1077 if (!qdict_haskey(*options, "filename")) {
1078 qdict_put(*options, "filename", qstring_from_str(filename));
1079 parse_filename = true;
1080 } else {
1081 error_setg(errp, "Can't specify 'file' and 'filename' options at "
1082 "the same time");
1083 return -EINVAL;
1084 }
1085 }
1086
1087 /* Find the right block driver */
1088 filename = qdict_get_try_str(*options, "filename");
1089 drvname = qdict_get_try_str(*options, "driver");
1090
1091 if (drv) {
1092 if (drvname) {
1093 error_setg(errp, "Driver specified twice");
1094 return -EINVAL;
1095 }
1096 drvname = drv->format_name;
1097 qdict_put(*options, "driver", qstring_from_str(drvname));
1098 } else {
1099 if (!drvname && protocol) {
1100 if (filename) {
1101 drv = bdrv_find_protocol(filename, parse_filename);
1102 if (!drv) {
1103 error_setg(errp, "Unknown protocol");
1104 return -EINVAL;
1105 }
1106
1107 drvname = drv->format_name;
1108 qdict_put(*options, "driver", qstring_from_str(drvname));
1109 } else {
1110 error_setg(errp, "Must specify either driver or file");
1111 return -EINVAL;
1112 }
1113 } else if (drvname) {
1114 drv = bdrv_find_format(drvname);
1115 if (!drv) {
1116 error_setg(errp, "Unknown driver '%s'", drvname);
1117 return -ENOENT;
1118 }
1119 }
1120 }
1121
1122 assert(drv || !protocol);
1123
1124 /* Driver-specific filename parsing */
1125 if (drv && drv->bdrv_parse_filename && parse_filename) {
1126 drv->bdrv_parse_filename(filename, *options, &local_err);
1127 if (local_err) {
1128 error_propagate(errp, local_err);
1129 return -EINVAL;
1130 }
1131
1132 if (!drv->bdrv_needs_filename) {
1133 qdict_del(*options, "filename");
1134 }
1135 }
1136
1137 return 0;
1138 }
1139
1140 void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd)
1141 {
1142
1143 if (bs->backing_hd) {
1144 assert(bs->backing_blocker);
1145 bdrv_op_unblock_all(bs->backing_hd, bs->backing_blocker);
1146 } else if (backing_hd) {
1147 error_setg(&bs->backing_blocker,
1148 "device is used as backing hd of '%s'",
1149 bdrv_get_device_name(bs));
1150 }
1151
1152 bs->backing_hd = backing_hd;
1153 if (!backing_hd) {
1154 error_free(bs->backing_blocker);
1155 bs->backing_blocker = NULL;
1156 goto out;
1157 }
1158 bs->open_flags &= ~BDRV_O_NO_BACKING;
1159 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_hd->filename);
1160 pstrcpy(bs->backing_format, sizeof(bs->backing_format),
1161 backing_hd->drv ? backing_hd->drv->format_name : "");
1162
1163 bdrv_op_block_all(bs->backing_hd, bs->backing_blocker);
1164 /* Otherwise we won't be able to commit due to check in bdrv_commit */
1165 bdrv_op_unblock(bs->backing_hd, BLOCK_OP_TYPE_COMMIT,
1166 bs->backing_blocker);
1167 out:
1168 bdrv_refresh_limits(bs, NULL);
1169 }
1170
1171 /*
1172 * Opens the backing file for a BlockDriverState if not yet open
1173 *
1174 * options is a QDict of options to pass to the block drivers, or NULL for an
1175 * empty set of options. The reference to the QDict is transferred to this
1176 * function (even on failure), so if the caller intends to reuse the dictionary,
1177 * it needs to use QINCREF() before calling bdrv_file_open.
1178 */
1179 int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
1180 {
1181 char *backing_filename = g_malloc0(PATH_MAX);
1182 int ret = 0;
1183 BlockDriver *back_drv = NULL;
1184 BlockDriverState *backing_hd;
1185 Error *local_err = NULL;
1186
1187 if (bs->backing_hd != NULL) {
1188 QDECREF(options);
1189 goto free_exit;
1190 }
1191
1192 /* NULL means an empty set of options */
1193 if (options == NULL) {
1194 options = qdict_new();
1195 }
1196
1197 bs->open_flags &= ~BDRV_O_NO_BACKING;
1198 if (qdict_haskey(options, "file.filename")) {
1199 backing_filename[0] = '\0';
1200 } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
1201 QDECREF(options);
1202 goto free_exit;
1203 } else {
1204 bdrv_get_full_backing_filename(bs, backing_filename, PATH_MAX);
1205 }
1206
1207 if (!bs->drv || !bs->drv->supports_backing) {
1208 ret = -EINVAL;
1209 error_setg(errp, "Driver doesn't support backing files");
1210 QDECREF(options);
1211 goto free_exit;
1212 }
1213
1214 backing_hd = bdrv_new();
1215
1216 if (bs->backing_format[0] != '\0') {
1217 back_drv = bdrv_find_format(bs->backing_format);
1218 }
1219
1220 assert(bs->backing_hd == NULL);
1221 ret = bdrv_open(&backing_hd,
1222 *backing_filename ? backing_filename : NULL, NULL, options,
1223 bdrv_backing_flags(bs->open_flags), back_drv, &local_err);
1224 if (ret < 0) {
1225 bdrv_unref(backing_hd);
1226 backing_hd = NULL;
1227 bs->open_flags |= BDRV_O_NO_BACKING;
1228 error_setg(errp, "Could not open backing file: %s",
1229 error_get_pretty(local_err));
1230 error_free(local_err);
1231 goto free_exit;
1232 }
1233 bdrv_set_backing_hd(bs, backing_hd);
1234
1235 free_exit:
1236 g_free(backing_filename);
1237 return ret;
1238 }
1239
1240 /*
1241 * Opens a disk image whose options are given as BlockdevRef in another block
1242 * device's options.
1243 *
1244 * If allow_none is true, no image will be opened if filename is false and no
1245 * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
1246 *
1247 * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
1248 * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
1249 * itself, all options starting with "${bdref_key}." are considered part of the
1250 * BlockdevRef.
1251 *
1252 * The BlockdevRef will be removed from the options QDict.
1253 *
1254 * To conform with the behavior of bdrv_open(), *pbs has to be NULL.
1255 */
1256 int bdrv_open_image(BlockDriverState **pbs, const char *filename,
1257 QDict *options, const char *bdref_key, int flags,
1258 bool allow_none, Error **errp)
1259 {
1260 QDict *image_options;
1261 int ret;
1262 char *bdref_key_dot;
1263 const char *reference;
1264
1265 assert(pbs);
1266 assert(*pbs == NULL);
1267
1268 bdref_key_dot = g_strdup_printf("%s.", bdref_key);
1269 qdict_extract_subqdict(options, &image_options, bdref_key_dot);
1270 g_free(bdref_key_dot);
1271
1272 reference = qdict_get_try_str(options, bdref_key);
1273 if (!filename && !reference && !qdict_size(image_options)) {
1274 if (allow_none) {
1275 ret = 0;
1276 } else {
1277 error_setg(errp, "A block device must be specified for \"%s\"",
1278 bdref_key);
1279 ret = -EINVAL;
1280 }
1281 QDECREF(image_options);
1282 goto done;
1283 }
1284
1285 ret = bdrv_open(pbs, filename, reference, image_options, flags, NULL, errp);
1286
1287 done:
1288 qdict_del(options, bdref_key);
1289 return ret;
1290 }
1291
1292 int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp)
1293 {
1294 /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1295 char *tmp_filename = g_malloc0(PATH_MAX + 1);
1296 int64_t total_size;
1297 BlockDriver *bdrv_qcow2;
1298 QemuOpts *opts = NULL;
1299 QDict *snapshot_options;
1300 BlockDriverState *bs_snapshot;
1301 Error *local_err;
1302 int ret;
1303
1304 /* if snapshot, we create a temporary backing file and open it
1305 instead of opening 'filename' directly */
1306
1307 /* Get the required size from the image */
1308 total_size = bdrv_getlength(bs);
1309 if (total_size < 0) {
1310 ret = total_size;
1311 error_setg_errno(errp, -total_size, "Could not get image size");
1312 goto out;
1313 }
1314
1315 /* Create the temporary image */
1316 ret = get_tmp_filename(tmp_filename, PATH_MAX + 1);
1317 if (ret < 0) {
1318 error_setg_errno(errp, -ret, "Could not get temporary filename");
1319 goto out;
1320 }
1321
1322 bdrv_qcow2 = bdrv_find_format("qcow2");
1323 opts = qemu_opts_create(bdrv_qcow2->create_opts, NULL, 0,
1324 &error_abort);
1325 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, total_size);
1326 ret = bdrv_create(bdrv_qcow2, tmp_filename, opts, &local_err);
1327 qemu_opts_del(opts);
1328 if (ret < 0) {
1329 error_setg_errno(errp, -ret, "Could not create temporary overlay "
1330 "'%s': %s", tmp_filename,
1331 error_get_pretty(local_err));
1332 error_free(local_err);
1333 goto out;
1334 }
1335
1336 /* Prepare a new options QDict for the temporary file */
1337 snapshot_options = qdict_new();
1338 qdict_put(snapshot_options, "file.driver",
1339 qstring_from_str("file"));
1340 qdict_put(snapshot_options, "file.filename",
1341 qstring_from_str(tmp_filename));
1342
1343 bs_snapshot = bdrv_new();
1344
1345 ret = bdrv_open(&bs_snapshot, NULL, NULL, snapshot_options,
1346 flags, bdrv_qcow2, &local_err);
1347 if (ret < 0) {
1348 error_propagate(errp, local_err);
1349 goto out;
1350 }
1351
1352 bdrv_append(bs_snapshot, bs);
1353
1354 out:
1355 g_free(tmp_filename);
1356 return ret;
1357 }
1358
1359 /*
1360 * Opens a disk image (raw, qcow2, vmdk, ...)
1361 *
1362 * options is a QDict of options to pass to the block drivers, or NULL for an
1363 * empty set of options. The reference to the QDict belongs to the block layer
1364 * after the call (even on failure), so if the caller intends to reuse the
1365 * dictionary, it needs to use QINCREF() before calling bdrv_open.
1366 *
1367 * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
1368 * If it is not NULL, the referenced BDS will be reused.
1369 *
1370 * The reference parameter may be used to specify an existing block device which
1371 * should be opened. If specified, neither options nor a filename may be given,
1372 * nor can an existing BDS be reused (that is, *pbs has to be NULL).
1373 */
1374 int bdrv_open(BlockDriverState **pbs, const char *filename,
1375 const char *reference, QDict *options, int flags,
1376 BlockDriver *drv, Error **errp)
1377 {
1378 int ret;
1379 BlockDriverState *file = NULL, *bs;
1380 const char *drvname;
1381 Error *local_err = NULL;
1382 int snapshot_flags = 0;
1383
1384 assert(pbs);
1385
1386 if (reference) {
1387 bool options_non_empty = options ? qdict_size(options) : false;
1388 QDECREF(options);
1389
1390 if (*pbs) {
1391 error_setg(errp, "Cannot reuse an existing BDS when referencing "
1392 "another block device");
1393 return -EINVAL;
1394 }
1395
1396 if (filename || options_non_empty) {
1397 error_setg(errp, "Cannot reference an existing block device with "
1398 "additional options or a new filename");
1399 return -EINVAL;
1400 }
1401
1402 bs = bdrv_lookup_bs(reference, reference, errp);
1403 if (!bs) {
1404 return -ENODEV;
1405 }
1406 bdrv_ref(bs);
1407 *pbs = bs;
1408 return 0;
1409 }
1410
1411 if (*pbs) {
1412 bs = *pbs;
1413 } else {
1414 bs = bdrv_new();
1415 }
1416
1417 /* NULL means an empty set of options */
1418 if (options == NULL) {
1419 options = qdict_new();
1420 }
1421
1422 ret = bdrv_fill_options(&options, &filename, flags, drv, &local_err);
1423 if (local_err) {
1424 goto fail;
1425 }
1426
1427 /* Find the right image format driver */
1428 drv = NULL;
1429 drvname = qdict_get_try_str(options, "driver");
1430 if (drvname) {
1431 drv = bdrv_find_format(drvname);
1432 qdict_del(options, "driver");
1433 if (!drv) {
1434 error_setg(errp, "Unknown driver: '%s'", drvname);
1435 ret = -EINVAL;
1436 goto fail;
1437 }
1438 }
1439
1440 assert(drvname || !(flags & BDRV_O_PROTOCOL));
1441 if (drv && !drv->bdrv_file_open) {
1442 /* If the user explicitly wants a format driver here, we'll need to add
1443 * another layer for the protocol in bs->file */
1444 flags &= ~BDRV_O_PROTOCOL;
1445 }
1446
1447 bs->options = options;
1448 options = qdict_clone_shallow(options);
1449
1450 /* Open image file without format layer */
1451 if ((flags & BDRV_O_PROTOCOL) == 0) {
1452 if (flags & BDRV_O_RDWR) {
1453 flags |= BDRV_O_ALLOW_RDWR;
1454 }
1455 if (flags & BDRV_O_SNAPSHOT) {
1456 snapshot_flags = bdrv_temp_snapshot_flags(flags);
1457 flags = bdrv_backing_flags(flags);
1458 }
1459
1460 assert(file == NULL);
1461 ret = bdrv_open_image(&file, filename, options, "file",
1462 bdrv_inherited_flags(flags),
1463 true, &local_err);
1464 if (ret < 0) {
1465 goto fail;
1466 }
1467 }
1468
1469 /* Image format probing */
1470 if (!drv && file) {
1471 ret = find_image_format(file, filename, &drv, &local_err);
1472 if (ret < 0) {
1473 goto fail;
1474 }
1475 } else if (!drv) {
1476 error_setg(errp, "Must specify either driver or file");
1477 ret = -EINVAL;
1478 goto fail;
1479 }
1480
1481 /* Open the image */
1482 ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
1483 if (ret < 0) {
1484 goto fail;
1485 }
1486
1487 if (file && (bs->file != file)) {
1488 bdrv_unref(file);
1489 file = NULL;
1490 }
1491
1492 /* If there is a backing file, use it */
1493 if ((flags & BDRV_O_NO_BACKING) == 0) {
1494 QDict *backing_options;
1495
1496 qdict_extract_subqdict(options, &backing_options, "backing.");
1497 ret = bdrv_open_backing_file(bs, backing_options, &local_err);
1498 if (ret < 0) {
1499 goto close_and_fail;
1500 }
1501 }
1502
1503 bdrv_refresh_filename(bs);
1504
1505 /* For snapshot=on, create a temporary qcow2 overlay. bs points to the
1506 * temporary snapshot afterwards. */
1507 if (snapshot_flags) {
1508 ret = bdrv_append_temp_snapshot(bs, snapshot_flags, &local_err);
1509 if (local_err) {
1510 goto close_and_fail;
1511 }
1512 }
1513
1514 /* Check if any unknown options were used */
1515 if (options && (qdict_size(options) != 0)) {
1516 const QDictEntry *entry = qdict_first(options);
1517 if (flags & BDRV_O_PROTOCOL) {
1518 error_setg(errp, "Block protocol '%s' doesn't support the option "
1519 "'%s'", drv->format_name, entry->key);
1520 } else {
1521 error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1522 "support the option '%s'", drv->format_name,
1523 bdrv_get_device_name(bs), entry->key);
1524 }
1525
1526 ret = -EINVAL;
1527 goto close_and_fail;
1528 }
1529
1530 if (!bdrv_key_required(bs)) {
1531 if (bs->blk) {
1532 blk_dev_change_media_cb(bs->blk, true);
1533 }
1534 } else if (!runstate_check(RUN_STATE_PRELAUNCH)
1535 && !runstate_check(RUN_STATE_INMIGRATE)
1536 && !runstate_check(RUN_STATE_PAUSED)) { /* HACK */
1537 error_setg(errp,
1538 "Guest must be stopped for opening of encrypted image");
1539 ret = -EBUSY;
1540 goto close_and_fail;
1541 }
1542
1543 QDECREF(options);
1544 *pbs = bs;
1545 return 0;
1546
1547 fail:
1548 if (file != NULL) {
1549 bdrv_unref(file);
1550 }
1551 QDECREF(bs->options);
1552 QDECREF(options);
1553 bs->options = NULL;
1554 if (!*pbs) {
1555 /* If *pbs is NULL, a new BDS has been created in this function and
1556 needs to be freed now. Otherwise, it does not need to be closed,
1557 since it has not really been opened yet. */
1558 bdrv_unref(bs);
1559 }
1560 if (local_err) {
1561 error_propagate(errp, local_err);
1562 }
1563 return ret;
1564
1565 close_and_fail:
1566 /* See fail path, but now the BDS has to be always closed */
1567 if (*pbs) {
1568 bdrv_close(bs);
1569 } else {
1570 bdrv_unref(bs);
1571 }
1572 QDECREF(options);
1573 if (local_err) {
1574 error_propagate(errp, local_err);
1575 }
1576 return ret;
1577 }
1578
1579 typedef struct BlockReopenQueueEntry {
1580 bool prepared;
1581 BDRVReopenState state;
1582 QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1583 } BlockReopenQueueEntry;
1584
1585 /*
1586 * Adds a BlockDriverState to a simple queue for an atomic, transactional
1587 * reopen of multiple devices.
1588 *
1589 * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1590 * already performed, or alternatively may be NULL a new BlockReopenQueue will
1591 * be created and initialized. This newly created BlockReopenQueue should be
1592 * passed back in for subsequent calls that are intended to be of the same
1593 * atomic 'set'.
1594 *
1595 * bs is the BlockDriverState to add to the reopen queue.
1596 *
1597 * flags contains the open flags for the associated bs
1598 *
1599 * returns a pointer to bs_queue, which is either the newly allocated
1600 * bs_queue, or the existing bs_queue being used.
1601 *
1602 */
1603 BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1604 BlockDriverState *bs, int flags)
1605 {
1606 assert(bs != NULL);
1607
1608 BlockReopenQueueEntry *bs_entry;
1609 if (bs_queue == NULL) {
1610 bs_queue = g_new0(BlockReopenQueue, 1);
1611 QSIMPLEQ_INIT(bs_queue);
1612 }
1613
1614 /* bdrv_open() masks this flag out */
1615 flags &= ~BDRV_O_PROTOCOL;
1616
1617 if (bs->file) {
1618 bdrv_reopen_queue(bs_queue, bs->file, bdrv_inherited_flags(flags));
1619 }
1620
1621 bs_entry = g_new0(BlockReopenQueueEntry, 1);
1622 QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1623
1624 bs_entry->state.bs = bs;
1625 bs_entry->state.flags = flags;
1626
1627 return bs_queue;
1628 }
1629
1630 /*
1631 * Reopen multiple BlockDriverStates atomically & transactionally.
1632 *
1633 * The queue passed in (bs_queue) must have been built up previous
1634 * via bdrv_reopen_queue().
1635 *
1636 * Reopens all BDS specified in the queue, with the appropriate
1637 * flags. All devices are prepared for reopen, and failure of any
1638 * device will cause all device changes to be abandonded, and intermediate
1639 * data cleaned up.
1640 *
1641 * If all devices prepare successfully, then the changes are committed
1642 * to all devices.
1643 *
1644 */
1645 int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1646 {
1647 int ret = -1;
1648 BlockReopenQueueEntry *bs_entry, *next;
1649 Error *local_err = NULL;
1650
1651 assert(bs_queue != NULL);
1652
1653 bdrv_drain_all();
1654
1655 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1656 if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1657 error_propagate(errp, local_err);
1658 goto cleanup;
1659 }
1660 bs_entry->prepared = true;
1661 }
1662
1663 /* If we reach this point, we have success and just need to apply the
1664 * changes
1665 */
1666 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1667 bdrv_reopen_commit(&bs_entry->state);
1668 }
1669
1670 ret = 0;
1671
1672 cleanup:
1673 QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1674 if (ret && bs_entry->prepared) {
1675 bdrv_reopen_abort(&bs_entry->state);
1676 }
1677 g_free(bs_entry);
1678 }
1679 g_free(bs_queue);
1680 return ret;
1681 }
1682
1683
1684 /* Reopen a single BlockDriverState with the specified flags. */
1685 int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1686 {
1687 int ret = -1;
1688 Error *local_err = NULL;
1689 BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1690
1691 ret = bdrv_reopen_multiple(queue, &local_err);
1692 if (local_err != NULL) {
1693 error_propagate(errp, local_err);
1694 }
1695 return ret;
1696 }
1697
1698
1699 /*
1700 * Prepares a BlockDriverState for reopen. All changes are staged in the
1701 * 'opaque' field of the BDRVReopenState, which is used and allocated by
1702 * the block driver layer .bdrv_reopen_prepare()
1703 *
1704 * bs is the BlockDriverState to reopen
1705 * flags are the new open flags
1706 * queue is the reopen queue
1707 *
1708 * Returns 0 on success, non-zero on error. On error errp will be set
1709 * as well.
1710 *
1711 * On failure, bdrv_reopen_abort() will be called to clean up any data.
1712 * It is the responsibility of the caller to then call the abort() or
1713 * commit() for any other BDS that have been left in a prepare() state
1714 *
1715 */
1716 int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1717 Error **errp)
1718 {
1719 int ret = -1;
1720 Error *local_err = NULL;
1721 BlockDriver *drv;
1722
1723 assert(reopen_state != NULL);
1724 assert(reopen_state->bs->drv != NULL);
1725 drv = reopen_state->bs->drv;
1726
1727 /* if we are to stay read-only, do not allow permission change
1728 * to r/w */
1729 if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1730 reopen_state->flags & BDRV_O_RDWR) {
1731 error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1732 bdrv_get_device_name(reopen_state->bs));
1733 goto error;
1734 }
1735
1736
1737 ret = bdrv_flush(reopen_state->bs);
1738 if (ret) {
1739 error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1740 strerror(-ret));
1741 goto error;
1742 }
1743
1744 if (drv->bdrv_reopen_prepare) {
1745 ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1746 if (ret) {
1747 if (local_err != NULL) {
1748 error_propagate(errp, local_err);
1749 } else {
1750 error_setg(errp, "failed while preparing to reopen image '%s'",
1751 reopen_state->bs->filename);
1752 }
1753 goto error;
1754 }
1755 } else {
1756 /* It is currently mandatory to have a bdrv_reopen_prepare()
1757 * handler for each supported drv. */
1758 error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1759 drv->format_name, bdrv_get_device_name(reopen_state->bs),
1760 "reopening of file");
1761 ret = -1;
1762 goto error;
1763 }
1764
1765 ret = 0;
1766
1767 error:
1768 return ret;
1769 }
1770
1771 /*
1772 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1773 * makes them final by swapping the staging BlockDriverState contents into
1774 * the active BlockDriverState contents.
1775 */
1776 void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1777 {
1778 BlockDriver *drv;
1779
1780 assert(reopen_state != NULL);
1781 drv = reopen_state->bs->drv;
1782 assert(drv != NULL);
1783
1784 /* If there are any driver level actions to take */
1785 if (drv->bdrv_reopen_commit) {
1786 drv->bdrv_reopen_commit(reopen_state);
1787 }
1788
1789 /* set BDS specific flags now */
1790 reopen_state->bs->open_flags = reopen_state->flags;
1791 reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1792 BDRV_O_CACHE_WB);
1793 reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1794
1795 bdrv_refresh_limits(reopen_state->bs, NULL);
1796 }
1797
1798 /*
1799 * Abort the reopen, and delete and free the staged changes in
1800 * reopen_state
1801 */
1802 void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1803 {
1804 BlockDriver *drv;
1805
1806 assert(reopen_state != NULL);
1807 drv = reopen_state->bs->drv;
1808 assert(drv != NULL);
1809
1810 if (drv->bdrv_reopen_abort) {
1811 drv->bdrv_reopen_abort(reopen_state);
1812 }
1813 }
1814
1815
1816 void bdrv_close(BlockDriverState *bs)
1817 {
1818 BdrvAioNotifier *ban, *ban_next;
1819
1820 if (bs->job) {
1821 block_job_cancel_sync(bs->job);
1822 }
1823 bdrv_drain_all(); /* complete I/O */
1824 bdrv_flush(bs);
1825 bdrv_drain_all(); /* in case flush left pending I/O */
1826 notifier_list_notify(&bs->close_notifiers, bs);
1827
1828 if (bs->drv) {
1829 if (bs->backing_hd) {
1830 BlockDriverState *backing_hd = bs->backing_hd;
1831 bdrv_set_backing_hd(bs, NULL);
1832 bdrv_unref(backing_hd);
1833 }
1834 bs->drv->bdrv_close(bs);
1835 g_free(bs->opaque);
1836 bs->opaque = NULL;
1837 bs->drv = NULL;
1838 bs->copy_on_read = 0;
1839 bs->backing_file[0] = '\0';
1840 bs->backing_format[0] = '\0';
1841 bs->total_sectors = 0;
1842 bs->encrypted = 0;
1843 bs->valid_key = 0;
1844 bs->sg = 0;
1845 bs->growable = 0;
1846 bs->zero_beyond_eof = false;
1847 QDECREF(bs->options);
1848 bs->options = NULL;
1849 QDECREF(bs->full_open_options);
1850 bs->full_open_options = NULL;
1851
1852 if (bs->file != NULL) {
1853 bdrv_unref(bs->file);
1854 bs->file = NULL;
1855 }
1856 }
1857
1858 if (bs->blk) {
1859 blk_dev_change_media_cb(bs->blk, false);
1860 }
1861
1862 /*throttling disk I/O limits*/
1863 if (bs->io_limits_enabled) {
1864 bdrv_io_limits_disable(bs);
1865 }
1866
1867 QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
1868 g_free(ban);
1869 }
1870 QLIST_INIT(&bs->aio_notifiers);
1871 }
1872
1873 void bdrv_close_all(void)
1874 {
1875 BlockDriverState *bs;
1876
1877 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1878 AioContext *aio_context = bdrv_get_aio_context(bs);
1879
1880 aio_context_acquire(aio_context);
1881 bdrv_close(bs);
1882 aio_context_release(aio_context);
1883 }
1884 }
1885
1886 /* Check if any requests are in-flight (including throttled requests) */
1887 static bool bdrv_requests_pending(BlockDriverState *bs)
1888 {
1889 if (!QLIST_EMPTY(&bs->tracked_requests)) {
1890 return true;
1891 }
1892 if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1893 return true;
1894 }
1895 if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
1896 return true;
1897 }
1898 if (bs->file && bdrv_requests_pending(bs->file)) {
1899 return true;
1900 }
1901 if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1902 return true;
1903 }
1904 return false;
1905 }
1906
1907 /*
1908 * Wait for pending requests to complete across all BlockDriverStates
1909 *
1910 * This function does not flush data to disk, use bdrv_flush_all() for that
1911 * after calling this function.
1912 *
1913 * Note that completion of an asynchronous I/O operation can trigger any
1914 * number of other I/O operations on other devices---for example a coroutine
1915 * can be arbitrarily complex and a constant flow of I/O can come until the
1916 * coroutine is complete. Because of this, it is not possible to have a
1917 * function to drain a single device's I/O queue.
1918 */
1919 void bdrv_drain_all(void)
1920 {
1921 /* Always run first iteration so any pending completion BHs run */
1922 bool busy = true;
1923 BlockDriverState *bs;
1924
1925 while (busy) {
1926 busy = false;
1927
1928 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1929 AioContext *aio_context = bdrv_get_aio_context(bs);
1930 bool bs_busy;
1931
1932 aio_context_acquire(aio_context);
1933 bdrv_flush_io_queue(bs);
1934 bdrv_start_throttled_reqs(bs);
1935 bs_busy = bdrv_requests_pending(bs);
1936 bs_busy |= aio_poll(aio_context, bs_busy);
1937 aio_context_release(aio_context);
1938
1939 busy |= bs_busy;
1940 }
1941 }
1942 }
1943
1944 /* make a BlockDriverState anonymous by removing from bdrv_state and
1945 * graph_bdrv_state list.
1946 Also, NULL terminate the device_name to prevent double remove */
1947 void bdrv_make_anon(BlockDriverState *bs)
1948 {
1949 /*
1950 * Take care to remove bs from bdrv_states only when it's actually
1951 * in it. Note that bs->device_list.tqe_prev is initially null,
1952 * and gets set to non-null by QTAILQ_INSERT_TAIL(). Establish
1953 * the useful invariant "bs in bdrv_states iff bs->tqe_prev" by
1954 * resetting it to null on remove.
1955 */
1956 if (bs->device_list.tqe_prev) {
1957 QTAILQ_REMOVE(&bdrv_states, bs, device_list);
1958 bs->device_list.tqe_prev = NULL;
1959 }
1960 if (bs->node_name[0] != '\0') {
1961 QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
1962 }
1963 bs->node_name[0] = '\0';
1964 }
1965
1966 static void bdrv_rebind(BlockDriverState *bs)
1967 {
1968 if (bs->drv && bs->drv->bdrv_rebind) {
1969 bs->drv->bdrv_rebind(bs);
1970 }
1971 }
1972
1973 static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
1974 BlockDriverState *bs_src)
1975 {
1976 /* move some fields that need to stay attached to the device */
1977
1978 /* dev info */
1979 bs_dest->guest_block_size = bs_src->guest_block_size;
1980 bs_dest->copy_on_read = bs_src->copy_on_read;
1981
1982 bs_dest->enable_write_cache = bs_src->enable_write_cache;
1983
1984 /* i/o throttled req */
1985 memcpy(&bs_dest->throttle_state,
1986 &bs_src->throttle_state,
1987 sizeof(ThrottleState));
1988 bs_dest->throttled_reqs[0] = bs_src->throttled_reqs[0];
1989 bs_dest->throttled_reqs[1] = bs_src->throttled_reqs[1];
1990 bs_dest->io_limits_enabled = bs_src->io_limits_enabled;
1991
1992 /* r/w error */
1993 bs_dest->on_read_error = bs_src->on_read_error;
1994 bs_dest->on_write_error = bs_src->on_write_error;
1995
1996 /* i/o status */
1997 bs_dest->iostatus_enabled = bs_src->iostatus_enabled;
1998 bs_dest->iostatus = bs_src->iostatus;
1999
2000 /* dirty bitmap */
2001 bs_dest->dirty_bitmaps = bs_src->dirty_bitmaps;
2002
2003 /* reference count */
2004 bs_dest->refcnt = bs_src->refcnt;
2005
2006 /* job */
2007 bs_dest->job = bs_src->job;
2008
2009 /* keep the same entry in bdrv_states */
2010 bs_dest->device_list = bs_src->device_list;
2011 bs_dest->blk = bs_src->blk;
2012
2013 memcpy(bs_dest->op_blockers, bs_src->op_blockers,
2014 sizeof(bs_dest->op_blockers));
2015 }
2016
2017 /*
2018 * Swap bs contents for two image chains while they are live,
2019 * while keeping required fields on the BlockDriverState that is
2020 * actually attached to a device.
2021 *
2022 * This will modify the BlockDriverState fields, and swap contents
2023 * between bs_new and bs_old. Both bs_new and bs_old are modified.
2024 *
2025 * bs_new must not be attached to a BlockBackend.
2026 *
2027 * This function does not create any image files.
2028 */
2029 void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
2030 {
2031 BlockDriverState tmp;
2032
2033 /* The code needs to swap the node_name but simply swapping node_list won't
2034 * work so first remove the nodes from the graph list, do the swap then
2035 * insert them back if needed.
2036 */
2037 if (bs_new->node_name[0] != '\0') {
2038 QTAILQ_REMOVE(&graph_bdrv_states, bs_new, node_list);
2039 }
2040 if (bs_old->node_name[0] != '\0') {
2041 QTAILQ_REMOVE(&graph_bdrv_states, bs_old, node_list);
2042 }
2043
2044 /* bs_new must be unattached and shouldn't have anything fancy enabled */
2045 assert(!bs_new->blk);
2046 assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
2047 assert(bs_new->job == NULL);
2048 assert(bs_new->io_limits_enabled == false);
2049 assert(!throttle_have_timer(&bs_new->throttle_state));
2050
2051 tmp = *bs_new;
2052 *bs_new = *bs_old;
2053 *bs_old = tmp;
2054
2055 /* there are some fields that should not be swapped, move them back */
2056 bdrv_move_feature_fields(&tmp, bs_old);
2057 bdrv_move_feature_fields(bs_old, bs_new);
2058 bdrv_move_feature_fields(bs_new, &tmp);
2059
2060 /* bs_new must remain unattached */
2061 assert(!bs_new->blk);
2062
2063 /* Check a few fields that should remain attached to the device */
2064 assert(bs_new->job == NULL);
2065 assert(bs_new->io_limits_enabled == false);
2066 assert(!throttle_have_timer(&bs_new->throttle_state));
2067
2068 /* insert the nodes back into the graph node list if needed */
2069 if (bs_new->node_name[0] != '\0') {
2070 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_new, node_list);
2071 }
2072 if (bs_old->node_name[0] != '\0') {
2073 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_old, node_list);
2074 }
2075
2076 bdrv_rebind(bs_new);
2077 bdrv_rebind(bs_old);
2078 }
2079
2080 /*
2081 * Add new bs contents at the top of an image chain while the chain is
2082 * live, while keeping required fields on the top layer.
2083 *
2084 * This will modify the BlockDriverState fields, and swap contents
2085 * between bs_new and bs_top. Both bs_new and bs_top are modified.
2086 *
2087 * bs_new must not be attached to a BlockBackend.
2088 *
2089 * This function does not create any image files.
2090 */
2091 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
2092 {
2093 bdrv_swap(bs_new, bs_top);
2094
2095 /* The contents of 'tmp' will become bs_top, as we are
2096 * swapping bs_new and bs_top contents. */
2097 bdrv_set_backing_hd(bs_top, bs_new);
2098 }
2099
2100 static void bdrv_delete(BlockDriverState *bs)
2101 {
2102 assert(!bs->job);
2103 assert(bdrv_op_blocker_is_empty(bs));
2104 assert(!bs->refcnt);
2105 assert(QLIST_EMPTY(&bs->dirty_bitmaps));
2106
2107 bdrv_close(bs);
2108
2109 /* remove from list, if necessary */
2110 bdrv_make_anon(bs);
2111
2112 g_free(bs);
2113 }
2114
2115 /*
2116 * Run consistency checks on an image
2117 *
2118 * Returns 0 if the check could be completed (it doesn't mean that the image is
2119 * free of errors) or -errno when an internal error occurred. The results of the
2120 * check are stored in res.
2121 */
2122 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
2123 {
2124 if (bs->drv == NULL) {
2125 return -ENOMEDIUM;
2126 }
2127 if (bs->drv->bdrv_check == NULL) {
2128 return -ENOTSUP;
2129 }
2130
2131 memset(res, 0, sizeof(*res));
2132 return bs->drv->bdrv_check(bs, res, fix);
2133 }
2134
2135 #define COMMIT_BUF_SECTORS 2048
2136
2137 /* commit COW file into the raw image */
2138 int bdrv_commit(BlockDriverState *bs)
2139 {
2140 BlockDriver *drv = bs->drv;
2141 int64_t sector, total_sectors, length, backing_length;
2142 int n, ro, open_flags;
2143 int ret = 0;
2144 uint8_t *buf = NULL;
2145 char filename[PATH_MAX];
2146
2147 if (!drv)
2148 return -ENOMEDIUM;
2149
2150 if (!bs->backing_hd) {
2151 return -ENOTSUP;
2152 }
2153
2154 if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_COMMIT, NULL) ||
2155 bdrv_op_is_blocked(bs->backing_hd, BLOCK_OP_TYPE_COMMIT, NULL)) {
2156 return -EBUSY;
2157 }
2158
2159 ro = bs->backing_hd->read_only;
2160 /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
2161 pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
2162 open_flags = bs->backing_hd->open_flags;
2163
2164 if (ro) {
2165 if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
2166 return -EACCES;
2167 }
2168 }
2169
2170 length = bdrv_getlength(bs);
2171 if (length < 0) {
2172 ret = length;
2173 goto ro_cleanup;
2174 }
2175
2176 backing_length = bdrv_getlength(bs->backing_hd);
2177 if (backing_length < 0) {
2178 ret = backing_length;
2179 goto ro_cleanup;
2180 }
2181
2182 /* If our top snapshot is larger than the backing file image,
2183 * grow the backing file image if possible. If not possible,
2184 * we must return an error */
2185 if (length > backing_length) {
2186 ret = bdrv_truncate(bs->backing_hd, length);
2187 if (ret < 0) {
2188 goto ro_cleanup;
2189 }
2190 }
2191
2192 total_sectors = length >> BDRV_SECTOR_BITS;
2193
2194 /* qemu_try_blockalign() for bs will choose an alignment that works for
2195 * bs->backing_hd as well, so no need to compare the alignment manually. */
2196 buf = qemu_try_blockalign(bs, COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
2197 if (buf == NULL) {
2198 ret = -ENOMEM;
2199 goto ro_cleanup;
2200 }
2201
2202 for (sector = 0; sector < total_sectors; sector += n) {
2203 ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
2204 if (ret < 0) {
2205 goto ro_cleanup;
2206 }
2207 if (ret) {
2208 ret = bdrv_read(bs, sector, buf, n);
2209 if (ret < 0) {
2210 goto ro_cleanup;
2211 }
2212
2213 ret = bdrv_write(bs->backing_hd, sector, buf, n);
2214 if (ret < 0) {
2215 goto ro_cleanup;
2216 }
2217 }
2218 }
2219
2220 if (drv->bdrv_make_empty) {
2221 ret = drv->bdrv_make_empty(bs);
2222 if (ret < 0) {
2223 goto ro_cleanup;
2224 }
2225 bdrv_flush(bs);
2226 }
2227
2228 /*
2229 * Make sure all data we wrote to the backing device is actually
2230 * stable on disk.
2231 */
2232 if (bs->backing_hd) {
2233 bdrv_flush(bs->backing_hd);
2234 }
2235
2236 ret = 0;
2237 ro_cleanup:
2238 qemu_vfree(buf);
2239
2240 if (ro) {
2241 /* ignoring error return here */
2242 bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
2243 }
2244
2245 return ret;
2246 }
2247
2248 int bdrv_commit_all(void)
2249 {
2250 BlockDriverState *bs;
2251
2252 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
2253 AioContext *aio_context = bdrv_get_aio_context(bs);
2254
2255 aio_context_acquire(aio_context);
2256 if (bs->drv && bs->backing_hd) {
2257 int ret = bdrv_commit(bs);
2258 if (ret < 0) {
2259 aio_context_release(aio_context);
2260 return ret;
2261 }
2262 }
2263 aio_context_release(aio_context);
2264 }
2265 return 0;
2266 }
2267
2268 /**
2269 * Remove an active request from the tracked requests list
2270 *
2271 * This function should be called when a tracked request is completing.
2272 */
2273 static void tracked_request_end(BdrvTrackedRequest *req)
2274 {
2275 if (req->serialising) {
2276 req->bs->serialising_in_flight--;
2277 }
2278
2279 QLIST_REMOVE(req, list);
2280 qemu_co_queue_restart_all(&req->wait_queue);
2281 }
2282
2283 /**
2284 * Add an active request to the tracked requests list
2285 */
2286 static void tracked_request_begin(BdrvTrackedRequest *req,
2287 BlockDriverState *bs,
2288 int64_t offset,
2289 unsigned int bytes, bool is_write)
2290 {
2291 *req = (BdrvTrackedRequest){
2292 .bs = bs,
2293 .offset = offset,
2294 .bytes = bytes,
2295 .is_write = is_write,
2296 .co = qemu_coroutine_self(),
2297 .serialising = false,
2298 .overlap_offset = offset,
2299 .overlap_bytes = bytes,
2300 };
2301
2302 qemu_co_queue_init(&req->wait_queue);
2303
2304 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2305 }
2306
2307 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
2308 {
2309 int64_t overlap_offset = req->offset & ~(align - 1);
2310 unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
2311 - overlap_offset;
2312
2313 if (!req->serialising) {
2314 req->bs->serialising_in_flight++;
2315 req->serialising = true;
2316 }
2317
2318 req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
2319 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
2320 }
2321
2322 /**
2323 * Round a region to cluster boundaries
2324 */
2325 void bdrv_round_to_clusters(BlockDriverState *bs,
2326 int64_t sector_num, int nb_sectors,
2327 int64_t *cluster_sector_num,
2328 int *cluster_nb_sectors)
2329 {
2330 BlockDriverInfo bdi;
2331
2332 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2333 *cluster_sector_num = sector_num;
2334 *cluster_nb_sectors = nb_sectors;
2335 } else {
2336 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2337 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2338 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2339 nb_sectors, c);
2340 }
2341 }
2342
2343 static int bdrv_get_cluster_size(BlockDriverState *bs)
2344 {
2345 BlockDriverInfo bdi;
2346 int ret;
2347
2348 ret = bdrv_get_info(bs, &bdi);
2349 if (ret < 0 || bdi.cluster_size == 0) {
2350 return bs->request_alignment;
2351 } else {
2352 return bdi.cluster_size;
2353 }
2354 }
2355
2356 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
2357 int64_t offset, unsigned int bytes)
2358 {
2359 /* aaaa bbbb */
2360 if (offset >= req->overlap_offset + req->overlap_bytes) {
2361 return false;
2362 }
2363 /* bbbb aaaa */
2364 if (req->overlap_offset >= offset + bytes) {
2365 return false;
2366 }
2367 return true;
2368 }
2369
2370 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
2371 {
2372 BlockDriverState *bs = self->bs;
2373 BdrvTrackedRequest *req;
2374 bool retry;
2375 bool waited = false;
2376
2377 if (!bs->serialising_in_flight) {
2378 return false;
2379 }
2380
2381 do {
2382 retry = false;
2383 QLIST_FOREACH(req, &bs->tracked_requests, list) {
2384 if (req == self || (!req->serialising && !self->serialising)) {
2385 continue;
2386 }
2387 if (tracked_request_overlaps(req, self->overlap_offset,
2388 self->overlap_bytes))
2389 {
2390 /* Hitting this means there was a reentrant request, for
2391 * example, a block driver issuing nested requests. This must
2392 * never happen since it means deadlock.
2393 */
2394 assert(qemu_coroutine_self() != req->co);
2395
2396 /* If the request is already (indirectly) waiting for us, or
2397 * will wait for us as soon as it wakes up, then just go on
2398 * (instead of producing a deadlock in the former case). */
2399 if (!req->waiting_for) {
2400 self->waiting_for = req;
2401 qemu_co_queue_wait(&req->wait_queue);
2402 self->waiting_for = NULL;
2403 retry = true;
2404 waited = true;
2405 break;
2406 }
2407 }
2408 }
2409 } while (retry);
2410
2411 return waited;
2412 }
2413
2414 /*
2415 * Return values:
2416 * 0 - success
2417 * -EINVAL - backing format specified, but no file
2418 * -ENOSPC - can't update the backing file because no space is left in the
2419 * image file header
2420 * -ENOTSUP - format driver doesn't support changing the backing file
2421 */
2422 int bdrv_change_backing_file(BlockDriverState *bs,
2423 const char *backing_file, const char *backing_fmt)
2424 {
2425 BlockDriver *drv = bs->drv;
2426 int ret;
2427
2428 /* Backing file format doesn't make sense without a backing file */
2429 if (backing_fmt && !backing_file) {
2430 return -EINVAL;
2431 }
2432
2433 if (drv->bdrv_change_backing_file != NULL) {
2434 ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
2435 } else {
2436 ret = -ENOTSUP;
2437 }
2438
2439 if (ret == 0) {
2440 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2441 pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2442 }
2443 return ret;
2444 }
2445
2446 /*
2447 * Finds the image layer in the chain that has 'bs' as its backing file.
2448 *
2449 * active is the current topmost image.
2450 *
2451 * Returns NULL if bs is not found in active's image chain,
2452 * or if active == bs.
2453 *
2454 * Returns the bottommost base image if bs == NULL.
2455 */
2456 BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2457 BlockDriverState *bs)
2458 {
2459 while (active && bs != active->backing_hd) {
2460 active = active->backing_hd;
2461 }
2462
2463 return active;
2464 }
2465
2466 /* Given a BDS, searches for the base layer. */
2467 BlockDriverState *bdrv_find_base(BlockDriverState *bs)
2468 {
2469 return bdrv_find_overlay(bs, NULL);
2470 }
2471
2472 typedef struct BlkIntermediateStates {
2473 BlockDriverState *bs;
2474 QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2475 } BlkIntermediateStates;
2476
2477
2478 /*
2479 * Drops images above 'base' up to and including 'top', and sets the image
2480 * above 'top' to have base as its backing file.
2481 *
2482 * Requires that the overlay to 'top' is opened r/w, so that the backing file
2483 * information in 'bs' can be properly updated.
2484 *
2485 * E.g., this will convert the following chain:
2486 * bottom <- base <- intermediate <- top <- active
2487 *
2488 * to
2489 *
2490 * bottom <- base <- active
2491 *
2492 * It is allowed for bottom==base, in which case it converts:
2493 *
2494 * base <- intermediate <- top <- active
2495 *
2496 * to
2497 *
2498 * base <- active
2499 *
2500 * If backing_file_str is non-NULL, it will be used when modifying top's
2501 * overlay image metadata.
2502 *
2503 * Error conditions:
2504 * if active == top, that is considered an error
2505 *
2506 */
2507 int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2508 BlockDriverState *base, const char *backing_file_str)
2509 {
2510 BlockDriverState *intermediate;
2511 BlockDriverState *base_bs = NULL;
2512 BlockDriverState *new_top_bs = NULL;
2513 BlkIntermediateStates *intermediate_state, *next;
2514 int ret = -EIO;
2515
2516 QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2517 QSIMPLEQ_INIT(&states_to_delete);
2518
2519 if (!top->drv || !base->drv) {
2520 goto exit;
2521 }
2522
2523 new_top_bs = bdrv_find_overlay(active, top);
2524
2525 if (new_top_bs == NULL) {
2526 /* we could not find the image above 'top', this is an error */
2527 goto exit;
2528 }
2529
2530 /* special case of new_top_bs->backing_hd already pointing to base - nothing
2531 * to do, no intermediate images */
2532 if (new_top_bs->backing_hd == base) {
2533 ret = 0;
2534 goto exit;
2535 }
2536
2537 intermediate = top;
2538
2539 /* now we will go down through the list, and add each BDS we find
2540 * into our deletion queue, until we hit the 'base'
2541 */
2542 while (intermediate) {
2543 intermediate_state = g_new0(BlkIntermediateStates, 1);
2544 intermediate_state->bs = intermediate;
2545 QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2546
2547 if (intermediate->backing_hd == base) {
2548 base_bs = intermediate->backing_hd;
2549 break;
2550 }
2551 intermediate = intermediate->backing_hd;
2552 }
2553 if (base_bs == NULL) {
2554 /* something went wrong, we did not end at the base. safely
2555 * unravel everything, and exit with error */
2556 goto exit;
2557 }
2558
2559 /* success - we can delete the intermediate states, and link top->base */
2560 backing_file_str = backing_file_str ? backing_file_str : base_bs->filename;
2561 ret = bdrv_change_backing_file(new_top_bs, backing_file_str,
2562 base_bs->drv ? base_bs->drv->format_name : "");
2563 if (ret) {
2564 goto exit;
2565 }
2566 bdrv_set_backing_hd(new_top_bs, base_bs);
2567
2568 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2569 /* so that bdrv_close() does not recursively close the chain */
2570 bdrv_set_backing_hd(intermediate_state->bs, NULL);
2571 bdrv_unref(intermediate_state->bs);
2572 }
2573 ret = 0;
2574
2575 exit:
2576 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2577 g_free(intermediate_state);
2578 }
2579 return ret;
2580 }
2581
2582
2583 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2584 size_t size)
2585 {
2586 int64_t len;
2587
2588 if (size > INT_MAX) {
2589 return -EIO;
2590 }
2591
2592 if (!bdrv_is_inserted(bs))
2593 return -ENOMEDIUM;
2594
2595 if (bs->growable)
2596 return 0;
2597
2598 len = bdrv_getlength(bs);
2599
2600 if (offset < 0)
2601 return -EIO;
2602
2603 if ((offset > len) || (len - offset < size))
2604 return -EIO;
2605
2606 return 0;
2607 }
2608
2609 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2610 int nb_sectors)
2611 {
2612 if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2613 return -EIO;
2614 }
2615
2616 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2617 nb_sectors * BDRV_SECTOR_SIZE);
2618 }
2619
2620 typedef struct RwCo {
2621 BlockDriverState *bs;
2622 int64_t offset;
2623 QEMUIOVector *qiov;
2624 bool is_write;
2625 int ret;
2626 BdrvRequestFlags flags;
2627 } RwCo;
2628
2629 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
2630 {
2631 RwCo *rwco = opaque;
2632
2633 if (!rwco->is_write) {
2634 rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
2635 rwco->qiov->size, rwco->qiov,
2636 rwco->flags);
2637 } else {
2638 rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
2639 rwco->qiov->size, rwco->qiov,
2640 rwco->flags);
2641 }
2642 }
2643
2644 /*
2645 * Process a vectored synchronous request using coroutines
2646 */
2647 static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
2648 QEMUIOVector *qiov, bool is_write,
2649 BdrvRequestFlags flags)
2650 {
2651 Coroutine *co;
2652 RwCo rwco = {
2653 .bs = bs,
2654 .offset = offset,
2655 .qiov = qiov,
2656 .is_write = is_write,
2657 .ret = NOT_DONE,
2658 .flags = flags,
2659 };
2660
2661 /**
2662 * In sync call context, when the vcpu is blocked, this throttling timer
2663 * will not fire; so the I/O throttling function has to be disabled here
2664 * if it has been enabled.
2665 */
2666 if (bs->io_limits_enabled) {
2667 fprintf(stderr, "Disabling I/O throttling on '%s' due "
2668 "to synchronous I/O.\n", bdrv_get_device_name(bs));
2669 bdrv_io_limits_disable(bs);
2670 }
2671
2672 if (qemu_in_coroutine()) {
2673 /* Fast-path if already in coroutine context */
2674 bdrv_rw_co_entry(&rwco);
2675 } else {
2676 AioContext *aio_context = bdrv_get_aio_context(bs);
2677
2678 co = qemu_coroutine_create(bdrv_rw_co_entry);
2679 qemu_coroutine_enter(co, &rwco);
2680 while (rwco.ret == NOT_DONE) {
2681 aio_poll(aio_context, true);
2682 }
2683 }
2684 return rwco.ret;
2685 }
2686
2687 /*
2688 * Process a synchronous request using coroutines
2689 */
2690 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
2691 int nb_sectors, bool is_write, BdrvRequestFlags flags)
2692 {
2693 QEMUIOVector qiov;
2694 struct iovec iov = {
2695 .iov_base = (void *)buf,
2696 .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2697 };
2698
2699 if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2700 return -EINVAL;
2701 }
2702
2703 qemu_iovec_init_external(&qiov, &iov, 1);
2704 return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
2705 &qiov, is_write, flags);
2706 }
2707
2708 /* return < 0 if error. See bdrv_write() for the return codes */
2709 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2710 uint8_t *buf, int nb_sectors)
2711 {
2712 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
2713 }
2714
2715 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2716 int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2717 uint8_t *buf, int nb_sectors)
2718 {
2719 bool enabled;
2720 int ret;
2721
2722 enabled = bs->io_limits_enabled;
2723 bs->io_limits_enabled = false;
2724 ret = bdrv_read(bs, sector_num, buf, nb_sectors);
2725 bs->io_limits_enabled = enabled;
2726 return ret;
2727 }
2728
2729 /* Return < 0 if error. Important errors are:
2730 -EIO generic I/O error (may happen for all errors)
2731 -ENOMEDIUM No media inserted.
2732 -EINVAL Invalid sector number or nb_sectors
2733 -EACCES Trying to write a read-only device
2734 */
2735 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
2736 const uint8_t *buf, int nb_sectors)
2737 {
2738 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
2739 }
2740
2741 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2742 int nb_sectors, BdrvRequestFlags flags)
2743 {
2744 return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
2745 BDRV_REQ_ZERO_WRITE | flags);
2746 }
2747
2748 /*
2749 * Completely zero out a block device with the help of bdrv_write_zeroes.
2750 * The operation is sped up by checking the block status and only writing
2751 * zeroes to the device if they currently do not return zeroes. Optional
2752 * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2753 *
2754 * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2755 */
2756 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2757 {
2758 int64_t target_sectors, ret, nb_sectors, sector_num = 0;
2759 int n;
2760
2761 target_sectors = bdrv_nb_sectors(bs);
2762 if (target_sectors < 0) {
2763 return target_sectors;
2764 }
2765
2766 for (;;) {
2767 nb_sectors = target_sectors - sector_num;
2768 if (nb_sectors <= 0) {
2769 return 0;
2770 }
2771 if (nb_sectors > INT_MAX) {
2772 nb_sectors = INT_MAX;
2773 }
2774 ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
2775 if (ret < 0) {
2776 error_report("error getting block status at sector %" PRId64 ": %s",
2777 sector_num, strerror(-ret));
2778 return ret;
2779 }
2780 if (ret & BDRV_BLOCK_ZERO) {
2781 sector_num += n;
2782 continue;
2783 }
2784 ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2785 if (ret < 0) {
2786 error_report("error writing zeroes at sector %" PRId64 ": %s",
2787 sector_num, strerror(-ret));
2788 return ret;
2789 }
2790 sector_num += n;
2791 }
2792 }
2793
2794 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
2795 {
2796 QEMUIOVector qiov;
2797 struct iovec iov = {
2798 .iov_base = (void *)buf,
2799 .iov_len = bytes,
2800 };
2801 int ret;
2802
2803 if (bytes < 0) {
2804 return -EINVAL;
2805 }
2806
2807 qemu_iovec_init_external(&qiov, &iov, 1);
2808 ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
2809 if (ret < 0) {
2810 return ret;
2811 }
2812
2813 return bytes;
2814 }
2815
2816 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
2817 {
2818 int ret;
2819
2820 ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
2821 if (ret < 0) {
2822 return ret;
2823 }
2824
2825 return qiov->size;
2826 }
2827
2828 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2829 const void *buf, int bytes)
2830 {
2831 QEMUIOVector qiov;
2832 struct iovec iov = {
2833 .iov_base = (void *) buf,
2834 .iov_len = bytes,
2835 };
2836
2837 if (bytes < 0) {
2838 return -EINVAL;
2839 }
2840
2841 qemu_iovec_init_external(&qiov, &iov, 1);
2842 return bdrv_pwritev(bs, offset, &qiov);
2843 }
2844
2845 /*
2846 * Writes to the file and ensures that no writes are reordered across this
2847 * request (acts as a barrier)
2848 *
2849 * Returns 0 on success, -errno in error cases.
2850 */
2851 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2852 const void *buf, int count)
2853 {
2854 int ret;
2855
2856 ret = bdrv_pwrite(bs, offset, buf, count);
2857 if (ret < 0) {
2858 return ret;
2859 }
2860
2861 /* No flush needed for cache modes that already do it */
2862 if (bs->enable_write_cache) {
2863 bdrv_flush(bs);
2864 }
2865
2866 return 0;
2867 }
2868
2869 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
2870 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2871 {
2872 /* Perform I/O through a temporary buffer so that users who scribble over
2873 * their read buffer while the operation is in progress do not end up
2874 * modifying the image file. This is critical for zero-copy guest I/O
2875 * where anything might happen inside guest memory.
2876 */
2877 void *bounce_buffer;
2878
2879 BlockDriver *drv = bs->drv;
2880 struct iovec iov;
2881 QEMUIOVector bounce_qiov;
2882 int64_t cluster_sector_num;
2883 int cluster_nb_sectors;
2884 size_t skip_bytes;
2885 int ret;
2886
2887 /* Cover entire cluster so no additional backing file I/O is required when
2888 * allocating cluster in the image file.
2889 */
2890 bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2891 &cluster_sector_num, &cluster_nb_sectors);
2892
2893 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2894 cluster_sector_num, cluster_nb_sectors);
2895
2896 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2897 iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len);
2898 if (bounce_buffer == NULL) {
2899 ret = -ENOMEM;
2900 goto err;
2901 }
2902
2903 qemu_iovec_init_external(&bounce_qiov, &iov, 1);
2904
2905 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
2906 &bounce_qiov);
2907 if (ret < 0) {
2908 goto err;
2909 }
2910
2911 if (drv->bdrv_co_write_zeroes &&
2912 buffer_is_zero(bounce_buffer, iov.iov_len)) {
2913 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
2914 cluster_nb_sectors, 0);
2915 } else {
2916 /* This does not change the data on the disk, it is not necessary
2917 * to flush even in cache=writethrough mode.
2918 */
2919 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
2920 &bounce_qiov);
2921 }
2922
2923 if (ret < 0) {
2924 /* It might be okay to ignore write errors for guest requests. If this
2925 * is a deliberate copy-on-read then we don't want to ignore the error.
2926 * Simply report it in all cases.
2927 */
2928 goto err;
2929 }
2930
2931 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
2932 qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
2933 nb_sectors * BDRV_SECTOR_SIZE);
2934
2935 err:
2936 qemu_vfree(bounce_buffer);
2937 return ret;
2938 }
2939
2940 /*
2941 * Forwards an already correctly aligned request to the BlockDriver. This
2942 * handles copy on read and zeroing after EOF; any other features must be
2943 * implemented by the caller.
2944 */
2945 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
2946 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
2947 int64_t align, QEMUIOVector *qiov, int flags)
2948 {
2949 BlockDriver *drv = bs->drv;
2950 int ret;
2951
2952 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
2953 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
2954
2955 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
2956 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
2957 assert(!qiov || bytes == qiov->size);
2958
2959 /* Handle Copy on Read and associated serialisation */
2960 if (flags & BDRV_REQ_COPY_ON_READ) {
2961 /* If we touch the same cluster it counts as an overlap. This
2962 * guarantees that allocating writes will be serialized and not race
2963 * with each other for the same cluster. For example, in copy-on-read
2964 * it ensures that the CoR read and write operations are atomic and
2965 * guest writes cannot interleave between them. */
2966 mark_request_serialising(req, bdrv_get_cluster_size(bs));
2967 }
2968
2969 wait_serialising_requests(req);
2970
2971 if (flags & BDRV_REQ_COPY_ON_READ) {
2972 int pnum;
2973
2974 ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
2975 if (ret < 0) {
2976 goto out;
2977 }
2978
2979 if (!ret || pnum != nb_sectors) {
2980 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
2981 goto out;
2982 }
2983 }
2984
2985 /* Forward the request to the BlockDriver */
2986 if (!(bs->zero_beyond_eof && bs->growable)) {
2987 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
2988 } else {
2989 /* Read zeros after EOF of growable BDSes */
2990 int64_t total_sectors, max_nb_sectors;
2991
2992 total_sectors = bdrv_nb_sectors(bs);
2993 if (total_sectors < 0) {
2994 ret = total_sectors;
2995 goto out;
2996 }
2997
2998 max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
2999 align >> BDRV_SECTOR_BITS);
3000 if (max_nb_sectors > 0) {
3001 QEMUIOVector local_qiov;
3002 size_t local_sectors;
3003
3004 max_nb_sectors = MIN(max_nb_sectors, SIZE_MAX / BDRV_SECTOR_BITS);
3005 local_sectors = MIN(max_nb_sectors, nb_sectors);
3006
3007 qemu_iovec_init(&local_qiov, qiov->niov);
3008 qemu_iovec_concat(&local_qiov, qiov, 0,
3009 local_sectors * BDRV_SECTOR_SIZE);
3010
3011 ret = drv->bdrv_co_readv(bs, sector_num, local_sectors,
3012 &local_qiov);
3013
3014 qemu_iovec_destroy(&local_qiov);
3015 } else {
3016 ret = 0;
3017 }
3018
3019 /* Reading beyond end of file is supposed to produce zeroes */
3020 if (ret == 0 && total_sectors < sector_num + nb_sectors) {
3021 uint64_t offset = MAX(0, total_sectors - sector_num);
3022 uint64_t bytes = (sector_num + nb_sectors - offset) *
3023 BDRV_SECTOR_SIZE;
3024 qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
3025 }
3026 }
3027
3028 out:
3029 return ret;
3030 }
3031
3032 /*
3033 * Handle a read request in coroutine context
3034 */
3035 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
3036 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3037 BdrvRequestFlags flags)
3038 {
3039 BlockDriver *drv = bs->drv;
3040 BdrvTrackedRequest req;
3041
3042 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3043 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3044 uint8_t *head_buf = NULL;
3045 uint8_t *tail_buf = NULL;
3046 QEMUIOVector local_qiov;
3047 bool use_local_qiov = false;
3048 int ret;
3049
3050 if (!drv) {
3051 return -ENOMEDIUM;
3052 }
3053 if (bdrv_check_byte_request(bs, offset, bytes)) {
3054 return -EIO;
3055 }
3056
3057 if (bs->copy_on_read) {
3058 flags |= BDRV_REQ_COPY_ON_READ;
3059 }
3060
3061 /* throttling disk I/O */
3062 if (bs->io_limits_enabled) {
3063 bdrv_io_limits_intercept(bs, bytes, false);
3064 }
3065
3066 /* Align read if necessary by padding qiov */
3067 if (offset & (align - 1)) {
3068 head_buf = qemu_blockalign(bs, align);
3069 qemu_iovec_init(&local_qiov, qiov->niov + 2);
3070 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3071 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3072 use_local_qiov = true;
3073
3074 bytes += offset & (align - 1);
3075 offset = offset & ~(align - 1);
3076 }
3077
3078 if ((offset + bytes) & (align - 1)) {
3079 if (!use_local_qiov) {
3080 qemu_iovec_init(&local_qiov, qiov->niov + 1);
3081 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3082 use_local_qiov = true;
3083 }
3084 tail_buf = qemu_blockalign(bs, align);
3085 qemu_iovec_add(&local_qiov, tail_buf,
3086 align - ((offset + bytes) & (align - 1)));
3087
3088 bytes = ROUND_UP(bytes, align);
3089 }
3090
3091 tracked_request_begin(&req, bs, offset, bytes, false);
3092 ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
3093 use_local_qiov ? &local_qiov : qiov,
3094 flags);
3095 tracked_request_end(&req);
3096
3097 if (use_local_qiov) {
3098 qemu_iovec_destroy(&local_qiov);
3099 qemu_vfree(head_buf);
3100 qemu_vfree(tail_buf);
3101 }
3102
3103 return ret;
3104 }
3105
3106 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
3107 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3108 BdrvRequestFlags flags)
3109 {
3110 if (nb_sectors < 0 || nb_sectors > (UINT_MAX >> BDRV_SECTOR_BITS)) {
3111 return -EINVAL;
3112 }
3113
3114 return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
3115 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3116 }
3117
3118 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
3119 int nb_sectors, QEMUIOVector *qiov)
3120 {
3121 trace_bdrv_co_readv(bs, sector_num, nb_sectors);
3122
3123 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
3124 }
3125
3126 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
3127 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
3128 {
3129 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
3130
3131 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
3132 BDRV_REQ_COPY_ON_READ);
3133 }
3134
3135 /* if no limit is specified in the BlockLimits use a default
3136 * of 32768 512-byte sectors (16 MiB) per request.
3137 */
3138 #define MAX_WRITE_ZEROES_DEFAULT 32768
3139
3140 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
3141 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
3142 {
3143 BlockDriver *drv = bs->drv;
3144 QEMUIOVector qiov;
3145 struct iovec iov = {0};
3146 int ret = 0;
3147
3148 int max_write_zeroes = bs->bl.max_write_zeroes ?
3149 bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
3150
3151 while (nb_sectors > 0 && !ret) {
3152 int num = nb_sectors;
3153
3154 /* Align request. Block drivers can expect the "bulk" of the request
3155 * to be aligned.
3156 */
3157 if (bs->bl.write_zeroes_alignment
3158 && num > bs->bl.write_zeroes_alignment) {
3159 if (sector_num % bs->bl.write_zeroes_alignment != 0) {
3160 /* Make a small request up to the first aligned sector. */
3161 num = bs->bl.write_zeroes_alignment;
3162 num -= sector_num % bs->bl.write_zeroes_alignment;
3163 } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
3164 /* Shorten the request to the last aligned sector. num cannot
3165 * underflow because num > bs->bl.write_zeroes_alignment.
3166 */
3167 num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
3168 }
3169 }
3170
3171 /* limit request size */
3172 if (num > max_write_zeroes) {
3173 num = max_write_zeroes;
3174 }
3175
3176 ret = -ENOTSUP;
3177 /* First try the efficient write zeroes operation */
3178 if (drv->bdrv_co_write_zeroes) {
3179 ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
3180 }
3181
3182 if (ret == -ENOTSUP) {
3183 /* Fall back to bounce buffer if write zeroes is unsupported */
3184 iov.iov_len = num * BDRV_SECTOR_SIZE;
3185 if (iov.iov_base == NULL) {
3186 iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE);
3187 if (iov.iov_base == NULL) {
3188 ret = -ENOMEM;
3189 goto fail;
3190 }
3191 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
3192 }
3193 qemu_iovec_init_external(&qiov, &iov, 1);
3194
3195 ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
3196
3197 /* Keep bounce buffer around if it is big enough for all
3198 * all future requests.
3199 */
3200 if (num < max_write_zeroes) {
3201 qemu_vfree(iov.iov_base);
3202 iov.iov_base = NULL;
3203 }
3204 }
3205
3206 sector_num += num;
3207 nb_sectors -= num;
3208 }
3209
3210 fail:
3211 qemu_vfree(iov.iov_base);
3212 return ret;
3213 }
3214
3215 /*
3216 * Forwards an already correctly aligned write request to the BlockDriver.
3217 */
3218 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
3219 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3220 QEMUIOVector *qiov, int flags)
3221 {
3222 BlockDriver *drv = bs->drv;
3223 bool waited;
3224 int ret;
3225
3226 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3227 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
3228
3229 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3230 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3231 assert(!qiov || bytes == qiov->size);
3232
3233 waited = wait_serialising_requests(req);
3234 assert(!waited || !req->serialising);
3235 assert(req->overlap_offset <= offset);
3236 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
3237
3238 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
3239
3240 if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
3241 !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes &&
3242 qemu_iovec_is_zero(qiov)) {
3243 flags |= BDRV_REQ_ZERO_WRITE;
3244 if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
3245 flags |= BDRV_REQ_MAY_UNMAP;
3246 }
3247 }
3248
3249 if (ret < 0) {
3250 /* Do nothing, write notifier decided to fail this request */
3251 } else if (flags & BDRV_REQ_ZERO_WRITE) {
3252 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
3253 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
3254 } else {
3255 BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
3256 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
3257 }
3258 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
3259
3260 if (ret == 0 && !bs->enable_write_cache) {
3261 ret = bdrv_co_flush(bs);
3262 }
3263
3264 bdrv_set_dirty(bs, sector_num, nb_sectors);
3265
3266 block_acct_highest_sector(&bs->stats, sector_num, nb_sectors);
3267
3268 if (bs->growable && ret >= 0) {
3269 bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
3270 }
3271
3272 return ret;
3273 }
3274
3275 /*
3276 * Handle a write request in coroutine context
3277 */
3278 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
3279 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3280 BdrvRequestFlags flags)
3281 {
3282 BdrvTrackedRequest req;
3283 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3284 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3285 uint8_t *head_buf = NULL;
3286 uint8_t *tail_buf = NULL;
3287 QEMUIOVector local_qiov;
3288 bool use_local_qiov = false;
3289 int ret;
3290
3291 if (!bs->drv) {
3292 return -ENOMEDIUM;
3293 }
3294 if (bs->read_only) {
3295 return -EACCES;
3296 }
3297 if (bdrv_check_byte_request(bs, offset, bytes)) {
3298 return -EIO;
3299 }
3300
3301 /* throttling disk I/O */
3302 if (bs->io_limits_enabled) {
3303 bdrv_io_limits_intercept(bs, bytes, true);
3304 }
3305
3306 /*
3307 * Align write if necessary by performing a read-modify-write cycle.
3308 * Pad qiov with the read parts and be sure to have a tracked request not
3309 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
3310 */
3311 tracked_request_begin(&req, bs, offset, bytes, true);
3312
3313 if (offset & (align - 1)) {
3314 QEMUIOVector head_qiov;
3315 struct iovec head_iov;
3316
3317 mark_request_serialising(&req, align);
3318 wait_serialising_requests(&req);
3319
3320 head_buf = qemu_blockalign(bs, align);
3321 head_iov = (struct iovec) {
3322 .iov_base = head_buf,
3323 .iov_len = align,
3324 };
3325 qemu_iovec_init_external(&head_qiov, &head_iov, 1);
3326
3327 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
3328 ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
3329 align, &head_qiov, 0);
3330 if (ret < 0) {
3331 goto fail;
3332 }
3333 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
3334
3335 qemu_iovec_init(&local_qiov, qiov->niov + 2);
3336 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3337 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3338 use_local_qiov = true;
3339
3340 bytes += offset & (align - 1);
3341 offset = offset & ~(align - 1);
3342 }
3343
3344 if ((offset + bytes) & (align - 1)) {
3345 QEMUIOVector tail_qiov;
3346 struct iovec tail_iov;
3347 size_t tail_bytes;
3348 bool waited;
3349
3350 mark_request_serialising(&req, align);
3351 waited = wait_serialising_requests(&req);
3352 assert(!waited || !use_local_qiov);
3353
3354 tail_buf = qemu_blockalign(bs, align);
3355 tail_iov = (struct iovec) {
3356 .iov_base = tail_buf,
3357 .iov_len = align,
3358 };
3359 qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
3360
3361 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
3362 ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
3363 align, &tail_qiov, 0);
3364 if (ret < 0) {
3365 goto fail;
3366 }
3367 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
3368
3369 if (!use_local_qiov) {
3370 qemu_iovec_init(&local_qiov, qiov->niov + 1);
3371 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3372 use_local_qiov = true;
3373 }
3374
3375 tail_bytes = (offset + bytes) & (align - 1);
3376 qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
3377
3378 bytes = ROUND_UP(bytes, align);
3379 }
3380
3381 ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
3382 use_local_qiov ? &local_qiov : qiov,
3383 flags);
3384
3385 fail:
3386 tracked_request_end(&req);
3387
3388 if (use_local_qiov) {
3389 qemu_iovec_destroy(&local_qiov);
3390 }
3391 qemu_vfree(head_buf);
3392 qemu_vfree(tail_buf);
3393
3394 return ret;
3395 }
3396
3397 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
3398 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3399 BdrvRequestFlags flags)
3400 {
3401 if (nb_sectors < 0 || nb_sectors > (INT_MAX >> BDRV_SECTOR_BITS)) {
3402 return -EINVAL;
3403 }
3404
3405 return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
3406 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3407 }
3408
3409 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
3410 int nb_sectors, QEMUIOVector *qiov)
3411 {
3412 trace_bdrv_co_writev(bs, sector_num, nb_sectors);
3413
3414 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
3415 }
3416
3417 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
3418 int64_t sector_num, int nb_sectors,
3419 BdrvRequestFlags flags)
3420 {
3421 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
3422
3423 if (!(bs->open_flags & BDRV_O_UNMAP)) {
3424 flags &= ~BDRV_REQ_MAY_UNMAP;
3425 }
3426
3427 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
3428 BDRV_REQ_ZERO_WRITE | flags);
3429 }
3430
3431 /**
3432 * Truncate file to 'offset' bytes (needed only for file protocols)
3433 */
3434 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
3435 {
3436 BlockDriver *drv = bs->drv;
3437 int ret;
3438 if (!drv)
3439 return -ENOMEDIUM;
3440 if (!drv->bdrv_truncate)
3441 return -ENOTSUP;
3442 if (bs->read_only)
3443 return -EACCES;
3444
3445 ret = drv->bdrv_truncate(bs, offset);
3446 if (ret == 0) {
3447 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
3448 if (bs->blk) {
3449 blk_dev_resize_cb(bs->blk);
3450 }
3451 }
3452 return ret;
3453 }
3454
3455 /**
3456 * Length of a allocated file in bytes. Sparse files are counted by actual
3457 * allocated space. Return < 0 if error or unknown.
3458 */
3459 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
3460 {
3461 BlockDriver *drv = bs->drv;
3462 if (!drv) {
3463 return -ENOMEDIUM;
3464 }
3465 if (drv->bdrv_get_allocated_file_size) {
3466 return drv->bdrv_get_allocated_file_size(bs);
3467 }
3468 if (bs->file) {
3469 return bdrv_get_allocated_file_size(bs->file);
3470 }
3471 return -ENOTSUP;
3472 }
3473
3474 /**
3475 * Return number of sectors on success, -errno on error.
3476 */
3477 int64_t bdrv_nb_sectors(BlockDriverState *bs)
3478 {
3479 BlockDriver *drv = bs->drv;
3480
3481 if (!drv)
3482 return -ENOMEDIUM;
3483
3484 if (drv->has_variable_length) {
3485 int ret = refresh_total_sectors(bs, bs->total_sectors);
3486 if (ret < 0) {
3487 return ret;
3488 }
3489 }
3490 return bs->total_sectors;
3491 }
3492
3493 /**
3494 * Return length in bytes on success, -errno on error.
3495 * The length is always a multiple of BDRV_SECTOR_SIZE.
3496 */
3497 int64_t bdrv_getlength(BlockDriverState *bs)
3498 {
3499 int64_t ret = bdrv_nb_sectors(bs);
3500
3501 return ret < 0 ? ret : ret * BDRV_SECTOR_SIZE;
3502 }
3503
3504 /* return 0 as number of sectors if no device present or error */
3505 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
3506 {
3507 int64_t nb_sectors = bdrv_nb_sectors(bs);
3508
3509 *nb_sectors_ptr = nb_sectors < 0 ? 0 : nb_sectors;
3510 }
3511
3512 void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
3513 BlockdevOnError on_write_error)
3514 {
3515 bs->on_read_error = on_read_error;
3516 bs->on_write_error = on_write_error;
3517 }
3518
3519 BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
3520 {
3521 return is_read ? bs->on_read_error : bs->on_write_error;
3522 }
3523
3524 BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
3525 {
3526 BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
3527
3528 switch (on_err) {
3529 case BLOCKDEV_ON_ERROR_ENOSPC:
3530 return (error == ENOSPC) ?
3531 BLOCK_ERROR_ACTION_STOP : BLOCK_ERROR_ACTION_REPORT;
3532 case BLOCKDEV_ON_ERROR_STOP:
3533 return BLOCK_ERROR_ACTION_STOP;
3534 case BLOCKDEV_ON_ERROR_REPORT:
3535 return BLOCK_ERROR_ACTION_REPORT;
3536 case BLOCKDEV_ON_ERROR_IGNORE:
3537 return BLOCK_ERROR_ACTION_IGNORE;
3538 default:
3539 abort();
3540 }
3541 }
3542
3543 static void send_qmp_error_event(BlockDriverState *bs,
3544 BlockErrorAction action,
3545 bool is_read, int error)
3546 {
3547 BlockErrorAction ac;
3548
3549 ac = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE;
3550 qapi_event_send_block_io_error(bdrv_get_device_name(bs), ac, action,
3551 bdrv_iostatus_is_enabled(bs),
3552 error == ENOSPC, strerror(error),
3553 &error_abort);
3554 }
3555
3556 /* This is done by device models because, while the block layer knows
3557 * about the error, it does not know whether an operation comes from
3558 * the device or the block layer (from a job, for example).
3559 */
3560 void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3561 bool is_read, int error)
3562 {
3563 assert(error >= 0);
3564
3565 if (action == BLOCK_ERROR_ACTION_STOP) {
3566 /* First set the iostatus, so that "info block" returns an iostatus
3567 * that matches the events raised so far (an additional error iostatus
3568 * is fine, but not a lost one).
3569 */
3570 bdrv_iostatus_set_err(bs, error);
3571
3572 /* Then raise the request to stop the VM and the event.
3573 * qemu_system_vmstop_request_prepare has two effects. First,
3574 * it ensures that the STOP event always comes after the
3575 * BLOCK_IO_ERROR event. Second, it ensures that even if management
3576 * can observe the STOP event and do a "cont" before the STOP
3577 * event is issued, the VM will not stop. In this case, vm_start()
3578 * also ensures that the STOP/RESUME pair of events is emitted.
3579 */
3580 qemu_system_vmstop_request_prepare();
3581 send_qmp_error_event(bs, action, is_read, error);
3582 qemu_system_vmstop_request(RUN_STATE_IO_ERROR);
3583 } else {
3584 send_qmp_error_event(bs, action, is_read, error);
3585 }
3586 }
3587
3588 int bdrv_is_read_only(BlockDriverState *bs)
3589 {
3590 return bs->read_only;
3591 }
3592
3593 int bdrv_is_sg(BlockDriverState *bs)
3594 {
3595 return bs->sg;
3596 }
3597
3598 int bdrv_enable_write_cache(BlockDriverState *bs)
3599 {
3600 return bs->enable_write_cache;
3601 }
3602
3603 void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3604 {
3605 bs->enable_write_cache = wce;
3606
3607 /* so a reopen() will preserve wce */
3608 if (wce) {
3609 bs->open_flags |= BDRV_O_CACHE_WB;
3610 } else {
3611 bs->open_flags &= ~BDRV_O_CACHE_WB;
3612 }
3613 }
3614
3615 int bdrv_is_encrypted(BlockDriverState *bs)
3616 {
3617 if (bs->backing_hd && bs->backing_hd->encrypted)
3618 return 1;
3619 return bs->encrypted;
3620 }
3621
3622 int bdrv_key_required(BlockDriverState *bs)
3623 {
3624 BlockDriverState *backing_hd = bs->backing_hd;
3625
3626 if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3627 return 1;
3628 return (bs->encrypted && !bs->valid_key);
3629 }
3630
3631 int bdrv_set_key(BlockDriverState *bs, const char *key)
3632 {
3633 int ret;
3634 if (bs->backing_hd && bs->backing_hd->encrypted) {
3635 ret = bdrv_set_key(bs->backing_hd, key);
3636 if (ret < 0)
3637 return ret;
3638 if (!bs->encrypted)
3639 return 0;
3640 }
3641 if (!bs->encrypted) {
3642 return -EINVAL;
3643 } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3644 return -ENOMEDIUM;
3645 }
3646 ret = bs->drv->bdrv_set_key(bs, key);
3647 if (ret < 0) {
3648 bs->valid_key = 0;
3649 } else if (!bs->valid_key) {
3650 bs->valid_key = 1;
3651 if (bs->blk) {
3652 /* call the change callback now, we skipped it on open */
3653 blk_dev_change_media_cb(bs->blk, true);
3654 }
3655 }
3656 return ret;
3657 }
3658
3659 const char *bdrv_get_format_name(BlockDriverState *bs)
3660 {
3661 return bs->drv ? bs->drv->format_name : NULL;
3662 }
3663
3664 static int qsort_strcmp(const void *a, const void *b)
3665 {
3666 return strcmp(a, b);
3667 }
3668
3669 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
3670 void *opaque)
3671 {
3672 BlockDriver *drv;
3673 int count = 0;
3674 int i;
3675 const char **formats = NULL;
3676
3677 QLIST_FOREACH(drv, &bdrv_drivers, list) {
3678 if (drv->format_name) {
3679 bool found = false;
3680 int i = count;
3681 while (formats && i && !found) {
3682 found = !strcmp(formats[--i], drv->format_name);
3683 }
3684
3685 if (!found) {
3686 formats = g_renew(const char *, formats, count + 1);
3687 formats[count++] = drv->format_name;
3688 }
3689 }
3690 }
3691
3692 qsort(formats, count, sizeof(formats[0]), qsort_strcmp);
3693
3694 for (i = 0; i < count; i++) {
3695 it(opaque, formats[i]);
3696 }
3697
3698 g_free(formats);
3699 }
3700
3701 /* This function is to find block backend bs */
3702 /* TODO convert callers to blk_by_name(), then remove */
3703 BlockDriverState *bdrv_find(const char *name)
3704 {
3705 BlockBackend *blk = blk_by_name(name);
3706
3707 return blk ? blk_bs(blk) : NULL;
3708 }
3709
3710 /* This function is to find a node in the bs graph */
3711 BlockDriverState *bdrv_find_node(const char *node_name)
3712 {
3713 BlockDriverState *bs;
3714
3715 assert(node_name);
3716
3717 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3718 if (!strcmp(node_name, bs->node_name)) {
3719 return bs;
3720 }
3721 }
3722 return NULL;
3723 }
3724
3725 /* Put this QMP function here so it can access the static graph_bdrv_states. */
3726 BlockDeviceInfoList *bdrv_named_nodes_list(void)
3727 {
3728 BlockDeviceInfoList *list, *entry;
3729 BlockDriverState *bs;
3730
3731 list = NULL;
3732 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3733 entry = g_malloc0(sizeof(*entry));
3734 entry->value = bdrv_block_device_info(bs);
3735 entry->next = list;
3736 list = entry;
3737 }
3738
3739 return list;
3740 }
3741
3742 BlockDriverState *bdrv_lookup_bs(const char *device,
3743 const char *node_name,
3744 Error **errp)
3745 {
3746 BlockBackend *blk;
3747 BlockDriverState *bs;
3748
3749 if (device) {
3750 blk = blk_by_name(device);
3751
3752 if (blk) {
3753 return blk_bs(blk);
3754 }
3755 }
3756
3757 if (node_name) {
3758 bs = bdrv_find_node(node_name);
3759
3760 if (bs) {
3761 return bs;
3762 }
3763 }
3764
3765 error_setg(errp, "Cannot find device=%s nor node_name=%s",
3766 device ? device : "",
3767 node_name ? node_name : "");
3768 return NULL;
3769 }
3770
3771 /* If 'base' is in the same chain as 'top', return true. Otherwise,
3772 * return false. If either argument is NULL, return false. */
3773 bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base)
3774 {
3775 while (top && top != base) {
3776 top = top->backing_hd;
3777 }
3778
3779 return top != NULL;
3780 }
3781
3782 BlockDriverState *bdrv_next(BlockDriverState *bs)
3783 {
3784 if (!bs) {
3785 return QTAILQ_FIRST(&bdrv_states);
3786 }
3787 return QTAILQ_NEXT(bs, device_list);
3788 }
3789
3790 /* TODO check what callers really want: bs->node_name or blk_name() */
3791 const char *bdrv_get_device_name(const BlockDriverState *bs)
3792 {
3793 return bs->blk ? blk_name(bs->blk) : "";
3794 }
3795
3796 int bdrv_get_flags(BlockDriverState *bs)
3797 {
3798 return bs->open_flags;
3799 }
3800
3801 int bdrv_flush_all(void)
3802 {
3803 BlockDriverState *bs;
3804 int result = 0;
3805
3806 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3807 AioContext *aio_context = bdrv_get_aio_context(bs);
3808 int ret;
3809
3810 aio_context_acquire(aio_context);
3811 ret = bdrv_flush(bs);
3812 if (ret < 0 && !result) {
3813 result = ret;
3814 }
3815 aio_context_release(aio_context);
3816 }
3817
3818 return result;
3819 }
3820
3821 int bdrv_has_zero_init_1(BlockDriverState *bs)
3822 {
3823 return 1;
3824 }
3825
3826 int bdrv_has_zero_init(BlockDriverState *bs)
3827 {
3828 assert(bs->drv);
3829
3830 /* If BS is a copy on write image, it is initialized to
3831 the contents of the base image, which may not be zeroes. */
3832 if (bs->backing_hd) {
3833 return 0;
3834 }
3835 if (bs->drv->bdrv_has_zero_init) {
3836 return bs->drv->bdrv_has_zero_init(bs);
3837 }
3838
3839 /* safe default */
3840 return 0;
3841 }
3842
3843 bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
3844 {
3845 BlockDriverInfo bdi;
3846
3847 if (bs->backing_hd) {
3848 return false;
3849 }
3850
3851 if (bdrv_get_info(bs, &bdi) == 0) {
3852 return bdi.unallocated_blocks_are_zero;
3853 }
3854
3855 return false;
3856 }
3857
3858 bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
3859 {
3860 BlockDriverInfo bdi;
3861
3862 if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
3863 return false;
3864 }
3865
3866 if (bdrv_get_info(bs, &bdi) == 0) {
3867 return bdi.can_write_zeroes_with_unmap;
3868 }
3869
3870 return false;
3871 }
3872
3873 typedef struct BdrvCoGetBlockStatusData {
3874 BlockDriverState *bs;
3875 BlockDriverState *base;
3876 int64_t sector_num;
3877 int nb_sectors;
3878 int *pnum;
3879 int64_t ret;
3880 bool done;
3881 } BdrvCoGetBlockStatusData;
3882
3883 /*
3884 * Returns true iff the specified sector is present in the disk image. Drivers
3885 * not implementing the functionality are assumed to not support backing files,
3886 * hence all their sectors are reported as allocated.
3887 *
3888 * If 'sector_num' is beyond the end of the disk image the return value is 0
3889 * and 'pnum' is set to 0.
3890 *
3891 * 'pnum' is set to the number of sectors (including and immediately following
3892 * the specified sector) that are known to be in the same
3893 * allocated/unallocated state.
3894 *
3895 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
3896 * beyond the end of the disk image it will be clamped.
3897 */
3898 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
3899 int64_t sector_num,
3900 int nb_sectors, int *pnum)
3901 {
3902 int64_t total_sectors;
3903 int64_t n;
3904 int64_t ret, ret2;
3905
3906 total_sectors = bdrv_nb_sectors(bs);
3907 if (total_sectors < 0) {
3908 return total_sectors;
3909 }
3910
3911 if (sector_num >= total_sectors) {
3912 *pnum = 0;
3913 return 0;
3914 }
3915
3916 n = total_sectors - sector_num;
3917 if (n < nb_sectors) {
3918 nb_sectors = n;
3919 }
3920
3921 if (!bs->drv->bdrv_co_get_block_status) {
3922 *pnum = nb_sectors;
3923 ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
3924 if (bs->drv->protocol_name) {
3925 ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
3926 }
3927 return ret;
3928 }
3929
3930 ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
3931 if (ret < 0) {
3932 *pnum = 0;
3933 return ret;
3934 }
3935
3936 if (ret & BDRV_BLOCK_RAW) {
3937 assert(ret & BDRV_BLOCK_OFFSET_VALID);
3938 return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3939 *pnum, pnum);
3940 }
3941
3942 if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
3943 ret |= BDRV_BLOCK_ALLOCATED;
3944 }
3945
3946 if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
3947 if (bdrv_unallocated_blocks_are_zero(bs)) {
3948 ret |= BDRV_BLOCK_ZERO;
3949 } else if (bs->backing_hd) {
3950 BlockDriverState *bs2 = bs->backing_hd;
3951 int64_t nb_sectors2 = bdrv_nb_sectors(bs2);
3952 if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) {
3953 ret |= BDRV_BLOCK_ZERO;
3954 }
3955 }
3956 }
3957
3958 if (bs->file &&
3959 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
3960 (ret & BDRV_BLOCK_OFFSET_VALID)) {
3961 int file_pnum;
3962
3963 ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3964 *pnum, &file_pnum);
3965 if (ret2 >= 0) {
3966 /* Ignore errors. This is just providing extra information, it
3967 * is useful but not necessary.
3968 */
3969 if (!file_pnum) {
3970 /* !file_pnum indicates an offset at or beyond the EOF; it is
3971 * perfectly valid for the format block driver to point to such
3972 * offsets, so catch it and mark everything as zero */
3973 ret |= BDRV_BLOCK_ZERO;
3974 } else {
3975 /* Limit request to the range reported by the protocol driver */
3976 *pnum = file_pnum;
3977 ret |= (ret2 & BDRV_BLOCK_ZERO);
3978 }
3979 }
3980 }
3981
3982 return ret;
3983 }
3984
3985 /* Coroutine wrapper for bdrv_get_block_status() */
3986 static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
3987 {
3988 BdrvCoGetBlockStatusData *data = opaque;
3989 BlockDriverState *bs = data->bs;
3990
3991 data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
3992 data->pnum);
3993 data->done = true;
3994 }
3995
3996 /*
3997 * Synchronous wrapper around bdrv_co_get_block_status().
3998 *
3999 * See bdrv_co_get_block_status() for details.
4000 */
4001 int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
4002 int nb_sectors, int *pnum)
4003 {
4004 Coroutine *co;
4005 BdrvCoGetBlockStatusData data = {
4006 .bs = bs,
4007 .sector_num = sector_num,
4008 .nb_sectors = nb_sectors,
4009 .pnum = pnum,
4010 .done = false,
4011 };
4012
4013 if (qemu_in_coroutine()) {
4014 /* Fast-path if already in coroutine context */
4015 bdrv_get_block_status_co_entry(&data);
4016 } else {
4017 AioContext *aio_context = bdrv_get_aio_context(bs);
4018
4019 co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
4020 qemu_coroutine_enter(co, &data);
4021 while (!data.done) {
4022 aio_poll(aio_context, true);
4023 }
4024 }
4025 return data.ret;
4026 }
4027
4028 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
4029 int nb_sectors, int *pnum)
4030 {
4031 int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
4032 if (ret < 0) {
4033 return ret;
4034 }
4035 return !!(ret & BDRV_BLOCK_ALLOCATED);
4036 }
4037
4038 /*
4039 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
4040 *
4041 * Return true if the given sector is allocated in any image between
4042 * BASE and TOP (inclusive). BASE can be NULL to check if the given
4043 * sector is allocated in any image of the chain. Return false otherwise.
4044 *
4045 * 'pnum' is set to the number of sectors (including and immediately following
4046 * the specified sector) that are known to be in the same
4047 * allocated/unallocated state.
4048 *
4049 */
4050 int bdrv_is_allocated_above(BlockDriverState *top,
4051 BlockDriverState *base,
4052 int64_t sector_num,
4053 int nb_sectors, int *pnum)
4054 {
4055 BlockDriverState *intermediate;
4056 int ret, n = nb_sectors;
4057
4058 intermediate = top;
4059 while (intermediate && intermediate != base) {
4060 int pnum_inter;
4061 ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
4062 &pnum_inter);
4063 if (ret < 0) {
4064 return ret;
4065 } else if (ret) {
4066 *pnum = pnum_inter;
4067 return 1;
4068 }
4069
4070 /*
4071 * [sector_num, nb_sectors] is unallocated on top but intermediate
4072 * might have
4073 *
4074 * [sector_num+x, nr_sectors] allocated.
4075 */
4076 if (n > pnum_inter &&
4077 (intermediate == top ||
4078 sector_num + pnum_inter < intermediate->total_sectors)) {
4079 n = pnum_inter;
4080 }
4081
4082 intermediate = intermediate->backing_hd;
4083 }
4084
4085 *pnum = n;
4086 return 0;
4087 }
4088
4089 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
4090 {
4091 if (bs->backing_hd && bs->backing_hd->encrypted)
4092 return bs->backing_file;
4093 else if (bs->encrypted)
4094 return bs->filename;
4095 else
4096 return NULL;
4097 }
4098
4099 void bdrv_get_backing_filename(BlockDriverState *bs,
4100 char *filename, int filename_size)
4101 {
4102 pstrcpy(filename, filename_size, bs->backing_file);
4103 }
4104
4105 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
4106 const uint8_t *buf, int nb_sectors)
4107 {
4108 BlockDriver *drv = bs->drv;
4109 if (!drv)
4110 return -ENOMEDIUM;
4111 if (!drv->bdrv_write_compressed)
4112 return -ENOTSUP;
4113 if (bdrv_check_request(bs, sector_num, nb_sectors))
4114 return -EIO;
4115
4116 assert(QLIST_EMPTY(&bs->dirty_bitmaps));
4117
4118 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
4119 }
4120
4121 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
4122 {
4123 BlockDriver *drv = bs->drv;
4124 if (!drv)
4125 return -ENOMEDIUM;
4126 if (!drv->bdrv_get_info)
4127 return -ENOTSUP;
4128 memset(bdi, 0, sizeof(*bdi));
4129 return drv->bdrv_get_info(bs, bdi);
4130 }
4131
4132 ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
4133 {
4134 BlockDriver *drv = bs->drv;
4135 if (drv && drv->bdrv_get_specific_info) {
4136 return drv->bdrv_get_specific_info(bs);
4137 }
4138 return NULL;
4139 }
4140
4141 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
4142 int64_t pos, int size)
4143 {
4144 QEMUIOVector qiov;
4145 struct iovec iov = {
4146 .iov_base = (void *) buf,
4147 .iov_len = size,
4148 };
4149
4150 qemu_iovec_init_external(&qiov, &iov, 1);
4151 return bdrv_writev_vmstate(bs, &qiov, pos);
4152 }
4153
4154 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
4155 {
4156 BlockDriver *drv = bs->drv;
4157
4158 if (!drv) {
4159 return -ENOMEDIUM;
4160 } else if (drv->bdrv_save_vmstate) {
4161 return drv->bdrv_save_vmstate(bs, qiov, pos);
4162 } else if (bs->file) {
4163 return bdrv_writev_vmstate(bs->file, qiov, pos);
4164 }
4165
4166 return -ENOTSUP;
4167 }
4168
4169 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
4170 int64_t pos, int size)
4171 {
4172 BlockDriver *drv = bs->drv;
4173 if (!drv)
4174 return -ENOMEDIUM;
4175 if (drv->bdrv_load_vmstate)
4176 return drv->bdrv_load_vmstate(bs, buf, pos, size);
4177 if (bs->file)
4178 return bdrv_load_vmstate(bs->file, buf, pos, size);
4179 return -ENOTSUP;
4180 }
4181
4182 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
4183 {
4184 if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
4185 return;
4186 }
4187
4188 bs->drv->bdrv_debug_event(bs, event);
4189 }
4190
4191 int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
4192 const char *tag)
4193 {
4194 while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
4195 bs = bs->file;
4196 }
4197
4198 if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
4199 return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
4200 }
4201
4202 return -ENOTSUP;
4203 }
4204
4205 int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
4206 {
4207 while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
4208 bs = bs->file;
4209 }
4210
4211 if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
4212 return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
4213 }
4214
4215 return -ENOTSUP;
4216 }
4217
4218 int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
4219 {
4220 while (bs && (!bs->drv || !bs->drv->bdrv_debug_resume)) {
4221 bs = bs->file;
4222 }
4223
4224 if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
4225 return bs->drv->bdrv_debug_resume(bs, tag);
4226 }
4227
4228 return -ENOTSUP;
4229 }
4230
4231 bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
4232 {
4233 while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
4234 bs = bs->file;
4235 }
4236
4237 if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
4238 return bs->drv->bdrv_debug_is_suspended(bs, tag);
4239 }
4240
4241 return false;
4242 }
4243
4244 int bdrv_is_snapshot(BlockDriverState *bs)
4245 {
4246 return !!(bs->open_flags & BDRV_O_SNAPSHOT);
4247 }
4248
4249 /* backing_file can either be relative, or absolute, or a protocol. If it is
4250 * relative, it must be relative to the chain. So, passing in bs->filename
4251 * from a BDS as backing_file should not be done, as that may be relative to
4252 * the CWD rather than the chain. */
4253 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
4254 const char *backing_file)
4255 {
4256 char *filename_full = NULL;
4257 char *backing_file_full = NULL;
4258 char *filename_tmp = NULL;
4259 int is_protocol = 0;
4260 BlockDriverState *curr_bs = NULL;
4261 BlockDriverState *retval = NULL;
4262
4263 if (!bs || !bs->drv || !backing_file) {
4264 return NULL;
4265 }
4266
4267 filename_full = g_malloc(PATH_MAX);
4268 backing_file_full = g_malloc(PATH_MAX);
4269 filename_tmp = g_malloc(PATH_MAX);
4270
4271 is_protocol = path_has_protocol(backing_file);
4272
4273 for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
4274
4275 /* If either of the filename paths is actually a protocol, then
4276 * compare unmodified paths; otherwise make paths relative */
4277 if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
4278 if (strcmp(backing_file, curr_bs->backing_file) == 0) {
4279 retval = curr_bs->backing_hd;
4280 break;
4281 }
4282 } else {
4283 /* If not an absolute filename path, make it relative to the current
4284 * image's filename path */
4285 path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4286 backing_file);
4287
4288 /* We are going to compare absolute pathnames */
4289 if (!realpath(filename_tmp, filename_full)) {
4290 continue;
4291 }
4292
4293 /* We need to make sure the backing filename we are comparing against
4294 * is relative to the current image filename (or absolute) */
4295 path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4296 curr_bs->backing_file);
4297
4298 if (!realpath(filename_tmp, backing_file_full)) {
4299 continue;
4300 }
4301
4302 if (strcmp(backing_file_full, filename_full) == 0) {
4303 retval = curr_bs->backing_hd;
4304 break;
4305 }
4306 }
4307 }
4308
4309 g_free(filename_full);
4310 g_free(backing_file_full);
4311 g_free(filename_tmp);
4312 return retval;
4313 }
4314
4315 int bdrv_get_backing_file_depth(BlockDriverState *bs)
4316 {
4317 if (!bs->drv) {
4318 return 0;
4319 }
4320
4321 if (!bs->backing_hd) {
4322 return 0;
4323 }
4324
4325 return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
4326 }
4327
4328 /**************************************************************/
4329 /* async I/Os */
4330
4331 BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
4332 QEMUIOVector *qiov, int nb_sectors,
4333 BlockCompletionFunc *cb, void *opaque)
4334 {
4335 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
4336
4337 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4338 cb, opaque, false);
4339 }
4340
4341 BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
4342 QEMUIOVector *qiov, int nb_sectors,
4343 BlockCompletionFunc *cb, void *opaque)
4344 {
4345 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
4346
4347 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4348 cb, opaque, true);
4349 }
4350
4351 BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
4352 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
4353 BlockCompletionFunc *cb, void *opaque)
4354 {
4355 trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
4356
4357 return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
4358 BDRV_REQ_ZERO_WRITE | flags,
4359 cb, opaque, true);
4360 }
4361
4362
4363 typedef struct MultiwriteCB {
4364 int error;
4365 int num_requests;
4366 int num_callbacks;
4367 struct {
4368 BlockCompletionFunc *cb;
4369 void *opaque;
4370 QEMUIOVector *free_qiov;
4371 } callbacks[];
4372 } MultiwriteCB;
4373
4374 static void multiwrite_user_cb(MultiwriteCB *mcb)
4375 {
4376 int i;
4377
4378 for (i = 0; i < mcb->num_callbacks; i++) {
4379 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
4380 if (mcb->callbacks[i].free_qiov) {
4381 qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
4382 }
4383 g_free(mcb->callbacks[i].free_qiov);
4384 }
4385 }
4386
4387 static void multiwrite_cb(void *opaque, int ret)
4388 {
4389 MultiwriteCB *mcb = opaque;
4390
4391 trace_multiwrite_cb(mcb, ret);
4392
4393 if (ret < 0 && !mcb->error) {
4394 mcb->error = ret;
4395 }
4396
4397 mcb->num_requests--;
4398 if (mcb->num_requests == 0) {
4399 multiwrite_user_cb(mcb);
4400 g_free(mcb);
4401 }
4402 }
4403
4404 static int multiwrite_req_compare(const void *a, const void *b)
4405 {
4406 const BlockRequest *req1 = a, *req2 = b;
4407
4408 /*
4409 * Note that we can't simply subtract req2->sector from req1->sector
4410 * here as that could overflow the return value.
4411 */
4412 if (req1->sector > req2->sector) {
4413 return 1;
4414 } else if (req1->sector < req2->sector) {
4415 return -1;
4416 } else {
4417 return 0;
4418 }
4419 }
4420
4421 /*
4422 * Takes a bunch of requests and tries to merge them. Returns the number of
4423 * requests that remain after merging.
4424 */
4425 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
4426 int num_reqs, MultiwriteCB *mcb)
4427 {
4428 int i, outidx;
4429
4430 // Sort requests by start sector
4431 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
4432
4433 // Check if adjacent requests touch the same clusters. If so, combine them,
4434 // filling up gaps with zero sectors.
4435 outidx = 0;
4436 for (i = 1; i < num_reqs; i++) {
4437 int merge = 0;
4438 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
4439
4440 // Handle exactly sequential writes and overlapping writes.
4441 if (reqs[i].sector <= oldreq_last) {
4442 merge = 1;
4443 }
4444
4445 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
4446 merge = 0;
4447 }
4448
4449 if (merge) {
4450 size_t size;
4451 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
4452 qemu_iovec_init(qiov,
4453 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
4454
4455 // Add the first request to the merged one. If the requests are
4456 // overlapping, drop the last sectors of the first request.
4457 size = (reqs[i].sector - reqs[outidx].sector) << 9;
4458 qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
4459
4460 // We should need to add any zeros between the two requests
4461 assert (reqs[i].sector <= oldreq_last);
4462
4463 // Add the second request
4464 qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
4465
4466 // Add tail of first request, if necessary
4467 if (qiov->size < reqs[outidx].qiov->size) {
4468 qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size,
4469 reqs[outidx].qiov->size - qiov->size);
4470 }
4471
4472 reqs[outidx].nb_sectors = qiov->size >> 9;
4473 reqs[outidx].qiov = qiov;
4474
4475 mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
4476 } else {
4477 outidx++;
4478 reqs[outidx].sector = reqs[i].sector;
4479 reqs[outidx].nb_sectors = reqs[i].nb_sectors;
4480 reqs[outidx].qiov = reqs[i].qiov;
4481 }
4482 }
4483
4484 return outidx + 1;
4485 }
4486
4487 /*
4488 * Submit multiple AIO write requests at once.
4489 *
4490 * On success, the function returns 0 and all requests in the reqs array have
4491 * been submitted. In error case this function returns -1, and any of the
4492 * requests may or may not be submitted yet. In particular, this means that the
4493 * callback will be called for some of the requests, for others it won't. The
4494 * caller must check the error field of the BlockRequest to wait for the right
4495 * callbacks (if error != 0, no callback will be called).
4496 *
4497 * The implementation may modify the contents of the reqs array, e.g. to merge
4498 * requests. However, the fields opaque and error are left unmodified as they
4499 * are used to signal failure for a single request to the caller.
4500 */
4501 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
4502 {
4503 MultiwriteCB *mcb;
4504 int i;
4505
4506 /* don't submit writes if we don't have a medium */
4507 if (bs->drv == NULL) {
4508 for (i = 0; i < num_reqs; i++) {
4509 reqs[i].error = -ENOMEDIUM;
4510 }
4511 return -1;
4512 }
4513
4514 if (num_reqs == 0) {
4515 return 0;
4516 }
4517
4518 // Create MultiwriteCB structure
4519 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
4520 mcb->num_requests = 0;
4521 mcb->num_callbacks = num_reqs;
4522
4523 for (i = 0; i < num_reqs; i++) {
4524 mcb->callbacks[i].cb = reqs[i].cb;
4525 mcb->callbacks[i].opaque = reqs[i].opaque;
4526 }
4527
4528 // Check for mergable requests
4529 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
4530
4531 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
4532
4533 /* Run the aio requests. */
4534 mcb->num_requests = num_reqs;
4535 for (i = 0; i < num_reqs; i++) {
4536 bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
4537 reqs[i].nb_sectors, reqs[i].flags,
4538 multiwrite_cb, mcb,
4539 true);
4540 }
4541
4542 return 0;
4543 }
4544
4545 void bdrv_aio_cancel(BlockAIOCB *acb)
4546 {
4547 qemu_aio_ref(acb);
4548 bdrv_aio_cancel_async(acb);
4549 while (acb->refcnt > 1) {
4550 if (acb->aiocb_info->get_aio_context) {
4551 aio_poll(acb->aiocb_info->get_aio_context(acb), true);
4552 } else if (acb->bs) {
4553 aio_poll(bdrv_get_aio_context(acb->bs), true);
4554 } else {
4555 abort();
4556 }
4557 }
4558 qemu_aio_unref(acb);
4559 }
4560
4561 /* Async version of aio cancel. The caller is not blocked if the acb implements
4562 * cancel_async, otherwise we do nothing and let the request normally complete.
4563 * In either case the completion callback must be called. */
4564 void bdrv_aio_cancel_async(BlockAIOCB *acb)
4565 {
4566 if (acb->aiocb_info->cancel_async) {
4567 acb->aiocb_info->cancel_async(acb);
4568 }
4569 }
4570
4571 /**************************************************************/
4572 /* async block device emulation */
4573
4574 typedef struct BlockAIOCBSync {
4575 BlockAIOCB common;
4576 QEMUBH *bh;
4577 int ret;
4578 /* vector translation state */
4579 QEMUIOVector *qiov;
4580 uint8_t *bounce;
4581 int is_write;
4582 } BlockAIOCBSync;
4583
4584 static const AIOCBInfo bdrv_em_aiocb_info = {
4585 .aiocb_size = sizeof(BlockAIOCBSync),
4586 };
4587
4588 static void bdrv_aio_bh_cb(void *opaque)
4589 {
4590 BlockAIOCBSync *acb = opaque;
4591
4592 if (!acb->is_write && acb->ret >= 0) {
4593 qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
4594 }
4595 qemu_vfree(acb->bounce);
4596 acb->common.cb(acb->common.opaque, acb->ret);
4597 qemu_bh_delete(acb->bh);
4598 acb->bh = NULL;
4599 qemu_aio_unref(acb);
4600 }
4601
4602 static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
4603 int64_t sector_num,
4604 QEMUIOVector *qiov,
4605 int nb_sectors,
4606 BlockCompletionFunc *cb,
4607 void *opaque,
4608 int is_write)
4609
4610 {
4611 BlockAIOCBSync *acb;
4612
4613 acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
4614 acb->is_write = is_write;
4615 acb->qiov = qiov;
4616 acb->bounce = qemu_try_blockalign(bs, qiov->size);
4617 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb);
4618
4619 if (acb->bounce == NULL) {
4620 acb->ret = -ENOMEM;
4621 } else if (is_write) {
4622 qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
4623 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
4624 } else {
4625 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
4626 }
4627
4628 qemu_bh_schedule(acb->bh);
4629
4630 return &acb->common;
4631 }
4632
4633 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
4634 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4635 BlockCompletionFunc *cb, void *opaque)
4636 {
4637 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
4638 }
4639
4640 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
4641 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4642 BlockCompletionFunc *cb, void *opaque)
4643 {
4644 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
4645 }
4646
4647
4648 typedef struct BlockAIOCBCoroutine {
4649 BlockAIOCB common;
4650 BlockRequest req;
4651 bool is_write;
4652 bool *done;
4653 QEMUBH* bh;
4654 } BlockAIOCBCoroutine;
4655
4656 static const AIOCBInfo bdrv_em_co_aiocb_info = {
4657 .aiocb_size = sizeof(BlockAIOCBCoroutine),
4658 };
4659
4660 static void bdrv_co_em_bh(void *opaque)
4661 {
4662 BlockAIOCBCoroutine *acb = opaque;
4663
4664 acb->common.cb(acb->common.opaque, acb->req.error);
4665
4666 qemu_bh_delete(acb->bh);
4667 qemu_aio_unref(acb);
4668 }
4669
4670 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4671 static void coroutine_fn bdrv_co_do_rw(void *opaque)
4672 {
4673 BlockAIOCBCoroutine *acb = opaque;
4674 BlockDriverState *bs = acb->common.bs;
4675
4676 if (!acb->is_write) {
4677 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
4678 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4679 } else {
4680 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
4681 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4682 }
4683
4684 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4685 qemu_bh_schedule(acb->bh);
4686 }
4687
4688 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4689 int64_t sector_num,
4690 QEMUIOVector *qiov,
4691 int nb_sectors,
4692 BdrvRequestFlags flags,
4693 BlockCompletionFunc *cb,
4694 void *opaque,
4695 bool is_write)
4696 {
4697 Coroutine *co;
4698 BlockAIOCBCoroutine *acb;
4699
4700 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4701 acb->req.sector = sector_num;
4702 acb->req.nb_sectors = nb_sectors;
4703 acb->req.qiov = qiov;
4704 acb->req.flags = flags;
4705 acb->is_write = is_write;
4706
4707 co = qemu_coroutine_create(bdrv_co_do_rw);
4708 qemu_coroutine_enter(co, acb);
4709
4710 return &acb->common;
4711 }
4712
4713 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
4714 {
4715 BlockAIOCBCoroutine *acb = opaque;
4716 BlockDriverState *bs = acb->common.bs;
4717
4718 acb->req.error = bdrv_co_flush(bs);
4719 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4720 qemu_bh_schedule(acb->bh);
4721 }
4722
4723 BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
4724 BlockCompletionFunc *cb, void *opaque)
4725 {
4726 trace_bdrv_aio_flush(bs, opaque);
4727
4728 Coroutine *co;
4729 BlockAIOCBCoroutine *acb;
4730
4731 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4732
4733 co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4734 qemu_coroutine_enter(co, acb);
4735
4736 return &acb->common;
4737 }
4738
4739 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4740 {
4741 BlockAIOCBCoroutine *acb = opaque;
4742 BlockDriverState *bs = acb->common.bs;
4743
4744 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
4745 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4746 qemu_bh_schedule(acb->bh);
4747 }
4748
4749 BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4750 int64_t sector_num, int nb_sectors,
4751 BlockCompletionFunc *cb, void *opaque)
4752 {
4753 Coroutine *co;
4754 BlockAIOCBCoroutine *acb;
4755
4756 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4757
4758 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4759 acb->req.sector = sector_num;
4760 acb->req.nb_sectors = nb_sectors;
4761 co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
4762 qemu_coroutine_enter(co, acb);
4763
4764 return &acb->common;
4765 }
4766
4767 void bdrv_init(void)
4768 {
4769 module_call_init(MODULE_INIT_BLOCK);
4770 }
4771
4772 void bdrv_init_with_whitelist(void)
4773 {
4774 use_bdrv_whitelist = 1;
4775 bdrv_init();
4776 }
4777
4778 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
4779 BlockCompletionFunc *cb, void *opaque)
4780 {
4781 BlockAIOCB *acb;
4782
4783 acb = g_slice_alloc(aiocb_info->aiocb_size);
4784 acb->aiocb_info = aiocb_info;
4785 acb->bs = bs;
4786 acb->cb = cb;
4787 acb->opaque = opaque;
4788 acb->refcnt = 1;
4789 return acb;
4790 }
4791
4792 void qemu_aio_ref(void *p)
4793 {
4794 BlockAIOCB *acb = p;
4795 acb->refcnt++;
4796 }
4797
4798 void qemu_aio_unref(void *p)
4799 {
4800 BlockAIOCB *acb = p;
4801 assert(acb->refcnt > 0);
4802 if (--acb->refcnt == 0) {
4803 g_slice_free1(acb->aiocb_info->aiocb_size, acb);
4804 }
4805 }
4806
4807 /**************************************************************/
4808 /* Coroutine block device emulation */
4809
4810 typedef struct CoroutineIOCompletion {
4811 Coroutine *coroutine;
4812 int ret;
4813 } CoroutineIOCompletion;
4814
4815 static void bdrv_co_io_em_complete(void *opaque, int ret)
4816 {
4817 CoroutineIOCompletion *co = opaque;
4818
4819 co->ret = ret;
4820 qemu_coroutine_enter(co->coroutine, NULL);
4821 }
4822
4823 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
4824 int nb_sectors, QEMUIOVector *iov,
4825 bool is_write)
4826 {
4827 CoroutineIOCompletion co = {
4828 .coroutine = qemu_coroutine_self(),
4829 };
4830 BlockAIOCB *acb;
4831
4832 if (is_write) {
4833 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
4834 bdrv_co_io_em_complete, &co);
4835 } else {
4836 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
4837 bdrv_co_io_em_complete, &co);
4838 }
4839
4840 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
4841 if (!acb) {
4842 return -EIO;
4843 }
4844 qemu_coroutine_yield();
4845
4846 return co.ret;
4847 }
4848
4849 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4850 int64_t sector_num, int nb_sectors,
4851 QEMUIOVector *iov)
4852 {
4853 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4854 }
4855
4856 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4857 int64_t sector_num, int nb_sectors,
4858 QEMUIOVector *iov)
4859 {
4860 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4861 }
4862
4863 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
4864 {
4865 RwCo *rwco = opaque;
4866
4867 rwco->ret = bdrv_co_flush(rwco->bs);
4868 }
4869
4870 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4871 {
4872 int ret;
4873
4874 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
4875 return 0;
4876 }
4877
4878 /* Write back cached data to the OS even with cache=unsafe */
4879 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
4880 if (bs->drv->bdrv_co_flush_to_os) {
4881 ret = bs->drv->bdrv_co_flush_to_os(bs);
4882 if (ret < 0) {
4883 return ret;
4884 }
4885 }
4886
4887 /* But don't actually force it to the disk with cache=unsafe */
4888 if (bs->open_flags & BDRV_O_NO_FLUSH) {
4889 goto flush_parent;
4890 }
4891
4892 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
4893 if (bs->drv->bdrv_co_flush_to_disk) {
4894 ret = bs->drv->bdrv_co_flush_to_disk(bs);
4895 } else if (bs->drv->bdrv_aio_flush) {
4896 BlockAIOCB *acb;
4897 CoroutineIOCompletion co = {
4898 .coroutine = qemu_coroutine_self(),
4899 };
4900
4901 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4902 if (acb == NULL) {
4903 ret = -EIO;
4904 } else {
4905 qemu_coroutine_yield();
4906 ret = co.ret;
4907 }
4908 } else {
4909 /*
4910 * Some block drivers always operate in either writethrough or unsafe
4911 * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4912 * know how the server works (because the behaviour is hardcoded or
4913 * depends on server-side configuration), so we can't ensure that
4914 * everything is safe on disk. Returning an error doesn't work because
4915 * that would break guests even if the server operates in writethrough
4916 * mode.
4917 *
4918 * Let's hope the user knows what he's doing.
4919 */
4920 ret = 0;
4921 }
4922 if (ret < 0) {
4923 return ret;
4924 }
4925
4926 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
4927 * in the case of cache=unsafe, so there are no useless flushes.
4928 */
4929 flush_parent:
4930 return bdrv_co_flush(bs->file);
4931 }
4932
4933 void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)
4934 {
4935 Error *local_err = NULL;
4936 int ret;
4937
4938 if (!bs->drv) {
4939 return;
4940 }
4941
4942 if (!(bs->open_flags & BDRV_O_INCOMING)) {
4943 return;
4944 }
4945 bs->open_flags &= ~BDRV_O_INCOMING;
4946
4947 if (bs->drv->bdrv_invalidate_cache) {
4948 bs->drv->bdrv_invalidate_cache(bs, &local_err);
4949 } else if (bs->file) {
4950 bdrv_invalidate_cache(bs->file, &local_err);
4951 }
4952 if (local_err) {
4953 error_propagate(errp, local_err);
4954 return;
4955 }
4956
4957 ret = refresh_total_sectors(bs, bs->total_sectors);
4958 if (ret < 0) {
4959 error_setg_errno(errp, -ret, "Could not refresh total sector count");
4960 return;
4961 }
4962 }
4963
4964 void bdrv_invalidate_cache_all(Error **errp)
4965 {
4966 BlockDriverState *bs;
4967 Error *local_err = NULL;
4968
4969 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
4970 AioContext *aio_context = bdrv_get_aio_context(bs);
4971
4972 aio_context_acquire(aio_context);
4973 bdrv_invalidate_cache(bs, &local_err);
4974 aio_context_release(aio_context);
4975 if (local_err) {
4976 error_propagate(errp, local_err);
4977 return;
4978 }
4979 }
4980 }
4981
4982 int bdrv_flush(BlockDriverState *bs)
4983 {
4984 Coroutine *co;
4985 RwCo rwco = {
4986 .bs = bs,
4987 .ret = NOT_DONE,
4988 };
4989
4990 if (qemu_in_coroutine()) {
4991 /* Fast-path if already in coroutine context */
4992 bdrv_flush_co_entry(&rwco);
4993 } else {
4994 AioContext *aio_context = bdrv_get_aio_context(bs);
4995
4996 co = qemu_coroutine_create(bdrv_flush_co_entry);
4997 qemu_coroutine_enter(co, &rwco);
4998 while (rwco.ret == NOT_DONE) {
4999 aio_poll(aio_context, true);
5000 }
5001 }
5002
5003 return rwco.ret;
5004 }
5005
5006 typedef struct DiscardCo {
5007 BlockDriverState *bs;
5008 int64_t sector_num;
5009 int nb_sectors;
5010 int ret;
5011 } DiscardCo;
5012 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
5013 {
5014 DiscardCo *rwco = opaque;
5015
5016 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
5017 }
5018
5019 /* if no limit is specified in the BlockLimits use a default
5020 * of 32768 512-byte sectors (16 MiB) per request.
5021 */
5022 #define MAX_DISCARD_DEFAULT 32768
5023
5024 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
5025 int nb_sectors)
5026 {
5027 int max_discard;
5028
5029 if (!bs->drv) {
5030 return -ENOMEDIUM;
5031 } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
5032 return -EIO;
5033 } else if (bs->read_only) {
5034 return -EROFS;
5035 }
5036
5037 bdrv_reset_dirty(bs, sector_num, nb_sectors);
5038
5039 /* Do nothing if disabled. */
5040 if (!(bs->open_flags & BDRV_O_UNMAP)) {
5041 return 0;
5042 }
5043
5044 if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
5045 return 0;
5046 }
5047
5048 max_discard = bs->bl.max_discard ? bs->bl.max_discard : MAX_DISCARD_DEFAULT;
5049 while (nb_sectors > 0) {
5050 int ret;
5051 int num = nb_sectors;
5052
5053 /* align request */
5054 if (bs->bl.discard_alignment &&
5055 num >= bs->bl.discard_alignment &&
5056 sector_num % bs->bl.discard_alignment) {
5057 if (num > bs->bl.discard_alignment) {
5058 num = bs->bl.discard_alignment;
5059 }
5060 num -= sector_num % bs->bl.discard_alignment;
5061 }
5062
5063 /* limit request size */
5064 if (num > max_discard) {
5065 num = max_discard;
5066 }
5067
5068 if (bs->drv->bdrv_co_discard) {
5069 ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
5070 } else {
5071 BlockAIOCB *acb;
5072 CoroutineIOCompletion co = {
5073 .coroutine = qemu_coroutine_self(),
5074 };
5075
5076 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
5077 bdrv_co_io_em_complete, &co);
5078 if (acb == NULL) {
5079 return -EIO;
5080 } else {
5081 qemu_coroutine_yield();
5082 ret = co.ret;
5083 }
5084 }
5085 if (ret && ret != -ENOTSUP) {
5086 return ret;
5087 }
5088
5089 sector_num += num;
5090 nb_sectors -= num;
5091 }
5092 return 0;
5093 }
5094
5095 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
5096 {
5097 Coroutine *co;
5098 DiscardCo rwco = {
5099 .bs = bs,
5100 .sector_num = sector_num,
5101 .nb_sectors = nb_sectors,
5102 .ret = NOT_DONE,
5103 };
5104
5105 if (qemu_in_coroutine()) {
5106 /* Fast-path if already in coroutine context */
5107 bdrv_discard_co_entry(&rwco);
5108 } else {
5109 AioContext *aio_context = bdrv_get_aio_context(bs);
5110
5111 co = qemu_coroutine_create(bdrv_discard_co_entry);
5112 qemu_coroutine_enter(co, &rwco);
5113 while (rwco.ret == NOT_DONE) {
5114 aio_poll(aio_context, true);
5115 }
5116 }
5117
5118 return rwco.ret;
5119 }
5120
5121 /**************************************************************/
5122 /* removable device support */
5123
5124 /**
5125 * Return TRUE if the media is present
5126 */
5127 int bdrv_is_inserted(BlockDriverState *bs)
5128 {
5129 BlockDriver *drv = bs->drv;
5130
5131 if (!drv)
5132 return 0;
5133 if (!drv->bdrv_is_inserted)
5134 return 1;
5135 return drv->bdrv_is_inserted(bs);
5136 }
5137
5138 /**
5139 * Return whether the media changed since the last call to this
5140 * function, or -ENOTSUP if we don't know. Most drivers don't know.
5141 */
5142 int bdrv_media_changed(BlockDriverState *bs)
5143 {
5144 BlockDriver *drv = bs->drv;
5145
5146 if (drv && drv->bdrv_media_changed) {
5147 return drv->bdrv_media_changed(bs);
5148 }
5149 return -ENOTSUP;
5150 }
5151
5152 /**
5153 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
5154 */
5155 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
5156 {
5157 BlockDriver *drv = bs->drv;
5158 const char *device_name;
5159
5160 if (drv && drv->bdrv_eject) {
5161 drv->bdrv_eject(bs, eject_flag);
5162 }
5163
5164 device_name = bdrv_get_device_name(bs);
5165 if (device_name[0] != '\0') {
5166 qapi_event_send_device_tray_moved(device_name,
5167 eject_flag, &error_abort);
5168 }
5169 }
5170
5171 /**
5172 * Lock or unlock the media (if it is locked, the user won't be able
5173 * to eject it manually).
5174 */
5175 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
5176 {
5177 BlockDriver *drv = bs->drv;
5178
5179 trace_bdrv_lock_medium(bs, locked);
5180
5181 if (drv && drv->bdrv_lock_medium) {
5182 drv->bdrv_lock_medium(bs, locked);
5183 }
5184 }
5185
5186 /* needed for generic scsi interface */
5187
5188 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
5189 {
5190 BlockDriver *drv = bs->drv;
5191
5192 if (drv && drv->bdrv_ioctl)
5193 return drv->bdrv_ioctl(bs, req, buf);
5194 return -ENOTSUP;
5195 }
5196
5197 BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
5198 unsigned long int req, void *buf,
5199 BlockCompletionFunc *cb, void *opaque)
5200 {
5201 BlockDriver *drv = bs->drv;
5202
5203 if (drv && drv->bdrv_aio_ioctl)
5204 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
5205 return NULL;
5206 }
5207
5208 void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
5209 {
5210 bs->guest_block_size = align;
5211 }
5212
5213 void *qemu_blockalign(BlockDriverState *bs, size_t size)
5214 {
5215 return qemu_memalign(bdrv_opt_mem_align(bs), size);
5216 }
5217
5218 void *qemu_blockalign0(BlockDriverState *bs, size_t size)
5219 {
5220 return memset(qemu_blockalign(bs, size), 0, size);
5221 }
5222
5223 void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
5224 {
5225 size_t align = bdrv_opt_mem_align(bs);
5226
5227 /* Ensure that NULL is never returned on success */
5228 assert(align > 0);
5229 if (size == 0) {
5230 size = align;
5231 }
5232
5233 return qemu_try_memalign(align, size);
5234 }
5235
5236 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
5237 {
5238 void *mem = qemu_try_blockalign(bs, size);
5239
5240 if (mem) {
5241 memset(mem, 0, size);
5242 }
5243
5244 return mem;
5245 }
5246
5247 /*
5248 * Check if all memory in this vector is sector aligned.
5249 */
5250 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
5251 {
5252 int i;
5253 size_t alignment = bdrv_opt_mem_align(bs);
5254
5255 for (i = 0; i < qiov->niov; i++) {
5256 if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
5257 return false;
5258 }
5259 if (qiov->iov[i].iov_len % alignment) {
5260 return false;
5261 }
5262 }
5263
5264 return true;
5265 }
5266
5267 BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity,
5268 Error **errp)
5269 {
5270 int64_t bitmap_size;
5271 BdrvDirtyBitmap *bitmap;
5272
5273 assert((granularity & (granularity - 1)) == 0);
5274
5275 granularity >>= BDRV_SECTOR_BITS;
5276 assert(granularity);
5277 bitmap_size = bdrv_nb_sectors(bs);
5278 if (bitmap_size < 0) {
5279 error_setg_errno(errp, -bitmap_size, "could not get length of device");
5280 errno = -bitmap_size;
5281 return NULL;
5282 }
5283 bitmap = g_new0(BdrvDirtyBitmap, 1);
5284 bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
5285 QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
5286 return bitmap;
5287 }
5288
5289 void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5290 {
5291 BdrvDirtyBitmap *bm, *next;
5292 QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
5293 if (bm == bitmap) {
5294 QLIST_REMOVE(bitmap, list);
5295 hbitmap_free(bitmap->bitmap);
5296 g_free(bitmap);
5297 return;
5298 }
5299 }
5300 }
5301
5302 BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
5303 {
5304 BdrvDirtyBitmap *bm;
5305 BlockDirtyInfoList *list = NULL;
5306 BlockDirtyInfoList **plist = &list;
5307
5308 QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
5309 BlockDirtyInfo *info = g_new0(BlockDirtyInfo, 1);
5310 BlockDirtyInfoList *entry = g_new0(BlockDirtyInfoList, 1);
5311 info->count = bdrv_get_dirty_count(bs, bm);
5312 info->granularity =
5313 ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
5314 entry->value = info;
5315 *plist = entry;
5316 plist = &entry->next;
5317 }
5318
5319 return list;
5320 }
5321
5322 int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
5323 {
5324 if (bitmap) {
5325 return hbitmap_get(bitmap->bitmap, sector);
5326 } else {
5327 return 0;
5328 }
5329 }
5330
5331 void bdrv_dirty_iter_init(BlockDriverState *bs,
5332 BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
5333 {
5334 hbitmap_iter_init(hbi, bitmap->bitmap, 0);
5335 }
5336
5337 void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
5338 int nr_sectors)
5339 {
5340 BdrvDirtyBitmap *bitmap;
5341 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5342 hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5343 }
5344 }
5345
5346 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors)
5347 {
5348 BdrvDirtyBitmap *bitmap;
5349 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5350 hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5351 }
5352 }
5353
5354 int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5355 {
5356 return hbitmap_count(bitmap->bitmap);
5357 }
5358
5359 /* Get a reference to bs */
5360 void bdrv_ref(BlockDriverState *bs)
5361 {
5362 bs->refcnt++;
5363 }
5364
5365 /* Release a previously grabbed reference to bs.
5366 * If after releasing, reference count is zero, the BlockDriverState is
5367 * deleted. */
5368 void bdrv_unref(BlockDriverState *bs)
5369 {
5370 if (!bs) {
5371 return;
5372 }
5373 assert(bs->refcnt > 0);
5374 if (--bs->refcnt == 0) {
5375 bdrv_delete(bs);
5376 }
5377 }
5378
5379 struct BdrvOpBlocker {
5380 Error *reason;
5381 QLIST_ENTRY(BdrvOpBlocker) list;
5382 };
5383
5384 bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp)
5385 {
5386 BdrvOpBlocker *blocker;
5387 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5388 if (!QLIST_EMPTY(&bs->op_blockers[op])) {
5389 blocker = QLIST_FIRST(&bs->op_blockers[op]);
5390 if (errp) {
5391 error_setg(errp, "Device '%s' is busy: %s",
5392 bdrv_get_device_name(bs),
5393 error_get_pretty(blocker->reason));
5394 }
5395 return true;
5396 }
5397 return false;
5398 }
5399
5400 void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason)
5401 {
5402 BdrvOpBlocker *blocker;
5403 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5404
5405 blocker = g_new0(BdrvOpBlocker, 1);
5406 blocker->reason = reason;
5407 QLIST_INSERT_HEAD(&bs->op_blockers[op], blocker, list);
5408 }
5409
5410 void bdrv_op_unblock(BlockDriverState *bs, BlockOpType op, Error *reason)
5411 {
5412 BdrvOpBlocker *blocker, *next;
5413 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5414 QLIST_FOREACH_SAFE(blocker, &bs->op_blockers[op], list, next) {
5415 if (blocker->reason == reason) {
5416 QLIST_REMOVE(blocker, list);
5417 g_free(blocker);
5418 }
5419 }
5420 }
5421
5422 void bdrv_op_block_all(BlockDriverState *bs, Error *reason)
5423 {
5424 int i;
5425 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5426 bdrv_op_block(bs, i, reason);
5427 }
5428 }
5429
5430 void bdrv_op_unblock_all(BlockDriverState *bs, Error *reason)
5431 {
5432 int i;
5433 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5434 bdrv_op_unblock(bs, i, reason);
5435 }
5436 }
5437
5438 bool bdrv_op_blocker_is_empty(BlockDriverState *bs)
5439 {
5440 int i;
5441
5442 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5443 if (!QLIST_EMPTY(&bs->op_blockers[i])) {
5444 return false;
5445 }
5446 }
5447 return true;
5448 }
5449
5450 void bdrv_iostatus_enable(BlockDriverState *bs)
5451 {
5452 bs->iostatus_enabled = true;
5453 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5454 }
5455
5456 /* The I/O status is only enabled if the drive explicitly
5457 * enables it _and_ the VM is configured to stop on errors */
5458 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
5459 {
5460 return (bs->iostatus_enabled &&
5461 (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
5462 bs->on_write_error == BLOCKDEV_ON_ERROR_STOP ||
5463 bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
5464 }
5465
5466 void bdrv_iostatus_disable(BlockDriverState *bs)
5467 {
5468 bs->iostatus_enabled = false;
5469 }
5470
5471 void bdrv_iostatus_reset(BlockDriverState *bs)
5472 {
5473 if (bdrv_iostatus_is_enabled(bs)) {
5474 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5475 if (bs->job) {
5476 block_job_iostatus_reset(bs->job);
5477 }
5478 }
5479 }
5480
5481 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
5482 {
5483 assert(bdrv_iostatus_is_enabled(bs));
5484 if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
5485 bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
5486 BLOCK_DEVICE_IO_STATUS_FAILED;
5487 }
5488 }
5489
5490 void bdrv_img_create(const char *filename, const char *fmt,
5491 const char *base_filename, const char *base_fmt,
5492 char *options, uint64_t img_size, int flags,
5493 Error **errp, bool quiet)
5494 {
5495 QemuOptsList *create_opts = NULL;
5496 QemuOpts *opts = NULL;
5497 const char *backing_fmt, *backing_file;
5498 int64_t size;
5499 BlockDriver *drv, *proto_drv;
5500 BlockDriver *backing_drv = NULL;
5501 Error *local_err = NULL;
5502 int ret = 0;
5503
5504 /* Find driver and parse its options */
5505 drv = bdrv_find_format(fmt);
5506 if (!drv) {
5507 error_setg(errp, "Unknown file format '%s'", fmt);
5508 return;
5509 }
5510
5511 proto_drv = bdrv_find_protocol(filename, true);
5512 if (!proto_drv) {
5513 error_setg(errp, "Unknown protocol '%s'", filename);
5514 return;
5515 }
5516
5517 create_opts = qemu_opts_append(create_opts, drv->create_opts);
5518 create_opts = qemu_opts_append(create_opts, proto_drv->create_opts);
5519
5520 /* Create parameter list with default values */
5521 opts = qemu_opts_create(create_opts, NULL, 0, &error_abort);
5522 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, img_size);
5523
5524 /* Parse -o options */
5525 if (options) {
5526 if (qemu_opts_do_parse(opts, options, NULL) != 0) {
5527 error_setg(errp, "Invalid options for file format '%s'", fmt);
5528 goto out;
5529 }
5530 }
5531
5532 if (base_filename) {
5533 if (qemu_opt_set(opts, BLOCK_OPT_BACKING_FILE, base_filename)) {
5534 error_setg(errp, "Backing file not supported for file format '%s'",
5535 fmt);
5536 goto out;
5537 }
5538 }
5539
5540 if (base_fmt) {
5541 if (qemu_opt_set(opts, BLOCK_OPT_BACKING_FMT, base_fmt)) {
5542 error_setg(errp, "Backing file format not supported for file "
5543 "format '%s'", fmt);
5544 goto out;
5545 }
5546 }
5547
5548 backing_file = qemu_opt_get(opts, BLOCK_OPT_BACKING_FILE);
5549 if (backing_file) {
5550 if (!strcmp(filename, backing_file)) {
5551 error_setg(errp, "Error: Trying to create an image with the "
5552 "same filename as the backing file");
5553 goto out;
5554 }
5555 }
5556
5557 backing_fmt = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT);
5558 if (backing_fmt) {
5559 backing_drv = bdrv_find_format(backing_fmt);
5560 if (!backing_drv) {
5561 error_setg(errp, "Unknown backing file format '%s'",
5562 backing_fmt);
5563 goto out;
5564 }
5565 }
5566
5567 // The size for the image must always be specified, with one exception:
5568 // If we are using a backing file, we can obtain the size from there
5569 size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 0);
5570 if (size == -1) {
5571 if (backing_file) {
5572 BlockDriverState *bs;
5573 int64_t size;
5574 int back_flags;
5575
5576 /* backing files always opened read-only */
5577 back_flags =
5578 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
5579
5580 bs = NULL;
5581 ret = bdrv_open(&bs, backing_file, NULL, NULL, back_flags,
5582 backing_drv, &local_err);
5583 if (ret < 0) {
5584 error_setg_errno(errp, -ret, "Could not open '%s': %s",
5585 backing_file,
5586 error_get_pretty(local_err));
5587 error_free(local_err);
5588 local_err = NULL;
5589 goto out;
5590 }
5591 size = bdrv_getlength(bs);
5592 if (size < 0) {
5593 error_setg_errno(errp, -size, "Could not get size of '%s'",
5594 backing_file);
5595 bdrv_unref(bs);
5596 goto out;
5597 }
5598
5599 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, size);
5600
5601 bdrv_unref(bs);
5602 } else {
5603 error_setg(errp, "Image creation needs a size parameter");
5604 goto out;
5605 }
5606 }
5607
5608 if (!quiet) {
5609 printf("Formatting '%s', fmt=%s ", filename, fmt);
5610 qemu_opts_print(opts);
5611 puts("");
5612 }
5613
5614 ret = bdrv_create(drv, filename, opts, &local_err);
5615
5616 if (ret == -EFBIG) {
5617 /* This is generally a better message than whatever the driver would
5618 * deliver (especially because of the cluster_size_hint), since that
5619 * is most probably not much different from "image too large". */
5620 const char *cluster_size_hint = "";
5621 if (qemu_opt_get_size(opts, BLOCK_OPT_CLUSTER_SIZE, 0)) {
5622 cluster_size_hint = " (try using a larger cluster size)";
5623 }
5624 error_setg(errp, "The image size is too large for file format '%s'"
5625 "%s", fmt, cluster_size_hint);
5626 error_free(local_err);
5627 local_err = NULL;
5628 }
5629
5630 out:
5631 qemu_opts_del(opts);
5632 qemu_opts_free(create_opts);
5633 if (local_err) {
5634 error_propagate(errp, local_err);
5635 }
5636 }
5637
5638 AioContext *bdrv_get_aio_context(BlockDriverState *bs)
5639 {
5640 return bs->aio_context;
5641 }
5642
5643 void bdrv_detach_aio_context(BlockDriverState *bs)
5644 {
5645 BdrvAioNotifier *baf;
5646
5647 if (!bs->drv) {
5648 return;
5649 }
5650
5651 QLIST_FOREACH(baf, &bs->aio_notifiers, list) {
5652 baf->detach_aio_context(baf->opaque);
5653 }
5654
5655 if (bs->io_limits_enabled) {
5656 throttle_detach_aio_context(&bs->throttle_state);
5657 }
5658 if (bs->drv->bdrv_detach_aio_context) {
5659 bs->drv->bdrv_detach_aio_context(bs);
5660 }
5661 if (bs->file) {
5662 bdrv_detach_aio_context(bs->file);
5663 }
5664 if (bs->backing_hd) {
5665 bdrv_detach_aio_context(bs->backing_hd);
5666 }
5667
5668 bs->aio_context = NULL;
5669 }
5670
5671 void bdrv_attach_aio_context(BlockDriverState *bs,
5672 AioContext *new_context)
5673 {
5674 BdrvAioNotifier *ban;
5675
5676 if (!bs->drv) {
5677 return;
5678 }
5679
5680 bs->aio_context = new_context;
5681
5682 if (bs->backing_hd) {
5683 bdrv_attach_aio_context(bs->backing_hd, new_context);
5684 }
5685 if (bs->file) {
5686 bdrv_attach_aio_context(bs->file, new_context);
5687 }
5688 if (bs->drv->bdrv_attach_aio_context) {
5689 bs->drv->bdrv_attach_aio_context(bs, new_context);
5690 }
5691 if (bs->io_limits_enabled) {
5692 throttle_attach_aio_context(&bs->throttle_state, new_context);
5693 }
5694
5695 QLIST_FOREACH(ban, &bs->aio_notifiers, list) {
5696 ban->attached_aio_context(new_context, ban->opaque);
5697 }
5698 }
5699
5700 void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
5701 {
5702 bdrv_drain_all(); /* ensure there are no in-flight requests */
5703
5704 bdrv_detach_aio_context(bs);
5705
5706 /* This function executes in the old AioContext so acquire the new one in
5707 * case it runs in a different thread.
5708 */
5709 aio_context_acquire(new_context);
5710 bdrv_attach_aio_context(bs, new_context);
5711 aio_context_release(new_context);
5712 }
5713
5714 void bdrv_add_aio_context_notifier(BlockDriverState *bs,
5715 void (*attached_aio_context)(AioContext *new_context, void *opaque),
5716 void (*detach_aio_context)(void *opaque), void *opaque)
5717 {
5718 BdrvAioNotifier *ban = g_new(BdrvAioNotifier, 1);
5719 *ban = (BdrvAioNotifier){
5720 .attached_aio_context = attached_aio_context,
5721 .detach_aio_context = detach_aio_context,
5722 .opaque = opaque
5723 };
5724
5725 QLIST_INSERT_HEAD(&bs->aio_notifiers, ban, list);
5726 }
5727
5728 void bdrv_remove_aio_context_notifier(BlockDriverState *bs,
5729 void (*attached_aio_context)(AioContext *,
5730 void *),
5731 void (*detach_aio_context)(void *),
5732 void *opaque)
5733 {
5734 BdrvAioNotifier *ban, *ban_next;
5735
5736 QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
5737 if (ban->attached_aio_context == attached_aio_context &&
5738 ban->detach_aio_context == detach_aio_context &&
5739 ban->opaque == opaque)
5740 {
5741 QLIST_REMOVE(ban, list);
5742 g_free(ban);
5743
5744 return;
5745 }
5746 }
5747
5748 abort();
5749 }
5750
5751 void bdrv_add_before_write_notifier(BlockDriverState *bs,
5752 NotifierWithReturn *notifier)
5753 {
5754 notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
5755 }
5756
5757 int bdrv_amend_options(BlockDriverState *bs, QemuOpts *opts)
5758 {
5759 if (!bs->drv->bdrv_amend_options) {
5760 return -ENOTSUP;
5761 }
5762 return bs->drv->bdrv_amend_options(bs, opts);
5763 }
5764
5765 /* This function will be called by the bdrv_recurse_is_first_non_filter method
5766 * of block filter and by bdrv_is_first_non_filter.
5767 * It is used to test if the given bs is the candidate or recurse more in the
5768 * node graph.
5769 */
5770 bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
5771 BlockDriverState *candidate)
5772 {
5773 /* return false if basic checks fails */
5774 if (!bs || !bs->drv) {
5775 return false;
5776 }
5777
5778 /* the code reached a non block filter driver -> check if the bs is
5779 * the same as the candidate. It's the recursion termination condition.
5780 */
5781 if (!bs->drv->is_filter) {
5782 return bs == candidate;
5783 }
5784 /* Down this path the driver is a block filter driver */
5785
5786 /* If the block filter recursion method is defined use it to recurse down
5787 * the node graph.
5788 */
5789 if (bs->drv->bdrv_recurse_is_first_non_filter) {
5790 return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate);
5791 }
5792
5793 /* the driver is a block filter but don't allow to recurse -> return false
5794 */
5795 return false;
5796 }
5797
5798 /* This function checks if the candidate is the first non filter bs down it's
5799 * bs chain. Since we don't have pointers to parents it explore all bs chains
5800 * from the top. Some filters can choose not to pass down the recursion.
5801 */
5802 bool bdrv_is_first_non_filter(BlockDriverState *candidate)
5803 {
5804 BlockDriverState *bs;
5805
5806 /* walk down the bs forest recursively */
5807 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5808 bool perm;
5809
5810 /* try to recurse in this top level bs */
5811 perm = bdrv_recurse_is_first_non_filter(bs, candidate);
5812
5813 /* candidate is the first non filter */
5814 if (perm) {
5815 return true;
5816 }
5817 }
5818
5819 return false;
5820 }
5821
5822 BlockDriverState *check_to_replace_node(const char *node_name, Error **errp)
5823 {
5824 BlockDriverState *to_replace_bs = bdrv_find_node(node_name);
5825 if (!to_replace_bs) {
5826 error_setg(errp, "Node name '%s' not found", node_name);
5827 return NULL;
5828 }
5829
5830 if (bdrv_op_is_blocked(to_replace_bs, BLOCK_OP_TYPE_REPLACE, errp)) {
5831 return NULL;
5832 }
5833
5834 /* We don't want arbitrary node of the BDS chain to be replaced only the top
5835 * most non filter in order to prevent data corruption.
5836 * Another benefit is that this tests exclude backing files which are
5837 * blocked by the backing blockers.
5838 */
5839 if (!bdrv_is_first_non_filter(to_replace_bs)) {
5840 error_setg(errp, "Only top most non filter can be replaced");
5841 return NULL;
5842 }
5843
5844 return to_replace_bs;
5845 }
5846
5847 void bdrv_io_plug(BlockDriverState *bs)
5848 {
5849 BlockDriver *drv = bs->drv;
5850 if (drv && drv->bdrv_io_plug) {
5851 drv->bdrv_io_plug(bs);
5852 } else if (bs->file) {
5853 bdrv_io_plug(bs->file);
5854 }
5855 }
5856
5857 void bdrv_io_unplug(BlockDriverState *bs)
5858 {
5859 BlockDriver *drv = bs->drv;
5860 if (drv && drv->bdrv_io_unplug) {
5861 drv->bdrv_io_unplug(bs);
5862 } else if (bs->file) {
5863 bdrv_io_unplug(bs->file);
5864 }
5865 }
5866
5867 void bdrv_flush_io_queue(BlockDriverState *bs)
5868 {
5869 BlockDriver *drv = bs->drv;
5870 if (drv && drv->bdrv_flush_io_queue) {
5871 drv->bdrv_flush_io_queue(bs);
5872 } else if (bs->file) {
5873 bdrv_flush_io_queue(bs->file);
5874 }
5875 }
5876
5877 static bool append_open_options(QDict *d, BlockDriverState *bs)
5878 {
5879 const QDictEntry *entry;
5880 bool found_any = false;
5881
5882 for (entry = qdict_first(bs->options); entry;
5883 entry = qdict_next(bs->options, entry))
5884 {
5885 /* Only take options for this level and exclude all non-driver-specific
5886 * options */
5887 if (!strchr(qdict_entry_key(entry), '.') &&
5888 strcmp(qdict_entry_key(entry), "node-name"))
5889 {
5890 qobject_incref(qdict_entry_value(entry));
5891 qdict_put_obj(d, qdict_entry_key(entry), qdict_entry_value(entry));
5892 found_any = true;
5893 }
5894 }
5895
5896 return found_any;
5897 }
5898
5899 /* Updates the following BDS fields:
5900 * - exact_filename: A filename which may be used for opening a block device
5901 * which (mostly) equals the given BDS (even without any
5902 * other options; so reading and writing must return the same
5903 * results, but caching etc. may be different)
5904 * - full_open_options: Options which, when given when opening a block device
5905 * (without a filename), result in a BDS (mostly)
5906 * equalling the given one
5907 * - filename: If exact_filename is set, it is copied here. Otherwise,
5908 * full_open_options is converted to a JSON object, prefixed with
5909 * "json:" (for use through the JSON pseudo protocol) and put here.
5910 */
5911 void bdrv_refresh_filename(BlockDriverState *bs)
5912 {
5913 BlockDriver *drv = bs->drv;
5914 QDict *opts;
5915
5916 if (!drv) {
5917 return;
5918 }
5919
5920 /* This BDS's file name will most probably depend on its file's name, so
5921 * refresh that first */
5922 if (bs->file) {
5923 bdrv_refresh_filename(bs->file);
5924 }
5925
5926 if (drv->bdrv_refresh_filename) {
5927 /* Obsolete information is of no use here, so drop the old file name
5928 * information before refreshing it */
5929 bs->exact_filename[0] = '\0';
5930 if (bs->full_open_options) {
5931 QDECREF(bs->full_open_options);
5932 bs->full_open_options = NULL;
5933 }
5934
5935 drv->bdrv_refresh_filename(bs);
5936 } else if (bs->file) {
5937 /* Try to reconstruct valid information from the underlying file */
5938 bool has_open_options;
5939
5940 bs->exact_filename[0] = '\0';
5941 if (bs->full_open_options) {
5942 QDECREF(bs->full_open_options);
5943 bs->full_open_options = NULL;
5944 }
5945
5946 opts = qdict_new();
5947 has_open_options = append_open_options(opts, bs);
5948
5949 /* If no specific options have been given for this BDS, the filename of
5950 * the underlying file should suffice for this one as well */
5951 if (bs->file->exact_filename[0] && !has_open_options) {
5952 strcpy(bs->exact_filename, bs->file->exact_filename);
5953 }
5954 /* Reconstructing the full options QDict is simple for most format block
5955 * drivers, as long as the full options are known for the underlying
5956 * file BDS. The full options QDict of that file BDS should somehow
5957 * contain a representation of the filename, therefore the following
5958 * suffices without querying the (exact_)filename of this BDS. */
5959 if (bs->file->full_open_options) {
5960 qdict_put_obj(opts, "driver",
5961 QOBJECT(qstring_from_str(drv->format_name)));
5962 QINCREF(bs->file->full_open_options);
5963 qdict_put_obj(opts, "file", QOBJECT(bs->file->full_open_options));
5964
5965 bs->full_open_options = opts;
5966 } else {
5967 QDECREF(opts);
5968 }
5969 } else if (!bs->full_open_options && qdict_size(bs->options)) {
5970 /* There is no underlying file BDS (at least referenced by BDS.file),
5971 * so the full options QDict should be equal to the options given
5972 * specifically for this block device when it was opened (plus the
5973 * driver specification).
5974 * Because those options don't change, there is no need to update
5975 * full_open_options when it's already set. */
5976
5977 opts = qdict_new();
5978 append_open_options(opts, bs);
5979 qdict_put_obj(opts, "driver",
5980 QOBJECT(qstring_from_str(drv->format_name)));
5981
5982 if (bs->exact_filename[0]) {
5983 /* This may not work for all block protocol drivers (some may
5984 * require this filename to be parsed), but we have to find some
5985 * default solution here, so just include it. If some block driver
5986 * does not support pure options without any filename at all or
5987 * needs some special format of the options QDict, it needs to
5988 * implement the driver-specific bdrv_refresh_filename() function.
5989 */
5990 qdict_put_obj(opts, "filename",
5991 QOBJECT(qstring_from_str(bs->exact_filename)));
5992 }
5993
5994 bs->full_open_options = opts;
5995 }
5996
5997 if (bs->exact_filename[0]) {
5998 pstrcpy(bs->filename, sizeof(bs->filename), bs->exact_filename);
5999 } else if (bs->full_open_options) {
6000 QString *json = qobject_to_json(QOBJECT(bs->full_open_options));
6001 snprintf(bs->filename, sizeof(bs->filename), "json:%s",
6002 qstring_get_str(json));
6003 QDECREF(json);
6004 }
6005 }
6006
6007 /* This accessor function purpose is to allow the device models to access the
6008 * BlockAcctStats structure embedded inside a BlockDriverState without being
6009 * aware of the BlockDriverState structure layout.
6010 * It will go away when the BlockAcctStats structure will be moved inside
6011 * the device models.
6012 */
6013 BlockAcctStats *bdrv_get_stats(BlockDriverState *bs)
6014 {
6015 return &bs->stats;
6016 }