]> git.proxmox.com Git - mirror_qemu.git/blob - block.c
block: Read only one sector for format probing
[mirror_qemu.git] / block.c
1 /*
2 * QEMU System Emulator block driver
3 *
4 * Copyright (c) 2003 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24 #include "config-host.h"
25 #include "qemu-common.h"
26 #include "trace.h"
27 #include "block/block_int.h"
28 #include "block/blockjob.h"
29 #include "qemu/module.h"
30 #include "qapi/qmp/qjson.h"
31 #include "sysemu/block-backend.h"
32 #include "sysemu/sysemu.h"
33 #include "qemu/notify.h"
34 #include "block/coroutine.h"
35 #include "block/qapi.h"
36 #include "qmp-commands.h"
37 #include "qemu/timer.h"
38 #include "qapi-event.h"
39
40 #ifdef CONFIG_BSD
41 #include <sys/types.h>
42 #include <sys/stat.h>
43 #include <sys/ioctl.h>
44 #include <sys/queue.h>
45 #ifndef __DragonFly__
46 #include <sys/disk.h>
47 #endif
48 #endif
49
50 #ifdef _WIN32
51 #include <windows.h>
52 #endif
53
54 struct BdrvDirtyBitmap {
55 HBitmap *bitmap;
56 QLIST_ENTRY(BdrvDirtyBitmap) list;
57 };
58
59 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
60
61 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
62 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
63 BlockCompletionFunc *cb, void *opaque);
64 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
65 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
66 BlockCompletionFunc *cb, void *opaque);
67 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
68 int64_t sector_num, int nb_sectors,
69 QEMUIOVector *iov);
70 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
71 int64_t sector_num, int nb_sectors,
72 QEMUIOVector *iov);
73 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
74 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
75 BdrvRequestFlags flags);
76 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
77 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
78 BdrvRequestFlags flags);
79 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
80 int64_t sector_num,
81 QEMUIOVector *qiov,
82 int nb_sectors,
83 BdrvRequestFlags flags,
84 BlockCompletionFunc *cb,
85 void *opaque,
86 bool is_write);
87 static void coroutine_fn bdrv_co_do_rw(void *opaque);
88 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
89 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
90
91 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
92 QTAILQ_HEAD_INITIALIZER(bdrv_states);
93
94 static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
95 QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
96
97 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
98 QLIST_HEAD_INITIALIZER(bdrv_drivers);
99
100 /* If non-zero, use only whitelisted block drivers */
101 static int use_bdrv_whitelist;
102
103 #ifdef _WIN32
104 static int is_windows_drive_prefix(const char *filename)
105 {
106 return (((filename[0] >= 'a' && filename[0] <= 'z') ||
107 (filename[0] >= 'A' && filename[0] <= 'Z')) &&
108 filename[1] == ':');
109 }
110
111 int is_windows_drive(const char *filename)
112 {
113 if (is_windows_drive_prefix(filename) &&
114 filename[2] == '\0')
115 return 1;
116 if (strstart(filename, "\\\\.\\", NULL) ||
117 strstart(filename, "//./", NULL))
118 return 1;
119 return 0;
120 }
121 #endif
122
123 /* throttling disk I/O limits */
124 void bdrv_set_io_limits(BlockDriverState *bs,
125 ThrottleConfig *cfg)
126 {
127 int i;
128
129 throttle_config(&bs->throttle_state, cfg);
130
131 for (i = 0; i < 2; i++) {
132 qemu_co_enter_next(&bs->throttled_reqs[i]);
133 }
134 }
135
136 /* this function drain all the throttled IOs */
137 static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
138 {
139 bool drained = false;
140 bool enabled = bs->io_limits_enabled;
141 int i;
142
143 bs->io_limits_enabled = false;
144
145 for (i = 0; i < 2; i++) {
146 while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
147 drained = true;
148 }
149 }
150
151 bs->io_limits_enabled = enabled;
152
153 return drained;
154 }
155
156 void bdrv_io_limits_disable(BlockDriverState *bs)
157 {
158 bs->io_limits_enabled = false;
159
160 bdrv_start_throttled_reqs(bs);
161
162 throttle_destroy(&bs->throttle_state);
163 }
164
165 static void bdrv_throttle_read_timer_cb(void *opaque)
166 {
167 BlockDriverState *bs = opaque;
168 qemu_co_enter_next(&bs->throttled_reqs[0]);
169 }
170
171 static void bdrv_throttle_write_timer_cb(void *opaque)
172 {
173 BlockDriverState *bs = opaque;
174 qemu_co_enter_next(&bs->throttled_reqs[1]);
175 }
176
177 /* should be called before bdrv_set_io_limits if a limit is set */
178 void bdrv_io_limits_enable(BlockDriverState *bs)
179 {
180 assert(!bs->io_limits_enabled);
181 throttle_init(&bs->throttle_state,
182 bdrv_get_aio_context(bs),
183 QEMU_CLOCK_VIRTUAL,
184 bdrv_throttle_read_timer_cb,
185 bdrv_throttle_write_timer_cb,
186 bs);
187 bs->io_limits_enabled = true;
188 }
189
190 /* This function makes an IO wait if needed
191 *
192 * @nb_sectors: the number of sectors of the IO
193 * @is_write: is the IO a write
194 */
195 static void bdrv_io_limits_intercept(BlockDriverState *bs,
196 unsigned int bytes,
197 bool is_write)
198 {
199 /* does this io must wait */
200 bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
201
202 /* if must wait or any request of this type throttled queue the IO */
203 if (must_wait ||
204 !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
205 qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
206 }
207
208 /* the IO will be executed, do the accounting */
209 throttle_account(&bs->throttle_state, is_write, bytes);
210
211
212 /* if the next request must wait -> do nothing */
213 if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
214 return;
215 }
216
217 /* else queue next request for execution */
218 qemu_co_queue_next(&bs->throttled_reqs[is_write]);
219 }
220
221 size_t bdrv_opt_mem_align(BlockDriverState *bs)
222 {
223 if (!bs || !bs->drv) {
224 /* 4k should be on the safe side */
225 return 4096;
226 }
227
228 return bs->bl.opt_mem_alignment;
229 }
230
231 /* check if the path starts with "<protocol>:" */
232 static int path_has_protocol(const char *path)
233 {
234 const char *p;
235
236 #ifdef _WIN32
237 if (is_windows_drive(path) ||
238 is_windows_drive_prefix(path)) {
239 return 0;
240 }
241 p = path + strcspn(path, ":/\\");
242 #else
243 p = path + strcspn(path, ":/");
244 #endif
245
246 return *p == ':';
247 }
248
249 int path_is_absolute(const char *path)
250 {
251 #ifdef _WIN32
252 /* specific case for names like: "\\.\d:" */
253 if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
254 return 1;
255 }
256 return (*path == '/' || *path == '\\');
257 #else
258 return (*path == '/');
259 #endif
260 }
261
262 /* if filename is absolute, just copy it to dest. Otherwise, build a
263 path to it by considering it is relative to base_path. URL are
264 supported. */
265 void path_combine(char *dest, int dest_size,
266 const char *base_path,
267 const char *filename)
268 {
269 const char *p, *p1;
270 int len;
271
272 if (dest_size <= 0)
273 return;
274 if (path_is_absolute(filename)) {
275 pstrcpy(dest, dest_size, filename);
276 } else {
277 p = strchr(base_path, ':');
278 if (p)
279 p++;
280 else
281 p = base_path;
282 p1 = strrchr(base_path, '/');
283 #ifdef _WIN32
284 {
285 const char *p2;
286 p2 = strrchr(base_path, '\\');
287 if (!p1 || p2 > p1)
288 p1 = p2;
289 }
290 #endif
291 if (p1)
292 p1++;
293 else
294 p1 = base_path;
295 if (p1 > p)
296 p = p1;
297 len = p - base_path;
298 if (len > dest_size - 1)
299 len = dest_size - 1;
300 memcpy(dest, base_path, len);
301 dest[len] = '\0';
302 pstrcat(dest, dest_size, filename);
303 }
304 }
305
306 void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
307 {
308 if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
309 pstrcpy(dest, sz, bs->backing_file);
310 } else {
311 path_combine(dest, sz, bs->filename, bs->backing_file);
312 }
313 }
314
315 void bdrv_register(BlockDriver *bdrv)
316 {
317 /* Block drivers without coroutine functions need emulation */
318 if (!bdrv->bdrv_co_readv) {
319 bdrv->bdrv_co_readv = bdrv_co_readv_em;
320 bdrv->bdrv_co_writev = bdrv_co_writev_em;
321
322 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
323 * the block driver lacks aio we need to emulate that too.
324 */
325 if (!bdrv->bdrv_aio_readv) {
326 /* add AIO emulation layer */
327 bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
328 bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
329 }
330 }
331
332 QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
333 }
334
335 BlockDriverState *bdrv_new_root(void)
336 {
337 BlockDriverState *bs = bdrv_new();
338
339 QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
340 return bs;
341 }
342
343 BlockDriverState *bdrv_new(void)
344 {
345 BlockDriverState *bs;
346 int i;
347
348 bs = g_new0(BlockDriverState, 1);
349 QLIST_INIT(&bs->dirty_bitmaps);
350 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
351 QLIST_INIT(&bs->op_blockers[i]);
352 }
353 bdrv_iostatus_disable(bs);
354 notifier_list_init(&bs->close_notifiers);
355 notifier_with_return_list_init(&bs->before_write_notifiers);
356 qemu_co_queue_init(&bs->throttled_reqs[0]);
357 qemu_co_queue_init(&bs->throttled_reqs[1]);
358 bs->refcnt = 1;
359 bs->aio_context = qemu_get_aio_context();
360
361 return bs;
362 }
363
364 void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
365 {
366 notifier_list_add(&bs->close_notifiers, notify);
367 }
368
369 BlockDriver *bdrv_find_format(const char *format_name)
370 {
371 BlockDriver *drv1;
372 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
373 if (!strcmp(drv1->format_name, format_name)) {
374 return drv1;
375 }
376 }
377 return NULL;
378 }
379
380 static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
381 {
382 static const char *whitelist_rw[] = {
383 CONFIG_BDRV_RW_WHITELIST
384 };
385 static const char *whitelist_ro[] = {
386 CONFIG_BDRV_RO_WHITELIST
387 };
388 const char **p;
389
390 if (!whitelist_rw[0] && !whitelist_ro[0]) {
391 return 1; /* no whitelist, anything goes */
392 }
393
394 for (p = whitelist_rw; *p; p++) {
395 if (!strcmp(drv->format_name, *p)) {
396 return 1;
397 }
398 }
399 if (read_only) {
400 for (p = whitelist_ro; *p; p++) {
401 if (!strcmp(drv->format_name, *p)) {
402 return 1;
403 }
404 }
405 }
406 return 0;
407 }
408
409 BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
410 bool read_only)
411 {
412 BlockDriver *drv = bdrv_find_format(format_name);
413 return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
414 }
415
416 typedef struct CreateCo {
417 BlockDriver *drv;
418 char *filename;
419 QemuOpts *opts;
420 int ret;
421 Error *err;
422 } CreateCo;
423
424 static void coroutine_fn bdrv_create_co_entry(void *opaque)
425 {
426 Error *local_err = NULL;
427 int ret;
428
429 CreateCo *cco = opaque;
430 assert(cco->drv);
431
432 ret = cco->drv->bdrv_create(cco->filename, cco->opts, &local_err);
433 if (local_err) {
434 error_propagate(&cco->err, local_err);
435 }
436 cco->ret = ret;
437 }
438
439 int bdrv_create(BlockDriver *drv, const char* filename,
440 QemuOpts *opts, Error **errp)
441 {
442 int ret;
443
444 Coroutine *co;
445 CreateCo cco = {
446 .drv = drv,
447 .filename = g_strdup(filename),
448 .opts = opts,
449 .ret = NOT_DONE,
450 .err = NULL,
451 };
452
453 if (!drv->bdrv_create) {
454 error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
455 ret = -ENOTSUP;
456 goto out;
457 }
458
459 if (qemu_in_coroutine()) {
460 /* Fast-path if already in coroutine context */
461 bdrv_create_co_entry(&cco);
462 } else {
463 co = qemu_coroutine_create(bdrv_create_co_entry);
464 qemu_coroutine_enter(co, &cco);
465 while (cco.ret == NOT_DONE) {
466 aio_poll(qemu_get_aio_context(), true);
467 }
468 }
469
470 ret = cco.ret;
471 if (ret < 0) {
472 if (cco.err) {
473 error_propagate(errp, cco.err);
474 } else {
475 error_setg_errno(errp, -ret, "Could not create image");
476 }
477 }
478
479 out:
480 g_free(cco.filename);
481 return ret;
482 }
483
484 int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp)
485 {
486 BlockDriver *drv;
487 Error *local_err = NULL;
488 int ret;
489
490 drv = bdrv_find_protocol(filename, true);
491 if (drv == NULL) {
492 error_setg(errp, "Could not find protocol for file '%s'", filename);
493 return -ENOENT;
494 }
495
496 ret = bdrv_create(drv, filename, opts, &local_err);
497 if (local_err) {
498 error_propagate(errp, local_err);
499 }
500 return ret;
501 }
502
503 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
504 {
505 BlockDriver *drv = bs->drv;
506 Error *local_err = NULL;
507
508 memset(&bs->bl, 0, sizeof(bs->bl));
509
510 if (!drv) {
511 return;
512 }
513
514 /* Take some limits from the children as a default */
515 if (bs->file) {
516 bdrv_refresh_limits(bs->file, &local_err);
517 if (local_err) {
518 error_propagate(errp, local_err);
519 return;
520 }
521 bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
522 bs->bl.max_transfer_length = bs->file->bl.max_transfer_length;
523 bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
524 } else {
525 bs->bl.opt_mem_alignment = 512;
526 }
527
528 if (bs->backing_hd) {
529 bdrv_refresh_limits(bs->backing_hd, &local_err);
530 if (local_err) {
531 error_propagate(errp, local_err);
532 return;
533 }
534 bs->bl.opt_transfer_length =
535 MAX(bs->bl.opt_transfer_length,
536 bs->backing_hd->bl.opt_transfer_length);
537 bs->bl.max_transfer_length =
538 MIN_NON_ZERO(bs->bl.max_transfer_length,
539 bs->backing_hd->bl.max_transfer_length);
540 bs->bl.opt_mem_alignment =
541 MAX(bs->bl.opt_mem_alignment,
542 bs->backing_hd->bl.opt_mem_alignment);
543 }
544
545 /* Then let the driver override it */
546 if (drv->bdrv_refresh_limits) {
547 drv->bdrv_refresh_limits(bs, errp);
548 }
549 }
550
551 /*
552 * Create a uniquely-named empty temporary file.
553 * Return 0 upon success, otherwise a negative errno value.
554 */
555 int get_tmp_filename(char *filename, int size)
556 {
557 #ifdef _WIN32
558 char temp_dir[MAX_PATH];
559 /* GetTempFileName requires that its output buffer (4th param)
560 have length MAX_PATH or greater. */
561 assert(size >= MAX_PATH);
562 return (GetTempPath(MAX_PATH, temp_dir)
563 && GetTempFileName(temp_dir, "qem", 0, filename)
564 ? 0 : -GetLastError());
565 #else
566 int fd;
567 const char *tmpdir;
568 tmpdir = getenv("TMPDIR");
569 if (!tmpdir) {
570 tmpdir = "/var/tmp";
571 }
572 if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
573 return -EOVERFLOW;
574 }
575 fd = mkstemp(filename);
576 if (fd < 0) {
577 return -errno;
578 }
579 if (close(fd) != 0) {
580 unlink(filename);
581 return -errno;
582 }
583 return 0;
584 #endif
585 }
586
587 /*
588 * Detect host devices. By convention, /dev/cdrom[N] is always
589 * recognized as a host CDROM.
590 */
591 static BlockDriver *find_hdev_driver(const char *filename)
592 {
593 int score_max = 0, score;
594 BlockDriver *drv = NULL, *d;
595
596 QLIST_FOREACH(d, &bdrv_drivers, list) {
597 if (d->bdrv_probe_device) {
598 score = d->bdrv_probe_device(filename);
599 if (score > score_max) {
600 score_max = score;
601 drv = d;
602 }
603 }
604 }
605
606 return drv;
607 }
608
609 BlockDriver *bdrv_find_protocol(const char *filename,
610 bool allow_protocol_prefix)
611 {
612 BlockDriver *drv1;
613 char protocol[128];
614 int len;
615 const char *p;
616
617 /* TODO Drivers without bdrv_file_open must be specified explicitly */
618
619 /*
620 * XXX(hch): we really should not let host device detection
621 * override an explicit protocol specification, but moving this
622 * later breaks access to device names with colons in them.
623 * Thanks to the brain-dead persistent naming schemes on udev-
624 * based Linux systems those actually are quite common.
625 */
626 drv1 = find_hdev_driver(filename);
627 if (drv1) {
628 return drv1;
629 }
630
631 if (!path_has_protocol(filename) || !allow_protocol_prefix) {
632 return bdrv_find_format("file");
633 }
634
635 p = strchr(filename, ':');
636 assert(p != NULL);
637 len = p - filename;
638 if (len > sizeof(protocol) - 1)
639 len = sizeof(protocol) - 1;
640 memcpy(protocol, filename, len);
641 protocol[len] = '\0';
642 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
643 if (drv1->protocol_name &&
644 !strcmp(drv1->protocol_name, protocol)) {
645 return drv1;
646 }
647 }
648 return NULL;
649 }
650
651 /*
652 * Guess image format by probing its contents.
653 * This is not a good idea when your image is raw (CVE-2008-2004), but
654 * we do it anyway for backward compatibility.
655 *
656 * @buf contains the image's first @buf_size bytes.
657 * @buf_size is the buffer size in bytes (generally BLOCK_PROBE_BUF_SIZE,
658 * but can be smaller if the image file is smaller)
659 * @filename is its filename.
660 *
661 * For all block drivers, call the bdrv_probe() method to get its
662 * probing score.
663 * Return the first block driver with the highest probing score.
664 */
665 static BlockDriver *bdrv_probe_all(const uint8_t *buf, int buf_size,
666 const char *filename)
667 {
668 int score_max = 0, score;
669 BlockDriver *drv = NULL, *d;
670
671 QLIST_FOREACH(d, &bdrv_drivers, list) {
672 if (d->bdrv_probe) {
673 score = d->bdrv_probe(buf, buf_size, filename);
674 if (score > score_max) {
675 score_max = score;
676 drv = d;
677 }
678 }
679 }
680
681 return drv;
682 }
683
684 static int find_image_format(BlockDriverState *bs, const char *filename,
685 BlockDriver **pdrv, Error **errp)
686 {
687 BlockDriver *drv;
688 uint8_t buf[BLOCK_PROBE_BUF_SIZE];
689 int ret = 0;
690
691 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
692 if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
693 drv = bdrv_find_format("raw");
694 if (!drv) {
695 error_setg(errp, "Could not find raw image format");
696 ret = -ENOENT;
697 }
698 *pdrv = drv;
699 return ret;
700 }
701
702 ret = bdrv_pread(bs, 0, buf, sizeof(buf));
703 if (ret < 0) {
704 error_setg_errno(errp, -ret, "Could not read image for determining its "
705 "format");
706 *pdrv = NULL;
707 return ret;
708 }
709
710 drv = bdrv_probe_all(buf, ret, filename);
711 if (!drv) {
712 error_setg(errp, "Could not determine image format: No compatible "
713 "driver found");
714 ret = -ENOENT;
715 }
716 *pdrv = drv;
717 return ret;
718 }
719
720 /**
721 * Set the current 'total_sectors' value
722 * Return 0 on success, -errno on error.
723 */
724 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
725 {
726 BlockDriver *drv = bs->drv;
727
728 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
729 if (bs->sg)
730 return 0;
731
732 /* query actual device if possible, otherwise just trust the hint */
733 if (drv->bdrv_getlength) {
734 int64_t length = drv->bdrv_getlength(bs);
735 if (length < 0) {
736 return length;
737 }
738 hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
739 }
740
741 bs->total_sectors = hint;
742 return 0;
743 }
744
745 /**
746 * Set open flags for a given discard mode
747 *
748 * Return 0 on success, -1 if the discard mode was invalid.
749 */
750 int bdrv_parse_discard_flags(const char *mode, int *flags)
751 {
752 *flags &= ~BDRV_O_UNMAP;
753
754 if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
755 /* do nothing */
756 } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
757 *flags |= BDRV_O_UNMAP;
758 } else {
759 return -1;
760 }
761
762 return 0;
763 }
764
765 /**
766 * Set open flags for a given cache mode
767 *
768 * Return 0 on success, -1 if the cache mode was invalid.
769 */
770 int bdrv_parse_cache_flags(const char *mode, int *flags)
771 {
772 *flags &= ~BDRV_O_CACHE_MASK;
773
774 if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
775 *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
776 } else if (!strcmp(mode, "directsync")) {
777 *flags |= BDRV_O_NOCACHE;
778 } else if (!strcmp(mode, "writeback")) {
779 *flags |= BDRV_O_CACHE_WB;
780 } else if (!strcmp(mode, "unsafe")) {
781 *flags |= BDRV_O_CACHE_WB;
782 *flags |= BDRV_O_NO_FLUSH;
783 } else if (!strcmp(mode, "writethrough")) {
784 /* this is the default */
785 } else {
786 return -1;
787 }
788
789 return 0;
790 }
791
792 /**
793 * The copy-on-read flag is actually a reference count so multiple users may
794 * use the feature without worrying about clobbering its previous state.
795 * Copy-on-read stays enabled until all users have called to disable it.
796 */
797 void bdrv_enable_copy_on_read(BlockDriverState *bs)
798 {
799 bs->copy_on_read++;
800 }
801
802 void bdrv_disable_copy_on_read(BlockDriverState *bs)
803 {
804 assert(bs->copy_on_read > 0);
805 bs->copy_on_read--;
806 }
807
808 /*
809 * Returns the flags that a temporary snapshot should get, based on the
810 * originally requested flags (the originally requested image will have flags
811 * like a backing file)
812 */
813 static int bdrv_temp_snapshot_flags(int flags)
814 {
815 return (flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY;
816 }
817
818 /*
819 * Returns the flags that bs->file should get, based on the given flags for
820 * the parent BDS
821 */
822 static int bdrv_inherited_flags(int flags)
823 {
824 /* Enable protocol handling, disable format probing for bs->file */
825 flags |= BDRV_O_PROTOCOL;
826
827 /* Our block drivers take care to send flushes and respect unmap policy,
828 * so we can enable both unconditionally on lower layers. */
829 flags |= BDRV_O_CACHE_WB | BDRV_O_UNMAP;
830
831 /* Clear flags that only apply to the top layer */
832 flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_COPY_ON_READ);
833
834 return flags;
835 }
836
837 /*
838 * Returns the flags that bs->backing_hd should get, based on the given flags
839 * for the parent BDS
840 */
841 static int bdrv_backing_flags(int flags)
842 {
843 /* backing files always opened read-only */
844 flags &= ~(BDRV_O_RDWR | BDRV_O_COPY_ON_READ);
845
846 /* snapshot=on is handled on the top layer */
847 flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_TEMPORARY);
848
849 return flags;
850 }
851
852 static int bdrv_open_flags(BlockDriverState *bs, int flags)
853 {
854 int open_flags = flags | BDRV_O_CACHE_WB;
855
856 /*
857 * Clear flags that are internal to the block layer before opening the
858 * image.
859 */
860 open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_PROTOCOL);
861
862 /*
863 * Snapshots should be writable.
864 */
865 if (flags & BDRV_O_TEMPORARY) {
866 open_flags |= BDRV_O_RDWR;
867 }
868
869 return open_flags;
870 }
871
872 static void bdrv_assign_node_name(BlockDriverState *bs,
873 const char *node_name,
874 Error **errp)
875 {
876 if (!node_name) {
877 return;
878 }
879
880 /* Check for empty string or invalid characters */
881 if (!id_wellformed(node_name)) {
882 error_setg(errp, "Invalid node name");
883 return;
884 }
885
886 /* takes care of avoiding namespaces collisions */
887 if (blk_by_name(node_name)) {
888 error_setg(errp, "node-name=%s is conflicting with a device id",
889 node_name);
890 return;
891 }
892
893 /* takes care of avoiding duplicates node names */
894 if (bdrv_find_node(node_name)) {
895 error_setg(errp, "Duplicate node name");
896 return;
897 }
898
899 /* copy node name into the bs and insert it into the graph list */
900 pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
901 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
902 }
903
904 /*
905 * Common part for opening disk images and files
906 *
907 * Removes all processed options from *options.
908 */
909 static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
910 QDict *options, int flags, BlockDriver *drv, Error **errp)
911 {
912 int ret, open_flags;
913 const char *filename;
914 const char *node_name = NULL;
915 Error *local_err = NULL;
916
917 assert(drv != NULL);
918 assert(bs->file == NULL);
919 assert(options != NULL && bs->options != options);
920
921 if (file != NULL) {
922 filename = file->filename;
923 } else {
924 filename = qdict_get_try_str(options, "filename");
925 }
926
927 if (drv->bdrv_needs_filename && !filename) {
928 error_setg(errp, "The '%s' block driver requires a file name",
929 drv->format_name);
930 return -EINVAL;
931 }
932
933 trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
934
935 node_name = qdict_get_try_str(options, "node-name");
936 bdrv_assign_node_name(bs, node_name, &local_err);
937 if (local_err) {
938 error_propagate(errp, local_err);
939 return -EINVAL;
940 }
941 qdict_del(options, "node-name");
942
943 /* bdrv_open() with directly using a protocol as drv. This layer is already
944 * opened, so assign it to bs (while file becomes a closed BlockDriverState)
945 * and return immediately. */
946 if (file != NULL && drv->bdrv_file_open) {
947 bdrv_swap(file, bs);
948 return 0;
949 }
950
951 bs->open_flags = flags;
952 bs->guest_block_size = 512;
953 bs->request_alignment = 512;
954 bs->zero_beyond_eof = true;
955 open_flags = bdrv_open_flags(bs, flags);
956 bs->read_only = !(open_flags & BDRV_O_RDWR);
957 bs->growable = !!(flags & BDRV_O_PROTOCOL);
958
959 if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
960 error_setg(errp,
961 !bs->read_only && bdrv_is_whitelisted(drv, true)
962 ? "Driver '%s' can only be used for read-only devices"
963 : "Driver '%s' is not whitelisted",
964 drv->format_name);
965 return -ENOTSUP;
966 }
967
968 assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
969 if (flags & BDRV_O_COPY_ON_READ) {
970 if (!bs->read_only) {
971 bdrv_enable_copy_on_read(bs);
972 } else {
973 error_setg(errp, "Can't use copy-on-read on read-only device");
974 return -EINVAL;
975 }
976 }
977
978 if (filename != NULL) {
979 pstrcpy(bs->filename, sizeof(bs->filename), filename);
980 } else {
981 bs->filename[0] = '\0';
982 }
983 pstrcpy(bs->exact_filename, sizeof(bs->exact_filename), bs->filename);
984
985 bs->drv = drv;
986 bs->opaque = g_malloc0(drv->instance_size);
987
988 bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
989
990 /* Open the image, either directly or using a protocol */
991 if (drv->bdrv_file_open) {
992 assert(file == NULL);
993 assert(!drv->bdrv_needs_filename || filename != NULL);
994 ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
995 } else {
996 if (file == NULL) {
997 error_setg(errp, "Can't use '%s' as a block driver for the "
998 "protocol level", drv->format_name);
999 ret = -EINVAL;
1000 goto free_and_fail;
1001 }
1002 bs->file = file;
1003 ret = drv->bdrv_open(bs, options, open_flags, &local_err);
1004 }
1005
1006 if (ret < 0) {
1007 if (local_err) {
1008 error_propagate(errp, local_err);
1009 } else if (bs->filename[0]) {
1010 error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
1011 } else {
1012 error_setg_errno(errp, -ret, "Could not open image");
1013 }
1014 goto free_and_fail;
1015 }
1016
1017 ret = refresh_total_sectors(bs, bs->total_sectors);
1018 if (ret < 0) {
1019 error_setg_errno(errp, -ret, "Could not refresh total sector count");
1020 goto free_and_fail;
1021 }
1022
1023 bdrv_refresh_limits(bs, &local_err);
1024 if (local_err) {
1025 error_propagate(errp, local_err);
1026 ret = -EINVAL;
1027 goto free_and_fail;
1028 }
1029
1030 assert(bdrv_opt_mem_align(bs) != 0);
1031 assert((bs->request_alignment != 0) || bs->sg);
1032 return 0;
1033
1034 free_and_fail:
1035 bs->file = NULL;
1036 g_free(bs->opaque);
1037 bs->opaque = NULL;
1038 bs->drv = NULL;
1039 return ret;
1040 }
1041
1042 static QDict *parse_json_filename(const char *filename, Error **errp)
1043 {
1044 QObject *options_obj;
1045 QDict *options;
1046 int ret;
1047
1048 ret = strstart(filename, "json:", &filename);
1049 assert(ret);
1050
1051 options_obj = qobject_from_json(filename);
1052 if (!options_obj) {
1053 error_setg(errp, "Could not parse the JSON options");
1054 return NULL;
1055 }
1056
1057 if (qobject_type(options_obj) != QTYPE_QDICT) {
1058 qobject_decref(options_obj);
1059 error_setg(errp, "Invalid JSON object given");
1060 return NULL;
1061 }
1062
1063 options = qobject_to_qdict(options_obj);
1064 qdict_flatten(options);
1065
1066 return options;
1067 }
1068
1069 /*
1070 * Fills in default options for opening images and converts the legacy
1071 * filename/flags pair to option QDict entries.
1072 */
1073 static int bdrv_fill_options(QDict **options, const char **pfilename, int flags,
1074 BlockDriver *drv, Error **errp)
1075 {
1076 const char *filename = *pfilename;
1077 const char *drvname;
1078 bool protocol = flags & BDRV_O_PROTOCOL;
1079 bool parse_filename = false;
1080 Error *local_err = NULL;
1081
1082 /* Parse json: pseudo-protocol */
1083 if (filename && g_str_has_prefix(filename, "json:")) {
1084 QDict *json_options = parse_json_filename(filename, &local_err);
1085 if (local_err) {
1086 error_propagate(errp, local_err);
1087 return -EINVAL;
1088 }
1089
1090 /* Options given in the filename have lower priority than options
1091 * specified directly */
1092 qdict_join(*options, json_options, false);
1093 QDECREF(json_options);
1094 *pfilename = filename = NULL;
1095 }
1096
1097 /* Fetch the file name from the options QDict if necessary */
1098 if (protocol && filename) {
1099 if (!qdict_haskey(*options, "filename")) {
1100 qdict_put(*options, "filename", qstring_from_str(filename));
1101 parse_filename = true;
1102 } else {
1103 error_setg(errp, "Can't specify 'file' and 'filename' options at "
1104 "the same time");
1105 return -EINVAL;
1106 }
1107 }
1108
1109 /* Find the right block driver */
1110 filename = qdict_get_try_str(*options, "filename");
1111 drvname = qdict_get_try_str(*options, "driver");
1112
1113 if (drv) {
1114 if (drvname) {
1115 error_setg(errp, "Driver specified twice");
1116 return -EINVAL;
1117 }
1118 drvname = drv->format_name;
1119 qdict_put(*options, "driver", qstring_from_str(drvname));
1120 } else {
1121 if (!drvname && protocol) {
1122 if (filename) {
1123 drv = bdrv_find_protocol(filename, parse_filename);
1124 if (!drv) {
1125 error_setg(errp, "Unknown protocol");
1126 return -EINVAL;
1127 }
1128
1129 drvname = drv->format_name;
1130 qdict_put(*options, "driver", qstring_from_str(drvname));
1131 } else {
1132 error_setg(errp, "Must specify either driver or file");
1133 return -EINVAL;
1134 }
1135 } else if (drvname) {
1136 drv = bdrv_find_format(drvname);
1137 if (!drv) {
1138 error_setg(errp, "Unknown driver '%s'", drvname);
1139 return -ENOENT;
1140 }
1141 }
1142 }
1143
1144 assert(drv || !protocol);
1145
1146 /* Driver-specific filename parsing */
1147 if (drv && drv->bdrv_parse_filename && parse_filename) {
1148 drv->bdrv_parse_filename(filename, *options, &local_err);
1149 if (local_err) {
1150 error_propagate(errp, local_err);
1151 return -EINVAL;
1152 }
1153
1154 if (!drv->bdrv_needs_filename) {
1155 qdict_del(*options, "filename");
1156 }
1157 }
1158
1159 return 0;
1160 }
1161
1162 void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd)
1163 {
1164
1165 if (bs->backing_hd) {
1166 assert(bs->backing_blocker);
1167 bdrv_op_unblock_all(bs->backing_hd, bs->backing_blocker);
1168 } else if (backing_hd) {
1169 error_setg(&bs->backing_blocker,
1170 "device is used as backing hd of '%s'",
1171 bdrv_get_device_name(bs));
1172 }
1173
1174 bs->backing_hd = backing_hd;
1175 if (!backing_hd) {
1176 error_free(bs->backing_blocker);
1177 bs->backing_blocker = NULL;
1178 goto out;
1179 }
1180 bs->open_flags &= ~BDRV_O_NO_BACKING;
1181 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_hd->filename);
1182 pstrcpy(bs->backing_format, sizeof(bs->backing_format),
1183 backing_hd->drv ? backing_hd->drv->format_name : "");
1184
1185 bdrv_op_block_all(bs->backing_hd, bs->backing_blocker);
1186 /* Otherwise we won't be able to commit due to check in bdrv_commit */
1187 bdrv_op_unblock(bs->backing_hd, BLOCK_OP_TYPE_COMMIT,
1188 bs->backing_blocker);
1189 out:
1190 bdrv_refresh_limits(bs, NULL);
1191 }
1192
1193 /*
1194 * Opens the backing file for a BlockDriverState if not yet open
1195 *
1196 * options is a QDict of options to pass to the block drivers, or NULL for an
1197 * empty set of options. The reference to the QDict is transferred to this
1198 * function (even on failure), so if the caller intends to reuse the dictionary,
1199 * it needs to use QINCREF() before calling bdrv_file_open.
1200 */
1201 int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
1202 {
1203 char *backing_filename = g_malloc0(PATH_MAX);
1204 int ret = 0;
1205 BlockDriver *back_drv = NULL;
1206 BlockDriverState *backing_hd;
1207 Error *local_err = NULL;
1208
1209 if (bs->backing_hd != NULL) {
1210 QDECREF(options);
1211 goto free_exit;
1212 }
1213
1214 /* NULL means an empty set of options */
1215 if (options == NULL) {
1216 options = qdict_new();
1217 }
1218
1219 bs->open_flags &= ~BDRV_O_NO_BACKING;
1220 if (qdict_haskey(options, "file.filename")) {
1221 backing_filename[0] = '\0';
1222 } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
1223 QDECREF(options);
1224 goto free_exit;
1225 } else {
1226 bdrv_get_full_backing_filename(bs, backing_filename, PATH_MAX);
1227 }
1228
1229 if (!bs->drv || !bs->drv->supports_backing) {
1230 ret = -EINVAL;
1231 error_setg(errp, "Driver doesn't support backing files");
1232 QDECREF(options);
1233 goto free_exit;
1234 }
1235
1236 backing_hd = bdrv_new();
1237
1238 if (bs->backing_format[0] != '\0') {
1239 back_drv = bdrv_find_format(bs->backing_format);
1240 }
1241
1242 assert(bs->backing_hd == NULL);
1243 ret = bdrv_open(&backing_hd,
1244 *backing_filename ? backing_filename : NULL, NULL, options,
1245 bdrv_backing_flags(bs->open_flags), back_drv, &local_err);
1246 if (ret < 0) {
1247 bdrv_unref(backing_hd);
1248 backing_hd = NULL;
1249 bs->open_flags |= BDRV_O_NO_BACKING;
1250 error_setg(errp, "Could not open backing file: %s",
1251 error_get_pretty(local_err));
1252 error_free(local_err);
1253 goto free_exit;
1254 }
1255 bdrv_set_backing_hd(bs, backing_hd);
1256
1257 free_exit:
1258 g_free(backing_filename);
1259 return ret;
1260 }
1261
1262 /*
1263 * Opens a disk image whose options are given as BlockdevRef in another block
1264 * device's options.
1265 *
1266 * If allow_none is true, no image will be opened if filename is false and no
1267 * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
1268 *
1269 * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
1270 * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
1271 * itself, all options starting with "${bdref_key}." are considered part of the
1272 * BlockdevRef.
1273 *
1274 * The BlockdevRef will be removed from the options QDict.
1275 *
1276 * To conform with the behavior of bdrv_open(), *pbs has to be NULL.
1277 */
1278 int bdrv_open_image(BlockDriverState **pbs, const char *filename,
1279 QDict *options, const char *bdref_key, int flags,
1280 bool allow_none, Error **errp)
1281 {
1282 QDict *image_options;
1283 int ret;
1284 char *bdref_key_dot;
1285 const char *reference;
1286
1287 assert(pbs);
1288 assert(*pbs == NULL);
1289
1290 bdref_key_dot = g_strdup_printf("%s.", bdref_key);
1291 qdict_extract_subqdict(options, &image_options, bdref_key_dot);
1292 g_free(bdref_key_dot);
1293
1294 reference = qdict_get_try_str(options, bdref_key);
1295 if (!filename && !reference && !qdict_size(image_options)) {
1296 if (allow_none) {
1297 ret = 0;
1298 } else {
1299 error_setg(errp, "A block device must be specified for \"%s\"",
1300 bdref_key);
1301 ret = -EINVAL;
1302 }
1303 QDECREF(image_options);
1304 goto done;
1305 }
1306
1307 ret = bdrv_open(pbs, filename, reference, image_options, flags, NULL, errp);
1308
1309 done:
1310 qdict_del(options, bdref_key);
1311 return ret;
1312 }
1313
1314 int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp)
1315 {
1316 /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1317 char *tmp_filename = g_malloc0(PATH_MAX + 1);
1318 int64_t total_size;
1319 BlockDriver *bdrv_qcow2;
1320 QemuOpts *opts = NULL;
1321 QDict *snapshot_options;
1322 BlockDriverState *bs_snapshot;
1323 Error *local_err;
1324 int ret;
1325
1326 /* if snapshot, we create a temporary backing file and open it
1327 instead of opening 'filename' directly */
1328
1329 /* Get the required size from the image */
1330 total_size = bdrv_getlength(bs);
1331 if (total_size < 0) {
1332 ret = total_size;
1333 error_setg_errno(errp, -total_size, "Could not get image size");
1334 goto out;
1335 }
1336
1337 /* Create the temporary image */
1338 ret = get_tmp_filename(tmp_filename, PATH_MAX + 1);
1339 if (ret < 0) {
1340 error_setg_errno(errp, -ret, "Could not get temporary filename");
1341 goto out;
1342 }
1343
1344 bdrv_qcow2 = bdrv_find_format("qcow2");
1345 opts = qemu_opts_create(bdrv_qcow2->create_opts, NULL, 0,
1346 &error_abort);
1347 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, total_size);
1348 ret = bdrv_create(bdrv_qcow2, tmp_filename, opts, &local_err);
1349 qemu_opts_del(opts);
1350 if (ret < 0) {
1351 error_setg_errno(errp, -ret, "Could not create temporary overlay "
1352 "'%s': %s", tmp_filename,
1353 error_get_pretty(local_err));
1354 error_free(local_err);
1355 goto out;
1356 }
1357
1358 /* Prepare a new options QDict for the temporary file */
1359 snapshot_options = qdict_new();
1360 qdict_put(snapshot_options, "file.driver",
1361 qstring_from_str("file"));
1362 qdict_put(snapshot_options, "file.filename",
1363 qstring_from_str(tmp_filename));
1364
1365 bs_snapshot = bdrv_new();
1366
1367 ret = bdrv_open(&bs_snapshot, NULL, NULL, snapshot_options,
1368 flags, bdrv_qcow2, &local_err);
1369 if (ret < 0) {
1370 error_propagate(errp, local_err);
1371 goto out;
1372 }
1373
1374 bdrv_append(bs_snapshot, bs);
1375
1376 out:
1377 g_free(tmp_filename);
1378 return ret;
1379 }
1380
1381 /*
1382 * Opens a disk image (raw, qcow2, vmdk, ...)
1383 *
1384 * options is a QDict of options to pass to the block drivers, or NULL for an
1385 * empty set of options. The reference to the QDict belongs to the block layer
1386 * after the call (even on failure), so if the caller intends to reuse the
1387 * dictionary, it needs to use QINCREF() before calling bdrv_open.
1388 *
1389 * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
1390 * If it is not NULL, the referenced BDS will be reused.
1391 *
1392 * The reference parameter may be used to specify an existing block device which
1393 * should be opened. If specified, neither options nor a filename may be given,
1394 * nor can an existing BDS be reused (that is, *pbs has to be NULL).
1395 */
1396 int bdrv_open(BlockDriverState **pbs, const char *filename,
1397 const char *reference, QDict *options, int flags,
1398 BlockDriver *drv, Error **errp)
1399 {
1400 int ret;
1401 BlockDriverState *file = NULL, *bs;
1402 const char *drvname;
1403 Error *local_err = NULL;
1404 int snapshot_flags = 0;
1405
1406 assert(pbs);
1407
1408 if (reference) {
1409 bool options_non_empty = options ? qdict_size(options) : false;
1410 QDECREF(options);
1411
1412 if (*pbs) {
1413 error_setg(errp, "Cannot reuse an existing BDS when referencing "
1414 "another block device");
1415 return -EINVAL;
1416 }
1417
1418 if (filename || options_non_empty) {
1419 error_setg(errp, "Cannot reference an existing block device with "
1420 "additional options or a new filename");
1421 return -EINVAL;
1422 }
1423
1424 bs = bdrv_lookup_bs(reference, reference, errp);
1425 if (!bs) {
1426 return -ENODEV;
1427 }
1428 bdrv_ref(bs);
1429 *pbs = bs;
1430 return 0;
1431 }
1432
1433 if (*pbs) {
1434 bs = *pbs;
1435 } else {
1436 bs = bdrv_new();
1437 }
1438
1439 /* NULL means an empty set of options */
1440 if (options == NULL) {
1441 options = qdict_new();
1442 }
1443
1444 ret = bdrv_fill_options(&options, &filename, flags, drv, &local_err);
1445 if (local_err) {
1446 goto fail;
1447 }
1448
1449 /* Find the right image format driver */
1450 drv = NULL;
1451 drvname = qdict_get_try_str(options, "driver");
1452 if (drvname) {
1453 drv = bdrv_find_format(drvname);
1454 qdict_del(options, "driver");
1455 if (!drv) {
1456 error_setg(errp, "Unknown driver: '%s'", drvname);
1457 ret = -EINVAL;
1458 goto fail;
1459 }
1460 }
1461
1462 assert(drvname || !(flags & BDRV_O_PROTOCOL));
1463 if (drv && !drv->bdrv_file_open) {
1464 /* If the user explicitly wants a format driver here, we'll need to add
1465 * another layer for the protocol in bs->file */
1466 flags &= ~BDRV_O_PROTOCOL;
1467 }
1468
1469 bs->options = options;
1470 options = qdict_clone_shallow(options);
1471
1472 /* Open image file without format layer */
1473 if ((flags & BDRV_O_PROTOCOL) == 0) {
1474 if (flags & BDRV_O_RDWR) {
1475 flags |= BDRV_O_ALLOW_RDWR;
1476 }
1477 if (flags & BDRV_O_SNAPSHOT) {
1478 snapshot_flags = bdrv_temp_snapshot_flags(flags);
1479 flags = bdrv_backing_flags(flags);
1480 }
1481
1482 assert(file == NULL);
1483 ret = bdrv_open_image(&file, filename, options, "file",
1484 bdrv_inherited_flags(flags),
1485 true, &local_err);
1486 if (ret < 0) {
1487 goto fail;
1488 }
1489 }
1490
1491 /* Image format probing */
1492 if (!drv && file) {
1493 ret = find_image_format(file, filename, &drv, &local_err);
1494 if (ret < 0) {
1495 goto fail;
1496 }
1497 } else if (!drv) {
1498 error_setg(errp, "Must specify either driver or file");
1499 ret = -EINVAL;
1500 goto fail;
1501 }
1502
1503 /* Open the image */
1504 ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
1505 if (ret < 0) {
1506 goto fail;
1507 }
1508
1509 if (file && (bs->file != file)) {
1510 bdrv_unref(file);
1511 file = NULL;
1512 }
1513
1514 /* If there is a backing file, use it */
1515 if ((flags & BDRV_O_NO_BACKING) == 0) {
1516 QDict *backing_options;
1517
1518 qdict_extract_subqdict(options, &backing_options, "backing.");
1519 ret = bdrv_open_backing_file(bs, backing_options, &local_err);
1520 if (ret < 0) {
1521 goto close_and_fail;
1522 }
1523 }
1524
1525 bdrv_refresh_filename(bs);
1526
1527 /* For snapshot=on, create a temporary qcow2 overlay. bs points to the
1528 * temporary snapshot afterwards. */
1529 if (snapshot_flags) {
1530 ret = bdrv_append_temp_snapshot(bs, snapshot_flags, &local_err);
1531 if (local_err) {
1532 goto close_and_fail;
1533 }
1534 }
1535
1536 /* Check if any unknown options were used */
1537 if (options && (qdict_size(options) != 0)) {
1538 const QDictEntry *entry = qdict_first(options);
1539 if (flags & BDRV_O_PROTOCOL) {
1540 error_setg(errp, "Block protocol '%s' doesn't support the option "
1541 "'%s'", drv->format_name, entry->key);
1542 } else {
1543 error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1544 "support the option '%s'", drv->format_name,
1545 bdrv_get_device_name(bs), entry->key);
1546 }
1547
1548 ret = -EINVAL;
1549 goto close_and_fail;
1550 }
1551
1552 if (!bdrv_key_required(bs)) {
1553 if (bs->blk) {
1554 blk_dev_change_media_cb(bs->blk, true);
1555 }
1556 } else if (!runstate_check(RUN_STATE_PRELAUNCH)
1557 && !runstate_check(RUN_STATE_INMIGRATE)
1558 && !runstate_check(RUN_STATE_PAUSED)) { /* HACK */
1559 error_setg(errp,
1560 "Guest must be stopped for opening of encrypted image");
1561 ret = -EBUSY;
1562 goto close_and_fail;
1563 }
1564
1565 QDECREF(options);
1566 *pbs = bs;
1567 return 0;
1568
1569 fail:
1570 if (file != NULL) {
1571 bdrv_unref(file);
1572 }
1573 QDECREF(bs->options);
1574 QDECREF(options);
1575 bs->options = NULL;
1576 if (!*pbs) {
1577 /* If *pbs is NULL, a new BDS has been created in this function and
1578 needs to be freed now. Otherwise, it does not need to be closed,
1579 since it has not really been opened yet. */
1580 bdrv_unref(bs);
1581 }
1582 if (local_err) {
1583 error_propagate(errp, local_err);
1584 }
1585 return ret;
1586
1587 close_and_fail:
1588 /* See fail path, but now the BDS has to be always closed */
1589 if (*pbs) {
1590 bdrv_close(bs);
1591 } else {
1592 bdrv_unref(bs);
1593 }
1594 QDECREF(options);
1595 if (local_err) {
1596 error_propagate(errp, local_err);
1597 }
1598 return ret;
1599 }
1600
1601 typedef struct BlockReopenQueueEntry {
1602 bool prepared;
1603 BDRVReopenState state;
1604 QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1605 } BlockReopenQueueEntry;
1606
1607 /*
1608 * Adds a BlockDriverState to a simple queue for an atomic, transactional
1609 * reopen of multiple devices.
1610 *
1611 * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1612 * already performed, or alternatively may be NULL a new BlockReopenQueue will
1613 * be created and initialized. This newly created BlockReopenQueue should be
1614 * passed back in for subsequent calls that are intended to be of the same
1615 * atomic 'set'.
1616 *
1617 * bs is the BlockDriverState to add to the reopen queue.
1618 *
1619 * flags contains the open flags for the associated bs
1620 *
1621 * returns a pointer to bs_queue, which is either the newly allocated
1622 * bs_queue, or the existing bs_queue being used.
1623 *
1624 */
1625 BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1626 BlockDriverState *bs, int flags)
1627 {
1628 assert(bs != NULL);
1629
1630 BlockReopenQueueEntry *bs_entry;
1631 if (bs_queue == NULL) {
1632 bs_queue = g_new0(BlockReopenQueue, 1);
1633 QSIMPLEQ_INIT(bs_queue);
1634 }
1635
1636 /* bdrv_open() masks this flag out */
1637 flags &= ~BDRV_O_PROTOCOL;
1638
1639 if (bs->file) {
1640 bdrv_reopen_queue(bs_queue, bs->file, bdrv_inherited_flags(flags));
1641 }
1642
1643 bs_entry = g_new0(BlockReopenQueueEntry, 1);
1644 QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1645
1646 bs_entry->state.bs = bs;
1647 bs_entry->state.flags = flags;
1648
1649 return bs_queue;
1650 }
1651
1652 /*
1653 * Reopen multiple BlockDriverStates atomically & transactionally.
1654 *
1655 * The queue passed in (bs_queue) must have been built up previous
1656 * via bdrv_reopen_queue().
1657 *
1658 * Reopens all BDS specified in the queue, with the appropriate
1659 * flags. All devices are prepared for reopen, and failure of any
1660 * device will cause all device changes to be abandonded, and intermediate
1661 * data cleaned up.
1662 *
1663 * If all devices prepare successfully, then the changes are committed
1664 * to all devices.
1665 *
1666 */
1667 int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1668 {
1669 int ret = -1;
1670 BlockReopenQueueEntry *bs_entry, *next;
1671 Error *local_err = NULL;
1672
1673 assert(bs_queue != NULL);
1674
1675 bdrv_drain_all();
1676
1677 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1678 if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1679 error_propagate(errp, local_err);
1680 goto cleanup;
1681 }
1682 bs_entry->prepared = true;
1683 }
1684
1685 /* If we reach this point, we have success and just need to apply the
1686 * changes
1687 */
1688 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1689 bdrv_reopen_commit(&bs_entry->state);
1690 }
1691
1692 ret = 0;
1693
1694 cleanup:
1695 QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1696 if (ret && bs_entry->prepared) {
1697 bdrv_reopen_abort(&bs_entry->state);
1698 }
1699 g_free(bs_entry);
1700 }
1701 g_free(bs_queue);
1702 return ret;
1703 }
1704
1705
1706 /* Reopen a single BlockDriverState with the specified flags. */
1707 int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1708 {
1709 int ret = -1;
1710 Error *local_err = NULL;
1711 BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1712
1713 ret = bdrv_reopen_multiple(queue, &local_err);
1714 if (local_err != NULL) {
1715 error_propagate(errp, local_err);
1716 }
1717 return ret;
1718 }
1719
1720
1721 /*
1722 * Prepares a BlockDriverState for reopen. All changes are staged in the
1723 * 'opaque' field of the BDRVReopenState, which is used and allocated by
1724 * the block driver layer .bdrv_reopen_prepare()
1725 *
1726 * bs is the BlockDriverState to reopen
1727 * flags are the new open flags
1728 * queue is the reopen queue
1729 *
1730 * Returns 0 on success, non-zero on error. On error errp will be set
1731 * as well.
1732 *
1733 * On failure, bdrv_reopen_abort() will be called to clean up any data.
1734 * It is the responsibility of the caller to then call the abort() or
1735 * commit() for any other BDS that have been left in a prepare() state
1736 *
1737 */
1738 int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1739 Error **errp)
1740 {
1741 int ret = -1;
1742 Error *local_err = NULL;
1743 BlockDriver *drv;
1744
1745 assert(reopen_state != NULL);
1746 assert(reopen_state->bs->drv != NULL);
1747 drv = reopen_state->bs->drv;
1748
1749 /* if we are to stay read-only, do not allow permission change
1750 * to r/w */
1751 if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1752 reopen_state->flags & BDRV_O_RDWR) {
1753 error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1754 bdrv_get_device_name(reopen_state->bs));
1755 goto error;
1756 }
1757
1758
1759 ret = bdrv_flush(reopen_state->bs);
1760 if (ret) {
1761 error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1762 strerror(-ret));
1763 goto error;
1764 }
1765
1766 if (drv->bdrv_reopen_prepare) {
1767 ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1768 if (ret) {
1769 if (local_err != NULL) {
1770 error_propagate(errp, local_err);
1771 } else {
1772 error_setg(errp, "failed while preparing to reopen image '%s'",
1773 reopen_state->bs->filename);
1774 }
1775 goto error;
1776 }
1777 } else {
1778 /* It is currently mandatory to have a bdrv_reopen_prepare()
1779 * handler for each supported drv. */
1780 error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1781 drv->format_name, bdrv_get_device_name(reopen_state->bs),
1782 "reopening of file");
1783 ret = -1;
1784 goto error;
1785 }
1786
1787 ret = 0;
1788
1789 error:
1790 return ret;
1791 }
1792
1793 /*
1794 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1795 * makes them final by swapping the staging BlockDriverState contents into
1796 * the active BlockDriverState contents.
1797 */
1798 void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1799 {
1800 BlockDriver *drv;
1801
1802 assert(reopen_state != NULL);
1803 drv = reopen_state->bs->drv;
1804 assert(drv != NULL);
1805
1806 /* If there are any driver level actions to take */
1807 if (drv->bdrv_reopen_commit) {
1808 drv->bdrv_reopen_commit(reopen_state);
1809 }
1810
1811 /* set BDS specific flags now */
1812 reopen_state->bs->open_flags = reopen_state->flags;
1813 reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1814 BDRV_O_CACHE_WB);
1815 reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1816
1817 bdrv_refresh_limits(reopen_state->bs, NULL);
1818 }
1819
1820 /*
1821 * Abort the reopen, and delete and free the staged changes in
1822 * reopen_state
1823 */
1824 void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1825 {
1826 BlockDriver *drv;
1827
1828 assert(reopen_state != NULL);
1829 drv = reopen_state->bs->drv;
1830 assert(drv != NULL);
1831
1832 if (drv->bdrv_reopen_abort) {
1833 drv->bdrv_reopen_abort(reopen_state);
1834 }
1835 }
1836
1837
1838 void bdrv_close(BlockDriverState *bs)
1839 {
1840 BdrvAioNotifier *ban, *ban_next;
1841
1842 if (bs->job) {
1843 block_job_cancel_sync(bs->job);
1844 }
1845 bdrv_drain_all(); /* complete I/O */
1846 bdrv_flush(bs);
1847 bdrv_drain_all(); /* in case flush left pending I/O */
1848 notifier_list_notify(&bs->close_notifiers, bs);
1849
1850 if (bs->drv) {
1851 if (bs->backing_hd) {
1852 BlockDriverState *backing_hd = bs->backing_hd;
1853 bdrv_set_backing_hd(bs, NULL);
1854 bdrv_unref(backing_hd);
1855 }
1856 bs->drv->bdrv_close(bs);
1857 g_free(bs->opaque);
1858 bs->opaque = NULL;
1859 bs->drv = NULL;
1860 bs->copy_on_read = 0;
1861 bs->backing_file[0] = '\0';
1862 bs->backing_format[0] = '\0';
1863 bs->total_sectors = 0;
1864 bs->encrypted = 0;
1865 bs->valid_key = 0;
1866 bs->sg = 0;
1867 bs->growable = 0;
1868 bs->zero_beyond_eof = false;
1869 QDECREF(bs->options);
1870 bs->options = NULL;
1871 QDECREF(bs->full_open_options);
1872 bs->full_open_options = NULL;
1873
1874 if (bs->file != NULL) {
1875 bdrv_unref(bs->file);
1876 bs->file = NULL;
1877 }
1878 }
1879
1880 if (bs->blk) {
1881 blk_dev_change_media_cb(bs->blk, false);
1882 }
1883
1884 /*throttling disk I/O limits*/
1885 if (bs->io_limits_enabled) {
1886 bdrv_io_limits_disable(bs);
1887 }
1888
1889 QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
1890 g_free(ban);
1891 }
1892 QLIST_INIT(&bs->aio_notifiers);
1893 }
1894
1895 void bdrv_close_all(void)
1896 {
1897 BlockDriverState *bs;
1898
1899 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1900 AioContext *aio_context = bdrv_get_aio_context(bs);
1901
1902 aio_context_acquire(aio_context);
1903 bdrv_close(bs);
1904 aio_context_release(aio_context);
1905 }
1906 }
1907
1908 /* Check if any requests are in-flight (including throttled requests) */
1909 static bool bdrv_requests_pending(BlockDriverState *bs)
1910 {
1911 if (!QLIST_EMPTY(&bs->tracked_requests)) {
1912 return true;
1913 }
1914 if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1915 return true;
1916 }
1917 if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
1918 return true;
1919 }
1920 if (bs->file && bdrv_requests_pending(bs->file)) {
1921 return true;
1922 }
1923 if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1924 return true;
1925 }
1926 return false;
1927 }
1928
1929 static bool bdrv_drain_one(BlockDriverState *bs)
1930 {
1931 bool bs_busy;
1932
1933 bdrv_flush_io_queue(bs);
1934 bdrv_start_throttled_reqs(bs);
1935 bs_busy = bdrv_requests_pending(bs);
1936 bs_busy |= aio_poll(bdrv_get_aio_context(bs), bs_busy);
1937 return bs_busy;
1938 }
1939
1940 /*
1941 * Wait for pending requests to complete on a single BlockDriverState subtree
1942 *
1943 * See the warning in bdrv_drain_all(). This function can only be called if
1944 * you are sure nothing can generate I/O because you have op blockers
1945 * installed.
1946 *
1947 * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
1948 * AioContext.
1949 */
1950 void bdrv_drain(BlockDriverState *bs)
1951 {
1952 while (bdrv_drain_one(bs)) {
1953 /* Keep iterating */
1954 }
1955 }
1956
1957 /*
1958 * Wait for pending requests to complete across all BlockDriverStates
1959 *
1960 * This function does not flush data to disk, use bdrv_flush_all() for that
1961 * after calling this function.
1962 *
1963 * Note that completion of an asynchronous I/O operation can trigger any
1964 * number of other I/O operations on other devices---for example a coroutine
1965 * can be arbitrarily complex and a constant flow of I/O can come until the
1966 * coroutine is complete. Because of this, it is not possible to have a
1967 * function to drain a single device's I/O queue.
1968 */
1969 void bdrv_drain_all(void)
1970 {
1971 /* Always run first iteration so any pending completion BHs run */
1972 bool busy = true;
1973 BlockDriverState *bs;
1974
1975 while (busy) {
1976 busy = false;
1977
1978 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1979 AioContext *aio_context = bdrv_get_aio_context(bs);
1980
1981 aio_context_acquire(aio_context);
1982 busy |= bdrv_drain_one(bs);
1983 aio_context_release(aio_context);
1984 }
1985 }
1986 }
1987
1988 /* make a BlockDriverState anonymous by removing from bdrv_state and
1989 * graph_bdrv_state list.
1990 Also, NULL terminate the device_name to prevent double remove */
1991 void bdrv_make_anon(BlockDriverState *bs)
1992 {
1993 /*
1994 * Take care to remove bs from bdrv_states only when it's actually
1995 * in it. Note that bs->device_list.tqe_prev is initially null,
1996 * and gets set to non-null by QTAILQ_INSERT_TAIL(). Establish
1997 * the useful invariant "bs in bdrv_states iff bs->tqe_prev" by
1998 * resetting it to null on remove.
1999 */
2000 if (bs->device_list.tqe_prev) {
2001 QTAILQ_REMOVE(&bdrv_states, bs, device_list);
2002 bs->device_list.tqe_prev = NULL;
2003 }
2004 if (bs->node_name[0] != '\0') {
2005 QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
2006 }
2007 bs->node_name[0] = '\0';
2008 }
2009
2010 static void bdrv_rebind(BlockDriverState *bs)
2011 {
2012 if (bs->drv && bs->drv->bdrv_rebind) {
2013 bs->drv->bdrv_rebind(bs);
2014 }
2015 }
2016
2017 static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
2018 BlockDriverState *bs_src)
2019 {
2020 /* move some fields that need to stay attached to the device */
2021
2022 /* dev info */
2023 bs_dest->guest_block_size = bs_src->guest_block_size;
2024 bs_dest->copy_on_read = bs_src->copy_on_read;
2025
2026 bs_dest->enable_write_cache = bs_src->enable_write_cache;
2027
2028 /* i/o throttled req */
2029 memcpy(&bs_dest->throttle_state,
2030 &bs_src->throttle_state,
2031 sizeof(ThrottleState));
2032 bs_dest->throttled_reqs[0] = bs_src->throttled_reqs[0];
2033 bs_dest->throttled_reqs[1] = bs_src->throttled_reqs[1];
2034 bs_dest->io_limits_enabled = bs_src->io_limits_enabled;
2035
2036 /* r/w error */
2037 bs_dest->on_read_error = bs_src->on_read_error;
2038 bs_dest->on_write_error = bs_src->on_write_error;
2039
2040 /* i/o status */
2041 bs_dest->iostatus_enabled = bs_src->iostatus_enabled;
2042 bs_dest->iostatus = bs_src->iostatus;
2043
2044 /* dirty bitmap */
2045 bs_dest->dirty_bitmaps = bs_src->dirty_bitmaps;
2046
2047 /* reference count */
2048 bs_dest->refcnt = bs_src->refcnt;
2049
2050 /* job */
2051 bs_dest->job = bs_src->job;
2052
2053 /* keep the same entry in bdrv_states */
2054 bs_dest->device_list = bs_src->device_list;
2055 bs_dest->blk = bs_src->blk;
2056
2057 memcpy(bs_dest->op_blockers, bs_src->op_blockers,
2058 sizeof(bs_dest->op_blockers));
2059 }
2060
2061 /*
2062 * Swap bs contents for two image chains while they are live,
2063 * while keeping required fields on the BlockDriverState that is
2064 * actually attached to a device.
2065 *
2066 * This will modify the BlockDriverState fields, and swap contents
2067 * between bs_new and bs_old. Both bs_new and bs_old are modified.
2068 *
2069 * bs_new must not be attached to a BlockBackend.
2070 *
2071 * This function does not create any image files.
2072 */
2073 void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
2074 {
2075 BlockDriverState tmp;
2076
2077 /* The code needs to swap the node_name but simply swapping node_list won't
2078 * work so first remove the nodes from the graph list, do the swap then
2079 * insert them back if needed.
2080 */
2081 if (bs_new->node_name[0] != '\0') {
2082 QTAILQ_REMOVE(&graph_bdrv_states, bs_new, node_list);
2083 }
2084 if (bs_old->node_name[0] != '\0') {
2085 QTAILQ_REMOVE(&graph_bdrv_states, bs_old, node_list);
2086 }
2087
2088 /* bs_new must be unattached and shouldn't have anything fancy enabled */
2089 assert(!bs_new->blk);
2090 assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
2091 assert(bs_new->job == NULL);
2092 assert(bs_new->io_limits_enabled == false);
2093 assert(!throttle_have_timer(&bs_new->throttle_state));
2094
2095 tmp = *bs_new;
2096 *bs_new = *bs_old;
2097 *bs_old = tmp;
2098
2099 /* there are some fields that should not be swapped, move them back */
2100 bdrv_move_feature_fields(&tmp, bs_old);
2101 bdrv_move_feature_fields(bs_old, bs_new);
2102 bdrv_move_feature_fields(bs_new, &tmp);
2103
2104 /* bs_new must remain unattached */
2105 assert(!bs_new->blk);
2106
2107 /* Check a few fields that should remain attached to the device */
2108 assert(bs_new->job == NULL);
2109 assert(bs_new->io_limits_enabled == false);
2110 assert(!throttle_have_timer(&bs_new->throttle_state));
2111
2112 /* insert the nodes back into the graph node list if needed */
2113 if (bs_new->node_name[0] != '\0') {
2114 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_new, node_list);
2115 }
2116 if (bs_old->node_name[0] != '\0') {
2117 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_old, node_list);
2118 }
2119
2120 bdrv_rebind(bs_new);
2121 bdrv_rebind(bs_old);
2122 }
2123
2124 /*
2125 * Add new bs contents at the top of an image chain while the chain is
2126 * live, while keeping required fields on the top layer.
2127 *
2128 * This will modify the BlockDriverState fields, and swap contents
2129 * between bs_new and bs_top. Both bs_new and bs_top are modified.
2130 *
2131 * bs_new must not be attached to a BlockBackend.
2132 *
2133 * This function does not create any image files.
2134 */
2135 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
2136 {
2137 bdrv_swap(bs_new, bs_top);
2138
2139 /* The contents of 'tmp' will become bs_top, as we are
2140 * swapping bs_new and bs_top contents. */
2141 bdrv_set_backing_hd(bs_top, bs_new);
2142 }
2143
2144 static void bdrv_delete(BlockDriverState *bs)
2145 {
2146 assert(!bs->job);
2147 assert(bdrv_op_blocker_is_empty(bs));
2148 assert(!bs->refcnt);
2149 assert(QLIST_EMPTY(&bs->dirty_bitmaps));
2150
2151 bdrv_close(bs);
2152
2153 /* remove from list, if necessary */
2154 bdrv_make_anon(bs);
2155
2156 g_free(bs);
2157 }
2158
2159 /*
2160 * Run consistency checks on an image
2161 *
2162 * Returns 0 if the check could be completed (it doesn't mean that the image is
2163 * free of errors) or -errno when an internal error occurred. The results of the
2164 * check are stored in res.
2165 */
2166 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
2167 {
2168 if (bs->drv == NULL) {
2169 return -ENOMEDIUM;
2170 }
2171 if (bs->drv->bdrv_check == NULL) {
2172 return -ENOTSUP;
2173 }
2174
2175 memset(res, 0, sizeof(*res));
2176 return bs->drv->bdrv_check(bs, res, fix);
2177 }
2178
2179 #define COMMIT_BUF_SECTORS 2048
2180
2181 /* commit COW file into the raw image */
2182 int bdrv_commit(BlockDriverState *bs)
2183 {
2184 BlockDriver *drv = bs->drv;
2185 int64_t sector, total_sectors, length, backing_length;
2186 int n, ro, open_flags;
2187 int ret = 0;
2188 uint8_t *buf = NULL;
2189 char filename[PATH_MAX];
2190
2191 if (!drv)
2192 return -ENOMEDIUM;
2193
2194 if (!bs->backing_hd) {
2195 return -ENOTSUP;
2196 }
2197
2198 if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_COMMIT, NULL) ||
2199 bdrv_op_is_blocked(bs->backing_hd, BLOCK_OP_TYPE_COMMIT, NULL)) {
2200 return -EBUSY;
2201 }
2202
2203 ro = bs->backing_hd->read_only;
2204 /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
2205 pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
2206 open_flags = bs->backing_hd->open_flags;
2207
2208 if (ro) {
2209 if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
2210 return -EACCES;
2211 }
2212 }
2213
2214 length = bdrv_getlength(bs);
2215 if (length < 0) {
2216 ret = length;
2217 goto ro_cleanup;
2218 }
2219
2220 backing_length = bdrv_getlength(bs->backing_hd);
2221 if (backing_length < 0) {
2222 ret = backing_length;
2223 goto ro_cleanup;
2224 }
2225
2226 /* If our top snapshot is larger than the backing file image,
2227 * grow the backing file image if possible. If not possible,
2228 * we must return an error */
2229 if (length > backing_length) {
2230 ret = bdrv_truncate(bs->backing_hd, length);
2231 if (ret < 0) {
2232 goto ro_cleanup;
2233 }
2234 }
2235
2236 total_sectors = length >> BDRV_SECTOR_BITS;
2237
2238 /* qemu_try_blockalign() for bs will choose an alignment that works for
2239 * bs->backing_hd as well, so no need to compare the alignment manually. */
2240 buf = qemu_try_blockalign(bs, COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
2241 if (buf == NULL) {
2242 ret = -ENOMEM;
2243 goto ro_cleanup;
2244 }
2245
2246 for (sector = 0; sector < total_sectors; sector += n) {
2247 ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
2248 if (ret < 0) {
2249 goto ro_cleanup;
2250 }
2251 if (ret) {
2252 ret = bdrv_read(bs, sector, buf, n);
2253 if (ret < 0) {
2254 goto ro_cleanup;
2255 }
2256
2257 ret = bdrv_write(bs->backing_hd, sector, buf, n);
2258 if (ret < 0) {
2259 goto ro_cleanup;
2260 }
2261 }
2262 }
2263
2264 if (drv->bdrv_make_empty) {
2265 ret = drv->bdrv_make_empty(bs);
2266 if (ret < 0) {
2267 goto ro_cleanup;
2268 }
2269 bdrv_flush(bs);
2270 }
2271
2272 /*
2273 * Make sure all data we wrote to the backing device is actually
2274 * stable on disk.
2275 */
2276 if (bs->backing_hd) {
2277 bdrv_flush(bs->backing_hd);
2278 }
2279
2280 ret = 0;
2281 ro_cleanup:
2282 qemu_vfree(buf);
2283
2284 if (ro) {
2285 /* ignoring error return here */
2286 bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
2287 }
2288
2289 return ret;
2290 }
2291
2292 int bdrv_commit_all(void)
2293 {
2294 BlockDriverState *bs;
2295
2296 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
2297 AioContext *aio_context = bdrv_get_aio_context(bs);
2298
2299 aio_context_acquire(aio_context);
2300 if (bs->drv && bs->backing_hd) {
2301 int ret = bdrv_commit(bs);
2302 if (ret < 0) {
2303 aio_context_release(aio_context);
2304 return ret;
2305 }
2306 }
2307 aio_context_release(aio_context);
2308 }
2309 return 0;
2310 }
2311
2312 /**
2313 * Remove an active request from the tracked requests list
2314 *
2315 * This function should be called when a tracked request is completing.
2316 */
2317 static void tracked_request_end(BdrvTrackedRequest *req)
2318 {
2319 if (req->serialising) {
2320 req->bs->serialising_in_flight--;
2321 }
2322
2323 QLIST_REMOVE(req, list);
2324 qemu_co_queue_restart_all(&req->wait_queue);
2325 }
2326
2327 /**
2328 * Add an active request to the tracked requests list
2329 */
2330 static void tracked_request_begin(BdrvTrackedRequest *req,
2331 BlockDriverState *bs,
2332 int64_t offset,
2333 unsigned int bytes, bool is_write)
2334 {
2335 *req = (BdrvTrackedRequest){
2336 .bs = bs,
2337 .offset = offset,
2338 .bytes = bytes,
2339 .is_write = is_write,
2340 .co = qemu_coroutine_self(),
2341 .serialising = false,
2342 .overlap_offset = offset,
2343 .overlap_bytes = bytes,
2344 };
2345
2346 qemu_co_queue_init(&req->wait_queue);
2347
2348 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2349 }
2350
2351 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
2352 {
2353 int64_t overlap_offset = req->offset & ~(align - 1);
2354 unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
2355 - overlap_offset;
2356
2357 if (!req->serialising) {
2358 req->bs->serialising_in_flight++;
2359 req->serialising = true;
2360 }
2361
2362 req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
2363 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
2364 }
2365
2366 /**
2367 * Round a region to cluster boundaries
2368 */
2369 void bdrv_round_to_clusters(BlockDriverState *bs,
2370 int64_t sector_num, int nb_sectors,
2371 int64_t *cluster_sector_num,
2372 int *cluster_nb_sectors)
2373 {
2374 BlockDriverInfo bdi;
2375
2376 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2377 *cluster_sector_num = sector_num;
2378 *cluster_nb_sectors = nb_sectors;
2379 } else {
2380 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2381 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2382 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2383 nb_sectors, c);
2384 }
2385 }
2386
2387 static int bdrv_get_cluster_size(BlockDriverState *bs)
2388 {
2389 BlockDriverInfo bdi;
2390 int ret;
2391
2392 ret = bdrv_get_info(bs, &bdi);
2393 if (ret < 0 || bdi.cluster_size == 0) {
2394 return bs->request_alignment;
2395 } else {
2396 return bdi.cluster_size;
2397 }
2398 }
2399
2400 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
2401 int64_t offset, unsigned int bytes)
2402 {
2403 /* aaaa bbbb */
2404 if (offset >= req->overlap_offset + req->overlap_bytes) {
2405 return false;
2406 }
2407 /* bbbb aaaa */
2408 if (req->overlap_offset >= offset + bytes) {
2409 return false;
2410 }
2411 return true;
2412 }
2413
2414 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
2415 {
2416 BlockDriverState *bs = self->bs;
2417 BdrvTrackedRequest *req;
2418 bool retry;
2419 bool waited = false;
2420
2421 if (!bs->serialising_in_flight) {
2422 return false;
2423 }
2424
2425 do {
2426 retry = false;
2427 QLIST_FOREACH(req, &bs->tracked_requests, list) {
2428 if (req == self || (!req->serialising && !self->serialising)) {
2429 continue;
2430 }
2431 if (tracked_request_overlaps(req, self->overlap_offset,
2432 self->overlap_bytes))
2433 {
2434 /* Hitting this means there was a reentrant request, for
2435 * example, a block driver issuing nested requests. This must
2436 * never happen since it means deadlock.
2437 */
2438 assert(qemu_coroutine_self() != req->co);
2439
2440 /* If the request is already (indirectly) waiting for us, or
2441 * will wait for us as soon as it wakes up, then just go on
2442 * (instead of producing a deadlock in the former case). */
2443 if (!req->waiting_for) {
2444 self->waiting_for = req;
2445 qemu_co_queue_wait(&req->wait_queue);
2446 self->waiting_for = NULL;
2447 retry = true;
2448 waited = true;
2449 break;
2450 }
2451 }
2452 }
2453 } while (retry);
2454
2455 return waited;
2456 }
2457
2458 /*
2459 * Return values:
2460 * 0 - success
2461 * -EINVAL - backing format specified, but no file
2462 * -ENOSPC - can't update the backing file because no space is left in the
2463 * image file header
2464 * -ENOTSUP - format driver doesn't support changing the backing file
2465 */
2466 int bdrv_change_backing_file(BlockDriverState *bs,
2467 const char *backing_file, const char *backing_fmt)
2468 {
2469 BlockDriver *drv = bs->drv;
2470 int ret;
2471
2472 /* Backing file format doesn't make sense without a backing file */
2473 if (backing_fmt && !backing_file) {
2474 return -EINVAL;
2475 }
2476
2477 if (drv->bdrv_change_backing_file != NULL) {
2478 ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
2479 } else {
2480 ret = -ENOTSUP;
2481 }
2482
2483 if (ret == 0) {
2484 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2485 pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2486 }
2487 return ret;
2488 }
2489
2490 /*
2491 * Finds the image layer in the chain that has 'bs' as its backing file.
2492 *
2493 * active is the current topmost image.
2494 *
2495 * Returns NULL if bs is not found in active's image chain,
2496 * or if active == bs.
2497 *
2498 * Returns the bottommost base image if bs == NULL.
2499 */
2500 BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2501 BlockDriverState *bs)
2502 {
2503 while (active && bs != active->backing_hd) {
2504 active = active->backing_hd;
2505 }
2506
2507 return active;
2508 }
2509
2510 /* Given a BDS, searches for the base layer. */
2511 BlockDriverState *bdrv_find_base(BlockDriverState *bs)
2512 {
2513 return bdrv_find_overlay(bs, NULL);
2514 }
2515
2516 typedef struct BlkIntermediateStates {
2517 BlockDriverState *bs;
2518 QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2519 } BlkIntermediateStates;
2520
2521
2522 /*
2523 * Drops images above 'base' up to and including 'top', and sets the image
2524 * above 'top' to have base as its backing file.
2525 *
2526 * Requires that the overlay to 'top' is opened r/w, so that the backing file
2527 * information in 'bs' can be properly updated.
2528 *
2529 * E.g., this will convert the following chain:
2530 * bottom <- base <- intermediate <- top <- active
2531 *
2532 * to
2533 *
2534 * bottom <- base <- active
2535 *
2536 * It is allowed for bottom==base, in which case it converts:
2537 *
2538 * base <- intermediate <- top <- active
2539 *
2540 * to
2541 *
2542 * base <- active
2543 *
2544 * If backing_file_str is non-NULL, it will be used when modifying top's
2545 * overlay image metadata.
2546 *
2547 * Error conditions:
2548 * if active == top, that is considered an error
2549 *
2550 */
2551 int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2552 BlockDriverState *base, const char *backing_file_str)
2553 {
2554 BlockDriverState *intermediate;
2555 BlockDriverState *base_bs = NULL;
2556 BlockDriverState *new_top_bs = NULL;
2557 BlkIntermediateStates *intermediate_state, *next;
2558 int ret = -EIO;
2559
2560 QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2561 QSIMPLEQ_INIT(&states_to_delete);
2562
2563 if (!top->drv || !base->drv) {
2564 goto exit;
2565 }
2566
2567 new_top_bs = bdrv_find_overlay(active, top);
2568
2569 if (new_top_bs == NULL) {
2570 /* we could not find the image above 'top', this is an error */
2571 goto exit;
2572 }
2573
2574 /* special case of new_top_bs->backing_hd already pointing to base - nothing
2575 * to do, no intermediate images */
2576 if (new_top_bs->backing_hd == base) {
2577 ret = 0;
2578 goto exit;
2579 }
2580
2581 intermediate = top;
2582
2583 /* now we will go down through the list, and add each BDS we find
2584 * into our deletion queue, until we hit the 'base'
2585 */
2586 while (intermediate) {
2587 intermediate_state = g_new0(BlkIntermediateStates, 1);
2588 intermediate_state->bs = intermediate;
2589 QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2590
2591 if (intermediate->backing_hd == base) {
2592 base_bs = intermediate->backing_hd;
2593 break;
2594 }
2595 intermediate = intermediate->backing_hd;
2596 }
2597 if (base_bs == NULL) {
2598 /* something went wrong, we did not end at the base. safely
2599 * unravel everything, and exit with error */
2600 goto exit;
2601 }
2602
2603 /* success - we can delete the intermediate states, and link top->base */
2604 backing_file_str = backing_file_str ? backing_file_str : base_bs->filename;
2605 ret = bdrv_change_backing_file(new_top_bs, backing_file_str,
2606 base_bs->drv ? base_bs->drv->format_name : "");
2607 if (ret) {
2608 goto exit;
2609 }
2610 bdrv_set_backing_hd(new_top_bs, base_bs);
2611
2612 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2613 /* so that bdrv_close() does not recursively close the chain */
2614 bdrv_set_backing_hd(intermediate_state->bs, NULL);
2615 bdrv_unref(intermediate_state->bs);
2616 }
2617 ret = 0;
2618
2619 exit:
2620 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2621 g_free(intermediate_state);
2622 }
2623 return ret;
2624 }
2625
2626
2627 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2628 size_t size)
2629 {
2630 int64_t len;
2631
2632 if (size > INT_MAX) {
2633 return -EIO;
2634 }
2635
2636 if (!bdrv_is_inserted(bs))
2637 return -ENOMEDIUM;
2638
2639 if (bs->growable)
2640 return 0;
2641
2642 len = bdrv_getlength(bs);
2643
2644 if (offset < 0)
2645 return -EIO;
2646
2647 if ((offset > len) || (len - offset < size))
2648 return -EIO;
2649
2650 return 0;
2651 }
2652
2653 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2654 int nb_sectors)
2655 {
2656 if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2657 return -EIO;
2658 }
2659
2660 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2661 nb_sectors * BDRV_SECTOR_SIZE);
2662 }
2663
2664 typedef struct RwCo {
2665 BlockDriverState *bs;
2666 int64_t offset;
2667 QEMUIOVector *qiov;
2668 bool is_write;
2669 int ret;
2670 BdrvRequestFlags flags;
2671 } RwCo;
2672
2673 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
2674 {
2675 RwCo *rwco = opaque;
2676
2677 if (!rwco->is_write) {
2678 rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
2679 rwco->qiov->size, rwco->qiov,
2680 rwco->flags);
2681 } else {
2682 rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
2683 rwco->qiov->size, rwco->qiov,
2684 rwco->flags);
2685 }
2686 }
2687
2688 /*
2689 * Process a vectored synchronous request using coroutines
2690 */
2691 static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
2692 QEMUIOVector *qiov, bool is_write,
2693 BdrvRequestFlags flags)
2694 {
2695 Coroutine *co;
2696 RwCo rwco = {
2697 .bs = bs,
2698 .offset = offset,
2699 .qiov = qiov,
2700 .is_write = is_write,
2701 .ret = NOT_DONE,
2702 .flags = flags,
2703 };
2704
2705 /**
2706 * In sync call context, when the vcpu is blocked, this throttling timer
2707 * will not fire; so the I/O throttling function has to be disabled here
2708 * if it has been enabled.
2709 */
2710 if (bs->io_limits_enabled) {
2711 fprintf(stderr, "Disabling I/O throttling on '%s' due "
2712 "to synchronous I/O.\n", bdrv_get_device_name(bs));
2713 bdrv_io_limits_disable(bs);
2714 }
2715
2716 if (qemu_in_coroutine()) {
2717 /* Fast-path if already in coroutine context */
2718 bdrv_rw_co_entry(&rwco);
2719 } else {
2720 AioContext *aio_context = bdrv_get_aio_context(bs);
2721
2722 co = qemu_coroutine_create(bdrv_rw_co_entry);
2723 qemu_coroutine_enter(co, &rwco);
2724 while (rwco.ret == NOT_DONE) {
2725 aio_poll(aio_context, true);
2726 }
2727 }
2728 return rwco.ret;
2729 }
2730
2731 /*
2732 * Process a synchronous request using coroutines
2733 */
2734 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
2735 int nb_sectors, bool is_write, BdrvRequestFlags flags)
2736 {
2737 QEMUIOVector qiov;
2738 struct iovec iov = {
2739 .iov_base = (void *)buf,
2740 .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2741 };
2742
2743 if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2744 return -EINVAL;
2745 }
2746
2747 qemu_iovec_init_external(&qiov, &iov, 1);
2748 return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
2749 &qiov, is_write, flags);
2750 }
2751
2752 /* return < 0 if error. See bdrv_write() for the return codes */
2753 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2754 uint8_t *buf, int nb_sectors)
2755 {
2756 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
2757 }
2758
2759 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2760 int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2761 uint8_t *buf, int nb_sectors)
2762 {
2763 bool enabled;
2764 int ret;
2765
2766 enabled = bs->io_limits_enabled;
2767 bs->io_limits_enabled = false;
2768 ret = bdrv_read(bs, sector_num, buf, nb_sectors);
2769 bs->io_limits_enabled = enabled;
2770 return ret;
2771 }
2772
2773 /* Return < 0 if error. Important errors are:
2774 -EIO generic I/O error (may happen for all errors)
2775 -ENOMEDIUM No media inserted.
2776 -EINVAL Invalid sector number or nb_sectors
2777 -EACCES Trying to write a read-only device
2778 */
2779 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
2780 const uint8_t *buf, int nb_sectors)
2781 {
2782 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
2783 }
2784
2785 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2786 int nb_sectors, BdrvRequestFlags flags)
2787 {
2788 return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
2789 BDRV_REQ_ZERO_WRITE | flags);
2790 }
2791
2792 /*
2793 * Completely zero out a block device with the help of bdrv_write_zeroes.
2794 * The operation is sped up by checking the block status and only writing
2795 * zeroes to the device if they currently do not return zeroes. Optional
2796 * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2797 *
2798 * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2799 */
2800 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2801 {
2802 int64_t target_sectors, ret, nb_sectors, sector_num = 0;
2803 int n;
2804
2805 target_sectors = bdrv_nb_sectors(bs);
2806 if (target_sectors < 0) {
2807 return target_sectors;
2808 }
2809
2810 for (;;) {
2811 nb_sectors = target_sectors - sector_num;
2812 if (nb_sectors <= 0) {
2813 return 0;
2814 }
2815 if (nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2816 nb_sectors = INT_MAX / BDRV_SECTOR_SIZE;
2817 }
2818 ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
2819 if (ret < 0) {
2820 error_report("error getting block status at sector %" PRId64 ": %s",
2821 sector_num, strerror(-ret));
2822 return ret;
2823 }
2824 if (ret & BDRV_BLOCK_ZERO) {
2825 sector_num += n;
2826 continue;
2827 }
2828 ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2829 if (ret < 0) {
2830 error_report("error writing zeroes at sector %" PRId64 ": %s",
2831 sector_num, strerror(-ret));
2832 return ret;
2833 }
2834 sector_num += n;
2835 }
2836 }
2837
2838 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
2839 {
2840 QEMUIOVector qiov;
2841 struct iovec iov = {
2842 .iov_base = (void *)buf,
2843 .iov_len = bytes,
2844 };
2845 int ret;
2846
2847 if (bytes < 0) {
2848 return -EINVAL;
2849 }
2850
2851 qemu_iovec_init_external(&qiov, &iov, 1);
2852 ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
2853 if (ret < 0) {
2854 return ret;
2855 }
2856
2857 return bytes;
2858 }
2859
2860 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
2861 {
2862 int ret;
2863
2864 ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
2865 if (ret < 0) {
2866 return ret;
2867 }
2868
2869 return qiov->size;
2870 }
2871
2872 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2873 const void *buf, int bytes)
2874 {
2875 QEMUIOVector qiov;
2876 struct iovec iov = {
2877 .iov_base = (void *) buf,
2878 .iov_len = bytes,
2879 };
2880
2881 if (bytes < 0) {
2882 return -EINVAL;
2883 }
2884
2885 qemu_iovec_init_external(&qiov, &iov, 1);
2886 return bdrv_pwritev(bs, offset, &qiov);
2887 }
2888
2889 /*
2890 * Writes to the file and ensures that no writes are reordered across this
2891 * request (acts as a barrier)
2892 *
2893 * Returns 0 on success, -errno in error cases.
2894 */
2895 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2896 const void *buf, int count)
2897 {
2898 int ret;
2899
2900 ret = bdrv_pwrite(bs, offset, buf, count);
2901 if (ret < 0) {
2902 return ret;
2903 }
2904
2905 /* No flush needed for cache modes that already do it */
2906 if (bs->enable_write_cache) {
2907 bdrv_flush(bs);
2908 }
2909
2910 return 0;
2911 }
2912
2913 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
2914 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2915 {
2916 /* Perform I/O through a temporary buffer so that users who scribble over
2917 * their read buffer while the operation is in progress do not end up
2918 * modifying the image file. This is critical for zero-copy guest I/O
2919 * where anything might happen inside guest memory.
2920 */
2921 void *bounce_buffer;
2922
2923 BlockDriver *drv = bs->drv;
2924 struct iovec iov;
2925 QEMUIOVector bounce_qiov;
2926 int64_t cluster_sector_num;
2927 int cluster_nb_sectors;
2928 size_t skip_bytes;
2929 int ret;
2930
2931 /* Cover entire cluster so no additional backing file I/O is required when
2932 * allocating cluster in the image file.
2933 */
2934 bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2935 &cluster_sector_num, &cluster_nb_sectors);
2936
2937 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2938 cluster_sector_num, cluster_nb_sectors);
2939
2940 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2941 iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len);
2942 if (bounce_buffer == NULL) {
2943 ret = -ENOMEM;
2944 goto err;
2945 }
2946
2947 qemu_iovec_init_external(&bounce_qiov, &iov, 1);
2948
2949 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
2950 &bounce_qiov);
2951 if (ret < 0) {
2952 goto err;
2953 }
2954
2955 if (drv->bdrv_co_write_zeroes &&
2956 buffer_is_zero(bounce_buffer, iov.iov_len)) {
2957 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
2958 cluster_nb_sectors, 0);
2959 } else {
2960 /* This does not change the data on the disk, it is not necessary
2961 * to flush even in cache=writethrough mode.
2962 */
2963 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
2964 &bounce_qiov);
2965 }
2966
2967 if (ret < 0) {
2968 /* It might be okay to ignore write errors for guest requests. If this
2969 * is a deliberate copy-on-read then we don't want to ignore the error.
2970 * Simply report it in all cases.
2971 */
2972 goto err;
2973 }
2974
2975 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
2976 qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
2977 nb_sectors * BDRV_SECTOR_SIZE);
2978
2979 err:
2980 qemu_vfree(bounce_buffer);
2981 return ret;
2982 }
2983
2984 /*
2985 * Forwards an already correctly aligned request to the BlockDriver. This
2986 * handles copy on read and zeroing after EOF; any other features must be
2987 * implemented by the caller.
2988 */
2989 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
2990 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
2991 int64_t align, QEMUIOVector *qiov, int flags)
2992 {
2993 BlockDriver *drv = bs->drv;
2994 int ret;
2995
2996 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
2997 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
2998
2999 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3000 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3001 assert(!qiov || bytes == qiov->size);
3002
3003 /* Handle Copy on Read and associated serialisation */
3004 if (flags & BDRV_REQ_COPY_ON_READ) {
3005 /* If we touch the same cluster it counts as an overlap. This
3006 * guarantees that allocating writes will be serialized and not race
3007 * with each other for the same cluster. For example, in copy-on-read
3008 * it ensures that the CoR read and write operations are atomic and
3009 * guest writes cannot interleave between them. */
3010 mark_request_serialising(req, bdrv_get_cluster_size(bs));
3011 }
3012
3013 wait_serialising_requests(req);
3014
3015 if (flags & BDRV_REQ_COPY_ON_READ) {
3016 int pnum;
3017
3018 ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
3019 if (ret < 0) {
3020 goto out;
3021 }
3022
3023 if (!ret || pnum != nb_sectors) {
3024 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
3025 goto out;
3026 }
3027 }
3028
3029 /* Forward the request to the BlockDriver */
3030 if (!(bs->zero_beyond_eof && bs->growable)) {
3031 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
3032 } else {
3033 /* Read zeros after EOF of growable BDSes */
3034 int64_t total_sectors, max_nb_sectors;
3035
3036 total_sectors = bdrv_nb_sectors(bs);
3037 if (total_sectors < 0) {
3038 ret = total_sectors;
3039 goto out;
3040 }
3041
3042 max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
3043 align >> BDRV_SECTOR_BITS);
3044 if (max_nb_sectors > 0) {
3045 QEMUIOVector local_qiov;
3046 size_t local_sectors;
3047
3048 max_nb_sectors = MIN(max_nb_sectors, SIZE_MAX / BDRV_SECTOR_BITS);
3049 local_sectors = MIN(max_nb_sectors, nb_sectors);
3050
3051 qemu_iovec_init(&local_qiov, qiov->niov);
3052 qemu_iovec_concat(&local_qiov, qiov, 0,
3053 local_sectors * BDRV_SECTOR_SIZE);
3054
3055 ret = drv->bdrv_co_readv(bs, sector_num, local_sectors,
3056 &local_qiov);
3057
3058 qemu_iovec_destroy(&local_qiov);
3059 } else {
3060 ret = 0;
3061 }
3062
3063 /* Reading beyond end of file is supposed to produce zeroes */
3064 if (ret == 0 && total_sectors < sector_num + nb_sectors) {
3065 uint64_t offset = MAX(0, total_sectors - sector_num);
3066 uint64_t bytes = (sector_num + nb_sectors - offset) *
3067 BDRV_SECTOR_SIZE;
3068 qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
3069 }
3070 }
3071
3072 out:
3073 return ret;
3074 }
3075
3076 /*
3077 * Handle a read request in coroutine context
3078 */
3079 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
3080 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3081 BdrvRequestFlags flags)
3082 {
3083 BlockDriver *drv = bs->drv;
3084 BdrvTrackedRequest req;
3085
3086 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3087 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3088 uint8_t *head_buf = NULL;
3089 uint8_t *tail_buf = NULL;
3090 QEMUIOVector local_qiov;
3091 bool use_local_qiov = false;
3092 int ret;
3093
3094 if (!drv) {
3095 return -ENOMEDIUM;
3096 }
3097 if (bdrv_check_byte_request(bs, offset, bytes)) {
3098 return -EIO;
3099 }
3100
3101 if (bs->copy_on_read) {
3102 flags |= BDRV_REQ_COPY_ON_READ;
3103 }
3104
3105 /* throttling disk I/O */
3106 if (bs->io_limits_enabled) {
3107 bdrv_io_limits_intercept(bs, bytes, false);
3108 }
3109
3110 /* Align read if necessary by padding qiov */
3111 if (offset & (align - 1)) {
3112 head_buf = qemu_blockalign(bs, align);
3113 qemu_iovec_init(&local_qiov, qiov->niov + 2);
3114 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3115 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3116 use_local_qiov = true;
3117
3118 bytes += offset & (align - 1);
3119 offset = offset & ~(align - 1);
3120 }
3121
3122 if ((offset + bytes) & (align - 1)) {
3123 if (!use_local_qiov) {
3124 qemu_iovec_init(&local_qiov, qiov->niov + 1);
3125 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3126 use_local_qiov = true;
3127 }
3128 tail_buf = qemu_blockalign(bs, align);
3129 qemu_iovec_add(&local_qiov, tail_buf,
3130 align - ((offset + bytes) & (align - 1)));
3131
3132 bytes = ROUND_UP(bytes, align);
3133 }
3134
3135 tracked_request_begin(&req, bs, offset, bytes, false);
3136 ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
3137 use_local_qiov ? &local_qiov : qiov,
3138 flags);
3139 tracked_request_end(&req);
3140
3141 if (use_local_qiov) {
3142 qemu_iovec_destroy(&local_qiov);
3143 qemu_vfree(head_buf);
3144 qemu_vfree(tail_buf);
3145 }
3146
3147 return ret;
3148 }
3149
3150 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
3151 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3152 BdrvRequestFlags flags)
3153 {
3154 if (nb_sectors < 0 || nb_sectors > (UINT_MAX >> BDRV_SECTOR_BITS)) {
3155 return -EINVAL;
3156 }
3157
3158 return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
3159 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3160 }
3161
3162 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
3163 int nb_sectors, QEMUIOVector *qiov)
3164 {
3165 trace_bdrv_co_readv(bs, sector_num, nb_sectors);
3166
3167 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
3168 }
3169
3170 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
3171 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
3172 {
3173 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
3174
3175 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
3176 BDRV_REQ_COPY_ON_READ);
3177 }
3178
3179 /* if no limit is specified in the BlockLimits use a default
3180 * of 32768 512-byte sectors (16 MiB) per request.
3181 */
3182 #define MAX_WRITE_ZEROES_DEFAULT 32768
3183
3184 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
3185 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
3186 {
3187 BlockDriver *drv = bs->drv;
3188 QEMUIOVector qiov;
3189 struct iovec iov = {0};
3190 int ret = 0;
3191
3192 int max_write_zeroes = bs->bl.max_write_zeroes ?
3193 bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
3194
3195 while (nb_sectors > 0 && !ret) {
3196 int num = nb_sectors;
3197
3198 /* Align request. Block drivers can expect the "bulk" of the request
3199 * to be aligned.
3200 */
3201 if (bs->bl.write_zeroes_alignment
3202 && num > bs->bl.write_zeroes_alignment) {
3203 if (sector_num % bs->bl.write_zeroes_alignment != 0) {
3204 /* Make a small request up to the first aligned sector. */
3205 num = bs->bl.write_zeroes_alignment;
3206 num -= sector_num % bs->bl.write_zeroes_alignment;
3207 } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
3208 /* Shorten the request to the last aligned sector. num cannot
3209 * underflow because num > bs->bl.write_zeroes_alignment.
3210 */
3211 num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
3212 }
3213 }
3214
3215 /* limit request size */
3216 if (num > max_write_zeroes) {
3217 num = max_write_zeroes;
3218 }
3219
3220 ret = -ENOTSUP;
3221 /* First try the efficient write zeroes operation */
3222 if (drv->bdrv_co_write_zeroes) {
3223 ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
3224 }
3225
3226 if (ret == -ENOTSUP) {
3227 /* Fall back to bounce buffer if write zeroes is unsupported */
3228 iov.iov_len = num * BDRV_SECTOR_SIZE;
3229 if (iov.iov_base == NULL) {
3230 iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE);
3231 if (iov.iov_base == NULL) {
3232 ret = -ENOMEM;
3233 goto fail;
3234 }
3235 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
3236 }
3237 qemu_iovec_init_external(&qiov, &iov, 1);
3238
3239 ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
3240
3241 /* Keep bounce buffer around if it is big enough for all
3242 * all future requests.
3243 */
3244 if (num < max_write_zeroes) {
3245 qemu_vfree(iov.iov_base);
3246 iov.iov_base = NULL;
3247 }
3248 }
3249
3250 sector_num += num;
3251 nb_sectors -= num;
3252 }
3253
3254 fail:
3255 qemu_vfree(iov.iov_base);
3256 return ret;
3257 }
3258
3259 /*
3260 * Forwards an already correctly aligned write request to the BlockDriver.
3261 */
3262 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
3263 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3264 QEMUIOVector *qiov, int flags)
3265 {
3266 BlockDriver *drv = bs->drv;
3267 bool waited;
3268 int ret;
3269
3270 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3271 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
3272
3273 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3274 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3275 assert(!qiov || bytes == qiov->size);
3276
3277 waited = wait_serialising_requests(req);
3278 assert(!waited || !req->serialising);
3279 assert(req->overlap_offset <= offset);
3280 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
3281
3282 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
3283
3284 if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
3285 !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes &&
3286 qemu_iovec_is_zero(qiov)) {
3287 flags |= BDRV_REQ_ZERO_WRITE;
3288 if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
3289 flags |= BDRV_REQ_MAY_UNMAP;
3290 }
3291 }
3292
3293 if (ret < 0) {
3294 /* Do nothing, write notifier decided to fail this request */
3295 } else if (flags & BDRV_REQ_ZERO_WRITE) {
3296 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
3297 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
3298 } else {
3299 BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
3300 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
3301 }
3302 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
3303
3304 if (ret == 0 && !bs->enable_write_cache) {
3305 ret = bdrv_co_flush(bs);
3306 }
3307
3308 bdrv_set_dirty(bs, sector_num, nb_sectors);
3309
3310 block_acct_highest_sector(&bs->stats, sector_num, nb_sectors);
3311
3312 if (bs->growable && ret >= 0) {
3313 bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
3314 }
3315
3316 return ret;
3317 }
3318
3319 /*
3320 * Handle a write request in coroutine context
3321 */
3322 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
3323 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3324 BdrvRequestFlags flags)
3325 {
3326 BdrvTrackedRequest req;
3327 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3328 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3329 uint8_t *head_buf = NULL;
3330 uint8_t *tail_buf = NULL;
3331 QEMUIOVector local_qiov;
3332 bool use_local_qiov = false;
3333 int ret;
3334
3335 if (!bs->drv) {
3336 return -ENOMEDIUM;
3337 }
3338 if (bs->read_only) {
3339 return -EACCES;
3340 }
3341 if (bdrv_check_byte_request(bs, offset, bytes)) {
3342 return -EIO;
3343 }
3344
3345 /* throttling disk I/O */
3346 if (bs->io_limits_enabled) {
3347 bdrv_io_limits_intercept(bs, bytes, true);
3348 }
3349
3350 /*
3351 * Align write if necessary by performing a read-modify-write cycle.
3352 * Pad qiov with the read parts and be sure to have a tracked request not
3353 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
3354 */
3355 tracked_request_begin(&req, bs, offset, bytes, true);
3356
3357 if (offset & (align - 1)) {
3358 QEMUIOVector head_qiov;
3359 struct iovec head_iov;
3360
3361 mark_request_serialising(&req, align);
3362 wait_serialising_requests(&req);
3363
3364 head_buf = qemu_blockalign(bs, align);
3365 head_iov = (struct iovec) {
3366 .iov_base = head_buf,
3367 .iov_len = align,
3368 };
3369 qemu_iovec_init_external(&head_qiov, &head_iov, 1);
3370
3371 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
3372 ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
3373 align, &head_qiov, 0);
3374 if (ret < 0) {
3375 goto fail;
3376 }
3377 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
3378
3379 qemu_iovec_init(&local_qiov, qiov->niov + 2);
3380 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3381 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3382 use_local_qiov = true;
3383
3384 bytes += offset & (align - 1);
3385 offset = offset & ~(align - 1);
3386 }
3387
3388 if ((offset + bytes) & (align - 1)) {
3389 QEMUIOVector tail_qiov;
3390 struct iovec tail_iov;
3391 size_t tail_bytes;
3392 bool waited;
3393
3394 mark_request_serialising(&req, align);
3395 waited = wait_serialising_requests(&req);
3396 assert(!waited || !use_local_qiov);
3397
3398 tail_buf = qemu_blockalign(bs, align);
3399 tail_iov = (struct iovec) {
3400 .iov_base = tail_buf,
3401 .iov_len = align,
3402 };
3403 qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
3404
3405 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
3406 ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
3407 align, &tail_qiov, 0);
3408 if (ret < 0) {
3409 goto fail;
3410 }
3411 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
3412
3413 if (!use_local_qiov) {
3414 qemu_iovec_init(&local_qiov, qiov->niov + 1);
3415 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3416 use_local_qiov = true;
3417 }
3418
3419 tail_bytes = (offset + bytes) & (align - 1);
3420 qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
3421
3422 bytes = ROUND_UP(bytes, align);
3423 }
3424
3425 ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
3426 use_local_qiov ? &local_qiov : qiov,
3427 flags);
3428
3429 fail:
3430 tracked_request_end(&req);
3431
3432 if (use_local_qiov) {
3433 qemu_iovec_destroy(&local_qiov);
3434 }
3435 qemu_vfree(head_buf);
3436 qemu_vfree(tail_buf);
3437
3438 return ret;
3439 }
3440
3441 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
3442 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3443 BdrvRequestFlags flags)
3444 {
3445 if (nb_sectors < 0 || nb_sectors > (INT_MAX >> BDRV_SECTOR_BITS)) {
3446 return -EINVAL;
3447 }
3448
3449 return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
3450 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3451 }
3452
3453 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
3454 int nb_sectors, QEMUIOVector *qiov)
3455 {
3456 trace_bdrv_co_writev(bs, sector_num, nb_sectors);
3457
3458 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
3459 }
3460
3461 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
3462 int64_t sector_num, int nb_sectors,
3463 BdrvRequestFlags flags)
3464 {
3465 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
3466
3467 if (!(bs->open_flags & BDRV_O_UNMAP)) {
3468 flags &= ~BDRV_REQ_MAY_UNMAP;
3469 }
3470
3471 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
3472 BDRV_REQ_ZERO_WRITE | flags);
3473 }
3474
3475 /**
3476 * Truncate file to 'offset' bytes (needed only for file protocols)
3477 */
3478 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
3479 {
3480 BlockDriver *drv = bs->drv;
3481 int ret;
3482 if (!drv)
3483 return -ENOMEDIUM;
3484 if (!drv->bdrv_truncate)
3485 return -ENOTSUP;
3486 if (bs->read_only)
3487 return -EACCES;
3488
3489 ret = drv->bdrv_truncate(bs, offset);
3490 if (ret == 0) {
3491 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
3492 if (bs->blk) {
3493 blk_dev_resize_cb(bs->blk);
3494 }
3495 }
3496 return ret;
3497 }
3498
3499 /**
3500 * Length of a allocated file in bytes. Sparse files are counted by actual
3501 * allocated space. Return < 0 if error or unknown.
3502 */
3503 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
3504 {
3505 BlockDriver *drv = bs->drv;
3506 if (!drv) {
3507 return -ENOMEDIUM;
3508 }
3509 if (drv->bdrv_get_allocated_file_size) {
3510 return drv->bdrv_get_allocated_file_size(bs);
3511 }
3512 if (bs->file) {
3513 return bdrv_get_allocated_file_size(bs->file);
3514 }
3515 return -ENOTSUP;
3516 }
3517
3518 /**
3519 * Return number of sectors on success, -errno on error.
3520 */
3521 int64_t bdrv_nb_sectors(BlockDriverState *bs)
3522 {
3523 BlockDriver *drv = bs->drv;
3524
3525 if (!drv)
3526 return -ENOMEDIUM;
3527
3528 if (drv->has_variable_length) {
3529 int ret = refresh_total_sectors(bs, bs->total_sectors);
3530 if (ret < 0) {
3531 return ret;
3532 }
3533 }
3534 return bs->total_sectors;
3535 }
3536
3537 /**
3538 * Return length in bytes on success, -errno on error.
3539 * The length is always a multiple of BDRV_SECTOR_SIZE.
3540 */
3541 int64_t bdrv_getlength(BlockDriverState *bs)
3542 {
3543 int64_t ret = bdrv_nb_sectors(bs);
3544
3545 return ret < 0 ? ret : ret * BDRV_SECTOR_SIZE;
3546 }
3547
3548 /* return 0 as number of sectors if no device present or error */
3549 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
3550 {
3551 int64_t nb_sectors = bdrv_nb_sectors(bs);
3552
3553 *nb_sectors_ptr = nb_sectors < 0 ? 0 : nb_sectors;
3554 }
3555
3556 void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
3557 BlockdevOnError on_write_error)
3558 {
3559 bs->on_read_error = on_read_error;
3560 bs->on_write_error = on_write_error;
3561 }
3562
3563 BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
3564 {
3565 return is_read ? bs->on_read_error : bs->on_write_error;
3566 }
3567
3568 BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
3569 {
3570 BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
3571
3572 switch (on_err) {
3573 case BLOCKDEV_ON_ERROR_ENOSPC:
3574 return (error == ENOSPC) ?
3575 BLOCK_ERROR_ACTION_STOP : BLOCK_ERROR_ACTION_REPORT;
3576 case BLOCKDEV_ON_ERROR_STOP:
3577 return BLOCK_ERROR_ACTION_STOP;
3578 case BLOCKDEV_ON_ERROR_REPORT:
3579 return BLOCK_ERROR_ACTION_REPORT;
3580 case BLOCKDEV_ON_ERROR_IGNORE:
3581 return BLOCK_ERROR_ACTION_IGNORE;
3582 default:
3583 abort();
3584 }
3585 }
3586
3587 static void send_qmp_error_event(BlockDriverState *bs,
3588 BlockErrorAction action,
3589 bool is_read, int error)
3590 {
3591 IoOperationType optype;
3592
3593 optype = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE;
3594 qapi_event_send_block_io_error(bdrv_get_device_name(bs), optype, action,
3595 bdrv_iostatus_is_enabled(bs),
3596 error == ENOSPC, strerror(error),
3597 &error_abort);
3598 }
3599
3600 /* This is done by device models because, while the block layer knows
3601 * about the error, it does not know whether an operation comes from
3602 * the device or the block layer (from a job, for example).
3603 */
3604 void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3605 bool is_read, int error)
3606 {
3607 assert(error >= 0);
3608
3609 if (action == BLOCK_ERROR_ACTION_STOP) {
3610 /* First set the iostatus, so that "info block" returns an iostatus
3611 * that matches the events raised so far (an additional error iostatus
3612 * is fine, but not a lost one).
3613 */
3614 bdrv_iostatus_set_err(bs, error);
3615
3616 /* Then raise the request to stop the VM and the event.
3617 * qemu_system_vmstop_request_prepare has two effects. First,
3618 * it ensures that the STOP event always comes after the
3619 * BLOCK_IO_ERROR event. Second, it ensures that even if management
3620 * can observe the STOP event and do a "cont" before the STOP
3621 * event is issued, the VM will not stop. In this case, vm_start()
3622 * also ensures that the STOP/RESUME pair of events is emitted.
3623 */
3624 qemu_system_vmstop_request_prepare();
3625 send_qmp_error_event(bs, action, is_read, error);
3626 qemu_system_vmstop_request(RUN_STATE_IO_ERROR);
3627 } else {
3628 send_qmp_error_event(bs, action, is_read, error);
3629 }
3630 }
3631
3632 int bdrv_is_read_only(BlockDriverState *bs)
3633 {
3634 return bs->read_only;
3635 }
3636
3637 int bdrv_is_sg(BlockDriverState *bs)
3638 {
3639 return bs->sg;
3640 }
3641
3642 int bdrv_enable_write_cache(BlockDriverState *bs)
3643 {
3644 return bs->enable_write_cache;
3645 }
3646
3647 void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3648 {
3649 bs->enable_write_cache = wce;
3650
3651 /* so a reopen() will preserve wce */
3652 if (wce) {
3653 bs->open_flags |= BDRV_O_CACHE_WB;
3654 } else {
3655 bs->open_flags &= ~BDRV_O_CACHE_WB;
3656 }
3657 }
3658
3659 int bdrv_is_encrypted(BlockDriverState *bs)
3660 {
3661 if (bs->backing_hd && bs->backing_hd->encrypted)
3662 return 1;
3663 return bs->encrypted;
3664 }
3665
3666 int bdrv_key_required(BlockDriverState *bs)
3667 {
3668 BlockDriverState *backing_hd = bs->backing_hd;
3669
3670 if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3671 return 1;
3672 return (bs->encrypted && !bs->valid_key);
3673 }
3674
3675 int bdrv_set_key(BlockDriverState *bs, const char *key)
3676 {
3677 int ret;
3678 if (bs->backing_hd && bs->backing_hd->encrypted) {
3679 ret = bdrv_set_key(bs->backing_hd, key);
3680 if (ret < 0)
3681 return ret;
3682 if (!bs->encrypted)
3683 return 0;
3684 }
3685 if (!bs->encrypted) {
3686 return -EINVAL;
3687 } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3688 return -ENOMEDIUM;
3689 }
3690 ret = bs->drv->bdrv_set_key(bs, key);
3691 if (ret < 0) {
3692 bs->valid_key = 0;
3693 } else if (!bs->valid_key) {
3694 bs->valid_key = 1;
3695 if (bs->blk) {
3696 /* call the change callback now, we skipped it on open */
3697 blk_dev_change_media_cb(bs->blk, true);
3698 }
3699 }
3700 return ret;
3701 }
3702
3703 const char *bdrv_get_format_name(BlockDriverState *bs)
3704 {
3705 return bs->drv ? bs->drv->format_name : NULL;
3706 }
3707
3708 static int qsort_strcmp(const void *a, const void *b)
3709 {
3710 return strcmp(a, b);
3711 }
3712
3713 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
3714 void *opaque)
3715 {
3716 BlockDriver *drv;
3717 int count = 0;
3718 int i;
3719 const char **formats = NULL;
3720
3721 QLIST_FOREACH(drv, &bdrv_drivers, list) {
3722 if (drv->format_name) {
3723 bool found = false;
3724 int i = count;
3725 while (formats && i && !found) {
3726 found = !strcmp(formats[--i], drv->format_name);
3727 }
3728
3729 if (!found) {
3730 formats = g_renew(const char *, formats, count + 1);
3731 formats[count++] = drv->format_name;
3732 }
3733 }
3734 }
3735
3736 qsort(formats, count, sizeof(formats[0]), qsort_strcmp);
3737
3738 for (i = 0; i < count; i++) {
3739 it(opaque, formats[i]);
3740 }
3741
3742 g_free(formats);
3743 }
3744
3745 /* This function is to find block backend bs */
3746 /* TODO convert callers to blk_by_name(), then remove */
3747 BlockDriverState *bdrv_find(const char *name)
3748 {
3749 BlockBackend *blk = blk_by_name(name);
3750
3751 return blk ? blk_bs(blk) : NULL;
3752 }
3753
3754 /* This function is to find a node in the bs graph */
3755 BlockDriverState *bdrv_find_node(const char *node_name)
3756 {
3757 BlockDriverState *bs;
3758
3759 assert(node_name);
3760
3761 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3762 if (!strcmp(node_name, bs->node_name)) {
3763 return bs;
3764 }
3765 }
3766 return NULL;
3767 }
3768
3769 /* Put this QMP function here so it can access the static graph_bdrv_states. */
3770 BlockDeviceInfoList *bdrv_named_nodes_list(void)
3771 {
3772 BlockDeviceInfoList *list, *entry;
3773 BlockDriverState *bs;
3774
3775 list = NULL;
3776 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3777 entry = g_malloc0(sizeof(*entry));
3778 entry->value = bdrv_block_device_info(bs);
3779 entry->next = list;
3780 list = entry;
3781 }
3782
3783 return list;
3784 }
3785
3786 BlockDriverState *bdrv_lookup_bs(const char *device,
3787 const char *node_name,
3788 Error **errp)
3789 {
3790 BlockBackend *blk;
3791 BlockDriverState *bs;
3792
3793 if (device) {
3794 blk = blk_by_name(device);
3795
3796 if (blk) {
3797 return blk_bs(blk);
3798 }
3799 }
3800
3801 if (node_name) {
3802 bs = bdrv_find_node(node_name);
3803
3804 if (bs) {
3805 return bs;
3806 }
3807 }
3808
3809 error_setg(errp, "Cannot find device=%s nor node_name=%s",
3810 device ? device : "",
3811 node_name ? node_name : "");
3812 return NULL;
3813 }
3814
3815 /* If 'base' is in the same chain as 'top', return true. Otherwise,
3816 * return false. If either argument is NULL, return false. */
3817 bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base)
3818 {
3819 while (top && top != base) {
3820 top = top->backing_hd;
3821 }
3822
3823 return top != NULL;
3824 }
3825
3826 BlockDriverState *bdrv_next_node(BlockDriverState *bs)
3827 {
3828 if (!bs) {
3829 return QTAILQ_FIRST(&graph_bdrv_states);
3830 }
3831 return QTAILQ_NEXT(bs, node_list);
3832 }
3833
3834 BlockDriverState *bdrv_next(BlockDriverState *bs)
3835 {
3836 if (!bs) {
3837 return QTAILQ_FIRST(&bdrv_states);
3838 }
3839 return QTAILQ_NEXT(bs, device_list);
3840 }
3841
3842 const char *bdrv_get_node_name(const BlockDriverState *bs)
3843 {
3844 return bs->node_name;
3845 }
3846
3847 /* TODO check what callers really want: bs->node_name or blk_name() */
3848 const char *bdrv_get_device_name(const BlockDriverState *bs)
3849 {
3850 return bs->blk ? blk_name(bs->blk) : "";
3851 }
3852
3853 int bdrv_get_flags(BlockDriverState *bs)
3854 {
3855 return bs->open_flags;
3856 }
3857
3858 int bdrv_flush_all(void)
3859 {
3860 BlockDriverState *bs;
3861 int result = 0;
3862
3863 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3864 AioContext *aio_context = bdrv_get_aio_context(bs);
3865 int ret;
3866
3867 aio_context_acquire(aio_context);
3868 ret = bdrv_flush(bs);
3869 if (ret < 0 && !result) {
3870 result = ret;
3871 }
3872 aio_context_release(aio_context);
3873 }
3874
3875 return result;
3876 }
3877
3878 int bdrv_has_zero_init_1(BlockDriverState *bs)
3879 {
3880 return 1;
3881 }
3882
3883 int bdrv_has_zero_init(BlockDriverState *bs)
3884 {
3885 assert(bs->drv);
3886
3887 /* If BS is a copy on write image, it is initialized to
3888 the contents of the base image, which may not be zeroes. */
3889 if (bs->backing_hd) {
3890 return 0;
3891 }
3892 if (bs->drv->bdrv_has_zero_init) {
3893 return bs->drv->bdrv_has_zero_init(bs);
3894 }
3895
3896 /* safe default */
3897 return 0;
3898 }
3899
3900 bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
3901 {
3902 BlockDriverInfo bdi;
3903
3904 if (bs->backing_hd) {
3905 return false;
3906 }
3907
3908 if (bdrv_get_info(bs, &bdi) == 0) {
3909 return bdi.unallocated_blocks_are_zero;
3910 }
3911
3912 return false;
3913 }
3914
3915 bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
3916 {
3917 BlockDriverInfo bdi;
3918
3919 if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
3920 return false;
3921 }
3922
3923 if (bdrv_get_info(bs, &bdi) == 0) {
3924 return bdi.can_write_zeroes_with_unmap;
3925 }
3926
3927 return false;
3928 }
3929
3930 typedef struct BdrvCoGetBlockStatusData {
3931 BlockDriverState *bs;
3932 BlockDriverState *base;
3933 int64_t sector_num;
3934 int nb_sectors;
3935 int *pnum;
3936 int64_t ret;
3937 bool done;
3938 } BdrvCoGetBlockStatusData;
3939
3940 /*
3941 * Returns the allocation status of the specified sectors.
3942 * Drivers not implementing the functionality are assumed to not support
3943 * backing files, hence all their sectors are reported as allocated.
3944 *
3945 * If 'sector_num' is beyond the end of the disk image the return value is 0
3946 * and 'pnum' is set to 0.
3947 *
3948 * 'pnum' is set to the number of sectors (including and immediately following
3949 * the specified sector) that are known to be in the same
3950 * allocated/unallocated state.
3951 *
3952 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
3953 * beyond the end of the disk image it will be clamped.
3954 */
3955 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
3956 int64_t sector_num,
3957 int nb_sectors, int *pnum)
3958 {
3959 int64_t total_sectors;
3960 int64_t n;
3961 int64_t ret, ret2;
3962
3963 total_sectors = bdrv_nb_sectors(bs);
3964 if (total_sectors < 0) {
3965 return total_sectors;
3966 }
3967
3968 if (sector_num >= total_sectors) {
3969 *pnum = 0;
3970 return 0;
3971 }
3972
3973 n = total_sectors - sector_num;
3974 if (n < nb_sectors) {
3975 nb_sectors = n;
3976 }
3977
3978 if (!bs->drv->bdrv_co_get_block_status) {
3979 *pnum = nb_sectors;
3980 ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
3981 if (bs->drv->protocol_name) {
3982 ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
3983 }
3984 return ret;
3985 }
3986
3987 ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
3988 if (ret < 0) {
3989 *pnum = 0;
3990 return ret;
3991 }
3992
3993 if (ret & BDRV_BLOCK_RAW) {
3994 assert(ret & BDRV_BLOCK_OFFSET_VALID);
3995 return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3996 *pnum, pnum);
3997 }
3998
3999 if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
4000 ret |= BDRV_BLOCK_ALLOCATED;
4001 }
4002
4003 if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
4004 if (bdrv_unallocated_blocks_are_zero(bs)) {
4005 ret |= BDRV_BLOCK_ZERO;
4006 } else if (bs->backing_hd) {
4007 BlockDriverState *bs2 = bs->backing_hd;
4008 int64_t nb_sectors2 = bdrv_nb_sectors(bs2);
4009 if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) {
4010 ret |= BDRV_BLOCK_ZERO;
4011 }
4012 }
4013 }
4014
4015 if (bs->file &&
4016 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
4017 (ret & BDRV_BLOCK_OFFSET_VALID)) {
4018 int file_pnum;
4019
4020 ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
4021 *pnum, &file_pnum);
4022 if (ret2 >= 0) {
4023 /* Ignore errors. This is just providing extra information, it
4024 * is useful but not necessary.
4025 */
4026 if (!file_pnum) {
4027 /* !file_pnum indicates an offset at or beyond the EOF; it is
4028 * perfectly valid for the format block driver to point to such
4029 * offsets, so catch it and mark everything as zero */
4030 ret |= BDRV_BLOCK_ZERO;
4031 } else {
4032 /* Limit request to the range reported by the protocol driver */
4033 *pnum = file_pnum;
4034 ret |= (ret2 & BDRV_BLOCK_ZERO);
4035 }
4036 }
4037 }
4038
4039 return ret;
4040 }
4041
4042 /* Coroutine wrapper for bdrv_get_block_status() */
4043 static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
4044 {
4045 BdrvCoGetBlockStatusData *data = opaque;
4046 BlockDriverState *bs = data->bs;
4047
4048 data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
4049 data->pnum);
4050 data->done = true;
4051 }
4052
4053 /*
4054 * Synchronous wrapper around bdrv_co_get_block_status().
4055 *
4056 * See bdrv_co_get_block_status() for details.
4057 */
4058 int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
4059 int nb_sectors, int *pnum)
4060 {
4061 Coroutine *co;
4062 BdrvCoGetBlockStatusData data = {
4063 .bs = bs,
4064 .sector_num = sector_num,
4065 .nb_sectors = nb_sectors,
4066 .pnum = pnum,
4067 .done = false,
4068 };
4069
4070 if (qemu_in_coroutine()) {
4071 /* Fast-path if already in coroutine context */
4072 bdrv_get_block_status_co_entry(&data);
4073 } else {
4074 AioContext *aio_context = bdrv_get_aio_context(bs);
4075
4076 co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
4077 qemu_coroutine_enter(co, &data);
4078 while (!data.done) {
4079 aio_poll(aio_context, true);
4080 }
4081 }
4082 return data.ret;
4083 }
4084
4085 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
4086 int nb_sectors, int *pnum)
4087 {
4088 int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
4089 if (ret < 0) {
4090 return ret;
4091 }
4092 return !!(ret & BDRV_BLOCK_ALLOCATED);
4093 }
4094
4095 /*
4096 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
4097 *
4098 * Return true if the given sector is allocated in any image between
4099 * BASE and TOP (inclusive). BASE can be NULL to check if the given
4100 * sector is allocated in any image of the chain. Return false otherwise.
4101 *
4102 * 'pnum' is set to the number of sectors (including and immediately following
4103 * the specified sector) that are known to be in the same
4104 * allocated/unallocated state.
4105 *
4106 */
4107 int bdrv_is_allocated_above(BlockDriverState *top,
4108 BlockDriverState *base,
4109 int64_t sector_num,
4110 int nb_sectors, int *pnum)
4111 {
4112 BlockDriverState *intermediate;
4113 int ret, n = nb_sectors;
4114
4115 intermediate = top;
4116 while (intermediate && intermediate != base) {
4117 int pnum_inter;
4118 ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
4119 &pnum_inter);
4120 if (ret < 0) {
4121 return ret;
4122 } else if (ret) {
4123 *pnum = pnum_inter;
4124 return 1;
4125 }
4126
4127 /*
4128 * [sector_num, nb_sectors] is unallocated on top but intermediate
4129 * might have
4130 *
4131 * [sector_num+x, nr_sectors] allocated.
4132 */
4133 if (n > pnum_inter &&
4134 (intermediate == top ||
4135 sector_num + pnum_inter < intermediate->total_sectors)) {
4136 n = pnum_inter;
4137 }
4138
4139 intermediate = intermediate->backing_hd;
4140 }
4141
4142 *pnum = n;
4143 return 0;
4144 }
4145
4146 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
4147 {
4148 if (bs->backing_hd && bs->backing_hd->encrypted)
4149 return bs->backing_file;
4150 else if (bs->encrypted)
4151 return bs->filename;
4152 else
4153 return NULL;
4154 }
4155
4156 void bdrv_get_backing_filename(BlockDriverState *bs,
4157 char *filename, int filename_size)
4158 {
4159 pstrcpy(filename, filename_size, bs->backing_file);
4160 }
4161
4162 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
4163 const uint8_t *buf, int nb_sectors)
4164 {
4165 BlockDriver *drv = bs->drv;
4166 if (!drv)
4167 return -ENOMEDIUM;
4168 if (!drv->bdrv_write_compressed)
4169 return -ENOTSUP;
4170 if (bdrv_check_request(bs, sector_num, nb_sectors))
4171 return -EIO;
4172
4173 assert(QLIST_EMPTY(&bs->dirty_bitmaps));
4174
4175 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
4176 }
4177
4178 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
4179 {
4180 BlockDriver *drv = bs->drv;
4181 if (!drv)
4182 return -ENOMEDIUM;
4183 if (!drv->bdrv_get_info)
4184 return -ENOTSUP;
4185 memset(bdi, 0, sizeof(*bdi));
4186 return drv->bdrv_get_info(bs, bdi);
4187 }
4188
4189 ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
4190 {
4191 BlockDriver *drv = bs->drv;
4192 if (drv && drv->bdrv_get_specific_info) {
4193 return drv->bdrv_get_specific_info(bs);
4194 }
4195 return NULL;
4196 }
4197
4198 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
4199 int64_t pos, int size)
4200 {
4201 QEMUIOVector qiov;
4202 struct iovec iov = {
4203 .iov_base = (void *) buf,
4204 .iov_len = size,
4205 };
4206
4207 qemu_iovec_init_external(&qiov, &iov, 1);
4208 return bdrv_writev_vmstate(bs, &qiov, pos);
4209 }
4210
4211 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
4212 {
4213 BlockDriver *drv = bs->drv;
4214
4215 if (!drv) {
4216 return -ENOMEDIUM;
4217 } else if (drv->bdrv_save_vmstate) {
4218 return drv->bdrv_save_vmstate(bs, qiov, pos);
4219 } else if (bs->file) {
4220 return bdrv_writev_vmstate(bs->file, qiov, pos);
4221 }
4222
4223 return -ENOTSUP;
4224 }
4225
4226 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
4227 int64_t pos, int size)
4228 {
4229 BlockDriver *drv = bs->drv;
4230 if (!drv)
4231 return -ENOMEDIUM;
4232 if (drv->bdrv_load_vmstate)
4233 return drv->bdrv_load_vmstate(bs, buf, pos, size);
4234 if (bs->file)
4235 return bdrv_load_vmstate(bs->file, buf, pos, size);
4236 return -ENOTSUP;
4237 }
4238
4239 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
4240 {
4241 if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
4242 return;
4243 }
4244
4245 bs->drv->bdrv_debug_event(bs, event);
4246 }
4247
4248 int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
4249 const char *tag)
4250 {
4251 while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
4252 bs = bs->file;
4253 }
4254
4255 if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
4256 return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
4257 }
4258
4259 return -ENOTSUP;
4260 }
4261
4262 int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
4263 {
4264 while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
4265 bs = bs->file;
4266 }
4267
4268 if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
4269 return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
4270 }
4271
4272 return -ENOTSUP;
4273 }
4274
4275 int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
4276 {
4277 while (bs && (!bs->drv || !bs->drv->bdrv_debug_resume)) {
4278 bs = bs->file;
4279 }
4280
4281 if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
4282 return bs->drv->bdrv_debug_resume(bs, tag);
4283 }
4284
4285 return -ENOTSUP;
4286 }
4287
4288 bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
4289 {
4290 while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
4291 bs = bs->file;
4292 }
4293
4294 if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
4295 return bs->drv->bdrv_debug_is_suspended(bs, tag);
4296 }
4297
4298 return false;
4299 }
4300
4301 int bdrv_is_snapshot(BlockDriverState *bs)
4302 {
4303 return !!(bs->open_flags & BDRV_O_SNAPSHOT);
4304 }
4305
4306 /* backing_file can either be relative, or absolute, or a protocol. If it is
4307 * relative, it must be relative to the chain. So, passing in bs->filename
4308 * from a BDS as backing_file should not be done, as that may be relative to
4309 * the CWD rather than the chain. */
4310 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
4311 const char *backing_file)
4312 {
4313 char *filename_full = NULL;
4314 char *backing_file_full = NULL;
4315 char *filename_tmp = NULL;
4316 int is_protocol = 0;
4317 BlockDriverState *curr_bs = NULL;
4318 BlockDriverState *retval = NULL;
4319
4320 if (!bs || !bs->drv || !backing_file) {
4321 return NULL;
4322 }
4323
4324 filename_full = g_malloc(PATH_MAX);
4325 backing_file_full = g_malloc(PATH_MAX);
4326 filename_tmp = g_malloc(PATH_MAX);
4327
4328 is_protocol = path_has_protocol(backing_file);
4329
4330 for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
4331
4332 /* If either of the filename paths is actually a protocol, then
4333 * compare unmodified paths; otherwise make paths relative */
4334 if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
4335 if (strcmp(backing_file, curr_bs->backing_file) == 0) {
4336 retval = curr_bs->backing_hd;
4337 break;
4338 }
4339 } else {
4340 /* If not an absolute filename path, make it relative to the current
4341 * image's filename path */
4342 path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4343 backing_file);
4344
4345 /* We are going to compare absolute pathnames */
4346 if (!realpath(filename_tmp, filename_full)) {
4347 continue;
4348 }
4349
4350 /* We need to make sure the backing filename we are comparing against
4351 * is relative to the current image filename (or absolute) */
4352 path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4353 curr_bs->backing_file);
4354
4355 if (!realpath(filename_tmp, backing_file_full)) {
4356 continue;
4357 }
4358
4359 if (strcmp(backing_file_full, filename_full) == 0) {
4360 retval = curr_bs->backing_hd;
4361 break;
4362 }
4363 }
4364 }
4365
4366 g_free(filename_full);
4367 g_free(backing_file_full);
4368 g_free(filename_tmp);
4369 return retval;
4370 }
4371
4372 int bdrv_get_backing_file_depth(BlockDriverState *bs)
4373 {
4374 if (!bs->drv) {
4375 return 0;
4376 }
4377
4378 if (!bs->backing_hd) {
4379 return 0;
4380 }
4381
4382 return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
4383 }
4384
4385 /**************************************************************/
4386 /* async I/Os */
4387
4388 BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
4389 QEMUIOVector *qiov, int nb_sectors,
4390 BlockCompletionFunc *cb, void *opaque)
4391 {
4392 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
4393
4394 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4395 cb, opaque, false);
4396 }
4397
4398 BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
4399 QEMUIOVector *qiov, int nb_sectors,
4400 BlockCompletionFunc *cb, void *opaque)
4401 {
4402 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
4403
4404 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4405 cb, opaque, true);
4406 }
4407
4408 BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
4409 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
4410 BlockCompletionFunc *cb, void *opaque)
4411 {
4412 trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
4413
4414 return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
4415 BDRV_REQ_ZERO_WRITE | flags,
4416 cb, opaque, true);
4417 }
4418
4419
4420 typedef struct MultiwriteCB {
4421 int error;
4422 int num_requests;
4423 int num_callbacks;
4424 struct {
4425 BlockCompletionFunc *cb;
4426 void *opaque;
4427 QEMUIOVector *free_qiov;
4428 } callbacks[];
4429 } MultiwriteCB;
4430
4431 static void multiwrite_user_cb(MultiwriteCB *mcb)
4432 {
4433 int i;
4434
4435 for (i = 0; i < mcb->num_callbacks; i++) {
4436 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
4437 if (mcb->callbacks[i].free_qiov) {
4438 qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
4439 }
4440 g_free(mcb->callbacks[i].free_qiov);
4441 }
4442 }
4443
4444 static void multiwrite_cb(void *opaque, int ret)
4445 {
4446 MultiwriteCB *mcb = opaque;
4447
4448 trace_multiwrite_cb(mcb, ret);
4449
4450 if (ret < 0 && !mcb->error) {
4451 mcb->error = ret;
4452 }
4453
4454 mcb->num_requests--;
4455 if (mcb->num_requests == 0) {
4456 multiwrite_user_cb(mcb);
4457 g_free(mcb);
4458 }
4459 }
4460
4461 static int multiwrite_req_compare(const void *a, const void *b)
4462 {
4463 const BlockRequest *req1 = a, *req2 = b;
4464
4465 /*
4466 * Note that we can't simply subtract req2->sector from req1->sector
4467 * here as that could overflow the return value.
4468 */
4469 if (req1->sector > req2->sector) {
4470 return 1;
4471 } else if (req1->sector < req2->sector) {
4472 return -1;
4473 } else {
4474 return 0;
4475 }
4476 }
4477
4478 /*
4479 * Takes a bunch of requests and tries to merge them. Returns the number of
4480 * requests that remain after merging.
4481 */
4482 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
4483 int num_reqs, MultiwriteCB *mcb)
4484 {
4485 int i, outidx;
4486
4487 // Sort requests by start sector
4488 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
4489
4490 // Check if adjacent requests touch the same clusters. If so, combine them,
4491 // filling up gaps with zero sectors.
4492 outidx = 0;
4493 for (i = 1; i < num_reqs; i++) {
4494 int merge = 0;
4495 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
4496
4497 // Handle exactly sequential writes and overlapping writes.
4498 if (reqs[i].sector <= oldreq_last) {
4499 merge = 1;
4500 }
4501
4502 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
4503 merge = 0;
4504 }
4505
4506 if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors +
4507 reqs[i].nb_sectors > bs->bl.max_transfer_length) {
4508 merge = 0;
4509 }
4510
4511 if (merge) {
4512 size_t size;
4513 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
4514 qemu_iovec_init(qiov,
4515 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
4516
4517 // Add the first request to the merged one. If the requests are
4518 // overlapping, drop the last sectors of the first request.
4519 size = (reqs[i].sector - reqs[outidx].sector) << 9;
4520 qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
4521
4522 // We should need to add any zeros between the two requests
4523 assert (reqs[i].sector <= oldreq_last);
4524
4525 // Add the second request
4526 qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
4527
4528 // Add tail of first request, if necessary
4529 if (qiov->size < reqs[outidx].qiov->size) {
4530 qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size,
4531 reqs[outidx].qiov->size - qiov->size);
4532 }
4533
4534 reqs[outidx].nb_sectors = qiov->size >> 9;
4535 reqs[outidx].qiov = qiov;
4536
4537 mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
4538 } else {
4539 outidx++;
4540 reqs[outidx].sector = reqs[i].sector;
4541 reqs[outidx].nb_sectors = reqs[i].nb_sectors;
4542 reqs[outidx].qiov = reqs[i].qiov;
4543 }
4544 }
4545
4546 return outidx + 1;
4547 }
4548
4549 /*
4550 * Submit multiple AIO write requests at once.
4551 *
4552 * On success, the function returns 0 and all requests in the reqs array have
4553 * been submitted. In error case this function returns -1, and any of the
4554 * requests may or may not be submitted yet. In particular, this means that the
4555 * callback will be called for some of the requests, for others it won't. The
4556 * caller must check the error field of the BlockRequest to wait for the right
4557 * callbacks (if error != 0, no callback will be called).
4558 *
4559 * The implementation may modify the contents of the reqs array, e.g. to merge
4560 * requests. However, the fields opaque and error are left unmodified as they
4561 * are used to signal failure for a single request to the caller.
4562 */
4563 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
4564 {
4565 MultiwriteCB *mcb;
4566 int i;
4567
4568 /* don't submit writes if we don't have a medium */
4569 if (bs->drv == NULL) {
4570 for (i = 0; i < num_reqs; i++) {
4571 reqs[i].error = -ENOMEDIUM;
4572 }
4573 return -1;
4574 }
4575
4576 if (num_reqs == 0) {
4577 return 0;
4578 }
4579
4580 // Create MultiwriteCB structure
4581 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
4582 mcb->num_requests = 0;
4583 mcb->num_callbacks = num_reqs;
4584
4585 for (i = 0; i < num_reqs; i++) {
4586 mcb->callbacks[i].cb = reqs[i].cb;
4587 mcb->callbacks[i].opaque = reqs[i].opaque;
4588 }
4589
4590 // Check for mergable requests
4591 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
4592
4593 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
4594
4595 /* Run the aio requests. */
4596 mcb->num_requests = num_reqs;
4597 for (i = 0; i < num_reqs; i++) {
4598 bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
4599 reqs[i].nb_sectors, reqs[i].flags,
4600 multiwrite_cb, mcb,
4601 true);
4602 }
4603
4604 return 0;
4605 }
4606
4607 void bdrv_aio_cancel(BlockAIOCB *acb)
4608 {
4609 qemu_aio_ref(acb);
4610 bdrv_aio_cancel_async(acb);
4611 while (acb->refcnt > 1) {
4612 if (acb->aiocb_info->get_aio_context) {
4613 aio_poll(acb->aiocb_info->get_aio_context(acb), true);
4614 } else if (acb->bs) {
4615 aio_poll(bdrv_get_aio_context(acb->bs), true);
4616 } else {
4617 abort();
4618 }
4619 }
4620 qemu_aio_unref(acb);
4621 }
4622
4623 /* Async version of aio cancel. The caller is not blocked if the acb implements
4624 * cancel_async, otherwise we do nothing and let the request normally complete.
4625 * In either case the completion callback must be called. */
4626 void bdrv_aio_cancel_async(BlockAIOCB *acb)
4627 {
4628 if (acb->aiocb_info->cancel_async) {
4629 acb->aiocb_info->cancel_async(acb);
4630 }
4631 }
4632
4633 /**************************************************************/
4634 /* async block device emulation */
4635
4636 typedef struct BlockAIOCBSync {
4637 BlockAIOCB common;
4638 QEMUBH *bh;
4639 int ret;
4640 /* vector translation state */
4641 QEMUIOVector *qiov;
4642 uint8_t *bounce;
4643 int is_write;
4644 } BlockAIOCBSync;
4645
4646 static const AIOCBInfo bdrv_em_aiocb_info = {
4647 .aiocb_size = sizeof(BlockAIOCBSync),
4648 };
4649
4650 static void bdrv_aio_bh_cb(void *opaque)
4651 {
4652 BlockAIOCBSync *acb = opaque;
4653
4654 if (!acb->is_write && acb->ret >= 0) {
4655 qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
4656 }
4657 qemu_vfree(acb->bounce);
4658 acb->common.cb(acb->common.opaque, acb->ret);
4659 qemu_bh_delete(acb->bh);
4660 acb->bh = NULL;
4661 qemu_aio_unref(acb);
4662 }
4663
4664 static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
4665 int64_t sector_num,
4666 QEMUIOVector *qiov,
4667 int nb_sectors,
4668 BlockCompletionFunc *cb,
4669 void *opaque,
4670 int is_write)
4671
4672 {
4673 BlockAIOCBSync *acb;
4674
4675 acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
4676 acb->is_write = is_write;
4677 acb->qiov = qiov;
4678 acb->bounce = qemu_try_blockalign(bs, qiov->size);
4679 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb);
4680
4681 if (acb->bounce == NULL) {
4682 acb->ret = -ENOMEM;
4683 } else if (is_write) {
4684 qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
4685 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
4686 } else {
4687 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
4688 }
4689
4690 qemu_bh_schedule(acb->bh);
4691
4692 return &acb->common;
4693 }
4694
4695 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
4696 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4697 BlockCompletionFunc *cb, void *opaque)
4698 {
4699 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
4700 }
4701
4702 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
4703 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4704 BlockCompletionFunc *cb, void *opaque)
4705 {
4706 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
4707 }
4708
4709
4710 typedef struct BlockAIOCBCoroutine {
4711 BlockAIOCB common;
4712 BlockRequest req;
4713 bool is_write;
4714 bool *done;
4715 QEMUBH* bh;
4716 } BlockAIOCBCoroutine;
4717
4718 static const AIOCBInfo bdrv_em_co_aiocb_info = {
4719 .aiocb_size = sizeof(BlockAIOCBCoroutine),
4720 };
4721
4722 static void bdrv_co_em_bh(void *opaque)
4723 {
4724 BlockAIOCBCoroutine *acb = opaque;
4725
4726 acb->common.cb(acb->common.opaque, acb->req.error);
4727
4728 qemu_bh_delete(acb->bh);
4729 qemu_aio_unref(acb);
4730 }
4731
4732 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4733 static void coroutine_fn bdrv_co_do_rw(void *opaque)
4734 {
4735 BlockAIOCBCoroutine *acb = opaque;
4736 BlockDriverState *bs = acb->common.bs;
4737
4738 if (!acb->is_write) {
4739 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
4740 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4741 } else {
4742 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
4743 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4744 }
4745
4746 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4747 qemu_bh_schedule(acb->bh);
4748 }
4749
4750 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4751 int64_t sector_num,
4752 QEMUIOVector *qiov,
4753 int nb_sectors,
4754 BdrvRequestFlags flags,
4755 BlockCompletionFunc *cb,
4756 void *opaque,
4757 bool is_write)
4758 {
4759 Coroutine *co;
4760 BlockAIOCBCoroutine *acb;
4761
4762 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4763 acb->req.sector = sector_num;
4764 acb->req.nb_sectors = nb_sectors;
4765 acb->req.qiov = qiov;
4766 acb->req.flags = flags;
4767 acb->is_write = is_write;
4768
4769 co = qemu_coroutine_create(bdrv_co_do_rw);
4770 qemu_coroutine_enter(co, acb);
4771
4772 return &acb->common;
4773 }
4774
4775 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
4776 {
4777 BlockAIOCBCoroutine *acb = opaque;
4778 BlockDriverState *bs = acb->common.bs;
4779
4780 acb->req.error = bdrv_co_flush(bs);
4781 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4782 qemu_bh_schedule(acb->bh);
4783 }
4784
4785 BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
4786 BlockCompletionFunc *cb, void *opaque)
4787 {
4788 trace_bdrv_aio_flush(bs, opaque);
4789
4790 Coroutine *co;
4791 BlockAIOCBCoroutine *acb;
4792
4793 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4794
4795 co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4796 qemu_coroutine_enter(co, acb);
4797
4798 return &acb->common;
4799 }
4800
4801 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4802 {
4803 BlockAIOCBCoroutine *acb = opaque;
4804 BlockDriverState *bs = acb->common.bs;
4805
4806 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
4807 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4808 qemu_bh_schedule(acb->bh);
4809 }
4810
4811 BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4812 int64_t sector_num, int nb_sectors,
4813 BlockCompletionFunc *cb, void *opaque)
4814 {
4815 Coroutine *co;
4816 BlockAIOCBCoroutine *acb;
4817
4818 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4819
4820 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4821 acb->req.sector = sector_num;
4822 acb->req.nb_sectors = nb_sectors;
4823 co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
4824 qemu_coroutine_enter(co, acb);
4825
4826 return &acb->common;
4827 }
4828
4829 void bdrv_init(void)
4830 {
4831 module_call_init(MODULE_INIT_BLOCK);
4832 }
4833
4834 void bdrv_init_with_whitelist(void)
4835 {
4836 use_bdrv_whitelist = 1;
4837 bdrv_init();
4838 }
4839
4840 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
4841 BlockCompletionFunc *cb, void *opaque)
4842 {
4843 BlockAIOCB *acb;
4844
4845 acb = g_slice_alloc(aiocb_info->aiocb_size);
4846 acb->aiocb_info = aiocb_info;
4847 acb->bs = bs;
4848 acb->cb = cb;
4849 acb->opaque = opaque;
4850 acb->refcnt = 1;
4851 return acb;
4852 }
4853
4854 void qemu_aio_ref(void *p)
4855 {
4856 BlockAIOCB *acb = p;
4857 acb->refcnt++;
4858 }
4859
4860 void qemu_aio_unref(void *p)
4861 {
4862 BlockAIOCB *acb = p;
4863 assert(acb->refcnt > 0);
4864 if (--acb->refcnt == 0) {
4865 g_slice_free1(acb->aiocb_info->aiocb_size, acb);
4866 }
4867 }
4868
4869 /**************************************************************/
4870 /* Coroutine block device emulation */
4871
4872 typedef struct CoroutineIOCompletion {
4873 Coroutine *coroutine;
4874 int ret;
4875 } CoroutineIOCompletion;
4876
4877 static void bdrv_co_io_em_complete(void *opaque, int ret)
4878 {
4879 CoroutineIOCompletion *co = opaque;
4880
4881 co->ret = ret;
4882 qemu_coroutine_enter(co->coroutine, NULL);
4883 }
4884
4885 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
4886 int nb_sectors, QEMUIOVector *iov,
4887 bool is_write)
4888 {
4889 CoroutineIOCompletion co = {
4890 .coroutine = qemu_coroutine_self(),
4891 };
4892 BlockAIOCB *acb;
4893
4894 if (is_write) {
4895 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
4896 bdrv_co_io_em_complete, &co);
4897 } else {
4898 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
4899 bdrv_co_io_em_complete, &co);
4900 }
4901
4902 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
4903 if (!acb) {
4904 return -EIO;
4905 }
4906 qemu_coroutine_yield();
4907
4908 return co.ret;
4909 }
4910
4911 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4912 int64_t sector_num, int nb_sectors,
4913 QEMUIOVector *iov)
4914 {
4915 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4916 }
4917
4918 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4919 int64_t sector_num, int nb_sectors,
4920 QEMUIOVector *iov)
4921 {
4922 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4923 }
4924
4925 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
4926 {
4927 RwCo *rwco = opaque;
4928
4929 rwco->ret = bdrv_co_flush(rwco->bs);
4930 }
4931
4932 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4933 {
4934 int ret;
4935
4936 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
4937 return 0;
4938 }
4939
4940 /* Write back cached data to the OS even with cache=unsafe */
4941 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
4942 if (bs->drv->bdrv_co_flush_to_os) {
4943 ret = bs->drv->bdrv_co_flush_to_os(bs);
4944 if (ret < 0) {
4945 return ret;
4946 }
4947 }
4948
4949 /* But don't actually force it to the disk with cache=unsafe */
4950 if (bs->open_flags & BDRV_O_NO_FLUSH) {
4951 goto flush_parent;
4952 }
4953
4954 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
4955 if (bs->drv->bdrv_co_flush_to_disk) {
4956 ret = bs->drv->bdrv_co_flush_to_disk(bs);
4957 } else if (bs->drv->bdrv_aio_flush) {
4958 BlockAIOCB *acb;
4959 CoroutineIOCompletion co = {
4960 .coroutine = qemu_coroutine_self(),
4961 };
4962
4963 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4964 if (acb == NULL) {
4965 ret = -EIO;
4966 } else {
4967 qemu_coroutine_yield();
4968 ret = co.ret;
4969 }
4970 } else {
4971 /*
4972 * Some block drivers always operate in either writethrough or unsafe
4973 * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4974 * know how the server works (because the behaviour is hardcoded or
4975 * depends on server-side configuration), so we can't ensure that
4976 * everything is safe on disk. Returning an error doesn't work because
4977 * that would break guests even if the server operates in writethrough
4978 * mode.
4979 *
4980 * Let's hope the user knows what he's doing.
4981 */
4982 ret = 0;
4983 }
4984 if (ret < 0) {
4985 return ret;
4986 }
4987
4988 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
4989 * in the case of cache=unsafe, so there are no useless flushes.
4990 */
4991 flush_parent:
4992 return bdrv_co_flush(bs->file);
4993 }
4994
4995 void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)
4996 {
4997 Error *local_err = NULL;
4998 int ret;
4999
5000 if (!bs->drv) {
5001 return;
5002 }
5003
5004 if (!(bs->open_flags & BDRV_O_INCOMING)) {
5005 return;
5006 }
5007 bs->open_flags &= ~BDRV_O_INCOMING;
5008
5009 if (bs->drv->bdrv_invalidate_cache) {
5010 bs->drv->bdrv_invalidate_cache(bs, &local_err);
5011 } else if (bs->file) {
5012 bdrv_invalidate_cache(bs->file, &local_err);
5013 }
5014 if (local_err) {
5015 error_propagate(errp, local_err);
5016 return;
5017 }
5018
5019 ret = refresh_total_sectors(bs, bs->total_sectors);
5020 if (ret < 0) {
5021 error_setg_errno(errp, -ret, "Could not refresh total sector count");
5022 return;
5023 }
5024 }
5025
5026 void bdrv_invalidate_cache_all(Error **errp)
5027 {
5028 BlockDriverState *bs;
5029 Error *local_err = NULL;
5030
5031 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5032 AioContext *aio_context = bdrv_get_aio_context(bs);
5033
5034 aio_context_acquire(aio_context);
5035 bdrv_invalidate_cache(bs, &local_err);
5036 aio_context_release(aio_context);
5037 if (local_err) {
5038 error_propagate(errp, local_err);
5039 return;
5040 }
5041 }
5042 }
5043
5044 int bdrv_flush(BlockDriverState *bs)
5045 {
5046 Coroutine *co;
5047 RwCo rwco = {
5048 .bs = bs,
5049 .ret = NOT_DONE,
5050 };
5051
5052 if (qemu_in_coroutine()) {
5053 /* Fast-path if already in coroutine context */
5054 bdrv_flush_co_entry(&rwco);
5055 } else {
5056 AioContext *aio_context = bdrv_get_aio_context(bs);
5057
5058 co = qemu_coroutine_create(bdrv_flush_co_entry);
5059 qemu_coroutine_enter(co, &rwco);
5060 while (rwco.ret == NOT_DONE) {
5061 aio_poll(aio_context, true);
5062 }
5063 }
5064
5065 return rwco.ret;
5066 }
5067
5068 typedef struct DiscardCo {
5069 BlockDriverState *bs;
5070 int64_t sector_num;
5071 int nb_sectors;
5072 int ret;
5073 } DiscardCo;
5074 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
5075 {
5076 DiscardCo *rwco = opaque;
5077
5078 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
5079 }
5080
5081 /* if no limit is specified in the BlockLimits use a default
5082 * of 32768 512-byte sectors (16 MiB) per request.
5083 */
5084 #define MAX_DISCARD_DEFAULT 32768
5085
5086 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
5087 int nb_sectors)
5088 {
5089 int max_discard;
5090
5091 if (!bs->drv) {
5092 return -ENOMEDIUM;
5093 } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
5094 return -EIO;
5095 } else if (bs->read_only) {
5096 return -EROFS;
5097 }
5098
5099 bdrv_reset_dirty(bs, sector_num, nb_sectors);
5100
5101 /* Do nothing if disabled. */
5102 if (!(bs->open_flags & BDRV_O_UNMAP)) {
5103 return 0;
5104 }
5105
5106 if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
5107 return 0;
5108 }
5109
5110 max_discard = bs->bl.max_discard ? bs->bl.max_discard : MAX_DISCARD_DEFAULT;
5111 while (nb_sectors > 0) {
5112 int ret;
5113 int num = nb_sectors;
5114
5115 /* align request */
5116 if (bs->bl.discard_alignment &&
5117 num >= bs->bl.discard_alignment &&
5118 sector_num % bs->bl.discard_alignment) {
5119 if (num > bs->bl.discard_alignment) {
5120 num = bs->bl.discard_alignment;
5121 }
5122 num -= sector_num % bs->bl.discard_alignment;
5123 }
5124
5125 /* limit request size */
5126 if (num > max_discard) {
5127 num = max_discard;
5128 }
5129
5130 if (bs->drv->bdrv_co_discard) {
5131 ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
5132 } else {
5133 BlockAIOCB *acb;
5134 CoroutineIOCompletion co = {
5135 .coroutine = qemu_coroutine_self(),
5136 };
5137
5138 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
5139 bdrv_co_io_em_complete, &co);
5140 if (acb == NULL) {
5141 return -EIO;
5142 } else {
5143 qemu_coroutine_yield();
5144 ret = co.ret;
5145 }
5146 }
5147 if (ret && ret != -ENOTSUP) {
5148 return ret;
5149 }
5150
5151 sector_num += num;
5152 nb_sectors -= num;
5153 }
5154 return 0;
5155 }
5156
5157 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
5158 {
5159 Coroutine *co;
5160 DiscardCo rwco = {
5161 .bs = bs,
5162 .sector_num = sector_num,
5163 .nb_sectors = nb_sectors,
5164 .ret = NOT_DONE,
5165 };
5166
5167 if (qemu_in_coroutine()) {
5168 /* Fast-path if already in coroutine context */
5169 bdrv_discard_co_entry(&rwco);
5170 } else {
5171 AioContext *aio_context = bdrv_get_aio_context(bs);
5172
5173 co = qemu_coroutine_create(bdrv_discard_co_entry);
5174 qemu_coroutine_enter(co, &rwco);
5175 while (rwco.ret == NOT_DONE) {
5176 aio_poll(aio_context, true);
5177 }
5178 }
5179
5180 return rwco.ret;
5181 }
5182
5183 /**************************************************************/
5184 /* removable device support */
5185
5186 /**
5187 * Return TRUE if the media is present
5188 */
5189 int bdrv_is_inserted(BlockDriverState *bs)
5190 {
5191 BlockDriver *drv = bs->drv;
5192
5193 if (!drv)
5194 return 0;
5195 if (!drv->bdrv_is_inserted)
5196 return 1;
5197 return drv->bdrv_is_inserted(bs);
5198 }
5199
5200 /**
5201 * Return whether the media changed since the last call to this
5202 * function, or -ENOTSUP if we don't know. Most drivers don't know.
5203 */
5204 int bdrv_media_changed(BlockDriverState *bs)
5205 {
5206 BlockDriver *drv = bs->drv;
5207
5208 if (drv && drv->bdrv_media_changed) {
5209 return drv->bdrv_media_changed(bs);
5210 }
5211 return -ENOTSUP;
5212 }
5213
5214 /**
5215 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
5216 */
5217 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
5218 {
5219 BlockDriver *drv = bs->drv;
5220 const char *device_name;
5221
5222 if (drv && drv->bdrv_eject) {
5223 drv->bdrv_eject(bs, eject_flag);
5224 }
5225
5226 device_name = bdrv_get_device_name(bs);
5227 if (device_name[0] != '\0') {
5228 qapi_event_send_device_tray_moved(device_name,
5229 eject_flag, &error_abort);
5230 }
5231 }
5232
5233 /**
5234 * Lock or unlock the media (if it is locked, the user won't be able
5235 * to eject it manually).
5236 */
5237 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
5238 {
5239 BlockDriver *drv = bs->drv;
5240
5241 trace_bdrv_lock_medium(bs, locked);
5242
5243 if (drv && drv->bdrv_lock_medium) {
5244 drv->bdrv_lock_medium(bs, locked);
5245 }
5246 }
5247
5248 /* needed for generic scsi interface */
5249
5250 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
5251 {
5252 BlockDriver *drv = bs->drv;
5253
5254 if (drv && drv->bdrv_ioctl)
5255 return drv->bdrv_ioctl(bs, req, buf);
5256 return -ENOTSUP;
5257 }
5258
5259 BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
5260 unsigned long int req, void *buf,
5261 BlockCompletionFunc *cb, void *opaque)
5262 {
5263 BlockDriver *drv = bs->drv;
5264
5265 if (drv && drv->bdrv_aio_ioctl)
5266 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
5267 return NULL;
5268 }
5269
5270 void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
5271 {
5272 bs->guest_block_size = align;
5273 }
5274
5275 void *qemu_blockalign(BlockDriverState *bs, size_t size)
5276 {
5277 return qemu_memalign(bdrv_opt_mem_align(bs), size);
5278 }
5279
5280 void *qemu_blockalign0(BlockDriverState *bs, size_t size)
5281 {
5282 return memset(qemu_blockalign(bs, size), 0, size);
5283 }
5284
5285 void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
5286 {
5287 size_t align = bdrv_opt_mem_align(bs);
5288
5289 /* Ensure that NULL is never returned on success */
5290 assert(align > 0);
5291 if (size == 0) {
5292 size = align;
5293 }
5294
5295 return qemu_try_memalign(align, size);
5296 }
5297
5298 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
5299 {
5300 void *mem = qemu_try_blockalign(bs, size);
5301
5302 if (mem) {
5303 memset(mem, 0, size);
5304 }
5305
5306 return mem;
5307 }
5308
5309 /*
5310 * Check if all memory in this vector is sector aligned.
5311 */
5312 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
5313 {
5314 int i;
5315 size_t alignment = bdrv_opt_mem_align(bs);
5316
5317 for (i = 0; i < qiov->niov; i++) {
5318 if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
5319 return false;
5320 }
5321 if (qiov->iov[i].iov_len % alignment) {
5322 return false;
5323 }
5324 }
5325
5326 return true;
5327 }
5328
5329 BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity,
5330 Error **errp)
5331 {
5332 int64_t bitmap_size;
5333 BdrvDirtyBitmap *bitmap;
5334
5335 assert((granularity & (granularity - 1)) == 0);
5336
5337 granularity >>= BDRV_SECTOR_BITS;
5338 assert(granularity);
5339 bitmap_size = bdrv_nb_sectors(bs);
5340 if (bitmap_size < 0) {
5341 error_setg_errno(errp, -bitmap_size, "could not get length of device");
5342 errno = -bitmap_size;
5343 return NULL;
5344 }
5345 bitmap = g_new0(BdrvDirtyBitmap, 1);
5346 bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
5347 QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
5348 return bitmap;
5349 }
5350
5351 void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5352 {
5353 BdrvDirtyBitmap *bm, *next;
5354 QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
5355 if (bm == bitmap) {
5356 QLIST_REMOVE(bitmap, list);
5357 hbitmap_free(bitmap->bitmap);
5358 g_free(bitmap);
5359 return;
5360 }
5361 }
5362 }
5363
5364 BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
5365 {
5366 BdrvDirtyBitmap *bm;
5367 BlockDirtyInfoList *list = NULL;
5368 BlockDirtyInfoList **plist = &list;
5369
5370 QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
5371 BlockDirtyInfo *info = g_new0(BlockDirtyInfo, 1);
5372 BlockDirtyInfoList *entry = g_new0(BlockDirtyInfoList, 1);
5373 info->count = bdrv_get_dirty_count(bs, bm);
5374 info->granularity =
5375 ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
5376 entry->value = info;
5377 *plist = entry;
5378 plist = &entry->next;
5379 }
5380
5381 return list;
5382 }
5383
5384 int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
5385 {
5386 if (bitmap) {
5387 return hbitmap_get(bitmap->bitmap, sector);
5388 } else {
5389 return 0;
5390 }
5391 }
5392
5393 void bdrv_dirty_iter_init(BlockDriverState *bs,
5394 BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
5395 {
5396 hbitmap_iter_init(hbi, bitmap->bitmap, 0);
5397 }
5398
5399 void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
5400 int nr_sectors)
5401 {
5402 BdrvDirtyBitmap *bitmap;
5403 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5404 hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5405 }
5406 }
5407
5408 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors)
5409 {
5410 BdrvDirtyBitmap *bitmap;
5411 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5412 hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5413 }
5414 }
5415
5416 int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5417 {
5418 return hbitmap_count(bitmap->bitmap);
5419 }
5420
5421 /* Get a reference to bs */
5422 void bdrv_ref(BlockDriverState *bs)
5423 {
5424 bs->refcnt++;
5425 }
5426
5427 /* Release a previously grabbed reference to bs.
5428 * If after releasing, reference count is zero, the BlockDriverState is
5429 * deleted. */
5430 void bdrv_unref(BlockDriverState *bs)
5431 {
5432 if (!bs) {
5433 return;
5434 }
5435 assert(bs->refcnt > 0);
5436 if (--bs->refcnt == 0) {
5437 bdrv_delete(bs);
5438 }
5439 }
5440
5441 struct BdrvOpBlocker {
5442 Error *reason;
5443 QLIST_ENTRY(BdrvOpBlocker) list;
5444 };
5445
5446 bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp)
5447 {
5448 BdrvOpBlocker *blocker;
5449 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5450 if (!QLIST_EMPTY(&bs->op_blockers[op])) {
5451 blocker = QLIST_FIRST(&bs->op_blockers[op]);
5452 if (errp) {
5453 error_setg(errp, "Device '%s' is busy: %s",
5454 bdrv_get_device_name(bs),
5455 error_get_pretty(blocker->reason));
5456 }
5457 return true;
5458 }
5459 return false;
5460 }
5461
5462 void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason)
5463 {
5464 BdrvOpBlocker *blocker;
5465 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5466
5467 blocker = g_new0(BdrvOpBlocker, 1);
5468 blocker->reason = reason;
5469 QLIST_INSERT_HEAD(&bs->op_blockers[op], blocker, list);
5470 }
5471
5472 void bdrv_op_unblock(BlockDriverState *bs, BlockOpType op, Error *reason)
5473 {
5474 BdrvOpBlocker *blocker, *next;
5475 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5476 QLIST_FOREACH_SAFE(blocker, &bs->op_blockers[op], list, next) {
5477 if (blocker->reason == reason) {
5478 QLIST_REMOVE(blocker, list);
5479 g_free(blocker);
5480 }
5481 }
5482 }
5483
5484 void bdrv_op_block_all(BlockDriverState *bs, Error *reason)
5485 {
5486 int i;
5487 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5488 bdrv_op_block(bs, i, reason);
5489 }
5490 }
5491
5492 void bdrv_op_unblock_all(BlockDriverState *bs, Error *reason)
5493 {
5494 int i;
5495 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5496 bdrv_op_unblock(bs, i, reason);
5497 }
5498 }
5499
5500 bool bdrv_op_blocker_is_empty(BlockDriverState *bs)
5501 {
5502 int i;
5503
5504 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5505 if (!QLIST_EMPTY(&bs->op_blockers[i])) {
5506 return false;
5507 }
5508 }
5509 return true;
5510 }
5511
5512 void bdrv_iostatus_enable(BlockDriverState *bs)
5513 {
5514 bs->iostatus_enabled = true;
5515 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5516 }
5517
5518 /* The I/O status is only enabled if the drive explicitly
5519 * enables it _and_ the VM is configured to stop on errors */
5520 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
5521 {
5522 return (bs->iostatus_enabled &&
5523 (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
5524 bs->on_write_error == BLOCKDEV_ON_ERROR_STOP ||
5525 bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
5526 }
5527
5528 void bdrv_iostatus_disable(BlockDriverState *bs)
5529 {
5530 bs->iostatus_enabled = false;
5531 }
5532
5533 void bdrv_iostatus_reset(BlockDriverState *bs)
5534 {
5535 if (bdrv_iostatus_is_enabled(bs)) {
5536 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5537 if (bs->job) {
5538 block_job_iostatus_reset(bs->job);
5539 }
5540 }
5541 }
5542
5543 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
5544 {
5545 assert(bdrv_iostatus_is_enabled(bs));
5546 if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
5547 bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
5548 BLOCK_DEVICE_IO_STATUS_FAILED;
5549 }
5550 }
5551
5552 void bdrv_img_create(const char *filename, const char *fmt,
5553 const char *base_filename, const char *base_fmt,
5554 char *options, uint64_t img_size, int flags,
5555 Error **errp, bool quiet)
5556 {
5557 QemuOptsList *create_opts = NULL;
5558 QemuOpts *opts = NULL;
5559 const char *backing_fmt, *backing_file;
5560 int64_t size;
5561 BlockDriver *drv, *proto_drv;
5562 BlockDriver *backing_drv = NULL;
5563 Error *local_err = NULL;
5564 int ret = 0;
5565
5566 /* Find driver and parse its options */
5567 drv = bdrv_find_format(fmt);
5568 if (!drv) {
5569 error_setg(errp, "Unknown file format '%s'", fmt);
5570 return;
5571 }
5572
5573 proto_drv = bdrv_find_protocol(filename, true);
5574 if (!proto_drv) {
5575 error_setg(errp, "Unknown protocol '%s'", filename);
5576 return;
5577 }
5578
5579 create_opts = qemu_opts_append(create_opts, drv->create_opts);
5580 create_opts = qemu_opts_append(create_opts, proto_drv->create_opts);
5581
5582 /* Create parameter list with default values */
5583 opts = qemu_opts_create(create_opts, NULL, 0, &error_abort);
5584 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, img_size);
5585
5586 /* Parse -o options */
5587 if (options) {
5588 if (qemu_opts_do_parse(opts, options, NULL) != 0) {
5589 error_setg(errp, "Invalid options for file format '%s'", fmt);
5590 goto out;
5591 }
5592 }
5593
5594 if (base_filename) {
5595 if (qemu_opt_set(opts, BLOCK_OPT_BACKING_FILE, base_filename)) {
5596 error_setg(errp, "Backing file not supported for file format '%s'",
5597 fmt);
5598 goto out;
5599 }
5600 }
5601
5602 if (base_fmt) {
5603 if (qemu_opt_set(opts, BLOCK_OPT_BACKING_FMT, base_fmt)) {
5604 error_setg(errp, "Backing file format not supported for file "
5605 "format '%s'", fmt);
5606 goto out;
5607 }
5608 }
5609
5610 backing_file = qemu_opt_get(opts, BLOCK_OPT_BACKING_FILE);
5611 if (backing_file) {
5612 if (!strcmp(filename, backing_file)) {
5613 error_setg(errp, "Error: Trying to create an image with the "
5614 "same filename as the backing file");
5615 goto out;
5616 }
5617 }
5618
5619 backing_fmt = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT);
5620 if (backing_fmt) {
5621 backing_drv = bdrv_find_format(backing_fmt);
5622 if (!backing_drv) {
5623 error_setg(errp, "Unknown backing file format '%s'",
5624 backing_fmt);
5625 goto out;
5626 }
5627 }
5628
5629 // The size for the image must always be specified, with one exception:
5630 // If we are using a backing file, we can obtain the size from there
5631 size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 0);
5632 if (size == -1) {
5633 if (backing_file) {
5634 BlockDriverState *bs;
5635 int64_t size;
5636 int back_flags;
5637
5638 /* backing files always opened read-only */
5639 back_flags =
5640 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
5641
5642 bs = NULL;
5643 ret = bdrv_open(&bs, backing_file, NULL, NULL, back_flags,
5644 backing_drv, &local_err);
5645 if (ret < 0) {
5646 goto out;
5647 }
5648 size = bdrv_getlength(bs);
5649 if (size < 0) {
5650 error_setg_errno(errp, -size, "Could not get size of '%s'",
5651 backing_file);
5652 bdrv_unref(bs);
5653 goto out;
5654 }
5655
5656 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, size);
5657
5658 bdrv_unref(bs);
5659 } else {
5660 error_setg(errp, "Image creation needs a size parameter");
5661 goto out;
5662 }
5663 }
5664
5665 if (!quiet) {
5666 printf("Formatting '%s', fmt=%s ", filename, fmt);
5667 qemu_opts_print(opts);
5668 puts("");
5669 }
5670
5671 ret = bdrv_create(drv, filename, opts, &local_err);
5672
5673 if (ret == -EFBIG) {
5674 /* This is generally a better message than whatever the driver would
5675 * deliver (especially because of the cluster_size_hint), since that
5676 * is most probably not much different from "image too large". */
5677 const char *cluster_size_hint = "";
5678 if (qemu_opt_get_size(opts, BLOCK_OPT_CLUSTER_SIZE, 0)) {
5679 cluster_size_hint = " (try using a larger cluster size)";
5680 }
5681 error_setg(errp, "The image size is too large for file format '%s'"
5682 "%s", fmt, cluster_size_hint);
5683 error_free(local_err);
5684 local_err = NULL;
5685 }
5686
5687 out:
5688 qemu_opts_del(opts);
5689 qemu_opts_free(create_opts);
5690 if (local_err) {
5691 error_propagate(errp, local_err);
5692 }
5693 }
5694
5695 AioContext *bdrv_get_aio_context(BlockDriverState *bs)
5696 {
5697 return bs->aio_context;
5698 }
5699
5700 void bdrv_detach_aio_context(BlockDriverState *bs)
5701 {
5702 BdrvAioNotifier *baf;
5703
5704 if (!bs->drv) {
5705 return;
5706 }
5707
5708 QLIST_FOREACH(baf, &bs->aio_notifiers, list) {
5709 baf->detach_aio_context(baf->opaque);
5710 }
5711
5712 if (bs->io_limits_enabled) {
5713 throttle_detach_aio_context(&bs->throttle_state);
5714 }
5715 if (bs->drv->bdrv_detach_aio_context) {
5716 bs->drv->bdrv_detach_aio_context(bs);
5717 }
5718 if (bs->file) {
5719 bdrv_detach_aio_context(bs->file);
5720 }
5721 if (bs->backing_hd) {
5722 bdrv_detach_aio_context(bs->backing_hd);
5723 }
5724
5725 bs->aio_context = NULL;
5726 }
5727
5728 void bdrv_attach_aio_context(BlockDriverState *bs,
5729 AioContext *new_context)
5730 {
5731 BdrvAioNotifier *ban;
5732
5733 if (!bs->drv) {
5734 return;
5735 }
5736
5737 bs->aio_context = new_context;
5738
5739 if (bs->backing_hd) {
5740 bdrv_attach_aio_context(bs->backing_hd, new_context);
5741 }
5742 if (bs->file) {
5743 bdrv_attach_aio_context(bs->file, new_context);
5744 }
5745 if (bs->drv->bdrv_attach_aio_context) {
5746 bs->drv->bdrv_attach_aio_context(bs, new_context);
5747 }
5748 if (bs->io_limits_enabled) {
5749 throttle_attach_aio_context(&bs->throttle_state, new_context);
5750 }
5751
5752 QLIST_FOREACH(ban, &bs->aio_notifiers, list) {
5753 ban->attached_aio_context(new_context, ban->opaque);
5754 }
5755 }
5756
5757 void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
5758 {
5759 bdrv_drain_all(); /* ensure there are no in-flight requests */
5760
5761 bdrv_detach_aio_context(bs);
5762
5763 /* This function executes in the old AioContext so acquire the new one in
5764 * case it runs in a different thread.
5765 */
5766 aio_context_acquire(new_context);
5767 bdrv_attach_aio_context(bs, new_context);
5768 aio_context_release(new_context);
5769 }
5770
5771 void bdrv_add_aio_context_notifier(BlockDriverState *bs,
5772 void (*attached_aio_context)(AioContext *new_context, void *opaque),
5773 void (*detach_aio_context)(void *opaque), void *opaque)
5774 {
5775 BdrvAioNotifier *ban = g_new(BdrvAioNotifier, 1);
5776 *ban = (BdrvAioNotifier){
5777 .attached_aio_context = attached_aio_context,
5778 .detach_aio_context = detach_aio_context,
5779 .opaque = opaque
5780 };
5781
5782 QLIST_INSERT_HEAD(&bs->aio_notifiers, ban, list);
5783 }
5784
5785 void bdrv_remove_aio_context_notifier(BlockDriverState *bs,
5786 void (*attached_aio_context)(AioContext *,
5787 void *),
5788 void (*detach_aio_context)(void *),
5789 void *opaque)
5790 {
5791 BdrvAioNotifier *ban, *ban_next;
5792
5793 QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
5794 if (ban->attached_aio_context == attached_aio_context &&
5795 ban->detach_aio_context == detach_aio_context &&
5796 ban->opaque == opaque)
5797 {
5798 QLIST_REMOVE(ban, list);
5799 g_free(ban);
5800
5801 return;
5802 }
5803 }
5804
5805 abort();
5806 }
5807
5808 void bdrv_add_before_write_notifier(BlockDriverState *bs,
5809 NotifierWithReturn *notifier)
5810 {
5811 notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
5812 }
5813
5814 int bdrv_amend_options(BlockDriverState *bs, QemuOpts *opts,
5815 BlockDriverAmendStatusCB *status_cb)
5816 {
5817 if (!bs->drv->bdrv_amend_options) {
5818 return -ENOTSUP;
5819 }
5820 return bs->drv->bdrv_amend_options(bs, opts, status_cb);
5821 }
5822
5823 /* This function will be called by the bdrv_recurse_is_first_non_filter method
5824 * of block filter and by bdrv_is_first_non_filter.
5825 * It is used to test if the given bs is the candidate or recurse more in the
5826 * node graph.
5827 */
5828 bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
5829 BlockDriverState *candidate)
5830 {
5831 /* return false if basic checks fails */
5832 if (!bs || !bs->drv) {
5833 return false;
5834 }
5835
5836 /* the code reached a non block filter driver -> check if the bs is
5837 * the same as the candidate. It's the recursion termination condition.
5838 */
5839 if (!bs->drv->is_filter) {
5840 return bs == candidate;
5841 }
5842 /* Down this path the driver is a block filter driver */
5843
5844 /* If the block filter recursion method is defined use it to recurse down
5845 * the node graph.
5846 */
5847 if (bs->drv->bdrv_recurse_is_first_non_filter) {
5848 return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate);
5849 }
5850
5851 /* the driver is a block filter but don't allow to recurse -> return false
5852 */
5853 return false;
5854 }
5855
5856 /* This function checks if the candidate is the first non filter bs down it's
5857 * bs chain. Since we don't have pointers to parents it explore all bs chains
5858 * from the top. Some filters can choose not to pass down the recursion.
5859 */
5860 bool bdrv_is_first_non_filter(BlockDriverState *candidate)
5861 {
5862 BlockDriverState *bs;
5863
5864 /* walk down the bs forest recursively */
5865 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5866 bool perm;
5867
5868 /* try to recurse in this top level bs */
5869 perm = bdrv_recurse_is_first_non_filter(bs, candidate);
5870
5871 /* candidate is the first non filter */
5872 if (perm) {
5873 return true;
5874 }
5875 }
5876
5877 return false;
5878 }
5879
5880 BlockDriverState *check_to_replace_node(const char *node_name, Error **errp)
5881 {
5882 BlockDriverState *to_replace_bs = bdrv_find_node(node_name);
5883 AioContext *aio_context;
5884
5885 if (!to_replace_bs) {
5886 error_setg(errp, "Node name '%s' not found", node_name);
5887 return NULL;
5888 }
5889
5890 aio_context = bdrv_get_aio_context(to_replace_bs);
5891 aio_context_acquire(aio_context);
5892
5893 if (bdrv_op_is_blocked(to_replace_bs, BLOCK_OP_TYPE_REPLACE, errp)) {
5894 to_replace_bs = NULL;
5895 goto out;
5896 }
5897
5898 /* We don't want arbitrary node of the BDS chain to be replaced only the top
5899 * most non filter in order to prevent data corruption.
5900 * Another benefit is that this tests exclude backing files which are
5901 * blocked by the backing blockers.
5902 */
5903 if (!bdrv_is_first_non_filter(to_replace_bs)) {
5904 error_setg(errp, "Only top most non filter can be replaced");
5905 to_replace_bs = NULL;
5906 goto out;
5907 }
5908
5909 out:
5910 aio_context_release(aio_context);
5911 return to_replace_bs;
5912 }
5913
5914 void bdrv_io_plug(BlockDriverState *bs)
5915 {
5916 BlockDriver *drv = bs->drv;
5917 if (drv && drv->bdrv_io_plug) {
5918 drv->bdrv_io_plug(bs);
5919 } else if (bs->file) {
5920 bdrv_io_plug(bs->file);
5921 }
5922 }
5923
5924 void bdrv_io_unplug(BlockDriverState *bs)
5925 {
5926 BlockDriver *drv = bs->drv;
5927 if (drv && drv->bdrv_io_unplug) {
5928 drv->bdrv_io_unplug(bs);
5929 } else if (bs->file) {
5930 bdrv_io_unplug(bs->file);
5931 }
5932 }
5933
5934 void bdrv_flush_io_queue(BlockDriverState *bs)
5935 {
5936 BlockDriver *drv = bs->drv;
5937 if (drv && drv->bdrv_flush_io_queue) {
5938 drv->bdrv_flush_io_queue(bs);
5939 } else if (bs->file) {
5940 bdrv_flush_io_queue(bs->file);
5941 }
5942 }
5943
5944 static bool append_open_options(QDict *d, BlockDriverState *bs)
5945 {
5946 const QDictEntry *entry;
5947 bool found_any = false;
5948
5949 for (entry = qdict_first(bs->options); entry;
5950 entry = qdict_next(bs->options, entry))
5951 {
5952 /* Only take options for this level and exclude all non-driver-specific
5953 * options */
5954 if (!strchr(qdict_entry_key(entry), '.') &&
5955 strcmp(qdict_entry_key(entry), "node-name"))
5956 {
5957 qobject_incref(qdict_entry_value(entry));
5958 qdict_put_obj(d, qdict_entry_key(entry), qdict_entry_value(entry));
5959 found_any = true;
5960 }
5961 }
5962
5963 return found_any;
5964 }
5965
5966 /* Updates the following BDS fields:
5967 * - exact_filename: A filename which may be used for opening a block device
5968 * which (mostly) equals the given BDS (even without any
5969 * other options; so reading and writing must return the same
5970 * results, but caching etc. may be different)
5971 * - full_open_options: Options which, when given when opening a block device
5972 * (without a filename), result in a BDS (mostly)
5973 * equalling the given one
5974 * - filename: If exact_filename is set, it is copied here. Otherwise,
5975 * full_open_options is converted to a JSON object, prefixed with
5976 * "json:" (for use through the JSON pseudo protocol) and put here.
5977 */
5978 void bdrv_refresh_filename(BlockDriverState *bs)
5979 {
5980 BlockDriver *drv = bs->drv;
5981 QDict *opts;
5982
5983 if (!drv) {
5984 return;
5985 }
5986
5987 /* This BDS's file name will most probably depend on its file's name, so
5988 * refresh that first */
5989 if (bs->file) {
5990 bdrv_refresh_filename(bs->file);
5991 }
5992
5993 if (drv->bdrv_refresh_filename) {
5994 /* Obsolete information is of no use here, so drop the old file name
5995 * information before refreshing it */
5996 bs->exact_filename[0] = '\0';
5997 if (bs->full_open_options) {
5998 QDECREF(bs->full_open_options);
5999 bs->full_open_options = NULL;
6000 }
6001
6002 drv->bdrv_refresh_filename(bs);
6003 } else if (bs->file) {
6004 /* Try to reconstruct valid information from the underlying file */
6005 bool has_open_options;
6006
6007 bs->exact_filename[0] = '\0';
6008 if (bs->full_open_options) {
6009 QDECREF(bs->full_open_options);
6010 bs->full_open_options = NULL;
6011 }
6012
6013 opts = qdict_new();
6014 has_open_options = append_open_options(opts, bs);
6015
6016 /* If no specific options have been given for this BDS, the filename of
6017 * the underlying file should suffice for this one as well */
6018 if (bs->file->exact_filename[0] && !has_open_options) {
6019 strcpy(bs->exact_filename, bs->file->exact_filename);
6020 }
6021 /* Reconstructing the full options QDict is simple for most format block
6022 * drivers, as long as the full options are known for the underlying
6023 * file BDS. The full options QDict of that file BDS should somehow
6024 * contain a representation of the filename, therefore the following
6025 * suffices without querying the (exact_)filename of this BDS. */
6026 if (bs->file->full_open_options) {
6027 qdict_put_obj(opts, "driver",
6028 QOBJECT(qstring_from_str(drv->format_name)));
6029 QINCREF(bs->file->full_open_options);
6030 qdict_put_obj(opts, "file", QOBJECT(bs->file->full_open_options));
6031
6032 bs->full_open_options = opts;
6033 } else {
6034 QDECREF(opts);
6035 }
6036 } else if (!bs->full_open_options && qdict_size(bs->options)) {
6037 /* There is no underlying file BDS (at least referenced by BDS.file),
6038 * so the full options QDict should be equal to the options given
6039 * specifically for this block device when it was opened (plus the
6040 * driver specification).
6041 * Because those options don't change, there is no need to update
6042 * full_open_options when it's already set. */
6043
6044 opts = qdict_new();
6045 append_open_options(opts, bs);
6046 qdict_put_obj(opts, "driver",
6047 QOBJECT(qstring_from_str(drv->format_name)));
6048
6049 if (bs->exact_filename[0]) {
6050 /* This may not work for all block protocol drivers (some may
6051 * require this filename to be parsed), but we have to find some
6052 * default solution here, so just include it. If some block driver
6053 * does not support pure options without any filename at all or
6054 * needs some special format of the options QDict, it needs to
6055 * implement the driver-specific bdrv_refresh_filename() function.
6056 */
6057 qdict_put_obj(opts, "filename",
6058 QOBJECT(qstring_from_str(bs->exact_filename)));
6059 }
6060
6061 bs->full_open_options = opts;
6062 }
6063
6064 if (bs->exact_filename[0]) {
6065 pstrcpy(bs->filename, sizeof(bs->filename), bs->exact_filename);
6066 } else if (bs->full_open_options) {
6067 QString *json = qobject_to_json(QOBJECT(bs->full_open_options));
6068 snprintf(bs->filename, sizeof(bs->filename), "json:%s",
6069 qstring_get_str(json));
6070 QDECREF(json);
6071 }
6072 }
6073
6074 /* This accessor function purpose is to allow the device models to access the
6075 * BlockAcctStats structure embedded inside a BlockDriverState without being
6076 * aware of the BlockDriverState structure layout.
6077 * It will go away when the BlockAcctStats structure will be moved inside
6078 * the device models.
6079 */
6080 BlockAcctStats *bdrv_get_stats(BlockDriverState *bs)
6081 {
6082 return &bs->stats;
6083 }