]> git.proxmox.com Git - mirror_qemu.git/blob - block.c
raw: Prohibit dangerous writes for probed images
[mirror_qemu.git] / block.c
1 /*
2 * QEMU System Emulator block driver
3 *
4 * Copyright (c) 2003 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24 #include "config-host.h"
25 #include "qemu-common.h"
26 #include "trace.h"
27 #include "block/block_int.h"
28 #include "block/blockjob.h"
29 #include "qemu/module.h"
30 #include "qapi/qmp/qjson.h"
31 #include "sysemu/block-backend.h"
32 #include "sysemu/sysemu.h"
33 #include "qemu/notify.h"
34 #include "block/coroutine.h"
35 #include "block/qapi.h"
36 #include "qmp-commands.h"
37 #include "qemu/timer.h"
38 #include "qapi-event.h"
39
40 #ifdef CONFIG_BSD
41 #include <sys/types.h>
42 #include <sys/stat.h>
43 #include <sys/ioctl.h>
44 #include <sys/queue.h>
45 #ifndef __DragonFly__
46 #include <sys/disk.h>
47 #endif
48 #endif
49
50 #ifdef _WIN32
51 #include <windows.h>
52 #endif
53
54 struct BdrvDirtyBitmap {
55 HBitmap *bitmap;
56 QLIST_ENTRY(BdrvDirtyBitmap) list;
57 };
58
59 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
60
61 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
62 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
63 BlockCompletionFunc *cb, void *opaque);
64 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
65 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
66 BlockCompletionFunc *cb, void *opaque);
67 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
68 int64_t sector_num, int nb_sectors,
69 QEMUIOVector *iov);
70 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
71 int64_t sector_num, int nb_sectors,
72 QEMUIOVector *iov);
73 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
74 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
75 BdrvRequestFlags flags);
76 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
77 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
78 BdrvRequestFlags flags);
79 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
80 int64_t sector_num,
81 QEMUIOVector *qiov,
82 int nb_sectors,
83 BdrvRequestFlags flags,
84 BlockCompletionFunc *cb,
85 void *opaque,
86 bool is_write);
87 static void coroutine_fn bdrv_co_do_rw(void *opaque);
88 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
89 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
90
91 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
92 QTAILQ_HEAD_INITIALIZER(bdrv_states);
93
94 static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
95 QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
96
97 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
98 QLIST_HEAD_INITIALIZER(bdrv_drivers);
99
100 /* If non-zero, use only whitelisted block drivers */
101 static int use_bdrv_whitelist;
102
103 #ifdef _WIN32
104 static int is_windows_drive_prefix(const char *filename)
105 {
106 return (((filename[0] >= 'a' && filename[0] <= 'z') ||
107 (filename[0] >= 'A' && filename[0] <= 'Z')) &&
108 filename[1] == ':');
109 }
110
111 int is_windows_drive(const char *filename)
112 {
113 if (is_windows_drive_prefix(filename) &&
114 filename[2] == '\0')
115 return 1;
116 if (strstart(filename, "\\\\.\\", NULL) ||
117 strstart(filename, "//./", NULL))
118 return 1;
119 return 0;
120 }
121 #endif
122
123 /* throttling disk I/O limits */
124 void bdrv_set_io_limits(BlockDriverState *bs,
125 ThrottleConfig *cfg)
126 {
127 int i;
128
129 throttle_config(&bs->throttle_state, cfg);
130
131 for (i = 0; i < 2; i++) {
132 qemu_co_enter_next(&bs->throttled_reqs[i]);
133 }
134 }
135
136 /* this function drain all the throttled IOs */
137 static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
138 {
139 bool drained = false;
140 bool enabled = bs->io_limits_enabled;
141 int i;
142
143 bs->io_limits_enabled = false;
144
145 for (i = 0; i < 2; i++) {
146 while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
147 drained = true;
148 }
149 }
150
151 bs->io_limits_enabled = enabled;
152
153 return drained;
154 }
155
156 void bdrv_io_limits_disable(BlockDriverState *bs)
157 {
158 bs->io_limits_enabled = false;
159
160 bdrv_start_throttled_reqs(bs);
161
162 throttle_destroy(&bs->throttle_state);
163 }
164
165 static void bdrv_throttle_read_timer_cb(void *opaque)
166 {
167 BlockDriverState *bs = opaque;
168 qemu_co_enter_next(&bs->throttled_reqs[0]);
169 }
170
171 static void bdrv_throttle_write_timer_cb(void *opaque)
172 {
173 BlockDriverState *bs = opaque;
174 qemu_co_enter_next(&bs->throttled_reqs[1]);
175 }
176
177 /* should be called before bdrv_set_io_limits if a limit is set */
178 void bdrv_io_limits_enable(BlockDriverState *bs)
179 {
180 assert(!bs->io_limits_enabled);
181 throttle_init(&bs->throttle_state,
182 bdrv_get_aio_context(bs),
183 QEMU_CLOCK_VIRTUAL,
184 bdrv_throttle_read_timer_cb,
185 bdrv_throttle_write_timer_cb,
186 bs);
187 bs->io_limits_enabled = true;
188 }
189
190 /* This function makes an IO wait if needed
191 *
192 * @nb_sectors: the number of sectors of the IO
193 * @is_write: is the IO a write
194 */
195 static void bdrv_io_limits_intercept(BlockDriverState *bs,
196 unsigned int bytes,
197 bool is_write)
198 {
199 /* does this io must wait */
200 bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
201
202 /* if must wait or any request of this type throttled queue the IO */
203 if (must_wait ||
204 !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
205 qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
206 }
207
208 /* the IO will be executed, do the accounting */
209 throttle_account(&bs->throttle_state, is_write, bytes);
210
211
212 /* if the next request must wait -> do nothing */
213 if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
214 return;
215 }
216
217 /* else queue next request for execution */
218 qemu_co_queue_next(&bs->throttled_reqs[is_write]);
219 }
220
221 size_t bdrv_opt_mem_align(BlockDriverState *bs)
222 {
223 if (!bs || !bs->drv) {
224 /* 4k should be on the safe side */
225 return 4096;
226 }
227
228 return bs->bl.opt_mem_alignment;
229 }
230
231 /* check if the path starts with "<protocol>:" */
232 static int path_has_protocol(const char *path)
233 {
234 const char *p;
235
236 #ifdef _WIN32
237 if (is_windows_drive(path) ||
238 is_windows_drive_prefix(path)) {
239 return 0;
240 }
241 p = path + strcspn(path, ":/\\");
242 #else
243 p = path + strcspn(path, ":/");
244 #endif
245
246 return *p == ':';
247 }
248
249 int path_is_absolute(const char *path)
250 {
251 #ifdef _WIN32
252 /* specific case for names like: "\\.\d:" */
253 if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
254 return 1;
255 }
256 return (*path == '/' || *path == '\\');
257 #else
258 return (*path == '/');
259 #endif
260 }
261
262 /* if filename is absolute, just copy it to dest. Otherwise, build a
263 path to it by considering it is relative to base_path. URL are
264 supported. */
265 void path_combine(char *dest, int dest_size,
266 const char *base_path,
267 const char *filename)
268 {
269 const char *p, *p1;
270 int len;
271
272 if (dest_size <= 0)
273 return;
274 if (path_is_absolute(filename)) {
275 pstrcpy(dest, dest_size, filename);
276 } else {
277 p = strchr(base_path, ':');
278 if (p)
279 p++;
280 else
281 p = base_path;
282 p1 = strrchr(base_path, '/');
283 #ifdef _WIN32
284 {
285 const char *p2;
286 p2 = strrchr(base_path, '\\');
287 if (!p1 || p2 > p1)
288 p1 = p2;
289 }
290 #endif
291 if (p1)
292 p1++;
293 else
294 p1 = base_path;
295 if (p1 > p)
296 p = p1;
297 len = p - base_path;
298 if (len > dest_size - 1)
299 len = dest_size - 1;
300 memcpy(dest, base_path, len);
301 dest[len] = '\0';
302 pstrcat(dest, dest_size, filename);
303 }
304 }
305
306 void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
307 {
308 if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
309 pstrcpy(dest, sz, bs->backing_file);
310 } else {
311 path_combine(dest, sz, bs->filename, bs->backing_file);
312 }
313 }
314
315 void bdrv_register(BlockDriver *bdrv)
316 {
317 /* Block drivers without coroutine functions need emulation */
318 if (!bdrv->bdrv_co_readv) {
319 bdrv->bdrv_co_readv = bdrv_co_readv_em;
320 bdrv->bdrv_co_writev = bdrv_co_writev_em;
321
322 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
323 * the block driver lacks aio we need to emulate that too.
324 */
325 if (!bdrv->bdrv_aio_readv) {
326 /* add AIO emulation layer */
327 bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
328 bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
329 }
330 }
331
332 QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
333 }
334
335 BlockDriverState *bdrv_new_root(void)
336 {
337 BlockDriverState *bs = bdrv_new();
338
339 QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
340 return bs;
341 }
342
343 BlockDriverState *bdrv_new(void)
344 {
345 BlockDriverState *bs;
346 int i;
347
348 bs = g_new0(BlockDriverState, 1);
349 QLIST_INIT(&bs->dirty_bitmaps);
350 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
351 QLIST_INIT(&bs->op_blockers[i]);
352 }
353 bdrv_iostatus_disable(bs);
354 notifier_list_init(&bs->close_notifiers);
355 notifier_with_return_list_init(&bs->before_write_notifiers);
356 qemu_co_queue_init(&bs->throttled_reqs[0]);
357 qemu_co_queue_init(&bs->throttled_reqs[1]);
358 bs->refcnt = 1;
359 bs->aio_context = qemu_get_aio_context();
360
361 return bs;
362 }
363
364 void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
365 {
366 notifier_list_add(&bs->close_notifiers, notify);
367 }
368
369 BlockDriver *bdrv_find_format(const char *format_name)
370 {
371 BlockDriver *drv1;
372 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
373 if (!strcmp(drv1->format_name, format_name)) {
374 return drv1;
375 }
376 }
377 return NULL;
378 }
379
380 static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
381 {
382 static const char *whitelist_rw[] = {
383 CONFIG_BDRV_RW_WHITELIST
384 };
385 static const char *whitelist_ro[] = {
386 CONFIG_BDRV_RO_WHITELIST
387 };
388 const char **p;
389
390 if (!whitelist_rw[0] && !whitelist_ro[0]) {
391 return 1; /* no whitelist, anything goes */
392 }
393
394 for (p = whitelist_rw; *p; p++) {
395 if (!strcmp(drv->format_name, *p)) {
396 return 1;
397 }
398 }
399 if (read_only) {
400 for (p = whitelist_ro; *p; p++) {
401 if (!strcmp(drv->format_name, *p)) {
402 return 1;
403 }
404 }
405 }
406 return 0;
407 }
408
409 BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
410 bool read_only)
411 {
412 BlockDriver *drv = bdrv_find_format(format_name);
413 return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
414 }
415
416 typedef struct CreateCo {
417 BlockDriver *drv;
418 char *filename;
419 QemuOpts *opts;
420 int ret;
421 Error *err;
422 } CreateCo;
423
424 static void coroutine_fn bdrv_create_co_entry(void *opaque)
425 {
426 Error *local_err = NULL;
427 int ret;
428
429 CreateCo *cco = opaque;
430 assert(cco->drv);
431
432 ret = cco->drv->bdrv_create(cco->filename, cco->opts, &local_err);
433 if (local_err) {
434 error_propagate(&cco->err, local_err);
435 }
436 cco->ret = ret;
437 }
438
439 int bdrv_create(BlockDriver *drv, const char* filename,
440 QemuOpts *opts, Error **errp)
441 {
442 int ret;
443
444 Coroutine *co;
445 CreateCo cco = {
446 .drv = drv,
447 .filename = g_strdup(filename),
448 .opts = opts,
449 .ret = NOT_DONE,
450 .err = NULL,
451 };
452
453 if (!drv->bdrv_create) {
454 error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
455 ret = -ENOTSUP;
456 goto out;
457 }
458
459 if (qemu_in_coroutine()) {
460 /* Fast-path if already in coroutine context */
461 bdrv_create_co_entry(&cco);
462 } else {
463 co = qemu_coroutine_create(bdrv_create_co_entry);
464 qemu_coroutine_enter(co, &cco);
465 while (cco.ret == NOT_DONE) {
466 aio_poll(qemu_get_aio_context(), true);
467 }
468 }
469
470 ret = cco.ret;
471 if (ret < 0) {
472 if (cco.err) {
473 error_propagate(errp, cco.err);
474 } else {
475 error_setg_errno(errp, -ret, "Could not create image");
476 }
477 }
478
479 out:
480 g_free(cco.filename);
481 return ret;
482 }
483
484 int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp)
485 {
486 BlockDriver *drv;
487 Error *local_err = NULL;
488 int ret;
489
490 drv = bdrv_find_protocol(filename, true);
491 if (drv == NULL) {
492 error_setg(errp, "Could not find protocol for file '%s'", filename);
493 return -ENOENT;
494 }
495
496 ret = bdrv_create(drv, filename, opts, &local_err);
497 if (local_err) {
498 error_propagate(errp, local_err);
499 }
500 return ret;
501 }
502
503 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
504 {
505 BlockDriver *drv = bs->drv;
506 Error *local_err = NULL;
507
508 memset(&bs->bl, 0, sizeof(bs->bl));
509
510 if (!drv) {
511 return;
512 }
513
514 /* Take some limits from the children as a default */
515 if (bs->file) {
516 bdrv_refresh_limits(bs->file, &local_err);
517 if (local_err) {
518 error_propagate(errp, local_err);
519 return;
520 }
521 bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
522 bs->bl.max_transfer_length = bs->file->bl.max_transfer_length;
523 bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
524 } else {
525 bs->bl.opt_mem_alignment = 512;
526 }
527
528 if (bs->backing_hd) {
529 bdrv_refresh_limits(bs->backing_hd, &local_err);
530 if (local_err) {
531 error_propagate(errp, local_err);
532 return;
533 }
534 bs->bl.opt_transfer_length =
535 MAX(bs->bl.opt_transfer_length,
536 bs->backing_hd->bl.opt_transfer_length);
537 bs->bl.max_transfer_length =
538 MIN_NON_ZERO(bs->bl.max_transfer_length,
539 bs->backing_hd->bl.max_transfer_length);
540 bs->bl.opt_mem_alignment =
541 MAX(bs->bl.opt_mem_alignment,
542 bs->backing_hd->bl.opt_mem_alignment);
543 }
544
545 /* Then let the driver override it */
546 if (drv->bdrv_refresh_limits) {
547 drv->bdrv_refresh_limits(bs, errp);
548 }
549 }
550
551 /*
552 * Create a uniquely-named empty temporary file.
553 * Return 0 upon success, otherwise a negative errno value.
554 */
555 int get_tmp_filename(char *filename, int size)
556 {
557 #ifdef _WIN32
558 char temp_dir[MAX_PATH];
559 /* GetTempFileName requires that its output buffer (4th param)
560 have length MAX_PATH or greater. */
561 assert(size >= MAX_PATH);
562 return (GetTempPath(MAX_PATH, temp_dir)
563 && GetTempFileName(temp_dir, "qem", 0, filename)
564 ? 0 : -GetLastError());
565 #else
566 int fd;
567 const char *tmpdir;
568 tmpdir = getenv("TMPDIR");
569 if (!tmpdir) {
570 tmpdir = "/var/tmp";
571 }
572 if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
573 return -EOVERFLOW;
574 }
575 fd = mkstemp(filename);
576 if (fd < 0) {
577 return -errno;
578 }
579 if (close(fd) != 0) {
580 unlink(filename);
581 return -errno;
582 }
583 return 0;
584 #endif
585 }
586
587 /*
588 * Detect host devices. By convention, /dev/cdrom[N] is always
589 * recognized as a host CDROM.
590 */
591 static BlockDriver *find_hdev_driver(const char *filename)
592 {
593 int score_max = 0, score;
594 BlockDriver *drv = NULL, *d;
595
596 QLIST_FOREACH(d, &bdrv_drivers, list) {
597 if (d->bdrv_probe_device) {
598 score = d->bdrv_probe_device(filename);
599 if (score > score_max) {
600 score_max = score;
601 drv = d;
602 }
603 }
604 }
605
606 return drv;
607 }
608
609 BlockDriver *bdrv_find_protocol(const char *filename,
610 bool allow_protocol_prefix)
611 {
612 BlockDriver *drv1;
613 char protocol[128];
614 int len;
615 const char *p;
616
617 /* TODO Drivers without bdrv_file_open must be specified explicitly */
618
619 /*
620 * XXX(hch): we really should not let host device detection
621 * override an explicit protocol specification, but moving this
622 * later breaks access to device names with colons in them.
623 * Thanks to the brain-dead persistent naming schemes on udev-
624 * based Linux systems those actually are quite common.
625 */
626 drv1 = find_hdev_driver(filename);
627 if (drv1) {
628 return drv1;
629 }
630
631 if (!path_has_protocol(filename) || !allow_protocol_prefix) {
632 return bdrv_find_format("file");
633 }
634
635 p = strchr(filename, ':');
636 assert(p != NULL);
637 len = p - filename;
638 if (len > sizeof(protocol) - 1)
639 len = sizeof(protocol) - 1;
640 memcpy(protocol, filename, len);
641 protocol[len] = '\0';
642 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
643 if (drv1->protocol_name &&
644 !strcmp(drv1->protocol_name, protocol)) {
645 return drv1;
646 }
647 }
648 return NULL;
649 }
650
651 /*
652 * Guess image format by probing its contents.
653 * This is not a good idea when your image is raw (CVE-2008-2004), but
654 * we do it anyway for backward compatibility.
655 *
656 * @buf contains the image's first @buf_size bytes.
657 * @buf_size is the buffer size in bytes (generally BLOCK_PROBE_BUF_SIZE,
658 * but can be smaller if the image file is smaller)
659 * @filename is its filename.
660 *
661 * For all block drivers, call the bdrv_probe() method to get its
662 * probing score.
663 * Return the first block driver with the highest probing score.
664 */
665 BlockDriver *bdrv_probe_all(const uint8_t *buf, int buf_size,
666 const char *filename)
667 {
668 int score_max = 0, score;
669 BlockDriver *drv = NULL, *d;
670
671 QLIST_FOREACH(d, &bdrv_drivers, list) {
672 if (d->bdrv_probe) {
673 score = d->bdrv_probe(buf, buf_size, filename);
674 if (score > score_max) {
675 score_max = score;
676 drv = d;
677 }
678 }
679 }
680
681 return drv;
682 }
683
684 static int find_image_format(BlockDriverState *bs, const char *filename,
685 BlockDriver **pdrv, Error **errp)
686 {
687 BlockDriver *drv;
688 uint8_t buf[BLOCK_PROBE_BUF_SIZE];
689 int ret = 0;
690
691 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
692 if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
693 drv = bdrv_find_format("raw");
694 if (!drv) {
695 error_setg(errp, "Could not find raw image format");
696 ret = -ENOENT;
697 }
698 *pdrv = drv;
699 return ret;
700 }
701
702 ret = bdrv_pread(bs, 0, buf, sizeof(buf));
703 if (ret < 0) {
704 error_setg_errno(errp, -ret, "Could not read image for determining its "
705 "format");
706 *pdrv = NULL;
707 return ret;
708 }
709
710 drv = bdrv_probe_all(buf, ret, filename);
711 if (!drv) {
712 error_setg(errp, "Could not determine image format: No compatible "
713 "driver found");
714 ret = -ENOENT;
715 }
716 *pdrv = drv;
717 return ret;
718 }
719
720 /**
721 * Set the current 'total_sectors' value
722 * Return 0 on success, -errno on error.
723 */
724 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
725 {
726 BlockDriver *drv = bs->drv;
727
728 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
729 if (bs->sg)
730 return 0;
731
732 /* query actual device if possible, otherwise just trust the hint */
733 if (drv->bdrv_getlength) {
734 int64_t length = drv->bdrv_getlength(bs);
735 if (length < 0) {
736 return length;
737 }
738 hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
739 }
740
741 bs->total_sectors = hint;
742 return 0;
743 }
744
745 /**
746 * Set open flags for a given discard mode
747 *
748 * Return 0 on success, -1 if the discard mode was invalid.
749 */
750 int bdrv_parse_discard_flags(const char *mode, int *flags)
751 {
752 *flags &= ~BDRV_O_UNMAP;
753
754 if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
755 /* do nothing */
756 } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
757 *flags |= BDRV_O_UNMAP;
758 } else {
759 return -1;
760 }
761
762 return 0;
763 }
764
765 /**
766 * Set open flags for a given cache mode
767 *
768 * Return 0 on success, -1 if the cache mode was invalid.
769 */
770 int bdrv_parse_cache_flags(const char *mode, int *flags)
771 {
772 *flags &= ~BDRV_O_CACHE_MASK;
773
774 if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
775 *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
776 } else if (!strcmp(mode, "directsync")) {
777 *flags |= BDRV_O_NOCACHE;
778 } else if (!strcmp(mode, "writeback")) {
779 *flags |= BDRV_O_CACHE_WB;
780 } else if (!strcmp(mode, "unsafe")) {
781 *flags |= BDRV_O_CACHE_WB;
782 *flags |= BDRV_O_NO_FLUSH;
783 } else if (!strcmp(mode, "writethrough")) {
784 /* this is the default */
785 } else {
786 return -1;
787 }
788
789 return 0;
790 }
791
792 /**
793 * The copy-on-read flag is actually a reference count so multiple users may
794 * use the feature without worrying about clobbering its previous state.
795 * Copy-on-read stays enabled until all users have called to disable it.
796 */
797 void bdrv_enable_copy_on_read(BlockDriverState *bs)
798 {
799 bs->copy_on_read++;
800 }
801
802 void bdrv_disable_copy_on_read(BlockDriverState *bs)
803 {
804 assert(bs->copy_on_read > 0);
805 bs->copy_on_read--;
806 }
807
808 /*
809 * Returns the flags that a temporary snapshot should get, based on the
810 * originally requested flags (the originally requested image will have flags
811 * like a backing file)
812 */
813 static int bdrv_temp_snapshot_flags(int flags)
814 {
815 return (flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY;
816 }
817
818 /*
819 * Returns the flags that bs->file should get, based on the given flags for
820 * the parent BDS
821 */
822 static int bdrv_inherited_flags(int flags)
823 {
824 /* Enable protocol handling, disable format probing for bs->file */
825 flags |= BDRV_O_PROTOCOL;
826
827 /* Our block drivers take care to send flushes and respect unmap policy,
828 * so we can enable both unconditionally on lower layers. */
829 flags |= BDRV_O_CACHE_WB | BDRV_O_UNMAP;
830
831 /* Clear flags that only apply to the top layer */
832 flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_COPY_ON_READ);
833
834 return flags;
835 }
836
837 /*
838 * Returns the flags that bs->backing_hd should get, based on the given flags
839 * for the parent BDS
840 */
841 static int bdrv_backing_flags(int flags)
842 {
843 /* backing files always opened read-only */
844 flags &= ~(BDRV_O_RDWR | BDRV_O_COPY_ON_READ);
845
846 /* snapshot=on is handled on the top layer */
847 flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_TEMPORARY);
848
849 return flags;
850 }
851
852 static int bdrv_open_flags(BlockDriverState *bs, int flags)
853 {
854 int open_flags = flags | BDRV_O_CACHE_WB;
855
856 /*
857 * Clear flags that are internal to the block layer before opening the
858 * image.
859 */
860 open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_PROTOCOL);
861
862 /*
863 * Snapshots should be writable.
864 */
865 if (flags & BDRV_O_TEMPORARY) {
866 open_flags |= BDRV_O_RDWR;
867 }
868
869 return open_flags;
870 }
871
872 static void bdrv_assign_node_name(BlockDriverState *bs,
873 const char *node_name,
874 Error **errp)
875 {
876 if (!node_name) {
877 return;
878 }
879
880 /* Check for empty string or invalid characters */
881 if (!id_wellformed(node_name)) {
882 error_setg(errp, "Invalid node name");
883 return;
884 }
885
886 /* takes care of avoiding namespaces collisions */
887 if (blk_by_name(node_name)) {
888 error_setg(errp, "node-name=%s is conflicting with a device id",
889 node_name);
890 return;
891 }
892
893 /* takes care of avoiding duplicates node names */
894 if (bdrv_find_node(node_name)) {
895 error_setg(errp, "Duplicate node name");
896 return;
897 }
898
899 /* copy node name into the bs and insert it into the graph list */
900 pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
901 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
902 }
903
904 /*
905 * Common part for opening disk images and files
906 *
907 * Removes all processed options from *options.
908 */
909 static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
910 QDict *options, int flags, BlockDriver *drv, Error **errp)
911 {
912 int ret, open_flags;
913 const char *filename;
914 const char *node_name = NULL;
915 Error *local_err = NULL;
916
917 assert(drv != NULL);
918 assert(bs->file == NULL);
919 assert(options != NULL && bs->options != options);
920
921 if (file != NULL) {
922 filename = file->filename;
923 } else {
924 filename = qdict_get_try_str(options, "filename");
925 }
926
927 if (drv->bdrv_needs_filename && !filename) {
928 error_setg(errp, "The '%s' block driver requires a file name",
929 drv->format_name);
930 return -EINVAL;
931 }
932
933 trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
934
935 node_name = qdict_get_try_str(options, "node-name");
936 bdrv_assign_node_name(bs, node_name, &local_err);
937 if (local_err) {
938 error_propagate(errp, local_err);
939 return -EINVAL;
940 }
941 qdict_del(options, "node-name");
942
943 /* bdrv_open() with directly using a protocol as drv. This layer is already
944 * opened, so assign it to bs (while file becomes a closed BlockDriverState)
945 * and return immediately. */
946 if (file != NULL && drv->bdrv_file_open) {
947 bdrv_swap(file, bs);
948 return 0;
949 }
950
951 bs->open_flags = flags;
952 bs->guest_block_size = 512;
953 bs->request_alignment = 512;
954 bs->zero_beyond_eof = true;
955 open_flags = bdrv_open_flags(bs, flags);
956 bs->read_only = !(open_flags & BDRV_O_RDWR);
957 bs->growable = !!(flags & BDRV_O_PROTOCOL);
958
959 if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
960 error_setg(errp,
961 !bs->read_only && bdrv_is_whitelisted(drv, true)
962 ? "Driver '%s' can only be used for read-only devices"
963 : "Driver '%s' is not whitelisted",
964 drv->format_name);
965 return -ENOTSUP;
966 }
967
968 assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
969 if (flags & BDRV_O_COPY_ON_READ) {
970 if (!bs->read_only) {
971 bdrv_enable_copy_on_read(bs);
972 } else {
973 error_setg(errp, "Can't use copy-on-read on read-only device");
974 return -EINVAL;
975 }
976 }
977
978 if (filename != NULL) {
979 pstrcpy(bs->filename, sizeof(bs->filename), filename);
980 } else {
981 bs->filename[0] = '\0';
982 }
983 pstrcpy(bs->exact_filename, sizeof(bs->exact_filename), bs->filename);
984
985 bs->drv = drv;
986 bs->opaque = g_malloc0(drv->instance_size);
987
988 bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
989
990 /* Open the image, either directly or using a protocol */
991 if (drv->bdrv_file_open) {
992 assert(file == NULL);
993 assert(!drv->bdrv_needs_filename || filename != NULL);
994 ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
995 } else {
996 if (file == NULL) {
997 error_setg(errp, "Can't use '%s' as a block driver for the "
998 "protocol level", drv->format_name);
999 ret = -EINVAL;
1000 goto free_and_fail;
1001 }
1002 bs->file = file;
1003 ret = drv->bdrv_open(bs, options, open_flags, &local_err);
1004 }
1005
1006 if (ret < 0) {
1007 if (local_err) {
1008 error_propagate(errp, local_err);
1009 } else if (bs->filename[0]) {
1010 error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
1011 } else {
1012 error_setg_errno(errp, -ret, "Could not open image");
1013 }
1014 goto free_and_fail;
1015 }
1016
1017 ret = refresh_total_sectors(bs, bs->total_sectors);
1018 if (ret < 0) {
1019 error_setg_errno(errp, -ret, "Could not refresh total sector count");
1020 goto free_and_fail;
1021 }
1022
1023 bdrv_refresh_limits(bs, &local_err);
1024 if (local_err) {
1025 error_propagate(errp, local_err);
1026 ret = -EINVAL;
1027 goto free_and_fail;
1028 }
1029
1030 assert(bdrv_opt_mem_align(bs) != 0);
1031 assert((bs->request_alignment != 0) || bs->sg);
1032 return 0;
1033
1034 free_and_fail:
1035 bs->file = NULL;
1036 g_free(bs->opaque);
1037 bs->opaque = NULL;
1038 bs->drv = NULL;
1039 return ret;
1040 }
1041
1042 static QDict *parse_json_filename(const char *filename, Error **errp)
1043 {
1044 QObject *options_obj;
1045 QDict *options;
1046 int ret;
1047
1048 ret = strstart(filename, "json:", &filename);
1049 assert(ret);
1050
1051 options_obj = qobject_from_json(filename);
1052 if (!options_obj) {
1053 error_setg(errp, "Could not parse the JSON options");
1054 return NULL;
1055 }
1056
1057 if (qobject_type(options_obj) != QTYPE_QDICT) {
1058 qobject_decref(options_obj);
1059 error_setg(errp, "Invalid JSON object given");
1060 return NULL;
1061 }
1062
1063 options = qobject_to_qdict(options_obj);
1064 qdict_flatten(options);
1065
1066 return options;
1067 }
1068
1069 /*
1070 * Fills in default options for opening images and converts the legacy
1071 * filename/flags pair to option QDict entries.
1072 */
1073 static int bdrv_fill_options(QDict **options, const char **pfilename, int flags,
1074 BlockDriver *drv, Error **errp)
1075 {
1076 const char *filename = *pfilename;
1077 const char *drvname;
1078 bool protocol = flags & BDRV_O_PROTOCOL;
1079 bool parse_filename = false;
1080 Error *local_err = NULL;
1081
1082 /* Parse json: pseudo-protocol */
1083 if (filename && g_str_has_prefix(filename, "json:")) {
1084 QDict *json_options = parse_json_filename(filename, &local_err);
1085 if (local_err) {
1086 error_propagate(errp, local_err);
1087 return -EINVAL;
1088 }
1089
1090 /* Options given in the filename have lower priority than options
1091 * specified directly */
1092 qdict_join(*options, json_options, false);
1093 QDECREF(json_options);
1094 *pfilename = filename = NULL;
1095 }
1096
1097 /* Fetch the file name from the options QDict if necessary */
1098 if (protocol && filename) {
1099 if (!qdict_haskey(*options, "filename")) {
1100 qdict_put(*options, "filename", qstring_from_str(filename));
1101 parse_filename = true;
1102 } else {
1103 error_setg(errp, "Can't specify 'file' and 'filename' options at "
1104 "the same time");
1105 return -EINVAL;
1106 }
1107 }
1108
1109 /* Find the right block driver */
1110 filename = qdict_get_try_str(*options, "filename");
1111 drvname = qdict_get_try_str(*options, "driver");
1112
1113 if (drv) {
1114 if (drvname) {
1115 error_setg(errp, "Driver specified twice");
1116 return -EINVAL;
1117 }
1118 drvname = drv->format_name;
1119 qdict_put(*options, "driver", qstring_from_str(drvname));
1120 } else {
1121 if (!drvname && protocol) {
1122 if (filename) {
1123 drv = bdrv_find_protocol(filename, parse_filename);
1124 if (!drv) {
1125 error_setg(errp, "Unknown protocol");
1126 return -EINVAL;
1127 }
1128
1129 drvname = drv->format_name;
1130 qdict_put(*options, "driver", qstring_from_str(drvname));
1131 } else {
1132 error_setg(errp, "Must specify either driver or file");
1133 return -EINVAL;
1134 }
1135 } else if (drvname) {
1136 drv = bdrv_find_format(drvname);
1137 if (!drv) {
1138 error_setg(errp, "Unknown driver '%s'", drvname);
1139 return -ENOENT;
1140 }
1141 }
1142 }
1143
1144 assert(drv || !protocol);
1145
1146 /* Driver-specific filename parsing */
1147 if (drv && drv->bdrv_parse_filename && parse_filename) {
1148 drv->bdrv_parse_filename(filename, *options, &local_err);
1149 if (local_err) {
1150 error_propagate(errp, local_err);
1151 return -EINVAL;
1152 }
1153
1154 if (!drv->bdrv_needs_filename) {
1155 qdict_del(*options, "filename");
1156 }
1157 }
1158
1159 return 0;
1160 }
1161
1162 void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd)
1163 {
1164
1165 if (bs->backing_hd) {
1166 assert(bs->backing_blocker);
1167 bdrv_op_unblock_all(bs->backing_hd, bs->backing_blocker);
1168 } else if (backing_hd) {
1169 error_setg(&bs->backing_blocker,
1170 "device is used as backing hd of '%s'",
1171 bdrv_get_device_name(bs));
1172 }
1173
1174 bs->backing_hd = backing_hd;
1175 if (!backing_hd) {
1176 error_free(bs->backing_blocker);
1177 bs->backing_blocker = NULL;
1178 goto out;
1179 }
1180 bs->open_flags &= ~BDRV_O_NO_BACKING;
1181 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_hd->filename);
1182 pstrcpy(bs->backing_format, sizeof(bs->backing_format),
1183 backing_hd->drv ? backing_hd->drv->format_name : "");
1184
1185 bdrv_op_block_all(bs->backing_hd, bs->backing_blocker);
1186 /* Otherwise we won't be able to commit due to check in bdrv_commit */
1187 bdrv_op_unblock(bs->backing_hd, BLOCK_OP_TYPE_COMMIT,
1188 bs->backing_blocker);
1189 out:
1190 bdrv_refresh_limits(bs, NULL);
1191 }
1192
1193 /*
1194 * Opens the backing file for a BlockDriverState if not yet open
1195 *
1196 * options is a QDict of options to pass to the block drivers, or NULL for an
1197 * empty set of options. The reference to the QDict is transferred to this
1198 * function (even on failure), so if the caller intends to reuse the dictionary,
1199 * it needs to use QINCREF() before calling bdrv_file_open.
1200 */
1201 int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
1202 {
1203 char *backing_filename = g_malloc0(PATH_MAX);
1204 int ret = 0;
1205 BlockDriver *back_drv = NULL;
1206 BlockDriverState *backing_hd;
1207 Error *local_err = NULL;
1208
1209 if (bs->backing_hd != NULL) {
1210 QDECREF(options);
1211 goto free_exit;
1212 }
1213
1214 /* NULL means an empty set of options */
1215 if (options == NULL) {
1216 options = qdict_new();
1217 }
1218
1219 bs->open_flags &= ~BDRV_O_NO_BACKING;
1220 if (qdict_haskey(options, "file.filename")) {
1221 backing_filename[0] = '\0';
1222 } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
1223 QDECREF(options);
1224 goto free_exit;
1225 } else {
1226 bdrv_get_full_backing_filename(bs, backing_filename, PATH_MAX);
1227 }
1228
1229 if (!bs->drv || !bs->drv->supports_backing) {
1230 ret = -EINVAL;
1231 error_setg(errp, "Driver doesn't support backing files");
1232 QDECREF(options);
1233 goto free_exit;
1234 }
1235
1236 backing_hd = bdrv_new();
1237
1238 if (bs->backing_format[0] != '\0') {
1239 back_drv = bdrv_find_format(bs->backing_format);
1240 }
1241
1242 assert(bs->backing_hd == NULL);
1243 ret = bdrv_open(&backing_hd,
1244 *backing_filename ? backing_filename : NULL, NULL, options,
1245 bdrv_backing_flags(bs->open_flags), back_drv, &local_err);
1246 if (ret < 0) {
1247 bdrv_unref(backing_hd);
1248 backing_hd = NULL;
1249 bs->open_flags |= BDRV_O_NO_BACKING;
1250 error_setg(errp, "Could not open backing file: %s",
1251 error_get_pretty(local_err));
1252 error_free(local_err);
1253 goto free_exit;
1254 }
1255 bdrv_set_backing_hd(bs, backing_hd);
1256
1257 free_exit:
1258 g_free(backing_filename);
1259 return ret;
1260 }
1261
1262 /*
1263 * Opens a disk image whose options are given as BlockdevRef in another block
1264 * device's options.
1265 *
1266 * If allow_none is true, no image will be opened if filename is false and no
1267 * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
1268 *
1269 * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
1270 * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
1271 * itself, all options starting with "${bdref_key}." are considered part of the
1272 * BlockdevRef.
1273 *
1274 * The BlockdevRef will be removed from the options QDict.
1275 *
1276 * To conform with the behavior of bdrv_open(), *pbs has to be NULL.
1277 */
1278 int bdrv_open_image(BlockDriverState **pbs, const char *filename,
1279 QDict *options, const char *bdref_key, int flags,
1280 bool allow_none, Error **errp)
1281 {
1282 QDict *image_options;
1283 int ret;
1284 char *bdref_key_dot;
1285 const char *reference;
1286
1287 assert(pbs);
1288 assert(*pbs == NULL);
1289
1290 bdref_key_dot = g_strdup_printf("%s.", bdref_key);
1291 qdict_extract_subqdict(options, &image_options, bdref_key_dot);
1292 g_free(bdref_key_dot);
1293
1294 reference = qdict_get_try_str(options, bdref_key);
1295 if (!filename && !reference && !qdict_size(image_options)) {
1296 if (allow_none) {
1297 ret = 0;
1298 } else {
1299 error_setg(errp, "A block device must be specified for \"%s\"",
1300 bdref_key);
1301 ret = -EINVAL;
1302 }
1303 QDECREF(image_options);
1304 goto done;
1305 }
1306
1307 ret = bdrv_open(pbs, filename, reference, image_options, flags, NULL, errp);
1308
1309 done:
1310 qdict_del(options, bdref_key);
1311 return ret;
1312 }
1313
1314 int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp)
1315 {
1316 /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1317 char *tmp_filename = g_malloc0(PATH_MAX + 1);
1318 int64_t total_size;
1319 BlockDriver *bdrv_qcow2;
1320 QemuOpts *opts = NULL;
1321 QDict *snapshot_options;
1322 BlockDriverState *bs_snapshot;
1323 Error *local_err;
1324 int ret;
1325
1326 /* if snapshot, we create a temporary backing file and open it
1327 instead of opening 'filename' directly */
1328
1329 /* Get the required size from the image */
1330 total_size = bdrv_getlength(bs);
1331 if (total_size < 0) {
1332 ret = total_size;
1333 error_setg_errno(errp, -total_size, "Could not get image size");
1334 goto out;
1335 }
1336
1337 /* Create the temporary image */
1338 ret = get_tmp_filename(tmp_filename, PATH_MAX + 1);
1339 if (ret < 0) {
1340 error_setg_errno(errp, -ret, "Could not get temporary filename");
1341 goto out;
1342 }
1343
1344 bdrv_qcow2 = bdrv_find_format("qcow2");
1345 opts = qemu_opts_create(bdrv_qcow2->create_opts, NULL, 0,
1346 &error_abort);
1347 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, total_size);
1348 ret = bdrv_create(bdrv_qcow2, tmp_filename, opts, &local_err);
1349 qemu_opts_del(opts);
1350 if (ret < 0) {
1351 error_setg_errno(errp, -ret, "Could not create temporary overlay "
1352 "'%s': %s", tmp_filename,
1353 error_get_pretty(local_err));
1354 error_free(local_err);
1355 goto out;
1356 }
1357
1358 /* Prepare a new options QDict for the temporary file */
1359 snapshot_options = qdict_new();
1360 qdict_put(snapshot_options, "file.driver",
1361 qstring_from_str("file"));
1362 qdict_put(snapshot_options, "file.filename",
1363 qstring_from_str(tmp_filename));
1364
1365 bs_snapshot = bdrv_new();
1366
1367 ret = bdrv_open(&bs_snapshot, NULL, NULL, snapshot_options,
1368 flags, bdrv_qcow2, &local_err);
1369 if (ret < 0) {
1370 error_propagate(errp, local_err);
1371 goto out;
1372 }
1373
1374 bdrv_append(bs_snapshot, bs);
1375
1376 out:
1377 g_free(tmp_filename);
1378 return ret;
1379 }
1380
1381 /*
1382 * Opens a disk image (raw, qcow2, vmdk, ...)
1383 *
1384 * options is a QDict of options to pass to the block drivers, or NULL for an
1385 * empty set of options. The reference to the QDict belongs to the block layer
1386 * after the call (even on failure), so if the caller intends to reuse the
1387 * dictionary, it needs to use QINCREF() before calling bdrv_open.
1388 *
1389 * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
1390 * If it is not NULL, the referenced BDS will be reused.
1391 *
1392 * The reference parameter may be used to specify an existing block device which
1393 * should be opened. If specified, neither options nor a filename may be given,
1394 * nor can an existing BDS be reused (that is, *pbs has to be NULL).
1395 */
1396 int bdrv_open(BlockDriverState **pbs, const char *filename,
1397 const char *reference, QDict *options, int flags,
1398 BlockDriver *drv, Error **errp)
1399 {
1400 int ret;
1401 BlockDriverState *file = NULL, *bs;
1402 const char *drvname;
1403 Error *local_err = NULL;
1404 int snapshot_flags = 0;
1405
1406 assert(pbs);
1407
1408 if (reference) {
1409 bool options_non_empty = options ? qdict_size(options) : false;
1410 QDECREF(options);
1411
1412 if (*pbs) {
1413 error_setg(errp, "Cannot reuse an existing BDS when referencing "
1414 "another block device");
1415 return -EINVAL;
1416 }
1417
1418 if (filename || options_non_empty) {
1419 error_setg(errp, "Cannot reference an existing block device with "
1420 "additional options or a new filename");
1421 return -EINVAL;
1422 }
1423
1424 bs = bdrv_lookup_bs(reference, reference, errp);
1425 if (!bs) {
1426 return -ENODEV;
1427 }
1428 bdrv_ref(bs);
1429 *pbs = bs;
1430 return 0;
1431 }
1432
1433 if (*pbs) {
1434 bs = *pbs;
1435 } else {
1436 bs = bdrv_new();
1437 }
1438
1439 /* NULL means an empty set of options */
1440 if (options == NULL) {
1441 options = qdict_new();
1442 }
1443
1444 ret = bdrv_fill_options(&options, &filename, flags, drv, &local_err);
1445 if (local_err) {
1446 goto fail;
1447 }
1448
1449 /* Find the right image format driver */
1450 drv = NULL;
1451 drvname = qdict_get_try_str(options, "driver");
1452 if (drvname) {
1453 drv = bdrv_find_format(drvname);
1454 qdict_del(options, "driver");
1455 if (!drv) {
1456 error_setg(errp, "Unknown driver: '%s'", drvname);
1457 ret = -EINVAL;
1458 goto fail;
1459 }
1460 }
1461
1462 assert(drvname || !(flags & BDRV_O_PROTOCOL));
1463 if (drv && !drv->bdrv_file_open) {
1464 /* If the user explicitly wants a format driver here, we'll need to add
1465 * another layer for the protocol in bs->file */
1466 flags &= ~BDRV_O_PROTOCOL;
1467 }
1468
1469 bs->options = options;
1470 options = qdict_clone_shallow(options);
1471
1472 /* Open image file without format layer */
1473 if ((flags & BDRV_O_PROTOCOL) == 0) {
1474 if (flags & BDRV_O_RDWR) {
1475 flags |= BDRV_O_ALLOW_RDWR;
1476 }
1477 if (flags & BDRV_O_SNAPSHOT) {
1478 snapshot_flags = bdrv_temp_snapshot_flags(flags);
1479 flags = bdrv_backing_flags(flags);
1480 }
1481
1482 assert(file == NULL);
1483 ret = bdrv_open_image(&file, filename, options, "file",
1484 bdrv_inherited_flags(flags),
1485 true, &local_err);
1486 if (ret < 0) {
1487 goto fail;
1488 }
1489 }
1490
1491 /* Image format probing */
1492 bs->probed = !drv;
1493 if (!drv && file) {
1494 ret = find_image_format(file, filename, &drv, &local_err);
1495 if (ret < 0) {
1496 goto fail;
1497 }
1498 } else if (!drv) {
1499 error_setg(errp, "Must specify either driver or file");
1500 ret = -EINVAL;
1501 goto fail;
1502 }
1503
1504 /* Open the image */
1505 ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
1506 if (ret < 0) {
1507 goto fail;
1508 }
1509
1510 if (file && (bs->file != file)) {
1511 bdrv_unref(file);
1512 file = NULL;
1513 }
1514
1515 /* If there is a backing file, use it */
1516 if ((flags & BDRV_O_NO_BACKING) == 0) {
1517 QDict *backing_options;
1518
1519 qdict_extract_subqdict(options, &backing_options, "backing.");
1520 ret = bdrv_open_backing_file(bs, backing_options, &local_err);
1521 if (ret < 0) {
1522 goto close_and_fail;
1523 }
1524 }
1525
1526 bdrv_refresh_filename(bs);
1527
1528 /* For snapshot=on, create a temporary qcow2 overlay. bs points to the
1529 * temporary snapshot afterwards. */
1530 if (snapshot_flags) {
1531 ret = bdrv_append_temp_snapshot(bs, snapshot_flags, &local_err);
1532 if (local_err) {
1533 goto close_and_fail;
1534 }
1535 }
1536
1537 /* Check if any unknown options were used */
1538 if (options && (qdict_size(options) != 0)) {
1539 const QDictEntry *entry = qdict_first(options);
1540 if (flags & BDRV_O_PROTOCOL) {
1541 error_setg(errp, "Block protocol '%s' doesn't support the option "
1542 "'%s'", drv->format_name, entry->key);
1543 } else {
1544 error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1545 "support the option '%s'", drv->format_name,
1546 bdrv_get_device_name(bs), entry->key);
1547 }
1548
1549 ret = -EINVAL;
1550 goto close_and_fail;
1551 }
1552
1553 if (!bdrv_key_required(bs)) {
1554 if (bs->blk) {
1555 blk_dev_change_media_cb(bs->blk, true);
1556 }
1557 } else if (!runstate_check(RUN_STATE_PRELAUNCH)
1558 && !runstate_check(RUN_STATE_INMIGRATE)
1559 && !runstate_check(RUN_STATE_PAUSED)) { /* HACK */
1560 error_setg(errp,
1561 "Guest must be stopped for opening of encrypted image");
1562 ret = -EBUSY;
1563 goto close_and_fail;
1564 }
1565
1566 QDECREF(options);
1567 *pbs = bs;
1568 return 0;
1569
1570 fail:
1571 if (file != NULL) {
1572 bdrv_unref(file);
1573 }
1574 QDECREF(bs->options);
1575 QDECREF(options);
1576 bs->options = NULL;
1577 if (!*pbs) {
1578 /* If *pbs is NULL, a new BDS has been created in this function and
1579 needs to be freed now. Otherwise, it does not need to be closed,
1580 since it has not really been opened yet. */
1581 bdrv_unref(bs);
1582 }
1583 if (local_err) {
1584 error_propagate(errp, local_err);
1585 }
1586 return ret;
1587
1588 close_and_fail:
1589 /* See fail path, but now the BDS has to be always closed */
1590 if (*pbs) {
1591 bdrv_close(bs);
1592 } else {
1593 bdrv_unref(bs);
1594 }
1595 QDECREF(options);
1596 if (local_err) {
1597 error_propagate(errp, local_err);
1598 }
1599 return ret;
1600 }
1601
1602 typedef struct BlockReopenQueueEntry {
1603 bool prepared;
1604 BDRVReopenState state;
1605 QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1606 } BlockReopenQueueEntry;
1607
1608 /*
1609 * Adds a BlockDriverState to a simple queue for an atomic, transactional
1610 * reopen of multiple devices.
1611 *
1612 * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1613 * already performed, or alternatively may be NULL a new BlockReopenQueue will
1614 * be created and initialized. This newly created BlockReopenQueue should be
1615 * passed back in for subsequent calls that are intended to be of the same
1616 * atomic 'set'.
1617 *
1618 * bs is the BlockDriverState to add to the reopen queue.
1619 *
1620 * flags contains the open flags for the associated bs
1621 *
1622 * returns a pointer to bs_queue, which is either the newly allocated
1623 * bs_queue, or the existing bs_queue being used.
1624 *
1625 */
1626 BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1627 BlockDriverState *bs, int flags)
1628 {
1629 assert(bs != NULL);
1630
1631 BlockReopenQueueEntry *bs_entry;
1632 if (bs_queue == NULL) {
1633 bs_queue = g_new0(BlockReopenQueue, 1);
1634 QSIMPLEQ_INIT(bs_queue);
1635 }
1636
1637 /* bdrv_open() masks this flag out */
1638 flags &= ~BDRV_O_PROTOCOL;
1639
1640 if (bs->file) {
1641 bdrv_reopen_queue(bs_queue, bs->file, bdrv_inherited_flags(flags));
1642 }
1643
1644 bs_entry = g_new0(BlockReopenQueueEntry, 1);
1645 QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1646
1647 bs_entry->state.bs = bs;
1648 bs_entry->state.flags = flags;
1649
1650 return bs_queue;
1651 }
1652
1653 /*
1654 * Reopen multiple BlockDriverStates atomically & transactionally.
1655 *
1656 * The queue passed in (bs_queue) must have been built up previous
1657 * via bdrv_reopen_queue().
1658 *
1659 * Reopens all BDS specified in the queue, with the appropriate
1660 * flags. All devices are prepared for reopen, and failure of any
1661 * device will cause all device changes to be abandonded, and intermediate
1662 * data cleaned up.
1663 *
1664 * If all devices prepare successfully, then the changes are committed
1665 * to all devices.
1666 *
1667 */
1668 int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1669 {
1670 int ret = -1;
1671 BlockReopenQueueEntry *bs_entry, *next;
1672 Error *local_err = NULL;
1673
1674 assert(bs_queue != NULL);
1675
1676 bdrv_drain_all();
1677
1678 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1679 if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1680 error_propagate(errp, local_err);
1681 goto cleanup;
1682 }
1683 bs_entry->prepared = true;
1684 }
1685
1686 /* If we reach this point, we have success and just need to apply the
1687 * changes
1688 */
1689 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1690 bdrv_reopen_commit(&bs_entry->state);
1691 }
1692
1693 ret = 0;
1694
1695 cleanup:
1696 QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1697 if (ret && bs_entry->prepared) {
1698 bdrv_reopen_abort(&bs_entry->state);
1699 }
1700 g_free(bs_entry);
1701 }
1702 g_free(bs_queue);
1703 return ret;
1704 }
1705
1706
1707 /* Reopen a single BlockDriverState with the specified flags. */
1708 int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1709 {
1710 int ret = -1;
1711 Error *local_err = NULL;
1712 BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1713
1714 ret = bdrv_reopen_multiple(queue, &local_err);
1715 if (local_err != NULL) {
1716 error_propagate(errp, local_err);
1717 }
1718 return ret;
1719 }
1720
1721
1722 /*
1723 * Prepares a BlockDriverState for reopen. All changes are staged in the
1724 * 'opaque' field of the BDRVReopenState, which is used and allocated by
1725 * the block driver layer .bdrv_reopen_prepare()
1726 *
1727 * bs is the BlockDriverState to reopen
1728 * flags are the new open flags
1729 * queue is the reopen queue
1730 *
1731 * Returns 0 on success, non-zero on error. On error errp will be set
1732 * as well.
1733 *
1734 * On failure, bdrv_reopen_abort() will be called to clean up any data.
1735 * It is the responsibility of the caller to then call the abort() or
1736 * commit() for any other BDS that have been left in a prepare() state
1737 *
1738 */
1739 int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1740 Error **errp)
1741 {
1742 int ret = -1;
1743 Error *local_err = NULL;
1744 BlockDriver *drv;
1745
1746 assert(reopen_state != NULL);
1747 assert(reopen_state->bs->drv != NULL);
1748 drv = reopen_state->bs->drv;
1749
1750 /* if we are to stay read-only, do not allow permission change
1751 * to r/w */
1752 if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1753 reopen_state->flags & BDRV_O_RDWR) {
1754 error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1755 bdrv_get_device_name(reopen_state->bs));
1756 goto error;
1757 }
1758
1759
1760 ret = bdrv_flush(reopen_state->bs);
1761 if (ret) {
1762 error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1763 strerror(-ret));
1764 goto error;
1765 }
1766
1767 if (drv->bdrv_reopen_prepare) {
1768 ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1769 if (ret) {
1770 if (local_err != NULL) {
1771 error_propagate(errp, local_err);
1772 } else {
1773 error_setg(errp, "failed while preparing to reopen image '%s'",
1774 reopen_state->bs->filename);
1775 }
1776 goto error;
1777 }
1778 } else {
1779 /* It is currently mandatory to have a bdrv_reopen_prepare()
1780 * handler for each supported drv. */
1781 error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1782 drv->format_name, bdrv_get_device_name(reopen_state->bs),
1783 "reopening of file");
1784 ret = -1;
1785 goto error;
1786 }
1787
1788 ret = 0;
1789
1790 error:
1791 return ret;
1792 }
1793
1794 /*
1795 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1796 * makes them final by swapping the staging BlockDriverState contents into
1797 * the active BlockDriverState contents.
1798 */
1799 void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1800 {
1801 BlockDriver *drv;
1802
1803 assert(reopen_state != NULL);
1804 drv = reopen_state->bs->drv;
1805 assert(drv != NULL);
1806
1807 /* If there are any driver level actions to take */
1808 if (drv->bdrv_reopen_commit) {
1809 drv->bdrv_reopen_commit(reopen_state);
1810 }
1811
1812 /* set BDS specific flags now */
1813 reopen_state->bs->open_flags = reopen_state->flags;
1814 reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1815 BDRV_O_CACHE_WB);
1816 reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1817
1818 bdrv_refresh_limits(reopen_state->bs, NULL);
1819 }
1820
1821 /*
1822 * Abort the reopen, and delete and free the staged changes in
1823 * reopen_state
1824 */
1825 void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1826 {
1827 BlockDriver *drv;
1828
1829 assert(reopen_state != NULL);
1830 drv = reopen_state->bs->drv;
1831 assert(drv != NULL);
1832
1833 if (drv->bdrv_reopen_abort) {
1834 drv->bdrv_reopen_abort(reopen_state);
1835 }
1836 }
1837
1838
1839 void bdrv_close(BlockDriverState *bs)
1840 {
1841 BdrvAioNotifier *ban, *ban_next;
1842
1843 if (bs->job) {
1844 block_job_cancel_sync(bs->job);
1845 }
1846 bdrv_drain_all(); /* complete I/O */
1847 bdrv_flush(bs);
1848 bdrv_drain_all(); /* in case flush left pending I/O */
1849 notifier_list_notify(&bs->close_notifiers, bs);
1850
1851 if (bs->drv) {
1852 if (bs->backing_hd) {
1853 BlockDriverState *backing_hd = bs->backing_hd;
1854 bdrv_set_backing_hd(bs, NULL);
1855 bdrv_unref(backing_hd);
1856 }
1857 bs->drv->bdrv_close(bs);
1858 g_free(bs->opaque);
1859 bs->opaque = NULL;
1860 bs->drv = NULL;
1861 bs->copy_on_read = 0;
1862 bs->backing_file[0] = '\0';
1863 bs->backing_format[0] = '\0';
1864 bs->total_sectors = 0;
1865 bs->encrypted = 0;
1866 bs->valid_key = 0;
1867 bs->sg = 0;
1868 bs->growable = 0;
1869 bs->zero_beyond_eof = false;
1870 QDECREF(bs->options);
1871 bs->options = NULL;
1872 QDECREF(bs->full_open_options);
1873 bs->full_open_options = NULL;
1874
1875 if (bs->file != NULL) {
1876 bdrv_unref(bs->file);
1877 bs->file = NULL;
1878 }
1879 }
1880
1881 if (bs->blk) {
1882 blk_dev_change_media_cb(bs->blk, false);
1883 }
1884
1885 /*throttling disk I/O limits*/
1886 if (bs->io_limits_enabled) {
1887 bdrv_io_limits_disable(bs);
1888 }
1889
1890 QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
1891 g_free(ban);
1892 }
1893 QLIST_INIT(&bs->aio_notifiers);
1894 }
1895
1896 void bdrv_close_all(void)
1897 {
1898 BlockDriverState *bs;
1899
1900 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1901 AioContext *aio_context = bdrv_get_aio_context(bs);
1902
1903 aio_context_acquire(aio_context);
1904 bdrv_close(bs);
1905 aio_context_release(aio_context);
1906 }
1907 }
1908
1909 /* Check if any requests are in-flight (including throttled requests) */
1910 static bool bdrv_requests_pending(BlockDriverState *bs)
1911 {
1912 if (!QLIST_EMPTY(&bs->tracked_requests)) {
1913 return true;
1914 }
1915 if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1916 return true;
1917 }
1918 if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
1919 return true;
1920 }
1921 if (bs->file && bdrv_requests_pending(bs->file)) {
1922 return true;
1923 }
1924 if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1925 return true;
1926 }
1927 return false;
1928 }
1929
1930 static bool bdrv_drain_one(BlockDriverState *bs)
1931 {
1932 bool bs_busy;
1933
1934 bdrv_flush_io_queue(bs);
1935 bdrv_start_throttled_reqs(bs);
1936 bs_busy = bdrv_requests_pending(bs);
1937 bs_busy |= aio_poll(bdrv_get_aio_context(bs), bs_busy);
1938 return bs_busy;
1939 }
1940
1941 /*
1942 * Wait for pending requests to complete on a single BlockDriverState subtree
1943 *
1944 * See the warning in bdrv_drain_all(). This function can only be called if
1945 * you are sure nothing can generate I/O because you have op blockers
1946 * installed.
1947 *
1948 * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
1949 * AioContext.
1950 */
1951 void bdrv_drain(BlockDriverState *bs)
1952 {
1953 while (bdrv_drain_one(bs)) {
1954 /* Keep iterating */
1955 }
1956 }
1957
1958 /*
1959 * Wait for pending requests to complete across all BlockDriverStates
1960 *
1961 * This function does not flush data to disk, use bdrv_flush_all() for that
1962 * after calling this function.
1963 *
1964 * Note that completion of an asynchronous I/O operation can trigger any
1965 * number of other I/O operations on other devices---for example a coroutine
1966 * can be arbitrarily complex and a constant flow of I/O can come until the
1967 * coroutine is complete. Because of this, it is not possible to have a
1968 * function to drain a single device's I/O queue.
1969 */
1970 void bdrv_drain_all(void)
1971 {
1972 /* Always run first iteration so any pending completion BHs run */
1973 bool busy = true;
1974 BlockDriverState *bs;
1975
1976 while (busy) {
1977 busy = false;
1978
1979 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1980 AioContext *aio_context = bdrv_get_aio_context(bs);
1981
1982 aio_context_acquire(aio_context);
1983 busy |= bdrv_drain_one(bs);
1984 aio_context_release(aio_context);
1985 }
1986 }
1987 }
1988
1989 /* make a BlockDriverState anonymous by removing from bdrv_state and
1990 * graph_bdrv_state list.
1991 Also, NULL terminate the device_name to prevent double remove */
1992 void bdrv_make_anon(BlockDriverState *bs)
1993 {
1994 /*
1995 * Take care to remove bs from bdrv_states only when it's actually
1996 * in it. Note that bs->device_list.tqe_prev is initially null,
1997 * and gets set to non-null by QTAILQ_INSERT_TAIL(). Establish
1998 * the useful invariant "bs in bdrv_states iff bs->tqe_prev" by
1999 * resetting it to null on remove.
2000 */
2001 if (bs->device_list.tqe_prev) {
2002 QTAILQ_REMOVE(&bdrv_states, bs, device_list);
2003 bs->device_list.tqe_prev = NULL;
2004 }
2005 if (bs->node_name[0] != '\0') {
2006 QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
2007 }
2008 bs->node_name[0] = '\0';
2009 }
2010
2011 static void bdrv_rebind(BlockDriverState *bs)
2012 {
2013 if (bs->drv && bs->drv->bdrv_rebind) {
2014 bs->drv->bdrv_rebind(bs);
2015 }
2016 }
2017
2018 static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
2019 BlockDriverState *bs_src)
2020 {
2021 /* move some fields that need to stay attached to the device */
2022
2023 /* dev info */
2024 bs_dest->guest_block_size = bs_src->guest_block_size;
2025 bs_dest->copy_on_read = bs_src->copy_on_read;
2026
2027 bs_dest->enable_write_cache = bs_src->enable_write_cache;
2028
2029 /* i/o throttled req */
2030 memcpy(&bs_dest->throttle_state,
2031 &bs_src->throttle_state,
2032 sizeof(ThrottleState));
2033 bs_dest->throttled_reqs[0] = bs_src->throttled_reqs[0];
2034 bs_dest->throttled_reqs[1] = bs_src->throttled_reqs[1];
2035 bs_dest->io_limits_enabled = bs_src->io_limits_enabled;
2036
2037 /* r/w error */
2038 bs_dest->on_read_error = bs_src->on_read_error;
2039 bs_dest->on_write_error = bs_src->on_write_error;
2040
2041 /* i/o status */
2042 bs_dest->iostatus_enabled = bs_src->iostatus_enabled;
2043 bs_dest->iostatus = bs_src->iostatus;
2044
2045 /* dirty bitmap */
2046 bs_dest->dirty_bitmaps = bs_src->dirty_bitmaps;
2047
2048 /* reference count */
2049 bs_dest->refcnt = bs_src->refcnt;
2050
2051 /* job */
2052 bs_dest->job = bs_src->job;
2053
2054 /* keep the same entry in bdrv_states */
2055 bs_dest->device_list = bs_src->device_list;
2056 bs_dest->blk = bs_src->blk;
2057
2058 memcpy(bs_dest->op_blockers, bs_src->op_blockers,
2059 sizeof(bs_dest->op_blockers));
2060 }
2061
2062 /*
2063 * Swap bs contents for two image chains while they are live,
2064 * while keeping required fields on the BlockDriverState that is
2065 * actually attached to a device.
2066 *
2067 * This will modify the BlockDriverState fields, and swap contents
2068 * between bs_new and bs_old. Both bs_new and bs_old are modified.
2069 *
2070 * bs_new must not be attached to a BlockBackend.
2071 *
2072 * This function does not create any image files.
2073 */
2074 void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
2075 {
2076 BlockDriverState tmp;
2077
2078 /* The code needs to swap the node_name but simply swapping node_list won't
2079 * work so first remove the nodes from the graph list, do the swap then
2080 * insert them back if needed.
2081 */
2082 if (bs_new->node_name[0] != '\0') {
2083 QTAILQ_REMOVE(&graph_bdrv_states, bs_new, node_list);
2084 }
2085 if (bs_old->node_name[0] != '\0') {
2086 QTAILQ_REMOVE(&graph_bdrv_states, bs_old, node_list);
2087 }
2088
2089 /* bs_new must be unattached and shouldn't have anything fancy enabled */
2090 assert(!bs_new->blk);
2091 assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
2092 assert(bs_new->job == NULL);
2093 assert(bs_new->io_limits_enabled == false);
2094 assert(!throttle_have_timer(&bs_new->throttle_state));
2095
2096 tmp = *bs_new;
2097 *bs_new = *bs_old;
2098 *bs_old = tmp;
2099
2100 /* there are some fields that should not be swapped, move them back */
2101 bdrv_move_feature_fields(&tmp, bs_old);
2102 bdrv_move_feature_fields(bs_old, bs_new);
2103 bdrv_move_feature_fields(bs_new, &tmp);
2104
2105 /* bs_new must remain unattached */
2106 assert(!bs_new->blk);
2107
2108 /* Check a few fields that should remain attached to the device */
2109 assert(bs_new->job == NULL);
2110 assert(bs_new->io_limits_enabled == false);
2111 assert(!throttle_have_timer(&bs_new->throttle_state));
2112
2113 /* insert the nodes back into the graph node list if needed */
2114 if (bs_new->node_name[0] != '\0') {
2115 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_new, node_list);
2116 }
2117 if (bs_old->node_name[0] != '\0') {
2118 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_old, node_list);
2119 }
2120
2121 bdrv_rebind(bs_new);
2122 bdrv_rebind(bs_old);
2123 }
2124
2125 /*
2126 * Add new bs contents at the top of an image chain while the chain is
2127 * live, while keeping required fields on the top layer.
2128 *
2129 * This will modify the BlockDriverState fields, and swap contents
2130 * between bs_new and bs_top. Both bs_new and bs_top are modified.
2131 *
2132 * bs_new must not be attached to a BlockBackend.
2133 *
2134 * This function does not create any image files.
2135 */
2136 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
2137 {
2138 bdrv_swap(bs_new, bs_top);
2139
2140 /* The contents of 'tmp' will become bs_top, as we are
2141 * swapping bs_new and bs_top contents. */
2142 bdrv_set_backing_hd(bs_top, bs_new);
2143 }
2144
2145 static void bdrv_delete(BlockDriverState *bs)
2146 {
2147 assert(!bs->job);
2148 assert(bdrv_op_blocker_is_empty(bs));
2149 assert(!bs->refcnt);
2150 assert(QLIST_EMPTY(&bs->dirty_bitmaps));
2151
2152 bdrv_close(bs);
2153
2154 /* remove from list, if necessary */
2155 bdrv_make_anon(bs);
2156
2157 g_free(bs);
2158 }
2159
2160 /*
2161 * Run consistency checks on an image
2162 *
2163 * Returns 0 if the check could be completed (it doesn't mean that the image is
2164 * free of errors) or -errno when an internal error occurred. The results of the
2165 * check are stored in res.
2166 */
2167 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
2168 {
2169 if (bs->drv == NULL) {
2170 return -ENOMEDIUM;
2171 }
2172 if (bs->drv->bdrv_check == NULL) {
2173 return -ENOTSUP;
2174 }
2175
2176 memset(res, 0, sizeof(*res));
2177 return bs->drv->bdrv_check(bs, res, fix);
2178 }
2179
2180 #define COMMIT_BUF_SECTORS 2048
2181
2182 /* commit COW file into the raw image */
2183 int bdrv_commit(BlockDriverState *bs)
2184 {
2185 BlockDriver *drv = bs->drv;
2186 int64_t sector, total_sectors, length, backing_length;
2187 int n, ro, open_flags;
2188 int ret = 0;
2189 uint8_t *buf = NULL;
2190 char filename[PATH_MAX];
2191
2192 if (!drv)
2193 return -ENOMEDIUM;
2194
2195 if (!bs->backing_hd) {
2196 return -ENOTSUP;
2197 }
2198
2199 if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_COMMIT, NULL) ||
2200 bdrv_op_is_blocked(bs->backing_hd, BLOCK_OP_TYPE_COMMIT, NULL)) {
2201 return -EBUSY;
2202 }
2203
2204 ro = bs->backing_hd->read_only;
2205 /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
2206 pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
2207 open_flags = bs->backing_hd->open_flags;
2208
2209 if (ro) {
2210 if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
2211 return -EACCES;
2212 }
2213 }
2214
2215 length = bdrv_getlength(bs);
2216 if (length < 0) {
2217 ret = length;
2218 goto ro_cleanup;
2219 }
2220
2221 backing_length = bdrv_getlength(bs->backing_hd);
2222 if (backing_length < 0) {
2223 ret = backing_length;
2224 goto ro_cleanup;
2225 }
2226
2227 /* If our top snapshot is larger than the backing file image,
2228 * grow the backing file image if possible. If not possible,
2229 * we must return an error */
2230 if (length > backing_length) {
2231 ret = bdrv_truncate(bs->backing_hd, length);
2232 if (ret < 0) {
2233 goto ro_cleanup;
2234 }
2235 }
2236
2237 total_sectors = length >> BDRV_SECTOR_BITS;
2238
2239 /* qemu_try_blockalign() for bs will choose an alignment that works for
2240 * bs->backing_hd as well, so no need to compare the alignment manually. */
2241 buf = qemu_try_blockalign(bs, COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
2242 if (buf == NULL) {
2243 ret = -ENOMEM;
2244 goto ro_cleanup;
2245 }
2246
2247 for (sector = 0; sector < total_sectors; sector += n) {
2248 ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
2249 if (ret < 0) {
2250 goto ro_cleanup;
2251 }
2252 if (ret) {
2253 ret = bdrv_read(bs, sector, buf, n);
2254 if (ret < 0) {
2255 goto ro_cleanup;
2256 }
2257
2258 ret = bdrv_write(bs->backing_hd, sector, buf, n);
2259 if (ret < 0) {
2260 goto ro_cleanup;
2261 }
2262 }
2263 }
2264
2265 if (drv->bdrv_make_empty) {
2266 ret = drv->bdrv_make_empty(bs);
2267 if (ret < 0) {
2268 goto ro_cleanup;
2269 }
2270 bdrv_flush(bs);
2271 }
2272
2273 /*
2274 * Make sure all data we wrote to the backing device is actually
2275 * stable on disk.
2276 */
2277 if (bs->backing_hd) {
2278 bdrv_flush(bs->backing_hd);
2279 }
2280
2281 ret = 0;
2282 ro_cleanup:
2283 qemu_vfree(buf);
2284
2285 if (ro) {
2286 /* ignoring error return here */
2287 bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
2288 }
2289
2290 return ret;
2291 }
2292
2293 int bdrv_commit_all(void)
2294 {
2295 BlockDriverState *bs;
2296
2297 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
2298 AioContext *aio_context = bdrv_get_aio_context(bs);
2299
2300 aio_context_acquire(aio_context);
2301 if (bs->drv && bs->backing_hd) {
2302 int ret = bdrv_commit(bs);
2303 if (ret < 0) {
2304 aio_context_release(aio_context);
2305 return ret;
2306 }
2307 }
2308 aio_context_release(aio_context);
2309 }
2310 return 0;
2311 }
2312
2313 /**
2314 * Remove an active request from the tracked requests list
2315 *
2316 * This function should be called when a tracked request is completing.
2317 */
2318 static void tracked_request_end(BdrvTrackedRequest *req)
2319 {
2320 if (req->serialising) {
2321 req->bs->serialising_in_flight--;
2322 }
2323
2324 QLIST_REMOVE(req, list);
2325 qemu_co_queue_restart_all(&req->wait_queue);
2326 }
2327
2328 /**
2329 * Add an active request to the tracked requests list
2330 */
2331 static void tracked_request_begin(BdrvTrackedRequest *req,
2332 BlockDriverState *bs,
2333 int64_t offset,
2334 unsigned int bytes, bool is_write)
2335 {
2336 *req = (BdrvTrackedRequest){
2337 .bs = bs,
2338 .offset = offset,
2339 .bytes = bytes,
2340 .is_write = is_write,
2341 .co = qemu_coroutine_self(),
2342 .serialising = false,
2343 .overlap_offset = offset,
2344 .overlap_bytes = bytes,
2345 };
2346
2347 qemu_co_queue_init(&req->wait_queue);
2348
2349 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2350 }
2351
2352 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
2353 {
2354 int64_t overlap_offset = req->offset & ~(align - 1);
2355 unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
2356 - overlap_offset;
2357
2358 if (!req->serialising) {
2359 req->bs->serialising_in_flight++;
2360 req->serialising = true;
2361 }
2362
2363 req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
2364 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
2365 }
2366
2367 /**
2368 * Round a region to cluster boundaries
2369 */
2370 void bdrv_round_to_clusters(BlockDriverState *bs,
2371 int64_t sector_num, int nb_sectors,
2372 int64_t *cluster_sector_num,
2373 int *cluster_nb_sectors)
2374 {
2375 BlockDriverInfo bdi;
2376
2377 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2378 *cluster_sector_num = sector_num;
2379 *cluster_nb_sectors = nb_sectors;
2380 } else {
2381 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2382 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2383 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2384 nb_sectors, c);
2385 }
2386 }
2387
2388 static int bdrv_get_cluster_size(BlockDriverState *bs)
2389 {
2390 BlockDriverInfo bdi;
2391 int ret;
2392
2393 ret = bdrv_get_info(bs, &bdi);
2394 if (ret < 0 || bdi.cluster_size == 0) {
2395 return bs->request_alignment;
2396 } else {
2397 return bdi.cluster_size;
2398 }
2399 }
2400
2401 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
2402 int64_t offset, unsigned int bytes)
2403 {
2404 /* aaaa bbbb */
2405 if (offset >= req->overlap_offset + req->overlap_bytes) {
2406 return false;
2407 }
2408 /* bbbb aaaa */
2409 if (req->overlap_offset >= offset + bytes) {
2410 return false;
2411 }
2412 return true;
2413 }
2414
2415 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
2416 {
2417 BlockDriverState *bs = self->bs;
2418 BdrvTrackedRequest *req;
2419 bool retry;
2420 bool waited = false;
2421
2422 if (!bs->serialising_in_flight) {
2423 return false;
2424 }
2425
2426 do {
2427 retry = false;
2428 QLIST_FOREACH(req, &bs->tracked_requests, list) {
2429 if (req == self || (!req->serialising && !self->serialising)) {
2430 continue;
2431 }
2432 if (tracked_request_overlaps(req, self->overlap_offset,
2433 self->overlap_bytes))
2434 {
2435 /* Hitting this means there was a reentrant request, for
2436 * example, a block driver issuing nested requests. This must
2437 * never happen since it means deadlock.
2438 */
2439 assert(qemu_coroutine_self() != req->co);
2440
2441 /* If the request is already (indirectly) waiting for us, or
2442 * will wait for us as soon as it wakes up, then just go on
2443 * (instead of producing a deadlock in the former case). */
2444 if (!req->waiting_for) {
2445 self->waiting_for = req;
2446 qemu_co_queue_wait(&req->wait_queue);
2447 self->waiting_for = NULL;
2448 retry = true;
2449 waited = true;
2450 break;
2451 }
2452 }
2453 }
2454 } while (retry);
2455
2456 return waited;
2457 }
2458
2459 /*
2460 * Return values:
2461 * 0 - success
2462 * -EINVAL - backing format specified, but no file
2463 * -ENOSPC - can't update the backing file because no space is left in the
2464 * image file header
2465 * -ENOTSUP - format driver doesn't support changing the backing file
2466 */
2467 int bdrv_change_backing_file(BlockDriverState *bs,
2468 const char *backing_file, const char *backing_fmt)
2469 {
2470 BlockDriver *drv = bs->drv;
2471 int ret;
2472
2473 /* Backing file format doesn't make sense without a backing file */
2474 if (backing_fmt && !backing_file) {
2475 return -EINVAL;
2476 }
2477
2478 if (drv->bdrv_change_backing_file != NULL) {
2479 ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
2480 } else {
2481 ret = -ENOTSUP;
2482 }
2483
2484 if (ret == 0) {
2485 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2486 pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2487 }
2488 return ret;
2489 }
2490
2491 /*
2492 * Finds the image layer in the chain that has 'bs' as its backing file.
2493 *
2494 * active is the current topmost image.
2495 *
2496 * Returns NULL if bs is not found in active's image chain,
2497 * or if active == bs.
2498 *
2499 * Returns the bottommost base image if bs == NULL.
2500 */
2501 BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2502 BlockDriverState *bs)
2503 {
2504 while (active && bs != active->backing_hd) {
2505 active = active->backing_hd;
2506 }
2507
2508 return active;
2509 }
2510
2511 /* Given a BDS, searches for the base layer. */
2512 BlockDriverState *bdrv_find_base(BlockDriverState *bs)
2513 {
2514 return bdrv_find_overlay(bs, NULL);
2515 }
2516
2517 typedef struct BlkIntermediateStates {
2518 BlockDriverState *bs;
2519 QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2520 } BlkIntermediateStates;
2521
2522
2523 /*
2524 * Drops images above 'base' up to and including 'top', and sets the image
2525 * above 'top' to have base as its backing file.
2526 *
2527 * Requires that the overlay to 'top' is opened r/w, so that the backing file
2528 * information in 'bs' can be properly updated.
2529 *
2530 * E.g., this will convert the following chain:
2531 * bottom <- base <- intermediate <- top <- active
2532 *
2533 * to
2534 *
2535 * bottom <- base <- active
2536 *
2537 * It is allowed for bottom==base, in which case it converts:
2538 *
2539 * base <- intermediate <- top <- active
2540 *
2541 * to
2542 *
2543 * base <- active
2544 *
2545 * If backing_file_str is non-NULL, it will be used when modifying top's
2546 * overlay image metadata.
2547 *
2548 * Error conditions:
2549 * if active == top, that is considered an error
2550 *
2551 */
2552 int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2553 BlockDriverState *base, const char *backing_file_str)
2554 {
2555 BlockDriverState *intermediate;
2556 BlockDriverState *base_bs = NULL;
2557 BlockDriverState *new_top_bs = NULL;
2558 BlkIntermediateStates *intermediate_state, *next;
2559 int ret = -EIO;
2560
2561 QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2562 QSIMPLEQ_INIT(&states_to_delete);
2563
2564 if (!top->drv || !base->drv) {
2565 goto exit;
2566 }
2567
2568 new_top_bs = bdrv_find_overlay(active, top);
2569
2570 if (new_top_bs == NULL) {
2571 /* we could not find the image above 'top', this is an error */
2572 goto exit;
2573 }
2574
2575 /* special case of new_top_bs->backing_hd already pointing to base - nothing
2576 * to do, no intermediate images */
2577 if (new_top_bs->backing_hd == base) {
2578 ret = 0;
2579 goto exit;
2580 }
2581
2582 intermediate = top;
2583
2584 /* now we will go down through the list, and add each BDS we find
2585 * into our deletion queue, until we hit the 'base'
2586 */
2587 while (intermediate) {
2588 intermediate_state = g_new0(BlkIntermediateStates, 1);
2589 intermediate_state->bs = intermediate;
2590 QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2591
2592 if (intermediate->backing_hd == base) {
2593 base_bs = intermediate->backing_hd;
2594 break;
2595 }
2596 intermediate = intermediate->backing_hd;
2597 }
2598 if (base_bs == NULL) {
2599 /* something went wrong, we did not end at the base. safely
2600 * unravel everything, and exit with error */
2601 goto exit;
2602 }
2603
2604 /* success - we can delete the intermediate states, and link top->base */
2605 backing_file_str = backing_file_str ? backing_file_str : base_bs->filename;
2606 ret = bdrv_change_backing_file(new_top_bs, backing_file_str,
2607 base_bs->drv ? base_bs->drv->format_name : "");
2608 if (ret) {
2609 goto exit;
2610 }
2611 bdrv_set_backing_hd(new_top_bs, base_bs);
2612
2613 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2614 /* so that bdrv_close() does not recursively close the chain */
2615 bdrv_set_backing_hd(intermediate_state->bs, NULL);
2616 bdrv_unref(intermediate_state->bs);
2617 }
2618 ret = 0;
2619
2620 exit:
2621 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2622 g_free(intermediate_state);
2623 }
2624 return ret;
2625 }
2626
2627
2628 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2629 size_t size)
2630 {
2631 int64_t len;
2632
2633 if (size > INT_MAX) {
2634 return -EIO;
2635 }
2636
2637 if (!bdrv_is_inserted(bs))
2638 return -ENOMEDIUM;
2639
2640 if (bs->growable)
2641 return 0;
2642
2643 len = bdrv_getlength(bs);
2644
2645 if (offset < 0)
2646 return -EIO;
2647
2648 if ((offset > len) || (len - offset < size))
2649 return -EIO;
2650
2651 return 0;
2652 }
2653
2654 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2655 int nb_sectors)
2656 {
2657 if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2658 return -EIO;
2659 }
2660
2661 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2662 nb_sectors * BDRV_SECTOR_SIZE);
2663 }
2664
2665 typedef struct RwCo {
2666 BlockDriverState *bs;
2667 int64_t offset;
2668 QEMUIOVector *qiov;
2669 bool is_write;
2670 int ret;
2671 BdrvRequestFlags flags;
2672 } RwCo;
2673
2674 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
2675 {
2676 RwCo *rwco = opaque;
2677
2678 if (!rwco->is_write) {
2679 rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
2680 rwco->qiov->size, rwco->qiov,
2681 rwco->flags);
2682 } else {
2683 rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
2684 rwco->qiov->size, rwco->qiov,
2685 rwco->flags);
2686 }
2687 }
2688
2689 /*
2690 * Process a vectored synchronous request using coroutines
2691 */
2692 static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
2693 QEMUIOVector *qiov, bool is_write,
2694 BdrvRequestFlags flags)
2695 {
2696 Coroutine *co;
2697 RwCo rwco = {
2698 .bs = bs,
2699 .offset = offset,
2700 .qiov = qiov,
2701 .is_write = is_write,
2702 .ret = NOT_DONE,
2703 .flags = flags,
2704 };
2705
2706 /**
2707 * In sync call context, when the vcpu is blocked, this throttling timer
2708 * will not fire; so the I/O throttling function has to be disabled here
2709 * if it has been enabled.
2710 */
2711 if (bs->io_limits_enabled) {
2712 fprintf(stderr, "Disabling I/O throttling on '%s' due "
2713 "to synchronous I/O.\n", bdrv_get_device_name(bs));
2714 bdrv_io_limits_disable(bs);
2715 }
2716
2717 if (qemu_in_coroutine()) {
2718 /* Fast-path if already in coroutine context */
2719 bdrv_rw_co_entry(&rwco);
2720 } else {
2721 AioContext *aio_context = bdrv_get_aio_context(bs);
2722
2723 co = qemu_coroutine_create(bdrv_rw_co_entry);
2724 qemu_coroutine_enter(co, &rwco);
2725 while (rwco.ret == NOT_DONE) {
2726 aio_poll(aio_context, true);
2727 }
2728 }
2729 return rwco.ret;
2730 }
2731
2732 /*
2733 * Process a synchronous request using coroutines
2734 */
2735 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
2736 int nb_sectors, bool is_write, BdrvRequestFlags flags)
2737 {
2738 QEMUIOVector qiov;
2739 struct iovec iov = {
2740 .iov_base = (void *)buf,
2741 .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2742 };
2743
2744 if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2745 return -EINVAL;
2746 }
2747
2748 qemu_iovec_init_external(&qiov, &iov, 1);
2749 return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
2750 &qiov, is_write, flags);
2751 }
2752
2753 /* return < 0 if error. See bdrv_write() for the return codes */
2754 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2755 uint8_t *buf, int nb_sectors)
2756 {
2757 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
2758 }
2759
2760 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2761 int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2762 uint8_t *buf, int nb_sectors)
2763 {
2764 bool enabled;
2765 int ret;
2766
2767 enabled = bs->io_limits_enabled;
2768 bs->io_limits_enabled = false;
2769 ret = bdrv_read(bs, sector_num, buf, nb_sectors);
2770 bs->io_limits_enabled = enabled;
2771 return ret;
2772 }
2773
2774 /* Return < 0 if error. Important errors are:
2775 -EIO generic I/O error (may happen for all errors)
2776 -ENOMEDIUM No media inserted.
2777 -EINVAL Invalid sector number or nb_sectors
2778 -EACCES Trying to write a read-only device
2779 */
2780 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
2781 const uint8_t *buf, int nb_sectors)
2782 {
2783 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
2784 }
2785
2786 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2787 int nb_sectors, BdrvRequestFlags flags)
2788 {
2789 return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
2790 BDRV_REQ_ZERO_WRITE | flags);
2791 }
2792
2793 /*
2794 * Completely zero out a block device with the help of bdrv_write_zeroes.
2795 * The operation is sped up by checking the block status and only writing
2796 * zeroes to the device if they currently do not return zeroes. Optional
2797 * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2798 *
2799 * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2800 */
2801 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2802 {
2803 int64_t target_sectors, ret, nb_sectors, sector_num = 0;
2804 int n;
2805
2806 target_sectors = bdrv_nb_sectors(bs);
2807 if (target_sectors < 0) {
2808 return target_sectors;
2809 }
2810
2811 for (;;) {
2812 nb_sectors = target_sectors - sector_num;
2813 if (nb_sectors <= 0) {
2814 return 0;
2815 }
2816 if (nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2817 nb_sectors = INT_MAX / BDRV_SECTOR_SIZE;
2818 }
2819 ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
2820 if (ret < 0) {
2821 error_report("error getting block status at sector %" PRId64 ": %s",
2822 sector_num, strerror(-ret));
2823 return ret;
2824 }
2825 if (ret & BDRV_BLOCK_ZERO) {
2826 sector_num += n;
2827 continue;
2828 }
2829 ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2830 if (ret < 0) {
2831 error_report("error writing zeroes at sector %" PRId64 ": %s",
2832 sector_num, strerror(-ret));
2833 return ret;
2834 }
2835 sector_num += n;
2836 }
2837 }
2838
2839 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
2840 {
2841 QEMUIOVector qiov;
2842 struct iovec iov = {
2843 .iov_base = (void *)buf,
2844 .iov_len = bytes,
2845 };
2846 int ret;
2847
2848 if (bytes < 0) {
2849 return -EINVAL;
2850 }
2851
2852 qemu_iovec_init_external(&qiov, &iov, 1);
2853 ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
2854 if (ret < 0) {
2855 return ret;
2856 }
2857
2858 return bytes;
2859 }
2860
2861 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
2862 {
2863 int ret;
2864
2865 ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
2866 if (ret < 0) {
2867 return ret;
2868 }
2869
2870 return qiov->size;
2871 }
2872
2873 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2874 const void *buf, int bytes)
2875 {
2876 QEMUIOVector qiov;
2877 struct iovec iov = {
2878 .iov_base = (void *) buf,
2879 .iov_len = bytes,
2880 };
2881
2882 if (bytes < 0) {
2883 return -EINVAL;
2884 }
2885
2886 qemu_iovec_init_external(&qiov, &iov, 1);
2887 return bdrv_pwritev(bs, offset, &qiov);
2888 }
2889
2890 /*
2891 * Writes to the file and ensures that no writes are reordered across this
2892 * request (acts as a barrier)
2893 *
2894 * Returns 0 on success, -errno in error cases.
2895 */
2896 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2897 const void *buf, int count)
2898 {
2899 int ret;
2900
2901 ret = bdrv_pwrite(bs, offset, buf, count);
2902 if (ret < 0) {
2903 return ret;
2904 }
2905
2906 /* No flush needed for cache modes that already do it */
2907 if (bs->enable_write_cache) {
2908 bdrv_flush(bs);
2909 }
2910
2911 return 0;
2912 }
2913
2914 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
2915 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2916 {
2917 /* Perform I/O through a temporary buffer so that users who scribble over
2918 * their read buffer while the operation is in progress do not end up
2919 * modifying the image file. This is critical for zero-copy guest I/O
2920 * where anything might happen inside guest memory.
2921 */
2922 void *bounce_buffer;
2923
2924 BlockDriver *drv = bs->drv;
2925 struct iovec iov;
2926 QEMUIOVector bounce_qiov;
2927 int64_t cluster_sector_num;
2928 int cluster_nb_sectors;
2929 size_t skip_bytes;
2930 int ret;
2931
2932 /* Cover entire cluster so no additional backing file I/O is required when
2933 * allocating cluster in the image file.
2934 */
2935 bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2936 &cluster_sector_num, &cluster_nb_sectors);
2937
2938 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2939 cluster_sector_num, cluster_nb_sectors);
2940
2941 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2942 iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len);
2943 if (bounce_buffer == NULL) {
2944 ret = -ENOMEM;
2945 goto err;
2946 }
2947
2948 qemu_iovec_init_external(&bounce_qiov, &iov, 1);
2949
2950 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
2951 &bounce_qiov);
2952 if (ret < 0) {
2953 goto err;
2954 }
2955
2956 if (drv->bdrv_co_write_zeroes &&
2957 buffer_is_zero(bounce_buffer, iov.iov_len)) {
2958 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
2959 cluster_nb_sectors, 0);
2960 } else {
2961 /* This does not change the data on the disk, it is not necessary
2962 * to flush even in cache=writethrough mode.
2963 */
2964 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
2965 &bounce_qiov);
2966 }
2967
2968 if (ret < 0) {
2969 /* It might be okay to ignore write errors for guest requests. If this
2970 * is a deliberate copy-on-read then we don't want to ignore the error.
2971 * Simply report it in all cases.
2972 */
2973 goto err;
2974 }
2975
2976 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
2977 qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
2978 nb_sectors * BDRV_SECTOR_SIZE);
2979
2980 err:
2981 qemu_vfree(bounce_buffer);
2982 return ret;
2983 }
2984
2985 /*
2986 * Forwards an already correctly aligned request to the BlockDriver. This
2987 * handles copy on read and zeroing after EOF; any other features must be
2988 * implemented by the caller.
2989 */
2990 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
2991 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
2992 int64_t align, QEMUIOVector *qiov, int flags)
2993 {
2994 BlockDriver *drv = bs->drv;
2995 int ret;
2996
2997 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
2998 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
2999
3000 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3001 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3002 assert(!qiov || bytes == qiov->size);
3003
3004 /* Handle Copy on Read and associated serialisation */
3005 if (flags & BDRV_REQ_COPY_ON_READ) {
3006 /* If we touch the same cluster it counts as an overlap. This
3007 * guarantees that allocating writes will be serialized and not race
3008 * with each other for the same cluster. For example, in copy-on-read
3009 * it ensures that the CoR read and write operations are atomic and
3010 * guest writes cannot interleave between them. */
3011 mark_request_serialising(req, bdrv_get_cluster_size(bs));
3012 }
3013
3014 wait_serialising_requests(req);
3015
3016 if (flags & BDRV_REQ_COPY_ON_READ) {
3017 int pnum;
3018
3019 ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
3020 if (ret < 0) {
3021 goto out;
3022 }
3023
3024 if (!ret || pnum != nb_sectors) {
3025 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
3026 goto out;
3027 }
3028 }
3029
3030 /* Forward the request to the BlockDriver */
3031 if (!(bs->zero_beyond_eof && bs->growable)) {
3032 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
3033 } else {
3034 /* Read zeros after EOF of growable BDSes */
3035 int64_t total_sectors, max_nb_sectors;
3036
3037 total_sectors = bdrv_nb_sectors(bs);
3038 if (total_sectors < 0) {
3039 ret = total_sectors;
3040 goto out;
3041 }
3042
3043 max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
3044 align >> BDRV_SECTOR_BITS);
3045 if (max_nb_sectors > 0) {
3046 QEMUIOVector local_qiov;
3047 size_t local_sectors;
3048
3049 max_nb_sectors = MIN(max_nb_sectors, SIZE_MAX / BDRV_SECTOR_BITS);
3050 local_sectors = MIN(max_nb_sectors, nb_sectors);
3051
3052 qemu_iovec_init(&local_qiov, qiov->niov);
3053 qemu_iovec_concat(&local_qiov, qiov, 0,
3054 local_sectors * BDRV_SECTOR_SIZE);
3055
3056 ret = drv->bdrv_co_readv(bs, sector_num, local_sectors,
3057 &local_qiov);
3058
3059 qemu_iovec_destroy(&local_qiov);
3060 } else {
3061 ret = 0;
3062 }
3063
3064 /* Reading beyond end of file is supposed to produce zeroes */
3065 if (ret == 0 && total_sectors < sector_num + nb_sectors) {
3066 uint64_t offset = MAX(0, total_sectors - sector_num);
3067 uint64_t bytes = (sector_num + nb_sectors - offset) *
3068 BDRV_SECTOR_SIZE;
3069 qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
3070 }
3071 }
3072
3073 out:
3074 return ret;
3075 }
3076
3077 /*
3078 * Handle a read request in coroutine context
3079 */
3080 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
3081 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3082 BdrvRequestFlags flags)
3083 {
3084 BlockDriver *drv = bs->drv;
3085 BdrvTrackedRequest req;
3086
3087 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3088 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3089 uint8_t *head_buf = NULL;
3090 uint8_t *tail_buf = NULL;
3091 QEMUIOVector local_qiov;
3092 bool use_local_qiov = false;
3093 int ret;
3094
3095 if (!drv) {
3096 return -ENOMEDIUM;
3097 }
3098 if (bdrv_check_byte_request(bs, offset, bytes)) {
3099 return -EIO;
3100 }
3101
3102 if (bs->copy_on_read) {
3103 flags |= BDRV_REQ_COPY_ON_READ;
3104 }
3105
3106 /* throttling disk I/O */
3107 if (bs->io_limits_enabled) {
3108 bdrv_io_limits_intercept(bs, bytes, false);
3109 }
3110
3111 /* Align read if necessary by padding qiov */
3112 if (offset & (align - 1)) {
3113 head_buf = qemu_blockalign(bs, align);
3114 qemu_iovec_init(&local_qiov, qiov->niov + 2);
3115 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3116 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3117 use_local_qiov = true;
3118
3119 bytes += offset & (align - 1);
3120 offset = offset & ~(align - 1);
3121 }
3122
3123 if ((offset + bytes) & (align - 1)) {
3124 if (!use_local_qiov) {
3125 qemu_iovec_init(&local_qiov, qiov->niov + 1);
3126 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3127 use_local_qiov = true;
3128 }
3129 tail_buf = qemu_blockalign(bs, align);
3130 qemu_iovec_add(&local_qiov, tail_buf,
3131 align - ((offset + bytes) & (align - 1)));
3132
3133 bytes = ROUND_UP(bytes, align);
3134 }
3135
3136 tracked_request_begin(&req, bs, offset, bytes, false);
3137 ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
3138 use_local_qiov ? &local_qiov : qiov,
3139 flags);
3140 tracked_request_end(&req);
3141
3142 if (use_local_qiov) {
3143 qemu_iovec_destroy(&local_qiov);
3144 qemu_vfree(head_buf);
3145 qemu_vfree(tail_buf);
3146 }
3147
3148 return ret;
3149 }
3150
3151 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
3152 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3153 BdrvRequestFlags flags)
3154 {
3155 if (nb_sectors < 0 || nb_sectors > (UINT_MAX >> BDRV_SECTOR_BITS)) {
3156 return -EINVAL;
3157 }
3158
3159 return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
3160 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3161 }
3162
3163 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
3164 int nb_sectors, QEMUIOVector *qiov)
3165 {
3166 trace_bdrv_co_readv(bs, sector_num, nb_sectors);
3167
3168 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
3169 }
3170
3171 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
3172 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
3173 {
3174 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
3175
3176 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
3177 BDRV_REQ_COPY_ON_READ);
3178 }
3179
3180 /* if no limit is specified in the BlockLimits use a default
3181 * of 32768 512-byte sectors (16 MiB) per request.
3182 */
3183 #define MAX_WRITE_ZEROES_DEFAULT 32768
3184
3185 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
3186 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
3187 {
3188 BlockDriver *drv = bs->drv;
3189 QEMUIOVector qiov;
3190 struct iovec iov = {0};
3191 int ret = 0;
3192
3193 int max_write_zeroes = bs->bl.max_write_zeroes ?
3194 bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
3195
3196 while (nb_sectors > 0 && !ret) {
3197 int num = nb_sectors;
3198
3199 /* Align request. Block drivers can expect the "bulk" of the request
3200 * to be aligned.
3201 */
3202 if (bs->bl.write_zeroes_alignment
3203 && num > bs->bl.write_zeroes_alignment) {
3204 if (sector_num % bs->bl.write_zeroes_alignment != 0) {
3205 /* Make a small request up to the first aligned sector. */
3206 num = bs->bl.write_zeroes_alignment;
3207 num -= sector_num % bs->bl.write_zeroes_alignment;
3208 } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
3209 /* Shorten the request to the last aligned sector. num cannot
3210 * underflow because num > bs->bl.write_zeroes_alignment.
3211 */
3212 num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
3213 }
3214 }
3215
3216 /* limit request size */
3217 if (num > max_write_zeroes) {
3218 num = max_write_zeroes;
3219 }
3220
3221 ret = -ENOTSUP;
3222 /* First try the efficient write zeroes operation */
3223 if (drv->bdrv_co_write_zeroes) {
3224 ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
3225 }
3226
3227 if (ret == -ENOTSUP) {
3228 /* Fall back to bounce buffer if write zeroes is unsupported */
3229 iov.iov_len = num * BDRV_SECTOR_SIZE;
3230 if (iov.iov_base == NULL) {
3231 iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE);
3232 if (iov.iov_base == NULL) {
3233 ret = -ENOMEM;
3234 goto fail;
3235 }
3236 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
3237 }
3238 qemu_iovec_init_external(&qiov, &iov, 1);
3239
3240 ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
3241
3242 /* Keep bounce buffer around if it is big enough for all
3243 * all future requests.
3244 */
3245 if (num < max_write_zeroes) {
3246 qemu_vfree(iov.iov_base);
3247 iov.iov_base = NULL;
3248 }
3249 }
3250
3251 sector_num += num;
3252 nb_sectors -= num;
3253 }
3254
3255 fail:
3256 qemu_vfree(iov.iov_base);
3257 return ret;
3258 }
3259
3260 /*
3261 * Forwards an already correctly aligned write request to the BlockDriver.
3262 */
3263 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
3264 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3265 QEMUIOVector *qiov, int flags)
3266 {
3267 BlockDriver *drv = bs->drv;
3268 bool waited;
3269 int ret;
3270
3271 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3272 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
3273
3274 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3275 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3276 assert(!qiov || bytes == qiov->size);
3277
3278 waited = wait_serialising_requests(req);
3279 assert(!waited || !req->serialising);
3280 assert(req->overlap_offset <= offset);
3281 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
3282
3283 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
3284
3285 if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
3286 !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes &&
3287 qemu_iovec_is_zero(qiov)) {
3288 flags |= BDRV_REQ_ZERO_WRITE;
3289 if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
3290 flags |= BDRV_REQ_MAY_UNMAP;
3291 }
3292 }
3293
3294 if (ret < 0) {
3295 /* Do nothing, write notifier decided to fail this request */
3296 } else if (flags & BDRV_REQ_ZERO_WRITE) {
3297 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
3298 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
3299 } else {
3300 BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
3301 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
3302 }
3303 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
3304
3305 if (ret == 0 && !bs->enable_write_cache) {
3306 ret = bdrv_co_flush(bs);
3307 }
3308
3309 bdrv_set_dirty(bs, sector_num, nb_sectors);
3310
3311 block_acct_highest_sector(&bs->stats, sector_num, nb_sectors);
3312
3313 if (bs->growable && ret >= 0) {
3314 bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
3315 }
3316
3317 return ret;
3318 }
3319
3320 /*
3321 * Handle a write request in coroutine context
3322 */
3323 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
3324 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3325 BdrvRequestFlags flags)
3326 {
3327 BdrvTrackedRequest req;
3328 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3329 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3330 uint8_t *head_buf = NULL;
3331 uint8_t *tail_buf = NULL;
3332 QEMUIOVector local_qiov;
3333 bool use_local_qiov = false;
3334 int ret;
3335
3336 if (!bs->drv) {
3337 return -ENOMEDIUM;
3338 }
3339 if (bs->read_only) {
3340 return -EACCES;
3341 }
3342 if (bdrv_check_byte_request(bs, offset, bytes)) {
3343 return -EIO;
3344 }
3345
3346 /* throttling disk I/O */
3347 if (bs->io_limits_enabled) {
3348 bdrv_io_limits_intercept(bs, bytes, true);
3349 }
3350
3351 /*
3352 * Align write if necessary by performing a read-modify-write cycle.
3353 * Pad qiov with the read parts and be sure to have a tracked request not
3354 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
3355 */
3356 tracked_request_begin(&req, bs, offset, bytes, true);
3357
3358 if (offset & (align - 1)) {
3359 QEMUIOVector head_qiov;
3360 struct iovec head_iov;
3361
3362 mark_request_serialising(&req, align);
3363 wait_serialising_requests(&req);
3364
3365 head_buf = qemu_blockalign(bs, align);
3366 head_iov = (struct iovec) {
3367 .iov_base = head_buf,
3368 .iov_len = align,
3369 };
3370 qemu_iovec_init_external(&head_qiov, &head_iov, 1);
3371
3372 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
3373 ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
3374 align, &head_qiov, 0);
3375 if (ret < 0) {
3376 goto fail;
3377 }
3378 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
3379
3380 qemu_iovec_init(&local_qiov, qiov->niov + 2);
3381 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3382 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3383 use_local_qiov = true;
3384
3385 bytes += offset & (align - 1);
3386 offset = offset & ~(align - 1);
3387 }
3388
3389 if ((offset + bytes) & (align - 1)) {
3390 QEMUIOVector tail_qiov;
3391 struct iovec tail_iov;
3392 size_t tail_bytes;
3393 bool waited;
3394
3395 mark_request_serialising(&req, align);
3396 waited = wait_serialising_requests(&req);
3397 assert(!waited || !use_local_qiov);
3398
3399 tail_buf = qemu_blockalign(bs, align);
3400 tail_iov = (struct iovec) {
3401 .iov_base = tail_buf,
3402 .iov_len = align,
3403 };
3404 qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
3405
3406 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
3407 ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
3408 align, &tail_qiov, 0);
3409 if (ret < 0) {
3410 goto fail;
3411 }
3412 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
3413
3414 if (!use_local_qiov) {
3415 qemu_iovec_init(&local_qiov, qiov->niov + 1);
3416 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3417 use_local_qiov = true;
3418 }
3419
3420 tail_bytes = (offset + bytes) & (align - 1);
3421 qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
3422
3423 bytes = ROUND_UP(bytes, align);
3424 }
3425
3426 ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
3427 use_local_qiov ? &local_qiov : qiov,
3428 flags);
3429
3430 fail:
3431 tracked_request_end(&req);
3432
3433 if (use_local_qiov) {
3434 qemu_iovec_destroy(&local_qiov);
3435 }
3436 qemu_vfree(head_buf);
3437 qemu_vfree(tail_buf);
3438
3439 return ret;
3440 }
3441
3442 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
3443 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3444 BdrvRequestFlags flags)
3445 {
3446 if (nb_sectors < 0 || nb_sectors > (INT_MAX >> BDRV_SECTOR_BITS)) {
3447 return -EINVAL;
3448 }
3449
3450 return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
3451 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3452 }
3453
3454 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
3455 int nb_sectors, QEMUIOVector *qiov)
3456 {
3457 trace_bdrv_co_writev(bs, sector_num, nb_sectors);
3458
3459 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
3460 }
3461
3462 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
3463 int64_t sector_num, int nb_sectors,
3464 BdrvRequestFlags flags)
3465 {
3466 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
3467
3468 if (!(bs->open_flags & BDRV_O_UNMAP)) {
3469 flags &= ~BDRV_REQ_MAY_UNMAP;
3470 }
3471
3472 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
3473 BDRV_REQ_ZERO_WRITE | flags);
3474 }
3475
3476 /**
3477 * Truncate file to 'offset' bytes (needed only for file protocols)
3478 */
3479 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
3480 {
3481 BlockDriver *drv = bs->drv;
3482 int ret;
3483 if (!drv)
3484 return -ENOMEDIUM;
3485 if (!drv->bdrv_truncate)
3486 return -ENOTSUP;
3487 if (bs->read_only)
3488 return -EACCES;
3489
3490 ret = drv->bdrv_truncate(bs, offset);
3491 if (ret == 0) {
3492 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
3493 if (bs->blk) {
3494 blk_dev_resize_cb(bs->blk);
3495 }
3496 }
3497 return ret;
3498 }
3499
3500 /**
3501 * Length of a allocated file in bytes. Sparse files are counted by actual
3502 * allocated space. Return < 0 if error or unknown.
3503 */
3504 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
3505 {
3506 BlockDriver *drv = bs->drv;
3507 if (!drv) {
3508 return -ENOMEDIUM;
3509 }
3510 if (drv->bdrv_get_allocated_file_size) {
3511 return drv->bdrv_get_allocated_file_size(bs);
3512 }
3513 if (bs->file) {
3514 return bdrv_get_allocated_file_size(bs->file);
3515 }
3516 return -ENOTSUP;
3517 }
3518
3519 /**
3520 * Return number of sectors on success, -errno on error.
3521 */
3522 int64_t bdrv_nb_sectors(BlockDriverState *bs)
3523 {
3524 BlockDriver *drv = bs->drv;
3525
3526 if (!drv)
3527 return -ENOMEDIUM;
3528
3529 if (drv->has_variable_length) {
3530 int ret = refresh_total_sectors(bs, bs->total_sectors);
3531 if (ret < 0) {
3532 return ret;
3533 }
3534 }
3535 return bs->total_sectors;
3536 }
3537
3538 /**
3539 * Return length in bytes on success, -errno on error.
3540 * The length is always a multiple of BDRV_SECTOR_SIZE.
3541 */
3542 int64_t bdrv_getlength(BlockDriverState *bs)
3543 {
3544 int64_t ret = bdrv_nb_sectors(bs);
3545
3546 return ret < 0 ? ret : ret * BDRV_SECTOR_SIZE;
3547 }
3548
3549 /* return 0 as number of sectors if no device present or error */
3550 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
3551 {
3552 int64_t nb_sectors = bdrv_nb_sectors(bs);
3553
3554 *nb_sectors_ptr = nb_sectors < 0 ? 0 : nb_sectors;
3555 }
3556
3557 void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
3558 BlockdevOnError on_write_error)
3559 {
3560 bs->on_read_error = on_read_error;
3561 bs->on_write_error = on_write_error;
3562 }
3563
3564 BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
3565 {
3566 return is_read ? bs->on_read_error : bs->on_write_error;
3567 }
3568
3569 BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
3570 {
3571 BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
3572
3573 switch (on_err) {
3574 case BLOCKDEV_ON_ERROR_ENOSPC:
3575 return (error == ENOSPC) ?
3576 BLOCK_ERROR_ACTION_STOP : BLOCK_ERROR_ACTION_REPORT;
3577 case BLOCKDEV_ON_ERROR_STOP:
3578 return BLOCK_ERROR_ACTION_STOP;
3579 case BLOCKDEV_ON_ERROR_REPORT:
3580 return BLOCK_ERROR_ACTION_REPORT;
3581 case BLOCKDEV_ON_ERROR_IGNORE:
3582 return BLOCK_ERROR_ACTION_IGNORE;
3583 default:
3584 abort();
3585 }
3586 }
3587
3588 static void send_qmp_error_event(BlockDriverState *bs,
3589 BlockErrorAction action,
3590 bool is_read, int error)
3591 {
3592 IoOperationType optype;
3593
3594 optype = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE;
3595 qapi_event_send_block_io_error(bdrv_get_device_name(bs), optype, action,
3596 bdrv_iostatus_is_enabled(bs),
3597 error == ENOSPC, strerror(error),
3598 &error_abort);
3599 }
3600
3601 /* This is done by device models because, while the block layer knows
3602 * about the error, it does not know whether an operation comes from
3603 * the device or the block layer (from a job, for example).
3604 */
3605 void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3606 bool is_read, int error)
3607 {
3608 assert(error >= 0);
3609
3610 if (action == BLOCK_ERROR_ACTION_STOP) {
3611 /* First set the iostatus, so that "info block" returns an iostatus
3612 * that matches the events raised so far (an additional error iostatus
3613 * is fine, but not a lost one).
3614 */
3615 bdrv_iostatus_set_err(bs, error);
3616
3617 /* Then raise the request to stop the VM and the event.
3618 * qemu_system_vmstop_request_prepare has two effects. First,
3619 * it ensures that the STOP event always comes after the
3620 * BLOCK_IO_ERROR event. Second, it ensures that even if management
3621 * can observe the STOP event and do a "cont" before the STOP
3622 * event is issued, the VM will not stop. In this case, vm_start()
3623 * also ensures that the STOP/RESUME pair of events is emitted.
3624 */
3625 qemu_system_vmstop_request_prepare();
3626 send_qmp_error_event(bs, action, is_read, error);
3627 qemu_system_vmstop_request(RUN_STATE_IO_ERROR);
3628 } else {
3629 send_qmp_error_event(bs, action, is_read, error);
3630 }
3631 }
3632
3633 int bdrv_is_read_only(BlockDriverState *bs)
3634 {
3635 return bs->read_only;
3636 }
3637
3638 int bdrv_is_sg(BlockDriverState *bs)
3639 {
3640 return bs->sg;
3641 }
3642
3643 int bdrv_enable_write_cache(BlockDriverState *bs)
3644 {
3645 return bs->enable_write_cache;
3646 }
3647
3648 void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3649 {
3650 bs->enable_write_cache = wce;
3651
3652 /* so a reopen() will preserve wce */
3653 if (wce) {
3654 bs->open_flags |= BDRV_O_CACHE_WB;
3655 } else {
3656 bs->open_flags &= ~BDRV_O_CACHE_WB;
3657 }
3658 }
3659
3660 int bdrv_is_encrypted(BlockDriverState *bs)
3661 {
3662 if (bs->backing_hd && bs->backing_hd->encrypted)
3663 return 1;
3664 return bs->encrypted;
3665 }
3666
3667 int bdrv_key_required(BlockDriverState *bs)
3668 {
3669 BlockDriverState *backing_hd = bs->backing_hd;
3670
3671 if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3672 return 1;
3673 return (bs->encrypted && !bs->valid_key);
3674 }
3675
3676 int bdrv_set_key(BlockDriverState *bs, const char *key)
3677 {
3678 int ret;
3679 if (bs->backing_hd && bs->backing_hd->encrypted) {
3680 ret = bdrv_set_key(bs->backing_hd, key);
3681 if (ret < 0)
3682 return ret;
3683 if (!bs->encrypted)
3684 return 0;
3685 }
3686 if (!bs->encrypted) {
3687 return -EINVAL;
3688 } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3689 return -ENOMEDIUM;
3690 }
3691 ret = bs->drv->bdrv_set_key(bs, key);
3692 if (ret < 0) {
3693 bs->valid_key = 0;
3694 } else if (!bs->valid_key) {
3695 bs->valid_key = 1;
3696 if (bs->blk) {
3697 /* call the change callback now, we skipped it on open */
3698 blk_dev_change_media_cb(bs->blk, true);
3699 }
3700 }
3701 return ret;
3702 }
3703
3704 const char *bdrv_get_format_name(BlockDriverState *bs)
3705 {
3706 return bs->drv ? bs->drv->format_name : NULL;
3707 }
3708
3709 static int qsort_strcmp(const void *a, const void *b)
3710 {
3711 return strcmp(a, b);
3712 }
3713
3714 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
3715 void *opaque)
3716 {
3717 BlockDriver *drv;
3718 int count = 0;
3719 int i;
3720 const char **formats = NULL;
3721
3722 QLIST_FOREACH(drv, &bdrv_drivers, list) {
3723 if (drv->format_name) {
3724 bool found = false;
3725 int i = count;
3726 while (formats && i && !found) {
3727 found = !strcmp(formats[--i], drv->format_name);
3728 }
3729
3730 if (!found) {
3731 formats = g_renew(const char *, formats, count + 1);
3732 formats[count++] = drv->format_name;
3733 }
3734 }
3735 }
3736
3737 qsort(formats, count, sizeof(formats[0]), qsort_strcmp);
3738
3739 for (i = 0; i < count; i++) {
3740 it(opaque, formats[i]);
3741 }
3742
3743 g_free(formats);
3744 }
3745
3746 /* This function is to find block backend bs */
3747 /* TODO convert callers to blk_by_name(), then remove */
3748 BlockDriverState *bdrv_find(const char *name)
3749 {
3750 BlockBackend *blk = blk_by_name(name);
3751
3752 return blk ? blk_bs(blk) : NULL;
3753 }
3754
3755 /* This function is to find a node in the bs graph */
3756 BlockDriverState *bdrv_find_node(const char *node_name)
3757 {
3758 BlockDriverState *bs;
3759
3760 assert(node_name);
3761
3762 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3763 if (!strcmp(node_name, bs->node_name)) {
3764 return bs;
3765 }
3766 }
3767 return NULL;
3768 }
3769
3770 /* Put this QMP function here so it can access the static graph_bdrv_states. */
3771 BlockDeviceInfoList *bdrv_named_nodes_list(void)
3772 {
3773 BlockDeviceInfoList *list, *entry;
3774 BlockDriverState *bs;
3775
3776 list = NULL;
3777 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3778 entry = g_malloc0(sizeof(*entry));
3779 entry->value = bdrv_block_device_info(bs);
3780 entry->next = list;
3781 list = entry;
3782 }
3783
3784 return list;
3785 }
3786
3787 BlockDriverState *bdrv_lookup_bs(const char *device,
3788 const char *node_name,
3789 Error **errp)
3790 {
3791 BlockBackend *blk;
3792 BlockDriverState *bs;
3793
3794 if (device) {
3795 blk = blk_by_name(device);
3796
3797 if (blk) {
3798 return blk_bs(blk);
3799 }
3800 }
3801
3802 if (node_name) {
3803 bs = bdrv_find_node(node_name);
3804
3805 if (bs) {
3806 return bs;
3807 }
3808 }
3809
3810 error_setg(errp, "Cannot find device=%s nor node_name=%s",
3811 device ? device : "",
3812 node_name ? node_name : "");
3813 return NULL;
3814 }
3815
3816 /* If 'base' is in the same chain as 'top', return true. Otherwise,
3817 * return false. If either argument is NULL, return false. */
3818 bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base)
3819 {
3820 while (top && top != base) {
3821 top = top->backing_hd;
3822 }
3823
3824 return top != NULL;
3825 }
3826
3827 BlockDriverState *bdrv_next_node(BlockDriverState *bs)
3828 {
3829 if (!bs) {
3830 return QTAILQ_FIRST(&graph_bdrv_states);
3831 }
3832 return QTAILQ_NEXT(bs, node_list);
3833 }
3834
3835 BlockDriverState *bdrv_next(BlockDriverState *bs)
3836 {
3837 if (!bs) {
3838 return QTAILQ_FIRST(&bdrv_states);
3839 }
3840 return QTAILQ_NEXT(bs, device_list);
3841 }
3842
3843 const char *bdrv_get_node_name(const BlockDriverState *bs)
3844 {
3845 return bs->node_name;
3846 }
3847
3848 /* TODO check what callers really want: bs->node_name or blk_name() */
3849 const char *bdrv_get_device_name(const BlockDriverState *bs)
3850 {
3851 return bs->blk ? blk_name(bs->blk) : "";
3852 }
3853
3854 int bdrv_get_flags(BlockDriverState *bs)
3855 {
3856 return bs->open_flags;
3857 }
3858
3859 int bdrv_flush_all(void)
3860 {
3861 BlockDriverState *bs;
3862 int result = 0;
3863
3864 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3865 AioContext *aio_context = bdrv_get_aio_context(bs);
3866 int ret;
3867
3868 aio_context_acquire(aio_context);
3869 ret = bdrv_flush(bs);
3870 if (ret < 0 && !result) {
3871 result = ret;
3872 }
3873 aio_context_release(aio_context);
3874 }
3875
3876 return result;
3877 }
3878
3879 int bdrv_has_zero_init_1(BlockDriverState *bs)
3880 {
3881 return 1;
3882 }
3883
3884 int bdrv_has_zero_init(BlockDriverState *bs)
3885 {
3886 assert(bs->drv);
3887
3888 /* If BS is a copy on write image, it is initialized to
3889 the contents of the base image, which may not be zeroes. */
3890 if (bs->backing_hd) {
3891 return 0;
3892 }
3893 if (bs->drv->bdrv_has_zero_init) {
3894 return bs->drv->bdrv_has_zero_init(bs);
3895 }
3896
3897 /* safe default */
3898 return 0;
3899 }
3900
3901 bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
3902 {
3903 BlockDriverInfo bdi;
3904
3905 if (bs->backing_hd) {
3906 return false;
3907 }
3908
3909 if (bdrv_get_info(bs, &bdi) == 0) {
3910 return bdi.unallocated_blocks_are_zero;
3911 }
3912
3913 return false;
3914 }
3915
3916 bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
3917 {
3918 BlockDriverInfo bdi;
3919
3920 if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
3921 return false;
3922 }
3923
3924 if (bdrv_get_info(bs, &bdi) == 0) {
3925 return bdi.can_write_zeroes_with_unmap;
3926 }
3927
3928 return false;
3929 }
3930
3931 typedef struct BdrvCoGetBlockStatusData {
3932 BlockDriverState *bs;
3933 BlockDriverState *base;
3934 int64_t sector_num;
3935 int nb_sectors;
3936 int *pnum;
3937 int64_t ret;
3938 bool done;
3939 } BdrvCoGetBlockStatusData;
3940
3941 /*
3942 * Returns the allocation status of the specified sectors.
3943 * Drivers not implementing the functionality are assumed to not support
3944 * backing files, hence all their sectors are reported as allocated.
3945 *
3946 * If 'sector_num' is beyond the end of the disk image the return value is 0
3947 * and 'pnum' is set to 0.
3948 *
3949 * 'pnum' is set to the number of sectors (including and immediately following
3950 * the specified sector) that are known to be in the same
3951 * allocated/unallocated state.
3952 *
3953 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
3954 * beyond the end of the disk image it will be clamped.
3955 */
3956 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
3957 int64_t sector_num,
3958 int nb_sectors, int *pnum)
3959 {
3960 int64_t total_sectors;
3961 int64_t n;
3962 int64_t ret, ret2;
3963
3964 total_sectors = bdrv_nb_sectors(bs);
3965 if (total_sectors < 0) {
3966 return total_sectors;
3967 }
3968
3969 if (sector_num >= total_sectors) {
3970 *pnum = 0;
3971 return 0;
3972 }
3973
3974 n = total_sectors - sector_num;
3975 if (n < nb_sectors) {
3976 nb_sectors = n;
3977 }
3978
3979 if (!bs->drv->bdrv_co_get_block_status) {
3980 *pnum = nb_sectors;
3981 ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
3982 if (bs->drv->protocol_name) {
3983 ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
3984 }
3985 return ret;
3986 }
3987
3988 ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
3989 if (ret < 0) {
3990 *pnum = 0;
3991 return ret;
3992 }
3993
3994 if (ret & BDRV_BLOCK_RAW) {
3995 assert(ret & BDRV_BLOCK_OFFSET_VALID);
3996 return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3997 *pnum, pnum);
3998 }
3999
4000 if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
4001 ret |= BDRV_BLOCK_ALLOCATED;
4002 }
4003
4004 if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
4005 if (bdrv_unallocated_blocks_are_zero(bs)) {
4006 ret |= BDRV_BLOCK_ZERO;
4007 } else if (bs->backing_hd) {
4008 BlockDriverState *bs2 = bs->backing_hd;
4009 int64_t nb_sectors2 = bdrv_nb_sectors(bs2);
4010 if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) {
4011 ret |= BDRV_BLOCK_ZERO;
4012 }
4013 }
4014 }
4015
4016 if (bs->file &&
4017 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
4018 (ret & BDRV_BLOCK_OFFSET_VALID)) {
4019 int file_pnum;
4020
4021 ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
4022 *pnum, &file_pnum);
4023 if (ret2 >= 0) {
4024 /* Ignore errors. This is just providing extra information, it
4025 * is useful but not necessary.
4026 */
4027 if (!file_pnum) {
4028 /* !file_pnum indicates an offset at or beyond the EOF; it is
4029 * perfectly valid for the format block driver to point to such
4030 * offsets, so catch it and mark everything as zero */
4031 ret |= BDRV_BLOCK_ZERO;
4032 } else {
4033 /* Limit request to the range reported by the protocol driver */
4034 *pnum = file_pnum;
4035 ret |= (ret2 & BDRV_BLOCK_ZERO);
4036 }
4037 }
4038 }
4039
4040 return ret;
4041 }
4042
4043 /* Coroutine wrapper for bdrv_get_block_status() */
4044 static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
4045 {
4046 BdrvCoGetBlockStatusData *data = opaque;
4047 BlockDriverState *bs = data->bs;
4048
4049 data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
4050 data->pnum);
4051 data->done = true;
4052 }
4053
4054 /*
4055 * Synchronous wrapper around bdrv_co_get_block_status().
4056 *
4057 * See bdrv_co_get_block_status() for details.
4058 */
4059 int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
4060 int nb_sectors, int *pnum)
4061 {
4062 Coroutine *co;
4063 BdrvCoGetBlockStatusData data = {
4064 .bs = bs,
4065 .sector_num = sector_num,
4066 .nb_sectors = nb_sectors,
4067 .pnum = pnum,
4068 .done = false,
4069 };
4070
4071 if (qemu_in_coroutine()) {
4072 /* Fast-path if already in coroutine context */
4073 bdrv_get_block_status_co_entry(&data);
4074 } else {
4075 AioContext *aio_context = bdrv_get_aio_context(bs);
4076
4077 co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
4078 qemu_coroutine_enter(co, &data);
4079 while (!data.done) {
4080 aio_poll(aio_context, true);
4081 }
4082 }
4083 return data.ret;
4084 }
4085
4086 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
4087 int nb_sectors, int *pnum)
4088 {
4089 int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
4090 if (ret < 0) {
4091 return ret;
4092 }
4093 return !!(ret & BDRV_BLOCK_ALLOCATED);
4094 }
4095
4096 /*
4097 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
4098 *
4099 * Return true if the given sector is allocated in any image between
4100 * BASE and TOP (inclusive). BASE can be NULL to check if the given
4101 * sector is allocated in any image of the chain. Return false otherwise.
4102 *
4103 * 'pnum' is set to the number of sectors (including and immediately following
4104 * the specified sector) that are known to be in the same
4105 * allocated/unallocated state.
4106 *
4107 */
4108 int bdrv_is_allocated_above(BlockDriverState *top,
4109 BlockDriverState *base,
4110 int64_t sector_num,
4111 int nb_sectors, int *pnum)
4112 {
4113 BlockDriverState *intermediate;
4114 int ret, n = nb_sectors;
4115
4116 intermediate = top;
4117 while (intermediate && intermediate != base) {
4118 int pnum_inter;
4119 ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
4120 &pnum_inter);
4121 if (ret < 0) {
4122 return ret;
4123 } else if (ret) {
4124 *pnum = pnum_inter;
4125 return 1;
4126 }
4127
4128 /*
4129 * [sector_num, nb_sectors] is unallocated on top but intermediate
4130 * might have
4131 *
4132 * [sector_num+x, nr_sectors] allocated.
4133 */
4134 if (n > pnum_inter &&
4135 (intermediate == top ||
4136 sector_num + pnum_inter < intermediate->total_sectors)) {
4137 n = pnum_inter;
4138 }
4139
4140 intermediate = intermediate->backing_hd;
4141 }
4142
4143 *pnum = n;
4144 return 0;
4145 }
4146
4147 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
4148 {
4149 if (bs->backing_hd && bs->backing_hd->encrypted)
4150 return bs->backing_file;
4151 else if (bs->encrypted)
4152 return bs->filename;
4153 else
4154 return NULL;
4155 }
4156
4157 void bdrv_get_backing_filename(BlockDriverState *bs,
4158 char *filename, int filename_size)
4159 {
4160 pstrcpy(filename, filename_size, bs->backing_file);
4161 }
4162
4163 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
4164 const uint8_t *buf, int nb_sectors)
4165 {
4166 BlockDriver *drv = bs->drv;
4167 if (!drv)
4168 return -ENOMEDIUM;
4169 if (!drv->bdrv_write_compressed)
4170 return -ENOTSUP;
4171 if (bdrv_check_request(bs, sector_num, nb_sectors))
4172 return -EIO;
4173
4174 assert(QLIST_EMPTY(&bs->dirty_bitmaps));
4175
4176 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
4177 }
4178
4179 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
4180 {
4181 BlockDriver *drv = bs->drv;
4182 if (!drv)
4183 return -ENOMEDIUM;
4184 if (!drv->bdrv_get_info)
4185 return -ENOTSUP;
4186 memset(bdi, 0, sizeof(*bdi));
4187 return drv->bdrv_get_info(bs, bdi);
4188 }
4189
4190 ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
4191 {
4192 BlockDriver *drv = bs->drv;
4193 if (drv && drv->bdrv_get_specific_info) {
4194 return drv->bdrv_get_specific_info(bs);
4195 }
4196 return NULL;
4197 }
4198
4199 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
4200 int64_t pos, int size)
4201 {
4202 QEMUIOVector qiov;
4203 struct iovec iov = {
4204 .iov_base = (void *) buf,
4205 .iov_len = size,
4206 };
4207
4208 qemu_iovec_init_external(&qiov, &iov, 1);
4209 return bdrv_writev_vmstate(bs, &qiov, pos);
4210 }
4211
4212 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
4213 {
4214 BlockDriver *drv = bs->drv;
4215
4216 if (!drv) {
4217 return -ENOMEDIUM;
4218 } else if (drv->bdrv_save_vmstate) {
4219 return drv->bdrv_save_vmstate(bs, qiov, pos);
4220 } else if (bs->file) {
4221 return bdrv_writev_vmstate(bs->file, qiov, pos);
4222 }
4223
4224 return -ENOTSUP;
4225 }
4226
4227 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
4228 int64_t pos, int size)
4229 {
4230 BlockDriver *drv = bs->drv;
4231 if (!drv)
4232 return -ENOMEDIUM;
4233 if (drv->bdrv_load_vmstate)
4234 return drv->bdrv_load_vmstate(bs, buf, pos, size);
4235 if (bs->file)
4236 return bdrv_load_vmstate(bs->file, buf, pos, size);
4237 return -ENOTSUP;
4238 }
4239
4240 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
4241 {
4242 if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
4243 return;
4244 }
4245
4246 bs->drv->bdrv_debug_event(bs, event);
4247 }
4248
4249 int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
4250 const char *tag)
4251 {
4252 while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
4253 bs = bs->file;
4254 }
4255
4256 if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
4257 return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
4258 }
4259
4260 return -ENOTSUP;
4261 }
4262
4263 int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
4264 {
4265 while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
4266 bs = bs->file;
4267 }
4268
4269 if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
4270 return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
4271 }
4272
4273 return -ENOTSUP;
4274 }
4275
4276 int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
4277 {
4278 while (bs && (!bs->drv || !bs->drv->bdrv_debug_resume)) {
4279 bs = bs->file;
4280 }
4281
4282 if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
4283 return bs->drv->bdrv_debug_resume(bs, tag);
4284 }
4285
4286 return -ENOTSUP;
4287 }
4288
4289 bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
4290 {
4291 while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
4292 bs = bs->file;
4293 }
4294
4295 if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
4296 return bs->drv->bdrv_debug_is_suspended(bs, tag);
4297 }
4298
4299 return false;
4300 }
4301
4302 int bdrv_is_snapshot(BlockDriverState *bs)
4303 {
4304 return !!(bs->open_flags & BDRV_O_SNAPSHOT);
4305 }
4306
4307 /* backing_file can either be relative, or absolute, or a protocol. If it is
4308 * relative, it must be relative to the chain. So, passing in bs->filename
4309 * from a BDS as backing_file should not be done, as that may be relative to
4310 * the CWD rather than the chain. */
4311 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
4312 const char *backing_file)
4313 {
4314 char *filename_full = NULL;
4315 char *backing_file_full = NULL;
4316 char *filename_tmp = NULL;
4317 int is_protocol = 0;
4318 BlockDriverState *curr_bs = NULL;
4319 BlockDriverState *retval = NULL;
4320
4321 if (!bs || !bs->drv || !backing_file) {
4322 return NULL;
4323 }
4324
4325 filename_full = g_malloc(PATH_MAX);
4326 backing_file_full = g_malloc(PATH_MAX);
4327 filename_tmp = g_malloc(PATH_MAX);
4328
4329 is_protocol = path_has_protocol(backing_file);
4330
4331 for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
4332
4333 /* If either of the filename paths is actually a protocol, then
4334 * compare unmodified paths; otherwise make paths relative */
4335 if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
4336 if (strcmp(backing_file, curr_bs->backing_file) == 0) {
4337 retval = curr_bs->backing_hd;
4338 break;
4339 }
4340 } else {
4341 /* If not an absolute filename path, make it relative to the current
4342 * image's filename path */
4343 path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4344 backing_file);
4345
4346 /* We are going to compare absolute pathnames */
4347 if (!realpath(filename_tmp, filename_full)) {
4348 continue;
4349 }
4350
4351 /* We need to make sure the backing filename we are comparing against
4352 * is relative to the current image filename (or absolute) */
4353 path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4354 curr_bs->backing_file);
4355
4356 if (!realpath(filename_tmp, backing_file_full)) {
4357 continue;
4358 }
4359
4360 if (strcmp(backing_file_full, filename_full) == 0) {
4361 retval = curr_bs->backing_hd;
4362 break;
4363 }
4364 }
4365 }
4366
4367 g_free(filename_full);
4368 g_free(backing_file_full);
4369 g_free(filename_tmp);
4370 return retval;
4371 }
4372
4373 int bdrv_get_backing_file_depth(BlockDriverState *bs)
4374 {
4375 if (!bs->drv) {
4376 return 0;
4377 }
4378
4379 if (!bs->backing_hd) {
4380 return 0;
4381 }
4382
4383 return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
4384 }
4385
4386 /**************************************************************/
4387 /* async I/Os */
4388
4389 BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
4390 QEMUIOVector *qiov, int nb_sectors,
4391 BlockCompletionFunc *cb, void *opaque)
4392 {
4393 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
4394
4395 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4396 cb, opaque, false);
4397 }
4398
4399 BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
4400 QEMUIOVector *qiov, int nb_sectors,
4401 BlockCompletionFunc *cb, void *opaque)
4402 {
4403 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
4404
4405 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4406 cb, opaque, true);
4407 }
4408
4409 BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
4410 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
4411 BlockCompletionFunc *cb, void *opaque)
4412 {
4413 trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
4414
4415 return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
4416 BDRV_REQ_ZERO_WRITE | flags,
4417 cb, opaque, true);
4418 }
4419
4420
4421 typedef struct MultiwriteCB {
4422 int error;
4423 int num_requests;
4424 int num_callbacks;
4425 struct {
4426 BlockCompletionFunc *cb;
4427 void *opaque;
4428 QEMUIOVector *free_qiov;
4429 } callbacks[];
4430 } MultiwriteCB;
4431
4432 static void multiwrite_user_cb(MultiwriteCB *mcb)
4433 {
4434 int i;
4435
4436 for (i = 0; i < mcb->num_callbacks; i++) {
4437 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
4438 if (mcb->callbacks[i].free_qiov) {
4439 qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
4440 }
4441 g_free(mcb->callbacks[i].free_qiov);
4442 }
4443 }
4444
4445 static void multiwrite_cb(void *opaque, int ret)
4446 {
4447 MultiwriteCB *mcb = opaque;
4448
4449 trace_multiwrite_cb(mcb, ret);
4450
4451 if (ret < 0 && !mcb->error) {
4452 mcb->error = ret;
4453 }
4454
4455 mcb->num_requests--;
4456 if (mcb->num_requests == 0) {
4457 multiwrite_user_cb(mcb);
4458 g_free(mcb);
4459 }
4460 }
4461
4462 static int multiwrite_req_compare(const void *a, const void *b)
4463 {
4464 const BlockRequest *req1 = a, *req2 = b;
4465
4466 /*
4467 * Note that we can't simply subtract req2->sector from req1->sector
4468 * here as that could overflow the return value.
4469 */
4470 if (req1->sector > req2->sector) {
4471 return 1;
4472 } else if (req1->sector < req2->sector) {
4473 return -1;
4474 } else {
4475 return 0;
4476 }
4477 }
4478
4479 /*
4480 * Takes a bunch of requests and tries to merge them. Returns the number of
4481 * requests that remain after merging.
4482 */
4483 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
4484 int num_reqs, MultiwriteCB *mcb)
4485 {
4486 int i, outidx;
4487
4488 // Sort requests by start sector
4489 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
4490
4491 // Check if adjacent requests touch the same clusters. If so, combine them,
4492 // filling up gaps with zero sectors.
4493 outidx = 0;
4494 for (i = 1; i < num_reqs; i++) {
4495 int merge = 0;
4496 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
4497
4498 // Handle exactly sequential writes and overlapping writes.
4499 if (reqs[i].sector <= oldreq_last) {
4500 merge = 1;
4501 }
4502
4503 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
4504 merge = 0;
4505 }
4506
4507 if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors +
4508 reqs[i].nb_sectors > bs->bl.max_transfer_length) {
4509 merge = 0;
4510 }
4511
4512 if (merge) {
4513 size_t size;
4514 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
4515 qemu_iovec_init(qiov,
4516 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
4517
4518 // Add the first request to the merged one. If the requests are
4519 // overlapping, drop the last sectors of the first request.
4520 size = (reqs[i].sector - reqs[outidx].sector) << 9;
4521 qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
4522
4523 // We should need to add any zeros between the two requests
4524 assert (reqs[i].sector <= oldreq_last);
4525
4526 // Add the second request
4527 qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
4528
4529 // Add tail of first request, if necessary
4530 if (qiov->size < reqs[outidx].qiov->size) {
4531 qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size,
4532 reqs[outidx].qiov->size - qiov->size);
4533 }
4534
4535 reqs[outidx].nb_sectors = qiov->size >> 9;
4536 reqs[outidx].qiov = qiov;
4537
4538 mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
4539 } else {
4540 outidx++;
4541 reqs[outidx].sector = reqs[i].sector;
4542 reqs[outidx].nb_sectors = reqs[i].nb_sectors;
4543 reqs[outidx].qiov = reqs[i].qiov;
4544 }
4545 }
4546
4547 return outidx + 1;
4548 }
4549
4550 /*
4551 * Submit multiple AIO write requests at once.
4552 *
4553 * On success, the function returns 0 and all requests in the reqs array have
4554 * been submitted. In error case this function returns -1, and any of the
4555 * requests may or may not be submitted yet. In particular, this means that the
4556 * callback will be called for some of the requests, for others it won't. The
4557 * caller must check the error field of the BlockRequest to wait for the right
4558 * callbacks (if error != 0, no callback will be called).
4559 *
4560 * The implementation may modify the contents of the reqs array, e.g. to merge
4561 * requests. However, the fields opaque and error are left unmodified as they
4562 * are used to signal failure for a single request to the caller.
4563 */
4564 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
4565 {
4566 MultiwriteCB *mcb;
4567 int i;
4568
4569 /* don't submit writes if we don't have a medium */
4570 if (bs->drv == NULL) {
4571 for (i = 0; i < num_reqs; i++) {
4572 reqs[i].error = -ENOMEDIUM;
4573 }
4574 return -1;
4575 }
4576
4577 if (num_reqs == 0) {
4578 return 0;
4579 }
4580
4581 // Create MultiwriteCB structure
4582 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
4583 mcb->num_requests = 0;
4584 mcb->num_callbacks = num_reqs;
4585
4586 for (i = 0; i < num_reqs; i++) {
4587 mcb->callbacks[i].cb = reqs[i].cb;
4588 mcb->callbacks[i].opaque = reqs[i].opaque;
4589 }
4590
4591 // Check for mergable requests
4592 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
4593
4594 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
4595
4596 /* Run the aio requests. */
4597 mcb->num_requests = num_reqs;
4598 for (i = 0; i < num_reqs; i++) {
4599 bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
4600 reqs[i].nb_sectors, reqs[i].flags,
4601 multiwrite_cb, mcb,
4602 true);
4603 }
4604
4605 return 0;
4606 }
4607
4608 void bdrv_aio_cancel(BlockAIOCB *acb)
4609 {
4610 qemu_aio_ref(acb);
4611 bdrv_aio_cancel_async(acb);
4612 while (acb->refcnt > 1) {
4613 if (acb->aiocb_info->get_aio_context) {
4614 aio_poll(acb->aiocb_info->get_aio_context(acb), true);
4615 } else if (acb->bs) {
4616 aio_poll(bdrv_get_aio_context(acb->bs), true);
4617 } else {
4618 abort();
4619 }
4620 }
4621 qemu_aio_unref(acb);
4622 }
4623
4624 /* Async version of aio cancel. The caller is not blocked if the acb implements
4625 * cancel_async, otherwise we do nothing and let the request normally complete.
4626 * In either case the completion callback must be called. */
4627 void bdrv_aio_cancel_async(BlockAIOCB *acb)
4628 {
4629 if (acb->aiocb_info->cancel_async) {
4630 acb->aiocb_info->cancel_async(acb);
4631 }
4632 }
4633
4634 /**************************************************************/
4635 /* async block device emulation */
4636
4637 typedef struct BlockAIOCBSync {
4638 BlockAIOCB common;
4639 QEMUBH *bh;
4640 int ret;
4641 /* vector translation state */
4642 QEMUIOVector *qiov;
4643 uint8_t *bounce;
4644 int is_write;
4645 } BlockAIOCBSync;
4646
4647 static const AIOCBInfo bdrv_em_aiocb_info = {
4648 .aiocb_size = sizeof(BlockAIOCBSync),
4649 };
4650
4651 static void bdrv_aio_bh_cb(void *opaque)
4652 {
4653 BlockAIOCBSync *acb = opaque;
4654
4655 if (!acb->is_write && acb->ret >= 0) {
4656 qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
4657 }
4658 qemu_vfree(acb->bounce);
4659 acb->common.cb(acb->common.opaque, acb->ret);
4660 qemu_bh_delete(acb->bh);
4661 acb->bh = NULL;
4662 qemu_aio_unref(acb);
4663 }
4664
4665 static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
4666 int64_t sector_num,
4667 QEMUIOVector *qiov,
4668 int nb_sectors,
4669 BlockCompletionFunc *cb,
4670 void *opaque,
4671 int is_write)
4672
4673 {
4674 BlockAIOCBSync *acb;
4675
4676 acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
4677 acb->is_write = is_write;
4678 acb->qiov = qiov;
4679 acb->bounce = qemu_try_blockalign(bs, qiov->size);
4680 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb);
4681
4682 if (acb->bounce == NULL) {
4683 acb->ret = -ENOMEM;
4684 } else if (is_write) {
4685 qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
4686 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
4687 } else {
4688 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
4689 }
4690
4691 qemu_bh_schedule(acb->bh);
4692
4693 return &acb->common;
4694 }
4695
4696 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
4697 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4698 BlockCompletionFunc *cb, void *opaque)
4699 {
4700 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
4701 }
4702
4703 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
4704 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4705 BlockCompletionFunc *cb, void *opaque)
4706 {
4707 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
4708 }
4709
4710
4711 typedef struct BlockAIOCBCoroutine {
4712 BlockAIOCB common;
4713 BlockRequest req;
4714 bool is_write;
4715 bool *done;
4716 QEMUBH* bh;
4717 } BlockAIOCBCoroutine;
4718
4719 static const AIOCBInfo bdrv_em_co_aiocb_info = {
4720 .aiocb_size = sizeof(BlockAIOCBCoroutine),
4721 };
4722
4723 static void bdrv_co_em_bh(void *opaque)
4724 {
4725 BlockAIOCBCoroutine *acb = opaque;
4726
4727 acb->common.cb(acb->common.opaque, acb->req.error);
4728
4729 qemu_bh_delete(acb->bh);
4730 qemu_aio_unref(acb);
4731 }
4732
4733 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4734 static void coroutine_fn bdrv_co_do_rw(void *opaque)
4735 {
4736 BlockAIOCBCoroutine *acb = opaque;
4737 BlockDriverState *bs = acb->common.bs;
4738
4739 if (!acb->is_write) {
4740 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
4741 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4742 } else {
4743 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
4744 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4745 }
4746
4747 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4748 qemu_bh_schedule(acb->bh);
4749 }
4750
4751 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4752 int64_t sector_num,
4753 QEMUIOVector *qiov,
4754 int nb_sectors,
4755 BdrvRequestFlags flags,
4756 BlockCompletionFunc *cb,
4757 void *opaque,
4758 bool is_write)
4759 {
4760 Coroutine *co;
4761 BlockAIOCBCoroutine *acb;
4762
4763 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4764 acb->req.sector = sector_num;
4765 acb->req.nb_sectors = nb_sectors;
4766 acb->req.qiov = qiov;
4767 acb->req.flags = flags;
4768 acb->is_write = is_write;
4769
4770 co = qemu_coroutine_create(bdrv_co_do_rw);
4771 qemu_coroutine_enter(co, acb);
4772
4773 return &acb->common;
4774 }
4775
4776 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
4777 {
4778 BlockAIOCBCoroutine *acb = opaque;
4779 BlockDriverState *bs = acb->common.bs;
4780
4781 acb->req.error = bdrv_co_flush(bs);
4782 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4783 qemu_bh_schedule(acb->bh);
4784 }
4785
4786 BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
4787 BlockCompletionFunc *cb, void *opaque)
4788 {
4789 trace_bdrv_aio_flush(bs, opaque);
4790
4791 Coroutine *co;
4792 BlockAIOCBCoroutine *acb;
4793
4794 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4795
4796 co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4797 qemu_coroutine_enter(co, acb);
4798
4799 return &acb->common;
4800 }
4801
4802 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4803 {
4804 BlockAIOCBCoroutine *acb = opaque;
4805 BlockDriverState *bs = acb->common.bs;
4806
4807 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
4808 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4809 qemu_bh_schedule(acb->bh);
4810 }
4811
4812 BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4813 int64_t sector_num, int nb_sectors,
4814 BlockCompletionFunc *cb, void *opaque)
4815 {
4816 Coroutine *co;
4817 BlockAIOCBCoroutine *acb;
4818
4819 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4820
4821 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4822 acb->req.sector = sector_num;
4823 acb->req.nb_sectors = nb_sectors;
4824 co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
4825 qemu_coroutine_enter(co, acb);
4826
4827 return &acb->common;
4828 }
4829
4830 void bdrv_init(void)
4831 {
4832 module_call_init(MODULE_INIT_BLOCK);
4833 }
4834
4835 void bdrv_init_with_whitelist(void)
4836 {
4837 use_bdrv_whitelist = 1;
4838 bdrv_init();
4839 }
4840
4841 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
4842 BlockCompletionFunc *cb, void *opaque)
4843 {
4844 BlockAIOCB *acb;
4845
4846 acb = g_slice_alloc(aiocb_info->aiocb_size);
4847 acb->aiocb_info = aiocb_info;
4848 acb->bs = bs;
4849 acb->cb = cb;
4850 acb->opaque = opaque;
4851 acb->refcnt = 1;
4852 return acb;
4853 }
4854
4855 void qemu_aio_ref(void *p)
4856 {
4857 BlockAIOCB *acb = p;
4858 acb->refcnt++;
4859 }
4860
4861 void qemu_aio_unref(void *p)
4862 {
4863 BlockAIOCB *acb = p;
4864 assert(acb->refcnt > 0);
4865 if (--acb->refcnt == 0) {
4866 g_slice_free1(acb->aiocb_info->aiocb_size, acb);
4867 }
4868 }
4869
4870 /**************************************************************/
4871 /* Coroutine block device emulation */
4872
4873 typedef struct CoroutineIOCompletion {
4874 Coroutine *coroutine;
4875 int ret;
4876 } CoroutineIOCompletion;
4877
4878 static void bdrv_co_io_em_complete(void *opaque, int ret)
4879 {
4880 CoroutineIOCompletion *co = opaque;
4881
4882 co->ret = ret;
4883 qemu_coroutine_enter(co->coroutine, NULL);
4884 }
4885
4886 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
4887 int nb_sectors, QEMUIOVector *iov,
4888 bool is_write)
4889 {
4890 CoroutineIOCompletion co = {
4891 .coroutine = qemu_coroutine_self(),
4892 };
4893 BlockAIOCB *acb;
4894
4895 if (is_write) {
4896 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
4897 bdrv_co_io_em_complete, &co);
4898 } else {
4899 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
4900 bdrv_co_io_em_complete, &co);
4901 }
4902
4903 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
4904 if (!acb) {
4905 return -EIO;
4906 }
4907 qemu_coroutine_yield();
4908
4909 return co.ret;
4910 }
4911
4912 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4913 int64_t sector_num, int nb_sectors,
4914 QEMUIOVector *iov)
4915 {
4916 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4917 }
4918
4919 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4920 int64_t sector_num, int nb_sectors,
4921 QEMUIOVector *iov)
4922 {
4923 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4924 }
4925
4926 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
4927 {
4928 RwCo *rwco = opaque;
4929
4930 rwco->ret = bdrv_co_flush(rwco->bs);
4931 }
4932
4933 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4934 {
4935 int ret;
4936
4937 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
4938 return 0;
4939 }
4940
4941 /* Write back cached data to the OS even with cache=unsafe */
4942 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
4943 if (bs->drv->bdrv_co_flush_to_os) {
4944 ret = bs->drv->bdrv_co_flush_to_os(bs);
4945 if (ret < 0) {
4946 return ret;
4947 }
4948 }
4949
4950 /* But don't actually force it to the disk with cache=unsafe */
4951 if (bs->open_flags & BDRV_O_NO_FLUSH) {
4952 goto flush_parent;
4953 }
4954
4955 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
4956 if (bs->drv->bdrv_co_flush_to_disk) {
4957 ret = bs->drv->bdrv_co_flush_to_disk(bs);
4958 } else if (bs->drv->bdrv_aio_flush) {
4959 BlockAIOCB *acb;
4960 CoroutineIOCompletion co = {
4961 .coroutine = qemu_coroutine_self(),
4962 };
4963
4964 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4965 if (acb == NULL) {
4966 ret = -EIO;
4967 } else {
4968 qemu_coroutine_yield();
4969 ret = co.ret;
4970 }
4971 } else {
4972 /*
4973 * Some block drivers always operate in either writethrough or unsafe
4974 * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4975 * know how the server works (because the behaviour is hardcoded or
4976 * depends on server-side configuration), so we can't ensure that
4977 * everything is safe on disk. Returning an error doesn't work because
4978 * that would break guests even if the server operates in writethrough
4979 * mode.
4980 *
4981 * Let's hope the user knows what he's doing.
4982 */
4983 ret = 0;
4984 }
4985 if (ret < 0) {
4986 return ret;
4987 }
4988
4989 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
4990 * in the case of cache=unsafe, so there are no useless flushes.
4991 */
4992 flush_parent:
4993 return bdrv_co_flush(bs->file);
4994 }
4995
4996 void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)
4997 {
4998 Error *local_err = NULL;
4999 int ret;
5000
5001 if (!bs->drv) {
5002 return;
5003 }
5004
5005 if (!(bs->open_flags & BDRV_O_INCOMING)) {
5006 return;
5007 }
5008 bs->open_flags &= ~BDRV_O_INCOMING;
5009
5010 if (bs->drv->bdrv_invalidate_cache) {
5011 bs->drv->bdrv_invalidate_cache(bs, &local_err);
5012 } else if (bs->file) {
5013 bdrv_invalidate_cache(bs->file, &local_err);
5014 }
5015 if (local_err) {
5016 error_propagate(errp, local_err);
5017 return;
5018 }
5019
5020 ret = refresh_total_sectors(bs, bs->total_sectors);
5021 if (ret < 0) {
5022 error_setg_errno(errp, -ret, "Could not refresh total sector count");
5023 return;
5024 }
5025 }
5026
5027 void bdrv_invalidate_cache_all(Error **errp)
5028 {
5029 BlockDriverState *bs;
5030 Error *local_err = NULL;
5031
5032 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5033 AioContext *aio_context = bdrv_get_aio_context(bs);
5034
5035 aio_context_acquire(aio_context);
5036 bdrv_invalidate_cache(bs, &local_err);
5037 aio_context_release(aio_context);
5038 if (local_err) {
5039 error_propagate(errp, local_err);
5040 return;
5041 }
5042 }
5043 }
5044
5045 int bdrv_flush(BlockDriverState *bs)
5046 {
5047 Coroutine *co;
5048 RwCo rwco = {
5049 .bs = bs,
5050 .ret = NOT_DONE,
5051 };
5052
5053 if (qemu_in_coroutine()) {
5054 /* Fast-path if already in coroutine context */
5055 bdrv_flush_co_entry(&rwco);
5056 } else {
5057 AioContext *aio_context = bdrv_get_aio_context(bs);
5058
5059 co = qemu_coroutine_create(bdrv_flush_co_entry);
5060 qemu_coroutine_enter(co, &rwco);
5061 while (rwco.ret == NOT_DONE) {
5062 aio_poll(aio_context, true);
5063 }
5064 }
5065
5066 return rwco.ret;
5067 }
5068
5069 typedef struct DiscardCo {
5070 BlockDriverState *bs;
5071 int64_t sector_num;
5072 int nb_sectors;
5073 int ret;
5074 } DiscardCo;
5075 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
5076 {
5077 DiscardCo *rwco = opaque;
5078
5079 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
5080 }
5081
5082 /* if no limit is specified in the BlockLimits use a default
5083 * of 32768 512-byte sectors (16 MiB) per request.
5084 */
5085 #define MAX_DISCARD_DEFAULT 32768
5086
5087 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
5088 int nb_sectors)
5089 {
5090 int max_discard;
5091
5092 if (!bs->drv) {
5093 return -ENOMEDIUM;
5094 } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
5095 return -EIO;
5096 } else if (bs->read_only) {
5097 return -EROFS;
5098 }
5099
5100 bdrv_reset_dirty(bs, sector_num, nb_sectors);
5101
5102 /* Do nothing if disabled. */
5103 if (!(bs->open_flags & BDRV_O_UNMAP)) {
5104 return 0;
5105 }
5106
5107 if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
5108 return 0;
5109 }
5110
5111 max_discard = bs->bl.max_discard ? bs->bl.max_discard : MAX_DISCARD_DEFAULT;
5112 while (nb_sectors > 0) {
5113 int ret;
5114 int num = nb_sectors;
5115
5116 /* align request */
5117 if (bs->bl.discard_alignment &&
5118 num >= bs->bl.discard_alignment &&
5119 sector_num % bs->bl.discard_alignment) {
5120 if (num > bs->bl.discard_alignment) {
5121 num = bs->bl.discard_alignment;
5122 }
5123 num -= sector_num % bs->bl.discard_alignment;
5124 }
5125
5126 /* limit request size */
5127 if (num > max_discard) {
5128 num = max_discard;
5129 }
5130
5131 if (bs->drv->bdrv_co_discard) {
5132 ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
5133 } else {
5134 BlockAIOCB *acb;
5135 CoroutineIOCompletion co = {
5136 .coroutine = qemu_coroutine_self(),
5137 };
5138
5139 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
5140 bdrv_co_io_em_complete, &co);
5141 if (acb == NULL) {
5142 return -EIO;
5143 } else {
5144 qemu_coroutine_yield();
5145 ret = co.ret;
5146 }
5147 }
5148 if (ret && ret != -ENOTSUP) {
5149 return ret;
5150 }
5151
5152 sector_num += num;
5153 nb_sectors -= num;
5154 }
5155 return 0;
5156 }
5157
5158 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
5159 {
5160 Coroutine *co;
5161 DiscardCo rwco = {
5162 .bs = bs,
5163 .sector_num = sector_num,
5164 .nb_sectors = nb_sectors,
5165 .ret = NOT_DONE,
5166 };
5167
5168 if (qemu_in_coroutine()) {
5169 /* Fast-path if already in coroutine context */
5170 bdrv_discard_co_entry(&rwco);
5171 } else {
5172 AioContext *aio_context = bdrv_get_aio_context(bs);
5173
5174 co = qemu_coroutine_create(bdrv_discard_co_entry);
5175 qemu_coroutine_enter(co, &rwco);
5176 while (rwco.ret == NOT_DONE) {
5177 aio_poll(aio_context, true);
5178 }
5179 }
5180
5181 return rwco.ret;
5182 }
5183
5184 /**************************************************************/
5185 /* removable device support */
5186
5187 /**
5188 * Return TRUE if the media is present
5189 */
5190 int bdrv_is_inserted(BlockDriverState *bs)
5191 {
5192 BlockDriver *drv = bs->drv;
5193
5194 if (!drv)
5195 return 0;
5196 if (!drv->bdrv_is_inserted)
5197 return 1;
5198 return drv->bdrv_is_inserted(bs);
5199 }
5200
5201 /**
5202 * Return whether the media changed since the last call to this
5203 * function, or -ENOTSUP if we don't know. Most drivers don't know.
5204 */
5205 int bdrv_media_changed(BlockDriverState *bs)
5206 {
5207 BlockDriver *drv = bs->drv;
5208
5209 if (drv && drv->bdrv_media_changed) {
5210 return drv->bdrv_media_changed(bs);
5211 }
5212 return -ENOTSUP;
5213 }
5214
5215 /**
5216 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
5217 */
5218 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
5219 {
5220 BlockDriver *drv = bs->drv;
5221 const char *device_name;
5222
5223 if (drv && drv->bdrv_eject) {
5224 drv->bdrv_eject(bs, eject_flag);
5225 }
5226
5227 device_name = bdrv_get_device_name(bs);
5228 if (device_name[0] != '\0') {
5229 qapi_event_send_device_tray_moved(device_name,
5230 eject_flag, &error_abort);
5231 }
5232 }
5233
5234 /**
5235 * Lock or unlock the media (if it is locked, the user won't be able
5236 * to eject it manually).
5237 */
5238 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
5239 {
5240 BlockDriver *drv = bs->drv;
5241
5242 trace_bdrv_lock_medium(bs, locked);
5243
5244 if (drv && drv->bdrv_lock_medium) {
5245 drv->bdrv_lock_medium(bs, locked);
5246 }
5247 }
5248
5249 /* needed for generic scsi interface */
5250
5251 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
5252 {
5253 BlockDriver *drv = bs->drv;
5254
5255 if (drv && drv->bdrv_ioctl)
5256 return drv->bdrv_ioctl(bs, req, buf);
5257 return -ENOTSUP;
5258 }
5259
5260 BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
5261 unsigned long int req, void *buf,
5262 BlockCompletionFunc *cb, void *opaque)
5263 {
5264 BlockDriver *drv = bs->drv;
5265
5266 if (drv && drv->bdrv_aio_ioctl)
5267 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
5268 return NULL;
5269 }
5270
5271 void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
5272 {
5273 bs->guest_block_size = align;
5274 }
5275
5276 void *qemu_blockalign(BlockDriverState *bs, size_t size)
5277 {
5278 return qemu_memalign(bdrv_opt_mem_align(bs), size);
5279 }
5280
5281 void *qemu_blockalign0(BlockDriverState *bs, size_t size)
5282 {
5283 return memset(qemu_blockalign(bs, size), 0, size);
5284 }
5285
5286 void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
5287 {
5288 size_t align = bdrv_opt_mem_align(bs);
5289
5290 /* Ensure that NULL is never returned on success */
5291 assert(align > 0);
5292 if (size == 0) {
5293 size = align;
5294 }
5295
5296 return qemu_try_memalign(align, size);
5297 }
5298
5299 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
5300 {
5301 void *mem = qemu_try_blockalign(bs, size);
5302
5303 if (mem) {
5304 memset(mem, 0, size);
5305 }
5306
5307 return mem;
5308 }
5309
5310 /*
5311 * Check if all memory in this vector is sector aligned.
5312 */
5313 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
5314 {
5315 int i;
5316 size_t alignment = bdrv_opt_mem_align(bs);
5317
5318 for (i = 0; i < qiov->niov; i++) {
5319 if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
5320 return false;
5321 }
5322 if (qiov->iov[i].iov_len % alignment) {
5323 return false;
5324 }
5325 }
5326
5327 return true;
5328 }
5329
5330 BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity,
5331 Error **errp)
5332 {
5333 int64_t bitmap_size;
5334 BdrvDirtyBitmap *bitmap;
5335
5336 assert((granularity & (granularity - 1)) == 0);
5337
5338 granularity >>= BDRV_SECTOR_BITS;
5339 assert(granularity);
5340 bitmap_size = bdrv_nb_sectors(bs);
5341 if (bitmap_size < 0) {
5342 error_setg_errno(errp, -bitmap_size, "could not get length of device");
5343 errno = -bitmap_size;
5344 return NULL;
5345 }
5346 bitmap = g_new0(BdrvDirtyBitmap, 1);
5347 bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
5348 QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
5349 return bitmap;
5350 }
5351
5352 void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5353 {
5354 BdrvDirtyBitmap *bm, *next;
5355 QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
5356 if (bm == bitmap) {
5357 QLIST_REMOVE(bitmap, list);
5358 hbitmap_free(bitmap->bitmap);
5359 g_free(bitmap);
5360 return;
5361 }
5362 }
5363 }
5364
5365 BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
5366 {
5367 BdrvDirtyBitmap *bm;
5368 BlockDirtyInfoList *list = NULL;
5369 BlockDirtyInfoList **plist = &list;
5370
5371 QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
5372 BlockDirtyInfo *info = g_new0(BlockDirtyInfo, 1);
5373 BlockDirtyInfoList *entry = g_new0(BlockDirtyInfoList, 1);
5374 info->count = bdrv_get_dirty_count(bs, bm);
5375 info->granularity =
5376 ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
5377 entry->value = info;
5378 *plist = entry;
5379 plist = &entry->next;
5380 }
5381
5382 return list;
5383 }
5384
5385 int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
5386 {
5387 if (bitmap) {
5388 return hbitmap_get(bitmap->bitmap, sector);
5389 } else {
5390 return 0;
5391 }
5392 }
5393
5394 void bdrv_dirty_iter_init(BlockDriverState *bs,
5395 BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
5396 {
5397 hbitmap_iter_init(hbi, bitmap->bitmap, 0);
5398 }
5399
5400 void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
5401 int nr_sectors)
5402 {
5403 BdrvDirtyBitmap *bitmap;
5404 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5405 hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5406 }
5407 }
5408
5409 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors)
5410 {
5411 BdrvDirtyBitmap *bitmap;
5412 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5413 hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5414 }
5415 }
5416
5417 int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5418 {
5419 return hbitmap_count(bitmap->bitmap);
5420 }
5421
5422 /* Get a reference to bs */
5423 void bdrv_ref(BlockDriverState *bs)
5424 {
5425 bs->refcnt++;
5426 }
5427
5428 /* Release a previously grabbed reference to bs.
5429 * If after releasing, reference count is zero, the BlockDriverState is
5430 * deleted. */
5431 void bdrv_unref(BlockDriverState *bs)
5432 {
5433 if (!bs) {
5434 return;
5435 }
5436 assert(bs->refcnt > 0);
5437 if (--bs->refcnt == 0) {
5438 bdrv_delete(bs);
5439 }
5440 }
5441
5442 struct BdrvOpBlocker {
5443 Error *reason;
5444 QLIST_ENTRY(BdrvOpBlocker) list;
5445 };
5446
5447 bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp)
5448 {
5449 BdrvOpBlocker *blocker;
5450 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5451 if (!QLIST_EMPTY(&bs->op_blockers[op])) {
5452 blocker = QLIST_FIRST(&bs->op_blockers[op]);
5453 if (errp) {
5454 error_setg(errp, "Device '%s' is busy: %s",
5455 bdrv_get_device_name(bs),
5456 error_get_pretty(blocker->reason));
5457 }
5458 return true;
5459 }
5460 return false;
5461 }
5462
5463 void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason)
5464 {
5465 BdrvOpBlocker *blocker;
5466 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5467
5468 blocker = g_new0(BdrvOpBlocker, 1);
5469 blocker->reason = reason;
5470 QLIST_INSERT_HEAD(&bs->op_blockers[op], blocker, list);
5471 }
5472
5473 void bdrv_op_unblock(BlockDriverState *bs, BlockOpType op, Error *reason)
5474 {
5475 BdrvOpBlocker *blocker, *next;
5476 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5477 QLIST_FOREACH_SAFE(blocker, &bs->op_blockers[op], list, next) {
5478 if (blocker->reason == reason) {
5479 QLIST_REMOVE(blocker, list);
5480 g_free(blocker);
5481 }
5482 }
5483 }
5484
5485 void bdrv_op_block_all(BlockDriverState *bs, Error *reason)
5486 {
5487 int i;
5488 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5489 bdrv_op_block(bs, i, reason);
5490 }
5491 }
5492
5493 void bdrv_op_unblock_all(BlockDriverState *bs, Error *reason)
5494 {
5495 int i;
5496 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5497 bdrv_op_unblock(bs, i, reason);
5498 }
5499 }
5500
5501 bool bdrv_op_blocker_is_empty(BlockDriverState *bs)
5502 {
5503 int i;
5504
5505 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5506 if (!QLIST_EMPTY(&bs->op_blockers[i])) {
5507 return false;
5508 }
5509 }
5510 return true;
5511 }
5512
5513 void bdrv_iostatus_enable(BlockDriverState *bs)
5514 {
5515 bs->iostatus_enabled = true;
5516 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5517 }
5518
5519 /* The I/O status is only enabled if the drive explicitly
5520 * enables it _and_ the VM is configured to stop on errors */
5521 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
5522 {
5523 return (bs->iostatus_enabled &&
5524 (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
5525 bs->on_write_error == BLOCKDEV_ON_ERROR_STOP ||
5526 bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
5527 }
5528
5529 void bdrv_iostatus_disable(BlockDriverState *bs)
5530 {
5531 bs->iostatus_enabled = false;
5532 }
5533
5534 void bdrv_iostatus_reset(BlockDriverState *bs)
5535 {
5536 if (bdrv_iostatus_is_enabled(bs)) {
5537 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5538 if (bs->job) {
5539 block_job_iostatus_reset(bs->job);
5540 }
5541 }
5542 }
5543
5544 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
5545 {
5546 assert(bdrv_iostatus_is_enabled(bs));
5547 if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
5548 bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
5549 BLOCK_DEVICE_IO_STATUS_FAILED;
5550 }
5551 }
5552
5553 void bdrv_img_create(const char *filename, const char *fmt,
5554 const char *base_filename, const char *base_fmt,
5555 char *options, uint64_t img_size, int flags,
5556 Error **errp, bool quiet)
5557 {
5558 QemuOptsList *create_opts = NULL;
5559 QemuOpts *opts = NULL;
5560 const char *backing_fmt, *backing_file;
5561 int64_t size;
5562 BlockDriver *drv, *proto_drv;
5563 BlockDriver *backing_drv = NULL;
5564 Error *local_err = NULL;
5565 int ret = 0;
5566
5567 /* Find driver and parse its options */
5568 drv = bdrv_find_format(fmt);
5569 if (!drv) {
5570 error_setg(errp, "Unknown file format '%s'", fmt);
5571 return;
5572 }
5573
5574 proto_drv = bdrv_find_protocol(filename, true);
5575 if (!proto_drv) {
5576 error_setg(errp, "Unknown protocol '%s'", filename);
5577 return;
5578 }
5579
5580 create_opts = qemu_opts_append(create_opts, drv->create_opts);
5581 create_opts = qemu_opts_append(create_opts, proto_drv->create_opts);
5582
5583 /* Create parameter list with default values */
5584 opts = qemu_opts_create(create_opts, NULL, 0, &error_abort);
5585 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, img_size);
5586
5587 /* Parse -o options */
5588 if (options) {
5589 if (qemu_opts_do_parse(opts, options, NULL) != 0) {
5590 error_setg(errp, "Invalid options for file format '%s'", fmt);
5591 goto out;
5592 }
5593 }
5594
5595 if (base_filename) {
5596 if (qemu_opt_set(opts, BLOCK_OPT_BACKING_FILE, base_filename)) {
5597 error_setg(errp, "Backing file not supported for file format '%s'",
5598 fmt);
5599 goto out;
5600 }
5601 }
5602
5603 if (base_fmt) {
5604 if (qemu_opt_set(opts, BLOCK_OPT_BACKING_FMT, base_fmt)) {
5605 error_setg(errp, "Backing file format not supported for file "
5606 "format '%s'", fmt);
5607 goto out;
5608 }
5609 }
5610
5611 backing_file = qemu_opt_get(opts, BLOCK_OPT_BACKING_FILE);
5612 if (backing_file) {
5613 if (!strcmp(filename, backing_file)) {
5614 error_setg(errp, "Error: Trying to create an image with the "
5615 "same filename as the backing file");
5616 goto out;
5617 }
5618 }
5619
5620 backing_fmt = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT);
5621 if (backing_fmt) {
5622 backing_drv = bdrv_find_format(backing_fmt);
5623 if (!backing_drv) {
5624 error_setg(errp, "Unknown backing file format '%s'",
5625 backing_fmt);
5626 goto out;
5627 }
5628 }
5629
5630 // The size for the image must always be specified, with one exception:
5631 // If we are using a backing file, we can obtain the size from there
5632 size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 0);
5633 if (size == -1) {
5634 if (backing_file) {
5635 BlockDriverState *bs;
5636 int64_t size;
5637 int back_flags;
5638
5639 /* backing files always opened read-only */
5640 back_flags =
5641 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
5642
5643 bs = NULL;
5644 ret = bdrv_open(&bs, backing_file, NULL, NULL, back_flags,
5645 backing_drv, &local_err);
5646 if (ret < 0) {
5647 goto out;
5648 }
5649 size = bdrv_getlength(bs);
5650 if (size < 0) {
5651 error_setg_errno(errp, -size, "Could not get size of '%s'",
5652 backing_file);
5653 bdrv_unref(bs);
5654 goto out;
5655 }
5656
5657 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, size);
5658
5659 bdrv_unref(bs);
5660 } else {
5661 error_setg(errp, "Image creation needs a size parameter");
5662 goto out;
5663 }
5664 }
5665
5666 if (!quiet) {
5667 printf("Formatting '%s', fmt=%s ", filename, fmt);
5668 qemu_opts_print(opts);
5669 puts("");
5670 }
5671
5672 ret = bdrv_create(drv, filename, opts, &local_err);
5673
5674 if (ret == -EFBIG) {
5675 /* This is generally a better message than whatever the driver would
5676 * deliver (especially because of the cluster_size_hint), since that
5677 * is most probably not much different from "image too large". */
5678 const char *cluster_size_hint = "";
5679 if (qemu_opt_get_size(opts, BLOCK_OPT_CLUSTER_SIZE, 0)) {
5680 cluster_size_hint = " (try using a larger cluster size)";
5681 }
5682 error_setg(errp, "The image size is too large for file format '%s'"
5683 "%s", fmt, cluster_size_hint);
5684 error_free(local_err);
5685 local_err = NULL;
5686 }
5687
5688 out:
5689 qemu_opts_del(opts);
5690 qemu_opts_free(create_opts);
5691 if (local_err) {
5692 error_propagate(errp, local_err);
5693 }
5694 }
5695
5696 AioContext *bdrv_get_aio_context(BlockDriverState *bs)
5697 {
5698 return bs->aio_context;
5699 }
5700
5701 void bdrv_detach_aio_context(BlockDriverState *bs)
5702 {
5703 BdrvAioNotifier *baf;
5704
5705 if (!bs->drv) {
5706 return;
5707 }
5708
5709 QLIST_FOREACH(baf, &bs->aio_notifiers, list) {
5710 baf->detach_aio_context(baf->opaque);
5711 }
5712
5713 if (bs->io_limits_enabled) {
5714 throttle_detach_aio_context(&bs->throttle_state);
5715 }
5716 if (bs->drv->bdrv_detach_aio_context) {
5717 bs->drv->bdrv_detach_aio_context(bs);
5718 }
5719 if (bs->file) {
5720 bdrv_detach_aio_context(bs->file);
5721 }
5722 if (bs->backing_hd) {
5723 bdrv_detach_aio_context(bs->backing_hd);
5724 }
5725
5726 bs->aio_context = NULL;
5727 }
5728
5729 void bdrv_attach_aio_context(BlockDriverState *bs,
5730 AioContext *new_context)
5731 {
5732 BdrvAioNotifier *ban;
5733
5734 if (!bs->drv) {
5735 return;
5736 }
5737
5738 bs->aio_context = new_context;
5739
5740 if (bs->backing_hd) {
5741 bdrv_attach_aio_context(bs->backing_hd, new_context);
5742 }
5743 if (bs->file) {
5744 bdrv_attach_aio_context(bs->file, new_context);
5745 }
5746 if (bs->drv->bdrv_attach_aio_context) {
5747 bs->drv->bdrv_attach_aio_context(bs, new_context);
5748 }
5749 if (bs->io_limits_enabled) {
5750 throttle_attach_aio_context(&bs->throttle_state, new_context);
5751 }
5752
5753 QLIST_FOREACH(ban, &bs->aio_notifiers, list) {
5754 ban->attached_aio_context(new_context, ban->opaque);
5755 }
5756 }
5757
5758 void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
5759 {
5760 bdrv_drain_all(); /* ensure there are no in-flight requests */
5761
5762 bdrv_detach_aio_context(bs);
5763
5764 /* This function executes in the old AioContext so acquire the new one in
5765 * case it runs in a different thread.
5766 */
5767 aio_context_acquire(new_context);
5768 bdrv_attach_aio_context(bs, new_context);
5769 aio_context_release(new_context);
5770 }
5771
5772 void bdrv_add_aio_context_notifier(BlockDriverState *bs,
5773 void (*attached_aio_context)(AioContext *new_context, void *opaque),
5774 void (*detach_aio_context)(void *opaque), void *opaque)
5775 {
5776 BdrvAioNotifier *ban = g_new(BdrvAioNotifier, 1);
5777 *ban = (BdrvAioNotifier){
5778 .attached_aio_context = attached_aio_context,
5779 .detach_aio_context = detach_aio_context,
5780 .opaque = opaque
5781 };
5782
5783 QLIST_INSERT_HEAD(&bs->aio_notifiers, ban, list);
5784 }
5785
5786 void bdrv_remove_aio_context_notifier(BlockDriverState *bs,
5787 void (*attached_aio_context)(AioContext *,
5788 void *),
5789 void (*detach_aio_context)(void *),
5790 void *opaque)
5791 {
5792 BdrvAioNotifier *ban, *ban_next;
5793
5794 QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
5795 if (ban->attached_aio_context == attached_aio_context &&
5796 ban->detach_aio_context == detach_aio_context &&
5797 ban->opaque == opaque)
5798 {
5799 QLIST_REMOVE(ban, list);
5800 g_free(ban);
5801
5802 return;
5803 }
5804 }
5805
5806 abort();
5807 }
5808
5809 void bdrv_add_before_write_notifier(BlockDriverState *bs,
5810 NotifierWithReturn *notifier)
5811 {
5812 notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
5813 }
5814
5815 int bdrv_amend_options(BlockDriverState *bs, QemuOpts *opts,
5816 BlockDriverAmendStatusCB *status_cb)
5817 {
5818 if (!bs->drv->bdrv_amend_options) {
5819 return -ENOTSUP;
5820 }
5821 return bs->drv->bdrv_amend_options(bs, opts, status_cb);
5822 }
5823
5824 /* This function will be called by the bdrv_recurse_is_first_non_filter method
5825 * of block filter and by bdrv_is_first_non_filter.
5826 * It is used to test if the given bs is the candidate or recurse more in the
5827 * node graph.
5828 */
5829 bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
5830 BlockDriverState *candidate)
5831 {
5832 /* return false if basic checks fails */
5833 if (!bs || !bs->drv) {
5834 return false;
5835 }
5836
5837 /* the code reached a non block filter driver -> check if the bs is
5838 * the same as the candidate. It's the recursion termination condition.
5839 */
5840 if (!bs->drv->is_filter) {
5841 return bs == candidate;
5842 }
5843 /* Down this path the driver is a block filter driver */
5844
5845 /* If the block filter recursion method is defined use it to recurse down
5846 * the node graph.
5847 */
5848 if (bs->drv->bdrv_recurse_is_first_non_filter) {
5849 return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate);
5850 }
5851
5852 /* the driver is a block filter but don't allow to recurse -> return false
5853 */
5854 return false;
5855 }
5856
5857 /* This function checks if the candidate is the first non filter bs down it's
5858 * bs chain. Since we don't have pointers to parents it explore all bs chains
5859 * from the top. Some filters can choose not to pass down the recursion.
5860 */
5861 bool bdrv_is_first_non_filter(BlockDriverState *candidate)
5862 {
5863 BlockDriverState *bs;
5864
5865 /* walk down the bs forest recursively */
5866 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5867 bool perm;
5868
5869 /* try to recurse in this top level bs */
5870 perm = bdrv_recurse_is_first_non_filter(bs, candidate);
5871
5872 /* candidate is the first non filter */
5873 if (perm) {
5874 return true;
5875 }
5876 }
5877
5878 return false;
5879 }
5880
5881 BlockDriverState *check_to_replace_node(const char *node_name, Error **errp)
5882 {
5883 BlockDriverState *to_replace_bs = bdrv_find_node(node_name);
5884 AioContext *aio_context;
5885
5886 if (!to_replace_bs) {
5887 error_setg(errp, "Node name '%s' not found", node_name);
5888 return NULL;
5889 }
5890
5891 aio_context = bdrv_get_aio_context(to_replace_bs);
5892 aio_context_acquire(aio_context);
5893
5894 if (bdrv_op_is_blocked(to_replace_bs, BLOCK_OP_TYPE_REPLACE, errp)) {
5895 to_replace_bs = NULL;
5896 goto out;
5897 }
5898
5899 /* We don't want arbitrary node of the BDS chain to be replaced only the top
5900 * most non filter in order to prevent data corruption.
5901 * Another benefit is that this tests exclude backing files which are
5902 * blocked by the backing blockers.
5903 */
5904 if (!bdrv_is_first_non_filter(to_replace_bs)) {
5905 error_setg(errp, "Only top most non filter can be replaced");
5906 to_replace_bs = NULL;
5907 goto out;
5908 }
5909
5910 out:
5911 aio_context_release(aio_context);
5912 return to_replace_bs;
5913 }
5914
5915 void bdrv_io_plug(BlockDriverState *bs)
5916 {
5917 BlockDriver *drv = bs->drv;
5918 if (drv && drv->bdrv_io_plug) {
5919 drv->bdrv_io_plug(bs);
5920 } else if (bs->file) {
5921 bdrv_io_plug(bs->file);
5922 }
5923 }
5924
5925 void bdrv_io_unplug(BlockDriverState *bs)
5926 {
5927 BlockDriver *drv = bs->drv;
5928 if (drv && drv->bdrv_io_unplug) {
5929 drv->bdrv_io_unplug(bs);
5930 } else if (bs->file) {
5931 bdrv_io_unplug(bs->file);
5932 }
5933 }
5934
5935 void bdrv_flush_io_queue(BlockDriverState *bs)
5936 {
5937 BlockDriver *drv = bs->drv;
5938 if (drv && drv->bdrv_flush_io_queue) {
5939 drv->bdrv_flush_io_queue(bs);
5940 } else if (bs->file) {
5941 bdrv_flush_io_queue(bs->file);
5942 }
5943 }
5944
5945 static bool append_open_options(QDict *d, BlockDriverState *bs)
5946 {
5947 const QDictEntry *entry;
5948 bool found_any = false;
5949
5950 for (entry = qdict_first(bs->options); entry;
5951 entry = qdict_next(bs->options, entry))
5952 {
5953 /* Only take options for this level and exclude all non-driver-specific
5954 * options */
5955 if (!strchr(qdict_entry_key(entry), '.') &&
5956 strcmp(qdict_entry_key(entry), "node-name"))
5957 {
5958 qobject_incref(qdict_entry_value(entry));
5959 qdict_put_obj(d, qdict_entry_key(entry), qdict_entry_value(entry));
5960 found_any = true;
5961 }
5962 }
5963
5964 return found_any;
5965 }
5966
5967 /* Updates the following BDS fields:
5968 * - exact_filename: A filename which may be used for opening a block device
5969 * which (mostly) equals the given BDS (even without any
5970 * other options; so reading and writing must return the same
5971 * results, but caching etc. may be different)
5972 * - full_open_options: Options which, when given when opening a block device
5973 * (without a filename), result in a BDS (mostly)
5974 * equalling the given one
5975 * - filename: If exact_filename is set, it is copied here. Otherwise,
5976 * full_open_options is converted to a JSON object, prefixed with
5977 * "json:" (for use through the JSON pseudo protocol) and put here.
5978 */
5979 void bdrv_refresh_filename(BlockDriverState *bs)
5980 {
5981 BlockDriver *drv = bs->drv;
5982 QDict *opts;
5983
5984 if (!drv) {
5985 return;
5986 }
5987
5988 /* This BDS's file name will most probably depend on its file's name, so
5989 * refresh that first */
5990 if (bs->file) {
5991 bdrv_refresh_filename(bs->file);
5992 }
5993
5994 if (drv->bdrv_refresh_filename) {
5995 /* Obsolete information is of no use here, so drop the old file name
5996 * information before refreshing it */
5997 bs->exact_filename[0] = '\0';
5998 if (bs->full_open_options) {
5999 QDECREF(bs->full_open_options);
6000 bs->full_open_options = NULL;
6001 }
6002
6003 drv->bdrv_refresh_filename(bs);
6004 } else if (bs->file) {
6005 /* Try to reconstruct valid information from the underlying file */
6006 bool has_open_options;
6007
6008 bs->exact_filename[0] = '\0';
6009 if (bs->full_open_options) {
6010 QDECREF(bs->full_open_options);
6011 bs->full_open_options = NULL;
6012 }
6013
6014 opts = qdict_new();
6015 has_open_options = append_open_options(opts, bs);
6016
6017 /* If no specific options have been given for this BDS, the filename of
6018 * the underlying file should suffice for this one as well */
6019 if (bs->file->exact_filename[0] && !has_open_options) {
6020 strcpy(bs->exact_filename, bs->file->exact_filename);
6021 }
6022 /* Reconstructing the full options QDict is simple for most format block
6023 * drivers, as long as the full options are known for the underlying
6024 * file BDS. The full options QDict of that file BDS should somehow
6025 * contain a representation of the filename, therefore the following
6026 * suffices without querying the (exact_)filename of this BDS. */
6027 if (bs->file->full_open_options) {
6028 qdict_put_obj(opts, "driver",
6029 QOBJECT(qstring_from_str(drv->format_name)));
6030 QINCREF(bs->file->full_open_options);
6031 qdict_put_obj(opts, "file", QOBJECT(bs->file->full_open_options));
6032
6033 bs->full_open_options = opts;
6034 } else {
6035 QDECREF(opts);
6036 }
6037 } else if (!bs->full_open_options && qdict_size(bs->options)) {
6038 /* There is no underlying file BDS (at least referenced by BDS.file),
6039 * so the full options QDict should be equal to the options given
6040 * specifically for this block device when it was opened (plus the
6041 * driver specification).
6042 * Because those options don't change, there is no need to update
6043 * full_open_options when it's already set. */
6044
6045 opts = qdict_new();
6046 append_open_options(opts, bs);
6047 qdict_put_obj(opts, "driver",
6048 QOBJECT(qstring_from_str(drv->format_name)));
6049
6050 if (bs->exact_filename[0]) {
6051 /* This may not work for all block protocol drivers (some may
6052 * require this filename to be parsed), but we have to find some
6053 * default solution here, so just include it. If some block driver
6054 * does not support pure options without any filename at all or
6055 * needs some special format of the options QDict, it needs to
6056 * implement the driver-specific bdrv_refresh_filename() function.
6057 */
6058 qdict_put_obj(opts, "filename",
6059 QOBJECT(qstring_from_str(bs->exact_filename)));
6060 }
6061
6062 bs->full_open_options = opts;
6063 }
6064
6065 if (bs->exact_filename[0]) {
6066 pstrcpy(bs->filename, sizeof(bs->filename), bs->exact_filename);
6067 } else if (bs->full_open_options) {
6068 QString *json = qobject_to_json(QOBJECT(bs->full_open_options));
6069 snprintf(bs->filename, sizeof(bs->filename), "json:%s",
6070 qstring_get_str(json));
6071 QDECREF(json);
6072 }
6073 }
6074
6075 /* This accessor function purpose is to allow the device models to access the
6076 * BlockAcctStats structure embedded inside a BlockDriverState without being
6077 * aware of the BlockDriverState structure layout.
6078 * It will go away when the BlockAcctStats structure will be moved inside
6079 * the device models.
6080 */
6081 BlockAcctStats *bdrv_get_stats(BlockDriverState *bs)
6082 {
6083 return &bs->stats;
6084 }