]> git.proxmox.com Git - mirror_qemu.git/blame - block.c
block: Reuse reference handling from bdrv_open()
[mirror_qemu.git] / block.c
CommitLineData
fc01f7e7
FB
1/*
2 * QEMU System Emulator block driver
5fafdf24 3 *
fc01f7e7 4 * Copyright (c) 2003 Fabrice Bellard
5fafdf24 5 *
fc01f7e7
FB
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
3990d09a 24#include "config-host.h"
faf07963 25#include "qemu-common.h"
6d519a5f 26#include "trace.h"
83c9089e 27#include "monitor/monitor.h"
737e150e
PB
28#include "block/block_int.h"
29#include "block/blockjob.h"
1de7afc9 30#include "qemu/module.h"
7b1b5d19 31#include "qapi/qmp/qjson.h"
9c17d615 32#include "sysemu/sysemu.h"
1de7afc9 33#include "qemu/notify.h"
737e150e 34#include "block/coroutine.h"
c13163fb 35#include "block/qapi.h"
b2023818 36#include "qmp-commands.h"
1de7afc9 37#include "qemu/timer.h"
fc01f7e7 38
71e72a19 39#ifdef CONFIG_BSD
7674e7bf
FB
40#include <sys/types.h>
41#include <sys/stat.h>
42#include <sys/ioctl.h>
72cf2d4f 43#include <sys/queue.h>
c5e97233 44#ifndef __DragonFly__
7674e7bf
FB
45#include <sys/disk.h>
46#endif
c5e97233 47#endif
7674e7bf 48
49dc768d
AL
49#ifdef _WIN32
50#include <windows.h>
51#endif
52
e4654d2d
FZ
53struct BdrvDirtyBitmap {
54 HBitmap *bitmap;
55 QLIST_ENTRY(BdrvDirtyBitmap) list;
56};
57
1c9805a3
SH
58#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
59
7d4b4ba5 60static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
f141eafe
AL
61static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
62 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
c87c0672 63 BlockDriverCompletionFunc *cb, void *opaque);
f141eafe
AL
64static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
65 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
ce1a14dc 66 BlockDriverCompletionFunc *cb, void *opaque);
f9f05dc5
KW
67static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
68 int64_t sector_num, int nb_sectors,
69 QEMUIOVector *iov);
70static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
71 int64_t sector_num, int nb_sectors,
72 QEMUIOVector *iov);
775aa8b6
KW
73static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
74 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
470c0504 75 BdrvRequestFlags flags);
775aa8b6
KW
76static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
77 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
f08f2dda 78 BdrvRequestFlags flags);
b2a61371
SH
79static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
80 int64_t sector_num,
81 QEMUIOVector *qiov,
82 int nb_sectors,
d20d9b7c 83 BdrvRequestFlags flags,
b2a61371
SH
84 BlockDriverCompletionFunc *cb,
85 void *opaque,
8c5873d6 86 bool is_write);
b2a61371 87static void coroutine_fn bdrv_co_do_rw(void *opaque);
621f0589 88static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
aa7bfbff 89 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
ec530c81 90
1b7bdbc1
SH
91static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
92 QTAILQ_HEAD_INITIALIZER(bdrv_states);
7ee930d0 93
dc364f4c
BC
94static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
95 QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
96
8a22f02a
SH
97static QLIST_HEAD(, BlockDriver) bdrv_drivers =
98 QLIST_HEAD_INITIALIZER(bdrv_drivers);
ea2384d3 99
eb852011
MA
100/* If non-zero, use only whitelisted block drivers */
101static int use_bdrv_whitelist;
102
9e0b22f4
SH
103#ifdef _WIN32
104static int is_windows_drive_prefix(const char *filename)
105{
106 return (((filename[0] >= 'a' && filename[0] <= 'z') ||
107 (filename[0] >= 'A' && filename[0] <= 'Z')) &&
108 filename[1] == ':');
109}
110
111int is_windows_drive(const char *filename)
112{
113 if (is_windows_drive_prefix(filename) &&
114 filename[2] == '\0')
115 return 1;
116 if (strstart(filename, "\\\\.\\", NULL) ||
117 strstart(filename, "//./", NULL))
118 return 1;
119 return 0;
120}
121#endif
122
0563e191 123/* throttling disk I/O limits */
cc0681c4
BC
124void bdrv_set_io_limits(BlockDriverState *bs,
125 ThrottleConfig *cfg)
98f90dba 126{
cc0681c4 127 int i;
98f90dba 128
cc0681c4 129 throttle_config(&bs->throttle_state, cfg);
98f90dba 130
cc0681c4
BC
131 for (i = 0; i < 2; i++) {
132 qemu_co_enter_next(&bs->throttled_reqs[i]);
98f90dba 133 }
cc0681c4
BC
134}
135
136/* this function drain all the throttled IOs */
137static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
138{
139 bool drained = false;
140 bool enabled = bs->io_limits_enabled;
141 int i;
142
143 bs->io_limits_enabled = false;
144
145 for (i = 0; i < 2; i++) {
146 while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
147 drained = true;
148 }
149 }
150
151 bs->io_limits_enabled = enabled;
98f90dba 152
cc0681c4 153 return drained;
98f90dba
ZYW
154}
155
cc0681c4 156void bdrv_io_limits_disable(BlockDriverState *bs)
0563e191 157{
cc0681c4 158 bs->io_limits_enabled = false;
0563e191 159
cc0681c4
BC
160 bdrv_start_throttled_reqs(bs);
161
162 throttle_destroy(&bs->throttle_state);
0563e191
ZYW
163}
164
cc0681c4 165static void bdrv_throttle_read_timer_cb(void *opaque)
0563e191 166{
cc0681c4
BC
167 BlockDriverState *bs = opaque;
168 qemu_co_enter_next(&bs->throttled_reqs[0]);
0563e191
ZYW
169}
170
cc0681c4 171static void bdrv_throttle_write_timer_cb(void *opaque)
0563e191 172{
cc0681c4
BC
173 BlockDriverState *bs = opaque;
174 qemu_co_enter_next(&bs->throttled_reqs[1]);
0563e191
ZYW
175}
176
cc0681c4
BC
177/* should be called before bdrv_set_io_limits if a limit is set */
178void bdrv_io_limits_enable(BlockDriverState *bs)
179{
180 assert(!bs->io_limits_enabled);
181 throttle_init(&bs->throttle_state,
182 QEMU_CLOCK_VIRTUAL,
183 bdrv_throttle_read_timer_cb,
184 bdrv_throttle_write_timer_cb,
185 bs);
186 bs->io_limits_enabled = true;
187}
188
189/* This function makes an IO wait if needed
190 *
191 * @nb_sectors: the number of sectors of the IO
192 * @is_write: is the IO a write
193 */
98f90dba 194static void bdrv_io_limits_intercept(BlockDriverState *bs,
d5103588 195 unsigned int bytes,
cc0681c4 196 bool is_write)
98f90dba 197{
cc0681c4
BC
198 /* does this io must wait */
199 bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
98f90dba 200
cc0681c4
BC
201 /* if must wait or any request of this type throttled queue the IO */
202 if (must_wait ||
203 !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
204 qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
98f90dba
ZYW
205 }
206
cc0681c4 207 /* the IO will be executed, do the accounting */
d5103588
KW
208 throttle_account(&bs->throttle_state, is_write, bytes);
209
98f90dba 210
cc0681c4
BC
211 /* if the next request must wait -> do nothing */
212 if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
213 return;
98f90dba
ZYW
214 }
215
cc0681c4
BC
216 /* else queue next request for execution */
217 qemu_co_queue_next(&bs->throttled_reqs[is_write]);
98f90dba
ZYW
218}
219
339064d5
KW
220size_t bdrv_opt_mem_align(BlockDriverState *bs)
221{
222 if (!bs || !bs->drv) {
223 /* 4k should be on the safe side */
224 return 4096;
225 }
226
227 return bs->bl.opt_mem_alignment;
228}
229
9e0b22f4
SH
230/* check if the path starts with "<protocol>:" */
231static int path_has_protocol(const char *path)
232{
947995c0
PB
233 const char *p;
234
9e0b22f4
SH
235#ifdef _WIN32
236 if (is_windows_drive(path) ||
237 is_windows_drive_prefix(path)) {
238 return 0;
239 }
947995c0
PB
240 p = path + strcspn(path, ":/\\");
241#else
242 p = path + strcspn(path, ":/");
9e0b22f4
SH
243#endif
244
947995c0 245 return *p == ':';
9e0b22f4
SH
246}
247
83f64091 248int path_is_absolute(const char *path)
3b0d4f61 249{
21664424
FB
250#ifdef _WIN32
251 /* specific case for names like: "\\.\d:" */
f53f4da9 252 if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
21664424 253 return 1;
f53f4da9
PB
254 }
255 return (*path == '/' || *path == '\\');
3b9f94e1 256#else
f53f4da9 257 return (*path == '/');
3b9f94e1 258#endif
3b0d4f61
FB
259}
260
83f64091
FB
261/* if filename is absolute, just copy it to dest. Otherwise, build a
262 path to it by considering it is relative to base_path. URL are
263 supported. */
264void path_combine(char *dest, int dest_size,
265 const char *base_path,
266 const char *filename)
3b0d4f61 267{
83f64091
FB
268 const char *p, *p1;
269 int len;
270
271 if (dest_size <= 0)
272 return;
273 if (path_is_absolute(filename)) {
274 pstrcpy(dest, dest_size, filename);
275 } else {
276 p = strchr(base_path, ':');
277 if (p)
278 p++;
279 else
280 p = base_path;
3b9f94e1
FB
281 p1 = strrchr(base_path, '/');
282#ifdef _WIN32
283 {
284 const char *p2;
285 p2 = strrchr(base_path, '\\');
286 if (!p1 || p2 > p1)
287 p1 = p2;
288 }
289#endif
83f64091
FB
290 if (p1)
291 p1++;
292 else
293 p1 = base_path;
294 if (p1 > p)
295 p = p1;
296 len = p - base_path;
297 if (len > dest_size - 1)
298 len = dest_size - 1;
299 memcpy(dest, base_path, len);
300 dest[len] = '\0';
301 pstrcat(dest, dest_size, filename);
3b0d4f61 302 }
3b0d4f61
FB
303}
304
dc5a1371
PB
305void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
306{
307 if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
308 pstrcpy(dest, sz, bs->backing_file);
309 } else {
310 path_combine(dest, sz, bs->filename, bs->backing_file);
311 }
312}
313
5efa9d5a 314void bdrv_register(BlockDriver *bdrv)
ea2384d3 315{
8c5873d6
SH
316 /* Block drivers without coroutine functions need emulation */
317 if (!bdrv->bdrv_co_readv) {
f9f05dc5
KW
318 bdrv->bdrv_co_readv = bdrv_co_readv_em;
319 bdrv->bdrv_co_writev = bdrv_co_writev_em;
320
f8c35c1d
SH
321 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
322 * the block driver lacks aio we need to emulate that too.
323 */
f9f05dc5
KW
324 if (!bdrv->bdrv_aio_readv) {
325 /* add AIO emulation layer */
326 bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
327 bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
f9f05dc5 328 }
83f64091 329 }
b2e12bc6 330
8a22f02a 331 QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
ea2384d3 332}
b338082b
FB
333
334/* create a new block device (by default it is empty) */
335BlockDriverState *bdrv_new(const char *device_name)
336{
1b7bdbc1 337 BlockDriverState *bs;
b338082b 338
7267c094 339 bs = g_malloc0(sizeof(BlockDriverState));
e4654d2d 340 QLIST_INIT(&bs->dirty_bitmaps);
b338082b 341 pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
ea2384d3 342 if (device_name[0] != '\0') {
dc364f4c 343 QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
ea2384d3 344 }
28a7282a 345 bdrv_iostatus_disable(bs);
d7d512f6 346 notifier_list_init(&bs->close_notifiers);
d616b224 347 notifier_with_return_list_init(&bs->before_write_notifiers);
cc0681c4
BC
348 qemu_co_queue_init(&bs->throttled_reqs[0]);
349 qemu_co_queue_init(&bs->throttled_reqs[1]);
9fcb0251 350 bs->refcnt = 1;
d7d512f6 351
b338082b
FB
352 return bs;
353}
354
d7d512f6
PB
355void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
356{
357 notifier_list_add(&bs->close_notifiers, notify);
358}
359
ea2384d3
FB
360BlockDriver *bdrv_find_format(const char *format_name)
361{
362 BlockDriver *drv1;
8a22f02a
SH
363 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
364 if (!strcmp(drv1->format_name, format_name)) {
ea2384d3 365 return drv1;
8a22f02a 366 }
ea2384d3
FB
367 }
368 return NULL;
369}
370
b64ec4e4 371static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
eb852011 372{
b64ec4e4
FZ
373 static const char *whitelist_rw[] = {
374 CONFIG_BDRV_RW_WHITELIST
375 };
376 static const char *whitelist_ro[] = {
377 CONFIG_BDRV_RO_WHITELIST
eb852011
MA
378 };
379 const char **p;
380
b64ec4e4 381 if (!whitelist_rw[0] && !whitelist_ro[0]) {
eb852011 382 return 1; /* no whitelist, anything goes */
b64ec4e4 383 }
eb852011 384
b64ec4e4 385 for (p = whitelist_rw; *p; p++) {
eb852011
MA
386 if (!strcmp(drv->format_name, *p)) {
387 return 1;
388 }
389 }
b64ec4e4
FZ
390 if (read_only) {
391 for (p = whitelist_ro; *p; p++) {
392 if (!strcmp(drv->format_name, *p)) {
393 return 1;
394 }
395 }
396 }
eb852011
MA
397 return 0;
398}
399
b64ec4e4
FZ
400BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
401 bool read_only)
eb852011
MA
402{
403 BlockDriver *drv = bdrv_find_format(format_name);
b64ec4e4 404 return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
eb852011
MA
405}
406
5b7e1542
ZYW
407typedef struct CreateCo {
408 BlockDriver *drv;
409 char *filename;
410 QEMUOptionParameter *options;
411 int ret;
cc84d90f 412 Error *err;
5b7e1542
ZYW
413} CreateCo;
414
415static void coroutine_fn bdrv_create_co_entry(void *opaque)
416{
cc84d90f
HR
417 Error *local_err = NULL;
418 int ret;
419
5b7e1542
ZYW
420 CreateCo *cco = opaque;
421 assert(cco->drv);
422
cc84d90f 423 ret = cco->drv->bdrv_create(cco->filename, cco->options, &local_err);
84d18f06 424 if (local_err) {
cc84d90f
HR
425 error_propagate(&cco->err, local_err);
426 }
427 cco->ret = ret;
5b7e1542
ZYW
428}
429
0e7e1989 430int bdrv_create(BlockDriver *drv, const char* filename,
cc84d90f 431 QEMUOptionParameter *options, Error **errp)
ea2384d3 432{
5b7e1542
ZYW
433 int ret;
434
435 Coroutine *co;
436 CreateCo cco = {
437 .drv = drv,
438 .filename = g_strdup(filename),
439 .options = options,
440 .ret = NOT_DONE,
cc84d90f 441 .err = NULL,
5b7e1542
ZYW
442 };
443
444 if (!drv->bdrv_create) {
cc84d90f 445 error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
80168bff
LC
446 ret = -ENOTSUP;
447 goto out;
5b7e1542
ZYW
448 }
449
450 if (qemu_in_coroutine()) {
451 /* Fast-path if already in coroutine context */
452 bdrv_create_co_entry(&cco);
453 } else {
454 co = qemu_coroutine_create(bdrv_create_co_entry);
455 qemu_coroutine_enter(co, &cco);
456 while (cco.ret == NOT_DONE) {
457 qemu_aio_wait();
458 }
459 }
460
461 ret = cco.ret;
cc84d90f 462 if (ret < 0) {
84d18f06 463 if (cco.err) {
cc84d90f
HR
464 error_propagate(errp, cco.err);
465 } else {
466 error_setg_errno(errp, -ret, "Could not create image");
467 }
468 }
0e7e1989 469
80168bff
LC
470out:
471 g_free(cco.filename);
5b7e1542 472 return ret;
ea2384d3
FB
473}
474
cc84d90f
HR
475int bdrv_create_file(const char* filename, QEMUOptionParameter *options,
476 Error **errp)
84a12e66
CH
477{
478 BlockDriver *drv;
cc84d90f
HR
479 Error *local_err = NULL;
480 int ret;
84a12e66 481
98289620 482 drv = bdrv_find_protocol(filename, true);
84a12e66 483 if (drv == NULL) {
cc84d90f 484 error_setg(errp, "Could not find protocol for file '%s'", filename);
16905d71 485 return -ENOENT;
84a12e66
CH
486 }
487
cc84d90f 488 ret = bdrv_create(drv, filename, options, &local_err);
84d18f06 489 if (local_err) {
cc84d90f
HR
490 error_propagate(errp, local_err);
491 }
492 return ret;
84a12e66
CH
493}
494
355ef4ac 495int bdrv_refresh_limits(BlockDriverState *bs)
d34682cd
KW
496{
497 BlockDriver *drv = bs->drv;
498
499 memset(&bs->bl, 0, sizeof(bs->bl));
500
466ad822
KW
501 if (!drv) {
502 return 0;
503 }
504
505 /* Take some limits from the children as a default */
506 if (bs->file) {
507 bdrv_refresh_limits(bs->file);
508 bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
339064d5
KW
509 bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
510 } else {
511 bs->bl.opt_mem_alignment = 512;
466ad822
KW
512 }
513
514 if (bs->backing_hd) {
515 bdrv_refresh_limits(bs->backing_hd);
516 bs->bl.opt_transfer_length =
517 MAX(bs->bl.opt_transfer_length,
518 bs->backing_hd->bl.opt_transfer_length);
339064d5
KW
519 bs->bl.opt_mem_alignment =
520 MAX(bs->bl.opt_mem_alignment,
521 bs->backing_hd->bl.opt_mem_alignment);
466ad822
KW
522 }
523
524 /* Then let the driver override it */
525 if (drv->bdrv_refresh_limits) {
d34682cd
KW
526 return drv->bdrv_refresh_limits(bs);
527 }
528
529 return 0;
530}
531
eba25057
JM
532/*
533 * Create a uniquely-named empty temporary file.
534 * Return 0 upon success, otherwise a negative errno value.
535 */
536int get_tmp_filename(char *filename, int size)
d5249393 537{
eba25057 538#ifdef _WIN32
3b9f94e1 539 char temp_dir[MAX_PATH];
eba25057
JM
540 /* GetTempFileName requires that its output buffer (4th param)
541 have length MAX_PATH or greater. */
542 assert(size >= MAX_PATH);
543 return (GetTempPath(MAX_PATH, temp_dir)
544 && GetTempFileName(temp_dir, "qem", 0, filename)
545 ? 0 : -GetLastError());
d5249393 546#else
67b915a5 547 int fd;
7ccfb2eb 548 const char *tmpdir;
0badc1ee
AJ
549 tmpdir = getenv("TMPDIR");
550 if (!tmpdir)
551 tmpdir = "/tmp";
eba25057
JM
552 if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
553 return -EOVERFLOW;
554 }
ea2384d3 555 fd = mkstemp(filename);
fe235a06
DH
556 if (fd < 0) {
557 return -errno;
558 }
559 if (close(fd) != 0) {
560 unlink(filename);
eba25057
JM
561 return -errno;
562 }
563 return 0;
d5249393 564#endif
eba25057 565}
fc01f7e7 566
84a12e66
CH
567/*
568 * Detect host devices. By convention, /dev/cdrom[N] is always
569 * recognized as a host CDROM.
570 */
571static BlockDriver *find_hdev_driver(const char *filename)
572{
573 int score_max = 0, score;
574 BlockDriver *drv = NULL, *d;
575
576 QLIST_FOREACH(d, &bdrv_drivers, list) {
577 if (d->bdrv_probe_device) {
578 score = d->bdrv_probe_device(filename);
579 if (score > score_max) {
580 score_max = score;
581 drv = d;
582 }
583 }
584 }
585
586 return drv;
587}
588
98289620
KW
589BlockDriver *bdrv_find_protocol(const char *filename,
590 bool allow_protocol_prefix)
83f64091
FB
591{
592 BlockDriver *drv1;
593 char protocol[128];
1cec71e3 594 int len;
83f64091 595 const char *p;
19cb3738 596
66f82cee
KW
597 /* TODO Drivers without bdrv_file_open must be specified explicitly */
598
39508e7a
CH
599 /*
600 * XXX(hch): we really should not let host device detection
601 * override an explicit protocol specification, but moving this
602 * later breaks access to device names with colons in them.
603 * Thanks to the brain-dead persistent naming schemes on udev-
604 * based Linux systems those actually are quite common.
605 */
606 drv1 = find_hdev_driver(filename);
607 if (drv1) {
608 return drv1;
609 }
610
98289620 611 if (!path_has_protocol(filename) || !allow_protocol_prefix) {
39508e7a 612 return bdrv_find_format("file");
84a12e66 613 }
98289620 614
9e0b22f4
SH
615 p = strchr(filename, ':');
616 assert(p != NULL);
1cec71e3
AL
617 len = p - filename;
618 if (len > sizeof(protocol) - 1)
619 len = sizeof(protocol) - 1;
620 memcpy(protocol, filename, len);
621 protocol[len] = '\0';
8a22f02a 622 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
5fafdf24 623 if (drv1->protocol_name &&
8a22f02a 624 !strcmp(drv1->protocol_name, protocol)) {
83f64091 625 return drv1;
8a22f02a 626 }
83f64091
FB
627 }
628 return NULL;
629}
630
f500a6d3 631static int find_image_format(BlockDriverState *bs, const char *filename,
34b5d2c6 632 BlockDriver **pdrv, Error **errp)
f3a5d3f8 633{
f500a6d3 634 int score, score_max;
f3a5d3f8
CH
635 BlockDriver *drv1, *drv;
636 uint8_t buf[2048];
f500a6d3 637 int ret = 0;
f8ea0b00 638
08a00559 639 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
8e895599 640 if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
c98ac35d
SW
641 drv = bdrv_find_format("raw");
642 if (!drv) {
34b5d2c6 643 error_setg(errp, "Could not find raw image format");
c98ac35d
SW
644 ret = -ENOENT;
645 }
646 *pdrv = drv;
647 return ret;
1a396859 648 }
f8ea0b00 649
83f64091 650 ret = bdrv_pread(bs, 0, buf, sizeof(buf));
83f64091 651 if (ret < 0) {
34b5d2c6
HR
652 error_setg_errno(errp, -ret, "Could not read image for determining its "
653 "format");
c98ac35d
SW
654 *pdrv = NULL;
655 return ret;
83f64091
FB
656 }
657
ea2384d3 658 score_max = 0;
84a12e66 659 drv = NULL;
8a22f02a 660 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
83f64091
FB
661 if (drv1->bdrv_probe) {
662 score = drv1->bdrv_probe(buf, ret, filename);
663 if (score > score_max) {
664 score_max = score;
665 drv = drv1;
666 }
0849bf08 667 }
fc01f7e7 668 }
c98ac35d 669 if (!drv) {
34b5d2c6
HR
670 error_setg(errp, "Could not determine image format: No compatible "
671 "driver found");
c98ac35d
SW
672 ret = -ENOENT;
673 }
674 *pdrv = drv;
675 return ret;
ea2384d3
FB
676}
677
51762288
SH
678/**
679 * Set the current 'total_sectors' value
680 */
681static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
682{
683 BlockDriver *drv = bs->drv;
684
396759ad
NB
685 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
686 if (bs->sg)
687 return 0;
688
51762288
SH
689 /* query actual device if possible, otherwise just trust the hint */
690 if (drv->bdrv_getlength) {
691 int64_t length = drv->bdrv_getlength(bs);
692 if (length < 0) {
693 return length;
694 }
7e382003 695 hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
51762288
SH
696 }
697
698 bs->total_sectors = hint;
699 return 0;
700}
701
9e8f1835
PB
702/**
703 * Set open flags for a given discard mode
704 *
705 * Return 0 on success, -1 if the discard mode was invalid.
706 */
707int bdrv_parse_discard_flags(const char *mode, int *flags)
708{
709 *flags &= ~BDRV_O_UNMAP;
710
711 if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
712 /* do nothing */
713 } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
714 *flags |= BDRV_O_UNMAP;
715 } else {
716 return -1;
717 }
718
719 return 0;
720}
721
c3993cdc
SH
722/**
723 * Set open flags for a given cache mode
724 *
725 * Return 0 on success, -1 if the cache mode was invalid.
726 */
727int bdrv_parse_cache_flags(const char *mode, int *flags)
728{
729 *flags &= ~BDRV_O_CACHE_MASK;
730
731 if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
732 *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
92196b2f
SH
733 } else if (!strcmp(mode, "directsync")) {
734 *flags |= BDRV_O_NOCACHE;
c3993cdc
SH
735 } else if (!strcmp(mode, "writeback")) {
736 *flags |= BDRV_O_CACHE_WB;
737 } else if (!strcmp(mode, "unsafe")) {
738 *flags |= BDRV_O_CACHE_WB;
739 *flags |= BDRV_O_NO_FLUSH;
740 } else if (!strcmp(mode, "writethrough")) {
741 /* this is the default */
742 } else {
743 return -1;
744 }
745
746 return 0;
747}
748
53fec9d3
SH
749/**
750 * The copy-on-read flag is actually a reference count so multiple users may
751 * use the feature without worrying about clobbering its previous state.
752 * Copy-on-read stays enabled until all users have called to disable it.
753 */
754void bdrv_enable_copy_on_read(BlockDriverState *bs)
755{
756 bs->copy_on_read++;
757}
758
759void bdrv_disable_copy_on_read(BlockDriverState *bs)
760{
761 assert(bs->copy_on_read > 0);
762 bs->copy_on_read--;
763}
764
7b272452
KW
765static int bdrv_open_flags(BlockDriverState *bs, int flags)
766{
767 int open_flags = flags | BDRV_O_CACHE_WB;
768
769 /*
770 * Clear flags that are internal to the block layer before opening the
771 * image.
772 */
773 open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
774
775 /*
776 * Snapshots should be writable.
777 */
778 if (bs->is_temporary) {
779 open_flags |= BDRV_O_RDWR;
780 }
781
782 return open_flags;
783}
784
6913c0c2
BC
785static int bdrv_assign_node_name(BlockDriverState *bs,
786 const char *node_name,
787 Error **errp)
788{
789 if (!node_name) {
790 return 0;
791 }
792
793 /* empty string node name is invalid */
794 if (node_name[0] == '\0') {
795 error_setg(errp, "Empty node name");
796 return -EINVAL;
797 }
798
0c5e94ee
BC
799 /* takes care of avoiding namespaces collisions */
800 if (bdrv_find(node_name)) {
801 error_setg(errp, "node-name=%s is conflicting with a device id",
802 node_name);
803 return -EINVAL;
804 }
805
6913c0c2
BC
806 /* takes care of avoiding duplicates node names */
807 if (bdrv_find_node(node_name)) {
808 error_setg(errp, "Duplicate node name");
809 return -EINVAL;
810 }
811
812 /* copy node name into the bs and insert it into the graph list */
813 pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
814 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
815
816 return 0;
817}
818
57915332
KW
819/*
820 * Common part for opening disk images and files
b6ad491a
KW
821 *
822 * Removes all processed options from *options.
57915332 823 */
f500a6d3 824static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
34b5d2c6 825 QDict *options, int flags, BlockDriver *drv, Error **errp)
57915332
KW
826{
827 int ret, open_flags;
035fccdf 828 const char *filename;
6913c0c2 829 const char *node_name = NULL;
34b5d2c6 830 Error *local_err = NULL;
57915332
KW
831
832 assert(drv != NULL);
6405875c 833 assert(bs->file == NULL);
707ff828 834 assert(options != NULL && bs->options != options);
57915332 835
45673671
KW
836 if (file != NULL) {
837 filename = file->filename;
838 } else {
839 filename = qdict_get_try_str(options, "filename");
840 }
841
765003db
KW
842 if (drv->bdrv_needs_filename && !filename) {
843 error_setg(errp, "The '%s' block driver requires a file name",
844 drv->format_name);
845 return -EINVAL;
846 }
847
45673671 848 trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
28dcee10 849
6913c0c2
BC
850 node_name = qdict_get_try_str(options, "node-name");
851 ret = bdrv_assign_node_name(bs, node_name, errp);
852 if (ret < 0) {
853 return ret;
854 }
855 qdict_del(options, "node-name");
856
5d186eb0
KW
857 /* bdrv_open() with directly using a protocol as drv. This layer is already
858 * opened, so assign it to bs (while file becomes a closed BlockDriverState)
859 * and return immediately. */
860 if (file != NULL && drv->bdrv_file_open) {
861 bdrv_swap(file, bs);
862 return 0;
863 }
864
57915332 865 bs->open_flags = flags;
1b7fd729 866 bs->guest_block_size = 512;
c25f53b0 867 bs->request_alignment = 512;
0d51b4de 868 bs->zero_beyond_eof = true;
b64ec4e4
FZ
869 open_flags = bdrv_open_flags(bs, flags);
870 bs->read_only = !(open_flags & BDRV_O_RDWR);
871
872 if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
8f94a6e4
KW
873 error_setg(errp,
874 !bs->read_only && bdrv_is_whitelisted(drv, true)
875 ? "Driver '%s' can only be used for read-only devices"
876 : "Driver '%s' is not whitelisted",
877 drv->format_name);
b64ec4e4
FZ
878 return -ENOTSUP;
879 }
57915332 880
53fec9d3 881 assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
0ebd24e0
KW
882 if (flags & BDRV_O_COPY_ON_READ) {
883 if (!bs->read_only) {
884 bdrv_enable_copy_on_read(bs);
885 } else {
886 error_setg(errp, "Can't use copy-on-read on read-only device");
887 return -EINVAL;
888 }
53fec9d3
SH
889 }
890
c2ad1b0c
KW
891 if (filename != NULL) {
892 pstrcpy(bs->filename, sizeof(bs->filename), filename);
893 } else {
894 bs->filename[0] = '\0';
895 }
57915332 896
57915332 897 bs->drv = drv;
7267c094 898 bs->opaque = g_malloc0(drv->instance_size);
57915332 899
03f541bd 900 bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
e7c63796 901
66f82cee
KW
902 /* Open the image, either directly or using a protocol */
903 if (drv->bdrv_file_open) {
5d186eb0 904 assert(file == NULL);
030be321 905 assert(!drv->bdrv_needs_filename || filename != NULL);
34b5d2c6 906 ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
f500a6d3 907 } else {
2af5ef70 908 if (file == NULL) {
34b5d2c6
HR
909 error_setg(errp, "Can't use '%s' as a block driver for the "
910 "protocol level", drv->format_name);
2af5ef70
KW
911 ret = -EINVAL;
912 goto free_and_fail;
913 }
f500a6d3 914 bs->file = file;
34b5d2c6 915 ret = drv->bdrv_open(bs, options, open_flags, &local_err);
66f82cee
KW
916 }
917
57915332 918 if (ret < 0) {
84d18f06 919 if (local_err) {
34b5d2c6 920 error_propagate(errp, local_err);
2fa9aa59
DH
921 } else if (bs->filename[0]) {
922 error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
34b5d2c6
HR
923 } else {
924 error_setg_errno(errp, -ret, "Could not open image");
925 }
57915332
KW
926 goto free_and_fail;
927 }
928
51762288
SH
929 ret = refresh_total_sectors(bs, bs->total_sectors);
930 if (ret < 0) {
34b5d2c6 931 error_setg_errno(errp, -ret, "Could not refresh total sector count");
51762288 932 goto free_and_fail;
57915332 933 }
51762288 934
d34682cd 935 bdrv_refresh_limits(bs);
c25f53b0
PB
936 assert(bdrv_opt_mem_align(bs) != 0);
937 assert(bs->request_alignment != 0);
d34682cd 938
57915332
KW
939#ifndef _WIN32
940 if (bs->is_temporary) {
d4cea8df
DH
941 assert(bs->filename[0] != '\0');
942 unlink(bs->filename);
57915332
KW
943 }
944#endif
945 return 0;
946
947free_and_fail:
f500a6d3 948 bs->file = NULL;
7267c094 949 g_free(bs->opaque);
57915332
KW
950 bs->opaque = NULL;
951 bs->drv = NULL;
952 return ret;
953}
954
b6ce07aa
KW
955/*
956 * Opens a file using a protocol (file, host_device, nbd, ...)
787e4a85
KW
957 *
958 * options is a QDict of options to pass to the block drivers, or NULL for an
959 * empty set of options. The reference to the QDict belongs to the block layer
960 * after the call (even on failure), so if the caller intends to reuse the
961 * dictionary, it needs to use QINCREF() before calling bdrv_file_open.
b6ce07aa 962 */
2e40134b 963static int bdrv_file_open(BlockDriverState **pbs, const char *filename,
5d12aa63 964 QDict *options, int flags, Error **errp)
ea2384d3 965{
72daa72e 966 BlockDriverState *bs = NULL;
6db95603 967 BlockDriver *drv;
c2ad1b0c 968 const char *drvname;
98289620 969 bool allow_protocol_prefix = false;
34b5d2c6 970 Error *local_err = NULL;
83f64091
FB
971 int ret;
972
707ff828
KW
973 /* NULL means an empty set of options */
974 if (options == NULL) {
975 options = qdict_new();
976 }
977
83f64091 978 bs = bdrv_new("");
707ff828
KW
979 bs->options = options;
980 options = qdict_clone_shallow(options);
981
035fccdf
KW
982 /* Fetch the file name from the options QDict if necessary */
983 if (!filename) {
984 filename = qdict_get_try_str(options, "filename");
985 } else if (filename && !qdict_haskey(options, "filename")) {
986 qdict_put(options, "filename", qstring_from_str(filename));
98289620 987 allow_protocol_prefix = true;
035fccdf 988 } else {
34b5d2c6
HR
989 error_setg(errp, "Can't specify 'file' and 'filename' options at the "
990 "same time");
035fccdf
KW
991 ret = -EINVAL;
992 goto fail;
993 }
994
c2ad1b0c
KW
995 /* Find the right block driver */
996 drvname = qdict_get_try_str(options, "driver");
997 if (drvname) {
8f94a6e4 998 drv = bdrv_find_format(drvname);
34b5d2c6
HR
999 if (!drv) {
1000 error_setg(errp, "Unknown driver '%s'", drvname);
1001 }
c2ad1b0c
KW
1002 qdict_del(options, "driver");
1003 } else if (filename) {
98289620
KW
1004 drv = bdrv_find_protocol(filename, allow_protocol_prefix);
1005 if (!drv) {
34b5d2c6 1006 error_setg(errp, "Unknown protocol");
98289620 1007 }
c2ad1b0c 1008 } else {
34b5d2c6 1009 error_setg(errp, "Must specify either driver or file");
c2ad1b0c
KW
1010 drv = NULL;
1011 }
1012
1013 if (!drv) {
34b5d2c6 1014 /* errp has been set already */
c2ad1b0c
KW
1015 ret = -ENOENT;
1016 goto fail;
1017 }
1018
1019 /* Parse the filename and open it */
1020 if (drv->bdrv_parse_filename && filename) {
6963a30d 1021 drv->bdrv_parse_filename(filename, options, &local_err);
84d18f06 1022 if (local_err) {
34b5d2c6 1023 error_propagate(errp, local_err);
6963a30d
KW
1024 ret = -EINVAL;
1025 goto fail;
1026 }
56d1b4d2 1027 qdict_del(options, "filename");
6963a30d
KW
1028 }
1029
505d7583 1030 if (!drv->bdrv_file_open) {
ddf5636d 1031 ret = bdrv_open(&bs, filename, NULL, options, flags, drv, &local_err);
505d7583
HR
1032 options = NULL;
1033 } else {
1034 ret = bdrv_open_common(bs, NULL, options, flags, drv, &local_err);
1035 }
83f64091 1036 if (ret < 0) {
34b5d2c6 1037 error_propagate(errp, local_err);
707ff828
KW
1038 goto fail;
1039 }
1040
1041 /* Check if any unknown options were used */
505d7583 1042 if (options && (qdict_size(options) != 0)) {
707ff828 1043 const QDictEntry *entry = qdict_first(options);
34b5d2c6
HR
1044 error_setg(errp, "Block protocol '%s' doesn't support the option '%s'",
1045 drv->format_name, entry->key);
707ff828
KW
1046 ret = -EINVAL;
1047 goto fail;
3b0d4f61 1048 }
707ff828
KW
1049 QDECREF(options);
1050
71d0770c 1051 bs->growable = 1;
83f64091
FB
1052 *pbs = bs;
1053 return 0;
707ff828
KW
1054
1055fail:
1056 QDECREF(options);
1057 if (!bs->drv) {
1058 QDECREF(bs->options);
1059 }
4f6fd349 1060 bdrv_unref(bs);
707ff828 1061 return ret;
83f64091
FB
1062}
1063
31ca6d07
KW
1064/*
1065 * Opens the backing file for a BlockDriverState if not yet open
1066 *
1067 * options is a QDict of options to pass to the block drivers, or NULL for an
1068 * empty set of options. The reference to the QDict is transferred to this
1069 * function (even on failure), so if the caller intends to reuse the dictionary,
1070 * it needs to use QINCREF() before calling bdrv_file_open.
1071 */
34b5d2c6 1072int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
9156df12
PB
1073{
1074 char backing_filename[PATH_MAX];
1075 int back_flags, ret;
1076 BlockDriver *back_drv = NULL;
34b5d2c6 1077 Error *local_err = NULL;
9156df12
PB
1078
1079 if (bs->backing_hd != NULL) {
31ca6d07 1080 QDECREF(options);
9156df12
PB
1081 return 0;
1082 }
1083
31ca6d07
KW
1084 /* NULL means an empty set of options */
1085 if (options == NULL) {
1086 options = qdict_new();
1087 }
1088
9156df12 1089 bs->open_flags &= ~BDRV_O_NO_BACKING;
1cb6f506
KW
1090 if (qdict_haskey(options, "file.filename")) {
1091 backing_filename[0] = '\0';
1092 } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
31ca6d07 1093 QDECREF(options);
9156df12 1094 return 0;
dbecebdd
FZ
1095 } else {
1096 bdrv_get_full_backing_filename(bs, backing_filename,
1097 sizeof(backing_filename));
9156df12
PB
1098 }
1099
9156df12
PB
1100 if (bs->backing_format[0] != '\0') {
1101 back_drv = bdrv_find_format(bs->backing_format);
1102 }
1103
1104 /* backing files always opened read-only */
87a5debd
TL
1105 back_flags = bs->open_flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT |
1106 BDRV_O_COPY_ON_READ);
9156df12 1107
f67503e5
HR
1108 assert(bs->backing_hd == NULL);
1109 ret = bdrv_open(&bs->backing_hd,
ddf5636d 1110 *backing_filename ? backing_filename : NULL, NULL, options,
34b5d2c6 1111 back_flags, back_drv, &local_err);
9156df12 1112 if (ret < 0) {
9156df12
PB
1113 bs->backing_hd = NULL;
1114 bs->open_flags |= BDRV_O_NO_BACKING;
b04b6b6e
FZ
1115 error_setg(errp, "Could not open backing file: %s",
1116 error_get_pretty(local_err));
1117 error_free(local_err);
9156df12
PB
1118 return ret;
1119 }
d80ac658
PF
1120
1121 if (bs->backing_hd->file) {
1122 pstrcpy(bs->backing_file, sizeof(bs->backing_file),
1123 bs->backing_hd->file->filename);
1124 }
1125
d34682cd
KW
1126 /* Recalculate the BlockLimits with the backing file */
1127 bdrv_refresh_limits(bs);
1128
9156df12
PB
1129 return 0;
1130}
1131
da557aac
HR
1132/*
1133 * Opens a disk image whose options are given as BlockdevRef in another block
1134 * device's options.
1135 *
1136 * If force_raw is true, bdrv_file_open() will be used, thereby preventing any
1137 * image format auto-detection. If it is false and a filename is given,
1138 * bdrv_open() will be used for auto-detection.
1139 *
1140 * If allow_none is true, no image will be opened if filename is false and no
1141 * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
1142 *
1143 * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
1144 * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
1145 * itself, all options starting with "${bdref_key}." are considered part of the
1146 * BlockdevRef.
1147 *
1148 * The BlockdevRef will be removed from the options QDict.
f67503e5
HR
1149 *
1150 * To conform with the behavior of bdrv_open(), *pbs has to be NULL.
da557aac
HR
1151 */
1152int bdrv_open_image(BlockDriverState **pbs, const char *filename,
1153 QDict *options, const char *bdref_key, int flags,
1154 bool force_raw, bool allow_none, Error **errp)
1155{
1156 QDict *image_options;
1157 int ret;
1158 char *bdref_key_dot;
1159 const char *reference;
1160
f67503e5
HR
1161 assert(pbs);
1162 assert(*pbs == NULL);
1163
da557aac
HR
1164 bdref_key_dot = g_strdup_printf("%s.", bdref_key);
1165 qdict_extract_subqdict(options, &image_options, bdref_key_dot);
1166 g_free(bdref_key_dot);
1167
1168 reference = qdict_get_try_str(options, bdref_key);
1169 if (!filename && !reference && !qdict_size(image_options)) {
1170 if (allow_none) {
1171 ret = 0;
1172 } else {
1173 error_setg(errp, "A block device must be specified for \"%s\"",
1174 bdref_key);
1175 ret = -EINVAL;
1176 }
1177 goto done;
1178 }
1179
1180 if (filename && !force_raw) {
1181 /* If a filename is given and the block driver should be detected
1182 automatically (instead of using none), use bdrv_open() in order to do
1183 that auto-detection. */
da557aac
HR
1184 if (reference) {
1185 error_setg(errp, "Cannot reference an existing block device while "
1186 "giving a filename");
1187 ret = -EINVAL;
1188 goto done;
1189 }
1190
ddf5636d 1191 ret = bdrv_open(pbs, filename, NULL, image_options, flags, NULL, errp);
da557aac 1192 } else {
2e40134b
HR
1193 ret = bdrv_open(pbs, filename, reference, image_options,
1194 flags | BDRV_O_PROTOCOL, NULL, errp);
da557aac
HR
1195 }
1196
1197done:
1198 qdict_del(options, bdref_key);
1199 return ret;
1200}
1201
b6ce07aa
KW
1202/*
1203 * Opens a disk image (raw, qcow2, vmdk, ...)
de9c0cec
KW
1204 *
1205 * options is a QDict of options to pass to the block drivers, or NULL for an
1206 * empty set of options. The reference to the QDict belongs to the block layer
1207 * after the call (even on failure), so if the caller intends to reuse the
1208 * dictionary, it needs to use QINCREF() before calling bdrv_open.
f67503e5
HR
1209 *
1210 * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
1211 * If it is not NULL, the referenced BDS will be reused.
ddf5636d
HR
1212 *
1213 * The reference parameter may be used to specify an existing block device which
1214 * should be opened. If specified, neither options nor a filename may be given,
1215 * nor can an existing BDS be reused (that is, *pbs has to be NULL).
b6ce07aa 1216 */
ddf5636d
HR
1217int bdrv_open(BlockDriverState **pbs, const char *filename,
1218 const char *reference, QDict *options, int flags,
1219 BlockDriver *drv, Error **errp)
ea2384d3 1220{
b6ce07aa 1221 int ret;
89c9bc3d
SW
1222 /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1223 char tmp_filename[PATH_MAX + 1];
f67503e5 1224 BlockDriverState *file = NULL, *bs;
74fe54f2 1225 const char *drvname;
34b5d2c6 1226 Error *local_err = NULL;
712e7874 1227
f67503e5
HR
1228 assert(pbs);
1229
ddf5636d
HR
1230 if (reference) {
1231 bool options_non_empty = options ? qdict_size(options) : false;
1232 QDECREF(options);
1233
1234 if (*pbs) {
1235 error_setg(errp, "Cannot reuse an existing BDS when referencing "
1236 "another block device");
1237 return -EINVAL;
1238 }
1239
1240 if (filename || options_non_empty) {
1241 error_setg(errp, "Cannot reference an existing block device with "
1242 "additional options or a new filename");
1243 return -EINVAL;
1244 }
1245
1246 bs = bdrv_lookup_bs(reference, reference, errp);
1247 if (!bs) {
1248 return -ENODEV;
1249 }
1250 bdrv_ref(bs);
1251 *pbs = bs;
1252 return 0;
1253 }
1254
5d12aa63
HR
1255 if (flags & BDRV_O_PROTOCOL) {
1256 assert(!drv);
1257 return bdrv_file_open(pbs, filename, options, flags & ~BDRV_O_PROTOCOL,
1258 errp);
1259 }
1260
f67503e5
HR
1261 if (*pbs) {
1262 bs = *pbs;
1263 } else {
1264 bs = bdrv_new("");
1265 }
1266
de9c0cec
KW
1267 /* NULL means an empty set of options */
1268 if (options == NULL) {
1269 options = qdict_new();
1270 }
1271
1272 bs->options = options;
b6ad491a 1273 options = qdict_clone_shallow(options);
de9c0cec
KW
1274
1275 /* For snapshot=on, create a temporary qcow2 overlay */
83f64091 1276 if (flags & BDRV_O_SNAPSHOT) {
ea2384d3
FB
1277 BlockDriverState *bs1;
1278 int64_t total_size;
91a073a9 1279 BlockDriver *bdrv_qcow2;
08b392e1 1280 QEMUOptionParameter *create_options;
9fd3171a 1281 QDict *snapshot_options;
c2ad1b0c 1282
ea2384d3
FB
1283 /* if snapshot, we create a temporary backing file and open it
1284 instead of opening 'filename' directly */
33e3963e 1285
9fd3171a 1286 /* Get the required size from the image */
9fd3171a 1287 QINCREF(options);
f67503e5 1288 bs1 = NULL;
ddf5636d 1289 ret = bdrv_open(&bs1, filename, NULL, options, BDRV_O_NO_BACKING,
c9fbb99d 1290 drv, &local_err);
51d7c00c 1291 if (ret < 0) {
de9c0cec 1292 goto fail;
ea2384d3 1293 }
3e82990b 1294 total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
7c96d46e 1295
4f6fd349 1296 bdrv_unref(bs1);
3b46e624 1297
9fd3171a 1298 /* Create the temporary image */
eba25057
JM
1299 ret = get_tmp_filename(tmp_filename, sizeof(tmp_filename));
1300 if (ret < 0) {
34b5d2c6 1301 error_setg_errno(errp, -ret, "Could not get temporary filename");
de9c0cec 1302 goto fail;
eba25057 1303 }
7c96d46e 1304
91a073a9 1305 bdrv_qcow2 = bdrv_find_format("qcow2");
08b392e1
KW
1306 create_options = parse_option_parameters("", bdrv_qcow2->create_options,
1307 NULL);
91a073a9 1308
08b392e1 1309 set_option_parameter_int(create_options, BLOCK_OPT_SIZE, total_size);
91a073a9 1310
cc84d90f 1311 ret = bdrv_create(bdrv_qcow2, tmp_filename, create_options, &local_err);
08b392e1 1312 free_option_parameters(create_options);
51d7c00c 1313 if (ret < 0) {
34b5d2c6 1314 error_setg_errno(errp, -ret, "Could not create temporary overlay "
cc84d90f
HR
1315 "'%s': %s", tmp_filename,
1316 error_get_pretty(local_err));
1317 error_free(local_err);
1318 local_err = NULL;
de9c0cec 1319 goto fail;
ea2384d3 1320 }
91a073a9 1321
9fd3171a
KW
1322 /* Prepare a new options QDict for the temporary file, where user
1323 * options refer to the backing file */
1324 if (filename) {
1325 qdict_put(options, "file.filename", qstring_from_str(filename));
1326 }
1327 if (drv) {
1328 qdict_put(options, "driver", qstring_from_str(drv->format_name));
1329 }
1330
1331 snapshot_options = qdict_new();
1332 qdict_put(snapshot_options, "backing", options);
1333 qdict_flatten(snapshot_options);
1334
1335 bs->options = snapshot_options;
1336 options = qdict_clone_shallow(bs->options);
1337
ea2384d3 1338 filename = tmp_filename;
91a073a9 1339 drv = bdrv_qcow2;
ea2384d3
FB
1340 bs->is_temporary = 1;
1341 }
712e7874 1342
f500a6d3
KW
1343 /* Open image file without format layer */
1344 if (flags & BDRV_O_RDWR) {
1345 flags |= BDRV_O_ALLOW_RDWR;
1346 }
1347
f67503e5 1348 assert(file == NULL);
054963f8
HR
1349 ret = bdrv_open_image(&file, filename, options, "file",
1350 bdrv_open_flags(bs, flags | BDRV_O_UNMAP), true, true,
1351 &local_err);
1352 if (ret < 0) {
1353 goto fail;
f500a6d3
KW
1354 }
1355
b6ce07aa 1356 /* Find the right image format driver */
74fe54f2
KW
1357 drvname = qdict_get_try_str(options, "driver");
1358 if (drvname) {
8f94a6e4 1359 drv = bdrv_find_format(drvname);
74fe54f2 1360 qdict_del(options, "driver");
06d22aa3
KW
1361 if (!drv) {
1362 error_setg(errp, "Invalid driver: '%s'", drvname);
1363 ret = -EINVAL;
1364 goto unlink_and_fail;
1365 }
74fe54f2
KW
1366 }
1367
6db95603 1368 if (!drv) {
2a05cbe4
HR
1369 if (file) {
1370 ret = find_image_format(file, filename, &drv, &local_err);
1371 } else {
1372 error_setg(errp, "Must specify either driver or file");
1373 ret = -EINVAL;
1374 goto unlink_and_fail;
1375 }
51d7c00c 1376 }
6987307c 1377
51d7c00c 1378 if (!drv) {
51d7c00c 1379 goto unlink_and_fail;
ea2384d3 1380 }
b6ce07aa
KW
1381
1382 /* Open the image */
34b5d2c6 1383 ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
b6ce07aa 1384 if (ret < 0) {
6987307c
CH
1385 goto unlink_and_fail;
1386 }
1387
2a05cbe4 1388 if (file && (bs->file != file)) {
4f6fd349 1389 bdrv_unref(file);
f500a6d3
KW
1390 file = NULL;
1391 }
1392
b6ce07aa 1393 /* If there is a backing file, use it */
9156df12 1394 if ((flags & BDRV_O_NO_BACKING) == 0) {
31ca6d07
KW
1395 QDict *backing_options;
1396
5726d872 1397 qdict_extract_subqdict(options, &backing_options, "backing.");
34b5d2c6 1398 ret = bdrv_open_backing_file(bs, backing_options, &local_err);
b6ce07aa 1399 if (ret < 0) {
b6ad491a 1400 goto close_and_fail;
b6ce07aa 1401 }
b6ce07aa
KW
1402 }
1403
b6ad491a
KW
1404 /* Check if any unknown options were used */
1405 if (qdict_size(options) != 0) {
1406 const QDictEntry *entry = qdict_first(options);
34b5d2c6
HR
1407 error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1408 "support the option '%s'", drv->format_name, bs->device_name,
1409 entry->key);
b6ad491a
KW
1410
1411 ret = -EINVAL;
1412 goto close_and_fail;
1413 }
1414 QDECREF(options);
1415
b6ce07aa 1416 if (!bdrv_key_required(bs)) {
7d4b4ba5 1417 bdrv_dev_change_media_cb(bs, true);
b6ce07aa
KW
1418 }
1419
f67503e5 1420 *pbs = bs;
b6ce07aa
KW
1421 return 0;
1422
1423unlink_and_fail:
f500a6d3 1424 if (file != NULL) {
4f6fd349 1425 bdrv_unref(file);
f500a6d3 1426 }
b6ce07aa
KW
1427 if (bs->is_temporary) {
1428 unlink(filename);
1429 }
de9c0cec
KW
1430fail:
1431 QDECREF(bs->options);
b6ad491a 1432 QDECREF(options);
de9c0cec 1433 bs->options = NULL;
f67503e5
HR
1434 if (!*pbs) {
1435 /* If *pbs is NULL, a new BDS has been created in this function and
1436 needs to be freed now. Otherwise, it does not need to be closed,
1437 since it has not really been opened yet. */
1438 bdrv_unref(bs);
1439 }
84d18f06 1440 if (local_err) {
34b5d2c6
HR
1441 error_propagate(errp, local_err);
1442 }
b6ad491a 1443 return ret;
de9c0cec 1444
b6ad491a 1445close_and_fail:
f67503e5
HR
1446 /* See fail path, but now the BDS has to be always closed */
1447 if (*pbs) {
1448 bdrv_close(bs);
1449 } else {
1450 bdrv_unref(bs);
1451 }
b6ad491a 1452 QDECREF(options);
84d18f06 1453 if (local_err) {
34b5d2c6
HR
1454 error_propagate(errp, local_err);
1455 }
b6ce07aa
KW
1456 return ret;
1457}
1458
e971aa12
JC
1459typedef struct BlockReopenQueueEntry {
1460 bool prepared;
1461 BDRVReopenState state;
1462 QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1463} BlockReopenQueueEntry;
1464
1465/*
1466 * Adds a BlockDriverState to a simple queue for an atomic, transactional
1467 * reopen of multiple devices.
1468 *
1469 * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1470 * already performed, or alternatively may be NULL a new BlockReopenQueue will
1471 * be created and initialized. This newly created BlockReopenQueue should be
1472 * passed back in for subsequent calls that are intended to be of the same
1473 * atomic 'set'.
1474 *
1475 * bs is the BlockDriverState to add to the reopen queue.
1476 *
1477 * flags contains the open flags for the associated bs
1478 *
1479 * returns a pointer to bs_queue, which is either the newly allocated
1480 * bs_queue, or the existing bs_queue being used.
1481 *
1482 */
1483BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1484 BlockDriverState *bs, int flags)
1485{
1486 assert(bs != NULL);
1487
1488 BlockReopenQueueEntry *bs_entry;
1489 if (bs_queue == NULL) {
1490 bs_queue = g_new0(BlockReopenQueue, 1);
1491 QSIMPLEQ_INIT(bs_queue);
1492 }
1493
1494 if (bs->file) {
1495 bdrv_reopen_queue(bs_queue, bs->file, flags);
1496 }
1497
1498 bs_entry = g_new0(BlockReopenQueueEntry, 1);
1499 QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1500
1501 bs_entry->state.bs = bs;
1502 bs_entry->state.flags = flags;
1503
1504 return bs_queue;
1505}
1506
1507/*
1508 * Reopen multiple BlockDriverStates atomically & transactionally.
1509 *
1510 * The queue passed in (bs_queue) must have been built up previous
1511 * via bdrv_reopen_queue().
1512 *
1513 * Reopens all BDS specified in the queue, with the appropriate
1514 * flags. All devices are prepared for reopen, and failure of any
1515 * device will cause all device changes to be abandonded, and intermediate
1516 * data cleaned up.
1517 *
1518 * If all devices prepare successfully, then the changes are committed
1519 * to all devices.
1520 *
1521 */
1522int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1523{
1524 int ret = -1;
1525 BlockReopenQueueEntry *bs_entry, *next;
1526 Error *local_err = NULL;
1527
1528 assert(bs_queue != NULL);
1529
1530 bdrv_drain_all();
1531
1532 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1533 if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1534 error_propagate(errp, local_err);
1535 goto cleanup;
1536 }
1537 bs_entry->prepared = true;
1538 }
1539
1540 /* If we reach this point, we have success and just need to apply the
1541 * changes
1542 */
1543 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1544 bdrv_reopen_commit(&bs_entry->state);
1545 }
1546
1547 ret = 0;
1548
1549cleanup:
1550 QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1551 if (ret && bs_entry->prepared) {
1552 bdrv_reopen_abort(&bs_entry->state);
1553 }
1554 g_free(bs_entry);
1555 }
1556 g_free(bs_queue);
1557 return ret;
1558}
1559
1560
1561/* Reopen a single BlockDriverState with the specified flags. */
1562int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1563{
1564 int ret = -1;
1565 Error *local_err = NULL;
1566 BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1567
1568 ret = bdrv_reopen_multiple(queue, &local_err);
1569 if (local_err != NULL) {
1570 error_propagate(errp, local_err);
1571 }
1572 return ret;
1573}
1574
1575
1576/*
1577 * Prepares a BlockDriverState for reopen. All changes are staged in the
1578 * 'opaque' field of the BDRVReopenState, which is used and allocated by
1579 * the block driver layer .bdrv_reopen_prepare()
1580 *
1581 * bs is the BlockDriverState to reopen
1582 * flags are the new open flags
1583 * queue is the reopen queue
1584 *
1585 * Returns 0 on success, non-zero on error. On error errp will be set
1586 * as well.
1587 *
1588 * On failure, bdrv_reopen_abort() will be called to clean up any data.
1589 * It is the responsibility of the caller to then call the abort() or
1590 * commit() for any other BDS that have been left in a prepare() state
1591 *
1592 */
1593int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1594 Error **errp)
1595{
1596 int ret = -1;
1597 Error *local_err = NULL;
1598 BlockDriver *drv;
1599
1600 assert(reopen_state != NULL);
1601 assert(reopen_state->bs->drv != NULL);
1602 drv = reopen_state->bs->drv;
1603
1604 /* if we are to stay read-only, do not allow permission change
1605 * to r/w */
1606 if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1607 reopen_state->flags & BDRV_O_RDWR) {
1608 error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1609 reopen_state->bs->device_name);
1610 goto error;
1611 }
1612
1613
1614 ret = bdrv_flush(reopen_state->bs);
1615 if (ret) {
1616 error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1617 strerror(-ret));
1618 goto error;
1619 }
1620
1621 if (drv->bdrv_reopen_prepare) {
1622 ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1623 if (ret) {
1624 if (local_err != NULL) {
1625 error_propagate(errp, local_err);
1626 } else {
d8b6895f
LC
1627 error_setg(errp, "failed while preparing to reopen image '%s'",
1628 reopen_state->bs->filename);
e971aa12
JC
1629 }
1630 goto error;
1631 }
1632 } else {
1633 /* It is currently mandatory to have a bdrv_reopen_prepare()
1634 * handler for each supported drv. */
1635 error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1636 drv->format_name, reopen_state->bs->device_name,
1637 "reopening of file");
1638 ret = -1;
1639 goto error;
1640 }
1641
1642 ret = 0;
1643
1644error:
1645 return ret;
1646}
1647
1648/*
1649 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1650 * makes them final by swapping the staging BlockDriverState contents into
1651 * the active BlockDriverState contents.
1652 */
1653void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1654{
1655 BlockDriver *drv;
1656
1657 assert(reopen_state != NULL);
1658 drv = reopen_state->bs->drv;
1659 assert(drv != NULL);
1660
1661 /* If there are any driver level actions to take */
1662 if (drv->bdrv_reopen_commit) {
1663 drv->bdrv_reopen_commit(reopen_state);
1664 }
1665
1666 /* set BDS specific flags now */
1667 reopen_state->bs->open_flags = reopen_state->flags;
1668 reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1669 BDRV_O_CACHE_WB);
1670 reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
355ef4ac
KW
1671
1672 bdrv_refresh_limits(reopen_state->bs);
e971aa12
JC
1673}
1674
1675/*
1676 * Abort the reopen, and delete and free the staged changes in
1677 * reopen_state
1678 */
1679void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1680{
1681 BlockDriver *drv;
1682
1683 assert(reopen_state != NULL);
1684 drv = reopen_state->bs->drv;
1685 assert(drv != NULL);
1686
1687 if (drv->bdrv_reopen_abort) {
1688 drv->bdrv_reopen_abort(reopen_state);
1689 }
1690}
1691
1692
fc01f7e7
FB
1693void bdrv_close(BlockDriverState *bs)
1694{
3cbc002c
PB
1695 if (bs->job) {
1696 block_job_cancel_sync(bs->job);
1697 }
58fda173
SH
1698 bdrv_drain_all(); /* complete I/O */
1699 bdrv_flush(bs);
1700 bdrv_drain_all(); /* in case flush left pending I/O */
d7d512f6 1701 notifier_list_notify(&bs->close_notifiers, bs);
7094f12f 1702
3cbc002c 1703 if (bs->drv) {
557df6ac 1704 if (bs->backing_hd) {
4f6fd349 1705 bdrv_unref(bs->backing_hd);
557df6ac
SH
1706 bs->backing_hd = NULL;
1707 }
ea2384d3 1708 bs->drv->bdrv_close(bs);
7267c094 1709 g_free(bs->opaque);
ea2384d3
FB
1710#ifdef _WIN32
1711 if (bs->is_temporary) {
1712 unlink(bs->filename);
1713 }
67b915a5 1714#endif
ea2384d3
FB
1715 bs->opaque = NULL;
1716 bs->drv = NULL;
53fec9d3 1717 bs->copy_on_read = 0;
a275fa42
PB
1718 bs->backing_file[0] = '\0';
1719 bs->backing_format[0] = '\0';
6405875c
PB
1720 bs->total_sectors = 0;
1721 bs->encrypted = 0;
1722 bs->valid_key = 0;
1723 bs->sg = 0;
1724 bs->growable = 0;
0d51b4de 1725 bs->zero_beyond_eof = false;
de9c0cec
KW
1726 QDECREF(bs->options);
1727 bs->options = NULL;
b338082b 1728
66f82cee 1729 if (bs->file != NULL) {
4f6fd349 1730 bdrv_unref(bs->file);
0ac9377d 1731 bs->file = NULL;
66f82cee 1732 }
b338082b 1733 }
98f90dba 1734
9ca11154
PH
1735 bdrv_dev_change_media_cb(bs, false);
1736
98f90dba
ZYW
1737 /*throttling disk I/O limits*/
1738 if (bs->io_limits_enabled) {
1739 bdrv_io_limits_disable(bs);
1740 }
b338082b
FB
1741}
1742
2bc93fed
MK
1743void bdrv_close_all(void)
1744{
1745 BlockDriverState *bs;
1746
dc364f4c 1747 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
2bc93fed
MK
1748 bdrv_close(bs);
1749 }
1750}
1751
88266f5a
SH
1752/* Check if any requests are in-flight (including throttled requests) */
1753static bool bdrv_requests_pending(BlockDriverState *bs)
1754{
1755 if (!QLIST_EMPTY(&bs->tracked_requests)) {
1756 return true;
1757 }
cc0681c4
BC
1758 if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1759 return true;
1760 }
1761 if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
88266f5a
SH
1762 return true;
1763 }
1764 if (bs->file && bdrv_requests_pending(bs->file)) {
1765 return true;
1766 }
1767 if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1768 return true;
1769 }
1770 return false;
1771}
1772
1773static bool bdrv_requests_pending_all(void)
1774{
1775 BlockDriverState *bs;
dc364f4c 1776 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
88266f5a
SH
1777 if (bdrv_requests_pending(bs)) {
1778 return true;
1779 }
1780 }
1781 return false;
1782}
1783
922453bc
SH
1784/*
1785 * Wait for pending requests to complete across all BlockDriverStates
1786 *
1787 * This function does not flush data to disk, use bdrv_flush_all() for that
1788 * after calling this function.
4c355d53
ZYW
1789 *
1790 * Note that completion of an asynchronous I/O operation can trigger any
1791 * number of other I/O operations on other devices---for example a coroutine
1792 * can be arbitrarily complex and a constant flow of I/O can come until the
1793 * coroutine is complete. Because of this, it is not possible to have a
1794 * function to drain a single device's I/O queue.
922453bc
SH
1795 */
1796void bdrv_drain_all(void)
1797{
88266f5a
SH
1798 /* Always run first iteration so any pending completion BHs run */
1799 bool busy = true;
922453bc
SH
1800 BlockDriverState *bs;
1801
88266f5a 1802 while (busy) {
dc364f4c 1803 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
0b06ef3b 1804 bdrv_start_throttled_reqs(bs);
4c355d53 1805 }
922453bc 1806
88266f5a
SH
1807 busy = bdrv_requests_pending_all();
1808 busy |= aio_poll(qemu_get_aio_context(), busy);
922453bc
SH
1809 }
1810}
1811
dc364f4c
BC
1812/* make a BlockDriverState anonymous by removing from bdrv_state and
1813 * graph_bdrv_state list.
d22b2f41
RH
1814 Also, NULL terminate the device_name to prevent double remove */
1815void bdrv_make_anon(BlockDriverState *bs)
1816{
1817 if (bs->device_name[0] != '\0') {
dc364f4c 1818 QTAILQ_REMOVE(&bdrv_states, bs, device_list);
d22b2f41
RH
1819 }
1820 bs->device_name[0] = '\0';
dc364f4c
BC
1821 if (bs->node_name[0] != '\0') {
1822 QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
1823 }
1824 bs->node_name[0] = '\0';
d22b2f41
RH
1825}
1826
e023b2e2
PB
1827static void bdrv_rebind(BlockDriverState *bs)
1828{
1829 if (bs->drv && bs->drv->bdrv_rebind) {
1830 bs->drv->bdrv_rebind(bs);
1831 }
1832}
1833
4ddc07ca
PB
1834static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
1835 BlockDriverState *bs_src)
8802d1fd 1836{
4ddc07ca
PB
1837 /* move some fields that need to stay attached to the device */
1838 bs_dest->open_flags = bs_src->open_flags;
8802d1fd
JC
1839
1840 /* dev info */
4ddc07ca
PB
1841 bs_dest->dev_ops = bs_src->dev_ops;
1842 bs_dest->dev_opaque = bs_src->dev_opaque;
1843 bs_dest->dev = bs_src->dev;
1b7fd729 1844 bs_dest->guest_block_size = bs_src->guest_block_size;
4ddc07ca 1845 bs_dest->copy_on_read = bs_src->copy_on_read;
8802d1fd 1846
4ddc07ca 1847 bs_dest->enable_write_cache = bs_src->enable_write_cache;
c4a248a1 1848
cc0681c4
BC
1849 /* i/o throttled req */
1850 memcpy(&bs_dest->throttle_state,
1851 &bs_src->throttle_state,
1852 sizeof(ThrottleState));
1853 bs_dest->throttled_reqs[0] = bs_src->throttled_reqs[0];
1854 bs_dest->throttled_reqs[1] = bs_src->throttled_reqs[1];
4ddc07ca 1855 bs_dest->io_limits_enabled = bs_src->io_limits_enabled;
8802d1fd 1856
8802d1fd 1857 /* r/w error */
4ddc07ca
PB
1858 bs_dest->on_read_error = bs_src->on_read_error;
1859 bs_dest->on_write_error = bs_src->on_write_error;
8802d1fd
JC
1860
1861 /* i/o status */
4ddc07ca
PB
1862 bs_dest->iostatus_enabled = bs_src->iostatus_enabled;
1863 bs_dest->iostatus = bs_src->iostatus;
8802d1fd 1864
a9fc4408 1865 /* dirty bitmap */
e4654d2d 1866 bs_dest->dirty_bitmaps = bs_src->dirty_bitmaps;
a9fc4408 1867
9fcb0251
FZ
1868 /* reference count */
1869 bs_dest->refcnt = bs_src->refcnt;
1870
a9fc4408 1871 /* job */
4ddc07ca
PB
1872 bs_dest->in_use = bs_src->in_use;
1873 bs_dest->job = bs_src->job;
a9fc4408 1874
8802d1fd 1875 /* keep the same entry in bdrv_states */
4ddc07ca
PB
1876 pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name),
1877 bs_src->device_name);
dc364f4c
BC
1878 bs_dest->device_list = bs_src->device_list;
1879
1880 /* keep the same entry in graph_bdrv_states
1881 * We do want to swap name but don't want to swap linked list entries
1882 */
1883 bs_dest->node_list = bs_src->node_list;
4ddc07ca 1884}
8802d1fd 1885
4ddc07ca
PB
1886/*
1887 * Swap bs contents for two image chains while they are live,
1888 * while keeping required fields on the BlockDriverState that is
1889 * actually attached to a device.
1890 *
1891 * This will modify the BlockDriverState fields, and swap contents
1892 * between bs_new and bs_old. Both bs_new and bs_old are modified.
1893 *
1894 * bs_new is required to be anonymous.
1895 *
1896 * This function does not create any image files.
1897 */
1898void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
1899{
1900 BlockDriverState tmp;
f6801b83 1901
4ddc07ca
PB
1902 /* bs_new must be anonymous and shouldn't have anything fancy enabled */
1903 assert(bs_new->device_name[0] == '\0');
e4654d2d 1904 assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
4ddc07ca
PB
1905 assert(bs_new->job == NULL);
1906 assert(bs_new->dev == NULL);
1907 assert(bs_new->in_use == 0);
1908 assert(bs_new->io_limits_enabled == false);
cc0681c4 1909 assert(!throttle_have_timer(&bs_new->throttle_state));
8802d1fd 1910
4ddc07ca
PB
1911 tmp = *bs_new;
1912 *bs_new = *bs_old;
1913 *bs_old = tmp;
a9fc4408 1914
4ddc07ca
PB
1915 /* there are some fields that should not be swapped, move them back */
1916 bdrv_move_feature_fields(&tmp, bs_old);
1917 bdrv_move_feature_fields(bs_old, bs_new);
1918 bdrv_move_feature_fields(bs_new, &tmp);
8802d1fd 1919
4ddc07ca
PB
1920 /* bs_new shouldn't be in bdrv_states even after the swap! */
1921 assert(bs_new->device_name[0] == '\0');
1922
1923 /* Check a few fields that should remain attached to the device */
1924 assert(bs_new->dev == NULL);
1925 assert(bs_new->job == NULL);
1926 assert(bs_new->in_use == 0);
1927 assert(bs_new->io_limits_enabled == false);
cc0681c4 1928 assert(!throttle_have_timer(&bs_new->throttle_state));
e023b2e2
PB
1929
1930 bdrv_rebind(bs_new);
4ddc07ca
PB
1931 bdrv_rebind(bs_old);
1932}
1933
1934/*
1935 * Add new bs contents at the top of an image chain while the chain is
1936 * live, while keeping required fields on the top layer.
1937 *
1938 * This will modify the BlockDriverState fields, and swap contents
1939 * between bs_new and bs_top. Both bs_new and bs_top are modified.
1940 *
1941 * bs_new is required to be anonymous.
1942 *
1943 * This function does not create any image files.
1944 */
1945void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
1946{
1947 bdrv_swap(bs_new, bs_top);
1948
1949 /* The contents of 'tmp' will become bs_top, as we are
1950 * swapping bs_new and bs_top contents. */
1951 bs_top->backing_hd = bs_new;
1952 bs_top->open_flags &= ~BDRV_O_NO_BACKING;
1953 pstrcpy(bs_top->backing_file, sizeof(bs_top->backing_file),
1954 bs_new->filename);
1955 pstrcpy(bs_top->backing_format, sizeof(bs_top->backing_format),
1956 bs_new->drv ? bs_new->drv->format_name : "");
8802d1fd
JC
1957}
1958
4f6fd349 1959static void bdrv_delete(BlockDriverState *bs)
b338082b 1960{
fa879d62 1961 assert(!bs->dev);
3e914655
PB
1962 assert(!bs->job);
1963 assert(!bs->in_use);
4f6fd349 1964 assert(!bs->refcnt);
e4654d2d 1965 assert(QLIST_EMPTY(&bs->dirty_bitmaps));
18846dee 1966
e1b5c52e
SH
1967 bdrv_close(bs);
1968
1b7bdbc1 1969 /* remove from list, if necessary */
d22b2f41 1970 bdrv_make_anon(bs);
34c6f050 1971
7267c094 1972 g_free(bs);
fc01f7e7
FB
1973}
1974
fa879d62
MA
1975int bdrv_attach_dev(BlockDriverState *bs, void *dev)
1976/* TODO change to DeviceState *dev when all users are qdevified */
18846dee 1977{
fa879d62 1978 if (bs->dev) {
18846dee
MA
1979 return -EBUSY;
1980 }
fa879d62 1981 bs->dev = dev;
28a7282a 1982 bdrv_iostatus_reset(bs);
18846dee
MA
1983 return 0;
1984}
1985
fa879d62
MA
1986/* TODO qdevified devices don't use this, remove when devices are qdevified */
1987void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
18846dee 1988{
fa879d62
MA
1989 if (bdrv_attach_dev(bs, dev) < 0) {
1990 abort();
1991 }
1992}
1993
1994void bdrv_detach_dev(BlockDriverState *bs, void *dev)
1995/* TODO change to DeviceState *dev when all users are qdevified */
1996{
1997 assert(bs->dev == dev);
1998 bs->dev = NULL;
0e49de52
MA
1999 bs->dev_ops = NULL;
2000 bs->dev_opaque = NULL;
1b7fd729 2001 bs->guest_block_size = 512;
18846dee
MA
2002}
2003
fa879d62
MA
2004/* TODO change to return DeviceState * when all users are qdevified */
2005void *bdrv_get_attached_dev(BlockDriverState *bs)
18846dee 2006{
fa879d62 2007 return bs->dev;
18846dee
MA
2008}
2009
0e49de52
MA
2010void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
2011 void *opaque)
2012{
2013 bs->dev_ops = ops;
2014 bs->dev_opaque = opaque;
2015}
2016
32c81a4a
PB
2017void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
2018 enum MonitorEvent ev,
2019 BlockErrorAction action, bool is_read)
329c0a48
LC
2020{
2021 QObject *data;
2022 const char *action_str;
2023
2024 switch (action) {
2025 case BDRV_ACTION_REPORT:
2026 action_str = "report";
2027 break;
2028 case BDRV_ACTION_IGNORE:
2029 action_str = "ignore";
2030 break;
2031 case BDRV_ACTION_STOP:
2032 action_str = "stop";
2033 break;
2034 default:
2035 abort();
2036 }
2037
2038 data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
2039 bdrv->device_name,
2040 action_str,
2041 is_read ? "read" : "write");
32c81a4a 2042 monitor_protocol_event(ev, data);
329c0a48
LC
2043
2044 qobject_decref(data);
2045}
2046
6f382ed2
LC
2047static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
2048{
2049 QObject *data;
2050
2051 data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
2052 bdrv_get_device_name(bs), ejected);
2053 monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
2054
2055 qobject_decref(data);
2056}
2057
7d4b4ba5 2058static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
0e49de52 2059{
145feb17 2060 if (bs->dev_ops && bs->dev_ops->change_media_cb) {
6f382ed2 2061 bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
7d4b4ba5 2062 bs->dev_ops->change_media_cb(bs->dev_opaque, load);
6f382ed2
LC
2063 if (tray_was_closed) {
2064 /* tray open */
2065 bdrv_emit_qmp_eject_event(bs, true);
2066 }
2067 if (load) {
2068 /* tray close */
2069 bdrv_emit_qmp_eject_event(bs, false);
2070 }
145feb17
MA
2071 }
2072}
2073
2c6942fa
MA
2074bool bdrv_dev_has_removable_media(BlockDriverState *bs)
2075{
2076 return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
2077}
2078
025ccaa7
PB
2079void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
2080{
2081 if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
2082 bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
2083 }
2084}
2085
e4def80b
MA
2086bool bdrv_dev_is_tray_open(BlockDriverState *bs)
2087{
2088 if (bs->dev_ops && bs->dev_ops->is_tray_open) {
2089 return bs->dev_ops->is_tray_open(bs->dev_opaque);
2090 }
2091 return false;
2092}
2093
145feb17
MA
2094static void bdrv_dev_resize_cb(BlockDriverState *bs)
2095{
2096 if (bs->dev_ops && bs->dev_ops->resize_cb) {
2097 bs->dev_ops->resize_cb(bs->dev_opaque);
0e49de52
MA
2098 }
2099}
2100
f107639a
MA
2101bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
2102{
2103 if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
2104 return bs->dev_ops->is_medium_locked(bs->dev_opaque);
2105 }
2106 return false;
2107}
2108
e97fc193
AL
2109/*
2110 * Run consistency checks on an image
2111 *
e076f338 2112 * Returns 0 if the check could be completed (it doesn't mean that the image is
a1c7273b 2113 * free of errors) or -errno when an internal error occurred. The results of the
e076f338 2114 * check are stored in res.
e97fc193 2115 */
4534ff54 2116int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
e97fc193
AL
2117{
2118 if (bs->drv->bdrv_check == NULL) {
2119 return -ENOTSUP;
2120 }
2121
e076f338 2122 memset(res, 0, sizeof(*res));
4534ff54 2123 return bs->drv->bdrv_check(bs, res, fix);
e97fc193
AL
2124}
2125
8a426614
KW
2126#define COMMIT_BUF_SECTORS 2048
2127
33e3963e
FB
2128/* commit COW file into the raw image */
2129int bdrv_commit(BlockDriverState *bs)
2130{
19cb3738 2131 BlockDriver *drv = bs->drv;
72706ea4 2132 int64_t sector, total_sectors, length, backing_length;
8a426614 2133 int n, ro, open_flags;
0bce597d 2134 int ret = 0;
72706ea4 2135 uint8_t *buf = NULL;
c2cba3d9 2136 char filename[PATH_MAX];
33e3963e 2137
19cb3738
FB
2138 if (!drv)
2139 return -ENOMEDIUM;
4dca4b63
NS
2140
2141 if (!bs->backing_hd) {
2142 return -ENOTSUP;
33e3963e
FB
2143 }
2144
2d3735d3
SH
2145 if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
2146 return -EBUSY;
2147 }
2148
4dca4b63 2149 ro = bs->backing_hd->read_only;
c2cba3d9
JM
2150 /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
2151 pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
4dca4b63
NS
2152 open_flags = bs->backing_hd->open_flags;
2153
2154 if (ro) {
0bce597d
JC
2155 if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
2156 return -EACCES;
4dca4b63 2157 }
ea2384d3 2158 }
33e3963e 2159
72706ea4
JC
2160 length = bdrv_getlength(bs);
2161 if (length < 0) {
2162 ret = length;
2163 goto ro_cleanup;
2164 }
2165
2166 backing_length = bdrv_getlength(bs->backing_hd);
2167 if (backing_length < 0) {
2168 ret = backing_length;
2169 goto ro_cleanup;
2170 }
2171
2172 /* If our top snapshot is larger than the backing file image,
2173 * grow the backing file image if possible. If not possible,
2174 * we must return an error */
2175 if (length > backing_length) {
2176 ret = bdrv_truncate(bs->backing_hd, length);
2177 if (ret < 0) {
2178 goto ro_cleanup;
2179 }
2180 }
2181
2182 total_sectors = length >> BDRV_SECTOR_BITS;
7267c094 2183 buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
8a426614
KW
2184
2185 for (sector = 0; sector < total_sectors; sector += n) {
d663640c
PB
2186 ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
2187 if (ret < 0) {
2188 goto ro_cleanup;
2189 }
2190 if (ret) {
dabfa6cc
KW
2191 ret = bdrv_read(bs, sector, buf, n);
2192 if (ret < 0) {
8a426614
KW
2193 goto ro_cleanup;
2194 }
2195
dabfa6cc
KW
2196 ret = bdrv_write(bs->backing_hd, sector, buf, n);
2197 if (ret < 0) {
8a426614
KW
2198 goto ro_cleanup;
2199 }
ea2384d3 2200 }
33e3963e 2201 }
95389c86 2202
1d44952f
CH
2203 if (drv->bdrv_make_empty) {
2204 ret = drv->bdrv_make_empty(bs);
dabfa6cc
KW
2205 if (ret < 0) {
2206 goto ro_cleanup;
2207 }
1d44952f
CH
2208 bdrv_flush(bs);
2209 }
95389c86 2210
3f5075ae
CH
2211 /*
2212 * Make sure all data we wrote to the backing device is actually
2213 * stable on disk.
2214 */
dabfa6cc 2215 if (bs->backing_hd) {
3f5075ae 2216 bdrv_flush(bs->backing_hd);
dabfa6cc 2217 }
4dca4b63 2218
dabfa6cc 2219 ret = 0;
4dca4b63 2220ro_cleanup:
7267c094 2221 g_free(buf);
4dca4b63
NS
2222
2223 if (ro) {
0bce597d
JC
2224 /* ignoring error return here */
2225 bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
4dca4b63
NS
2226 }
2227
1d44952f 2228 return ret;
33e3963e
FB
2229}
2230
e8877497 2231int bdrv_commit_all(void)
6ab4b5ab
MA
2232{
2233 BlockDriverState *bs;
2234
dc364f4c 2235 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
272d2d8e
JC
2236 if (bs->drv && bs->backing_hd) {
2237 int ret = bdrv_commit(bs);
2238 if (ret < 0) {
2239 return ret;
2240 }
e8877497 2241 }
6ab4b5ab 2242 }
e8877497 2243 return 0;
6ab4b5ab
MA
2244}
2245
dbffbdcf
SH
2246/**
2247 * Remove an active request from the tracked requests list
2248 *
2249 * This function should be called when a tracked request is completing.
2250 */
2251static void tracked_request_end(BdrvTrackedRequest *req)
2252{
2dbafdc0
KW
2253 if (req->serialising) {
2254 req->bs->serialising_in_flight--;
2255 }
2256
dbffbdcf 2257 QLIST_REMOVE(req, list);
f4658285 2258 qemu_co_queue_restart_all(&req->wait_queue);
dbffbdcf
SH
2259}
2260
2261/**
2262 * Add an active request to the tracked requests list
2263 */
2264static void tracked_request_begin(BdrvTrackedRequest *req,
2265 BlockDriverState *bs,
793ed47a
KW
2266 int64_t offset,
2267 unsigned int bytes, bool is_write)
dbffbdcf
SH
2268{
2269 *req = (BdrvTrackedRequest){
2270 .bs = bs,
2dbafdc0
KW
2271 .offset = offset,
2272 .bytes = bytes,
2273 .is_write = is_write,
2274 .co = qemu_coroutine_self(),
2275 .serialising = false,
7327145f
KW
2276 .overlap_offset = offset,
2277 .overlap_bytes = bytes,
dbffbdcf
SH
2278 };
2279
f4658285
SH
2280 qemu_co_queue_init(&req->wait_queue);
2281
dbffbdcf
SH
2282 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2283}
2284
e96126ff 2285static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
2dbafdc0 2286{
7327145f 2287 int64_t overlap_offset = req->offset & ~(align - 1);
e96126ff
KW
2288 unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
2289 - overlap_offset;
7327145f 2290
2dbafdc0
KW
2291 if (!req->serialising) {
2292 req->bs->serialising_in_flight++;
2293 req->serialising = true;
2294 }
7327145f
KW
2295
2296 req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
2297 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
2dbafdc0
KW
2298}
2299
d83947ac
SH
2300/**
2301 * Round a region to cluster boundaries
2302 */
343bded4
PB
2303void bdrv_round_to_clusters(BlockDriverState *bs,
2304 int64_t sector_num, int nb_sectors,
2305 int64_t *cluster_sector_num,
2306 int *cluster_nb_sectors)
d83947ac
SH
2307{
2308 BlockDriverInfo bdi;
2309
2310 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2311 *cluster_sector_num = sector_num;
2312 *cluster_nb_sectors = nb_sectors;
2313 } else {
2314 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2315 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2316 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2317 nb_sectors, c);
2318 }
2319}
2320
7327145f 2321static int bdrv_get_cluster_size(BlockDriverState *bs)
793ed47a
KW
2322{
2323 BlockDriverInfo bdi;
7327145f 2324 int ret;
793ed47a 2325
7327145f
KW
2326 ret = bdrv_get_info(bs, &bdi);
2327 if (ret < 0 || bdi.cluster_size == 0) {
2328 return bs->request_alignment;
793ed47a 2329 } else {
7327145f 2330 return bdi.cluster_size;
793ed47a
KW
2331 }
2332}
2333
f4658285 2334static bool tracked_request_overlaps(BdrvTrackedRequest *req,
793ed47a
KW
2335 int64_t offset, unsigned int bytes)
2336{
d83947ac 2337 /* aaaa bbbb */
7327145f 2338 if (offset >= req->overlap_offset + req->overlap_bytes) {
d83947ac
SH
2339 return false;
2340 }
2341 /* bbbb aaaa */
7327145f 2342 if (req->overlap_offset >= offset + bytes) {
d83947ac
SH
2343 return false;
2344 }
2345 return true;
f4658285
SH
2346}
2347
28de2dcd 2348static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
f4658285 2349{
2dbafdc0 2350 BlockDriverState *bs = self->bs;
f4658285
SH
2351 BdrvTrackedRequest *req;
2352 bool retry;
28de2dcd 2353 bool waited = false;
f4658285 2354
2dbafdc0 2355 if (!bs->serialising_in_flight) {
28de2dcd 2356 return false;
2dbafdc0
KW
2357 }
2358
f4658285
SH
2359 do {
2360 retry = false;
2361 QLIST_FOREACH(req, &bs->tracked_requests, list) {
2dbafdc0 2362 if (req == self || (!req->serialising && !self->serialising)) {
65afd211
KW
2363 continue;
2364 }
7327145f
KW
2365 if (tracked_request_overlaps(req, self->overlap_offset,
2366 self->overlap_bytes))
2367 {
5f8b6491
SH
2368 /* Hitting this means there was a reentrant request, for
2369 * example, a block driver issuing nested requests. This must
2370 * never happen since it means deadlock.
2371 */
2372 assert(qemu_coroutine_self() != req->co);
2373
6460440f
KW
2374 /* If the request is already (indirectly) waiting for us, or
2375 * will wait for us as soon as it wakes up, then just go on
2376 * (instead of producing a deadlock in the former case). */
2377 if (!req->waiting_for) {
2378 self->waiting_for = req;
2379 qemu_co_queue_wait(&req->wait_queue);
2380 self->waiting_for = NULL;
2381 retry = true;
28de2dcd 2382 waited = true;
6460440f
KW
2383 break;
2384 }
f4658285
SH
2385 }
2386 }
2387 } while (retry);
28de2dcd
KW
2388
2389 return waited;
f4658285
SH
2390}
2391
756e6736
KW
2392/*
2393 * Return values:
2394 * 0 - success
2395 * -EINVAL - backing format specified, but no file
2396 * -ENOSPC - can't update the backing file because no space is left in the
2397 * image file header
2398 * -ENOTSUP - format driver doesn't support changing the backing file
2399 */
2400int bdrv_change_backing_file(BlockDriverState *bs,
2401 const char *backing_file, const char *backing_fmt)
2402{
2403 BlockDriver *drv = bs->drv;
469ef350 2404 int ret;
756e6736 2405
5f377794
PB
2406 /* Backing file format doesn't make sense without a backing file */
2407 if (backing_fmt && !backing_file) {
2408 return -EINVAL;
2409 }
2410
756e6736 2411 if (drv->bdrv_change_backing_file != NULL) {
469ef350 2412 ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
756e6736 2413 } else {
469ef350 2414 ret = -ENOTSUP;
756e6736 2415 }
469ef350
PB
2416
2417 if (ret == 0) {
2418 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2419 pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2420 }
2421 return ret;
756e6736
KW
2422}
2423
6ebdcee2
JC
2424/*
2425 * Finds the image layer in the chain that has 'bs' as its backing file.
2426 *
2427 * active is the current topmost image.
2428 *
2429 * Returns NULL if bs is not found in active's image chain,
2430 * or if active == bs.
2431 */
2432BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2433 BlockDriverState *bs)
2434{
2435 BlockDriverState *overlay = NULL;
2436 BlockDriverState *intermediate;
2437
2438 assert(active != NULL);
2439 assert(bs != NULL);
2440
2441 /* if bs is the same as active, then by definition it has no overlay
2442 */
2443 if (active == bs) {
2444 return NULL;
2445 }
2446
2447 intermediate = active;
2448 while (intermediate->backing_hd) {
2449 if (intermediate->backing_hd == bs) {
2450 overlay = intermediate;
2451 break;
2452 }
2453 intermediate = intermediate->backing_hd;
2454 }
2455
2456 return overlay;
2457}
2458
2459typedef struct BlkIntermediateStates {
2460 BlockDriverState *bs;
2461 QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2462} BlkIntermediateStates;
2463
2464
2465/*
2466 * Drops images above 'base' up to and including 'top', and sets the image
2467 * above 'top' to have base as its backing file.
2468 *
2469 * Requires that the overlay to 'top' is opened r/w, so that the backing file
2470 * information in 'bs' can be properly updated.
2471 *
2472 * E.g., this will convert the following chain:
2473 * bottom <- base <- intermediate <- top <- active
2474 *
2475 * to
2476 *
2477 * bottom <- base <- active
2478 *
2479 * It is allowed for bottom==base, in which case it converts:
2480 *
2481 * base <- intermediate <- top <- active
2482 *
2483 * to
2484 *
2485 * base <- active
2486 *
2487 * Error conditions:
2488 * if active == top, that is considered an error
2489 *
2490 */
2491int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2492 BlockDriverState *base)
2493{
2494 BlockDriverState *intermediate;
2495 BlockDriverState *base_bs = NULL;
2496 BlockDriverState *new_top_bs = NULL;
2497 BlkIntermediateStates *intermediate_state, *next;
2498 int ret = -EIO;
2499
2500 QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2501 QSIMPLEQ_INIT(&states_to_delete);
2502
2503 if (!top->drv || !base->drv) {
2504 goto exit;
2505 }
2506
2507 new_top_bs = bdrv_find_overlay(active, top);
2508
2509 if (new_top_bs == NULL) {
2510 /* we could not find the image above 'top', this is an error */
2511 goto exit;
2512 }
2513
2514 /* special case of new_top_bs->backing_hd already pointing to base - nothing
2515 * to do, no intermediate images */
2516 if (new_top_bs->backing_hd == base) {
2517 ret = 0;
2518 goto exit;
2519 }
2520
2521 intermediate = top;
2522
2523 /* now we will go down through the list, and add each BDS we find
2524 * into our deletion queue, until we hit the 'base'
2525 */
2526 while (intermediate) {
2527 intermediate_state = g_malloc0(sizeof(BlkIntermediateStates));
2528 intermediate_state->bs = intermediate;
2529 QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2530
2531 if (intermediate->backing_hd == base) {
2532 base_bs = intermediate->backing_hd;
2533 break;
2534 }
2535 intermediate = intermediate->backing_hd;
2536 }
2537 if (base_bs == NULL) {
2538 /* something went wrong, we did not end at the base. safely
2539 * unravel everything, and exit with error */
2540 goto exit;
2541 }
2542
2543 /* success - we can delete the intermediate states, and link top->base */
2544 ret = bdrv_change_backing_file(new_top_bs, base_bs->filename,
2545 base_bs->drv ? base_bs->drv->format_name : "");
2546 if (ret) {
2547 goto exit;
2548 }
2549 new_top_bs->backing_hd = base_bs;
2550
355ef4ac 2551 bdrv_refresh_limits(new_top_bs);
6ebdcee2
JC
2552
2553 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2554 /* so that bdrv_close() does not recursively close the chain */
2555 intermediate_state->bs->backing_hd = NULL;
4f6fd349 2556 bdrv_unref(intermediate_state->bs);
6ebdcee2
JC
2557 }
2558 ret = 0;
2559
2560exit:
2561 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2562 g_free(intermediate_state);
2563 }
2564 return ret;
2565}
2566
2567
71d0770c
AL
2568static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2569 size_t size)
2570{
2571 int64_t len;
2572
2573 if (!bdrv_is_inserted(bs))
2574 return -ENOMEDIUM;
2575
2576 if (bs->growable)
2577 return 0;
2578
2579 len = bdrv_getlength(bs);
2580
fbb7b4e0
KW
2581 if (offset < 0)
2582 return -EIO;
2583
2584 if ((offset > len) || (len - offset < size))
71d0770c
AL
2585 return -EIO;
2586
2587 return 0;
2588}
2589
2590static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2591 int nb_sectors)
2592{
eb5a3165
JS
2593 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2594 nb_sectors * BDRV_SECTOR_SIZE);
71d0770c
AL
2595}
2596
1c9805a3
SH
2597typedef struct RwCo {
2598 BlockDriverState *bs;
775aa8b6 2599 int64_t offset;
1c9805a3
SH
2600 QEMUIOVector *qiov;
2601 bool is_write;
2602 int ret;
4105eaaa 2603 BdrvRequestFlags flags;
1c9805a3
SH
2604} RwCo;
2605
2606static void coroutine_fn bdrv_rw_co_entry(void *opaque)
fc01f7e7 2607{
1c9805a3 2608 RwCo *rwco = opaque;
ea2384d3 2609
1c9805a3 2610 if (!rwco->is_write) {
775aa8b6
KW
2611 rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
2612 rwco->qiov->size, rwco->qiov,
4105eaaa 2613 rwco->flags);
775aa8b6
KW
2614 } else {
2615 rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
2616 rwco->qiov->size, rwco->qiov,
2617 rwco->flags);
1c9805a3
SH
2618 }
2619}
e7a8a783 2620
1c9805a3 2621/*
8d3b1a2d 2622 * Process a vectored synchronous request using coroutines
1c9805a3 2623 */
775aa8b6
KW
2624static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
2625 QEMUIOVector *qiov, bool is_write,
2626 BdrvRequestFlags flags)
1c9805a3 2627{
1c9805a3
SH
2628 Coroutine *co;
2629 RwCo rwco = {
2630 .bs = bs,
775aa8b6 2631 .offset = offset,
8d3b1a2d 2632 .qiov = qiov,
1c9805a3
SH
2633 .is_write = is_write,
2634 .ret = NOT_DONE,
4105eaaa 2635 .flags = flags,
1c9805a3 2636 };
e7a8a783 2637
498e386c
ZYW
2638 /**
2639 * In sync call context, when the vcpu is blocked, this throttling timer
2640 * will not fire; so the I/O throttling function has to be disabled here
2641 * if it has been enabled.
2642 */
2643 if (bs->io_limits_enabled) {
2644 fprintf(stderr, "Disabling I/O throttling on '%s' due "
2645 "to synchronous I/O.\n", bdrv_get_device_name(bs));
2646 bdrv_io_limits_disable(bs);
2647 }
2648
1c9805a3
SH
2649 if (qemu_in_coroutine()) {
2650 /* Fast-path if already in coroutine context */
2651 bdrv_rw_co_entry(&rwco);
2652 } else {
2653 co = qemu_coroutine_create(bdrv_rw_co_entry);
2654 qemu_coroutine_enter(co, &rwco);
2655 while (rwco.ret == NOT_DONE) {
2656 qemu_aio_wait();
2657 }
2658 }
2659 return rwco.ret;
2660}
b338082b 2661
8d3b1a2d
KW
2662/*
2663 * Process a synchronous request using coroutines
2664 */
2665static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
4105eaaa 2666 int nb_sectors, bool is_write, BdrvRequestFlags flags)
8d3b1a2d
KW
2667{
2668 QEMUIOVector qiov;
2669 struct iovec iov = {
2670 .iov_base = (void *)buf,
2671 .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2672 };
2673
2674 qemu_iovec_init_external(&qiov, &iov, 1);
775aa8b6
KW
2675 return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
2676 &qiov, is_write, flags);
8d3b1a2d
KW
2677}
2678
1c9805a3
SH
2679/* return < 0 if error. See bdrv_write() for the return codes */
2680int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2681 uint8_t *buf, int nb_sectors)
2682{
4105eaaa 2683 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
fc01f7e7
FB
2684}
2685
07d27a44
MA
2686/* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2687int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2688 uint8_t *buf, int nb_sectors)
2689{
2690 bool enabled;
2691 int ret;
2692
2693 enabled = bs->io_limits_enabled;
2694 bs->io_limits_enabled = false;
4e7395e8 2695 ret = bdrv_read(bs, sector_num, buf, nb_sectors);
07d27a44
MA
2696 bs->io_limits_enabled = enabled;
2697 return ret;
2698}
2699
5fafdf24 2700/* Return < 0 if error. Important errors are:
19cb3738
FB
2701 -EIO generic I/O error (may happen for all errors)
2702 -ENOMEDIUM No media inserted.
2703 -EINVAL Invalid sector number or nb_sectors
2704 -EACCES Trying to write a read-only device
2705*/
5fafdf24 2706int bdrv_write(BlockDriverState *bs, int64_t sector_num,
fc01f7e7
FB
2707 const uint8_t *buf, int nb_sectors)
2708{
4105eaaa 2709 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
83f64091
FB
2710}
2711
aa7bfbff
PL
2712int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2713 int nb_sectors, BdrvRequestFlags flags)
4105eaaa
PL
2714{
2715 return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
aa7bfbff 2716 BDRV_REQ_ZERO_WRITE | flags);
8d3b1a2d
KW
2717}
2718
d75cbb5e
PL
2719/*
2720 * Completely zero out a block device with the help of bdrv_write_zeroes.
2721 * The operation is sped up by checking the block status and only writing
2722 * zeroes to the device if they currently do not return zeroes. Optional
2723 * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2724 *
2725 * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2726 */
2727int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2728{
2729 int64_t target_size = bdrv_getlength(bs) / BDRV_SECTOR_SIZE;
2730 int64_t ret, nb_sectors, sector_num = 0;
2731 int n;
2732
2733 for (;;) {
2734 nb_sectors = target_size - sector_num;
2735 if (nb_sectors <= 0) {
2736 return 0;
2737 }
2738 if (nb_sectors > INT_MAX) {
2739 nb_sectors = INT_MAX;
2740 }
2741 ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
3d94ce60
PL
2742 if (ret < 0) {
2743 error_report("error getting block status at sector %" PRId64 ": %s",
2744 sector_num, strerror(-ret));
2745 return ret;
2746 }
d75cbb5e
PL
2747 if (ret & BDRV_BLOCK_ZERO) {
2748 sector_num += n;
2749 continue;
2750 }
2751 ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2752 if (ret < 0) {
2753 error_report("error writing zeroes at sector %" PRId64 ": %s",
2754 sector_num, strerror(-ret));
2755 return ret;
2756 }
2757 sector_num += n;
2758 }
2759}
2760
a3ef6571 2761int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
83f64091 2762{
a3ef6571
KW
2763 QEMUIOVector qiov;
2764 struct iovec iov = {
2765 .iov_base = (void *)buf,
2766 .iov_len = bytes,
2767 };
9a8c4cce 2768 int ret;
83f64091 2769
a3ef6571
KW
2770 if (bytes < 0) {
2771 return -EINVAL;
83f64091
FB
2772 }
2773
a3ef6571
KW
2774 qemu_iovec_init_external(&qiov, &iov, 1);
2775 ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
2776 if (ret < 0) {
2777 return ret;
83f64091 2778 }
a3ef6571
KW
2779
2780 return bytes;
83f64091
FB
2781}
2782
8d3b1a2d 2783int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
83f64091 2784{
9a8c4cce 2785 int ret;
83f64091 2786
8407d5d7
KW
2787 ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
2788 if (ret < 0) {
2789 return ret;
83f64091
FB
2790 }
2791
8d3b1a2d
KW
2792 return qiov->size;
2793}
2794
2795int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
8407d5d7 2796 const void *buf, int bytes)
8d3b1a2d
KW
2797{
2798 QEMUIOVector qiov;
2799 struct iovec iov = {
2800 .iov_base = (void *) buf,
8407d5d7 2801 .iov_len = bytes,
8d3b1a2d
KW
2802 };
2803
8407d5d7
KW
2804 if (bytes < 0) {
2805 return -EINVAL;
2806 }
2807
8d3b1a2d
KW
2808 qemu_iovec_init_external(&qiov, &iov, 1);
2809 return bdrv_pwritev(bs, offset, &qiov);
83f64091 2810}
83f64091 2811
f08145fe
KW
2812/*
2813 * Writes to the file and ensures that no writes are reordered across this
2814 * request (acts as a barrier)
2815 *
2816 * Returns 0 on success, -errno in error cases.
2817 */
2818int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2819 const void *buf, int count)
2820{
2821 int ret;
2822
2823 ret = bdrv_pwrite(bs, offset, buf, count);
2824 if (ret < 0) {
2825 return ret;
2826 }
2827
f05fa4ad
PB
2828 /* No flush needed for cache modes that already do it */
2829 if (bs->enable_write_cache) {
f08145fe
KW
2830 bdrv_flush(bs);
2831 }
2832
2833 return 0;
2834}
2835
470c0504 2836static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
ab185921
SH
2837 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2838{
2839 /* Perform I/O through a temporary buffer so that users who scribble over
2840 * their read buffer while the operation is in progress do not end up
2841 * modifying the image file. This is critical for zero-copy guest I/O
2842 * where anything might happen inside guest memory.
2843 */
2844 void *bounce_buffer;
2845
79c053bd 2846 BlockDriver *drv = bs->drv;
ab185921
SH
2847 struct iovec iov;
2848 QEMUIOVector bounce_qiov;
2849 int64_t cluster_sector_num;
2850 int cluster_nb_sectors;
2851 size_t skip_bytes;
2852 int ret;
2853
2854 /* Cover entire cluster so no additional backing file I/O is required when
2855 * allocating cluster in the image file.
2856 */
343bded4
PB
2857 bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2858 &cluster_sector_num, &cluster_nb_sectors);
ab185921 2859
470c0504
SH
2860 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2861 cluster_sector_num, cluster_nb_sectors);
ab185921
SH
2862
2863 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2864 iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
2865 qemu_iovec_init_external(&bounce_qiov, &iov, 1);
2866
79c053bd
SH
2867 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
2868 &bounce_qiov);
ab185921
SH
2869 if (ret < 0) {
2870 goto err;
2871 }
2872
79c053bd
SH
2873 if (drv->bdrv_co_write_zeroes &&
2874 buffer_is_zero(bounce_buffer, iov.iov_len)) {
621f0589 2875 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
aa7bfbff 2876 cluster_nb_sectors, 0);
79c053bd 2877 } else {
f05fa4ad
PB
2878 /* This does not change the data on the disk, it is not necessary
2879 * to flush even in cache=writethrough mode.
2880 */
79c053bd 2881 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
ab185921 2882 &bounce_qiov);
79c053bd
SH
2883 }
2884
ab185921
SH
2885 if (ret < 0) {
2886 /* It might be okay to ignore write errors for guest requests. If this
2887 * is a deliberate copy-on-read then we don't want to ignore the error.
2888 * Simply report it in all cases.
2889 */
2890 goto err;
2891 }
2892
2893 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
03396148
MT
2894 qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
2895 nb_sectors * BDRV_SECTOR_SIZE);
ab185921
SH
2896
2897err:
2898 qemu_vfree(bounce_buffer);
2899 return ret;
2900}
2901
c5fbe571 2902/*
d0c7f642
KW
2903 * Forwards an already correctly aligned request to the BlockDriver. This
2904 * handles copy on read and zeroing after EOF; any other features must be
2905 * implemented by the caller.
c5fbe571 2906 */
d0c7f642 2907static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
65afd211 2908 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
ec746e10 2909 int64_t align, QEMUIOVector *qiov, int flags)
da1fa91d
KW
2910{
2911 BlockDriver *drv = bs->drv;
dbffbdcf 2912 int ret;
da1fa91d 2913
d0c7f642
KW
2914 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
2915 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
da1fa91d 2916
d0c7f642
KW
2917 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
2918 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
2919
2920 /* Handle Copy on Read and associated serialisation */
470c0504 2921 if (flags & BDRV_REQ_COPY_ON_READ) {
7327145f
KW
2922 /* If we touch the same cluster it counts as an overlap. This
2923 * guarantees that allocating writes will be serialized and not race
2924 * with each other for the same cluster. For example, in copy-on-read
2925 * it ensures that the CoR read and write operations are atomic and
2926 * guest writes cannot interleave between them. */
2927 mark_request_serialising(req, bdrv_get_cluster_size(bs));
470c0504
SH
2928 }
2929
2dbafdc0 2930 wait_serialising_requests(req);
f4658285 2931
470c0504 2932 if (flags & BDRV_REQ_COPY_ON_READ) {
ab185921
SH
2933 int pnum;
2934
bdad13b9 2935 ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
ab185921
SH
2936 if (ret < 0) {
2937 goto out;
2938 }
2939
2940 if (!ret || pnum != nb_sectors) {
470c0504 2941 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
ab185921
SH
2942 goto out;
2943 }
2944 }
2945
d0c7f642 2946 /* Forward the request to the BlockDriver */
893a8f62
MK
2947 if (!(bs->zero_beyond_eof && bs->growable)) {
2948 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
2949 } else {
2950 /* Read zeros after EOF of growable BDSes */
2951 int64_t len, total_sectors, max_nb_sectors;
2952
2953 len = bdrv_getlength(bs);
2954 if (len < 0) {
2955 ret = len;
2956 goto out;
2957 }
2958
d055a1fe 2959 total_sectors = DIV_ROUND_UP(len, BDRV_SECTOR_SIZE);
5f5bcd80
KW
2960 max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
2961 align >> BDRV_SECTOR_BITS);
893a8f62
MK
2962 if (max_nb_sectors > 0) {
2963 ret = drv->bdrv_co_readv(bs, sector_num,
2964 MIN(nb_sectors, max_nb_sectors), qiov);
2965 } else {
2966 ret = 0;
2967 }
2968
2969 /* Reading beyond end of file is supposed to produce zeroes */
2970 if (ret == 0 && total_sectors < sector_num + nb_sectors) {
2971 uint64_t offset = MAX(0, total_sectors - sector_num);
2972 uint64_t bytes = (sector_num + nb_sectors - offset) *
2973 BDRV_SECTOR_SIZE;
2974 qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
2975 }
2976 }
ab185921
SH
2977
2978out:
dbffbdcf 2979 return ret;
da1fa91d
KW
2980}
2981
d0c7f642
KW
2982/*
2983 * Handle a read request in coroutine context
2984 */
1b0288ae
KW
2985static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
2986 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
d0c7f642
KW
2987 BdrvRequestFlags flags)
2988{
2989 BlockDriver *drv = bs->drv;
65afd211
KW
2990 BdrvTrackedRequest req;
2991
1b0288ae
KW
2992 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
2993 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
2994 uint8_t *head_buf = NULL;
2995 uint8_t *tail_buf = NULL;
2996 QEMUIOVector local_qiov;
2997 bool use_local_qiov = false;
d0c7f642
KW
2998 int ret;
2999
3000 if (!drv) {
3001 return -ENOMEDIUM;
3002 }
1b0288ae 3003 if (bdrv_check_byte_request(bs, offset, bytes)) {
d0c7f642
KW
3004 return -EIO;
3005 }
3006
3007 if (bs->copy_on_read) {
3008 flags |= BDRV_REQ_COPY_ON_READ;
3009 }
3010
3011 /* throttling disk I/O */
3012 if (bs->io_limits_enabled) {
d5103588 3013 bdrv_io_limits_intercept(bs, bytes, false);
1b0288ae
KW
3014 }
3015
3016 /* Align read if necessary by padding qiov */
3017 if (offset & (align - 1)) {
3018 head_buf = qemu_blockalign(bs, align);
3019 qemu_iovec_init(&local_qiov, qiov->niov + 2);
3020 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3021 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3022 use_local_qiov = true;
3023
3024 bytes += offset & (align - 1);
3025 offset = offset & ~(align - 1);
3026 }
3027
3028 if ((offset + bytes) & (align - 1)) {
3029 if (!use_local_qiov) {
3030 qemu_iovec_init(&local_qiov, qiov->niov + 1);
3031 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3032 use_local_qiov = true;
3033 }
3034 tail_buf = qemu_blockalign(bs, align);
3035 qemu_iovec_add(&local_qiov, tail_buf,
3036 align - ((offset + bytes) & (align - 1)));
3037
3038 bytes = ROUND_UP(bytes, align);
3039 }
3040
65afd211 3041 tracked_request_begin(&req, bs, offset, bytes, false);
ec746e10 3042 ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
1b0288ae
KW
3043 use_local_qiov ? &local_qiov : qiov,
3044 flags);
65afd211 3045 tracked_request_end(&req);
1b0288ae
KW
3046
3047 if (use_local_qiov) {
3048 qemu_iovec_destroy(&local_qiov);
3049 qemu_vfree(head_buf);
3050 qemu_vfree(tail_buf);
d0c7f642
KW
3051 }
3052
d0c7f642
KW
3053 return ret;
3054}
3055
1b0288ae
KW
3056static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
3057 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3058 BdrvRequestFlags flags)
3059{
3060 if (nb_sectors < 0 || nb_sectors > (UINT_MAX >> BDRV_SECTOR_BITS)) {
3061 return -EINVAL;
3062 }
3063
3064 return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
3065 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3066}
3067
c5fbe571 3068int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
da1fa91d
KW
3069 int nb_sectors, QEMUIOVector *qiov)
3070{
c5fbe571 3071 trace_bdrv_co_readv(bs, sector_num, nb_sectors);
da1fa91d 3072
470c0504
SH
3073 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
3074}
3075
3076int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
3077 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
3078{
3079 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
3080
3081 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
3082 BDRV_REQ_COPY_ON_READ);
c5fbe571
SH
3083}
3084
c31cb707
PL
3085/* if no limit is specified in the BlockLimits use a default
3086 * of 32768 512-byte sectors (16 MiB) per request.
3087 */
3088#define MAX_WRITE_ZEROES_DEFAULT 32768
3089
f08f2dda 3090static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
aa7bfbff 3091 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
f08f2dda
SH
3092{
3093 BlockDriver *drv = bs->drv;
3094 QEMUIOVector qiov;
c31cb707
PL
3095 struct iovec iov = {0};
3096 int ret = 0;
f08f2dda 3097
c31cb707
PL
3098 int max_write_zeroes = bs->bl.max_write_zeroes ?
3099 bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
621f0589 3100
c31cb707
PL
3101 while (nb_sectors > 0 && !ret) {
3102 int num = nb_sectors;
3103
b8d71c09
PB
3104 /* Align request. Block drivers can expect the "bulk" of the request
3105 * to be aligned.
3106 */
3107 if (bs->bl.write_zeroes_alignment
3108 && num > bs->bl.write_zeroes_alignment) {
3109 if (sector_num % bs->bl.write_zeroes_alignment != 0) {
3110 /* Make a small request up to the first aligned sector. */
c31cb707 3111 num = bs->bl.write_zeroes_alignment;
b8d71c09
PB
3112 num -= sector_num % bs->bl.write_zeroes_alignment;
3113 } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
3114 /* Shorten the request to the last aligned sector. num cannot
3115 * underflow because num > bs->bl.write_zeroes_alignment.
3116 */
3117 num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
c31cb707 3118 }
621f0589 3119 }
f08f2dda 3120
c31cb707
PL
3121 /* limit request size */
3122 if (num > max_write_zeroes) {
3123 num = max_write_zeroes;
3124 }
3125
3126 ret = -ENOTSUP;
3127 /* First try the efficient write zeroes operation */
3128 if (drv->bdrv_co_write_zeroes) {
3129 ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
3130 }
3131
3132 if (ret == -ENOTSUP) {
3133 /* Fall back to bounce buffer if write zeroes is unsupported */
3134 iov.iov_len = num * BDRV_SECTOR_SIZE;
3135 if (iov.iov_base == NULL) {
b8d71c09
PB
3136 iov.iov_base = qemu_blockalign(bs, num * BDRV_SECTOR_SIZE);
3137 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
c31cb707
PL
3138 }
3139 qemu_iovec_init_external(&qiov, &iov, 1);
f08f2dda 3140
c31cb707 3141 ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
b8d71c09
PB
3142
3143 /* Keep bounce buffer around if it is big enough for all
3144 * all future requests.
3145 */
3146 if (num < max_write_zeroes) {
3147 qemu_vfree(iov.iov_base);
3148 iov.iov_base = NULL;
3149 }
c31cb707
PL
3150 }
3151
3152 sector_num += num;
3153 nb_sectors -= num;
3154 }
f08f2dda
SH
3155
3156 qemu_vfree(iov.iov_base);
3157 return ret;
3158}
3159
c5fbe571 3160/*
b404f720 3161 * Forwards an already correctly aligned write request to the BlockDriver.
c5fbe571 3162 */
b404f720 3163static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
65afd211
KW
3164 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3165 QEMUIOVector *qiov, int flags)
c5fbe571
SH
3166{
3167 BlockDriver *drv = bs->drv;
28de2dcd 3168 bool waited;
6b7cb247 3169 int ret;
da1fa91d 3170
b404f720
KW
3171 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3172 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
f4658285 3173
b404f720
KW
3174 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3175 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
cc0681c4 3176
28de2dcd
KW
3177 waited = wait_serialising_requests(req);
3178 assert(!waited || !req->serialising);
af91f9a7
KW
3179 assert(req->overlap_offset <= offset);
3180 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
244eadef 3181
65afd211 3182 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
d616b224
SH
3183
3184 if (ret < 0) {
3185 /* Do nothing, write notifier decided to fail this request */
3186 } else if (flags & BDRV_REQ_ZERO_WRITE) {
9e1cb96d 3187 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
aa7bfbff 3188 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
f08f2dda 3189 } else {
9e1cb96d 3190 BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
f08f2dda
SH
3191 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
3192 }
9e1cb96d 3193 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
6b7cb247 3194
f05fa4ad
PB
3195 if (ret == 0 && !bs->enable_write_cache) {
3196 ret = bdrv_co_flush(bs);
3197 }
3198
e4654d2d 3199 bdrv_set_dirty(bs, sector_num, nb_sectors);
da1fa91d
KW
3200
3201 if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
3202 bs->wr_highest_sector = sector_num + nb_sectors - 1;
3203 }
df2a6f29
PB
3204 if (bs->growable && ret >= 0) {
3205 bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
3206 }
da1fa91d 3207
6b7cb247 3208 return ret;
da1fa91d
KW
3209}
3210
b404f720
KW
3211/*
3212 * Handle a write request in coroutine context
3213 */
6601553e
KW
3214static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
3215 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
b404f720
KW
3216 BdrvRequestFlags flags)
3217{
65afd211 3218 BdrvTrackedRequest req;
3b8242e0
KW
3219 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3220 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3221 uint8_t *head_buf = NULL;
3222 uint8_t *tail_buf = NULL;
3223 QEMUIOVector local_qiov;
3224 bool use_local_qiov = false;
b404f720
KW
3225 int ret;
3226
3227 if (!bs->drv) {
3228 return -ENOMEDIUM;
3229 }
3230 if (bs->read_only) {
3231 return -EACCES;
3232 }
6601553e 3233 if (bdrv_check_byte_request(bs, offset, bytes)) {
b404f720
KW
3234 return -EIO;
3235 }
3236
b404f720
KW
3237 /* throttling disk I/O */
3238 if (bs->io_limits_enabled) {
d5103588 3239 bdrv_io_limits_intercept(bs, bytes, true);
b404f720
KW
3240 }
3241
3b8242e0
KW
3242 /*
3243 * Align write if necessary by performing a read-modify-write cycle.
3244 * Pad qiov with the read parts and be sure to have a tracked request not
3245 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
3246 */
65afd211 3247 tracked_request_begin(&req, bs, offset, bytes, true);
3b8242e0
KW
3248
3249 if (offset & (align - 1)) {
3250 QEMUIOVector head_qiov;
3251 struct iovec head_iov;
3252
3253 mark_request_serialising(&req, align);
3254 wait_serialising_requests(&req);
3255
3256 head_buf = qemu_blockalign(bs, align);
3257 head_iov = (struct iovec) {
3258 .iov_base = head_buf,
3259 .iov_len = align,
3260 };
3261 qemu_iovec_init_external(&head_qiov, &head_iov, 1);
3262
9e1cb96d 3263 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
3b8242e0
KW
3264 ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
3265 align, &head_qiov, 0);
3266 if (ret < 0) {
3267 goto fail;
3268 }
9e1cb96d 3269 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
3b8242e0
KW
3270
3271 qemu_iovec_init(&local_qiov, qiov->niov + 2);
3272 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3273 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3274 use_local_qiov = true;
3275
3276 bytes += offset & (align - 1);
3277 offset = offset & ~(align - 1);
3278 }
3279
3280 if ((offset + bytes) & (align - 1)) {
3281 QEMUIOVector tail_qiov;
3282 struct iovec tail_iov;
3283 size_t tail_bytes;
28de2dcd 3284 bool waited;
3b8242e0
KW
3285
3286 mark_request_serialising(&req, align);
28de2dcd
KW
3287 waited = wait_serialising_requests(&req);
3288 assert(!waited || !use_local_qiov);
3b8242e0
KW
3289
3290 tail_buf = qemu_blockalign(bs, align);
3291 tail_iov = (struct iovec) {
3292 .iov_base = tail_buf,
3293 .iov_len = align,
3294 };
3295 qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
3296
9e1cb96d 3297 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
3b8242e0
KW
3298 ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
3299 align, &tail_qiov, 0);
3300 if (ret < 0) {
3301 goto fail;
3302 }
9e1cb96d 3303 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
3b8242e0
KW
3304
3305 if (!use_local_qiov) {
3306 qemu_iovec_init(&local_qiov, qiov->niov + 1);
3307 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3308 use_local_qiov = true;
3309 }
3310
3311 tail_bytes = (offset + bytes) & (align - 1);
3312 qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
3313
3314 bytes = ROUND_UP(bytes, align);
3315 }
3316
3317 ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
3318 use_local_qiov ? &local_qiov : qiov,
3319 flags);
3320
3321fail:
65afd211 3322 tracked_request_end(&req);
b404f720 3323
3b8242e0
KW
3324 if (use_local_qiov) {
3325 qemu_iovec_destroy(&local_qiov);
3b8242e0 3326 }
99c4a85c
KW
3327 qemu_vfree(head_buf);
3328 qemu_vfree(tail_buf);
3b8242e0 3329
b404f720
KW
3330 return ret;
3331}
3332
6601553e
KW
3333static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
3334 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3335 BdrvRequestFlags flags)
3336{
3337 if (nb_sectors < 0 || nb_sectors > (INT_MAX >> BDRV_SECTOR_BITS)) {
3338 return -EINVAL;
3339 }
3340
3341 return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
3342 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3343}
3344
c5fbe571
SH
3345int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
3346 int nb_sectors, QEMUIOVector *qiov)
3347{
3348 trace_bdrv_co_writev(bs, sector_num, nb_sectors);
3349
f08f2dda
SH
3350 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
3351}
3352
3353int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
aa7bfbff
PL
3354 int64_t sector_num, int nb_sectors,
3355 BdrvRequestFlags flags)
f08f2dda 3356{
94d6ff21 3357 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
f08f2dda 3358
d32f35cb
PL
3359 if (!(bs->open_flags & BDRV_O_UNMAP)) {
3360 flags &= ~BDRV_REQ_MAY_UNMAP;
3361 }
3362
f08f2dda 3363 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
aa7bfbff 3364 BDRV_REQ_ZERO_WRITE | flags);
c5fbe571
SH
3365}
3366
83f64091
FB
3367/**
3368 * Truncate file to 'offset' bytes (needed only for file protocols)
3369 */
3370int bdrv_truncate(BlockDriverState *bs, int64_t offset)
3371{
3372 BlockDriver *drv = bs->drv;
51762288 3373 int ret;
83f64091 3374 if (!drv)
19cb3738 3375 return -ENOMEDIUM;
83f64091
FB
3376 if (!drv->bdrv_truncate)
3377 return -ENOTSUP;
59f2689d
NS
3378 if (bs->read_only)
3379 return -EACCES;
8591675f
MT
3380 if (bdrv_in_use(bs))
3381 return -EBUSY;
51762288
SH
3382 ret = drv->bdrv_truncate(bs, offset);
3383 if (ret == 0) {
3384 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
145feb17 3385 bdrv_dev_resize_cb(bs);
51762288
SH
3386 }
3387 return ret;
83f64091
FB
3388}
3389
4a1d5e1f
FZ
3390/**
3391 * Length of a allocated file in bytes. Sparse files are counted by actual
3392 * allocated space. Return < 0 if error or unknown.
3393 */
3394int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
3395{
3396 BlockDriver *drv = bs->drv;
3397 if (!drv) {
3398 return -ENOMEDIUM;
3399 }
3400 if (drv->bdrv_get_allocated_file_size) {
3401 return drv->bdrv_get_allocated_file_size(bs);
3402 }
3403 if (bs->file) {
3404 return bdrv_get_allocated_file_size(bs->file);
3405 }
3406 return -ENOTSUP;
3407}
3408
83f64091
FB
3409/**
3410 * Length of a file in bytes. Return < 0 if error or unknown.
3411 */
3412int64_t bdrv_getlength(BlockDriverState *bs)
3413{
3414 BlockDriver *drv = bs->drv;
3415 if (!drv)
19cb3738 3416 return -ENOMEDIUM;
51762288 3417
b94a2610
KW
3418 if (drv->has_variable_length) {
3419 int ret = refresh_total_sectors(bs, bs->total_sectors);
3420 if (ret < 0) {
3421 return ret;
46a4e4e6 3422 }
83f64091 3423 }
46a4e4e6 3424 return bs->total_sectors * BDRV_SECTOR_SIZE;
fc01f7e7
FB
3425}
3426
19cb3738 3427/* return 0 as number of sectors if no device present or error */
96b8f136 3428void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
fc01f7e7 3429{
19cb3738
FB
3430 int64_t length;
3431 length = bdrv_getlength(bs);
3432 if (length < 0)
3433 length = 0;
3434 else
6ea44308 3435 length = length >> BDRV_SECTOR_BITS;
19cb3738 3436 *nb_sectors_ptr = length;
fc01f7e7 3437}
cf98951b 3438
ff06f5f3
PB
3439void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
3440 BlockdevOnError on_write_error)
abd7f68d
MA
3441{
3442 bs->on_read_error = on_read_error;
3443 bs->on_write_error = on_write_error;
3444}
3445
1ceee0d5 3446BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
abd7f68d
MA
3447{
3448 return is_read ? bs->on_read_error : bs->on_write_error;
3449}
3450
3e1caa5f
PB
3451BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
3452{
3453 BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
3454
3455 switch (on_err) {
3456 case BLOCKDEV_ON_ERROR_ENOSPC:
3457 return (error == ENOSPC) ? BDRV_ACTION_STOP : BDRV_ACTION_REPORT;
3458 case BLOCKDEV_ON_ERROR_STOP:
3459 return BDRV_ACTION_STOP;
3460 case BLOCKDEV_ON_ERROR_REPORT:
3461 return BDRV_ACTION_REPORT;
3462 case BLOCKDEV_ON_ERROR_IGNORE:
3463 return BDRV_ACTION_IGNORE;
3464 default:
3465 abort();
3466 }
3467}
3468
3469/* This is done by device models because, while the block layer knows
3470 * about the error, it does not know whether an operation comes from
3471 * the device or the block layer (from a job, for example).
3472 */
3473void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3474 bool is_read, int error)
3475{
3476 assert(error >= 0);
32c81a4a 3477 bdrv_emit_qmp_error_event(bs, QEVENT_BLOCK_IO_ERROR, action, is_read);
3e1caa5f
PB
3478 if (action == BDRV_ACTION_STOP) {
3479 vm_stop(RUN_STATE_IO_ERROR);
3480 bdrv_iostatus_set_err(bs, error);
3481 }
3482}
3483
b338082b
FB
3484int bdrv_is_read_only(BlockDriverState *bs)
3485{
3486 return bs->read_only;
3487}
3488
985a03b0
TS
3489int bdrv_is_sg(BlockDriverState *bs)
3490{
3491 return bs->sg;
3492}
3493
e900a7b7
CH
3494int bdrv_enable_write_cache(BlockDriverState *bs)
3495{
3496 return bs->enable_write_cache;
3497}
3498
425b0148
PB
3499void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3500{
3501 bs->enable_write_cache = wce;
55b110f2
JC
3502
3503 /* so a reopen() will preserve wce */
3504 if (wce) {
3505 bs->open_flags |= BDRV_O_CACHE_WB;
3506 } else {
3507 bs->open_flags &= ~BDRV_O_CACHE_WB;
3508 }
425b0148
PB
3509}
3510
ea2384d3
FB
3511int bdrv_is_encrypted(BlockDriverState *bs)
3512{
3513 if (bs->backing_hd && bs->backing_hd->encrypted)
3514 return 1;
3515 return bs->encrypted;
3516}
3517
c0f4ce77
AL
3518int bdrv_key_required(BlockDriverState *bs)
3519{
3520 BlockDriverState *backing_hd = bs->backing_hd;
3521
3522 if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3523 return 1;
3524 return (bs->encrypted && !bs->valid_key);
3525}
3526
ea2384d3
FB
3527int bdrv_set_key(BlockDriverState *bs, const char *key)
3528{
3529 int ret;
3530 if (bs->backing_hd && bs->backing_hd->encrypted) {
3531 ret = bdrv_set_key(bs->backing_hd, key);
3532 if (ret < 0)
3533 return ret;
3534 if (!bs->encrypted)
3535 return 0;
3536 }
fd04a2ae
SH
3537 if (!bs->encrypted) {
3538 return -EINVAL;
3539 } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3540 return -ENOMEDIUM;
3541 }
c0f4ce77 3542 ret = bs->drv->bdrv_set_key(bs, key);
bb5fc20f
AL
3543 if (ret < 0) {
3544 bs->valid_key = 0;
3545 } else if (!bs->valid_key) {
3546 bs->valid_key = 1;
3547 /* call the change callback now, we skipped it on open */
7d4b4ba5 3548 bdrv_dev_change_media_cb(bs, true);
bb5fc20f 3549 }
c0f4ce77 3550 return ret;
ea2384d3
FB
3551}
3552
f8d6bba1 3553const char *bdrv_get_format_name(BlockDriverState *bs)
ea2384d3 3554{
f8d6bba1 3555 return bs->drv ? bs->drv->format_name : NULL;
ea2384d3
FB
3556}
3557
5fafdf24 3558void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
ea2384d3
FB
3559 void *opaque)
3560{
3561 BlockDriver *drv;
3562
8a22f02a 3563 QLIST_FOREACH(drv, &bdrv_drivers, list) {
ea2384d3
FB
3564 it(opaque, drv->format_name);
3565 }
3566}
3567
dc364f4c 3568/* This function is to find block backend bs */
b338082b
FB
3569BlockDriverState *bdrv_find(const char *name)
3570{
3571 BlockDriverState *bs;
3572
dc364f4c 3573 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1b7bdbc1 3574 if (!strcmp(name, bs->device_name)) {
b338082b 3575 return bs;
1b7bdbc1 3576 }
b338082b
FB
3577 }
3578 return NULL;
3579}
3580
dc364f4c
BC
3581/* This function is to find a node in the bs graph */
3582BlockDriverState *bdrv_find_node(const char *node_name)
3583{
3584 BlockDriverState *bs;
3585
3586 assert(node_name);
3587
3588 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3589 if (!strcmp(node_name, bs->node_name)) {
3590 return bs;
3591 }
3592 }
3593 return NULL;
3594}
3595
c13163fb
BC
3596/* Put this QMP function here so it can access the static graph_bdrv_states. */
3597BlockDeviceInfoList *bdrv_named_nodes_list(void)
3598{
3599 BlockDeviceInfoList *list, *entry;
3600 BlockDriverState *bs;
3601
3602 list = NULL;
3603 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3604 entry = g_malloc0(sizeof(*entry));
3605 entry->value = bdrv_block_device_info(bs);
3606 entry->next = list;
3607 list = entry;
3608 }
3609
3610 return list;
3611}
3612
12d3ba82
BC
3613BlockDriverState *bdrv_lookup_bs(const char *device,
3614 const char *node_name,
3615 Error **errp)
3616{
3617 BlockDriverState *bs = NULL;
3618
12d3ba82
BC
3619 if (device) {
3620 bs = bdrv_find(device);
3621
dd67fa50
BC
3622 if (bs) {
3623 return bs;
12d3ba82 3624 }
12d3ba82
BC
3625 }
3626
dd67fa50
BC
3627 if (node_name) {
3628 bs = bdrv_find_node(node_name);
12d3ba82 3629
dd67fa50
BC
3630 if (bs) {
3631 return bs;
3632 }
12d3ba82
BC
3633 }
3634
dd67fa50
BC
3635 error_setg(errp, "Cannot find device=%s nor node_name=%s",
3636 device ? device : "",
3637 node_name ? node_name : "");
3638 return NULL;
12d3ba82
BC
3639}
3640
2f399b0a
MA
3641BlockDriverState *bdrv_next(BlockDriverState *bs)
3642{
3643 if (!bs) {
3644 return QTAILQ_FIRST(&bdrv_states);
3645 }
dc364f4c 3646 return QTAILQ_NEXT(bs, device_list);
2f399b0a
MA
3647}
3648
51de9760 3649void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
81d0912d
FB
3650{
3651 BlockDriverState *bs;
3652
dc364f4c 3653 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
51de9760 3654 it(opaque, bs);
81d0912d
FB
3655 }
3656}
3657
ea2384d3
FB
3658const char *bdrv_get_device_name(BlockDriverState *bs)
3659{
3660 return bs->device_name;
3661}
3662
c8433287
MA
3663int bdrv_get_flags(BlockDriverState *bs)
3664{
3665 return bs->open_flags;
3666}
3667
f0f0fdfe 3668int bdrv_flush_all(void)
c6ca28d6
AL
3669{
3670 BlockDriverState *bs;
f0f0fdfe 3671 int result = 0;
c6ca28d6 3672
dc364f4c 3673 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
f0f0fdfe
KW
3674 int ret = bdrv_flush(bs);
3675 if (ret < 0 && !result) {
3676 result = ret;
3677 }
1b7bdbc1 3678 }
f0f0fdfe
KW
3679
3680 return result;
c6ca28d6
AL
3681}
3682
3ac21627
PL
3683int bdrv_has_zero_init_1(BlockDriverState *bs)
3684{
3685 return 1;
3686}
3687
f2feebbd
KW
3688int bdrv_has_zero_init(BlockDriverState *bs)
3689{
3690 assert(bs->drv);
3691
11212d8f
PB
3692 /* If BS is a copy on write image, it is initialized to
3693 the contents of the base image, which may not be zeroes. */
3694 if (bs->backing_hd) {
3695 return 0;
3696 }
336c1c12
KW
3697 if (bs->drv->bdrv_has_zero_init) {
3698 return bs->drv->bdrv_has_zero_init(bs);
f2feebbd
KW
3699 }
3700
3ac21627
PL
3701 /* safe default */
3702 return 0;
f2feebbd
KW
3703}
3704
4ce78691
PL
3705bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
3706{
3707 BlockDriverInfo bdi;
3708
3709 if (bs->backing_hd) {
3710 return false;
3711 }
3712
3713 if (bdrv_get_info(bs, &bdi) == 0) {
3714 return bdi.unallocated_blocks_are_zero;
3715 }
3716
3717 return false;
3718}
3719
3720bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
3721{
3722 BlockDriverInfo bdi;
3723
3724 if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
3725 return false;
3726 }
3727
3728 if (bdrv_get_info(bs, &bdi) == 0) {
3729 return bdi.can_write_zeroes_with_unmap;
3730 }
3731
3732 return false;
3733}
3734
b6b8a333 3735typedef struct BdrvCoGetBlockStatusData {
376ae3f1 3736 BlockDriverState *bs;
b35b2bba 3737 BlockDriverState *base;
376ae3f1
SH
3738 int64_t sector_num;
3739 int nb_sectors;
3740 int *pnum;
b6b8a333 3741 int64_t ret;
376ae3f1 3742 bool done;
b6b8a333 3743} BdrvCoGetBlockStatusData;
376ae3f1 3744
f58c7b35
TS
3745/*
3746 * Returns true iff the specified sector is present in the disk image. Drivers
3747 * not implementing the functionality are assumed to not support backing files,
3748 * hence all their sectors are reported as allocated.
3749 *
bd9533e3
SH
3750 * If 'sector_num' is beyond the end of the disk image the return value is 0
3751 * and 'pnum' is set to 0.
3752 *
f58c7b35
TS
3753 * 'pnum' is set to the number of sectors (including and immediately following
3754 * the specified sector) that are known to be in the same
3755 * allocated/unallocated state.
3756 *
bd9533e3
SH
3757 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
3758 * beyond the end of the disk image it will be clamped.
f58c7b35 3759 */
b6b8a333
PB
3760static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
3761 int64_t sector_num,
3762 int nb_sectors, int *pnum)
f58c7b35 3763{
617ccb46 3764 int64_t length;
bd9533e3 3765 int64_t n;
5daa74a6 3766 int64_t ret, ret2;
bd9533e3 3767
617ccb46
PB
3768 length = bdrv_getlength(bs);
3769 if (length < 0) {
3770 return length;
3771 }
3772
3773 if (sector_num >= (length >> BDRV_SECTOR_BITS)) {
bd9533e3
SH
3774 *pnum = 0;
3775 return 0;
3776 }
3777
3778 n = bs->total_sectors - sector_num;
3779 if (n < nb_sectors) {
3780 nb_sectors = n;
3781 }
3782
b6b8a333 3783 if (!bs->drv->bdrv_co_get_block_status) {
bd9533e3 3784 *pnum = nb_sectors;
918e92d7
PB
3785 ret = BDRV_BLOCK_DATA;
3786 if (bs->drv->protocol_name) {
3787 ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
3788 }
3789 return ret;
f58c7b35 3790 }
6aebab14 3791
415b5b01
PB
3792 ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
3793 if (ret < 0) {
3e0a233d 3794 *pnum = 0;
415b5b01
PB
3795 return ret;
3796 }
3797
92bc50a5
PL
3798 if (ret & BDRV_BLOCK_RAW) {
3799 assert(ret & BDRV_BLOCK_OFFSET_VALID);
3800 return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3801 *pnum, pnum);
3802 }
3803
c3d86884
PL
3804 if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
3805 if (bdrv_unallocated_blocks_are_zero(bs)) {
f0ad5712 3806 ret |= BDRV_BLOCK_ZERO;
1f9db224 3807 } else if (bs->backing_hd) {
f0ad5712
PB
3808 BlockDriverState *bs2 = bs->backing_hd;
3809 int64_t length2 = bdrv_getlength(bs2);
3810 if (length2 >= 0 && sector_num >= (length2 >> BDRV_SECTOR_BITS)) {
3811 ret |= BDRV_BLOCK_ZERO;
3812 }
3813 }
415b5b01 3814 }
5daa74a6
PB
3815
3816 if (bs->file &&
3817 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
3818 (ret & BDRV_BLOCK_OFFSET_VALID)) {
3819 ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3820 *pnum, pnum);
3821 if (ret2 >= 0) {
3822 /* Ignore errors. This is just providing extra information, it
3823 * is useful but not necessary.
3824 */
3825 ret |= (ret2 & BDRV_BLOCK_ZERO);
3826 }
3827 }
3828
415b5b01 3829 return ret;
060f51c9
SH
3830}
3831
b6b8a333
PB
3832/* Coroutine wrapper for bdrv_get_block_status() */
3833static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
060f51c9 3834{
b6b8a333 3835 BdrvCoGetBlockStatusData *data = opaque;
060f51c9
SH
3836 BlockDriverState *bs = data->bs;
3837
b6b8a333
PB
3838 data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
3839 data->pnum);
060f51c9
SH
3840 data->done = true;
3841}
3842
3843/*
b6b8a333 3844 * Synchronous wrapper around bdrv_co_get_block_status().
060f51c9 3845 *
b6b8a333 3846 * See bdrv_co_get_block_status() for details.
060f51c9 3847 */
b6b8a333
PB
3848int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
3849 int nb_sectors, int *pnum)
060f51c9 3850{
6aebab14 3851 Coroutine *co;
b6b8a333 3852 BdrvCoGetBlockStatusData data = {
6aebab14
SH
3853 .bs = bs,
3854 .sector_num = sector_num,
3855 .nb_sectors = nb_sectors,
3856 .pnum = pnum,
3857 .done = false,
3858 };
3859
bdad13b9
PB
3860 if (qemu_in_coroutine()) {
3861 /* Fast-path if already in coroutine context */
b6b8a333 3862 bdrv_get_block_status_co_entry(&data);
bdad13b9 3863 } else {
b6b8a333 3864 co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
bdad13b9
PB
3865 qemu_coroutine_enter(co, &data);
3866 while (!data.done) {
3867 qemu_aio_wait();
3868 }
6aebab14
SH
3869 }
3870 return data.ret;
f58c7b35
TS
3871}
3872
b6b8a333
PB
3873int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
3874 int nb_sectors, int *pnum)
3875{
4333bb71
PB
3876 int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
3877 if (ret < 0) {
3878 return ret;
3879 }
3880 return
3881 (ret & BDRV_BLOCK_DATA) ||
3882 ((ret & BDRV_BLOCK_ZERO) && !bdrv_has_zero_init(bs));
b6b8a333
PB
3883}
3884
188a7bbf
PB
3885/*
3886 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
3887 *
3888 * Return true if the given sector is allocated in any image between
3889 * BASE and TOP (inclusive). BASE can be NULL to check if the given
3890 * sector is allocated in any image of the chain. Return false otherwise.
3891 *
3892 * 'pnum' is set to the number of sectors (including and immediately following
3893 * the specified sector) that are known to be in the same
3894 * allocated/unallocated state.
3895 *
3896 */
4f578637
PB
3897int bdrv_is_allocated_above(BlockDriverState *top,
3898 BlockDriverState *base,
3899 int64_t sector_num,
3900 int nb_sectors, int *pnum)
188a7bbf
PB
3901{
3902 BlockDriverState *intermediate;
3903 int ret, n = nb_sectors;
3904
3905 intermediate = top;
3906 while (intermediate && intermediate != base) {
3907 int pnum_inter;
bdad13b9
PB
3908 ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
3909 &pnum_inter);
188a7bbf
PB
3910 if (ret < 0) {
3911 return ret;
3912 } else if (ret) {
3913 *pnum = pnum_inter;
3914 return 1;
3915 }
3916
3917 /*
3918 * [sector_num, nb_sectors] is unallocated on top but intermediate
3919 * might have
3920 *
3921 * [sector_num+x, nr_sectors] allocated.
3922 */
63ba17d3
VI
3923 if (n > pnum_inter &&
3924 (intermediate == top ||
3925 sector_num + pnum_inter < intermediate->total_sectors)) {
188a7bbf
PB
3926 n = pnum_inter;
3927 }
3928
3929 intermediate = intermediate->backing_hd;
3930 }
3931
3932 *pnum = n;
3933 return 0;
3934}
3935
045df330
AL
3936const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
3937{
3938 if (bs->backing_hd && bs->backing_hd->encrypted)
3939 return bs->backing_file;
3940 else if (bs->encrypted)
3941 return bs->filename;
3942 else
3943 return NULL;
3944}
3945
5fafdf24 3946void bdrv_get_backing_filename(BlockDriverState *bs,
83f64091
FB
3947 char *filename, int filename_size)
3948{
3574c608 3949 pstrcpy(filename, filename_size, bs->backing_file);
83f64091
FB
3950}
3951
5fafdf24 3952int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
faea38e7
FB
3953 const uint8_t *buf, int nb_sectors)
3954{
3955 BlockDriver *drv = bs->drv;
3956 if (!drv)
19cb3738 3957 return -ENOMEDIUM;
faea38e7
FB
3958 if (!drv->bdrv_write_compressed)
3959 return -ENOTSUP;
fbb7b4e0
KW
3960 if (bdrv_check_request(bs, sector_num, nb_sectors))
3961 return -EIO;
a55eb92c 3962
e4654d2d 3963 assert(QLIST_EMPTY(&bs->dirty_bitmaps));
a55eb92c 3964
faea38e7
FB
3965 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
3966}
3b46e624 3967
faea38e7
FB
3968int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
3969{
3970 BlockDriver *drv = bs->drv;
3971 if (!drv)
19cb3738 3972 return -ENOMEDIUM;
faea38e7
FB
3973 if (!drv->bdrv_get_info)
3974 return -ENOTSUP;
3975 memset(bdi, 0, sizeof(*bdi));
3976 return drv->bdrv_get_info(bs, bdi);
3977}
3978
eae041fe
HR
3979ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
3980{
3981 BlockDriver *drv = bs->drv;
3982 if (drv && drv->bdrv_get_specific_info) {
3983 return drv->bdrv_get_specific_info(bs);
3984 }
3985 return NULL;
3986}
3987
45566e9c
CH
3988int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
3989 int64_t pos, int size)
cf8074b3
KW
3990{
3991 QEMUIOVector qiov;
3992 struct iovec iov = {
3993 .iov_base = (void *) buf,
3994 .iov_len = size,
3995 };
3996
3997 qemu_iovec_init_external(&qiov, &iov, 1);
3998 return bdrv_writev_vmstate(bs, &qiov, pos);
3999}
4000
4001int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
178e08a5
AL
4002{
4003 BlockDriver *drv = bs->drv;
cf8074b3
KW
4004
4005 if (!drv) {
178e08a5 4006 return -ENOMEDIUM;
cf8074b3
KW
4007 } else if (drv->bdrv_save_vmstate) {
4008 return drv->bdrv_save_vmstate(bs, qiov, pos);
4009 } else if (bs->file) {
4010 return bdrv_writev_vmstate(bs->file, qiov, pos);
4011 }
4012
7cdb1f6d 4013 return -ENOTSUP;
178e08a5
AL
4014}
4015
45566e9c
CH
4016int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
4017 int64_t pos, int size)
178e08a5
AL
4018{
4019 BlockDriver *drv = bs->drv;
4020 if (!drv)
4021 return -ENOMEDIUM;
7cdb1f6d
MK
4022 if (drv->bdrv_load_vmstate)
4023 return drv->bdrv_load_vmstate(bs, buf, pos, size);
4024 if (bs->file)
4025 return bdrv_load_vmstate(bs->file, buf, pos, size);
4026 return -ENOTSUP;
178e08a5
AL
4027}
4028
8b9b0cc2
KW
4029void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
4030{
bf736fe3 4031 if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
8b9b0cc2
KW
4032 return;
4033 }
4034
bf736fe3 4035 bs->drv->bdrv_debug_event(bs, event);
41c695c7
KW
4036}
4037
4038int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
4039 const char *tag)
4040{
4041 while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
4042 bs = bs->file;
4043 }
4044
4045 if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
4046 return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
4047 }
4048
4049 return -ENOTSUP;
4050}
4051
4cc70e93
FZ
4052int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
4053{
4054 while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
4055 bs = bs->file;
4056 }
4057
4058 if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
4059 return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
4060 }
4061
4062 return -ENOTSUP;
4063}
4064
41c695c7
KW
4065int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
4066{
4067 while (bs && bs->drv && !bs->drv->bdrv_debug_resume) {
4068 bs = bs->file;
4069 }
8b9b0cc2 4070
41c695c7
KW
4071 if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
4072 return bs->drv->bdrv_debug_resume(bs, tag);
4073 }
4074
4075 return -ENOTSUP;
4076}
4077
4078bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
4079{
4080 while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
4081 bs = bs->file;
4082 }
4083
4084 if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
4085 return bs->drv->bdrv_debug_is_suspended(bs, tag);
4086 }
4087
4088 return false;
8b9b0cc2
KW
4089}
4090
199630b6
BS
4091int bdrv_is_snapshot(BlockDriverState *bs)
4092{
4093 return !!(bs->open_flags & BDRV_O_SNAPSHOT);
4094}
4095
b1b1d783
JC
4096/* backing_file can either be relative, or absolute, or a protocol. If it is
4097 * relative, it must be relative to the chain. So, passing in bs->filename
4098 * from a BDS as backing_file should not be done, as that may be relative to
4099 * the CWD rather than the chain. */
e8a6bb9c
MT
4100BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
4101 const char *backing_file)
4102{
b1b1d783
JC
4103 char *filename_full = NULL;
4104 char *backing_file_full = NULL;
4105 char *filename_tmp = NULL;
4106 int is_protocol = 0;
4107 BlockDriverState *curr_bs = NULL;
4108 BlockDriverState *retval = NULL;
4109
4110 if (!bs || !bs->drv || !backing_file) {
e8a6bb9c
MT
4111 return NULL;
4112 }
4113
b1b1d783
JC
4114 filename_full = g_malloc(PATH_MAX);
4115 backing_file_full = g_malloc(PATH_MAX);
4116 filename_tmp = g_malloc(PATH_MAX);
4117
4118 is_protocol = path_has_protocol(backing_file);
4119
4120 for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
4121
4122 /* If either of the filename paths is actually a protocol, then
4123 * compare unmodified paths; otherwise make paths relative */
4124 if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
4125 if (strcmp(backing_file, curr_bs->backing_file) == 0) {
4126 retval = curr_bs->backing_hd;
4127 break;
4128 }
e8a6bb9c 4129 } else {
b1b1d783
JC
4130 /* If not an absolute filename path, make it relative to the current
4131 * image's filename path */
4132 path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4133 backing_file);
4134
4135 /* We are going to compare absolute pathnames */
4136 if (!realpath(filename_tmp, filename_full)) {
4137 continue;
4138 }
4139
4140 /* We need to make sure the backing filename we are comparing against
4141 * is relative to the current image filename (or absolute) */
4142 path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4143 curr_bs->backing_file);
4144
4145 if (!realpath(filename_tmp, backing_file_full)) {
4146 continue;
4147 }
4148
4149 if (strcmp(backing_file_full, filename_full) == 0) {
4150 retval = curr_bs->backing_hd;
4151 break;
4152 }
e8a6bb9c
MT
4153 }
4154 }
4155
b1b1d783
JC
4156 g_free(filename_full);
4157 g_free(backing_file_full);
4158 g_free(filename_tmp);
4159 return retval;
e8a6bb9c
MT
4160}
4161
f198fd1c
BC
4162int bdrv_get_backing_file_depth(BlockDriverState *bs)
4163{
4164 if (!bs->drv) {
4165 return 0;
4166 }
4167
4168 if (!bs->backing_hd) {
4169 return 0;
4170 }
4171
4172 return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
4173}
4174
79fac568
JC
4175BlockDriverState *bdrv_find_base(BlockDriverState *bs)
4176{
4177 BlockDriverState *curr_bs = NULL;
4178
4179 if (!bs) {
4180 return NULL;
4181 }
4182
4183 curr_bs = bs;
4184
4185 while (curr_bs->backing_hd) {
4186 curr_bs = curr_bs->backing_hd;
4187 }
4188 return curr_bs;
4189}
4190
ea2384d3 4191/**************************************************************/
83f64091 4192/* async I/Os */
ea2384d3 4193
3b69e4b9 4194BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
f141eafe 4195 QEMUIOVector *qiov, int nb_sectors,
3b69e4b9 4196 BlockDriverCompletionFunc *cb, void *opaque)
83f64091 4197{
bbf0a440
SH
4198 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
4199
d20d9b7c 4200 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
8c5873d6 4201 cb, opaque, false);
ea2384d3
FB
4202}
4203
f141eafe
AL
4204BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
4205 QEMUIOVector *qiov, int nb_sectors,
4206 BlockDriverCompletionFunc *cb, void *opaque)
ea2384d3 4207{
bbf0a440
SH
4208 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
4209
d20d9b7c 4210 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
8c5873d6 4211 cb, opaque, true);
83f64091
FB
4212}
4213
d5ef94d4
PB
4214BlockDriverAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
4215 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
4216 BlockDriverCompletionFunc *cb, void *opaque)
4217{
4218 trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
4219
4220 return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
4221 BDRV_REQ_ZERO_WRITE | flags,
4222 cb, opaque, true);
4223}
4224
40b4f539
KW
4225
4226typedef struct MultiwriteCB {
4227 int error;
4228 int num_requests;
4229 int num_callbacks;
4230 struct {
4231 BlockDriverCompletionFunc *cb;
4232 void *opaque;
4233 QEMUIOVector *free_qiov;
40b4f539
KW
4234 } callbacks[];
4235} MultiwriteCB;
4236
4237static void multiwrite_user_cb(MultiwriteCB *mcb)
4238{
4239 int i;
4240
4241 for (i = 0; i < mcb->num_callbacks; i++) {
4242 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
1e1ea48d
SH
4243 if (mcb->callbacks[i].free_qiov) {
4244 qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
4245 }
7267c094 4246 g_free(mcb->callbacks[i].free_qiov);
40b4f539
KW
4247 }
4248}
4249
4250static void multiwrite_cb(void *opaque, int ret)
4251{
4252 MultiwriteCB *mcb = opaque;
4253
6d519a5f
SH
4254 trace_multiwrite_cb(mcb, ret);
4255
cb6d3ca0 4256 if (ret < 0 && !mcb->error) {
40b4f539 4257 mcb->error = ret;
40b4f539
KW
4258 }
4259
4260 mcb->num_requests--;
4261 if (mcb->num_requests == 0) {
de189a1b 4262 multiwrite_user_cb(mcb);
7267c094 4263 g_free(mcb);
40b4f539
KW
4264 }
4265}
4266
4267static int multiwrite_req_compare(const void *a, const void *b)
4268{
77be4366
CH
4269 const BlockRequest *req1 = a, *req2 = b;
4270
4271 /*
4272 * Note that we can't simply subtract req2->sector from req1->sector
4273 * here as that could overflow the return value.
4274 */
4275 if (req1->sector > req2->sector) {
4276 return 1;
4277 } else if (req1->sector < req2->sector) {
4278 return -1;
4279 } else {
4280 return 0;
4281 }
40b4f539
KW
4282}
4283
4284/*
4285 * Takes a bunch of requests and tries to merge them. Returns the number of
4286 * requests that remain after merging.
4287 */
4288static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
4289 int num_reqs, MultiwriteCB *mcb)
4290{
4291 int i, outidx;
4292
4293 // Sort requests by start sector
4294 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
4295
4296 // Check if adjacent requests touch the same clusters. If so, combine them,
4297 // filling up gaps with zero sectors.
4298 outidx = 0;
4299 for (i = 1; i < num_reqs; i++) {
4300 int merge = 0;
4301 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
4302
b6a127a1 4303 // Handle exactly sequential writes and overlapping writes.
40b4f539
KW
4304 if (reqs[i].sector <= oldreq_last) {
4305 merge = 1;
4306 }
4307
e2a305fb
CH
4308 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
4309 merge = 0;
4310 }
4311
40b4f539
KW
4312 if (merge) {
4313 size_t size;
7267c094 4314 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
40b4f539
KW
4315 qemu_iovec_init(qiov,
4316 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
4317
4318 // Add the first request to the merged one. If the requests are
4319 // overlapping, drop the last sectors of the first request.
4320 size = (reqs[i].sector - reqs[outidx].sector) << 9;
1b093c48 4321 qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
40b4f539 4322
b6a127a1
PB
4323 // We should need to add any zeros between the two requests
4324 assert (reqs[i].sector <= oldreq_last);
40b4f539
KW
4325
4326 // Add the second request
1b093c48 4327 qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
40b4f539 4328
cbf1dff2 4329 reqs[outidx].nb_sectors = qiov->size >> 9;
40b4f539
KW
4330 reqs[outidx].qiov = qiov;
4331
4332 mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
4333 } else {
4334 outidx++;
4335 reqs[outidx].sector = reqs[i].sector;
4336 reqs[outidx].nb_sectors = reqs[i].nb_sectors;
4337 reqs[outidx].qiov = reqs[i].qiov;
4338 }
4339 }
4340
4341 return outidx + 1;
4342}
4343
4344/*
4345 * Submit multiple AIO write requests at once.
4346 *
4347 * On success, the function returns 0 and all requests in the reqs array have
4348 * been submitted. In error case this function returns -1, and any of the
4349 * requests may or may not be submitted yet. In particular, this means that the
4350 * callback will be called for some of the requests, for others it won't. The
4351 * caller must check the error field of the BlockRequest to wait for the right
4352 * callbacks (if error != 0, no callback will be called).
4353 *
4354 * The implementation may modify the contents of the reqs array, e.g. to merge
4355 * requests. However, the fields opaque and error are left unmodified as they
4356 * are used to signal failure for a single request to the caller.
4357 */
4358int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
4359{
40b4f539
KW
4360 MultiwriteCB *mcb;
4361 int i;
4362
301db7c2
RH
4363 /* don't submit writes if we don't have a medium */
4364 if (bs->drv == NULL) {
4365 for (i = 0; i < num_reqs; i++) {
4366 reqs[i].error = -ENOMEDIUM;
4367 }
4368 return -1;
4369 }
4370
40b4f539
KW
4371 if (num_reqs == 0) {
4372 return 0;
4373 }
4374
4375 // Create MultiwriteCB structure
7267c094 4376 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
40b4f539
KW
4377 mcb->num_requests = 0;
4378 mcb->num_callbacks = num_reqs;
4379
4380 for (i = 0; i < num_reqs; i++) {
4381 mcb->callbacks[i].cb = reqs[i].cb;
4382 mcb->callbacks[i].opaque = reqs[i].opaque;
4383 }
4384
4385 // Check for mergable requests
4386 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
4387
6d519a5f
SH
4388 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
4389
df9309fb
PB
4390 /* Run the aio requests. */
4391 mcb->num_requests = num_reqs;
40b4f539 4392 for (i = 0; i < num_reqs; i++) {
d20d9b7c
PB
4393 bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
4394 reqs[i].nb_sectors, reqs[i].flags,
4395 multiwrite_cb, mcb,
4396 true);
40b4f539
KW
4397 }
4398
4399 return 0;
40b4f539
KW
4400}
4401
83f64091 4402void bdrv_aio_cancel(BlockDriverAIOCB *acb)
83f64091 4403{
d7331bed 4404 acb->aiocb_info->cancel(acb);
83f64091
FB
4405}
4406
4407/**************************************************************/
4408/* async block device emulation */
4409
c16b5a2c
CH
4410typedef struct BlockDriverAIOCBSync {
4411 BlockDriverAIOCB common;
4412 QEMUBH *bh;
4413 int ret;
4414 /* vector translation state */
4415 QEMUIOVector *qiov;
4416 uint8_t *bounce;
4417 int is_write;
4418} BlockDriverAIOCBSync;
4419
4420static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
4421{
b666d239
KW
4422 BlockDriverAIOCBSync *acb =
4423 container_of(blockacb, BlockDriverAIOCBSync, common);
6a7ad299 4424 qemu_bh_delete(acb->bh);
36afc451 4425 acb->bh = NULL;
c16b5a2c
CH
4426 qemu_aio_release(acb);
4427}
4428
d7331bed 4429static const AIOCBInfo bdrv_em_aiocb_info = {
c16b5a2c
CH
4430 .aiocb_size = sizeof(BlockDriverAIOCBSync),
4431 .cancel = bdrv_aio_cancel_em,
4432};
4433
ce1a14dc 4434static void bdrv_aio_bh_cb(void *opaque)
83f64091 4435{
ce1a14dc 4436 BlockDriverAIOCBSync *acb = opaque;
f141eafe 4437
f141eafe 4438 if (!acb->is_write)
03396148 4439 qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
ceb42de8 4440 qemu_vfree(acb->bounce);
ce1a14dc 4441 acb->common.cb(acb->common.opaque, acb->ret);
6a7ad299 4442 qemu_bh_delete(acb->bh);
36afc451 4443 acb->bh = NULL;
ce1a14dc 4444 qemu_aio_release(acb);
83f64091 4445}
beac80cd 4446
f141eafe
AL
4447static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
4448 int64_t sector_num,
4449 QEMUIOVector *qiov,
4450 int nb_sectors,
4451 BlockDriverCompletionFunc *cb,
4452 void *opaque,
4453 int is_write)
4454
83f64091 4455{
ce1a14dc 4456 BlockDriverAIOCBSync *acb;
ce1a14dc 4457
d7331bed 4458 acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
f141eafe
AL
4459 acb->is_write = is_write;
4460 acb->qiov = qiov;
e268ca52 4461 acb->bounce = qemu_blockalign(bs, qiov->size);
3f3aace8 4462 acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
f141eafe
AL
4463
4464 if (is_write) {
d5e6b161 4465 qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
1ed20acf 4466 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
f141eafe 4467 } else {
1ed20acf 4468 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
f141eafe
AL
4469 }
4470
ce1a14dc 4471 qemu_bh_schedule(acb->bh);
f141eafe 4472
ce1a14dc 4473 return &acb->common;
beac80cd
FB
4474}
4475
f141eafe
AL
4476static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
4477 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
ce1a14dc 4478 BlockDriverCompletionFunc *cb, void *opaque)
beac80cd 4479{
f141eafe
AL
4480 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
4481}
83f64091 4482
f141eafe
AL
4483static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
4484 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4485 BlockDriverCompletionFunc *cb, void *opaque)
4486{
4487 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
beac80cd 4488}
beac80cd 4489
68485420
KW
4490
4491typedef struct BlockDriverAIOCBCoroutine {
4492 BlockDriverAIOCB common;
4493 BlockRequest req;
4494 bool is_write;
d318aea9 4495 bool *done;
68485420
KW
4496 QEMUBH* bh;
4497} BlockDriverAIOCBCoroutine;
4498
4499static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
4500{
d318aea9
KW
4501 BlockDriverAIOCBCoroutine *acb =
4502 container_of(blockacb, BlockDriverAIOCBCoroutine, common);
4503 bool done = false;
4504
4505 acb->done = &done;
4506 while (!done) {
4507 qemu_aio_wait();
4508 }
68485420
KW
4509}
4510
d7331bed 4511static const AIOCBInfo bdrv_em_co_aiocb_info = {
68485420
KW
4512 .aiocb_size = sizeof(BlockDriverAIOCBCoroutine),
4513 .cancel = bdrv_aio_co_cancel_em,
4514};
4515
35246a68 4516static void bdrv_co_em_bh(void *opaque)
68485420
KW
4517{
4518 BlockDriverAIOCBCoroutine *acb = opaque;
4519
4520 acb->common.cb(acb->common.opaque, acb->req.error);
d318aea9
KW
4521
4522 if (acb->done) {
4523 *acb->done = true;
4524 }
4525
68485420
KW
4526 qemu_bh_delete(acb->bh);
4527 qemu_aio_release(acb);
4528}
4529
b2a61371
SH
4530/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4531static void coroutine_fn bdrv_co_do_rw(void *opaque)
4532{
4533 BlockDriverAIOCBCoroutine *acb = opaque;
4534 BlockDriverState *bs = acb->common.bs;
4535
4536 if (!acb->is_write) {
4537 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
d20d9b7c 4538 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
b2a61371
SH
4539 } else {
4540 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
d20d9b7c 4541 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
b2a61371
SH
4542 }
4543
35246a68 4544 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
b2a61371
SH
4545 qemu_bh_schedule(acb->bh);
4546}
4547
68485420
KW
4548static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4549 int64_t sector_num,
4550 QEMUIOVector *qiov,
4551 int nb_sectors,
d20d9b7c 4552 BdrvRequestFlags flags,
68485420
KW
4553 BlockDriverCompletionFunc *cb,
4554 void *opaque,
8c5873d6 4555 bool is_write)
68485420
KW
4556{
4557 Coroutine *co;
4558 BlockDriverAIOCBCoroutine *acb;
4559
d7331bed 4560 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
68485420
KW
4561 acb->req.sector = sector_num;
4562 acb->req.nb_sectors = nb_sectors;
4563 acb->req.qiov = qiov;
d20d9b7c 4564 acb->req.flags = flags;
68485420 4565 acb->is_write = is_write;
d318aea9 4566 acb->done = NULL;
68485420 4567
8c5873d6 4568 co = qemu_coroutine_create(bdrv_co_do_rw);
68485420
KW
4569 qemu_coroutine_enter(co, acb);
4570
4571 return &acb->common;
4572}
4573
07f07615 4574static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
b2e12bc6 4575{
07f07615
PB
4576 BlockDriverAIOCBCoroutine *acb = opaque;
4577 BlockDriverState *bs = acb->common.bs;
b2e12bc6 4578
07f07615
PB
4579 acb->req.error = bdrv_co_flush(bs);
4580 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
b2e12bc6 4581 qemu_bh_schedule(acb->bh);
b2e12bc6
CH
4582}
4583
07f07615 4584BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
016f5cf6
AG
4585 BlockDriverCompletionFunc *cb, void *opaque)
4586{
07f07615 4587 trace_bdrv_aio_flush(bs, opaque);
016f5cf6 4588
07f07615
PB
4589 Coroutine *co;
4590 BlockDriverAIOCBCoroutine *acb;
016f5cf6 4591
d7331bed 4592 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
d318aea9
KW
4593 acb->done = NULL;
4594
07f07615
PB
4595 co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4596 qemu_coroutine_enter(co, acb);
016f5cf6 4597
016f5cf6
AG
4598 return &acb->common;
4599}
4600
4265d620
PB
4601static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4602{
4603 BlockDriverAIOCBCoroutine *acb = opaque;
4604 BlockDriverState *bs = acb->common.bs;
4605
4606 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
4607 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4608 qemu_bh_schedule(acb->bh);
4609}
4610
4611BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4612 int64_t sector_num, int nb_sectors,
4613 BlockDriverCompletionFunc *cb, void *opaque)
4614{
4615 Coroutine *co;
4616 BlockDriverAIOCBCoroutine *acb;
4617
4618 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4619
d7331bed 4620 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4265d620
PB
4621 acb->req.sector = sector_num;
4622 acb->req.nb_sectors = nb_sectors;
d318aea9 4623 acb->done = NULL;
4265d620
PB
4624 co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
4625 qemu_coroutine_enter(co, acb);
4626
4627 return &acb->common;
4628}
4629
ea2384d3
FB
4630void bdrv_init(void)
4631{
5efa9d5a 4632 module_call_init(MODULE_INIT_BLOCK);
ea2384d3 4633}
ce1a14dc 4634
eb852011
MA
4635void bdrv_init_with_whitelist(void)
4636{
4637 use_bdrv_whitelist = 1;
4638 bdrv_init();
4639}
4640
d7331bed 4641void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
c16b5a2c 4642 BlockDriverCompletionFunc *cb, void *opaque)
ce1a14dc 4643{
ce1a14dc
PB
4644 BlockDriverAIOCB *acb;
4645
d7331bed
SH
4646 acb = g_slice_alloc(aiocb_info->aiocb_size);
4647 acb->aiocb_info = aiocb_info;
ce1a14dc
PB
4648 acb->bs = bs;
4649 acb->cb = cb;
4650 acb->opaque = opaque;
4651 return acb;
4652}
4653
4654void qemu_aio_release(void *p)
4655{
d37c975f 4656 BlockDriverAIOCB *acb = p;
d7331bed 4657 g_slice_free1(acb->aiocb_info->aiocb_size, acb);
ce1a14dc 4658}
19cb3738 4659
f9f05dc5
KW
4660/**************************************************************/
4661/* Coroutine block device emulation */
4662
4663typedef struct CoroutineIOCompletion {
4664 Coroutine *coroutine;
4665 int ret;
4666} CoroutineIOCompletion;
4667
4668static void bdrv_co_io_em_complete(void *opaque, int ret)
4669{
4670 CoroutineIOCompletion *co = opaque;
4671
4672 co->ret = ret;
4673 qemu_coroutine_enter(co->coroutine, NULL);
4674}
4675
4676static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
4677 int nb_sectors, QEMUIOVector *iov,
4678 bool is_write)
4679{
4680 CoroutineIOCompletion co = {
4681 .coroutine = qemu_coroutine_self(),
4682 };
4683 BlockDriverAIOCB *acb;
4684
4685 if (is_write) {
a652d160
SH
4686 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
4687 bdrv_co_io_em_complete, &co);
f9f05dc5 4688 } else {
a652d160
SH
4689 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
4690 bdrv_co_io_em_complete, &co);
f9f05dc5
KW
4691 }
4692
59370aaa 4693 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
f9f05dc5
KW
4694 if (!acb) {
4695 return -EIO;
4696 }
4697 qemu_coroutine_yield();
4698
4699 return co.ret;
4700}
4701
4702static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4703 int64_t sector_num, int nb_sectors,
4704 QEMUIOVector *iov)
4705{
4706 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4707}
4708
4709static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4710 int64_t sector_num, int nb_sectors,
4711 QEMUIOVector *iov)
4712{
4713 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4714}
4715
07f07615 4716static void coroutine_fn bdrv_flush_co_entry(void *opaque)
e7a8a783 4717{
07f07615
PB
4718 RwCo *rwco = opaque;
4719
4720 rwco->ret = bdrv_co_flush(rwco->bs);
4721}
4722
4723int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4724{
eb489bb1
KW
4725 int ret;
4726
29cdb251 4727 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
07f07615 4728 return 0;
eb489bb1
KW
4729 }
4730
ca716364 4731 /* Write back cached data to the OS even with cache=unsafe */
bf736fe3 4732 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
eb489bb1
KW
4733 if (bs->drv->bdrv_co_flush_to_os) {
4734 ret = bs->drv->bdrv_co_flush_to_os(bs);
4735 if (ret < 0) {
4736 return ret;
4737 }
4738 }
4739
ca716364
KW
4740 /* But don't actually force it to the disk with cache=unsafe */
4741 if (bs->open_flags & BDRV_O_NO_FLUSH) {
d4c82329 4742 goto flush_parent;
ca716364
KW
4743 }
4744
bf736fe3 4745 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
eb489bb1 4746 if (bs->drv->bdrv_co_flush_to_disk) {
29cdb251 4747 ret = bs->drv->bdrv_co_flush_to_disk(bs);
07f07615
PB
4748 } else if (bs->drv->bdrv_aio_flush) {
4749 BlockDriverAIOCB *acb;
4750 CoroutineIOCompletion co = {
4751 .coroutine = qemu_coroutine_self(),
4752 };
4753
4754 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4755 if (acb == NULL) {
29cdb251 4756 ret = -EIO;
07f07615
PB
4757 } else {
4758 qemu_coroutine_yield();
29cdb251 4759 ret = co.ret;
07f07615 4760 }
07f07615
PB
4761 } else {
4762 /*
4763 * Some block drivers always operate in either writethrough or unsafe
4764 * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4765 * know how the server works (because the behaviour is hardcoded or
4766 * depends on server-side configuration), so we can't ensure that
4767 * everything is safe on disk. Returning an error doesn't work because
4768 * that would break guests even if the server operates in writethrough
4769 * mode.
4770 *
4771 * Let's hope the user knows what he's doing.
4772 */
29cdb251 4773 ret = 0;
07f07615 4774 }
29cdb251
PB
4775 if (ret < 0) {
4776 return ret;
4777 }
4778
4779 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
4780 * in the case of cache=unsafe, so there are no useless flushes.
4781 */
d4c82329 4782flush_parent:
29cdb251 4783 return bdrv_co_flush(bs->file);
07f07615
PB
4784}
4785
0f15423c
AL
4786void bdrv_invalidate_cache(BlockDriverState *bs)
4787{
4788 if (bs->drv && bs->drv->bdrv_invalidate_cache) {
4789 bs->drv->bdrv_invalidate_cache(bs);
4790 }
4791}
4792
4793void bdrv_invalidate_cache_all(void)
4794{
4795 BlockDriverState *bs;
4796
dc364f4c 4797 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
0f15423c
AL
4798 bdrv_invalidate_cache(bs);
4799 }
4800}
4801
07789269
BC
4802void bdrv_clear_incoming_migration_all(void)
4803{
4804 BlockDriverState *bs;
4805
dc364f4c 4806 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
07789269
BC
4807 bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
4808 }
4809}
4810
07f07615
PB
4811int bdrv_flush(BlockDriverState *bs)
4812{
4813 Coroutine *co;
4814 RwCo rwco = {
4815 .bs = bs,
4816 .ret = NOT_DONE,
e7a8a783 4817 };
e7a8a783 4818
07f07615
PB
4819 if (qemu_in_coroutine()) {
4820 /* Fast-path if already in coroutine context */
4821 bdrv_flush_co_entry(&rwco);
4822 } else {
4823 co = qemu_coroutine_create(bdrv_flush_co_entry);
4824 qemu_coroutine_enter(co, &rwco);
4825 while (rwco.ret == NOT_DONE) {
4826 qemu_aio_wait();
4827 }
e7a8a783 4828 }
07f07615
PB
4829
4830 return rwco.ret;
e7a8a783
KW
4831}
4832
775aa8b6
KW
4833typedef struct DiscardCo {
4834 BlockDriverState *bs;
4835 int64_t sector_num;
4836 int nb_sectors;
4837 int ret;
4838} DiscardCo;
4265d620
PB
4839static void coroutine_fn bdrv_discard_co_entry(void *opaque)
4840{
775aa8b6 4841 DiscardCo *rwco = opaque;
4265d620
PB
4842
4843 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
4844}
4845
6f14da52
PL
4846/* if no limit is specified in the BlockLimits use a default
4847 * of 32768 512-byte sectors (16 MiB) per request.
4848 */
4849#define MAX_DISCARD_DEFAULT 32768
4850
4265d620
PB
4851int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
4852 int nb_sectors)
4853{
d51e9fe5
PB
4854 int max_discard;
4855
4265d620
PB
4856 if (!bs->drv) {
4857 return -ENOMEDIUM;
4858 } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
4859 return -EIO;
4860 } else if (bs->read_only) {
4861 return -EROFS;
df702c9b
PB
4862 }
4863
e4654d2d 4864 bdrv_reset_dirty(bs, sector_num, nb_sectors);
df702c9b 4865
9e8f1835
PB
4866 /* Do nothing if disabled. */
4867 if (!(bs->open_flags & BDRV_O_UNMAP)) {
4868 return 0;
4869 }
4870
d51e9fe5
PB
4871 if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
4872 return 0;
4873 }
6f14da52 4874
d51e9fe5
PB
4875 max_discard = bs->bl.max_discard ? bs->bl.max_discard : MAX_DISCARD_DEFAULT;
4876 while (nb_sectors > 0) {
4877 int ret;
4878 int num = nb_sectors;
6f14da52 4879
d51e9fe5
PB
4880 /* align request */
4881 if (bs->bl.discard_alignment &&
4882 num >= bs->bl.discard_alignment &&
4883 sector_num % bs->bl.discard_alignment) {
4884 if (num > bs->bl.discard_alignment) {
4885 num = bs->bl.discard_alignment;
6f14da52 4886 }
d51e9fe5
PB
4887 num -= sector_num % bs->bl.discard_alignment;
4888 }
6f14da52 4889
d51e9fe5
PB
4890 /* limit request size */
4891 if (num > max_discard) {
4892 num = max_discard;
4893 }
6f14da52 4894
d51e9fe5 4895 if (bs->drv->bdrv_co_discard) {
6f14da52 4896 ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
d51e9fe5
PB
4897 } else {
4898 BlockDriverAIOCB *acb;
4899 CoroutineIOCompletion co = {
4900 .coroutine = qemu_coroutine_self(),
4901 };
4902
4903 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
4904 bdrv_co_io_em_complete, &co);
4905 if (acb == NULL) {
4906 return -EIO;
4907 } else {
4908 qemu_coroutine_yield();
4909 ret = co.ret;
6f14da52 4910 }
6f14da52 4911 }
7ce21016 4912 if (ret && ret != -ENOTSUP) {
d51e9fe5 4913 return ret;
4265d620 4914 }
d51e9fe5
PB
4915
4916 sector_num += num;
4917 nb_sectors -= num;
4265d620 4918 }
d51e9fe5 4919 return 0;
4265d620
PB
4920}
4921
4922int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
4923{
4924 Coroutine *co;
775aa8b6 4925 DiscardCo rwco = {
4265d620
PB
4926 .bs = bs,
4927 .sector_num = sector_num,
4928 .nb_sectors = nb_sectors,
4929 .ret = NOT_DONE,
4930 };
4931
4932 if (qemu_in_coroutine()) {
4933 /* Fast-path if already in coroutine context */
4934 bdrv_discard_co_entry(&rwco);
4935 } else {
4936 co = qemu_coroutine_create(bdrv_discard_co_entry);
4937 qemu_coroutine_enter(co, &rwco);
4938 while (rwco.ret == NOT_DONE) {
4939 qemu_aio_wait();
4940 }
4941 }
4942
4943 return rwco.ret;
4944}
4945
19cb3738
FB
4946/**************************************************************/
4947/* removable device support */
4948
4949/**
4950 * Return TRUE if the media is present
4951 */
4952int bdrv_is_inserted(BlockDriverState *bs)
4953{
4954 BlockDriver *drv = bs->drv;
a1aff5bf 4955
19cb3738
FB
4956 if (!drv)
4957 return 0;
4958 if (!drv->bdrv_is_inserted)
a1aff5bf
MA
4959 return 1;
4960 return drv->bdrv_is_inserted(bs);
19cb3738
FB
4961}
4962
4963/**
8e49ca46
MA
4964 * Return whether the media changed since the last call to this
4965 * function, or -ENOTSUP if we don't know. Most drivers don't know.
19cb3738
FB
4966 */
4967int bdrv_media_changed(BlockDriverState *bs)
4968{
4969 BlockDriver *drv = bs->drv;
19cb3738 4970
8e49ca46
MA
4971 if (drv && drv->bdrv_media_changed) {
4972 return drv->bdrv_media_changed(bs);
4973 }
4974 return -ENOTSUP;
19cb3738
FB
4975}
4976
4977/**
4978 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
4979 */
f36f3949 4980void bdrv_eject(BlockDriverState *bs, bool eject_flag)
19cb3738
FB
4981{
4982 BlockDriver *drv = bs->drv;
19cb3738 4983
822e1cd1
MA
4984 if (drv && drv->bdrv_eject) {
4985 drv->bdrv_eject(bs, eject_flag);
19cb3738 4986 }
6f382ed2
LC
4987
4988 if (bs->device_name[0] != '\0') {
4989 bdrv_emit_qmp_eject_event(bs, eject_flag);
4990 }
19cb3738
FB
4991}
4992
19cb3738
FB
4993/**
4994 * Lock or unlock the media (if it is locked, the user won't be able
4995 * to eject it manually).
4996 */
025e849a 4997void bdrv_lock_medium(BlockDriverState *bs, bool locked)
19cb3738
FB
4998{
4999 BlockDriver *drv = bs->drv;
5000
025e849a 5001 trace_bdrv_lock_medium(bs, locked);
b8c6d095 5002
025e849a
MA
5003 if (drv && drv->bdrv_lock_medium) {
5004 drv->bdrv_lock_medium(bs, locked);
19cb3738
FB
5005 }
5006}
985a03b0
TS
5007
5008/* needed for generic scsi interface */
5009
5010int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
5011{
5012 BlockDriver *drv = bs->drv;
5013
5014 if (drv && drv->bdrv_ioctl)
5015 return drv->bdrv_ioctl(bs, req, buf);
5016 return -ENOTSUP;
5017}
7d780669 5018
221f715d
AL
5019BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
5020 unsigned long int req, void *buf,
5021 BlockDriverCompletionFunc *cb, void *opaque)
7d780669 5022{
221f715d 5023 BlockDriver *drv = bs->drv;
7d780669 5024
221f715d
AL
5025 if (drv && drv->bdrv_aio_ioctl)
5026 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
5027 return NULL;
7d780669 5028}
e268ca52 5029
1b7fd729 5030void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
7b6f9300 5031{
1b7fd729 5032 bs->guest_block_size = align;
7b6f9300 5033}
7cd1e32a 5034
e268ca52
AL
5035void *qemu_blockalign(BlockDriverState *bs, size_t size)
5036{
339064d5 5037 return qemu_memalign(bdrv_opt_mem_align(bs), size);
e268ca52 5038}
7cd1e32a 5039
c53b1c51
SH
5040/*
5041 * Check if all memory in this vector is sector aligned.
5042 */
5043bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
5044{
5045 int i;
339064d5 5046 size_t alignment = bdrv_opt_mem_align(bs);
c53b1c51
SH
5047
5048 for (i = 0; i < qiov->niov; i++) {
339064d5 5049 if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
c53b1c51 5050 return false;
1ff735bd 5051 }
339064d5 5052 if (qiov->iov[i].iov_len % alignment) {
1ff735bd 5053 return false;
c53b1c51
SH
5054 }
5055 }
5056
5057 return true;
5058}
5059
e4654d2d 5060BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity)
7cd1e32a
LS
5061{
5062 int64_t bitmap_size;
e4654d2d 5063 BdrvDirtyBitmap *bitmap;
a55eb92c 5064
50717e94
PB
5065 assert((granularity & (granularity - 1)) == 0);
5066
e4654d2d
FZ
5067 granularity >>= BDRV_SECTOR_BITS;
5068 assert(granularity);
5069 bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS);
5070 bitmap = g_malloc0(sizeof(BdrvDirtyBitmap));
5071 bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
5072 QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
5073 return bitmap;
5074}
5075
5076void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5077{
5078 BdrvDirtyBitmap *bm, *next;
5079 QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
5080 if (bm == bitmap) {
5081 QLIST_REMOVE(bitmap, list);
5082 hbitmap_free(bitmap->bitmap);
5083 g_free(bitmap);
5084 return;
a55eb92c 5085 }
7cd1e32a
LS
5086 }
5087}
5088
21b56835
FZ
5089BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
5090{
5091 BdrvDirtyBitmap *bm;
5092 BlockDirtyInfoList *list = NULL;
5093 BlockDirtyInfoList **plist = &list;
5094
5095 QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
5096 BlockDirtyInfo *info = g_malloc0(sizeof(BlockDirtyInfo));
5097 BlockDirtyInfoList *entry = g_malloc0(sizeof(BlockDirtyInfoList));
5098 info->count = bdrv_get_dirty_count(bs, bm);
5099 info->granularity =
5100 ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
5101 entry->value = info;
5102 *plist = entry;
5103 plist = &entry->next;
5104 }
5105
5106 return list;
5107}
5108
e4654d2d 5109int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
7cd1e32a 5110{
e4654d2d
FZ
5111 if (bitmap) {
5112 return hbitmap_get(bitmap->bitmap, sector);
7cd1e32a
LS
5113 } else {
5114 return 0;
5115 }
5116}
5117
e4654d2d
FZ
5118void bdrv_dirty_iter_init(BlockDriverState *bs,
5119 BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
1755da16 5120{
e4654d2d 5121 hbitmap_iter_init(hbi, bitmap->bitmap, 0);
1755da16
PB
5122}
5123
5124void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
5125 int nr_sectors)
5126{
e4654d2d
FZ
5127 BdrvDirtyBitmap *bitmap;
5128 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5129 hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5130 }
1755da16
PB
5131}
5132
e4654d2d 5133void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors)
7cd1e32a 5134{
e4654d2d
FZ
5135 BdrvDirtyBitmap *bitmap;
5136 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5137 hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5138 }
7cd1e32a 5139}
aaa0eb75 5140
e4654d2d 5141int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
aaa0eb75 5142{
e4654d2d 5143 return hbitmap_count(bitmap->bitmap);
aaa0eb75 5144}
f88e1a42 5145
9fcb0251
FZ
5146/* Get a reference to bs */
5147void bdrv_ref(BlockDriverState *bs)
5148{
5149 bs->refcnt++;
5150}
5151
5152/* Release a previously grabbed reference to bs.
5153 * If after releasing, reference count is zero, the BlockDriverState is
5154 * deleted. */
5155void bdrv_unref(BlockDriverState *bs)
5156{
5157 assert(bs->refcnt > 0);
5158 if (--bs->refcnt == 0) {
5159 bdrv_delete(bs);
5160 }
5161}
5162
db593f25
MT
5163void bdrv_set_in_use(BlockDriverState *bs, int in_use)
5164{
5165 assert(bs->in_use != in_use);
5166 bs->in_use = in_use;
5167}
5168
5169int bdrv_in_use(BlockDriverState *bs)
5170{
5171 return bs->in_use;
5172}
5173
28a7282a
LC
5174void bdrv_iostatus_enable(BlockDriverState *bs)
5175{
d6bf279e 5176 bs->iostatus_enabled = true;
58e21ef5 5177 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
28a7282a
LC
5178}
5179
5180/* The I/O status is only enabled if the drive explicitly
5181 * enables it _and_ the VM is configured to stop on errors */
5182bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
5183{
d6bf279e 5184 return (bs->iostatus_enabled &&
92aa5c6d
PB
5185 (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
5186 bs->on_write_error == BLOCKDEV_ON_ERROR_STOP ||
5187 bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
28a7282a
LC
5188}
5189
5190void bdrv_iostatus_disable(BlockDriverState *bs)
5191{
d6bf279e 5192 bs->iostatus_enabled = false;
28a7282a
LC
5193}
5194
5195void bdrv_iostatus_reset(BlockDriverState *bs)
5196{
5197 if (bdrv_iostatus_is_enabled(bs)) {
58e21ef5 5198 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
3bd293c3
PB
5199 if (bs->job) {
5200 block_job_iostatus_reset(bs->job);
5201 }
28a7282a
LC
5202 }
5203}
5204
28a7282a
LC
5205void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
5206{
3e1caa5f
PB
5207 assert(bdrv_iostatus_is_enabled(bs));
5208 if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
58e21ef5
LC
5209 bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
5210 BLOCK_DEVICE_IO_STATUS_FAILED;
28a7282a
LC
5211 }
5212}
5213
a597e79c
CH
5214void
5215bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
5216 enum BlockAcctType type)
5217{
5218 assert(type < BDRV_MAX_IOTYPE);
5219
5220 cookie->bytes = bytes;
c488c7f6 5221 cookie->start_time_ns = get_clock();
a597e79c
CH
5222 cookie->type = type;
5223}
5224
5225void
5226bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
5227{
5228 assert(cookie->type < BDRV_MAX_IOTYPE);
5229
5230 bs->nr_bytes[cookie->type] += cookie->bytes;
5231 bs->nr_ops[cookie->type]++;
c488c7f6 5232 bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
a597e79c
CH
5233}
5234
d92ada22
LC
5235void bdrv_img_create(const char *filename, const char *fmt,
5236 const char *base_filename, const char *base_fmt,
f382d43a
MR
5237 char *options, uint64_t img_size, int flags,
5238 Error **errp, bool quiet)
f88e1a42
JS
5239{
5240 QEMUOptionParameter *param = NULL, *create_options = NULL;
d220894e 5241 QEMUOptionParameter *backing_fmt, *backing_file, *size;
f88e1a42 5242 BlockDriver *drv, *proto_drv;
96df67d1 5243 BlockDriver *backing_drv = NULL;
cc84d90f 5244 Error *local_err = NULL;
f88e1a42
JS
5245 int ret = 0;
5246
5247 /* Find driver and parse its options */
5248 drv = bdrv_find_format(fmt);
5249 if (!drv) {
71c79813 5250 error_setg(errp, "Unknown file format '%s'", fmt);
d92ada22 5251 return;
f88e1a42
JS
5252 }
5253
98289620 5254 proto_drv = bdrv_find_protocol(filename, true);
f88e1a42 5255 if (!proto_drv) {
71c79813 5256 error_setg(errp, "Unknown protocol '%s'", filename);
d92ada22 5257 return;
f88e1a42
JS
5258 }
5259
5260 create_options = append_option_parameters(create_options,
5261 drv->create_options);
5262 create_options = append_option_parameters(create_options,
5263 proto_drv->create_options);
5264
5265 /* Create parameter list with default values */
5266 param = parse_option_parameters("", create_options, param);
5267
5268 set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
5269
5270 /* Parse -o options */
5271 if (options) {
5272 param = parse_option_parameters(options, create_options, param);
5273 if (param == NULL) {
71c79813 5274 error_setg(errp, "Invalid options for file format '%s'.", fmt);
f88e1a42
JS
5275 goto out;
5276 }
5277 }
5278
5279 if (base_filename) {
5280 if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
5281 base_filename)) {
71c79813
LC
5282 error_setg(errp, "Backing file not supported for file format '%s'",
5283 fmt);
f88e1a42
JS
5284 goto out;
5285 }
5286 }
5287
5288 if (base_fmt) {
5289 if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
71c79813
LC
5290 error_setg(errp, "Backing file format not supported for file "
5291 "format '%s'", fmt);
f88e1a42
JS
5292 goto out;
5293 }
5294 }
5295
792da93a
JS
5296 backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
5297 if (backing_file && backing_file->value.s) {
5298 if (!strcmp(filename, backing_file->value.s)) {
71c79813
LC
5299 error_setg(errp, "Error: Trying to create an image with the "
5300 "same filename as the backing file");
792da93a
JS
5301 goto out;
5302 }
5303 }
5304
f88e1a42
JS
5305 backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
5306 if (backing_fmt && backing_fmt->value.s) {
96df67d1
SH
5307 backing_drv = bdrv_find_format(backing_fmt->value.s);
5308 if (!backing_drv) {
71c79813
LC
5309 error_setg(errp, "Unknown backing file format '%s'",
5310 backing_fmt->value.s);
f88e1a42
JS
5311 goto out;
5312 }
5313 }
5314
5315 // The size for the image must always be specified, with one exception:
5316 // If we are using a backing file, we can obtain the size from there
d220894e
KW
5317 size = get_option_parameter(param, BLOCK_OPT_SIZE);
5318 if (size && size->value.n == -1) {
f88e1a42 5319 if (backing_file && backing_file->value.s) {
66f6b814 5320 BlockDriverState *bs;
f88e1a42 5321 uint64_t size;
f88e1a42 5322 char buf[32];
63090dac
PB
5323 int back_flags;
5324
5325 /* backing files always opened read-only */
5326 back_flags =
5327 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
f88e1a42 5328
f67503e5 5329 bs = NULL;
ddf5636d 5330 ret = bdrv_open(&bs, backing_file->value.s, NULL, NULL, back_flags,
cc84d90f 5331 backing_drv, &local_err);
f88e1a42 5332 if (ret < 0) {
cc84d90f
HR
5333 error_setg_errno(errp, -ret, "Could not open '%s': %s",
5334 backing_file->value.s,
5335 error_get_pretty(local_err));
5336 error_free(local_err);
5337 local_err = NULL;
f88e1a42
JS
5338 goto out;
5339 }
5340 bdrv_get_geometry(bs, &size);
5341 size *= 512;
5342
5343 snprintf(buf, sizeof(buf), "%" PRId64, size);
5344 set_option_parameter(param, BLOCK_OPT_SIZE, buf);
66f6b814
HR
5345
5346 bdrv_unref(bs);
f88e1a42 5347 } else {
71c79813 5348 error_setg(errp, "Image creation needs a size parameter");
f88e1a42
JS
5349 goto out;
5350 }
5351 }
5352
f382d43a
MR
5353 if (!quiet) {
5354 printf("Formatting '%s', fmt=%s ", filename, fmt);
5355 print_option_parameters(param);
5356 puts("");
5357 }
cc84d90f
HR
5358 ret = bdrv_create(drv, filename, param, &local_err);
5359 if (ret == -EFBIG) {
5360 /* This is generally a better message than whatever the driver would
5361 * deliver (especially because of the cluster_size_hint), since that
5362 * is most probably not much different from "image too large". */
5363 const char *cluster_size_hint = "";
5364 if (get_option_parameter(create_options, BLOCK_OPT_CLUSTER_SIZE)) {
5365 cluster_size_hint = " (try using a larger cluster size)";
f88e1a42 5366 }
cc84d90f
HR
5367 error_setg(errp, "The image size is too large for file format '%s'"
5368 "%s", fmt, cluster_size_hint);
5369 error_free(local_err);
5370 local_err = NULL;
f88e1a42
JS
5371 }
5372
5373out:
5374 free_option_parameters(create_options);
5375 free_option_parameters(param);
5376
84d18f06 5377 if (local_err) {
cc84d90f
HR
5378 error_propagate(errp, local_err);
5379 }
f88e1a42 5380}
85d126f3
SH
5381
5382AioContext *bdrv_get_aio_context(BlockDriverState *bs)
5383{
5384 /* Currently BlockDriverState always uses the main loop AioContext */
5385 return qemu_get_aio_context();
5386}
d616b224
SH
5387
5388void bdrv_add_before_write_notifier(BlockDriverState *bs,
5389 NotifierWithReturn *notifier)
5390{
5391 notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
5392}
6f176b48
HR
5393
5394int bdrv_amend_options(BlockDriverState *bs, QEMUOptionParameter *options)
5395{
5396 if (bs->drv->bdrv_amend_options == NULL) {
5397 return -ENOTSUP;
5398 }
5399 return bs->drv->bdrv_amend_options(bs, options);
5400}
f6186f49 5401
212a5a8f
BC
5402/* Used to recurse on single child block filters.
5403 * Single child block filter will store their child in bs->file.
5404 */
5405bool bdrv_generic_is_first_non_filter(BlockDriverState *bs,
5406 BlockDriverState *candidate)
f6186f49 5407{
212a5a8f
BC
5408 if (!bs->drv) {
5409 return false;
5410 }
5411
5412 if (!bs->drv->authorizations[BS_IS_A_FILTER]) {
5413 if (bs == candidate) {
5414 return true;
5415 } else {
5416 return false;
5417 }
5418 }
5419
5420 if (!bs->drv->authorizations[BS_FILTER_PASS_DOWN]) {
5421 return false;
f6186f49
BC
5422 }
5423
212a5a8f
BC
5424 if (!bs->file) {
5425 return false;
5426 }
5427
5428 return bdrv_recurse_is_first_non_filter(bs->file, candidate);
5429}
5430
5431bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
5432 BlockDriverState *candidate)
5433{
5434 if (bs->drv && bs->drv->bdrv_recurse_is_first_non_filter) {
5435 return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate);
f6186f49
BC
5436 }
5437
212a5a8f 5438 return bdrv_generic_is_first_non_filter(bs, candidate);
f6186f49
BC
5439}
5440
212a5a8f
BC
5441/* This function checks if the candidate is the first non filter bs down it's
5442 * bs chain. Since we don't have pointers to parents it explore all bs chains
5443 * from the top. Some filters can choose not to pass down the recursion.
5444 */
5445bool bdrv_is_first_non_filter(BlockDriverState *candidate)
f6186f49 5446{
212a5a8f
BC
5447 BlockDriverState *bs;
5448
5449 /* walk down the bs forest recursively */
5450 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5451 bool perm;
5452
e6dc8a1f 5453 perm = bdrv_recurse_is_first_non_filter(bs, candidate);
212a5a8f
BC
5454
5455 /* candidate is the first non filter */
5456 if (perm) {
5457 return true;
5458 }
5459 }
5460
5461 return false;
f6186f49 5462}