]> git.proxmox.com Git - mirror_qemu.git/blame - block.c
block: acquire AioContext in bdrv_*_all()
[mirror_qemu.git] / block.c
CommitLineData
fc01f7e7
FB
1/*
2 * QEMU System Emulator block driver
5fafdf24 3 *
fc01f7e7 4 * Copyright (c) 2003 Fabrice Bellard
5fafdf24 5 *
fc01f7e7
FB
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
3990d09a 24#include "config-host.h"
faf07963 25#include "qemu-common.h"
6d519a5f 26#include "trace.h"
83c9089e 27#include "monitor/monitor.h"
737e150e
PB
28#include "block/block_int.h"
29#include "block/blockjob.h"
1de7afc9 30#include "qemu/module.h"
7b1b5d19 31#include "qapi/qmp/qjson.h"
9c17d615 32#include "sysemu/sysemu.h"
1de7afc9 33#include "qemu/notify.h"
737e150e 34#include "block/coroutine.h"
c13163fb 35#include "block/qapi.h"
b2023818 36#include "qmp-commands.h"
1de7afc9 37#include "qemu/timer.h"
fc01f7e7 38
71e72a19 39#ifdef CONFIG_BSD
7674e7bf
FB
40#include <sys/types.h>
41#include <sys/stat.h>
42#include <sys/ioctl.h>
72cf2d4f 43#include <sys/queue.h>
c5e97233 44#ifndef __DragonFly__
7674e7bf
FB
45#include <sys/disk.h>
46#endif
c5e97233 47#endif
7674e7bf 48
49dc768d
AL
49#ifdef _WIN32
50#include <windows.h>
51#endif
52
e4654d2d
FZ
53struct BdrvDirtyBitmap {
54 HBitmap *bitmap;
55 QLIST_ENTRY(BdrvDirtyBitmap) list;
56};
57
1c9805a3
SH
58#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
59
7d4b4ba5 60static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
f141eafe
AL
61static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
62 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
c87c0672 63 BlockDriverCompletionFunc *cb, void *opaque);
f141eafe
AL
64static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
65 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
ce1a14dc 66 BlockDriverCompletionFunc *cb, void *opaque);
f9f05dc5
KW
67static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
68 int64_t sector_num, int nb_sectors,
69 QEMUIOVector *iov);
70static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
71 int64_t sector_num, int nb_sectors,
72 QEMUIOVector *iov);
775aa8b6
KW
73static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
74 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
470c0504 75 BdrvRequestFlags flags);
775aa8b6
KW
76static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
77 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
f08f2dda 78 BdrvRequestFlags flags);
b2a61371
SH
79static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
80 int64_t sector_num,
81 QEMUIOVector *qiov,
82 int nb_sectors,
d20d9b7c 83 BdrvRequestFlags flags,
b2a61371
SH
84 BlockDriverCompletionFunc *cb,
85 void *opaque,
8c5873d6 86 bool is_write);
b2a61371 87static void coroutine_fn bdrv_co_do_rw(void *opaque);
621f0589 88static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
aa7bfbff 89 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
ec530c81 90
1b7bdbc1
SH
91static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
92 QTAILQ_HEAD_INITIALIZER(bdrv_states);
7ee930d0 93
dc364f4c
BC
94static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
95 QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
96
8a22f02a
SH
97static QLIST_HEAD(, BlockDriver) bdrv_drivers =
98 QLIST_HEAD_INITIALIZER(bdrv_drivers);
ea2384d3 99
eb852011
MA
100/* If non-zero, use only whitelisted block drivers */
101static int use_bdrv_whitelist;
102
9e0b22f4
SH
103#ifdef _WIN32
104static int is_windows_drive_prefix(const char *filename)
105{
106 return (((filename[0] >= 'a' && filename[0] <= 'z') ||
107 (filename[0] >= 'A' && filename[0] <= 'Z')) &&
108 filename[1] == ':');
109}
110
111int is_windows_drive(const char *filename)
112{
113 if (is_windows_drive_prefix(filename) &&
114 filename[2] == '\0')
115 return 1;
116 if (strstart(filename, "\\\\.\\", NULL) ||
117 strstart(filename, "//./", NULL))
118 return 1;
119 return 0;
120}
121#endif
122
0563e191 123/* throttling disk I/O limits */
cc0681c4
BC
124void bdrv_set_io_limits(BlockDriverState *bs,
125 ThrottleConfig *cfg)
98f90dba 126{
cc0681c4 127 int i;
98f90dba 128
cc0681c4 129 throttle_config(&bs->throttle_state, cfg);
98f90dba 130
cc0681c4
BC
131 for (i = 0; i < 2; i++) {
132 qemu_co_enter_next(&bs->throttled_reqs[i]);
98f90dba 133 }
cc0681c4
BC
134}
135
136/* this function drain all the throttled IOs */
137static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
138{
139 bool drained = false;
140 bool enabled = bs->io_limits_enabled;
141 int i;
142
143 bs->io_limits_enabled = false;
144
145 for (i = 0; i < 2; i++) {
146 while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
147 drained = true;
148 }
149 }
150
151 bs->io_limits_enabled = enabled;
98f90dba 152
cc0681c4 153 return drained;
98f90dba
ZYW
154}
155
cc0681c4 156void bdrv_io_limits_disable(BlockDriverState *bs)
0563e191 157{
cc0681c4 158 bs->io_limits_enabled = false;
0563e191 159
cc0681c4
BC
160 bdrv_start_throttled_reqs(bs);
161
162 throttle_destroy(&bs->throttle_state);
0563e191
ZYW
163}
164
cc0681c4 165static void bdrv_throttle_read_timer_cb(void *opaque)
0563e191 166{
cc0681c4
BC
167 BlockDriverState *bs = opaque;
168 qemu_co_enter_next(&bs->throttled_reqs[0]);
0563e191
ZYW
169}
170
cc0681c4 171static void bdrv_throttle_write_timer_cb(void *opaque)
0563e191 172{
cc0681c4
BC
173 BlockDriverState *bs = opaque;
174 qemu_co_enter_next(&bs->throttled_reqs[1]);
0563e191
ZYW
175}
176
cc0681c4
BC
177/* should be called before bdrv_set_io_limits if a limit is set */
178void bdrv_io_limits_enable(BlockDriverState *bs)
179{
180 assert(!bs->io_limits_enabled);
181 throttle_init(&bs->throttle_state,
182 QEMU_CLOCK_VIRTUAL,
183 bdrv_throttle_read_timer_cb,
184 bdrv_throttle_write_timer_cb,
185 bs);
186 bs->io_limits_enabled = true;
187}
188
189/* This function makes an IO wait if needed
190 *
191 * @nb_sectors: the number of sectors of the IO
192 * @is_write: is the IO a write
193 */
98f90dba 194static void bdrv_io_limits_intercept(BlockDriverState *bs,
d5103588 195 unsigned int bytes,
cc0681c4 196 bool is_write)
98f90dba 197{
cc0681c4
BC
198 /* does this io must wait */
199 bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
98f90dba 200
cc0681c4
BC
201 /* if must wait or any request of this type throttled queue the IO */
202 if (must_wait ||
203 !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
204 qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
98f90dba
ZYW
205 }
206
cc0681c4 207 /* the IO will be executed, do the accounting */
d5103588
KW
208 throttle_account(&bs->throttle_state, is_write, bytes);
209
98f90dba 210
cc0681c4
BC
211 /* if the next request must wait -> do nothing */
212 if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
213 return;
98f90dba
ZYW
214 }
215
cc0681c4
BC
216 /* else queue next request for execution */
217 qemu_co_queue_next(&bs->throttled_reqs[is_write]);
98f90dba
ZYW
218}
219
339064d5
KW
220size_t bdrv_opt_mem_align(BlockDriverState *bs)
221{
222 if (!bs || !bs->drv) {
223 /* 4k should be on the safe side */
224 return 4096;
225 }
226
227 return bs->bl.opt_mem_alignment;
228}
229
9e0b22f4
SH
230/* check if the path starts with "<protocol>:" */
231static int path_has_protocol(const char *path)
232{
947995c0
PB
233 const char *p;
234
9e0b22f4
SH
235#ifdef _WIN32
236 if (is_windows_drive(path) ||
237 is_windows_drive_prefix(path)) {
238 return 0;
239 }
947995c0
PB
240 p = path + strcspn(path, ":/\\");
241#else
242 p = path + strcspn(path, ":/");
9e0b22f4
SH
243#endif
244
947995c0 245 return *p == ':';
9e0b22f4
SH
246}
247
83f64091 248int path_is_absolute(const char *path)
3b0d4f61 249{
21664424
FB
250#ifdef _WIN32
251 /* specific case for names like: "\\.\d:" */
f53f4da9 252 if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
21664424 253 return 1;
f53f4da9
PB
254 }
255 return (*path == '/' || *path == '\\');
3b9f94e1 256#else
f53f4da9 257 return (*path == '/');
3b9f94e1 258#endif
3b0d4f61
FB
259}
260
83f64091
FB
261/* if filename is absolute, just copy it to dest. Otherwise, build a
262 path to it by considering it is relative to base_path. URL are
263 supported. */
264void path_combine(char *dest, int dest_size,
265 const char *base_path,
266 const char *filename)
3b0d4f61 267{
83f64091
FB
268 const char *p, *p1;
269 int len;
270
271 if (dest_size <= 0)
272 return;
273 if (path_is_absolute(filename)) {
274 pstrcpy(dest, dest_size, filename);
275 } else {
276 p = strchr(base_path, ':');
277 if (p)
278 p++;
279 else
280 p = base_path;
3b9f94e1
FB
281 p1 = strrchr(base_path, '/');
282#ifdef _WIN32
283 {
284 const char *p2;
285 p2 = strrchr(base_path, '\\');
286 if (!p1 || p2 > p1)
287 p1 = p2;
288 }
289#endif
83f64091
FB
290 if (p1)
291 p1++;
292 else
293 p1 = base_path;
294 if (p1 > p)
295 p = p1;
296 len = p - base_path;
297 if (len > dest_size - 1)
298 len = dest_size - 1;
299 memcpy(dest, base_path, len);
300 dest[len] = '\0';
301 pstrcat(dest, dest_size, filename);
3b0d4f61 302 }
3b0d4f61
FB
303}
304
dc5a1371
PB
305void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
306{
307 if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
308 pstrcpy(dest, sz, bs->backing_file);
309 } else {
310 path_combine(dest, sz, bs->filename, bs->backing_file);
311 }
312}
313
5efa9d5a 314void bdrv_register(BlockDriver *bdrv)
ea2384d3 315{
8c5873d6
SH
316 /* Block drivers without coroutine functions need emulation */
317 if (!bdrv->bdrv_co_readv) {
f9f05dc5
KW
318 bdrv->bdrv_co_readv = bdrv_co_readv_em;
319 bdrv->bdrv_co_writev = bdrv_co_writev_em;
320
f8c35c1d
SH
321 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
322 * the block driver lacks aio we need to emulate that too.
323 */
f9f05dc5
KW
324 if (!bdrv->bdrv_aio_readv) {
325 /* add AIO emulation layer */
326 bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
327 bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
f9f05dc5 328 }
83f64091 329 }
b2e12bc6 330
8a22f02a 331 QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
ea2384d3 332}
b338082b
FB
333
334/* create a new block device (by default it is empty) */
98522f63 335BlockDriverState *bdrv_new(const char *device_name, Error **errp)
b338082b 336{
1b7bdbc1 337 BlockDriverState *bs;
fbe40ff7 338 int i;
b338082b 339
f2d953ec
KW
340 if (bdrv_find(device_name)) {
341 error_setg(errp, "Device with id '%s' already exists",
342 device_name);
343 return NULL;
344 }
345 if (bdrv_find_node(device_name)) {
346 error_setg(errp, "Device with node-name '%s' already exists",
347 device_name);
348 return NULL;
349 }
350
7267c094 351 bs = g_malloc0(sizeof(BlockDriverState));
e4654d2d 352 QLIST_INIT(&bs->dirty_bitmaps);
b338082b 353 pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
ea2384d3 354 if (device_name[0] != '\0') {
dc364f4c 355 QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
ea2384d3 356 }
fbe40ff7
FZ
357 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
358 QLIST_INIT(&bs->op_blockers[i]);
359 }
28a7282a 360 bdrv_iostatus_disable(bs);
d7d512f6 361 notifier_list_init(&bs->close_notifiers);
d616b224 362 notifier_with_return_list_init(&bs->before_write_notifiers);
cc0681c4
BC
363 qemu_co_queue_init(&bs->throttled_reqs[0]);
364 qemu_co_queue_init(&bs->throttled_reqs[1]);
9fcb0251 365 bs->refcnt = 1;
d7d512f6 366
b338082b
FB
367 return bs;
368}
369
d7d512f6
PB
370void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
371{
372 notifier_list_add(&bs->close_notifiers, notify);
373}
374
ea2384d3
FB
375BlockDriver *bdrv_find_format(const char *format_name)
376{
377 BlockDriver *drv1;
8a22f02a
SH
378 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
379 if (!strcmp(drv1->format_name, format_name)) {
ea2384d3 380 return drv1;
8a22f02a 381 }
ea2384d3
FB
382 }
383 return NULL;
384}
385
b64ec4e4 386static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
eb852011 387{
b64ec4e4
FZ
388 static const char *whitelist_rw[] = {
389 CONFIG_BDRV_RW_WHITELIST
390 };
391 static const char *whitelist_ro[] = {
392 CONFIG_BDRV_RO_WHITELIST
eb852011
MA
393 };
394 const char **p;
395
b64ec4e4 396 if (!whitelist_rw[0] && !whitelist_ro[0]) {
eb852011 397 return 1; /* no whitelist, anything goes */
b64ec4e4 398 }
eb852011 399
b64ec4e4 400 for (p = whitelist_rw; *p; p++) {
eb852011
MA
401 if (!strcmp(drv->format_name, *p)) {
402 return 1;
403 }
404 }
b64ec4e4
FZ
405 if (read_only) {
406 for (p = whitelist_ro; *p; p++) {
407 if (!strcmp(drv->format_name, *p)) {
408 return 1;
409 }
410 }
411 }
eb852011
MA
412 return 0;
413}
414
b64ec4e4
FZ
415BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
416 bool read_only)
eb852011
MA
417{
418 BlockDriver *drv = bdrv_find_format(format_name);
b64ec4e4 419 return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
eb852011
MA
420}
421
5b7e1542
ZYW
422typedef struct CreateCo {
423 BlockDriver *drv;
424 char *filename;
425 QEMUOptionParameter *options;
426 int ret;
cc84d90f 427 Error *err;
5b7e1542
ZYW
428} CreateCo;
429
430static void coroutine_fn bdrv_create_co_entry(void *opaque)
431{
cc84d90f
HR
432 Error *local_err = NULL;
433 int ret;
434
5b7e1542
ZYW
435 CreateCo *cco = opaque;
436 assert(cco->drv);
437
cc84d90f 438 ret = cco->drv->bdrv_create(cco->filename, cco->options, &local_err);
84d18f06 439 if (local_err) {
cc84d90f
HR
440 error_propagate(&cco->err, local_err);
441 }
442 cco->ret = ret;
5b7e1542
ZYW
443}
444
0e7e1989 445int bdrv_create(BlockDriver *drv, const char* filename,
cc84d90f 446 QEMUOptionParameter *options, Error **errp)
ea2384d3 447{
5b7e1542
ZYW
448 int ret;
449
450 Coroutine *co;
451 CreateCo cco = {
452 .drv = drv,
453 .filename = g_strdup(filename),
454 .options = options,
455 .ret = NOT_DONE,
cc84d90f 456 .err = NULL,
5b7e1542
ZYW
457 };
458
459 if (!drv->bdrv_create) {
cc84d90f 460 error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
80168bff
LC
461 ret = -ENOTSUP;
462 goto out;
5b7e1542
ZYW
463 }
464
465 if (qemu_in_coroutine()) {
466 /* Fast-path if already in coroutine context */
467 bdrv_create_co_entry(&cco);
468 } else {
469 co = qemu_coroutine_create(bdrv_create_co_entry);
470 qemu_coroutine_enter(co, &cco);
471 while (cco.ret == NOT_DONE) {
472 qemu_aio_wait();
473 }
474 }
475
476 ret = cco.ret;
cc84d90f 477 if (ret < 0) {
84d18f06 478 if (cco.err) {
cc84d90f
HR
479 error_propagate(errp, cco.err);
480 } else {
481 error_setg_errno(errp, -ret, "Could not create image");
482 }
483 }
0e7e1989 484
80168bff
LC
485out:
486 g_free(cco.filename);
5b7e1542 487 return ret;
ea2384d3
FB
488}
489
cc84d90f
HR
490int bdrv_create_file(const char* filename, QEMUOptionParameter *options,
491 Error **errp)
84a12e66
CH
492{
493 BlockDriver *drv;
cc84d90f
HR
494 Error *local_err = NULL;
495 int ret;
84a12e66 496
98289620 497 drv = bdrv_find_protocol(filename, true);
84a12e66 498 if (drv == NULL) {
cc84d90f 499 error_setg(errp, "Could not find protocol for file '%s'", filename);
16905d71 500 return -ENOENT;
84a12e66
CH
501 }
502
cc84d90f 503 ret = bdrv_create(drv, filename, options, &local_err);
84d18f06 504 if (local_err) {
cc84d90f
HR
505 error_propagate(errp, local_err);
506 }
507 return ret;
84a12e66
CH
508}
509
355ef4ac 510int bdrv_refresh_limits(BlockDriverState *bs)
d34682cd
KW
511{
512 BlockDriver *drv = bs->drv;
513
514 memset(&bs->bl, 0, sizeof(bs->bl));
515
466ad822
KW
516 if (!drv) {
517 return 0;
518 }
519
520 /* Take some limits from the children as a default */
521 if (bs->file) {
522 bdrv_refresh_limits(bs->file);
523 bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
339064d5
KW
524 bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
525 } else {
526 bs->bl.opt_mem_alignment = 512;
466ad822
KW
527 }
528
529 if (bs->backing_hd) {
530 bdrv_refresh_limits(bs->backing_hd);
531 bs->bl.opt_transfer_length =
532 MAX(bs->bl.opt_transfer_length,
533 bs->backing_hd->bl.opt_transfer_length);
339064d5
KW
534 bs->bl.opt_mem_alignment =
535 MAX(bs->bl.opt_mem_alignment,
536 bs->backing_hd->bl.opt_mem_alignment);
466ad822
KW
537 }
538
539 /* Then let the driver override it */
540 if (drv->bdrv_refresh_limits) {
d34682cd
KW
541 return drv->bdrv_refresh_limits(bs);
542 }
543
544 return 0;
545}
546
eba25057
JM
547/*
548 * Create a uniquely-named empty temporary file.
549 * Return 0 upon success, otherwise a negative errno value.
550 */
551int get_tmp_filename(char *filename, int size)
d5249393 552{
eba25057 553#ifdef _WIN32
3b9f94e1 554 char temp_dir[MAX_PATH];
eba25057
JM
555 /* GetTempFileName requires that its output buffer (4th param)
556 have length MAX_PATH or greater. */
557 assert(size >= MAX_PATH);
558 return (GetTempPath(MAX_PATH, temp_dir)
559 && GetTempFileName(temp_dir, "qem", 0, filename)
560 ? 0 : -GetLastError());
d5249393 561#else
67b915a5 562 int fd;
7ccfb2eb 563 const char *tmpdir;
0badc1ee 564 tmpdir = getenv("TMPDIR");
69bef793
AS
565 if (!tmpdir) {
566 tmpdir = "/var/tmp";
567 }
eba25057
JM
568 if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
569 return -EOVERFLOW;
570 }
ea2384d3 571 fd = mkstemp(filename);
fe235a06
DH
572 if (fd < 0) {
573 return -errno;
574 }
575 if (close(fd) != 0) {
576 unlink(filename);
eba25057
JM
577 return -errno;
578 }
579 return 0;
d5249393 580#endif
eba25057 581}
fc01f7e7 582
84a12e66
CH
583/*
584 * Detect host devices. By convention, /dev/cdrom[N] is always
585 * recognized as a host CDROM.
586 */
587static BlockDriver *find_hdev_driver(const char *filename)
588{
589 int score_max = 0, score;
590 BlockDriver *drv = NULL, *d;
591
592 QLIST_FOREACH(d, &bdrv_drivers, list) {
593 if (d->bdrv_probe_device) {
594 score = d->bdrv_probe_device(filename);
595 if (score > score_max) {
596 score_max = score;
597 drv = d;
598 }
599 }
600 }
601
602 return drv;
603}
604
98289620
KW
605BlockDriver *bdrv_find_protocol(const char *filename,
606 bool allow_protocol_prefix)
83f64091
FB
607{
608 BlockDriver *drv1;
609 char protocol[128];
1cec71e3 610 int len;
83f64091 611 const char *p;
19cb3738 612
66f82cee
KW
613 /* TODO Drivers without bdrv_file_open must be specified explicitly */
614
39508e7a
CH
615 /*
616 * XXX(hch): we really should not let host device detection
617 * override an explicit protocol specification, but moving this
618 * later breaks access to device names with colons in them.
619 * Thanks to the brain-dead persistent naming schemes on udev-
620 * based Linux systems those actually are quite common.
621 */
622 drv1 = find_hdev_driver(filename);
623 if (drv1) {
624 return drv1;
625 }
626
98289620 627 if (!path_has_protocol(filename) || !allow_protocol_prefix) {
39508e7a 628 return bdrv_find_format("file");
84a12e66 629 }
98289620 630
9e0b22f4
SH
631 p = strchr(filename, ':');
632 assert(p != NULL);
1cec71e3
AL
633 len = p - filename;
634 if (len > sizeof(protocol) - 1)
635 len = sizeof(protocol) - 1;
636 memcpy(protocol, filename, len);
637 protocol[len] = '\0';
8a22f02a 638 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
5fafdf24 639 if (drv1->protocol_name &&
8a22f02a 640 !strcmp(drv1->protocol_name, protocol)) {
83f64091 641 return drv1;
8a22f02a 642 }
83f64091
FB
643 }
644 return NULL;
645}
646
f500a6d3 647static int find_image_format(BlockDriverState *bs, const char *filename,
34b5d2c6 648 BlockDriver **pdrv, Error **errp)
f3a5d3f8 649{
f500a6d3 650 int score, score_max;
f3a5d3f8
CH
651 BlockDriver *drv1, *drv;
652 uint8_t buf[2048];
f500a6d3 653 int ret = 0;
f8ea0b00 654
08a00559 655 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
8e895599 656 if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
c98ac35d
SW
657 drv = bdrv_find_format("raw");
658 if (!drv) {
34b5d2c6 659 error_setg(errp, "Could not find raw image format");
c98ac35d
SW
660 ret = -ENOENT;
661 }
662 *pdrv = drv;
663 return ret;
1a396859 664 }
f8ea0b00 665
83f64091 666 ret = bdrv_pread(bs, 0, buf, sizeof(buf));
83f64091 667 if (ret < 0) {
34b5d2c6
HR
668 error_setg_errno(errp, -ret, "Could not read image for determining its "
669 "format");
c98ac35d
SW
670 *pdrv = NULL;
671 return ret;
83f64091
FB
672 }
673
ea2384d3 674 score_max = 0;
84a12e66 675 drv = NULL;
8a22f02a 676 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
83f64091
FB
677 if (drv1->bdrv_probe) {
678 score = drv1->bdrv_probe(buf, ret, filename);
679 if (score > score_max) {
680 score_max = score;
681 drv = drv1;
682 }
0849bf08 683 }
fc01f7e7 684 }
c98ac35d 685 if (!drv) {
34b5d2c6
HR
686 error_setg(errp, "Could not determine image format: No compatible "
687 "driver found");
c98ac35d
SW
688 ret = -ENOENT;
689 }
690 *pdrv = drv;
691 return ret;
ea2384d3
FB
692}
693
51762288
SH
694/**
695 * Set the current 'total_sectors' value
696 */
697static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
698{
699 BlockDriver *drv = bs->drv;
700
396759ad
NB
701 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
702 if (bs->sg)
703 return 0;
704
51762288
SH
705 /* query actual device if possible, otherwise just trust the hint */
706 if (drv->bdrv_getlength) {
707 int64_t length = drv->bdrv_getlength(bs);
708 if (length < 0) {
709 return length;
710 }
7e382003 711 hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
51762288
SH
712 }
713
714 bs->total_sectors = hint;
715 return 0;
716}
717
9e8f1835
PB
718/**
719 * Set open flags for a given discard mode
720 *
721 * Return 0 on success, -1 if the discard mode was invalid.
722 */
723int bdrv_parse_discard_flags(const char *mode, int *flags)
724{
725 *flags &= ~BDRV_O_UNMAP;
726
727 if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
728 /* do nothing */
729 } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
730 *flags |= BDRV_O_UNMAP;
731 } else {
732 return -1;
733 }
734
735 return 0;
736}
737
c3993cdc
SH
738/**
739 * Set open flags for a given cache mode
740 *
741 * Return 0 on success, -1 if the cache mode was invalid.
742 */
743int bdrv_parse_cache_flags(const char *mode, int *flags)
744{
745 *flags &= ~BDRV_O_CACHE_MASK;
746
747 if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
748 *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
92196b2f
SH
749 } else if (!strcmp(mode, "directsync")) {
750 *flags |= BDRV_O_NOCACHE;
c3993cdc
SH
751 } else if (!strcmp(mode, "writeback")) {
752 *flags |= BDRV_O_CACHE_WB;
753 } else if (!strcmp(mode, "unsafe")) {
754 *flags |= BDRV_O_CACHE_WB;
755 *flags |= BDRV_O_NO_FLUSH;
756 } else if (!strcmp(mode, "writethrough")) {
757 /* this is the default */
758 } else {
759 return -1;
760 }
761
762 return 0;
763}
764
53fec9d3
SH
765/**
766 * The copy-on-read flag is actually a reference count so multiple users may
767 * use the feature without worrying about clobbering its previous state.
768 * Copy-on-read stays enabled until all users have called to disable it.
769 */
770void bdrv_enable_copy_on_read(BlockDriverState *bs)
771{
772 bs->copy_on_read++;
773}
774
775void bdrv_disable_copy_on_read(BlockDriverState *bs)
776{
777 assert(bs->copy_on_read > 0);
778 bs->copy_on_read--;
779}
780
b1e6fc08
KW
781/*
782 * Returns the flags that a temporary snapshot should get, based on the
783 * originally requested flags (the originally requested image will have flags
784 * like a backing file)
785 */
786static int bdrv_temp_snapshot_flags(int flags)
787{
788 return (flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY;
789}
790
0b50cc88
KW
791/*
792 * Returns the flags that bs->file should get, based on the given flags for
793 * the parent BDS
794 */
795static int bdrv_inherited_flags(int flags)
796{
797 /* Enable protocol handling, disable format probing for bs->file */
798 flags |= BDRV_O_PROTOCOL;
799
800 /* Our block drivers take care to send flushes and respect unmap policy,
801 * so we can enable both unconditionally on lower layers. */
802 flags |= BDRV_O_CACHE_WB | BDRV_O_UNMAP;
803
0b50cc88 804 /* Clear flags that only apply to the top layer */
5669b44d 805 flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_COPY_ON_READ);
0b50cc88
KW
806
807 return flags;
808}
809
317fc44e
KW
810/*
811 * Returns the flags that bs->backing_hd should get, based on the given flags
812 * for the parent BDS
813 */
814static int bdrv_backing_flags(int flags)
815{
816 /* backing files always opened read-only */
817 flags &= ~(BDRV_O_RDWR | BDRV_O_COPY_ON_READ);
818
819 /* snapshot=on is handled on the top layer */
8bfea15d 820 flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_TEMPORARY);
317fc44e
KW
821
822 return flags;
823}
824
7b272452
KW
825static int bdrv_open_flags(BlockDriverState *bs, int flags)
826{
827 int open_flags = flags | BDRV_O_CACHE_WB;
828
829 /*
830 * Clear flags that are internal to the block layer before opening the
831 * image.
832 */
833 open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
834
835 /*
836 * Snapshots should be writable.
837 */
8bfea15d 838 if (flags & BDRV_O_TEMPORARY) {
7b272452
KW
839 open_flags |= BDRV_O_RDWR;
840 }
841
842 return open_flags;
843}
844
636ea370
KW
845static void bdrv_assign_node_name(BlockDriverState *bs,
846 const char *node_name,
847 Error **errp)
6913c0c2
BC
848{
849 if (!node_name) {
636ea370 850 return;
6913c0c2
BC
851 }
852
853 /* empty string node name is invalid */
854 if (node_name[0] == '\0') {
855 error_setg(errp, "Empty node name");
636ea370 856 return;
6913c0c2
BC
857 }
858
0c5e94ee
BC
859 /* takes care of avoiding namespaces collisions */
860 if (bdrv_find(node_name)) {
861 error_setg(errp, "node-name=%s is conflicting with a device id",
862 node_name);
636ea370 863 return;
0c5e94ee
BC
864 }
865
6913c0c2
BC
866 /* takes care of avoiding duplicates node names */
867 if (bdrv_find_node(node_name)) {
868 error_setg(errp, "Duplicate node name");
636ea370 869 return;
6913c0c2
BC
870 }
871
872 /* copy node name into the bs and insert it into the graph list */
873 pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
874 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
6913c0c2
BC
875}
876
57915332
KW
877/*
878 * Common part for opening disk images and files
b6ad491a
KW
879 *
880 * Removes all processed options from *options.
57915332 881 */
f500a6d3 882static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
34b5d2c6 883 QDict *options, int flags, BlockDriver *drv, Error **errp)
57915332
KW
884{
885 int ret, open_flags;
035fccdf 886 const char *filename;
6913c0c2 887 const char *node_name = NULL;
34b5d2c6 888 Error *local_err = NULL;
57915332
KW
889
890 assert(drv != NULL);
6405875c 891 assert(bs->file == NULL);
707ff828 892 assert(options != NULL && bs->options != options);
57915332 893
45673671
KW
894 if (file != NULL) {
895 filename = file->filename;
896 } else {
897 filename = qdict_get_try_str(options, "filename");
898 }
899
765003db
KW
900 if (drv->bdrv_needs_filename && !filename) {
901 error_setg(errp, "The '%s' block driver requires a file name",
902 drv->format_name);
903 return -EINVAL;
904 }
905
45673671 906 trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
28dcee10 907
6913c0c2 908 node_name = qdict_get_try_str(options, "node-name");
636ea370 909 bdrv_assign_node_name(bs, node_name, &local_err);
0fb6395c 910 if (local_err) {
636ea370
KW
911 error_propagate(errp, local_err);
912 return -EINVAL;
6913c0c2
BC
913 }
914 qdict_del(options, "node-name");
915
5d186eb0
KW
916 /* bdrv_open() with directly using a protocol as drv. This layer is already
917 * opened, so assign it to bs (while file becomes a closed BlockDriverState)
918 * and return immediately. */
919 if (file != NULL && drv->bdrv_file_open) {
920 bdrv_swap(file, bs);
921 return 0;
922 }
923
57915332 924 bs->open_flags = flags;
1b7fd729 925 bs->guest_block_size = 512;
c25f53b0 926 bs->request_alignment = 512;
0d51b4de 927 bs->zero_beyond_eof = true;
b64ec4e4
FZ
928 open_flags = bdrv_open_flags(bs, flags);
929 bs->read_only = !(open_flags & BDRV_O_RDWR);
930
931 if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
8f94a6e4
KW
932 error_setg(errp,
933 !bs->read_only && bdrv_is_whitelisted(drv, true)
934 ? "Driver '%s' can only be used for read-only devices"
935 : "Driver '%s' is not whitelisted",
936 drv->format_name);
b64ec4e4
FZ
937 return -ENOTSUP;
938 }
57915332 939
53fec9d3 940 assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
0ebd24e0
KW
941 if (flags & BDRV_O_COPY_ON_READ) {
942 if (!bs->read_only) {
943 bdrv_enable_copy_on_read(bs);
944 } else {
945 error_setg(errp, "Can't use copy-on-read on read-only device");
946 return -EINVAL;
947 }
53fec9d3
SH
948 }
949
c2ad1b0c
KW
950 if (filename != NULL) {
951 pstrcpy(bs->filename, sizeof(bs->filename), filename);
952 } else {
953 bs->filename[0] = '\0';
954 }
57915332 955
57915332 956 bs->drv = drv;
7267c094 957 bs->opaque = g_malloc0(drv->instance_size);
57915332 958
03f541bd 959 bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
e7c63796 960
66f82cee
KW
961 /* Open the image, either directly or using a protocol */
962 if (drv->bdrv_file_open) {
5d186eb0 963 assert(file == NULL);
030be321 964 assert(!drv->bdrv_needs_filename || filename != NULL);
34b5d2c6 965 ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
f500a6d3 966 } else {
2af5ef70 967 if (file == NULL) {
34b5d2c6
HR
968 error_setg(errp, "Can't use '%s' as a block driver for the "
969 "protocol level", drv->format_name);
2af5ef70
KW
970 ret = -EINVAL;
971 goto free_and_fail;
972 }
f500a6d3 973 bs->file = file;
34b5d2c6 974 ret = drv->bdrv_open(bs, options, open_flags, &local_err);
66f82cee
KW
975 }
976
57915332 977 if (ret < 0) {
84d18f06 978 if (local_err) {
34b5d2c6 979 error_propagate(errp, local_err);
2fa9aa59
DH
980 } else if (bs->filename[0]) {
981 error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
34b5d2c6
HR
982 } else {
983 error_setg_errno(errp, -ret, "Could not open image");
984 }
57915332
KW
985 goto free_and_fail;
986 }
987
51762288
SH
988 ret = refresh_total_sectors(bs, bs->total_sectors);
989 if (ret < 0) {
34b5d2c6 990 error_setg_errno(errp, -ret, "Could not refresh total sector count");
51762288 991 goto free_and_fail;
57915332 992 }
51762288 993
d34682cd 994 bdrv_refresh_limits(bs);
c25f53b0 995 assert(bdrv_opt_mem_align(bs) != 0);
47ea2de2 996 assert((bs->request_alignment != 0) || bs->sg);
57915332
KW
997 return 0;
998
999free_and_fail:
f500a6d3 1000 bs->file = NULL;
7267c094 1001 g_free(bs->opaque);
57915332
KW
1002 bs->opaque = NULL;
1003 bs->drv = NULL;
1004 return ret;
1005}
1006
b6ce07aa
KW
1007/*
1008 * Opens a file using a protocol (file, host_device, nbd, ...)
787e4a85 1009 *
5acd9d81
HR
1010 * options is an indirect pointer to a QDict of options to pass to the block
1011 * drivers, or pointer to NULL for an empty set of options. If this function
1012 * takes ownership of the QDict reference, it will set *options to NULL;
1013 * otherwise, it will contain unused/unrecognized options after this function
1014 * returns. Then, the caller is responsible for freeing it. If it intends to
1015 * reuse the QDict, QINCREF() should be called beforehand.
b6ce07aa 1016 */
d4446eae 1017static int bdrv_file_open(BlockDriverState *bs, const char *filename,
5acd9d81 1018 QDict **options, int flags, Error **errp)
ea2384d3 1019{
6db95603 1020 BlockDriver *drv;
c2ad1b0c 1021 const char *drvname;
e3fa4bfa 1022 bool parse_filename = false;
34b5d2c6 1023 Error *local_err = NULL;
83f64091
FB
1024 int ret;
1025
035fccdf
KW
1026 /* Fetch the file name from the options QDict if necessary */
1027 if (!filename) {
5acd9d81
HR
1028 filename = qdict_get_try_str(*options, "filename");
1029 } else if (filename && !qdict_haskey(*options, "filename")) {
1030 qdict_put(*options, "filename", qstring_from_str(filename));
e3fa4bfa 1031 parse_filename = true;
035fccdf 1032 } else {
34b5d2c6
HR
1033 error_setg(errp, "Can't specify 'file' and 'filename' options at the "
1034 "same time");
035fccdf
KW
1035 ret = -EINVAL;
1036 goto fail;
1037 }
1038
c2ad1b0c 1039 /* Find the right block driver */
5acd9d81 1040 drvname = qdict_get_try_str(*options, "driver");
c2ad1b0c 1041 if (drvname) {
8f94a6e4 1042 drv = bdrv_find_format(drvname);
34b5d2c6
HR
1043 if (!drv) {
1044 error_setg(errp, "Unknown driver '%s'", drvname);
1045 }
5acd9d81 1046 qdict_del(*options, "driver");
c2ad1b0c 1047 } else if (filename) {
e3fa4bfa 1048 drv = bdrv_find_protocol(filename, parse_filename);
98289620 1049 if (!drv) {
34b5d2c6 1050 error_setg(errp, "Unknown protocol");
98289620 1051 }
c2ad1b0c 1052 } else {
34b5d2c6 1053 error_setg(errp, "Must specify either driver or file");
c2ad1b0c
KW
1054 drv = NULL;
1055 }
1056
1057 if (!drv) {
34b5d2c6 1058 /* errp has been set already */
c2ad1b0c
KW
1059 ret = -ENOENT;
1060 goto fail;
1061 }
1062
1063 /* Parse the filename and open it */
e3fa4bfa 1064 if (drv->bdrv_parse_filename && parse_filename) {
5acd9d81 1065 drv->bdrv_parse_filename(filename, *options, &local_err);
84d18f06 1066 if (local_err) {
34b5d2c6 1067 error_propagate(errp, local_err);
6963a30d
KW
1068 ret = -EINVAL;
1069 goto fail;
1070 }
cd5d031e
HR
1071
1072 if (!drv->bdrv_needs_filename) {
1073 qdict_del(*options, "filename");
1074 } else {
1075 filename = qdict_get_str(*options, "filename");
1076 }
6963a30d
KW
1077 }
1078
505d7583 1079 if (!drv->bdrv_file_open) {
5acd9d81
HR
1080 ret = bdrv_open(&bs, filename, NULL, *options, flags, drv, &local_err);
1081 *options = NULL;
505d7583 1082 } else {
5acd9d81 1083 ret = bdrv_open_common(bs, NULL, *options, flags, drv, &local_err);
505d7583 1084 }
83f64091 1085 if (ret < 0) {
34b5d2c6 1086 error_propagate(errp, local_err);
707ff828
KW
1087 goto fail;
1088 }
1089
71d0770c 1090 bs->growable = 1;
83f64091 1091 return 0;
707ff828
KW
1092
1093fail:
707ff828 1094 return ret;
83f64091
FB
1095}
1096
8d24cce1
FZ
1097void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd)
1098{
1099
826b6ca0
FZ
1100 if (bs->backing_hd) {
1101 assert(bs->backing_blocker);
1102 bdrv_op_unblock_all(bs->backing_hd, bs->backing_blocker);
1103 } else if (backing_hd) {
1104 error_setg(&bs->backing_blocker,
1105 "device is used as backing hd of '%s'",
1106 bs->device_name);
1107 }
1108
8d24cce1
FZ
1109 bs->backing_hd = backing_hd;
1110 if (!backing_hd) {
826b6ca0
FZ
1111 error_free(bs->backing_blocker);
1112 bs->backing_blocker = NULL;
8d24cce1
FZ
1113 goto out;
1114 }
1115 bs->open_flags &= ~BDRV_O_NO_BACKING;
1116 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_hd->filename);
1117 pstrcpy(bs->backing_format, sizeof(bs->backing_format),
1118 backing_hd->drv ? backing_hd->drv->format_name : "");
826b6ca0
FZ
1119
1120 bdrv_op_block_all(bs->backing_hd, bs->backing_blocker);
1121 /* Otherwise we won't be able to commit due to check in bdrv_commit */
1122 bdrv_op_unblock(bs->backing_hd, BLOCK_OP_TYPE_COMMIT,
1123 bs->backing_blocker);
8d24cce1
FZ
1124out:
1125 bdrv_refresh_limits(bs);
1126}
1127
31ca6d07
KW
1128/*
1129 * Opens the backing file for a BlockDriverState if not yet open
1130 *
1131 * options is a QDict of options to pass to the block drivers, or NULL for an
1132 * empty set of options. The reference to the QDict is transferred to this
1133 * function (even on failure), so if the caller intends to reuse the dictionary,
1134 * it needs to use QINCREF() before calling bdrv_file_open.
1135 */
34b5d2c6 1136int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
9156df12 1137{
1ba4b6a5 1138 char *backing_filename = g_malloc0(PATH_MAX);
317fc44e 1139 int ret = 0;
9156df12 1140 BlockDriver *back_drv = NULL;
8d24cce1 1141 BlockDriverState *backing_hd;
34b5d2c6 1142 Error *local_err = NULL;
9156df12
PB
1143
1144 if (bs->backing_hd != NULL) {
31ca6d07 1145 QDECREF(options);
1ba4b6a5 1146 goto free_exit;
9156df12
PB
1147 }
1148
31ca6d07
KW
1149 /* NULL means an empty set of options */
1150 if (options == NULL) {
1151 options = qdict_new();
1152 }
1153
9156df12 1154 bs->open_flags &= ~BDRV_O_NO_BACKING;
1cb6f506
KW
1155 if (qdict_haskey(options, "file.filename")) {
1156 backing_filename[0] = '\0';
1157 } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
31ca6d07 1158 QDECREF(options);
1ba4b6a5 1159 goto free_exit;
dbecebdd 1160 } else {
1ba4b6a5 1161 bdrv_get_full_backing_filename(bs, backing_filename, PATH_MAX);
9156df12
PB
1162 }
1163
8d24cce1
FZ
1164 backing_hd = bdrv_new("", errp);
1165
9156df12
PB
1166 if (bs->backing_format[0] != '\0') {
1167 back_drv = bdrv_find_format(bs->backing_format);
1168 }
1169
f67503e5 1170 assert(bs->backing_hd == NULL);
8d24cce1 1171 ret = bdrv_open(&backing_hd,
ddf5636d 1172 *backing_filename ? backing_filename : NULL, NULL, options,
317fc44e 1173 bdrv_backing_flags(bs->open_flags), back_drv, &local_err);
9156df12 1174 if (ret < 0) {
8d24cce1
FZ
1175 bdrv_unref(backing_hd);
1176 backing_hd = NULL;
9156df12 1177 bs->open_flags |= BDRV_O_NO_BACKING;
b04b6b6e
FZ
1178 error_setg(errp, "Could not open backing file: %s",
1179 error_get_pretty(local_err));
1180 error_free(local_err);
1ba4b6a5 1181 goto free_exit;
9156df12 1182 }
8d24cce1 1183 bdrv_set_backing_hd(bs, backing_hd);
d80ac658 1184
1ba4b6a5
BC
1185free_exit:
1186 g_free(backing_filename);
1187 return ret;
9156df12
PB
1188}
1189
da557aac
HR
1190/*
1191 * Opens a disk image whose options are given as BlockdevRef in another block
1192 * device's options.
1193 *
da557aac
HR
1194 * If allow_none is true, no image will be opened if filename is false and no
1195 * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
1196 *
1197 * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
1198 * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
1199 * itself, all options starting with "${bdref_key}." are considered part of the
1200 * BlockdevRef.
1201 *
1202 * The BlockdevRef will be removed from the options QDict.
f67503e5
HR
1203 *
1204 * To conform with the behavior of bdrv_open(), *pbs has to be NULL.
da557aac
HR
1205 */
1206int bdrv_open_image(BlockDriverState **pbs, const char *filename,
1207 QDict *options, const char *bdref_key, int flags,
f7d9fd8c 1208 bool allow_none, Error **errp)
da557aac
HR
1209{
1210 QDict *image_options;
1211 int ret;
1212 char *bdref_key_dot;
1213 const char *reference;
1214
f67503e5
HR
1215 assert(pbs);
1216 assert(*pbs == NULL);
1217
da557aac
HR
1218 bdref_key_dot = g_strdup_printf("%s.", bdref_key);
1219 qdict_extract_subqdict(options, &image_options, bdref_key_dot);
1220 g_free(bdref_key_dot);
1221
1222 reference = qdict_get_try_str(options, bdref_key);
1223 if (!filename && !reference && !qdict_size(image_options)) {
1224 if (allow_none) {
1225 ret = 0;
1226 } else {
1227 error_setg(errp, "A block device must be specified for \"%s\"",
1228 bdref_key);
1229 ret = -EINVAL;
1230 }
b20e61e0 1231 QDECREF(image_options);
da557aac
HR
1232 goto done;
1233 }
1234
f7d9fd8c 1235 ret = bdrv_open(pbs, filename, reference, image_options, flags, NULL, errp);
da557aac
HR
1236
1237done:
1238 qdict_del(options, bdref_key);
1239 return ret;
1240}
1241
b1e6fc08 1242void bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp)
b998875d
KW
1243{
1244 /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1ba4b6a5 1245 char *tmp_filename = g_malloc0(PATH_MAX + 1);
b998875d
KW
1246 int64_t total_size;
1247 BlockDriver *bdrv_qcow2;
1248 QEMUOptionParameter *create_options;
1249 QDict *snapshot_options;
1250 BlockDriverState *bs_snapshot;
1251 Error *local_err;
1252 int ret;
1253
1254 /* if snapshot, we create a temporary backing file and open it
1255 instead of opening 'filename' directly */
1256
1257 /* Get the required size from the image */
f187743a
KW
1258 total_size = bdrv_getlength(bs);
1259 if (total_size < 0) {
1260 error_setg_errno(errp, -total_size, "Could not get image size");
1ba4b6a5 1261 goto out;
f187743a
KW
1262 }
1263 total_size &= BDRV_SECTOR_MASK;
b998875d
KW
1264
1265 /* Create the temporary image */
1ba4b6a5 1266 ret = get_tmp_filename(tmp_filename, PATH_MAX + 1);
b998875d
KW
1267 if (ret < 0) {
1268 error_setg_errno(errp, -ret, "Could not get temporary filename");
1ba4b6a5 1269 goto out;
b998875d
KW
1270 }
1271
1272 bdrv_qcow2 = bdrv_find_format("qcow2");
1273 create_options = parse_option_parameters("", bdrv_qcow2->create_options,
1274 NULL);
1275
1276 set_option_parameter_int(create_options, BLOCK_OPT_SIZE, total_size);
1277
1278 ret = bdrv_create(bdrv_qcow2, tmp_filename, create_options, &local_err);
1279 free_option_parameters(create_options);
1280 if (ret < 0) {
1281 error_setg_errno(errp, -ret, "Could not create temporary overlay "
1282 "'%s': %s", tmp_filename,
1283 error_get_pretty(local_err));
1284 error_free(local_err);
1ba4b6a5 1285 goto out;
b998875d
KW
1286 }
1287
1288 /* Prepare a new options QDict for the temporary file */
1289 snapshot_options = qdict_new();
1290 qdict_put(snapshot_options, "file.driver",
1291 qstring_from_str("file"));
1292 qdict_put(snapshot_options, "file.filename",
1293 qstring_from_str(tmp_filename));
1294
98522f63 1295 bs_snapshot = bdrv_new("", &error_abort);
b998875d
KW
1296
1297 ret = bdrv_open(&bs_snapshot, NULL, NULL, snapshot_options,
b1e6fc08 1298 flags, bdrv_qcow2, &local_err);
b998875d
KW
1299 if (ret < 0) {
1300 error_propagate(errp, local_err);
1ba4b6a5 1301 goto out;
b998875d
KW
1302 }
1303
1304 bdrv_append(bs_snapshot, bs);
1ba4b6a5
BC
1305
1306out:
1307 g_free(tmp_filename);
b998875d
KW
1308}
1309
4993f7ea
HR
1310static QDict *parse_json_filename(const char *filename, Error **errp)
1311{
1312 QObject *options_obj;
1313 QDict *options;
1314 int ret;
1315
1316 ret = strstart(filename, "json:", &filename);
1317 assert(ret);
1318
1319 options_obj = qobject_from_json(filename);
1320 if (!options_obj) {
1321 error_setg(errp, "Could not parse the JSON options");
1322 return NULL;
1323 }
1324
1325 if (qobject_type(options_obj) != QTYPE_QDICT) {
1326 qobject_decref(options_obj);
1327 error_setg(errp, "Invalid JSON object given");
1328 return NULL;
1329 }
1330
1331 options = qobject_to_qdict(options_obj);
1332 qdict_flatten(options);
1333
1334 return options;
1335}
1336
b6ce07aa
KW
1337/*
1338 * Opens a disk image (raw, qcow2, vmdk, ...)
de9c0cec
KW
1339 *
1340 * options is a QDict of options to pass to the block drivers, or NULL for an
1341 * empty set of options. The reference to the QDict belongs to the block layer
1342 * after the call (even on failure), so if the caller intends to reuse the
1343 * dictionary, it needs to use QINCREF() before calling bdrv_open.
f67503e5
HR
1344 *
1345 * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
1346 * If it is not NULL, the referenced BDS will be reused.
ddf5636d
HR
1347 *
1348 * The reference parameter may be used to specify an existing block device which
1349 * should be opened. If specified, neither options nor a filename may be given,
1350 * nor can an existing BDS be reused (that is, *pbs has to be NULL).
b6ce07aa 1351 */
ddf5636d
HR
1352int bdrv_open(BlockDriverState **pbs, const char *filename,
1353 const char *reference, QDict *options, int flags,
1354 BlockDriver *drv, Error **errp)
ea2384d3 1355{
b6ce07aa 1356 int ret;
f67503e5 1357 BlockDriverState *file = NULL, *bs;
74fe54f2 1358 const char *drvname;
34b5d2c6 1359 Error *local_err = NULL;
b1e6fc08 1360 int snapshot_flags = 0;
712e7874 1361
f67503e5
HR
1362 assert(pbs);
1363
ddf5636d
HR
1364 if (reference) {
1365 bool options_non_empty = options ? qdict_size(options) : false;
1366 QDECREF(options);
1367
1368 if (*pbs) {
1369 error_setg(errp, "Cannot reuse an existing BDS when referencing "
1370 "another block device");
1371 return -EINVAL;
1372 }
1373
1374 if (filename || options_non_empty) {
1375 error_setg(errp, "Cannot reference an existing block device with "
1376 "additional options or a new filename");
1377 return -EINVAL;
1378 }
1379
1380 bs = bdrv_lookup_bs(reference, reference, errp);
1381 if (!bs) {
1382 return -ENODEV;
1383 }
1384 bdrv_ref(bs);
1385 *pbs = bs;
1386 return 0;
1387 }
1388
f67503e5
HR
1389 if (*pbs) {
1390 bs = *pbs;
1391 } else {
98522f63 1392 bs = bdrv_new("", &error_abort);
f67503e5
HR
1393 }
1394
de9c0cec
KW
1395 /* NULL means an empty set of options */
1396 if (options == NULL) {
1397 options = qdict_new();
1398 }
1399
4993f7ea
HR
1400 if (filename && g_str_has_prefix(filename, "json:")) {
1401 QDict *json_options = parse_json_filename(filename, &local_err);
1402 if (local_err) {
1403 ret = -EINVAL;
1404 goto fail;
1405 }
1406
1407 /* Options given in the filename have lower priority than options
1408 * specified directly */
1409 qdict_join(options, json_options, false);
1410 QDECREF(json_options);
1411 filename = NULL;
1412 }
1413
de9c0cec 1414 bs->options = options;
b6ad491a 1415 options = qdict_clone_shallow(options);
de9c0cec 1416
5469a2a6
HR
1417 if (flags & BDRV_O_PROTOCOL) {
1418 assert(!drv);
5acd9d81 1419 ret = bdrv_file_open(bs, filename, &options, flags & ~BDRV_O_PROTOCOL,
5469a2a6 1420 &local_err);
5469a2a6 1421 if (!ret) {
eb909c7f 1422 drv = bs->drv;
5acd9d81 1423 goto done;
5469a2a6
HR
1424 } else if (bs->drv) {
1425 goto close_and_fail;
1426 } else {
1427 goto fail;
1428 }
1429 }
1430
f500a6d3
KW
1431 /* Open image file without format layer */
1432 if (flags & BDRV_O_RDWR) {
1433 flags |= BDRV_O_ALLOW_RDWR;
1434 }
b1e6fc08
KW
1435 if (flags & BDRV_O_SNAPSHOT) {
1436 snapshot_flags = bdrv_temp_snapshot_flags(flags);
1437 flags = bdrv_backing_flags(flags);
1438 }
f500a6d3 1439
f67503e5 1440 assert(file == NULL);
054963f8 1441 ret = bdrv_open_image(&file, filename, options, "file",
0b50cc88
KW
1442 bdrv_inherited_flags(flags),
1443 true, &local_err);
054963f8 1444 if (ret < 0) {
8bfea15d 1445 goto fail;
f500a6d3
KW
1446 }
1447
b6ce07aa 1448 /* Find the right image format driver */
74fe54f2
KW
1449 drvname = qdict_get_try_str(options, "driver");
1450 if (drvname) {
8f94a6e4 1451 drv = bdrv_find_format(drvname);
74fe54f2 1452 qdict_del(options, "driver");
06d22aa3
KW
1453 if (!drv) {
1454 error_setg(errp, "Invalid driver: '%s'", drvname);
1455 ret = -EINVAL;
8bfea15d 1456 goto fail;
06d22aa3 1457 }
74fe54f2
KW
1458 }
1459
6db95603 1460 if (!drv) {
2a05cbe4
HR
1461 if (file) {
1462 ret = find_image_format(file, filename, &drv, &local_err);
1463 } else {
1464 error_setg(errp, "Must specify either driver or file");
1465 ret = -EINVAL;
8bfea15d 1466 goto fail;
2a05cbe4 1467 }
51d7c00c 1468 }
6987307c 1469
51d7c00c 1470 if (!drv) {
8bfea15d 1471 goto fail;
ea2384d3 1472 }
b6ce07aa
KW
1473
1474 /* Open the image */
34b5d2c6 1475 ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
b6ce07aa 1476 if (ret < 0) {
8bfea15d 1477 goto fail;
6987307c
CH
1478 }
1479
2a05cbe4 1480 if (file && (bs->file != file)) {
4f6fd349 1481 bdrv_unref(file);
f500a6d3
KW
1482 file = NULL;
1483 }
1484
b6ce07aa 1485 /* If there is a backing file, use it */
9156df12 1486 if ((flags & BDRV_O_NO_BACKING) == 0) {
31ca6d07
KW
1487 QDict *backing_options;
1488
5726d872 1489 qdict_extract_subqdict(options, &backing_options, "backing.");
34b5d2c6 1490 ret = bdrv_open_backing_file(bs, backing_options, &local_err);
b6ce07aa 1491 if (ret < 0) {
b6ad491a 1492 goto close_and_fail;
b6ce07aa 1493 }
b6ce07aa
KW
1494 }
1495
b998875d
KW
1496 /* For snapshot=on, create a temporary qcow2 overlay. bs points to the
1497 * temporary snapshot afterwards. */
b1e6fc08
KW
1498 if (snapshot_flags) {
1499 bdrv_append_temp_snapshot(bs, snapshot_flags, &local_err);
b998875d
KW
1500 if (local_err) {
1501 error_propagate(errp, local_err);
1502 goto close_and_fail;
1503 }
1504 }
1505
1506
5acd9d81 1507done:
b6ad491a 1508 /* Check if any unknown options were used */
5acd9d81 1509 if (options && (qdict_size(options) != 0)) {
b6ad491a 1510 const QDictEntry *entry = qdict_first(options);
5acd9d81
HR
1511 if (flags & BDRV_O_PROTOCOL) {
1512 error_setg(errp, "Block protocol '%s' doesn't support the option "
1513 "'%s'", drv->format_name, entry->key);
1514 } else {
1515 error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1516 "support the option '%s'", drv->format_name,
1517 bs->device_name, entry->key);
1518 }
b6ad491a
KW
1519
1520 ret = -EINVAL;
1521 goto close_and_fail;
1522 }
b6ad491a 1523
b6ce07aa 1524 if (!bdrv_key_required(bs)) {
7d4b4ba5 1525 bdrv_dev_change_media_cb(bs, true);
c3adb58f
MA
1526 } else if (!runstate_check(RUN_STATE_PRELAUNCH)
1527 && !runstate_check(RUN_STATE_INMIGRATE)
1528 && !runstate_check(RUN_STATE_PAUSED)) { /* HACK */
1529 error_setg(errp,
1530 "Guest must be stopped for opening of encrypted image");
1531 ret = -EBUSY;
1532 goto close_and_fail;
b6ce07aa
KW
1533 }
1534
c3adb58f 1535 QDECREF(options);
f67503e5 1536 *pbs = bs;
b6ce07aa
KW
1537 return 0;
1538
8bfea15d 1539fail:
f500a6d3 1540 if (file != NULL) {
4f6fd349 1541 bdrv_unref(file);
f500a6d3 1542 }
de9c0cec 1543 QDECREF(bs->options);
b6ad491a 1544 QDECREF(options);
de9c0cec 1545 bs->options = NULL;
f67503e5
HR
1546 if (!*pbs) {
1547 /* If *pbs is NULL, a new BDS has been created in this function and
1548 needs to be freed now. Otherwise, it does not need to be closed,
1549 since it has not really been opened yet. */
1550 bdrv_unref(bs);
1551 }
84d18f06 1552 if (local_err) {
34b5d2c6
HR
1553 error_propagate(errp, local_err);
1554 }
b6ad491a 1555 return ret;
de9c0cec 1556
b6ad491a 1557close_and_fail:
f67503e5
HR
1558 /* See fail path, but now the BDS has to be always closed */
1559 if (*pbs) {
1560 bdrv_close(bs);
1561 } else {
1562 bdrv_unref(bs);
1563 }
b6ad491a 1564 QDECREF(options);
84d18f06 1565 if (local_err) {
34b5d2c6
HR
1566 error_propagate(errp, local_err);
1567 }
b6ce07aa
KW
1568 return ret;
1569}
1570
e971aa12
JC
1571typedef struct BlockReopenQueueEntry {
1572 bool prepared;
1573 BDRVReopenState state;
1574 QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1575} BlockReopenQueueEntry;
1576
1577/*
1578 * Adds a BlockDriverState to a simple queue for an atomic, transactional
1579 * reopen of multiple devices.
1580 *
1581 * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1582 * already performed, or alternatively may be NULL a new BlockReopenQueue will
1583 * be created and initialized. This newly created BlockReopenQueue should be
1584 * passed back in for subsequent calls that are intended to be of the same
1585 * atomic 'set'.
1586 *
1587 * bs is the BlockDriverState to add to the reopen queue.
1588 *
1589 * flags contains the open flags for the associated bs
1590 *
1591 * returns a pointer to bs_queue, which is either the newly allocated
1592 * bs_queue, or the existing bs_queue being used.
1593 *
1594 */
1595BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1596 BlockDriverState *bs, int flags)
1597{
1598 assert(bs != NULL);
1599
1600 BlockReopenQueueEntry *bs_entry;
1601 if (bs_queue == NULL) {
1602 bs_queue = g_new0(BlockReopenQueue, 1);
1603 QSIMPLEQ_INIT(bs_queue);
1604 }
1605
f1f25a2e
KW
1606 /* bdrv_open() masks this flag out */
1607 flags &= ~BDRV_O_PROTOCOL;
1608
e971aa12 1609 if (bs->file) {
f1f25a2e 1610 bdrv_reopen_queue(bs_queue, bs->file, bdrv_inherited_flags(flags));
e971aa12
JC
1611 }
1612
1613 bs_entry = g_new0(BlockReopenQueueEntry, 1);
1614 QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1615
1616 bs_entry->state.bs = bs;
1617 bs_entry->state.flags = flags;
1618
1619 return bs_queue;
1620}
1621
1622/*
1623 * Reopen multiple BlockDriverStates atomically & transactionally.
1624 *
1625 * The queue passed in (bs_queue) must have been built up previous
1626 * via bdrv_reopen_queue().
1627 *
1628 * Reopens all BDS specified in the queue, with the appropriate
1629 * flags. All devices are prepared for reopen, and failure of any
1630 * device will cause all device changes to be abandonded, and intermediate
1631 * data cleaned up.
1632 *
1633 * If all devices prepare successfully, then the changes are committed
1634 * to all devices.
1635 *
1636 */
1637int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1638{
1639 int ret = -1;
1640 BlockReopenQueueEntry *bs_entry, *next;
1641 Error *local_err = NULL;
1642
1643 assert(bs_queue != NULL);
1644
1645 bdrv_drain_all();
1646
1647 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1648 if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1649 error_propagate(errp, local_err);
1650 goto cleanup;
1651 }
1652 bs_entry->prepared = true;
1653 }
1654
1655 /* If we reach this point, we have success and just need to apply the
1656 * changes
1657 */
1658 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1659 bdrv_reopen_commit(&bs_entry->state);
1660 }
1661
1662 ret = 0;
1663
1664cleanup:
1665 QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1666 if (ret && bs_entry->prepared) {
1667 bdrv_reopen_abort(&bs_entry->state);
1668 }
1669 g_free(bs_entry);
1670 }
1671 g_free(bs_queue);
1672 return ret;
1673}
1674
1675
1676/* Reopen a single BlockDriverState with the specified flags. */
1677int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1678{
1679 int ret = -1;
1680 Error *local_err = NULL;
1681 BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1682
1683 ret = bdrv_reopen_multiple(queue, &local_err);
1684 if (local_err != NULL) {
1685 error_propagate(errp, local_err);
1686 }
1687 return ret;
1688}
1689
1690
1691/*
1692 * Prepares a BlockDriverState for reopen. All changes are staged in the
1693 * 'opaque' field of the BDRVReopenState, which is used and allocated by
1694 * the block driver layer .bdrv_reopen_prepare()
1695 *
1696 * bs is the BlockDriverState to reopen
1697 * flags are the new open flags
1698 * queue is the reopen queue
1699 *
1700 * Returns 0 on success, non-zero on error. On error errp will be set
1701 * as well.
1702 *
1703 * On failure, bdrv_reopen_abort() will be called to clean up any data.
1704 * It is the responsibility of the caller to then call the abort() or
1705 * commit() for any other BDS that have been left in a prepare() state
1706 *
1707 */
1708int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1709 Error **errp)
1710{
1711 int ret = -1;
1712 Error *local_err = NULL;
1713 BlockDriver *drv;
1714
1715 assert(reopen_state != NULL);
1716 assert(reopen_state->bs->drv != NULL);
1717 drv = reopen_state->bs->drv;
1718
1719 /* if we are to stay read-only, do not allow permission change
1720 * to r/w */
1721 if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1722 reopen_state->flags & BDRV_O_RDWR) {
1723 error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1724 reopen_state->bs->device_name);
1725 goto error;
1726 }
1727
1728
1729 ret = bdrv_flush(reopen_state->bs);
1730 if (ret) {
1731 error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1732 strerror(-ret));
1733 goto error;
1734 }
1735
1736 if (drv->bdrv_reopen_prepare) {
1737 ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1738 if (ret) {
1739 if (local_err != NULL) {
1740 error_propagate(errp, local_err);
1741 } else {
d8b6895f
LC
1742 error_setg(errp, "failed while preparing to reopen image '%s'",
1743 reopen_state->bs->filename);
e971aa12
JC
1744 }
1745 goto error;
1746 }
1747 } else {
1748 /* It is currently mandatory to have a bdrv_reopen_prepare()
1749 * handler for each supported drv. */
1750 error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1751 drv->format_name, reopen_state->bs->device_name,
1752 "reopening of file");
1753 ret = -1;
1754 goto error;
1755 }
1756
1757 ret = 0;
1758
1759error:
1760 return ret;
1761}
1762
1763/*
1764 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1765 * makes them final by swapping the staging BlockDriverState contents into
1766 * the active BlockDriverState contents.
1767 */
1768void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1769{
1770 BlockDriver *drv;
1771
1772 assert(reopen_state != NULL);
1773 drv = reopen_state->bs->drv;
1774 assert(drv != NULL);
1775
1776 /* If there are any driver level actions to take */
1777 if (drv->bdrv_reopen_commit) {
1778 drv->bdrv_reopen_commit(reopen_state);
1779 }
1780
1781 /* set BDS specific flags now */
1782 reopen_state->bs->open_flags = reopen_state->flags;
1783 reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1784 BDRV_O_CACHE_WB);
1785 reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
355ef4ac
KW
1786
1787 bdrv_refresh_limits(reopen_state->bs);
e971aa12
JC
1788}
1789
1790/*
1791 * Abort the reopen, and delete and free the staged changes in
1792 * reopen_state
1793 */
1794void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1795{
1796 BlockDriver *drv;
1797
1798 assert(reopen_state != NULL);
1799 drv = reopen_state->bs->drv;
1800 assert(drv != NULL);
1801
1802 if (drv->bdrv_reopen_abort) {
1803 drv->bdrv_reopen_abort(reopen_state);
1804 }
1805}
1806
1807
fc01f7e7
FB
1808void bdrv_close(BlockDriverState *bs)
1809{
3cbc002c
PB
1810 if (bs->job) {
1811 block_job_cancel_sync(bs->job);
1812 }
58fda173
SH
1813 bdrv_drain_all(); /* complete I/O */
1814 bdrv_flush(bs);
1815 bdrv_drain_all(); /* in case flush left pending I/O */
d7d512f6 1816 notifier_list_notify(&bs->close_notifiers, bs);
7094f12f 1817
3cbc002c 1818 if (bs->drv) {
557df6ac 1819 if (bs->backing_hd) {
826b6ca0
FZ
1820 BlockDriverState *backing_hd = bs->backing_hd;
1821 bdrv_set_backing_hd(bs, NULL);
1822 bdrv_unref(backing_hd);
557df6ac 1823 }
ea2384d3 1824 bs->drv->bdrv_close(bs);
7267c094 1825 g_free(bs->opaque);
ea2384d3
FB
1826 bs->opaque = NULL;
1827 bs->drv = NULL;
53fec9d3 1828 bs->copy_on_read = 0;
a275fa42
PB
1829 bs->backing_file[0] = '\0';
1830 bs->backing_format[0] = '\0';
6405875c
PB
1831 bs->total_sectors = 0;
1832 bs->encrypted = 0;
1833 bs->valid_key = 0;
1834 bs->sg = 0;
1835 bs->growable = 0;
0d51b4de 1836 bs->zero_beyond_eof = false;
de9c0cec
KW
1837 QDECREF(bs->options);
1838 bs->options = NULL;
b338082b 1839
66f82cee 1840 if (bs->file != NULL) {
4f6fd349 1841 bdrv_unref(bs->file);
0ac9377d 1842 bs->file = NULL;
66f82cee 1843 }
b338082b 1844 }
98f90dba 1845
9ca11154
PH
1846 bdrv_dev_change_media_cb(bs, false);
1847
98f90dba
ZYW
1848 /*throttling disk I/O limits*/
1849 if (bs->io_limits_enabled) {
1850 bdrv_io_limits_disable(bs);
1851 }
b338082b
FB
1852}
1853
2bc93fed
MK
1854void bdrv_close_all(void)
1855{
1856 BlockDriverState *bs;
1857
dc364f4c 1858 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
ed78cda3
SH
1859 AioContext *aio_context = bdrv_get_aio_context(bs);
1860
1861 aio_context_acquire(aio_context);
2bc93fed 1862 bdrv_close(bs);
ed78cda3 1863 aio_context_release(aio_context);
2bc93fed
MK
1864 }
1865}
1866
88266f5a
SH
1867/* Check if any requests are in-flight (including throttled requests) */
1868static bool bdrv_requests_pending(BlockDriverState *bs)
1869{
1870 if (!QLIST_EMPTY(&bs->tracked_requests)) {
1871 return true;
1872 }
cc0681c4
BC
1873 if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1874 return true;
1875 }
1876 if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
88266f5a
SH
1877 return true;
1878 }
1879 if (bs->file && bdrv_requests_pending(bs->file)) {
1880 return true;
1881 }
1882 if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1883 return true;
1884 }
1885 return false;
1886}
1887
1888static bool bdrv_requests_pending_all(void)
1889{
1890 BlockDriverState *bs;
dc364f4c 1891 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
88266f5a
SH
1892 if (bdrv_requests_pending(bs)) {
1893 return true;
1894 }
1895 }
1896 return false;
1897}
1898
922453bc
SH
1899/*
1900 * Wait for pending requests to complete across all BlockDriverStates
1901 *
1902 * This function does not flush data to disk, use bdrv_flush_all() for that
1903 * after calling this function.
4c355d53
ZYW
1904 *
1905 * Note that completion of an asynchronous I/O operation can trigger any
1906 * number of other I/O operations on other devices---for example a coroutine
1907 * can be arbitrarily complex and a constant flow of I/O can come until the
1908 * coroutine is complete. Because of this, it is not possible to have a
1909 * function to drain a single device's I/O queue.
922453bc
SH
1910 */
1911void bdrv_drain_all(void)
1912{
88266f5a
SH
1913 /* Always run first iteration so any pending completion BHs run */
1914 bool busy = true;
922453bc
SH
1915 BlockDriverState *bs;
1916
88266f5a 1917 while (busy) {
dc364f4c 1918 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
0b06ef3b 1919 bdrv_start_throttled_reqs(bs);
4c355d53 1920 }
922453bc 1921
88266f5a
SH
1922 busy = bdrv_requests_pending_all();
1923 busy |= aio_poll(qemu_get_aio_context(), busy);
922453bc
SH
1924 }
1925}
1926
dc364f4c
BC
1927/* make a BlockDriverState anonymous by removing from bdrv_state and
1928 * graph_bdrv_state list.
d22b2f41
RH
1929 Also, NULL terminate the device_name to prevent double remove */
1930void bdrv_make_anon(BlockDriverState *bs)
1931{
1932 if (bs->device_name[0] != '\0') {
dc364f4c 1933 QTAILQ_REMOVE(&bdrv_states, bs, device_list);
d22b2f41
RH
1934 }
1935 bs->device_name[0] = '\0';
dc364f4c
BC
1936 if (bs->node_name[0] != '\0') {
1937 QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
1938 }
1939 bs->node_name[0] = '\0';
d22b2f41
RH
1940}
1941
e023b2e2
PB
1942static void bdrv_rebind(BlockDriverState *bs)
1943{
1944 if (bs->drv && bs->drv->bdrv_rebind) {
1945 bs->drv->bdrv_rebind(bs);
1946 }
1947}
1948
4ddc07ca
PB
1949static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
1950 BlockDriverState *bs_src)
8802d1fd 1951{
4ddc07ca 1952 /* move some fields that need to stay attached to the device */
8802d1fd
JC
1953
1954 /* dev info */
4ddc07ca
PB
1955 bs_dest->dev_ops = bs_src->dev_ops;
1956 bs_dest->dev_opaque = bs_src->dev_opaque;
1957 bs_dest->dev = bs_src->dev;
1b7fd729 1958 bs_dest->guest_block_size = bs_src->guest_block_size;
4ddc07ca 1959 bs_dest->copy_on_read = bs_src->copy_on_read;
8802d1fd 1960
4ddc07ca 1961 bs_dest->enable_write_cache = bs_src->enable_write_cache;
c4a248a1 1962
cc0681c4
BC
1963 /* i/o throttled req */
1964 memcpy(&bs_dest->throttle_state,
1965 &bs_src->throttle_state,
1966 sizeof(ThrottleState));
1967 bs_dest->throttled_reqs[0] = bs_src->throttled_reqs[0];
1968 bs_dest->throttled_reqs[1] = bs_src->throttled_reqs[1];
4ddc07ca 1969 bs_dest->io_limits_enabled = bs_src->io_limits_enabled;
8802d1fd 1970
8802d1fd 1971 /* r/w error */
4ddc07ca
PB
1972 bs_dest->on_read_error = bs_src->on_read_error;
1973 bs_dest->on_write_error = bs_src->on_write_error;
8802d1fd
JC
1974
1975 /* i/o status */
4ddc07ca
PB
1976 bs_dest->iostatus_enabled = bs_src->iostatus_enabled;
1977 bs_dest->iostatus = bs_src->iostatus;
8802d1fd 1978
a9fc4408 1979 /* dirty bitmap */
e4654d2d 1980 bs_dest->dirty_bitmaps = bs_src->dirty_bitmaps;
a9fc4408 1981
9fcb0251
FZ
1982 /* reference count */
1983 bs_dest->refcnt = bs_src->refcnt;
1984
a9fc4408 1985 /* job */
4ddc07ca 1986 bs_dest->job = bs_src->job;
a9fc4408 1987
8802d1fd 1988 /* keep the same entry in bdrv_states */
4ddc07ca
PB
1989 pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name),
1990 bs_src->device_name);
dc364f4c 1991 bs_dest->device_list = bs_src->device_list;
fbe40ff7
FZ
1992 memcpy(bs_dest->op_blockers, bs_src->op_blockers,
1993 sizeof(bs_dest->op_blockers));
4ddc07ca 1994}
8802d1fd 1995
4ddc07ca
PB
1996/*
1997 * Swap bs contents for two image chains while they are live,
1998 * while keeping required fields on the BlockDriverState that is
1999 * actually attached to a device.
2000 *
2001 * This will modify the BlockDriverState fields, and swap contents
2002 * between bs_new and bs_old. Both bs_new and bs_old are modified.
2003 *
2004 * bs_new is required to be anonymous.
2005 *
2006 * This function does not create any image files.
2007 */
2008void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
2009{
2010 BlockDriverState tmp;
f6801b83 2011
90ce8a06
BC
2012 /* The code needs to swap the node_name but simply swapping node_list won't
2013 * work so first remove the nodes from the graph list, do the swap then
2014 * insert them back if needed.
2015 */
2016 if (bs_new->node_name[0] != '\0') {
2017 QTAILQ_REMOVE(&graph_bdrv_states, bs_new, node_list);
2018 }
2019 if (bs_old->node_name[0] != '\0') {
2020 QTAILQ_REMOVE(&graph_bdrv_states, bs_old, node_list);
2021 }
2022
4ddc07ca
PB
2023 /* bs_new must be anonymous and shouldn't have anything fancy enabled */
2024 assert(bs_new->device_name[0] == '\0');
e4654d2d 2025 assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
4ddc07ca
PB
2026 assert(bs_new->job == NULL);
2027 assert(bs_new->dev == NULL);
4ddc07ca 2028 assert(bs_new->io_limits_enabled == false);
cc0681c4 2029 assert(!throttle_have_timer(&bs_new->throttle_state));
8802d1fd 2030
4ddc07ca
PB
2031 tmp = *bs_new;
2032 *bs_new = *bs_old;
2033 *bs_old = tmp;
a9fc4408 2034
4ddc07ca
PB
2035 /* there are some fields that should not be swapped, move them back */
2036 bdrv_move_feature_fields(&tmp, bs_old);
2037 bdrv_move_feature_fields(bs_old, bs_new);
2038 bdrv_move_feature_fields(bs_new, &tmp);
8802d1fd 2039
4ddc07ca
PB
2040 /* bs_new shouldn't be in bdrv_states even after the swap! */
2041 assert(bs_new->device_name[0] == '\0');
2042
2043 /* Check a few fields that should remain attached to the device */
2044 assert(bs_new->dev == NULL);
2045 assert(bs_new->job == NULL);
4ddc07ca 2046 assert(bs_new->io_limits_enabled == false);
cc0681c4 2047 assert(!throttle_have_timer(&bs_new->throttle_state));
e023b2e2 2048
90ce8a06
BC
2049 /* insert the nodes back into the graph node list if needed */
2050 if (bs_new->node_name[0] != '\0') {
2051 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_new, node_list);
2052 }
2053 if (bs_old->node_name[0] != '\0') {
2054 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_old, node_list);
2055 }
2056
e023b2e2 2057 bdrv_rebind(bs_new);
4ddc07ca
PB
2058 bdrv_rebind(bs_old);
2059}
2060
2061/*
2062 * Add new bs contents at the top of an image chain while the chain is
2063 * live, while keeping required fields on the top layer.
2064 *
2065 * This will modify the BlockDriverState fields, and swap contents
2066 * between bs_new and bs_top. Both bs_new and bs_top are modified.
2067 *
2068 * bs_new is required to be anonymous.
2069 *
2070 * This function does not create any image files.
2071 */
2072void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
2073{
2074 bdrv_swap(bs_new, bs_top);
2075
2076 /* The contents of 'tmp' will become bs_top, as we are
2077 * swapping bs_new and bs_top contents. */
8d24cce1 2078 bdrv_set_backing_hd(bs_top, bs_new);
8802d1fd
JC
2079}
2080
4f6fd349 2081static void bdrv_delete(BlockDriverState *bs)
b338082b 2082{
fa879d62 2083 assert(!bs->dev);
3e914655 2084 assert(!bs->job);
3718d8ab 2085 assert(bdrv_op_blocker_is_empty(bs));
4f6fd349 2086 assert(!bs->refcnt);
e4654d2d 2087 assert(QLIST_EMPTY(&bs->dirty_bitmaps));
18846dee 2088
e1b5c52e
SH
2089 bdrv_close(bs);
2090
1b7bdbc1 2091 /* remove from list, if necessary */
d22b2f41 2092 bdrv_make_anon(bs);
34c6f050 2093
7267c094 2094 g_free(bs);
fc01f7e7
FB
2095}
2096
fa879d62
MA
2097int bdrv_attach_dev(BlockDriverState *bs, void *dev)
2098/* TODO change to DeviceState *dev when all users are qdevified */
18846dee 2099{
fa879d62 2100 if (bs->dev) {
18846dee
MA
2101 return -EBUSY;
2102 }
fa879d62 2103 bs->dev = dev;
28a7282a 2104 bdrv_iostatus_reset(bs);
18846dee
MA
2105 return 0;
2106}
2107
fa879d62
MA
2108/* TODO qdevified devices don't use this, remove when devices are qdevified */
2109void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
18846dee 2110{
fa879d62
MA
2111 if (bdrv_attach_dev(bs, dev) < 0) {
2112 abort();
2113 }
2114}
2115
2116void bdrv_detach_dev(BlockDriverState *bs, void *dev)
2117/* TODO change to DeviceState *dev when all users are qdevified */
2118{
2119 assert(bs->dev == dev);
2120 bs->dev = NULL;
0e49de52
MA
2121 bs->dev_ops = NULL;
2122 bs->dev_opaque = NULL;
1b7fd729 2123 bs->guest_block_size = 512;
18846dee
MA
2124}
2125
fa879d62
MA
2126/* TODO change to return DeviceState * when all users are qdevified */
2127void *bdrv_get_attached_dev(BlockDriverState *bs)
18846dee 2128{
fa879d62 2129 return bs->dev;
18846dee
MA
2130}
2131
0e49de52
MA
2132void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
2133 void *opaque)
2134{
2135 bs->dev_ops = ops;
2136 bs->dev_opaque = opaque;
2137}
2138
32c81a4a
PB
2139void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
2140 enum MonitorEvent ev,
2141 BlockErrorAction action, bool is_read)
329c0a48
LC
2142{
2143 QObject *data;
2144 const char *action_str;
2145
2146 switch (action) {
2147 case BDRV_ACTION_REPORT:
2148 action_str = "report";
2149 break;
2150 case BDRV_ACTION_IGNORE:
2151 action_str = "ignore";
2152 break;
2153 case BDRV_ACTION_STOP:
2154 action_str = "stop";
2155 break;
2156 default:
2157 abort();
2158 }
2159
2160 data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
2161 bdrv->device_name,
2162 action_str,
2163 is_read ? "read" : "write");
32c81a4a 2164 monitor_protocol_event(ev, data);
329c0a48
LC
2165
2166 qobject_decref(data);
2167}
2168
6f382ed2
LC
2169static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
2170{
2171 QObject *data;
2172
2173 data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
2174 bdrv_get_device_name(bs), ejected);
2175 monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
2176
2177 qobject_decref(data);
2178}
2179
7d4b4ba5 2180static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
0e49de52 2181{
145feb17 2182 if (bs->dev_ops && bs->dev_ops->change_media_cb) {
6f382ed2 2183 bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
7d4b4ba5 2184 bs->dev_ops->change_media_cb(bs->dev_opaque, load);
6f382ed2
LC
2185 if (tray_was_closed) {
2186 /* tray open */
2187 bdrv_emit_qmp_eject_event(bs, true);
2188 }
2189 if (load) {
2190 /* tray close */
2191 bdrv_emit_qmp_eject_event(bs, false);
2192 }
145feb17
MA
2193 }
2194}
2195
2c6942fa
MA
2196bool bdrv_dev_has_removable_media(BlockDriverState *bs)
2197{
2198 return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
2199}
2200
025ccaa7
PB
2201void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
2202{
2203 if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
2204 bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
2205 }
2206}
2207
e4def80b
MA
2208bool bdrv_dev_is_tray_open(BlockDriverState *bs)
2209{
2210 if (bs->dev_ops && bs->dev_ops->is_tray_open) {
2211 return bs->dev_ops->is_tray_open(bs->dev_opaque);
2212 }
2213 return false;
2214}
2215
145feb17
MA
2216static void bdrv_dev_resize_cb(BlockDriverState *bs)
2217{
2218 if (bs->dev_ops && bs->dev_ops->resize_cb) {
2219 bs->dev_ops->resize_cb(bs->dev_opaque);
0e49de52
MA
2220 }
2221}
2222
f107639a
MA
2223bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
2224{
2225 if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
2226 return bs->dev_ops->is_medium_locked(bs->dev_opaque);
2227 }
2228 return false;
2229}
2230
e97fc193
AL
2231/*
2232 * Run consistency checks on an image
2233 *
e076f338 2234 * Returns 0 if the check could be completed (it doesn't mean that the image is
a1c7273b 2235 * free of errors) or -errno when an internal error occurred. The results of the
e076f338 2236 * check are stored in res.
e97fc193 2237 */
4534ff54 2238int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
e97fc193
AL
2239{
2240 if (bs->drv->bdrv_check == NULL) {
2241 return -ENOTSUP;
2242 }
2243
e076f338 2244 memset(res, 0, sizeof(*res));
4534ff54 2245 return bs->drv->bdrv_check(bs, res, fix);
e97fc193
AL
2246}
2247
8a426614
KW
2248#define COMMIT_BUF_SECTORS 2048
2249
33e3963e
FB
2250/* commit COW file into the raw image */
2251int bdrv_commit(BlockDriverState *bs)
2252{
19cb3738 2253 BlockDriver *drv = bs->drv;
72706ea4 2254 int64_t sector, total_sectors, length, backing_length;
8a426614 2255 int n, ro, open_flags;
0bce597d 2256 int ret = 0;
72706ea4 2257 uint8_t *buf = NULL;
c2cba3d9 2258 char filename[PATH_MAX];
33e3963e 2259
19cb3738
FB
2260 if (!drv)
2261 return -ENOMEDIUM;
4dca4b63
NS
2262
2263 if (!bs->backing_hd) {
2264 return -ENOTSUP;
33e3963e
FB
2265 }
2266
3718d8ab
FZ
2267 if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_COMMIT, NULL) ||
2268 bdrv_op_is_blocked(bs->backing_hd, BLOCK_OP_TYPE_COMMIT, NULL)) {
2d3735d3
SH
2269 return -EBUSY;
2270 }
2271
4dca4b63 2272 ro = bs->backing_hd->read_only;
c2cba3d9
JM
2273 /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
2274 pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
4dca4b63
NS
2275 open_flags = bs->backing_hd->open_flags;
2276
2277 if (ro) {
0bce597d
JC
2278 if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
2279 return -EACCES;
4dca4b63 2280 }
ea2384d3 2281 }
33e3963e 2282
72706ea4
JC
2283 length = bdrv_getlength(bs);
2284 if (length < 0) {
2285 ret = length;
2286 goto ro_cleanup;
2287 }
2288
2289 backing_length = bdrv_getlength(bs->backing_hd);
2290 if (backing_length < 0) {
2291 ret = backing_length;
2292 goto ro_cleanup;
2293 }
2294
2295 /* If our top snapshot is larger than the backing file image,
2296 * grow the backing file image if possible. If not possible,
2297 * we must return an error */
2298 if (length > backing_length) {
2299 ret = bdrv_truncate(bs->backing_hd, length);
2300 if (ret < 0) {
2301 goto ro_cleanup;
2302 }
2303 }
2304
2305 total_sectors = length >> BDRV_SECTOR_BITS;
7267c094 2306 buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
8a426614
KW
2307
2308 for (sector = 0; sector < total_sectors; sector += n) {
d663640c
PB
2309 ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
2310 if (ret < 0) {
2311 goto ro_cleanup;
2312 }
2313 if (ret) {
dabfa6cc
KW
2314 ret = bdrv_read(bs, sector, buf, n);
2315 if (ret < 0) {
8a426614
KW
2316 goto ro_cleanup;
2317 }
2318
dabfa6cc
KW
2319 ret = bdrv_write(bs->backing_hd, sector, buf, n);
2320 if (ret < 0) {
8a426614
KW
2321 goto ro_cleanup;
2322 }
ea2384d3 2323 }
33e3963e 2324 }
95389c86 2325
1d44952f
CH
2326 if (drv->bdrv_make_empty) {
2327 ret = drv->bdrv_make_empty(bs);
dabfa6cc
KW
2328 if (ret < 0) {
2329 goto ro_cleanup;
2330 }
1d44952f
CH
2331 bdrv_flush(bs);
2332 }
95389c86 2333
3f5075ae
CH
2334 /*
2335 * Make sure all data we wrote to the backing device is actually
2336 * stable on disk.
2337 */
dabfa6cc 2338 if (bs->backing_hd) {
3f5075ae 2339 bdrv_flush(bs->backing_hd);
dabfa6cc 2340 }
4dca4b63 2341
dabfa6cc 2342 ret = 0;
4dca4b63 2343ro_cleanup:
7267c094 2344 g_free(buf);
4dca4b63
NS
2345
2346 if (ro) {
0bce597d
JC
2347 /* ignoring error return here */
2348 bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
4dca4b63
NS
2349 }
2350
1d44952f 2351 return ret;
33e3963e
FB
2352}
2353
e8877497 2354int bdrv_commit_all(void)
6ab4b5ab
MA
2355{
2356 BlockDriverState *bs;
2357
dc364f4c 2358 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
ed78cda3
SH
2359 AioContext *aio_context = bdrv_get_aio_context(bs);
2360
2361 aio_context_acquire(aio_context);
272d2d8e
JC
2362 if (bs->drv && bs->backing_hd) {
2363 int ret = bdrv_commit(bs);
2364 if (ret < 0) {
ed78cda3 2365 aio_context_release(aio_context);
272d2d8e
JC
2366 return ret;
2367 }
e8877497 2368 }
ed78cda3 2369 aio_context_release(aio_context);
6ab4b5ab 2370 }
e8877497 2371 return 0;
6ab4b5ab
MA
2372}
2373
dbffbdcf
SH
2374/**
2375 * Remove an active request from the tracked requests list
2376 *
2377 * This function should be called when a tracked request is completing.
2378 */
2379static void tracked_request_end(BdrvTrackedRequest *req)
2380{
2dbafdc0
KW
2381 if (req->serialising) {
2382 req->bs->serialising_in_flight--;
2383 }
2384
dbffbdcf 2385 QLIST_REMOVE(req, list);
f4658285 2386 qemu_co_queue_restart_all(&req->wait_queue);
dbffbdcf
SH
2387}
2388
2389/**
2390 * Add an active request to the tracked requests list
2391 */
2392static void tracked_request_begin(BdrvTrackedRequest *req,
2393 BlockDriverState *bs,
793ed47a
KW
2394 int64_t offset,
2395 unsigned int bytes, bool is_write)
dbffbdcf
SH
2396{
2397 *req = (BdrvTrackedRequest){
2398 .bs = bs,
2dbafdc0
KW
2399 .offset = offset,
2400 .bytes = bytes,
2401 .is_write = is_write,
2402 .co = qemu_coroutine_self(),
2403 .serialising = false,
7327145f
KW
2404 .overlap_offset = offset,
2405 .overlap_bytes = bytes,
dbffbdcf
SH
2406 };
2407
f4658285
SH
2408 qemu_co_queue_init(&req->wait_queue);
2409
dbffbdcf
SH
2410 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2411}
2412
e96126ff 2413static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
2dbafdc0 2414{
7327145f 2415 int64_t overlap_offset = req->offset & ~(align - 1);
e96126ff
KW
2416 unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
2417 - overlap_offset;
7327145f 2418
2dbafdc0
KW
2419 if (!req->serialising) {
2420 req->bs->serialising_in_flight++;
2421 req->serialising = true;
2422 }
7327145f
KW
2423
2424 req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
2425 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
2dbafdc0
KW
2426}
2427
d83947ac
SH
2428/**
2429 * Round a region to cluster boundaries
2430 */
343bded4
PB
2431void bdrv_round_to_clusters(BlockDriverState *bs,
2432 int64_t sector_num, int nb_sectors,
2433 int64_t *cluster_sector_num,
2434 int *cluster_nb_sectors)
d83947ac
SH
2435{
2436 BlockDriverInfo bdi;
2437
2438 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2439 *cluster_sector_num = sector_num;
2440 *cluster_nb_sectors = nb_sectors;
2441 } else {
2442 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2443 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2444 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2445 nb_sectors, c);
2446 }
2447}
2448
7327145f 2449static int bdrv_get_cluster_size(BlockDriverState *bs)
793ed47a
KW
2450{
2451 BlockDriverInfo bdi;
7327145f 2452 int ret;
793ed47a 2453
7327145f
KW
2454 ret = bdrv_get_info(bs, &bdi);
2455 if (ret < 0 || bdi.cluster_size == 0) {
2456 return bs->request_alignment;
793ed47a 2457 } else {
7327145f 2458 return bdi.cluster_size;
793ed47a
KW
2459 }
2460}
2461
f4658285 2462static bool tracked_request_overlaps(BdrvTrackedRequest *req,
793ed47a
KW
2463 int64_t offset, unsigned int bytes)
2464{
d83947ac 2465 /* aaaa bbbb */
7327145f 2466 if (offset >= req->overlap_offset + req->overlap_bytes) {
d83947ac
SH
2467 return false;
2468 }
2469 /* bbbb aaaa */
7327145f 2470 if (req->overlap_offset >= offset + bytes) {
d83947ac
SH
2471 return false;
2472 }
2473 return true;
f4658285
SH
2474}
2475
28de2dcd 2476static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
f4658285 2477{
2dbafdc0 2478 BlockDriverState *bs = self->bs;
f4658285
SH
2479 BdrvTrackedRequest *req;
2480 bool retry;
28de2dcd 2481 bool waited = false;
f4658285 2482
2dbafdc0 2483 if (!bs->serialising_in_flight) {
28de2dcd 2484 return false;
2dbafdc0
KW
2485 }
2486
f4658285
SH
2487 do {
2488 retry = false;
2489 QLIST_FOREACH(req, &bs->tracked_requests, list) {
2dbafdc0 2490 if (req == self || (!req->serialising && !self->serialising)) {
65afd211
KW
2491 continue;
2492 }
7327145f
KW
2493 if (tracked_request_overlaps(req, self->overlap_offset,
2494 self->overlap_bytes))
2495 {
5f8b6491
SH
2496 /* Hitting this means there was a reentrant request, for
2497 * example, a block driver issuing nested requests. This must
2498 * never happen since it means deadlock.
2499 */
2500 assert(qemu_coroutine_self() != req->co);
2501
6460440f
KW
2502 /* If the request is already (indirectly) waiting for us, or
2503 * will wait for us as soon as it wakes up, then just go on
2504 * (instead of producing a deadlock in the former case). */
2505 if (!req->waiting_for) {
2506 self->waiting_for = req;
2507 qemu_co_queue_wait(&req->wait_queue);
2508 self->waiting_for = NULL;
2509 retry = true;
28de2dcd 2510 waited = true;
6460440f
KW
2511 break;
2512 }
f4658285
SH
2513 }
2514 }
2515 } while (retry);
28de2dcd
KW
2516
2517 return waited;
f4658285
SH
2518}
2519
756e6736
KW
2520/*
2521 * Return values:
2522 * 0 - success
2523 * -EINVAL - backing format specified, but no file
2524 * -ENOSPC - can't update the backing file because no space is left in the
2525 * image file header
2526 * -ENOTSUP - format driver doesn't support changing the backing file
2527 */
2528int bdrv_change_backing_file(BlockDriverState *bs,
2529 const char *backing_file, const char *backing_fmt)
2530{
2531 BlockDriver *drv = bs->drv;
469ef350 2532 int ret;
756e6736 2533
5f377794
PB
2534 /* Backing file format doesn't make sense without a backing file */
2535 if (backing_fmt && !backing_file) {
2536 return -EINVAL;
2537 }
2538
756e6736 2539 if (drv->bdrv_change_backing_file != NULL) {
469ef350 2540 ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
756e6736 2541 } else {
469ef350 2542 ret = -ENOTSUP;
756e6736 2543 }
469ef350
PB
2544
2545 if (ret == 0) {
2546 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2547 pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2548 }
2549 return ret;
756e6736
KW
2550}
2551
6ebdcee2
JC
2552/*
2553 * Finds the image layer in the chain that has 'bs' as its backing file.
2554 *
2555 * active is the current topmost image.
2556 *
2557 * Returns NULL if bs is not found in active's image chain,
2558 * or if active == bs.
2559 */
2560BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2561 BlockDriverState *bs)
2562{
2563 BlockDriverState *overlay = NULL;
2564 BlockDriverState *intermediate;
2565
2566 assert(active != NULL);
2567 assert(bs != NULL);
2568
2569 /* if bs is the same as active, then by definition it has no overlay
2570 */
2571 if (active == bs) {
2572 return NULL;
2573 }
2574
2575 intermediate = active;
2576 while (intermediate->backing_hd) {
2577 if (intermediate->backing_hd == bs) {
2578 overlay = intermediate;
2579 break;
2580 }
2581 intermediate = intermediate->backing_hd;
2582 }
2583
2584 return overlay;
2585}
2586
2587typedef struct BlkIntermediateStates {
2588 BlockDriverState *bs;
2589 QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2590} BlkIntermediateStates;
2591
2592
2593/*
2594 * Drops images above 'base' up to and including 'top', and sets the image
2595 * above 'top' to have base as its backing file.
2596 *
2597 * Requires that the overlay to 'top' is opened r/w, so that the backing file
2598 * information in 'bs' can be properly updated.
2599 *
2600 * E.g., this will convert the following chain:
2601 * bottom <- base <- intermediate <- top <- active
2602 *
2603 * to
2604 *
2605 * bottom <- base <- active
2606 *
2607 * It is allowed for bottom==base, in which case it converts:
2608 *
2609 * base <- intermediate <- top <- active
2610 *
2611 * to
2612 *
2613 * base <- active
2614 *
2615 * Error conditions:
2616 * if active == top, that is considered an error
2617 *
2618 */
2619int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2620 BlockDriverState *base)
2621{
2622 BlockDriverState *intermediate;
2623 BlockDriverState *base_bs = NULL;
2624 BlockDriverState *new_top_bs = NULL;
2625 BlkIntermediateStates *intermediate_state, *next;
2626 int ret = -EIO;
2627
2628 QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2629 QSIMPLEQ_INIT(&states_to_delete);
2630
2631 if (!top->drv || !base->drv) {
2632 goto exit;
2633 }
2634
2635 new_top_bs = bdrv_find_overlay(active, top);
2636
2637 if (new_top_bs == NULL) {
2638 /* we could not find the image above 'top', this is an error */
2639 goto exit;
2640 }
2641
2642 /* special case of new_top_bs->backing_hd already pointing to base - nothing
2643 * to do, no intermediate images */
2644 if (new_top_bs->backing_hd == base) {
2645 ret = 0;
2646 goto exit;
2647 }
2648
2649 intermediate = top;
2650
2651 /* now we will go down through the list, and add each BDS we find
2652 * into our deletion queue, until we hit the 'base'
2653 */
2654 while (intermediate) {
2655 intermediate_state = g_malloc0(sizeof(BlkIntermediateStates));
2656 intermediate_state->bs = intermediate;
2657 QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2658
2659 if (intermediate->backing_hd == base) {
2660 base_bs = intermediate->backing_hd;
2661 break;
2662 }
2663 intermediate = intermediate->backing_hd;
2664 }
2665 if (base_bs == NULL) {
2666 /* something went wrong, we did not end at the base. safely
2667 * unravel everything, and exit with error */
2668 goto exit;
2669 }
2670
2671 /* success - we can delete the intermediate states, and link top->base */
2672 ret = bdrv_change_backing_file(new_top_bs, base_bs->filename,
2673 base_bs->drv ? base_bs->drv->format_name : "");
2674 if (ret) {
2675 goto exit;
2676 }
920beae1 2677 bdrv_set_backing_hd(new_top_bs, base_bs);
6ebdcee2
JC
2678
2679 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2680 /* so that bdrv_close() does not recursively close the chain */
920beae1 2681 bdrv_set_backing_hd(intermediate_state->bs, NULL);
4f6fd349 2682 bdrv_unref(intermediate_state->bs);
6ebdcee2
JC
2683 }
2684 ret = 0;
2685
2686exit:
2687 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2688 g_free(intermediate_state);
2689 }
2690 return ret;
2691}
2692
2693
71d0770c
AL
2694static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2695 size_t size)
2696{
2697 int64_t len;
2698
1dd3a447
KW
2699 if (size > INT_MAX) {
2700 return -EIO;
2701 }
2702
71d0770c
AL
2703 if (!bdrv_is_inserted(bs))
2704 return -ENOMEDIUM;
2705
2706 if (bs->growable)
2707 return 0;
2708
2709 len = bdrv_getlength(bs);
2710
fbb7b4e0
KW
2711 if (offset < 0)
2712 return -EIO;
2713
2714 if ((offset > len) || (len - offset < size))
71d0770c
AL
2715 return -EIO;
2716
2717 return 0;
2718}
2719
2720static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2721 int nb_sectors)
2722{
54db38a4 2723 if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
8f4754ed
KW
2724 return -EIO;
2725 }
2726
eb5a3165
JS
2727 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2728 nb_sectors * BDRV_SECTOR_SIZE);
71d0770c
AL
2729}
2730
1c9805a3
SH
2731typedef struct RwCo {
2732 BlockDriverState *bs;
775aa8b6 2733 int64_t offset;
1c9805a3
SH
2734 QEMUIOVector *qiov;
2735 bool is_write;
2736 int ret;
4105eaaa 2737 BdrvRequestFlags flags;
1c9805a3
SH
2738} RwCo;
2739
2740static void coroutine_fn bdrv_rw_co_entry(void *opaque)
fc01f7e7 2741{
1c9805a3 2742 RwCo *rwco = opaque;
ea2384d3 2743
1c9805a3 2744 if (!rwco->is_write) {
775aa8b6
KW
2745 rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
2746 rwco->qiov->size, rwco->qiov,
4105eaaa 2747 rwco->flags);
775aa8b6
KW
2748 } else {
2749 rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
2750 rwco->qiov->size, rwco->qiov,
2751 rwco->flags);
1c9805a3
SH
2752 }
2753}
e7a8a783 2754
1c9805a3 2755/*
8d3b1a2d 2756 * Process a vectored synchronous request using coroutines
1c9805a3 2757 */
775aa8b6
KW
2758static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
2759 QEMUIOVector *qiov, bool is_write,
2760 BdrvRequestFlags flags)
1c9805a3 2761{
1c9805a3
SH
2762 Coroutine *co;
2763 RwCo rwco = {
2764 .bs = bs,
775aa8b6 2765 .offset = offset,
8d3b1a2d 2766 .qiov = qiov,
1c9805a3
SH
2767 .is_write = is_write,
2768 .ret = NOT_DONE,
4105eaaa 2769 .flags = flags,
1c9805a3 2770 };
e7a8a783 2771
498e386c
ZYW
2772 /**
2773 * In sync call context, when the vcpu is blocked, this throttling timer
2774 * will not fire; so the I/O throttling function has to be disabled here
2775 * if it has been enabled.
2776 */
2777 if (bs->io_limits_enabled) {
2778 fprintf(stderr, "Disabling I/O throttling on '%s' due "
2779 "to synchronous I/O.\n", bdrv_get_device_name(bs));
2780 bdrv_io_limits_disable(bs);
2781 }
2782
1c9805a3
SH
2783 if (qemu_in_coroutine()) {
2784 /* Fast-path if already in coroutine context */
2785 bdrv_rw_co_entry(&rwco);
2786 } else {
2572b37a
SH
2787 AioContext *aio_context = bdrv_get_aio_context(bs);
2788
1c9805a3
SH
2789 co = qemu_coroutine_create(bdrv_rw_co_entry);
2790 qemu_coroutine_enter(co, &rwco);
2791 while (rwco.ret == NOT_DONE) {
2572b37a 2792 aio_poll(aio_context, true);
1c9805a3
SH
2793 }
2794 }
2795 return rwco.ret;
2796}
b338082b 2797
8d3b1a2d
KW
2798/*
2799 * Process a synchronous request using coroutines
2800 */
2801static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
4105eaaa 2802 int nb_sectors, bool is_write, BdrvRequestFlags flags)
8d3b1a2d
KW
2803{
2804 QEMUIOVector qiov;
2805 struct iovec iov = {
2806 .iov_base = (void *)buf,
2807 .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2808 };
2809
da15ee51
KW
2810 if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2811 return -EINVAL;
2812 }
2813
8d3b1a2d 2814 qemu_iovec_init_external(&qiov, &iov, 1);
775aa8b6
KW
2815 return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
2816 &qiov, is_write, flags);
8d3b1a2d
KW
2817}
2818
1c9805a3
SH
2819/* return < 0 if error. See bdrv_write() for the return codes */
2820int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2821 uint8_t *buf, int nb_sectors)
2822{
4105eaaa 2823 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
fc01f7e7
FB
2824}
2825
07d27a44
MA
2826/* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2827int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2828 uint8_t *buf, int nb_sectors)
2829{
2830 bool enabled;
2831 int ret;
2832
2833 enabled = bs->io_limits_enabled;
2834 bs->io_limits_enabled = false;
4e7395e8 2835 ret = bdrv_read(bs, sector_num, buf, nb_sectors);
07d27a44
MA
2836 bs->io_limits_enabled = enabled;
2837 return ret;
2838}
2839
5fafdf24 2840/* Return < 0 if error. Important errors are:
19cb3738
FB
2841 -EIO generic I/O error (may happen for all errors)
2842 -ENOMEDIUM No media inserted.
2843 -EINVAL Invalid sector number or nb_sectors
2844 -EACCES Trying to write a read-only device
2845*/
5fafdf24 2846int bdrv_write(BlockDriverState *bs, int64_t sector_num,
fc01f7e7
FB
2847 const uint8_t *buf, int nb_sectors)
2848{
4105eaaa 2849 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
83f64091
FB
2850}
2851
aa7bfbff
PL
2852int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2853 int nb_sectors, BdrvRequestFlags flags)
4105eaaa
PL
2854{
2855 return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
aa7bfbff 2856 BDRV_REQ_ZERO_WRITE | flags);
8d3b1a2d
KW
2857}
2858
d75cbb5e
PL
2859/*
2860 * Completely zero out a block device with the help of bdrv_write_zeroes.
2861 * The operation is sped up by checking the block status and only writing
2862 * zeroes to the device if they currently do not return zeroes. Optional
2863 * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2864 *
2865 * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2866 */
2867int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2868{
9ce10c0b 2869 int64_t target_size;
d75cbb5e
PL
2870 int64_t ret, nb_sectors, sector_num = 0;
2871 int n;
2872
9ce10c0b
KW
2873 target_size = bdrv_getlength(bs);
2874 if (target_size < 0) {
2875 return target_size;
2876 }
2877 target_size /= BDRV_SECTOR_SIZE;
2878
d75cbb5e
PL
2879 for (;;) {
2880 nb_sectors = target_size - sector_num;
2881 if (nb_sectors <= 0) {
2882 return 0;
2883 }
2884 if (nb_sectors > INT_MAX) {
2885 nb_sectors = INT_MAX;
2886 }
2887 ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
3d94ce60
PL
2888 if (ret < 0) {
2889 error_report("error getting block status at sector %" PRId64 ": %s",
2890 sector_num, strerror(-ret));
2891 return ret;
2892 }
d75cbb5e
PL
2893 if (ret & BDRV_BLOCK_ZERO) {
2894 sector_num += n;
2895 continue;
2896 }
2897 ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2898 if (ret < 0) {
2899 error_report("error writing zeroes at sector %" PRId64 ": %s",
2900 sector_num, strerror(-ret));
2901 return ret;
2902 }
2903 sector_num += n;
2904 }
2905}
2906
a3ef6571 2907int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
83f64091 2908{
a3ef6571
KW
2909 QEMUIOVector qiov;
2910 struct iovec iov = {
2911 .iov_base = (void *)buf,
2912 .iov_len = bytes,
2913 };
9a8c4cce 2914 int ret;
83f64091 2915
a3ef6571
KW
2916 if (bytes < 0) {
2917 return -EINVAL;
83f64091
FB
2918 }
2919
a3ef6571
KW
2920 qemu_iovec_init_external(&qiov, &iov, 1);
2921 ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
2922 if (ret < 0) {
2923 return ret;
83f64091 2924 }
a3ef6571
KW
2925
2926 return bytes;
83f64091
FB
2927}
2928
8d3b1a2d 2929int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
83f64091 2930{
9a8c4cce 2931 int ret;
83f64091 2932
8407d5d7
KW
2933 ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
2934 if (ret < 0) {
2935 return ret;
83f64091
FB
2936 }
2937
8d3b1a2d
KW
2938 return qiov->size;
2939}
2940
2941int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
8407d5d7 2942 const void *buf, int bytes)
8d3b1a2d
KW
2943{
2944 QEMUIOVector qiov;
2945 struct iovec iov = {
2946 .iov_base = (void *) buf,
8407d5d7 2947 .iov_len = bytes,
8d3b1a2d
KW
2948 };
2949
8407d5d7
KW
2950 if (bytes < 0) {
2951 return -EINVAL;
2952 }
2953
8d3b1a2d
KW
2954 qemu_iovec_init_external(&qiov, &iov, 1);
2955 return bdrv_pwritev(bs, offset, &qiov);
83f64091 2956}
83f64091 2957
f08145fe
KW
2958/*
2959 * Writes to the file and ensures that no writes are reordered across this
2960 * request (acts as a barrier)
2961 *
2962 * Returns 0 on success, -errno in error cases.
2963 */
2964int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2965 const void *buf, int count)
2966{
2967 int ret;
2968
2969 ret = bdrv_pwrite(bs, offset, buf, count);
2970 if (ret < 0) {
2971 return ret;
2972 }
2973
f05fa4ad
PB
2974 /* No flush needed for cache modes that already do it */
2975 if (bs->enable_write_cache) {
f08145fe
KW
2976 bdrv_flush(bs);
2977 }
2978
2979 return 0;
2980}
2981
470c0504 2982static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
ab185921
SH
2983 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2984{
2985 /* Perform I/O through a temporary buffer so that users who scribble over
2986 * their read buffer while the operation is in progress do not end up
2987 * modifying the image file. This is critical for zero-copy guest I/O
2988 * where anything might happen inside guest memory.
2989 */
2990 void *bounce_buffer;
2991
79c053bd 2992 BlockDriver *drv = bs->drv;
ab185921
SH
2993 struct iovec iov;
2994 QEMUIOVector bounce_qiov;
2995 int64_t cluster_sector_num;
2996 int cluster_nb_sectors;
2997 size_t skip_bytes;
2998 int ret;
2999
3000 /* Cover entire cluster so no additional backing file I/O is required when
3001 * allocating cluster in the image file.
3002 */
343bded4
PB
3003 bdrv_round_to_clusters(bs, sector_num, nb_sectors,
3004 &cluster_sector_num, &cluster_nb_sectors);
ab185921 3005
470c0504
SH
3006 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
3007 cluster_sector_num, cluster_nb_sectors);
ab185921
SH
3008
3009 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
3010 iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
3011 qemu_iovec_init_external(&bounce_qiov, &iov, 1);
3012
79c053bd
SH
3013 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
3014 &bounce_qiov);
ab185921
SH
3015 if (ret < 0) {
3016 goto err;
3017 }
3018
79c053bd
SH
3019 if (drv->bdrv_co_write_zeroes &&
3020 buffer_is_zero(bounce_buffer, iov.iov_len)) {
621f0589 3021 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
aa7bfbff 3022 cluster_nb_sectors, 0);
79c053bd 3023 } else {
f05fa4ad
PB
3024 /* This does not change the data on the disk, it is not necessary
3025 * to flush even in cache=writethrough mode.
3026 */
79c053bd 3027 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
ab185921 3028 &bounce_qiov);
79c053bd
SH
3029 }
3030
ab185921
SH
3031 if (ret < 0) {
3032 /* It might be okay to ignore write errors for guest requests. If this
3033 * is a deliberate copy-on-read then we don't want to ignore the error.
3034 * Simply report it in all cases.
3035 */
3036 goto err;
3037 }
3038
3039 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
03396148
MT
3040 qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
3041 nb_sectors * BDRV_SECTOR_SIZE);
ab185921
SH
3042
3043err:
3044 qemu_vfree(bounce_buffer);
3045 return ret;
3046}
3047
c5fbe571 3048/*
d0c7f642
KW
3049 * Forwards an already correctly aligned request to the BlockDriver. This
3050 * handles copy on read and zeroing after EOF; any other features must be
3051 * implemented by the caller.
c5fbe571 3052 */
d0c7f642 3053static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
65afd211 3054 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
ec746e10 3055 int64_t align, QEMUIOVector *qiov, int flags)
da1fa91d
KW
3056{
3057 BlockDriver *drv = bs->drv;
dbffbdcf 3058 int ret;
da1fa91d 3059
d0c7f642
KW
3060 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3061 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
da1fa91d 3062
d0c7f642
KW
3063 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3064 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3065
3066 /* Handle Copy on Read and associated serialisation */
470c0504 3067 if (flags & BDRV_REQ_COPY_ON_READ) {
7327145f
KW
3068 /* If we touch the same cluster it counts as an overlap. This
3069 * guarantees that allocating writes will be serialized and not race
3070 * with each other for the same cluster. For example, in copy-on-read
3071 * it ensures that the CoR read and write operations are atomic and
3072 * guest writes cannot interleave between them. */
3073 mark_request_serialising(req, bdrv_get_cluster_size(bs));
470c0504
SH
3074 }
3075
2dbafdc0 3076 wait_serialising_requests(req);
f4658285 3077
470c0504 3078 if (flags & BDRV_REQ_COPY_ON_READ) {
ab185921
SH
3079 int pnum;
3080
bdad13b9 3081 ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
ab185921
SH
3082 if (ret < 0) {
3083 goto out;
3084 }
3085
3086 if (!ret || pnum != nb_sectors) {
470c0504 3087 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
ab185921
SH
3088 goto out;
3089 }
3090 }
3091
d0c7f642 3092 /* Forward the request to the BlockDriver */
893a8f62
MK
3093 if (!(bs->zero_beyond_eof && bs->growable)) {
3094 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
3095 } else {
3096 /* Read zeros after EOF of growable BDSes */
3097 int64_t len, total_sectors, max_nb_sectors;
3098
3099 len = bdrv_getlength(bs);
3100 if (len < 0) {
3101 ret = len;
3102 goto out;
3103 }
3104
d055a1fe 3105 total_sectors = DIV_ROUND_UP(len, BDRV_SECTOR_SIZE);
5f5bcd80
KW
3106 max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
3107 align >> BDRV_SECTOR_BITS);
893a8f62
MK
3108 if (max_nb_sectors > 0) {
3109 ret = drv->bdrv_co_readv(bs, sector_num,
3110 MIN(nb_sectors, max_nb_sectors), qiov);
3111 } else {
3112 ret = 0;
3113 }
3114
3115 /* Reading beyond end of file is supposed to produce zeroes */
3116 if (ret == 0 && total_sectors < sector_num + nb_sectors) {
3117 uint64_t offset = MAX(0, total_sectors - sector_num);
3118 uint64_t bytes = (sector_num + nb_sectors - offset) *
3119 BDRV_SECTOR_SIZE;
3120 qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
3121 }
3122 }
ab185921
SH
3123
3124out:
dbffbdcf 3125 return ret;
da1fa91d
KW
3126}
3127
d0c7f642
KW
3128/*
3129 * Handle a read request in coroutine context
3130 */
1b0288ae
KW
3131static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
3132 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
d0c7f642
KW
3133 BdrvRequestFlags flags)
3134{
3135 BlockDriver *drv = bs->drv;
65afd211
KW
3136 BdrvTrackedRequest req;
3137
1b0288ae
KW
3138 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3139 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3140 uint8_t *head_buf = NULL;
3141 uint8_t *tail_buf = NULL;
3142 QEMUIOVector local_qiov;
3143 bool use_local_qiov = false;
d0c7f642
KW
3144 int ret;
3145
3146 if (!drv) {
3147 return -ENOMEDIUM;
3148 }
1b0288ae 3149 if (bdrv_check_byte_request(bs, offset, bytes)) {
d0c7f642
KW
3150 return -EIO;
3151 }
3152
3153 if (bs->copy_on_read) {
3154 flags |= BDRV_REQ_COPY_ON_READ;
3155 }
3156
3157 /* throttling disk I/O */
3158 if (bs->io_limits_enabled) {
d5103588 3159 bdrv_io_limits_intercept(bs, bytes, false);
1b0288ae
KW
3160 }
3161
3162 /* Align read if necessary by padding qiov */
3163 if (offset & (align - 1)) {
3164 head_buf = qemu_blockalign(bs, align);
3165 qemu_iovec_init(&local_qiov, qiov->niov + 2);
3166 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3167 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3168 use_local_qiov = true;
3169
3170 bytes += offset & (align - 1);
3171 offset = offset & ~(align - 1);
3172 }
3173
3174 if ((offset + bytes) & (align - 1)) {
3175 if (!use_local_qiov) {
3176 qemu_iovec_init(&local_qiov, qiov->niov + 1);
3177 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3178 use_local_qiov = true;
3179 }
3180 tail_buf = qemu_blockalign(bs, align);
3181 qemu_iovec_add(&local_qiov, tail_buf,
3182 align - ((offset + bytes) & (align - 1)));
3183
3184 bytes = ROUND_UP(bytes, align);
3185 }
3186
65afd211 3187 tracked_request_begin(&req, bs, offset, bytes, false);
ec746e10 3188 ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
1b0288ae
KW
3189 use_local_qiov ? &local_qiov : qiov,
3190 flags);
65afd211 3191 tracked_request_end(&req);
1b0288ae
KW
3192
3193 if (use_local_qiov) {
3194 qemu_iovec_destroy(&local_qiov);
3195 qemu_vfree(head_buf);
3196 qemu_vfree(tail_buf);
d0c7f642
KW
3197 }
3198
d0c7f642
KW
3199 return ret;
3200}
3201
1b0288ae
KW
3202static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
3203 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3204 BdrvRequestFlags flags)
3205{
3206 if (nb_sectors < 0 || nb_sectors > (UINT_MAX >> BDRV_SECTOR_BITS)) {
3207 return -EINVAL;
3208 }
3209
3210 return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
3211 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3212}
3213
c5fbe571 3214int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
da1fa91d
KW
3215 int nb_sectors, QEMUIOVector *qiov)
3216{
c5fbe571 3217 trace_bdrv_co_readv(bs, sector_num, nb_sectors);
da1fa91d 3218
470c0504
SH
3219 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
3220}
3221
3222int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
3223 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
3224{
3225 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
3226
3227 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
3228 BDRV_REQ_COPY_ON_READ);
c5fbe571
SH
3229}
3230
c31cb707
PL
3231/* if no limit is specified in the BlockLimits use a default
3232 * of 32768 512-byte sectors (16 MiB) per request.
3233 */
3234#define MAX_WRITE_ZEROES_DEFAULT 32768
3235
f08f2dda 3236static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
aa7bfbff 3237 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
f08f2dda
SH
3238{
3239 BlockDriver *drv = bs->drv;
3240 QEMUIOVector qiov;
c31cb707
PL
3241 struct iovec iov = {0};
3242 int ret = 0;
f08f2dda 3243
c31cb707
PL
3244 int max_write_zeroes = bs->bl.max_write_zeroes ?
3245 bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
621f0589 3246
c31cb707
PL
3247 while (nb_sectors > 0 && !ret) {
3248 int num = nb_sectors;
3249
b8d71c09
PB
3250 /* Align request. Block drivers can expect the "bulk" of the request
3251 * to be aligned.
3252 */
3253 if (bs->bl.write_zeroes_alignment
3254 && num > bs->bl.write_zeroes_alignment) {
3255 if (sector_num % bs->bl.write_zeroes_alignment != 0) {
3256 /* Make a small request up to the first aligned sector. */
c31cb707 3257 num = bs->bl.write_zeroes_alignment;
b8d71c09
PB
3258 num -= sector_num % bs->bl.write_zeroes_alignment;
3259 } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
3260 /* Shorten the request to the last aligned sector. num cannot
3261 * underflow because num > bs->bl.write_zeroes_alignment.
3262 */
3263 num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
c31cb707 3264 }
621f0589 3265 }
f08f2dda 3266
c31cb707
PL
3267 /* limit request size */
3268 if (num > max_write_zeroes) {
3269 num = max_write_zeroes;
3270 }
3271
3272 ret = -ENOTSUP;
3273 /* First try the efficient write zeroes operation */
3274 if (drv->bdrv_co_write_zeroes) {
3275 ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
3276 }
3277
3278 if (ret == -ENOTSUP) {
3279 /* Fall back to bounce buffer if write zeroes is unsupported */
3280 iov.iov_len = num * BDRV_SECTOR_SIZE;
3281 if (iov.iov_base == NULL) {
b8d71c09
PB
3282 iov.iov_base = qemu_blockalign(bs, num * BDRV_SECTOR_SIZE);
3283 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
c31cb707
PL
3284 }
3285 qemu_iovec_init_external(&qiov, &iov, 1);
f08f2dda 3286
c31cb707 3287 ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
b8d71c09
PB
3288
3289 /* Keep bounce buffer around if it is big enough for all
3290 * all future requests.
3291 */
3292 if (num < max_write_zeroes) {
3293 qemu_vfree(iov.iov_base);
3294 iov.iov_base = NULL;
3295 }
c31cb707
PL
3296 }
3297
3298 sector_num += num;
3299 nb_sectors -= num;
3300 }
f08f2dda
SH
3301
3302 qemu_vfree(iov.iov_base);
3303 return ret;
3304}
3305
c5fbe571 3306/*
b404f720 3307 * Forwards an already correctly aligned write request to the BlockDriver.
c5fbe571 3308 */
b404f720 3309static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
65afd211
KW
3310 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3311 QEMUIOVector *qiov, int flags)
c5fbe571
SH
3312{
3313 BlockDriver *drv = bs->drv;
28de2dcd 3314 bool waited;
6b7cb247 3315 int ret;
da1fa91d 3316
b404f720
KW
3317 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3318 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
f4658285 3319
b404f720
KW
3320 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3321 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
cc0681c4 3322
28de2dcd
KW
3323 waited = wait_serialising_requests(req);
3324 assert(!waited || !req->serialising);
af91f9a7
KW
3325 assert(req->overlap_offset <= offset);
3326 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
244eadef 3327
65afd211 3328 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
d616b224 3329
465bee1d
PL
3330 if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
3331 !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes &&
3332 qemu_iovec_is_zero(qiov)) {
3333 flags |= BDRV_REQ_ZERO_WRITE;
3334 if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
3335 flags |= BDRV_REQ_MAY_UNMAP;
3336 }
3337 }
3338
d616b224
SH
3339 if (ret < 0) {
3340 /* Do nothing, write notifier decided to fail this request */
3341 } else if (flags & BDRV_REQ_ZERO_WRITE) {
9e1cb96d 3342 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
aa7bfbff 3343 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
f08f2dda 3344 } else {
9e1cb96d 3345 BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
f08f2dda
SH
3346 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
3347 }
9e1cb96d 3348 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
6b7cb247 3349
f05fa4ad
PB
3350 if (ret == 0 && !bs->enable_write_cache) {
3351 ret = bdrv_co_flush(bs);
3352 }
3353
e4654d2d 3354 bdrv_set_dirty(bs, sector_num, nb_sectors);
da1fa91d
KW
3355
3356 if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
3357 bs->wr_highest_sector = sector_num + nb_sectors - 1;
3358 }
df2a6f29
PB
3359 if (bs->growable && ret >= 0) {
3360 bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
3361 }
da1fa91d 3362
6b7cb247 3363 return ret;
da1fa91d
KW
3364}
3365
b404f720
KW
3366/*
3367 * Handle a write request in coroutine context
3368 */
6601553e
KW
3369static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
3370 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
b404f720
KW
3371 BdrvRequestFlags flags)
3372{
65afd211 3373 BdrvTrackedRequest req;
3b8242e0
KW
3374 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3375 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3376 uint8_t *head_buf = NULL;
3377 uint8_t *tail_buf = NULL;
3378 QEMUIOVector local_qiov;
3379 bool use_local_qiov = false;
b404f720
KW
3380 int ret;
3381
3382 if (!bs->drv) {
3383 return -ENOMEDIUM;
3384 }
3385 if (bs->read_only) {
3386 return -EACCES;
3387 }
6601553e 3388 if (bdrv_check_byte_request(bs, offset, bytes)) {
b404f720
KW
3389 return -EIO;
3390 }
3391
b404f720
KW
3392 /* throttling disk I/O */
3393 if (bs->io_limits_enabled) {
d5103588 3394 bdrv_io_limits_intercept(bs, bytes, true);
b404f720
KW
3395 }
3396
3b8242e0
KW
3397 /*
3398 * Align write if necessary by performing a read-modify-write cycle.
3399 * Pad qiov with the read parts and be sure to have a tracked request not
3400 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
3401 */
65afd211 3402 tracked_request_begin(&req, bs, offset, bytes, true);
3b8242e0
KW
3403
3404 if (offset & (align - 1)) {
3405 QEMUIOVector head_qiov;
3406 struct iovec head_iov;
3407
3408 mark_request_serialising(&req, align);
3409 wait_serialising_requests(&req);
3410
3411 head_buf = qemu_blockalign(bs, align);
3412 head_iov = (struct iovec) {
3413 .iov_base = head_buf,
3414 .iov_len = align,
3415 };
3416 qemu_iovec_init_external(&head_qiov, &head_iov, 1);
3417
9e1cb96d 3418 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
3b8242e0
KW
3419 ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
3420 align, &head_qiov, 0);
3421 if (ret < 0) {
3422 goto fail;
3423 }
9e1cb96d 3424 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
3b8242e0
KW
3425
3426 qemu_iovec_init(&local_qiov, qiov->niov + 2);
3427 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3428 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3429 use_local_qiov = true;
3430
3431 bytes += offset & (align - 1);
3432 offset = offset & ~(align - 1);
3433 }
3434
3435 if ((offset + bytes) & (align - 1)) {
3436 QEMUIOVector tail_qiov;
3437 struct iovec tail_iov;
3438 size_t tail_bytes;
28de2dcd 3439 bool waited;
3b8242e0
KW
3440
3441 mark_request_serialising(&req, align);
28de2dcd
KW
3442 waited = wait_serialising_requests(&req);
3443 assert(!waited || !use_local_qiov);
3b8242e0
KW
3444
3445 tail_buf = qemu_blockalign(bs, align);
3446 tail_iov = (struct iovec) {
3447 .iov_base = tail_buf,
3448 .iov_len = align,
3449 };
3450 qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
3451
9e1cb96d 3452 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
3b8242e0
KW
3453 ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
3454 align, &tail_qiov, 0);
3455 if (ret < 0) {
3456 goto fail;
3457 }
9e1cb96d 3458 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
3b8242e0
KW
3459
3460 if (!use_local_qiov) {
3461 qemu_iovec_init(&local_qiov, qiov->niov + 1);
3462 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3463 use_local_qiov = true;
3464 }
3465
3466 tail_bytes = (offset + bytes) & (align - 1);
3467 qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
3468
3469 bytes = ROUND_UP(bytes, align);
3470 }
3471
3472 ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
3473 use_local_qiov ? &local_qiov : qiov,
3474 flags);
3475
3476fail:
65afd211 3477 tracked_request_end(&req);
b404f720 3478
3b8242e0
KW
3479 if (use_local_qiov) {
3480 qemu_iovec_destroy(&local_qiov);
3b8242e0 3481 }
99c4a85c
KW
3482 qemu_vfree(head_buf);
3483 qemu_vfree(tail_buf);
3b8242e0 3484
b404f720
KW
3485 return ret;
3486}
3487
6601553e
KW
3488static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
3489 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3490 BdrvRequestFlags flags)
3491{
3492 if (nb_sectors < 0 || nb_sectors > (INT_MAX >> BDRV_SECTOR_BITS)) {
3493 return -EINVAL;
3494 }
3495
3496 return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
3497 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3498}
3499
c5fbe571
SH
3500int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
3501 int nb_sectors, QEMUIOVector *qiov)
3502{
3503 trace_bdrv_co_writev(bs, sector_num, nb_sectors);
3504
f08f2dda
SH
3505 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
3506}
3507
3508int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
aa7bfbff
PL
3509 int64_t sector_num, int nb_sectors,
3510 BdrvRequestFlags flags)
f08f2dda 3511{
94d6ff21 3512 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
f08f2dda 3513
d32f35cb
PL
3514 if (!(bs->open_flags & BDRV_O_UNMAP)) {
3515 flags &= ~BDRV_REQ_MAY_UNMAP;
3516 }
3517
f08f2dda 3518 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
aa7bfbff 3519 BDRV_REQ_ZERO_WRITE | flags);
c5fbe571
SH
3520}
3521
83f64091
FB
3522/**
3523 * Truncate file to 'offset' bytes (needed only for file protocols)
3524 */
3525int bdrv_truncate(BlockDriverState *bs, int64_t offset)
3526{
3527 BlockDriver *drv = bs->drv;
51762288 3528 int ret;
83f64091 3529 if (!drv)
19cb3738 3530 return -ENOMEDIUM;
83f64091
FB
3531 if (!drv->bdrv_truncate)
3532 return -ENOTSUP;
59f2689d
NS
3533 if (bs->read_only)
3534 return -EACCES;
3718d8ab 3535 if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_RESIZE, NULL)) {
8591675f 3536 return -EBUSY;
3718d8ab 3537 }
51762288
SH
3538 ret = drv->bdrv_truncate(bs, offset);
3539 if (ret == 0) {
3540 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
145feb17 3541 bdrv_dev_resize_cb(bs);
51762288
SH
3542 }
3543 return ret;
83f64091
FB
3544}
3545
4a1d5e1f
FZ
3546/**
3547 * Length of a allocated file in bytes. Sparse files are counted by actual
3548 * allocated space. Return < 0 if error or unknown.
3549 */
3550int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
3551{
3552 BlockDriver *drv = bs->drv;
3553 if (!drv) {
3554 return -ENOMEDIUM;
3555 }
3556 if (drv->bdrv_get_allocated_file_size) {
3557 return drv->bdrv_get_allocated_file_size(bs);
3558 }
3559 if (bs->file) {
3560 return bdrv_get_allocated_file_size(bs->file);
3561 }
3562 return -ENOTSUP;
3563}
3564
83f64091
FB
3565/**
3566 * Length of a file in bytes. Return < 0 if error or unknown.
3567 */
3568int64_t bdrv_getlength(BlockDriverState *bs)
3569{
3570 BlockDriver *drv = bs->drv;
3571 if (!drv)
19cb3738 3572 return -ENOMEDIUM;
51762288 3573
b94a2610
KW
3574 if (drv->has_variable_length) {
3575 int ret = refresh_total_sectors(bs, bs->total_sectors);
3576 if (ret < 0) {
3577 return ret;
46a4e4e6 3578 }
83f64091 3579 }
46a4e4e6 3580 return bs->total_sectors * BDRV_SECTOR_SIZE;
fc01f7e7
FB
3581}
3582
19cb3738 3583/* return 0 as number of sectors if no device present or error */
96b8f136 3584void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
fc01f7e7 3585{
19cb3738
FB
3586 int64_t length;
3587 length = bdrv_getlength(bs);
3588 if (length < 0)
3589 length = 0;
3590 else
6ea44308 3591 length = length >> BDRV_SECTOR_BITS;
19cb3738 3592 *nb_sectors_ptr = length;
fc01f7e7 3593}
cf98951b 3594
ff06f5f3
PB
3595void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
3596 BlockdevOnError on_write_error)
abd7f68d
MA
3597{
3598 bs->on_read_error = on_read_error;
3599 bs->on_write_error = on_write_error;
3600}
3601
1ceee0d5 3602BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
abd7f68d
MA
3603{
3604 return is_read ? bs->on_read_error : bs->on_write_error;
3605}
3606
3e1caa5f
PB
3607BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
3608{
3609 BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
3610
3611 switch (on_err) {
3612 case BLOCKDEV_ON_ERROR_ENOSPC:
3613 return (error == ENOSPC) ? BDRV_ACTION_STOP : BDRV_ACTION_REPORT;
3614 case BLOCKDEV_ON_ERROR_STOP:
3615 return BDRV_ACTION_STOP;
3616 case BLOCKDEV_ON_ERROR_REPORT:
3617 return BDRV_ACTION_REPORT;
3618 case BLOCKDEV_ON_ERROR_IGNORE:
3619 return BDRV_ACTION_IGNORE;
3620 default:
3621 abort();
3622 }
3623}
3624
3625/* This is done by device models because, while the block layer knows
3626 * about the error, it does not know whether an operation comes from
3627 * the device or the block layer (from a job, for example).
3628 */
3629void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3630 bool is_read, int error)
3631{
3632 assert(error >= 0);
32c81a4a 3633 bdrv_emit_qmp_error_event(bs, QEVENT_BLOCK_IO_ERROR, action, is_read);
3e1caa5f
PB
3634 if (action == BDRV_ACTION_STOP) {
3635 vm_stop(RUN_STATE_IO_ERROR);
3636 bdrv_iostatus_set_err(bs, error);
3637 }
3638}
3639
b338082b
FB
3640int bdrv_is_read_only(BlockDriverState *bs)
3641{
3642 return bs->read_only;
3643}
3644
985a03b0
TS
3645int bdrv_is_sg(BlockDriverState *bs)
3646{
3647 return bs->sg;
3648}
3649
e900a7b7
CH
3650int bdrv_enable_write_cache(BlockDriverState *bs)
3651{
3652 return bs->enable_write_cache;
3653}
3654
425b0148
PB
3655void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3656{
3657 bs->enable_write_cache = wce;
55b110f2
JC
3658
3659 /* so a reopen() will preserve wce */
3660 if (wce) {
3661 bs->open_flags |= BDRV_O_CACHE_WB;
3662 } else {
3663 bs->open_flags &= ~BDRV_O_CACHE_WB;
3664 }
425b0148
PB
3665}
3666
ea2384d3
FB
3667int bdrv_is_encrypted(BlockDriverState *bs)
3668{
3669 if (bs->backing_hd && bs->backing_hd->encrypted)
3670 return 1;
3671 return bs->encrypted;
3672}
3673
c0f4ce77
AL
3674int bdrv_key_required(BlockDriverState *bs)
3675{
3676 BlockDriverState *backing_hd = bs->backing_hd;
3677
3678 if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3679 return 1;
3680 return (bs->encrypted && !bs->valid_key);
3681}
3682
ea2384d3
FB
3683int bdrv_set_key(BlockDriverState *bs, const char *key)
3684{
3685 int ret;
3686 if (bs->backing_hd && bs->backing_hd->encrypted) {
3687 ret = bdrv_set_key(bs->backing_hd, key);
3688 if (ret < 0)
3689 return ret;
3690 if (!bs->encrypted)
3691 return 0;
3692 }
fd04a2ae
SH
3693 if (!bs->encrypted) {
3694 return -EINVAL;
3695 } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3696 return -ENOMEDIUM;
3697 }
c0f4ce77 3698 ret = bs->drv->bdrv_set_key(bs, key);
bb5fc20f
AL
3699 if (ret < 0) {
3700 bs->valid_key = 0;
3701 } else if (!bs->valid_key) {
3702 bs->valid_key = 1;
3703 /* call the change callback now, we skipped it on open */
7d4b4ba5 3704 bdrv_dev_change_media_cb(bs, true);
bb5fc20f 3705 }
c0f4ce77 3706 return ret;
ea2384d3
FB
3707}
3708
f8d6bba1 3709const char *bdrv_get_format_name(BlockDriverState *bs)
ea2384d3 3710{
f8d6bba1 3711 return bs->drv ? bs->drv->format_name : NULL;
ea2384d3
FB
3712}
3713
5fafdf24 3714void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
ea2384d3
FB
3715 void *opaque)
3716{
3717 BlockDriver *drv;
e855e4fb
JC
3718 int count = 0;
3719 const char **formats = NULL;
ea2384d3 3720
8a22f02a 3721 QLIST_FOREACH(drv, &bdrv_drivers, list) {
e855e4fb
JC
3722 if (drv->format_name) {
3723 bool found = false;
3724 int i = count;
3725 while (formats && i && !found) {
3726 found = !strcmp(formats[--i], drv->format_name);
3727 }
3728
3729 if (!found) {
3730 formats = g_realloc(formats, (count + 1) * sizeof(char *));
3731 formats[count++] = drv->format_name;
3732 it(opaque, drv->format_name);
3733 }
3734 }
ea2384d3 3735 }
e855e4fb 3736 g_free(formats);
ea2384d3
FB
3737}
3738
dc364f4c 3739/* This function is to find block backend bs */
b338082b
FB
3740BlockDriverState *bdrv_find(const char *name)
3741{
3742 BlockDriverState *bs;
3743
dc364f4c 3744 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1b7bdbc1 3745 if (!strcmp(name, bs->device_name)) {
b338082b 3746 return bs;
1b7bdbc1 3747 }
b338082b
FB
3748 }
3749 return NULL;
3750}
3751
dc364f4c
BC
3752/* This function is to find a node in the bs graph */
3753BlockDriverState *bdrv_find_node(const char *node_name)
3754{
3755 BlockDriverState *bs;
3756
3757 assert(node_name);
3758
3759 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3760 if (!strcmp(node_name, bs->node_name)) {
3761 return bs;
3762 }
3763 }
3764 return NULL;
3765}
3766
c13163fb
BC
3767/* Put this QMP function here so it can access the static graph_bdrv_states. */
3768BlockDeviceInfoList *bdrv_named_nodes_list(void)
3769{
3770 BlockDeviceInfoList *list, *entry;
3771 BlockDriverState *bs;
3772
3773 list = NULL;
3774 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3775 entry = g_malloc0(sizeof(*entry));
3776 entry->value = bdrv_block_device_info(bs);
3777 entry->next = list;
3778 list = entry;
3779 }
3780
3781 return list;
3782}
3783
12d3ba82
BC
3784BlockDriverState *bdrv_lookup_bs(const char *device,
3785 const char *node_name,
3786 Error **errp)
3787{
3788 BlockDriverState *bs = NULL;
3789
12d3ba82
BC
3790 if (device) {
3791 bs = bdrv_find(device);
3792
dd67fa50
BC
3793 if (bs) {
3794 return bs;
12d3ba82 3795 }
12d3ba82
BC
3796 }
3797
dd67fa50
BC
3798 if (node_name) {
3799 bs = bdrv_find_node(node_name);
12d3ba82 3800
dd67fa50
BC
3801 if (bs) {
3802 return bs;
3803 }
12d3ba82
BC
3804 }
3805
dd67fa50
BC
3806 error_setg(errp, "Cannot find device=%s nor node_name=%s",
3807 device ? device : "",
3808 node_name ? node_name : "");
3809 return NULL;
12d3ba82
BC
3810}
3811
2f399b0a
MA
3812BlockDriverState *bdrv_next(BlockDriverState *bs)
3813{
3814 if (!bs) {
3815 return QTAILQ_FIRST(&bdrv_states);
3816 }
dc364f4c 3817 return QTAILQ_NEXT(bs, device_list);
2f399b0a
MA
3818}
3819
51de9760 3820void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
81d0912d
FB
3821{
3822 BlockDriverState *bs;
3823
dc364f4c 3824 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
51de9760 3825 it(opaque, bs);
81d0912d
FB
3826 }
3827}
3828
ea2384d3
FB
3829const char *bdrv_get_device_name(BlockDriverState *bs)
3830{
3831 return bs->device_name;
3832}
3833
c8433287
MA
3834int bdrv_get_flags(BlockDriverState *bs)
3835{
3836 return bs->open_flags;
3837}
3838
f0f0fdfe 3839int bdrv_flush_all(void)
c6ca28d6
AL
3840{
3841 BlockDriverState *bs;
f0f0fdfe 3842 int result = 0;
c6ca28d6 3843
dc364f4c 3844 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
ed78cda3
SH
3845 AioContext *aio_context = bdrv_get_aio_context(bs);
3846 int ret;
3847
3848 aio_context_acquire(aio_context);
3849 ret = bdrv_flush(bs);
f0f0fdfe
KW
3850 if (ret < 0 && !result) {
3851 result = ret;
3852 }
ed78cda3 3853 aio_context_release(aio_context);
1b7bdbc1 3854 }
f0f0fdfe
KW
3855
3856 return result;
c6ca28d6
AL
3857}
3858
3ac21627
PL
3859int bdrv_has_zero_init_1(BlockDriverState *bs)
3860{
3861 return 1;
3862}
3863
f2feebbd
KW
3864int bdrv_has_zero_init(BlockDriverState *bs)
3865{
3866 assert(bs->drv);
3867
11212d8f
PB
3868 /* If BS is a copy on write image, it is initialized to
3869 the contents of the base image, which may not be zeroes. */
3870 if (bs->backing_hd) {
3871 return 0;
3872 }
336c1c12
KW
3873 if (bs->drv->bdrv_has_zero_init) {
3874 return bs->drv->bdrv_has_zero_init(bs);
f2feebbd
KW
3875 }
3876
3ac21627
PL
3877 /* safe default */
3878 return 0;
f2feebbd
KW
3879}
3880
4ce78691
PL
3881bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
3882{
3883 BlockDriverInfo bdi;
3884
3885 if (bs->backing_hd) {
3886 return false;
3887 }
3888
3889 if (bdrv_get_info(bs, &bdi) == 0) {
3890 return bdi.unallocated_blocks_are_zero;
3891 }
3892
3893 return false;
3894}
3895
3896bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
3897{
3898 BlockDriverInfo bdi;
3899
3900 if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
3901 return false;
3902 }
3903
3904 if (bdrv_get_info(bs, &bdi) == 0) {
3905 return bdi.can_write_zeroes_with_unmap;
3906 }
3907
3908 return false;
3909}
3910
b6b8a333 3911typedef struct BdrvCoGetBlockStatusData {
376ae3f1 3912 BlockDriverState *bs;
b35b2bba 3913 BlockDriverState *base;
376ae3f1
SH
3914 int64_t sector_num;
3915 int nb_sectors;
3916 int *pnum;
b6b8a333 3917 int64_t ret;
376ae3f1 3918 bool done;
b6b8a333 3919} BdrvCoGetBlockStatusData;
376ae3f1 3920
f58c7b35
TS
3921/*
3922 * Returns true iff the specified sector is present in the disk image. Drivers
3923 * not implementing the functionality are assumed to not support backing files,
3924 * hence all their sectors are reported as allocated.
3925 *
bd9533e3
SH
3926 * If 'sector_num' is beyond the end of the disk image the return value is 0
3927 * and 'pnum' is set to 0.
3928 *
f58c7b35
TS
3929 * 'pnum' is set to the number of sectors (including and immediately following
3930 * the specified sector) that are known to be in the same
3931 * allocated/unallocated state.
3932 *
bd9533e3
SH
3933 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
3934 * beyond the end of the disk image it will be clamped.
f58c7b35 3935 */
b6b8a333
PB
3936static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
3937 int64_t sector_num,
3938 int nb_sectors, int *pnum)
f58c7b35 3939{
617ccb46 3940 int64_t length;
bd9533e3 3941 int64_t n;
5daa74a6 3942 int64_t ret, ret2;
bd9533e3 3943
617ccb46
PB
3944 length = bdrv_getlength(bs);
3945 if (length < 0) {
3946 return length;
3947 }
3948
3949 if (sector_num >= (length >> BDRV_SECTOR_BITS)) {
bd9533e3
SH
3950 *pnum = 0;
3951 return 0;
3952 }
3953
3954 n = bs->total_sectors - sector_num;
3955 if (n < nb_sectors) {
3956 nb_sectors = n;
3957 }
3958
b6b8a333 3959 if (!bs->drv->bdrv_co_get_block_status) {
bd9533e3 3960 *pnum = nb_sectors;
e88ae226 3961 ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
918e92d7
PB
3962 if (bs->drv->protocol_name) {
3963 ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
3964 }
3965 return ret;
f58c7b35 3966 }
6aebab14 3967
415b5b01
PB
3968 ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
3969 if (ret < 0) {
3e0a233d 3970 *pnum = 0;
415b5b01
PB
3971 return ret;
3972 }
3973
92bc50a5
PL
3974 if (ret & BDRV_BLOCK_RAW) {
3975 assert(ret & BDRV_BLOCK_OFFSET_VALID);
3976 return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3977 *pnum, pnum);
3978 }
3979
e88ae226
KW
3980 if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
3981 ret |= BDRV_BLOCK_ALLOCATED;
3982 }
3983
c3d86884
PL
3984 if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
3985 if (bdrv_unallocated_blocks_are_zero(bs)) {
f0ad5712 3986 ret |= BDRV_BLOCK_ZERO;
1f9db224 3987 } else if (bs->backing_hd) {
f0ad5712
PB
3988 BlockDriverState *bs2 = bs->backing_hd;
3989 int64_t length2 = bdrv_getlength(bs2);
3990 if (length2 >= 0 && sector_num >= (length2 >> BDRV_SECTOR_BITS)) {
3991 ret |= BDRV_BLOCK_ZERO;
3992 }
3993 }
415b5b01 3994 }
5daa74a6
PB
3995
3996 if (bs->file &&
3997 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
3998 (ret & BDRV_BLOCK_OFFSET_VALID)) {
3999 ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
4000 *pnum, pnum);
4001 if (ret2 >= 0) {
4002 /* Ignore errors. This is just providing extra information, it
4003 * is useful but not necessary.
4004 */
4005 ret |= (ret2 & BDRV_BLOCK_ZERO);
4006 }
4007 }
4008
415b5b01 4009 return ret;
060f51c9
SH
4010}
4011
b6b8a333
PB
4012/* Coroutine wrapper for bdrv_get_block_status() */
4013static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
060f51c9 4014{
b6b8a333 4015 BdrvCoGetBlockStatusData *data = opaque;
060f51c9
SH
4016 BlockDriverState *bs = data->bs;
4017
b6b8a333
PB
4018 data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
4019 data->pnum);
060f51c9
SH
4020 data->done = true;
4021}
4022
4023/*
b6b8a333 4024 * Synchronous wrapper around bdrv_co_get_block_status().
060f51c9 4025 *
b6b8a333 4026 * See bdrv_co_get_block_status() for details.
060f51c9 4027 */
b6b8a333
PB
4028int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
4029 int nb_sectors, int *pnum)
060f51c9 4030{
6aebab14 4031 Coroutine *co;
b6b8a333 4032 BdrvCoGetBlockStatusData data = {
6aebab14
SH
4033 .bs = bs,
4034 .sector_num = sector_num,
4035 .nb_sectors = nb_sectors,
4036 .pnum = pnum,
4037 .done = false,
4038 };
4039
bdad13b9
PB
4040 if (qemu_in_coroutine()) {
4041 /* Fast-path if already in coroutine context */
b6b8a333 4042 bdrv_get_block_status_co_entry(&data);
bdad13b9 4043 } else {
2572b37a
SH
4044 AioContext *aio_context = bdrv_get_aio_context(bs);
4045
b6b8a333 4046 co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
bdad13b9
PB
4047 qemu_coroutine_enter(co, &data);
4048 while (!data.done) {
2572b37a 4049 aio_poll(aio_context, true);
bdad13b9 4050 }
6aebab14
SH
4051 }
4052 return data.ret;
f58c7b35
TS
4053}
4054
b6b8a333
PB
4055int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
4056 int nb_sectors, int *pnum)
4057{
4333bb71
PB
4058 int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
4059 if (ret < 0) {
4060 return ret;
4061 }
e88ae226 4062 return (ret & BDRV_BLOCK_ALLOCATED);
b6b8a333
PB
4063}
4064
188a7bbf
PB
4065/*
4066 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
4067 *
4068 * Return true if the given sector is allocated in any image between
4069 * BASE and TOP (inclusive). BASE can be NULL to check if the given
4070 * sector is allocated in any image of the chain. Return false otherwise.
4071 *
4072 * 'pnum' is set to the number of sectors (including and immediately following
4073 * the specified sector) that are known to be in the same
4074 * allocated/unallocated state.
4075 *
4076 */
4f578637
PB
4077int bdrv_is_allocated_above(BlockDriverState *top,
4078 BlockDriverState *base,
4079 int64_t sector_num,
4080 int nb_sectors, int *pnum)
188a7bbf
PB
4081{
4082 BlockDriverState *intermediate;
4083 int ret, n = nb_sectors;
4084
4085 intermediate = top;
4086 while (intermediate && intermediate != base) {
4087 int pnum_inter;
bdad13b9
PB
4088 ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
4089 &pnum_inter);
188a7bbf
PB
4090 if (ret < 0) {
4091 return ret;
4092 } else if (ret) {
4093 *pnum = pnum_inter;
4094 return 1;
4095 }
4096
4097 /*
4098 * [sector_num, nb_sectors] is unallocated on top but intermediate
4099 * might have
4100 *
4101 * [sector_num+x, nr_sectors] allocated.
4102 */
63ba17d3
VI
4103 if (n > pnum_inter &&
4104 (intermediate == top ||
4105 sector_num + pnum_inter < intermediate->total_sectors)) {
188a7bbf
PB
4106 n = pnum_inter;
4107 }
4108
4109 intermediate = intermediate->backing_hd;
4110 }
4111
4112 *pnum = n;
4113 return 0;
4114}
4115
045df330
AL
4116const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
4117{
4118 if (bs->backing_hd && bs->backing_hd->encrypted)
4119 return bs->backing_file;
4120 else if (bs->encrypted)
4121 return bs->filename;
4122 else
4123 return NULL;
4124}
4125
5fafdf24 4126void bdrv_get_backing_filename(BlockDriverState *bs,
83f64091
FB
4127 char *filename, int filename_size)
4128{
3574c608 4129 pstrcpy(filename, filename_size, bs->backing_file);
83f64091
FB
4130}
4131
5fafdf24 4132int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
faea38e7
FB
4133 const uint8_t *buf, int nb_sectors)
4134{
4135 BlockDriver *drv = bs->drv;
4136 if (!drv)
19cb3738 4137 return -ENOMEDIUM;
faea38e7
FB
4138 if (!drv->bdrv_write_compressed)
4139 return -ENOTSUP;
fbb7b4e0
KW
4140 if (bdrv_check_request(bs, sector_num, nb_sectors))
4141 return -EIO;
a55eb92c 4142
e4654d2d 4143 assert(QLIST_EMPTY(&bs->dirty_bitmaps));
a55eb92c 4144
faea38e7
FB
4145 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
4146}
3b46e624 4147
faea38e7
FB
4148int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
4149{
4150 BlockDriver *drv = bs->drv;
4151 if (!drv)
19cb3738 4152 return -ENOMEDIUM;
faea38e7
FB
4153 if (!drv->bdrv_get_info)
4154 return -ENOTSUP;
4155 memset(bdi, 0, sizeof(*bdi));
4156 return drv->bdrv_get_info(bs, bdi);
4157}
4158
eae041fe
HR
4159ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
4160{
4161 BlockDriver *drv = bs->drv;
4162 if (drv && drv->bdrv_get_specific_info) {
4163 return drv->bdrv_get_specific_info(bs);
4164 }
4165 return NULL;
4166}
4167
45566e9c
CH
4168int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
4169 int64_t pos, int size)
cf8074b3
KW
4170{
4171 QEMUIOVector qiov;
4172 struct iovec iov = {
4173 .iov_base = (void *) buf,
4174 .iov_len = size,
4175 };
4176
4177 qemu_iovec_init_external(&qiov, &iov, 1);
4178 return bdrv_writev_vmstate(bs, &qiov, pos);
4179}
4180
4181int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
178e08a5
AL
4182{
4183 BlockDriver *drv = bs->drv;
cf8074b3
KW
4184
4185 if (!drv) {
178e08a5 4186 return -ENOMEDIUM;
cf8074b3
KW
4187 } else if (drv->bdrv_save_vmstate) {
4188 return drv->bdrv_save_vmstate(bs, qiov, pos);
4189 } else if (bs->file) {
4190 return bdrv_writev_vmstate(bs->file, qiov, pos);
4191 }
4192
7cdb1f6d 4193 return -ENOTSUP;
178e08a5
AL
4194}
4195
45566e9c
CH
4196int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
4197 int64_t pos, int size)
178e08a5
AL
4198{
4199 BlockDriver *drv = bs->drv;
4200 if (!drv)
4201 return -ENOMEDIUM;
7cdb1f6d
MK
4202 if (drv->bdrv_load_vmstate)
4203 return drv->bdrv_load_vmstate(bs, buf, pos, size);
4204 if (bs->file)
4205 return bdrv_load_vmstate(bs->file, buf, pos, size);
4206 return -ENOTSUP;
178e08a5
AL
4207}
4208
8b9b0cc2
KW
4209void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
4210{
bf736fe3 4211 if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
8b9b0cc2
KW
4212 return;
4213 }
4214
bf736fe3 4215 bs->drv->bdrv_debug_event(bs, event);
41c695c7
KW
4216}
4217
4218int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
4219 const char *tag)
4220{
4221 while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
4222 bs = bs->file;
4223 }
4224
4225 if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
4226 return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
4227 }
4228
4229 return -ENOTSUP;
4230}
4231
4cc70e93
FZ
4232int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
4233{
4234 while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
4235 bs = bs->file;
4236 }
4237
4238 if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
4239 return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
4240 }
4241
4242 return -ENOTSUP;
4243}
4244
41c695c7
KW
4245int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
4246{
938789ea 4247 while (bs && (!bs->drv || !bs->drv->bdrv_debug_resume)) {
41c695c7
KW
4248 bs = bs->file;
4249 }
8b9b0cc2 4250
41c695c7
KW
4251 if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
4252 return bs->drv->bdrv_debug_resume(bs, tag);
4253 }
4254
4255 return -ENOTSUP;
4256}
4257
4258bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
4259{
4260 while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
4261 bs = bs->file;
4262 }
4263
4264 if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
4265 return bs->drv->bdrv_debug_is_suspended(bs, tag);
4266 }
4267
4268 return false;
8b9b0cc2
KW
4269}
4270
199630b6
BS
4271int bdrv_is_snapshot(BlockDriverState *bs)
4272{
4273 return !!(bs->open_flags & BDRV_O_SNAPSHOT);
4274}
4275
b1b1d783
JC
4276/* backing_file can either be relative, or absolute, or a protocol. If it is
4277 * relative, it must be relative to the chain. So, passing in bs->filename
4278 * from a BDS as backing_file should not be done, as that may be relative to
4279 * the CWD rather than the chain. */
e8a6bb9c
MT
4280BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
4281 const char *backing_file)
4282{
b1b1d783
JC
4283 char *filename_full = NULL;
4284 char *backing_file_full = NULL;
4285 char *filename_tmp = NULL;
4286 int is_protocol = 0;
4287 BlockDriverState *curr_bs = NULL;
4288 BlockDriverState *retval = NULL;
4289
4290 if (!bs || !bs->drv || !backing_file) {
e8a6bb9c
MT
4291 return NULL;
4292 }
4293
b1b1d783
JC
4294 filename_full = g_malloc(PATH_MAX);
4295 backing_file_full = g_malloc(PATH_MAX);
4296 filename_tmp = g_malloc(PATH_MAX);
4297
4298 is_protocol = path_has_protocol(backing_file);
4299
4300 for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
4301
4302 /* If either of the filename paths is actually a protocol, then
4303 * compare unmodified paths; otherwise make paths relative */
4304 if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
4305 if (strcmp(backing_file, curr_bs->backing_file) == 0) {
4306 retval = curr_bs->backing_hd;
4307 break;
4308 }
e8a6bb9c 4309 } else {
b1b1d783
JC
4310 /* If not an absolute filename path, make it relative to the current
4311 * image's filename path */
4312 path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4313 backing_file);
4314
4315 /* We are going to compare absolute pathnames */
4316 if (!realpath(filename_tmp, filename_full)) {
4317 continue;
4318 }
4319
4320 /* We need to make sure the backing filename we are comparing against
4321 * is relative to the current image filename (or absolute) */
4322 path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4323 curr_bs->backing_file);
4324
4325 if (!realpath(filename_tmp, backing_file_full)) {
4326 continue;
4327 }
4328
4329 if (strcmp(backing_file_full, filename_full) == 0) {
4330 retval = curr_bs->backing_hd;
4331 break;
4332 }
e8a6bb9c
MT
4333 }
4334 }
4335
b1b1d783
JC
4336 g_free(filename_full);
4337 g_free(backing_file_full);
4338 g_free(filename_tmp);
4339 return retval;
e8a6bb9c
MT
4340}
4341
f198fd1c
BC
4342int bdrv_get_backing_file_depth(BlockDriverState *bs)
4343{
4344 if (!bs->drv) {
4345 return 0;
4346 }
4347
4348 if (!bs->backing_hd) {
4349 return 0;
4350 }
4351
4352 return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
4353}
4354
79fac568
JC
4355BlockDriverState *bdrv_find_base(BlockDriverState *bs)
4356{
4357 BlockDriverState *curr_bs = NULL;
4358
4359 if (!bs) {
4360 return NULL;
4361 }
4362
4363 curr_bs = bs;
4364
4365 while (curr_bs->backing_hd) {
4366 curr_bs = curr_bs->backing_hd;
4367 }
4368 return curr_bs;
4369}
4370
ea2384d3 4371/**************************************************************/
83f64091 4372/* async I/Os */
ea2384d3 4373
3b69e4b9 4374BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
f141eafe 4375 QEMUIOVector *qiov, int nb_sectors,
3b69e4b9 4376 BlockDriverCompletionFunc *cb, void *opaque)
83f64091 4377{
bbf0a440
SH
4378 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
4379
d20d9b7c 4380 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
8c5873d6 4381 cb, opaque, false);
ea2384d3
FB
4382}
4383
f141eafe
AL
4384BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
4385 QEMUIOVector *qiov, int nb_sectors,
4386 BlockDriverCompletionFunc *cb, void *opaque)
ea2384d3 4387{
bbf0a440
SH
4388 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
4389
d20d9b7c 4390 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
8c5873d6 4391 cb, opaque, true);
83f64091
FB
4392}
4393
d5ef94d4
PB
4394BlockDriverAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
4395 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
4396 BlockDriverCompletionFunc *cb, void *opaque)
4397{
4398 trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
4399
4400 return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
4401 BDRV_REQ_ZERO_WRITE | flags,
4402 cb, opaque, true);
4403}
4404
40b4f539
KW
4405
4406typedef struct MultiwriteCB {
4407 int error;
4408 int num_requests;
4409 int num_callbacks;
4410 struct {
4411 BlockDriverCompletionFunc *cb;
4412 void *opaque;
4413 QEMUIOVector *free_qiov;
40b4f539
KW
4414 } callbacks[];
4415} MultiwriteCB;
4416
4417static void multiwrite_user_cb(MultiwriteCB *mcb)
4418{
4419 int i;
4420
4421 for (i = 0; i < mcb->num_callbacks; i++) {
4422 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
1e1ea48d
SH
4423 if (mcb->callbacks[i].free_qiov) {
4424 qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
4425 }
7267c094 4426 g_free(mcb->callbacks[i].free_qiov);
40b4f539
KW
4427 }
4428}
4429
4430static void multiwrite_cb(void *opaque, int ret)
4431{
4432 MultiwriteCB *mcb = opaque;
4433
6d519a5f
SH
4434 trace_multiwrite_cb(mcb, ret);
4435
cb6d3ca0 4436 if (ret < 0 && !mcb->error) {
40b4f539 4437 mcb->error = ret;
40b4f539
KW
4438 }
4439
4440 mcb->num_requests--;
4441 if (mcb->num_requests == 0) {
de189a1b 4442 multiwrite_user_cb(mcb);
7267c094 4443 g_free(mcb);
40b4f539
KW
4444 }
4445}
4446
4447static int multiwrite_req_compare(const void *a, const void *b)
4448{
77be4366
CH
4449 const BlockRequest *req1 = a, *req2 = b;
4450
4451 /*
4452 * Note that we can't simply subtract req2->sector from req1->sector
4453 * here as that could overflow the return value.
4454 */
4455 if (req1->sector > req2->sector) {
4456 return 1;
4457 } else if (req1->sector < req2->sector) {
4458 return -1;
4459 } else {
4460 return 0;
4461 }
40b4f539
KW
4462}
4463
4464/*
4465 * Takes a bunch of requests and tries to merge them. Returns the number of
4466 * requests that remain after merging.
4467 */
4468static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
4469 int num_reqs, MultiwriteCB *mcb)
4470{
4471 int i, outidx;
4472
4473 // Sort requests by start sector
4474 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
4475
4476 // Check if adjacent requests touch the same clusters. If so, combine them,
4477 // filling up gaps with zero sectors.
4478 outidx = 0;
4479 for (i = 1; i < num_reqs; i++) {
4480 int merge = 0;
4481 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
4482
b6a127a1 4483 // Handle exactly sequential writes and overlapping writes.
40b4f539
KW
4484 if (reqs[i].sector <= oldreq_last) {
4485 merge = 1;
4486 }
4487
e2a305fb
CH
4488 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
4489 merge = 0;
4490 }
4491
40b4f539
KW
4492 if (merge) {
4493 size_t size;
7267c094 4494 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
40b4f539
KW
4495 qemu_iovec_init(qiov,
4496 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
4497
4498 // Add the first request to the merged one. If the requests are
4499 // overlapping, drop the last sectors of the first request.
4500 size = (reqs[i].sector - reqs[outidx].sector) << 9;
1b093c48 4501 qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
40b4f539 4502
b6a127a1
PB
4503 // We should need to add any zeros between the two requests
4504 assert (reqs[i].sector <= oldreq_last);
40b4f539
KW
4505
4506 // Add the second request
1b093c48 4507 qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
40b4f539 4508
cbf1dff2 4509 reqs[outidx].nb_sectors = qiov->size >> 9;
40b4f539
KW
4510 reqs[outidx].qiov = qiov;
4511
4512 mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
4513 } else {
4514 outidx++;
4515 reqs[outidx].sector = reqs[i].sector;
4516 reqs[outidx].nb_sectors = reqs[i].nb_sectors;
4517 reqs[outidx].qiov = reqs[i].qiov;
4518 }
4519 }
4520
4521 return outidx + 1;
4522}
4523
4524/*
4525 * Submit multiple AIO write requests at once.
4526 *
4527 * On success, the function returns 0 and all requests in the reqs array have
4528 * been submitted. In error case this function returns -1, and any of the
4529 * requests may or may not be submitted yet. In particular, this means that the
4530 * callback will be called for some of the requests, for others it won't. The
4531 * caller must check the error field of the BlockRequest to wait for the right
4532 * callbacks (if error != 0, no callback will be called).
4533 *
4534 * The implementation may modify the contents of the reqs array, e.g. to merge
4535 * requests. However, the fields opaque and error are left unmodified as they
4536 * are used to signal failure for a single request to the caller.
4537 */
4538int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
4539{
40b4f539
KW
4540 MultiwriteCB *mcb;
4541 int i;
4542
301db7c2
RH
4543 /* don't submit writes if we don't have a medium */
4544 if (bs->drv == NULL) {
4545 for (i = 0; i < num_reqs; i++) {
4546 reqs[i].error = -ENOMEDIUM;
4547 }
4548 return -1;
4549 }
4550
40b4f539
KW
4551 if (num_reqs == 0) {
4552 return 0;
4553 }
4554
4555 // Create MultiwriteCB structure
7267c094 4556 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
40b4f539
KW
4557 mcb->num_requests = 0;
4558 mcb->num_callbacks = num_reqs;
4559
4560 for (i = 0; i < num_reqs; i++) {
4561 mcb->callbacks[i].cb = reqs[i].cb;
4562 mcb->callbacks[i].opaque = reqs[i].opaque;
4563 }
4564
4565 // Check for mergable requests
4566 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
4567
6d519a5f
SH
4568 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
4569
df9309fb
PB
4570 /* Run the aio requests. */
4571 mcb->num_requests = num_reqs;
40b4f539 4572 for (i = 0; i < num_reqs; i++) {
d20d9b7c
PB
4573 bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
4574 reqs[i].nb_sectors, reqs[i].flags,
4575 multiwrite_cb, mcb,
4576 true);
40b4f539
KW
4577 }
4578
4579 return 0;
40b4f539
KW
4580}
4581
83f64091 4582void bdrv_aio_cancel(BlockDriverAIOCB *acb)
83f64091 4583{
d7331bed 4584 acb->aiocb_info->cancel(acb);
83f64091
FB
4585}
4586
4587/**************************************************************/
4588/* async block device emulation */
4589
c16b5a2c
CH
4590typedef struct BlockDriverAIOCBSync {
4591 BlockDriverAIOCB common;
4592 QEMUBH *bh;
4593 int ret;
4594 /* vector translation state */
4595 QEMUIOVector *qiov;
4596 uint8_t *bounce;
4597 int is_write;
4598} BlockDriverAIOCBSync;
4599
4600static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
4601{
b666d239
KW
4602 BlockDriverAIOCBSync *acb =
4603 container_of(blockacb, BlockDriverAIOCBSync, common);
6a7ad299 4604 qemu_bh_delete(acb->bh);
36afc451 4605 acb->bh = NULL;
c16b5a2c
CH
4606 qemu_aio_release(acb);
4607}
4608
d7331bed 4609static const AIOCBInfo bdrv_em_aiocb_info = {
c16b5a2c
CH
4610 .aiocb_size = sizeof(BlockDriverAIOCBSync),
4611 .cancel = bdrv_aio_cancel_em,
4612};
4613
ce1a14dc 4614static void bdrv_aio_bh_cb(void *opaque)
83f64091 4615{
ce1a14dc 4616 BlockDriverAIOCBSync *acb = opaque;
f141eafe 4617
f141eafe 4618 if (!acb->is_write)
03396148 4619 qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
ceb42de8 4620 qemu_vfree(acb->bounce);
ce1a14dc 4621 acb->common.cb(acb->common.opaque, acb->ret);
6a7ad299 4622 qemu_bh_delete(acb->bh);
36afc451 4623 acb->bh = NULL;
ce1a14dc 4624 qemu_aio_release(acb);
83f64091 4625}
beac80cd 4626
f141eafe
AL
4627static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
4628 int64_t sector_num,
4629 QEMUIOVector *qiov,
4630 int nb_sectors,
4631 BlockDriverCompletionFunc *cb,
4632 void *opaque,
4633 int is_write)
4634
83f64091 4635{
ce1a14dc 4636 BlockDriverAIOCBSync *acb;
ce1a14dc 4637
d7331bed 4638 acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
f141eafe
AL
4639 acb->is_write = is_write;
4640 acb->qiov = qiov;
e268ca52 4641 acb->bounce = qemu_blockalign(bs, qiov->size);
2572b37a 4642 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb);
f141eafe
AL
4643
4644 if (is_write) {
d5e6b161 4645 qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
1ed20acf 4646 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
f141eafe 4647 } else {
1ed20acf 4648 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
f141eafe
AL
4649 }
4650
ce1a14dc 4651 qemu_bh_schedule(acb->bh);
f141eafe 4652
ce1a14dc 4653 return &acb->common;
beac80cd
FB
4654}
4655
f141eafe
AL
4656static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
4657 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
ce1a14dc 4658 BlockDriverCompletionFunc *cb, void *opaque)
beac80cd 4659{
f141eafe
AL
4660 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
4661}
83f64091 4662
f141eafe
AL
4663static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
4664 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4665 BlockDriverCompletionFunc *cb, void *opaque)
4666{
4667 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
beac80cd 4668}
beac80cd 4669
68485420
KW
4670
4671typedef struct BlockDriverAIOCBCoroutine {
4672 BlockDriverAIOCB common;
4673 BlockRequest req;
4674 bool is_write;
d318aea9 4675 bool *done;
68485420
KW
4676 QEMUBH* bh;
4677} BlockDriverAIOCBCoroutine;
4678
4679static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
4680{
2572b37a 4681 AioContext *aio_context = bdrv_get_aio_context(blockacb->bs);
d318aea9
KW
4682 BlockDriverAIOCBCoroutine *acb =
4683 container_of(blockacb, BlockDriverAIOCBCoroutine, common);
4684 bool done = false;
4685
4686 acb->done = &done;
4687 while (!done) {
2572b37a 4688 aio_poll(aio_context, true);
d318aea9 4689 }
68485420
KW
4690}
4691
d7331bed 4692static const AIOCBInfo bdrv_em_co_aiocb_info = {
68485420
KW
4693 .aiocb_size = sizeof(BlockDriverAIOCBCoroutine),
4694 .cancel = bdrv_aio_co_cancel_em,
4695};
4696
35246a68 4697static void bdrv_co_em_bh(void *opaque)
68485420
KW
4698{
4699 BlockDriverAIOCBCoroutine *acb = opaque;
4700
4701 acb->common.cb(acb->common.opaque, acb->req.error);
d318aea9
KW
4702
4703 if (acb->done) {
4704 *acb->done = true;
4705 }
4706
68485420
KW
4707 qemu_bh_delete(acb->bh);
4708 qemu_aio_release(acb);
4709}
4710
b2a61371
SH
4711/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4712static void coroutine_fn bdrv_co_do_rw(void *opaque)
4713{
4714 BlockDriverAIOCBCoroutine *acb = opaque;
4715 BlockDriverState *bs = acb->common.bs;
4716
4717 if (!acb->is_write) {
4718 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
d20d9b7c 4719 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
b2a61371
SH
4720 } else {
4721 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
d20d9b7c 4722 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
b2a61371
SH
4723 }
4724
2572b37a 4725 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
b2a61371
SH
4726 qemu_bh_schedule(acb->bh);
4727}
4728
68485420
KW
4729static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4730 int64_t sector_num,
4731 QEMUIOVector *qiov,
4732 int nb_sectors,
d20d9b7c 4733 BdrvRequestFlags flags,
68485420
KW
4734 BlockDriverCompletionFunc *cb,
4735 void *opaque,
8c5873d6 4736 bool is_write)
68485420
KW
4737{
4738 Coroutine *co;
4739 BlockDriverAIOCBCoroutine *acb;
4740
d7331bed 4741 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
68485420
KW
4742 acb->req.sector = sector_num;
4743 acb->req.nb_sectors = nb_sectors;
4744 acb->req.qiov = qiov;
d20d9b7c 4745 acb->req.flags = flags;
68485420 4746 acb->is_write = is_write;
d318aea9 4747 acb->done = NULL;
68485420 4748
8c5873d6 4749 co = qemu_coroutine_create(bdrv_co_do_rw);
68485420
KW
4750 qemu_coroutine_enter(co, acb);
4751
4752 return &acb->common;
4753}
4754
07f07615 4755static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
b2e12bc6 4756{
07f07615
PB
4757 BlockDriverAIOCBCoroutine *acb = opaque;
4758 BlockDriverState *bs = acb->common.bs;
b2e12bc6 4759
07f07615 4760 acb->req.error = bdrv_co_flush(bs);
2572b37a 4761 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
b2e12bc6 4762 qemu_bh_schedule(acb->bh);
b2e12bc6
CH
4763}
4764
07f07615 4765BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
016f5cf6
AG
4766 BlockDriverCompletionFunc *cb, void *opaque)
4767{
07f07615 4768 trace_bdrv_aio_flush(bs, opaque);
016f5cf6 4769
07f07615
PB
4770 Coroutine *co;
4771 BlockDriverAIOCBCoroutine *acb;
016f5cf6 4772
d7331bed 4773 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
d318aea9
KW
4774 acb->done = NULL;
4775
07f07615
PB
4776 co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4777 qemu_coroutine_enter(co, acb);
016f5cf6 4778
016f5cf6
AG
4779 return &acb->common;
4780}
4781
4265d620
PB
4782static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4783{
4784 BlockDriverAIOCBCoroutine *acb = opaque;
4785 BlockDriverState *bs = acb->common.bs;
4786
4787 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
2572b37a 4788 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4265d620
PB
4789 qemu_bh_schedule(acb->bh);
4790}
4791
4792BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4793 int64_t sector_num, int nb_sectors,
4794 BlockDriverCompletionFunc *cb, void *opaque)
4795{
4796 Coroutine *co;
4797 BlockDriverAIOCBCoroutine *acb;
4798
4799 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4800
d7331bed 4801 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4265d620
PB
4802 acb->req.sector = sector_num;
4803 acb->req.nb_sectors = nb_sectors;
d318aea9 4804 acb->done = NULL;
4265d620
PB
4805 co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
4806 qemu_coroutine_enter(co, acb);
4807
4808 return &acb->common;
4809}
4810
ea2384d3
FB
4811void bdrv_init(void)
4812{
5efa9d5a 4813 module_call_init(MODULE_INIT_BLOCK);
ea2384d3 4814}
ce1a14dc 4815
eb852011
MA
4816void bdrv_init_with_whitelist(void)
4817{
4818 use_bdrv_whitelist = 1;
4819 bdrv_init();
4820}
4821
d7331bed 4822void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
c16b5a2c 4823 BlockDriverCompletionFunc *cb, void *opaque)
ce1a14dc 4824{
ce1a14dc
PB
4825 BlockDriverAIOCB *acb;
4826
d7331bed
SH
4827 acb = g_slice_alloc(aiocb_info->aiocb_size);
4828 acb->aiocb_info = aiocb_info;
ce1a14dc
PB
4829 acb->bs = bs;
4830 acb->cb = cb;
4831 acb->opaque = opaque;
4832 return acb;
4833}
4834
4835void qemu_aio_release(void *p)
4836{
d37c975f 4837 BlockDriverAIOCB *acb = p;
d7331bed 4838 g_slice_free1(acb->aiocb_info->aiocb_size, acb);
ce1a14dc 4839}
19cb3738 4840
f9f05dc5
KW
4841/**************************************************************/
4842/* Coroutine block device emulation */
4843
4844typedef struct CoroutineIOCompletion {
4845 Coroutine *coroutine;
4846 int ret;
4847} CoroutineIOCompletion;
4848
4849static void bdrv_co_io_em_complete(void *opaque, int ret)
4850{
4851 CoroutineIOCompletion *co = opaque;
4852
4853 co->ret = ret;
4854 qemu_coroutine_enter(co->coroutine, NULL);
4855}
4856
4857static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
4858 int nb_sectors, QEMUIOVector *iov,
4859 bool is_write)
4860{
4861 CoroutineIOCompletion co = {
4862 .coroutine = qemu_coroutine_self(),
4863 };
4864 BlockDriverAIOCB *acb;
4865
4866 if (is_write) {
a652d160
SH
4867 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
4868 bdrv_co_io_em_complete, &co);
f9f05dc5 4869 } else {
a652d160
SH
4870 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
4871 bdrv_co_io_em_complete, &co);
f9f05dc5
KW
4872 }
4873
59370aaa 4874 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
f9f05dc5
KW
4875 if (!acb) {
4876 return -EIO;
4877 }
4878 qemu_coroutine_yield();
4879
4880 return co.ret;
4881}
4882
4883static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4884 int64_t sector_num, int nb_sectors,
4885 QEMUIOVector *iov)
4886{
4887 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4888}
4889
4890static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4891 int64_t sector_num, int nb_sectors,
4892 QEMUIOVector *iov)
4893{
4894 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4895}
4896
07f07615 4897static void coroutine_fn bdrv_flush_co_entry(void *opaque)
e7a8a783 4898{
07f07615
PB
4899 RwCo *rwco = opaque;
4900
4901 rwco->ret = bdrv_co_flush(rwco->bs);
4902}
4903
4904int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4905{
eb489bb1
KW
4906 int ret;
4907
29cdb251 4908 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
07f07615 4909 return 0;
eb489bb1
KW
4910 }
4911
ca716364 4912 /* Write back cached data to the OS even with cache=unsafe */
bf736fe3 4913 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
eb489bb1
KW
4914 if (bs->drv->bdrv_co_flush_to_os) {
4915 ret = bs->drv->bdrv_co_flush_to_os(bs);
4916 if (ret < 0) {
4917 return ret;
4918 }
4919 }
4920
ca716364
KW
4921 /* But don't actually force it to the disk with cache=unsafe */
4922 if (bs->open_flags & BDRV_O_NO_FLUSH) {
d4c82329 4923 goto flush_parent;
ca716364
KW
4924 }
4925
bf736fe3 4926 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
eb489bb1 4927 if (bs->drv->bdrv_co_flush_to_disk) {
29cdb251 4928 ret = bs->drv->bdrv_co_flush_to_disk(bs);
07f07615
PB
4929 } else if (bs->drv->bdrv_aio_flush) {
4930 BlockDriverAIOCB *acb;
4931 CoroutineIOCompletion co = {
4932 .coroutine = qemu_coroutine_self(),
4933 };
4934
4935 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4936 if (acb == NULL) {
29cdb251 4937 ret = -EIO;
07f07615
PB
4938 } else {
4939 qemu_coroutine_yield();
29cdb251 4940 ret = co.ret;
07f07615 4941 }
07f07615
PB
4942 } else {
4943 /*
4944 * Some block drivers always operate in either writethrough or unsafe
4945 * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4946 * know how the server works (because the behaviour is hardcoded or
4947 * depends on server-side configuration), so we can't ensure that
4948 * everything is safe on disk. Returning an error doesn't work because
4949 * that would break guests even if the server operates in writethrough
4950 * mode.
4951 *
4952 * Let's hope the user knows what he's doing.
4953 */
29cdb251 4954 ret = 0;
07f07615 4955 }
29cdb251
PB
4956 if (ret < 0) {
4957 return ret;
4958 }
4959
4960 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
4961 * in the case of cache=unsafe, so there are no useless flushes.
4962 */
d4c82329 4963flush_parent:
29cdb251 4964 return bdrv_co_flush(bs->file);
07f07615
PB
4965}
4966
5a8a30db 4967void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)
0f15423c 4968{
5a8a30db
KW
4969 Error *local_err = NULL;
4970 int ret;
4971
3456a8d1
KW
4972 if (!bs->drv) {
4973 return;
4974 }
4975
4976 if (bs->drv->bdrv_invalidate_cache) {
5a8a30db 4977 bs->drv->bdrv_invalidate_cache(bs, &local_err);
3456a8d1 4978 } else if (bs->file) {
5a8a30db
KW
4979 bdrv_invalidate_cache(bs->file, &local_err);
4980 }
4981 if (local_err) {
4982 error_propagate(errp, local_err);
4983 return;
0f15423c 4984 }
3456a8d1 4985
5a8a30db
KW
4986 ret = refresh_total_sectors(bs, bs->total_sectors);
4987 if (ret < 0) {
4988 error_setg_errno(errp, -ret, "Could not refresh total sector count");
4989 return;
4990 }
0f15423c
AL
4991}
4992
5a8a30db 4993void bdrv_invalidate_cache_all(Error **errp)
0f15423c
AL
4994{
4995 BlockDriverState *bs;
5a8a30db 4996 Error *local_err = NULL;
0f15423c 4997
dc364f4c 4998 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
ed78cda3
SH
4999 AioContext *aio_context = bdrv_get_aio_context(bs);
5000
5001 aio_context_acquire(aio_context);
5a8a30db 5002 bdrv_invalidate_cache(bs, &local_err);
ed78cda3 5003 aio_context_release(aio_context);
5a8a30db
KW
5004 if (local_err) {
5005 error_propagate(errp, local_err);
5006 return;
5007 }
0f15423c
AL
5008 }
5009}
5010
07789269
BC
5011void bdrv_clear_incoming_migration_all(void)
5012{
5013 BlockDriverState *bs;
5014
dc364f4c 5015 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
ed78cda3
SH
5016 AioContext *aio_context = bdrv_get_aio_context(bs);
5017
5018 aio_context_acquire(aio_context);
07789269 5019 bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
ed78cda3 5020 aio_context_release(aio_context);
07789269
BC
5021 }
5022}
5023
07f07615
PB
5024int bdrv_flush(BlockDriverState *bs)
5025{
5026 Coroutine *co;
5027 RwCo rwco = {
5028 .bs = bs,
5029 .ret = NOT_DONE,
e7a8a783 5030 };
e7a8a783 5031
07f07615
PB
5032 if (qemu_in_coroutine()) {
5033 /* Fast-path if already in coroutine context */
5034 bdrv_flush_co_entry(&rwco);
5035 } else {
2572b37a
SH
5036 AioContext *aio_context = bdrv_get_aio_context(bs);
5037
07f07615
PB
5038 co = qemu_coroutine_create(bdrv_flush_co_entry);
5039 qemu_coroutine_enter(co, &rwco);
5040 while (rwco.ret == NOT_DONE) {
2572b37a 5041 aio_poll(aio_context, true);
07f07615 5042 }
e7a8a783 5043 }
07f07615
PB
5044
5045 return rwco.ret;
e7a8a783
KW
5046}
5047
775aa8b6
KW
5048typedef struct DiscardCo {
5049 BlockDriverState *bs;
5050 int64_t sector_num;
5051 int nb_sectors;
5052 int ret;
5053} DiscardCo;
4265d620
PB
5054static void coroutine_fn bdrv_discard_co_entry(void *opaque)
5055{
775aa8b6 5056 DiscardCo *rwco = opaque;
4265d620
PB
5057
5058 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
5059}
5060
6f14da52
PL
5061/* if no limit is specified in the BlockLimits use a default
5062 * of 32768 512-byte sectors (16 MiB) per request.
5063 */
5064#define MAX_DISCARD_DEFAULT 32768
5065
4265d620
PB
5066int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
5067 int nb_sectors)
5068{
d51e9fe5
PB
5069 int max_discard;
5070
4265d620
PB
5071 if (!bs->drv) {
5072 return -ENOMEDIUM;
5073 } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
5074 return -EIO;
5075 } else if (bs->read_only) {
5076 return -EROFS;
df702c9b
PB
5077 }
5078
e4654d2d 5079 bdrv_reset_dirty(bs, sector_num, nb_sectors);
df702c9b 5080
9e8f1835
PB
5081 /* Do nothing if disabled. */
5082 if (!(bs->open_flags & BDRV_O_UNMAP)) {
5083 return 0;
5084 }
5085
d51e9fe5
PB
5086 if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
5087 return 0;
5088 }
6f14da52 5089
d51e9fe5
PB
5090 max_discard = bs->bl.max_discard ? bs->bl.max_discard : MAX_DISCARD_DEFAULT;
5091 while (nb_sectors > 0) {
5092 int ret;
5093 int num = nb_sectors;
6f14da52 5094
d51e9fe5
PB
5095 /* align request */
5096 if (bs->bl.discard_alignment &&
5097 num >= bs->bl.discard_alignment &&
5098 sector_num % bs->bl.discard_alignment) {
5099 if (num > bs->bl.discard_alignment) {
5100 num = bs->bl.discard_alignment;
6f14da52 5101 }
d51e9fe5
PB
5102 num -= sector_num % bs->bl.discard_alignment;
5103 }
6f14da52 5104
d51e9fe5
PB
5105 /* limit request size */
5106 if (num > max_discard) {
5107 num = max_discard;
5108 }
6f14da52 5109
d51e9fe5 5110 if (bs->drv->bdrv_co_discard) {
6f14da52 5111 ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
d51e9fe5
PB
5112 } else {
5113 BlockDriverAIOCB *acb;
5114 CoroutineIOCompletion co = {
5115 .coroutine = qemu_coroutine_self(),
5116 };
5117
5118 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
5119 bdrv_co_io_em_complete, &co);
5120 if (acb == NULL) {
5121 return -EIO;
5122 } else {
5123 qemu_coroutine_yield();
5124 ret = co.ret;
6f14da52 5125 }
6f14da52 5126 }
7ce21016 5127 if (ret && ret != -ENOTSUP) {
d51e9fe5 5128 return ret;
4265d620 5129 }
d51e9fe5
PB
5130
5131 sector_num += num;
5132 nb_sectors -= num;
4265d620 5133 }
d51e9fe5 5134 return 0;
4265d620
PB
5135}
5136
5137int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
5138{
5139 Coroutine *co;
775aa8b6 5140 DiscardCo rwco = {
4265d620
PB
5141 .bs = bs,
5142 .sector_num = sector_num,
5143 .nb_sectors = nb_sectors,
5144 .ret = NOT_DONE,
5145 };
5146
5147 if (qemu_in_coroutine()) {
5148 /* Fast-path if already in coroutine context */
5149 bdrv_discard_co_entry(&rwco);
5150 } else {
2572b37a
SH
5151 AioContext *aio_context = bdrv_get_aio_context(bs);
5152
4265d620
PB
5153 co = qemu_coroutine_create(bdrv_discard_co_entry);
5154 qemu_coroutine_enter(co, &rwco);
5155 while (rwco.ret == NOT_DONE) {
2572b37a 5156 aio_poll(aio_context, true);
4265d620
PB
5157 }
5158 }
5159
5160 return rwco.ret;
5161}
5162
19cb3738
FB
5163/**************************************************************/
5164/* removable device support */
5165
5166/**
5167 * Return TRUE if the media is present
5168 */
5169int bdrv_is_inserted(BlockDriverState *bs)
5170{
5171 BlockDriver *drv = bs->drv;
a1aff5bf 5172
19cb3738
FB
5173 if (!drv)
5174 return 0;
5175 if (!drv->bdrv_is_inserted)
a1aff5bf
MA
5176 return 1;
5177 return drv->bdrv_is_inserted(bs);
19cb3738
FB
5178}
5179
5180/**
8e49ca46
MA
5181 * Return whether the media changed since the last call to this
5182 * function, or -ENOTSUP if we don't know. Most drivers don't know.
19cb3738
FB
5183 */
5184int bdrv_media_changed(BlockDriverState *bs)
5185{
5186 BlockDriver *drv = bs->drv;
19cb3738 5187
8e49ca46
MA
5188 if (drv && drv->bdrv_media_changed) {
5189 return drv->bdrv_media_changed(bs);
5190 }
5191 return -ENOTSUP;
19cb3738
FB
5192}
5193
5194/**
5195 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
5196 */
f36f3949 5197void bdrv_eject(BlockDriverState *bs, bool eject_flag)
19cb3738
FB
5198{
5199 BlockDriver *drv = bs->drv;
19cb3738 5200
822e1cd1
MA
5201 if (drv && drv->bdrv_eject) {
5202 drv->bdrv_eject(bs, eject_flag);
19cb3738 5203 }
6f382ed2
LC
5204
5205 if (bs->device_name[0] != '\0') {
5206 bdrv_emit_qmp_eject_event(bs, eject_flag);
5207 }
19cb3738
FB
5208}
5209
19cb3738
FB
5210/**
5211 * Lock or unlock the media (if it is locked, the user won't be able
5212 * to eject it manually).
5213 */
025e849a 5214void bdrv_lock_medium(BlockDriverState *bs, bool locked)
19cb3738
FB
5215{
5216 BlockDriver *drv = bs->drv;
5217
025e849a 5218 trace_bdrv_lock_medium(bs, locked);
b8c6d095 5219
025e849a
MA
5220 if (drv && drv->bdrv_lock_medium) {
5221 drv->bdrv_lock_medium(bs, locked);
19cb3738
FB
5222 }
5223}
985a03b0
TS
5224
5225/* needed for generic scsi interface */
5226
5227int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
5228{
5229 BlockDriver *drv = bs->drv;
5230
5231 if (drv && drv->bdrv_ioctl)
5232 return drv->bdrv_ioctl(bs, req, buf);
5233 return -ENOTSUP;
5234}
7d780669 5235
221f715d
AL
5236BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
5237 unsigned long int req, void *buf,
5238 BlockDriverCompletionFunc *cb, void *opaque)
7d780669 5239{
221f715d 5240 BlockDriver *drv = bs->drv;
7d780669 5241
221f715d
AL
5242 if (drv && drv->bdrv_aio_ioctl)
5243 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
5244 return NULL;
7d780669 5245}
e268ca52 5246
1b7fd729 5247void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
7b6f9300 5248{
1b7fd729 5249 bs->guest_block_size = align;
7b6f9300 5250}
7cd1e32a 5251
e268ca52
AL
5252void *qemu_blockalign(BlockDriverState *bs, size_t size)
5253{
339064d5 5254 return qemu_memalign(bdrv_opt_mem_align(bs), size);
e268ca52 5255}
7cd1e32a 5256
c53b1c51
SH
5257/*
5258 * Check if all memory in this vector is sector aligned.
5259 */
5260bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
5261{
5262 int i;
339064d5 5263 size_t alignment = bdrv_opt_mem_align(bs);
c53b1c51
SH
5264
5265 for (i = 0; i < qiov->niov; i++) {
339064d5 5266 if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
c53b1c51 5267 return false;
1ff735bd 5268 }
339064d5 5269 if (qiov->iov[i].iov_len % alignment) {
1ff735bd 5270 return false;
c53b1c51
SH
5271 }
5272 }
5273
5274 return true;
5275}
5276
b8afb520
FZ
5277BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity,
5278 Error **errp)
7cd1e32a
LS
5279{
5280 int64_t bitmap_size;
e4654d2d 5281 BdrvDirtyBitmap *bitmap;
a55eb92c 5282
50717e94
PB
5283 assert((granularity & (granularity - 1)) == 0);
5284
e4654d2d
FZ
5285 granularity >>= BDRV_SECTOR_BITS;
5286 assert(granularity);
b8afb520
FZ
5287 bitmap_size = bdrv_getlength(bs);
5288 if (bitmap_size < 0) {
5289 error_setg_errno(errp, -bitmap_size, "could not get length of device");
5290 errno = -bitmap_size;
5291 return NULL;
5292 }
5293 bitmap_size >>= BDRV_SECTOR_BITS;
e4654d2d
FZ
5294 bitmap = g_malloc0(sizeof(BdrvDirtyBitmap));
5295 bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
5296 QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
5297 return bitmap;
5298}
5299
5300void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5301{
5302 BdrvDirtyBitmap *bm, *next;
5303 QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
5304 if (bm == bitmap) {
5305 QLIST_REMOVE(bitmap, list);
5306 hbitmap_free(bitmap->bitmap);
5307 g_free(bitmap);
5308 return;
a55eb92c 5309 }
7cd1e32a
LS
5310 }
5311}
5312
21b56835
FZ
5313BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
5314{
5315 BdrvDirtyBitmap *bm;
5316 BlockDirtyInfoList *list = NULL;
5317 BlockDirtyInfoList **plist = &list;
5318
5319 QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
5320 BlockDirtyInfo *info = g_malloc0(sizeof(BlockDirtyInfo));
5321 BlockDirtyInfoList *entry = g_malloc0(sizeof(BlockDirtyInfoList));
5322 info->count = bdrv_get_dirty_count(bs, bm);
5323 info->granularity =
5324 ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
5325 entry->value = info;
5326 *plist = entry;
5327 plist = &entry->next;
5328 }
5329
5330 return list;
5331}
5332
e4654d2d 5333int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
7cd1e32a 5334{
e4654d2d
FZ
5335 if (bitmap) {
5336 return hbitmap_get(bitmap->bitmap, sector);
7cd1e32a
LS
5337 } else {
5338 return 0;
5339 }
5340}
5341
e4654d2d
FZ
5342void bdrv_dirty_iter_init(BlockDriverState *bs,
5343 BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
1755da16 5344{
e4654d2d 5345 hbitmap_iter_init(hbi, bitmap->bitmap, 0);
1755da16
PB
5346}
5347
5348void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
5349 int nr_sectors)
5350{
e4654d2d
FZ
5351 BdrvDirtyBitmap *bitmap;
5352 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5353 hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5354 }
1755da16
PB
5355}
5356
e4654d2d 5357void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors)
7cd1e32a 5358{
e4654d2d
FZ
5359 BdrvDirtyBitmap *bitmap;
5360 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5361 hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5362 }
7cd1e32a 5363}
aaa0eb75 5364
e4654d2d 5365int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
aaa0eb75 5366{
e4654d2d 5367 return hbitmap_count(bitmap->bitmap);
aaa0eb75 5368}
f88e1a42 5369
9fcb0251
FZ
5370/* Get a reference to bs */
5371void bdrv_ref(BlockDriverState *bs)
5372{
5373 bs->refcnt++;
5374}
5375
5376/* Release a previously grabbed reference to bs.
5377 * If after releasing, reference count is zero, the BlockDriverState is
5378 * deleted. */
5379void bdrv_unref(BlockDriverState *bs)
5380{
5381 assert(bs->refcnt > 0);
5382 if (--bs->refcnt == 0) {
5383 bdrv_delete(bs);
5384 }
5385}
5386
fbe40ff7
FZ
5387struct BdrvOpBlocker {
5388 Error *reason;
5389 QLIST_ENTRY(BdrvOpBlocker) list;
5390};
5391
5392bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp)
5393{
5394 BdrvOpBlocker *blocker;
5395 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5396 if (!QLIST_EMPTY(&bs->op_blockers[op])) {
5397 blocker = QLIST_FIRST(&bs->op_blockers[op]);
5398 if (errp) {
5399 error_setg(errp, "Device '%s' is busy: %s",
5400 bs->device_name, error_get_pretty(blocker->reason));
5401 }
5402 return true;
5403 }
5404 return false;
5405}
5406
5407void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason)
5408{
5409 BdrvOpBlocker *blocker;
5410 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5411
5412 blocker = g_malloc0(sizeof(BdrvOpBlocker));
5413 blocker->reason = reason;
5414 QLIST_INSERT_HEAD(&bs->op_blockers[op], blocker, list);
5415}
5416
5417void bdrv_op_unblock(BlockDriverState *bs, BlockOpType op, Error *reason)
5418{
5419 BdrvOpBlocker *blocker, *next;
5420 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5421 QLIST_FOREACH_SAFE(blocker, &bs->op_blockers[op], list, next) {
5422 if (blocker->reason == reason) {
5423 QLIST_REMOVE(blocker, list);
5424 g_free(blocker);
5425 }
5426 }
5427}
5428
5429void bdrv_op_block_all(BlockDriverState *bs, Error *reason)
5430{
5431 int i;
5432 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5433 bdrv_op_block(bs, i, reason);
5434 }
5435}
5436
5437void bdrv_op_unblock_all(BlockDriverState *bs, Error *reason)
5438{
5439 int i;
5440 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5441 bdrv_op_unblock(bs, i, reason);
5442 }
5443}
5444
5445bool bdrv_op_blocker_is_empty(BlockDriverState *bs)
5446{
5447 int i;
5448
5449 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5450 if (!QLIST_EMPTY(&bs->op_blockers[i])) {
5451 return false;
5452 }
5453 }
5454 return true;
5455}
5456
28a7282a
LC
5457void bdrv_iostatus_enable(BlockDriverState *bs)
5458{
d6bf279e 5459 bs->iostatus_enabled = true;
58e21ef5 5460 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
28a7282a
LC
5461}
5462
5463/* The I/O status is only enabled if the drive explicitly
5464 * enables it _and_ the VM is configured to stop on errors */
5465bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
5466{
d6bf279e 5467 return (bs->iostatus_enabled &&
92aa5c6d
PB
5468 (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
5469 bs->on_write_error == BLOCKDEV_ON_ERROR_STOP ||
5470 bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
28a7282a
LC
5471}
5472
5473void bdrv_iostatus_disable(BlockDriverState *bs)
5474{
d6bf279e 5475 bs->iostatus_enabled = false;
28a7282a
LC
5476}
5477
5478void bdrv_iostatus_reset(BlockDriverState *bs)
5479{
5480 if (bdrv_iostatus_is_enabled(bs)) {
58e21ef5 5481 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
3bd293c3
PB
5482 if (bs->job) {
5483 block_job_iostatus_reset(bs->job);
5484 }
28a7282a
LC
5485 }
5486}
5487
28a7282a
LC
5488void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
5489{
3e1caa5f
PB
5490 assert(bdrv_iostatus_is_enabled(bs));
5491 if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
58e21ef5
LC
5492 bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
5493 BLOCK_DEVICE_IO_STATUS_FAILED;
28a7282a
LC
5494 }
5495}
5496
a597e79c
CH
5497void
5498bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
5499 enum BlockAcctType type)
5500{
5501 assert(type < BDRV_MAX_IOTYPE);
5502
5503 cookie->bytes = bytes;
c488c7f6 5504 cookie->start_time_ns = get_clock();
a597e79c
CH
5505 cookie->type = type;
5506}
5507
5508void
5509bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
5510{
5511 assert(cookie->type < BDRV_MAX_IOTYPE);
5512
5513 bs->nr_bytes[cookie->type] += cookie->bytes;
5514 bs->nr_ops[cookie->type]++;
c488c7f6 5515 bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
a597e79c
CH
5516}
5517
d92ada22
LC
5518void bdrv_img_create(const char *filename, const char *fmt,
5519 const char *base_filename, const char *base_fmt,
f382d43a
MR
5520 char *options, uint64_t img_size, int flags,
5521 Error **errp, bool quiet)
f88e1a42
JS
5522{
5523 QEMUOptionParameter *param = NULL, *create_options = NULL;
d220894e 5524 QEMUOptionParameter *backing_fmt, *backing_file, *size;
f88e1a42 5525 BlockDriver *drv, *proto_drv;
96df67d1 5526 BlockDriver *backing_drv = NULL;
cc84d90f 5527 Error *local_err = NULL;
f88e1a42
JS
5528 int ret = 0;
5529
5530 /* Find driver and parse its options */
5531 drv = bdrv_find_format(fmt);
5532 if (!drv) {
71c79813 5533 error_setg(errp, "Unknown file format '%s'", fmt);
d92ada22 5534 return;
f88e1a42
JS
5535 }
5536
98289620 5537 proto_drv = bdrv_find_protocol(filename, true);
f88e1a42 5538 if (!proto_drv) {
71c79813 5539 error_setg(errp, "Unknown protocol '%s'", filename);
d92ada22 5540 return;
f88e1a42
JS
5541 }
5542
5543 create_options = append_option_parameters(create_options,
5544 drv->create_options);
5545 create_options = append_option_parameters(create_options,
5546 proto_drv->create_options);
5547
5548 /* Create parameter list with default values */
5549 param = parse_option_parameters("", create_options, param);
5550
5551 set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
5552
5553 /* Parse -o options */
5554 if (options) {
5555 param = parse_option_parameters(options, create_options, param);
5556 if (param == NULL) {
71c79813 5557 error_setg(errp, "Invalid options for file format '%s'.", fmt);
f88e1a42
JS
5558 goto out;
5559 }
5560 }
5561
5562 if (base_filename) {
5563 if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
5564 base_filename)) {
71c79813
LC
5565 error_setg(errp, "Backing file not supported for file format '%s'",
5566 fmt);
f88e1a42
JS
5567 goto out;
5568 }
5569 }
5570
5571 if (base_fmt) {
5572 if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
71c79813
LC
5573 error_setg(errp, "Backing file format not supported for file "
5574 "format '%s'", fmt);
f88e1a42
JS
5575 goto out;
5576 }
5577 }
5578
792da93a
JS
5579 backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
5580 if (backing_file && backing_file->value.s) {
5581 if (!strcmp(filename, backing_file->value.s)) {
71c79813
LC
5582 error_setg(errp, "Error: Trying to create an image with the "
5583 "same filename as the backing file");
792da93a
JS
5584 goto out;
5585 }
5586 }
5587
f88e1a42
JS
5588 backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
5589 if (backing_fmt && backing_fmt->value.s) {
96df67d1
SH
5590 backing_drv = bdrv_find_format(backing_fmt->value.s);
5591 if (!backing_drv) {
71c79813
LC
5592 error_setg(errp, "Unknown backing file format '%s'",
5593 backing_fmt->value.s);
f88e1a42
JS
5594 goto out;
5595 }
5596 }
5597
5598 // The size for the image must always be specified, with one exception:
5599 // If we are using a backing file, we can obtain the size from there
d220894e
KW
5600 size = get_option_parameter(param, BLOCK_OPT_SIZE);
5601 if (size && size->value.n == -1) {
f88e1a42 5602 if (backing_file && backing_file->value.s) {
66f6b814 5603 BlockDriverState *bs;
f88e1a42 5604 uint64_t size;
f88e1a42 5605 char buf[32];
63090dac
PB
5606 int back_flags;
5607
5608 /* backing files always opened read-only */
5609 back_flags =
5610 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
f88e1a42 5611
f67503e5 5612 bs = NULL;
ddf5636d 5613 ret = bdrv_open(&bs, backing_file->value.s, NULL, NULL, back_flags,
cc84d90f 5614 backing_drv, &local_err);
f88e1a42 5615 if (ret < 0) {
cc84d90f
HR
5616 error_setg_errno(errp, -ret, "Could not open '%s': %s",
5617 backing_file->value.s,
5618 error_get_pretty(local_err));
5619 error_free(local_err);
5620 local_err = NULL;
f88e1a42
JS
5621 goto out;
5622 }
5623 bdrv_get_geometry(bs, &size);
5624 size *= 512;
5625
5626 snprintf(buf, sizeof(buf), "%" PRId64, size);
5627 set_option_parameter(param, BLOCK_OPT_SIZE, buf);
66f6b814
HR
5628
5629 bdrv_unref(bs);
f88e1a42 5630 } else {
71c79813 5631 error_setg(errp, "Image creation needs a size parameter");
f88e1a42
JS
5632 goto out;
5633 }
5634 }
5635
f382d43a
MR
5636 if (!quiet) {
5637 printf("Formatting '%s', fmt=%s ", filename, fmt);
5638 print_option_parameters(param);
5639 puts("");
5640 }
cc84d90f
HR
5641 ret = bdrv_create(drv, filename, param, &local_err);
5642 if (ret == -EFBIG) {
5643 /* This is generally a better message than whatever the driver would
5644 * deliver (especially because of the cluster_size_hint), since that
5645 * is most probably not much different from "image too large". */
5646 const char *cluster_size_hint = "";
5647 if (get_option_parameter(create_options, BLOCK_OPT_CLUSTER_SIZE)) {
5648 cluster_size_hint = " (try using a larger cluster size)";
f88e1a42 5649 }
cc84d90f
HR
5650 error_setg(errp, "The image size is too large for file format '%s'"
5651 "%s", fmt, cluster_size_hint);
5652 error_free(local_err);
5653 local_err = NULL;
f88e1a42
JS
5654 }
5655
5656out:
5657 free_option_parameters(create_options);
5658 free_option_parameters(param);
5659
84d18f06 5660 if (local_err) {
cc84d90f
HR
5661 error_propagate(errp, local_err);
5662 }
f88e1a42 5663}
85d126f3
SH
5664
5665AioContext *bdrv_get_aio_context(BlockDriverState *bs)
5666{
5667 /* Currently BlockDriverState always uses the main loop AioContext */
5668 return qemu_get_aio_context();
5669}
d616b224
SH
5670
5671void bdrv_add_before_write_notifier(BlockDriverState *bs,
5672 NotifierWithReturn *notifier)
5673{
5674 notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
5675}
6f176b48
HR
5676
5677int bdrv_amend_options(BlockDriverState *bs, QEMUOptionParameter *options)
5678{
5679 if (bs->drv->bdrv_amend_options == NULL) {
5680 return -ENOTSUP;
5681 }
5682 return bs->drv->bdrv_amend_options(bs, options);
5683}
f6186f49 5684
b5042a36
BC
5685/* This function will be called by the bdrv_recurse_is_first_non_filter method
5686 * of block filter and by bdrv_is_first_non_filter.
5687 * It is used to test if the given bs is the candidate or recurse more in the
5688 * node graph.
212a5a8f 5689 */
b5042a36 5690bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
212a5a8f 5691 BlockDriverState *candidate)
f6186f49 5692{
b5042a36
BC
5693 /* return false if basic checks fails */
5694 if (!bs || !bs->drv) {
212a5a8f 5695 return false;
f6186f49
BC
5696 }
5697
b5042a36
BC
5698 /* the code reached a non block filter driver -> check if the bs is
5699 * the same as the candidate. It's the recursion termination condition.
5700 */
5701 if (!bs->drv->is_filter) {
5702 return bs == candidate;
212a5a8f 5703 }
b5042a36 5704 /* Down this path the driver is a block filter driver */
212a5a8f 5705
b5042a36
BC
5706 /* If the block filter recursion method is defined use it to recurse down
5707 * the node graph.
5708 */
5709 if (bs->drv->bdrv_recurse_is_first_non_filter) {
212a5a8f 5710 return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate);
f6186f49
BC
5711 }
5712
b5042a36
BC
5713 /* the driver is a block filter but don't allow to recurse -> return false
5714 */
5715 return false;
f6186f49
BC
5716}
5717
212a5a8f
BC
5718/* This function checks if the candidate is the first non filter bs down it's
5719 * bs chain. Since we don't have pointers to parents it explore all bs chains
5720 * from the top. Some filters can choose not to pass down the recursion.
5721 */
5722bool bdrv_is_first_non_filter(BlockDriverState *candidate)
f6186f49 5723{
212a5a8f
BC
5724 BlockDriverState *bs;
5725
5726 /* walk down the bs forest recursively */
5727 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5728 bool perm;
5729
b5042a36 5730 /* try to recurse in this top level bs */
e6dc8a1f 5731 perm = bdrv_recurse_is_first_non_filter(bs, candidate);
212a5a8f
BC
5732
5733 /* candidate is the first non filter */
5734 if (perm) {
5735 return true;
5736 }
5737 }
5738
5739 return false;
f6186f49 5740}