]> git.proxmox.com Git - mirror_qemu.git/blame - block.c
block: JSON filenames and relative backing files
[mirror_qemu.git] / block.c
CommitLineData
fc01f7e7
FB
1/*
2 * QEMU System Emulator block driver
5fafdf24 3 *
fc01f7e7 4 * Copyright (c) 2003 Fabrice Bellard
5fafdf24 5 *
fc01f7e7
FB
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
3990d09a 24#include "config-host.h"
faf07963 25#include "qemu-common.h"
6d519a5f 26#include "trace.h"
737e150e
PB
27#include "block/block_int.h"
28#include "block/blockjob.h"
1de7afc9 29#include "qemu/module.h"
7b1b5d19 30#include "qapi/qmp/qjson.h"
bfb197e0 31#include "sysemu/block-backend.h"
9c17d615 32#include "sysemu/sysemu.h"
1de7afc9 33#include "qemu/notify.h"
737e150e 34#include "block/coroutine.h"
c13163fb 35#include "block/qapi.h"
b2023818 36#include "qmp-commands.h"
1de7afc9 37#include "qemu/timer.h"
a5ee7bd4 38#include "qapi-event.h"
fc01f7e7 39
71e72a19 40#ifdef CONFIG_BSD
7674e7bf
FB
41#include <sys/types.h>
42#include <sys/stat.h>
43#include <sys/ioctl.h>
72cf2d4f 44#include <sys/queue.h>
c5e97233 45#ifndef __DragonFly__
7674e7bf
FB
46#include <sys/disk.h>
47#endif
c5e97233 48#endif
7674e7bf 49
49dc768d
AL
50#ifdef _WIN32
51#include <windows.h>
52#endif
53
e4654d2d
FZ
54struct BdrvDirtyBitmap {
55 HBitmap *bitmap;
56 QLIST_ENTRY(BdrvDirtyBitmap) list;
57};
58
1c9805a3
SH
59#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
60
7c84b1b8 61static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
f141eafe 62 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
097310b5 63 BlockCompletionFunc *cb, void *opaque);
7c84b1b8 64static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
f141eafe 65 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
097310b5 66 BlockCompletionFunc *cb, void *opaque);
f9f05dc5
KW
67static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
68 int64_t sector_num, int nb_sectors,
69 QEMUIOVector *iov);
70static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
71 int64_t sector_num, int nb_sectors,
72 QEMUIOVector *iov);
775aa8b6
KW
73static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
74 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
470c0504 75 BdrvRequestFlags flags);
775aa8b6
KW
76static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
77 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
f08f2dda 78 BdrvRequestFlags flags);
7c84b1b8
MA
79static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
80 int64_t sector_num,
81 QEMUIOVector *qiov,
82 int nb_sectors,
83 BdrvRequestFlags flags,
097310b5 84 BlockCompletionFunc *cb,
7c84b1b8
MA
85 void *opaque,
86 bool is_write);
b2a61371 87static void coroutine_fn bdrv_co_do_rw(void *opaque);
621f0589 88static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
aa7bfbff 89 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
ec530c81 90
1b7bdbc1
SH
91static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
92 QTAILQ_HEAD_INITIALIZER(bdrv_states);
7ee930d0 93
dc364f4c
BC
94static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
95 QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
96
8a22f02a
SH
97static QLIST_HEAD(, BlockDriver) bdrv_drivers =
98 QLIST_HEAD_INITIALIZER(bdrv_drivers);
ea2384d3 99
eb852011
MA
100/* If non-zero, use only whitelisted block drivers */
101static int use_bdrv_whitelist;
102
9e0b22f4
SH
103#ifdef _WIN32
104static int is_windows_drive_prefix(const char *filename)
105{
106 return (((filename[0] >= 'a' && filename[0] <= 'z') ||
107 (filename[0] >= 'A' && filename[0] <= 'Z')) &&
108 filename[1] == ':');
109}
110
111int is_windows_drive(const char *filename)
112{
113 if (is_windows_drive_prefix(filename) &&
114 filename[2] == '\0')
115 return 1;
116 if (strstart(filename, "\\\\.\\", NULL) ||
117 strstart(filename, "//./", NULL))
118 return 1;
119 return 0;
120}
121#endif
122
0563e191 123/* throttling disk I/O limits */
cc0681c4
BC
124void bdrv_set_io_limits(BlockDriverState *bs,
125 ThrottleConfig *cfg)
98f90dba 126{
cc0681c4 127 int i;
98f90dba 128
cc0681c4 129 throttle_config(&bs->throttle_state, cfg);
98f90dba 130
cc0681c4
BC
131 for (i = 0; i < 2; i++) {
132 qemu_co_enter_next(&bs->throttled_reqs[i]);
98f90dba 133 }
cc0681c4
BC
134}
135
136/* this function drain all the throttled IOs */
137static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
138{
139 bool drained = false;
140 bool enabled = bs->io_limits_enabled;
141 int i;
142
143 bs->io_limits_enabled = false;
144
145 for (i = 0; i < 2; i++) {
146 while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
147 drained = true;
148 }
149 }
150
151 bs->io_limits_enabled = enabled;
98f90dba 152
cc0681c4 153 return drained;
98f90dba
ZYW
154}
155
cc0681c4 156void bdrv_io_limits_disable(BlockDriverState *bs)
0563e191 157{
cc0681c4 158 bs->io_limits_enabled = false;
0563e191 159
cc0681c4
BC
160 bdrv_start_throttled_reqs(bs);
161
162 throttle_destroy(&bs->throttle_state);
0563e191
ZYW
163}
164
cc0681c4 165static void bdrv_throttle_read_timer_cb(void *opaque)
0563e191 166{
cc0681c4
BC
167 BlockDriverState *bs = opaque;
168 qemu_co_enter_next(&bs->throttled_reqs[0]);
0563e191
ZYW
169}
170
cc0681c4 171static void bdrv_throttle_write_timer_cb(void *opaque)
0563e191 172{
cc0681c4
BC
173 BlockDriverState *bs = opaque;
174 qemu_co_enter_next(&bs->throttled_reqs[1]);
0563e191
ZYW
175}
176
cc0681c4
BC
177/* should be called before bdrv_set_io_limits if a limit is set */
178void bdrv_io_limits_enable(BlockDriverState *bs)
179{
180 assert(!bs->io_limits_enabled);
181 throttle_init(&bs->throttle_state,
13af91eb 182 bdrv_get_aio_context(bs),
cc0681c4
BC
183 QEMU_CLOCK_VIRTUAL,
184 bdrv_throttle_read_timer_cb,
185 bdrv_throttle_write_timer_cb,
186 bs);
187 bs->io_limits_enabled = true;
188}
189
190/* This function makes an IO wait if needed
191 *
192 * @nb_sectors: the number of sectors of the IO
193 * @is_write: is the IO a write
194 */
98f90dba 195static void bdrv_io_limits_intercept(BlockDriverState *bs,
d5103588 196 unsigned int bytes,
cc0681c4 197 bool is_write)
98f90dba 198{
cc0681c4
BC
199 /* does this io must wait */
200 bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
98f90dba 201
cc0681c4
BC
202 /* if must wait or any request of this type throttled queue the IO */
203 if (must_wait ||
204 !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
205 qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
98f90dba
ZYW
206 }
207
cc0681c4 208 /* the IO will be executed, do the accounting */
d5103588
KW
209 throttle_account(&bs->throttle_state, is_write, bytes);
210
98f90dba 211
cc0681c4
BC
212 /* if the next request must wait -> do nothing */
213 if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
214 return;
98f90dba
ZYW
215 }
216
cc0681c4
BC
217 /* else queue next request for execution */
218 qemu_co_queue_next(&bs->throttled_reqs[is_write]);
98f90dba
ZYW
219}
220
339064d5
KW
221size_t bdrv_opt_mem_align(BlockDriverState *bs)
222{
223 if (!bs || !bs->drv) {
224 /* 4k should be on the safe side */
225 return 4096;
226 }
227
228 return bs->bl.opt_mem_alignment;
229}
230
9e0b22f4 231/* check if the path starts with "<protocol>:" */
5c98415b 232int path_has_protocol(const char *path)
9e0b22f4 233{
947995c0
PB
234 const char *p;
235
9e0b22f4
SH
236#ifdef _WIN32
237 if (is_windows_drive(path) ||
238 is_windows_drive_prefix(path)) {
239 return 0;
240 }
947995c0
PB
241 p = path + strcspn(path, ":/\\");
242#else
243 p = path + strcspn(path, ":/");
9e0b22f4
SH
244#endif
245
947995c0 246 return *p == ':';
9e0b22f4
SH
247}
248
83f64091 249int path_is_absolute(const char *path)
3b0d4f61 250{
21664424
FB
251#ifdef _WIN32
252 /* specific case for names like: "\\.\d:" */
f53f4da9 253 if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
21664424 254 return 1;
f53f4da9
PB
255 }
256 return (*path == '/' || *path == '\\');
3b9f94e1 257#else
f53f4da9 258 return (*path == '/');
3b9f94e1 259#endif
3b0d4f61
FB
260}
261
83f64091
FB
262/* if filename is absolute, just copy it to dest. Otherwise, build a
263 path to it by considering it is relative to base_path. URL are
264 supported. */
265void path_combine(char *dest, int dest_size,
266 const char *base_path,
267 const char *filename)
3b0d4f61 268{
83f64091
FB
269 const char *p, *p1;
270 int len;
271
272 if (dest_size <= 0)
273 return;
274 if (path_is_absolute(filename)) {
275 pstrcpy(dest, dest_size, filename);
276 } else {
277 p = strchr(base_path, ':');
278 if (p)
279 p++;
280 else
281 p = base_path;
3b9f94e1
FB
282 p1 = strrchr(base_path, '/');
283#ifdef _WIN32
284 {
285 const char *p2;
286 p2 = strrchr(base_path, '\\');
287 if (!p1 || p2 > p1)
288 p1 = p2;
289 }
290#endif
83f64091
FB
291 if (p1)
292 p1++;
293 else
294 p1 = base_path;
295 if (p1 > p)
296 p = p1;
297 len = p - base_path;
298 if (len > dest_size - 1)
299 len = dest_size - 1;
300 memcpy(dest, base_path, len);
301 dest[len] = '\0';
302 pstrcat(dest, dest_size, filename);
3b0d4f61 303 }
3b0d4f61
FB
304}
305
0a82855a
HR
306void bdrv_get_full_backing_filename_from_filename(const char *backed,
307 const char *backing,
9f07429e
HR
308 char *dest, size_t sz,
309 Error **errp)
dc5a1371 310{
9f07429e
HR
311 if (backing[0] == '\0' || path_has_protocol(backing) ||
312 path_is_absolute(backing))
313 {
0a82855a 314 pstrcpy(dest, sz, backing);
9f07429e
HR
315 } else if (backed[0] == '\0' || strstart(backed, "json:", NULL)) {
316 error_setg(errp, "Cannot use relative backing file names for '%s'",
317 backed);
dc5a1371 318 } else {
0a82855a 319 path_combine(dest, sz, backed, backing);
dc5a1371
PB
320 }
321}
322
9f07429e
HR
323void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz,
324 Error **errp)
0a82855a 325{
9f07429e
HR
326 char *backed = bs->exact_filename[0] ? bs->exact_filename : bs->filename;
327
328 bdrv_get_full_backing_filename_from_filename(backed, bs->backing_file,
329 dest, sz, errp);
0a82855a
HR
330}
331
5efa9d5a 332void bdrv_register(BlockDriver *bdrv)
ea2384d3 333{
8c5873d6
SH
334 /* Block drivers without coroutine functions need emulation */
335 if (!bdrv->bdrv_co_readv) {
f9f05dc5
KW
336 bdrv->bdrv_co_readv = bdrv_co_readv_em;
337 bdrv->bdrv_co_writev = bdrv_co_writev_em;
338
f8c35c1d
SH
339 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
340 * the block driver lacks aio we need to emulate that too.
341 */
f9f05dc5
KW
342 if (!bdrv->bdrv_aio_readv) {
343 /* add AIO emulation layer */
344 bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
345 bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
f9f05dc5 346 }
83f64091 347 }
b2e12bc6 348
8a22f02a 349 QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
ea2384d3 350}
b338082b 351
7f06d47e 352BlockDriverState *bdrv_new_root(void)
b338082b 353{
7f06d47e 354 BlockDriverState *bs = bdrv_new();
e4e9986b 355
e4e9986b 356 QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
e4e9986b
MA
357 return bs;
358}
359
360BlockDriverState *bdrv_new(void)
361{
362 BlockDriverState *bs;
363 int i;
364
5839e53b 365 bs = g_new0(BlockDriverState, 1);
e4654d2d 366 QLIST_INIT(&bs->dirty_bitmaps);
fbe40ff7
FZ
367 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
368 QLIST_INIT(&bs->op_blockers[i]);
369 }
28a7282a 370 bdrv_iostatus_disable(bs);
d7d512f6 371 notifier_list_init(&bs->close_notifiers);
d616b224 372 notifier_with_return_list_init(&bs->before_write_notifiers);
cc0681c4
BC
373 qemu_co_queue_init(&bs->throttled_reqs[0]);
374 qemu_co_queue_init(&bs->throttled_reqs[1]);
9fcb0251 375 bs->refcnt = 1;
dcd04228 376 bs->aio_context = qemu_get_aio_context();
d7d512f6 377
b338082b
FB
378 return bs;
379}
380
d7d512f6
PB
381void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
382{
383 notifier_list_add(&bs->close_notifiers, notify);
384}
385
ea2384d3
FB
386BlockDriver *bdrv_find_format(const char *format_name)
387{
388 BlockDriver *drv1;
8a22f02a
SH
389 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
390 if (!strcmp(drv1->format_name, format_name)) {
ea2384d3 391 return drv1;
8a22f02a 392 }
ea2384d3
FB
393 }
394 return NULL;
395}
396
b64ec4e4 397static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
eb852011 398{
b64ec4e4
FZ
399 static const char *whitelist_rw[] = {
400 CONFIG_BDRV_RW_WHITELIST
401 };
402 static const char *whitelist_ro[] = {
403 CONFIG_BDRV_RO_WHITELIST
eb852011
MA
404 };
405 const char **p;
406
b64ec4e4 407 if (!whitelist_rw[0] && !whitelist_ro[0]) {
eb852011 408 return 1; /* no whitelist, anything goes */
b64ec4e4 409 }
eb852011 410
b64ec4e4 411 for (p = whitelist_rw; *p; p++) {
eb852011
MA
412 if (!strcmp(drv->format_name, *p)) {
413 return 1;
414 }
415 }
b64ec4e4
FZ
416 if (read_only) {
417 for (p = whitelist_ro; *p; p++) {
418 if (!strcmp(drv->format_name, *p)) {
419 return 1;
420 }
421 }
422 }
eb852011
MA
423 return 0;
424}
425
b64ec4e4
FZ
426BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
427 bool read_only)
eb852011
MA
428{
429 BlockDriver *drv = bdrv_find_format(format_name);
b64ec4e4 430 return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
eb852011
MA
431}
432
5b7e1542
ZYW
433typedef struct CreateCo {
434 BlockDriver *drv;
435 char *filename;
83d0521a 436 QemuOpts *opts;
5b7e1542 437 int ret;
cc84d90f 438 Error *err;
5b7e1542
ZYW
439} CreateCo;
440
441static void coroutine_fn bdrv_create_co_entry(void *opaque)
442{
cc84d90f
HR
443 Error *local_err = NULL;
444 int ret;
445
5b7e1542
ZYW
446 CreateCo *cco = opaque;
447 assert(cco->drv);
448
c282e1fd 449 ret = cco->drv->bdrv_create(cco->filename, cco->opts, &local_err);
84d18f06 450 if (local_err) {
cc84d90f
HR
451 error_propagate(&cco->err, local_err);
452 }
453 cco->ret = ret;
5b7e1542
ZYW
454}
455
0e7e1989 456int bdrv_create(BlockDriver *drv, const char* filename,
83d0521a 457 QemuOpts *opts, Error **errp)
ea2384d3 458{
5b7e1542
ZYW
459 int ret;
460
461 Coroutine *co;
462 CreateCo cco = {
463 .drv = drv,
464 .filename = g_strdup(filename),
83d0521a 465 .opts = opts,
5b7e1542 466 .ret = NOT_DONE,
cc84d90f 467 .err = NULL,
5b7e1542
ZYW
468 };
469
c282e1fd 470 if (!drv->bdrv_create) {
cc84d90f 471 error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
80168bff
LC
472 ret = -ENOTSUP;
473 goto out;
5b7e1542
ZYW
474 }
475
476 if (qemu_in_coroutine()) {
477 /* Fast-path if already in coroutine context */
478 bdrv_create_co_entry(&cco);
479 } else {
480 co = qemu_coroutine_create(bdrv_create_co_entry);
481 qemu_coroutine_enter(co, &cco);
482 while (cco.ret == NOT_DONE) {
b47ec2c4 483 aio_poll(qemu_get_aio_context(), true);
5b7e1542
ZYW
484 }
485 }
486
487 ret = cco.ret;
cc84d90f 488 if (ret < 0) {
84d18f06 489 if (cco.err) {
cc84d90f
HR
490 error_propagate(errp, cco.err);
491 } else {
492 error_setg_errno(errp, -ret, "Could not create image");
493 }
494 }
0e7e1989 495
80168bff
LC
496out:
497 g_free(cco.filename);
5b7e1542 498 return ret;
ea2384d3
FB
499}
500
c282e1fd 501int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp)
84a12e66
CH
502{
503 BlockDriver *drv;
cc84d90f
HR
504 Error *local_err = NULL;
505 int ret;
84a12e66 506
98289620 507 drv = bdrv_find_protocol(filename, true);
84a12e66 508 if (drv == NULL) {
cc84d90f 509 error_setg(errp, "Could not find protocol for file '%s'", filename);
16905d71 510 return -ENOENT;
84a12e66
CH
511 }
512
c282e1fd 513 ret = bdrv_create(drv, filename, opts, &local_err);
84d18f06 514 if (local_err) {
cc84d90f
HR
515 error_propagate(errp, local_err);
516 }
517 return ret;
84a12e66
CH
518}
519
3baca891 520void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
d34682cd
KW
521{
522 BlockDriver *drv = bs->drv;
3baca891 523 Error *local_err = NULL;
d34682cd
KW
524
525 memset(&bs->bl, 0, sizeof(bs->bl));
526
466ad822 527 if (!drv) {
3baca891 528 return;
466ad822
KW
529 }
530
531 /* Take some limits from the children as a default */
532 if (bs->file) {
3baca891
KW
533 bdrv_refresh_limits(bs->file, &local_err);
534 if (local_err) {
535 error_propagate(errp, local_err);
536 return;
537 }
466ad822 538 bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
2647fab5 539 bs->bl.max_transfer_length = bs->file->bl.max_transfer_length;
339064d5
KW
540 bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
541 } else {
542 bs->bl.opt_mem_alignment = 512;
466ad822
KW
543 }
544
545 if (bs->backing_hd) {
3baca891
KW
546 bdrv_refresh_limits(bs->backing_hd, &local_err);
547 if (local_err) {
548 error_propagate(errp, local_err);
549 return;
550 }
466ad822
KW
551 bs->bl.opt_transfer_length =
552 MAX(bs->bl.opt_transfer_length,
553 bs->backing_hd->bl.opt_transfer_length);
2647fab5
PL
554 bs->bl.max_transfer_length =
555 MIN_NON_ZERO(bs->bl.max_transfer_length,
556 bs->backing_hd->bl.max_transfer_length);
339064d5
KW
557 bs->bl.opt_mem_alignment =
558 MAX(bs->bl.opt_mem_alignment,
559 bs->backing_hd->bl.opt_mem_alignment);
466ad822
KW
560 }
561
562 /* Then let the driver override it */
563 if (drv->bdrv_refresh_limits) {
3baca891 564 drv->bdrv_refresh_limits(bs, errp);
d34682cd 565 }
d34682cd
KW
566}
567
eba25057
JM
568/*
569 * Create a uniquely-named empty temporary file.
570 * Return 0 upon success, otherwise a negative errno value.
571 */
572int get_tmp_filename(char *filename, int size)
d5249393 573{
eba25057 574#ifdef _WIN32
3b9f94e1 575 char temp_dir[MAX_PATH];
eba25057
JM
576 /* GetTempFileName requires that its output buffer (4th param)
577 have length MAX_PATH or greater. */
578 assert(size >= MAX_PATH);
579 return (GetTempPath(MAX_PATH, temp_dir)
580 && GetTempFileName(temp_dir, "qem", 0, filename)
581 ? 0 : -GetLastError());
d5249393 582#else
67b915a5 583 int fd;
7ccfb2eb 584 const char *tmpdir;
0badc1ee 585 tmpdir = getenv("TMPDIR");
69bef793
AS
586 if (!tmpdir) {
587 tmpdir = "/var/tmp";
588 }
eba25057
JM
589 if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
590 return -EOVERFLOW;
591 }
ea2384d3 592 fd = mkstemp(filename);
fe235a06
DH
593 if (fd < 0) {
594 return -errno;
595 }
596 if (close(fd) != 0) {
597 unlink(filename);
eba25057
JM
598 return -errno;
599 }
600 return 0;
d5249393 601#endif
eba25057 602}
fc01f7e7 603
84a12e66
CH
604/*
605 * Detect host devices. By convention, /dev/cdrom[N] is always
606 * recognized as a host CDROM.
607 */
608static BlockDriver *find_hdev_driver(const char *filename)
609{
610 int score_max = 0, score;
611 BlockDriver *drv = NULL, *d;
612
613 QLIST_FOREACH(d, &bdrv_drivers, list) {
614 if (d->bdrv_probe_device) {
615 score = d->bdrv_probe_device(filename);
616 if (score > score_max) {
617 score_max = score;
618 drv = d;
619 }
620 }
621 }
622
623 return drv;
624}
625
98289620
KW
626BlockDriver *bdrv_find_protocol(const char *filename,
627 bool allow_protocol_prefix)
83f64091
FB
628{
629 BlockDriver *drv1;
630 char protocol[128];
1cec71e3 631 int len;
83f64091 632 const char *p;
19cb3738 633
66f82cee
KW
634 /* TODO Drivers without bdrv_file_open must be specified explicitly */
635
39508e7a
CH
636 /*
637 * XXX(hch): we really should not let host device detection
638 * override an explicit protocol specification, but moving this
639 * later breaks access to device names with colons in them.
640 * Thanks to the brain-dead persistent naming schemes on udev-
641 * based Linux systems those actually are quite common.
642 */
643 drv1 = find_hdev_driver(filename);
644 if (drv1) {
645 return drv1;
646 }
647
98289620 648 if (!path_has_protocol(filename) || !allow_protocol_prefix) {
ef810437 649 return &bdrv_file;
84a12e66 650 }
98289620 651
9e0b22f4
SH
652 p = strchr(filename, ':');
653 assert(p != NULL);
1cec71e3
AL
654 len = p - filename;
655 if (len > sizeof(protocol) - 1)
656 len = sizeof(protocol) - 1;
657 memcpy(protocol, filename, len);
658 protocol[len] = '\0';
8a22f02a 659 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
5fafdf24 660 if (drv1->protocol_name &&
8a22f02a 661 !strcmp(drv1->protocol_name, protocol)) {
83f64091 662 return drv1;
8a22f02a 663 }
83f64091
FB
664 }
665 return NULL;
666}
667
c6684249
MA
668/*
669 * Guess image format by probing its contents.
670 * This is not a good idea when your image is raw (CVE-2008-2004), but
671 * we do it anyway for backward compatibility.
672 *
673 * @buf contains the image's first @buf_size bytes.
7cddd372
KW
674 * @buf_size is the buffer size in bytes (generally BLOCK_PROBE_BUF_SIZE,
675 * but can be smaller if the image file is smaller)
c6684249
MA
676 * @filename is its filename.
677 *
678 * For all block drivers, call the bdrv_probe() method to get its
679 * probing score.
680 * Return the first block driver with the highest probing score.
681 */
38f3ef57
KW
682BlockDriver *bdrv_probe_all(const uint8_t *buf, int buf_size,
683 const char *filename)
c6684249
MA
684{
685 int score_max = 0, score;
686 BlockDriver *drv = NULL, *d;
687
688 QLIST_FOREACH(d, &bdrv_drivers, list) {
689 if (d->bdrv_probe) {
690 score = d->bdrv_probe(buf, buf_size, filename);
691 if (score > score_max) {
692 score_max = score;
693 drv = d;
694 }
695 }
696 }
697
698 return drv;
699}
700
f500a6d3 701static int find_image_format(BlockDriverState *bs, const char *filename,
34b5d2c6 702 BlockDriver **pdrv, Error **errp)
f3a5d3f8 703{
c6684249 704 BlockDriver *drv;
7cddd372 705 uint8_t buf[BLOCK_PROBE_BUF_SIZE];
f500a6d3 706 int ret = 0;
f8ea0b00 707
08a00559 708 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
8e895599 709 if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
ef810437 710 *pdrv = &bdrv_raw;
c98ac35d 711 return ret;
1a396859 712 }
f8ea0b00 713
83f64091 714 ret = bdrv_pread(bs, 0, buf, sizeof(buf));
83f64091 715 if (ret < 0) {
34b5d2c6
HR
716 error_setg_errno(errp, -ret, "Could not read image for determining its "
717 "format");
c98ac35d
SW
718 *pdrv = NULL;
719 return ret;
83f64091
FB
720 }
721
c6684249 722 drv = bdrv_probe_all(buf, ret, filename);
c98ac35d 723 if (!drv) {
34b5d2c6
HR
724 error_setg(errp, "Could not determine image format: No compatible "
725 "driver found");
c98ac35d
SW
726 ret = -ENOENT;
727 }
728 *pdrv = drv;
729 return ret;
ea2384d3
FB
730}
731
51762288
SH
732/**
733 * Set the current 'total_sectors' value
65a9bb25 734 * Return 0 on success, -errno on error.
51762288
SH
735 */
736static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
737{
738 BlockDriver *drv = bs->drv;
739
396759ad
NB
740 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
741 if (bs->sg)
742 return 0;
743
51762288
SH
744 /* query actual device if possible, otherwise just trust the hint */
745 if (drv->bdrv_getlength) {
746 int64_t length = drv->bdrv_getlength(bs);
747 if (length < 0) {
748 return length;
749 }
7e382003 750 hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
51762288
SH
751 }
752
753 bs->total_sectors = hint;
754 return 0;
755}
756
9e8f1835
PB
757/**
758 * Set open flags for a given discard mode
759 *
760 * Return 0 on success, -1 if the discard mode was invalid.
761 */
762int bdrv_parse_discard_flags(const char *mode, int *flags)
763{
764 *flags &= ~BDRV_O_UNMAP;
765
766 if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
767 /* do nothing */
768 } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
769 *flags |= BDRV_O_UNMAP;
770 } else {
771 return -1;
772 }
773
774 return 0;
775}
776
c3993cdc
SH
777/**
778 * Set open flags for a given cache mode
779 *
780 * Return 0 on success, -1 if the cache mode was invalid.
781 */
782int bdrv_parse_cache_flags(const char *mode, int *flags)
783{
784 *flags &= ~BDRV_O_CACHE_MASK;
785
786 if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
787 *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
92196b2f
SH
788 } else if (!strcmp(mode, "directsync")) {
789 *flags |= BDRV_O_NOCACHE;
c3993cdc
SH
790 } else if (!strcmp(mode, "writeback")) {
791 *flags |= BDRV_O_CACHE_WB;
792 } else if (!strcmp(mode, "unsafe")) {
793 *flags |= BDRV_O_CACHE_WB;
794 *flags |= BDRV_O_NO_FLUSH;
795 } else if (!strcmp(mode, "writethrough")) {
796 /* this is the default */
797 } else {
798 return -1;
799 }
800
801 return 0;
802}
803
53fec9d3
SH
804/**
805 * The copy-on-read flag is actually a reference count so multiple users may
806 * use the feature without worrying about clobbering its previous state.
807 * Copy-on-read stays enabled until all users have called to disable it.
808 */
809void bdrv_enable_copy_on_read(BlockDriverState *bs)
810{
811 bs->copy_on_read++;
812}
813
814void bdrv_disable_copy_on_read(BlockDriverState *bs)
815{
816 assert(bs->copy_on_read > 0);
817 bs->copy_on_read--;
818}
819
b1e6fc08
KW
820/*
821 * Returns the flags that a temporary snapshot should get, based on the
822 * originally requested flags (the originally requested image will have flags
823 * like a backing file)
824 */
825static int bdrv_temp_snapshot_flags(int flags)
826{
827 return (flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY;
828}
829
0b50cc88
KW
830/*
831 * Returns the flags that bs->file should get, based on the given flags for
832 * the parent BDS
833 */
834static int bdrv_inherited_flags(int flags)
835{
836 /* Enable protocol handling, disable format probing for bs->file */
837 flags |= BDRV_O_PROTOCOL;
838
839 /* Our block drivers take care to send flushes and respect unmap policy,
840 * so we can enable both unconditionally on lower layers. */
841 flags |= BDRV_O_CACHE_WB | BDRV_O_UNMAP;
842
0b50cc88 843 /* Clear flags that only apply to the top layer */
5669b44d 844 flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_COPY_ON_READ);
0b50cc88
KW
845
846 return flags;
847}
848
317fc44e
KW
849/*
850 * Returns the flags that bs->backing_hd should get, based on the given flags
851 * for the parent BDS
852 */
853static int bdrv_backing_flags(int flags)
854{
855 /* backing files always opened read-only */
856 flags &= ~(BDRV_O_RDWR | BDRV_O_COPY_ON_READ);
857
858 /* snapshot=on is handled on the top layer */
8bfea15d 859 flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_TEMPORARY);
317fc44e
KW
860
861 return flags;
862}
863
7b272452
KW
864static int bdrv_open_flags(BlockDriverState *bs, int flags)
865{
866 int open_flags = flags | BDRV_O_CACHE_WB;
867
868 /*
869 * Clear flags that are internal to the block layer before opening the
870 * image.
871 */
20cca275 872 open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_PROTOCOL);
7b272452
KW
873
874 /*
875 * Snapshots should be writable.
876 */
8bfea15d 877 if (flags & BDRV_O_TEMPORARY) {
7b272452
KW
878 open_flags |= BDRV_O_RDWR;
879 }
880
881 return open_flags;
882}
883
636ea370
KW
884static void bdrv_assign_node_name(BlockDriverState *bs,
885 const char *node_name,
886 Error **errp)
6913c0c2
BC
887{
888 if (!node_name) {
636ea370 889 return;
6913c0c2
BC
890 }
891
9aebf3b8 892 /* Check for empty string or invalid characters */
f5bebbbb 893 if (!id_wellformed(node_name)) {
9aebf3b8 894 error_setg(errp, "Invalid node name");
636ea370 895 return;
6913c0c2
BC
896 }
897
0c5e94ee 898 /* takes care of avoiding namespaces collisions */
7f06d47e 899 if (blk_by_name(node_name)) {
0c5e94ee
BC
900 error_setg(errp, "node-name=%s is conflicting with a device id",
901 node_name);
636ea370 902 return;
0c5e94ee
BC
903 }
904
6913c0c2
BC
905 /* takes care of avoiding duplicates node names */
906 if (bdrv_find_node(node_name)) {
907 error_setg(errp, "Duplicate node name");
636ea370 908 return;
6913c0c2
BC
909 }
910
911 /* copy node name into the bs and insert it into the graph list */
912 pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
913 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
6913c0c2
BC
914}
915
57915332
KW
916/*
917 * Common part for opening disk images and files
b6ad491a
KW
918 *
919 * Removes all processed options from *options.
57915332 920 */
f500a6d3 921static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
34b5d2c6 922 QDict *options, int flags, BlockDriver *drv, Error **errp)
57915332
KW
923{
924 int ret, open_flags;
035fccdf 925 const char *filename;
6913c0c2 926 const char *node_name = NULL;
34b5d2c6 927 Error *local_err = NULL;
57915332
KW
928
929 assert(drv != NULL);
6405875c 930 assert(bs->file == NULL);
707ff828 931 assert(options != NULL && bs->options != options);
57915332 932
45673671
KW
933 if (file != NULL) {
934 filename = file->filename;
935 } else {
936 filename = qdict_get_try_str(options, "filename");
937 }
938
765003db
KW
939 if (drv->bdrv_needs_filename && !filename) {
940 error_setg(errp, "The '%s' block driver requires a file name",
941 drv->format_name);
942 return -EINVAL;
943 }
944
45673671 945 trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
28dcee10 946
6913c0c2 947 node_name = qdict_get_try_str(options, "node-name");
636ea370 948 bdrv_assign_node_name(bs, node_name, &local_err);
0fb6395c 949 if (local_err) {
636ea370
KW
950 error_propagate(errp, local_err);
951 return -EINVAL;
6913c0c2
BC
952 }
953 qdict_del(options, "node-name");
954
5d186eb0
KW
955 /* bdrv_open() with directly using a protocol as drv. This layer is already
956 * opened, so assign it to bs (while file becomes a closed BlockDriverState)
957 * and return immediately. */
958 if (file != NULL && drv->bdrv_file_open) {
959 bdrv_swap(file, bs);
960 return 0;
961 }
962
57915332 963 bs->open_flags = flags;
1b7fd729 964 bs->guest_block_size = 512;
c25f53b0 965 bs->request_alignment = 512;
0d51b4de 966 bs->zero_beyond_eof = true;
b64ec4e4
FZ
967 open_flags = bdrv_open_flags(bs, flags);
968 bs->read_only = !(open_flags & BDRV_O_RDWR);
20cca275 969 bs->growable = !!(flags & BDRV_O_PROTOCOL);
b64ec4e4
FZ
970
971 if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
8f94a6e4
KW
972 error_setg(errp,
973 !bs->read_only && bdrv_is_whitelisted(drv, true)
974 ? "Driver '%s' can only be used for read-only devices"
975 : "Driver '%s' is not whitelisted",
976 drv->format_name);
b64ec4e4
FZ
977 return -ENOTSUP;
978 }
57915332 979
53fec9d3 980 assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
0ebd24e0
KW
981 if (flags & BDRV_O_COPY_ON_READ) {
982 if (!bs->read_only) {
983 bdrv_enable_copy_on_read(bs);
984 } else {
985 error_setg(errp, "Can't use copy-on-read on read-only device");
986 return -EINVAL;
987 }
53fec9d3
SH
988 }
989
c2ad1b0c
KW
990 if (filename != NULL) {
991 pstrcpy(bs->filename, sizeof(bs->filename), filename);
992 } else {
993 bs->filename[0] = '\0';
994 }
91af7014 995 pstrcpy(bs->exact_filename, sizeof(bs->exact_filename), bs->filename);
57915332 996
57915332 997 bs->drv = drv;
7267c094 998 bs->opaque = g_malloc0(drv->instance_size);
57915332 999
03f541bd 1000 bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
e7c63796 1001
66f82cee
KW
1002 /* Open the image, either directly or using a protocol */
1003 if (drv->bdrv_file_open) {
5d186eb0 1004 assert(file == NULL);
030be321 1005 assert(!drv->bdrv_needs_filename || filename != NULL);
34b5d2c6 1006 ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
f500a6d3 1007 } else {
2af5ef70 1008 if (file == NULL) {
34b5d2c6
HR
1009 error_setg(errp, "Can't use '%s' as a block driver for the "
1010 "protocol level", drv->format_name);
2af5ef70
KW
1011 ret = -EINVAL;
1012 goto free_and_fail;
1013 }
f500a6d3 1014 bs->file = file;
34b5d2c6 1015 ret = drv->bdrv_open(bs, options, open_flags, &local_err);
66f82cee
KW
1016 }
1017
57915332 1018 if (ret < 0) {
84d18f06 1019 if (local_err) {
34b5d2c6 1020 error_propagate(errp, local_err);
2fa9aa59
DH
1021 } else if (bs->filename[0]) {
1022 error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
34b5d2c6
HR
1023 } else {
1024 error_setg_errno(errp, -ret, "Could not open image");
1025 }
57915332
KW
1026 goto free_and_fail;
1027 }
1028
51762288
SH
1029 ret = refresh_total_sectors(bs, bs->total_sectors);
1030 if (ret < 0) {
34b5d2c6 1031 error_setg_errno(errp, -ret, "Could not refresh total sector count");
51762288 1032 goto free_and_fail;
57915332 1033 }
51762288 1034
3baca891
KW
1035 bdrv_refresh_limits(bs, &local_err);
1036 if (local_err) {
1037 error_propagate(errp, local_err);
1038 ret = -EINVAL;
1039 goto free_and_fail;
1040 }
1041
c25f53b0 1042 assert(bdrv_opt_mem_align(bs) != 0);
47ea2de2 1043 assert((bs->request_alignment != 0) || bs->sg);
57915332
KW
1044 return 0;
1045
1046free_and_fail:
f500a6d3 1047 bs->file = NULL;
7267c094 1048 g_free(bs->opaque);
57915332
KW
1049 bs->opaque = NULL;
1050 bs->drv = NULL;
1051 return ret;
1052}
1053
5e5c4f63
KW
1054static QDict *parse_json_filename(const char *filename, Error **errp)
1055{
1056 QObject *options_obj;
1057 QDict *options;
1058 int ret;
1059
1060 ret = strstart(filename, "json:", &filename);
1061 assert(ret);
1062
1063 options_obj = qobject_from_json(filename);
1064 if (!options_obj) {
1065 error_setg(errp, "Could not parse the JSON options");
1066 return NULL;
1067 }
1068
1069 if (qobject_type(options_obj) != QTYPE_QDICT) {
1070 qobject_decref(options_obj);
1071 error_setg(errp, "Invalid JSON object given");
1072 return NULL;
1073 }
1074
1075 options = qobject_to_qdict(options_obj);
1076 qdict_flatten(options);
1077
1078 return options;
1079}
1080
b6ce07aa 1081/*
f54120ff
KW
1082 * Fills in default options for opening images and converts the legacy
1083 * filename/flags pair to option QDict entries.
b6ce07aa 1084 */
5e5c4f63 1085static int bdrv_fill_options(QDict **options, const char **pfilename, int flags,
17b005f1 1086 BlockDriver *drv, Error **errp)
ea2384d3 1087{
5e5c4f63 1088 const char *filename = *pfilename;
c2ad1b0c 1089 const char *drvname;
462f5bcf 1090 bool protocol = flags & BDRV_O_PROTOCOL;
e3fa4bfa 1091 bool parse_filename = false;
34b5d2c6 1092 Error *local_err = NULL;
83f64091 1093
5e5c4f63
KW
1094 /* Parse json: pseudo-protocol */
1095 if (filename && g_str_has_prefix(filename, "json:")) {
1096 QDict *json_options = parse_json_filename(filename, &local_err);
1097 if (local_err) {
1098 error_propagate(errp, local_err);
1099 return -EINVAL;
1100 }
1101
1102 /* Options given in the filename have lower priority than options
1103 * specified directly */
1104 qdict_join(*options, json_options, false);
1105 QDECREF(json_options);
1106 *pfilename = filename = NULL;
1107 }
1108
035fccdf 1109 /* Fetch the file name from the options QDict if necessary */
17b005f1 1110 if (protocol && filename) {
f54120ff
KW
1111 if (!qdict_haskey(*options, "filename")) {
1112 qdict_put(*options, "filename", qstring_from_str(filename));
1113 parse_filename = true;
1114 } else {
1115 error_setg(errp, "Can't specify 'file' and 'filename' options at "
1116 "the same time");
1117 return -EINVAL;
1118 }
035fccdf
KW
1119 }
1120
c2ad1b0c 1121 /* Find the right block driver */
f54120ff 1122 filename = qdict_get_try_str(*options, "filename");
5acd9d81 1123 drvname = qdict_get_try_str(*options, "driver");
f54120ff 1124
17b005f1
KW
1125 if (drv) {
1126 if (drvname) {
1127 error_setg(errp, "Driver specified twice");
1128 return -EINVAL;
1129 }
1130 drvname = drv->format_name;
1131 qdict_put(*options, "driver", qstring_from_str(drvname));
1132 } else {
1133 if (!drvname && protocol) {
1134 if (filename) {
1135 drv = bdrv_find_protocol(filename, parse_filename);
1136 if (!drv) {
1137 error_setg(errp, "Unknown protocol");
1138 return -EINVAL;
1139 }
1140
1141 drvname = drv->format_name;
1142 qdict_put(*options, "driver", qstring_from_str(drvname));
1143 } else {
1144 error_setg(errp, "Must specify either driver or file");
f54120ff
KW
1145 return -EINVAL;
1146 }
17b005f1
KW
1147 } else if (drvname) {
1148 drv = bdrv_find_format(drvname);
1149 if (!drv) {
1150 error_setg(errp, "Unknown driver '%s'", drvname);
1151 return -ENOENT;
1152 }
98289620 1153 }
c2ad1b0c
KW
1154 }
1155
17b005f1 1156 assert(drv || !protocol);
c2ad1b0c 1157
f54120ff 1158 /* Driver-specific filename parsing */
17b005f1 1159 if (drv && drv->bdrv_parse_filename && parse_filename) {
5acd9d81 1160 drv->bdrv_parse_filename(filename, *options, &local_err);
84d18f06 1161 if (local_err) {
34b5d2c6 1162 error_propagate(errp, local_err);
f54120ff 1163 return -EINVAL;
6963a30d 1164 }
cd5d031e
HR
1165
1166 if (!drv->bdrv_needs_filename) {
1167 qdict_del(*options, "filename");
cd5d031e 1168 }
6963a30d
KW
1169 }
1170
f54120ff
KW
1171 return 0;
1172}
1173
8d24cce1
FZ
1174void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd)
1175{
1176
826b6ca0
FZ
1177 if (bs->backing_hd) {
1178 assert(bs->backing_blocker);
1179 bdrv_op_unblock_all(bs->backing_hd, bs->backing_blocker);
1180 } else if (backing_hd) {
1181 error_setg(&bs->backing_blocker,
1182 "device is used as backing hd of '%s'",
bfb197e0 1183 bdrv_get_device_name(bs));
826b6ca0
FZ
1184 }
1185
8d24cce1
FZ
1186 bs->backing_hd = backing_hd;
1187 if (!backing_hd) {
826b6ca0
FZ
1188 error_free(bs->backing_blocker);
1189 bs->backing_blocker = NULL;
8d24cce1
FZ
1190 goto out;
1191 }
1192 bs->open_flags &= ~BDRV_O_NO_BACKING;
1193 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_hd->filename);
1194 pstrcpy(bs->backing_format, sizeof(bs->backing_format),
1195 backing_hd->drv ? backing_hd->drv->format_name : "");
826b6ca0
FZ
1196
1197 bdrv_op_block_all(bs->backing_hd, bs->backing_blocker);
1198 /* Otherwise we won't be able to commit due to check in bdrv_commit */
1199 bdrv_op_unblock(bs->backing_hd, BLOCK_OP_TYPE_COMMIT,
1200 bs->backing_blocker);
8d24cce1 1201out:
3baca891 1202 bdrv_refresh_limits(bs, NULL);
8d24cce1
FZ
1203}
1204
31ca6d07
KW
1205/*
1206 * Opens the backing file for a BlockDriverState if not yet open
1207 *
1208 * options is a QDict of options to pass to the block drivers, or NULL for an
1209 * empty set of options. The reference to the QDict is transferred to this
1210 * function (even on failure), so if the caller intends to reuse the dictionary,
1211 * it needs to use QINCREF() before calling bdrv_file_open.
1212 */
34b5d2c6 1213int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
9156df12 1214{
1ba4b6a5 1215 char *backing_filename = g_malloc0(PATH_MAX);
317fc44e 1216 int ret = 0;
8d24cce1 1217 BlockDriverState *backing_hd;
34b5d2c6 1218 Error *local_err = NULL;
9156df12
PB
1219
1220 if (bs->backing_hd != NULL) {
31ca6d07 1221 QDECREF(options);
1ba4b6a5 1222 goto free_exit;
9156df12
PB
1223 }
1224
31ca6d07
KW
1225 /* NULL means an empty set of options */
1226 if (options == NULL) {
1227 options = qdict_new();
1228 }
1229
9156df12 1230 bs->open_flags &= ~BDRV_O_NO_BACKING;
1cb6f506
KW
1231 if (qdict_haskey(options, "file.filename")) {
1232 backing_filename[0] = '\0';
1233 } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
31ca6d07 1234 QDECREF(options);
1ba4b6a5 1235 goto free_exit;
dbecebdd 1236 } else {
9f07429e
HR
1237 bdrv_get_full_backing_filename(bs, backing_filename, PATH_MAX,
1238 &local_err);
1239 if (local_err) {
1240 ret = -EINVAL;
1241 error_propagate(errp, local_err);
1242 QDECREF(options);
1243 goto free_exit;
1244 }
9156df12
PB
1245 }
1246
8ee79e70
KW
1247 if (!bs->drv || !bs->drv->supports_backing) {
1248 ret = -EINVAL;
1249 error_setg(errp, "Driver doesn't support backing files");
1250 QDECREF(options);
1251 goto free_exit;
1252 }
1253
e4e9986b 1254 backing_hd = bdrv_new();
8d24cce1 1255
c5f6e493
KW
1256 if (bs->backing_format[0] != '\0' && !qdict_haskey(options, "driver")) {
1257 qdict_put(options, "driver", qstring_from_str(bs->backing_format));
9156df12
PB
1258 }
1259
f67503e5 1260 assert(bs->backing_hd == NULL);
8d24cce1 1261 ret = bdrv_open(&backing_hd,
ddf5636d 1262 *backing_filename ? backing_filename : NULL, NULL, options,
c5f6e493 1263 bdrv_backing_flags(bs->open_flags), NULL, &local_err);
9156df12 1264 if (ret < 0) {
8d24cce1
FZ
1265 bdrv_unref(backing_hd);
1266 backing_hd = NULL;
9156df12 1267 bs->open_flags |= BDRV_O_NO_BACKING;
b04b6b6e
FZ
1268 error_setg(errp, "Could not open backing file: %s",
1269 error_get_pretty(local_err));
1270 error_free(local_err);
1ba4b6a5 1271 goto free_exit;
9156df12 1272 }
8d24cce1 1273 bdrv_set_backing_hd(bs, backing_hd);
d80ac658 1274
1ba4b6a5
BC
1275free_exit:
1276 g_free(backing_filename);
1277 return ret;
9156df12
PB
1278}
1279
da557aac
HR
1280/*
1281 * Opens a disk image whose options are given as BlockdevRef in another block
1282 * device's options.
1283 *
da557aac
HR
1284 * If allow_none is true, no image will be opened if filename is false and no
1285 * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
1286 *
1287 * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
1288 * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
1289 * itself, all options starting with "${bdref_key}." are considered part of the
1290 * BlockdevRef.
1291 *
1292 * The BlockdevRef will be removed from the options QDict.
f67503e5
HR
1293 *
1294 * To conform with the behavior of bdrv_open(), *pbs has to be NULL.
da557aac
HR
1295 */
1296int bdrv_open_image(BlockDriverState **pbs, const char *filename,
1297 QDict *options, const char *bdref_key, int flags,
f7d9fd8c 1298 bool allow_none, Error **errp)
da557aac
HR
1299{
1300 QDict *image_options;
1301 int ret;
1302 char *bdref_key_dot;
1303 const char *reference;
1304
f67503e5
HR
1305 assert(pbs);
1306 assert(*pbs == NULL);
1307
da557aac
HR
1308 bdref_key_dot = g_strdup_printf("%s.", bdref_key);
1309 qdict_extract_subqdict(options, &image_options, bdref_key_dot);
1310 g_free(bdref_key_dot);
1311
1312 reference = qdict_get_try_str(options, bdref_key);
1313 if (!filename && !reference && !qdict_size(image_options)) {
1314 if (allow_none) {
1315 ret = 0;
1316 } else {
1317 error_setg(errp, "A block device must be specified for \"%s\"",
1318 bdref_key);
1319 ret = -EINVAL;
1320 }
b20e61e0 1321 QDECREF(image_options);
da557aac
HR
1322 goto done;
1323 }
1324
f7d9fd8c 1325 ret = bdrv_open(pbs, filename, reference, image_options, flags, NULL, errp);
da557aac
HR
1326
1327done:
1328 qdict_del(options, bdref_key);
1329 return ret;
1330}
1331
6b8aeca5 1332int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp)
b998875d
KW
1333{
1334 /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1ba4b6a5 1335 char *tmp_filename = g_malloc0(PATH_MAX + 1);
b998875d 1336 int64_t total_size;
83d0521a 1337 QemuOpts *opts = NULL;
b998875d
KW
1338 QDict *snapshot_options;
1339 BlockDriverState *bs_snapshot;
1340 Error *local_err;
1341 int ret;
1342
1343 /* if snapshot, we create a temporary backing file and open it
1344 instead of opening 'filename' directly */
1345
1346 /* Get the required size from the image */
f187743a
KW
1347 total_size = bdrv_getlength(bs);
1348 if (total_size < 0) {
6b8aeca5 1349 ret = total_size;
f187743a 1350 error_setg_errno(errp, -total_size, "Could not get image size");
1ba4b6a5 1351 goto out;
f187743a 1352 }
b998875d
KW
1353
1354 /* Create the temporary image */
1ba4b6a5 1355 ret = get_tmp_filename(tmp_filename, PATH_MAX + 1);
b998875d
KW
1356 if (ret < 0) {
1357 error_setg_errno(errp, -ret, "Could not get temporary filename");
1ba4b6a5 1358 goto out;
b998875d
KW
1359 }
1360
ef810437 1361 opts = qemu_opts_create(bdrv_qcow2.create_opts, NULL, 0,
c282e1fd 1362 &error_abort);
83d0521a 1363 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, total_size);
ef810437 1364 ret = bdrv_create(&bdrv_qcow2, tmp_filename, opts, &local_err);
83d0521a 1365 qemu_opts_del(opts);
b998875d
KW
1366 if (ret < 0) {
1367 error_setg_errno(errp, -ret, "Could not create temporary overlay "
1368 "'%s': %s", tmp_filename,
1369 error_get_pretty(local_err));
1370 error_free(local_err);
1ba4b6a5 1371 goto out;
b998875d
KW
1372 }
1373
1374 /* Prepare a new options QDict for the temporary file */
1375 snapshot_options = qdict_new();
1376 qdict_put(snapshot_options, "file.driver",
1377 qstring_from_str("file"));
1378 qdict_put(snapshot_options, "file.filename",
1379 qstring_from_str(tmp_filename));
1380
e4e9986b 1381 bs_snapshot = bdrv_new();
b998875d
KW
1382
1383 ret = bdrv_open(&bs_snapshot, NULL, NULL, snapshot_options,
ef810437 1384 flags, &bdrv_qcow2, &local_err);
b998875d
KW
1385 if (ret < 0) {
1386 error_propagate(errp, local_err);
1ba4b6a5 1387 goto out;
b998875d
KW
1388 }
1389
1390 bdrv_append(bs_snapshot, bs);
1ba4b6a5
BC
1391
1392out:
1393 g_free(tmp_filename);
6b8aeca5 1394 return ret;
b998875d
KW
1395}
1396
b6ce07aa
KW
1397/*
1398 * Opens a disk image (raw, qcow2, vmdk, ...)
de9c0cec
KW
1399 *
1400 * options is a QDict of options to pass to the block drivers, or NULL for an
1401 * empty set of options. The reference to the QDict belongs to the block layer
1402 * after the call (even on failure), so if the caller intends to reuse the
1403 * dictionary, it needs to use QINCREF() before calling bdrv_open.
f67503e5
HR
1404 *
1405 * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
1406 * If it is not NULL, the referenced BDS will be reused.
ddf5636d
HR
1407 *
1408 * The reference parameter may be used to specify an existing block device which
1409 * should be opened. If specified, neither options nor a filename may be given,
1410 * nor can an existing BDS be reused (that is, *pbs has to be NULL).
b6ce07aa 1411 */
ddf5636d
HR
1412int bdrv_open(BlockDriverState **pbs, const char *filename,
1413 const char *reference, QDict *options, int flags,
1414 BlockDriver *drv, Error **errp)
ea2384d3 1415{
b6ce07aa 1416 int ret;
f67503e5 1417 BlockDriverState *file = NULL, *bs;
74fe54f2 1418 const char *drvname;
34b5d2c6 1419 Error *local_err = NULL;
b1e6fc08 1420 int snapshot_flags = 0;
712e7874 1421
f67503e5
HR
1422 assert(pbs);
1423
ddf5636d
HR
1424 if (reference) {
1425 bool options_non_empty = options ? qdict_size(options) : false;
1426 QDECREF(options);
1427
1428 if (*pbs) {
1429 error_setg(errp, "Cannot reuse an existing BDS when referencing "
1430 "another block device");
1431 return -EINVAL;
1432 }
1433
1434 if (filename || options_non_empty) {
1435 error_setg(errp, "Cannot reference an existing block device with "
1436 "additional options or a new filename");
1437 return -EINVAL;
1438 }
1439
1440 bs = bdrv_lookup_bs(reference, reference, errp);
1441 if (!bs) {
1442 return -ENODEV;
1443 }
1444 bdrv_ref(bs);
1445 *pbs = bs;
1446 return 0;
1447 }
1448
f67503e5
HR
1449 if (*pbs) {
1450 bs = *pbs;
1451 } else {
e4e9986b 1452 bs = bdrv_new();
f67503e5
HR
1453 }
1454
de9c0cec
KW
1455 /* NULL means an empty set of options */
1456 if (options == NULL) {
1457 options = qdict_new();
1458 }
1459
17b005f1 1460 ret = bdrv_fill_options(&options, &filename, flags, drv, &local_err);
462f5bcf
KW
1461 if (local_err) {
1462 goto fail;
1463 }
1464
76c591b0
KW
1465 /* Find the right image format driver */
1466 drv = NULL;
1467 drvname = qdict_get_try_str(options, "driver");
1468 if (drvname) {
1469 drv = bdrv_find_format(drvname);
1470 qdict_del(options, "driver");
1471 if (!drv) {
1472 error_setg(errp, "Unknown driver: '%s'", drvname);
1473 ret = -EINVAL;
1474 goto fail;
1475 }
1476 }
1477
1478 assert(drvname || !(flags & BDRV_O_PROTOCOL));
1479 if (drv && !drv->bdrv_file_open) {
1480 /* If the user explicitly wants a format driver here, we'll need to add
1481 * another layer for the protocol in bs->file */
1482 flags &= ~BDRV_O_PROTOCOL;
1483 }
1484
de9c0cec 1485 bs->options = options;
b6ad491a 1486 options = qdict_clone_shallow(options);
de9c0cec 1487
f500a6d3 1488 /* Open image file without format layer */
f4788adc
KW
1489 if ((flags & BDRV_O_PROTOCOL) == 0) {
1490 if (flags & BDRV_O_RDWR) {
1491 flags |= BDRV_O_ALLOW_RDWR;
1492 }
1493 if (flags & BDRV_O_SNAPSHOT) {
1494 snapshot_flags = bdrv_temp_snapshot_flags(flags);
1495 flags = bdrv_backing_flags(flags);
1496 }
f500a6d3 1497
f4788adc
KW
1498 assert(file == NULL);
1499 ret = bdrv_open_image(&file, filename, options, "file",
1500 bdrv_inherited_flags(flags),
1501 true, &local_err);
1502 if (ret < 0) {
1503 goto fail;
1504 }
f500a6d3
KW
1505 }
1506
76c591b0 1507 /* Image format probing */
38f3ef57 1508 bs->probed = !drv;
76c591b0 1509 if (!drv && file) {
17b005f1
KW
1510 ret = find_image_format(file, filename, &drv, &local_err);
1511 if (ret < 0) {
8bfea15d 1512 goto fail;
2a05cbe4 1513 }
76c591b0 1514 } else if (!drv) {
17b005f1
KW
1515 error_setg(errp, "Must specify either driver or file");
1516 ret = -EINVAL;
8bfea15d 1517 goto fail;
ea2384d3 1518 }
b6ce07aa
KW
1519
1520 /* Open the image */
34b5d2c6 1521 ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
b6ce07aa 1522 if (ret < 0) {
8bfea15d 1523 goto fail;
6987307c
CH
1524 }
1525
2a05cbe4 1526 if (file && (bs->file != file)) {
4f6fd349 1527 bdrv_unref(file);
f500a6d3
KW
1528 file = NULL;
1529 }
1530
b6ce07aa 1531 /* If there is a backing file, use it */
9156df12 1532 if ((flags & BDRV_O_NO_BACKING) == 0) {
31ca6d07
KW
1533 QDict *backing_options;
1534
5726d872 1535 qdict_extract_subqdict(options, &backing_options, "backing.");
34b5d2c6 1536 ret = bdrv_open_backing_file(bs, backing_options, &local_err);
b6ce07aa 1537 if (ret < 0) {
b6ad491a 1538 goto close_and_fail;
b6ce07aa 1539 }
b6ce07aa
KW
1540 }
1541
91af7014
HR
1542 bdrv_refresh_filename(bs);
1543
b998875d
KW
1544 /* For snapshot=on, create a temporary qcow2 overlay. bs points to the
1545 * temporary snapshot afterwards. */
b1e6fc08 1546 if (snapshot_flags) {
6b8aeca5 1547 ret = bdrv_append_temp_snapshot(bs, snapshot_flags, &local_err);
b998875d 1548 if (local_err) {
b998875d
KW
1549 goto close_and_fail;
1550 }
1551 }
1552
b6ad491a 1553 /* Check if any unknown options were used */
5acd9d81 1554 if (options && (qdict_size(options) != 0)) {
b6ad491a 1555 const QDictEntry *entry = qdict_first(options);
5acd9d81
HR
1556 if (flags & BDRV_O_PROTOCOL) {
1557 error_setg(errp, "Block protocol '%s' doesn't support the option "
1558 "'%s'", drv->format_name, entry->key);
1559 } else {
1560 error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1561 "support the option '%s'", drv->format_name,
bfb197e0 1562 bdrv_get_device_name(bs), entry->key);
5acd9d81 1563 }
b6ad491a
KW
1564
1565 ret = -EINVAL;
1566 goto close_and_fail;
1567 }
b6ad491a 1568
b6ce07aa 1569 if (!bdrv_key_required(bs)) {
a7f53e26
MA
1570 if (bs->blk) {
1571 blk_dev_change_media_cb(bs->blk, true);
1572 }
c3adb58f
MA
1573 } else if (!runstate_check(RUN_STATE_PRELAUNCH)
1574 && !runstate_check(RUN_STATE_INMIGRATE)
1575 && !runstate_check(RUN_STATE_PAUSED)) { /* HACK */
1576 error_setg(errp,
1577 "Guest must be stopped for opening of encrypted image");
1578 ret = -EBUSY;
1579 goto close_and_fail;
b6ce07aa
KW
1580 }
1581
c3adb58f 1582 QDECREF(options);
f67503e5 1583 *pbs = bs;
b6ce07aa
KW
1584 return 0;
1585
8bfea15d 1586fail:
f500a6d3 1587 if (file != NULL) {
4f6fd349 1588 bdrv_unref(file);
f500a6d3 1589 }
de9c0cec 1590 QDECREF(bs->options);
b6ad491a 1591 QDECREF(options);
de9c0cec 1592 bs->options = NULL;
f67503e5
HR
1593 if (!*pbs) {
1594 /* If *pbs is NULL, a new BDS has been created in this function and
1595 needs to be freed now. Otherwise, it does not need to be closed,
1596 since it has not really been opened yet. */
1597 bdrv_unref(bs);
1598 }
84d18f06 1599 if (local_err) {
34b5d2c6
HR
1600 error_propagate(errp, local_err);
1601 }
b6ad491a 1602 return ret;
de9c0cec 1603
b6ad491a 1604close_and_fail:
f67503e5
HR
1605 /* See fail path, but now the BDS has to be always closed */
1606 if (*pbs) {
1607 bdrv_close(bs);
1608 } else {
1609 bdrv_unref(bs);
1610 }
b6ad491a 1611 QDECREF(options);
84d18f06 1612 if (local_err) {
34b5d2c6
HR
1613 error_propagate(errp, local_err);
1614 }
b6ce07aa
KW
1615 return ret;
1616}
1617
e971aa12
JC
1618typedef struct BlockReopenQueueEntry {
1619 bool prepared;
1620 BDRVReopenState state;
1621 QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1622} BlockReopenQueueEntry;
1623
1624/*
1625 * Adds a BlockDriverState to a simple queue for an atomic, transactional
1626 * reopen of multiple devices.
1627 *
1628 * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1629 * already performed, or alternatively may be NULL a new BlockReopenQueue will
1630 * be created and initialized. This newly created BlockReopenQueue should be
1631 * passed back in for subsequent calls that are intended to be of the same
1632 * atomic 'set'.
1633 *
1634 * bs is the BlockDriverState to add to the reopen queue.
1635 *
1636 * flags contains the open flags for the associated bs
1637 *
1638 * returns a pointer to bs_queue, which is either the newly allocated
1639 * bs_queue, or the existing bs_queue being used.
1640 *
1641 */
1642BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1643 BlockDriverState *bs, int flags)
1644{
1645 assert(bs != NULL);
1646
1647 BlockReopenQueueEntry *bs_entry;
1648 if (bs_queue == NULL) {
1649 bs_queue = g_new0(BlockReopenQueue, 1);
1650 QSIMPLEQ_INIT(bs_queue);
1651 }
1652
f1f25a2e
KW
1653 /* bdrv_open() masks this flag out */
1654 flags &= ~BDRV_O_PROTOCOL;
1655
e971aa12 1656 if (bs->file) {
f1f25a2e 1657 bdrv_reopen_queue(bs_queue, bs->file, bdrv_inherited_flags(flags));
e971aa12
JC
1658 }
1659
1660 bs_entry = g_new0(BlockReopenQueueEntry, 1);
1661 QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1662
1663 bs_entry->state.bs = bs;
1664 bs_entry->state.flags = flags;
1665
1666 return bs_queue;
1667}
1668
1669/*
1670 * Reopen multiple BlockDriverStates atomically & transactionally.
1671 *
1672 * The queue passed in (bs_queue) must have been built up previous
1673 * via bdrv_reopen_queue().
1674 *
1675 * Reopens all BDS specified in the queue, with the appropriate
1676 * flags. All devices are prepared for reopen, and failure of any
1677 * device will cause all device changes to be abandonded, and intermediate
1678 * data cleaned up.
1679 *
1680 * If all devices prepare successfully, then the changes are committed
1681 * to all devices.
1682 *
1683 */
1684int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1685{
1686 int ret = -1;
1687 BlockReopenQueueEntry *bs_entry, *next;
1688 Error *local_err = NULL;
1689
1690 assert(bs_queue != NULL);
1691
1692 bdrv_drain_all();
1693
1694 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1695 if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1696 error_propagate(errp, local_err);
1697 goto cleanup;
1698 }
1699 bs_entry->prepared = true;
1700 }
1701
1702 /* If we reach this point, we have success and just need to apply the
1703 * changes
1704 */
1705 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1706 bdrv_reopen_commit(&bs_entry->state);
1707 }
1708
1709 ret = 0;
1710
1711cleanup:
1712 QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1713 if (ret && bs_entry->prepared) {
1714 bdrv_reopen_abort(&bs_entry->state);
1715 }
1716 g_free(bs_entry);
1717 }
1718 g_free(bs_queue);
1719 return ret;
1720}
1721
1722
1723/* Reopen a single BlockDriverState with the specified flags. */
1724int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1725{
1726 int ret = -1;
1727 Error *local_err = NULL;
1728 BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1729
1730 ret = bdrv_reopen_multiple(queue, &local_err);
1731 if (local_err != NULL) {
1732 error_propagate(errp, local_err);
1733 }
1734 return ret;
1735}
1736
1737
1738/*
1739 * Prepares a BlockDriverState for reopen. All changes are staged in the
1740 * 'opaque' field of the BDRVReopenState, which is used and allocated by
1741 * the block driver layer .bdrv_reopen_prepare()
1742 *
1743 * bs is the BlockDriverState to reopen
1744 * flags are the new open flags
1745 * queue is the reopen queue
1746 *
1747 * Returns 0 on success, non-zero on error. On error errp will be set
1748 * as well.
1749 *
1750 * On failure, bdrv_reopen_abort() will be called to clean up any data.
1751 * It is the responsibility of the caller to then call the abort() or
1752 * commit() for any other BDS that have been left in a prepare() state
1753 *
1754 */
1755int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1756 Error **errp)
1757{
1758 int ret = -1;
1759 Error *local_err = NULL;
1760 BlockDriver *drv;
1761
1762 assert(reopen_state != NULL);
1763 assert(reopen_state->bs->drv != NULL);
1764 drv = reopen_state->bs->drv;
1765
1766 /* if we are to stay read-only, do not allow permission change
1767 * to r/w */
1768 if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1769 reopen_state->flags & BDRV_O_RDWR) {
1770 error_set(errp, QERR_DEVICE_IS_READ_ONLY,
bfb197e0 1771 bdrv_get_device_name(reopen_state->bs));
e971aa12
JC
1772 goto error;
1773 }
1774
1775
1776 ret = bdrv_flush(reopen_state->bs);
1777 if (ret) {
1778 error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1779 strerror(-ret));
1780 goto error;
1781 }
1782
1783 if (drv->bdrv_reopen_prepare) {
1784 ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1785 if (ret) {
1786 if (local_err != NULL) {
1787 error_propagate(errp, local_err);
1788 } else {
d8b6895f
LC
1789 error_setg(errp, "failed while preparing to reopen image '%s'",
1790 reopen_state->bs->filename);
e971aa12
JC
1791 }
1792 goto error;
1793 }
1794 } else {
1795 /* It is currently mandatory to have a bdrv_reopen_prepare()
1796 * handler for each supported drv. */
1797 error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
bfb197e0 1798 drv->format_name, bdrv_get_device_name(reopen_state->bs),
e971aa12
JC
1799 "reopening of file");
1800 ret = -1;
1801 goto error;
1802 }
1803
1804 ret = 0;
1805
1806error:
1807 return ret;
1808}
1809
1810/*
1811 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1812 * makes them final by swapping the staging BlockDriverState contents into
1813 * the active BlockDriverState contents.
1814 */
1815void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1816{
1817 BlockDriver *drv;
1818
1819 assert(reopen_state != NULL);
1820 drv = reopen_state->bs->drv;
1821 assert(drv != NULL);
1822
1823 /* If there are any driver level actions to take */
1824 if (drv->bdrv_reopen_commit) {
1825 drv->bdrv_reopen_commit(reopen_state);
1826 }
1827
1828 /* set BDS specific flags now */
1829 reopen_state->bs->open_flags = reopen_state->flags;
1830 reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1831 BDRV_O_CACHE_WB);
1832 reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
355ef4ac 1833
3baca891 1834 bdrv_refresh_limits(reopen_state->bs, NULL);
e971aa12
JC
1835}
1836
1837/*
1838 * Abort the reopen, and delete and free the staged changes in
1839 * reopen_state
1840 */
1841void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1842{
1843 BlockDriver *drv;
1844
1845 assert(reopen_state != NULL);
1846 drv = reopen_state->bs->drv;
1847 assert(drv != NULL);
1848
1849 if (drv->bdrv_reopen_abort) {
1850 drv->bdrv_reopen_abort(reopen_state);
1851 }
1852}
1853
1854
fc01f7e7
FB
1855void bdrv_close(BlockDriverState *bs)
1856{
33384421
HR
1857 BdrvAioNotifier *ban, *ban_next;
1858
3cbc002c
PB
1859 if (bs->job) {
1860 block_job_cancel_sync(bs->job);
1861 }
58fda173
SH
1862 bdrv_drain_all(); /* complete I/O */
1863 bdrv_flush(bs);
1864 bdrv_drain_all(); /* in case flush left pending I/O */
d7d512f6 1865 notifier_list_notify(&bs->close_notifiers, bs);
7094f12f 1866
3cbc002c 1867 if (bs->drv) {
557df6ac 1868 if (bs->backing_hd) {
826b6ca0
FZ
1869 BlockDriverState *backing_hd = bs->backing_hd;
1870 bdrv_set_backing_hd(bs, NULL);
1871 bdrv_unref(backing_hd);
557df6ac 1872 }
ea2384d3 1873 bs->drv->bdrv_close(bs);
7267c094 1874 g_free(bs->opaque);
ea2384d3
FB
1875 bs->opaque = NULL;
1876 bs->drv = NULL;
53fec9d3 1877 bs->copy_on_read = 0;
a275fa42
PB
1878 bs->backing_file[0] = '\0';
1879 bs->backing_format[0] = '\0';
6405875c
PB
1880 bs->total_sectors = 0;
1881 bs->encrypted = 0;
1882 bs->valid_key = 0;
1883 bs->sg = 0;
1884 bs->growable = 0;
0d51b4de 1885 bs->zero_beyond_eof = false;
de9c0cec
KW
1886 QDECREF(bs->options);
1887 bs->options = NULL;
91af7014
HR
1888 QDECREF(bs->full_open_options);
1889 bs->full_open_options = NULL;
b338082b 1890
66f82cee 1891 if (bs->file != NULL) {
4f6fd349 1892 bdrv_unref(bs->file);
0ac9377d 1893 bs->file = NULL;
66f82cee 1894 }
b338082b 1895 }
98f90dba 1896
a7f53e26
MA
1897 if (bs->blk) {
1898 blk_dev_change_media_cb(bs->blk, false);
1899 }
9ca11154 1900
98f90dba
ZYW
1901 /*throttling disk I/O limits*/
1902 if (bs->io_limits_enabled) {
1903 bdrv_io_limits_disable(bs);
1904 }
33384421
HR
1905
1906 QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
1907 g_free(ban);
1908 }
1909 QLIST_INIT(&bs->aio_notifiers);
b338082b
FB
1910}
1911
2bc93fed
MK
1912void bdrv_close_all(void)
1913{
1914 BlockDriverState *bs;
1915
dc364f4c 1916 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
ed78cda3
SH
1917 AioContext *aio_context = bdrv_get_aio_context(bs);
1918
1919 aio_context_acquire(aio_context);
2bc93fed 1920 bdrv_close(bs);
ed78cda3 1921 aio_context_release(aio_context);
2bc93fed
MK
1922 }
1923}
1924
88266f5a
SH
1925/* Check if any requests are in-flight (including throttled requests) */
1926static bool bdrv_requests_pending(BlockDriverState *bs)
1927{
1928 if (!QLIST_EMPTY(&bs->tracked_requests)) {
1929 return true;
1930 }
cc0681c4
BC
1931 if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1932 return true;
1933 }
1934 if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
88266f5a
SH
1935 return true;
1936 }
1937 if (bs->file && bdrv_requests_pending(bs->file)) {
1938 return true;
1939 }
1940 if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1941 return true;
1942 }
1943 return false;
1944}
1945
5b98db0a
SH
1946static bool bdrv_drain_one(BlockDriverState *bs)
1947{
1948 bool bs_busy;
1949
1950 bdrv_flush_io_queue(bs);
1951 bdrv_start_throttled_reqs(bs);
1952 bs_busy = bdrv_requests_pending(bs);
1953 bs_busy |= aio_poll(bdrv_get_aio_context(bs), bs_busy);
1954 return bs_busy;
1955}
1956
1957/*
1958 * Wait for pending requests to complete on a single BlockDriverState subtree
1959 *
1960 * See the warning in bdrv_drain_all(). This function can only be called if
1961 * you are sure nothing can generate I/O because you have op blockers
1962 * installed.
1963 *
1964 * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
1965 * AioContext.
1966 */
1967void bdrv_drain(BlockDriverState *bs)
1968{
1969 while (bdrv_drain_one(bs)) {
1970 /* Keep iterating */
1971 }
1972}
1973
922453bc
SH
1974/*
1975 * Wait for pending requests to complete across all BlockDriverStates
1976 *
1977 * This function does not flush data to disk, use bdrv_flush_all() for that
1978 * after calling this function.
4c355d53
ZYW
1979 *
1980 * Note that completion of an asynchronous I/O operation can trigger any
1981 * number of other I/O operations on other devices---for example a coroutine
1982 * can be arbitrarily complex and a constant flow of I/O can come until the
1983 * coroutine is complete. Because of this, it is not possible to have a
1984 * function to drain a single device's I/O queue.
922453bc
SH
1985 */
1986void bdrv_drain_all(void)
1987{
88266f5a
SH
1988 /* Always run first iteration so any pending completion BHs run */
1989 bool busy = true;
922453bc
SH
1990 BlockDriverState *bs;
1991
88266f5a 1992 while (busy) {
9b536adc
SH
1993 busy = false;
1994
dc364f4c 1995 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
9b536adc 1996 AioContext *aio_context = bdrv_get_aio_context(bs);
9b536adc
SH
1997
1998 aio_context_acquire(aio_context);
5b98db0a 1999 busy |= bdrv_drain_one(bs);
9b536adc 2000 aio_context_release(aio_context);
9b536adc 2001 }
922453bc
SH
2002 }
2003}
2004
dc364f4c
BC
2005/* make a BlockDriverState anonymous by removing from bdrv_state and
2006 * graph_bdrv_state list.
d22b2f41
RH
2007 Also, NULL terminate the device_name to prevent double remove */
2008void bdrv_make_anon(BlockDriverState *bs)
2009{
bfb197e0
MA
2010 /*
2011 * Take care to remove bs from bdrv_states only when it's actually
2012 * in it. Note that bs->device_list.tqe_prev is initially null,
2013 * and gets set to non-null by QTAILQ_INSERT_TAIL(). Establish
2014 * the useful invariant "bs in bdrv_states iff bs->tqe_prev" by
2015 * resetting it to null on remove.
2016 */
2017 if (bs->device_list.tqe_prev) {
dc364f4c 2018 QTAILQ_REMOVE(&bdrv_states, bs, device_list);
bfb197e0 2019 bs->device_list.tqe_prev = NULL;
d22b2f41 2020 }
dc364f4c
BC
2021 if (bs->node_name[0] != '\0') {
2022 QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
2023 }
2024 bs->node_name[0] = '\0';
d22b2f41
RH
2025}
2026
e023b2e2
PB
2027static void bdrv_rebind(BlockDriverState *bs)
2028{
2029 if (bs->drv && bs->drv->bdrv_rebind) {
2030 bs->drv->bdrv_rebind(bs);
2031 }
2032}
2033
4ddc07ca
PB
2034static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
2035 BlockDriverState *bs_src)
8802d1fd 2036{
4ddc07ca 2037 /* move some fields that need to stay attached to the device */
8802d1fd
JC
2038
2039 /* dev info */
1b7fd729 2040 bs_dest->guest_block_size = bs_src->guest_block_size;
4ddc07ca 2041 bs_dest->copy_on_read = bs_src->copy_on_read;
8802d1fd 2042
4ddc07ca 2043 bs_dest->enable_write_cache = bs_src->enable_write_cache;
c4a248a1 2044
cc0681c4
BC
2045 /* i/o throttled req */
2046 memcpy(&bs_dest->throttle_state,
2047 &bs_src->throttle_state,
2048 sizeof(ThrottleState));
2049 bs_dest->throttled_reqs[0] = bs_src->throttled_reqs[0];
2050 bs_dest->throttled_reqs[1] = bs_src->throttled_reqs[1];
4ddc07ca 2051 bs_dest->io_limits_enabled = bs_src->io_limits_enabled;
8802d1fd 2052
8802d1fd 2053 /* r/w error */
4ddc07ca
PB
2054 bs_dest->on_read_error = bs_src->on_read_error;
2055 bs_dest->on_write_error = bs_src->on_write_error;
8802d1fd
JC
2056
2057 /* i/o status */
4ddc07ca
PB
2058 bs_dest->iostatus_enabled = bs_src->iostatus_enabled;
2059 bs_dest->iostatus = bs_src->iostatus;
8802d1fd 2060
a9fc4408 2061 /* dirty bitmap */
e4654d2d 2062 bs_dest->dirty_bitmaps = bs_src->dirty_bitmaps;
a9fc4408 2063
9fcb0251
FZ
2064 /* reference count */
2065 bs_dest->refcnt = bs_src->refcnt;
2066
a9fc4408 2067 /* job */
4ddc07ca 2068 bs_dest->job = bs_src->job;
a9fc4408 2069
8802d1fd 2070 /* keep the same entry in bdrv_states */
dc364f4c 2071 bs_dest->device_list = bs_src->device_list;
7e7d56d9
MA
2072 bs_dest->blk = bs_src->blk;
2073
fbe40ff7
FZ
2074 memcpy(bs_dest->op_blockers, bs_src->op_blockers,
2075 sizeof(bs_dest->op_blockers));
4ddc07ca 2076}
8802d1fd 2077
4ddc07ca
PB
2078/*
2079 * Swap bs contents for two image chains while they are live,
2080 * while keeping required fields on the BlockDriverState that is
2081 * actually attached to a device.
2082 *
2083 * This will modify the BlockDriverState fields, and swap contents
2084 * between bs_new and bs_old. Both bs_new and bs_old are modified.
2085 *
bfb197e0 2086 * bs_new must not be attached to a BlockBackend.
4ddc07ca
PB
2087 *
2088 * This function does not create any image files.
2089 */
2090void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
2091{
2092 BlockDriverState tmp;
f6801b83 2093
90ce8a06
BC
2094 /* The code needs to swap the node_name but simply swapping node_list won't
2095 * work so first remove the nodes from the graph list, do the swap then
2096 * insert them back if needed.
2097 */
2098 if (bs_new->node_name[0] != '\0') {
2099 QTAILQ_REMOVE(&graph_bdrv_states, bs_new, node_list);
2100 }
2101 if (bs_old->node_name[0] != '\0') {
2102 QTAILQ_REMOVE(&graph_bdrv_states, bs_old, node_list);
2103 }
2104
bfb197e0 2105 /* bs_new must be unattached and shouldn't have anything fancy enabled */
7e7d56d9 2106 assert(!bs_new->blk);
e4654d2d 2107 assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
4ddc07ca 2108 assert(bs_new->job == NULL);
4ddc07ca 2109 assert(bs_new->io_limits_enabled == false);
cc0681c4 2110 assert(!throttle_have_timer(&bs_new->throttle_state));
8802d1fd 2111
4ddc07ca
PB
2112 tmp = *bs_new;
2113 *bs_new = *bs_old;
2114 *bs_old = tmp;
a9fc4408 2115
4ddc07ca
PB
2116 /* there are some fields that should not be swapped, move them back */
2117 bdrv_move_feature_fields(&tmp, bs_old);
2118 bdrv_move_feature_fields(bs_old, bs_new);
2119 bdrv_move_feature_fields(bs_new, &tmp);
8802d1fd 2120
bfb197e0 2121 /* bs_new must remain unattached */
7e7d56d9 2122 assert(!bs_new->blk);
4ddc07ca
PB
2123
2124 /* Check a few fields that should remain attached to the device */
4ddc07ca 2125 assert(bs_new->job == NULL);
4ddc07ca 2126 assert(bs_new->io_limits_enabled == false);
cc0681c4 2127 assert(!throttle_have_timer(&bs_new->throttle_state));
e023b2e2 2128
90ce8a06
BC
2129 /* insert the nodes back into the graph node list if needed */
2130 if (bs_new->node_name[0] != '\0') {
2131 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_new, node_list);
2132 }
2133 if (bs_old->node_name[0] != '\0') {
2134 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_old, node_list);
2135 }
2136
e023b2e2 2137 bdrv_rebind(bs_new);
4ddc07ca
PB
2138 bdrv_rebind(bs_old);
2139}
2140
2141/*
2142 * Add new bs contents at the top of an image chain while the chain is
2143 * live, while keeping required fields on the top layer.
2144 *
2145 * This will modify the BlockDriverState fields, and swap contents
2146 * between bs_new and bs_top. Both bs_new and bs_top are modified.
2147 *
bfb197e0 2148 * bs_new must not be attached to a BlockBackend.
4ddc07ca
PB
2149 *
2150 * This function does not create any image files.
2151 */
2152void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
2153{
2154 bdrv_swap(bs_new, bs_top);
2155
2156 /* The contents of 'tmp' will become bs_top, as we are
2157 * swapping bs_new and bs_top contents. */
8d24cce1 2158 bdrv_set_backing_hd(bs_top, bs_new);
8802d1fd
JC
2159}
2160
4f6fd349 2161static void bdrv_delete(BlockDriverState *bs)
b338082b 2162{
3e914655 2163 assert(!bs->job);
3718d8ab 2164 assert(bdrv_op_blocker_is_empty(bs));
4f6fd349 2165 assert(!bs->refcnt);
e4654d2d 2166 assert(QLIST_EMPTY(&bs->dirty_bitmaps));
18846dee 2167
e1b5c52e
SH
2168 bdrv_close(bs);
2169
1b7bdbc1 2170 /* remove from list, if necessary */
d22b2f41 2171 bdrv_make_anon(bs);
34c6f050 2172
7267c094 2173 g_free(bs);
fc01f7e7
FB
2174}
2175
e97fc193
AL
2176/*
2177 * Run consistency checks on an image
2178 *
e076f338 2179 * Returns 0 if the check could be completed (it doesn't mean that the image is
a1c7273b 2180 * free of errors) or -errno when an internal error occurred. The results of the
e076f338 2181 * check are stored in res.
e97fc193 2182 */
4534ff54 2183int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
e97fc193 2184{
908bcd54
HR
2185 if (bs->drv == NULL) {
2186 return -ENOMEDIUM;
2187 }
e97fc193
AL
2188 if (bs->drv->bdrv_check == NULL) {
2189 return -ENOTSUP;
2190 }
2191
e076f338 2192 memset(res, 0, sizeof(*res));
4534ff54 2193 return bs->drv->bdrv_check(bs, res, fix);
e97fc193
AL
2194}
2195
8a426614
KW
2196#define COMMIT_BUF_SECTORS 2048
2197
33e3963e
FB
2198/* commit COW file into the raw image */
2199int bdrv_commit(BlockDriverState *bs)
2200{
19cb3738 2201 BlockDriver *drv = bs->drv;
72706ea4 2202 int64_t sector, total_sectors, length, backing_length;
8a426614 2203 int n, ro, open_flags;
0bce597d 2204 int ret = 0;
72706ea4 2205 uint8_t *buf = NULL;
c2cba3d9 2206 char filename[PATH_MAX];
33e3963e 2207
19cb3738
FB
2208 if (!drv)
2209 return -ENOMEDIUM;
6bb45158 2210
4dca4b63
NS
2211 if (!bs->backing_hd) {
2212 return -ENOTSUP;
33e3963e
FB
2213 }
2214
3718d8ab
FZ
2215 if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_COMMIT, NULL) ||
2216 bdrv_op_is_blocked(bs->backing_hd, BLOCK_OP_TYPE_COMMIT, NULL)) {
2d3735d3
SH
2217 return -EBUSY;
2218 }
2219
4dca4b63 2220 ro = bs->backing_hd->read_only;
c2cba3d9
JM
2221 /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
2222 pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
4dca4b63
NS
2223 open_flags = bs->backing_hd->open_flags;
2224
2225 if (ro) {
0bce597d
JC
2226 if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
2227 return -EACCES;
4dca4b63 2228 }
ea2384d3 2229 }
33e3963e 2230
72706ea4
JC
2231 length = bdrv_getlength(bs);
2232 if (length < 0) {
2233 ret = length;
2234 goto ro_cleanup;
2235 }
2236
2237 backing_length = bdrv_getlength(bs->backing_hd);
2238 if (backing_length < 0) {
2239 ret = backing_length;
2240 goto ro_cleanup;
2241 }
2242
2243 /* If our top snapshot is larger than the backing file image,
2244 * grow the backing file image if possible. If not possible,
2245 * we must return an error */
2246 if (length > backing_length) {
2247 ret = bdrv_truncate(bs->backing_hd, length);
2248 if (ret < 0) {
2249 goto ro_cleanup;
2250 }
2251 }
2252
2253 total_sectors = length >> BDRV_SECTOR_BITS;
857d4f46
KW
2254
2255 /* qemu_try_blockalign() for bs will choose an alignment that works for
2256 * bs->backing_hd as well, so no need to compare the alignment manually. */
2257 buf = qemu_try_blockalign(bs, COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
2258 if (buf == NULL) {
2259 ret = -ENOMEM;
2260 goto ro_cleanup;
2261 }
8a426614
KW
2262
2263 for (sector = 0; sector < total_sectors; sector += n) {
d663640c
PB
2264 ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
2265 if (ret < 0) {
2266 goto ro_cleanup;
2267 }
2268 if (ret) {
dabfa6cc
KW
2269 ret = bdrv_read(bs, sector, buf, n);
2270 if (ret < 0) {
8a426614
KW
2271 goto ro_cleanup;
2272 }
2273
dabfa6cc
KW
2274 ret = bdrv_write(bs->backing_hd, sector, buf, n);
2275 if (ret < 0) {
8a426614
KW
2276 goto ro_cleanup;
2277 }
ea2384d3 2278 }
33e3963e 2279 }
95389c86 2280
1d44952f
CH
2281 if (drv->bdrv_make_empty) {
2282 ret = drv->bdrv_make_empty(bs);
dabfa6cc
KW
2283 if (ret < 0) {
2284 goto ro_cleanup;
2285 }
1d44952f
CH
2286 bdrv_flush(bs);
2287 }
95389c86 2288
3f5075ae
CH
2289 /*
2290 * Make sure all data we wrote to the backing device is actually
2291 * stable on disk.
2292 */
dabfa6cc 2293 if (bs->backing_hd) {
3f5075ae 2294 bdrv_flush(bs->backing_hd);
dabfa6cc 2295 }
4dca4b63 2296
dabfa6cc 2297 ret = 0;
4dca4b63 2298ro_cleanup:
857d4f46 2299 qemu_vfree(buf);
4dca4b63
NS
2300
2301 if (ro) {
0bce597d
JC
2302 /* ignoring error return here */
2303 bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
4dca4b63
NS
2304 }
2305
1d44952f 2306 return ret;
33e3963e
FB
2307}
2308
e8877497 2309int bdrv_commit_all(void)
6ab4b5ab
MA
2310{
2311 BlockDriverState *bs;
2312
dc364f4c 2313 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
ed78cda3
SH
2314 AioContext *aio_context = bdrv_get_aio_context(bs);
2315
2316 aio_context_acquire(aio_context);
272d2d8e
JC
2317 if (bs->drv && bs->backing_hd) {
2318 int ret = bdrv_commit(bs);
2319 if (ret < 0) {
ed78cda3 2320 aio_context_release(aio_context);
272d2d8e
JC
2321 return ret;
2322 }
e8877497 2323 }
ed78cda3 2324 aio_context_release(aio_context);
6ab4b5ab 2325 }
e8877497 2326 return 0;
6ab4b5ab
MA
2327}
2328
dbffbdcf
SH
2329/**
2330 * Remove an active request from the tracked requests list
2331 *
2332 * This function should be called when a tracked request is completing.
2333 */
2334static void tracked_request_end(BdrvTrackedRequest *req)
2335{
2dbafdc0
KW
2336 if (req->serialising) {
2337 req->bs->serialising_in_flight--;
2338 }
2339
dbffbdcf 2340 QLIST_REMOVE(req, list);
f4658285 2341 qemu_co_queue_restart_all(&req->wait_queue);
dbffbdcf
SH
2342}
2343
2344/**
2345 * Add an active request to the tracked requests list
2346 */
2347static void tracked_request_begin(BdrvTrackedRequest *req,
2348 BlockDriverState *bs,
793ed47a
KW
2349 int64_t offset,
2350 unsigned int bytes, bool is_write)
dbffbdcf
SH
2351{
2352 *req = (BdrvTrackedRequest){
2353 .bs = bs,
2dbafdc0
KW
2354 .offset = offset,
2355 .bytes = bytes,
2356 .is_write = is_write,
2357 .co = qemu_coroutine_self(),
2358 .serialising = false,
7327145f
KW
2359 .overlap_offset = offset,
2360 .overlap_bytes = bytes,
dbffbdcf
SH
2361 };
2362
f4658285
SH
2363 qemu_co_queue_init(&req->wait_queue);
2364
dbffbdcf
SH
2365 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2366}
2367
e96126ff 2368static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
2dbafdc0 2369{
7327145f 2370 int64_t overlap_offset = req->offset & ~(align - 1);
e96126ff
KW
2371 unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
2372 - overlap_offset;
7327145f 2373
2dbafdc0
KW
2374 if (!req->serialising) {
2375 req->bs->serialising_in_flight++;
2376 req->serialising = true;
2377 }
7327145f
KW
2378
2379 req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
2380 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
2dbafdc0
KW
2381}
2382
d83947ac
SH
2383/**
2384 * Round a region to cluster boundaries
2385 */
343bded4
PB
2386void bdrv_round_to_clusters(BlockDriverState *bs,
2387 int64_t sector_num, int nb_sectors,
2388 int64_t *cluster_sector_num,
2389 int *cluster_nb_sectors)
d83947ac
SH
2390{
2391 BlockDriverInfo bdi;
2392
2393 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2394 *cluster_sector_num = sector_num;
2395 *cluster_nb_sectors = nb_sectors;
2396 } else {
2397 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2398 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2399 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2400 nb_sectors, c);
2401 }
2402}
2403
7327145f 2404static int bdrv_get_cluster_size(BlockDriverState *bs)
793ed47a
KW
2405{
2406 BlockDriverInfo bdi;
7327145f 2407 int ret;
793ed47a 2408
7327145f
KW
2409 ret = bdrv_get_info(bs, &bdi);
2410 if (ret < 0 || bdi.cluster_size == 0) {
2411 return bs->request_alignment;
793ed47a 2412 } else {
7327145f 2413 return bdi.cluster_size;
793ed47a
KW
2414 }
2415}
2416
f4658285 2417static bool tracked_request_overlaps(BdrvTrackedRequest *req,
793ed47a
KW
2418 int64_t offset, unsigned int bytes)
2419{
d83947ac 2420 /* aaaa bbbb */
7327145f 2421 if (offset >= req->overlap_offset + req->overlap_bytes) {
d83947ac
SH
2422 return false;
2423 }
2424 /* bbbb aaaa */
7327145f 2425 if (req->overlap_offset >= offset + bytes) {
d83947ac
SH
2426 return false;
2427 }
2428 return true;
f4658285
SH
2429}
2430
28de2dcd 2431static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
f4658285 2432{
2dbafdc0 2433 BlockDriverState *bs = self->bs;
f4658285
SH
2434 BdrvTrackedRequest *req;
2435 bool retry;
28de2dcd 2436 bool waited = false;
f4658285 2437
2dbafdc0 2438 if (!bs->serialising_in_flight) {
28de2dcd 2439 return false;
2dbafdc0
KW
2440 }
2441
f4658285
SH
2442 do {
2443 retry = false;
2444 QLIST_FOREACH(req, &bs->tracked_requests, list) {
2dbafdc0 2445 if (req == self || (!req->serialising && !self->serialising)) {
65afd211
KW
2446 continue;
2447 }
7327145f
KW
2448 if (tracked_request_overlaps(req, self->overlap_offset,
2449 self->overlap_bytes))
2450 {
5f8b6491
SH
2451 /* Hitting this means there was a reentrant request, for
2452 * example, a block driver issuing nested requests. This must
2453 * never happen since it means deadlock.
2454 */
2455 assert(qemu_coroutine_self() != req->co);
2456
6460440f
KW
2457 /* If the request is already (indirectly) waiting for us, or
2458 * will wait for us as soon as it wakes up, then just go on
2459 * (instead of producing a deadlock in the former case). */
2460 if (!req->waiting_for) {
2461 self->waiting_for = req;
2462 qemu_co_queue_wait(&req->wait_queue);
2463 self->waiting_for = NULL;
2464 retry = true;
28de2dcd 2465 waited = true;
6460440f
KW
2466 break;
2467 }
f4658285
SH
2468 }
2469 }
2470 } while (retry);
28de2dcd
KW
2471
2472 return waited;
f4658285
SH
2473}
2474
756e6736
KW
2475/*
2476 * Return values:
2477 * 0 - success
2478 * -EINVAL - backing format specified, but no file
2479 * -ENOSPC - can't update the backing file because no space is left in the
2480 * image file header
2481 * -ENOTSUP - format driver doesn't support changing the backing file
2482 */
2483int bdrv_change_backing_file(BlockDriverState *bs,
2484 const char *backing_file, const char *backing_fmt)
2485{
2486 BlockDriver *drv = bs->drv;
469ef350 2487 int ret;
756e6736 2488
5f377794
PB
2489 /* Backing file format doesn't make sense without a backing file */
2490 if (backing_fmt && !backing_file) {
2491 return -EINVAL;
2492 }
2493
756e6736 2494 if (drv->bdrv_change_backing_file != NULL) {
469ef350 2495 ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
756e6736 2496 } else {
469ef350 2497 ret = -ENOTSUP;
756e6736 2498 }
469ef350
PB
2499
2500 if (ret == 0) {
2501 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2502 pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2503 }
2504 return ret;
756e6736
KW
2505}
2506
6ebdcee2
JC
2507/*
2508 * Finds the image layer in the chain that has 'bs' as its backing file.
2509 *
2510 * active is the current topmost image.
2511 *
2512 * Returns NULL if bs is not found in active's image chain,
2513 * or if active == bs.
4caf0fcd
JC
2514 *
2515 * Returns the bottommost base image if bs == NULL.
6ebdcee2
JC
2516 */
2517BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2518 BlockDriverState *bs)
2519{
4caf0fcd
JC
2520 while (active && bs != active->backing_hd) {
2521 active = active->backing_hd;
6ebdcee2
JC
2522 }
2523
4caf0fcd
JC
2524 return active;
2525}
6ebdcee2 2526
4caf0fcd
JC
2527/* Given a BDS, searches for the base layer. */
2528BlockDriverState *bdrv_find_base(BlockDriverState *bs)
2529{
2530 return bdrv_find_overlay(bs, NULL);
6ebdcee2
JC
2531}
2532
2533typedef struct BlkIntermediateStates {
2534 BlockDriverState *bs;
2535 QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2536} BlkIntermediateStates;
2537
2538
2539/*
2540 * Drops images above 'base' up to and including 'top', and sets the image
2541 * above 'top' to have base as its backing file.
2542 *
2543 * Requires that the overlay to 'top' is opened r/w, so that the backing file
2544 * information in 'bs' can be properly updated.
2545 *
2546 * E.g., this will convert the following chain:
2547 * bottom <- base <- intermediate <- top <- active
2548 *
2549 * to
2550 *
2551 * bottom <- base <- active
2552 *
2553 * It is allowed for bottom==base, in which case it converts:
2554 *
2555 * base <- intermediate <- top <- active
2556 *
2557 * to
2558 *
2559 * base <- active
2560 *
54e26900
JC
2561 * If backing_file_str is non-NULL, it will be used when modifying top's
2562 * overlay image metadata.
2563 *
6ebdcee2
JC
2564 * Error conditions:
2565 * if active == top, that is considered an error
2566 *
2567 */
2568int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
54e26900 2569 BlockDriverState *base, const char *backing_file_str)
6ebdcee2
JC
2570{
2571 BlockDriverState *intermediate;
2572 BlockDriverState *base_bs = NULL;
2573 BlockDriverState *new_top_bs = NULL;
2574 BlkIntermediateStates *intermediate_state, *next;
2575 int ret = -EIO;
2576
2577 QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2578 QSIMPLEQ_INIT(&states_to_delete);
2579
2580 if (!top->drv || !base->drv) {
2581 goto exit;
2582 }
2583
2584 new_top_bs = bdrv_find_overlay(active, top);
2585
2586 if (new_top_bs == NULL) {
2587 /* we could not find the image above 'top', this is an error */
2588 goto exit;
2589 }
2590
2591 /* special case of new_top_bs->backing_hd already pointing to base - nothing
2592 * to do, no intermediate images */
2593 if (new_top_bs->backing_hd == base) {
2594 ret = 0;
2595 goto exit;
2596 }
2597
2598 intermediate = top;
2599
2600 /* now we will go down through the list, and add each BDS we find
2601 * into our deletion queue, until we hit the 'base'
2602 */
2603 while (intermediate) {
5839e53b 2604 intermediate_state = g_new0(BlkIntermediateStates, 1);
6ebdcee2
JC
2605 intermediate_state->bs = intermediate;
2606 QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2607
2608 if (intermediate->backing_hd == base) {
2609 base_bs = intermediate->backing_hd;
2610 break;
2611 }
2612 intermediate = intermediate->backing_hd;
2613 }
2614 if (base_bs == NULL) {
2615 /* something went wrong, we did not end at the base. safely
2616 * unravel everything, and exit with error */
2617 goto exit;
2618 }
2619
2620 /* success - we can delete the intermediate states, and link top->base */
54e26900
JC
2621 backing_file_str = backing_file_str ? backing_file_str : base_bs->filename;
2622 ret = bdrv_change_backing_file(new_top_bs, backing_file_str,
6ebdcee2
JC
2623 base_bs->drv ? base_bs->drv->format_name : "");
2624 if (ret) {
2625 goto exit;
2626 }
920beae1 2627 bdrv_set_backing_hd(new_top_bs, base_bs);
6ebdcee2
JC
2628
2629 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2630 /* so that bdrv_close() does not recursively close the chain */
920beae1 2631 bdrv_set_backing_hd(intermediate_state->bs, NULL);
4f6fd349 2632 bdrv_unref(intermediate_state->bs);
6ebdcee2
JC
2633 }
2634 ret = 0;
2635
2636exit:
2637 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2638 g_free(intermediate_state);
2639 }
2640 return ret;
2641}
2642
2643
71d0770c
AL
2644static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2645 size_t size)
2646{
2647 int64_t len;
2648
1dd3a447
KW
2649 if (size > INT_MAX) {
2650 return -EIO;
2651 }
2652
71d0770c
AL
2653 if (!bdrv_is_inserted(bs))
2654 return -ENOMEDIUM;
2655
2656 if (bs->growable)
2657 return 0;
2658
2659 len = bdrv_getlength(bs);
2660
fbb7b4e0
KW
2661 if (offset < 0)
2662 return -EIO;
2663
2664 if ((offset > len) || (len - offset < size))
71d0770c
AL
2665 return -EIO;
2666
2667 return 0;
2668}
2669
2670static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2671 int nb_sectors)
2672{
54db38a4 2673 if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
8f4754ed
KW
2674 return -EIO;
2675 }
2676
eb5a3165
JS
2677 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2678 nb_sectors * BDRV_SECTOR_SIZE);
71d0770c
AL
2679}
2680
1c9805a3
SH
2681typedef struct RwCo {
2682 BlockDriverState *bs;
775aa8b6 2683 int64_t offset;
1c9805a3
SH
2684 QEMUIOVector *qiov;
2685 bool is_write;
2686 int ret;
4105eaaa 2687 BdrvRequestFlags flags;
1c9805a3
SH
2688} RwCo;
2689
2690static void coroutine_fn bdrv_rw_co_entry(void *opaque)
fc01f7e7 2691{
1c9805a3 2692 RwCo *rwco = opaque;
ea2384d3 2693
1c9805a3 2694 if (!rwco->is_write) {
775aa8b6
KW
2695 rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
2696 rwco->qiov->size, rwco->qiov,
4105eaaa 2697 rwco->flags);
775aa8b6
KW
2698 } else {
2699 rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
2700 rwco->qiov->size, rwco->qiov,
2701 rwco->flags);
1c9805a3
SH
2702 }
2703}
e7a8a783 2704
1c9805a3 2705/*
8d3b1a2d 2706 * Process a vectored synchronous request using coroutines
1c9805a3 2707 */
775aa8b6
KW
2708static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
2709 QEMUIOVector *qiov, bool is_write,
2710 BdrvRequestFlags flags)
1c9805a3 2711{
1c9805a3
SH
2712 Coroutine *co;
2713 RwCo rwco = {
2714 .bs = bs,
775aa8b6 2715 .offset = offset,
8d3b1a2d 2716 .qiov = qiov,
1c9805a3
SH
2717 .is_write = is_write,
2718 .ret = NOT_DONE,
4105eaaa 2719 .flags = flags,
1c9805a3 2720 };
e7a8a783 2721
498e386c
ZYW
2722 /**
2723 * In sync call context, when the vcpu is blocked, this throttling timer
2724 * will not fire; so the I/O throttling function has to be disabled here
2725 * if it has been enabled.
2726 */
2727 if (bs->io_limits_enabled) {
2728 fprintf(stderr, "Disabling I/O throttling on '%s' due "
2729 "to synchronous I/O.\n", bdrv_get_device_name(bs));
2730 bdrv_io_limits_disable(bs);
2731 }
2732
1c9805a3
SH
2733 if (qemu_in_coroutine()) {
2734 /* Fast-path if already in coroutine context */
2735 bdrv_rw_co_entry(&rwco);
2736 } else {
2572b37a
SH
2737 AioContext *aio_context = bdrv_get_aio_context(bs);
2738
1c9805a3
SH
2739 co = qemu_coroutine_create(bdrv_rw_co_entry);
2740 qemu_coroutine_enter(co, &rwco);
2741 while (rwco.ret == NOT_DONE) {
2572b37a 2742 aio_poll(aio_context, true);
1c9805a3
SH
2743 }
2744 }
2745 return rwco.ret;
2746}
b338082b 2747
8d3b1a2d
KW
2748/*
2749 * Process a synchronous request using coroutines
2750 */
2751static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
4105eaaa 2752 int nb_sectors, bool is_write, BdrvRequestFlags flags)
8d3b1a2d
KW
2753{
2754 QEMUIOVector qiov;
2755 struct iovec iov = {
2756 .iov_base = (void *)buf,
2757 .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2758 };
2759
da15ee51
KW
2760 if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2761 return -EINVAL;
2762 }
2763
8d3b1a2d 2764 qemu_iovec_init_external(&qiov, &iov, 1);
775aa8b6
KW
2765 return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
2766 &qiov, is_write, flags);
8d3b1a2d
KW
2767}
2768
1c9805a3
SH
2769/* return < 0 if error. See bdrv_write() for the return codes */
2770int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2771 uint8_t *buf, int nb_sectors)
2772{
4105eaaa 2773 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
fc01f7e7
FB
2774}
2775
07d27a44
MA
2776/* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2777int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2778 uint8_t *buf, int nb_sectors)
2779{
2780 bool enabled;
2781 int ret;
2782
2783 enabled = bs->io_limits_enabled;
2784 bs->io_limits_enabled = false;
4e7395e8 2785 ret = bdrv_read(bs, sector_num, buf, nb_sectors);
07d27a44
MA
2786 bs->io_limits_enabled = enabled;
2787 return ret;
2788}
2789
5fafdf24 2790/* Return < 0 if error. Important errors are:
19cb3738
FB
2791 -EIO generic I/O error (may happen for all errors)
2792 -ENOMEDIUM No media inserted.
2793 -EINVAL Invalid sector number or nb_sectors
2794 -EACCES Trying to write a read-only device
2795*/
5fafdf24 2796int bdrv_write(BlockDriverState *bs, int64_t sector_num,
fc01f7e7
FB
2797 const uint8_t *buf, int nb_sectors)
2798{
4105eaaa 2799 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
83f64091
FB
2800}
2801
aa7bfbff
PL
2802int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2803 int nb_sectors, BdrvRequestFlags flags)
4105eaaa
PL
2804{
2805 return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
aa7bfbff 2806 BDRV_REQ_ZERO_WRITE | flags);
8d3b1a2d
KW
2807}
2808
d75cbb5e
PL
2809/*
2810 * Completely zero out a block device with the help of bdrv_write_zeroes.
2811 * The operation is sped up by checking the block status and only writing
2812 * zeroes to the device if they currently do not return zeroes. Optional
2813 * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2814 *
2815 * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2816 */
2817int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2818{
d32f7c10 2819 int64_t target_sectors, ret, nb_sectors, sector_num = 0;
d75cbb5e
PL
2820 int n;
2821
d32f7c10
MA
2822 target_sectors = bdrv_nb_sectors(bs);
2823 if (target_sectors < 0) {
2824 return target_sectors;
9ce10c0b 2825 }
9ce10c0b 2826
d75cbb5e 2827 for (;;) {
d32f7c10 2828 nb_sectors = target_sectors - sector_num;
d75cbb5e
PL
2829 if (nb_sectors <= 0) {
2830 return 0;
2831 }
f3a9cfdd
FZ
2832 if (nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2833 nb_sectors = INT_MAX / BDRV_SECTOR_SIZE;
d75cbb5e
PL
2834 }
2835 ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
3d94ce60
PL
2836 if (ret < 0) {
2837 error_report("error getting block status at sector %" PRId64 ": %s",
2838 sector_num, strerror(-ret));
2839 return ret;
2840 }
d75cbb5e
PL
2841 if (ret & BDRV_BLOCK_ZERO) {
2842 sector_num += n;
2843 continue;
2844 }
2845 ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2846 if (ret < 0) {
2847 error_report("error writing zeroes at sector %" PRId64 ": %s",
2848 sector_num, strerror(-ret));
2849 return ret;
2850 }
2851 sector_num += n;
2852 }
2853}
2854
a3ef6571 2855int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
83f64091 2856{
a3ef6571
KW
2857 QEMUIOVector qiov;
2858 struct iovec iov = {
2859 .iov_base = (void *)buf,
2860 .iov_len = bytes,
2861 };
9a8c4cce 2862 int ret;
83f64091 2863
a3ef6571
KW
2864 if (bytes < 0) {
2865 return -EINVAL;
83f64091
FB
2866 }
2867
a3ef6571
KW
2868 qemu_iovec_init_external(&qiov, &iov, 1);
2869 ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
2870 if (ret < 0) {
2871 return ret;
83f64091 2872 }
a3ef6571
KW
2873
2874 return bytes;
83f64091
FB
2875}
2876
8d3b1a2d 2877int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
83f64091 2878{
9a8c4cce 2879 int ret;
83f64091 2880
8407d5d7
KW
2881 ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
2882 if (ret < 0) {
2883 return ret;
83f64091
FB
2884 }
2885
8d3b1a2d
KW
2886 return qiov->size;
2887}
2888
2889int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
8407d5d7 2890 const void *buf, int bytes)
8d3b1a2d
KW
2891{
2892 QEMUIOVector qiov;
2893 struct iovec iov = {
2894 .iov_base = (void *) buf,
8407d5d7 2895 .iov_len = bytes,
8d3b1a2d
KW
2896 };
2897
8407d5d7
KW
2898 if (bytes < 0) {
2899 return -EINVAL;
2900 }
2901
8d3b1a2d
KW
2902 qemu_iovec_init_external(&qiov, &iov, 1);
2903 return bdrv_pwritev(bs, offset, &qiov);
83f64091 2904}
83f64091 2905
f08145fe
KW
2906/*
2907 * Writes to the file and ensures that no writes are reordered across this
2908 * request (acts as a barrier)
2909 *
2910 * Returns 0 on success, -errno in error cases.
2911 */
2912int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2913 const void *buf, int count)
2914{
2915 int ret;
2916
2917 ret = bdrv_pwrite(bs, offset, buf, count);
2918 if (ret < 0) {
2919 return ret;
2920 }
2921
f05fa4ad
PB
2922 /* No flush needed for cache modes that already do it */
2923 if (bs->enable_write_cache) {
f08145fe
KW
2924 bdrv_flush(bs);
2925 }
2926
2927 return 0;
2928}
2929
470c0504 2930static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
ab185921
SH
2931 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2932{
2933 /* Perform I/O through a temporary buffer so that users who scribble over
2934 * their read buffer while the operation is in progress do not end up
2935 * modifying the image file. This is critical for zero-copy guest I/O
2936 * where anything might happen inside guest memory.
2937 */
2938 void *bounce_buffer;
2939
79c053bd 2940 BlockDriver *drv = bs->drv;
ab185921
SH
2941 struct iovec iov;
2942 QEMUIOVector bounce_qiov;
2943 int64_t cluster_sector_num;
2944 int cluster_nb_sectors;
2945 size_t skip_bytes;
2946 int ret;
2947
2948 /* Cover entire cluster so no additional backing file I/O is required when
2949 * allocating cluster in the image file.
2950 */
343bded4
PB
2951 bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2952 &cluster_sector_num, &cluster_nb_sectors);
ab185921 2953
470c0504
SH
2954 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2955 cluster_sector_num, cluster_nb_sectors);
ab185921
SH
2956
2957 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
857d4f46
KW
2958 iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len);
2959 if (bounce_buffer == NULL) {
2960 ret = -ENOMEM;
2961 goto err;
2962 }
2963
ab185921
SH
2964 qemu_iovec_init_external(&bounce_qiov, &iov, 1);
2965
79c053bd
SH
2966 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
2967 &bounce_qiov);
ab185921
SH
2968 if (ret < 0) {
2969 goto err;
2970 }
2971
79c053bd
SH
2972 if (drv->bdrv_co_write_zeroes &&
2973 buffer_is_zero(bounce_buffer, iov.iov_len)) {
621f0589 2974 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
aa7bfbff 2975 cluster_nb_sectors, 0);
79c053bd 2976 } else {
f05fa4ad
PB
2977 /* This does not change the data on the disk, it is not necessary
2978 * to flush even in cache=writethrough mode.
2979 */
79c053bd 2980 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
ab185921 2981 &bounce_qiov);
79c053bd
SH
2982 }
2983
ab185921
SH
2984 if (ret < 0) {
2985 /* It might be okay to ignore write errors for guest requests. If this
2986 * is a deliberate copy-on-read then we don't want to ignore the error.
2987 * Simply report it in all cases.
2988 */
2989 goto err;
2990 }
2991
2992 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
03396148
MT
2993 qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
2994 nb_sectors * BDRV_SECTOR_SIZE);
ab185921
SH
2995
2996err:
2997 qemu_vfree(bounce_buffer);
2998 return ret;
2999}
3000
c5fbe571 3001/*
d0c7f642
KW
3002 * Forwards an already correctly aligned request to the BlockDriver. This
3003 * handles copy on read and zeroing after EOF; any other features must be
3004 * implemented by the caller.
c5fbe571 3005 */
d0c7f642 3006static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
65afd211 3007 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
ec746e10 3008 int64_t align, QEMUIOVector *qiov, int flags)
da1fa91d
KW
3009{
3010 BlockDriver *drv = bs->drv;
dbffbdcf 3011 int ret;
da1fa91d 3012
d0c7f642
KW
3013 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3014 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
da1fa91d 3015
d0c7f642
KW
3016 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3017 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
8eb029c2 3018 assert(!qiov || bytes == qiov->size);
d0c7f642
KW
3019
3020 /* Handle Copy on Read and associated serialisation */
470c0504 3021 if (flags & BDRV_REQ_COPY_ON_READ) {
7327145f
KW
3022 /* If we touch the same cluster it counts as an overlap. This
3023 * guarantees that allocating writes will be serialized and not race
3024 * with each other for the same cluster. For example, in copy-on-read
3025 * it ensures that the CoR read and write operations are atomic and
3026 * guest writes cannot interleave between them. */
3027 mark_request_serialising(req, bdrv_get_cluster_size(bs));
470c0504
SH
3028 }
3029
2dbafdc0 3030 wait_serialising_requests(req);
f4658285 3031
470c0504 3032 if (flags & BDRV_REQ_COPY_ON_READ) {
ab185921
SH
3033 int pnum;
3034
bdad13b9 3035 ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
ab185921
SH
3036 if (ret < 0) {
3037 goto out;
3038 }
3039
3040 if (!ret || pnum != nb_sectors) {
470c0504 3041 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
ab185921
SH
3042 goto out;
3043 }
3044 }
3045
d0c7f642 3046 /* Forward the request to the BlockDriver */
893a8f62
MK
3047 if (!(bs->zero_beyond_eof && bs->growable)) {
3048 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
3049 } else {
3050 /* Read zeros after EOF of growable BDSes */
4049082c 3051 int64_t total_sectors, max_nb_sectors;
893a8f62 3052
4049082c
MA
3053 total_sectors = bdrv_nb_sectors(bs);
3054 if (total_sectors < 0) {
3055 ret = total_sectors;
893a8f62
MK
3056 goto out;
3057 }
3058
5f5bcd80
KW
3059 max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
3060 align >> BDRV_SECTOR_BITS);
e012b78c
PB
3061 if (nb_sectors < max_nb_sectors) {
3062 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
3063 } else if (max_nb_sectors > 0) {
33f461e0 3064 QEMUIOVector local_qiov;
33f461e0
KW
3065
3066 qemu_iovec_init(&local_qiov, qiov->niov);
3067 qemu_iovec_concat(&local_qiov, qiov, 0,
e012b78c 3068 max_nb_sectors * BDRV_SECTOR_SIZE);
33f461e0 3069
e012b78c 3070 ret = drv->bdrv_co_readv(bs, sector_num, max_nb_sectors,
33f461e0
KW
3071 &local_qiov);
3072
3073 qemu_iovec_destroy(&local_qiov);
893a8f62
MK
3074 } else {
3075 ret = 0;
3076 }
3077
3078 /* Reading beyond end of file is supposed to produce zeroes */
3079 if (ret == 0 && total_sectors < sector_num + nb_sectors) {
3080 uint64_t offset = MAX(0, total_sectors - sector_num);
3081 uint64_t bytes = (sector_num + nb_sectors - offset) *
3082 BDRV_SECTOR_SIZE;
3083 qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
3084 }
3085 }
ab185921
SH
3086
3087out:
dbffbdcf 3088 return ret;
da1fa91d
KW
3089}
3090
d0c7f642
KW
3091/*
3092 * Handle a read request in coroutine context
3093 */
1b0288ae
KW
3094static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
3095 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
d0c7f642
KW
3096 BdrvRequestFlags flags)
3097{
3098 BlockDriver *drv = bs->drv;
65afd211
KW
3099 BdrvTrackedRequest req;
3100
1b0288ae
KW
3101 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3102 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3103 uint8_t *head_buf = NULL;
3104 uint8_t *tail_buf = NULL;
3105 QEMUIOVector local_qiov;
3106 bool use_local_qiov = false;
d0c7f642
KW
3107 int ret;
3108
3109 if (!drv) {
3110 return -ENOMEDIUM;
3111 }
1b0288ae 3112 if (bdrv_check_byte_request(bs, offset, bytes)) {
d0c7f642
KW
3113 return -EIO;
3114 }
3115
3116 if (bs->copy_on_read) {
3117 flags |= BDRV_REQ_COPY_ON_READ;
3118 }
3119
3120 /* throttling disk I/O */
3121 if (bs->io_limits_enabled) {
d5103588 3122 bdrv_io_limits_intercept(bs, bytes, false);
1b0288ae
KW
3123 }
3124
3125 /* Align read if necessary by padding qiov */
3126 if (offset & (align - 1)) {
3127 head_buf = qemu_blockalign(bs, align);
3128 qemu_iovec_init(&local_qiov, qiov->niov + 2);
3129 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3130 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3131 use_local_qiov = true;
3132
3133 bytes += offset & (align - 1);
3134 offset = offset & ~(align - 1);
3135 }
3136
3137 if ((offset + bytes) & (align - 1)) {
3138 if (!use_local_qiov) {
3139 qemu_iovec_init(&local_qiov, qiov->niov + 1);
3140 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3141 use_local_qiov = true;
3142 }
3143 tail_buf = qemu_blockalign(bs, align);
3144 qemu_iovec_add(&local_qiov, tail_buf,
3145 align - ((offset + bytes) & (align - 1)));
3146
3147 bytes = ROUND_UP(bytes, align);
3148 }
3149
65afd211 3150 tracked_request_begin(&req, bs, offset, bytes, false);
ec746e10 3151 ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
1b0288ae
KW
3152 use_local_qiov ? &local_qiov : qiov,
3153 flags);
65afd211 3154 tracked_request_end(&req);
1b0288ae
KW
3155
3156 if (use_local_qiov) {
3157 qemu_iovec_destroy(&local_qiov);
3158 qemu_vfree(head_buf);
3159 qemu_vfree(tail_buf);
d0c7f642
KW
3160 }
3161
d0c7f642
KW
3162 return ret;
3163}
3164
1b0288ae
KW
3165static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
3166 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3167 BdrvRequestFlags flags)
3168{
3169 if (nb_sectors < 0 || nb_sectors > (UINT_MAX >> BDRV_SECTOR_BITS)) {
3170 return -EINVAL;
3171 }
3172
3173 return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
3174 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3175}
3176
c5fbe571 3177int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
da1fa91d
KW
3178 int nb_sectors, QEMUIOVector *qiov)
3179{
c5fbe571 3180 trace_bdrv_co_readv(bs, sector_num, nb_sectors);
da1fa91d 3181
470c0504
SH
3182 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
3183}
3184
3185int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
3186 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
3187{
3188 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
3189
3190 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
3191 BDRV_REQ_COPY_ON_READ);
c5fbe571
SH
3192}
3193
c31cb707
PL
3194/* if no limit is specified in the BlockLimits use a default
3195 * of 32768 512-byte sectors (16 MiB) per request.
3196 */
3197#define MAX_WRITE_ZEROES_DEFAULT 32768
3198
f08f2dda 3199static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
aa7bfbff 3200 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
f08f2dda
SH
3201{
3202 BlockDriver *drv = bs->drv;
3203 QEMUIOVector qiov;
c31cb707
PL
3204 struct iovec iov = {0};
3205 int ret = 0;
f08f2dda 3206
c31cb707
PL
3207 int max_write_zeroes = bs->bl.max_write_zeroes ?
3208 bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
621f0589 3209
c31cb707
PL
3210 while (nb_sectors > 0 && !ret) {
3211 int num = nb_sectors;
3212
b8d71c09
PB
3213 /* Align request. Block drivers can expect the "bulk" of the request
3214 * to be aligned.
3215 */
3216 if (bs->bl.write_zeroes_alignment
3217 && num > bs->bl.write_zeroes_alignment) {
3218 if (sector_num % bs->bl.write_zeroes_alignment != 0) {
3219 /* Make a small request up to the first aligned sector. */
c31cb707 3220 num = bs->bl.write_zeroes_alignment;
b8d71c09
PB
3221 num -= sector_num % bs->bl.write_zeroes_alignment;
3222 } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
3223 /* Shorten the request to the last aligned sector. num cannot
3224 * underflow because num > bs->bl.write_zeroes_alignment.
3225 */
3226 num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
c31cb707 3227 }
621f0589 3228 }
f08f2dda 3229
c31cb707
PL
3230 /* limit request size */
3231 if (num > max_write_zeroes) {
3232 num = max_write_zeroes;
3233 }
3234
3235 ret = -ENOTSUP;
3236 /* First try the efficient write zeroes operation */
3237 if (drv->bdrv_co_write_zeroes) {
3238 ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
3239 }
3240
3241 if (ret == -ENOTSUP) {
3242 /* Fall back to bounce buffer if write zeroes is unsupported */
3243 iov.iov_len = num * BDRV_SECTOR_SIZE;
3244 if (iov.iov_base == NULL) {
857d4f46
KW
3245 iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE);
3246 if (iov.iov_base == NULL) {
3247 ret = -ENOMEM;
3248 goto fail;
3249 }
b8d71c09 3250 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
c31cb707
PL
3251 }
3252 qemu_iovec_init_external(&qiov, &iov, 1);
f08f2dda 3253
c31cb707 3254 ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
b8d71c09
PB
3255
3256 /* Keep bounce buffer around if it is big enough for all
3257 * all future requests.
3258 */
3259 if (num < max_write_zeroes) {
3260 qemu_vfree(iov.iov_base);
3261 iov.iov_base = NULL;
3262 }
c31cb707
PL
3263 }
3264
3265 sector_num += num;
3266 nb_sectors -= num;
3267 }
f08f2dda 3268
857d4f46 3269fail:
f08f2dda
SH
3270 qemu_vfree(iov.iov_base);
3271 return ret;
3272}
3273
c5fbe571 3274/*
b404f720 3275 * Forwards an already correctly aligned write request to the BlockDriver.
c5fbe571 3276 */
b404f720 3277static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
65afd211
KW
3278 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3279 QEMUIOVector *qiov, int flags)
c5fbe571
SH
3280{
3281 BlockDriver *drv = bs->drv;
28de2dcd 3282 bool waited;
6b7cb247 3283 int ret;
da1fa91d 3284
b404f720
KW
3285 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3286 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
f4658285 3287
b404f720
KW
3288 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3289 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
8eb029c2 3290 assert(!qiov || bytes == qiov->size);
cc0681c4 3291
28de2dcd
KW
3292 waited = wait_serialising_requests(req);
3293 assert(!waited || !req->serialising);
af91f9a7
KW
3294 assert(req->overlap_offset <= offset);
3295 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
244eadef 3296
65afd211 3297 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
d616b224 3298
465bee1d
PL
3299 if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
3300 !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes &&
3301 qemu_iovec_is_zero(qiov)) {
3302 flags |= BDRV_REQ_ZERO_WRITE;
3303 if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
3304 flags |= BDRV_REQ_MAY_UNMAP;
3305 }
3306 }
3307
d616b224
SH
3308 if (ret < 0) {
3309 /* Do nothing, write notifier decided to fail this request */
3310 } else if (flags & BDRV_REQ_ZERO_WRITE) {
9e1cb96d 3311 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
aa7bfbff 3312 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
f08f2dda 3313 } else {
9e1cb96d 3314 BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
f08f2dda
SH
3315 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
3316 }
9e1cb96d 3317 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
6b7cb247 3318
f05fa4ad
PB
3319 if (ret == 0 && !bs->enable_write_cache) {
3320 ret = bdrv_co_flush(bs);
3321 }
3322
e4654d2d 3323 bdrv_set_dirty(bs, sector_num, nb_sectors);
da1fa91d 3324
5366d0c8 3325 block_acct_highest_sector(&bs->stats, sector_num, nb_sectors);
5e5a94b6 3326
df2a6f29
PB
3327 if (bs->growable && ret >= 0) {
3328 bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
3329 }
da1fa91d 3330
6b7cb247 3331 return ret;
da1fa91d
KW
3332}
3333
b404f720
KW
3334/*
3335 * Handle a write request in coroutine context
3336 */
6601553e
KW
3337static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
3338 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
b404f720
KW
3339 BdrvRequestFlags flags)
3340{
65afd211 3341 BdrvTrackedRequest req;
3b8242e0
KW
3342 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3343 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3344 uint8_t *head_buf = NULL;
3345 uint8_t *tail_buf = NULL;
3346 QEMUIOVector local_qiov;
3347 bool use_local_qiov = false;
b404f720
KW
3348 int ret;
3349
3350 if (!bs->drv) {
3351 return -ENOMEDIUM;
3352 }
3353 if (bs->read_only) {
3354 return -EACCES;
3355 }
6601553e 3356 if (bdrv_check_byte_request(bs, offset, bytes)) {
b404f720
KW
3357 return -EIO;
3358 }
3359
b404f720
KW
3360 /* throttling disk I/O */
3361 if (bs->io_limits_enabled) {
d5103588 3362 bdrv_io_limits_intercept(bs, bytes, true);
b404f720
KW
3363 }
3364
3b8242e0
KW
3365 /*
3366 * Align write if necessary by performing a read-modify-write cycle.
3367 * Pad qiov with the read parts and be sure to have a tracked request not
3368 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
3369 */
65afd211 3370 tracked_request_begin(&req, bs, offset, bytes, true);
3b8242e0
KW
3371
3372 if (offset & (align - 1)) {
3373 QEMUIOVector head_qiov;
3374 struct iovec head_iov;
3375
3376 mark_request_serialising(&req, align);
3377 wait_serialising_requests(&req);
3378
3379 head_buf = qemu_blockalign(bs, align);
3380 head_iov = (struct iovec) {
3381 .iov_base = head_buf,
3382 .iov_len = align,
3383 };
3384 qemu_iovec_init_external(&head_qiov, &head_iov, 1);
3385
9e1cb96d 3386 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
3b8242e0
KW
3387 ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
3388 align, &head_qiov, 0);
3389 if (ret < 0) {
3390 goto fail;
3391 }
9e1cb96d 3392 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
3b8242e0
KW
3393
3394 qemu_iovec_init(&local_qiov, qiov->niov + 2);
3395 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3396 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3397 use_local_qiov = true;
3398
3399 bytes += offset & (align - 1);
3400 offset = offset & ~(align - 1);
3401 }
3402
3403 if ((offset + bytes) & (align - 1)) {
3404 QEMUIOVector tail_qiov;
3405 struct iovec tail_iov;
3406 size_t tail_bytes;
28de2dcd 3407 bool waited;
3b8242e0
KW
3408
3409 mark_request_serialising(&req, align);
28de2dcd
KW
3410 waited = wait_serialising_requests(&req);
3411 assert(!waited || !use_local_qiov);
3b8242e0
KW
3412
3413 tail_buf = qemu_blockalign(bs, align);
3414 tail_iov = (struct iovec) {
3415 .iov_base = tail_buf,
3416 .iov_len = align,
3417 };
3418 qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
3419
9e1cb96d 3420 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
3b8242e0
KW
3421 ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
3422 align, &tail_qiov, 0);
3423 if (ret < 0) {
3424 goto fail;
3425 }
9e1cb96d 3426 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
3b8242e0
KW
3427
3428 if (!use_local_qiov) {
3429 qemu_iovec_init(&local_qiov, qiov->niov + 1);
3430 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3431 use_local_qiov = true;
3432 }
3433
3434 tail_bytes = (offset + bytes) & (align - 1);
3435 qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
3436
3437 bytes = ROUND_UP(bytes, align);
3438 }
3439
3440 ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
3441 use_local_qiov ? &local_qiov : qiov,
3442 flags);
3443
3444fail:
65afd211 3445 tracked_request_end(&req);
b404f720 3446
3b8242e0
KW
3447 if (use_local_qiov) {
3448 qemu_iovec_destroy(&local_qiov);
3b8242e0 3449 }
99c4a85c
KW
3450 qemu_vfree(head_buf);
3451 qemu_vfree(tail_buf);
3b8242e0 3452
b404f720
KW
3453 return ret;
3454}
3455
6601553e
KW
3456static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
3457 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3458 BdrvRequestFlags flags)
3459{
3460 if (nb_sectors < 0 || nb_sectors > (INT_MAX >> BDRV_SECTOR_BITS)) {
3461 return -EINVAL;
3462 }
3463
3464 return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
3465 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3466}
3467
c5fbe571
SH
3468int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
3469 int nb_sectors, QEMUIOVector *qiov)
3470{
3471 trace_bdrv_co_writev(bs, sector_num, nb_sectors);
3472
f08f2dda
SH
3473 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
3474}
3475
3476int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
aa7bfbff
PL
3477 int64_t sector_num, int nb_sectors,
3478 BdrvRequestFlags flags)
f08f2dda 3479{
94d6ff21 3480 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
f08f2dda 3481
d32f35cb
PL
3482 if (!(bs->open_flags & BDRV_O_UNMAP)) {
3483 flags &= ~BDRV_REQ_MAY_UNMAP;
3484 }
3485
f08f2dda 3486 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
aa7bfbff 3487 BDRV_REQ_ZERO_WRITE | flags);
c5fbe571
SH
3488}
3489
83f64091
FB
3490/**
3491 * Truncate file to 'offset' bytes (needed only for file protocols)
3492 */
3493int bdrv_truncate(BlockDriverState *bs, int64_t offset)
3494{
3495 BlockDriver *drv = bs->drv;
51762288 3496 int ret;
83f64091 3497 if (!drv)
19cb3738 3498 return -ENOMEDIUM;
83f64091
FB
3499 if (!drv->bdrv_truncate)
3500 return -ENOTSUP;
59f2689d
NS
3501 if (bs->read_only)
3502 return -EACCES;
9c75e168 3503
51762288
SH
3504 ret = drv->bdrv_truncate(bs, offset);
3505 if (ret == 0) {
3506 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
a7f53e26
MA
3507 if (bs->blk) {
3508 blk_dev_resize_cb(bs->blk);
3509 }
51762288
SH
3510 }
3511 return ret;
83f64091
FB
3512}
3513
4a1d5e1f
FZ
3514/**
3515 * Length of a allocated file in bytes. Sparse files are counted by actual
3516 * allocated space. Return < 0 if error or unknown.
3517 */
3518int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
3519{
3520 BlockDriver *drv = bs->drv;
3521 if (!drv) {
3522 return -ENOMEDIUM;
3523 }
3524 if (drv->bdrv_get_allocated_file_size) {
3525 return drv->bdrv_get_allocated_file_size(bs);
3526 }
3527 if (bs->file) {
3528 return bdrv_get_allocated_file_size(bs->file);
3529 }
3530 return -ENOTSUP;
3531}
3532
83f64091 3533/**
65a9bb25 3534 * Return number of sectors on success, -errno on error.
83f64091 3535 */
65a9bb25 3536int64_t bdrv_nb_sectors(BlockDriverState *bs)
83f64091
FB
3537{
3538 BlockDriver *drv = bs->drv;
65a9bb25 3539
83f64091 3540 if (!drv)
19cb3738 3541 return -ENOMEDIUM;
51762288 3542
b94a2610
KW
3543 if (drv->has_variable_length) {
3544 int ret = refresh_total_sectors(bs, bs->total_sectors);
3545 if (ret < 0) {
3546 return ret;
46a4e4e6 3547 }
83f64091 3548 }
65a9bb25
MA
3549 return bs->total_sectors;
3550}
3551
3552/**
3553 * Return length in bytes on success, -errno on error.
3554 * The length is always a multiple of BDRV_SECTOR_SIZE.
3555 */
3556int64_t bdrv_getlength(BlockDriverState *bs)
3557{
3558 int64_t ret = bdrv_nb_sectors(bs);
3559
3560 return ret < 0 ? ret : ret * BDRV_SECTOR_SIZE;
fc01f7e7
FB
3561}
3562
19cb3738 3563/* return 0 as number of sectors if no device present or error */
96b8f136 3564void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
fc01f7e7 3565{
65a9bb25
MA
3566 int64_t nb_sectors = bdrv_nb_sectors(bs);
3567
3568 *nb_sectors_ptr = nb_sectors < 0 ? 0 : nb_sectors;
fc01f7e7 3569}
cf98951b 3570
ff06f5f3
PB
3571void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
3572 BlockdevOnError on_write_error)
abd7f68d
MA
3573{
3574 bs->on_read_error = on_read_error;
3575 bs->on_write_error = on_write_error;
3576}
3577
1ceee0d5 3578BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
abd7f68d
MA
3579{
3580 return is_read ? bs->on_read_error : bs->on_write_error;
3581}
3582
3e1caa5f
PB
3583BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
3584{
3585 BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
3586
3587 switch (on_err) {
3588 case BLOCKDEV_ON_ERROR_ENOSPC:
a589569f
WX
3589 return (error == ENOSPC) ?
3590 BLOCK_ERROR_ACTION_STOP : BLOCK_ERROR_ACTION_REPORT;
3e1caa5f 3591 case BLOCKDEV_ON_ERROR_STOP:
a589569f 3592 return BLOCK_ERROR_ACTION_STOP;
3e1caa5f 3593 case BLOCKDEV_ON_ERROR_REPORT:
a589569f 3594 return BLOCK_ERROR_ACTION_REPORT;
3e1caa5f 3595 case BLOCKDEV_ON_ERROR_IGNORE:
a589569f 3596 return BLOCK_ERROR_ACTION_IGNORE;
3e1caa5f
PB
3597 default:
3598 abort();
3599 }
3600}
3601
c7c2ff0c
LC
3602static void send_qmp_error_event(BlockDriverState *bs,
3603 BlockErrorAction action,
3604 bool is_read, int error)
3605{
573742a5 3606 IoOperationType optype;
c7c2ff0c 3607
573742a5
PM
3608 optype = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE;
3609 qapi_event_send_block_io_error(bdrv_get_device_name(bs), optype, action,
c7c2ff0c 3610 bdrv_iostatus_is_enabled(bs),
624ff573
LC
3611 error == ENOSPC, strerror(error),
3612 &error_abort);
c7c2ff0c
LC
3613}
3614
3e1caa5f
PB
3615/* This is done by device models because, while the block layer knows
3616 * about the error, it does not know whether an operation comes from
3617 * the device or the block layer (from a job, for example).
3618 */
3619void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3620 bool is_read, int error)
3621{
3622 assert(error >= 0);
2bd3bce8 3623
a589569f 3624 if (action == BLOCK_ERROR_ACTION_STOP) {
2bd3bce8
PB
3625 /* First set the iostatus, so that "info block" returns an iostatus
3626 * that matches the events raised so far (an additional error iostatus
3627 * is fine, but not a lost one).
3628 */
3e1caa5f 3629 bdrv_iostatus_set_err(bs, error);
2bd3bce8
PB
3630
3631 /* Then raise the request to stop the VM and the event.
3632 * qemu_system_vmstop_request_prepare has two effects. First,
3633 * it ensures that the STOP event always comes after the
3634 * BLOCK_IO_ERROR event. Second, it ensures that even if management
3635 * can observe the STOP event and do a "cont" before the STOP
3636 * event is issued, the VM will not stop. In this case, vm_start()
3637 * also ensures that the STOP/RESUME pair of events is emitted.
3638 */
3639 qemu_system_vmstop_request_prepare();
c7c2ff0c 3640 send_qmp_error_event(bs, action, is_read, error);
2bd3bce8
PB
3641 qemu_system_vmstop_request(RUN_STATE_IO_ERROR);
3642 } else {
c7c2ff0c 3643 send_qmp_error_event(bs, action, is_read, error);
3e1caa5f
PB
3644 }
3645}
3646
b338082b
FB
3647int bdrv_is_read_only(BlockDriverState *bs)
3648{
3649 return bs->read_only;
3650}
3651
985a03b0
TS
3652int bdrv_is_sg(BlockDriverState *bs)
3653{
3654 return bs->sg;
3655}
3656
e900a7b7
CH
3657int bdrv_enable_write_cache(BlockDriverState *bs)
3658{
3659 return bs->enable_write_cache;
3660}
3661
425b0148
PB
3662void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3663{
3664 bs->enable_write_cache = wce;
55b110f2
JC
3665
3666 /* so a reopen() will preserve wce */
3667 if (wce) {
3668 bs->open_flags |= BDRV_O_CACHE_WB;
3669 } else {
3670 bs->open_flags &= ~BDRV_O_CACHE_WB;
3671 }
425b0148
PB
3672}
3673
ea2384d3
FB
3674int bdrv_is_encrypted(BlockDriverState *bs)
3675{
3676 if (bs->backing_hd && bs->backing_hd->encrypted)
3677 return 1;
3678 return bs->encrypted;
3679}
3680
c0f4ce77
AL
3681int bdrv_key_required(BlockDriverState *bs)
3682{
3683 BlockDriverState *backing_hd = bs->backing_hd;
3684
3685 if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3686 return 1;
3687 return (bs->encrypted && !bs->valid_key);
3688}
3689
ea2384d3
FB
3690int bdrv_set_key(BlockDriverState *bs, const char *key)
3691{
3692 int ret;
3693 if (bs->backing_hd && bs->backing_hd->encrypted) {
3694 ret = bdrv_set_key(bs->backing_hd, key);
3695 if (ret < 0)
3696 return ret;
3697 if (!bs->encrypted)
3698 return 0;
3699 }
fd04a2ae
SH
3700 if (!bs->encrypted) {
3701 return -EINVAL;
3702 } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3703 return -ENOMEDIUM;
3704 }
c0f4ce77 3705 ret = bs->drv->bdrv_set_key(bs, key);
bb5fc20f
AL
3706 if (ret < 0) {
3707 bs->valid_key = 0;
3708 } else if (!bs->valid_key) {
3709 bs->valid_key = 1;
a7f53e26
MA
3710 if (bs->blk) {
3711 /* call the change callback now, we skipped it on open */
3712 blk_dev_change_media_cb(bs->blk, true);
3713 }
bb5fc20f 3714 }
c0f4ce77 3715 return ret;
ea2384d3
FB
3716}
3717
f8d6bba1 3718const char *bdrv_get_format_name(BlockDriverState *bs)
ea2384d3 3719{
f8d6bba1 3720 return bs->drv ? bs->drv->format_name : NULL;
ea2384d3
FB
3721}
3722
ada42401
SH
3723static int qsort_strcmp(const void *a, const void *b)
3724{
3725 return strcmp(a, b);
3726}
3727
5fafdf24 3728void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
ea2384d3
FB
3729 void *opaque)
3730{
3731 BlockDriver *drv;
e855e4fb 3732 int count = 0;
ada42401 3733 int i;
e855e4fb 3734 const char **formats = NULL;
ea2384d3 3735
8a22f02a 3736 QLIST_FOREACH(drv, &bdrv_drivers, list) {
e855e4fb
JC
3737 if (drv->format_name) {
3738 bool found = false;
3739 int i = count;
3740 while (formats && i && !found) {
3741 found = !strcmp(formats[--i], drv->format_name);
3742 }
3743
3744 if (!found) {
5839e53b 3745 formats = g_renew(const char *, formats, count + 1);
e855e4fb 3746 formats[count++] = drv->format_name;
e855e4fb
JC
3747 }
3748 }
ea2384d3 3749 }
ada42401
SH
3750
3751 qsort(formats, count, sizeof(formats[0]), qsort_strcmp);
3752
3753 for (i = 0; i < count; i++) {
3754 it(opaque, formats[i]);
3755 }
3756
e855e4fb 3757 g_free(formats);
ea2384d3
FB
3758}
3759
dc364f4c 3760/* This function is to find block backend bs */
7f06d47e 3761/* TODO convert callers to blk_by_name(), then remove */
b338082b
FB
3762BlockDriverState *bdrv_find(const char *name)
3763{
7f06d47e 3764 BlockBackend *blk = blk_by_name(name);
b338082b 3765
7f06d47e 3766 return blk ? blk_bs(blk) : NULL;
b338082b
FB
3767}
3768
dc364f4c
BC
3769/* This function is to find a node in the bs graph */
3770BlockDriverState *bdrv_find_node(const char *node_name)
3771{
3772 BlockDriverState *bs;
3773
3774 assert(node_name);
3775
3776 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3777 if (!strcmp(node_name, bs->node_name)) {
3778 return bs;
3779 }
3780 }
3781 return NULL;
3782}
3783
c13163fb
BC
3784/* Put this QMP function here so it can access the static graph_bdrv_states. */
3785BlockDeviceInfoList *bdrv_named_nodes_list(void)
3786{
3787 BlockDeviceInfoList *list, *entry;
3788 BlockDriverState *bs;
3789
3790 list = NULL;
3791 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3792 entry = g_malloc0(sizeof(*entry));
3793 entry->value = bdrv_block_device_info(bs);
3794 entry->next = list;
3795 list = entry;
3796 }
3797
3798 return list;
3799}
3800
12d3ba82
BC
3801BlockDriverState *bdrv_lookup_bs(const char *device,
3802 const char *node_name,
3803 Error **errp)
3804{
7f06d47e
MA
3805 BlockBackend *blk;
3806 BlockDriverState *bs;
12d3ba82 3807
12d3ba82 3808 if (device) {
7f06d47e 3809 blk = blk_by_name(device);
12d3ba82 3810
7f06d47e
MA
3811 if (blk) {
3812 return blk_bs(blk);
12d3ba82 3813 }
12d3ba82
BC
3814 }
3815
dd67fa50
BC
3816 if (node_name) {
3817 bs = bdrv_find_node(node_name);
12d3ba82 3818
dd67fa50
BC
3819 if (bs) {
3820 return bs;
3821 }
12d3ba82
BC
3822 }
3823
dd67fa50
BC
3824 error_setg(errp, "Cannot find device=%s nor node_name=%s",
3825 device ? device : "",
3826 node_name ? node_name : "");
3827 return NULL;
12d3ba82
BC
3828}
3829
5a6684d2
JC
3830/* If 'base' is in the same chain as 'top', return true. Otherwise,
3831 * return false. If either argument is NULL, return false. */
3832bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base)
3833{
3834 while (top && top != base) {
3835 top = top->backing_hd;
3836 }
3837
3838 return top != NULL;
3839}
3840
04df765a
FZ
3841BlockDriverState *bdrv_next_node(BlockDriverState *bs)
3842{
3843 if (!bs) {
3844 return QTAILQ_FIRST(&graph_bdrv_states);
3845 }
3846 return QTAILQ_NEXT(bs, node_list);
3847}
3848
2f399b0a
MA
3849BlockDriverState *bdrv_next(BlockDriverState *bs)
3850{
3851 if (!bs) {
3852 return QTAILQ_FIRST(&bdrv_states);
3853 }
dc364f4c 3854 return QTAILQ_NEXT(bs, device_list);
2f399b0a
MA
3855}
3856
20a9e77d
FZ
3857const char *bdrv_get_node_name(const BlockDriverState *bs)
3858{
3859 return bs->node_name;
3860}
3861
7f06d47e 3862/* TODO check what callers really want: bs->node_name or blk_name() */
bfb197e0 3863const char *bdrv_get_device_name(const BlockDriverState *bs)
ea2384d3 3864{
bfb197e0 3865 return bs->blk ? blk_name(bs->blk) : "";
ea2384d3
FB
3866}
3867
c8433287
MA
3868int bdrv_get_flags(BlockDriverState *bs)
3869{
3870 return bs->open_flags;
3871}
3872
f0f0fdfe 3873int bdrv_flush_all(void)
c6ca28d6
AL
3874{
3875 BlockDriverState *bs;
f0f0fdfe 3876 int result = 0;
c6ca28d6 3877
dc364f4c 3878 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
ed78cda3
SH
3879 AioContext *aio_context = bdrv_get_aio_context(bs);
3880 int ret;
3881
3882 aio_context_acquire(aio_context);
3883 ret = bdrv_flush(bs);
f0f0fdfe
KW
3884 if (ret < 0 && !result) {
3885 result = ret;
3886 }
ed78cda3 3887 aio_context_release(aio_context);
1b7bdbc1 3888 }
f0f0fdfe
KW
3889
3890 return result;
c6ca28d6
AL
3891}
3892
3ac21627
PL
3893int bdrv_has_zero_init_1(BlockDriverState *bs)
3894{
3895 return 1;
3896}
3897
f2feebbd
KW
3898int bdrv_has_zero_init(BlockDriverState *bs)
3899{
3900 assert(bs->drv);
3901
11212d8f
PB
3902 /* If BS is a copy on write image, it is initialized to
3903 the contents of the base image, which may not be zeroes. */
3904 if (bs->backing_hd) {
3905 return 0;
3906 }
336c1c12
KW
3907 if (bs->drv->bdrv_has_zero_init) {
3908 return bs->drv->bdrv_has_zero_init(bs);
f2feebbd
KW
3909 }
3910
3ac21627
PL
3911 /* safe default */
3912 return 0;
f2feebbd
KW
3913}
3914
4ce78691
PL
3915bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
3916{
3917 BlockDriverInfo bdi;
3918
3919 if (bs->backing_hd) {
3920 return false;
3921 }
3922
3923 if (bdrv_get_info(bs, &bdi) == 0) {
3924 return bdi.unallocated_blocks_are_zero;
3925 }
3926
3927 return false;
3928}
3929
3930bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
3931{
3932 BlockDriverInfo bdi;
3933
3934 if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
3935 return false;
3936 }
3937
3938 if (bdrv_get_info(bs, &bdi) == 0) {
3939 return bdi.can_write_zeroes_with_unmap;
3940 }
3941
3942 return false;
3943}
3944
b6b8a333 3945typedef struct BdrvCoGetBlockStatusData {
376ae3f1 3946 BlockDriverState *bs;
b35b2bba 3947 BlockDriverState *base;
376ae3f1
SH
3948 int64_t sector_num;
3949 int nb_sectors;
3950 int *pnum;
b6b8a333 3951 int64_t ret;
376ae3f1 3952 bool done;
b6b8a333 3953} BdrvCoGetBlockStatusData;
376ae3f1 3954
f58c7b35 3955/*
705be728
FZ
3956 * Returns the allocation status of the specified sectors.
3957 * Drivers not implementing the functionality are assumed to not support
3958 * backing files, hence all their sectors are reported as allocated.
f58c7b35 3959 *
bd9533e3
SH
3960 * If 'sector_num' is beyond the end of the disk image the return value is 0
3961 * and 'pnum' is set to 0.
3962 *
f58c7b35
TS
3963 * 'pnum' is set to the number of sectors (including and immediately following
3964 * the specified sector) that are known to be in the same
3965 * allocated/unallocated state.
3966 *
bd9533e3
SH
3967 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
3968 * beyond the end of the disk image it will be clamped.
f58c7b35 3969 */
b6b8a333
PB
3970static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
3971 int64_t sector_num,
3972 int nb_sectors, int *pnum)
f58c7b35 3973{
30a7f2fc 3974 int64_t total_sectors;
bd9533e3 3975 int64_t n;
5daa74a6 3976 int64_t ret, ret2;
bd9533e3 3977
30a7f2fc
MA
3978 total_sectors = bdrv_nb_sectors(bs);
3979 if (total_sectors < 0) {
3980 return total_sectors;
617ccb46
PB
3981 }
3982
30a7f2fc 3983 if (sector_num >= total_sectors) {
bd9533e3
SH
3984 *pnum = 0;
3985 return 0;
3986 }
3987
30a7f2fc 3988 n = total_sectors - sector_num;
bd9533e3
SH
3989 if (n < nb_sectors) {
3990 nb_sectors = n;
3991 }
3992
b6b8a333 3993 if (!bs->drv->bdrv_co_get_block_status) {
bd9533e3 3994 *pnum = nb_sectors;
e88ae226 3995 ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
918e92d7
PB
3996 if (bs->drv->protocol_name) {
3997 ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
3998 }
3999 return ret;
f58c7b35 4000 }
6aebab14 4001
415b5b01
PB
4002 ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
4003 if (ret < 0) {
3e0a233d 4004 *pnum = 0;
415b5b01
PB
4005 return ret;
4006 }
4007
92bc50a5
PL
4008 if (ret & BDRV_BLOCK_RAW) {
4009 assert(ret & BDRV_BLOCK_OFFSET_VALID);
4010 return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
4011 *pnum, pnum);
4012 }
4013
e88ae226
KW
4014 if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
4015 ret |= BDRV_BLOCK_ALLOCATED;
4016 }
4017
c3d86884
PL
4018 if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
4019 if (bdrv_unallocated_blocks_are_zero(bs)) {
f0ad5712 4020 ret |= BDRV_BLOCK_ZERO;
1f9db224 4021 } else if (bs->backing_hd) {
f0ad5712 4022 BlockDriverState *bs2 = bs->backing_hd;
30a7f2fc
MA
4023 int64_t nb_sectors2 = bdrv_nb_sectors(bs2);
4024 if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) {
f0ad5712
PB
4025 ret |= BDRV_BLOCK_ZERO;
4026 }
4027 }
415b5b01 4028 }
5daa74a6
PB
4029
4030 if (bs->file &&
4031 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
4032 (ret & BDRV_BLOCK_OFFSET_VALID)) {
59c9a95f
HR
4033 int file_pnum;
4034
5daa74a6 4035 ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
59c9a95f 4036 *pnum, &file_pnum);
5daa74a6
PB
4037 if (ret2 >= 0) {
4038 /* Ignore errors. This is just providing extra information, it
4039 * is useful but not necessary.
4040 */
59c9a95f
HR
4041 if (!file_pnum) {
4042 /* !file_pnum indicates an offset at or beyond the EOF; it is
4043 * perfectly valid for the format block driver to point to such
4044 * offsets, so catch it and mark everything as zero */
4045 ret |= BDRV_BLOCK_ZERO;
4046 } else {
4047 /* Limit request to the range reported by the protocol driver */
4048 *pnum = file_pnum;
4049 ret |= (ret2 & BDRV_BLOCK_ZERO);
4050 }
5daa74a6
PB
4051 }
4052 }
4053
415b5b01 4054 return ret;
060f51c9
SH
4055}
4056
b6b8a333
PB
4057/* Coroutine wrapper for bdrv_get_block_status() */
4058static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
060f51c9 4059{
b6b8a333 4060 BdrvCoGetBlockStatusData *data = opaque;
060f51c9
SH
4061 BlockDriverState *bs = data->bs;
4062
b6b8a333
PB
4063 data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
4064 data->pnum);
060f51c9
SH
4065 data->done = true;
4066}
4067
4068/*
b6b8a333 4069 * Synchronous wrapper around bdrv_co_get_block_status().
060f51c9 4070 *
b6b8a333 4071 * See bdrv_co_get_block_status() for details.
060f51c9 4072 */
b6b8a333
PB
4073int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
4074 int nb_sectors, int *pnum)
060f51c9 4075{
6aebab14 4076 Coroutine *co;
b6b8a333 4077 BdrvCoGetBlockStatusData data = {
6aebab14
SH
4078 .bs = bs,
4079 .sector_num = sector_num,
4080 .nb_sectors = nb_sectors,
4081 .pnum = pnum,
4082 .done = false,
4083 };
4084
bdad13b9
PB
4085 if (qemu_in_coroutine()) {
4086 /* Fast-path if already in coroutine context */
b6b8a333 4087 bdrv_get_block_status_co_entry(&data);
bdad13b9 4088 } else {
2572b37a
SH
4089 AioContext *aio_context = bdrv_get_aio_context(bs);
4090
b6b8a333 4091 co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
bdad13b9
PB
4092 qemu_coroutine_enter(co, &data);
4093 while (!data.done) {
2572b37a 4094 aio_poll(aio_context, true);
bdad13b9 4095 }
6aebab14
SH
4096 }
4097 return data.ret;
f58c7b35
TS
4098}
4099
b6b8a333
PB
4100int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
4101 int nb_sectors, int *pnum)
4102{
4333bb71
PB
4103 int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
4104 if (ret < 0) {
4105 return ret;
4106 }
01fb2705 4107 return !!(ret & BDRV_BLOCK_ALLOCATED);
b6b8a333
PB
4108}
4109
188a7bbf
PB
4110/*
4111 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
4112 *
4113 * Return true if the given sector is allocated in any image between
4114 * BASE and TOP (inclusive). BASE can be NULL to check if the given
4115 * sector is allocated in any image of the chain. Return false otherwise.
4116 *
4117 * 'pnum' is set to the number of sectors (including and immediately following
4118 * the specified sector) that are known to be in the same
4119 * allocated/unallocated state.
4120 *
4121 */
4f578637
PB
4122int bdrv_is_allocated_above(BlockDriverState *top,
4123 BlockDriverState *base,
4124 int64_t sector_num,
4125 int nb_sectors, int *pnum)
188a7bbf
PB
4126{
4127 BlockDriverState *intermediate;
4128 int ret, n = nb_sectors;
4129
4130 intermediate = top;
4131 while (intermediate && intermediate != base) {
4132 int pnum_inter;
bdad13b9
PB
4133 ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
4134 &pnum_inter);
188a7bbf
PB
4135 if (ret < 0) {
4136 return ret;
4137 } else if (ret) {
4138 *pnum = pnum_inter;
4139 return 1;
4140 }
4141
4142 /*
4143 * [sector_num, nb_sectors] is unallocated on top but intermediate
4144 * might have
4145 *
4146 * [sector_num+x, nr_sectors] allocated.
4147 */
63ba17d3
VI
4148 if (n > pnum_inter &&
4149 (intermediate == top ||
4150 sector_num + pnum_inter < intermediate->total_sectors)) {
188a7bbf
PB
4151 n = pnum_inter;
4152 }
4153
4154 intermediate = intermediate->backing_hd;
4155 }
4156
4157 *pnum = n;
4158 return 0;
4159}
4160
045df330
AL
4161const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
4162{
4163 if (bs->backing_hd && bs->backing_hd->encrypted)
4164 return bs->backing_file;
4165 else if (bs->encrypted)
4166 return bs->filename;
4167 else
4168 return NULL;
4169}
4170
5fafdf24 4171void bdrv_get_backing_filename(BlockDriverState *bs,
83f64091
FB
4172 char *filename, int filename_size)
4173{
3574c608 4174 pstrcpy(filename, filename_size, bs->backing_file);
83f64091
FB
4175}
4176
5fafdf24 4177int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
faea38e7
FB
4178 const uint8_t *buf, int nb_sectors)
4179{
4180 BlockDriver *drv = bs->drv;
4181 if (!drv)
19cb3738 4182 return -ENOMEDIUM;
faea38e7
FB
4183 if (!drv->bdrv_write_compressed)
4184 return -ENOTSUP;
fbb7b4e0
KW
4185 if (bdrv_check_request(bs, sector_num, nb_sectors))
4186 return -EIO;
a55eb92c 4187
e4654d2d 4188 assert(QLIST_EMPTY(&bs->dirty_bitmaps));
a55eb92c 4189
faea38e7
FB
4190 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
4191}
3b46e624 4192
faea38e7
FB
4193int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
4194{
4195 BlockDriver *drv = bs->drv;
4196 if (!drv)
19cb3738 4197 return -ENOMEDIUM;
faea38e7
FB
4198 if (!drv->bdrv_get_info)
4199 return -ENOTSUP;
4200 memset(bdi, 0, sizeof(*bdi));
4201 return drv->bdrv_get_info(bs, bdi);
4202}
4203
eae041fe
HR
4204ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
4205{
4206 BlockDriver *drv = bs->drv;
4207 if (drv && drv->bdrv_get_specific_info) {
4208 return drv->bdrv_get_specific_info(bs);
4209 }
4210 return NULL;
4211}
4212
45566e9c
CH
4213int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
4214 int64_t pos, int size)
cf8074b3
KW
4215{
4216 QEMUIOVector qiov;
4217 struct iovec iov = {
4218 .iov_base = (void *) buf,
4219 .iov_len = size,
4220 };
4221
4222 qemu_iovec_init_external(&qiov, &iov, 1);
4223 return bdrv_writev_vmstate(bs, &qiov, pos);
4224}
4225
4226int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
178e08a5
AL
4227{
4228 BlockDriver *drv = bs->drv;
cf8074b3
KW
4229
4230 if (!drv) {
178e08a5 4231 return -ENOMEDIUM;
cf8074b3
KW
4232 } else if (drv->bdrv_save_vmstate) {
4233 return drv->bdrv_save_vmstate(bs, qiov, pos);
4234 } else if (bs->file) {
4235 return bdrv_writev_vmstate(bs->file, qiov, pos);
4236 }
4237
7cdb1f6d 4238 return -ENOTSUP;
178e08a5
AL
4239}
4240
45566e9c
CH
4241int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
4242 int64_t pos, int size)
178e08a5
AL
4243{
4244 BlockDriver *drv = bs->drv;
4245 if (!drv)
4246 return -ENOMEDIUM;
7cdb1f6d
MK
4247 if (drv->bdrv_load_vmstate)
4248 return drv->bdrv_load_vmstate(bs, buf, pos, size);
4249 if (bs->file)
4250 return bdrv_load_vmstate(bs->file, buf, pos, size);
4251 return -ENOTSUP;
178e08a5
AL
4252}
4253
8b9b0cc2
KW
4254void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
4255{
bf736fe3 4256 if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
8b9b0cc2
KW
4257 return;
4258 }
4259
bf736fe3 4260 bs->drv->bdrv_debug_event(bs, event);
41c695c7
KW
4261}
4262
4263int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
4264 const char *tag)
4265{
4266 while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
4267 bs = bs->file;
4268 }
4269
4270 if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
4271 return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
4272 }
4273
4274 return -ENOTSUP;
4275}
4276
4cc70e93
FZ
4277int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
4278{
4279 while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
4280 bs = bs->file;
4281 }
4282
4283 if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
4284 return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
4285 }
4286
4287 return -ENOTSUP;
4288}
4289
41c695c7
KW
4290int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
4291{
938789ea 4292 while (bs && (!bs->drv || !bs->drv->bdrv_debug_resume)) {
41c695c7
KW
4293 bs = bs->file;
4294 }
8b9b0cc2 4295
41c695c7
KW
4296 if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
4297 return bs->drv->bdrv_debug_resume(bs, tag);
4298 }
4299
4300 return -ENOTSUP;
4301}
4302
4303bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
4304{
4305 while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
4306 bs = bs->file;
4307 }
4308
4309 if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
4310 return bs->drv->bdrv_debug_is_suspended(bs, tag);
4311 }
4312
4313 return false;
8b9b0cc2
KW
4314}
4315
199630b6
BS
4316int bdrv_is_snapshot(BlockDriverState *bs)
4317{
4318 return !!(bs->open_flags & BDRV_O_SNAPSHOT);
4319}
4320
b1b1d783
JC
4321/* backing_file can either be relative, or absolute, or a protocol. If it is
4322 * relative, it must be relative to the chain. So, passing in bs->filename
4323 * from a BDS as backing_file should not be done, as that may be relative to
4324 * the CWD rather than the chain. */
e8a6bb9c
MT
4325BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
4326 const char *backing_file)
4327{
b1b1d783
JC
4328 char *filename_full = NULL;
4329 char *backing_file_full = NULL;
4330 char *filename_tmp = NULL;
4331 int is_protocol = 0;
4332 BlockDriverState *curr_bs = NULL;
4333 BlockDriverState *retval = NULL;
4334
4335 if (!bs || !bs->drv || !backing_file) {
e8a6bb9c
MT
4336 return NULL;
4337 }
4338
b1b1d783
JC
4339 filename_full = g_malloc(PATH_MAX);
4340 backing_file_full = g_malloc(PATH_MAX);
4341 filename_tmp = g_malloc(PATH_MAX);
4342
4343 is_protocol = path_has_protocol(backing_file);
4344
4345 for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
4346
4347 /* If either of the filename paths is actually a protocol, then
4348 * compare unmodified paths; otherwise make paths relative */
4349 if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
4350 if (strcmp(backing_file, curr_bs->backing_file) == 0) {
4351 retval = curr_bs->backing_hd;
4352 break;
4353 }
e8a6bb9c 4354 } else {
b1b1d783
JC
4355 /* If not an absolute filename path, make it relative to the current
4356 * image's filename path */
4357 path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4358 backing_file);
4359
4360 /* We are going to compare absolute pathnames */
4361 if (!realpath(filename_tmp, filename_full)) {
4362 continue;
4363 }
4364
4365 /* We need to make sure the backing filename we are comparing against
4366 * is relative to the current image filename (or absolute) */
4367 path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4368 curr_bs->backing_file);
4369
4370 if (!realpath(filename_tmp, backing_file_full)) {
4371 continue;
4372 }
4373
4374 if (strcmp(backing_file_full, filename_full) == 0) {
4375 retval = curr_bs->backing_hd;
4376 break;
4377 }
e8a6bb9c
MT
4378 }
4379 }
4380
b1b1d783
JC
4381 g_free(filename_full);
4382 g_free(backing_file_full);
4383 g_free(filename_tmp);
4384 return retval;
e8a6bb9c
MT
4385}
4386
f198fd1c
BC
4387int bdrv_get_backing_file_depth(BlockDriverState *bs)
4388{
4389 if (!bs->drv) {
4390 return 0;
4391 }
4392
4393 if (!bs->backing_hd) {
4394 return 0;
4395 }
4396
4397 return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
4398}
4399
ea2384d3 4400/**************************************************************/
83f64091 4401/* async I/Os */
ea2384d3 4402
7c84b1b8
MA
4403BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
4404 QEMUIOVector *qiov, int nb_sectors,
097310b5 4405 BlockCompletionFunc *cb, void *opaque)
83f64091 4406{
bbf0a440
SH
4407 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
4408
d20d9b7c 4409 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
8c5873d6 4410 cb, opaque, false);
ea2384d3
FB
4411}
4412
7c84b1b8
MA
4413BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
4414 QEMUIOVector *qiov, int nb_sectors,
097310b5 4415 BlockCompletionFunc *cb, void *opaque)
ea2384d3 4416{
bbf0a440
SH
4417 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
4418
d20d9b7c 4419 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
8c5873d6 4420 cb, opaque, true);
83f64091
FB
4421}
4422
7c84b1b8 4423BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
d5ef94d4 4424 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
097310b5 4425 BlockCompletionFunc *cb, void *opaque)
d5ef94d4
PB
4426{
4427 trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
4428
4429 return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
4430 BDRV_REQ_ZERO_WRITE | flags,
4431 cb, opaque, true);
4432}
4433
40b4f539
KW
4434
4435typedef struct MultiwriteCB {
4436 int error;
4437 int num_requests;
4438 int num_callbacks;
4439 struct {
097310b5 4440 BlockCompletionFunc *cb;
40b4f539
KW
4441 void *opaque;
4442 QEMUIOVector *free_qiov;
40b4f539
KW
4443 } callbacks[];
4444} MultiwriteCB;
4445
4446static void multiwrite_user_cb(MultiwriteCB *mcb)
4447{
4448 int i;
4449
4450 for (i = 0; i < mcb->num_callbacks; i++) {
4451 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
1e1ea48d
SH
4452 if (mcb->callbacks[i].free_qiov) {
4453 qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
4454 }
7267c094 4455 g_free(mcb->callbacks[i].free_qiov);
40b4f539
KW
4456 }
4457}
4458
4459static void multiwrite_cb(void *opaque, int ret)
4460{
4461 MultiwriteCB *mcb = opaque;
4462
6d519a5f
SH
4463 trace_multiwrite_cb(mcb, ret);
4464
cb6d3ca0 4465 if (ret < 0 && !mcb->error) {
40b4f539 4466 mcb->error = ret;
40b4f539
KW
4467 }
4468
4469 mcb->num_requests--;
4470 if (mcb->num_requests == 0) {
de189a1b 4471 multiwrite_user_cb(mcb);
7267c094 4472 g_free(mcb);
40b4f539
KW
4473 }
4474}
4475
4476static int multiwrite_req_compare(const void *a, const void *b)
4477{
77be4366
CH
4478 const BlockRequest *req1 = a, *req2 = b;
4479
4480 /*
4481 * Note that we can't simply subtract req2->sector from req1->sector
4482 * here as that could overflow the return value.
4483 */
4484 if (req1->sector > req2->sector) {
4485 return 1;
4486 } else if (req1->sector < req2->sector) {
4487 return -1;
4488 } else {
4489 return 0;
4490 }
40b4f539
KW
4491}
4492
4493/*
4494 * Takes a bunch of requests and tries to merge them. Returns the number of
4495 * requests that remain after merging.
4496 */
4497static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
4498 int num_reqs, MultiwriteCB *mcb)
4499{
4500 int i, outidx;
4501
4502 // Sort requests by start sector
4503 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
4504
4505 // Check if adjacent requests touch the same clusters. If so, combine them,
4506 // filling up gaps with zero sectors.
4507 outidx = 0;
4508 for (i = 1; i < num_reqs; i++) {
4509 int merge = 0;
4510 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
4511
b6a127a1 4512 // Handle exactly sequential writes and overlapping writes.
40b4f539
KW
4513 if (reqs[i].sector <= oldreq_last) {
4514 merge = 1;
4515 }
4516
e2a305fb
CH
4517 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
4518 merge = 0;
4519 }
4520
6c5a42ac
PL
4521 if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors +
4522 reqs[i].nb_sectors > bs->bl.max_transfer_length) {
4523 merge = 0;
4524 }
4525
40b4f539
KW
4526 if (merge) {
4527 size_t size;
7267c094 4528 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
40b4f539
KW
4529 qemu_iovec_init(qiov,
4530 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
4531
4532 // Add the first request to the merged one. If the requests are
4533 // overlapping, drop the last sectors of the first request.
4534 size = (reqs[i].sector - reqs[outidx].sector) << 9;
1b093c48 4535 qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
40b4f539 4536
b6a127a1
PB
4537 // We should need to add any zeros between the two requests
4538 assert (reqs[i].sector <= oldreq_last);
40b4f539
KW
4539
4540 // Add the second request
1b093c48 4541 qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
40b4f539 4542
391827eb
SH
4543 // Add tail of first request, if necessary
4544 if (qiov->size < reqs[outidx].qiov->size) {
4545 qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size,
4546 reqs[outidx].qiov->size - qiov->size);
4547 }
4548
cbf1dff2 4549 reqs[outidx].nb_sectors = qiov->size >> 9;
40b4f539
KW
4550 reqs[outidx].qiov = qiov;
4551
4552 mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
4553 } else {
4554 outidx++;
4555 reqs[outidx].sector = reqs[i].sector;
4556 reqs[outidx].nb_sectors = reqs[i].nb_sectors;
4557 reqs[outidx].qiov = reqs[i].qiov;
4558 }
4559 }
4560
4561 return outidx + 1;
4562}
4563
4564/*
4565 * Submit multiple AIO write requests at once.
4566 *
4567 * On success, the function returns 0 and all requests in the reqs array have
4568 * been submitted. In error case this function returns -1, and any of the
4569 * requests may or may not be submitted yet. In particular, this means that the
4570 * callback will be called for some of the requests, for others it won't. The
4571 * caller must check the error field of the BlockRequest to wait for the right
4572 * callbacks (if error != 0, no callback will be called).
4573 *
4574 * The implementation may modify the contents of the reqs array, e.g. to merge
4575 * requests. However, the fields opaque and error are left unmodified as they
4576 * are used to signal failure for a single request to the caller.
4577 */
4578int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
4579{
40b4f539
KW
4580 MultiwriteCB *mcb;
4581 int i;
4582
301db7c2
RH
4583 /* don't submit writes if we don't have a medium */
4584 if (bs->drv == NULL) {
4585 for (i = 0; i < num_reqs; i++) {
4586 reqs[i].error = -ENOMEDIUM;
4587 }
4588 return -1;
4589 }
4590
40b4f539
KW
4591 if (num_reqs == 0) {
4592 return 0;
4593 }
4594
4595 // Create MultiwriteCB structure
7267c094 4596 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
40b4f539
KW
4597 mcb->num_requests = 0;
4598 mcb->num_callbacks = num_reqs;
4599
4600 for (i = 0; i < num_reqs; i++) {
4601 mcb->callbacks[i].cb = reqs[i].cb;
4602 mcb->callbacks[i].opaque = reqs[i].opaque;
4603 }
4604
4605 // Check for mergable requests
4606 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
4607
6d519a5f
SH
4608 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
4609
df9309fb
PB
4610 /* Run the aio requests. */
4611 mcb->num_requests = num_reqs;
40b4f539 4612 for (i = 0; i < num_reqs; i++) {
d20d9b7c
PB
4613 bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
4614 reqs[i].nb_sectors, reqs[i].flags,
4615 multiwrite_cb, mcb,
4616 true);
40b4f539
KW
4617 }
4618
4619 return 0;
40b4f539
KW
4620}
4621
7c84b1b8 4622void bdrv_aio_cancel(BlockAIOCB *acb)
83f64091 4623{
ca5fd113
FZ
4624 qemu_aio_ref(acb);
4625 bdrv_aio_cancel_async(acb);
4626 while (acb->refcnt > 1) {
4627 if (acb->aiocb_info->get_aio_context) {
4628 aio_poll(acb->aiocb_info->get_aio_context(acb), true);
4629 } else if (acb->bs) {
4630 aio_poll(bdrv_get_aio_context(acb->bs), true);
4631 } else {
4632 abort();
02c50efe 4633 }
02c50efe 4634 }
8007429a 4635 qemu_aio_unref(acb);
02c50efe
FZ
4636}
4637
4638/* Async version of aio cancel. The caller is not blocked if the acb implements
4639 * cancel_async, otherwise we do nothing and let the request normally complete.
4640 * In either case the completion callback must be called. */
7c84b1b8 4641void bdrv_aio_cancel_async(BlockAIOCB *acb)
02c50efe
FZ
4642{
4643 if (acb->aiocb_info->cancel_async) {
4644 acb->aiocb_info->cancel_async(acb);
4645 }
83f64091
FB
4646}
4647
4648/**************************************************************/
4649/* async block device emulation */
4650
7c84b1b8
MA
4651typedef struct BlockAIOCBSync {
4652 BlockAIOCB common;
c16b5a2c
CH
4653 QEMUBH *bh;
4654 int ret;
4655 /* vector translation state */
4656 QEMUIOVector *qiov;
4657 uint8_t *bounce;
4658 int is_write;
7c84b1b8 4659} BlockAIOCBSync;
c16b5a2c 4660
d7331bed 4661static const AIOCBInfo bdrv_em_aiocb_info = {
7c84b1b8 4662 .aiocb_size = sizeof(BlockAIOCBSync),
c16b5a2c
CH
4663};
4664
ce1a14dc 4665static void bdrv_aio_bh_cb(void *opaque)
83f64091 4666{
7c84b1b8 4667 BlockAIOCBSync *acb = opaque;
f141eafe 4668
857d4f46 4669 if (!acb->is_write && acb->ret >= 0) {
03396148 4670 qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
857d4f46 4671 }
ceb42de8 4672 qemu_vfree(acb->bounce);
ce1a14dc 4673 acb->common.cb(acb->common.opaque, acb->ret);
6a7ad299 4674 qemu_bh_delete(acb->bh);
36afc451 4675 acb->bh = NULL;
8007429a 4676 qemu_aio_unref(acb);
83f64091 4677}
beac80cd 4678
7c84b1b8
MA
4679static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
4680 int64_t sector_num,
4681 QEMUIOVector *qiov,
4682 int nb_sectors,
097310b5 4683 BlockCompletionFunc *cb,
7c84b1b8
MA
4684 void *opaque,
4685 int is_write)
f141eafe 4686
83f64091 4687{
7c84b1b8 4688 BlockAIOCBSync *acb;
ce1a14dc 4689
d7331bed 4690 acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
f141eafe
AL
4691 acb->is_write = is_write;
4692 acb->qiov = qiov;
857d4f46 4693 acb->bounce = qemu_try_blockalign(bs, qiov->size);
2572b37a 4694 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb);
f141eafe 4695
857d4f46
KW
4696 if (acb->bounce == NULL) {
4697 acb->ret = -ENOMEM;
4698 } else if (is_write) {
d5e6b161 4699 qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
1ed20acf 4700 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
f141eafe 4701 } else {
1ed20acf 4702 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
f141eafe
AL
4703 }
4704
ce1a14dc 4705 qemu_bh_schedule(acb->bh);
f141eafe 4706
ce1a14dc 4707 return &acb->common;
beac80cd
FB
4708}
4709
7c84b1b8 4710static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
f141eafe 4711 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
097310b5 4712 BlockCompletionFunc *cb, void *opaque)
beac80cd 4713{
f141eafe
AL
4714 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
4715}
83f64091 4716
7c84b1b8 4717static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
f141eafe 4718 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
097310b5 4719 BlockCompletionFunc *cb, void *opaque)
f141eafe
AL
4720{
4721 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
beac80cd 4722}
beac80cd 4723
68485420 4724
7c84b1b8
MA
4725typedef struct BlockAIOCBCoroutine {
4726 BlockAIOCB common;
68485420
KW
4727 BlockRequest req;
4728 bool is_write;
d318aea9 4729 bool *done;
68485420 4730 QEMUBH* bh;
7c84b1b8 4731} BlockAIOCBCoroutine;
68485420 4732
d7331bed 4733static const AIOCBInfo bdrv_em_co_aiocb_info = {
7c84b1b8 4734 .aiocb_size = sizeof(BlockAIOCBCoroutine),
68485420
KW
4735};
4736
35246a68 4737static void bdrv_co_em_bh(void *opaque)
68485420 4738{
7c84b1b8 4739 BlockAIOCBCoroutine *acb = opaque;
68485420
KW
4740
4741 acb->common.cb(acb->common.opaque, acb->req.error);
d318aea9 4742
68485420 4743 qemu_bh_delete(acb->bh);
8007429a 4744 qemu_aio_unref(acb);
68485420
KW
4745}
4746
b2a61371
SH
4747/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4748static void coroutine_fn bdrv_co_do_rw(void *opaque)
4749{
7c84b1b8 4750 BlockAIOCBCoroutine *acb = opaque;
b2a61371
SH
4751 BlockDriverState *bs = acb->common.bs;
4752
4753 if (!acb->is_write) {
4754 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
d20d9b7c 4755 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
b2a61371
SH
4756 } else {
4757 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
d20d9b7c 4758 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
b2a61371
SH
4759 }
4760
2572b37a 4761 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
b2a61371
SH
4762 qemu_bh_schedule(acb->bh);
4763}
4764
7c84b1b8
MA
4765static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4766 int64_t sector_num,
4767 QEMUIOVector *qiov,
4768 int nb_sectors,
4769 BdrvRequestFlags flags,
097310b5 4770 BlockCompletionFunc *cb,
7c84b1b8
MA
4771 void *opaque,
4772 bool is_write)
68485420
KW
4773{
4774 Coroutine *co;
7c84b1b8 4775 BlockAIOCBCoroutine *acb;
68485420 4776
d7331bed 4777 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
68485420
KW
4778 acb->req.sector = sector_num;
4779 acb->req.nb_sectors = nb_sectors;
4780 acb->req.qiov = qiov;
d20d9b7c 4781 acb->req.flags = flags;
68485420
KW
4782 acb->is_write = is_write;
4783
8c5873d6 4784 co = qemu_coroutine_create(bdrv_co_do_rw);
68485420
KW
4785 qemu_coroutine_enter(co, acb);
4786
4787 return &acb->common;
4788}
4789
07f07615 4790static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
b2e12bc6 4791{
7c84b1b8 4792 BlockAIOCBCoroutine *acb = opaque;
07f07615 4793 BlockDriverState *bs = acb->common.bs;
b2e12bc6 4794
07f07615 4795 acb->req.error = bdrv_co_flush(bs);
2572b37a 4796 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
b2e12bc6 4797 qemu_bh_schedule(acb->bh);
b2e12bc6
CH
4798}
4799
7c84b1b8 4800BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
097310b5 4801 BlockCompletionFunc *cb, void *opaque)
016f5cf6 4802{
07f07615 4803 trace_bdrv_aio_flush(bs, opaque);
016f5cf6 4804
07f07615 4805 Coroutine *co;
7c84b1b8 4806 BlockAIOCBCoroutine *acb;
016f5cf6 4807
d7331bed 4808 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
d318aea9 4809
07f07615
PB
4810 co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4811 qemu_coroutine_enter(co, acb);
016f5cf6 4812
016f5cf6
AG
4813 return &acb->common;
4814}
4815
4265d620
PB
4816static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4817{
7c84b1b8 4818 BlockAIOCBCoroutine *acb = opaque;
4265d620
PB
4819 BlockDriverState *bs = acb->common.bs;
4820
4821 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
2572b37a 4822 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4265d620
PB
4823 qemu_bh_schedule(acb->bh);
4824}
4825
7c84b1b8 4826BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4265d620 4827 int64_t sector_num, int nb_sectors,
097310b5 4828 BlockCompletionFunc *cb, void *opaque)
4265d620
PB
4829{
4830 Coroutine *co;
7c84b1b8 4831 BlockAIOCBCoroutine *acb;
4265d620
PB
4832
4833 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4834
d7331bed 4835 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4265d620
PB
4836 acb->req.sector = sector_num;
4837 acb->req.nb_sectors = nb_sectors;
4838 co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
4839 qemu_coroutine_enter(co, acb);
4840
4841 return &acb->common;
4842}
4843
ea2384d3
FB
4844void bdrv_init(void)
4845{
5efa9d5a 4846 module_call_init(MODULE_INIT_BLOCK);
ea2384d3 4847}
ce1a14dc 4848
eb852011
MA
4849void bdrv_init_with_whitelist(void)
4850{
4851 use_bdrv_whitelist = 1;
4852 bdrv_init();
4853}
4854
d7331bed 4855void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
097310b5 4856 BlockCompletionFunc *cb, void *opaque)
ce1a14dc 4857{
7c84b1b8 4858 BlockAIOCB *acb;
ce1a14dc 4859
d7331bed
SH
4860 acb = g_slice_alloc(aiocb_info->aiocb_size);
4861 acb->aiocb_info = aiocb_info;
ce1a14dc
PB
4862 acb->bs = bs;
4863 acb->cb = cb;
4864 acb->opaque = opaque;
f197fe2b 4865 acb->refcnt = 1;
ce1a14dc
PB
4866 return acb;
4867}
4868
f197fe2b
FZ
4869void qemu_aio_ref(void *p)
4870{
7c84b1b8 4871 BlockAIOCB *acb = p;
f197fe2b
FZ
4872 acb->refcnt++;
4873}
4874
8007429a 4875void qemu_aio_unref(void *p)
ce1a14dc 4876{
7c84b1b8 4877 BlockAIOCB *acb = p;
f197fe2b
FZ
4878 assert(acb->refcnt > 0);
4879 if (--acb->refcnt == 0) {
4880 g_slice_free1(acb->aiocb_info->aiocb_size, acb);
4881 }
ce1a14dc 4882}
19cb3738 4883
f9f05dc5
KW
4884/**************************************************************/
4885/* Coroutine block device emulation */
4886
4887typedef struct CoroutineIOCompletion {
4888 Coroutine *coroutine;
4889 int ret;
4890} CoroutineIOCompletion;
4891
4892static void bdrv_co_io_em_complete(void *opaque, int ret)
4893{
4894 CoroutineIOCompletion *co = opaque;
4895
4896 co->ret = ret;
4897 qemu_coroutine_enter(co->coroutine, NULL);
4898}
4899
4900static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
4901 int nb_sectors, QEMUIOVector *iov,
4902 bool is_write)
4903{
4904 CoroutineIOCompletion co = {
4905 .coroutine = qemu_coroutine_self(),
4906 };
7c84b1b8 4907 BlockAIOCB *acb;
f9f05dc5
KW
4908
4909 if (is_write) {
a652d160
SH
4910 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
4911 bdrv_co_io_em_complete, &co);
f9f05dc5 4912 } else {
a652d160
SH
4913 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
4914 bdrv_co_io_em_complete, &co);
f9f05dc5
KW
4915 }
4916
59370aaa 4917 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
f9f05dc5
KW
4918 if (!acb) {
4919 return -EIO;
4920 }
4921 qemu_coroutine_yield();
4922
4923 return co.ret;
4924}
4925
4926static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4927 int64_t sector_num, int nb_sectors,
4928 QEMUIOVector *iov)
4929{
4930 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4931}
4932
4933static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4934 int64_t sector_num, int nb_sectors,
4935 QEMUIOVector *iov)
4936{
4937 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4938}
4939
07f07615 4940static void coroutine_fn bdrv_flush_co_entry(void *opaque)
e7a8a783 4941{
07f07615
PB
4942 RwCo *rwco = opaque;
4943
4944 rwco->ret = bdrv_co_flush(rwco->bs);
4945}
4946
4947int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4948{
eb489bb1
KW
4949 int ret;
4950
29cdb251 4951 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
07f07615 4952 return 0;
eb489bb1
KW
4953 }
4954
ca716364 4955 /* Write back cached data to the OS even with cache=unsafe */
bf736fe3 4956 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
eb489bb1
KW
4957 if (bs->drv->bdrv_co_flush_to_os) {
4958 ret = bs->drv->bdrv_co_flush_to_os(bs);
4959 if (ret < 0) {
4960 return ret;
4961 }
4962 }
4963
ca716364
KW
4964 /* But don't actually force it to the disk with cache=unsafe */
4965 if (bs->open_flags & BDRV_O_NO_FLUSH) {
d4c82329 4966 goto flush_parent;
ca716364
KW
4967 }
4968
bf736fe3 4969 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
eb489bb1 4970 if (bs->drv->bdrv_co_flush_to_disk) {
29cdb251 4971 ret = bs->drv->bdrv_co_flush_to_disk(bs);
07f07615 4972 } else if (bs->drv->bdrv_aio_flush) {
7c84b1b8 4973 BlockAIOCB *acb;
07f07615
PB
4974 CoroutineIOCompletion co = {
4975 .coroutine = qemu_coroutine_self(),
4976 };
4977
4978 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4979 if (acb == NULL) {
29cdb251 4980 ret = -EIO;
07f07615
PB
4981 } else {
4982 qemu_coroutine_yield();
29cdb251 4983 ret = co.ret;
07f07615 4984 }
07f07615
PB
4985 } else {
4986 /*
4987 * Some block drivers always operate in either writethrough or unsafe
4988 * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4989 * know how the server works (because the behaviour is hardcoded or
4990 * depends on server-side configuration), so we can't ensure that
4991 * everything is safe on disk. Returning an error doesn't work because
4992 * that would break guests even if the server operates in writethrough
4993 * mode.
4994 *
4995 * Let's hope the user knows what he's doing.
4996 */
29cdb251 4997 ret = 0;
07f07615 4998 }
29cdb251
PB
4999 if (ret < 0) {
5000 return ret;
5001 }
5002
5003 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
5004 * in the case of cache=unsafe, so there are no useless flushes.
5005 */
d4c82329 5006flush_parent:
29cdb251 5007 return bdrv_co_flush(bs->file);
07f07615
PB
5008}
5009
5a8a30db 5010void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)
0f15423c 5011{
5a8a30db
KW
5012 Error *local_err = NULL;
5013 int ret;
5014
3456a8d1
KW
5015 if (!bs->drv) {
5016 return;
5017 }
5018
7ea2d269
AK
5019 if (!(bs->open_flags & BDRV_O_INCOMING)) {
5020 return;
5021 }
5022 bs->open_flags &= ~BDRV_O_INCOMING;
5023
3456a8d1 5024 if (bs->drv->bdrv_invalidate_cache) {
5a8a30db 5025 bs->drv->bdrv_invalidate_cache(bs, &local_err);
3456a8d1 5026 } else if (bs->file) {
5a8a30db
KW
5027 bdrv_invalidate_cache(bs->file, &local_err);
5028 }
5029 if (local_err) {
5030 error_propagate(errp, local_err);
5031 return;
0f15423c 5032 }
3456a8d1 5033
5a8a30db
KW
5034 ret = refresh_total_sectors(bs, bs->total_sectors);
5035 if (ret < 0) {
5036 error_setg_errno(errp, -ret, "Could not refresh total sector count");
5037 return;
5038 }
0f15423c
AL
5039}
5040
5a8a30db 5041void bdrv_invalidate_cache_all(Error **errp)
0f15423c
AL
5042{
5043 BlockDriverState *bs;
5a8a30db 5044 Error *local_err = NULL;
0f15423c 5045
dc364f4c 5046 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
ed78cda3
SH
5047 AioContext *aio_context = bdrv_get_aio_context(bs);
5048
5049 aio_context_acquire(aio_context);
5a8a30db 5050 bdrv_invalidate_cache(bs, &local_err);
ed78cda3 5051 aio_context_release(aio_context);
5a8a30db
KW
5052 if (local_err) {
5053 error_propagate(errp, local_err);
5054 return;
5055 }
0f15423c
AL
5056 }
5057}
5058
07f07615
PB
5059int bdrv_flush(BlockDriverState *bs)
5060{
5061 Coroutine *co;
5062 RwCo rwco = {
5063 .bs = bs,
5064 .ret = NOT_DONE,
e7a8a783 5065 };
e7a8a783 5066
07f07615
PB
5067 if (qemu_in_coroutine()) {
5068 /* Fast-path if already in coroutine context */
5069 bdrv_flush_co_entry(&rwco);
5070 } else {
2572b37a
SH
5071 AioContext *aio_context = bdrv_get_aio_context(bs);
5072
07f07615
PB
5073 co = qemu_coroutine_create(bdrv_flush_co_entry);
5074 qemu_coroutine_enter(co, &rwco);
5075 while (rwco.ret == NOT_DONE) {
2572b37a 5076 aio_poll(aio_context, true);
07f07615 5077 }
e7a8a783 5078 }
07f07615
PB
5079
5080 return rwco.ret;
e7a8a783
KW
5081}
5082
775aa8b6
KW
5083typedef struct DiscardCo {
5084 BlockDriverState *bs;
5085 int64_t sector_num;
5086 int nb_sectors;
5087 int ret;
5088} DiscardCo;
4265d620
PB
5089static void coroutine_fn bdrv_discard_co_entry(void *opaque)
5090{
775aa8b6 5091 DiscardCo *rwco = opaque;
4265d620
PB
5092
5093 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
5094}
5095
6f14da52
PL
5096/* if no limit is specified in the BlockLimits use a default
5097 * of 32768 512-byte sectors (16 MiB) per request.
5098 */
5099#define MAX_DISCARD_DEFAULT 32768
5100
4265d620
PB
5101int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
5102 int nb_sectors)
5103{
d51e9fe5
PB
5104 int max_discard;
5105
4265d620
PB
5106 if (!bs->drv) {
5107 return -ENOMEDIUM;
5108 } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
5109 return -EIO;
5110 } else if (bs->read_only) {
5111 return -EROFS;
df702c9b
PB
5112 }
5113
e4654d2d 5114 bdrv_reset_dirty(bs, sector_num, nb_sectors);
df702c9b 5115
9e8f1835
PB
5116 /* Do nothing if disabled. */
5117 if (!(bs->open_flags & BDRV_O_UNMAP)) {
5118 return 0;
5119 }
5120
d51e9fe5
PB
5121 if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
5122 return 0;
5123 }
6f14da52 5124
d51e9fe5
PB
5125 max_discard = bs->bl.max_discard ? bs->bl.max_discard : MAX_DISCARD_DEFAULT;
5126 while (nb_sectors > 0) {
5127 int ret;
5128 int num = nb_sectors;
6f14da52 5129
d51e9fe5
PB
5130 /* align request */
5131 if (bs->bl.discard_alignment &&
5132 num >= bs->bl.discard_alignment &&
5133 sector_num % bs->bl.discard_alignment) {
5134 if (num > bs->bl.discard_alignment) {
5135 num = bs->bl.discard_alignment;
6f14da52 5136 }
d51e9fe5
PB
5137 num -= sector_num % bs->bl.discard_alignment;
5138 }
6f14da52 5139
d51e9fe5
PB
5140 /* limit request size */
5141 if (num > max_discard) {
5142 num = max_discard;
5143 }
6f14da52 5144
d51e9fe5 5145 if (bs->drv->bdrv_co_discard) {
6f14da52 5146 ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
d51e9fe5 5147 } else {
7c84b1b8 5148 BlockAIOCB *acb;
d51e9fe5
PB
5149 CoroutineIOCompletion co = {
5150 .coroutine = qemu_coroutine_self(),
5151 };
5152
5153 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
5154 bdrv_co_io_em_complete, &co);
5155 if (acb == NULL) {
5156 return -EIO;
5157 } else {
5158 qemu_coroutine_yield();
5159 ret = co.ret;
6f14da52 5160 }
6f14da52 5161 }
7ce21016 5162 if (ret && ret != -ENOTSUP) {
d51e9fe5 5163 return ret;
4265d620 5164 }
d51e9fe5
PB
5165
5166 sector_num += num;
5167 nb_sectors -= num;
4265d620 5168 }
d51e9fe5 5169 return 0;
4265d620
PB
5170}
5171
5172int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
5173{
5174 Coroutine *co;
775aa8b6 5175 DiscardCo rwco = {
4265d620
PB
5176 .bs = bs,
5177 .sector_num = sector_num,
5178 .nb_sectors = nb_sectors,
5179 .ret = NOT_DONE,
5180 };
5181
5182 if (qemu_in_coroutine()) {
5183 /* Fast-path if already in coroutine context */
5184 bdrv_discard_co_entry(&rwco);
5185 } else {
2572b37a
SH
5186 AioContext *aio_context = bdrv_get_aio_context(bs);
5187
4265d620
PB
5188 co = qemu_coroutine_create(bdrv_discard_co_entry);
5189 qemu_coroutine_enter(co, &rwco);
5190 while (rwco.ret == NOT_DONE) {
2572b37a 5191 aio_poll(aio_context, true);
4265d620
PB
5192 }
5193 }
5194
5195 return rwco.ret;
5196}
5197
19cb3738
FB
5198/**************************************************************/
5199/* removable device support */
5200
5201/**
5202 * Return TRUE if the media is present
5203 */
5204int bdrv_is_inserted(BlockDriverState *bs)
5205{
5206 BlockDriver *drv = bs->drv;
a1aff5bf 5207
19cb3738
FB
5208 if (!drv)
5209 return 0;
5210 if (!drv->bdrv_is_inserted)
a1aff5bf
MA
5211 return 1;
5212 return drv->bdrv_is_inserted(bs);
19cb3738
FB
5213}
5214
5215/**
8e49ca46
MA
5216 * Return whether the media changed since the last call to this
5217 * function, or -ENOTSUP if we don't know. Most drivers don't know.
19cb3738
FB
5218 */
5219int bdrv_media_changed(BlockDriverState *bs)
5220{
5221 BlockDriver *drv = bs->drv;
19cb3738 5222
8e49ca46
MA
5223 if (drv && drv->bdrv_media_changed) {
5224 return drv->bdrv_media_changed(bs);
5225 }
5226 return -ENOTSUP;
19cb3738
FB
5227}
5228
5229/**
5230 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
5231 */
f36f3949 5232void bdrv_eject(BlockDriverState *bs, bool eject_flag)
19cb3738
FB
5233{
5234 BlockDriver *drv = bs->drv;
bfb197e0 5235 const char *device_name;
19cb3738 5236
822e1cd1
MA
5237 if (drv && drv->bdrv_eject) {
5238 drv->bdrv_eject(bs, eject_flag);
19cb3738 5239 }
6f382ed2 5240
bfb197e0
MA
5241 device_name = bdrv_get_device_name(bs);
5242 if (device_name[0] != '\0') {
5243 qapi_event_send_device_tray_moved(device_name,
a5ee7bd4 5244 eject_flag, &error_abort);
6f382ed2 5245 }
19cb3738
FB
5246}
5247
19cb3738
FB
5248/**
5249 * Lock or unlock the media (if it is locked, the user won't be able
5250 * to eject it manually).
5251 */
025e849a 5252void bdrv_lock_medium(BlockDriverState *bs, bool locked)
19cb3738
FB
5253{
5254 BlockDriver *drv = bs->drv;
5255
025e849a 5256 trace_bdrv_lock_medium(bs, locked);
b8c6d095 5257
025e849a
MA
5258 if (drv && drv->bdrv_lock_medium) {
5259 drv->bdrv_lock_medium(bs, locked);
19cb3738
FB
5260 }
5261}
985a03b0
TS
5262
5263/* needed for generic scsi interface */
5264
5265int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
5266{
5267 BlockDriver *drv = bs->drv;
5268
5269 if (drv && drv->bdrv_ioctl)
5270 return drv->bdrv_ioctl(bs, req, buf);
5271 return -ENOTSUP;
5272}
7d780669 5273
7c84b1b8 5274BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
221f715d 5275 unsigned long int req, void *buf,
097310b5 5276 BlockCompletionFunc *cb, void *opaque)
7d780669 5277{
221f715d 5278 BlockDriver *drv = bs->drv;
7d780669 5279
221f715d
AL
5280 if (drv && drv->bdrv_aio_ioctl)
5281 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
5282 return NULL;
7d780669 5283}
e268ca52 5284
1b7fd729 5285void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
7b6f9300 5286{
1b7fd729 5287 bs->guest_block_size = align;
7b6f9300 5288}
7cd1e32a 5289
e268ca52
AL
5290void *qemu_blockalign(BlockDriverState *bs, size_t size)
5291{
339064d5 5292 return qemu_memalign(bdrv_opt_mem_align(bs), size);
e268ca52 5293}
7cd1e32a 5294
9ebd8448
HR
5295void *qemu_blockalign0(BlockDriverState *bs, size_t size)
5296{
5297 return memset(qemu_blockalign(bs, size), 0, size);
5298}
5299
7d2a35cc
KW
5300void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
5301{
5302 size_t align = bdrv_opt_mem_align(bs);
5303
5304 /* Ensure that NULL is never returned on success */
5305 assert(align > 0);
5306 if (size == 0) {
5307 size = align;
5308 }
5309
5310 return qemu_try_memalign(align, size);
5311}
5312
9ebd8448
HR
5313void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
5314{
5315 void *mem = qemu_try_blockalign(bs, size);
5316
5317 if (mem) {
5318 memset(mem, 0, size);
5319 }
5320
5321 return mem;
5322}
5323
c53b1c51
SH
5324/*
5325 * Check if all memory in this vector is sector aligned.
5326 */
5327bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
5328{
5329 int i;
339064d5 5330 size_t alignment = bdrv_opt_mem_align(bs);
c53b1c51
SH
5331
5332 for (i = 0; i < qiov->niov; i++) {
339064d5 5333 if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
c53b1c51 5334 return false;
1ff735bd 5335 }
339064d5 5336 if (qiov->iov[i].iov_len % alignment) {
1ff735bd 5337 return false;
c53b1c51
SH
5338 }
5339 }
5340
5341 return true;
5342}
5343
b8afb520
FZ
5344BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity,
5345 Error **errp)
7cd1e32a
LS
5346{
5347 int64_t bitmap_size;
e4654d2d 5348 BdrvDirtyBitmap *bitmap;
a55eb92c 5349
50717e94
PB
5350 assert((granularity & (granularity - 1)) == 0);
5351
e4654d2d
FZ
5352 granularity >>= BDRV_SECTOR_BITS;
5353 assert(granularity);
57322b78 5354 bitmap_size = bdrv_nb_sectors(bs);
b8afb520
FZ
5355 if (bitmap_size < 0) {
5356 error_setg_errno(errp, -bitmap_size, "could not get length of device");
5357 errno = -bitmap_size;
5358 return NULL;
5359 }
5839e53b 5360 bitmap = g_new0(BdrvDirtyBitmap, 1);
e4654d2d
FZ
5361 bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
5362 QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
5363 return bitmap;
5364}
5365
5366void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5367{
5368 BdrvDirtyBitmap *bm, *next;
5369 QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
5370 if (bm == bitmap) {
5371 QLIST_REMOVE(bitmap, list);
5372 hbitmap_free(bitmap->bitmap);
5373 g_free(bitmap);
5374 return;
a55eb92c 5375 }
7cd1e32a
LS
5376 }
5377}
5378
21b56835
FZ
5379BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
5380{
5381 BdrvDirtyBitmap *bm;
5382 BlockDirtyInfoList *list = NULL;
5383 BlockDirtyInfoList **plist = &list;
5384
5385 QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
5839e53b
MA
5386 BlockDirtyInfo *info = g_new0(BlockDirtyInfo, 1);
5387 BlockDirtyInfoList *entry = g_new0(BlockDirtyInfoList, 1);
21b56835
FZ
5388 info->count = bdrv_get_dirty_count(bs, bm);
5389 info->granularity =
5390 ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
5391 entry->value = info;
5392 *plist = entry;
5393 plist = &entry->next;
5394 }
5395
5396 return list;
5397}
5398
e4654d2d 5399int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
7cd1e32a 5400{
e4654d2d
FZ
5401 if (bitmap) {
5402 return hbitmap_get(bitmap->bitmap, sector);
7cd1e32a
LS
5403 } else {
5404 return 0;
5405 }
5406}
5407
e4654d2d
FZ
5408void bdrv_dirty_iter_init(BlockDriverState *bs,
5409 BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
1755da16 5410{
e4654d2d 5411 hbitmap_iter_init(hbi, bitmap->bitmap, 0);
1755da16
PB
5412}
5413
5414void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
5415 int nr_sectors)
5416{
e4654d2d
FZ
5417 BdrvDirtyBitmap *bitmap;
5418 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5419 hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5420 }
1755da16
PB
5421}
5422
e4654d2d 5423void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors)
7cd1e32a 5424{
e4654d2d
FZ
5425 BdrvDirtyBitmap *bitmap;
5426 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5427 hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5428 }
7cd1e32a 5429}
aaa0eb75 5430
e4654d2d 5431int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
aaa0eb75 5432{
e4654d2d 5433 return hbitmap_count(bitmap->bitmap);
aaa0eb75 5434}
f88e1a42 5435
9fcb0251
FZ
5436/* Get a reference to bs */
5437void bdrv_ref(BlockDriverState *bs)
5438{
5439 bs->refcnt++;
5440}
5441
5442/* Release a previously grabbed reference to bs.
5443 * If after releasing, reference count is zero, the BlockDriverState is
5444 * deleted. */
5445void bdrv_unref(BlockDriverState *bs)
5446{
9a4d5ca6
JC
5447 if (!bs) {
5448 return;
5449 }
9fcb0251
FZ
5450 assert(bs->refcnt > 0);
5451 if (--bs->refcnt == 0) {
5452 bdrv_delete(bs);
5453 }
5454}
5455
fbe40ff7
FZ
5456struct BdrvOpBlocker {
5457 Error *reason;
5458 QLIST_ENTRY(BdrvOpBlocker) list;
5459};
5460
5461bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp)
5462{
5463 BdrvOpBlocker *blocker;
5464 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5465 if (!QLIST_EMPTY(&bs->op_blockers[op])) {
5466 blocker = QLIST_FIRST(&bs->op_blockers[op]);
5467 if (errp) {
5468 error_setg(errp, "Device '%s' is busy: %s",
bfb197e0
MA
5469 bdrv_get_device_name(bs),
5470 error_get_pretty(blocker->reason));
fbe40ff7
FZ
5471 }
5472 return true;
5473 }
5474 return false;
5475}
5476
5477void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason)
5478{
5479 BdrvOpBlocker *blocker;
5480 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5481
5839e53b 5482 blocker = g_new0(BdrvOpBlocker, 1);
fbe40ff7
FZ
5483 blocker->reason = reason;
5484 QLIST_INSERT_HEAD(&bs->op_blockers[op], blocker, list);
5485}
5486
5487void bdrv_op_unblock(BlockDriverState *bs, BlockOpType op, Error *reason)
5488{
5489 BdrvOpBlocker *blocker, *next;
5490 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5491 QLIST_FOREACH_SAFE(blocker, &bs->op_blockers[op], list, next) {
5492 if (blocker->reason == reason) {
5493 QLIST_REMOVE(blocker, list);
5494 g_free(blocker);
5495 }
5496 }
5497}
5498
5499void bdrv_op_block_all(BlockDriverState *bs, Error *reason)
5500{
5501 int i;
5502 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5503 bdrv_op_block(bs, i, reason);
5504 }
5505}
5506
5507void bdrv_op_unblock_all(BlockDriverState *bs, Error *reason)
5508{
5509 int i;
5510 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5511 bdrv_op_unblock(bs, i, reason);
5512 }
5513}
5514
5515bool bdrv_op_blocker_is_empty(BlockDriverState *bs)
5516{
5517 int i;
5518
5519 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5520 if (!QLIST_EMPTY(&bs->op_blockers[i])) {
5521 return false;
5522 }
5523 }
5524 return true;
5525}
5526
28a7282a
LC
5527void bdrv_iostatus_enable(BlockDriverState *bs)
5528{
d6bf279e 5529 bs->iostatus_enabled = true;
58e21ef5 5530 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
28a7282a
LC
5531}
5532
5533/* The I/O status is only enabled if the drive explicitly
5534 * enables it _and_ the VM is configured to stop on errors */
5535bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
5536{
d6bf279e 5537 return (bs->iostatus_enabled &&
92aa5c6d
PB
5538 (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
5539 bs->on_write_error == BLOCKDEV_ON_ERROR_STOP ||
5540 bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
28a7282a
LC
5541}
5542
5543void bdrv_iostatus_disable(BlockDriverState *bs)
5544{
d6bf279e 5545 bs->iostatus_enabled = false;
28a7282a
LC
5546}
5547
5548void bdrv_iostatus_reset(BlockDriverState *bs)
5549{
5550 if (bdrv_iostatus_is_enabled(bs)) {
58e21ef5 5551 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
3bd293c3
PB
5552 if (bs->job) {
5553 block_job_iostatus_reset(bs->job);
5554 }
28a7282a
LC
5555 }
5556}
5557
28a7282a
LC
5558void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
5559{
3e1caa5f
PB
5560 assert(bdrv_iostatus_is_enabled(bs));
5561 if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
58e21ef5
LC
5562 bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
5563 BLOCK_DEVICE_IO_STATUS_FAILED;
28a7282a
LC
5564 }
5565}
5566
d92ada22
LC
5567void bdrv_img_create(const char *filename, const char *fmt,
5568 const char *base_filename, const char *base_fmt,
f382d43a
MR
5569 char *options, uint64_t img_size, int flags,
5570 Error **errp, bool quiet)
f88e1a42 5571{
83d0521a
CL
5572 QemuOptsList *create_opts = NULL;
5573 QemuOpts *opts = NULL;
5574 const char *backing_fmt, *backing_file;
5575 int64_t size;
f88e1a42 5576 BlockDriver *drv, *proto_drv;
96df67d1 5577 BlockDriver *backing_drv = NULL;
cc84d90f 5578 Error *local_err = NULL;
f88e1a42
JS
5579 int ret = 0;
5580
5581 /* Find driver and parse its options */
5582 drv = bdrv_find_format(fmt);
5583 if (!drv) {
71c79813 5584 error_setg(errp, "Unknown file format '%s'", fmt);
d92ada22 5585 return;
f88e1a42
JS
5586 }
5587
98289620 5588 proto_drv = bdrv_find_protocol(filename, true);
f88e1a42 5589 if (!proto_drv) {
71c79813 5590 error_setg(errp, "Unknown protocol '%s'", filename);
d92ada22 5591 return;
f88e1a42
JS
5592 }
5593
c6149724
HR
5594 if (!drv->create_opts) {
5595 error_setg(errp, "Format driver '%s' does not support image creation",
5596 drv->format_name);
5597 return;
5598 }
5599
5600 if (!proto_drv->create_opts) {
5601 error_setg(errp, "Protocol driver '%s' does not support image creation",
5602 proto_drv->format_name);
5603 return;
5604 }
5605
c282e1fd
CL
5606 create_opts = qemu_opts_append(create_opts, drv->create_opts);
5607 create_opts = qemu_opts_append(create_opts, proto_drv->create_opts);
f88e1a42
JS
5608
5609 /* Create parameter list with default values */
83d0521a
CL
5610 opts = qemu_opts_create(create_opts, NULL, 0, &error_abort);
5611 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, img_size);
f88e1a42
JS
5612
5613 /* Parse -o options */
5614 if (options) {
83d0521a
CL
5615 if (qemu_opts_do_parse(opts, options, NULL) != 0) {
5616 error_setg(errp, "Invalid options for file format '%s'", fmt);
f88e1a42
JS
5617 goto out;
5618 }
5619 }
5620
5621 if (base_filename) {
83d0521a 5622 if (qemu_opt_set(opts, BLOCK_OPT_BACKING_FILE, base_filename)) {
71c79813
LC
5623 error_setg(errp, "Backing file not supported for file format '%s'",
5624 fmt);
f88e1a42
JS
5625 goto out;
5626 }
5627 }
5628
5629 if (base_fmt) {
83d0521a 5630 if (qemu_opt_set(opts, BLOCK_OPT_BACKING_FMT, base_fmt)) {
71c79813
LC
5631 error_setg(errp, "Backing file format not supported for file "
5632 "format '%s'", fmt);
f88e1a42
JS
5633 goto out;
5634 }
5635 }
5636
83d0521a
CL
5637 backing_file = qemu_opt_get(opts, BLOCK_OPT_BACKING_FILE);
5638 if (backing_file) {
5639 if (!strcmp(filename, backing_file)) {
71c79813
LC
5640 error_setg(errp, "Error: Trying to create an image with the "
5641 "same filename as the backing file");
792da93a
JS
5642 goto out;
5643 }
5644 }
5645
83d0521a
CL
5646 backing_fmt = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT);
5647 if (backing_fmt) {
5648 backing_drv = bdrv_find_format(backing_fmt);
96df67d1 5649 if (!backing_drv) {
71c79813 5650 error_setg(errp, "Unknown backing file format '%s'",
83d0521a 5651 backing_fmt);
f88e1a42
JS
5652 goto out;
5653 }
5654 }
5655
5656 // The size for the image must always be specified, with one exception:
5657 // If we are using a backing file, we can obtain the size from there
83d0521a
CL
5658 size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 0);
5659 if (size == -1) {
5660 if (backing_file) {
66f6b814 5661 BlockDriverState *bs;
52bf1e72 5662 int64_t size;
63090dac
PB
5663 int back_flags;
5664
5665 /* backing files always opened read-only */
5666 back_flags =
5667 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
f88e1a42 5668
f67503e5 5669 bs = NULL;
83d0521a 5670 ret = bdrv_open(&bs, backing_file, NULL, NULL, back_flags,
cc84d90f 5671 backing_drv, &local_err);
f88e1a42 5672 if (ret < 0) {
f88e1a42
JS
5673 goto out;
5674 }
52bf1e72
MA
5675 size = bdrv_getlength(bs);
5676 if (size < 0) {
5677 error_setg_errno(errp, -size, "Could not get size of '%s'",
5678 backing_file);
5679 bdrv_unref(bs);
5680 goto out;
5681 }
f88e1a42 5682
83d0521a 5683 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, size);
66f6b814
HR
5684
5685 bdrv_unref(bs);
f88e1a42 5686 } else {
71c79813 5687 error_setg(errp, "Image creation needs a size parameter");
f88e1a42
JS
5688 goto out;
5689 }
5690 }
5691
f382d43a 5692 if (!quiet) {
43c5d8f8
FZ
5693 printf("Formatting '%s', fmt=%s", filename, fmt);
5694 qemu_opts_print(opts, " ");
f382d43a
MR
5695 puts("");
5696 }
83d0521a 5697
c282e1fd 5698 ret = bdrv_create(drv, filename, opts, &local_err);
83d0521a 5699
cc84d90f
HR
5700 if (ret == -EFBIG) {
5701 /* This is generally a better message than whatever the driver would
5702 * deliver (especially because of the cluster_size_hint), since that
5703 * is most probably not much different from "image too large". */
5704 const char *cluster_size_hint = "";
83d0521a 5705 if (qemu_opt_get_size(opts, BLOCK_OPT_CLUSTER_SIZE, 0)) {
cc84d90f 5706 cluster_size_hint = " (try using a larger cluster size)";
f88e1a42 5707 }
cc84d90f
HR
5708 error_setg(errp, "The image size is too large for file format '%s'"
5709 "%s", fmt, cluster_size_hint);
5710 error_free(local_err);
5711 local_err = NULL;
f88e1a42
JS
5712 }
5713
5714out:
83d0521a
CL
5715 qemu_opts_del(opts);
5716 qemu_opts_free(create_opts);
84d18f06 5717 if (local_err) {
cc84d90f
HR
5718 error_propagate(errp, local_err);
5719 }
f88e1a42 5720}
85d126f3
SH
5721
5722AioContext *bdrv_get_aio_context(BlockDriverState *bs)
5723{
dcd04228
SH
5724 return bs->aio_context;
5725}
5726
5727void bdrv_detach_aio_context(BlockDriverState *bs)
5728{
33384421
HR
5729 BdrvAioNotifier *baf;
5730
dcd04228
SH
5731 if (!bs->drv) {
5732 return;
5733 }
5734
33384421
HR
5735 QLIST_FOREACH(baf, &bs->aio_notifiers, list) {
5736 baf->detach_aio_context(baf->opaque);
5737 }
5738
13af91eb
SH
5739 if (bs->io_limits_enabled) {
5740 throttle_detach_aio_context(&bs->throttle_state);
5741 }
dcd04228
SH
5742 if (bs->drv->bdrv_detach_aio_context) {
5743 bs->drv->bdrv_detach_aio_context(bs);
5744 }
5745 if (bs->file) {
5746 bdrv_detach_aio_context(bs->file);
5747 }
5748 if (bs->backing_hd) {
5749 bdrv_detach_aio_context(bs->backing_hd);
5750 }
5751
5752 bs->aio_context = NULL;
5753}
5754
5755void bdrv_attach_aio_context(BlockDriverState *bs,
5756 AioContext *new_context)
5757{
33384421
HR
5758 BdrvAioNotifier *ban;
5759
dcd04228
SH
5760 if (!bs->drv) {
5761 return;
5762 }
5763
5764 bs->aio_context = new_context;
5765
5766 if (bs->backing_hd) {
5767 bdrv_attach_aio_context(bs->backing_hd, new_context);
5768 }
5769 if (bs->file) {
5770 bdrv_attach_aio_context(bs->file, new_context);
5771 }
5772 if (bs->drv->bdrv_attach_aio_context) {
5773 bs->drv->bdrv_attach_aio_context(bs, new_context);
5774 }
13af91eb
SH
5775 if (bs->io_limits_enabled) {
5776 throttle_attach_aio_context(&bs->throttle_state, new_context);
5777 }
33384421
HR
5778
5779 QLIST_FOREACH(ban, &bs->aio_notifiers, list) {
5780 ban->attached_aio_context(new_context, ban->opaque);
5781 }
dcd04228
SH
5782}
5783
5784void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
5785{
5786 bdrv_drain_all(); /* ensure there are no in-flight requests */
5787
5788 bdrv_detach_aio_context(bs);
5789
5790 /* This function executes in the old AioContext so acquire the new one in
5791 * case it runs in a different thread.
5792 */
5793 aio_context_acquire(new_context);
5794 bdrv_attach_aio_context(bs, new_context);
5795 aio_context_release(new_context);
85d126f3 5796}
d616b224 5797
33384421
HR
5798void bdrv_add_aio_context_notifier(BlockDriverState *bs,
5799 void (*attached_aio_context)(AioContext *new_context, void *opaque),
5800 void (*detach_aio_context)(void *opaque), void *opaque)
5801{
5802 BdrvAioNotifier *ban = g_new(BdrvAioNotifier, 1);
5803 *ban = (BdrvAioNotifier){
5804 .attached_aio_context = attached_aio_context,
5805 .detach_aio_context = detach_aio_context,
5806 .opaque = opaque
5807 };
5808
5809 QLIST_INSERT_HEAD(&bs->aio_notifiers, ban, list);
5810}
5811
5812void bdrv_remove_aio_context_notifier(BlockDriverState *bs,
5813 void (*attached_aio_context)(AioContext *,
5814 void *),
5815 void (*detach_aio_context)(void *),
5816 void *opaque)
5817{
5818 BdrvAioNotifier *ban, *ban_next;
5819
5820 QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
5821 if (ban->attached_aio_context == attached_aio_context &&
5822 ban->detach_aio_context == detach_aio_context &&
5823 ban->opaque == opaque)
5824 {
5825 QLIST_REMOVE(ban, list);
5826 g_free(ban);
5827
5828 return;
5829 }
5830 }
5831
5832 abort();
5833}
5834
d616b224
SH
5835void bdrv_add_before_write_notifier(BlockDriverState *bs,
5836 NotifierWithReturn *notifier)
5837{
5838 notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
5839}
6f176b48 5840
77485434
HR
5841int bdrv_amend_options(BlockDriverState *bs, QemuOpts *opts,
5842 BlockDriverAmendStatusCB *status_cb)
6f176b48 5843{
c282e1fd 5844 if (!bs->drv->bdrv_amend_options) {
6f176b48
HR
5845 return -ENOTSUP;
5846 }
77485434 5847 return bs->drv->bdrv_amend_options(bs, opts, status_cb);
6f176b48 5848}
f6186f49 5849
b5042a36
BC
5850/* This function will be called by the bdrv_recurse_is_first_non_filter method
5851 * of block filter and by bdrv_is_first_non_filter.
5852 * It is used to test if the given bs is the candidate or recurse more in the
5853 * node graph.
212a5a8f 5854 */
b5042a36 5855bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
212a5a8f 5856 BlockDriverState *candidate)
f6186f49 5857{
b5042a36
BC
5858 /* return false if basic checks fails */
5859 if (!bs || !bs->drv) {
212a5a8f 5860 return false;
f6186f49
BC
5861 }
5862
b5042a36
BC
5863 /* the code reached a non block filter driver -> check if the bs is
5864 * the same as the candidate. It's the recursion termination condition.
5865 */
5866 if (!bs->drv->is_filter) {
5867 return bs == candidate;
212a5a8f 5868 }
b5042a36 5869 /* Down this path the driver is a block filter driver */
212a5a8f 5870
b5042a36
BC
5871 /* If the block filter recursion method is defined use it to recurse down
5872 * the node graph.
5873 */
5874 if (bs->drv->bdrv_recurse_is_first_non_filter) {
212a5a8f 5875 return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate);
f6186f49
BC
5876 }
5877
b5042a36
BC
5878 /* the driver is a block filter but don't allow to recurse -> return false
5879 */
5880 return false;
f6186f49
BC
5881}
5882
212a5a8f
BC
5883/* This function checks if the candidate is the first non filter bs down it's
5884 * bs chain. Since we don't have pointers to parents it explore all bs chains
5885 * from the top. Some filters can choose not to pass down the recursion.
5886 */
5887bool bdrv_is_first_non_filter(BlockDriverState *candidate)
f6186f49 5888{
212a5a8f
BC
5889 BlockDriverState *bs;
5890
5891 /* walk down the bs forest recursively */
5892 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5893 bool perm;
5894
b5042a36 5895 /* try to recurse in this top level bs */
e6dc8a1f 5896 perm = bdrv_recurse_is_first_non_filter(bs, candidate);
212a5a8f
BC
5897
5898 /* candidate is the first non filter */
5899 if (perm) {
5900 return true;
5901 }
5902 }
5903
5904 return false;
f6186f49 5905}
09158f00
BC
5906
5907BlockDriverState *check_to_replace_node(const char *node_name, Error **errp)
5908{
5909 BlockDriverState *to_replace_bs = bdrv_find_node(node_name);
5a7e7a0b
SH
5910 AioContext *aio_context;
5911
09158f00
BC
5912 if (!to_replace_bs) {
5913 error_setg(errp, "Node name '%s' not found", node_name);
5914 return NULL;
5915 }
5916
5a7e7a0b
SH
5917 aio_context = bdrv_get_aio_context(to_replace_bs);
5918 aio_context_acquire(aio_context);
5919
09158f00 5920 if (bdrv_op_is_blocked(to_replace_bs, BLOCK_OP_TYPE_REPLACE, errp)) {
5a7e7a0b
SH
5921 to_replace_bs = NULL;
5922 goto out;
09158f00
BC
5923 }
5924
5925 /* We don't want arbitrary node of the BDS chain to be replaced only the top
5926 * most non filter in order to prevent data corruption.
5927 * Another benefit is that this tests exclude backing files which are
5928 * blocked by the backing blockers.
5929 */
5930 if (!bdrv_is_first_non_filter(to_replace_bs)) {
5931 error_setg(errp, "Only top most non filter can be replaced");
5a7e7a0b
SH
5932 to_replace_bs = NULL;
5933 goto out;
09158f00
BC
5934 }
5935
5a7e7a0b
SH
5936out:
5937 aio_context_release(aio_context);
09158f00
BC
5938 return to_replace_bs;
5939}
448ad91d
ML
5940
5941void bdrv_io_plug(BlockDriverState *bs)
5942{
5943 BlockDriver *drv = bs->drv;
5944 if (drv && drv->bdrv_io_plug) {
5945 drv->bdrv_io_plug(bs);
5946 } else if (bs->file) {
5947 bdrv_io_plug(bs->file);
5948 }
5949}
5950
5951void bdrv_io_unplug(BlockDriverState *bs)
5952{
5953 BlockDriver *drv = bs->drv;
5954 if (drv && drv->bdrv_io_unplug) {
5955 drv->bdrv_io_unplug(bs);
5956 } else if (bs->file) {
5957 bdrv_io_unplug(bs->file);
5958 }
5959}
5960
5961void bdrv_flush_io_queue(BlockDriverState *bs)
5962{
5963 BlockDriver *drv = bs->drv;
5964 if (drv && drv->bdrv_flush_io_queue) {
5965 drv->bdrv_flush_io_queue(bs);
5966 } else if (bs->file) {
5967 bdrv_flush_io_queue(bs->file);
5968 }
5969}
91af7014
HR
5970
5971static bool append_open_options(QDict *d, BlockDriverState *bs)
5972{
5973 const QDictEntry *entry;
5974 bool found_any = false;
5975
5976 for (entry = qdict_first(bs->options); entry;
5977 entry = qdict_next(bs->options, entry))
5978 {
5979 /* Only take options for this level and exclude all non-driver-specific
5980 * options */
5981 if (!strchr(qdict_entry_key(entry), '.') &&
5982 strcmp(qdict_entry_key(entry), "node-name"))
5983 {
5984 qobject_incref(qdict_entry_value(entry));
5985 qdict_put_obj(d, qdict_entry_key(entry), qdict_entry_value(entry));
5986 found_any = true;
5987 }
5988 }
5989
5990 return found_any;
5991}
5992
5993/* Updates the following BDS fields:
5994 * - exact_filename: A filename which may be used for opening a block device
5995 * which (mostly) equals the given BDS (even without any
5996 * other options; so reading and writing must return the same
5997 * results, but caching etc. may be different)
5998 * - full_open_options: Options which, when given when opening a block device
5999 * (without a filename), result in a BDS (mostly)
6000 * equalling the given one
6001 * - filename: If exact_filename is set, it is copied here. Otherwise,
6002 * full_open_options is converted to a JSON object, prefixed with
6003 * "json:" (for use through the JSON pseudo protocol) and put here.
6004 */
6005void bdrv_refresh_filename(BlockDriverState *bs)
6006{
6007 BlockDriver *drv = bs->drv;
6008 QDict *opts;
6009
6010 if (!drv) {
6011 return;
6012 }
6013
6014 /* This BDS's file name will most probably depend on its file's name, so
6015 * refresh that first */
6016 if (bs->file) {
6017 bdrv_refresh_filename(bs->file);
6018 }
6019
6020 if (drv->bdrv_refresh_filename) {
6021 /* Obsolete information is of no use here, so drop the old file name
6022 * information before refreshing it */
6023 bs->exact_filename[0] = '\0';
6024 if (bs->full_open_options) {
6025 QDECREF(bs->full_open_options);
6026 bs->full_open_options = NULL;
6027 }
6028
6029 drv->bdrv_refresh_filename(bs);
6030 } else if (bs->file) {
6031 /* Try to reconstruct valid information from the underlying file */
6032 bool has_open_options;
6033
6034 bs->exact_filename[0] = '\0';
6035 if (bs->full_open_options) {
6036 QDECREF(bs->full_open_options);
6037 bs->full_open_options = NULL;
6038 }
6039
6040 opts = qdict_new();
6041 has_open_options = append_open_options(opts, bs);
6042
6043 /* If no specific options have been given for this BDS, the filename of
6044 * the underlying file should suffice for this one as well */
6045 if (bs->file->exact_filename[0] && !has_open_options) {
6046 strcpy(bs->exact_filename, bs->file->exact_filename);
6047 }
6048 /* Reconstructing the full options QDict is simple for most format block
6049 * drivers, as long as the full options are known for the underlying
6050 * file BDS. The full options QDict of that file BDS should somehow
6051 * contain a representation of the filename, therefore the following
6052 * suffices without querying the (exact_)filename of this BDS. */
6053 if (bs->file->full_open_options) {
6054 qdict_put_obj(opts, "driver",
6055 QOBJECT(qstring_from_str(drv->format_name)));
6056 QINCREF(bs->file->full_open_options);
6057 qdict_put_obj(opts, "file", QOBJECT(bs->file->full_open_options));
6058
6059 bs->full_open_options = opts;
6060 } else {
6061 QDECREF(opts);
6062 }
6063 } else if (!bs->full_open_options && qdict_size(bs->options)) {
6064 /* There is no underlying file BDS (at least referenced by BDS.file),
6065 * so the full options QDict should be equal to the options given
6066 * specifically for this block device when it was opened (plus the
6067 * driver specification).
6068 * Because those options don't change, there is no need to update
6069 * full_open_options when it's already set. */
6070
6071 opts = qdict_new();
6072 append_open_options(opts, bs);
6073 qdict_put_obj(opts, "driver",
6074 QOBJECT(qstring_from_str(drv->format_name)));
6075
6076 if (bs->exact_filename[0]) {
6077 /* This may not work for all block protocol drivers (some may
6078 * require this filename to be parsed), but we have to find some
6079 * default solution here, so just include it. If some block driver
6080 * does not support pure options without any filename at all or
6081 * needs some special format of the options QDict, it needs to
6082 * implement the driver-specific bdrv_refresh_filename() function.
6083 */
6084 qdict_put_obj(opts, "filename",
6085 QOBJECT(qstring_from_str(bs->exact_filename)));
6086 }
6087
6088 bs->full_open_options = opts;
6089 }
6090
6091 if (bs->exact_filename[0]) {
6092 pstrcpy(bs->filename, sizeof(bs->filename), bs->exact_filename);
6093 } else if (bs->full_open_options) {
6094 QString *json = qobject_to_json(QOBJECT(bs->full_open_options));
6095 snprintf(bs->filename, sizeof(bs->filename), "json:%s",
6096 qstring_get_str(json));
6097 QDECREF(json);
6098 }
6099}
5366d0c8
BC
6100
6101/* This accessor function purpose is to allow the device models to access the
6102 * BlockAcctStats structure embedded inside a BlockDriverState without being
6103 * aware of the BlockDriverState structure layout.
6104 * It will go away when the BlockAcctStats structure will be moved inside
6105 * the device models.
6106 */
6107BlockAcctStats *bdrv_get_stats(BlockDriverState *bs)
6108{
6109 return &bs->stats;
6110}