]> git.proxmox.com Git - mirror_qemu.git/blame - block.c
block: Eliminate DriveInfo member bdrv, use blk_by_legacy_dinfo()
[mirror_qemu.git] / block.c
CommitLineData
fc01f7e7
FB
1/*
2 * QEMU System Emulator block driver
5fafdf24 3 *
fc01f7e7 4 * Copyright (c) 2003 Fabrice Bellard
5fafdf24 5 *
fc01f7e7
FB
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
3990d09a 24#include "config-host.h"
faf07963 25#include "qemu-common.h"
6d519a5f 26#include "trace.h"
737e150e
PB
27#include "block/block_int.h"
28#include "block/blockjob.h"
1de7afc9 29#include "qemu/module.h"
7b1b5d19 30#include "qapi/qmp/qjson.h"
bfb197e0 31#include "sysemu/block-backend.h"
9c17d615 32#include "sysemu/sysemu.h"
1de7afc9 33#include "qemu/notify.h"
737e150e 34#include "block/coroutine.h"
c13163fb 35#include "block/qapi.h"
b2023818 36#include "qmp-commands.h"
1de7afc9 37#include "qemu/timer.h"
a5ee7bd4 38#include "qapi-event.h"
fc01f7e7 39
71e72a19 40#ifdef CONFIG_BSD
7674e7bf
FB
41#include <sys/types.h>
42#include <sys/stat.h>
43#include <sys/ioctl.h>
72cf2d4f 44#include <sys/queue.h>
c5e97233 45#ifndef __DragonFly__
7674e7bf
FB
46#include <sys/disk.h>
47#endif
c5e97233 48#endif
7674e7bf 49
49dc768d
AL
50#ifdef _WIN32
51#include <windows.h>
52#endif
53
e4654d2d
FZ
54struct BdrvDirtyBitmap {
55 HBitmap *bitmap;
56 QLIST_ENTRY(BdrvDirtyBitmap) list;
57};
58
1c9805a3
SH
59#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
60
2a87151f
SH
61#define COROUTINE_POOL_RESERVATION 64 /* number of coroutines to reserve */
62
7d4b4ba5 63static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
f141eafe
AL
64static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
65 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
c87c0672 66 BlockDriverCompletionFunc *cb, void *opaque);
f141eafe
AL
67static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
68 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
ce1a14dc 69 BlockDriverCompletionFunc *cb, void *opaque);
f9f05dc5
KW
70static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
71 int64_t sector_num, int nb_sectors,
72 QEMUIOVector *iov);
73static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
74 int64_t sector_num, int nb_sectors,
75 QEMUIOVector *iov);
775aa8b6
KW
76static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
77 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
470c0504 78 BdrvRequestFlags flags);
775aa8b6
KW
79static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
80 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
f08f2dda 81 BdrvRequestFlags flags);
b2a61371
SH
82static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
83 int64_t sector_num,
84 QEMUIOVector *qiov,
85 int nb_sectors,
d20d9b7c 86 BdrvRequestFlags flags,
b2a61371
SH
87 BlockDriverCompletionFunc *cb,
88 void *opaque,
8c5873d6 89 bool is_write);
b2a61371 90static void coroutine_fn bdrv_co_do_rw(void *opaque);
621f0589 91static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
aa7bfbff 92 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
ec530c81 93
1b7bdbc1
SH
94static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
95 QTAILQ_HEAD_INITIALIZER(bdrv_states);
7ee930d0 96
dc364f4c
BC
97static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
98 QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
99
8a22f02a
SH
100static QLIST_HEAD(, BlockDriver) bdrv_drivers =
101 QLIST_HEAD_INITIALIZER(bdrv_drivers);
ea2384d3 102
eb852011
MA
103/* If non-zero, use only whitelisted block drivers */
104static int use_bdrv_whitelist;
105
9e0b22f4
SH
106#ifdef _WIN32
107static int is_windows_drive_prefix(const char *filename)
108{
109 return (((filename[0] >= 'a' && filename[0] <= 'z') ||
110 (filename[0] >= 'A' && filename[0] <= 'Z')) &&
111 filename[1] == ':');
112}
113
114int is_windows_drive(const char *filename)
115{
116 if (is_windows_drive_prefix(filename) &&
117 filename[2] == '\0')
118 return 1;
119 if (strstart(filename, "\\\\.\\", NULL) ||
120 strstart(filename, "//./", NULL))
121 return 1;
122 return 0;
123}
124#endif
125
0563e191 126/* throttling disk I/O limits */
cc0681c4
BC
127void bdrv_set_io_limits(BlockDriverState *bs,
128 ThrottleConfig *cfg)
98f90dba 129{
cc0681c4 130 int i;
98f90dba 131
cc0681c4 132 throttle_config(&bs->throttle_state, cfg);
98f90dba 133
cc0681c4
BC
134 for (i = 0; i < 2; i++) {
135 qemu_co_enter_next(&bs->throttled_reqs[i]);
98f90dba 136 }
cc0681c4
BC
137}
138
139/* this function drain all the throttled IOs */
140static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
141{
142 bool drained = false;
143 bool enabled = bs->io_limits_enabled;
144 int i;
145
146 bs->io_limits_enabled = false;
147
148 for (i = 0; i < 2; i++) {
149 while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
150 drained = true;
151 }
152 }
153
154 bs->io_limits_enabled = enabled;
98f90dba 155
cc0681c4 156 return drained;
98f90dba
ZYW
157}
158
cc0681c4 159void bdrv_io_limits_disable(BlockDriverState *bs)
0563e191 160{
cc0681c4 161 bs->io_limits_enabled = false;
0563e191 162
cc0681c4
BC
163 bdrv_start_throttled_reqs(bs);
164
165 throttle_destroy(&bs->throttle_state);
0563e191
ZYW
166}
167
cc0681c4 168static void bdrv_throttle_read_timer_cb(void *opaque)
0563e191 169{
cc0681c4
BC
170 BlockDriverState *bs = opaque;
171 qemu_co_enter_next(&bs->throttled_reqs[0]);
0563e191
ZYW
172}
173
cc0681c4 174static void bdrv_throttle_write_timer_cb(void *opaque)
0563e191 175{
cc0681c4
BC
176 BlockDriverState *bs = opaque;
177 qemu_co_enter_next(&bs->throttled_reqs[1]);
0563e191
ZYW
178}
179
cc0681c4
BC
180/* should be called before bdrv_set_io_limits if a limit is set */
181void bdrv_io_limits_enable(BlockDriverState *bs)
182{
183 assert(!bs->io_limits_enabled);
184 throttle_init(&bs->throttle_state,
13af91eb 185 bdrv_get_aio_context(bs),
cc0681c4
BC
186 QEMU_CLOCK_VIRTUAL,
187 bdrv_throttle_read_timer_cb,
188 bdrv_throttle_write_timer_cb,
189 bs);
190 bs->io_limits_enabled = true;
191}
192
193/* This function makes an IO wait if needed
194 *
195 * @nb_sectors: the number of sectors of the IO
196 * @is_write: is the IO a write
197 */
98f90dba 198static void bdrv_io_limits_intercept(BlockDriverState *bs,
d5103588 199 unsigned int bytes,
cc0681c4 200 bool is_write)
98f90dba 201{
cc0681c4
BC
202 /* does this io must wait */
203 bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
98f90dba 204
cc0681c4
BC
205 /* if must wait or any request of this type throttled queue the IO */
206 if (must_wait ||
207 !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
208 qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
98f90dba
ZYW
209 }
210
cc0681c4 211 /* the IO will be executed, do the accounting */
d5103588
KW
212 throttle_account(&bs->throttle_state, is_write, bytes);
213
98f90dba 214
cc0681c4
BC
215 /* if the next request must wait -> do nothing */
216 if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
217 return;
98f90dba
ZYW
218 }
219
cc0681c4
BC
220 /* else queue next request for execution */
221 qemu_co_queue_next(&bs->throttled_reqs[is_write]);
98f90dba
ZYW
222}
223
339064d5
KW
224size_t bdrv_opt_mem_align(BlockDriverState *bs)
225{
226 if (!bs || !bs->drv) {
227 /* 4k should be on the safe side */
228 return 4096;
229 }
230
231 return bs->bl.opt_mem_alignment;
232}
233
9e0b22f4
SH
234/* check if the path starts with "<protocol>:" */
235static int path_has_protocol(const char *path)
236{
947995c0
PB
237 const char *p;
238
9e0b22f4
SH
239#ifdef _WIN32
240 if (is_windows_drive(path) ||
241 is_windows_drive_prefix(path)) {
242 return 0;
243 }
947995c0
PB
244 p = path + strcspn(path, ":/\\");
245#else
246 p = path + strcspn(path, ":/");
9e0b22f4
SH
247#endif
248
947995c0 249 return *p == ':';
9e0b22f4
SH
250}
251
83f64091 252int path_is_absolute(const char *path)
3b0d4f61 253{
21664424
FB
254#ifdef _WIN32
255 /* specific case for names like: "\\.\d:" */
f53f4da9 256 if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
21664424 257 return 1;
f53f4da9
PB
258 }
259 return (*path == '/' || *path == '\\');
3b9f94e1 260#else
f53f4da9 261 return (*path == '/');
3b9f94e1 262#endif
3b0d4f61
FB
263}
264
83f64091
FB
265/* if filename is absolute, just copy it to dest. Otherwise, build a
266 path to it by considering it is relative to base_path. URL are
267 supported. */
268void path_combine(char *dest, int dest_size,
269 const char *base_path,
270 const char *filename)
3b0d4f61 271{
83f64091
FB
272 const char *p, *p1;
273 int len;
274
275 if (dest_size <= 0)
276 return;
277 if (path_is_absolute(filename)) {
278 pstrcpy(dest, dest_size, filename);
279 } else {
280 p = strchr(base_path, ':');
281 if (p)
282 p++;
283 else
284 p = base_path;
3b9f94e1
FB
285 p1 = strrchr(base_path, '/');
286#ifdef _WIN32
287 {
288 const char *p2;
289 p2 = strrchr(base_path, '\\');
290 if (!p1 || p2 > p1)
291 p1 = p2;
292 }
293#endif
83f64091
FB
294 if (p1)
295 p1++;
296 else
297 p1 = base_path;
298 if (p1 > p)
299 p = p1;
300 len = p - base_path;
301 if (len > dest_size - 1)
302 len = dest_size - 1;
303 memcpy(dest, base_path, len);
304 dest[len] = '\0';
305 pstrcat(dest, dest_size, filename);
3b0d4f61 306 }
3b0d4f61
FB
307}
308
dc5a1371
PB
309void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
310{
311 if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
312 pstrcpy(dest, sz, bs->backing_file);
313 } else {
314 path_combine(dest, sz, bs->filename, bs->backing_file);
315 }
316}
317
5efa9d5a 318void bdrv_register(BlockDriver *bdrv)
ea2384d3 319{
8c5873d6
SH
320 /* Block drivers without coroutine functions need emulation */
321 if (!bdrv->bdrv_co_readv) {
f9f05dc5
KW
322 bdrv->bdrv_co_readv = bdrv_co_readv_em;
323 bdrv->bdrv_co_writev = bdrv_co_writev_em;
324
f8c35c1d
SH
325 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
326 * the block driver lacks aio we need to emulate that too.
327 */
f9f05dc5
KW
328 if (!bdrv->bdrv_aio_readv) {
329 /* add AIO emulation layer */
330 bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
331 bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
f9f05dc5 332 }
83f64091 333 }
b2e12bc6 334
8a22f02a 335 QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
ea2384d3 336}
b338082b 337
7f06d47e 338BlockDriverState *bdrv_new_root(void)
b338082b 339{
7f06d47e 340 BlockDriverState *bs = bdrv_new();
e4e9986b 341
e4e9986b 342 QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
e4e9986b
MA
343 return bs;
344}
345
346BlockDriverState *bdrv_new(void)
347{
348 BlockDriverState *bs;
349 int i;
350
5839e53b 351 bs = g_new0(BlockDriverState, 1);
e4654d2d 352 QLIST_INIT(&bs->dirty_bitmaps);
fbe40ff7
FZ
353 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
354 QLIST_INIT(&bs->op_blockers[i]);
355 }
28a7282a 356 bdrv_iostatus_disable(bs);
d7d512f6 357 notifier_list_init(&bs->close_notifiers);
d616b224 358 notifier_with_return_list_init(&bs->before_write_notifiers);
cc0681c4
BC
359 qemu_co_queue_init(&bs->throttled_reqs[0]);
360 qemu_co_queue_init(&bs->throttled_reqs[1]);
9fcb0251 361 bs->refcnt = 1;
dcd04228 362 bs->aio_context = qemu_get_aio_context();
d7d512f6 363
b338082b
FB
364 return bs;
365}
366
d7d512f6
PB
367void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
368{
369 notifier_list_add(&bs->close_notifiers, notify);
370}
371
ea2384d3
FB
372BlockDriver *bdrv_find_format(const char *format_name)
373{
374 BlockDriver *drv1;
8a22f02a
SH
375 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
376 if (!strcmp(drv1->format_name, format_name)) {
ea2384d3 377 return drv1;
8a22f02a 378 }
ea2384d3
FB
379 }
380 return NULL;
381}
382
b64ec4e4 383static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
eb852011 384{
b64ec4e4
FZ
385 static const char *whitelist_rw[] = {
386 CONFIG_BDRV_RW_WHITELIST
387 };
388 static const char *whitelist_ro[] = {
389 CONFIG_BDRV_RO_WHITELIST
eb852011
MA
390 };
391 const char **p;
392
b64ec4e4 393 if (!whitelist_rw[0] && !whitelist_ro[0]) {
eb852011 394 return 1; /* no whitelist, anything goes */
b64ec4e4 395 }
eb852011 396
b64ec4e4 397 for (p = whitelist_rw; *p; p++) {
eb852011
MA
398 if (!strcmp(drv->format_name, *p)) {
399 return 1;
400 }
401 }
b64ec4e4
FZ
402 if (read_only) {
403 for (p = whitelist_ro; *p; p++) {
404 if (!strcmp(drv->format_name, *p)) {
405 return 1;
406 }
407 }
408 }
eb852011
MA
409 return 0;
410}
411
b64ec4e4
FZ
412BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
413 bool read_only)
eb852011
MA
414{
415 BlockDriver *drv = bdrv_find_format(format_name);
b64ec4e4 416 return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
eb852011
MA
417}
418
5b7e1542
ZYW
419typedef struct CreateCo {
420 BlockDriver *drv;
421 char *filename;
83d0521a 422 QemuOpts *opts;
5b7e1542 423 int ret;
cc84d90f 424 Error *err;
5b7e1542
ZYW
425} CreateCo;
426
427static void coroutine_fn bdrv_create_co_entry(void *opaque)
428{
cc84d90f
HR
429 Error *local_err = NULL;
430 int ret;
431
5b7e1542
ZYW
432 CreateCo *cco = opaque;
433 assert(cco->drv);
434
c282e1fd 435 ret = cco->drv->bdrv_create(cco->filename, cco->opts, &local_err);
84d18f06 436 if (local_err) {
cc84d90f
HR
437 error_propagate(&cco->err, local_err);
438 }
439 cco->ret = ret;
5b7e1542
ZYW
440}
441
0e7e1989 442int bdrv_create(BlockDriver *drv, const char* filename,
83d0521a 443 QemuOpts *opts, Error **errp)
ea2384d3 444{
5b7e1542
ZYW
445 int ret;
446
447 Coroutine *co;
448 CreateCo cco = {
449 .drv = drv,
450 .filename = g_strdup(filename),
83d0521a 451 .opts = opts,
5b7e1542 452 .ret = NOT_DONE,
cc84d90f 453 .err = NULL,
5b7e1542
ZYW
454 };
455
c282e1fd 456 if (!drv->bdrv_create) {
cc84d90f 457 error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
80168bff
LC
458 ret = -ENOTSUP;
459 goto out;
5b7e1542
ZYW
460 }
461
462 if (qemu_in_coroutine()) {
463 /* Fast-path if already in coroutine context */
464 bdrv_create_co_entry(&cco);
465 } else {
466 co = qemu_coroutine_create(bdrv_create_co_entry);
467 qemu_coroutine_enter(co, &cco);
468 while (cco.ret == NOT_DONE) {
b47ec2c4 469 aio_poll(qemu_get_aio_context(), true);
5b7e1542
ZYW
470 }
471 }
472
473 ret = cco.ret;
cc84d90f 474 if (ret < 0) {
84d18f06 475 if (cco.err) {
cc84d90f
HR
476 error_propagate(errp, cco.err);
477 } else {
478 error_setg_errno(errp, -ret, "Could not create image");
479 }
480 }
0e7e1989 481
80168bff
LC
482out:
483 g_free(cco.filename);
5b7e1542 484 return ret;
ea2384d3
FB
485}
486
c282e1fd 487int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp)
84a12e66
CH
488{
489 BlockDriver *drv;
cc84d90f
HR
490 Error *local_err = NULL;
491 int ret;
84a12e66 492
98289620 493 drv = bdrv_find_protocol(filename, true);
84a12e66 494 if (drv == NULL) {
cc84d90f 495 error_setg(errp, "Could not find protocol for file '%s'", filename);
16905d71 496 return -ENOENT;
84a12e66
CH
497 }
498
c282e1fd 499 ret = bdrv_create(drv, filename, opts, &local_err);
84d18f06 500 if (local_err) {
cc84d90f
HR
501 error_propagate(errp, local_err);
502 }
503 return ret;
84a12e66
CH
504}
505
3baca891 506void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
d34682cd
KW
507{
508 BlockDriver *drv = bs->drv;
3baca891 509 Error *local_err = NULL;
d34682cd
KW
510
511 memset(&bs->bl, 0, sizeof(bs->bl));
512
466ad822 513 if (!drv) {
3baca891 514 return;
466ad822
KW
515 }
516
517 /* Take some limits from the children as a default */
518 if (bs->file) {
3baca891
KW
519 bdrv_refresh_limits(bs->file, &local_err);
520 if (local_err) {
521 error_propagate(errp, local_err);
522 return;
523 }
466ad822 524 bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
339064d5
KW
525 bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
526 } else {
527 bs->bl.opt_mem_alignment = 512;
466ad822
KW
528 }
529
530 if (bs->backing_hd) {
3baca891
KW
531 bdrv_refresh_limits(bs->backing_hd, &local_err);
532 if (local_err) {
533 error_propagate(errp, local_err);
534 return;
535 }
466ad822
KW
536 bs->bl.opt_transfer_length =
537 MAX(bs->bl.opt_transfer_length,
538 bs->backing_hd->bl.opt_transfer_length);
339064d5
KW
539 bs->bl.opt_mem_alignment =
540 MAX(bs->bl.opt_mem_alignment,
541 bs->backing_hd->bl.opt_mem_alignment);
466ad822
KW
542 }
543
544 /* Then let the driver override it */
545 if (drv->bdrv_refresh_limits) {
3baca891 546 drv->bdrv_refresh_limits(bs, errp);
d34682cd 547 }
d34682cd
KW
548}
549
eba25057
JM
550/*
551 * Create a uniquely-named empty temporary file.
552 * Return 0 upon success, otherwise a negative errno value.
553 */
554int get_tmp_filename(char *filename, int size)
d5249393 555{
eba25057 556#ifdef _WIN32
3b9f94e1 557 char temp_dir[MAX_PATH];
eba25057
JM
558 /* GetTempFileName requires that its output buffer (4th param)
559 have length MAX_PATH or greater. */
560 assert(size >= MAX_PATH);
561 return (GetTempPath(MAX_PATH, temp_dir)
562 && GetTempFileName(temp_dir, "qem", 0, filename)
563 ? 0 : -GetLastError());
d5249393 564#else
67b915a5 565 int fd;
7ccfb2eb 566 const char *tmpdir;
0badc1ee 567 tmpdir = getenv("TMPDIR");
69bef793
AS
568 if (!tmpdir) {
569 tmpdir = "/var/tmp";
570 }
eba25057
JM
571 if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
572 return -EOVERFLOW;
573 }
ea2384d3 574 fd = mkstemp(filename);
fe235a06
DH
575 if (fd < 0) {
576 return -errno;
577 }
578 if (close(fd) != 0) {
579 unlink(filename);
eba25057
JM
580 return -errno;
581 }
582 return 0;
d5249393 583#endif
eba25057 584}
fc01f7e7 585
84a12e66
CH
586/*
587 * Detect host devices. By convention, /dev/cdrom[N] is always
588 * recognized as a host CDROM.
589 */
590static BlockDriver *find_hdev_driver(const char *filename)
591{
592 int score_max = 0, score;
593 BlockDriver *drv = NULL, *d;
594
595 QLIST_FOREACH(d, &bdrv_drivers, list) {
596 if (d->bdrv_probe_device) {
597 score = d->bdrv_probe_device(filename);
598 if (score > score_max) {
599 score_max = score;
600 drv = d;
601 }
602 }
603 }
604
605 return drv;
606}
607
98289620
KW
608BlockDriver *bdrv_find_protocol(const char *filename,
609 bool allow_protocol_prefix)
83f64091
FB
610{
611 BlockDriver *drv1;
612 char protocol[128];
1cec71e3 613 int len;
83f64091 614 const char *p;
19cb3738 615
66f82cee
KW
616 /* TODO Drivers without bdrv_file_open must be specified explicitly */
617
39508e7a
CH
618 /*
619 * XXX(hch): we really should not let host device detection
620 * override an explicit protocol specification, but moving this
621 * later breaks access to device names with colons in them.
622 * Thanks to the brain-dead persistent naming schemes on udev-
623 * based Linux systems those actually are quite common.
624 */
625 drv1 = find_hdev_driver(filename);
626 if (drv1) {
627 return drv1;
628 }
629
98289620 630 if (!path_has_protocol(filename) || !allow_protocol_prefix) {
39508e7a 631 return bdrv_find_format("file");
84a12e66 632 }
98289620 633
9e0b22f4
SH
634 p = strchr(filename, ':');
635 assert(p != NULL);
1cec71e3
AL
636 len = p - filename;
637 if (len > sizeof(protocol) - 1)
638 len = sizeof(protocol) - 1;
639 memcpy(protocol, filename, len);
640 protocol[len] = '\0';
8a22f02a 641 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
5fafdf24 642 if (drv1->protocol_name &&
8a22f02a 643 !strcmp(drv1->protocol_name, protocol)) {
83f64091 644 return drv1;
8a22f02a 645 }
83f64091
FB
646 }
647 return NULL;
648}
649
f500a6d3 650static int find_image_format(BlockDriverState *bs, const char *filename,
34b5d2c6 651 BlockDriver **pdrv, Error **errp)
f3a5d3f8 652{
f500a6d3 653 int score, score_max;
f3a5d3f8
CH
654 BlockDriver *drv1, *drv;
655 uint8_t buf[2048];
f500a6d3 656 int ret = 0;
f8ea0b00 657
08a00559 658 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
8e895599 659 if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
c98ac35d
SW
660 drv = bdrv_find_format("raw");
661 if (!drv) {
34b5d2c6 662 error_setg(errp, "Could not find raw image format");
c98ac35d
SW
663 ret = -ENOENT;
664 }
665 *pdrv = drv;
666 return ret;
1a396859 667 }
f8ea0b00 668
83f64091 669 ret = bdrv_pread(bs, 0, buf, sizeof(buf));
83f64091 670 if (ret < 0) {
34b5d2c6
HR
671 error_setg_errno(errp, -ret, "Could not read image for determining its "
672 "format");
c98ac35d
SW
673 *pdrv = NULL;
674 return ret;
83f64091
FB
675 }
676
ea2384d3 677 score_max = 0;
84a12e66 678 drv = NULL;
8a22f02a 679 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
83f64091
FB
680 if (drv1->bdrv_probe) {
681 score = drv1->bdrv_probe(buf, ret, filename);
682 if (score > score_max) {
683 score_max = score;
684 drv = drv1;
685 }
0849bf08 686 }
fc01f7e7 687 }
c98ac35d 688 if (!drv) {
34b5d2c6
HR
689 error_setg(errp, "Could not determine image format: No compatible "
690 "driver found");
c98ac35d
SW
691 ret = -ENOENT;
692 }
693 *pdrv = drv;
694 return ret;
ea2384d3
FB
695}
696
51762288
SH
697/**
698 * Set the current 'total_sectors' value
65a9bb25 699 * Return 0 on success, -errno on error.
51762288
SH
700 */
701static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
702{
703 BlockDriver *drv = bs->drv;
704
396759ad
NB
705 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
706 if (bs->sg)
707 return 0;
708
51762288
SH
709 /* query actual device if possible, otherwise just trust the hint */
710 if (drv->bdrv_getlength) {
711 int64_t length = drv->bdrv_getlength(bs);
712 if (length < 0) {
713 return length;
714 }
7e382003 715 hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
51762288
SH
716 }
717
718 bs->total_sectors = hint;
719 return 0;
720}
721
9e8f1835
PB
722/**
723 * Set open flags for a given discard mode
724 *
725 * Return 0 on success, -1 if the discard mode was invalid.
726 */
727int bdrv_parse_discard_flags(const char *mode, int *flags)
728{
729 *flags &= ~BDRV_O_UNMAP;
730
731 if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
732 /* do nothing */
733 } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
734 *flags |= BDRV_O_UNMAP;
735 } else {
736 return -1;
737 }
738
739 return 0;
740}
741
c3993cdc
SH
742/**
743 * Set open flags for a given cache mode
744 *
745 * Return 0 on success, -1 if the cache mode was invalid.
746 */
747int bdrv_parse_cache_flags(const char *mode, int *flags)
748{
749 *flags &= ~BDRV_O_CACHE_MASK;
750
751 if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
752 *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
92196b2f
SH
753 } else if (!strcmp(mode, "directsync")) {
754 *flags |= BDRV_O_NOCACHE;
c3993cdc
SH
755 } else if (!strcmp(mode, "writeback")) {
756 *flags |= BDRV_O_CACHE_WB;
757 } else if (!strcmp(mode, "unsafe")) {
758 *flags |= BDRV_O_CACHE_WB;
759 *flags |= BDRV_O_NO_FLUSH;
760 } else if (!strcmp(mode, "writethrough")) {
761 /* this is the default */
762 } else {
763 return -1;
764 }
765
766 return 0;
767}
768
53fec9d3
SH
769/**
770 * The copy-on-read flag is actually a reference count so multiple users may
771 * use the feature without worrying about clobbering its previous state.
772 * Copy-on-read stays enabled until all users have called to disable it.
773 */
774void bdrv_enable_copy_on_read(BlockDriverState *bs)
775{
776 bs->copy_on_read++;
777}
778
779void bdrv_disable_copy_on_read(BlockDriverState *bs)
780{
781 assert(bs->copy_on_read > 0);
782 bs->copy_on_read--;
783}
784
b1e6fc08
KW
785/*
786 * Returns the flags that a temporary snapshot should get, based on the
787 * originally requested flags (the originally requested image will have flags
788 * like a backing file)
789 */
790static int bdrv_temp_snapshot_flags(int flags)
791{
792 return (flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY;
793}
794
0b50cc88
KW
795/*
796 * Returns the flags that bs->file should get, based on the given flags for
797 * the parent BDS
798 */
799static int bdrv_inherited_flags(int flags)
800{
801 /* Enable protocol handling, disable format probing for bs->file */
802 flags |= BDRV_O_PROTOCOL;
803
804 /* Our block drivers take care to send flushes and respect unmap policy,
805 * so we can enable both unconditionally on lower layers. */
806 flags |= BDRV_O_CACHE_WB | BDRV_O_UNMAP;
807
0b50cc88 808 /* Clear flags that only apply to the top layer */
5669b44d 809 flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_COPY_ON_READ);
0b50cc88
KW
810
811 return flags;
812}
813
317fc44e
KW
814/*
815 * Returns the flags that bs->backing_hd should get, based on the given flags
816 * for the parent BDS
817 */
818static int bdrv_backing_flags(int flags)
819{
820 /* backing files always opened read-only */
821 flags &= ~(BDRV_O_RDWR | BDRV_O_COPY_ON_READ);
822
823 /* snapshot=on is handled on the top layer */
8bfea15d 824 flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_TEMPORARY);
317fc44e
KW
825
826 return flags;
827}
828
7b272452
KW
829static int bdrv_open_flags(BlockDriverState *bs, int flags)
830{
831 int open_flags = flags | BDRV_O_CACHE_WB;
832
833 /*
834 * Clear flags that are internal to the block layer before opening the
835 * image.
836 */
20cca275 837 open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_PROTOCOL);
7b272452
KW
838
839 /*
840 * Snapshots should be writable.
841 */
8bfea15d 842 if (flags & BDRV_O_TEMPORARY) {
7b272452
KW
843 open_flags |= BDRV_O_RDWR;
844 }
845
846 return open_flags;
847}
848
636ea370
KW
849static void bdrv_assign_node_name(BlockDriverState *bs,
850 const char *node_name,
851 Error **errp)
6913c0c2
BC
852{
853 if (!node_name) {
636ea370 854 return;
6913c0c2
BC
855 }
856
9aebf3b8 857 /* Check for empty string or invalid characters */
f5bebbbb 858 if (!id_wellformed(node_name)) {
9aebf3b8 859 error_setg(errp, "Invalid node name");
636ea370 860 return;
6913c0c2
BC
861 }
862
0c5e94ee 863 /* takes care of avoiding namespaces collisions */
7f06d47e 864 if (blk_by_name(node_name)) {
0c5e94ee
BC
865 error_setg(errp, "node-name=%s is conflicting with a device id",
866 node_name);
636ea370 867 return;
0c5e94ee
BC
868 }
869
6913c0c2
BC
870 /* takes care of avoiding duplicates node names */
871 if (bdrv_find_node(node_name)) {
872 error_setg(errp, "Duplicate node name");
636ea370 873 return;
6913c0c2
BC
874 }
875
876 /* copy node name into the bs and insert it into the graph list */
877 pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
878 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
6913c0c2
BC
879}
880
57915332
KW
881/*
882 * Common part for opening disk images and files
b6ad491a
KW
883 *
884 * Removes all processed options from *options.
57915332 885 */
f500a6d3 886static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
34b5d2c6 887 QDict *options, int flags, BlockDriver *drv, Error **errp)
57915332
KW
888{
889 int ret, open_flags;
035fccdf 890 const char *filename;
6913c0c2 891 const char *node_name = NULL;
34b5d2c6 892 Error *local_err = NULL;
57915332
KW
893
894 assert(drv != NULL);
6405875c 895 assert(bs->file == NULL);
707ff828 896 assert(options != NULL && bs->options != options);
57915332 897
45673671
KW
898 if (file != NULL) {
899 filename = file->filename;
900 } else {
901 filename = qdict_get_try_str(options, "filename");
902 }
903
765003db
KW
904 if (drv->bdrv_needs_filename && !filename) {
905 error_setg(errp, "The '%s' block driver requires a file name",
906 drv->format_name);
907 return -EINVAL;
908 }
909
45673671 910 trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
28dcee10 911
6913c0c2 912 node_name = qdict_get_try_str(options, "node-name");
636ea370 913 bdrv_assign_node_name(bs, node_name, &local_err);
0fb6395c 914 if (local_err) {
636ea370
KW
915 error_propagate(errp, local_err);
916 return -EINVAL;
6913c0c2
BC
917 }
918 qdict_del(options, "node-name");
919
5d186eb0
KW
920 /* bdrv_open() with directly using a protocol as drv. This layer is already
921 * opened, so assign it to bs (while file becomes a closed BlockDriverState)
922 * and return immediately. */
923 if (file != NULL && drv->bdrv_file_open) {
924 bdrv_swap(file, bs);
925 return 0;
926 }
927
57915332 928 bs->open_flags = flags;
1b7fd729 929 bs->guest_block_size = 512;
c25f53b0 930 bs->request_alignment = 512;
0d51b4de 931 bs->zero_beyond_eof = true;
b64ec4e4
FZ
932 open_flags = bdrv_open_flags(bs, flags);
933 bs->read_only = !(open_flags & BDRV_O_RDWR);
20cca275 934 bs->growable = !!(flags & BDRV_O_PROTOCOL);
b64ec4e4
FZ
935
936 if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
8f94a6e4
KW
937 error_setg(errp,
938 !bs->read_only && bdrv_is_whitelisted(drv, true)
939 ? "Driver '%s' can only be used for read-only devices"
940 : "Driver '%s' is not whitelisted",
941 drv->format_name);
b64ec4e4
FZ
942 return -ENOTSUP;
943 }
57915332 944
53fec9d3 945 assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
0ebd24e0
KW
946 if (flags & BDRV_O_COPY_ON_READ) {
947 if (!bs->read_only) {
948 bdrv_enable_copy_on_read(bs);
949 } else {
950 error_setg(errp, "Can't use copy-on-read on read-only device");
951 return -EINVAL;
952 }
53fec9d3
SH
953 }
954
c2ad1b0c
KW
955 if (filename != NULL) {
956 pstrcpy(bs->filename, sizeof(bs->filename), filename);
957 } else {
958 bs->filename[0] = '\0';
959 }
91af7014 960 pstrcpy(bs->exact_filename, sizeof(bs->exact_filename), bs->filename);
57915332 961
57915332 962 bs->drv = drv;
7267c094 963 bs->opaque = g_malloc0(drv->instance_size);
57915332 964
03f541bd 965 bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
e7c63796 966
66f82cee
KW
967 /* Open the image, either directly or using a protocol */
968 if (drv->bdrv_file_open) {
5d186eb0 969 assert(file == NULL);
030be321 970 assert(!drv->bdrv_needs_filename || filename != NULL);
34b5d2c6 971 ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
f500a6d3 972 } else {
2af5ef70 973 if (file == NULL) {
34b5d2c6
HR
974 error_setg(errp, "Can't use '%s' as a block driver for the "
975 "protocol level", drv->format_name);
2af5ef70
KW
976 ret = -EINVAL;
977 goto free_and_fail;
978 }
f500a6d3 979 bs->file = file;
34b5d2c6 980 ret = drv->bdrv_open(bs, options, open_flags, &local_err);
66f82cee
KW
981 }
982
57915332 983 if (ret < 0) {
84d18f06 984 if (local_err) {
34b5d2c6 985 error_propagate(errp, local_err);
2fa9aa59
DH
986 } else if (bs->filename[0]) {
987 error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
34b5d2c6
HR
988 } else {
989 error_setg_errno(errp, -ret, "Could not open image");
990 }
57915332
KW
991 goto free_and_fail;
992 }
993
51762288
SH
994 ret = refresh_total_sectors(bs, bs->total_sectors);
995 if (ret < 0) {
34b5d2c6 996 error_setg_errno(errp, -ret, "Could not refresh total sector count");
51762288 997 goto free_and_fail;
57915332 998 }
51762288 999
3baca891
KW
1000 bdrv_refresh_limits(bs, &local_err);
1001 if (local_err) {
1002 error_propagate(errp, local_err);
1003 ret = -EINVAL;
1004 goto free_and_fail;
1005 }
1006
c25f53b0 1007 assert(bdrv_opt_mem_align(bs) != 0);
47ea2de2 1008 assert((bs->request_alignment != 0) || bs->sg);
57915332
KW
1009 return 0;
1010
1011free_and_fail:
f500a6d3 1012 bs->file = NULL;
7267c094 1013 g_free(bs->opaque);
57915332
KW
1014 bs->opaque = NULL;
1015 bs->drv = NULL;
1016 return ret;
1017}
1018
5e5c4f63
KW
1019static QDict *parse_json_filename(const char *filename, Error **errp)
1020{
1021 QObject *options_obj;
1022 QDict *options;
1023 int ret;
1024
1025 ret = strstart(filename, "json:", &filename);
1026 assert(ret);
1027
1028 options_obj = qobject_from_json(filename);
1029 if (!options_obj) {
1030 error_setg(errp, "Could not parse the JSON options");
1031 return NULL;
1032 }
1033
1034 if (qobject_type(options_obj) != QTYPE_QDICT) {
1035 qobject_decref(options_obj);
1036 error_setg(errp, "Invalid JSON object given");
1037 return NULL;
1038 }
1039
1040 options = qobject_to_qdict(options_obj);
1041 qdict_flatten(options);
1042
1043 return options;
1044}
1045
b6ce07aa 1046/*
f54120ff
KW
1047 * Fills in default options for opening images and converts the legacy
1048 * filename/flags pair to option QDict entries.
b6ce07aa 1049 */
5e5c4f63 1050static int bdrv_fill_options(QDict **options, const char **pfilename, int flags,
17b005f1 1051 BlockDriver *drv, Error **errp)
ea2384d3 1052{
5e5c4f63 1053 const char *filename = *pfilename;
c2ad1b0c 1054 const char *drvname;
462f5bcf 1055 bool protocol = flags & BDRV_O_PROTOCOL;
e3fa4bfa 1056 bool parse_filename = false;
34b5d2c6 1057 Error *local_err = NULL;
83f64091 1058
5e5c4f63
KW
1059 /* Parse json: pseudo-protocol */
1060 if (filename && g_str_has_prefix(filename, "json:")) {
1061 QDict *json_options = parse_json_filename(filename, &local_err);
1062 if (local_err) {
1063 error_propagate(errp, local_err);
1064 return -EINVAL;
1065 }
1066
1067 /* Options given in the filename have lower priority than options
1068 * specified directly */
1069 qdict_join(*options, json_options, false);
1070 QDECREF(json_options);
1071 *pfilename = filename = NULL;
1072 }
1073
035fccdf 1074 /* Fetch the file name from the options QDict if necessary */
17b005f1 1075 if (protocol && filename) {
f54120ff
KW
1076 if (!qdict_haskey(*options, "filename")) {
1077 qdict_put(*options, "filename", qstring_from_str(filename));
1078 parse_filename = true;
1079 } else {
1080 error_setg(errp, "Can't specify 'file' and 'filename' options at "
1081 "the same time");
1082 return -EINVAL;
1083 }
035fccdf
KW
1084 }
1085
c2ad1b0c 1086 /* Find the right block driver */
f54120ff 1087 filename = qdict_get_try_str(*options, "filename");
5acd9d81 1088 drvname = qdict_get_try_str(*options, "driver");
f54120ff 1089
17b005f1
KW
1090 if (drv) {
1091 if (drvname) {
1092 error_setg(errp, "Driver specified twice");
1093 return -EINVAL;
1094 }
1095 drvname = drv->format_name;
1096 qdict_put(*options, "driver", qstring_from_str(drvname));
1097 } else {
1098 if (!drvname && protocol) {
1099 if (filename) {
1100 drv = bdrv_find_protocol(filename, parse_filename);
1101 if (!drv) {
1102 error_setg(errp, "Unknown protocol");
1103 return -EINVAL;
1104 }
1105
1106 drvname = drv->format_name;
1107 qdict_put(*options, "driver", qstring_from_str(drvname));
1108 } else {
1109 error_setg(errp, "Must specify either driver or file");
f54120ff
KW
1110 return -EINVAL;
1111 }
17b005f1
KW
1112 } else if (drvname) {
1113 drv = bdrv_find_format(drvname);
1114 if (!drv) {
1115 error_setg(errp, "Unknown driver '%s'", drvname);
1116 return -ENOENT;
1117 }
98289620 1118 }
c2ad1b0c
KW
1119 }
1120
17b005f1 1121 assert(drv || !protocol);
c2ad1b0c 1122
f54120ff 1123 /* Driver-specific filename parsing */
17b005f1 1124 if (drv && drv->bdrv_parse_filename && parse_filename) {
5acd9d81 1125 drv->bdrv_parse_filename(filename, *options, &local_err);
84d18f06 1126 if (local_err) {
34b5d2c6 1127 error_propagate(errp, local_err);
f54120ff 1128 return -EINVAL;
6963a30d 1129 }
cd5d031e
HR
1130
1131 if (!drv->bdrv_needs_filename) {
1132 qdict_del(*options, "filename");
cd5d031e 1133 }
6963a30d
KW
1134 }
1135
f54120ff
KW
1136 return 0;
1137}
1138
8d24cce1
FZ
1139void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd)
1140{
1141
826b6ca0
FZ
1142 if (bs->backing_hd) {
1143 assert(bs->backing_blocker);
1144 bdrv_op_unblock_all(bs->backing_hd, bs->backing_blocker);
1145 } else if (backing_hd) {
1146 error_setg(&bs->backing_blocker,
1147 "device is used as backing hd of '%s'",
bfb197e0 1148 bdrv_get_device_name(bs));
826b6ca0
FZ
1149 }
1150
8d24cce1
FZ
1151 bs->backing_hd = backing_hd;
1152 if (!backing_hd) {
826b6ca0
FZ
1153 error_free(bs->backing_blocker);
1154 bs->backing_blocker = NULL;
8d24cce1
FZ
1155 goto out;
1156 }
1157 bs->open_flags &= ~BDRV_O_NO_BACKING;
1158 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_hd->filename);
1159 pstrcpy(bs->backing_format, sizeof(bs->backing_format),
1160 backing_hd->drv ? backing_hd->drv->format_name : "");
826b6ca0
FZ
1161
1162 bdrv_op_block_all(bs->backing_hd, bs->backing_blocker);
1163 /* Otherwise we won't be able to commit due to check in bdrv_commit */
1164 bdrv_op_unblock(bs->backing_hd, BLOCK_OP_TYPE_COMMIT,
1165 bs->backing_blocker);
8d24cce1 1166out:
3baca891 1167 bdrv_refresh_limits(bs, NULL);
8d24cce1
FZ
1168}
1169
31ca6d07
KW
1170/*
1171 * Opens the backing file for a BlockDriverState if not yet open
1172 *
1173 * options is a QDict of options to pass to the block drivers, or NULL for an
1174 * empty set of options. The reference to the QDict is transferred to this
1175 * function (even on failure), so if the caller intends to reuse the dictionary,
1176 * it needs to use QINCREF() before calling bdrv_file_open.
1177 */
34b5d2c6 1178int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
9156df12 1179{
1ba4b6a5 1180 char *backing_filename = g_malloc0(PATH_MAX);
317fc44e 1181 int ret = 0;
9156df12 1182 BlockDriver *back_drv = NULL;
8d24cce1 1183 BlockDriverState *backing_hd;
34b5d2c6 1184 Error *local_err = NULL;
9156df12
PB
1185
1186 if (bs->backing_hd != NULL) {
31ca6d07 1187 QDECREF(options);
1ba4b6a5 1188 goto free_exit;
9156df12
PB
1189 }
1190
31ca6d07
KW
1191 /* NULL means an empty set of options */
1192 if (options == NULL) {
1193 options = qdict_new();
1194 }
1195
9156df12 1196 bs->open_flags &= ~BDRV_O_NO_BACKING;
1cb6f506
KW
1197 if (qdict_haskey(options, "file.filename")) {
1198 backing_filename[0] = '\0';
1199 } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
31ca6d07 1200 QDECREF(options);
1ba4b6a5 1201 goto free_exit;
dbecebdd 1202 } else {
1ba4b6a5 1203 bdrv_get_full_backing_filename(bs, backing_filename, PATH_MAX);
9156df12
PB
1204 }
1205
8ee79e70
KW
1206 if (!bs->drv || !bs->drv->supports_backing) {
1207 ret = -EINVAL;
1208 error_setg(errp, "Driver doesn't support backing files");
1209 QDECREF(options);
1210 goto free_exit;
1211 }
1212
e4e9986b 1213 backing_hd = bdrv_new();
8d24cce1 1214
9156df12
PB
1215 if (bs->backing_format[0] != '\0') {
1216 back_drv = bdrv_find_format(bs->backing_format);
1217 }
1218
f67503e5 1219 assert(bs->backing_hd == NULL);
8d24cce1 1220 ret = bdrv_open(&backing_hd,
ddf5636d 1221 *backing_filename ? backing_filename : NULL, NULL, options,
317fc44e 1222 bdrv_backing_flags(bs->open_flags), back_drv, &local_err);
9156df12 1223 if (ret < 0) {
8d24cce1
FZ
1224 bdrv_unref(backing_hd);
1225 backing_hd = NULL;
9156df12 1226 bs->open_flags |= BDRV_O_NO_BACKING;
b04b6b6e
FZ
1227 error_setg(errp, "Could not open backing file: %s",
1228 error_get_pretty(local_err));
1229 error_free(local_err);
1ba4b6a5 1230 goto free_exit;
9156df12 1231 }
8d24cce1 1232 bdrv_set_backing_hd(bs, backing_hd);
d80ac658 1233
1ba4b6a5
BC
1234free_exit:
1235 g_free(backing_filename);
1236 return ret;
9156df12
PB
1237}
1238
da557aac
HR
1239/*
1240 * Opens a disk image whose options are given as BlockdevRef in another block
1241 * device's options.
1242 *
da557aac
HR
1243 * If allow_none is true, no image will be opened if filename is false and no
1244 * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
1245 *
1246 * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
1247 * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
1248 * itself, all options starting with "${bdref_key}." are considered part of the
1249 * BlockdevRef.
1250 *
1251 * The BlockdevRef will be removed from the options QDict.
f67503e5
HR
1252 *
1253 * To conform with the behavior of bdrv_open(), *pbs has to be NULL.
da557aac
HR
1254 */
1255int bdrv_open_image(BlockDriverState **pbs, const char *filename,
1256 QDict *options, const char *bdref_key, int flags,
f7d9fd8c 1257 bool allow_none, Error **errp)
da557aac
HR
1258{
1259 QDict *image_options;
1260 int ret;
1261 char *bdref_key_dot;
1262 const char *reference;
1263
f67503e5
HR
1264 assert(pbs);
1265 assert(*pbs == NULL);
1266
da557aac
HR
1267 bdref_key_dot = g_strdup_printf("%s.", bdref_key);
1268 qdict_extract_subqdict(options, &image_options, bdref_key_dot);
1269 g_free(bdref_key_dot);
1270
1271 reference = qdict_get_try_str(options, bdref_key);
1272 if (!filename && !reference && !qdict_size(image_options)) {
1273 if (allow_none) {
1274 ret = 0;
1275 } else {
1276 error_setg(errp, "A block device must be specified for \"%s\"",
1277 bdref_key);
1278 ret = -EINVAL;
1279 }
b20e61e0 1280 QDECREF(image_options);
da557aac
HR
1281 goto done;
1282 }
1283
f7d9fd8c 1284 ret = bdrv_open(pbs, filename, reference, image_options, flags, NULL, errp);
da557aac
HR
1285
1286done:
1287 qdict_del(options, bdref_key);
1288 return ret;
1289}
1290
6b8aeca5 1291int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp)
b998875d
KW
1292{
1293 /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1ba4b6a5 1294 char *tmp_filename = g_malloc0(PATH_MAX + 1);
b998875d
KW
1295 int64_t total_size;
1296 BlockDriver *bdrv_qcow2;
83d0521a 1297 QemuOpts *opts = NULL;
b998875d
KW
1298 QDict *snapshot_options;
1299 BlockDriverState *bs_snapshot;
1300 Error *local_err;
1301 int ret;
1302
1303 /* if snapshot, we create a temporary backing file and open it
1304 instead of opening 'filename' directly */
1305
1306 /* Get the required size from the image */
f187743a
KW
1307 total_size = bdrv_getlength(bs);
1308 if (total_size < 0) {
6b8aeca5 1309 ret = total_size;
f187743a 1310 error_setg_errno(errp, -total_size, "Could not get image size");
1ba4b6a5 1311 goto out;
f187743a 1312 }
b998875d
KW
1313
1314 /* Create the temporary image */
1ba4b6a5 1315 ret = get_tmp_filename(tmp_filename, PATH_MAX + 1);
b998875d
KW
1316 if (ret < 0) {
1317 error_setg_errno(errp, -ret, "Could not get temporary filename");
1ba4b6a5 1318 goto out;
b998875d
KW
1319 }
1320
1321 bdrv_qcow2 = bdrv_find_format("qcow2");
c282e1fd
CL
1322 opts = qemu_opts_create(bdrv_qcow2->create_opts, NULL, 0,
1323 &error_abort);
83d0521a 1324 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, total_size);
c282e1fd 1325 ret = bdrv_create(bdrv_qcow2, tmp_filename, opts, &local_err);
83d0521a 1326 qemu_opts_del(opts);
b998875d
KW
1327 if (ret < 0) {
1328 error_setg_errno(errp, -ret, "Could not create temporary overlay "
1329 "'%s': %s", tmp_filename,
1330 error_get_pretty(local_err));
1331 error_free(local_err);
1ba4b6a5 1332 goto out;
b998875d
KW
1333 }
1334
1335 /* Prepare a new options QDict for the temporary file */
1336 snapshot_options = qdict_new();
1337 qdict_put(snapshot_options, "file.driver",
1338 qstring_from_str("file"));
1339 qdict_put(snapshot_options, "file.filename",
1340 qstring_from_str(tmp_filename));
1341
e4e9986b 1342 bs_snapshot = bdrv_new();
b998875d
KW
1343
1344 ret = bdrv_open(&bs_snapshot, NULL, NULL, snapshot_options,
b1e6fc08 1345 flags, bdrv_qcow2, &local_err);
b998875d
KW
1346 if (ret < 0) {
1347 error_propagate(errp, local_err);
1ba4b6a5 1348 goto out;
b998875d
KW
1349 }
1350
1351 bdrv_append(bs_snapshot, bs);
1ba4b6a5
BC
1352
1353out:
1354 g_free(tmp_filename);
6b8aeca5 1355 return ret;
b998875d
KW
1356}
1357
b6ce07aa
KW
1358/*
1359 * Opens a disk image (raw, qcow2, vmdk, ...)
de9c0cec
KW
1360 *
1361 * options is a QDict of options to pass to the block drivers, or NULL for an
1362 * empty set of options. The reference to the QDict belongs to the block layer
1363 * after the call (even on failure), so if the caller intends to reuse the
1364 * dictionary, it needs to use QINCREF() before calling bdrv_open.
f67503e5
HR
1365 *
1366 * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
1367 * If it is not NULL, the referenced BDS will be reused.
ddf5636d
HR
1368 *
1369 * The reference parameter may be used to specify an existing block device which
1370 * should be opened. If specified, neither options nor a filename may be given,
1371 * nor can an existing BDS be reused (that is, *pbs has to be NULL).
b6ce07aa 1372 */
ddf5636d
HR
1373int bdrv_open(BlockDriverState **pbs, const char *filename,
1374 const char *reference, QDict *options, int flags,
1375 BlockDriver *drv, Error **errp)
ea2384d3 1376{
b6ce07aa 1377 int ret;
f67503e5 1378 BlockDriverState *file = NULL, *bs;
74fe54f2 1379 const char *drvname;
34b5d2c6 1380 Error *local_err = NULL;
b1e6fc08 1381 int snapshot_flags = 0;
712e7874 1382
f67503e5
HR
1383 assert(pbs);
1384
ddf5636d
HR
1385 if (reference) {
1386 bool options_non_empty = options ? qdict_size(options) : false;
1387 QDECREF(options);
1388
1389 if (*pbs) {
1390 error_setg(errp, "Cannot reuse an existing BDS when referencing "
1391 "another block device");
1392 return -EINVAL;
1393 }
1394
1395 if (filename || options_non_empty) {
1396 error_setg(errp, "Cannot reference an existing block device with "
1397 "additional options or a new filename");
1398 return -EINVAL;
1399 }
1400
1401 bs = bdrv_lookup_bs(reference, reference, errp);
1402 if (!bs) {
1403 return -ENODEV;
1404 }
1405 bdrv_ref(bs);
1406 *pbs = bs;
1407 return 0;
1408 }
1409
f67503e5
HR
1410 if (*pbs) {
1411 bs = *pbs;
1412 } else {
e4e9986b 1413 bs = bdrv_new();
f67503e5
HR
1414 }
1415
de9c0cec
KW
1416 /* NULL means an empty set of options */
1417 if (options == NULL) {
1418 options = qdict_new();
1419 }
1420
17b005f1 1421 ret = bdrv_fill_options(&options, &filename, flags, drv, &local_err);
462f5bcf
KW
1422 if (local_err) {
1423 goto fail;
1424 }
1425
76c591b0
KW
1426 /* Find the right image format driver */
1427 drv = NULL;
1428 drvname = qdict_get_try_str(options, "driver");
1429 if (drvname) {
1430 drv = bdrv_find_format(drvname);
1431 qdict_del(options, "driver");
1432 if (!drv) {
1433 error_setg(errp, "Unknown driver: '%s'", drvname);
1434 ret = -EINVAL;
1435 goto fail;
1436 }
1437 }
1438
1439 assert(drvname || !(flags & BDRV_O_PROTOCOL));
1440 if (drv && !drv->bdrv_file_open) {
1441 /* If the user explicitly wants a format driver here, we'll need to add
1442 * another layer for the protocol in bs->file */
1443 flags &= ~BDRV_O_PROTOCOL;
1444 }
1445
de9c0cec 1446 bs->options = options;
b6ad491a 1447 options = qdict_clone_shallow(options);
de9c0cec 1448
f500a6d3 1449 /* Open image file without format layer */
f4788adc
KW
1450 if ((flags & BDRV_O_PROTOCOL) == 0) {
1451 if (flags & BDRV_O_RDWR) {
1452 flags |= BDRV_O_ALLOW_RDWR;
1453 }
1454 if (flags & BDRV_O_SNAPSHOT) {
1455 snapshot_flags = bdrv_temp_snapshot_flags(flags);
1456 flags = bdrv_backing_flags(flags);
1457 }
f500a6d3 1458
f4788adc
KW
1459 assert(file == NULL);
1460 ret = bdrv_open_image(&file, filename, options, "file",
1461 bdrv_inherited_flags(flags),
1462 true, &local_err);
1463 if (ret < 0) {
1464 goto fail;
1465 }
f500a6d3
KW
1466 }
1467
76c591b0
KW
1468 /* Image format probing */
1469 if (!drv && file) {
17b005f1
KW
1470 ret = find_image_format(file, filename, &drv, &local_err);
1471 if (ret < 0) {
8bfea15d 1472 goto fail;
2a05cbe4 1473 }
76c591b0 1474 } else if (!drv) {
17b005f1
KW
1475 error_setg(errp, "Must specify either driver or file");
1476 ret = -EINVAL;
8bfea15d 1477 goto fail;
ea2384d3 1478 }
b6ce07aa
KW
1479
1480 /* Open the image */
34b5d2c6 1481 ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
b6ce07aa 1482 if (ret < 0) {
8bfea15d 1483 goto fail;
6987307c
CH
1484 }
1485
2a05cbe4 1486 if (file && (bs->file != file)) {
4f6fd349 1487 bdrv_unref(file);
f500a6d3
KW
1488 file = NULL;
1489 }
1490
b6ce07aa 1491 /* If there is a backing file, use it */
9156df12 1492 if ((flags & BDRV_O_NO_BACKING) == 0) {
31ca6d07
KW
1493 QDict *backing_options;
1494
5726d872 1495 qdict_extract_subqdict(options, &backing_options, "backing.");
34b5d2c6 1496 ret = bdrv_open_backing_file(bs, backing_options, &local_err);
b6ce07aa 1497 if (ret < 0) {
b6ad491a 1498 goto close_and_fail;
b6ce07aa 1499 }
b6ce07aa
KW
1500 }
1501
91af7014
HR
1502 bdrv_refresh_filename(bs);
1503
b998875d
KW
1504 /* For snapshot=on, create a temporary qcow2 overlay. bs points to the
1505 * temporary snapshot afterwards. */
b1e6fc08 1506 if (snapshot_flags) {
6b8aeca5 1507 ret = bdrv_append_temp_snapshot(bs, snapshot_flags, &local_err);
b998875d 1508 if (local_err) {
b998875d
KW
1509 goto close_and_fail;
1510 }
1511 }
1512
b6ad491a 1513 /* Check if any unknown options were used */
5acd9d81 1514 if (options && (qdict_size(options) != 0)) {
b6ad491a 1515 const QDictEntry *entry = qdict_first(options);
5acd9d81
HR
1516 if (flags & BDRV_O_PROTOCOL) {
1517 error_setg(errp, "Block protocol '%s' doesn't support the option "
1518 "'%s'", drv->format_name, entry->key);
1519 } else {
1520 error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1521 "support the option '%s'", drv->format_name,
bfb197e0 1522 bdrv_get_device_name(bs), entry->key);
5acd9d81 1523 }
b6ad491a
KW
1524
1525 ret = -EINVAL;
1526 goto close_and_fail;
1527 }
b6ad491a 1528
b6ce07aa 1529 if (!bdrv_key_required(bs)) {
7d4b4ba5 1530 bdrv_dev_change_media_cb(bs, true);
c3adb58f
MA
1531 } else if (!runstate_check(RUN_STATE_PRELAUNCH)
1532 && !runstate_check(RUN_STATE_INMIGRATE)
1533 && !runstate_check(RUN_STATE_PAUSED)) { /* HACK */
1534 error_setg(errp,
1535 "Guest must be stopped for opening of encrypted image");
1536 ret = -EBUSY;
1537 goto close_and_fail;
b6ce07aa
KW
1538 }
1539
c3adb58f 1540 QDECREF(options);
f67503e5 1541 *pbs = bs;
b6ce07aa
KW
1542 return 0;
1543
8bfea15d 1544fail:
f500a6d3 1545 if (file != NULL) {
4f6fd349 1546 bdrv_unref(file);
f500a6d3 1547 }
de9c0cec 1548 QDECREF(bs->options);
b6ad491a 1549 QDECREF(options);
de9c0cec 1550 bs->options = NULL;
f67503e5
HR
1551 if (!*pbs) {
1552 /* If *pbs is NULL, a new BDS has been created in this function and
1553 needs to be freed now. Otherwise, it does not need to be closed,
1554 since it has not really been opened yet. */
1555 bdrv_unref(bs);
1556 }
84d18f06 1557 if (local_err) {
34b5d2c6
HR
1558 error_propagate(errp, local_err);
1559 }
b6ad491a 1560 return ret;
de9c0cec 1561
b6ad491a 1562close_and_fail:
f67503e5
HR
1563 /* See fail path, but now the BDS has to be always closed */
1564 if (*pbs) {
1565 bdrv_close(bs);
1566 } else {
1567 bdrv_unref(bs);
1568 }
b6ad491a 1569 QDECREF(options);
84d18f06 1570 if (local_err) {
34b5d2c6
HR
1571 error_propagate(errp, local_err);
1572 }
b6ce07aa
KW
1573 return ret;
1574}
1575
e971aa12
JC
1576typedef struct BlockReopenQueueEntry {
1577 bool prepared;
1578 BDRVReopenState state;
1579 QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1580} BlockReopenQueueEntry;
1581
1582/*
1583 * Adds a BlockDriverState to a simple queue for an atomic, transactional
1584 * reopen of multiple devices.
1585 *
1586 * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1587 * already performed, or alternatively may be NULL a new BlockReopenQueue will
1588 * be created and initialized. This newly created BlockReopenQueue should be
1589 * passed back in for subsequent calls that are intended to be of the same
1590 * atomic 'set'.
1591 *
1592 * bs is the BlockDriverState to add to the reopen queue.
1593 *
1594 * flags contains the open flags for the associated bs
1595 *
1596 * returns a pointer to bs_queue, which is either the newly allocated
1597 * bs_queue, or the existing bs_queue being used.
1598 *
1599 */
1600BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1601 BlockDriverState *bs, int flags)
1602{
1603 assert(bs != NULL);
1604
1605 BlockReopenQueueEntry *bs_entry;
1606 if (bs_queue == NULL) {
1607 bs_queue = g_new0(BlockReopenQueue, 1);
1608 QSIMPLEQ_INIT(bs_queue);
1609 }
1610
f1f25a2e
KW
1611 /* bdrv_open() masks this flag out */
1612 flags &= ~BDRV_O_PROTOCOL;
1613
e971aa12 1614 if (bs->file) {
f1f25a2e 1615 bdrv_reopen_queue(bs_queue, bs->file, bdrv_inherited_flags(flags));
e971aa12
JC
1616 }
1617
1618 bs_entry = g_new0(BlockReopenQueueEntry, 1);
1619 QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1620
1621 bs_entry->state.bs = bs;
1622 bs_entry->state.flags = flags;
1623
1624 return bs_queue;
1625}
1626
1627/*
1628 * Reopen multiple BlockDriverStates atomically & transactionally.
1629 *
1630 * The queue passed in (bs_queue) must have been built up previous
1631 * via bdrv_reopen_queue().
1632 *
1633 * Reopens all BDS specified in the queue, with the appropriate
1634 * flags. All devices are prepared for reopen, and failure of any
1635 * device will cause all device changes to be abandonded, and intermediate
1636 * data cleaned up.
1637 *
1638 * If all devices prepare successfully, then the changes are committed
1639 * to all devices.
1640 *
1641 */
1642int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1643{
1644 int ret = -1;
1645 BlockReopenQueueEntry *bs_entry, *next;
1646 Error *local_err = NULL;
1647
1648 assert(bs_queue != NULL);
1649
1650 bdrv_drain_all();
1651
1652 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1653 if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1654 error_propagate(errp, local_err);
1655 goto cleanup;
1656 }
1657 bs_entry->prepared = true;
1658 }
1659
1660 /* If we reach this point, we have success and just need to apply the
1661 * changes
1662 */
1663 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1664 bdrv_reopen_commit(&bs_entry->state);
1665 }
1666
1667 ret = 0;
1668
1669cleanup:
1670 QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1671 if (ret && bs_entry->prepared) {
1672 bdrv_reopen_abort(&bs_entry->state);
1673 }
1674 g_free(bs_entry);
1675 }
1676 g_free(bs_queue);
1677 return ret;
1678}
1679
1680
1681/* Reopen a single BlockDriverState with the specified flags. */
1682int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1683{
1684 int ret = -1;
1685 Error *local_err = NULL;
1686 BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1687
1688 ret = bdrv_reopen_multiple(queue, &local_err);
1689 if (local_err != NULL) {
1690 error_propagate(errp, local_err);
1691 }
1692 return ret;
1693}
1694
1695
1696/*
1697 * Prepares a BlockDriverState for reopen. All changes are staged in the
1698 * 'opaque' field of the BDRVReopenState, which is used and allocated by
1699 * the block driver layer .bdrv_reopen_prepare()
1700 *
1701 * bs is the BlockDriverState to reopen
1702 * flags are the new open flags
1703 * queue is the reopen queue
1704 *
1705 * Returns 0 on success, non-zero on error. On error errp will be set
1706 * as well.
1707 *
1708 * On failure, bdrv_reopen_abort() will be called to clean up any data.
1709 * It is the responsibility of the caller to then call the abort() or
1710 * commit() for any other BDS that have been left in a prepare() state
1711 *
1712 */
1713int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1714 Error **errp)
1715{
1716 int ret = -1;
1717 Error *local_err = NULL;
1718 BlockDriver *drv;
1719
1720 assert(reopen_state != NULL);
1721 assert(reopen_state->bs->drv != NULL);
1722 drv = reopen_state->bs->drv;
1723
1724 /* if we are to stay read-only, do not allow permission change
1725 * to r/w */
1726 if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1727 reopen_state->flags & BDRV_O_RDWR) {
1728 error_set(errp, QERR_DEVICE_IS_READ_ONLY,
bfb197e0 1729 bdrv_get_device_name(reopen_state->bs));
e971aa12
JC
1730 goto error;
1731 }
1732
1733
1734 ret = bdrv_flush(reopen_state->bs);
1735 if (ret) {
1736 error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1737 strerror(-ret));
1738 goto error;
1739 }
1740
1741 if (drv->bdrv_reopen_prepare) {
1742 ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1743 if (ret) {
1744 if (local_err != NULL) {
1745 error_propagate(errp, local_err);
1746 } else {
d8b6895f
LC
1747 error_setg(errp, "failed while preparing to reopen image '%s'",
1748 reopen_state->bs->filename);
e971aa12
JC
1749 }
1750 goto error;
1751 }
1752 } else {
1753 /* It is currently mandatory to have a bdrv_reopen_prepare()
1754 * handler for each supported drv. */
1755 error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
bfb197e0 1756 drv->format_name, bdrv_get_device_name(reopen_state->bs),
e971aa12
JC
1757 "reopening of file");
1758 ret = -1;
1759 goto error;
1760 }
1761
1762 ret = 0;
1763
1764error:
1765 return ret;
1766}
1767
1768/*
1769 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1770 * makes them final by swapping the staging BlockDriverState contents into
1771 * the active BlockDriverState contents.
1772 */
1773void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1774{
1775 BlockDriver *drv;
1776
1777 assert(reopen_state != NULL);
1778 drv = reopen_state->bs->drv;
1779 assert(drv != NULL);
1780
1781 /* If there are any driver level actions to take */
1782 if (drv->bdrv_reopen_commit) {
1783 drv->bdrv_reopen_commit(reopen_state);
1784 }
1785
1786 /* set BDS specific flags now */
1787 reopen_state->bs->open_flags = reopen_state->flags;
1788 reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1789 BDRV_O_CACHE_WB);
1790 reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
355ef4ac 1791
3baca891 1792 bdrv_refresh_limits(reopen_state->bs, NULL);
e971aa12
JC
1793}
1794
1795/*
1796 * Abort the reopen, and delete and free the staged changes in
1797 * reopen_state
1798 */
1799void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1800{
1801 BlockDriver *drv;
1802
1803 assert(reopen_state != NULL);
1804 drv = reopen_state->bs->drv;
1805 assert(drv != NULL);
1806
1807 if (drv->bdrv_reopen_abort) {
1808 drv->bdrv_reopen_abort(reopen_state);
1809 }
1810}
1811
1812
fc01f7e7
FB
1813void bdrv_close(BlockDriverState *bs)
1814{
33384421
HR
1815 BdrvAioNotifier *ban, *ban_next;
1816
3cbc002c
PB
1817 if (bs->job) {
1818 block_job_cancel_sync(bs->job);
1819 }
58fda173
SH
1820 bdrv_drain_all(); /* complete I/O */
1821 bdrv_flush(bs);
1822 bdrv_drain_all(); /* in case flush left pending I/O */
d7d512f6 1823 notifier_list_notify(&bs->close_notifiers, bs);
7094f12f 1824
3cbc002c 1825 if (bs->drv) {
557df6ac 1826 if (bs->backing_hd) {
826b6ca0
FZ
1827 BlockDriverState *backing_hd = bs->backing_hd;
1828 bdrv_set_backing_hd(bs, NULL);
1829 bdrv_unref(backing_hd);
557df6ac 1830 }
ea2384d3 1831 bs->drv->bdrv_close(bs);
7267c094 1832 g_free(bs->opaque);
ea2384d3
FB
1833 bs->opaque = NULL;
1834 bs->drv = NULL;
53fec9d3 1835 bs->copy_on_read = 0;
a275fa42
PB
1836 bs->backing_file[0] = '\0';
1837 bs->backing_format[0] = '\0';
6405875c
PB
1838 bs->total_sectors = 0;
1839 bs->encrypted = 0;
1840 bs->valid_key = 0;
1841 bs->sg = 0;
1842 bs->growable = 0;
0d51b4de 1843 bs->zero_beyond_eof = false;
de9c0cec
KW
1844 QDECREF(bs->options);
1845 bs->options = NULL;
91af7014
HR
1846 QDECREF(bs->full_open_options);
1847 bs->full_open_options = NULL;
b338082b 1848
66f82cee 1849 if (bs->file != NULL) {
4f6fd349 1850 bdrv_unref(bs->file);
0ac9377d 1851 bs->file = NULL;
66f82cee 1852 }
b338082b 1853 }
98f90dba 1854
9ca11154
PH
1855 bdrv_dev_change_media_cb(bs, false);
1856
98f90dba
ZYW
1857 /*throttling disk I/O limits*/
1858 if (bs->io_limits_enabled) {
1859 bdrv_io_limits_disable(bs);
1860 }
33384421
HR
1861
1862 QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
1863 g_free(ban);
1864 }
1865 QLIST_INIT(&bs->aio_notifiers);
b338082b
FB
1866}
1867
2bc93fed
MK
1868void bdrv_close_all(void)
1869{
1870 BlockDriverState *bs;
1871
dc364f4c 1872 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
ed78cda3
SH
1873 AioContext *aio_context = bdrv_get_aio_context(bs);
1874
1875 aio_context_acquire(aio_context);
2bc93fed 1876 bdrv_close(bs);
ed78cda3 1877 aio_context_release(aio_context);
2bc93fed
MK
1878 }
1879}
1880
88266f5a
SH
1881/* Check if any requests are in-flight (including throttled requests) */
1882static bool bdrv_requests_pending(BlockDriverState *bs)
1883{
1884 if (!QLIST_EMPTY(&bs->tracked_requests)) {
1885 return true;
1886 }
cc0681c4
BC
1887 if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1888 return true;
1889 }
1890 if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
88266f5a
SH
1891 return true;
1892 }
1893 if (bs->file && bdrv_requests_pending(bs->file)) {
1894 return true;
1895 }
1896 if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1897 return true;
1898 }
1899 return false;
1900}
1901
922453bc
SH
1902/*
1903 * Wait for pending requests to complete across all BlockDriverStates
1904 *
1905 * This function does not flush data to disk, use bdrv_flush_all() for that
1906 * after calling this function.
4c355d53
ZYW
1907 *
1908 * Note that completion of an asynchronous I/O operation can trigger any
1909 * number of other I/O operations on other devices---for example a coroutine
1910 * can be arbitrarily complex and a constant flow of I/O can come until the
1911 * coroutine is complete. Because of this, it is not possible to have a
1912 * function to drain a single device's I/O queue.
922453bc
SH
1913 */
1914void bdrv_drain_all(void)
1915{
88266f5a
SH
1916 /* Always run first iteration so any pending completion BHs run */
1917 bool busy = true;
922453bc
SH
1918 BlockDriverState *bs;
1919
88266f5a 1920 while (busy) {
9b536adc
SH
1921 busy = false;
1922
dc364f4c 1923 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
9b536adc
SH
1924 AioContext *aio_context = bdrv_get_aio_context(bs);
1925 bool bs_busy;
1926
1927 aio_context_acquire(aio_context);
448ad91d 1928 bdrv_flush_io_queue(bs);
0b06ef3b 1929 bdrv_start_throttled_reqs(bs);
9b536adc
SH
1930 bs_busy = bdrv_requests_pending(bs);
1931 bs_busy |= aio_poll(aio_context, bs_busy);
1932 aio_context_release(aio_context);
922453bc 1933
9b536adc
SH
1934 busy |= bs_busy;
1935 }
922453bc
SH
1936 }
1937}
1938
dc364f4c
BC
1939/* make a BlockDriverState anonymous by removing from bdrv_state and
1940 * graph_bdrv_state list.
d22b2f41
RH
1941 Also, NULL terminate the device_name to prevent double remove */
1942void bdrv_make_anon(BlockDriverState *bs)
1943{
bfb197e0
MA
1944 /*
1945 * Take care to remove bs from bdrv_states only when it's actually
1946 * in it. Note that bs->device_list.tqe_prev is initially null,
1947 * and gets set to non-null by QTAILQ_INSERT_TAIL(). Establish
1948 * the useful invariant "bs in bdrv_states iff bs->tqe_prev" by
1949 * resetting it to null on remove.
1950 */
1951 if (bs->device_list.tqe_prev) {
dc364f4c 1952 QTAILQ_REMOVE(&bdrv_states, bs, device_list);
bfb197e0 1953 bs->device_list.tqe_prev = NULL;
d22b2f41 1954 }
dc364f4c
BC
1955 if (bs->node_name[0] != '\0') {
1956 QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
1957 }
1958 bs->node_name[0] = '\0';
d22b2f41
RH
1959}
1960
e023b2e2
PB
1961static void bdrv_rebind(BlockDriverState *bs)
1962{
1963 if (bs->drv && bs->drv->bdrv_rebind) {
1964 bs->drv->bdrv_rebind(bs);
1965 }
1966}
1967
4ddc07ca
PB
1968static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
1969 BlockDriverState *bs_src)
8802d1fd 1970{
4ddc07ca 1971 /* move some fields that need to stay attached to the device */
8802d1fd
JC
1972
1973 /* dev info */
4ddc07ca
PB
1974 bs_dest->dev_ops = bs_src->dev_ops;
1975 bs_dest->dev_opaque = bs_src->dev_opaque;
1976 bs_dest->dev = bs_src->dev;
1b7fd729 1977 bs_dest->guest_block_size = bs_src->guest_block_size;
4ddc07ca 1978 bs_dest->copy_on_read = bs_src->copy_on_read;
8802d1fd 1979
4ddc07ca 1980 bs_dest->enable_write_cache = bs_src->enable_write_cache;
c4a248a1 1981
cc0681c4
BC
1982 /* i/o throttled req */
1983 memcpy(&bs_dest->throttle_state,
1984 &bs_src->throttle_state,
1985 sizeof(ThrottleState));
1986 bs_dest->throttled_reqs[0] = bs_src->throttled_reqs[0];
1987 bs_dest->throttled_reqs[1] = bs_src->throttled_reqs[1];
4ddc07ca 1988 bs_dest->io_limits_enabled = bs_src->io_limits_enabled;
8802d1fd 1989
8802d1fd 1990 /* r/w error */
4ddc07ca
PB
1991 bs_dest->on_read_error = bs_src->on_read_error;
1992 bs_dest->on_write_error = bs_src->on_write_error;
8802d1fd
JC
1993
1994 /* i/o status */
4ddc07ca
PB
1995 bs_dest->iostatus_enabled = bs_src->iostatus_enabled;
1996 bs_dest->iostatus = bs_src->iostatus;
8802d1fd 1997
a9fc4408 1998 /* dirty bitmap */
e4654d2d 1999 bs_dest->dirty_bitmaps = bs_src->dirty_bitmaps;
a9fc4408 2000
9fcb0251
FZ
2001 /* reference count */
2002 bs_dest->refcnt = bs_src->refcnt;
2003
a9fc4408 2004 /* job */
4ddc07ca 2005 bs_dest->job = bs_src->job;
a9fc4408 2006
8802d1fd 2007 /* keep the same entry in bdrv_states */
dc364f4c 2008 bs_dest->device_list = bs_src->device_list;
7e7d56d9
MA
2009 bs_dest->blk = bs_src->blk;
2010
fbe40ff7
FZ
2011 memcpy(bs_dest->op_blockers, bs_src->op_blockers,
2012 sizeof(bs_dest->op_blockers));
4ddc07ca 2013}
8802d1fd 2014
4ddc07ca
PB
2015/*
2016 * Swap bs contents for two image chains while they are live,
2017 * while keeping required fields on the BlockDriverState that is
2018 * actually attached to a device.
2019 *
2020 * This will modify the BlockDriverState fields, and swap contents
2021 * between bs_new and bs_old. Both bs_new and bs_old are modified.
2022 *
bfb197e0 2023 * bs_new must not be attached to a BlockBackend.
4ddc07ca
PB
2024 *
2025 * This function does not create any image files.
2026 */
2027void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
2028{
2029 BlockDriverState tmp;
f6801b83 2030
90ce8a06
BC
2031 /* The code needs to swap the node_name but simply swapping node_list won't
2032 * work so first remove the nodes from the graph list, do the swap then
2033 * insert them back if needed.
2034 */
2035 if (bs_new->node_name[0] != '\0') {
2036 QTAILQ_REMOVE(&graph_bdrv_states, bs_new, node_list);
2037 }
2038 if (bs_old->node_name[0] != '\0') {
2039 QTAILQ_REMOVE(&graph_bdrv_states, bs_old, node_list);
2040 }
2041
bfb197e0 2042 /* bs_new must be unattached and shouldn't have anything fancy enabled */
7e7d56d9 2043 assert(!bs_new->blk);
e4654d2d 2044 assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
4ddc07ca
PB
2045 assert(bs_new->job == NULL);
2046 assert(bs_new->dev == NULL);
4ddc07ca 2047 assert(bs_new->io_limits_enabled == false);
cc0681c4 2048 assert(!throttle_have_timer(&bs_new->throttle_state));
8802d1fd 2049
4ddc07ca
PB
2050 tmp = *bs_new;
2051 *bs_new = *bs_old;
2052 *bs_old = tmp;
a9fc4408 2053
4ddc07ca
PB
2054 /* there are some fields that should not be swapped, move them back */
2055 bdrv_move_feature_fields(&tmp, bs_old);
2056 bdrv_move_feature_fields(bs_old, bs_new);
2057 bdrv_move_feature_fields(bs_new, &tmp);
8802d1fd 2058
bfb197e0 2059 /* bs_new must remain unattached */
7e7d56d9 2060 assert(!bs_new->blk);
4ddc07ca
PB
2061
2062 /* Check a few fields that should remain attached to the device */
2063 assert(bs_new->dev == NULL);
2064 assert(bs_new->job == NULL);
4ddc07ca 2065 assert(bs_new->io_limits_enabled == false);
cc0681c4 2066 assert(!throttle_have_timer(&bs_new->throttle_state));
e023b2e2 2067
90ce8a06
BC
2068 /* insert the nodes back into the graph node list if needed */
2069 if (bs_new->node_name[0] != '\0') {
2070 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_new, node_list);
2071 }
2072 if (bs_old->node_name[0] != '\0') {
2073 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_old, node_list);
2074 }
2075
e023b2e2 2076 bdrv_rebind(bs_new);
4ddc07ca
PB
2077 bdrv_rebind(bs_old);
2078}
2079
2080/*
2081 * Add new bs contents at the top of an image chain while the chain is
2082 * live, while keeping required fields on the top layer.
2083 *
2084 * This will modify the BlockDriverState fields, and swap contents
2085 * between bs_new and bs_top. Both bs_new and bs_top are modified.
2086 *
bfb197e0 2087 * bs_new must not be attached to a BlockBackend.
4ddc07ca
PB
2088 *
2089 * This function does not create any image files.
2090 */
2091void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
2092{
2093 bdrv_swap(bs_new, bs_top);
2094
2095 /* The contents of 'tmp' will become bs_top, as we are
2096 * swapping bs_new and bs_top contents. */
8d24cce1 2097 bdrv_set_backing_hd(bs_top, bs_new);
8802d1fd
JC
2098}
2099
4f6fd349 2100static void bdrv_delete(BlockDriverState *bs)
b338082b 2101{
fa879d62 2102 assert(!bs->dev);
3e914655 2103 assert(!bs->job);
3718d8ab 2104 assert(bdrv_op_blocker_is_empty(bs));
4f6fd349 2105 assert(!bs->refcnt);
e4654d2d 2106 assert(QLIST_EMPTY(&bs->dirty_bitmaps));
18846dee 2107
e1b5c52e
SH
2108 bdrv_close(bs);
2109
1b7bdbc1 2110 /* remove from list, if necessary */
d22b2f41 2111 bdrv_make_anon(bs);
34c6f050 2112
7267c094 2113 g_free(bs);
fc01f7e7
FB
2114}
2115
fa879d62
MA
2116int bdrv_attach_dev(BlockDriverState *bs, void *dev)
2117/* TODO change to DeviceState *dev when all users are qdevified */
18846dee 2118{
fa879d62 2119 if (bs->dev) {
18846dee
MA
2120 return -EBUSY;
2121 }
fa879d62 2122 bs->dev = dev;
28a7282a 2123 bdrv_iostatus_reset(bs);
2a87151f
SH
2124
2125 /* We're expecting I/O from the device so bump up coroutine pool size */
2126 qemu_coroutine_adjust_pool_size(COROUTINE_POOL_RESERVATION);
18846dee
MA
2127 return 0;
2128}
2129
fa879d62
MA
2130/* TODO qdevified devices don't use this, remove when devices are qdevified */
2131void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
18846dee 2132{
fa879d62
MA
2133 if (bdrv_attach_dev(bs, dev) < 0) {
2134 abort();
2135 }
2136}
2137
2138void bdrv_detach_dev(BlockDriverState *bs, void *dev)
2139/* TODO change to DeviceState *dev when all users are qdevified */
2140{
2141 assert(bs->dev == dev);
2142 bs->dev = NULL;
0e49de52
MA
2143 bs->dev_ops = NULL;
2144 bs->dev_opaque = NULL;
1b7fd729 2145 bs->guest_block_size = 512;
2a87151f 2146 qemu_coroutine_adjust_pool_size(-COROUTINE_POOL_RESERVATION);
18846dee
MA
2147}
2148
fa879d62
MA
2149/* TODO change to return DeviceState * when all users are qdevified */
2150void *bdrv_get_attached_dev(BlockDriverState *bs)
18846dee 2151{
fa879d62 2152 return bs->dev;
18846dee
MA
2153}
2154
0e49de52
MA
2155void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
2156 void *opaque)
2157{
2158 bs->dev_ops = ops;
2159 bs->dev_opaque = opaque;
2160}
2161
7d4b4ba5 2162static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
0e49de52 2163{
145feb17 2164 if (bs->dev_ops && bs->dev_ops->change_media_cb) {
6f382ed2 2165 bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
7d4b4ba5 2166 bs->dev_ops->change_media_cb(bs->dev_opaque, load);
6f382ed2
LC
2167 if (tray_was_closed) {
2168 /* tray open */
a5ee7bd4
WX
2169 qapi_event_send_device_tray_moved(bdrv_get_device_name(bs),
2170 true, &error_abort);
6f382ed2
LC
2171 }
2172 if (load) {
2173 /* tray close */
a5ee7bd4
WX
2174 qapi_event_send_device_tray_moved(bdrv_get_device_name(bs),
2175 false, &error_abort);
6f382ed2 2176 }
145feb17
MA
2177 }
2178}
2179
2c6942fa
MA
2180bool bdrv_dev_has_removable_media(BlockDriverState *bs)
2181{
2182 return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
2183}
2184
025ccaa7
PB
2185void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
2186{
2187 if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
2188 bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
2189 }
2190}
2191
e4def80b
MA
2192bool bdrv_dev_is_tray_open(BlockDriverState *bs)
2193{
2194 if (bs->dev_ops && bs->dev_ops->is_tray_open) {
2195 return bs->dev_ops->is_tray_open(bs->dev_opaque);
2196 }
2197 return false;
2198}
2199
145feb17
MA
2200static void bdrv_dev_resize_cb(BlockDriverState *bs)
2201{
2202 if (bs->dev_ops && bs->dev_ops->resize_cb) {
2203 bs->dev_ops->resize_cb(bs->dev_opaque);
0e49de52
MA
2204 }
2205}
2206
f107639a
MA
2207bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
2208{
2209 if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
2210 return bs->dev_ops->is_medium_locked(bs->dev_opaque);
2211 }
2212 return false;
2213}
2214
e97fc193
AL
2215/*
2216 * Run consistency checks on an image
2217 *
e076f338 2218 * Returns 0 if the check could be completed (it doesn't mean that the image is
a1c7273b 2219 * free of errors) or -errno when an internal error occurred. The results of the
e076f338 2220 * check are stored in res.
e97fc193 2221 */
4534ff54 2222int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
e97fc193 2223{
908bcd54
HR
2224 if (bs->drv == NULL) {
2225 return -ENOMEDIUM;
2226 }
e97fc193
AL
2227 if (bs->drv->bdrv_check == NULL) {
2228 return -ENOTSUP;
2229 }
2230
e076f338 2231 memset(res, 0, sizeof(*res));
4534ff54 2232 return bs->drv->bdrv_check(bs, res, fix);
e97fc193
AL
2233}
2234
8a426614
KW
2235#define COMMIT_BUF_SECTORS 2048
2236
33e3963e
FB
2237/* commit COW file into the raw image */
2238int bdrv_commit(BlockDriverState *bs)
2239{
19cb3738 2240 BlockDriver *drv = bs->drv;
72706ea4 2241 int64_t sector, total_sectors, length, backing_length;
8a426614 2242 int n, ro, open_flags;
0bce597d 2243 int ret = 0;
72706ea4 2244 uint8_t *buf = NULL;
c2cba3d9 2245 char filename[PATH_MAX];
33e3963e 2246
19cb3738
FB
2247 if (!drv)
2248 return -ENOMEDIUM;
6bb45158 2249
4dca4b63
NS
2250 if (!bs->backing_hd) {
2251 return -ENOTSUP;
33e3963e
FB
2252 }
2253
3718d8ab
FZ
2254 if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_COMMIT, NULL) ||
2255 bdrv_op_is_blocked(bs->backing_hd, BLOCK_OP_TYPE_COMMIT, NULL)) {
2d3735d3
SH
2256 return -EBUSY;
2257 }
2258
4dca4b63 2259 ro = bs->backing_hd->read_only;
c2cba3d9
JM
2260 /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
2261 pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
4dca4b63
NS
2262 open_flags = bs->backing_hd->open_flags;
2263
2264 if (ro) {
0bce597d
JC
2265 if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
2266 return -EACCES;
4dca4b63 2267 }
ea2384d3 2268 }
33e3963e 2269
72706ea4
JC
2270 length = bdrv_getlength(bs);
2271 if (length < 0) {
2272 ret = length;
2273 goto ro_cleanup;
2274 }
2275
2276 backing_length = bdrv_getlength(bs->backing_hd);
2277 if (backing_length < 0) {
2278 ret = backing_length;
2279 goto ro_cleanup;
2280 }
2281
2282 /* If our top snapshot is larger than the backing file image,
2283 * grow the backing file image if possible. If not possible,
2284 * we must return an error */
2285 if (length > backing_length) {
2286 ret = bdrv_truncate(bs->backing_hd, length);
2287 if (ret < 0) {
2288 goto ro_cleanup;
2289 }
2290 }
2291
2292 total_sectors = length >> BDRV_SECTOR_BITS;
857d4f46
KW
2293
2294 /* qemu_try_blockalign() for bs will choose an alignment that works for
2295 * bs->backing_hd as well, so no need to compare the alignment manually. */
2296 buf = qemu_try_blockalign(bs, COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
2297 if (buf == NULL) {
2298 ret = -ENOMEM;
2299 goto ro_cleanup;
2300 }
8a426614
KW
2301
2302 for (sector = 0; sector < total_sectors; sector += n) {
d663640c
PB
2303 ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
2304 if (ret < 0) {
2305 goto ro_cleanup;
2306 }
2307 if (ret) {
dabfa6cc
KW
2308 ret = bdrv_read(bs, sector, buf, n);
2309 if (ret < 0) {
8a426614
KW
2310 goto ro_cleanup;
2311 }
2312
dabfa6cc
KW
2313 ret = bdrv_write(bs->backing_hd, sector, buf, n);
2314 if (ret < 0) {
8a426614
KW
2315 goto ro_cleanup;
2316 }
ea2384d3 2317 }
33e3963e 2318 }
95389c86 2319
1d44952f
CH
2320 if (drv->bdrv_make_empty) {
2321 ret = drv->bdrv_make_empty(bs);
dabfa6cc
KW
2322 if (ret < 0) {
2323 goto ro_cleanup;
2324 }
1d44952f
CH
2325 bdrv_flush(bs);
2326 }
95389c86 2327
3f5075ae
CH
2328 /*
2329 * Make sure all data we wrote to the backing device is actually
2330 * stable on disk.
2331 */
dabfa6cc 2332 if (bs->backing_hd) {
3f5075ae 2333 bdrv_flush(bs->backing_hd);
dabfa6cc 2334 }
4dca4b63 2335
dabfa6cc 2336 ret = 0;
4dca4b63 2337ro_cleanup:
857d4f46 2338 qemu_vfree(buf);
4dca4b63
NS
2339
2340 if (ro) {
0bce597d
JC
2341 /* ignoring error return here */
2342 bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
4dca4b63
NS
2343 }
2344
1d44952f 2345 return ret;
33e3963e
FB
2346}
2347
e8877497 2348int bdrv_commit_all(void)
6ab4b5ab
MA
2349{
2350 BlockDriverState *bs;
2351
dc364f4c 2352 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
ed78cda3
SH
2353 AioContext *aio_context = bdrv_get_aio_context(bs);
2354
2355 aio_context_acquire(aio_context);
272d2d8e
JC
2356 if (bs->drv && bs->backing_hd) {
2357 int ret = bdrv_commit(bs);
2358 if (ret < 0) {
ed78cda3 2359 aio_context_release(aio_context);
272d2d8e
JC
2360 return ret;
2361 }
e8877497 2362 }
ed78cda3 2363 aio_context_release(aio_context);
6ab4b5ab 2364 }
e8877497 2365 return 0;
6ab4b5ab
MA
2366}
2367
dbffbdcf
SH
2368/**
2369 * Remove an active request from the tracked requests list
2370 *
2371 * This function should be called when a tracked request is completing.
2372 */
2373static void tracked_request_end(BdrvTrackedRequest *req)
2374{
2dbafdc0
KW
2375 if (req->serialising) {
2376 req->bs->serialising_in_flight--;
2377 }
2378
dbffbdcf 2379 QLIST_REMOVE(req, list);
f4658285 2380 qemu_co_queue_restart_all(&req->wait_queue);
dbffbdcf
SH
2381}
2382
2383/**
2384 * Add an active request to the tracked requests list
2385 */
2386static void tracked_request_begin(BdrvTrackedRequest *req,
2387 BlockDriverState *bs,
793ed47a
KW
2388 int64_t offset,
2389 unsigned int bytes, bool is_write)
dbffbdcf
SH
2390{
2391 *req = (BdrvTrackedRequest){
2392 .bs = bs,
2dbafdc0
KW
2393 .offset = offset,
2394 .bytes = bytes,
2395 .is_write = is_write,
2396 .co = qemu_coroutine_self(),
2397 .serialising = false,
7327145f
KW
2398 .overlap_offset = offset,
2399 .overlap_bytes = bytes,
dbffbdcf
SH
2400 };
2401
f4658285
SH
2402 qemu_co_queue_init(&req->wait_queue);
2403
dbffbdcf
SH
2404 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2405}
2406
e96126ff 2407static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
2dbafdc0 2408{
7327145f 2409 int64_t overlap_offset = req->offset & ~(align - 1);
e96126ff
KW
2410 unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
2411 - overlap_offset;
7327145f 2412
2dbafdc0
KW
2413 if (!req->serialising) {
2414 req->bs->serialising_in_flight++;
2415 req->serialising = true;
2416 }
7327145f
KW
2417
2418 req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
2419 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
2dbafdc0
KW
2420}
2421
d83947ac
SH
2422/**
2423 * Round a region to cluster boundaries
2424 */
343bded4
PB
2425void bdrv_round_to_clusters(BlockDriverState *bs,
2426 int64_t sector_num, int nb_sectors,
2427 int64_t *cluster_sector_num,
2428 int *cluster_nb_sectors)
d83947ac
SH
2429{
2430 BlockDriverInfo bdi;
2431
2432 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2433 *cluster_sector_num = sector_num;
2434 *cluster_nb_sectors = nb_sectors;
2435 } else {
2436 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2437 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2438 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2439 nb_sectors, c);
2440 }
2441}
2442
7327145f 2443static int bdrv_get_cluster_size(BlockDriverState *bs)
793ed47a
KW
2444{
2445 BlockDriverInfo bdi;
7327145f 2446 int ret;
793ed47a 2447
7327145f
KW
2448 ret = bdrv_get_info(bs, &bdi);
2449 if (ret < 0 || bdi.cluster_size == 0) {
2450 return bs->request_alignment;
793ed47a 2451 } else {
7327145f 2452 return bdi.cluster_size;
793ed47a
KW
2453 }
2454}
2455
f4658285 2456static bool tracked_request_overlaps(BdrvTrackedRequest *req,
793ed47a
KW
2457 int64_t offset, unsigned int bytes)
2458{
d83947ac 2459 /* aaaa bbbb */
7327145f 2460 if (offset >= req->overlap_offset + req->overlap_bytes) {
d83947ac
SH
2461 return false;
2462 }
2463 /* bbbb aaaa */
7327145f 2464 if (req->overlap_offset >= offset + bytes) {
d83947ac
SH
2465 return false;
2466 }
2467 return true;
f4658285
SH
2468}
2469
28de2dcd 2470static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
f4658285 2471{
2dbafdc0 2472 BlockDriverState *bs = self->bs;
f4658285
SH
2473 BdrvTrackedRequest *req;
2474 bool retry;
28de2dcd 2475 bool waited = false;
f4658285 2476
2dbafdc0 2477 if (!bs->serialising_in_flight) {
28de2dcd 2478 return false;
2dbafdc0
KW
2479 }
2480
f4658285
SH
2481 do {
2482 retry = false;
2483 QLIST_FOREACH(req, &bs->tracked_requests, list) {
2dbafdc0 2484 if (req == self || (!req->serialising && !self->serialising)) {
65afd211
KW
2485 continue;
2486 }
7327145f
KW
2487 if (tracked_request_overlaps(req, self->overlap_offset,
2488 self->overlap_bytes))
2489 {
5f8b6491
SH
2490 /* Hitting this means there was a reentrant request, for
2491 * example, a block driver issuing nested requests. This must
2492 * never happen since it means deadlock.
2493 */
2494 assert(qemu_coroutine_self() != req->co);
2495
6460440f
KW
2496 /* If the request is already (indirectly) waiting for us, or
2497 * will wait for us as soon as it wakes up, then just go on
2498 * (instead of producing a deadlock in the former case). */
2499 if (!req->waiting_for) {
2500 self->waiting_for = req;
2501 qemu_co_queue_wait(&req->wait_queue);
2502 self->waiting_for = NULL;
2503 retry = true;
28de2dcd 2504 waited = true;
6460440f
KW
2505 break;
2506 }
f4658285
SH
2507 }
2508 }
2509 } while (retry);
28de2dcd
KW
2510
2511 return waited;
f4658285
SH
2512}
2513
756e6736
KW
2514/*
2515 * Return values:
2516 * 0 - success
2517 * -EINVAL - backing format specified, but no file
2518 * -ENOSPC - can't update the backing file because no space is left in the
2519 * image file header
2520 * -ENOTSUP - format driver doesn't support changing the backing file
2521 */
2522int bdrv_change_backing_file(BlockDriverState *bs,
2523 const char *backing_file, const char *backing_fmt)
2524{
2525 BlockDriver *drv = bs->drv;
469ef350 2526 int ret;
756e6736 2527
5f377794
PB
2528 /* Backing file format doesn't make sense without a backing file */
2529 if (backing_fmt && !backing_file) {
2530 return -EINVAL;
2531 }
2532
756e6736 2533 if (drv->bdrv_change_backing_file != NULL) {
469ef350 2534 ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
756e6736 2535 } else {
469ef350 2536 ret = -ENOTSUP;
756e6736 2537 }
469ef350
PB
2538
2539 if (ret == 0) {
2540 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2541 pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2542 }
2543 return ret;
756e6736
KW
2544}
2545
6ebdcee2
JC
2546/*
2547 * Finds the image layer in the chain that has 'bs' as its backing file.
2548 *
2549 * active is the current topmost image.
2550 *
2551 * Returns NULL if bs is not found in active's image chain,
2552 * or if active == bs.
4caf0fcd
JC
2553 *
2554 * Returns the bottommost base image if bs == NULL.
6ebdcee2
JC
2555 */
2556BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2557 BlockDriverState *bs)
2558{
4caf0fcd
JC
2559 while (active && bs != active->backing_hd) {
2560 active = active->backing_hd;
6ebdcee2
JC
2561 }
2562
4caf0fcd
JC
2563 return active;
2564}
6ebdcee2 2565
4caf0fcd
JC
2566/* Given a BDS, searches for the base layer. */
2567BlockDriverState *bdrv_find_base(BlockDriverState *bs)
2568{
2569 return bdrv_find_overlay(bs, NULL);
6ebdcee2
JC
2570}
2571
2572typedef struct BlkIntermediateStates {
2573 BlockDriverState *bs;
2574 QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2575} BlkIntermediateStates;
2576
2577
2578/*
2579 * Drops images above 'base' up to and including 'top', and sets the image
2580 * above 'top' to have base as its backing file.
2581 *
2582 * Requires that the overlay to 'top' is opened r/w, so that the backing file
2583 * information in 'bs' can be properly updated.
2584 *
2585 * E.g., this will convert the following chain:
2586 * bottom <- base <- intermediate <- top <- active
2587 *
2588 * to
2589 *
2590 * bottom <- base <- active
2591 *
2592 * It is allowed for bottom==base, in which case it converts:
2593 *
2594 * base <- intermediate <- top <- active
2595 *
2596 * to
2597 *
2598 * base <- active
2599 *
54e26900
JC
2600 * If backing_file_str is non-NULL, it will be used when modifying top's
2601 * overlay image metadata.
2602 *
6ebdcee2
JC
2603 * Error conditions:
2604 * if active == top, that is considered an error
2605 *
2606 */
2607int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
54e26900 2608 BlockDriverState *base, const char *backing_file_str)
6ebdcee2
JC
2609{
2610 BlockDriverState *intermediate;
2611 BlockDriverState *base_bs = NULL;
2612 BlockDriverState *new_top_bs = NULL;
2613 BlkIntermediateStates *intermediate_state, *next;
2614 int ret = -EIO;
2615
2616 QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2617 QSIMPLEQ_INIT(&states_to_delete);
2618
2619 if (!top->drv || !base->drv) {
2620 goto exit;
2621 }
2622
2623 new_top_bs = bdrv_find_overlay(active, top);
2624
2625 if (new_top_bs == NULL) {
2626 /* we could not find the image above 'top', this is an error */
2627 goto exit;
2628 }
2629
2630 /* special case of new_top_bs->backing_hd already pointing to base - nothing
2631 * to do, no intermediate images */
2632 if (new_top_bs->backing_hd == base) {
2633 ret = 0;
2634 goto exit;
2635 }
2636
2637 intermediate = top;
2638
2639 /* now we will go down through the list, and add each BDS we find
2640 * into our deletion queue, until we hit the 'base'
2641 */
2642 while (intermediate) {
5839e53b 2643 intermediate_state = g_new0(BlkIntermediateStates, 1);
6ebdcee2
JC
2644 intermediate_state->bs = intermediate;
2645 QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2646
2647 if (intermediate->backing_hd == base) {
2648 base_bs = intermediate->backing_hd;
2649 break;
2650 }
2651 intermediate = intermediate->backing_hd;
2652 }
2653 if (base_bs == NULL) {
2654 /* something went wrong, we did not end at the base. safely
2655 * unravel everything, and exit with error */
2656 goto exit;
2657 }
2658
2659 /* success - we can delete the intermediate states, and link top->base */
54e26900
JC
2660 backing_file_str = backing_file_str ? backing_file_str : base_bs->filename;
2661 ret = bdrv_change_backing_file(new_top_bs, backing_file_str,
6ebdcee2
JC
2662 base_bs->drv ? base_bs->drv->format_name : "");
2663 if (ret) {
2664 goto exit;
2665 }
920beae1 2666 bdrv_set_backing_hd(new_top_bs, base_bs);
6ebdcee2
JC
2667
2668 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2669 /* so that bdrv_close() does not recursively close the chain */
920beae1 2670 bdrv_set_backing_hd(intermediate_state->bs, NULL);
4f6fd349 2671 bdrv_unref(intermediate_state->bs);
6ebdcee2
JC
2672 }
2673 ret = 0;
2674
2675exit:
2676 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2677 g_free(intermediate_state);
2678 }
2679 return ret;
2680}
2681
2682
71d0770c
AL
2683static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2684 size_t size)
2685{
2686 int64_t len;
2687
1dd3a447
KW
2688 if (size > INT_MAX) {
2689 return -EIO;
2690 }
2691
71d0770c
AL
2692 if (!bdrv_is_inserted(bs))
2693 return -ENOMEDIUM;
2694
2695 if (bs->growable)
2696 return 0;
2697
2698 len = bdrv_getlength(bs);
2699
fbb7b4e0
KW
2700 if (offset < 0)
2701 return -EIO;
2702
2703 if ((offset > len) || (len - offset < size))
71d0770c
AL
2704 return -EIO;
2705
2706 return 0;
2707}
2708
2709static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2710 int nb_sectors)
2711{
54db38a4 2712 if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
8f4754ed
KW
2713 return -EIO;
2714 }
2715
eb5a3165
JS
2716 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2717 nb_sectors * BDRV_SECTOR_SIZE);
71d0770c
AL
2718}
2719
1c9805a3
SH
2720typedef struct RwCo {
2721 BlockDriverState *bs;
775aa8b6 2722 int64_t offset;
1c9805a3
SH
2723 QEMUIOVector *qiov;
2724 bool is_write;
2725 int ret;
4105eaaa 2726 BdrvRequestFlags flags;
1c9805a3
SH
2727} RwCo;
2728
2729static void coroutine_fn bdrv_rw_co_entry(void *opaque)
fc01f7e7 2730{
1c9805a3 2731 RwCo *rwco = opaque;
ea2384d3 2732
1c9805a3 2733 if (!rwco->is_write) {
775aa8b6
KW
2734 rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
2735 rwco->qiov->size, rwco->qiov,
4105eaaa 2736 rwco->flags);
775aa8b6
KW
2737 } else {
2738 rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
2739 rwco->qiov->size, rwco->qiov,
2740 rwco->flags);
1c9805a3
SH
2741 }
2742}
e7a8a783 2743
1c9805a3 2744/*
8d3b1a2d 2745 * Process a vectored synchronous request using coroutines
1c9805a3 2746 */
775aa8b6
KW
2747static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
2748 QEMUIOVector *qiov, bool is_write,
2749 BdrvRequestFlags flags)
1c9805a3 2750{
1c9805a3
SH
2751 Coroutine *co;
2752 RwCo rwco = {
2753 .bs = bs,
775aa8b6 2754 .offset = offset,
8d3b1a2d 2755 .qiov = qiov,
1c9805a3
SH
2756 .is_write = is_write,
2757 .ret = NOT_DONE,
4105eaaa 2758 .flags = flags,
1c9805a3 2759 };
e7a8a783 2760
498e386c
ZYW
2761 /**
2762 * In sync call context, when the vcpu is blocked, this throttling timer
2763 * will not fire; so the I/O throttling function has to be disabled here
2764 * if it has been enabled.
2765 */
2766 if (bs->io_limits_enabled) {
2767 fprintf(stderr, "Disabling I/O throttling on '%s' due "
2768 "to synchronous I/O.\n", bdrv_get_device_name(bs));
2769 bdrv_io_limits_disable(bs);
2770 }
2771
1c9805a3
SH
2772 if (qemu_in_coroutine()) {
2773 /* Fast-path if already in coroutine context */
2774 bdrv_rw_co_entry(&rwco);
2775 } else {
2572b37a
SH
2776 AioContext *aio_context = bdrv_get_aio_context(bs);
2777
1c9805a3
SH
2778 co = qemu_coroutine_create(bdrv_rw_co_entry);
2779 qemu_coroutine_enter(co, &rwco);
2780 while (rwco.ret == NOT_DONE) {
2572b37a 2781 aio_poll(aio_context, true);
1c9805a3
SH
2782 }
2783 }
2784 return rwco.ret;
2785}
b338082b 2786
8d3b1a2d
KW
2787/*
2788 * Process a synchronous request using coroutines
2789 */
2790static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
4105eaaa 2791 int nb_sectors, bool is_write, BdrvRequestFlags flags)
8d3b1a2d
KW
2792{
2793 QEMUIOVector qiov;
2794 struct iovec iov = {
2795 .iov_base = (void *)buf,
2796 .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2797 };
2798
da15ee51
KW
2799 if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2800 return -EINVAL;
2801 }
2802
8d3b1a2d 2803 qemu_iovec_init_external(&qiov, &iov, 1);
775aa8b6
KW
2804 return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
2805 &qiov, is_write, flags);
8d3b1a2d
KW
2806}
2807
1c9805a3
SH
2808/* return < 0 if error. See bdrv_write() for the return codes */
2809int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2810 uint8_t *buf, int nb_sectors)
2811{
4105eaaa 2812 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
fc01f7e7
FB
2813}
2814
07d27a44
MA
2815/* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2816int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2817 uint8_t *buf, int nb_sectors)
2818{
2819 bool enabled;
2820 int ret;
2821
2822 enabled = bs->io_limits_enabled;
2823 bs->io_limits_enabled = false;
4e7395e8 2824 ret = bdrv_read(bs, sector_num, buf, nb_sectors);
07d27a44
MA
2825 bs->io_limits_enabled = enabled;
2826 return ret;
2827}
2828
5fafdf24 2829/* Return < 0 if error. Important errors are:
19cb3738
FB
2830 -EIO generic I/O error (may happen for all errors)
2831 -ENOMEDIUM No media inserted.
2832 -EINVAL Invalid sector number or nb_sectors
2833 -EACCES Trying to write a read-only device
2834*/
5fafdf24 2835int bdrv_write(BlockDriverState *bs, int64_t sector_num,
fc01f7e7
FB
2836 const uint8_t *buf, int nb_sectors)
2837{
4105eaaa 2838 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
83f64091
FB
2839}
2840
aa7bfbff
PL
2841int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2842 int nb_sectors, BdrvRequestFlags flags)
4105eaaa
PL
2843{
2844 return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
aa7bfbff 2845 BDRV_REQ_ZERO_WRITE | flags);
8d3b1a2d
KW
2846}
2847
d75cbb5e
PL
2848/*
2849 * Completely zero out a block device with the help of bdrv_write_zeroes.
2850 * The operation is sped up by checking the block status and only writing
2851 * zeroes to the device if they currently do not return zeroes. Optional
2852 * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2853 *
2854 * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2855 */
2856int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2857{
d32f7c10 2858 int64_t target_sectors, ret, nb_sectors, sector_num = 0;
d75cbb5e
PL
2859 int n;
2860
d32f7c10
MA
2861 target_sectors = bdrv_nb_sectors(bs);
2862 if (target_sectors < 0) {
2863 return target_sectors;
9ce10c0b 2864 }
9ce10c0b 2865
d75cbb5e 2866 for (;;) {
d32f7c10 2867 nb_sectors = target_sectors - sector_num;
d75cbb5e
PL
2868 if (nb_sectors <= 0) {
2869 return 0;
2870 }
2871 if (nb_sectors > INT_MAX) {
2872 nb_sectors = INT_MAX;
2873 }
2874 ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
3d94ce60
PL
2875 if (ret < 0) {
2876 error_report("error getting block status at sector %" PRId64 ": %s",
2877 sector_num, strerror(-ret));
2878 return ret;
2879 }
d75cbb5e
PL
2880 if (ret & BDRV_BLOCK_ZERO) {
2881 sector_num += n;
2882 continue;
2883 }
2884 ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2885 if (ret < 0) {
2886 error_report("error writing zeroes at sector %" PRId64 ": %s",
2887 sector_num, strerror(-ret));
2888 return ret;
2889 }
2890 sector_num += n;
2891 }
2892}
2893
a3ef6571 2894int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
83f64091 2895{
a3ef6571
KW
2896 QEMUIOVector qiov;
2897 struct iovec iov = {
2898 .iov_base = (void *)buf,
2899 .iov_len = bytes,
2900 };
9a8c4cce 2901 int ret;
83f64091 2902
a3ef6571
KW
2903 if (bytes < 0) {
2904 return -EINVAL;
83f64091
FB
2905 }
2906
a3ef6571
KW
2907 qemu_iovec_init_external(&qiov, &iov, 1);
2908 ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
2909 if (ret < 0) {
2910 return ret;
83f64091 2911 }
a3ef6571
KW
2912
2913 return bytes;
83f64091
FB
2914}
2915
8d3b1a2d 2916int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
83f64091 2917{
9a8c4cce 2918 int ret;
83f64091 2919
8407d5d7
KW
2920 ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
2921 if (ret < 0) {
2922 return ret;
83f64091
FB
2923 }
2924
8d3b1a2d
KW
2925 return qiov->size;
2926}
2927
2928int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
8407d5d7 2929 const void *buf, int bytes)
8d3b1a2d
KW
2930{
2931 QEMUIOVector qiov;
2932 struct iovec iov = {
2933 .iov_base = (void *) buf,
8407d5d7 2934 .iov_len = bytes,
8d3b1a2d
KW
2935 };
2936
8407d5d7
KW
2937 if (bytes < 0) {
2938 return -EINVAL;
2939 }
2940
8d3b1a2d
KW
2941 qemu_iovec_init_external(&qiov, &iov, 1);
2942 return bdrv_pwritev(bs, offset, &qiov);
83f64091 2943}
83f64091 2944
f08145fe
KW
2945/*
2946 * Writes to the file and ensures that no writes are reordered across this
2947 * request (acts as a barrier)
2948 *
2949 * Returns 0 on success, -errno in error cases.
2950 */
2951int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2952 const void *buf, int count)
2953{
2954 int ret;
2955
2956 ret = bdrv_pwrite(bs, offset, buf, count);
2957 if (ret < 0) {
2958 return ret;
2959 }
2960
f05fa4ad
PB
2961 /* No flush needed for cache modes that already do it */
2962 if (bs->enable_write_cache) {
f08145fe
KW
2963 bdrv_flush(bs);
2964 }
2965
2966 return 0;
2967}
2968
470c0504 2969static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
ab185921
SH
2970 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2971{
2972 /* Perform I/O through a temporary buffer so that users who scribble over
2973 * their read buffer while the operation is in progress do not end up
2974 * modifying the image file. This is critical for zero-copy guest I/O
2975 * where anything might happen inside guest memory.
2976 */
2977 void *bounce_buffer;
2978
79c053bd 2979 BlockDriver *drv = bs->drv;
ab185921
SH
2980 struct iovec iov;
2981 QEMUIOVector bounce_qiov;
2982 int64_t cluster_sector_num;
2983 int cluster_nb_sectors;
2984 size_t skip_bytes;
2985 int ret;
2986
2987 /* Cover entire cluster so no additional backing file I/O is required when
2988 * allocating cluster in the image file.
2989 */
343bded4
PB
2990 bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2991 &cluster_sector_num, &cluster_nb_sectors);
ab185921 2992
470c0504
SH
2993 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2994 cluster_sector_num, cluster_nb_sectors);
ab185921
SH
2995
2996 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
857d4f46
KW
2997 iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len);
2998 if (bounce_buffer == NULL) {
2999 ret = -ENOMEM;
3000 goto err;
3001 }
3002
ab185921
SH
3003 qemu_iovec_init_external(&bounce_qiov, &iov, 1);
3004
79c053bd
SH
3005 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
3006 &bounce_qiov);
ab185921
SH
3007 if (ret < 0) {
3008 goto err;
3009 }
3010
79c053bd
SH
3011 if (drv->bdrv_co_write_zeroes &&
3012 buffer_is_zero(bounce_buffer, iov.iov_len)) {
621f0589 3013 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
aa7bfbff 3014 cluster_nb_sectors, 0);
79c053bd 3015 } else {
f05fa4ad
PB
3016 /* This does not change the data on the disk, it is not necessary
3017 * to flush even in cache=writethrough mode.
3018 */
79c053bd 3019 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
ab185921 3020 &bounce_qiov);
79c053bd
SH
3021 }
3022
ab185921
SH
3023 if (ret < 0) {
3024 /* It might be okay to ignore write errors for guest requests. If this
3025 * is a deliberate copy-on-read then we don't want to ignore the error.
3026 * Simply report it in all cases.
3027 */
3028 goto err;
3029 }
3030
3031 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
03396148
MT
3032 qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
3033 nb_sectors * BDRV_SECTOR_SIZE);
ab185921
SH
3034
3035err:
3036 qemu_vfree(bounce_buffer);
3037 return ret;
3038}
3039
c5fbe571 3040/*
d0c7f642
KW
3041 * Forwards an already correctly aligned request to the BlockDriver. This
3042 * handles copy on read and zeroing after EOF; any other features must be
3043 * implemented by the caller.
c5fbe571 3044 */
d0c7f642 3045static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
65afd211 3046 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
ec746e10 3047 int64_t align, QEMUIOVector *qiov, int flags)
da1fa91d
KW
3048{
3049 BlockDriver *drv = bs->drv;
dbffbdcf 3050 int ret;
da1fa91d 3051
d0c7f642
KW
3052 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3053 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
da1fa91d 3054
d0c7f642
KW
3055 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3056 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
8eb029c2 3057 assert(!qiov || bytes == qiov->size);
d0c7f642
KW
3058
3059 /* Handle Copy on Read and associated serialisation */
470c0504 3060 if (flags & BDRV_REQ_COPY_ON_READ) {
7327145f
KW
3061 /* If we touch the same cluster it counts as an overlap. This
3062 * guarantees that allocating writes will be serialized and not race
3063 * with each other for the same cluster. For example, in copy-on-read
3064 * it ensures that the CoR read and write operations are atomic and
3065 * guest writes cannot interleave between them. */
3066 mark_request_serialising(req, bdrv_get_cluster_size(bs));
470c0504
SH
3067 }
3068
2dbafdc0 3069 wait_serialising_requests(req);
f4658285 3070
470c0504 3071 if (flags & BDRV_REQ_COPY_ON_READ) {
ab185921
SH
3072 int pnum;
3073
bdad13b9 3074 ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
ab185921
SH
3075 if (ret < 0) {
3076 goto out;
3077 }
3078
3079 if (!ret || pnum != nb_sectors) {
470c0504 3080 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
ab185921
SH
3081 goto out;
3082 }
3083 }
3084
d0c7f642 3085 /* Forward the request to the BlockDriver */
893a8f62
MK
3086 if (!(bs->zero_beyond_eof && bs->growable)) {
3087 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
3088 } else {
3089 /* Read zeros after EOF of growable BDSes */
4049082c 3090 int64_t total_sectors, max_nb_sectors;
893a8f62 3091
4049082c
MA
3092 total_sectors = bdrv_nb_sectors(bs);
3093 if (total_sectors < 0) {
3094 ret = total_sectors;
893a8f62
MK
3095 goto out;
3096 }
3097
5f5bcd80
KW
3098 max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
3099 align >> BDRV_SECTOR_BITS);
893a8f62 3100 if (max_nb_sectors > 0) {
33f461e0
KW
3101 QEMUIOVector local_qiov;
3102 size_t local_sectors;
3103
3104 max_nb_sectors = MIN(max_nb_sectors, SIZE_MAX / BDRV_SECTOR_BITS);
3105 local_sectors = MIN(max_nb_sectors, nb_sectors);
3106
3107 qemu_iovec_init(&local_qiov, qiov->niov);
3108 qemu_iovec_concat(&local_qiov, qiov, 0,
3109 local_sectors * BDRV_SECTOR_SIZE);
3110
3111 ret = drv->bdrv_co_readv(bs, sector_num, local_sectors,
3112 &local_qiov);
3113
3114 qemu_iovec_destroy(&local_qiov);
893a8f62
MK
3115 } else {
3116 ret = 0;
3117 }
3118
3119 /* Reading beyond end of file is supposed to produce zeroes */
3120 if (ret == 0 && total_sectors < sector_num + nb_sectors) {
3121 uint64_t offset = MAX(0, total_sectors - sector_num);
3122 uint64_t bytes = (sector_num + nb_sectors - offset) *
3123 BDRV_SECTOR_SIZE;
3124 qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
3125 }
3126 }
ab185921
SH
3127
3128out:
dbffbdcf 3129 return ret;
da1fa91d
KW
3130}
3131
d0c7f642
KW
3132/*
3133 * Handle a read request in coroutine context
3134 */
1b0288ae
KW
3135static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
3136 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
d0c7f642
KW
3137 BdrvRequestFlags flags)
3138{
3139 BlockDriver *drv = bs->drv;
65afd211
KW
3140 BdrvTrackedRequest req;
3141
1b0288ae
KW
3142 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3143 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3144 uint8_t *head_buf = NULL;
3145 uint8_t *tail_buf = NULL;
3146 QEMUIOVector local_qiov;
3147 bool use_local_qiov = false;
d0c7f642
KW
3148 int ret;
3149
3150 if (!drv) {
3151 return -ENOMEDIUM;
3152 }
1b0288ae 3153 if (bdrv_check_byte_request(bs, offset, bytes)) {
d0c7f642
KW
3154 return -EIO;
3155 }
3156
3157 if (bs->copy_on_read) {
3158 flags |= BDRV_REQ_COPY_ON_READ;
3159 }
3160
3161 /* throttling disk I/O */
3162 if (bs->io_limits_enabled) {
d5103588 3163 bdrv_io_limits_intercept(bs, bytes, false);
1b0288ae
KW
3164 }
3165
3166 /* Align read if necessary by padding qiov */
3167 if (offset & (align - 1)) {
3168 head_buf = qemu_blockalign(bs, align);
3169 qemu_iovec_init(&local_qiov, qiov->niov + 2);
3170 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3171 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3172 use_local_qiov = true;
3173
3174 bytes += offset & (align - 1);
3175 offset = offset & ~(align - 1);
3176 }
3177
3178 if ((offset + bytes) & (align - 1)) {
3179 if (!use_local_qiov) {
3180 qemu_iovec_init(&local_qiov, qiov->niov + 1);
3181 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3182 use_local_qiov = true;
3183 }
3184 tail_buf = qemu_blockalign(bs, align);
3185 qemu_iovec_add(&local_qiov, tail_buf,
3186 align - ((offset + bytes) & (align - 1)));
3187
3188 bytes = ROUND_UP(bytes, align);
3189 }
3190
65afd211 3191 tracked_request_begin(&req, bs, offset, bytes, false);
ec746e10 3192 ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
1b0288ae
KW
3193 use_local_qiov ? &local_qiov : qiov,
3194 flags);
65afd211 3195 tracked_request_end(&req);
1b0288ae
KW
3196
3197 if (use_local_qiov) {
3198 qemu_iovec_destroy(&local_qiov);
3199 qemu_vfree(head_buf);
3200 qemu_vfree(tail_buf);
d0c7f642
KW
3201 }
3202
d0c7f642
KW
3203 return ret;
3204}
3205
1b0288ae
KW
3206static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
3207 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3208 BdrvRequestFlags flags)
3209{
3210 if (nb_sectors < 0 || nb_sectors > (UINT_MAX >> BDRV_SECTOR_BITS)) {
3211 return -EINVAL;
3212 }
3213
3214 return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
3215 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3216}
3217
c5fbe571 3218int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
da1fa91d
KW
3219 int nb_sectors, QEMUIOVector *qiov)
3220{
c5fbe571 3221 trace_bdrv_co_readv(bs, sector_num, nb_sectors);
da1fa91d 3222
470c0504
SH
3223 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
3224}
3225
3226int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
3227 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
3228{
3229 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
3230
3231 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
3232 BDRV_REQ_COPY_ON_READ);
c5fbe571
SH
3233}
3234
c31cb707
PL
3235/* if no limit is specified in the BlockLimits use a default
3236 * of 32768 512-byte sectors (16 MiB) per request.
3237 */
3238#define MAX_WRITE_ZEROES_DEFAULT 32768
3239
f08f2dda 3240static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
aa7bfbff 3241 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
f08f2dda
SH
3242{
3243 BlockDriver *drv = bs->drv;
3244 QEMUIOVector qiov;
c31cb707
PL
3245 struct iovec iov = {0};
3246 int ret = 0;
f08f2dda 3247
c31cb707
PL
3248 int max_write_zeroes = bs->bl.max_write_zeroes ?
3249 bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
621f0589 3250
c31cb707
PL
3251 while (nb_sectors > 0 && !ret) {
3252 int num = nb_sectors;
3253
b8d71c09
PB
3254 /* Align request. Block drivers can expect the "bulk" of the request
3255 * to be aligned.
3256 */
3257 if (bs->bl.write_zeroes_alignment
3258 && num > bs->bl.write_zeroes_alignment) {
3259 if (sector_num % bs->bl.write_zeroes_alignment != 0) {
3260 /* Make a small request up to the first aligned sector. */
c31cb707 3261 num = bs->bl.write_zeroes_alignment;
b8d71c09
PB
3262 num -= sector_num % bs->bl.write_zeroes_alignment;
3263 } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
3264 /* Shorten the request to the last aligned sector. num cannot
3265 * underflow because num > bs->bl.write_zeroes_alignment.
3266 */
3267 num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
c31cb707 3268 }
621f0589 3269 }
f08f2dda 3270
c31cb707
PL
3271 /* limit request size */
3272 if (num > max_write_zeroes) {
3273 num = max_write_zeroes;
3274 }
3275
3276 ret = -ENOTSUP;
3277 /* First try the efficient write zeroes operation */
3278 if (drv->bdrv_co_write_zeroes) {
3279 ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
3280 }
3281
3282 if (ret == -ENOTSUP) {
3283 /* Fall back to bounce buffer if write zeroes is unsupported */
3284 iov.iov_len = num * BDRV_SECTOR_SIZE;
3285 if (iov.iov_base == NULL) {
857d4f46
KW
3286 iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE);
3287 if (iov.iov_base == NULL) {
3288 ret = -ENOMEM;
3289 goto fail;
3290 }
b8d71c09 3291 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
c31cb707
PL
3292 }
3293 qemu_iovec_init_external(&qiov, &iov, 1);
f08f2dda 3294
c31cb707 3295 ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
b8d71c09
PB
3296
3297 /* Keep bounce buffer around if it is big enough for all
3298 * all future requests.
3299 */
3300 if (num < max_write_zeroes) {
3301 qemu_vfree(iov.iov_base);
3302 iov.iov_base = NULL;
3303 }
c31cb707
PL
3304 }
3305
3306 sector_num += num;
3307 nb_sectors -= num;
3308 }
f08f2dda 3309
857d4f46 3310fail:
f08f2dda
SH
3311 qemu_vfree(iov.iov_base);
3312 return ret;
3313}
3314
c5fbe571 3315/*
b404f720 3316 * Forwards an already correctly aligned write request to the BlockDriver.
c5fbe571 3317 */
b404f720 3318static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
65afd211
KW
3319 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3320 QEMUIOVector *qiov, int flags)
c5fbe571
SH
3321{
3322 BlockDriver *drv = bs->drv;
28de2dcd 3323 bool waited;
6b7cb247 3324 int ret;
da1fa91d 3325
b404f720
KW
3326 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3327 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
f4658285 3328
b404f720
KW
3329 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3330 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
8eb029c2 3331 assert(!qiov || bytes == qiov->size);
cc0681c4 3332
28de2dcd
KW
3333 waited = wait_serialising_requests(req);
3334 assert(!waited || !req->serialising);
af91f9a7
KW
3335 assert(req->overlap_offset <= offset);
3336 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
244eadef 3337
65afd211 3338 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
d616b224 3339
465bee1d
PL
3340 if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
3341 !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes &&
3342 qemu_iovec_is_zero(qiov)) {
3343 flags |= BDRV_REQ_ZERO_WRITE;
3344 if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
3345 flags |= BDRV_REQ_MAY_UNMAP;
3346 }
3347 }
3348
d616b224
SH
3349 if (ret < 0) {
3350 /* Do nothing, write notifier decided to fail this request */
3351 } else if (flags & BDRV_REQ_ZERO_WRITE) {
9e1cb96d 3352 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
aa7bfbff 3353 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
f08f2dda 3354 } else {
9e1cb96d 3355 BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
f08f2dda
SH
3356 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
3357 }
9e1cb96d 3358 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
6b7cb247 3359
f05fa4ad
PB
3360 if (ret == 0 && !bs->enable_write_cache) {
3361 ret = bdrv_co_flush(bs);
3362 }
3363
e4654d2d 3364 bdrv_set_dirty(bs, sector_num, nb_sectors);
da1fa91d 3365
5366d0c8 3366 block_acct_highest_sector(&bs->stats, sector_num, nb_sectors);
5e5a94b6 3367
df2a6f29
PB
3368 if (bs->growable && ret >= 0) {
3369 bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
3370 }
da1fa91d 3371
6b7cb247 3372 return ret;
da1fa91d
KW
3373}
3374
b404f720
KW
3375/*
3376 * Handle a write request in coroutine context
3377 */
6601553e
KW
3378static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
3379 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
b404f720
KW
3380 BdrvRequestFlags flags)
3381{
65afd211 3382 BdrvTrackedRequest req;
3b8242e0
KW
3383 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3384 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3385 uint8_t *head_buf = NULL;
3386 uint8_t *tail_buf = NULL;
3387 QEMUIOVector local_qiov;
3388 bool use_local_qiov = false;
b404f720
KW
3389 int ret;
3390
3391 if (!bs->drv) {
3392 return -ENOMEDIUM;
3393 }
3394 if (bs->read_only) {
3395 return -EACCES;
3396 }
6601553e 3397 if (bdrv_check_byte_request(bs, offset, bytes)) {
b404f720
KW
3398 return -EIO;
3399 }
3400
b404f720
KW
3401 /* throttling disk I/O */
3402 if (bs->io_limits_enabled) {
d5103588 3403 bdrv_io_limits_intercept(bs, bytes, true);
b404f720
KW
3404 }
3405
3b8242e0
KW
3406 /*
3407 * Align write if necessary by performing a read-modify-write cycle.
3408 * Pad qiov with the read parts and be sure to have a tracked request not
3409 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
3410 */
65afd211 3411 tracked_request_begin(&req, bs, offset, bytes, true);
3b8242e0
KW
3412
3413 if (offset & (align - 1)) {
3414 QEMUIOVector head_qiov;
3415 struct iovec head_iov;
3416
3417 mark_request_serialising(&req, align);
3418 wait_serialising_requests(&req);
3419
3420 head_buf = qemu_blockalign(bs, align);
3421 head_iov = (struct iovec) {
3422 .iov_base = head_buf,
3423 .iov_len = align,
3424 };
3425 qemu_iovec_init_external(&head_qiov, &head_iov, 1);
3426
9e1cb96d 3427 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
3b8242e0
KW
3428 ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
3429 align, &head_qiov, 0);
3430 if (ret < 0) {
3431 goto fail;
3432 }
9e1cb96d 3433 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
3b8242e0
KW
3434
3435 qemu_iovec_init(&local_qiov, qiov->niov + 2);
3436 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3437 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3438 use_local_qiov = true;
3439
3440 bytes += offset & (align - 1);
3441 offset = offset & ~(align - 1);
3442 }
3443
3444 if ((offset + bytes) & (align - 1)) {
3445 QEMUIOVector tail_qiov;
3446 struct iovec tail_iov;
3447 size_t tail_bytes;
28de2dcd 3448 bool waited;
3b8242e0
KW
3449
3450 mark_request_serialising(&req, align);
28de2dcd
KW
3451 waited = wait_serialising_requests(&req);
3452 assert(!waited || !use_local_qiov);
3b8242e0
KW
3453
3454 tail_buf = qemu_blockalign(bs, align);
3455 tail_iov = (struct iovec) {
3456 .iov_base = tail_buf,
3457 .iov_len = align,
3458 };
3459 qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
3460
9e1cb96d 3461 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
3b8242e0
KW
3462 ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
3463 align, &tail_qiov, 0);
3464 if (ret < 0) {
3465 goto fail;
3466 }
9e1cb96d 3467 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
3b8242e0
KW
3468
3469 if (!use_local_qiov) {
3470 qemu_iovec_init(&local_qiov, qiov->niov + 1);
3471 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3472 use_local_qiov = true;
3473 }
3474
3475 tail_bytes = (offset + bytes) & (align - 1);
3476 qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
3477
3478 bytes = ROUND_UP(bytes, align);
3479 }
3480
3481 ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
3482 use_local_qiov ? &local_qiov : qiov,
3483 flags);
3484
3485fail:
65afd211 3486 tracked_request_end(&req);
b404f720 3487
3b8242e0
KW
3488 if (use_local_qiov) {
3489 qemu_iovec_destroy(&local_qiov);
3b8242e0 3490 }
99c4a85c
KW
3491 qemu_vfree(head_buf);
3492 qemu_vfree(tail_buf);
3b8242e0 3493
b404f720
KW
3494 return ret;
3495}
3496
6601553e
KW
3497static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
3498 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3499 BdrvRequestFlags flags)
3500{
3501 if (nb_sectors < 0 || nb_sectors > (INT_MAX >> BDRV_SECTOR_BITS)) {
3502 return -EINVAL;
3503 }
3504
3505 return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
3506 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3507}
3508
c5fbe571
SH
3509int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
3510 int nb_sectors, QEMUIOVector *qiov)
3511{
3512 trace_bdrv_co_writev(bs, sector_num, nb_sectors);
3513
f08f2dda
SH
3514 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
3515}
3516
3517int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
aa7bfbff
PL
3518 int64_t sector_num, int nb_sectors,
3519 BdrvRequestFlags flags)
f08f2dda 3520{
94d6ff21 3521 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
f08f2dda 3522
d32f35cb
PL
3523 if (!(bs->open_flags & BDRV_O_UNMAP)) {
3524 flags &= ~BDRV_REQ_MAY_UNMAP;
3525 }
3526
f08f2dda 3527 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
aa7bfbff 3528 BDRV_REQ_ZERO_WRITE | flags);
c5fbe571
SH
3529}
3530
83f64091
FB
3531/**
3532 * Truncate file to 'offset' bytes (needed only for file protocols)
3533 */
3534int bdrv_truncate(BlockDriverState *bs, int64_t offset)
3535{
3536 BlockDriver *drv = bs->drv;
51762288 3537 int ret;
83f64091 3538 if (!drv)
19cb3738 3539 return -ENOMEDIUM;
83f64091
FB
3540 if (!drv->bdrv_truncate)
3541 return -ENOTSUP;
59f2689d
NS
3542 if (bs->read_only)
3543 return -EACCES;
9c75e168 3544
51762288
SH
3545 ret = drv->bdrv_truncate(bs, offset);
3546 if (ret == 0) {
3547 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
145feb17 3548 bdrv_dev_resize_cb(bs);
51762288
SH
3549 }
3550 return ret;
83f64091
FB
3551}
3552
4a1d5e1f
FZ
3553/**
3554 * Length of a allocated file in bytes. Sparse files are counted by actual
3555 * allocated space. Return < 0 if error or unknown.
3556 */
3557int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
3558{
3559 BlockDriver *drv = bs->drv;
3560 if (!drv) {
3561 return -ENOMEDIUM;
3562 }
3563 if (drv->bdrv_get_allocated_file_size) {
3564 return drv->bdrv_get_allocated_file_size(bs);
3565 }
3566 if (bs->file) {
3567 return bdrv_get_allocated_file_size(bs->file);
3568 }
3569 return -ENOTSUP;
3570}
3571
83f64091 3572/**
65a9bb25 3573 * Return number of sectors on success, -errno on error.
83f64091 3574 */
65a9bb25 3575int64_t bdrv_nb_sectors(BlockDriverState *bs)
83f64091
FB
3576{
3577 BlockDriver *drv = bs->drv;
65a9bb25 3578
83f64091 3579 if (!drv)
19cb3738 3580 return -ENOMEDIUM;
51762288 3581
b94a2610
KW
3582 if (drv->has_variable_length) {
3583 int ret = refresh_total_sectors(bs, bs->total_sectors);
3584 if (ret < 0) {
3585 return ret;
46a4e4e6 3586 }
83f64091 3587 }
65a9bb25
MA
3588 return bs->total_sectors;
3589}
3590
3591/**
3592 * Return length in bytes on success, -errno on error.
3593 * The length is always a multiple of BDRV_SECTOR_SIZE.
3594 */
3595int64_t bdrv_getlength(BlockDriverState *bs)
3596{
3597 int64_t ret = bdrv_nb_sectors(bs);
3598
3599 return ret < 0 ? ret : ret * BDRV_SECTOR_SIZE;
fc01f7e7
FB
3600}
3601
19cb3738 3602/* return 0 as number of sectors if no device present or error */
96b8f136 3603void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
fc01f7e7 3604{
65a9bb25
MA
3605 int64_t nb_sectors = bdrv_nb_sectors(bs);
3606
3607 *nb_sectors_ptr = nb_sectors < 0 ? 0 : nb_sectors;
fc01f7e7 3608}
cf98951b 3609
ff06f5f3
PB
3610void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
3611 BlockdevOnError on_write_error)
abd7f68d
MA
3612{
3613 bs->on_read_error = on_read_error;
3614 bs->on_write_error = on_write_error;
3615}
3616
1ceee0d5 3617BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
abd7f68d
MA
3618{
3619 return is_read ? bs->on_read_error : bs->on_write_error;
3620}
3621
3e1caa5f
PB
3622BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
3623{
3624 BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
3625
3626 switch (on_err) {
3627 case BLOCKDEV_ON_ERROR_ENOSPC:
a589569f
WX
3628 return (error == ENOSPC) ?
3629 BLOCK_ERROR_ACTION_STOP : BLOCK_ERROR_ACTION_REPORT;
3e1caa5f 3630 case BLOCKDEV_ON_ERROR_STOP:
a589569f 3631 return BLOCK_ERROR_ACTION_STOP;
3e1caa5f 3632 case BLOCKDEV_ON_ERROR_REPORT:
a589569f 3633 return BLOCK_ERROR_ACTION_REPORT;
3e1caa5f 3634 case BLOCKDEV_ON_ERROR_IGNORE:
a589569f 3635 return BLOCK_ERROR_ACTION_IGNORE;
3e1caa5f
PB
3636 default:
3637 abort();
3638 }
3639}
3640
c7c2ff0c
LC
3641static void send_qmp_error_event(BlockDriverState *bs,
3642 BlockErrorAction action,
3643 bool is_read, int error)
3644{
3645 BlockErrorAction ac;
3646
3647 ac = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE;
3648 qapi_event_send_block_io_error(bdrv_get_device_name(bs), ac, action,
3649 bdrv_iostatus_is_enabled(bs),
624ff573
LC
3650 error == ENOSPC, strerror(error),
3651 &error_abort);
c7c2ff0c
LC
3652}
3653
3e1caa5f
PB
3654/* This is done by device models because, while the block layer knows
3655 * about the error, it does not know whether an operation comes from
3656 * the device or the block layer (from a job, for example).
3657 */
3658void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3659 bool is_read, int error)
3660{
3661 assert(error >= 0);
2bd3bce8 3662
a589569f 3663 if (action == BLOCK_ERROR_ACTION_STOP) {
2bd3bce8
PB
3664 /* First set the iostatus, so that "info block" returns an iostatus
3665 * that matches the events raised so far (an additional error iostatus
3666 * is fine, but not a lost one).
3667 */
3e1caa5f 3668 bdrv_iostatus_set_err(bs, error);
2bd3bce8
PB
3669
3670 /* Then raise the request to stop the VM and the event.
3671 * qemu_system_vmstop_request_prepare has two effects. First,
3672 * it ensures that the STOP event always comes after the
3673 * BLOCK_IO_ERROR event. Second, it ensures that even if management
3674 * can observe the STOP event and do a "cont" before the STOP
3675 * event is issued, the VM will not stop. In this case, vm_start()
3676 * also ensures that the STOP/RESUME pair of events is emitted.
3677 */
3678 qemu_system_vmstop_request_prepare();
c7c2ff0c 3679 send_qmp_error_event(bs, action, is_read, error);
2bd3bce8
PB
3680 qemu_system_vmstop_request(RUN_STATE_IO_ERROR);
3681 } else {
c7c2ff0c 3682 send_qmp_error_event(bs, action, is_read, error);
3e1caa5f
PB
3683 }
3684}
3685
b338082b
FB
3686int bdrv_is_read_only(BlockDriverState *bs)
3687{
3688 return bs->read_only;
3689}
3690
985a03b0
TS
3691int bdrv_is_sg(BlockDriverState *bs)
3692{
3693 return bs->sg;
3694}
3695
e900a7b7
CH
3696int bdrv_enable_write_cache(BlockDriverState *bs)
3697{
3698 return bs->enable_write_cache;
3699}
3700
425b0148
PB
3701void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3702{
3703 bs->enable_write_cache = wce;
55b110f2
JC
3704
3705 /* so a reopen() will preserve wce */
3706 if (wce) {
3707 bs->open_flags |= BDRV_O_CACHE_WB;
3708 } else {
3709 bs->open_flags &= ~BDRV_O_CACHE_WB;
3710 }
425b0148
PB
3711}
3712
ea2384d3
FB
3713int bdrv_is_encrypted(BlockDriverState *bs)
3714{
3715 if (bs->backing_hd && bs->backing_hd->encrypted)
3716 return 1;
3717 return bs->encrypted;
3718}
3719
c0f4ce77
AL
3720int bdrv_key_required(BlockDriverState *bs)
3721{
3722 BlockDriverState *backing_hd = bs->backing_hd;
3723
3724 if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3725 return 1;
3726 return (bs->encrypted && !bs->valid_key);
3727}
3728
ea2384d3
FB
3729int bdrv_set_key(BlockDriverState *bs, const char *key)
3730{
3731 int ret;
3732 if (bs->backing_hd && bs->backing_hd->encrypted) {
3733 ret = bdrv_set_key(bs->backing_hd, key);
3734 if (ret < 0)
3735 return ret;
3736 if (!bs->encrypted)
3737 return 0;
3738 }
fd04a2ae
SH
3739 if (!bs->encrypted) {
3740 return -EINVAL;
3741 } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3742 return -ENOMEDIUM;
3743 }
c0f4ce77 3744 ret = bs->drv->bdrv_set_key(bs, key);
bb5fc20f
AL
3745 if (ret < 0) {
3746 bs->valid_key = 0;
3747 } else if (!bs->valid_key) {
3748 bs->valid_key = 1;
3749 /* call the change callback now, we skipped it on open */
7d4b4ba5 3750 bdrv_dev_change_media_cb(bs, true);
bb5fc20f 3751 }
c0f4ce77 3752 return ret;
ea2384d3
FB
3753}
3754
f8d6bba1 3755const char *bdrv_get_format_name(BlockDriverState *bs)
ea2384d3 3756{
f8d6bba1 3757 return bs->drv ? bs->drv->format_name : NULL;
ea2384d3
FB
3758}
3759
ada42401
SH
3760static int qsort_strcmp(const void *a, const void *b)
3761{
3762 return strcmp(a, b);
3763}
3764
5fafdf24 3765void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
ea2384d3
FB
3766 void *opaque)
3767{
3768 BlockDriver *drv;
e855e4fb 3769 int count = 0;
ada42401 3770 int i;
e855e4fb 3771 const char **formats = NULL;
ea2384d3 3772
8a22f02a 3773 QLIST_FOREACH(drv, &bdrv_drivers, list) {
e855e4fb
JC
3774 if (drv->format_name) {
3775 bool found = false;
3776 int i = count;
3777 while (formats && i && !found) {
3778 found = !strcmp(formats[--i], drv->format_name);
3779 }
3780
3781 if (!found) {
5839e53b 3782 formats = g_renew(const char *, formats, count + 1);
e855e4fb 3783 formats[count++] = drv->format_name;
e855e4fb
JC
3784 }
3785 }
ea2384d3 3786 }
ada42401
SH
3787
3788 qsort(formats, count, sizeof(formats[0]), qsort_strcmp);
3789
3790 for (i = 0; i < count; i++) {
3791 it(opaque, formats[i]);
3792 }
3793
e855e4fb 3794 g_free(formats);
ea2384d3
FB
3795}
3796
dc364f4c 3797/* This function is to find block backend bs */
7f06d47e 3798/* TODO convert callers to blk_by_name(), then remove */
b338082b
FB
3799BlockDriverState *bdrv_find(const char *name)
3800{
7f06d47e 3801 BlockBackend *blk = blk_by_name(name);
b338082b 3802
7f06d47e 3803 return blk ? blk_bs(blk) : NULL;
b338082b
FB
3804}
3805
dc364f4c
BC
3806/* This function is to find a node in the bs graph */
3807BlockDriverState *bdrv_find_node(const char *node_name)
3808{
3809 BlockDriverState *bs;
3810
3811 assert(node_name);
3812
3813 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3814 if (!strcmp(node_name, bs->node_name)) {
3815 return bs;
3816 }
3817 }
3818 return NULL;
3819}
3820
c13163fb
BC
3821/* Put this QMP function here so it can access the static graph_bdrv_states. */
3822BlockDeviceInfoList *bdrv_named_nodes_list(void)
3823{
3824 BlockDeviceInfoList *list, *entry;
3825 BlockDriverState *bs;
3826
3827 list = NULL;
3828 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3829 entry = g_malloc0(sizeof(*entry));
3830 entry->value = bdrv_block_device_info(bs);
3831 entry->next = list;
3832 list = entry;
3833 }
3834
3835 return list;
3836}
3837
12d3ba82
BC
3838BlockDriverState *bdrv_lookup_bs(const char *device,
3839 const char *node_name,
3840 Error **errp)
3841{
7f06d47e
MA
3842 BlockBackend *blk;
3843 BlockDriverState *bs;
12d3ba82 3844
12d3ba82 3845 if (device) {
7f06d47e 3846 blk = blk_by_name(device);
12d3ba82 3847
7f06d47e
MA
3848 if (blk) {
3849 return blk_bs(blk);
12d3ba82 3850 }
12d3ba82
BC
3851 }
3852
dd67fa50
BC
3853 if (node_name) {
3854 bs = bdrv_find_node(node_name);
12d3ba82 3855
dd67fa50
BC
3856 if (bs) {
3857 return bs;
3858 }
12d3ba82
BC
3859 }
3860
dd67fa50
BC
3861 error_setg(errp, "Cannot find device=%s nor node_name=%s",
3862 device ? device : "",
3863 node_name ? node_name : "");
3864 return NULL;
12d3ba82
BC
3865}
3866
5a6684d2
JC
3867/* If 'base' is in the same chain as 'top', return true. Otherwise,
3868 * return false. If either argument is NULL, return false. */
3869bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base)
3870{
3871 while (top && top != base) {
3872 top = top->backing_hd;
3873 }
3874
3875 return top != NULL;
3876}
3877
2f399b0a
MA
3878BlockDriverState *bdrv_next(BlockDriverState *bs)
3879{
3880 if (!bs) {
3881 return QTAILQ_FIRST(&bdrv_states);
3882 }
dc364f4c 3883 return QTAILQ_NEXT(bs, device_list);
2f399b0a
MA
3884}
3885
7f06d47e 3886/* TODO check what callers really want: bs->node_name or blk_name() */
bfb197e0 3887const char *bdrv_get_device_name(const BlockDriverState *bs)
ea2384d3 3888{
bfb197e0 3889 return bs->blk ? blk_name(bs->blk) : "";
ea2384d3
FB
3890}
3891
c8433287
MA
3892int bdrv_get_flags(BlockDriverState *bs)
3893{
3894 return bs->open_flags;
3895}
3896
f0f0fdfe 3897int bdrv_flush_all(void)
c6ca28d6
AL
3898{
3899 BlockDriverState *bs;
f0f0fdfe 3900 int result = 0;
c6ca28d6 3901
dc364f4c 3902 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
ed78cda3
SH
3903 AioContext *aio_context = bdrv_get_aio_context(bs);
3904 int ret;
3905
3906 aio_context_acquire(aio_context);
3907 ret = bdrv_flush(bs);
f0f0fdfe
KW
3908 if (ret < 0 && !result) {
3909 result = ret;
3910 }
ed78cda3 3911 aio_context_release(aio_context);
1b7bdbc1 3912 }
f0f0fdfe
KW
3913
3914 return result;
c6ca28d6
AL
3915}
3916
3ac21627
PL
3917int bdrv_has_zero_init_1(BlockDriverState *bs)
3918{
3919 return 1;
3920}
3921
f2feebbd
KW
3922int bdrv_has_zero_init(BlockDriverState *bs)
3923{
3924 assert(bs->drv);
3925
11212d8f
PB
3926 /* If BS is a copy on write image, it is initialized to
3927 the contents of the base image, which may not be zeroes. */
3928 if (bs->backing_hd) {
3929 return 0;
3930 }
336c1c12
KW
3931 if (bs->drv->bdrv_has_zero_init) {
3932 return bs->drv->bdrv_has_zero_init(bs);
f2feebbd
KW
3933 }
3934
3ac21627
PL
3935 /* safe default */
3936 return 0;
f2feebbd
KW
3937}
3938
4ce78691
PL
3939bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
3940{
3941 BlockDriverInfo bdi;
3942
3943 if (bs->backing_hd) {
3944 return false;
3945 }
3946
3947 if (bdrv_get_info(bs, &bdi) == 0) {
3948 return bdi.unallocated_blocks_are_zero;
3949 }
3950
3951 return false;
3952}
3953
3954bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
3955{
3956 BlockDriverInfo bdi;
3957
3958 if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
3959 return false;
3960 }
3961
3962 if (bdrv_get_info(bs, &bdi) == 0) {
3963 return bdi.can_write_zeroes_with_unmap;
3964 }
3965
3966 return false;
3967}
3968
b6b8a333 3969typedef struct BdrvCoGetBlockStatusData {
376ae3f1 3970 BlockDriverState *bs;
b35b2bba 3971 BlockDriverState *base;
376ae3f1
SH
3972 int64_t sector_num;
3973 int nb_sectors;
3974 int *pnum;
b6b8a333 3975 int64_t ret;
376ae3f1 3976 bool done;
b6b8a333 3977} BdrvCoGetBlockStatusData;
376ae3f1 3978
f58c7b35
TS
3979/*
3980 * Returns true iff the specified sector is present in the disk image. Drivers
3981 * not implementing the functionality are assumed to not support backing files,
3982 * hence all their sectors are reported as allocated.
3983 *
bd9533e3
SH
3984 * If 'sector_num' is beyond the end of the disk image the return value is 0
3985 * and 'pnum' is set to 0.
3986 *
f58c7b35
TS
3987 * 'pnum' is set to the number of sectors (including and immediately following
3988 * the specified sector) that are known to be in the same
3989 * allocated/unallocated state.
3990 *
bd9533e3
SH
3991 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
3992 * beyond the end of the disk image it will be clamped.
f58c7b35 3993 */
b6b8a333
PB
3994static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
3995 int64_t sector_num,
3996 int nb_sectors, int *pnum)
f58c7b35 3997{
30a7f2fc 3998 int64_t total_sectors;
bd9533e3 3999 int64_t n;
5daa74a6 4000 int64_t ret, ret2;
bd9533e3 4001
30a7f2fc
MA
4002 total_sectors = bdrv_nb_sectors(bs);
4003 if (total_sectors < 0) {
4004 return total_sectors;
617ccb46
PB
4005 }
4006
30a7f2fc 4007 if (sector_num >= total_sectors) {
bd9533e3
SH
4008 *pnum = 0;
4009 return 0;
4010 }
4011
30a7f2fc 4012 n = total_sectors - sector_num;
bd9533e3
SH
4013 if (n < nb_sectors) {
4014 nb_sectors = n;
4015 }
4016
b6b8a333 4017 if (!bs->drv->bdrv_co_get_block_status) {
bd9533e3 4018 *pnum = nb_sectors;
e88ae226 4019 ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
918e92d7
PB
4020 if (bs->drv->protocol_name) {
4021 ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
4022 }
4023 return ret;
f58c7b35 4024 }
6aebab14 4025
415b5b01
PB
4026 ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
4027 if (ret < 0) {
3e0a233d 4028 *pnum = 0;
415b5b01
PB
4029 return ret;
4030 }
4031
92bc50a5
PL
4032 if (ret & BDRV_BLOCK_RAW) {
4033 assert(ret & BDRV_BLOCK_OFFSET_VALID);
4034 return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
4035 *pnum, pnum);
4036 }
4037
e88ae226
KW
4038 if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
4039 ret |= BDRV_BLOCK_ALLOCATED;
4040 }
4041
c3d86884
PL
4042 if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
4043 if (bdrv_unallocated_blocks_are_zero(bs)) {
f0ad5712 4044 ret |= BDRV_BLOCK_ZERO;
1f9db224 4045 } else if (bs->backing_hd) {
f0ad5712 4046 BlockDriverState *bs2 = bs->backing_hd;
30a7f2fc
MA
4047 int64_t nb_sectors2 = bdrv_nb_sectors(bs2);
4048 if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) {
f0ad5712
PB
4049 ret |= BDRV_BLOCK_ZERO;
4050 }
4051 }
415b5b01 4052 }
5daa74a6
PB
4053
4054 if (bs->file &&
4055 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
4056 (ret & BDRV_BLOCK_OFFSET_VALID)) {
4057 ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
4058 *pnum, pnum);
4059 if (ret2 >= 0) {
4060 /* Ignore errors. This is just providing extra information, it
4061 * is useful but not necessary.
4062 */
4063 ret |= (ret2 & BDRV_BLOCK_ZERO);
4064 }
4065 }
4066
415b5b01 4067 return ret;
060f51c9
SH
4068}
4069
b6b8a333
PB
4070/* Coroutine wrapper for bdrv_get_block_status() */
4071static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
060f51c9 4072{
b6b8a333 4073 BdrvCoGetBlockStatusData *data = opaque;
060f51c9
SH
4074 BlockDriverState *bs = data->bs;
4075
b6b8a333
PB
4076 data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
4077 data->pnum);
060f51c9
SH
4078 data->done = true;
4079}
4080
4081/*
b6b8a333 4082 * Synchronous wrapper around bdrv_co_get_block_status().
060f51c9 4083 *
b6b8a333 4084 * See bdrv_co_get_block_status() for details.
060f51c9 4085 */
b6b8a333
PB
4086int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
4087 int nb_sectors, int *pnum)
060f51c9 4088{
6aebab14 4089 Coroutine *co;
b6b8a333 4090 BdrvCoGetBlockStatusData data = {
6aebab14
SH
4091 .bs = bs,
4092 .sector_num = sector_num,
4093 .nb_sectors = nb_sectors,
4094 .pnum = pnum,
4095 .done = false,
4096 };
4097
bdad13b9
PB
4098 if (qemu_in_coroutine()) {
4099 /* Fast-path if already in coroutine context */
b6b8a333 4100 bdrv_get_block_status_co_entry(&data);
bdad13b9 4101 } else {
2572b37a
SH
4102 AioContext *aio_context = bdrv_get_aio_context(bs);
4103
b6b8a333 4104 co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
bdad13b9
PB
4105 qemu_coroutine_enter(co, &data);
4106 while (!data.done) {
2572b37a 4107 aio_poll(aio_context, true);
bdad13b9 4108 }
6aebab14
SH
4109 }
4110 return data.ret;
f58c7b35
TS
4111}
4112
b6b8a333
PB
4113int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
4114 int nb_sectors, int *pnum)
4115{
4333bb71
PB
4116 int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
4117 if (ret < 0) {
4118 return ret;
4119 }
01fb2705 4120 return !!(ret & BDRV_BLOCK_ALLOCATED);
b6b8a333
PB
4121}
4122
188a7bbf
PB
4123/*
4124 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
4125 *
4126 * Return true if the given sector is allocated in any image between
4127 * BASE and TOP (inclusive). BASE can be NULL to check if the given
4128 * sector is allocated in any image of the chain. Return false otherwise.
4129 *
4130 * 'pnum' is set to the number of sectors (including and immediately following
4131 * the specified sector) that are known to be in the same
4132 * allocated/unallocated state.
4133 *
4134 */
4f578637
PB
4135int bdrv_is_allocated_above(BlockDriverState *top,
4136 BlockDriverState *base,
4137 int64_t sector_num,
4138 int nb_sectors, int *pnum)
188a7bbf
PB
4139{
4140 BlockDriverState *intermediate;
4141 int ret, n = nb_sectors;
4142
4143 intermediate = top;
4144 while (intermediate && intermediate != base) {
4145 int pnum_inter;
bdad13b9
PB
4146 ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
4147 &pnum_inter);
188a7bbf
PB
4148 if (ret < 0) {
4149 return ret;
4150 } else if (ret) {
4151 *pnum = pnum_inter;
4152 return 1;
4153 }
4154
4155 /*
4156 * [sector_num, nb_sectors] is unallocated on top but intermediate
4157 * might have
4158 *
4159 * [sector_num+x, nr_sectors] allocated.
4160 */
63ba17d3
VI
4161 if (n > pnum_inter &&
4162 (intermediate == top ||
4163 sector_num + pnum_inter < intermediate->total_sectors)) {
188a7bbf
PB
4164 n = pnum_inter;
4165 }
4166
4167 intermediate = intermediate->backing_hd;
4168 }
4169
4170 *pnum = n;
4171 return 0;
4172}
4173
045df330
AL
4174const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
4175{
4176 if (bs->backing_hd && bs->backing_hd->encrypted)
4177 return bs->backing_file;
4178 else if (bs->encrypted)
4179 return bs->filename;
4180 else
4181 return NULL;
4182}
4183
5fafdf24 4184void bdrv_get_backing_filename(BlockDriverState *bs,
83f64091
FB
4185 char *filename, int filename_size)
4186{
3574c608 4187 pstrcpy(filename, filename_size, bs->backing_file);
83f64091
FB
4188}
4189
5fafdf24 4190int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
faea38e7
FB
4191 const uint8_t *buf, int nb_sectors)
4192{
4193 BlockDriver *drv = bs->drv;
4194 if (!drv)
19cb3738 4195 return -ENOMEDIUM;
faea38e7
FB
4196 if (!drv->bdrv_write_compressed)
4197 return -ENOTSUP;
fbb7b4e0
KW
4198 if (bdrv_check_request(bs, sector_num, nb_sectors))
4199 return -EIO;
a55eb92c 4200
e4654d2d 4201 assert(QLIST_EMPTY(&bs->dirty_bitmaps));
a55eb92c 4202
faea38e7
FB
4203 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
4204}
3b46e624 4205
faea38e7
FB
4206int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
4207{
4208 BlockDriver *drv = bs->drv;
4209 if (!drv)
19cb3738 4210 return -ENOMEDIUM;
faea38e7
FB
4211 if (!drv->bdrv_get_info)
4212 return -ENOTSUP;
4213 memset(bdi, 0, sizeof(*bdi));
4214 return drv->bdrv_get_info(bs, bdi);
4215}
4216
eae041fe
HR
4217ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
4218{
4219 BlockDriver *drv = bs->drv;
4220 if (drv && drv->bdrv_get_specific_info) {
4221 return drv->bdrv_get_specific_info(bs);
4222 }
4223 return NULL;
4224}
4225
45566e9c
CH
4226int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
4227 int64_t pos, int size)
cf8074b3
KW
4228{
4229 QEMUIOVector qiov;
4230 struct iovec iov = {
4231 .iov_base = (void *) buf,
4232 .iov_len = size,
4233 };
4234
4235 qemu_iovec_init_external(&qiov, &iov, 1);
4236 return bdrv_writev_vmstate(bs, &qiov, pos);
4237}
4238
4239int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
178e08a5
AL
4240{
4241 BlockDriver *drv = bs->drv;
cf8074b3
KW
4242
4243 if (!drv) {
178e08a5 4244 return -ENOMEDIUM;
cf8074b3
KW
4245 } else if (drv->bdrv_save_vmstate) {
4246 return drv->bdrv_save_vmstate(bs, qiov, pos);
4247 } else if (bs->file) {
4248 return bdrv_writev_vmstate(bs->file, qiov, pos);
4249 }
4250
7cdb1f6d 4251 return -ENOTSUP;
178e08a5
AL
4252}
4253
45566e9c
CH
4254int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
4255 int64_t pos, int size)
178e08a5
AL
4256{
4257 BlockDriver *drv = bs->drv;
4258 if (!drv)
4259 return -ENOMEDIUM;
7cdb1f6d
MK
4260 if (drv->bdrv_load_vmstate)
4261 return drv->bdrv_load_vmstate(bs, buf, pos, size);
4262 if (bs->file)
4263 return bdrv_load_vmstate(bs->file, buf, pos, size);
4264 return -ENOTSUP;
178e08a5
AL
4265}
4266
8b9b0cc2
KW
4267void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
4268{
bf736fe3 4269 if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
8b9b0cc2
KW
4270 return;
4271 }
4272
bf736fe3 4273 bs->drv->bdrv_debug_event(bs, event);
41c695c7
KW
4274}
4275
4276int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
4277 const char *tag)
4278{
4279 while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
4280 bs = bs->file;
4281 }
4282
4283 if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
4284 return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
4285 }
4286
4287 return -ENOTSUP;
4288}
4289
4cc70e93
FZ
4290int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
4291{
4292 while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
4293 bs = bs->file;
4294 }
4295
4296 if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
4297 return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
4298 }
4299
4300 return -ENOTSUP;
4301}
4302
41c695c7
KW
4303int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
4304{
938789ea 4305 while (bs && (!bs->drv || !bs->drv->bdrv_debug_resume)) {
41c695c7
KW
4306 bs = bs->file;
4307 }
8b9b0cc2 4308
41c695c7
KW
4309 if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
4310 return bs->drv->bdrv_debug_resume(bs, tag);
4311 }
4312
4313 return -ENOTSUP;
4314}
4315
4316bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
4317{
4318 while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
4319 bs = bs->file;
4320 }
4321
4322 if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
4323 return bs->drv->bdrv_debug_is_suspended(bs, tag);
4324 }
4325
4326 return false;
8b9b0cc2
KW
4327}
4328
199630b6
BS
4329int bdrv_is_snapshot(BlockDriverState *bs)
4330{
4331 return !!(bs->open_flags & BDRV_O_SNAPSHOT);
4332}
4333
b1b1d783
JC
4334/* backing_file can either be relative, or absolute, or a protocol. If it is
4335 * relative, it must be relative to the chain. So, passing in bs->filename
4336 * from a BDS as backing_file should not be done, as that may be relative to
4337 * the CWD rather than the chain. */
e8a6bb9c
MT
4338BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
4339 const char *backing_file)
4340{
b1b1d783
JC
4341 char *filename_full = NULL;
4342 char *backing_file_full = NULL;
4343 char *filename_tmp = NULL;
4344 int is_protocol = 0;
4345 BlockDriverState *curr_bs = NULL;
4346 BlockDriverState *retval = NULL;
4347
4348 if (!bs || !bs->drv || !backing_file) {
e8a6bb9c
MT
4349 return NULL;
4350 }
4351
b1b1d783
JC
4352 filename_full = g_malloc(PATH_MAX);
4353 backing_file_full = g_malloc(PATH_MAX);
4354 filename_tmp = g_malloc(PATH_MAX);
4355
4356 is_protocol = path_has_protocol(backing_file);
4357
4358 for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
4359
4360 /* If either of the filename paths is actually a protocol, then
4361 * compare unmodified paths; otherwise make paths relative */
4362 if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
4363 if (strcmp(backing_file, curr_bs->backing_file) == 0) {
4364 retval = curr_bs->backing_hd;
4365 break;
4366 }
e8a6bb9c 4367 } else {
b1b1d783
JC
4368 /* If not an absolute filename path, make it relative to the current
4369 * image's filename path */
4370 path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4371 backing_file);
4372
4373 /* We are going to compare absolute pathnames */
4374 if (!realpath(filename_tmp, filename_full)) {
4375 continue;
4376 }
4377
4378 /* We need to make sure the backing filename we are comparing against
4379 * is relative to the current image filename (or absolute) */
4380 path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4381 curr_bs->backing_file);
4382
4383 if (!realpath(filename_tmp, backing_file_full)) {
4384 continue;
4385 }
4386
4387 if (strcmp(backing_file_full, filename_full) == 0) {
4388 retval = curr_bs->backing_hd;
4389 break;
4390 }
e8a6bb9c
MT
4391 }
4392 }
4393
b1b1d783
JC
4394 g_free(filename_full);
4395 g_free(backing_file_full);
4396 g_free(filename_tmp);
4397 return retval;
e8a6bb9c
MT
4398}
4399
f198fd1c
BC
4400int bdrv_get_backing_file_depth(BlockDriverState *bs)
4401{
4402 if (!bs->drv) {
4403 return 0;
4404 }
4405
4406 if (!bs->backing_hd) {
4407 return 0;
4408 }
4409
4410 return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
4411}
4412
ea2384d3 4413/**************************************************************/
83f64091 4414/* async I/Os */
ea2384d3 4415
3b69e4b9 4416BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
f141eafe 4417 QEMUIOVector *qiov, int nb_sectors,
3b69e4b9 4418 BlockDriverCompletionFunc *cb, void *opaque)
83f64091 4419{
bbf0a440
SH
4420 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
4421
d20d9b7c 4422 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
8c5873d6 4423 cb, opaque, false);
ea2384d3
FB
4424}
4425
f141eafe
AL
4426BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
4427 QEMUIOVector *qiov, int nb_sectors,
4428 BlockDriverCompletionFunc *cb, void *opaque)
ea2384d3 4429{
bbf0a440
SH
4430 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
4431
d20d9b7c 4432 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
8c5873d6 4433 cb, opaque, true);
83f64091
FB
4434}
4435
d5ef94d4
PB
4436BlockDriverAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
4437 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
4438 BlockDriverCompletionFunc *cb, void *opaque)
4439{
4440 trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
4441
4442 return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
4443 BDRV_REQ_ZERO_WRITE | flags,
4444 cb, opaque, true);
4445}
4446
40b4f539
KW
4447
4448typedef struct MultiwriteCB {
4449 int error;
4450 int num_requests;
4451 int num_callbacks;
4452 struct {
4453 BlockDriverCompletionFunc *cb;
4454 void *opaque;
4455 QEMUIOVector *free_qiov;
40b4f539
KW
4456 } callbacks[];
4457} MultiwriteCB;
4458
4459static void multiwrite_user_cb(MultiwriteCB *mcb)
4460{
4461 int i;
4462
4463 for (i = 0; i < mcb->num_callbacks; i++) {
4464 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
1e1ea48d
SH
4465 if (mcb->callbacks[i].free_qiov) {
4466 qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
4467 }
7267c094 4468 g_free(mcb->callbacks[i].free_qiov);
40b4f539
KW
4469 }
4470}
4471
4472static void multiwrite_cb(void *opaque, int ret)
4473{
4474 MultiwriteCB *mcb = opaque;
4475
6d519a5f
SH
4476 trace_multiwrite_cb(mcb, ret);
4477
cb6d3ca0 4478 if (ret < 0 && !mcb->error) {
40b4f539 4479 mcb->error = ret;
40b4f539
KW
4480 }
4481
4482 mcb->num_requests--;
4483 if (mcb->num_requests == 0) {
de189a1b 4484 multiwrite_user_cb(mcb);
7267c094 4485 g_free(mcb);
40b4f539
KW
4486 }
4487}
4488
4489static int multiwrite_req_compare(const void *a, const void *b)
4490{
77be4366
CH
4491 const BlockRequest *req1 = a, *req2 = b;
4492
4493 /*
4494 * Note that we can't simply subtract req2->sector from req1->sector
4495 * here as that could overflow the return value.
4496 */
4497 if (req1->sector > req2->sector) {
4498 return 1;
4499 } else if (req1->sector < req2->sector) {
4500 return -1;
4501 } else {
4502 return 0;
4503 }
40b4f539
KW
4504}
4505
4506/*
4507 * Takes a bunch of requests and tries to merge them. Returns the number of
4508 * requests that remain after merging.
4509 */
4510static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
4511 int num_reqs, MultiwriteCB *mcb)
4512{
4513 int i, outidx;
4514
4515 // Sort requests by start sector
4516 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
4517
4518 // Check if adjacent requests touch the same clusters. If so, combine them,
4519 // filling up gaps with zero sectors.
4520 outidx = 0;
4521 for (i = 1; i < num_reqs; i++) {
4522 int merge = 0;
4523 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
4524
b6a127a1 4525 // Handle exactly sequential writes and overlapping writes.
40b4f539
KW
4526 if (reqs[i].sector <= oldreq_last) {
4527 merge = 1;
4528 }
4529
e2a305fb
CH
4530 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
4531 merge = 0;
4532 }
4533
40b4f539
KW
4534 if (merge) {
4535 size_t size;
7267c094 4536 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
40b4f539
KW
4537 qemu_iovec_init(qiov,
4538 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
4539
4540 // Add the first request to the merged one. If the requests are
4541 // overlapping, drop the last sectors of the first request.
4542 size = (reqs[i].sector - reqs[outidx].sector) << 9;
1b093c48 4543 qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
40b4f539 4544
b6a127a1
PB
4545 // We should need to add any zeros between the two requests
4546 assert (reqs[i].sector <= oldreq_last);
40b4f539
KW
4547
4548 // Add the second request
1b093c48 4549 qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
40b4f539 4550
391827eb
SH
4551 // Add tail of first request, if necessary
4552 if (qiov->size < reqs[outidx].qiov->size) {
4553 qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size,
4554 reqs[outidx].qiov->size - qiov->size);
4555 }
4556
cbf1dff2 4557 reqs[outidx].nb_sectors = qiov->size >> 9;
40b4f539
KW
4558 reqs[outidx].qiov = qiov;
4559
4560 mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
4561 } else {
4562 outidx++;
4563 reqs[outidx].sector = reqs[i].sector;
4564 reqs[outidx].nb_sectors = reqs[i].nb_sectors;
4565 reqs[outidx].qiov = reqs[i].qiov;
4566 }
4567 }
4568
4569 return outidx + 1;
4570}
4571
4572/*
4573 * Submit multiple AIO write requests at once.
4574 *
4575 * On success, the function returns 0 and all requests in the reqs array have
4576 * been submitted. In error case this function returns -1, and any of the
4577 * requests may or may not be submitted yet. In particular, this means that the
4578 * callback will be called for some of the requests, for others it won't. The
4579 * caller must check the error field of the BlockRequest to wait for the right
4580 * callbacks (if error != 0, no callback will be called).
4581 *
4582 * The implementation may modify the contents of the reqs array, e.g. to merge
4583 * requests. However, the fields opaque and error are left unmodified as they
4584 * are used to signal failure for a single request to the caller.
4585 */
4586int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
4587{
40b4f539
KW
4588 MultiwriteCB *mcb;
4589 int i;
4590
301db7c2
RH
4591 /* don't submit writes if we don't have a medium */
4592 if (bs->drv == NULL) {
4593 for (i = 0; i < num_reqs; i++) {
4594 reqs[i].error = -ENOMEDIUM;
4595 }
4596 return -1;
4597 }
4598
40b4f539
KW
4599 if (num_reqs == 0) {
4600 return 0;
4601 }
4602
4603 // Create MultiwriteCB structure
7267c094 4604 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
40b4f539
KW
4605 mcb->num_requests = 0;
4606 mcb->num_callbacks = num_reqs;
4607
4608 for (i = 0; i < num_reqs; i++) {
4609 mcb->callbacks[i].cb = reqs[i].cb;
4610 mcb->callbacks[i].opaque = reqs[i].opaque;
4611 }
4612
4613 // Check for mergable requests
4614 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
4615
6d519a5f
SH
4616 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
4617
df9309fb
PB
4618 /* Run the aio requests. */
4619 mcb->num_requests = num_reqs;
40b4f539 4620 for (i = 0; i < num_reqs; i++) {
d20d9b7c
PB
4621 bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
4622 reqs[i].nb_sectors, reqs[i].flags,
4623 multiwrite_cb, mcb,
4624 true);
40b4f539
KW
4625 }
4626
4627 return 0;
40b4f539
KW
4628}
4629
83f64091 4630void bdrv_aio_cancel(BlockDriverAIOCB *acb)
83f64091 4631{
ca5fd113
FZ
4632 qemu_aio_ref(acb);
4633 bdrv_aio_cancel_async(acb);
4634 while (acb->refcnt > 1) {
4635 if (acb->aiocb_info->get_aio_context) {
4636 aio_poll(acb->aiocb_info->get_aio_context(acb), true);
4637 } else if (acb->bs) {
4638 aio_poll(bdrv_get_aio_context(acb->bs), true);
4639 } else {
4640 abort();
02c50efe 4641 }
02c50efe 4642 }
8007429a 4643 qemu_aio_unref(acb);
02c50efe
FZ
4644}
4645
4646/* Async version of aio cancel. The caller is not blocked if the acb implements
4647 * cancel_async, otherwise we do nothing and let the request normally complete.
4648 * In either case the completion callback must be called. */
4649void bdrv_aio_cancel_async(BlockDriverAIOCB *acb)
4650{
4651 if (acb->aiocb_info->cancel_async) {
4652 acb->aiocb_info->cancel_async(acb);
4653 }
83f64091
FB
4654}
4655
4656/**************************************************************/
4657/* async block device emulation */
4658
c16b5a2c
CH
4659typedef struct BlockDriverAIOCBSync {
4660 BlockDriverAIOCB common;
4661 QEMUBH *bh;
4662 int ret;
4663 /* vector translation state */
4664 QEMUIOVector *qiov;
4665 uint8_t *bounce;
4666 int is_write;
4667} BlockDriverAIOCBSync;
4668
d7331bed 4669static const AIOCBInfo bdrv_em_aiocb_info = {
c16b5a2c 4670 .aiocb_size = sizeof(BlockDriverAIOCBSync),
c16b5a2c
CH
4671};
4672
ce1a14dc 4673static void bdrv_aio_bh_cb(void *opaque)
83f64091 4674{
ce1a14dc 4675 BlockDriverAIOCBSync *acb = opaque;
f141eafe 4676
857d4f46 4677 if (!acb->is_write && acb->ret >= 0) {
03396148 4678 qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
857d4f46 4679 }
ceb42de8 4680 qemu_vfree(acb->bounce);
ce1a14dc 4681 acb->common.cb(acb->common.opaque, acb->ret);
6a7ad299 4682 qemu_bh_delete(acb->bh);
36afc451 4683 acb->bh = NULL;
8007429a 4684 qemu_aio_unref(acb);
83f64091 4685}
beac80cd 4686
f141eafe
AL
4687static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
4688 int64_t sector_num,
4689 QEMUIOVector *qiov,
4690 int nb_sectors,
4691 BlockDriverCompletionFunc *cb,
4692 void *opaque,
4693 int is_write)
4694
83f64091 4695{
ce1a14dc 4696 BlockDriverAIOCBSync *acb;
ce1a14dc 4697
d7331bed 4698 acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
f141eafe
AL
4699 acb->is_write = is_write;
4700 acb->qiov = qiov;
857d4f46 4701 acb->bounce = qemu_try_blockalign(bs, qiov->size);
2572b37a 4702 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb);
f141eafe 4703
857d4f46
KW
4704 if (acb->bounce == NULL) {
4705 acb->ret = -ENOMEM;
4706 } else if (is_write) {
d5e6b161 4707 qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
1ed20acf 4708 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
f141eafe 4709 } else {
1ed20acf 4710 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
f141eafe
AL
4711 }
4712
ce1a14dc 4713 qemu_bh_schedule(acb->bh);
f141eafe 4714
ce1a14dc 4715 return &acb->common;
beac80cd
FB
4716}
4717
f141eafe
AL
4718static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
4719 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
ce1a14dc 4720 BlockDriverCompletionFunc *cb, void *opaque)
beac80cd 4721{
f141eafe
AL
4722 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
4723}
83f64091 4724
f141eafe
AL
4725static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
4726 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4727 BlockDriverCompletionFunc *cb, void *opaque)
4728{
4729 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
beac80cd 4730}
beac80cd 4731
68485420
KW
4732
4733typedef struct BlockDriverAIOCBCoroutine {
4734 BlockDriverAIOCB common;
4735 BlockRequest req;
4736 bool is_write;
d318aea9 4737 bool *done;
68485420
KW
4738 QEMUBH* bh;
4739} BlockDriverAIOCBCoroutine;
4740
d7331bed 4741static const AIOCBInfo bdrv_em_co_aiocb_info = {
68485420 4742 .aiocb_size = sizeof(BlockDriverAIOCBCoroutine),
68485420
KW
4743};
4744
35246a68 4745static void bdrv_co_em_bh(void *opaque)
68485420
KW
4746{
4747 BlockDriverAIOCBCoroutine *acb = opaque;
4748
4749 acb->common.cb(acb->common.opaque, acb->req.error);
d318aea9 4750
68485420 4751 qemu_bh_delete(acb->bh);
8007429a 4752 qemu_aio_unref(acb);
68485420
KW
4753}
4754
b2a61371
SH
4755/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4756static void coroutine_fn bdrv_co_do_rw(void *opaque)
4757{
4758 BlockDriverAIOCBCoroutine *acb = opaque;
4759 BlockDriverState *bs = acb->common.bs;
4760
4761 if (!acb->is_write) {
4762 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
d20d9b7c 4763 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
b2a61371
SH
4764 } else {
4765 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
d20d9b7c 4766 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
b2a61371
SH
4767 }
4768
2572b37a 4769 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
b2a61371
SH
4770 qemu_bh_schedule(acb->bh);
4771}
4772
68485420
KW
4773static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4774 int64_t sector_num,
4775 QEMUIOVector *qiov,
4776 int nb_sectors,
d20d9b7c 4777 BdrvRequestFlags flags,
68485420
KW
4778 BlockDriverCompletionFunc *cb,
4779 void *opaque,
8c5873d6 4780 bool is_write)
68485420
KW
4781{
4782 Coroutine *co;
4783 BlockDriverAIOCBCoroutine *acb;
4784
d7331bed 4785 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
68485420
KW
4786 acb->req.sector = sector_num;
4787 acb->req.nb_sectors = nb_sectors;
4788 acb->req.qiov = qiov;
d20d9b7c 4789 acb->req.flags = flags;
68485420
KW
4790 acb->is_write = is_write;
4791
8c5873d6 4792 co = qemu_coroutine_create(bdrv_co_do_rw);
68485420
KW
4793 qemu_coroutine_enter(co, acb);
4794
4795 return &acb->common;
4796}
4797
07f07615 4798static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
b2e12bc6 4799{
07f07615
PB
4800 BlockDriverAIOCBCoroutine *acb = opaque;
4801 BlockDriverState *bs = acb->common.bs;
b2e12bc6 4802
07f07615 4803 acb->req.error = bdrv_co_flush(bs);
2572b37a 4804 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
b2e12bc6 4805 qemu_bh_schedule(acb->bh);
b2e12bc6
CH
4806}
4807
07f07615 4808BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
016f5cf6
AG
4809 BlockDriverCompletionFunc *cb, void *opaque)
4810{
07f07615 4811 trace_bdrv_aio_flush(bs, opaque);
016f5cf6 4812
07f07615
PB
4813 Coroutine *co;
4814 BlockDriverAIOCBCoroutine *acb;
016f5cf6 4815
d7331bed 4816 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
d318aea9 4817
07f07615
PB
4818 co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4819 qemu_coroutine_enter(co, acb);
016f5cf6 4820
016f5cf6
AG
4821 return &acb->common;
4822}
4823
4265d620
PB
4824static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4825{
4826 BlockDriverAIOCBCoroutine *acb = opaque;
4827 BlockDriverState *bs = acb->common.bs;
4828
4829 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
2572b37a 4830 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4265d620
PB
4831 qemu_bh_schedule(acb->bh);
4832}
4833
4834BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4835 int64_t sector_num, int nb_sectors,
4836 BlockDriverCompletionFunc *cb, void *opaque)
4837{
4838 Coroutine *co;
4839 BlockDriverAIOCBCoroutine *acb;
4840
4841 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4842
d7331bed 4843 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4265d620
PB
4844 acb->req.sector = sector_num;
4845 acb->req.nb_sectors = nb_sectors;
4846 co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
4847 qemu_coroutine_enter(co, acb);
4848
4849 return &acb->common;
4850}
4851
ea2384d3
FB
4852void bdrv_init(void)
4853{
5efa9d5a 4854 module_call_init(MODULE_INIT_BLOCK);
ea2384d3 4855}
ce1a14dc 4856
eb852011
MA
4857void bdrv_init_with_whitelist(void)
4858{
4859 use_bdrv_whitelist = 1;
4860 bdrv_init();
4861}
4862
d7331bed 4863void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
c16b5a2c 4864 BlockDriverCompletionFunc *cb, void *opaque)
ce1a14dc 4865{
ce1a14dc
PB
4866 BlockDriverAIOCB *acb;
4867
d7331bed
SH
4868 acb = g_slice_alloc(aiocb_info->aiocb_size);
4869 acb->aiocb_info = aiocb_info;
ce1a14dc
PB
4870 acb->bs = bs;
4871 acb->cb = cb;
4872 acb->opaque = opaque;
f197fe2b 4873 acb->refcnt = 1;
ce1a14dc
PB
4874 return acb;
4875}
4876
f197fe2b
FZ
4877void qemu_aio_ref(void *p)
4878{
4879 BlockDriverAIOCB *acb = p;
4880 acb->refcnt++;
4881}
4882
8007429a 4883void qemu_aio_unref(void *p)
ce1a14dc 4884{
d37c975f 4885 BlockDriverAIOCB *acb = p;
f197fe2b
FZ
4886 assert(acb->refcnt > 0);
4887 if (--acb->refcnt == 0) {
4888 g_slice_free1(acb->aiocb_info->aiocb_size, acb);
4889 }
ce1a14dc 4890}
19cb3738 4891
f9f05dc5
KW
4892/**************************************************************/
4893/* Coroutine block device emulation */
4894
4895typedef struct CoroutineIOCompletion {
4896 Coroutine *coroutine;
4897 int ret;
4898} CoroutineIOCompletion;
4899
4900static void bdrv_co_io_em_complete(void *opaque, int ret)
4901{
4902 CoroutineIOCompletion *co = opaque;
4903
4904 co->ret = ret;
4905 qemu_coroutine_enter(co->coroutine, NULL);
4906}
4907
4908static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
4909 int nb_sectors, QEMUIOVector *iov,
4910 bool is_write)
4911{
4912 CoroutineIOCompletion co = {
4913 .coroutine = qemu_coroutine_self(),
4914 };
4915 BlockDriverAIOCB *acb;
4916
4917 if (is_write) {
a652d160
SH
4918 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
4919 bdrv_co_io_em_complete, &co);
f9f05dc5 4920 } else {
a652d160
SH
4921 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
4922 bdrv_co_io_em_complete, &co);
f9f05dc5
KW
4923 }
4924
59370aaa 4925 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
f9f05dc5
KW
4926 if (!acb) {
4927 return -EIO;
4928 }
4929 qemu_coroutine_yield();
4930
4931 return co.ret;
4932}
4933
4934static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4935 int64_t sector_num, int nb_sectors,
4936 QEMUIOVector *iov)
4937{
4938 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4939}
4940
4941static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4942 int64_t sector_num, int nb_sectors,
4943 QEMUIOVector *iov)
4944{
4945 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4946}
4947
07f07615 4948static void coroutine_fn bdrv_flush_co_entry(void *opaque)
e7a8a783 4949{
07f07615
PB
4950 RwCo *rwco = opaque;
4951
4952 rwco->ret = bdrv_co_flush(rwco->bs);
4953}
4954
4955int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4956{
eb489bb1
KW
4957 int ret;
4958
29cdb251 4959 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
07f07615 4960 return 0;
eb489bb1
KW
4961 }
4962
ca716364 4963 /* Write back cached data to the OS even with cache=unsafe */
bf736fe3 4964 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
eb489bb1
KW
4965 if (bs->drv->bdrv_co_flush_to_os) {
4966 ret = bs->drv->bdrv_co_flush_to_os(bs);
4967 if (ret < 0) {
4968 return ret;
4969 }
4970 }
4971
ca716364
KW
4972 /* But don't actually force it to the disk with cache=unsafe */
4973 if (bs->open_flags & BDRV_O_NO_FLUSH) {
d4c82329 4974 goto flush_parent;
ca716364
KW
4975 }
4976
bf736fe3 4977 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
eb489bb1 4978 if (bs->drv->bdrv_co_flush_to_disk) {
29cdb251 4979 ret = bs->drv->bdrv_co_flush_to_disk(bs);
07f07615
PB
4980 } else if (bs->drv->bdrv_aio_flush) {
4981 BlockDriverAIOCB *acb;
4982 CoroutineIOCompletion co = {
4983 .coroutine = qemu_coroutine_self(),
4984 };
4985
4986 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4987 if (acb == NULL) {
29cdb251 4988 ret = -EIO;
07f07615
PB
4989 } else {
4990 qemu_coroutine_yield();
29cdb251 4991 ret = co.ret;
07f07615 4992 }
07f07615
PB
4993 } else {
4994 /*
4995 * Some block drivers always operate in either writethrough or unsafe
4996 * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4997 * know how the server works (because the behaviour is hardcoded or
4998 * depends on server-side configuration), so we can't ensure that
4999 * everything is safe on disk. Returning an error doesn't work because
5000 * that would break guests even if the server operates in writethrough
5001 * mode.
5002 *
5003 * Let's hope the user knows what he's doing.
5004 */
29cdb251 5005 ret = 0;
07f07615 5006 }
29cdb251
PB
5007 if (ret < 0) {
5008 return ret;
5009 }
5010
5011 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
5012 * in the case of cache=unsafe, so there are no useless flushes.
5013 */
d4c82329 5014flush_parent:
29cdb251 5015 return bdrv_co_flush(bs->file);
07f07615
PB
5016}
5017
5a8a30db 5018void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)
0f15423c 5019{
5a8a30db
KW
5020 Error *local_err = NULL;
5021 int ret;
5022
3456a8d1
KW
5023 if (!bs->drv) {
5024 return;
5025 }
5026
7ea2d269
AK
5027 if (!(bs->open_flags & BDRV_O_INCOMING)) {
5028 return;
5029 }
5030 bs->open_flags &= ~BDRV_O_INCOMING;
5031
3456a8d1 5032 if (bs->drv->bdrv_invalidate_cache) {
5a8a30db 5033 bs->drv->bdrv_invalidate_cache(bs, &local_err);
3456a8d1 5034 } else if (bs->file) {
5a8a30db
KW
5035 bdrv_invalidate_cache(bs->file, &local_err);
5036 }
5037 if (local_err) {
5038 error_propagate(errp, local_err);
5039 return;
0f15423c 5040 }
3456a8d1 5041
5a8a30db
KW
5042 ret = refresh_total_sectors(bs, bs->total_sectors);
5043 if (ret < 0) {
5044 error_setg_errno(errp, -ret, "Could not refresh total sector count");
5045 return;
5046 }
0f15423c
AL
5047}
5048
5a8a30db 5049void bdrv_invalidate_cache_all(Error **errp)
0f15423c
AL
5050{
5051 BlockDriverState *bs;
5a8a30db 5052 Error *local_err = NULL;
0f15423c 5053
dc364f4c 5054 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
ed78cda3
SH
5055 AioContext *aio_context = bdrv_get_aio_context(bs);
5056
5057 aio_context_acquire(aio_context);
5a8a30db 5058 bdrv_invalidate_cache(bs, &local_err);
ed78cda3 5059 aio_context_release(aio_context);
5a8a30db
KW
5060 if (local_err) {
5061 error_propagate(errp, local_err);
5062 return;
5063 }
0f15423c
AL
5064 }
5065}
5066
07f07615
PB
5067int bdrv_flush(BlockDriverState *bs)
5068{
5069 Coroutine *co;
5070 RwCo rwco = {
5071 .bs = bs,
5072 .ret = NOT_DONE,
e7a8a783 5073 };
e7a8a783 5074
07f07615
PB
5075 if (qemu_in_coroutine()) {
5076 /* Fast-path if already in coroutine context */
5077 bdrv_flush_co_entry(&rwco);
5078 } else {
2572b37a
SH
5079 AioContext *aio_context = bdrv_get_aio_context(bs);
5080
07f07615
PB
5081 co = qemu_coroutine_create(bdrv_flush_co_entry);
5082 qemu_coroutine_enter(co, &rwco);
5083 while (rwco.ret == NOT_DONE) {
2572b37a 5084 aio_poll(aio_context, true);
07f07615 5085 }
e7a8a783 5086 }
07f07615
PB
5087
5088 return rwco.ret;
e7a8a783
KW
5089}
5090
775aa8b6
KW
5091typedef struct DiscardCo {
5092 BlockDriverState *bs;
5093 int64_t sector_num;
5094 int nb_sectors;
5095 int ret;
5096} DiscardCo;
4265d620
PB
5097static void coroutine_fn bdrv_discard_co_entry(void *opaque)
5098{
775aa8b6 5099 DiscardCo *rwco = opaque;
4265d620
PB
5100
5101 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
5102}
5103
6f14da52
PL
5104/* if no limit is specified in the BlockLimits use a default
5105 * of 32768 512-byte sectors (16 MiB) per request.
5106 */
5107#define MAX_DISCARD_DEFAULT 32768
5108
4265d620
PB
5109int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
5110 int nb_sectors)
5111{
d51e9fe5
PB
5112 int max_discard;
5113
4265d620
PB
5114 if (!bs->drv) {
5115 return -ENOMEDIUM;
5116 } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
5117 return -EIO;
5118 } else if (bs->read_only) {
5119 return -EROFS;
df702c9b
PB
5120 }
5121
e4654d2d 5122 bdrv_reset_dirty(bs, sector_num, nb_sectors);
df702c9b 5123
9e8f1835
PB
5124 /* Do nothing if disabled. */
5125 if (!(bs->open_flags & BDRV_O_UNMAP)) {
5126 return 0;
5127 }
5128
d51e9fe5
PB
5129 if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
5130 return 0;
5131 }
6f14da52 5132
d51e9fe5
PB
5133 max_discard = bs->bl.max_discard ? bs->bl.max_discard : MAX_DISCARD_DEFAULT;
5134 while (nb_sectors > 0) {
5135 int ret;
5136 int num = nb_sectors;
6f14da52 5137
d51e9fe5
PB
5138 /* align request */
5139 if (bs->bl.discard_alignment &&
5140 num >= bs->bl.discard_alignment &&
5141 sector_num % bs->bl.discard_alignment) {
5142 if (num > bs->bl.discard_alignment) {
5143 num = bs->bl.discard_alignment;
6f14da52 5144 }
d51e9fe5
PB
5145 num -= sector_num % bs->bl.discard_alignment;
5146 }
6f14da52 5147
d51e9fe5
PB
5148 /* limit request size */
5149 if (num > max_discard) {
5150 num = max_discard;
5151 }
6f14da52 5152
d51e9fe5 5153 if (bs->drv->bdrv_co_discard) {
6f14da52 5154 ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
d51e9fe5
PB
5155 } else {
5156 BlockDriverAIOCB *acb;
5157 CoroutineIOCompletion co = {
5158 .coroutine = qemu_coroutine_self(),
5159 };
5160
5161 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
5162 bdrv_co_io_em_complete, &co);
5163 if (acb == NULL) {
5164 return -EIO;
5165 } else {
5166 qemu_coroutine_yield();
5167 ret = co.ret;
6f14da52 5168 }
6f14da52 5169 }
7ce21016 5170 if (ret && ret != -ENOTSUP) {
d51e9fe5 5171 return ret;
4265d620 5172 }
d51e9fe5
PB
5173
5174 sector_num += num;
5175 nb_sectors -= num;
4265d620 5176 }
d51e9fe5 5177 return 0;
4265d620
PB
5178}
5179
5180int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
5181{
5182 Coroutine *co;
775aa8b6 5183 DiscardCo rwco = {
4265d620
PB
5184 .bs = bs,
5185 .sector_num = sector_num,
5186 .nb_sectors = nb_sectors,
5187 .ret = NOT_DONE,
5188 };
5189
5190 if (qemu_in_coroutine()) {
5191 /* Fast-path if already in coroutine context */
5192 bdrv_discard_co_entry(&rwco);
5193 } else {
2572b37a
SH
5194 AioContext *aio_context = bdrv_get_aio_context(bs);
5195
4265d620
PB
5196 co = qemu_coroutine_create(bdrv_discard_co_entry);
5197 qemu_coroutine_enter(co, &rwco);
5198 while (rwco.ret == NOT_DONE) {
2572b37a 5199 aio_poll(aio_context, true);
4265d620
PB
5200 }
5201 }
5202
5203 return rwco.ret;
5204}
5205
19cb3738
FB
5206/**************************************************************/
5207/* removable device support */
5208
5209/**
5210 * Return TRUE if the media is present
5211 */
5212int bdrv_is_inserted(BlockDriverState *bs)
5213{
5214 BlockDriver *drv = bs->drv;
a1aff5bf 5215
19cb3738
FB
5216 if (!drv)
5217 return 0;
5218 if (!drv->bdrv_is_inserted)
a1aff5bf
MA
5219 return 1;
5220 return drv->bdrv_is_inserted(bs);
19cb3738
FB
5221}
5222
5223/**
8e49ca46
MA
5224 * Return whether the media changed since the last call to this
5225 * function, or -ENOTSUP if we don't know. Most drivers don't know.
19cb3738
FB
5226 */
5227int bdrv_media_changed(BlockDriverState *bs)
5228{
5229 BlockDriver *drv = bs->drv;
19cb3738 5230
8e49ca46
MA
5231 if (drv && drv->bdrv_media_changed) {
5232 return drv->bdrv_media_changed(bs);
5233 }
5234 return -ENOTSUP;
19cb3738
FB
5235}
5236
5237/**
5238 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
5239 */
f36f3949 5240void bdrv_eject(BlockDriverState *bs, bool eject_flag)
19cb3738
FB
5241{
5242 BlockDriver *drv = bs->drv;
bfb197e0 5243 const char *device_name;
19cb3738 5244
822e1cd1
MA
5245 if (drv && drv->bdrv_eject) {
5246 drv->bdrv_eject(bs, eject_flag);
19cb3738 5247 }
6f382ed2 5248
bfb197e0
MA
5249 device_name = bdrv_get_device_name(bs);
5250 if (device_name[0] != '\0') {
5251 qapi_event_send_device_tray_moved(device_name,
a5ee7bd4 5252 eject_flag, &error_abort);
6f382ed2 5253 }
19cb3738
FB
5254}
5255
19cb3738
FB
5256/**
5257 * Lock or unlock the media (if it is locked, the user won't be able
5258 * to eject it manually).
5259 */
025e849a 5260void bdrv_lock_medium(BlockDriverState *bs, bool locked)
19cb3738
FB
5261{
5262 BlockDriver *drv = bs->drv;
5263
025e849a 5264 trace_bdrv_lock_medium(bs, locked);
b8c6d095 5265
025e849a
MA
5266 if (drv && drv->bdrv_lock_medium) {
5267 drv->bdrv_lock_medium(bs, locked);
19cb3738
FB
5268 }
5269}
985a03b0
TS
5270
5271/* needed for generic scsi interface */
5272
5273int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
5274{
5275 BlockDriver *drv = bs->drv;
5276
5277 if (drv && drv->bdrv_ioctl)
5278 return drv->bdrv_ioctl(bs, req, buf);
5279 return -ENOTSUP;
5280}
7d780669 5281
221f715d
AL
5282BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
5283 unsigned long int req, void *buf,
5284 BlockDriverCompletionFunc *cb, void *opaque)
7d780669 5285{
221f715d 5286 BlockDriver *drv = bs->drv;
7d780669 5287
221f715d
AL
5288 if (drv && drv->bdrv_aio_ioctl)
5289 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
5290 return NULL;
7d780669 5291}
e268ca52 5292
1b7fd729 5293void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
7b6f9300 5294{
1b7fd729 5295 bs->guest_block_size = align;
7b6f9300 5296}
7cd1e32a 5297
e268ca52
AL
5298void *qemu_blockalign(BlockDriverState *bs, size_t size)
5299{
339064d5 5300 return qemu_memalign(bdrv_opt_mem_align(bs), size);
e268ca52 5301}
7cd1e32a 5302
7d2a35cc
KW
5303void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
5304{
5305 size_t align = bdrv_opt_mem_align(bs);
5306
5307 /* Ensure that NULL is never returned on success */
5308 assert(align > 0);
5309 if (size == 0) {
5310 size = align;
5311 }
5312
5313 return qemu_try_memalign(align, size);
5314}
5315
c53b1c51
SH
5316/*
5317 * Check if all memory in this vector is sector aligned.
5318 */
5319bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
5320{
5321 int i;
339064d5 5322 size_t alignment = bdrv_opt_mem_align(bs);
c53b1c51
SH
5323
5324 for (i = 0; i < qiov->niov; i++) {
339064d5 5325 if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
c53b1c51 5326 return false;
1ff735bd 5327 }
339064d5 5328 if (qiov->iov[i].iov_len % alignment) {
1ff735bd 5329 return false;
c53b1c51
SH
5330 }
5331 }
5332
5333 return true;
5334}
5335
b8afb520
FZ
5336BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity,
5337 Error **errp)
7cd1e32a
LS
5338{
5339 int64_t bitmap_size;
e4654d2d 5340 BdrvDirtyBitmap *bitmap;
a55eb92c 5341
50717e94
PB
5342 assert((granularity & (granularity - 1)) == 0);
5343
e4654d2d
FZ
5344 granularity >>= BDRV_SECTOR_BITS;
5345 assert(granularity);
57322b78 5346 bitmap_size = bdrv_nb_sectors(bs);
b8afb520
FZ
5347 if (bitmap_size < 0) {
5348 error_setg_errno(errp, -bitmap_size, "could not get length of device");
5349 errno = -bitmap_size;
5350 return NULL;
5351 }
5839e53b 5352 bitmap = g_new0(BdrvDirtyBitmap, 1);
e4654d2d
FZ
5353 bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
5354 QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
5355 return bitmap;
5356}
5357
5358void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5359{
5360 BdrvDirtyBitmap *bm, *next;
5361 QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
5362 if (bm == bitmap) {
5363 QLIST_REMOVE(bitmap, list);
5364 hbitmap_free(bitmap->bitmap);
5365 g_free(bitmap);
5366 return;
a55eb92c 5367 }
7cd1e32a
LS
5368 }
5369}
5370
21b56835
FZ
5371BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
5372{
5373 BdrvDirtyBitmap *bm;
5374 BlockDirtyInfoList *list = NULL;
5375 BlockDirtyInfoList **plist = &list;
5376
5377 QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
5839e53b
MA
5378 BlockDirtyInfo *info = g_new0(BlockDirtyInfo, 1);
5379 BlockDirtyInfoList *entry = g_new0(BlockDirtyInfoList, 1);
21b56835
FZ
5380 info->count = bdrv_get_dirty_count(bs, bm);
5381 info->granularity =
5382 ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
5383 entry->value = info;
5384 *plist = entry;
5385 plist = &entry->next;
5386 }
5387
5388 return list;
5389}
5390
e4654d2d 5391int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
7cd1e32a 5392{
e4654d2d
FZ
5393 if (bitmap) {
5394 return hbitmap_get(bitmap->bitmap, sector);
7cd1e32a
LS
5395 } else {
5396 return 0;
5397 }
5398}
5399
e4654d2d
FZ
5400void bdrv_dirty_iter_init(BlockDriverState *bs,
5401 BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
1755da16 5402{
e4654d2d 5403 hbitmap_iter_init(hbi, bitmap->bitmap, 0);
1755da16
PB
5404}
5405
5406void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
5407 int nr_sectors)
5408{
e4654d2d
FZ
5409 BdrvDirtyBitmap *bitmap;
5410 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5411 hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5412 }
1755da16
PB
5413}
5414
e4654d2d 5415void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors)
7cd1e32a 5416{
e4654d2d
FZ
5417 BdrvDirtyBitmap *bitmap;
5418 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5419 hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5420 }
7cd1e32a 5421}
aaa0eb75 5422
e4654d2d 5423int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
aaa0eb75 5424{
e4654d2d 5425 return hbitmap_count(bitmap->bitmap);
aaa0eb75 5426}
f88e1a42 5427
9fcb0251
FZ
5428/* Get a reference to bs */
5429void bdrv_ref(BlockDriverState *bs)
5430{
5431 bs->refcnt++;
5432}
5433
5434/* Release a previously grabbed reference to bs.
5435 * If after releasing, reference count is zero, the BlockDriverState is
5436 * deleted. */
5437void bdrv_unref(BlockDriverState *bs)
5438{
9a4d5ca6
JC
5439 if (!bs) {
5440 return;
5441 }
9fcb0251
FZ
5442 assert(bs->refcnt > 0);
5443 if (--bs->refcnt == 0) {
5444 bdrv_delete(bs);
5445 }
5446}
5447
fbe40ff7
FZ
5448struct BdrvOpBlocker {
5449 Error *reason;
5450 QLIST_ENTRY(BdrvOpBlocker) list;
5451};
5452
5453bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp)
5454{
5455 BdrvOpBlocker *blocker;
5456 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5457 if (!QLIST_EMPTY(&bs->op_blockers[op])) {
5458 blocker = QLIST_FIRST(&bs->op_blockers[op]);
5459 if (errp) {
5460 error_setg(errp, "Device '%s' is busy: %s",
bfb197e0
MA
5461 bdrv_get_device_name(bs),
5462 error_get_pretty(blocker->reason));
fbe40ff7
FZ
5463 }
5464 return true;
5465 }
5466 return false;
5467}
5468
5469void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason)
5470{
5471 BdrvOpBlocker *blocker;
5472 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5473
5839e53b 5474 blocker = g_new0(BdrvOpBlocker, 1);
fbe40ff7
FZ
5475 blocker->reason = reason;
5476 QLIST_INSERT_HEAD(&bs->op_blockers[op], blocker, list);
5477}
5478
5479void bdrv_op_unblock(BlockDriverState *bs, BlockOpType op, Error *reason)
5480{
5481 BdrvOpBlocker *blocker, *next;
5482 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5483 QLIST_FOREACH_SAFE(blocker, &bs->op_blockers[op], list, next) {
5484 if (blocker->reason == reason) {
5485 QLIST_REMOVE(blocker, list);
5486 g_free(blocker);
5487 }
5488 }
5489}
5490
5491void bdrv_op_block_all(BlockDriverState *bs, Error *reason)
5492{
5493 int i;
5494 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5495 bdrv_op_block(bs, i, reason);
5496 }
5497}
5498
5499void bdrv_op_unblock_all(BlockDriverState *bs, Error *reason)
5500{
5501 int i;
5502 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5503 bdrv_op_unblock(bs, i, reason);
5504 }
5505}
5506
5507bool bdrv_op_blocker_is_empty(BlockDriverState *bs)
5508{
5509 int i;
5510
5511 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5512 if (!QLIST_EMPTY(&bs->op_blockers[i])) {
5513 return false;
5514 }
5515 }
5516 return true;
5517}
5518
28a7282a
LC
5519void bdrv_iostatus_enable(BlockDriverState *bs)
5520{
d6bf279e 5521 bs->iostatus_enabled = true;
58e21ef5 5522 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
28a7282a
LC
5523}
5524
5525/* The I/O status is only enabled if the drive explicitly
5526 * enables it _and_ the VM is configured to stop on errors */
5527bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
5528{
d6bf279e 5529 return (bs->iostatus_enabled &&
92aa5c6d
PB
5530 (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
5531 bs->on_write_error == BLOCKDEV_ON_ERROR_STOP ||
5532 bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
28a7282a
LC
5533}
5534
5535void bdrv_iostatus_disable(BlockDriverState *bs)
5536{
d6bf279e 5537 bs->iostatus_enabled = false;
28a7282a
LC
5538}
5539
5540void bdrv_iostatus_reset(BlockDriverState *bs)
5541{
5542 if (bdrv_iostatus_is_enabled(bs)) {
58e21ef5 5543 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
3bd293c3
PB
5544 if (bs->job) {
5545 block_job_iostatus_reset(bs->job);
5546 }
28a7282a
LC
5547 }
5548}
5549
28a7282a
LC
5550void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
5551{
3e1caa5f
PB
5552 assert(bdrv_iostatus_is_enabled(bs));
5553 if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
58e21ef5
LC
5554 bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
5555 BLOCK_DEVICE_IO_STATUS_FAILED;
28a7282a
LC
5556 }
5557}
5558
d92ada22
LC
5559void bdrv_img_create(const char *filename, const char *fmt,
5560 const char *base_filename, const char *base_fmt,
f382d43a
MR
5561 char *options, uint64_t img_size, int flags,
5562 Error **errp, bool quiet)
f88e1a42 5563{
83d0521a
CL
5564 QemuOptsList *create_opts = NULL;
5565 QemuOpts *opts = NULL;
5566 const char *backing_fmt, *backing_file;
5567 int64_t size;
f88e1a42 5568 BlockDriver *drv, *proto_drv;
96df67d1 5569 BlockDriver *backing_drv = NULL;
cc84d90f 5570 Error *local_err = NULL;
f88e1a42
JS
5571 int ret = 0;
5572
5573 /* Find driver and parse its options */
5574 drv = bdrv_find_format(fmt);
5575 if (!drv) {
71c79813 5576 error_setg(errp, "Unknown file format '%s'", fmt);
d92ada22 5577 return;
f88e1a42
JS
5578 }
5579
98289620 5580 proto_drv = bdrv_find_protocol(filename, true);
f88e1a42 5581 if (!proto_drv) {
71c79813 5582 error_setg(errp, "Unknown protocol '%s'", filename);
d92ada22 5583 return;
f88e1a42
JS
5584 }
5585
c282e1fd
CL
5586 create_opts = qemu_opts_append(create_opts, drv->create_opts);
5587 create_opts = qemu_opts_append(create_opts, proto_drv->create_opts);
f88e1a42
JS
5588
5589 /* Create parameter list with default values */
83d0521a
CL
5590 opts = qemu_opts_create(create_opts, NULL, 0, &error_abort);
5591 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, img_size);
f88e1a42
JS
5592
5593 /* Parse -o options */
5594 if (options) {
83d0521a
CL
5595 if (qemu_opts_do_parse(opts, options, NULL) != 0) {
5596 error_setg(errp, "Invalid options for file format '%s'", fmt);
f88e1a42
JS
5597 goto out;
5598 }
5599 }
5600
5601 if (base_filename) {
83d0521a 5602 if (qemu_opt_set(opts, BLOCK_OPT_BACKING_FILE, base_filename)) {
71c79813
LC
5603 error_setg(errp, "Backing file not supported for file format '%s'",
5604 fmt);
f88e1a42
JS
5605 goto out;
5606 }
5607 }
5608
5609 if (base_fmt) {
83d0521a 5610 if (qemu_opt_set(opts, BLOCK_OPT_BACKING_FMT, base_fmt)) {
71c79813
LC
5611 error_setg(errp, "Backing file format not supported for file "
5612 "format '%s'", fmt);
f88e1a42
JS
5613 goto out;
5614 }
5615 }
5616
83d0521a
CL
5617 backing_file = qemu_opt_get(opts, BLOCK_OPT_BACKING_FILE);
5618 if (backing_file) {
5619 if (!strcmp(filename, backing_file)) {
71c79813
LC
5620 error_setg(errp, "Error: Trying to create an image with the "
5621 "same filename as the backing file");
792da93a
JS
5622 goto out;
5623 }
5624 }
5625
83d0521a
CL
5626 backing_fmt = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT);
5627 if (backing_fmt) {
5628 backing_drv = bdrv_find_format(backing_fmt);
96df67d1 5629 if (!backing_drv) {
71c79813 5630 error_setg(errp, "Unknown backing file format '%s'",
83d0521a 5631 backing_fmt);
f88e1a42
JS
5632 goto out;
5633 }
5634 }
5635
5636 // The size for the image must always be specified, with one exception:
5637 // If we are using a backing file, we can obtain the size from there
83d0521a
CL
5638 size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 0);
5639 if (size == -1) {
5640 if (backing_file) {
66f6b814 5641 BlockDriverState *bs;
52bf1e72 5642 int64_t size;
63090dac
PB
5643 int back_flags;
5644
5645 /* backing files always opened read-only */
5646 back_flags =
5647 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
f88e1a42 5648
f67503e5 5649 bs = NULL;
83d0521a 5650 ret = bdrv_open(&bs, backing_file, NULL, NULL, back_flags,
cc84d90f 5651 backing_drv, &local_err);
f88e1a42 5652 if (ret < 0) {
cc84d90f 5653 error_setg_errno(errp, -ret, "Could not open '%s': %s",
83d0521a 5654 backing_file,
cc84d90f
HR
5655 error_get_pretty(local_err));
5656 error_free(local_err);
5657 local_err = NULL;
f88e1a42
JS
5658 goto out;
5659 }
52bf1e72
MA
5660 size = bdrv_getlength(bs);
5661 if (size < 0) {
5662 error_setg_errno(errp, -size, "Could not get size of '%s'",
5663 backing_file);
5664 bdrv_unref(bs);
5665 goto out;
5666 }
f88e1a42 5667
83d0521a 5668 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, size);
66f6b814
HR
5669
5670 bdrv_unref(bs);
f88e1a42 5671 } else {
71c79813 5672 error_setg(errp, "Image creation needs a size parameter");
f88e1a42
JS
5673 goto out;
5674 }
5675 }
5676
f382d43a
MR
5677 if (!quiet) {
5678 printf("Formatting '%s', fmt=%s ", filename, fmt);
83d0521a 5679 qemu_opts_print(opts);
f382d43a
MR
5680 puts("");
5681 }
83d0521a 5682
c282e1fd 5683 ret = bdrv_create(drv, filename, opts, &local_err);
83d0521a 5684
cc84d90f
HR
5685 if (ret == -EFBIG) {
5686 /* This is generally a better message than whatever the driver would
5687 * deliver (especially because of the cluster_size_hint), since that
5688 * is most probably not much different from "image too large". */
5689 const char *cluster_size_hint = "";
83d0521a 5690 if (qemu_opt_get_size(opts, BLOCK_OPT_CLUSTER_SIZE, 0)) {
cc84d90f 5691 cluster_size_hint = " (try using a larger cluster size)";
f88e1a42 5692 }
cc84d90f
HR
5693 error_setg(errp, "The image size is too large for file format '%s'"
5694 "%s", fmt, cluster_size_hint);
5695 error_free(local_err);
5696 local_err = NULL;
f88e1a42
JS
5697 }
5698
5699out:
83d0521a
CL
5700 qemu_opts_del(opts);
5701 qemu_opts_free(create_opts);
84d18f06 5702 if (local_err) {
cc84d90f
HR
5703 error_propagate(errp, local_err);
5704 }
f88e1a42 5705}
85d126f3
SH
5706
5707AioContext *bdrv_get_aio_context(BlockDriverState *bs)
5708{
dcd04228
SH
5709 return bs->aio_context;
5710}
5711
5712void bdrv_detach_aio_context(BlockDriverState *bs)
5713{
33384421
HR
5714 BdrvAioNotifier *baf;
5715
dcd04228
SH
5716 if (!bs->drv) {
5717 return;
5718 }
5719
33384421
HR
5720 QLIST_FOREACH(baf, &bs->aio_notifiers, list) {
5721 baf->detach_aio_context(baf->opaque);
5722 }
5723
13af91eb
SH
5724 if (bs->io_limits_enabled) {
5725 throttle_detach_aio_context(&bs->throttle_state);
5726 }
dcd04228
SH
5727 if (bs->drv->bdrv_detach_aio_context) {
5728 bs->drv->bdrv_detach_aio_context(bs);
5729 }
5730 if (bs->file) {
5731 bdrv_detach_aio_context(bs->file);
5732 }
5733 if (bs->backing_hd) {
5734 bdrv_detach_aio_context(bs->backing_hd);
5735 }
5736
5737 bs->aio_context = NULL;
5738}
5739
5740void bdrv_attach_aio_context(BlockDriverState *bs,
5741 AioContext *new_context)
5742{
33384421
HR
5743 BdrvAioNotifier *ban;
5744
dcd04228
SH
5745 if (!bs->drv) {
5746 return;
5747 }
5748
5749 bs->aio_context = new_context;
5750
5751 if (bs->backing_hd) {
5752 bdrv_attach_aio_context(bs->backing_hd, new_context);
5753 }
5754 if (bs->file) {
5755 bdrv_attach_aio_context(bs->file, new_context);
5756 }
5757 if (bs->drv->bdrv_attach_aio_context) {
5758 bs->drv->bdrv_attach_aio_context(bs, new_context);
5759 }
13af91eb
SH
5760 if (bs->io_limits_enabled) {
5761 throttle_attach_aio_context(&bs->throttle_state, new_context);
5762 }
33384421
HR
5763
5764 QLIST_FOREACH(ban, &bs->aio_notifiers, list) {
5765 ban->attached_aio_context(new_context, ban->opaque);
5766 }
dcd04228
SH
5767}
5768
5769void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
5770{
5771 bdrv_drain_all(); /* ensure there are no in-flight requests */
5772
5773 bdrv_detach_aio_context(bs);
5774
5775 /* This function executes in the old AioContext so acquire the new one in
5776 * case it runs in a different thread.
5777 */
5778 aio_context_acquire(new_context);
5779 bdrv_attach_aio_context(bs, new_context);
5780 aio_context_release(new_context);
85d126f3 5781}
d616b224 5782
33384421
HR
5783void bdrv_add_aio_context_notifier(BlockDriverState *bs,
5784 void (*attached_aio_context)(AioContext *new_context, void *opaque),
5785 void (*detach_aio_context)(void *opaque), void *opaque)
5786{
5787 BdrvAioNotifier *ban = g_new(BdrvAioNotifier, 1);
5788 *ban = (BdrvAioNotifier){
5789 .attached_aio_context = attached_aio_context,
5790 .detach_aio_context = detach_aio_context,
5791 .opaque = opaque
5792 };
5793
5794 QLIST_INSERT_HEAD(&bs->aio_notifiers, ban, list);
5795}
5796
5797void bdrv_remove_aio_context_notifier(BlockDriverState *bs,
5798 void (*attached_aio_context)(AioContext *,
5799 void *),
5800 void (*detach_aio_context)(void *),
5801 void *opaque)
5802{
5803 BdrvAioNotifier *ban, *ban_next;
5804
5805 QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
5806 if (ban->attached_aio_context == attached_aio_context &&
5807 ban->detach_aio_context == detach_aio_context &&
5808 ban->opaque == opaque)
5809 {
5810 QLIST_REMOVE(ban, list);
5811 g_free(ban);
5812
5813 return;
5814 }
5815 }
5816
5817 abort();
5818}
5819
d616b224
SH
5820void bdrv_add_before_write_notifier(BlockDriverState *bs,
5821 NotifierWithReturn *notifier)
5822{
5823 notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
5824}
6f176b48 5825
c282e1fd 5826int bdrv_amend_options(BlockDriverState *bs, QemuOpts *opts)
6f176b48 5827{
c282e1fd 5828 if (!bs->drv->bdrv_amend_options) {
6f176b48
HR
5829 return -ENOTSUP;
5830 }
c282e1fd 5831 return bs->drv->bdrv_amend_options(bs, opts);
6f176b48 5832}
f6186f49 5833
b5042a36
BC
5834/* This function will be called by the bdrv_recurse_is_first_non_filter method
5835 * of block filter and by bdrv_is_first_non_filter.
5836 * It is used to test if the given bs is the candidate or recurse more in the
5837 * node graph.
212a5a8f 5838 */
b5042a36 5839bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
212a5a8f 5840 BlockDriverState *candidate)
f6186f49 5841{
b5042a36
BC
5842 /* return false if basic checks fails */
5843 if (!bs || !bs->drv) {
212a5a8f 5844 return false;
f6186f49
BC
5845 }
5846
b5042a36
BC
5847 /* the code reached a non block filter driver -> check if the bs is
5848 * the same as the candidate. It's the recursion termination condition.
5849 */
5850 if (!bs->drv->is_filter) {
5851 return bs == candidate;
212a5a8f 5852 }
b5042a36 5853 /* Down this path the driver is a block filter driver */
212a5a8f 5854
b5042a36
BC
5855 /* If the block filter recursion method is defined use it to recurse down
5856 * the node graph.
5857 */
5858 if (bs->drv->bdrv_recurse_is_first_non_filter) {
212a5a8f 5859 return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate);
f6186f49
BC
5860 }
5861
b5042a36
BC
5862 /* the driver is a block filter but don't allow to recurse -> return false
5863 */
5864 return false;
f6186f49
BC
5865}
5866
212a5a8f
BC
5867/* This function checks if the candidate is the first non filter bs down it's
5868 * bs chain. Since we don't have pointers to parents it explore all bs chains
5869 * from the top. Some filters can choose not to pass down the recursion.
5870 */
5871bool bdrv_is_first_non_filter(BlockDriverState *candidate)
f6186f49 5872{
212a5a8f
BC
5873 BlockDriverState *bs;
5874
5875 /* walk down the bs forest recursively */
5876 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5877 bool perm;
5878
b5042a36 5879 /* try to recurse in this top level bs */
e6dc8a1f 5880 perm = bdrv_recurse_is_first_non_filter(bs, candidate);
212a5a8f
BC
5881
5882 /* candidate is the first non filter */
5883 if (perm) {
5884 return true;
5885 }
5886 }
5887
5888 return false;
f6186f49 5889}
09158f00
BC
5890
5891BlockDriverState *check_to_replace_node(const char *node_name, Error **errp)
5892{
5893 BlockDriverState *to_replace_bs = bdrv_find_node(node_name);
5894 if (!to_replace_bs) {
5895 error_setg(errp, "Node name '%s' not found", node_name);
5896 return NULL;
5897 }
5898
5899 if (bdrv_op_is_blocked(to_replace_bs, BLOCK_OP_TYPE_REPLACE, errp)) {
5900 return NULL;
5901 }
5902
5903 /* We don't want arbitrary node of the BDS chain to be replaced only the top
5904 * most non filter in order to prevent data corruption.
5905 * Another benefit is that this tests exclude backing files which are
5906 * blocked by the backing blockers.
5907 */
5908 if (!bdrv_is_first_non_filter(to_replace_bs)) {
5909 error_setg(errp, "Only top most non filter can be replaced");
5910 return NULL;
5911 }
5912
5913 return to_replace_bs;
5914}
448ad91d
ML
5915
5916void bdrv_io_plug(BlockDriverState *bs)
5917{
5918 BlockDriver *drv = bs->drv;
5919 if (drv && drv->bdrv_io_plug) {
5920 drv->bdrv_io_plug(bs);
5921 } else if (bs->file) {
5922 bdrv_io_plug(bs->file);
5923 }
5924}
5925
5926void bdrv_io_unplug(BlockDriverState *bs)
5927{
5928 BlockDriver *drv = bs->drv;
5929 if (drv && drv->bdrv_io_unplug) {
5930 drv->bdrv_io_unplug(bs);
5931 } else if (bs->file) {
5932 bdrv_io_unplug(bs->file);
5933 }
5934}
5935
5936void bdrv_flush_io_queue(BlockDriverState *bs)
5937{
5938 BlockDriver *drv = bs->drv;
5939 if (drv && drv->bdrv_flush_io_queue) {
5940 drv->bdrv_flush_io_queue(bs);
5941 } else if (bs->file) {
5942 bdrv_flush_io_queue(bs->file);
5943 }
5944}
91af7014
HR
5945
5946static bool append_open_options(QDict *d, BlockDriverState *bs)
5947{
5948 const QDictEntry *entry;
5949 bool found_any = false;
5950
5951 for (entry = qdict_first(bs->options); entry;
5952 entry = qdict_next(bs->options, entry))
5953 {
5954 /* Only take options for this level and exclude all non-driver-specific
5955 * options */
5956 if (!strchr(qdict_entry_key(entry), '.') &&
5957 strcmp(qdict_entry_key(entry), "node-name"))
5958 {
5959 qobject_incref(qdict_entry_value(entry));
5960 qdict_put_obj(d, qdict_entry_key(entry), qdict_entry_value(entry));
5961 found_any = true;
5962 }
5963 }
5964
5965 return found_any;
5966}
5967
5968/* Updates the following BDS fields:
5969 * - exact_filename: A filename which may be used for opening a block device
5970 * which (mostly) equals the given BDS (even without any
5971 * other options; so reading and writing must return the same
5972 * results, but caching etc. may be different)
5973 * - full_open_options: Options which, when given when opening a block device
5974 * (without a filename), result in a BDS (mostly)
5975 * equalling the given one
5976 * - filename: If exact_filename is set, it is copied here. Otherwise,
5977 * full_open_options is converted to a JSON object, prefixed with
5978 * "json:" (for use through the JSON pseudo protocol) and put here.
5979 */
5980void bdrv_refresh_filename(BlockDriverState *bs)
5981{
5982 BlockDriver *drv = bs->drv;
5983 QDict *opts;
5984
5985 if (!drv) {
5986 return;
5987 }
5988
5989 /* This BDS's file name will most probably depend on its file's name, so
5990 * refresh that first */
5991 if (bs->file) {
5992 bdrv_refresh_filename(bs->file);
5993 }
5994
5995 if (drv->bdrv_refresh_filename) {
5996 /* Obsolete information is of no use here, so drop the old file name
5997 * information before refreshing it */
5998 bs->exact_filename[0] = '\0';
5999 if (bs->full_open_options) {
6000 QDECREF(bs->full_open_options);
6001 bs->full_open_options = NULL;
6002 }
6003
6004 drv->bdrv_refresh_filename(bs);
6005 } else if (bs->file) {
6006 /* Try to reconstruct valid information from the underlying file */
6007 bool has_open_options;
6008
6009 bs->exact_filename[0] = '\0';
6010 if (bs->full_open_options) {
6011 QDECREF(bs->full_open_options);
6012 bs->full_open_options = NULL;
6013 }
6014
6015 opts = qdict_new();
6016 has_open_options = append_open_options(opts, bs);
6017
6018 /* If no specific options have been given for this BDS, the filename of
6019 * the underlying file should suffice for this one as well */
6020 if (bs->file->exact_filename[0] && !has_open_options) {
6021 strcpy(bs->exact_filename, bs->file->exact_filename);
6022 }
6023 /* Reconstructing the full options QDict is simple for most format block
6024 * drivers, as long as the full options are known for the underlying
6025 * file BDS. The full options QDict of that file BDS should somehow
6026 * contain a representation of the filename, therefore the following
6027 * suffices without querying the (exact_)filename of this BDS. */
6028 if (bs->file->full_open_options) {
6029 qdict_put_obj(opts, "driver",
6030 QOBJECT(qstring_from_str(drv->format_name)));
6031 QINCREF(bs->file->full_open_options);
6032 qdict_put_obj(opts, "file", QOBJECT(bs->file->full_open_options));
6033
6034 bs->full_open_options = opts;
6035 } else {
6036 QDECREF(opts);
6037 }
6038 } else if (!bs->full_open_options && qdict_size(bs->options)) {
6039 /* There is no underlying file BDS (at least referenced by BDS.file),
6040 * so the full options QDict should be equal to the options given
6041 * specifically for this block device when it was opened (plus the
6042 * driver specification).
6043 * Because those options don't change, there is no need to update
6044 * full_open_options when it's already set. */
6045
6046 opts = qdict_new();
6047 append_open_options(opts, bs);
6048 qdict_put_obj(opts, "driver",
6049 QOBJECT(qstring_from_str(drv->format_name)));
6050
6051 if (bs->exact_filename[0]) {
6052 /* This may not work for all block protocol drivers (some may
6053 * require this filename to be parsed), but we have to find some
6054 * default solution here, so just include it. If some block driver
6055 * does not support pure options without any filename at all or
6056 * needs some special format of the options QDict, it needs to
6057 * implement the driver-specific bdrv_refresh_filename() function.
6058 */
6059 qdict_put_obj(opts, "filename",
6060 QOBJECT(qstring_from_str(bs->exact_filename)));
6061 }
6062
6063 bs->full_open_options = opts;
6064 }
6065
6066 if (bs->exact_filename[0]) {
6067 pstrcpy(bs->filename, sizeof(bs->filename), bs->exact_filename);
6068 } else if (bs->full_open_options) {
6069 QString *json = qobject_to_json(QOBJECT(bs->full_open_options));
6070 snprintf(bs->filename, sizeof(bs->filename), "json:%s",
6071 qstring_get_str(json));
6072 QDECREF(json);
6073 }
6074}
5366d0c8
BC
6075
6076/* This accessor function purpose is to allow the device models to access the
6077 * BlockAcctStats structure embedded inside a BlockDriverState without being
6078 * aware of the BlockDriverState structure layout.
6079 * It will go away when the BlockAcctStats structure will be moved inside
6080 * the device models.
6081 */
6082BlockAcctStats *bdrv_get_stats(BlockDriverState *bs)
6083{
6084 return &bs->stats;
6085}