]> git.proxmox.com Git - mirror_qemu.git/blame - block.c
rbd: Add support for bdrv_invalidate_cache
[mirror_qemu.git] / block.c
CommitLineData
fc01f7e7
FB
1/*
2 * QEMU System Emulator block driver
5fafdf24 3 *
fc01f7e7 4 * Copyright (c) 2003 Fabrice Bellard
5fafdf24 5 *
fc01f7e7
FB
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
3990d09a 24#include "config-host.h"
faf07963 25#include "qemu-common.h"
6d519a5f 26#include "trace.h"
737e150e
PB
27#include "block/block_int.h"
28#include "block/blockjob.h"
1de7afc9 29#include "qemu/module.h"
7b1b5d19 30#include "qapi/qmp/qjson.h"
bfb197e0 31#include "sysemu/block-backend.h"
9c17d615 32#include "sysemu/sysemu.h"
1de7afc9 33#include "qemu/notify.h"
737e150e 34#include "block/coroutine.h"
c13163fb 35#include "block/qapi.h"
b2023818 36#include "qmp-commands.h"
1de7afc9 37#include "qemu/timer.h"
a5ee7bd4 38#include "qapi-event.h"
fc01f7e7 39
71e72a19 40#ifdef CONFIG_BSD
7674e7bf
FB
41#include <sys/types.h>
42#include <sys/stat.h>
43#include <sys/ioctl.h>
72cf2d4f 44#include <sys/queue.h>
c5e97233 45#ifndef __DragonFly__
7674e7bf
FB
46#include <sys/disk.h>
47#endif
c5e97233 48#endif
7674e7bf 49
49dc768d
AL
50#ifdef _WIN32
51#include <windows.h>
52#endif
53
e4654d2d
FZ
54struct BdrvDirtyBitmap {
55 HBitmap *bitmap;
56 QLIST_ENTRY(BdrvDirtyBitmap) list;
57};
58
1c9805a3
SH
59#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
60
7c84b1b8 61static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
f141eafe 62 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
097310b5 63 BlockCompletionFunc *cb, void *opaque);
7c84b1b8 64static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
f141eafe 65 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
097310b5 66 BlockCompletionFunc *cb, void *opaque);
f9f05dc5
KW
67static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
68 int64_t sector_num, int nb_sectors,
69 QEMUIOVector *iov);
70static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
71 int64_t sector_num, int nb_sectors,
72 QEMUIOVector *iov);
775aa8b6
KW
73static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
74 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
470c0504 75 BdrvRequestFlags flags);
775aa8b6
KW
76static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
77 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
f08f2dda 78 BdrvRequestFlags flags);
7c84b1b8
MA
79static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
80 int64_t sector_num,
81 QEMUIOVector *qiov,
82 int nb_sectors,
83 BdrvRequestFlags flags,
097310b5 84 BlockCompletionFunc *cb,
7c84b1b8
MA
85 void *opaque,
86 bool is_write);
b2a61371 87static void coroutine_fn bdrv_co_do_rw(void *opaque);
621f0589 88static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
aa7bfbff 89 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
ec530c81 90
1b7bdbc1
SH
91static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
92 QTAILQ_HEAD_INITIALIZER(bdrv_states);
7ee930d0 93
dc364f4c
BC
94static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
95 QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
96
8a22f02a
SH
97static QLIST_HEAD(, BlockDriver) bdrv_drivers =
98 QLIST_HEAD_INITIALIZER(bdrv_drivers);
ea2384d3 99
eb852011
MA
100/* If non-zero, use only whitelisted block drivers */
101static int use_bdrv_whitelist;
102
9e0b22f4
SH
103#ifdef _WIN32
104static int is_windows_drive_prefix(const char *filename)
105{
106 return (((filename[0] >= 'a' && filename[0] <= 'z') ||
107 (filename[0] >= 'A' && filename[0] <= 'Z')) &&
108 filename[1] == ':');
109}
110
111int is_windows_drive(const char *filename)
112{
113 if (is_windows_drive_prefix(filename) &&
114 filename[2] == '\0')
115 return 1;
116 if (strstart(filename, "\\\\.\\", NULL) ||
117 strstart(filename, "//./", NULL))
118 return 1;
119 return 0;
120}
121#endif
122
0563e191 123/* throttling disk I/O limits */
cc0681c4
BC
124void bdrv_set_io_limits(BlockDriverState *bs,
125 ThrottleConfig *cfg)
98f90dba 126{
cc0681c4 127 int i;
98f90dba 128
cc0681c4 129 throttle_config(&bs->throttle_state, cfg);
98f90dba 130
cc0681c4
BC
131 for (i = 0; i < 2; i++) {
132 qemu_co_enter_next(&bs->throttled_reqs[i]);
98f90dba 133 }
cc0681c4
BC
134}
135
136/* this function drain all the throttled IOs */
137static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
138{
139 bool drained = false;
140 bool enabled = bs->io_limits_enabled;
141 int i;
142
143 bs->io_limits_enabled = false;
144
145 for (i = 0; i < 2; i++) {
146 while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
147 drained = true;
148 }
149 }
150
151 bs->io_limits_enabled = enabled;
98f90dba 152
cc0681c4 153 return drained;
98f90dba
ZYW
154}
155
cc0681c4 156void bdrv_io_limits_disable(BlockDriverState *bs)
0563e191 157{
cc0681c4 158 bs->io_limits_enabled = false;
0563e191 159
cc0681c4
BC
160 bdrv_start_throttled_reqs(bs);
161
162 throttle_destroy(&bs->throttle_state);
0563e191
ZYW
163}
164
cc0681c4 165static void bdrv_throttle_read_timer_cb(void *opaque)
0563e191 166{
cc0681c4
BC
167 BlockDriverState *bs = opaque;
168 qemu_co_enter_next(&bs->throttled_reqs[0]);
0563e191
ZYW
169}
170
cc0681c4 171static void bdrv_throttle_write_timer_cb(void *opaque)
0563e191 172{
cc0681c4
BC
173 BlockDriverState *bs = opaque;
174 qemu_co_enter_next(&bs->throttled_reqs[1]);
0563e191
ZYW
175}
176
cc0681c4
BC
177/* should be called before bdrv_set_io_limits if a limit is set */
178void bdrv_io_limits_enable(BlockDriverState *bs)
179{
180 assert(!bs->io_limits_enabled);
181 throttle_init(&bs->throttle_state,
13af91eb 182 bdrv_get_aio_context(bs),
cc0681c4
BC
183 QEMU_CLOCK_VIRTUAL,
184 bdrv_throttle_read_timer_cb,
185 bdrv_throttle_write_timer_cb,
186 bs);
187 bs->io_limits_enabled = true;
188}
189
190/* This function makes an IO wait if needed
191 *
192 * @nb_sectors: the number of sectors of the IO
193 * @is_write: is the IO a write
194 */
98f90dba 195static void bdrv_io_limits_intercept(BlockDriverState *bs,
d5103588 196 unsigned int bytes,
cc0681c4 197 bool is_write)
98f90dba 198{
cc0681c4
BC
199 /* does this io must wait */
200 bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
98f90dba 201
cc0681c4
BC
202 /* if must wait or any request of this type throttled queue the IO */
203 if (must_wait ||
204 !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
205 qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
98f90dba
ZYW
206 }
207
cc0681c4 208 /* the IO will be executed, do the accounting */
d5103588
KW
209 throttle_account(&bs->throttle_state, is_write, bytes);
210
98f90dba 211
cc0681c4
BC
212 /* if the next request must wait -> do nothing */
213 if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
214 return;
98f90dba
ZYW
215 }
216
cc0681c4
BC
217 /* else queue next request for execution */
218 qemu_co_queue_next(&bs->throttled_reqs[is_write]);
98f90dba
ZYW
219}
220
339064d5
KW
221size_t bdrv_opt_mem_align(BlockDriverState *bs)
222{
223 if (!bs || !bs->drv) {
224 /* 4k should be on the safe side */
225 return 4096;
226 }
227
228 return bs->bl.opt_mem_alignment;
229}
230
9e0b22f4
SH
231/* check if the path starts with "<protocol>:" */
232static int path_has_protocol(const char *path)
233{
947995c0
PB
234 const char *p;
235
9e0b22f4
SH
236#ifdef _WIN32
237 if (is_windows_drive(path) ||
238 is_windows_drive_prefix(path)) {
239 return 0;
240 }
947995c0
PB
241 p = path + strcspn(path, ":/\\");
242#else
243 p = path + strcspn(path, ":/");
9e0b22f4
SH
244#endif
245
947995c0 246 return *p == ':';
9e0b22f4
SH
247}
248
83f64091 249int path_is_absolute(const char *path)
3b0d4f61 250{
21664424
FB
251#ifdef _WIN32
252 /* specific case for names like: "\\.\d:" */
f53f4da9 253 if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
21664424 254 return 1;
f53f4da9
PB
255 }
256 return (*path == '/' || *path == '\\');
3b9f94e1 257#else
f53f4da9 258 return (*path == '/');
3b9f94e1 259#endif
3b0d4f61
FB
260}
261
83f64091
FB
262/* if filename is absolute, just copy it to dest. Otherwise, build a
263 path to it by considering it is relative to base_path. URL are
264 supported. */
265void path_combine(char *dest, int dest_size,
266 const char *base_path,
267 const char *filename)
3b0d4f61 268{
83f64091
FB
269 const char *p, *p1;
270 int len;
271
272 if (dest_size <= 0)
273 return;
274 if (path_is_absolute(filename)) {
275 pstrcpy(dest, dest_size, filename);
276 } else {
277 p = strchr(base_path, ':');
278 if (p)
279 p++;
280 else
281 p = base_path;
3b9f94e1
FB
282 p1 = strrchr(base_path, '/');
283#ifdef _WIN32
284 {
285 const char *p2;
286 p2 = strrchr(base_path, '\\');
287 if (!p1 || p2 > p1)
288 p1 = p2;
289 }
290#endif
83f64091
FB
291 if (p1)
292 p1++;
293 else
294 p1 = base_path;
295 if (p1 > p)
296 p = p1;
297 len = p - base_path;
298 if (len > dest_size - 1)
299 len = dest_size - 1;
300 memcpy(dest, base_path, len);
301 dest[len] = '\0';
302 pstrcat(dest, dest_size, filename);
3b0d4f61 303 }
3b0d4f61
FB
304}
305
dc5a1371
PB
306void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
307{
308 if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
309 pstrcpy(dest, sz, bs->backing_file);
310 } else {
311 path_combine(dest, sz, bs->filename, bs->backing_file);
312 }
313}
314
5efa9d5a 315void bdrv_register(BlockDriver *bdrv)
ea2384d3 316{
8c5873d6
SH
317 /* Block drivers without coroutine functions need emulation */
318 if (!bdrv->bdrv_co_readv) {
f9f05dc5
KW
319 bdrv->bdrv_co_readv = bdrv_co_readv_em;
320 bdrv->bdrv_co_writev = bdrv_co_writev_em;
321
f8c35c1d
SH
322 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
323 * the block driver lacks aio we need to emulate that too.
324 */
f9f05dc5
KW
325 if (!bdrv->bdrv_aio_readv) {
326 /* add AIO emulation layer */
327 bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
328 bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
f9f05dc5 329 }
83f64091 330 }
b2e12bc6 331
8a22f02a 332 QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
ea2384d3 333}
b338082b 334
7f06d47e 335BlockDriverState *bdrv_new_root(void)
b338082b 336{
7f06d47e 337 BlockDriverState *bs = bdrv_new();
e4e9986b 338
e4e9986b 339 QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
e4e9986b
MA
340 return bs;
341}
342
343BlockDriverState *bdrv_new(void)
344{
345 BlockDriverState *bs;
346 int i;
347
5839e53b 348 bs = g_new0(BlockDriverState, 1);
e4654d2d 349 QLIST_INIT(&bs->dirty_bitmaps);
fbe40ff7
FZ
350 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
351 QLIST_INIT(&bs->op_blockers[i]);
352 }
28a7282a 353 bdrv_iostatus_disable(bs);
d7d512f6 354 notifier_list_init(&bs->close_notifiers);
d616b224 355 notifier_with_return_list_init(&bs->before_write_notifiers);
cc0681c4
BC
356 qemu_co_queue_init(&bs->throttled_reqs[0]);
357 qemu_co_queue_init(&bs->throttled_reqs[1]);
9fcb0251 358 bs->refcnt = 1;
dcd04228 359 bs->aio_context = qemu_get_aio_context();
d7d512f6 360
b338082b
FB
361 return bs;
362}
363
d7d512f6
PB
364void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
365{
366 notifier_list_add(&bs->close_notifiers, notify);
367}
368
ea2384d3
FB
369BlockDriver *bdrv_find_format(const char *format_name)
370{
371 BlockDriver *drv1;
8a22f02a
SH
372 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
373 if (!strcmp(drv1->format_name, format_name)) {
ea2384d3 374 return drv1;
8a22f02a 375 }
ea2384d3
FB
376 }
377 return NULL;
378}
379
b64ec4e4 380static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
eb852011 381{
b64ec4e4
FZ
382 static const char *whitelist_rw[] = {
383 CONFIG_BDRV_RW_WHITELIST
384 };
385 static const char *whitelist_ro[] = {
386 CONFIG_BDRV_RO_WHITELIST
eb852011
MA
387 };
388 const char **p;
389
b64ec4e4 390 if (!whitelist_rw[0] && !whitelist_ro[0]) {
eb852011 391 return 1; /* no whitelist, anything goes */
b64ec4e4 392 }
eb852011 393
b64ec4e4 394 for (p = whitelist_rw; *p; p++) {
eb852011
MA
395 if (!strcmp(drv->format_name, *p)) {
396 return 1;
397 }
398 }
b64ec4e4
FZ
399 if (read_only) {
400 for (p = whitelist_ro; *p; p++) {
401 if (!strcmp(drv->format_name, *p)) {
402 return 1;
403 }
404 }
405 }
eb852011
MA
406 return 0;
407}
408
b64ec4e4
FZ
409BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
410 bool read_only)
eb852011
MA
411{
412 BlockDriver *drv = bdrv_find_format(format_name);
b64ec4e4 413 return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
eb852011
MA
414}
415
5b7e1542
ZYW
416typedef struct CreateCo {
417 BlockDriver *drv;
418 char *filename;
83d0521a 419 QemuOpts *opts;
5b7e1542 420 int ret;
cc84d90f 421 Error *err;
5b7e1542
ZYW
422} CreateCo;
423
424static void coroutine_fn bdrv_create_co_entry(void *opaque)
425{
cc84d90f
HR
426 Error *local_err = NULL;
427 int ret;
428
5b7e1542
ZYW
429 CreateCo *cco = opaque;
430 assert(cco->drv);
431
c282e1fd 432 ret = cco->drv->bdrv_create(cco->filename, cco->opts, &local_err);
84d18f06 433 if (local_err) {
cc84d90f
HR
434 error_propagate(&cco->err, local_err);
435 }
436 cco->ret = ret;
5b7e1542
ZYW
437}
438
0e7e1989 439int bdrv_create(BlockDriver *drv, const char* filename,
83d0521a 440 QemuOpts *opts, Error **errp)
ea2384d3 441{
5b7e1542
ZYW
442 int ret;
443
444 Coroutine *co;
445 CreateCo cco = {
446 .drv = drv,
447 .filename = g_strdup(filename),
83d0521a 448 .opts = opts,
5b7e1542 449 .ret = NOT_DONE,
cc84d90f 450 .err = NULL,
5b7e1542
ZYW
451 };
452
c282e1fd 453 if (!drv->bdrv_create) {
cc84d90f 454 error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
80168bff
LC
455 ret = -ENOTSUP;
456 goto out;
5b7e1542
ZYW
457 }
458
459 if (qemu_in_coroutine()) {
460 /* Fast-path if already in coroutine context */
461 bdrv_create_co_entry(&cco);
462 } else {
463 co = qemu_coroutine_create(bdrv_create_co_entry);
464 qemu_coroutine_enter(co, &cco);
465 while (cco.ret == NOT_DONE) {
b47ec2c4 466 aio_poll(qemu_get_aio_context(), true);
5b7e1542
ZYW
467 }
468 }
469
470 ret = cco.ret;
cc84d90f 471 if (ret < 0) {
84d18f06 472 if (cco.err) {
cc84d90f
HR
473 error_propagate(errp, cco.err);
474 } else {
475 error_setg_errno(errp, -ret, "Could not create image");
476 }
477 }
0e7e1989 478
80168bff
LC
479out:
480 g_free(cco.filename);
5b7e1542 481 return ret;
ea2384d3
FB
482}
483
c282e1fd 484int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp)
84a12e66
CH
485{
486 BlockDriver *drv;
cc84d90f
HR
487 Error *local_err = NULL;
488 int ret;
84a12e66 489
98289620 490 drv = bdrv_find_protocol(filename, true);
84a12e66 491 if (drv == NULL) {
cc84d90f 492 error_setg(errp, "Could not find protocol for file '%s'", filename);
16905d71 493 return -ENOENT;
84a12e66
CH
494 }
495
c282e1fd 496 ret = bdrv_create(drv, filename, opts, &local_err);
84d18f06 497 if (local_err) {
cc84d90f
HR
498 error_propagate(errp, local_err);
499 }
500 return ret;
84a12e66
CH
501}
502
3baca891 503void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
d34682cd
KW
504{
505 BlockDriver *drv = bs->drv;
3baca891 506 Error *local_err = NULL;
d34682cd
KW
507
508 memset(&bs->bl, 0, sizeof(bs->bl));
509
466ad822 510 if (!drv) {
3baca891 511 return;
466ad822
KW
512 }
513
514 /* Take some limits from the children as a default */
515 if (bs->file) {
3baca891
KW
516 bdrv_refresh_limits(bs->file, &local_err);
517 if (local_err) {
518 error_propagate(errp, local_err);
519 return;
520 }
466ad822 521 bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
2647fab5 522 bs->bl.max_transfer_length = bs->file->bl.max_transfer_length;
339064d5
KW
523 bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
524 } else {
525 bs->bl.opt_mem_alignment = 512;
466ad822
KW
526 }
527
528 if (bs->backing_hd) {
3baca891
KW
529 bdrv_refresh_limits(bs->backing_hd, &local_err);
530 if (local_err) {
531 error_propagate(errp, local_err);
532 return;
533 }
466ad822
KW
534 bs->bl.opt_transfer_length =
535 MAX(bs->bl.opt_transfer_length,
536 bs->backing_hd->bl.opt_transfer_length);
2647fab5
PL
537 bs->bl.max_transfer_length =
538 MIN_NON_ZERO(bs->bl.max_transfer_length,
539 bs->backing_hd->bl.max_transfer_length);
339064d5
KW
540 bs->bl.opt_mem_alignment =
541 MAX(bs->bl.opt_mem_alignment,
542 bs->backing_hd->bl.opt_mem_alignment);
466ad822
KW
543 }
544
545 /* Then let the driver override it */
546 if (drv->bdrv_refresh_limits) {
3baca891 547 drv->bdrv_refresh_limits(bs, errp);
d34682cd 548 }
d34682cd
KW
549}
550
eba25057
JM
551/*
552 * Create a uniquely-named empty temporary file.
553 * Return 0 upon success, otherwise a negative errno value.
554 */
555int get_tmp_filename(char *filename, int size)
d5249393 556{
eba25057 557#ifdef _WIN32
3b9f94e1 558 char temp_dir[MAX_PATH];
eba25057
JM
559 /* GetTempFileName requires that its output buffer (4th param)
560 have length MAX_PATH or greater. */
561 assert(size >= MAX_PATH);
562 return (GetTempPath(MAX_PATH, temp_dir)
563 && GetTempFileName(temp_dir, "qem", 0, filename)
564 ? 0 : -GetLastError());
d5249393 565#else
67b915a5 566 int fd;
7ccfb2eb 567 const char *tmpdir;
0badc1ee 568 tmpdir = getenv("TMPDIR");
69bef793
AS
569 if (!tmpdir) {
570 tmpdir = "/var/tmp";
571 }
eba25057
JM
572 if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
573 return -EOVERFLOW;
574 }
ea2384d3 575 fd = mkstemp(filename);
fe235a06
DH
576 if (fd < 0) {
577 return -errno;
578 }
579 if (close(fd) != 0) {
580 unlink(filename);
eba25057
JM
581 return -errno;
582 }
583 return 0;
d5249393 584#endif
eba25057 585}
fc01f7e7 586
84a12e66
CH
587/*
588 * Detect host devices. By convention, /dev/cdrom[N] is always
589 * recognized as a host CDROM.
590 */
591static BlockDriver *find_hdev_driver(const char *filename)
592{
593 int score_max = 0, score;
594 BlockDriver *drv = NULL, *d;
595
596 QLIST_FOREACH(d, &bdrv_drivers, list) {
597 if (d->bdrv_probe_device) {
598 score = d->bdrv_probe_device(filename);
599 if (score > score_max) {
600 score_max = score;
601 drv = d;
602 }
603 }
604 }
605
606 return drv;
607}
608
98289620
KW
609BlockDriver *bdrv_find_protocol(const char *filename,
610 bool allow_protocol_prefix)
83f64091
FB
611{
612 BlockDriver *drv1;
613 char protocol[128];
1cec71e3 614 int len;
83f64091 615 const char *p;
19cb3738 616
66f82cee
KW
617 /* TODO Drivers without bdrv_file_open must be specified explicitly */
618
39508e7a
CH
619 /*
620 * XXX(hch): we really should not let host device detection
621 * override an explicit protocol specification, but moving this
622 * later breaks access to device names with colons in them.
623 * Thanks to the brain-dead persistent naming schemes on udev-
624 * based Linux systems those actually are quite common.
625 */
626 drv1 = find_hdev_driver(filename);
627 if (drv1) {
628 return drv1;
629 }
630
98289620 631 if (!path_has_protocol(filename) || !allow_protocol_prefix) {
39508e7a 632 return bdrv_find_format("file");
84a12e66 633 }
98289620 634
9e0b22f4
SH
635 p = strchr(filename, ':');
636 assert(p != NULL);
1cec71e3
AL
637 len = p - filename;
638 if (len > sizeof(protocol) - 1)
639 len = sizeof(protocol) - 1;
640 memcpy(protocol, filename, len);
641 protocol[len] = '\0';
8a22f02a 642 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
5fafdf24 643 if (drv1->protocol_name &&
8a22f02a 644 !strcmp(drv1->protocol_name, protocol)) {
83f64091 645 return drv1;
8a22f02a 646 }
83f64091
FB
647 }
648 return NULL;
649}
650
f500a6d3 651static int find_image_format(BlockDriverState *bs, const char *filename,
34b5d2c6 652 BlockDriver **pdrv, Error **errp)
f3a5d3f8 653{
f500a6d3 654 int score, score_max;
f3a5d3f8
CH
655 BlockDriver *drv1, *drv;
656 uint8_t buf[2048];
f500a6d3 657 int ret = 0;
f8ea0b00 658
08a00559 659 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
8e895599 660 if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
c98ac35d
SW
661 drv = bdrv_find_format("raw");
662 if (!drv) {
34b5d2c6 663 error_setg(errp, "Could not find raw image format");
c98ac35d
SW
664 ret = -ENOENT;
665 }
666 *pdrv = drv;
667 return ret;
1a396859 668 }
f8ea0b00 669
83f64091 670 ret = bdrv_pread(bs, 0, buf, sizeof(buf));
83f64091 671 if (ret < 0) {
34b5d2c6
HR
672 error_setg_errno(errp, -ret, "Could not read image for determining its "
673 "format");
c98ac35d
SW
674 *pdrv = NULL;
675 return ret;
83f64091
FB
676 }
677
ea2384d3 678 score_max = 0;
84a12e66 679 drv = NULL;
8a22f02a 680 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
83f64091
FB
681 if (drv1->bdrv_probe) {
682 score = drv1->bdrv_probe(buf, ret, filename);
683 if (score > score_max) {
684 score_max = score;
685 drv = drv1;
686 }
0849bf08 687 }
fc01f7e7 688 }
c98ac35d 689 if (!drv) {
34b5d2c6
HR
690 error_setg(errp, "Could not determine image format: No compatible "
691 "driver found");
c98ac35d
SW
692 ret = -ENOENT;
693 }
694 *pdrv = drv;
695 return ret;
ea2384d3
FB
696}
697
51762288
SH
698/**
699 * Set the current 'total_sectors' value
65a9bb25 700 * Return 0 on success, -errno on error.
51762288
SH
701 */
702static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
703{
704 BlockDriver *drv = bs->drv;
705
396759ad
NB
706 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
707 if (bs->sg)
708 return 0;
709
51762288
SH
710 /* query actual device if possible, otherwise just trust the hint */
711 if (drv->bdrv_getlength) {
712 int64_t length = drv->bdrv_getlength(bs);
713 if (length < 0) {
714 return length;
715 }
7e382003 716 hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
51762288
SH
717 }
718
719 bs->total_sectors = hint;
720 return 0;
721}
722
9e8f1835
PB
723/**
724 * Set open flags for a given discard mode
725 *
726 * Return 0 on success, -1 if the discard mode was invalid.
727 */
728int bdrv_parse_discard_flags(const char *mode, int *flags)
729{
730 *flags &= ~BDRV_O_UNMAP;
731
732 if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
733 /* do nothing */
734 } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
735 *flags |= BDRV_O_UNMAP;
736 } else {
737 return -1;
738 }
739
740 return 0;
741}
742
c3993cdc
SH
743/**
744 * Set open flags for a given cache mode
745 *
746 * Return 0 on success, -1 if the cache mode was invalid.
747 */
748int bdrv_parse_cache_flags(const char *mode, int *flags)
749{
750 *flags &= ~BDRV_O_CACHE_MASK;
751
752 if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
753 *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
92196b2f
SH
754 } else if (!strcmp(mode, "directsync")) {
755 *flags |= BDRV_O_NOCACHE;
c3993cdc
SH
756 } else if (!strcmp(mode, "writeback")) {
757 *flags |= BDRV_O_CACHE_WB;
758 } else if (!strcmp(mode, "unsafe")) {
759 *flags |= BDRV_O_CACHE_WB;
760 *flags |= BDRV_O_NO_FLUSH;
761 } else if (!strcmp(mode, "writethrough")) {
762 /* this is the default */
763 } else {
764 return -1;
765 }
766
767 return 0;
768}
769
53fec9d3
SH
770/**
771 * The copy-on-read flag is actually a reference count so multiple users may
772 * use the feature without worrying about clobbering its previous state.
773 * Copy-on-read stays enabled until all users have called to disable it.
774 */
775void bdrv_enable_copy_on_read(BlockDriverState *bs)
776{
777 bs->copy_on_read++;
778}
779
780void bdrv_disable_copy_on_read(BlockDriverState *bs)
781{
782 assert(bs->copy_on_read > 0);
783 bs->copy_on_read--;
784}
785
b1e6fc08
KW
786/*
787 * Returns the flags that a temporary snapshot should get, based on the
788 * originally requested flags (the originally requested image will have flags
789 * like a backing file)
790 */
791static int bdrv_temp_snapshot_flags(int flags)
792{
793 return (flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY;
794}
795
0b50cc88
KW
796/*
797 * Returns the flags that bs->file should get, based on the given flags for
798 * the parent BDS
799 */
800static int bdrv_inherited_flags(int flags)
801{
802 /* Enable protocol handling, disable format probing for bs->file */
803 flags |= BDRV_O_PROTOCOL;
804
805 /* Our block drivers take care to send flushes and respect unmap policy,
806 * so we can enable both unconditionally on lower layers. */
807 flags |= BDRV_O_CACHE_WB | BDRV_O_UNMAP;
808
0b50cc88 809 /* Clear flags that only apply to the top layer */
5669b44d 810 flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_COPY_ON_READ);
0b50cc88
KW
811
812 return flags;
813}
814
317fc44e
KW
815/*
816 * Returns the flags that bs->backing_hd should get, based on the given flags
817 * for the parent BDS
818 */
819static int bdrv_backing_flags(int flags)
820{
821 /* backing files always opened read-only */
822 flags &= ~(BDRV_O_RDWR | BDRV_O_COPY_ON_READ);
823
824 /* snapshot=on is handled on the top layer */
8bfea15d 825 flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_TEMPORARY);
317fc44e
KW
826
827 return flags;
828}
829
7b272452
KW
830static int bdrv_open_flags(BlockDriverState *bs, int flags)
831{
832 int open_flags = flags | BDRV_O_CACHE_WB;
833
834 /*
835 * Clear flags that are internal to the block layer before opening the
836 * image.
837 */
20cca275 838 open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_PROTOCOL);
7b272452
KW
839
840 /*
841 * Snapshots should be writable.
842 */
8bfea15d 843 if (flags & BDRV_O_TEMPORARY) {
7b272452
KW
844 open_flags |= BDRV_O_RDWR;
845 }
846
847 return open_flags;
848}
849
636ea370
KW
850static void bdrv_assign_node_name(BlockDriverState *bs,
851 const char *node_name,
852 Error **errp)
6913c0c2
BC
853{
854 if (!node_name) {
636ea370 855 return;
6913c0c2
BC
856 }
857
9aebf3b8 858 /* Check for empty string or invalid characters */
f5bebbbb 859 if (!id_wellformed(node_name)) {
9aebf3b8 860 error_setg(errp, "Invalid node name");
636ea370 861 return;
6913c0c2
BC
862 }
863
0c5e94ee 864 /* takes care of avoiding namespaces collisions */
7f06d47e 865 if (blk_by_name(node_name)) {
0c5e94ee
BC
866 error_setg(errp, "node-name=%s is conflicting with a device id",
867 node_name);
636ea370 868 return;
0c5e94ee
BC
869 }
870
6913c0c2
BC
871 /* takes care of avoiding duplicates node names */
872 if (bdrv_find_node(node_name)) {
873 error_setg(errp, "Duplicate node name");
636ea370 874 return;
6913c0c2
BC
875 }
876
877 /* copy node name into the bs and insert it into the graph list */
878 pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
879 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
6913c0c2
BC
880}
881
57915332
KW
882/*
883 * Common part for opening disk images and files
b6ad491a
KW
884 *
885 * Removes all processed options from *options.
57915332 886 */
f500a6d3 887static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
34b5d2c6 888 QDict *options, int flags, BlockDriver *drv, Error **errp)
57915332
KW
889{
890 int ret, open_flags;
035fccdf 891 const char *filename;
6913c0c2 892 const char *node_name = NULL;
34b5d2c6 893 Error *local_err = NULL;
57915332
KW
894
895 assert(drv != NULL);
6405875c 896 assert(bs->file == NULL);
707ff828 897 assert(options != NULL && bs->options != options);
57915332 898
45673671
KW
899 if (file != NULL) {
900 filename = file->filename;
901 } else {
902 filename = qdict_get_try_str(options, "filename");
903 }
904
765003db
KW
905 if (drv->bdrv_needs_filename && !filename) {
906 error_setg(errp, "The '%s' block driver requires a file name",
907 drv->format_name);
908 return -EINVAL;
909 }
910
45673671 911 trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
28dcee10 912
6913c0c2 913 node_name = qdict_get_try_str(options, "node-name");
636ea370 914 bdrv_assign_node_name(bs, node_name, &local_err);
0fb6395c 915 if (local_err) {
636ea370
KW
916 error_propagate(errp, local_err);
917 return -EINVAL;
6913c0c2
BC
918 }
919 qdict_del(options, "node-name");
920
5d186eb0
KW
921 /* bdrv_open() with directly using a protocol as drv. This layer is already
922 * opened, so assign it to bs (while file becomes a closed BlockDriverState)
923 * and return immediately. */
924 if (file != NULL && drv->bdrv_file_open) {
925 bdrv_swap(file, bs);
926 return 0;
927 }
928
57915332 929 bs->open_flags = flags;
1b7fd729 930 bs->guest_block_size = 512;
c25f53b0 931 bs->request_alignment = 512;
0d51b4de 932 bs->zero_beyond_eof = true;
b64ec4e4
FZ
933 open_flags = bdrv_open_flags(bs, flags);
934 bs->read_only = !(open_flags & BDRV_O_RDWR);
20cca275 935 bs->growable = !!(flags & BDRV_O_PROTOCOL);
b64ec4e4
FZ
936
937 if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
8f94a6e4
KW
938 error_setg(errp,
939 !bs->read_only && bdrv_is_whitelisted(drv, true)
940 ? "Driver '%s' can only be used for read-only devices"
941 : "Driver '%s' is not whitelisted",
942 drv->format_name);
b64ec4e4
FZ
943 return -ENOTSUP;
944 }
57915332 945
53fec9d3 946 assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
0ebd24e0
KW
947 if (flags & BDRV_O_COPY_ON_READ) {
948 if (!bs->read_only) {
949 bdrv_enable_copy_on_read(bs);
950 } else {
951 error_setg(errp, "Can't use copy-on-read on read-only device");
952 return -EINVAL;
953 }
53fec9d3
SH
954 }
955
c2ad1b0c
KW
956 if (filename != NULL) {
957 pstrcpy(bs->filename, sizeof(bs->filename), filename);
958 } else {
959 bs->filename[0] = '\0';
960 }
91af7014 961 pstrcpy(bs->exact_filename, sizeof(bs->exact_filename), bs->filename);
57915332 962
57915332 963 bs->drv = drv;
7267c094 964 bs->opaque = g_malloc0(drv->instance_size);
57915332 965
03f541bd 966 bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
e7c63796 967
66f82cee
KW
968 /* Open the image, either directly or using a protocol */
969 if (drv->bdrv_file_open) {
5d186eb0 970 assert(file == NULL);
030be321 971 assert(!drv->bdrv_needs_filename || filename != NULL);
34b5d2c6 972 ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
f500a6d3 973 } else {
2af5ef70 974 if (file == NULL) {
34b5d2c6
HR
975 error_setg(errp, "Can't use '%s' as a block driver for the "
976 "protocol level", drv->format_name);
2af5ef70
KW
977 ret = -EINVAL;
978 goto free_and_fail;
979 }
f500a6d3 980 bs->file = file;
34b5d2c6 981 ret = drv->bdrv_open(bs, options, open_flags, &local_err);
66f82cee
KW
982 }
983
57915332 984 if (ret < 0) {
84d18f06 985 if (local_err) {
34b5d2c6 986 error_propagate(errp, local_err);
2fa9aa59
DH
987 } else if (bs->filename[0]) {
988 error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
34b5d2c6
HR
989 } else {
990 error_setg_errno(errp, -ret, "Could not open image");
991 }
57915332
KW
992 goto free_and_fail;
993 }
994
51762288
SH
995 ret = refresh_total_sectors(bs, bs->total_sectors);
996 if (ret < 0) {
34b5d2c6 997 error_setg_errno(errp, -ret, "Could not refresh total sector count");
51762288 998 goto free_and_fail;
57915332 999 }
51762288 1000
3baca891
KW
1001 bdrv_refresh_limits(bs, &local_err);
1002 if (local_err) {
1003 error_propagate(errp, local_err);
1004 ret = -EINVAL;
1005 goto free_and_fail;
1006 }
1007
c25f53b0 1008 assert(bdrv_opt_mem_align(bs) != 0);
47ea2de2 1009 assert((bs->request_alignment != 0) || bs->sg);
57915332
KW
1010 return 0;
1011
1012free_and_fail:
f500a6d3 1013 bs->file = NULL;
7267c094 1014 g_free(bs->opaque);
57915332
KW
1015 bs->opaque = NULL;
1016 bs->drv = NULL;
1017 return ret;
1018}
1019
5e5c4f63
KW
1020static QDict *parse_json_filename(const char *filename, Error **errp)
1021{
1022 QObject *options_obj;
1023 QDict *options;
1024 int ret;
1025
1026 ret = strstart(filename, "json:", &filename);
1027 assert(ret);
1028
1029 options_obj = qobject_from_json(filename);
1030 if (!options_obj) {
1031 error_setg(errp, "Could not parse the JSON options");
1032 return NULL;
1033 }
1034
1035 if (qobject_type(options_obj) != QTYPE_QDICT) {
1036 qobject_decref(options_obj);
1037 error_setg(errp, "Invalid JSON object given");
1038 return NULL;
1039 }
1040
1041 options = qobject_to_qdict(options_obj);
1042 qdict_flatten(options);
1043
1044 return options;
1045}
1046
b6ce07aa 1047/*
f54120ff
KW
1048 * Fills in default options for opening images and converts the legacy
1049 * filename/flags pair to option QDict entries.
b6ce07aa 1050 */
5e5c4f63 1051static int bdrv_fill_options(QDict **options, const char **pfilename, int flags,
17b005f1 1052 BlockDriver *drv, Error **errp)
ea2384d3 1053{
5e5c4f63 1054 const char *filename = *pfilename;
c2ad1b0c 1055 const char *drvname;
462f5bcf 1056 bool protocol = flags & BDRV_O_PROTOCOL;
e3fa4bfa 1057 bool parse_filename = false;
34b5d2c6 1058 Error *local_err = NULL;
83f64091 1059
5e5c4f63
KW
1060 /* Parse json: pseudo-protocol */
1061 if (filename && g_str_has_prefix(filename, "json:")) {
1062 QDict *json_options = parse_json_filename(filename, &local_err);
1063 if (local_err) {
1064 error_propagate(errp, local_err);
1065 return -EINVAL;
1066 }
1067
1068 /* Options given in the filename have lower priority than options
1069 * specified directly */
1070 qdict_join(*options, json_options, false);
1071 QDECREF(json_options);
1072 *pfilename = filename = NULL;
1073 }
1074
035fccdf 1075 /* Fetch the file name from the options QDict if necessary */
17b005f1 1076 if (protocol && filename) {
f54120ff
KW
1077 if (!qdict_haskey(*options, "filename")) {
1078 qdict_put(*options, "filename", qstring_from_str(filename));
1079 parse_filename = true;
1080 } else {
1081 error_setg(errp, "Can't specify 'file' and 'filename' options at "
1082 "the same time");
1083 return -EINVAL;
1084 }
035fccdf
KW
1085 }
1086
c2ad1b0c 1087 /* Find the right block driver */
f54120ff 1088 filename = qdict_get_try_str(*options, "filename");
5acd9d81 1089 drvname = qdict_get_try_str(*options, "driver");
f54120ff 1090
17b005f1
KW
1091 if (drv) {
1092 if (drvname) {
1093 error_setg(errp, "Driver specified twice");
1094 return -EINVAL;
1095 }
1096 drvname = drv->format_name;
1097 qdict_put(*options, "driver", qstring_from_str(drvname));
1098 } else {
1099 if (!drvname && protocol) {
1100 if (filename) {
1101 drv = bdrv_find_protocol(filename, parse_filename);
1102 if (!drv) {
1103 error_setg(errp, "Unknown protocol");
1104 return -EINVAL;
1105 }
1106
1107 drvname = drv->format_name;
1108 qdict_put(*options, "driver", qstring_from_str(drvname));
1109 } else {
1110 error_setg(errp, "Must specify either driver or file");
f54120ff
KW
1111 return -EINVAL;
1112 }
17b005f1
KW
1113 } else if (drvname) {
1114 drv = bdrv_find_format(drvname);
1115 if (!drv) {
1116 error_setg(errp, "Unknown driver '%s'", drvname);
1117 return -ENOENT;
1118 }
98289620 1119 }
c2ad1b0c
KW
1120 }
1121
17b005f1 1122 assert(drv || !protocol);
c2ad1b0c 1123
f54120ff 1124 /* Driver-specific filename parsing */
17b005f1 1125 if (drv && drv->bdrv_parse_filename && parse_filename) {
5acd9d81 1126 drv->bdrv_parse_filename(filename, *options, &local_err);
84d18f06 1127 if (local_err) {
34b5d2c6 1128 error_propagate(errp, local_err);
f54120ff 1129 return -EINVAL;
6963a30d 1130 }
cd5d031e
HR
1131
1132 if (!drv->bdrv_needs_filename) {
1133 qdict_del(*options, "filename");
cd5d031e 1134 }
6963a30d
KW
1135 }
1136
f54120ff
KW
1137 return 0;
1138}
1139
8d24cce1
FZ
1140void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd)
1141{
1142
826b6ca0
FZ
1143 if (bs->backing_hd) {
1144 assert(bs->backing_blocker);
1145 bdrv_op_unblock_all(bs->backing_hd, bs->backing_blocker);
1146 } else if (backing_hd) {
1147 error_setg(&bs->backing_blocker,
1148 "device is used as backing hd of '%s'",
bfb197e0 1149 bdrv_get_device_name(bs));
826b6ca0
FZ
1150 }
1151
8d24cce1
FZ
1152 bs->backing_hd = backing_hd;
1153 if (!backing_hd) {
826b6ca0
FZ
1154 error_free(bs->backing_blocker);
1155 bs->backing_blocker = NULL;
8d24cce1
FZ
1156 goto out;
1157 }
1158 bs->open_flags &= ~BDRV_O_NO_BACKING;
1159 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_hd->filename);
1160 pstrcpy(bs->backing_format, sizeof(bs->backing_format),
1161 backing_hd->drv ? backing_hd->drv->format_name : "");
826b6ca0
FZ
1162
1163 bdrv_op_block_all(bs->backing_hd, bs->backing_blocker);
1164 /* Otherwise we won't be able to commit due to check in bdrv_commit */
1165 bdrv_op_unblock(bs->backing_hd, BLOCK_OP_TYPE_COMMIT,
1166 bs->backing_blocker);
8d24cce1 1167out:
3baca891 1168 bdrv_refresh_limits(bs, NULL);
8d24cce1
FZ
1169}
1170
31ca6d07
KW
1171/*
1172 * Opens the backing file for a BlockDriverState if not yet open
1173 *
1174 * options is a QDict of options to pass to the block drivers, or NULL for an
1175 * empty set of options. The reference to the QDict is transferred to this
1176 * function (even on failure), so if the caller intends to reuse the dictionary,
1177 * it needs to use QINCREF() before calling bdrv_file_open.
1178 */
34b5d2c6 1179int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
9156df12 1180{
1ba4b6a5 1181 char *backing_filename = g_malloc0(PATH_MAX);
317fc44e 1182 int ret = 0;
9156df12 1183 BlockDriver *back_drv = NULL;
8d24cce1 1184 BlockDriverState *backing_hd;
34b5d2c6 1185 Error *local_err = NULL;
9156df12
PB
1186
1187 if (bs->backing_hd != NULL) {
31ca6d07 1188 QDECREF(options);
1ba4b6a5 1189 goto free_exit;
9156df12
PB
1190 }
1191
31ca6d07
KW
1192 /* NULL means an empty set of options */
1193 if (options == NULL) {
1194 options = qdict_new();
1195 }
1196
9156df12 1197 bs->open_flags &= ~BDRV_O_NO_BACKING;
1cb6f506
KW
1198 if (qdict_haskey(options, "file.filename")) {
1199 backing_filename[0] = '\0';
1200 } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
31ca6d07 1201 QDECREF(options);
1ba4b6a5 1202 goto free_exit;
dbecebdd 1203 } else {
1ba4b6a5 1204 bdrv_get_full_backing_filename(bs, backing_filename, PATH_MAX);
9156df12
PB
1205 }
1206
8ee79e70
KW
1207 if (!bs->drv || !bs->drv->supports_backing) {
1208 ret = -EINVAL;
1209 error_setg(errp, "Driver doesn't support backing files");
1210 QDECREF(options);
1211 goto free_exit;
1212 }
1213
e4e9986b 1214 backing_hd = bdrv_new();
8d24cce1 1215
9156df12
PB
1216 if (bs->backing_format[0] != '\0') {
1217 back_drv = bdrv_find_format(bs->backing_format);
1218 }
1219
f67503e5 1220 assert(bs->backing_hd == NULL);
8d24cce1 1221 ret = bdrv_open(&backing_hd,
ddf5636d 1222 *backing_filename ? backing_filename : NULL, NULL, options,
317fc44e 1223 bdrv_backing_flags(bs->open_flags), back_drv, &local_err);
9156df12 1224 if (ret < 0) {
8d24cce1
FZ
1225 bdrv_unref(backing_hd);
1226 backing_hd = NULL;
9156df12 1227 bs->open_flags |= BDRV_O_NO_BACKING;
b04b6b6e
FZ
1228 error_setg(errp, "Could not open backing file: %s",
1229 error_get_pretty(local_err));
1230 error_free(local_err);
1ba4b6a5 1231 goto free_exit;
9156df12 1232 }
8d24cce1 1233 bdrv_set_backing_hd(bs, backing_hd);
d80ac658 1234
1ba4b6a5
BC
1235free_exit:
1236 g_free(backing_filename);
1237 return ret;
9156df12
PB
1238}
1239
da557aac
HR
1240/*
1241 * Opens a disk image whose options are given as BlockdevRef in another block
1242 * device's options.
1243 *
da557aac
HR
1244 * If allow_none is true, no image will be opened if filename is false and no
1245 * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
1246 *
1247 * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
1248 * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
1249 * itself, all options starting with "${bdref_key}." are considered part of the
1250 * BlockdevRef.
1251 *
1252 * The BlockdevRef will be removed from the options QDict.
f67503e5
HR
1253 *
1254 * To conform with the behavior of bdrv_open(), *pbs has to be NULL.
da557aac
HR
1255 */
1256int bdrv_open_image(BlockDriverState **pbs, const char *filename,
1257 QDict *options, const char *bdref_key, int flags,
f7d9fd8c 1258 bool allow_none, Error **errp)
da557aac
HR
1259{
1260 QDict *image_options;
1261 int ret;
1262 char *bdref_key_dot;
1263 const char *reference;
1264
f67503e5
HR
1265 assert(pbs);
1266 assert(*pbs == NULL);
1267
da557aac
HR
1268 bdref_key_dot = g_strdup_printf("%s.", bdref_key);
1269 qdict_extract_subqdict(options, &image_options, bdref_key_dot);
1270 g_free(bdref_key_dot);
1271
1272 reference = qdict_get_try_str(options, bdref_key);
1273 if (!filename && !reference && !qdict_size(image_options)) {
1274 if (allow_none) {
1275 ret = 0;
1276 } else {
1277 error_setg(errp, "A block device must be specified for \"%s\"",
1278 bdref_key);
1279 ret = -EINVAL;
1280 }
b20e61e0 1281 QDECREF(image_options);
da557aac
HR
1282 goto done;
1283 }
1284
f7d9fd8c 1285 ret = bdrv_open(pbs, filename, reference, image_options, flags, NULL, errp);
da557aac
HR
1286
1287done:
1288 qdict_del(options, bdref_key);
1289 return ret;
1290}
1291
6b8aeca5 1292int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp)
b998875d
KW
1293{
1294 /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1ba4b6a5 1295 char *tmp_filename = g_malloc0(PATH_MAX + 1);
b998875d
KW
1296 int64_t total_size;
1297 BlockDriver *bdrv_qcow2;
83d0521a 1298 QemuOpts *opts = NULL;
b998875d
KW
1299 QDict *snapshot_options;
1300 BlockDriverState *bs_snapshot;
1301 Error *local_err;
1302 int ret;
1303
1304 /* if snapshot, we create a temporary backing file and open it
1305 instead of opening 'filename' directly */
1306
1307 /* Get the required size from the image */
f187743a
KW
1308 total_size = bdrv_getlength(bs);
1309 if (total_size < 0) {
6b8aeca5 1310 ret = total_size;
f187743a 1311 error_setg_errno(errp, -total_size, "Could not get image size");
1ba4b6a5 1312 goto out;
f187743a 1313 }
b998875d
KW
1314
1315 /* Create the temporary image */
1ba4b6a5 1316 ret = get_tmp_filename(tmp_filename, PATH_MAX + 1);
b998875d
KW
1317 if (ret < 0) {
1318 error_setg_errno(errp, -ret, "Could not get temporary filename");
1ba4b6a5 1319 goto out;
b998875d
KW
1320 }
1321
1322 bdrv_qcow2 = bdrv_find_format("qcow2");
c282e1fd
CL
1323 opts = qemu_opts_create(bdrv_qcow2->create_opts, NULL, 0,
1324 &error_abort);
83d0521a 1325 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, total_size);
c282e1fd 1326 ret = bdrv_create(bdrv_qcow2, tmp_filename, opts, &local_err);
83d0521a 1327 qemu_opts_del(opts);
b998875d
KW
1328 if (ret < 0) {
1329 error_setg_errno(errp, -ret, "Could not create temporary overlay "
1330 "'%s': %s", tmp_filename,
1331 error_get_pretty(local_err));
1332 error_free(local_err);
1ba4b6a5 1333 goto out;
b998875d
KW
1334 }
1335
1336 /* Prepare a new options QDict for the temporary file */
1337 snapshot_options = qdict_new();
1338 qdict_put(snapshot_options, "file.driver",
1339 qstring_from_str("file"));
1340 qdict_put(snapshot_options, "file.filename",
1341 qstring_from_str(tmp_filename));
1342
e4e9986b 1343 bs_snapshot = bdrv_new();
b998875d
KW
1344
1345 ret = bdrv_open(&bs_snapshot, NULL, NULL, snapshot_options,
b1e6fc08 1346 flags, bdrv_qcow2, &local_err);
b998875d
KW
1347 if (ret < 0) {
1348 error_propagate(errp, local_err);
1ba4b6a5 1349 goto out;
b998875d
KW
1350 }
1351
1352 bdrv_append(bs_snapshot, bs);
1ba4b6a5
BC
1353
1354out:
1355 g_free(tmp_filename);
6b8aeca5 1356 return ret;
b998875d
KW
1357}
1358
b6ce07aa
KW
1359/*
1360 * Opens a disk image (raw, qcow2, vmdk, ...)
de9c0cec
KW
1361 *
1362 * options is a QDict of options to pass to the block drivers, or NULL for an
1363 * empty set of options. The reference to the QDict belongs to the block layer
1364 * after the call (even on failure), so if the caller intends to reuse the
1365 * dictionary, it needs to use QINCREF() before calling bdrv_open.
f67503e5
HR
1366 *
1367 * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
1368 * If it is not NULL, the referenced BDS will be reused.
ddf5636d
HR
1369 *
1370 * The reference parameter may be used to specify an existing block device which
1371 * should be opened. If specified, neither options nor a filename may be given,
1372 * nor can an existing BDS be reused (that is, *pbs has to be NULL).
b6ce07aa 1373 */
ddf5636d
HR
1374int bdrv_open(BlockDriverState **pbs, const char *filename,
1375 const char *reference, QDict *options, int flags,
1376 BlockDriver *drv, Error **errp)
ea2384d3 1377{
b6ce07aa 1378 int ret;
f67503e5 1379 BlockDriverState *file = NULL, *bs;
74fe54f2 1380 const char *drvname;
34b5d2c6 1381 Error *local_err = NULL;
b1e6fc08 1382 int snapshot_flags = 0;
712e7874 1383
f67503e5
HR
1384 assert(pbs);
1385
ddf5636d
HR
1386 if (reference) {
1387 bool options_non_empty = options ? qdict_size(options) : false;
1388 QDECREF(options);
1389
1390 if (*pbs) {
1391 error_setg(errp, "Cannot reuse an existing BDS when referencing "
1392 "another block device");
1393 return -EINVAL;
1394 }
1395
1396 if (filename || options_non_empty) {
1397 error_setg(errp, "Cannot reference an existing block device with "
1398 "additional options or a new filename");
1399 return -EINVAL;
1400 }
1401
1402 bs = bdrv_lookup_bs(reference, reference, errp);
1403 if (!bs) {
1404 return -ENODEV;
1405 }
1406 bdrv_ref(bs);
1407 *pbs = bs;
1408 return 0;
1409 }
1410
f67503e5
HR
1411 if (*pbs) {
1412 bs = *pbs;
1413 } else {
e4e9986b 1414 bs = bdrv_new();
f67503e5
HR
1415 }
1416
de9c0cec
KW
1417 /* NULL means an empty set of options */
1418 if (options == NULL) {
1419 options = qdict_new();
1420 }
1421
17b005f1 1422 ret = bdrv_fill_options(&options, &filename, flags, drv, &local_err);
462f5bcf
KW
1423 if (local_err) {
1424 goto fail;
1425 }
1426
76c591b0
KW
1427 /* Find the right image format driver */
1428 drv = NULL;
1429 drvname = qdict_get_try_str(options, "driver");
1430 if (drvname) {
1431 drv = bdrv_find_format(drvname);
1432 qdict_del(options, "driver");
1433 if (!drv) {
1434 error_setg(errp, "Unknown driver: '%s'", drvname);
1435 ret = -EINVAL;
1436 goto fail;
1437 }
1438 }
1439
1440 assert(drvname || !(flags & BDRV_O_PROTOCOL));
1441 if (drv && !drv->bdrv_file_open) {
1442 /* If the user explicitly wants a format driver here, we'll need to add
1443 * another layer for the protocol in bs->file */
1444 flags &= ~BDRV_O_PROTOCOL;
1445 }
1446
de9c0cec 1447 bs->options = options;
b6ad491a 1448 options = qdict_clone_shallow(options);
de9c0cec 1449
f500a6d3 1450 /* Open image file without format layer */
f4788adc
KW
1451 if ((flags & BDRV_O_PROTOCOL) == 0) {
1452 if (flags & BDRV_O_RDWR) {
1453 flags |= BDRV_O_ALLOW_RDWR;
1454 }
1455 if (flags & BDRV_O_SNAPSHOT) {
1456 snapshot_flags = bdrv_temp_snapshot_flags(flags);
1457 flags = bdrv_backing_flags(flags);
1458 }
f500a6d3 1459
f4788adc
KW
1460 assert(file == NULL);
1461 ret = bdrv_open_image(&file, filename, options, "file",
1462 bdrv_inherited_flags(flags),
1463 true, &local_err);
1464 if (ret < 0) {
1465 goto fail;
1466 }
f500a6d3
KW
1467 }
1468
76c591b0
KW
1469 /* Image format probing */
1470 if (!drv && file) {
17b005f1
KW
1471 ret = find_image_format(file, filename, &drv, &local_err);
1472 if (ret < 0) {
8bfea15d 1473 goto fail;
2a05cbe4 1474 }
76c591b0 1475 } else if (!drv) {
17b005f1
KW
1476 error_setg(errp, "Must specify either driver or file");
1477 ret = -EINVAL;
8bfea15d 1478 goto fail;
ea2384d3 1479 }
b6ce07aa
KW
1480
1481 /* Open the image */
34b5d2c6 1482 ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
b6ce07aa 1483 if (ret < 0) {
8bfea15d 1484 goto fail;
6987307c
CH
1485 }
1486
2a05cbe4 1487 if (file && (bs->file != file)) {
4f6fd349 1488 bdrv_unref(file);
f500a6d3
KW
1489 file = NULL;
1490 }
1491
b6ce07aa 1492 /* If there is a backing file, use it */
9156df12 1493 if ((flags & BDRV_O_NO_BACKING) == 0) {
31ca6d07
KW
1494 QDict *backing_options;
1495
5726d872 1496 qdict_extract_subqdict(options, &backing_options, "backing.");
34b5d2c6 1497 ret = bdrv_open_backing_file(bs, backing_options, &local_err);
b6ce07aa 1498 if (ret < 0) {
b6ad491a 1499 goto close_and_fail;
b6ce07aa 1500 }
b6ce07aa
KW
1501 }
1502
91af7014
HR
1503 bdrv_refresh_filename(bs);
1504
b998875d
KW
1505 /* For snapshot=on, create a temporary qcow2 overlay. bs points to the
1506 * temporary snapshot afterwards. */
b1e6fc08 1507 if (snapshot_flags) {
6b8aeca5 1508 ret = bdrv_append_temp_snapshot(bs, snapshot_flags, &local_err);
b998875d 1509 if (local_err) {
b998875d
KW
1510 goto close_and_fail;
1511 }
1512 }
1513
b6ad491a 1514 /* Check if any unknown options were used */
5acd9d81 1515 if (options && (qdict_size(options) != 0)) {
b6ad491a 1516 const QDictEntry *entry = qdict_first(options);
5acd9d81
HR
1517 if (flags & BDRV_O_PROTOCOL) {
1518 error_setg(errp, "Block protocol '%s' doesn't support the option "
1519 "'%s'", drv->format_name, entry->key);
1520 } else {
1521 error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1522 "support the option '%s'", drv->format_name,
bfb197e0 1523 bdrv_get_device_name(bs), entry->key);
5acd9d81 1524 }
b6ad491a
KW
1525
1526 ret = -EINVAL;
1527 goto close_and_fail;
1528 }
b6ad491a 1529
b6ce07aa 1530 if (!bdrv_key_required(bs)) {
a7f53e26
MA
1531 if (bs->blk) {
1532 blk_dev_change_media_cb(bs->blk, true);
1533 }
c3adb58f
MA
1534 } else if (!runstate_check(RUN_STATE_PRELAUNCH)
1535 && !runstate_check(RUN_STATE_INMIGRATE)
1536 && !runstate_check(RUN_STATE_PAUSED)) { /* HACK */
1537 error_setg(errp,
1538 "Guest must be stopped for opening of encrypted image");
1539 ret = -EBUSY;
1540 goto close_and_fail;
b6ce07aa
KW
1541 }
1542
c3adb58f 1543 QDECREF(options);
f67503e5 1544 *pbs = bs;
b6ce07aa
KW
1545 return 0;
1546
8bfea15d 1547fail:
f500a6d3 1548 if (file != NULL) {
4f6fd349 1549 bdrv_unref(file);
f500a6d3 1550 }
de9c0cec 1551 QDECREF(bs->options);
b6ad491a 1552 QDECREF(options);
de9c0cec 1553 bs->options = NULL;
f67503e5
HR
1554 if (!*pbs) {
1555 /* If *pbs is NULL, a new BDS has been created in this function and
1556 needs to be freed now. Otherwise, it does not need to be closed,
1557 since it has not really been opened yet. */
1558 bdrv_unref(bs);
1559 }
84d18f06 1560 if (local_err) {
34b5d2c6
HR
1561 error_propagate(errp, local_err);
1562 }
b6ad491a 1563 return ret;
de9c0cec 1564
b6ad491a 1565close_and_fail:
f67503e5
HR
1566 /* See fail path, but now the BDS has to be always closed */
1567 if (*pbs) {
1568 bdrv_close(bs);
1569 } else {
1570 bdrv_unref(bs);
1571 }
b6ad491a 1572 QDECREF(options);
84d18f06 1573 if (local_err) {
34b5d2c6
HR
1574 error_propagate(errp, local_err);
1575 }
b6ce07aa
KW
1576 return ret;
1577}
1578
e971aa12
JC
1579typedef struct BlockReopenQueueEntry {
1580 bool prepared;
1581 BDRVReopenState state;
1582 QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1583} BlockReopenQueueEntry;
1584
1585/*
1586 * Adds a BlockDriverState to a simple queue for an atomic, transactional
1587 * reopen of multiple devices.
1588 *
1589 * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1590 * already performed, or alternatively may be NULL a new BlockReopenQueue will
1591 * be created and initialized. This newly created BlockReopenQueue should be
1592 * passed back in for subsequent calls that are intended to be of the same
1593 * atomic 'set'.
1594 *
1595 * bs is the BlockDriverState to add to the reopen queue.
1596 *
1597 * flags contains the open flags for the associated bs
1598 *
1599 * returns a pointer to bs_queue, which is either the newly allocated
1600 * bs_queue, or the existing bs_queue being used.
1601 *
1602 */
1603BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1604 BlockDriverState *bs, int flags)
1605{
1606 assert(bs != NULL);
1607
1608 BlockReopenQueueEntry *bs_entry;
1609 if (bs_queue == NULL) {
1610 bs_queue = g_new0(BlockReopenQueue, 1);
1611 QSIMPLEQ_INIT(bs_queue);
1612 }
1613
f1f25a2e
KW
1614 /* bdrv_open() masks this flag out */
1615 flags &= ~BDRV_O_PROTOCOL;
1616
e971aa12 1617 if (bs->file) {
f1f25a2e 1618 bdrv_reopen_queue(bs_queue, bs->file, bdrv_inherited_flags(flags));
e971aa12
JC
1619 }
1620
1621 bs_entry = g_new0(BlockReopenQueueEntry, 1);
1622 QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1623
1624 bs_entry->state.bs = bs;
1625 bs_entry->state.flags = flags;
1626
1627 return bs_queue;
1628}
1629
1630/*
1631 * Reopen multiple BlockDriverStates atomically & transactionally.
1632 *
1633 * The queue passed in (bs_queue) must have been built up previous
1634 * via bdrv_reopen_queue().
1635 *
1636 * Reopens all BDS specified in the queue, with the appropriate
1637 * flags. All devices are prepared for reopen, and failure of any
1638 * device will cause all device changes to be abandonded, and intermediate
1639 * data cleaned up.
1640 *
1641 * If all devices prepare successfully, then the changes are committed
1642 * to all devices.
1643 *
1644 */
1645int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1646{
1647 int ret = -1;
1648 BlockReopenQueueEntry *bs_entry, *next;
1649 Error *local_err = NULL;
1650
1651 assert(bs_queue != NULL);
1652
1653 bdrv_drain_all();
1654
1655 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1656 if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1657 error_propagate(errp, local_err);
1658 goto cleanup;
1659 }
1660 bs_entry->prepared = true;
1661 }
1662
1663 /* If we reach this point, we have success and just need to apply the
1664 * changes
1665 */
1666 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1667 bdrv_reopen_commit(&bs_entry->state);
1668 }
1669
1670 ret = 0;
1671
1672cleanup:
1673 QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1674 if (ret && bs_entry->prepared) {
1675 bdrv_reopen_abort(&bs_entry->state);
1676 }
1677 g_free(bs_entry);
1678 }
1679 g_free(bs_queue);
1680 return ret;
1681}
1682
1683
1684/* Reopen a single BlockDriverState with the specified flags. */
1685int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1686{
1687 int ret = -1;
1688 Error *local_err = NULL;
1689 BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1690
1691 ret = bdrv_reopen_multiple(queue, &local_err);
1692 if (local_err != NULL) {
1693 error_propagate(errp, local_err);
1694 }
1695 return ret;
1696}
1697
1698
1699/*
1700 * Prepares a BlockDriverState for reopen. All changes are staged in the
1701 * 'opaque' field of the BDRVReopenState, which is used and allocated by
1702 * the block driver layer .bdrv_reopen_prepare()
1703 *
1704 * bs is the BlockDriverState to reopen
1705 * flags are the new open flags
1706 * queue is the reopen queue
1707 *
1708 * Returns 0 on success, non-zero on error. On error errp will be set
1709 * as well.
1710 *
1711 * On failure, bdrv_reopen_abort() will be called to clean up any data.
1712 * It is the responsibility of the caller to then call the abort() or
1713 * commit() for any other BDS that have been left in a prepare() state
1714 *
1715 */
1716int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1717 Error **errp)
1718{
1719 int ret = -1;
1720 Error *local_err = NULL;
1721 BlockDriver *drv;
1722
1723 assert(reopen_state != NULL);
1724 assert(reopen_state->bs->drv != NULL);
1725 drv = reopen_state->bs->drv;
1726
1727 /* if we are to stay read-only, do not allow permission change
1728 * to r/w */
1729 if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1730 reopen_state->flags & BDRV_O_RDWR) {
1731 error_set(errp, QERR_DEVICE_IS_READ_ONLY,
bfb197e0 1732 bdrv_get_device_name(reopen_state->bs));
e971aa12
JC
1733 goto error;
1734 }
1735
1736
1737 ret = bdrv_flush(reopen_state->bs);
1738 if (ret) {
1739 error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1740 strerror(-ret));
1741 goto error;
1742 }
1743
1744 if (drv->bdrv_reopen_prepare) {
1745 ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1746 if (ret) {
1747 if (local_err != NULL) {
1748 error_propagate(errp, local_err);
1749 } else {
d8b6895f
LC
1750 error_setg(errp, "failed while preparing to reopen image '%s'",
1751 reopen_state->bs->filename);
e971aa12
JC
1752 }
1753 goto error;
1754 }
1755 } else {
1756 /* It is currently mandatory to have a bdrv_reopen_prepare()
1757 * handler for each supported drv. */
1758 error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
bfb197e0 1759 drv->format_name, bdrv_get_device_name(reopen_state->bs),
e971aa12
JC
1760 "reopening of file");
1761 ret = -1;
1762 goto error;
1763 }
1764
1765 ret = 0;
1766
1767error:
1768 return ret;
1769}
1770
1771/*
1772 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1773 * makes them final by swapping the staging BlockDriverState contents into
1774 * the active BlockDriverState contents.
1775 */
1776void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1777{
1778 BlockDriver *drv;
1779
1780 assert(reopen_state != NULL);
1781 drv = reopen_state->bs->drv;
1782 assert(drv != NULL);
1783
1784 /* If there are any driver level actions to take */
1785 if (drv->bdrv_reopen_commit) {
1786 drv->bdrv_reopen_commit(reopen_state);
1787 }
1788
1789 /* set BDS specific flags now */
1790 reopen_state->bs->open_flags = reopen_state->flags;
1791 reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1792 BDRV_O_CACHE_WB);
1793 reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
355ef4ac 1794
3baca891 1795 bdrv_refresh_limits(reopen_state->bs, NULL);
e971aa12
JC
1796}
1797
1798/*
1799 * Abort the reopen, and delete and free the staged changes in
1800 * reopen_state
1801 */
1802void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1803{
1804 BlockDriver *drv;
1805
1806 assert(reopen_state != NULL);
1807 drv = reopen_state->bs->drv;
1808 assert(drv != NULL);
1809
1810 if (drv->bdrv_reopen_abort) {
1811 drv->bdrv_reopen_abort(reopen_state);
1812 }
1813}
1814
1815
fc01f7e7
FB
1816void bdrv_close(BlockDriverState *bs)
1817{
33384421
HR
1818 BdrvAioNotifier *ban, *ban_next;
1819
3cbc002c
PB
1820 if (bs->job) {
1821 block_job_cancel_sync(bs->job);
1822 }
58fda173
SH
1823 bdrv_drain_all(); /* complete I/O */
1824 bdrv_flush(bs);
1825 bdrv_drain_all(); /* in case flush left pending I/O */
d7d512f6 1826 notifier_list_notify(&bs->close_notifiers, bs);
7094f12f 1827
3cbc002c 1828 if (bs->drv) {
557df6ac 1829 if (bs->backing_hd) {
826b6ca0
FZ
1830 BlockDriverState *backing_hd = bs->backing_hd;
1831 bdrv_set_backing_hd(bs, NULL);
1832 bdrv_unref(backing_hd);
557df6ac 1833 }
ea2384d3 1834 bs->drv->bdrv_close(bs);
7267c094 1835 g_free(bs->opaque);
ea2384d3
FB
1836 bs->opaque = NULL;
1837 bs->drv = NULL;
53fec9d3 1838 bs->copy_on_read = 0;
a275fa42
PB
1839 bs->backing_file[0] = '\0';
1840 bs->backing_format[0] = '\0';
6405875c
PB
1841 bs->total_sectors = 0;
1842 bs->encrypted = 0;
1843 bs->valid_key = 0;
1844 bs->sg = 0;
1845 bs->growable = 0;
0d51b4de 1846 bs->zero_beyond_eof = false;
de9c0cec
KW
1847 QDECREF(bs->options);
1848 bs->options = NULL;
91af7014
HR
1849 QDECREF(bs->full_open_options);
1850 bs->full_open_options = NULL;
b338082b 1851
66f82cee 1852 if (bs->file != NULL) {
4f6fd349 1853 bdrv_unref(bs->file);
0ac9377d 1854 bs->file = NULL;
66f82cee 1855 }
b338082b 1856 }
98f90dba 1857
a7f53e26
MA
1858 if (bs->blk) {
1859 blk_dev_change_media_cb(bs->blk, false);
1860 }
9ca11154 1861
98f90dba
ZYW
1862 /*throttling disk I/O limits*/
1863 if (bs->io_limits_enabled) {
1864 bdrv_io_limits_disable(bs);
1865 }
33384421
HR
1866
1867 QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
1868 g_free(ban);
1869 }
1870 QLIST_INIT(&bs->aio_notifiers);
b338082b
FB
1871}
1872
2bc93fed
MK
1873void bdrv_close_all(void)
1874{
1875 BlockDriverState *bs;
1876
dc364f4c 1877 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
ed78cda3
SH
1878 AioContext *aio_context = bdrv_get_aio_context(bs);
1879
1880 aio_context_acquire(aio_context);
2bc93fed 1881 bdrv_close(bs);
ed78cda3 1882 aio_context_release(aio_context);
2bc93fed
MK
1883 }
1884}
1885
88266f5a
SH
1886/* Check if any requests are in-flight (including throttled requests) */
1887static bool bdrv_requests_pending(BlockDriverState *bs)
1888{
1889 if (!QLIST_EMPTY(&bs->tracked_requests)) {
1890 return true;
1891 }
cc0681c4
BC
1892 if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1893 return true;
1894 }
1895 if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
88266f5a
SH
1896 return true;
1897 }
1898 if (bs->file && bdrv_requests_pending(bs->file)) {
1899 return true;
1900 }
1901 if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1902 return true;
1903 }
1904 return false;
1905}
1906
922453bc
SH
1907/*
1908 * Wait for pending requests to complete across all BlockDriverStates
1909 *
1910 * This function does not flush data to disk, use bdrv_flush_all() for that
1911 * after calling this function.
4c355d53
ZYW
1912 *
1913 * Note that completion of an asynchronous I/O operation can trigger any
1914 * number of other I/O operations on other devices---for example a coroutine
1915 * can be arbitrarily complex and a constant flow of I/O can come until the
1916 * coroutine is complete. Because of this, it is not possible to have a
1917 * function to drain a single device's I/O queue.
922453bc
SH
1918 */
1919void bdrv_drain_all(void)
1920{
88266f5a
SH
1921 /* Always run first iteration so any pending completion BHs run */
1922 bool busy = true;
922453bc
SH
1923 BlockDriverState *bs;
1924
88266f5a 1925 while (busy) {
9b536adc
SH
1926 busy = false;
1927
dc364f4c 1928 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
9b536adc
SH
1929 AioContext *aio_context = bdrv_get_aio_context(bs);
1930 bool bs_busy;
1931
1932 aio_context_acquire(aio_context);
448ad91d 1933 bdrv_flush_io_queue(bs);
0b06ef3b 1934 bdrv_start_throttled_reqs(bs);
9b536adc
SH
1935 bs_busy = bdrv_requests_pending(bs);
1936 bs_busy |= aio_poll(aio_context, bs_busy);
1937 aio_context_release(aio_context);
922453bc 1938
9b536adc
SH
1939 busy |= bs_busy;
1940 }
922453bc
SH
1941 }
1942}
1943
dc364f4c
BC
1944/* make a BlockDriverState anonymous by removing from bdrv_state and
1945 * graph_bdrv_state list.
d22b2f41
RH
1946 Also, NULL terminate the device_name to prevent double remove */
1947void bdrv_make_anon(BlockDriverState *bs)
1948{
bfb197e0
MA
1949 /*
1950 * Take care to remove bs from bdrv_states only when it's actually
1951 * in it. Note that bs->device_list.tqe_prev is initially null,
1952 * and gets set to non-null by QTAILQ_INSERT_TAIL(). Establish
1953 * the useful invariant "bs in bdrv_states iff bs->tqe_prev" by
1954 * resetting it to null on remove.
1955 */
1956 if (bs->device_list.tqe_prev) {
dc364f4c 1957 QTAILQ_REMOVE(&bdrv_states, bs, device_list);
bfb197e0 1958 bs->device_list.tqe_prev = NULL;
d22b2f41 1959 }
dc364f4c
BC
1960 if (bs->node_name[0] != '\0') {
1961 QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
1962 }
1963 bs->node_name[0] = '\0';
d22b2f41
RH
1964}
1965
e023b2e2
PB
1966static void bdrv_rebind(BlockDriverState *bs)
1967{
1968 if (bs->drv && bs->drv->bdrv_rebind) {
1969 bs->drv->bdrv_rebind(bs);
1970 }
1971}
1972
4ddc07ca
PB
1973static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
1974 BlockDriverState *bs_src)
8802d1fd 1975{
4ddc07ca 1976 /* move some fields that need to stay attached to the device */
8802d1fd
JC
1977
1978 /* dev info */
1b7fd729 1979 bs_dest->guest_block_size = bs_src->guest_block_size;
4ddc07ca 1980 bs_dest->copy_on_read = bs_src->copy_on_read;
8802d1fd 1981
4ddc07ca 1982 bs_dest->enable_write_cache = bs_src->enable_write_cache;
c4a248a1 1983
cc0681c4
BC
1984 /* i/o throttled req */
1985 memcpy(&bs_dest->throttle_state,
1986 &bs_src->throttle_state,
1987 sizeof(ThrottleState));
1988 bs_dest->throttled_reqs[0] = bs_src->throttled_reqs[0];
1989 bs_dest->throttled_reqs[1] = bs_src->throttled_reqs[1];
4ddc07ca 1990 bs_dest->io_limits_enabled = bs_src->io_limits_enabled;
8802d1fd 1991
8802d1fd 1992 /* r/w error */
4ddc07ca
PB
1993 bs_dest->on_read_error = bs_src->on_read_error;
1994 bs_dest->on_write_error = bs_src->on_write_error;
8802d1fd
JC
1995
1996 /* i/o status */
4ddc07ca
PB
1997 bs_dest->iostatus_enabled = bs_src->iostatus_enabled;
1998 bs_dest->iostatus = bs_src->iostatus;
8802d1fd 1999
a9fc4408 2000 /* dirty bitmap */
e4654d2d 2001 bs_dest->dirty_bitmaps = bs_src->dirty_bitmaps;
a9fc4408 2002
9fcb0251
FZ
2003 /* reference count */
2004 bs_dest->refcnt = bs_src->refcnt;
2005
a9fc4408 2006 /* job */
4ddc07ca 2007 bs_dest->job = bs_src->job;
a9fc4408 2008
8802d1fd 2009 /* keep the same entry in bdrv_states */
dc364f4c 2010 bs_dest->device_list = bs_src->device_list;
7e7d56d9
MA
2011 bs_dest->blk = bs_src->blk;
2012
fbe40ff7
FZ
2013 memcpy(bs_dest->op_blockers, bs_src->op_blockers,
2014 sizeof(bs_dest->op_blockers));
4ddc07ca 2015}
8802d1fd 2016
4ddc07ca
PB
2017/*
2018 * Swap bs contents for two image chains while they are live,
2019 * while keeping required fields on the BlockDriverState that is
2020 * actually attached to a device.
2021 *
2022 * This will modify the BlockDriverState fields, and swap contents
2023 * between bs_new and bs_old. Both bs_new and bs_old are modified.
2024 *
bfb197e0 2025 * bs_new must not be attached to a BlockBackend.
4ddc07ca
PB
2026 *
2027 * This function does not create any image files.
2028 */
2029void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
2030{
2031 BlockDriverState tmp;
f6801b83 2032
90ce8a06
BC
2033 /* The code needs to swap the node_name but simply swapping node_list won't
2034 * work so first remove the nodes from the graph list, do the swap then
2035 * insert them back if needed.
2036 */
2037 if (bs_new->node_name[0] != '\0') {
2038 QTAILQ_REMOVE(&graph_bdrv_states, bs_new, node_list);
2039 }
2040 if (bs_old->node_name[0] != '\0') {
2041 QTAILQ_REMOVE(&graph_bdrv_states, bs_old, node_list);
2042 }
2043
bfb197e0 2044 /* bs_new must be unattached and shouldn't have anything fancy enabled */
7e7d56d9 2045 assert(!bs_new->blk);
e4654d2d 2046 assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
4ddc07ca 2047 assert(bs_new->job == NULL);
4ddc07ca 2048 assert(bs_new->io_limits_enabled == false);
cc0681c4 2049 assert(!throttle_have_timer(&bs_new->throttle_state));
8802d1fd 2050
4ddc07ca
PB
2051 tmp = *bs_new;
2052 *bs_new = *bs_old;
2053 *bs_old = tmp;
a9fc4408 2054
4ddc07ca
PB
2055 /* there are some fields that should not be swapped, move them back */
2056 bdrv_move_feature_fields(&tmp, bs_old);
2057 bdrv_move_feature_fields(bs_old, bs_new);
2058 bdrv_move_feature_fields(bs_new, &tmp);
8802d1fd 2059
bfb197e0 2060 /* bs_new must remain unattached */
7e7d56d9 2061 assert(!bs_new->blk);
4ddc07ca
PB
2062
2063 /* Check a few fields that should remain attached to the device */
4ddc07ca 2064 assert(bs_new->job == NULL);
4ddc07ca 2065 assert(bs_new->io_limits_enabled == false);
cc0681c4 2066 assert(!throttle_have_timer(&bs_new->throttle_state));
e023b2e2 2067
90ce8a06
BC
2068 /* insert the nodes back into the graph node list if needed */
2069 if (bs_new->node_name[0] != '\0') {
2070 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_new, node_list);
2071 }
2072 if (bs_old->node_name[0] != '\0') {
2073 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_old, node_list);
2074 }
2075
e023b2e2 2076 bdrv_rebind(bs_new);
4ddc07ca
PB
2077 bdrv_rebind(bs_old);
2078}
2079
2080/*
2081 * Add new bs contents at the top of an image chain while the chain is
2082 * live, while keeping required fields on the top layer.
2083 *
2084 * This will modify the BlockDriverState fields, and swap contents
2085 * between bs_new and bs_top. Both bs_new and bs_top are modified.
2086 *
bfb197e0 2087 * bs_new must not be attached to a BlockBackend.
4ddc07ca
PB
2088 *
2089 * This function does not create any image files.
2090 */
2091void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
2092{
2093 bdrv_swap(bs_new, bs_top);
2094
2095 /* The contents of 'tmp' will become bs_top, as we are
2096 * swapping bs_new and bs_top contents. */
8d24cce1 2097 bdrv_set_backing_hd(bs_top, bs_new);
8802d1fd
JC
2098}
2099
4f6fd349 2100static void bdrv_delete(BlockDriverState *bs)
b338082b 2101{
3e914655 2102 assert(!bs->job);
3718d8ab 2103 assert(bdrv_op_blocker_is_empty(bs));
4f6fd349 2104 assert(!bs->refcnt);
e4654d2d 2105 assert(QLIST_EMPTY(&bs->dirty_bitmaps));
18846dee 2106
e1b5c52e
SH
2107 bdrv_close(bs);
2108
1b7bdbc1 2109 /* remove from list, if necessary */
d22b2f41 2110 bdrv_make_anon(bs);
34c6f050 2111
7267c094 2112 g_free(bs);
fc01f7e7
FB
2113}
2114
e97fc193
AL
2115/*
2116 * Run consistency checks on an image
2117 *
e076f338 2118 * Returns 0 if the check could be completed (it doesn't mean that the image is
a1c7273b 2119 * free of errors) or -errno when an internal error occurred. The results of the
e076f338 2120 * check are stored in res.
e97fc193 2121 */
4534ff54 2122int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
e97fc193 2123{
908bcd54
HR
2124 if (bs->drv == NULL) {
2125 return -ENOMEDIUM;
2126 }
e97fc193
AL
2127 if (bs->drv->bdrv_check == NULL) {
2128 return -ENOTSUP;
2129 }
2130
e076f338 2131 memset(res, 0, sizeof(*res));
4534ff54 2132 return bs->drv->bdrv_check(bs, res, fix);
e97fc193
AL
2133}
2134
8a426614
KW
2135#define COMMIT_BUF_SECTORS 2048
2136
33e3963e
FB
2137/* commit COW file into the raw image */
2138int bdrv_commit(BlockDriverState *bs)
2139{
19cb3738 2140 BlockDriver *drv = bs->drv;
72706ea4 2141 int64_t sector, total_sectors, length, backing_length;
8a426614 2142 int n, ro, open_flags;
0bce597d 2143 int ret = 0;
72706ea4 2144 uint8_t *buf = NULL;
c2cba3d9 2145 char filename[PATH_MAX];
33e3963e 2146
19cb3738
FB
2147 if (!drv)
2148 return -ENOMEDIUM;
6bb45158 2149
4dca4b63
NS
2150 if (!bs->backing_hd) {
2151 return -ENOTSUP;
33e3963e
FB
2152 }
2153
3718d8ab
FZ
2154 if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_COMMIT, NULL) ||
2155 bdrv_op_is_blocked(bs->backing_hd, BLOCK_OP_TYPE_COMMIT, NULL)) {
2d3735d3
SH
2156 return -EBUSY;
2157 }
2158
4dca4b63 2159 ro = bs->backing_hd->read_only;
c2cba3d9
JM
2160 /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
2161 pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
4dca4b63
NS
2162 open_flags = bs->backing_hd->open_flags;
2163
2164 if (ro) {
0bce597d
JC
2165 if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
2166 return -EACCES;
4dca4b63 2167 }
ea2384d3 2168 }
33e3963e 2169
72706ea4
JC
2170 length = bdrv_getlength(bs);
2171 if (length < 0) {
2172 ret = length;
2173 goto ro_cleanup;
2174 }
2175
2176 backing_length = bdrv_getlength(bs->backing_hd);
2177 if (backing_length < 0) {
2178 ret = backing_length;
2179 goto ro_cleanup;
2180 }
2181
2182 /* If our top snapshot is larger than the backing file image,
2183 * grow the backing file image if possible. If not possible,
2184 * we must return an error */
2185 if (length > backing_length) {
2186 ret = bdrv_truncate(bs->backing_hd, length);
2187 if (ret < 0) {
2188 goto ro_cleanup;
2189 }
2190 }
2191
2192 total_sectors = length >> BDRV_SECTOR_BITS;
857d4f46
KW
2193
2194 /* qemu_try_blockalign() for bs will choose an alignment that works for
2195 * bs->backing_hd as well, so no need to compare the alignment manually. */
2196 buf = qemu_try_blockalign(bs, COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
2197 if (buf == NULL) {
2198 ret = -ENOMEM;
2199 goto ro_cleanup;
2200 }
8a426614
KW
2201
2202 for (sector = 0; sector < total_sectors; sector += n) {
d663640c
PB
2203 ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
2204 if (ret < 0) {
2205 goto ro_cleanup;
2206 }
2207 if (ret) {
dabfa6cc
KW
2208 ret = bdrv_read(bs, sector, buf, n);
2209 if (ret < 0) {
8a426614
KW
2210 goto ro_cleanup;
2211 }
2212
dabfa6cc
KW
2213 ret = bdrv_write(bs->backing_hd, sector, buf, n);
2214 if (ret < 0) {
8a426614
KW
2215 goto ro_cleanup;
2216 }
ea2384d3 2217 }
33e3963e 2218 }
95389c86 2219
1d44952f
CH
2220 if (drv->bdrv_make_empty) {
2221 ret = drv->bdrv_make_empty(bs);
dabfa6cc
KW
2222 if (ret < 0) {
2223 goto ro_cleanup;
2224 }
1d44952f
CH
2225 bdrv_flush(bs);
2226 }
95389c86 2227
3f5075ae
CH
2228 /*
2229 * Make sure all data we wrote to the backing device is actually
2230 * stable on disk.
2231 */
dabfa6cc 2232 if (bs->backing_hd) {
3f5075ae 2233 bdrv_flush(bs->backing_hd);
dabfa6cc 2234 }
4dca4b63 2235
dabfa6cc 2236 ret = 0;
4dca4b63 2237ro_cleanup:
857d4f46 2238 qemu_vfree(buf);
4dca4b63
NS
2239
2240 if (ro) {
0bce597d
JC
2241 /* ignoring error return here */
2242 bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
4dca4b63
NS
2243 }
2244
1d44952f 2245 return ret;
33e3963e
FB
2246}
2247
e8877497 2248int bdrv_commit_all(void)
6ab4b5ab
MA
2249{
2250 BlockDriverState *bs;
2251
dc364f4c 2252 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
ed78cda3
SH
2253 AioContext *aio_context = bdrv_get_aio_context(bs);
2254
2255 aio_context_acquire(aio_context);
272d2d8e
JC
2256 if (bs->drv && bs->backing_hd) {
2257 int ret = bdrv_commit(bs);
2258 if (ret < 0) {
ed78cda3 2259 aio_context_release(aio_context);
272d2d8e
JC
2260 return ret;
2261 }
e8877497 2262 }
ed78cda3 2263 aio_context_release(aio_context);
6ab4b5ab 2264 }
e8877497 2265 return 0;
6ab4b5ab
MA
2266}
2267
dbffbdcf
SH
2268/**
2269 * Remove an active request from the tracked requests list
2270 *
2271 * This function should be called when a tracked request is completing.
2272 */
2273static void tracked_request_end(BdrvTrackedRequest *req)
2274{
2dbafdc0
KW
2275 if (req->serialising) {
2276 req->bs->serialising_in_flight--;
2277 }
2278
dbffbdcf 2279 QLIST_REMOVE(req, list);
f4658285 2280 qemu_co_queue_restart_all(&req->wait_queue);
dbffbdcf
SH
2281}
2282
2283/**
2284 * Add an active request to the tracked requests list
2285 */
2286static void tracked_request_begin(BdrvTrackedRequest *req,
2287 BlockDriverState *bs,
793ed47a
KW
2288 int64_t offset,
2289 unsigned int bytes, bool is_write)
dbffbdcf
SH
2290{
2291 *req = (BdrvTrackedRequest){
2292 .bs = bs,
2dbafdc0
KW
2293 .offset = offset,
2294 .bytes = bytes,
2295 .is_write = is_write,
2296 .co = qemu_coroutine_self(),
2297 .serialising = false,
7327145f
KW
2298 .overlap_offset = offset,
2299 .overlap_bytes = bytes,
dbffbdcf
SH
2300 };
2301
f4658285
SH
2302 qemu_co_queue_init(&req->wait_queue);
2303
dbffbdcf
SH
2304 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2305}
2306
e96126ff 2307static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
2dbafdc0 2308{
7327145f 2309 int64_t overlap_offset = req->offset & ~(align - 1);
e96126ff
KW
2310 unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
2311 - overlap_offset;
7327145f 2312
2dbafdc0
KW
2313 if (!req->serialising) {
2314 req->bs->serialising_in_flight++;
2315 req->serialising = true;
2316 }
7327145f
KW
2317
2318 req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
2319 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
2dbafdc0
KW
2320}
2321
d83947ac
SH
2322/**
2323 * Round a region to cluster boundaries
2324 */
343bded4
PB
2325void bdrv_round_to_clusters(BlockDriverState *bs,
2326 int64_t sector_num, int nb_sectors,
2327 int64_t *cluster_sector_num,
2328 int *cluster_nb_sectors)
d83947ac
SH
2329{
2330 BlockDriverInfo bdi;
2331
2332 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2333 *cluster_sector_num = sector_num;
2334 *cluster_nb_sectors = nb_sectors;
2335 } else {
2336 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2337 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2338 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2339 nb_sectors, c);
2340 }
2341}
2342
7327145f 2343static int bdrv_get_cluster_size(BlockDriverState *bs)
793ed47a
KW
2344{
2345 BlockDriverInfo bdi;
7327145f 2346 int ret;
793ed47a 2347
7327145f
KW
2348 ret = bdrv_get_info(bs, &bdi);
2349 if (ret < 0 || bdi.cluster_size == 0) {
2350 return bs->request_alignment;
793ed47a 2351 } else {
7327145f 2352 return bdi.cluster_size;
793ed47a
KW
2353 }
2354}
2355
f4658285 2356static bool tracked_request_overlaps(BdrvTrackedRequest *req,
793ed47a
KW
2357 int64_t offset, unsigned int bytes)
2358{
d83947ac 2359 /* aaaa bbbb */
7327145f 2360 if (offset >= req->overlap_offset + req->overlap_bytes) {
d83947ac
SH
2361 return false;
2362 }
2363 /* bbbb aaaa */
7327145f 2364 if (req->overlap_offset >= offset + bytes) {
d83947ac
SH
2365 return false;
2366 }
2367 return true;
f4658285
SH
2368}
2369
28de2dcd 2370static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
f4658285 2371{
2dbafdc0 2372 BlockDriverState *bs = self->bs;
f4658285
SH
2373 BdrvTrackedRequest *req;
2374 bool retry;
28de2dcd 2375 bool waited = false;
f4658285 2376
2dbafdc0 2377 if (!bs->serialising_in_flight) {
28de2dcd 2378 return false;
2dbafdc0
KW
2379 }
2380
f4658285
SH
2381 do {
2382 retry = false;
2383 QLIST_FOREACH(req, &bs->tracked_requests, list) {
2dbafdc0 2384 if (req == self || (!req->serialising && !self->serialising)) {
65afd211
KW
2385 continue;
2386 }
7327145f
KW
2387 if (tracked_request_overlaps(req, self->overlap_offset,
2388 self->overlap_bytes))
2389 {
5f8b6491
SH
2390 /* Hitting this means there was a reentrant request, for
2391 * example, a block driver issuing nested requests. This must
2392 * never happen since it means deadlock.
2393 */
2394 assert(qemu_coroutine_self() != req->co);
2395
6460440f
KW
2396 /* If the request is already (indirectly) waiting for us, or
2397 * will wait for us as soon as it wakes up, then just go on
2398 * (instead of producing a deadlock in the former case). */
2399 if (!req->waiting_for) {
2400 self->waiting_for = req;
2401 qemu_co_queue_wait(&req->wait_queue);
2402 self->waiting_for = NULL;
2403 retry = true;
28de2dcd 2404 waited = true;
6460440f
KW
2405 break;
2406 }
f4658285
SH
2407 }
2408 }
2409 } while (retry);
28de2dcd
KW
2410
2411 return waited;
f4658285
SH
2412}
2413
756e6736
KW
2414/*
2415 * Return values:
2416 * 0 - success
2417 * -EINVAL - backing format specified, but no file
2418 * -ENOSPC - can't update the backing file because no space is left in the
2419 * image file header
2420 * -ENOTSUP - format driver doesn't support changing the backing file
2421 */
2422int bdrv_change_backing_file(BlockDriverState *bs,
2423 const char *backing_file, const char *backing_fmt)
2424{
2425 BlockDriver *drv = bs->drv;
469ef350 2426 int ret;
756e6736 2427
5f377794
PB
2428 /* Backing file format doesn't make sense without a backing file */
2429 if (backing_fmt && !backing_file) {
2430 return -EINVAL;
2431 }
2432
756e6736 2433 if (drv->bdrv_change_backing_file != NULL) {
469ef350 2434 ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
756e6736 2435 } else {
469ef350 2436 ret = -ENOTSUP;
756e6736 2437 }
469ef350
PB
2438
2439 if (ret == 0) {
2440 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2441 pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2442 }
2443 return ret;
756e6736
KW
2444}
2445
6ebdcee2
JC
2446/*
2447 * Finds the image layer in the chain that has 'bs' as its backing file.
2448 *
2449 * active is the current topmost image.
2450 *
2451 * Returns NULL if bs is not found in active's image chain,
2452 * or if active == bs.
4caf0fcd
JC
2453 *
2454 * Returns the bottommost base image if bs == NULL.
6ebdcee2
JC
2455 */
2456BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2457 BlockDriverState *bs)
2458{
4caf0fcd
JC
2459 while (active && bs != active->backing_hd) {
2460 active = active->backing_hd;
6ebdcee2
JC
2461 }
2462
4caf0fcd
JC
2463 return active;
2464}
6ebdcee2 2465
4caf0fcd
JC
2466/* Given a BDS, searches for the base layer. */
2467BlockDriverState *bdrv_find_base(BlockDriverState *bs)
2468{
2469 return bdrv_find_overlay(bs, NULL);
6ebdcee2
JC
2470}
2471
2472typedef struct BlkIntermediateStates {
2473 BlockDriverState *bs;
2474 QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2475} BlkIntermediateStates;
2476
2477
2478/*
2479 * Drops images above 'base' up to and including 'top', and sets the image
2480 * above 'top' to have base as its backing file.
2481 *
2482 * Requires that the overlay to 'top' is opened r/w, so that the backing file
2483 * information in 'bs' can be properly updated.
2484 *
2485 * E.g., this will convert the following chain:
2486 * bottom <- base <- intermediate <- top <- active
2487 *
2488 * to
2489 *
2490 * bottom <- base <- active
2491 *
2492 * It is allowed for bottom==base, in which case it converts:
2493 *
2494 * base <- intermediate <- top <- active
2495 *
2496 * to
2497 *
2498 * base <- active
2499 *
54e26900
JC
2500 * If backing_file_str is non-NULL, it will be used when modifying top's
2501 * overlay image metadata.
2502 *
6ebdcee2
JC
2503 * Error conditions:
2504 * if active == top, that is considered an error
2505 *
2506 */
2507int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
54e26900 2508 BlockDriverState *base, const char *backing_file_str)
6ebdcee2
JC
2509{
2510 BlockDriverState *intermediate;
2511 BlockDriverState *base_bs = NULL;
2512 BlockDriverState *new_top_bs = NULL;
2513 BlkIntermediateStates *intermediate_state, *next;
2514 int ret = -EIO;
2515
2516 QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2517 QSIMPLEQ_INIT(&states_to_delete);
2518
2519 if (!top->drv || !base->drv) {
2520 goto exit;
2521 }
2522
2523 new_top_bs = bdrv_find_overlay(active, top);
2524
2525 if (new_top_bs == NULL) {
2526 /* we could not find the image above 'top', this is an error */
2527 goto exit;
2528 }
2529
2530 /* special case of new_top_bs->backing_hd already pointing to base - nothing
2531 * to do, no intermediate images */
2532 if (new_top_bs->backing_hd == base) {
2533 ret = 0;
2534 goto exit;
2535 }
2536
2537 intermediate = top;
2538
2539 /* now we will go down through the list, and add each BDS we find
2540 * into our deletion queue, until we hit the 'base'
2541 */
2542 while (intermediate) {
5839e53b 2543 intermediate_state = g_new0(BlkIntermediateStates, 1);
6ebdcee2
JC
2544 intermediate_state->bs = intermediate;
2545 QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2546
2547 if (intermediate->backing_hd == base) {
2548 base_bs = intermediate->backing_hd;
2549 break;
2550 }
2551 intermediate = intermediate->backing_hd;
2552 }
2553 if (base_bs == NULL) {
2554 /* something went wrong, we did not end at the base. safely
2555 * unravel everything, and exit with error */
2556 goto exit;
2557 }
2558
2559 /* success - we can delete the intermediate states, and link top->base */
54e26900
JC
2560 backing_file_str = backing_file_str ? backing_file_str : base_bs->filename;
2561 ret = bdrv_change_backing_file(new_top_bs, backing_file_str,
6ebdcee2
JC
2562 base_bs->drv ? base_bs->drv->format_name : "");
2563 if (ret) {
2564 goto exit;
2565 }
920beae1 2566 bdrv_set_backing_hd(new_top_bs, base_bs);
6ebdcee2
JC
2567
2568 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2569 /* so that bdrv_close() does not recursively close the chain */
920beae1 2570 bdrv_set_backing_hd(intermediate_state->bs, NULL);
4f6fd349 2571 bdrv_unref(intermediate_state->bs);
6ebdcee2
JC
2572 }
2573 ret = 0;
2574
2575exit:
2576 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2577 g_free(intermediate_state);
2578 }
2579 return ret;
2580}
2581
2582
71d0770c
AL
2583static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2584 size_t size)
2585{
2586 int64_t len;
2587
1dd3a447
KW
2588 if (size > INT_MAX) {
2589 return -EIO;
2590 }
2591
71d0770c
AL
2592 if (!bdrv_is_inserted(bs))
2593 return -ENOMEDIUM;
2594
2595 if (bs->growable)
2596 return 0;
2597
2598 len = bdrv_getlength(bs);
2599
fbb7b4e0
KW
2600 if (offset < 0)
2601 return -EIO;
2602
2603 if ((offset > len) || (len - offset < size))
71d0770c
AL
2604 return -EIO;
2605
2606 return 0;
2607}
2608
2609static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2610 int nb_sectors)
2611{
54db38a4 2612 if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
8f4754ed
KW
2613 return -EIO;
2614 }
2615
eb5a3165
JS
2616 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2617 nb_sectors * BDRV_SECTOR_SIZE);
71d0770c
AL
2618}
2619
1c9805a3
SH
2620typedef struct RwCo {
2621 BlockDriverState *bs;
775aa8b6 2622 int64_t offset;
1c9805a3
SH
2623 QEMUIOVector *qiov;
2624 bool is_write;
2625 int ret;
4105eaaa 2626 BdrvRequestFlags flags;
1c9805a3
SH
2627} RwCo;
2628
2629static void coroutine_fn bdrv_rw_co_entry(void *opaque)
fc01f7e7 2630{
1c9805a3 2631 RwCo *rwco = opaque;
ea2384d3 2632
1c9805a3 2633 if (!rwco->is_write) {
775aa8b6
KW
2634 rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
2635 rwco->qiov->size, rwco->qiov,
4105eaaa 2636 rwco->flags);
775aa8b6
KW
2637 } else {
2638 rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
2639 rwco->qiov->size, rwco->qiov,
2640 rwco->flags);
1c9805a3
SH
2641 }
2642}
e7a8a783 2643
1c9805a3 2644/*
8d3b1a2d 2645 * Process a vectored synchronous request using coroutines
1c9805a3 2646 */
775aa8b6
KW
2647static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
2648 QEMUIOVector *qiov, bool is_write,
2649 BdrvRequestFlags flags)
1c9805a3 2650{
1c9805a3
SH
2651 Coroutine *co;
2652 RwCo rwco = {
2653 .bs = bs,
775aa8b6 2654 .offset = offset,
8d3b1a2d 2655 .qiov = qiov,
1c9805a3
SH
2656 .is_write = is_write,
2657 .ret = NOT_DONE,
4105eaaa 2658 .flags = flags,
1c9805a3 2659 };
e7a8a783 2660
498e386c
ZYW
2661 /**
2662 * In sync call context, when the vcpu is blocked, this throttling timer
2663 * will not fire; so the I/O throttling function has to be disabled here
2664 * if it has been enabled.
2665 */
2666 if (bs->io_limits_enabled) {
2667 fprintf(stderr, "Disabling I/O throttling on '%s' due "
2668 "to synchronous I/O.\n", bdrv_get_device_name(bs));
2669 bdrv_io_limits_disable(bs);
2670 }
2671
1c9805a3
SH
2672 if (qemu_in_coroutine()) {
2673 /* Fast-path if already in coroutine context */
2674 bdrv_rw_co_entry(&rwco);
2675 } else {
2572b37a
SH
2676 AioContext *aio_context = bdrv_get_aio_context(bs);
2677
1c9805a3
SH
2678 co = qemu_coroutine_create(bdrv_rw_co_entry);
2679 qemu_coroutine_enter(co, &rwco);
2680 while (rwco.ret == NOT_DONE) {
2572b37a 2681 aio_poll(aio_context, true);
1c9805a3
SH
2682 }
2683 }
2684 return rwco.ret;
2685}
b338082b 2686
8d3b1a2d
KW
2687/*
2688 * Process a synchronous request using coroutines
2689 */
2690static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
4105eaaa 2691 int nb_sectors, bool is_write, BdrvRequestFlags flags)
8d3b1a2d
KW
2692{
2693 QEMUIOVector qiov;
2694 struct iovec iov = {
2695 .iov_base = (void *)buf,
2696 .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2697 };
2698
da15ee51
KW
2699 if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2700 return -EINVAL;
2701 }
2702
8d3b1a2d 2703 qemu_iovec_init_external(&qiov, &iov, 1);
775aa8b6
KW
2704 return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
2705 &qiov, is_write, flags);
8d3b1a2d
KW
2706}
2707
1c9805a3
SH
2708/* return < 0 if error. See bdrv_write() for the return codes */
2709int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2710 uint8_t *buf, int nb_sectors)
2711{
4105eaaa 2712 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
fc01f7e7
FB
2713}
2714
07d27a44
MA
2715/* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2716int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2717 uint8_t *buf, int nb_sectors)
2718{
2719 bool enabled;
2720 int ret;
2721
2722 enabled = bs->io_limits_enabled;
2723 bs->io_limits_enabled = false;
4e7395e8 2724 ret = bdrv_read(bs, sector_num, buf, nb_sectors);
07d27a44
MA
2725 bs->io_limits_enabled = enabled;
2726 return ret;
2727}
2728
5fafdf24 2729/* Return < 0 if error. Important errors are:
19cb3738
FB
2730 -EIO generic I/O error (may happen for all errors)
2731 -ENOMEDIUM No media inserted.
2732 -EINVAL Invalid sector number or nb_sectors
2733 -EACCES Trying to write a read-only device
2734*/
5fafdf24 2735int bdrv_write(BlockDriverState *bs, int64_t sector_num,
fc01f7e7
FB
2736 const uint8_t *buf, int nb_sectors)
2737{
4105eaaa 2738 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
83f64091
FB
2739}
2740
aa7bfbff
PL
2741int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2742 int nb_sectors, BdrvRequestFlags flags)
4105eaaa
PL
2743{
2744 return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
aa7bfbff 2745 BDRV_REQ_ZERO_WRITE | flags);
8d3b1a2d
KW
2746}
2747
d75cbb5e
PL
2748/*
2749 * Completely zero out a block device with the help of bdrv_write_zeroes.
2750 * The operation is sped up by checking the block status and only writing
2751 * zeroes to the device if they currently do not return zeroes. Optional
2752 * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2753 *
2754 * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2755 */
2756int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2757{
d32f7c10 2758 int64_t target_sectors, ret, nb_sectors, sector_num = 0;
d75cbb5e
PL
2759 int n;
2760
d32f7c10
MA
2761 target_sectors = bdrv_nb_sectors(bs);
2762 if (target_sectors < 0) {
2763 return target_sectors;
9ce10c0b 2764 }
9ce10c0b 2765
d75cbb5e 2766 for (;;) {
d32f7c10 2767 nb_sectors = target_sectors - sector_num;
d75cbb5e
PL
2768 if (nb_sectors <= 0) {
2769 return 0;
2770 }
2771 if (nb_sectors > INT_MAX) {
2772 nb_sectors = INT_MAX;
2773 }
2774 ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
3d94ce60
PL
2775 if (ret < 0) {
2776 error_report("error getting block status at sector %" PRId64 ": %s",
2777 sector_num, strerror(-ret));
2778 return ret;
2779 }
d75cbb5e
PL
2780 if (ret & BDRV_BLOCK_ZERO) {
2781 sector_num += n;
2782 continue;
2783 }
2784 ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2785 if (ret < 0) {
2786 error_report("error writing zeroes at sector %" PRId64 ": %s",
2787 sector_num, strerror(-ret));
2788 return ret;
2789 }
2790 sector_num += n;
2791 }
2792}
2793
a3ef6571 2794int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
83f64091 2795{
a3ef6571
KW
2796 QEMUIOVector qiov;
2797 struct iovec iov = {
2798 .iov_base = (void *)buf,
2799 .iov_len = bytes,
2800 };
9a8c4cce 2801 int ret;
83f64091 2802
a3ef6571
KW
2803 if (bytes < 0) {
2804 return -EINVAL;
83f64091
FB
2805 }
2806
a3ef6571
KW
2807 qemu_iovec_init_external(&qiov, &iov, 1);
2808 ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
2809 if (ret < 0) {
2810 return ret;
83f64091 2811 }
a3ef6571
KW
2812
2813 return bytes;
83f64091
FB
2814}
2815
8d3b1a2d 2816int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
83f64091 2817{
9a8c4cce 2818 int ret;
83f64091 2819
8407d5d7
KW
2820 ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
2821 if (ret < 0) {
2822 return ret;
83f64091
FB
2823 }
2824
8d3b1a2d
KW
2825 return qiov->size;
2826}
2827
2828int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
8407d5d7 2829 const void *buf, int bytes)
8d3b1a2d
KW
2830{
2831 QEMUIOVector qiov;
2832 struct iovec iov = {
2833 .iov_base = (void *) buf,
8407d5d7 2834 .iov_len = bytes,
8d3b1a2d
KW
2835 };
2836
8407d5d7
KW
2837 if (bytes < 0) {
2838 return -EINVAL;
2839 }
2840
8d3b1a2d
KW
2841 qemu_iovec_init_external(&qiov, &iov, 1);
2842 return bdrv_pwritev(bs, offset, &qiov);
83f64091 2843}
83f64091 2844
f08145fe
KW
2845/*
2846 * Writes to the file and ensures that no writes are reordered across this
2847 * request (acts as a barrier)
2848 *
2849 * Returns 0 on success, -errno in error cases.
2850 */
2851int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2852 const void *buf, int count)
2853{
2854 int ret;
2855
2856 ret = bdrv_pwrite(bs, offset, buf, count);
2857 if (ret < 0) {
2858 return ret;
2859 }
2860
f05fa4ad
PB
2861 /* No flush needed for cache modes that already do it */
2862 if (bs->enable_write_cache) {
f08145fe
KW
2863 bdrv_flush(bs);
2864 }
2865
2866 return 0;
2867}
2868
470c0504 2869static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
ab185921
SH
2870 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2871{
2872 /* Perform I/O through a temporary buffer so that users who scribble over
2873 * their read buffer while the operation is in progress do not end up
2874 * modifying the image file. This is critical for zero-copy guest I/O
2875 * where anything might happen inside guest memory.
2876 */
2877 void *bounce_buffer;
2878
79c053bd 2879 BlockDriver *drv = bs->drv;
ab185921
SH
2880 struct iovec iov;
2881 QEMUIOVector bounce_qiov;
2882 int64_t cluster_sector_num;
2883 int cluster_nb_sectors;
2884 size_t skip_bytes;
2885 int ret;
2886
2887 /* Cover entire cluster so no additional backing file I/O is required when
2888 * allocating cluster in the image file.
2889 */
343bded4
PB
2890 bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2891 &cluster_sector_num, &cluster_nb_sectors);
ab185921 2892
470c0504
SH
2893 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2894 cluster_sector_num, cluster_nb_sectors);
ab185921
SH
2895
2896 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
857d4f46
KW
2897 iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len);
2898 if (bounce_buffer == NULL) {
2899 ret = -ENOMEM;
2900 goto err;
2901 }
2902
ab185921
SH
2903 qemu_iovec_init_external(&bounce_qiov, &iov, 1);
2904
79c053bd
SH
2905 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
2906 &bounce_qiov);
ab185921
SH
2907 if (ret < 0) {
2908 goto err;
2909 }
2910
79c053bd
SH
2911 if (drv->bdrv_co_write_zeroes &&
2912 buffer_is_zero(bounce_buffer, iov.iov_len)) {
621f0589 2913 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
aa7bfbff 2914 cluster_nb_sectors, 0);
79c053bd 2915 } else {
f05fa4ad
PB
2916 /* This does not change the data on the disk, it is not necessary
2917 * to flush even in cache=writethrough mode.
2918 */
79c053bd 2919 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
ab185921 2920 &bounce_qiov);
79c053bd
SH
2921 }
2922
ab185921
SH
2923 if (ret < 0) {
2924 /* It might be okay to ignore write errors for guest requests. If this
2925 * is a deliberate copy-on-read then we don't want to ignore the error.
2926 * Simply report it in all cases.
2927 */
2928 goto err;
2929 }
2930
2931 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
03396148
MT
2932 qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
2933 nb_sectors * BDRV_SECTOR_SIZE);
ab185921
SH
2934
2935err:
2936 qemu_vfree(bounce_buffer);
2937 return ret;
2938}
2939
c5fbe571 2940/*
d0c7f642
KW
2941 * Forwards an already correctly aligned request to the BlockDriver. This
2942 * handles copy on read and zeroing after EOF; any other features must be
2943 * implemented by the caller.
c5fbe571 2944 */
d0c7f642 2945static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
65afd211 2946 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
ec746e10 2947 int64_t align, QEMUIOVector *qiov, int flags)
da1fa91d
KW
2948{
2949 BlockDriver *drv = bs->drv;
dbffbdcf 2950 int ret;
da1fa91d 2951
d0c7f642
KW
2952 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
2953 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
da1fa91d 2954
d0c7f642
KW
2955 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
2956 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
8eb029c2 2957 assert(!qiov || bytes == qiov->size);
d0c7f642
KW
2958
2959 /* Handle Copy on Read and associated serialisation */
470c0504 2960 if (flags & BDRV_REQ_COPY_ON_READ) {
7327145f
KW
2961 /* If we touch the same cluster it counts as an overlap. This
2962 * guarantees that allocating writes will be serialized and not race
2963 * with each other for the same cluster. For example, in copy-on-read
2964 * it ensures that the CoR read and write operations are atomic and
2965 * guest writes cannot interleave between them. */
2966 mark_request_serialising(req, bdrv_get_cluster_size(bs));
470c0504
SH
2967 }
2968
2dbafdc0 2969 wait_serialising_requests(req);
f4658285 2970
470c0504 2971 if (flags & BDRV_REQ_COPY_ON_READ) {
ab185921
SH
2972 int pnum;
2973
bdad13b9 2974 ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
ab185921
SH
2975 if (ret < 0) {
2976 goto out;
2977 }
2978
2979 if (!ret || pnum != nb_sectors) {
470c0504 2980 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
ab185921
SH
2981 goto out;
2982 }
2983 }
2984
d0c7f642 2985 /* Forward the request to the BlockDriver */
893a8f62
MK
2986 if (!(bs->zero_beyond_eof && bs->growable)) {
2987 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
2988 } else {
2989 /* Read zeros after EOF of growable BDSes */
4049082c 2990 int64_t total_sectors, max_nb_sectors;
893a8f62 2991
4049082c
MA
2992 total_sectors = bdrv_nb_sectors(bs);
2993 if (total_sectors < 0) {
2994 ret = total_sectors;
893a8f62
MK
2995 goto out;
2996 }
2997
5f5bcd80
KW
2998 max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
2999 align >> BDRV_SECTOR_BITS);
893a8f62 3000 if (max_nb_sectors > 0) {
33f461e0
KW
3001 QEMUIOVector local_qiov;
3002 size_t local_sectors;
3003
3004 max_nb_sectors = MIN(max_nb_sectors, SIZE_MAX / BDRV_SECTOR_BITS);
3005 local_sectors = MIN(max_nb_sectors, nb_sectors);
3006
3007 qemu_iovec_init(&local_qiov, qiov->niov);
3008 qemu_iovec_concat(&local_qiov, qiov, 0,
3009 local_sectors * BDRV_SECTOR_SIZE);
3010
3011 ret = drv->bdrv_co_readv(bs, sector_num, local_sectors,
3012 &local_qiov);
3013
3014 qemu_iovec_destroy(&local_qiov);
893a8f62
MK
3015 } else {
3016 ret = 0;
3017 }
3018
3019 /* Reading beyond end of file is supposed to produce zeroes */
3020 if (ret == 0 && total_sectors < sector_num + nb_sectors) {
3021 uint64_t offset = MAX(0, total_sectors - sector_num);
3022 uint64_t bytes = (sector_num + nb_sectors - offset) *
3023 BDRV_SECTOR_SIZE;
3024 qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
3025 }
3026 }
ab185921
SH
3027
3028out:
dbffbdcf 3029 return ret;
da1fa91d
KW
3030}
3031
d0c7f642
KW
3032/*
3033 * Handle a read request in coroutine context
3034 */
1b0288ae
KW
3035static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
3036 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
d0c7f642
KW
3037 BdrvRequestFlags flags)
3038{
3039 BlockDriver *drv = bs->drv;
65afd211
KW
3040 BdrvTrackedRequest req;
3041
1b0288ae
KW
3042 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3043 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3044 uint8_t *head_buf = NULL;
3045 uint8_t *tail_buf = NULL;
3046 QEMUIOVector local_qiov;
3047 bool use_local_qiov = false;
d0c7f642
KW
3048 int ret;
3049
3050 if (!drv) {
3051 return -ENOMEDIUM;
3052 }
1b0288ae 3053 if (bdrv_check_byte_request(bs, offset, bytes)) {
d0c7f642
KW
3054 return -EIO;
3055 }
3056
3057 if (bs->copy_on_read) {
3058 flags |= BDRV_REQ_COPY_ON_READ;
3059 }
3060
3061 /* throttling disk I/O */
3062 if (bs->io_limits_enabled) {
d5103588 3063 bdrv_io_limits_intercept(bs, bytes, false);
1b0288ae
KW
3064 }
3065
3066 /* Align read if necessary by padding qiov */
3067 if (offset & (align - 1)) {
3068 head_buf = qemu_blockalign(bs, align);
3069 qemu_iovec_init(&local_qiov, qiov->niov + 2);
3070 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3071 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3072 use_local_qiov = true;
3073
3074 bytes += offset & (align - 1);
3075 offset = offset & ~(align - 1);
3076 }
3077
3078 if ((offset + bytes) & (align - 1)) {
3079 if (!use_local_qiov) {
3080 qemu_iovec_init(&local_qiov, qiov->niov + 1);
3081 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3082 use_local_qiov = true;
3083 }
3084 tail_buf = qemu_blockalign(bs, align);
3085 qemu_iovec_add(&local_qiov, tail_buf,
3086 align - ((offset + bytes) & (align - 1)));
3087
3088 bytes = ROUND_UP(bytes, align);
3089 }
3090
65afd211 3091 tracked_request_begin(&req, bs, offset, bytes, false);
ec746e10 3092 ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
1b0288ae
KW
3093 use_local_qiov ? &local_qiov : qiov,
3094 flags);
65afd211 3095 tracked_request_end(&req);
1b0288ae
KW
3096
3097 if (use_local_qiov) {
3098 qemu_iovec_destroy(&local_qiov);
3099 qemu_vfree(head_buf);
3100 qemu_vfree(tail_buf);
d0c7f642
KW
3101 }
3102
d0c7f642
KW
3103 return ret;
3104}
3105
1b0288ae
KW
3106static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
3107 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3108 BdrvRequestFlags flags)
3109{
3110 if (nb_sectors < 0 || nb_sectors > (UINT_MAX >> BDRV_SECTOR_BITS)) {
3111 return -EINVAL;
3112 }
3113
3114 return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
3115 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3116}
3117
c5fbe571 3118int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
da1fa91d
KW
3119 int nb_sectors, QEMUIOVector *qiov)
3120{
c5fbe571 3121 trace_bdrv_co_readv(bs, sector_num, nb_sectors);
da1fa91d 3122
470c0504
SH
3123 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
3124}
3125
3126int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
3127 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
3128{
3129 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
3130
3131 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
3132 BDRV_REQ_COPY_ON_READ);
c5fbe571
SH
3133}
3134
c31cb707
PL
3135/* if no limit is specified in the BlockLimits use a default
3136 * of 32768 512-byte sectors (16 MiB) per request.
3137 */
3138#define MAX_WRITE_ZEROES_DEFAULT 32768
3139
f08f2dda 3140static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
aa7bfbff 3141 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
f08f2dda
SH
3142{
3143 BlockDriver *drv = bs->drv;
3144 QEMUIOVector qiov;
c31cb707
PL
3145 struct iovec iov = {0};
3146 int ret = 0;
f08f2dda 3147
c31cb707
PL
3148 int max_write_zeroes = bs->bl.max_write_zeroes ?
3149 bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
621f0589 3150
c31cb707
PL
3151 while (nb_sectors > 0 && !ret) {
3152 int num = nb_sectors;
3153
b8d71c09
PB
3154 /* Align request. Block drivers can expect the "bulk" of the request
3155 * to be aligned.
3156 */
3157 if (bs->bl.write_zeroes_alignment
3158 && num > bs->bl.write_zeroes_alignment) {
3159 if (sector_num % bs->bl.write_zeroes_alignment != 0) {
3160 /* Make a small request up to the first aligned sector. */
c31cb707 3161 num = bs->bl.write_zeroes_alignment;
b8d71c09
PB
3162 num -= sector_num % bs->bl.write_zeroes_alignment;
3163 } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
3164 /* Shorten the request to the last aligned sector. num cannot
3165 * underflow because num > bs->bl.write_zeroes_alignment.
3166 */
3167 num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
c31cb707 3168 }
621f0589 3169 }
f08f2dda 3170
c31cb707
PL
3171 /* limit request size */
3172 if (num > max_write_zeroes) {
3173 num = max_write_zeroes;
3174 }
3175
3176 ret = -ENOTSUP;
3177 /* First try the efficient write zeroes operation */
3178 if (drv->bdrv_co_write_zeroes) {
3179 ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
3180 }
3181
3182 if (ret == -ENOTSUP) {
3183 /* Fall back to bounce buffer if write zeroes is unsupported */
3184 iov.iov_len = num * BDRV_SECTOR_SIZE;
3185 if (iov.iov_base == NULL) {
857d4f46
KW
3186 iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE);
3187 if (iov.iov_base == NULL) {
3188 ret = -ENOMEM;
3189 goto fail;
3190 }
b8d71c09 3191 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
c31cb707
PL
3192 }
3193 qemu_iovec_init_external(&qiov, &iov, 1);
f08f2dda 3194
c31cb707 3195 ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
b8d71c09
PB
3196
3197 /* Keep bounce buffer around if it is big enough for all
3198 * all future requests.
3199 */
3200 if (num < max_write_zeroes) {
3201 qemu_vfree(iov.iov_base);
3202 iov.iov_base = NULL;
3203 }
c31cb707
PL
3204 }
3205
3206 sector_num += num;
3207 nb_sectors -= num;
3208 }
f08f2dda 3209
857d4f46 3210fail:
f08f2dda
SH
3211 qemu_vfree(iov.iov_base);
3212 return ret;
3213}
3214
c5fbe571 3215/*
b404f720 3216 * Forwards an already correctly aligned write request to the BlockDriver.
c5fbe571 3217 */
b404f720 3218static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
65afd211
KW
3219 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3220 QEMUIOVector *qiov, int flags)
c5fbe571
SH
3221{
3222 BlockDriver *drv = bs->drv;
28de2dcd 3223 bool waited;
6b7cb247 3224 int ret;
da1fa91d 3225
b404f720
KW
3226 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3227 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
f4658285 3228
b404f720
KW
3229 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3230 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
8eb029c2 3231 assert(!qiov || bytes == qiov->size);
cc0681c4 3232
28de2dcd
KW
3233 waited = wait_serialising_requests(req);
3234 assert(!waited || !req->serialising);
af91f9a7
KW
3235 assert(req->overlap_offset <= offset);
3236 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
244eadef 3237
65afd211 3238 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
d616b224 3239
465bee1d
PL
3240 if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
3241 !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes &&
3242 qemu_iovec_is_zero(qiov)) {
3243 flags |= BDRV_REQ_ZERO_WRITE;
3244 if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
3245 flags |= BDRV_REQ_MAY_UNMAP;
3246 }
3247 }
3248
d616b224
SH
3249 if (ret < 0) {
3250 /* Do nothing, write notifier decided to fail this request */
3251 } else if (flags & BDRV_REQ_ZERO_WRITE) {
9e1cb96d 3252 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
aa7bfbff 3253 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
f08f2dda 3254 } else {
9e1cb96d 3255 BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
f08f2dda
SH
3256 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
3257 }
9e1cb96d 3258 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
6b7cb247 3259
f05fa4ad
PB
3260 if (ret == 0 && !bs->enable_write_cache) {
3261 ret = bdrv_co_flush(bs);
3262 }
3263
e4654d2d 3264 bdrv_set_dirty(bs, sector_num, nb_sectors);
da1fa91d 3265
5366d0c8 3266 block_acct_highest_sector(&bs->stats, sector_num, nb_sectors);
5e5a94b6 3267
df2a6f29
PB
3268 if (bs->growable && ret >= 0) {
3269 bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
3270 }
da1fa91d 3271
6b7cb247 3272 return ret;
da1fa91d
KW
3273}
3274
b404f720
KW
3275/*
3276 * Handle a write request in coroutine context
3277 */
6601553e
KW
3278static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
3279 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
b404f720
KW
3280 BdrvRequestFlags flags)
3281{
65afd211 3282 BdrvTrackedRequest req;
3b8242e0
KW
3283 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3284 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3285 uint8_t *head_buf = NULL;
3286 uint8_t *tail_buf = NULL;
3287 QEMUIOVector local_qiov;
3288 bool use_local_qiov = false;
b404f720
KW
3289 int ret;
3290
3291 if (!bs->drv) {
3292 return -ENOMEDIUM;
3293 }
3294 if (bs->read_only) {
3295 return -EACCES;
3296 }
6601553e 3297 if (bdrv_check_byte_request(bs, offset, bytes)) {
b404f720
KW
3298 return -EIO;
3299 }
3300
b404f720
KW
3301 /* throttling disk I/O */
3302 if (bs->io_limits_enabled) {
d5103588 3303 bdrv_io_limits_intercept(bs, bytes, true);
b404f720
KW
3304 }
3305
3b8242e0
KW
3306 /*
3307 * Align write if necessary by performing a read-modify-write cycle.
3308 * Pad qiov with the read parts and be sure to have a tracked request not
3309 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
3310 */
65afd211 3311 tracked_request_begin(&req, bs, offset, bytes, true);
3b8242e0
KW
3312
3313 if (offset & (align - 1)) {
3314 QEMUIOVector head_qiov;
3315 struct iovec head_iov;
3316
3317 mark_request_serialising(&req, align);
3318 wait_serialising_requests(&req);
3319
3320 head_buf = qemu_blockalign(bs, align);
3321 head_iov = (struct iovec) {
3322 .iov_base = head_buf,
3323 .iov_len = align,
3324 };
3325 qemu_iovec_init_external(&head_qiov, &head_iov, 1);
3326
9e1cb96d 3327 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
3b8242e0
KW
3328 ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
3329 align, &head_qiov, 0);
3330 if (ret < 0) {
3331 goto fail;
3332 }
9e1cb96d 3333 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
3b8242e0
KW
3334
3335 qemu_iovec_init(&local_qiov, qiov->niov + 2);
3336 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3337 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3338 use_local_qiov = true;
3339
3340 bytes += offset & (align - 1);
3341 offset = offset & ~(align - 1);
3342 }
3343
3344 if ((offset + bytes) & (align - 1)) {
3345 QEMUIOVector tail_qiov;
3346 struct iovec tail_iov;
3347 size_t tail_bytes;
28de2dcd 3348 bool waited;
3b8242e0
KW
3349
3350 mark_request_serialising(&req, align);
28de2dcd
KW
3351 waited = wait_serialising_requests(&req);
3352 assert(!waited || !use_local_qiov);
3b8242e0
KW
3353
3354 tail_buf = qemu_blockalign(bs, align);
3355 tail_iov = (struct iovec) {
3356 .iov_base = tail_buf,
3357 .iov_len = align,
3358 };
3359 qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
3360
9e1cb96d 3361 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
3b8242e0
KW
3362 ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
3363 align, &tail_qiov, 0);
3364 if (ret < 0) {
3365 goto fail;
3366 }
9e1cb96d 3367 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
3b8242e0
KW
3368
3369 if (!use_local_qiov) {
3370 qemu_iovec_init(&local_qiov, qiov->niov + 1);
3371 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3372 use_local_qiov = true;
3373 }
3374
3375 tail_bytes = (offset + bytes) & (align - 1);
3376 qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
3377
3378 bytes = ROUND_UP(bytes, align);
3379 }
3380
3381 ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
3382 use_local_qiov ? &local_qiov : qiov,
3383 flags);
3384
3385fail:
65afd211 3386 tracked_request_end(&req);
b404f720 3387
3b8242e0
KW
3388 if (use_local_qiov) {
3389 qemu_iovec_destroy(&local_qiov);
3b8242e0 3390 }
99c4a85c
KW
3391 qemu_vfree(head_buf);
3392 qemu_vfree(tail_buf);
3b8242e0 3393
b404f720
KW
3394 return ret;
3395}
3396
6601553e
KW
3397static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
3398 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3399 BdrvRequestFlags flags)
3400{
3401 if (nb_sectors < 0 || nb_sectors > (INT_MAX >> BDRV_SECTOR_BITS)) {
3402 return -EINVAL;
3403 }
3404
3405 return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
3406 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3407}
3408
c5fbe571
SH
3409int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
3410 int nb_sectors, QEMUIOVector *qiov)
3411{
3412 trace_bdrv_co_writev(bs, sector_num, nb_sectors);
3413
f08f2dda
SH
3414 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
3415}
3416
3417int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
aa7bfbff
PL
3418 int64_t sector_num, int nb_sectors,
3419 BdrvRequestFlags flags)
f08f2dda 3420{
94d6ff21 3421 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
f08f2dda 3422
d32f35cb
PL
3423 if (!(bs->open_flags & BDRV_O_UNMAP)) {
3424 flags &= ~BDRV_REQ_MAY_UNMAP;
3425 }
3426
f08f2dda 3427 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
aa7bfbff 3428 BDRV_REQ_ZERO_WRITE | flags);
c5fbe571
SH
3429}
3430
83f64091
FB
3431/**
3432 * Truncate file to 'offset' bytes (needed only for file protocols)
3433 */
3434int bdrv_truncate(BlockDriverState *bs, int64_t offset)
3435{
3436 BlockDriver *drv = bs->drv;
51762288 3437 int ret;
83f64091 3438 if (!drv)
19cb3738 3439 return -ENOMEDIUM;
83f64091
FB
3440 if (!drv->bdrv_truncate)
3441 return -ENOTSUP;
59f2689d
NS
3442 if (bs->read_only)
3443 return -EACCES;
9c75e168 3444
51762288
SH
3445 ret = drv->bdrv_truncate(bs, offset);
3446 if (ret == 0) {
3447 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
a7f53e26
MA
3448 if (bs->blk) {
3449 blk_dev_resize_cb(bs->blk);
3450 }
51762288
SH
3451 }
3452 return ret;
83f64091
FB
3453}
3454
4a1d5e1f
FZ
3455/**
3456 * Length of a allocated file in bytes. Sparse files are counted by actual
3457 * allocated space. Return < 0 if error or unknown.
3458 */
3459int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
3460{
3461 BlockDriver *drv = bs->drv;
3462 if (!drv) {
3463 return -ENOMEDIUM;
3464 }
3465 if (drv->bdrv_get_allocated_file_size) {
3466 return drv->bdrv_get_allocated_file_size(bs);
3467 }
3468 if (bs->file) {
3469 return bdrv_get_allocated_file_size(bs->file);
3470 }
3471 return -ENOTSUP;
3472}
3473
83f64091 3474/**
65a9bb25 3475 * Return number of sectors on success, -errno on error.
83f64091 3476 */
65a9bb25 3477int64_t bdrv_nb_sectors(BlockDriverState *bs)
83f64091
FB
3478{
3479 BlockDriver *drv = bs->drv;
65a9bb25 3480
83f64091 3481 if (!drv)
19cb3738 3482 return -ENOMEDIUM;
51762288 3483
b94a2610
KW
3484 if (drv->has_variable_length) {
3485 int ret = refresh_total_sectors(bs, bs->total_sectors);
3486 if (ret < 0) {
3487 return ret;
46a4e4e6 3488 }
83f64091 3489 }
65a9bb25
MA
3490 return bs->total_sectors;
3491}
3492
3493/**
3494 * Return length in bytes on success, -errno on error.
3495 * The length is always a multiple of BDRV_SECTOR_SIZE.
3496 */
3497int64_t bdrv_getlength(BlockDriverState *bs)
3498{
3499 int64_t ret = bdrv_nb_sectors(bs);
3500
3501 return ret < 0 ? ret : ret * BDRV_SECTOR_SIZE;
fc01f7e7
FB
3502}
3503
19cb3738 3504/* return 0 as number of sectors if no device present or error */
96b8f136 3505void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
fc01f7e7 3506{
65a9bb25
MA
3507 int64_t nb_sectors = bdrv_nb_sectors(bs);
3508
3509 *nb_sectors_ptr = nb_sectors < 0 ? 0 : nb_sectors;
fc01f7e7 3510}
cf98951b 3511
ff06f5f3
PB
3512void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
3513 BlockdevOnError on_write_error)
abd7f68d
MA
3514{
3515 bs->on_read_error = on_read_error;
3516 bs->on_write_error = on_write_error;
3517}
3518
1ceee0d5 3519BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
abd7f68d
MA
3520{
3521 return is_read ? bs->on_read_error : bs->on_write_error;
3522}
3523
3e1caa5f
PB
3524BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
3525{
3526 BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
3527
3528 switch (on_err) {
3529 case BLOCKDEV_ON_ERROR_ENOSPC:
a589569f
WX
3530 return (error == ENOSPC) ?
3531 BLOCK_ERROR_ACTION_STOP : BLOCK_ERROR_ACTION_REPORT;
3e1caa5f 3532 case BLOCKDEV_ON_ERROR_STOP:
a589569f 3533 return BLOCK_ERROR_ACTION_STOP;
3e1caa5f 3534 case BLOCKDEV_ON_ERROR_REPORT:
a589569f 3535 return BLOCK_ERROR_ACTION_REPORT;
3e1caa5f 3536 case BLOCKDEV_ON_ERROR_IGNORE:
a589569f 3537 return BLOCK_ERROR_ACTION_IGNORE;
3e1caa5f
PB
3538 default:
3539 abort();
3540 }
3541}
3542
c7c2ff0c
LC
3543static void send_qmp_error_event(BlockDriverState *bs,
3544 BlockErrorAction action,
3545 bool is_read, int error)
3546{
3547 BlockErrorAction ac;
3548
3549 ac = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE;
3550 qapi_event_send_block_io_error(bdrv_get_device_name(bs), ac, action,
3551 bdrv_iostatus_is_enabled(bs),
624ff573
LC
3552 error == ENOSPC, strerror(error),
3553 &error_abort);
c7c2ff0c
LC
3554}
3555
3e1caa5f
PB
3556/* This is done by device models because, while the block layer knows
3557 * about the error, it does not know whether an operation comes from
3558 * the device or the block layer (from a job, for example).
3559 */
3560void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3561 bool is_read, int error)
3562{
3563 assert(error >= 0);
2bd3bce8 3564
a589569f 3565 if (action == BLOCK_ERROR_ACTION_STOP) {
2bd3bce8
PB
3566 /* First set the iostatus, so that "info block" returns an iostatus
3567 * that matches the events raised so far (an additional error iostatus
3568 * is fine, but not a lost one).
3569 */
3e1caa5f 3570 bdrv_iostatus_set_err(bs, error);
2bd3bce8
PB
3571
3572 /* Then raise the request to stop the VM and the event.
3573 * qemu_system_vmstop_request_prepare has two effects. First,
3574 * it ensures that the STOP event always comes after the
3575 * BLOCK_IO_ERROR event. Second, it ensures that even if management
3576 * can observe the STOP event and do a "cont" before the STOP
3577 * event is issued, the VM will not stop. In this case, vm_start()
3578 * also ensures that the STOP/RESUME pair of events is emitted.
3579 */
3580 qemu_system_vmstop_request_prepare();
c7c2ff0c 3581 send_qmp_error_event(bs, action, is_read, error);
2bd3bce8
PB
3582 qemu_system_vmstop_request(RUN_STATE_IO_ERROR);
3583 } else {
c7c2ff0c 3584 send_qmp_error_event(bs, action, is_read, error);
3e1caa5f
PB
3585 }
3586}
3587
b338082b
FB
3588int bdrv_is_read_only(BlockDriverState *bs)
3589{
3590 return bs->read_only;
3591}
3592
985a03b0
TS
3593int bdrv_is_sg(BlockDriverState *bs)
3594{
3595 return bs->sg;
3596}
3597
e900a7b7
CH
3598int bdrv_enable_write_cache(BlockDriverState *bs)
3599{
3600 return bs->enable_write_cache;
3601}
3602
425b0148
PB
3603void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3604{
3605 bs->enable_write_cache = wce;
55b110f2
JC
3606
3607 /* so a reopen() will preserve wce */
3608 if (wce) {
3609 bs->open_flags |= BDRV_O_CACHE_WB;
3610 } else {
3611 bs->open_flags &= ~BDRV_O_CACHE_WB;
3612 }
425b0148
PB
3613}
3614
ea2384d3
FB
3615int bdrv_is_encrypted(BlockDriverState *bs)
3616{
3617 if (bs->backing_hd && bs->backing_hd->encrypted)
3618 return 1;
3619 return bs->encrypted;
3620}
3621
c0f4ce77
AL
3622int bdrv_key_required(BlockDriverState *bs)
3623{
3624 BlockDriverState *backing_hd = bs->backing_hd;
3625
3626 if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3627 return 1;
3628 return (bs->encrypted && !bs->valid_key);
3629}
3630
ea2384d3
FB
3631int bdrv_set_key(BlockDriverState *bs, const char *key)
3632{
3633 int ret;
3634 if (bs->backing_hd && bs->backing_hd->encrypted) {
3635 ret = bdrv_set_key(bs->backing_hd, key);
3636 if (ret < 0)
3637 return ret;
3638 if (!bs->encrypted)
3639 return 0;
3640 }
fd04a2ae
SH
3641 if (!bs->encrypted) {
3642 return -EINVAL;
3643 } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3644 return -ENOMEDIUM;
3645 }
c0f4ce77 3646 ret = bs->drv->bdrv_set_key(bs, key);
bb5fc20f
AL
3647 if (ret < 0) {
3648 bs->valid_key = 0;
3649 } else if (!bs->valid_key) {
3650 bs->valid_key = 1;
a7f53e26
MA
3651 if (bs->blk) {
3652 /* call the change callback now, we skipped it on open */
3653 blk_dev_change_media_cb(bs->blk, true);
3654 }
bb5fc20f 3655 }
c0f4ce77 3656 return ret;
ea2384d3
FB
3657}
3658
f8d6bba1 3659const char *bdrv_get_format_name(BlockDriverState *bs)
ea2384d3 3660{
f8d6bba1 3661 return bs->drv ? bs->drv->format_name : NULL;
ea2384d3
FB
3662}
3663
ada42401
SH
3664static int qsort_strcmp(const void *a, const void *b)
3665{
3666 return strcmp(a, b);
3667}
3668
5fafdf24 3669void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
ea2384d3
FB
3670 void *opaque)
3671{
3672 BlockDriver *drv;
e855e4fb 3673 int count = 0;
ada42401 3674 int i;
e855e4fb 3675 const char **formats = NULL;
ea2384d3 3676
8a22f02a 3677 QLIST_FOREACH(drv, &bdrv_drivers, list) {
e855e4fb
JC
3678 if (drv->format_name) {
3679 bool found = false;
3680 int i = count;
3681 while (formats && i && !found) {
3682 found = !strcmp(formats[--i], drv->format_name);
3683 }
3684
3685 if (!found) {
5839e53b 3686 formats = g_renew(const char *, formats, count + 1);
e855e4fb 3687 formats[count++] = drv->format_name;
e855e4fb
JC
3688 }
3689 }
ea2384d3 3690 }
ada42401
SH
3691
3692 qsort(formats, count, sizeof(formats[0]), qsort_strcmp);
3693
3694 for (i = 0; i < count; i++) {
3695 it(opaque, formats[i]);
3696 }
3697
e855e4fb 3698 g_free(formats);
ea2384d3
FB
3699}
3700
dc364f4c 3701/* This function is to find block backend bs */
7f06d47e 3702/* TODO convert callers to blk_by_name(), then remove */
b338082b
FB
3703BlockDriverState *bdrv_find(const char *name)
3704{
7f06d47e 3705 BlockBackend *blk = blk_by_name(name);
b338082b 3706
7f06d47e 3707 return blk ? blk_bs(blk) : NULL;
b338082b
FB
3708}
3709
dc364f4c
BC
3710/* This function is to find a node in the bs graph */
3711BlockDriverState *bdrv_find_node(const char *node_name)
3712{
3713 BlockDriverState *bs;
3714
3715 assert(node_name);
3716
3717 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3718 if (!strcmp(node_name, bs->node_name)) {
3719 return bs;
3720 }
3721 }
3722 return NULL;
3723}
3724
c13163fb
BC
3725/* Put this QMP function here so it can access the static graph_bdrv_states. */
3726BlockDeviceInfoList *bdrv_named_nodes_list(void)
3727{
3728 BlockDeviceInfoList *list, *entry;
3729 BlockDriverState *bs;
3730
3731 list = NULL;
3732 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3733 entry = g_malloc0(sizeof(*entry));
3734 entry->value = bdrv_block_device_info(bs);
3735 entry->next = list;
3736 list = entry;
3737 }
3738
3739 return list;
3740}
3741
12d3ba82
BC
3742BlockDriverState *bdrv_lookup_bs(const char *device,
3743 const char *node_name,
3744 Error **errp)
3745{
7f06d47e
MA
3746 BlockBackend *blk;
3747 BlockDriverState *bs;
12d3ba82 3748
12d3ba82 3749 if (device) {
7f06d47e 3750 blk = blk_by_name(device);
12d3ba82 3751
7f06d47e
MA
3752 if (blk) {
3753 return blk_bs(blk);
12d3ba82 3754 }
12d3ba82
BC
3755 }
3756
dd67fa50
BC
3757 if (node_name) {
3758 bs = bdrv_find_node(node_name);
12d3ba82 3759
dd67fa50
BC
3760 if (bs) {
3761 return bs;
3762 }
12d3ba82
BC
3763 }
3764
dd67fa50
BC
3765 error_setg(errp, "Cannot find device=%s nor node_name=%s",
3766 device ? device : "",
3767 node_name ? node_name : "");
3768 return NULL;
12d3ba82
BC
3769}
3770
5a6684d2
JC
3771/* If 'base' is in the same chain as 'top', return true. Otherwise,
3772 * return false. If either argument is NULL, return false. */
3773bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base)
3774{
3775 while (top && top != base) {
3776 top = top->backing_hd;
3777 }
3778
3779 return top != NULL;
3780}
3781
2f399b0a
MA
3782BlockDriverState *bdrv_next(BlockDriverState *bs)
3783{
3784 if (!bs) {
3785 return QTAILQ_FIRST(&bdrv_states);
3786 }
dc364f4c 3787 return QTAILQ_NEXT(bs, device_list);
2f399b0a
MA
3788}
3789
7f06d47e 3790/* TODO check what callers really want: bs->node_name or blk_name() */
bfb197e0 3791const char *bdrv_get_device_name(const BlockDriverState *bs)
ea2384d3 3792{
bfb197e0 3793 return bs->blk ? blk_name(bs->blk) : "";
ea2384d3
FB
3794}
3795
c8433287
MA
3796int bdrv_get_flags(BlockDriverState *bs)
3797{
3798 return bs->open_flags;
3799}
3800
f0f0fdfe 3801int bdrv_flush_all(void)
c6ca28d6
AL
3802{
3803 BlockDriverState *bs;
f0f0fdfe 3804 int result = 0;
c6ca28d6 3805
dc364f4c 3806 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
ed78cda3
SH
3807 AioContext *aio_context = bdrv_get_aio_context(bs);
3808 int ret;
3809
3810 aio_context_acquire(aio_context);
3811 ret = bdrv_flush(bs);
f0f0fdfe
KW
3812 if (ret < 0 && !result) {
3813 result = ret;
3814 }
ed78cda3 3815 aio_context_release(aio_context);
1b7bdbc1 3816 }
f0f0fdfe
KW
3817
3818 return result;
c6ca28d6
AL
3819}
3820
3ac21627
PL
3821int bdrv_has_zero_init_1(BlockDriverState *bs)
3822{
3823 return 1;
3824}
3825
f2feebbd
KW
3826int bdrv_has_zero_init(BlockDriverState *bs)
3827{
3828 assert(bs->drv);
3829
11212d8f
PB
3830 /* If BS is a copy on write image, it is initialized to
3831 the contents of the base image, which may not be zeroes. */
3832 if (bs->backing_hd) {
3833 return 0;
3834 }
336c1c12
KW
3835 if (bs->drv->bdrv_has_zero_init) {
3836 return bs->drv->bdrv_has_zero_init(bs);
f2feebbd
KW
3837 }
3838
3ac21627
PL
3839 /* safe default */
3840 return 0;
f2feebbd
KW
3841}
3842
4ce78691
PL
3843bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
3844{
3845 BlockDriverInfo bdi;
3846
3847 if (bs->backing_hd) {
3848 return false;
3849 }
3850
3851 if (bdrv_get_info(bs, &bdi) == 0) {
3852 return bdi.unallocated_blocks_are_zero;
3853 }
3854
3855 return false;
3856}
3857
3858bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
3859{
3860 BlockDriverInfo bdi;
3861
3862 if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
3863 return false;
3864 }
3865
3866 if (bdrv_get_info(bs, &bdi) == 0) {
3867 return bdi.can_write_zeroes_with_unmap;
3868 }
3869
3870 return false;
3871}
3872
b6b8a333 3873typedef struct BdrvCoGetBlockStatusData {
376ae3f1 3874 BlockDriverState *bs;
b35b2bba 3875 BlockDriverState *base;
376ae3f1
SH
3876 int64_t sector_num;
3877 int nb_sectors;
3878 int *pnum;
b6b8a333 3879 int64_t ret;
376ae3f1 3880 bool done;
b6b8a333 3881} BdrvCoGetBlockStatusData;
376ae3f1 3882
f58c7b35
TS
3883/*
3884 * Returns true iff the specified sector is present in the disk image. Drivers
3885 * not implementing the functionality are assumed to not support backing files,
3886 * hence all their sectors are reported as allocated.
3887 *
bd9533e3
SH
3888 * If 'sector_num' is beyond the end of the disk image the return value is 0
3889 * and 'pnum' is set to 0.
3890 *
f58c7b35
TS
3891 * 'pnum' is set to the number of sectors (including and immediately following
3892 * the specified sector) that are known to be in the same
3893 * allocated/unallocated state.
3894 *
bd9533e3
SH
3895 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
3896 * beyond the end of the disk image it will be clamped.
f58c7b35 3897 */
b6b8a333
PB
3898static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
3899 int64_t sector_num,
3900 int nb_sectors, int *pnum)
f58c7b35 3901{
30a7f2fc 3902 int64_t total_sectors;
bd9533e3 3903 int64_t n;
5daa74a6 3904 int64_t ret, ret2;
bd9533e3 3905
30a7f2fc
MA
3906 total_sectors = bdrv_nb_sectors(bs);
3907 if (total_sectors < 0) {
3908 return total_sectors;
617ccb46
PB
3909 }
3910
30a7f2fc 3911 if (sector_num >= total_sectors) {
bd9533e3
SH
3912 *pnum = 0;
3913 return 0;
3914 }
3915
30a7f2fc 3916 n = total_sectors - sector_num;
bd9533e3
SH
3917 if (n < nb_sectors) {
3918 nb_sectors = n;
3919 }
3920
b6b8a333 3921 if (!bs->drv->bdrv_co_get_block_status) {
bd9533e3 3922 *pnum = nb_sectors;
e88ae226 3923 ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
918e92d7
PB
3924 if (bs->drv->protocol_name) {
3925 ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
3926 }
3927 return ret;
f58c7b35 3928 }
6aebab14 3929
415b5b01
PB
3930 ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
3931 if (ret < 0) {
3e0a233d 3932 *pnum = 0;
415b5b01
PB
3933 return ret;
3934 }
3935
92bc50a5
PL
3936 if (ret & BDRV_BLOCK_RAW) {
3937 assert(ret & BDRV_BLOCK_OFFSET_VALID);
3938 return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3939 *pnum, pnum);
3940 }
3941
e88ae226
KW
3942 if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
3943 ret |= BDRV_BLOCK_ALLOCATED;
3944 }
3945
c3d86884
PL
3946 if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
3947 if (bdrv_unallocated_blocks_are_zero(bs)) {
f0ad5712 3948 ret |= BDRV_BLOCK_ZERO;
1f9db224 3949 } else if (bs->backing_hd) {
f0ad5712 3950 BlockDriverState *bs2 = bs->backing_hd;
30a7f2fc
MA
3951 int64_t nb_sectors2 = bdrv_nb_sectors(bs2);
3952 if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) {
f0ad5712
PB
3953 ret |= BDRV_BLOCK_ZERO;
3954 }
3955 }
415b5b01 3956 }
5daa74a6
PB
3957
3958 if (bs->file &&
3959 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
3960 (ret & BDRV_BLOCK_OFFSET_VALID)) {
59c9a95f
HR
3961 int file_pnum;
3962
5daa74a6 3963 ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
59c9a95f 3964 *pnum, &file_pnum);
5daa74a6
PB
3965 if (ret2 >= 0) {
3966 /* Ignore errors. This is just providing extra information, it
3967 * is useful but not necessary.
3968 */
59c9a95f
HR
3969 if (!file_pnum) {
3970 /* !file_pnum indicates an offset at or beyond the EOF; it is
3971 * perfectly valid for the format block driver to point to such
3972 * offsets, so catch it and mark everything as zero */
3973 ret |= BDRV_BLOCK_ZERO;
3974 } else {
3975 /* Limit request to the range reported by the protocol driver */
3976 *pnum = file_pnum;
3977 ret |= (ret2 & BDRV_BLOCK_ZERO);
3978 }
5daa74a6
PB
3979 }
3980 }
3981
415b5b01 3982 return ret;
060f51c9
SH
3983}
3984
b6b8a333
PB
3985/* Coroutine wrapper for bdrv_get_block_status() */
3986static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
060f51c9 3987{
b6b8a333 3988 BdrvCoGetBlockStatusData *data = opaque;
060f51c9
SH
3989 BlockDriverState *bs = data->bs;
3990
b6b8a333
PB
3991 data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
3992 data->pnum);
060f51c9
SH
3993 data->done = true;
3994}
3995
3996/*
b6b8a333 3997 * Synchronous wrapper around bdrv_co_get_block_status().
060f51c9 3998 *
b6b8a333 3999 * See bdrv_co_get_block_status() for details.
060f51c9 4000 */
b6b8a333
PB
4001int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
4002 int nb_sectors, int *pnum)
060f51c9 4003{
6aebab14 4004 Coroutine *co;
b6b8a333 4005 BdrvCoGetBlockStatusData data = {
6aebab14
SH
4006 .bs = bs,
4007 .sector_num = sector_num,
4008 .nb_sectors = nb_sectors,
4009 .pnum = pnum,
4010 .done = false,
4011 };
4012
bdad13b9
PB
4013 if (qemu_in_coroutine()) {
4014 /* Fast-path if already in coroutine context */
b6b8a333 4015 bdrv_get_block_status_co_entry(&data);
bdad13b9 4016 } else {
2572b37a
SH
4017 AioContext *aio_context = bdrv_get_aio_context(bs);
4018
b6b8a333 4019 co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
bdad13b9
PB
4020 qemu_coroutine_enter(co, &data);
4021 while (!data.done) {
2572b37a 4022 aio_poll(aio_context, true);
bdad13b9 4023 }
6aebab14
SH
4024 }
4025 return data.ret;
f58c7b35
TS
4026}
4027
b6b8a333
PB
4028int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
4029 int nb_sectors, int *pnum)
4030{
4333bb71
PB
4031 int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
4032 if (ret < 0) {
4033 return ret;
4034 }
01fb2705 4035 return !!(ret & BDRV_BLOCK_ALLOCATED);
b6b8a333
PB
4036}
4037
188a7bbf
PB
4038/*
4039 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
4040 *
4041 * Return true if the given sector is allocated in any image between
4042 * BASE and TOP (inclusive). BASE can be NULL to check if the given
4043 * sector is allocated in any image of the chain. Return false otherwise.
4044 *
4045 * 'pnum' is set to the number of sectors (including and immediately following
4046 * the specified sector) that are known to be in the same
4047 * allocated/unallocated state.
4048 *
4049 */
4f578637
PB
4050int bdrv_is_allocated_above(BlockDriverState *top,
4051 BlockDriverState *base,
4052 int64_t sector_num,
4053 int nb_sectors, int *pnum)
188a7bbf
PB
4054{
4055 BlockDriverState *intermediate;
4056 int ret, n = nb_sectors;
4057
4058 intermediate = top;
4059 while (intermediate && intermediate != base) {
4060 int pnum_inter;
bdad13b9
PB
4061 ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
4062 &pnum_inter);
188a7bbf
PB
4063 if (ret < 0) {
4064 return ret;
4065 } else if (ret) {
4066 *pnum = pnum_inter;
4067 return 1;
4068 }
4069
4070 /*
4071 * [sector_num, nb_sectors] is unallocated on top but intermediate
4072 * might have
4073 *
4074 * [sector_num+x, nr_sectors] allocated.
4075 */
63ba17d3
VI
4076 if (n > pnum_inter &&
4077 (intermediate == top ||
4078 sector_num + pnum_inter < intermediate->total_sectors)) {
188a7bbf
PB
4079 n = pnum_inter;
4080 }
4081
4082 intermediate = intermediate->backing_hd;
4083 }
4084
4085 *pnum = n;
4086 return 0;
4087}
4088
045df330
AL
4089const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
4090{
4091 if (bs->backing_hd && bs->backing_hd->encrypted)
4092 return bs->backing_file;
4093 else if (bs->encrypted)
4094 return bs->filename;
4095 else
4096 return NULL;
4097}
4098
5fafdf24 4099void bdrv_get_backing_filename(BlockDriverState *bs,
83f64091
FB
4100 char *filename, int filename_size)
4101{
3574c608 4102 pstrcpy(filename, filename_size, bs->backing_file);
83f64091
FB
4103}
4104
5fafdf24 4105int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
faea38e7
FB
4106 const uint8_t *buf, int nb_sectors)
4107{
4108 BlockDriver *drv = bs->drv;
4109 if (!drv)
19cb3738 4110 return -ENOMEDIUM;
faea38e7
FB
4111 if (!drv->bdrv_write_compressed)
4112 return -ENOTSUP;
fbb7b4e0
KW
4113 if (bdrv_check_request(bs, sector_num, nb_sectors))
4114 return -EIO;
a55eb92c 4115
e4654d2d 4116 assert(QLIST_EMPTY(&bs->dirty_bitmaps));
a55eb92c 4117
faea38e7
FB
4118 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
4119}
3b46e624 4120
faea38e7
FB
4121int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
4122{
4123 BlockDriver *drv = bs->drv;
4124 if (!drv)
19cb3738 4125 return -ENOMEDIUM;
faea38e7
FB
4126 if (!drv->bdrv_get_info)
4127 return -ENOTSUP;
4128 memset(bdi, 0, sizeof(*bdi));
4129 return drv->bdrv_get_info(bs, bdi);
4130}
4131
eae041fe
HR
4132ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
4133{
4134 BlockDriver *drv = bs->drv;
4135 if (drv && drv->bdrv_get_specific_info) {
4136 return drv->bdrv_get_specific_info(bs);
4137 }
4138 return NULL;
4139}
4140
45566e9c
CH
4141int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
4142 int64_t pos, int size)
cf8074b3
KW
4143{
4144 QEMUIOVector qiov;
4145 struct iovec iov = {
4146 .iov_base = (void *) buf,
4147 .iov_len = size,
4148 };
4149
4150 qemu_iovec_init_external(&qiov, &iov, 1);
4151 return bdrv_writev_vmstate(bs, &qiov, pos);
4152}
4153
4154int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
178e08a5
AL
4155{
4156 BlockDriver *drv = bs->drv;
cf8074b3
KW
4157
4158 if (!drv) {
178e08a5 4159 return -ENOMEDIUM;
cf8074b3
KW
4160 } else if (drv->bdrv_save_vmstate) {
4161 return drv->bdrv_save_vmstate(bs, qiov, pos);
4162 } else if (bs->file) {
4163 return bdrv_writev_vmstate(bs->file, qiov, pos);
4164 }
4165
7cdb1f6d 4166 return -ENOTSUP;
178e08a5
AL
4167}
4168
45566e9c
CH
4169int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
4170 int64_t pos, int size)
178e08a5
AL
4171{
4172 BlockDriver *drv = bs->drv;
4173 if (!drv)
4174 return -ENOMEDIUM;
7cdb1f6d
MK
4175 if (drv->bdrv_load_vmstate)
4176 return drv->bdrv_load_vmstate(bs, buf, pos, size);
4177 if (bs->file)
4178 return bdrv_load_vmstate(bs->file, buf, pos, size);
4179 return -ENOTSUP;
178e08a5
AL
4180}
4181
8b9b0cc2
KW
4182void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
4183{
bf736fe3 4184 if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
8b9b0cc2
KW
4185 return;
4186 }
4187
bf736fe3 4188 bs->drv->bdrv_debug_event(bs, event);
41c695c7
KW
4189}
4190
4191int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
4192 const char *tag)
4193{
4194 while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
4195 bs = bs->file;
4196 }
4197
4198 if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
4199 return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
4200 }
4201
4202 return -ENOTSUP;
4203}
4204
4cc70e93
FZ
4205int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
4206{
4207 while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
4208 bs = bs->file;
4209 }
4210
4211 if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
4212 return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
4213 }
4214
4215 return -ENOTSUP;
4216}
4217
41c695c7
KW
4218int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
4219{
938789ea 4220 while (bs && (!bs->drv || !bs->drv->bdrv_debug_resume)) {
41c695c7
KW
4221 bs = bs->file;
4222 }
8b9b0cc2 4223
41c695c7
KW
4224 if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
4225 return bs->drv->bdrv_debug_resume(bs, tag);
4226 }
4227
4228 return -ENOTSUP;
4229}
4230
4231bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
4232{
4233 while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
4234 bs = bs->file;
4235 }
4236
4237 if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
4238 return bs->drv->bdrv_debug_is_suspended(bs, tag);
4239 }
4240
4241 return false;
8b9b0cc2
KW
4242}
4243
199630b6
BS
4244int bdrv_is_snapshot(BlockDriverState *bs)
4245{
4246 return !!(bs->open_flags & BDRV_O_SNAPSHOT);
4247}
4248
b1b1d783
JC
4249/* backing_file can either be relative, or absolute, or a protocol. If it is
4250 * relative, it must be relative to the chain. So, passing in bs->filename
4251 * from a BDS as backing_file should not be done, as that may be relative to
4252 * the CWD rather than the chain. */
e8a6bb9c
MT
4253BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
4254 const char *backing_file)
4255{
b1b1d783
JC
4256 char *filename_full = NULL;
4257 char *backing_file_full = NULL;
4258 char *filename_tmp = NULL;
4259 int is_protocol = 0;
4260 BlockDriverState *curr_bs = NULL;
4261 BlockDriverState *retval = NULL;
4262
4263 if (!bs || !bs->drv || !backing_file) {
e8a6bb9c
MT
4264 return NULL;
4265 }
4266
b1b1d783
JC
4267 filename_full = g_malloc(PATH_MAX);
4268 backing_file_full = g_malloc(PATH_MAX);
4269 filename_tmp = g_malloc(PATH_MAX);
4270
4271 is_protocol = path_has_protocol(backing_file);
4272
4273 for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
4274
4275 /* If either of the filename paths is actually a protocol, then
4276 * compare unmodified paths; otherwise make paths relative */
4277 if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
4278 if (strcmp(backing_file, curr_bs->backing_file) == 0) {
4279 retval = curr_bs->backing_hd;
4280 break;
4281 }
e8a6bb9c 4282 } else {
b1b1d783
JC
4283 /* If not an absolute filename path, make it relative to the current
4284 * image's filename path */
4285 path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4286 backing_file);
4287
4288 /* We are going to compare absolute pathnames */
4289 if (!realpath(filename_tmp, filename_full)) {
4290 continue;
4291 }
4292
4293 /* We need to make sure the backing filename we are comparing against
4294 * is relative to the current image filename (or absolute) */
4295 path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4296 curr_bs->backing_file);
4297
4298 if (!realpath(filename_tmp, backing_file_full)) {
4299 continue;
4300 }
4301
4302 if (strcmp(backing_file_full, filename_full) == 0) {
4303 retval = curr_bs->backing_hd;
4304 break;
4305 }
e8a6bb9c
MT
4306 }
4307 }
4308
b1b1d783
JC
4309 g_free(filename_full);
4310 g_free(backing_file_full);
4311 g_free(filename_tmp);
4312 return retval;
e8a6bb9c
MT
4313}
4314
f198fd1c
BC
4315int bdrv_get_backing_file_depth(BlockDriverState *bs)
4316{
4317 if (!bs->drv) {
4318 return 0;
4319 }
4320
4321 if (!bs->backing_hd) {
4322 return 0;
4323 }
4324
4325 return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
4326}
4327
ea2384d3 4328/**************************************************************/
83f64091 4329/* async I/Os */
ea2384d3 4330
7c84b1b8
MA
4331BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
4332 QEMUIOVector *qiov, int nb_sectors,
097310b5 4333 BlockCompletionFunc *cb, void *opaque)
83f64091 4334{
bbf0a440
SH
4335 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
4336
d20d9b7c 4337 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
8c5873d6 4338 cb, opaque, false);
ea2384d3
FB
4339}
4340
7c84b1b8
MA
4341BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
4342 QEMUIOVector *qiov, int nb_sectors,
097310b5 4343 BlockCompletionFunc *cb, void *opaque)
ea2384d3 4344{
bbf0a440
SH
4345 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
4346
d20d9b7c 4347 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
8c5873d6 4348 cb, opaque, true);
83f64091
FB
4349}
4350
7c84b1b8 4351BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
d5ef94d4 4352 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
097310b5 4353 BlockCompletionFunc *cb, void *opaque)
d5ef94d4
PB
4354{
4355 trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
4356
4357 return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
4358 BDRV_REQ_ZERO_WRITE | flags,
4359 cb, opaque, true);
4360}
4361
40b4f539
KW
4362
4363typedef struct MultiwriteCB {
4364 int error;
4365 int num_requests;
4366 int num_callbacks;
4367 struct {
097310b5 4368 BlockCompletionFunc *cb;
40b4f539
KW
4369 void *opaque;
4370 QEMUIOVector *free_qiov;
40b4f539
KW
4371 } callbacks[];
4372} MultiwriteCB;
4373
4374static void multiwrite_user_cb(MultiwriteCB *mcb)
4375{
4376 int i;
4377
4378 for (i = 0; i < mcb->num_callbacks; i++) {
4379 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
1e1ea48d
SH
4380 if (mcb->callbacks[i].free_qiov) {
4381 qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
4382 }
7267c094 4383 g_free(mcb->callbacks[i].free_qiov);
40b4f539
KW
4384 }
4385}
4386
4387static void multiwrite_cb(void *opaque, int ret)
4388{
4389 MultiwriteCB *mcb = opaque;
4390
6d519a5f
SH
4391 trace_multiwrite_cb(mcb, ret);
4392
cb6d3ca0 4393 if (ret < 0 && !mcb->error) {
40b4f539 4394 mcb->error = ret;
40b4f539
KW
4395 }
4396
4397 mcb->num_requests--;
4398 if (mcb->num_requests == 0) {
de189a1b 4399 multiwrite_user_cb(mcb);
7267c094 4400 g_free(mcb);
40b4f539
KW
4401 }
4402}
4403
4404static int multiwrite_req_compare(const void *a, const void *b)
4405{
77be4366
CH
4406 const BlockRequest *req1 = a, *req2 = b;
4407
4408 /*
4409 * Note that we can't simply subtract req2->sector from req1->sector
4410 * here as that could overflow the return value.
4411 */
4412 if (req1->sector > req2->sector) {
4413 return 1;
4414 } else if (req1->sector < req2->sector) {
4415 return -1;
4416 } else {
4417 return 0;
4418 }
40b4f539
KW
4419}
4420
4421/*
4422 * Takes a bunch of requests and tries to merge them. Returns the number of
4423 * requests that remain after merging.
4424 */
4425static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
4426 int num_reqs, MultiwriteCB *mcb)
4427{
4428 int i, outidx;
4429
4430 // Sort requests by start sector
4431 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
4432
4433 // Check if adjacent requests touch the same clusters. If so, combine them,
4434 // filling up gaps with zero sectors.
4435 outidx = 0;
4436 for (i = 1; i < num_reqs; i++) {
4437 int merge = 0;
4438 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
4439
b6a127a1 4440 // Handle exactly sequential writes and overlapping writes.
40b4f539
KW
4441 if (reqs[i].sector <= oldreq_last) {
4442 merge = 1;
4443 }
4444
e2a305fb
CH
4445 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
4446 merge = 0;
4447 }
4448
6c5a42ac
PL
4449 if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors +
4450 reqs[i].nb_sectors > bs->bl.max_transfer_length) {
4451 merge = 0;
4452 }
4453
40b4f539
KW
4454 if (merge) {
4455 size_t size;
7267c094 4456 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
40b4f539
KW
4457 qemu_iovec_init(qiov,
4458 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
4459
4460 // Add the first request to the merged one. If the requests are
4461 // overlapping, drop the last sectors of the first request.
4462 size = (reqs[i].sector - reqs[outidx].sector) << 9;
1b093c48 4463 qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
40b4f539 4464
b6a127a1
PB
4465 // We should need to add any zeros between the two requests
4466 assert (reqs[i].sector <= oldreq_last);
40b4f539
KW
4467
4468 // Add the second request
1b093c48 4469 qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
40b4f539 4470
391827eb
SH
4471 // Add tail of first request, if necessary
4472 if (qiov->size < reqs[outidx].qiov->size) {
4473 qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size,
4474 reqs[outidx].qiov->size - qiov->size);
4475 }
4476
cbf1dff2 4477 reqs[outidx].nb_sectors = qiov->size >> 9;
40b4f539
KW
4478 reqs[outidx].qiov = qiov;
4479
4480 mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
4481 } else {
4482 outidx++;
4483 reqs[outidx].sector = reqs[i].sector;
4484 reqs[outidx].nb_sectors = reqs[i].nb_sectors;
4485 reqs[outidx].qiov = reqs[i].qiov;
4486 }
4487 }
4488
4489 return outidx + 1;
4490}
4491
4492/*
4493 * Submit multiple AIO write requests at once.
4494 *
4495 * On success, the function returns 0 and all requests in the reqs array have
4496 * been submitted. In error case this function returns -1, and any of the
4497 * requests may or may not be submitted yet. In particular, this means that the
4498 * callback will be called for some of the requests, for others it won't. The
4499 * caller must check the error field of the BlockRequest to wait for the right
4500 * callbacks (if error != 0, no callback will be called).
4501 *
4502 * The implementation may modify the contents of the reqs array, e.g. to merge
4503 * requests. However, the fields opaque and error are left unmodified as they
4504 * are used to signal failure for a single request to the caller.
4505 */
4506int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
4507{
40b4f539
KW
4508 MultiwriteCB *mcb;
4509 int i;
4510
301db7c2
RH
4511 /* don't submit writes if we don't have a medium */
4512 if (bs->drv == NULL) {
4513 for (i = 0; i < num_reqs; i++) {
4514 reqs[i].error = -ENOMEDIUM;
4515 }
4516 return -1;
4517 }
4518
40b4f539
KW
4519 if (num_reqs == 0) {
4520 return 0;
4521 }
4522
4523 // Create MultiwriteCB structure
7267c094 4524 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
40b4f539
KW
4525 mcb->num_requests = 0;
4526 mcb->num_callbacks = num_reqs;
4527
4528 for (i = 0; i < num_reqs; i++) {
4529 mcb->callbacks[i].cb = reqs[i].cb;
4530 mcb->callbacks[i].opaque = reqs[i].opaque;
4531 }
4532
4533 // Check for mergable requests
4534 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
4535
6d519a5f
SH
4536 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
4537
df9309fb
PB
4538 /* Run the aio requests. */
4539 mcb->num_requests = num_reqs;
40b4f539 4540 for (i = 0; i < num_reqs; i++) {
d20d9b7c
PB
4541 bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
4542 reqs[i].nb_sectors, reqs[i].flags,
4543 multiwrite_cb, mcb,
4544 true);
40b4f539
KW
4545 }
4546
4547 return 0;
40b4f539
KW
4548}
4549
7c84b1b8 4550void bdrv_aio_cancel(BlockAIOCB *acb)
83f64091 4551{
ca5fd113
FZ
4552 qemu_aio_ref(acb);
4553 bdrv_aio_cancel_async(acb);
4554 while (acb->refcnt > 1) {
4555 if (acb->aiocb_info->get_aio_context) {
4556 aio_poll(acb->aiocb_info->get_aio_context(acb), true);
4557 } else if (acb->bs) {
4558 aio_poll(bdrv_get_aio_context(acb->bs), true);
4559 } else {
4560 abort();
02c50efe 4561 }
02c50efe 4562 }
8007429a 4563 qemu_aio_unref(acb);
02c50efe
FZ
4564}
4565
4566/* Async version of aio cancel. The caller is not blocked if the acb implements
4567 * cancel_async, otherwise we do nothing and let the request normally complete.
4568 * In either case the completion callback must be called. */
7c84b1b8 4569void bdrv_aio_cancel_async(BlockAIOCB *acb)
02c50efe
FZ
4570{
4571 if (acb->aiocb_info->cancel_async) {
4572 acb->aiocb_info->cancel_async(acb);
4573 }
83f64091
FB
4574}
4575
4576/**************************************************************/
4577/* async block device emulation */
4578
7c84b1b8
MA
4579typedef struct BlockAIOCBSync {
4580 BlockAIOCB common;
c16b5a2c
CH
4581 QEMUBH *bh;
4582 int ret;
4583 /* vector translation state */
4584 QEMUIOVector *qiov;
4585 uint8_t *bounce;
4586 int is_write;
7c84b1b8 4587} BlockAIOCBSync;
c16b5a2c 4588
d7331bed 4589static const AIOCBInfo bdrv_em_aiocb_info = {
7c84b1b8 4590 .aiocb_size = sizeof(BlockAIOCBSync),
c16b5a2c
CH
4591};
4592
ce1a14dc 4593static void bdrv_aio_bh_cb(void *opaque)
83f64091 4594{
7c84b1b8 4595 BlockAIOCBSync *acb = opaque;
f141eafe 4596
857d4f46 4597 if (!acb->is_write && acb->ret >= 0) {
03396148 4598 qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
857d4f46 4599 }
ceb42de8 4600 qemu_vfree(acb->bounce);
ce1a14dc 4601 acb->common.cb(acb->common.opaque, acb->ret);
6a7ad299 4602 qemu_bh_delete(acb->bh);
36afc451 4603 acb->bh = NULL;
8007429a 4604 qemu_aio_unref(acb);
83f64091 4605}
beac80cd 4606
7c84b1b8
MA
4607static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
4608 int64_t sector_num,
4609 QEMUIOVector *qiov,
4610 int nb_sectors,
097310b5 4611 BlockCompletionFunc *cb,
7c84b1b8
MA
4612 void *opaque,
4613 int is_write)
f141eafe 4614
83f64091 4615{
7c84b1b8 4616 BlockAIOCBSync *acb;
ce1a14dc 4617
d7331bed 4618 acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
f141eafe
AL
4619 acb->is_write = is_write;
4620 acb->qiov = qiov;
857d4f46 4621 acb->bounce = qemu_try_blockalign(bs, qiov->size);
2572b37a 4622 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb);
f141eafe 4623
857d4f46
KW
4624 if (acb->bounce == NULL) {
4625 acb->ret = -ENOMEM;
4626 } else if (is_write) {
d5e6b161 4627 qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
1ed20acf 4628 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
f141eafe 4629 } else {
1ed20acf 4630 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
f141eafe
AL
4631 }
4632
ce1a14dc 4633 qemu_bh_schedule(acb->bh);
f141eafe 4634
ce1a14dc 4635 return &acb->common;
beac80cd
FB
4636}
4637
7c84b1b8 4638static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
f141eafe 4639 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
097310b5 4640 BlockCompletionFunc *cb, void *opaque)
beac80cd 4641{
f141eafe
AL
4642 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
4643}
83f64091 4644
7c84b1b8 4645static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
f141eafe 4646 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
097310b5 4647 BlockCompletionFunc *cb, void *opaque)
f141eafe
AL
4648{
4649 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
beac80cd 4650}
beac80cd 4651
68485420 4652
7c84b1b8
MA
4653typedef struct BlockAIOCBCoroutine {
4654 BlockAIOCB common;
68485420
KW
4655 BlockRequest req;
4656 bool is_write;
d318aea9 4657 bool *done;
68485420 4658 QEMUBH* bh;
7c84b1b8 4659} BlockAIOCBCoroutine;
68485420 4660
d7331bed 4661static const AIOCBInfo bdrv_em_co_aiocb_info = {
7c84b1b8 4662 .aiocb_size = sizeof(BlockAIOCBCoroutine),
68485420
KW
4663};
4664
35246a68 4665static void bdrv_co_em_bh(void *opaque)
68485420 4666{
7c84b1b8 4667 BlockAIOCBCoroutine *acb = opaque;
68485420
KW
4668
4669 acb->common.cb(acb->common.opaque, acb->req.error);
d318aea9 4670
68485420 4671 qemu_bh_delete(acb->bh);
8007429a 4672 qemu_aio_unref(acb);
68485420
KW
4673}
4674
b2a61371
SH
4675/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4676static void coroutine_fn bdrv_co_do_rw(void *opaque)
4677{
7c84b1b8 4678 BlockAIOCBCoroutine *acb = opaque;
b2a61371
SH
4679 BlockDriverState *bs = acb->common.bs;
4680
4681 if (!acb->is_write) {
4682 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
d20d9b7c 4683 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
b2a61371
SH
4684 } else {
4685 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
d20d9b7c 4686 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
b2a61371
SH
4687 }
4688
2572b37a 4689 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
b2a61371
SH
4690 qemu_bh_schedule(acb->bh);
4691}
4692
7c84b1b8
MA
4693static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4694 int64_t sector_num,
4695 QEMUIOVector *qiov,
4696 int nb_sectors,
4697 BdrvRequestFlags flags,
097310b5 4698 BlockCompletionFunc *cb,
7c84b1b8
MA
4699 void *opaque,
4700 bool is_write)
68485420
KW
4701{
4702 Coroutine *co;
7c84b1b8 4703 BlockAIOCBCoroutine *acb;
68485420 4704
d7331bed 4705 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
68485420
KW
4706 acb->req.sector = sector_num;
4707 acb->req.nb_sectors = nb_sectors;
4708 acb->req.qiov = qiov;
d20d9b7c 4709 acb->req.flags = flags;
68485420
KW
4710 acb->is_write = is_write;
4711
8c5873d6 4712 co = qemu_coroutine_create(bdrv_co_do_rw);
68485420
KW
4713 qemu_coroutine_enter(co, acb);
4714
4715 return &acb->common;
4716}
4717
07f07615 4718static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
b2e12bc6 4719{
7c84b1b8 4720 BlockAIOCBCoroutine *acb = opaque;
07f07615 4721 BlockDriverState *bs = acb->common.bs;
b2e12bc6 4722
07f07615 4723 acb->req.error = bdrv_co_flush(bs);
2572b37a 4724 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
b2e12bc6 4725 qemu_bh_schedule(acb->bh);
b2e12bc6
CH
4726}
4727
7c84b1b8 4728BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
097310b5 4729 BlockCompletionFunc *cb, void *opaque)
016f5cf6 4730{
07f07615 4731 trace_bdrv_aio_flush(bs, opaque);
016f5cf6 4732
07f07615 4733 Coroutine *co;
7c84b1b8 4734 BlockAIOCBCoroutine *acb;
016f5cf6 4735
d7331bed 4736 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
d318aea9 4737
07f07615
PB
4738 co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4739 qemu_coroutine_enter(co, acb);
016f5cf6 4740
016f5cf6
AG
4741 return &acb->common;
4742}
4743
4265d620
PB
4744static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4745{
7c84b1b8 4746 BlockAIOCBCoroutine *acb = opaque;
4265d620
PB
4747 BlockDriverState *bs = acb->common.bs;
4748
4749 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
2572b37a 4750 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4265d620
PB
4751 qemu_bh_schedule(acb->bh);
4752}
4753
7c84b1b8 4754BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4265d620 4755 int64_t sector_num, int nb_sectors,
097310b5 4756 BlockCompletionFunc *cb, void *opaque)
4265d620
PB
4757{
4758 Coroutine *co;
7c84b1b8 4759 BlockAIOCBCoroutine *acb;
4265d620
PB
4760
4761 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4762
d7331bed 4763 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4265d620
PB
4764 acb->req.sector = sector_num;
4765 acb->req.nb_sectors = nb_sectors;
4766 co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
4767 qemu_coroutine_enter(co, acb);
4768
4769 return &acb->common;
4770}
4771
ea2384d3
FB
4772void bdrv_init(void)
4773{
5efa9d5a 4774 module_call_init(MODULE_INIT_BLOCK);
ea2384d3 4775}
ce1a14dc 4776
eb852011
MA
4777void bdrv_init_with_whitelist(void)
4778{
4779 use_bdrv_whitelist = 1;
4780 bdrv_init();
4781}
4782
d7331bed 4783void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
097310b5 4784 BlockCompletionFunc *cb, void *opaque)
ce1a14dc 4785{
7c84b1b8 4786 BlockAIOCB *acb;
ce1a14dc 4787
d7331bed
SH
4788 acb = g_slice_alloc(aiocb_info->aiocb_size);
4789 acb->aiocb_info = aiocb_info;
ce1a14dc
PB
4790 acb->bs = bs;
4791 acb->cb = cb;
4792 acb->opaque = opaque;
f197fe2b 4793 acb->refcnt = 1;
ce1a14dc
PB
4794 return acb;
4795}
4796
f197fe2b
FZ
4797void qemu_aio_ref(void *p)
4798{
7c84b1b8 4799 BlockAIOCB *acb = p;
f197fe2b
FZ
4800 acb->refcnt++;
4801}
4802
8007429a 4803void qemu_aio_unref(void *p)
ce1a14dc 4804{
7c84b1b8 4805 BlockAIOCB *acb = p;
f197fe2b
FZ
4806 assert(acb->refcnt > 0);
4807 if (--acb->refcnt == 0) {
4808 g_slice_free1(acb->aiocb_info->aiocb_size, acb);
4809 }
ce1a14dc 4810}
19cb3738 4811
f9f05dc5
KW
4812/**************************************************************/
4813/* Coroutine block device emulation */
4814
4815typedef struct CoroutineIOCompletion {
4816 Coroutine *coroutine;
4817 int ret;
4818} CoroutineIOCompletion;
4819
4820static void bdrv_co_io_em_complete(void *opaque, int ret)
4821{
4822 CoroutineIOCompletion *co = opaque;
4823
4824 co->ret = ret;
4825 qemu_coroutine_enter(co->coroutine, NULL);
4826}
4827
4828static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
4829 int nb_sectors, QEMUIOVector *iov,
4830 bool is_write)
4831{
4832 CoroutineIOCompletion co = {
4833 .coroutine = qemu_coroutine_self(),
4834 };
7c84b1b8 4835 BlockAIOCB *acb;
f9f05dc5
KW
4836
4837 if (is_write) {
a652d160
SH
4838 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
4839 bdrv_co_io_em_complete, &co);
f9f05dc5 4840 } else {
a652d160
SH
4841 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
4842 bdrv_co_io_em_complete, &co);
f9f05dc5
KW
4843 }
4844
59370aaa 4845 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
f9f05dc5
KW
4846 if (!acb) {
4847 return -EIO;
4848 }
4849 qemu_coroutine_yield();
4850
4851 return co.ret;
4852}
4853
4854static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4855 int64_t sector_num, int nb_sectors,
4856 QEMUIOVector *iov)
4857{
4858 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4859}
4860
4861static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4862 int64_t sector_num, int nb_sectors,
4863 QEMUIOVector *iov)
4864{
4865 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4866}
4867
07f07615 4868static void coroutine_fn bdrv_flush_co_entry(void *opaque)
e7a8a783 4869{
07f07615
PB
4870 RwCo *rwco = opaque;
4871
4872 rwco->ret = bdrv_co_flush(rwco->bs);
4873}
4874
4875int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4876{
eb489bb1
KW
4877 int ret;
4878
29cdb251 4879 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
07f07615 4880 return 0;
eb489bb1
KW
4881 }
4882
ca716364 4883 /* Write back cached data to the OS even with cache=unsafe */
bf736fe3 4884 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
eb489bb1
KW
4885 if (bs->drv->bdrv_co_flush_to_os) {
4886 ret = bs->drv->bdrv_co_flush_to_os(bs);
4887 if (ret < 0) {
4888 return ret;
4889 }
4890 }
4891
ca716364
KW
4892 /* But don't actually force it to the disk with cache=unsafe */
4893 if (bs->open_flags & BDRV_O_NO_FLUSH) {
d4c82329 4894 goto flush_parent;
ca716364
KW
4895 }
4896
bf736fe3 4897 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
eb489bb1 4898 if (bs->drv->bdrv_co_flush_to_disk) {
29cdb251 4899 ret = bs->drv->bdrv_co_flush_to_disk(bs);
07f07615 4900 } else if (bs->drv->bdrv_aio_flush) {
7c84b1b8 4901 BlockAIOCB *acb;
07f07615
PB
4902 CoroutineIOCompletion co = {
4903 .coroutine = qemu_coroutine_self(),
4904 };
4905
4906 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4907 if (acb == NULL) {
29cdb251 4908 ret = -EIO;
07f07615
PB
4909 } else {
4910 qemu_coroutine_yield();
29cdb251 4911 ret = co.ret;
07f07615 4912 }
07f07615
PB
4913 } else {
4914 /*
4915 * Some block drivers always operate in either writethrough or unsafe
4916 * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4917 * know how the server works (because the behaviour is hardcoded or
4918 * depends on server-side configuration), so we can't ensure that
4919 * everything is safe on disk. Returning an error doesn't work because
4920 * that would break guests even if the server operates in writethrough
4921 * mode.
4922 *
4923 * Let's hope the user knows what he's doing.
4924 */
29cdb251 4925 ret = 0;
07f07615 4926 }
29cdb251
PB
4927 if (ret < 0) {
4928 return ret;
4929 }
4930
4931 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
4932 * in the case of cache=unsafe, so there are no useless flushes.
4933 */
d4c82329 4934flush_parent:
29cdb251 4935 return bdrv_co_flush(bs->file);
07f07615
PB
4936}
4937
5a8a30db 4938void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)
0f15423c 4939{
5a8a30db
KW
4940 Error *local_err = NULL;
4941 int ret;
4942
3456a8d1
KW
4943 if (!bs->drv) {
4944 return;
4945 }
4946
7ea2d269
AK
4947 if (!(bs->open_flags & BDRV_O_INCOMING)) {
4948 return;
4949 }
4950 bs->open_flags &= ~BDRV_O_INCOMING;
4951
3456a8d1 4952 if (bs->drv->bdrv_invalidate_cache) {
5a8a30db 4953 bs->drv->bdrv_invalidate_cache(bs, &local_err);
3456a8d1 4954 } else if (bs->file) {
5a8a30db
KW
4955 bdrv_invalidate_cache(bs->file, &local_err);
4956 }
4957 if (local_err) {
4958 error_propagate(errp, local_err);
4959 return;
0f15423c 4960 }
3456a8d1 4961
5a8a30db
KW
4962 ret = refresh_total_sectors(bs, bs->total_sectors);
4963 if (ret < 0) {
4964 error_setg_errno(errp, -ret, "Could not refresh total sector count");
4965 return;
4966 }
0f15423c
AL
4967}
4968
5a8a30db 4969void bdrv_invalidate_cache_all(Error **errp)
0f15423c
AL
4970{
4971 BlockDriverState *bs;
5a8a30db 4972 Error *local_err = NULL;
0f15423c 4973
dc364f4c 4974 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
ed78cda3
SH
4975 AioContext *aio_context = bdrv_get_aio_context(bs);
4976
4977 aio_context_acquire(aio_context);
5a8a30db 4978 bdrv_invalidate_cache(bs, &local_err);
ed78cda3 4979 aio_context_release(aio_context);
5a8a30db
KW
4980 if (local_err) {
4981 error_propagate(errp, local_err);
4982 return;
4983 }
0f15423c
AL
4984 }
4985}
4986
07f07615
PB
4987int bdrv_flush(BlockDriverState *bs)
4988{
4989 Coroutine *co;
4990 RwCo rwco = {
4991 .bs = bs,
4992 .ret = NOT_DONE,
e7a8a783 4993 };
e7a8a783 4994
07f07615
PB
4995 if (qemu_in_coroutine()) {
4996 /* Fast-path if already in coroutine context */
4997 bdrv_flush_co_entry(&rwco);
4998 } else {
2572b37a
SH
4999 AioContext *aio_context = bdrv_get_aio_context(bs);
5000
07f07615
PB
5001 co = qemu_coroutine_create(bdrv_flush_co_entry);
5002 qemu_coroutine_enter(co, &rwco);
5003 while (rwco.ret == NOT_DONE) {
2572b37a 5004 aio_poll(aio_context, true);
07f07615 5005 }
e7a8a783 5006 }
07f07615
PB
5007
5008 return rwco.ret;
e7a8a783
KW
5009}
5010
775aa8b6
KW
5011typedef struct DiscardCo {
5012 BlockDriverState *bs;
5013 int64_t sector_num;
5014 int nb_sectors;
5015 int ret;
5016} DiscardCo;
4265d620
PB
5017static void coroutine_fn bdrv_discard_co_entry(void *opaque)
5018{
775aa8b6 5019 DiscardCo *rwco = opaque;
4265d620
PB
5020
5021 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
5022}
5023
6f14da52
PL
5024/* if no limit is specified in the BlockLimits use a default
5025 * of 32768 512-byte sectors (16 MiB) per request.
5026 */
5027#define MAX_DISCARD_DEFAULT 32768
5028
4265d620
PB
5029int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
5030 int nb_sectors)
5031{
d51e9fe5
PB
5032 int max_discard;
5033
4265d620
PB
5034 if (!bs->drv) {
5035 return -ENOMEDIUM;
5036 } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
5037 return -EIO;
5038 } else if (bs->read_only) {
5039 return -EROFS;
df702c9b
PB
5040 }
5041
e4654d2d 5042 bdrv_reset_dirty(bs, sector_num, nb_sectors);
df702c9b 5043
9e8f1835
PB
5044 /* Do nothing if disabled. */
5045 if (!(bs->open_flags & BDRV_O_UNMAP)) {
5046 return 0;
5047 }
5048
d51e9fe5
PB
5049 if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
5050 return 0;
5051 }
6f14da52 5052
d51e9fe5
PB
5053 max_discard = bs->bl.max_discard ? bs->bl.max_discard : MAX_DISCARD_DEFAULT;
5054 while (nb_sectors > 0) {
5055 int ret;
5056 int num = nb_sectors;
6f14da52 5057
d51e9fe5
PB
5058 /* align request */
5059 if (bs->bl.discard_alignment &&
5060 num >= bs->bl.discard_alignment &&
5061 sector_num % bs->bl.discard_alignment) {
5062 if (num > bs->bl.discard_alignment) {
5063 num = bs->bl.discard_alignment;
6f14da52 5064 }
d51e9fe5
PB
5065 num -= sector_num % bs->bl.discard_alignment;
5066 }
6f14da52 5067
d51e9fe5
PB
5068 /* limit request size */
5069 if (num > max_discard) {
5070 num = max_discard;
5071 }
6f14da52 5072
d51e9fe5 5073 if (bs->drv->bdrv_co_discard) {
6f14da52 5074 ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
d51e9fe5 5075 } else {
7c84b1b8 5076 BlockAIOCB *acb;
d51e9fe5
PB
5077 CoroutineIOCompletion co = {
5078 .coroutine = qemu_coroutine_self(),
5079 };
5080
5081 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
5082 bdrv_co_io_em_complete, &co);
5083 if (acb == NULL) {
5084 return -EIO;
5085 } else {
5086 qemu_coroutine_yield();
5087 ret = co.ret;
6f14da52 5088 }
6f14da52 5089 }
7ce21016 5090 if (ret && ret != -ENOTSUP) {
d51e9fe5 5091 return ret;
4265d620 5092 }
d51e9fe5
PB
5093
5094 sector_num += num;
5095 nb_sectors -= num;
4265d620 5096 }
d51e9fe5 5097 return 0;
4265d620
PB
5098}
5099
5100int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
5101{
5102 Coroutine *co;
775aa8b6 5103 DiscardCo rwco = {
4265d620
PB
5104 .bs = bs,
5105 .sector_num = sector_num,
5106 .nb_sectors = nb_sectors,
5107 .ret = NOT_DONE,
5108 };
5109
5110 if (qemu_in_coroutine()) {
5111 /* Fast-path if already in coroutine context */
5112 bdrv_discard_co_entry(&rwco);
5113 } else {
2572b37a
SH
5114 AioContext *aio_context = bdrv_get_aio_context(bs);
5115
4265d620
PB
5116 co = qemu_coroutine_create(bdrv_discard_co_entry);
5117 qemu_coroutine_enter(co, &rwco);
5118 while (rwco.ret == NOT_DONE) {
2572b37a 5119 aio_poll(aio_context, true);
4265d620
PB
5120 }
5121 }
5122
5123 return rwco.ret;
5124}
5125
19cb3738
FB
5126/**************************************************************/
5127/* removable device support */
5128
5129/**
5130 * Return TRUE if the media is present
5131 */
5132int bdrv_is_inserted(BlockDriverState *bs)
5133{
5134 BlockDriver *drv = bs->drv;
a1aff5bf 5135
19cb3738
FB
5136 if (!drv)
5137 return 0;
5138 if (!drv->bdrv_is_inserted)
a1aff5bf
MA
5139 return 1;
5140 return drv->bdrv_is_inserted(bs);
19cb3738
FB
5141}
5142
5143/**
8e49ca46
MA
5144 * Return whether the media changed since the last call to this
5145 * function, or -ENOTSUP if we don't know. Most drivers don't know.
19cb3738
FB
5146 */
5147int bdrv_media_changed(BlockDriverState *bs)
5148{
5149 BlockDriver *drv = bs->drv;
19cb3738 5150
8e49ca46
MA
5151 if (drv && drv->bdrv_media_changed) {
5152 return drv->bdrv_media_changed(bs);
5153 }
5154 return -ENOTSUP;
19cb3738
FB
5155}
5156
5157/**
5158 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
5159 */
f36f3949 5160void bdrv_eject(BlockDriverState *bs, bool eject_flag)
19cb3738
FB
5161{
5162 BlockDriver *drv = bs->drv;
bfb197e0 5163 const char *device_name;
19cb3738 5164
822e1cd1
MA
5165 if (drv && drv->bdrv_eject) {
5166 drv->bdrv_eject(bs, eject_flag);
19cb3738 5167 }
6f382ed2 5168
bfb197e0
MA
5169 device_name = bdrv_get_device_name(bs);
5170 if (device_name[0] != '\0') {
5171 qapi_event_send_device_tray_moved(device_name,
a5ee7bd4 5172 eject_flag, &error_abort);
6f382ed2 5173 }
19cb3738
FB
5174}
5175
19cb3738
FB
5176/**
5177 * Lock or unlock the media (if it is locked, the user won't be able
5178 * to eject it manually).
5179 */
025e849a 5180void bdrv_lock_medium(BlockDriverState *bs, bool locked)
19cb3738
FB
5181{
5182 BlockDriver *drv = bs->drv;
5183
025e849a 5184 trace_bdrv_lock_medium(bs, locked);
b8c6d095 5185
025e849a
MA
5186 if (drv && drv->bdrv_lock_medium) {
5187 drv->bdrv_lock_medium(bs, locked);
19cb3738
FB
5188 }
5189}
985a03b0
TS
5190
5191/* needed for generic scsi interface */
5192
5193int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
5194{
5195 BlockDriver *drv = bs->drv;
5196
5197 if (drv && drv->bdrv_ioctl)
5198 return drv->bdrv_ioctl(bs, req, buf);
5199 return -ENOTSUP;
5200}
7d780669 5201
7c84b1b8 5202BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
221f715d 5203 unsigned long int req, void *buf,
097310b5 5204 BlockCompletionFunc *cb, void *opaque)
7d780669 5205{
221f715d 5206 BlockDriver *drv = bs->drv;
7d780669 5207
221f715d
AL
5208 if (drv && drv->bdrv_aio_ioctl)
5209 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
5210 return NULL;
7d780669 5211}
e268ca52 5212
1b7fd729 5213void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
7b6f9300 5214{
1b7fd729 5215 bs->guest_block_size = align;
7b6f9300 5216}
7cd1e32a 5217
e268ca52
AL
5218void *qemu_blockalign(BlockDriverState *bs, size_t size)
5219{
339064d5 5220 return qemu_memalign(bdrv_opt_mem_align(bs), size);
e268ca52 5221}
7cd1e32a 5222
9ebd8448
HR
5223void *qemu_blockalign0(BlockDriverState *bs, size_t size)
5224{
5225 return memset(qemu_blockalign(bs, size), 0, size);
5226}
5227
7d2a35cc
KW
5228void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
5229{
5230 size_t align = bdrv_opt_mem_align(bs);
5231
5232 /* Ensure that NULL is never returned on success */
5233 assert(align > 0);
5234 if (size == 0) {
5235 size = align;
5236 }
5237
5238 return qemu_try_memalign(align, size);
5239}
5240
9ebd8448
HR
5241void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
5242{
5243 void *mem = qemu_try_blockalign(bs, size);
5244
5245 if (mem) {
5246 memset(mem, 0, size);
5247 }
5248
5249 return mem;
5250}
5251
c53b1c51
SH
5252/*
5253 * Check if all memory in this vector is sector aligned.
5254 */
5255bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
5256{
5257 int i;
339064d5 5258 size_t alignment = bdrv_opt_mem_align(bs);
c53b1c51
SH
5259
5260 for (i = 0; i < qiov->niov; i++) {
339064d5 5261 if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
c53b1c51 5262 return false;
1ff735bd 5263 }
339064d5 5264 if (qiov->iov[i].iov_len % alignment) {
1ff735bd 5265 return false;
c53b1c51
SH
5266 }
5267 }
5268
5269 return true;
5270}
5271
b8afb520
FZ
5272BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity,
5273 Error **errp)
7cd1e32a
LS
5274{
5275 int64_t bitmap_size;
e4654d2d 5276 BdrvDirtyBitmap *bitmap;
a55eb92c 5277
50717e94
PB
5278 assert((granularity & (granularity - 1)) == 0);
5279
e4654d2d
FZ
5280 granularity >>= BDRV_SECTOR_BITS;
5281 assert(granularity);
57322b78 5282 bitmap_size = bdrv_nb_sectors(bs);
b8afb520
FZ
5283 if (bitmap_size < 0) {
5284 error_setg_errno(errp, -bitmap_size, "could not get length of device");
5285 errno = -bitmap_size;
5286 return NULL;
5287 }
5839e53b 5288 bitmap = g_new0(BdrvDirtyBitmap, 1);
e4654d2d
FZ
5289 bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
5290 QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
5291 return bitmap;
5292}
5293
5294void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5295{
5296 BdrvDirtyBitmap *bm, *next;
5297 QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
5298 if (bm == bitmap) {
5299 QLIST_REMOVE(bitmap, list);
5300 hbitmap_free(bitmap->bitmap);
5301 g_free(bitmap);
5302 return;
a55eb92c 5303 }
7cd1e32a
LS
5304 }
5305}
5306
21b56835
FZ
5307BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
5308{
5309 BdrvDirtyBitmap *bm;
5310 BlockDirtyInfoList *list = NULL;
5311 BlockDirtyInfoList **plist = &list;
5312
5313 QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
5839e53b
MA
5314 BlockDirtyInfo *info = g_new0(BlockDirtyInfo, 1);
5315 BlockDirtyInfoList *entry = g_new0(BlockDirtyInfoList, 1);
21b56835
FZ
5316 info->count = bdrv_get_dirty_count(bs, bm);
5317 info->granularity =
5318 ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
5319 entry->value = info;
5320 *plist = entry;
5321 plist = &entry->next;
5322 }
5323
5324 return list;
5325}
5326
e4654d2d 5327int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
7cd1e32a 5328{
e4654d2d
FZ
5329 if (bitmap) {
5330 return hbitmap_get(bitmap->bitmap, sector);
7cd1e32a
LS
5331 } else {
5332 return 0;
5333 }
5334}
5335
e4654d2d
FZ
5336void bdrv_dirty_iter_init(BlockDriverState *bs,
5337 BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
1755da16 5338{
e4654d2d 5339 hbitmap_iter_init(hbi, bitmap->bitmap, 0);
1755da16
PB
5340}
5341
5342void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
5343 int nr_sectors)
5344{
e4654d2d
FZ
5345 BdrvDirtyBitmap *bitmap;
5346 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5347 hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5348 }
1755da16
PB
5349}
5350
e4654d2d 5351void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors)
7cd1e32a 5352{
e4654d2d
FZ
5353 BdrvDirtyBitmap *bitmap;
5354 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5355 hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5356 }
7cd1e32a 5357}
aaa0eb75 5358
e4654d2d 5359int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
aaa0eb75 5360{
e4654d2d 5361 return hbitmap_count(bitmap->bitmap);
aaa0eb75 5362}
f88e1a42 5363
9fcb0251
FZ
5364/* Get a reference to bs */
5365void bdrv_ref(BlockDriverState *bs)
5366{
5367 bs->refcnt++;
5368}
5369
5370/* Release a previously grabbed reference to bs.
5371 * If after releasing, reference count is zero, the BlockDriverState is
5372 * deleted. */
5373void bdrv_unref(BlockDriverState *bs)
5374{
9a4d5ca6
JC
5375 if (!bs) {
5376 return;
5377 }
9fcb0251
FZ
5378 assert(bs->refcnt > 0);
5379 if (--bs->refcnt == 0) {
5380 bdrv_delete(bs);
5381 }
5382}
5383
fbe40ff7
FZ
5384struct BdrvOpBlocker {
5385 Error *reason;
5386 QLIST_ENTRY(BdrvOpBlocker) list;
5387};
5388
5389bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp)
5390{
5391 BdrvOpBlocker *blocker;
5392 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5393 if (!QLIST_EMPTY(&bs->op_blockers[op])) {
5394 blocker = QLIST_FIRST(&bs->op_blockers[op]);
5395 if (errp) {
5396 error_setg(errp, "Device '%s' is busy: %s",
bfb197e0
MA
5397 bdrv_get_device_name(bs),
5398 error_get_pretty(blocker->reason));
fbe40ff7
FZ
5399 }
5400 return true;
5401 }
5402 return false;
5403}
5404
5405void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason)
5406{
5407 BdrvOpBlocker *blocker;
5408 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5409
5839e53b 5410 blocker = g_new0(BdrvOpBlocker, 1);
fbe40ff7
FZ
5411 blocker->reason = reason;
5412 QLIST_INSERT_HEAD(&bs->op_blockers[op], blocker, list);
5413}
5414
5415void bdrv_op_unblock(BlockDriverState *bs, BlockOpType op, Error *reason)
5416{
5417 BdrvOpBlocker *blocker, *next;
5418 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5419 QLIST_FOREACH_SAFE(blocker, &bs->op_blockers[op], list, next) {
5420 if (blocker->reason == reason) {
5421 QLIST_REMOVE(blocker, list);
5422 g_free(blocker);
5423 }
5424 }
5425}
5426
5427void bdrv_op_block_all(BlockDriverState *bs, Error *reason)
5428{
5429 int i;
5430 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5431 bdrv_op_block(bs, i, reason);
5432 }
5433}
5434
5435void bdrv_op_unblock_all(BlockDriverState *bs, Error *reason)
5436{
5437 int i;
5438 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5439 bdrv_op_unblock(bs, i, reason);
5440 }
5441}
5442
5443bool bdrv_op_blocker_is_empty(BlockDriverState *bs)
5444{
5445 int i;
5446
5447 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5448 if (!QLIST_EMPTY(&bs->op_blockers[i])) {
5449 return false;
5450 }
5451 }
5452 return true;
5453}
5454
28a7282a
LC
5455void bdrv_iostatus_enable(BlockDriverState *bs)
5456{
d6bf279e 5457 bs->iostatus_enabled = true;
58e21ef5 5458 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
28a7282a
LC
5459}
5460
5461/* The I/O status is only enabled if the drive explicitly
5462 * enables it _and_ the VM is configured to stop on errors */
5463bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
5464{
d6bf279e 5465 return (bs->iostatus_enabled &&
92aa5c6d
PB
5466 (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
5467 bs->on_write_error == BLOCKDEV_ON_ERROR_STOP ||
5468 bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
28a7282a
LC
5469}
5470
5471void bdrv_iostatus_disable(BlockDriverState *bs)
5472{
d6bf279e 5473 bs->iostatus_enabled = false;
28a7282a
LC
5474}
5475
5476void bdrv_iostatus_reset(BlockDriverState *bs)
5477{
5478 if (bdrv_iostatus_is_enabled(bs)) {
58e21ef5 5479 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
3bd293c3
PB
5480 if (bs->job) {
5481 block_job_iostatus_reset(bs->job);
5482 }
28a7282a
LC
5483 }
5484}
5485
28a7282a
LC
5486void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
5487{
3e1caa5f
PB
5488 assert(bdrv_iostatus_is_enabled(bs));
5489 if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
58e21ef5
LC
5490 bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
5491 BLOCK_DEVICE_IO_STATUS_FAILED;
28a7282a
LC
5492 }
5493}
5494
d92ada22
LC
5495void bdrv_img_create(const char *filename, const char *fmt,
5496 const char *base_filename, const char *base_fmt,
f382d43a
MR
5497 char *options, uint64_t img_size, int flags,
5498 Error **errp, bool quiet)
f88e1a42 5499{
83d0521a
CL
5500 QemuOptsList *create_opts = NULL;
5501 QemuOpts *opts = NULL;
5502 const char *backing_fmt, *backing_file;
5503 int64_t size;
f88e1a42 5504 BlockDriver *drv, *proto_drv;
96df67d1 5505 BlockDriver *backing_drv = NULL;
cc84d90f 5506 Error *local_err = NULL;
f88e1a42
JS
5507 int ret = 0;
5508
5509 /* Find driver and parse its options */
5510 drv = bdrv_find_format(fmt);
5511 if (!drv) {
71c79813 5512 error_setg(errp, "Unknown file format '%s'", fmt);
d92ada22 5513 return;
f88e1a42
JS
5514 }
5515
98289620 5516 proto_drv = bdrv_find_protocol(filename, true);
f88e1a42 5517 if (!proto_drv) {
71c79813 5518 error_setg(errp, "Unknown protocol '%s'", filename);
d92ada22 5519 return;
f88e1a42
JS
5520 }
5521
c282e1fd
CL
5522 create_opts = qemu_opts_append(create_opts, drv->create_opts);
5523 create_opts = qemu_opts_append(create_opts, proto_drv->create_opts);
f88e1a42
JS
5524
5525 /* Create parameter list with default values */
83d0521a
CL
5526 opts = qemu_opts_create(create_opts, NULL, 0, &error_abort);
5527 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, img_size);
f88e1a42
JS
5528
5529 /* Parse -o options */
5530 if (options) {
83d0521a
CL
5531 if (qemu_opts_do_parse(opts, options, NULL) != 0) {
5532 error_setg(errp, "Invalid options for file format '%s'", fmt);
f88e1a42
JS
5533 goto out;
5534 }
5535 }
5536
5537 if (base_filename) {
83d0521a 5538 if (qemu_opt_set(opts, BLOCK_OPT_BACKING_FILE, base_filename)) {
71c79813
LC
5539 error_setg(errp, "Backing file not supported for file format '%s'",
5540 fmt);
f88e1a42
JS
5541 goto out;
5542 }
5543 }
5544
5545 if (base_fmt) {
83d0521a 5546 if (qemu_opt_set(opts, BLOCK_OPT_BACKING_FMT, base_fmt)) {
71c79813
LC
5547 error_setg(errp, "Backing file format not supported for file "
5548 "format '%s'", fmt);
f88e1a42
JS
5549 goto out;
5550 }
5551 }
5552
83d0521a
CL
5553 backing_file = qemu_opt_get(opts, BLOCK_OPT_BACKING_FILE);
5554 if (backing_file) {
5555 if (!strcmp(filename, backing_file)) {
71c79813
LC
5556 error_setg(errp, "Error: Trying to create an image with the "
5557 "same filename as the backing file");
792da93a
JS
5558 goto out;
5559 }
5560 }
5561
83d0521a
CL
5562 backing_fmt = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT);
5563 if (backing_fmt) {
5564 backing_drv = bdrv_find_format(backing_fmt);
96df67d1 5565 if (!backing_drv) {
71c79813 5566 error_setg(errp, "Unknown backing file format '%s'",
83d0521a 5567 backing_fmt);
f88e1a42
JS
5568 goto out;
5569 }
5570 }
5571
5572 // The size for the image must always be specified, with one exception:
5573 // If we are using a backing file, we can obtain the size from there
83d0521a
CL
5574 size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 0);
5575 if (size == -1) {
5576 if (backing_file) {
66f6b814 5577 BlockDriverState *bs;
52bf1e72 5578 int64_t size;
63090dac
PB
5579 int back_flags;
5580
5581 /* backing files always opened read-only */
5582 back_flags =
5583 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
f88e1a42 5584
f67503e5 5585 bs = NULL;
83d0521a 5586 ret = bdrv_open(&bs, backing_file, NULL, NULL, back_flags,
cc84d90f 5587 backing_drv, &local_err);
f88e1a42 5588 if (ret < 0) {
cc84d90f 5589 error_setg_errno(errp, -ret, "Could not open '%s': %s",
83d0521a 5590 backing_file,
cc84d90f
HR
5591 error_get_pretty(local_err));
5592 error_free(local_err);
5593 local_err = NULL;
f88e1a42
JS
5594 goto out;
5595 }
52bf1e72
MA
5596 size = bdrv_getlength(bs);
5597 if (size < 0) {
5598 error_setg_errno(errp, -size, "Could not get size of '%s'",
5599 backing_file);
5600 bdrv_unref(bs);
5601 goto out;
5602 }
f88e1a42 5603
83d0521a 5604 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, size);
66f6b814
HR
5605
5606 bdrv_unref(bs);
f88e1a42 5607 } else {
71c79813 5608 error_setg(errp, "Image creation needs a size parameter");
f88e1a42
JS
5609 goto out;
5610 }
5611 }
5612
f382d43a
MR
5613 if (!quiet) {
5614 printf("Formatting '%s', fmt=%s ", filename, fmt);
83d0521a 5615 qemu_opts_print(opts);
f382d43a
MR
5616 puts("");
5617 }
83d0521a 5618
c282e1fd 5619 ret = bdrv_create(drv, filename, opts, &local_err);
83d0521a 5620
cc84d90f
HR
5621 if (ret == -EFBIG) {
5622 /* This is generally a better message than whatever the driver would
5623 * deliver (especially because of the cluster_size_hint), since that
5624 * is most probably not much different from "image too large". */
5625 const char *cluster_size_hint = "";
83d0521a 5626 if (qemu_opt_get_size(opts, BLOCK_OPT_CLUSTER_SIZE, 0)) {
cc84d90f 5627 cluster_size_hint = " (try using a larger cluster size)";
f88e1a42 5628 }
cc84d90f
HR
5629 error_setg(errp, "The image size is too large for file format '%s'"
5630 "%s", fmt, cluster_size_hint);
5631 error_free(local_err);
5632 local_err = NULL;
f88e1a42
JS
5633 }
5634
5635out:
83d0521a
CL
5636 qemu_opts_del(opts);
5637 qemu_opts_free(create_opts);
84d18f06 5638 if (local_err) {
cc84d90f
HR
5639 error_propagate(errp, local_err);
5640 }
f88e1a42 5641}
85d126f3
SH
5642
5643AioContext *bdrv_get_aio_context(BlockDriverState *bs)
5644{
dcd04228
SH
5645 return bs->aio_context;
5646}
5647
5648void bdrv_detach_aio_context(BlockDriverState *bs)
5649{
33384421
HR
5650 BdrvAioNotifier *baf;
5651
dcd04228
SH
5652 if (!bs->drv) {
5653 return;
5654 }
5655
33384421
HR
5656 QLIST_FOREACH(baf, &bs->aio_notifiers, list) {
5657 baf->detach_aio_context(baf->opaque);
5658 }
5659
13af91eb
SH
5660 if (bs->io_limits_enabled) {
5661 throttle_detach_aio_context(&bs->throttle_state);
5662 }
dcd04228
SH
5663 if (bs->drv->bdrv_detach_aio_context) {
5664 bs->drv->bdrv_detach_aio_context(bs);
5665 }
5666 if (bs->file) {
5667 bdrv_detach_aio_context(bs->file);
5668 }
5669 if (bs->backing_hd) {
5670 bdrv_detach_aio_context(bs->backing_hd);
5671 }
5672
5673 bs->aio_context = NULL;
5674}
5675
5676void bdrv_attach_aio_context(BlockDriverState *bs,
5677 AioContext *new_context)
5678{
33384421
HR
5679 BdrvAioNotifier *ban;
5680
dcd04228
SH
5681 if (!bs->drv) {
5682 return;
5683 }
5684
5685 bs->aio_context = new_context;
5686
5687 if (bs->backing_hd) {
5688 bdrv_attach_aio_context(bs->backing_hd, new_context);
5689 }
5690 if (bs->file) {
5691 bdrv_attach_aio_context(bs->file, new_context);
5692 }
5693 if (bs->drv->bdrv_attach_aio_context) {
5694 bs->drv->bdrv_attach_aio_context(bs, new_context);
5695 }
13af91eb
SH
5696 if (bs->io_limits_enabled) {
5697 throttle_attach_aio_context(&bs->throttle_state, new_context);
5698 }
33384421
HR
5699
5700 QLIST_FOREACH(ban, &bs->aio_notifiers, list) {
5701 ban->attached_aio_context(new_context, ban->opaque);
5702 }
dcd04228
SH
5703}
5704
5705void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
5706{
5707 bdrv_drain_all(); /* ensure there are no in-flight requests */
5708
5709 bdrv_detach_aio_context(bs);
5710
5711 /* This function executes in the old AioContext so acquire the new one in
5712 * case it runs in a different thread.
5713 */
5714 aio_context_acquire(new_context);
5715 bdrv_attach_aio_context(bs, new_context);
5716 aio_context_release(new_context);
85d126f3 5717}
d616b224 5718
33384421
HR
5719void bdrv_add_aio_context_notifier(BlockDriverState *bs,
5720 void (*attached_aio_context)(AioContext *new_context, void *opaque),
5721 void (*detach_aio_context)(void *opaque), void *opaque)
5722{
5723 BdrvAioNotifier *ban = g_new(BdrvAioNotifier, 1);
5724 *ban = (BdrvAioNotifier){
5725 .attached_aio_context = attached_aio_context,
5726 .detach_aio_context = detach_aio_context,
5727 .opaque = opaque
5728 };
5729
5730 QLIST_INSERT_HEAD(&bs->aio_notifiers, ban, list);
5731}
5732
5733void bdrv_remove_aio_context_notifier(BlockDriverState *bs,
5734 void (*attached_aio_context)(AioContext *,
5735 void *),
5736 void (*detach_aio_context)(void *),
5737 void *opaque)
5738{
5739 BdrvAioNotifier *ban, *ban_next;
5740
5741 QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
5742 if (ban->attached_aio_context == attached_aio_context &&
5743 ban->detach_aio_context == detach_aio_context &&
5744 ban->opaque == opaque)
5745 {
5746 QLIST_REMOVE(ban, list);
5747 g_free(ban);
5748
5749 return;
5750 }
5751 }
5752
5753 abort();
5754}
5755
d616b224
SH
5756void bdrv_add_before_write_notifier(BlockDriverState *bs,
5757 NotifierWithReturn *notifier)
5758{
5759 notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
5760}
6f176b48 5761
c282e1fd 5762int bdrv_amend_options(BlockDriverState *bs, QemuOpts *opts)
6f176b48 5763{
c282e1fd 5764 if (!bs->drv->bdrv_amend_options) {
6f176b48
HR
5765 return -ENOTSUP;
5766 }
c282e1fd 5767 return bs->drv->bdrv_amend_options(bs, opts);
6f176b48 5768}
f6186f49 5769
b5042a36
BC
5770/* This function will be called by the bdrv_recurse_is_first_non_filter method
5771 * of block filter and by bdrv_is_first_non_filter.
5772 * It is used to test if the given bs is the candidate or recurse more in the
5773 * node graph.
212a5a8f 5774 */
b5042a36 5775bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
212a5a8f 5776 BlockDriverState *candidate)
f6186f49 5777{
b5042a36
BC
5778 /* return false if basic checks fails */
5779 if (!bs || !bs->drv) {
212a5a8f 5780 return false;
f6186f49
BC
5781 }
5782
b5042a36
BC
5783 /* the code reached a non block filter driver -> check if the bs is
5784 * the same as the candidate. It's the recursion termination condition.
5785 */
5786 if (!bs->drv->is_filter) {
5787 return bs == candidate;
212a5a8f 5788 }
b5042a36 5789 /* Down this path the driver is a block filter driver */
212a5a8f 5790
b5042a36
BC
5791 /* If the block filter recursion method is defined use it to recurse down
5792 * the node graph.
5793 */
5794 if (bs->drv->bdrv_recurse_is_first_non_filter) {
212a5a8f 5795 return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate);
f6186f49
BC
5796 }
5797
b5042a36
BC
5798 /* the driver is a block filter but don't allow to recurse -> return false
5799 */
5800 return false;
f6186f49
BC
5801}
5802
212a5a8f
BC
5803/* This function checks if the candidate is the first non filter bs down it's
5804 * bs chain. Since we don't have pointers to parents it explore all bs chains
5805 * from the top. Some filters can choose not to pass down the recursion.
5806 */
5807bool bdrv_is_first_non_filter(BlockDriverState *candidate)
f6186f49 5808{
212a5a8f
BC
5809 BlockDriverState *bs;
5810
5811 /* walk down the bs forest recursively */
5812 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5813 bool perm;
5814
b5042a36 5815 /* try to recurse in this top level bs */
e6dc8a1f 5816 perm = bdrv_recurse_is_first_non_filter(bs, candidate);
212a5a8f
BC
5817
5818 /* candidate is the first non filter */
5819 if (perm) {
5820 return true;
5821 }
5822 }
5823
5824 return false;
f6186f49 5825}
09158f00
BC
5826
5827BlockDriverState *check_to_replace_node(const char *node_name, Error **errp)
5828{
5829 BlockDriverState *to_replace_bs = bdrv_find_node(node_name);
5830 if (!to_replace_bs) {
5831 error_setg(errp, "Node name '%s' not found", node_name);
5832 return NULL;
5833 }
5834
5835 if (bdrv_op_is_blocked(to_replace_bs, BLOCK_OP_TYPE_REPLACE, errp)) {
5836 return NULL;
5837 }
5838
5839 /* We don't want arbitrary node of the BDS chain to be replaced only the top
5840 * most non filter in order to prevent data corruption.
5841 * Another benefit is that this tests exclude backing files which are
5842 * blocked by the backing blockers.
5843 */
5844 if (!bdrv_is_first_non_filter(to_replace_bs)) {
5845 error_setg(errp, "Only top most non filter can be replaced");
5846 return NULL;
5847 }
5848
5849 return to_replace_bs;
5850}
448ad91d
ML
5851
5852void bdrv_io_plug(BlockDriverState *bs)
5853{
5854 BlockDriver *drv = bs->drv;
5855 if (drv && drv->bdrv_io_plug) {
5856 drv->bdrv_io_plug(bs);
5857 } else if (bs->file) {
5858 bdrv_io_plug(bs->file);
5859 }
5860}
5861
5862void bdrv_io_unplug(BlockDriverState *bs)
5863{
5864 BlockDriver *drv = bs->drv;
5865 if (drv && drv->bdrv_io_unplug) {
5866 drv->bdrv_io_unplug(bs);
5867 } else if (bs->file) {
5868 bdrv_io_unplug(bs->file);
5869 }
5870}
5871
5872void bdrv_flush_io_queue(BlockDriverState *bs)
5873{
5874 BlockDriver *drv = bs->drv;
5875 if (drv && drv->bdrv_flush_io_queue) {
5876 drv->bdrv_flush_io_queue(bs);
5877 } else if (bs->file) {
5878 bdrv_flush_io_queue(bs->file);
5879 }
5880}
91af7014
HR
5881
5882static bool append_open_options(QDict *d, BlockDriverState *bs)
5883{
5884 const QDictEntry *entry;
5885 bool found_any = false;
5886
5887 for (entry = qdict_first(bs->options); entry;
5888 entry = qdict_next(bs->options, entry))
5889 {
5890 /* Only take options for this level and exclude all non-driver-specific
5891 * options */
5892 if (!strchr(qdict_entry_key(entry), '.') &&
5893 strcmp(qdict_entry_key(entry), "node-name"))
5894 {
5895 qobject_incref(qdict_entry_value(entry));
5896 qdict_put_obj(d, qdict_entry_key(entry), qdict_entry_value(entry));
5897 found_any = true;
5898 }
5899 }
5900
5901 return found_any;
5902}
5903
5904/* Updates the following BDS fields:
5905 * - exact_filename: A filename which may be used for opening a block device
5906 * which (mostly) equals the given BDS (even without any
5907 * other options; so reading and writing must return the same
5908 * results, but caching etc. may be different)
5909 * - full_open_options: Options which, when given when opening a block device
5910 * (without a filename), result in a BDS (mostly)
5911 * equalling the given one
5912 * - filename: If exact_filename is set, it is copied here. Otherwise,
5913 * full_open_options is converted to a JSON object, prefixed with
5914 * "json:" (for use through the JSON pseudo protocol) and put here.
5915 */
5916void bdrv_refresh_filename(BlockDriverState *bs)
5917{
5918 BlockDriver *drv = bs->drv;
5919 QDict *opts;
5920
5921 if (!drv) {
5922 return;
5923 }
5924
5925 /* This BDS's file name will most probably depend on its file's name, so
5926 * refresh that first */
5927 if (bs->file) {
5928 bdrv_refresh_filename(bs->file);
5929 }
5930
5931 if (drv->bdrv_refresh_filename) {
5932 /* Obsolete information is of no use here, so drop the old file name
5933 * information before refreshing it */
5934 bs->exact_filename[0] = '\0';
5935 if (bs->full_open_options) {
5936 QDECREF(bs->full_open_options);
5937 bs->full_open_options = NULL;
5938 }
5939
5940 drv->bdrv_refresh_filename(bs);
5941 } else if (bs->file) {
5942 /* Try to reconstruct valid information from the underlying file */
5943 bool has_open_options;
5944
5945 bs->exact_filename[0] = '\0';
5946 if (bs->full_open_options) {
5947 QDECREF(bs->full_open_options);
5948 bs->full_open_options = NULL;
5949 }
5950
5951 opts = qdict_new();
5952 has_open_options = append_open_options(opts, bs);
5953
5954 /* If no specific options have been given for this BDS, the filename of
5955 * the underlying file should suffice for this one as well */
5956 if (bs->file->exact_filename[0] && !has_open_options) {
5957 strcpy(bs->exact_filename, bs->file->exact_filename);
5958 }
5959 /* Reconstructing the full options QDict is simple for most format block
5960 * drivers, as long as the full options are known for the underlying
5961 * file BDS. The full options QDict of that file BDS should somehow
5962 * contain a representation of the filename, therefore the following
5963 * suffices without querying the (exact_)filename of this BDS. */
5964 if (bs->file->full_open_options) {
5965 qdict_put_obj(opts, "driver",
5966 QOBJECT(qstring_from_str(drv->format_name)));
5967 QINCREF(bs->file->full_open_options);
5968 qdict_put_obj(opts, "file", QOBJECT(bs->file->full_open_options));
5969
5970 bs->full_open_options = opts;
5971 } else {
5972 QDECREF(opts);
5973 }
5974 } else if (!bs->full_open_options && qdict_size(bs->options)) {
5975 /* There is no underlying file BDS (at least referenced by BDS.file),
5976 * so the full options QDict should be equal to the options given
5977 * specifically for this block device when it was opened (plus the
5978 * driver specification).
5979 * Because those options don't change, there is no need to update
5980 * full_open_options when it's already set. */
5981
5982 opts = qdict_new();
5983 append_open_options(opts, bs);
5984 qdict_put_obj(opts, "driver",
5985 QOBJECT(qstring_from_str(drv->format_name)));
5986
5987 if (bs->exact_filename[0]) {
5988 /* This may not work for all block protocol drivers (some may
5989 * require this filename to be parsed), but we have to find some
5990 * default solution here, so just include it. If some block driver
5991 * does not support pure options without any filename at all or
5992 * needs some special format of the options QDict, it needs to
5993 * implement the driver-specific bdrv_refresh_filename() function.
5994 */
5995 qdict_put_obj(opts, "filename",
5996 QOBJECT(qstring_from_str(bs->exact_filename)));
5997 }
5998
5999 bs->full_open_options = opts;
6000 }
6001
6002 if (bs->exact_filename[0]) {
6003 pstrcpy(bs->filename, sizeof(bs->filename), bs->exact_filename);
6004 } else if (bs->full_open_options) {
6005 QString *json = qobject_to_json(QOBJECT(bs->full_open_options));
6006 snprintf(bs->filename, sizeof(bs->filename), "json:%s",
6007 qstring_get_str(json));
6008 QDECREF(json);
6009 }
6010}
5366d0c8
BC
6011
6012/* This accessor function purpose is to allow the device models to access the
6013 * BlockAcctStats structure embedded inside a BlockDriverState without being
6014 * aware of the BlockDriverState structure layout.
6015 * It will go away when the BlockAcctStats structure will be moved inside
6016 * the device models.
6017 */
6018BlockAcctStats *bdrv_get_stats(BlockDriverState *bs)
6019{
6020 return &bs->stats;
6021}