]> git.proxmox.com Git - mirror_qemu.git/blame - block.c
block: Move bdrv_fill_options() call to bdrv_open()
[mirror_qemu.git] / block.c
CommitLineData
fc01f7e7
FB
1/*
2 * QEMU System Emulator block driver
5fafdf24 3 *
fc01f7e7 4 * Copyright (c) 2003 Fabrice Bellard
5fafdf24 5 *
fc01f7e7
FB
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
3990d09a 24#include "config-host.h"
faf07963 25#include "qemu-common.h"
6d519a5f 26#include "trace.h"
737e150e
PB
27#include "block/block_int.h"
28#include "block/blockjob.h"
1de7afc9 29#include "qemu/module.h"
7b1b5d19 30#include "qapi/qmp/qjson.h"
9c17d615 31#include "sysemu/sysemu.h"
1de7afc9 32#include "qemu/notify.h"
737e150e 33#include "block/coroutine.h"
c13163fb 34#include "block/qapi.h"
b2023818 35#include "qmp-commands.h"
1de7afc9 36#include "qemu/timer.h"
a5ee7bd4 37#include "qapi-event.h"
fc01f7e7 38
71e72a19 39#ifdef CONFIG_BSD
7674e7bf
FB
40#include <sys/types.h>
41#include <sys/stat.h>
42#include <sys/ioctl.h>
72cf2d4f 43#include <sys/queue.h>
c5e97233 44#ifndef __DragonFly__
7674e7bf
FB
45#include <sys/disk.h>
46#endif
c5e97233 47#endif
7674e7bf 48
49dc768d
AL
49#ifdef _WIN32
50#include <windows.h>
51#endif
52
e4654d2d
FZ
53struct BdrvDirtyBitmap {
54 HBitmap *bitmap;
55 QLIST_ENTRY(BdrvDirtyBitmap) list;
56};
57
1c9805a3
SH
58#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
59
7d4b4ba5 60static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
f141eafe
AL
61static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
62 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
c87c0672 63 BlockDriverCompletionFunc *cb, void *opaque);
f141eafe
AL
64static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
65 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
ce1a14dc 66 BlockDriverCompletionFunc *cb, void *opaque);
f9f05dc5
KW
67static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
68 int64_t sector_num, int nb_sectors,
69 QEMUIOVector *iov);
70static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
71 int64_t sector_num, int nb_sectors,
72 QEMUIOVector *iov);
775aa8b6
KW
73static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
74 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
470c0504 75 BdrvRequestFlags flags);
775aa8b6
KW
76static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
77 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
f08f2dda 78 BdrvRequestFlags flags);
b2a61371
SH
79static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
80 int64_t sector_num,
81 QEMUIOVector *qiov,
82 int nb_sectors,
d20d9b7c 83 BdrvRequestFlags flags,
b2a61371
SH
84 BlockDriverCompletionFunc *cb,
85 void *opaque,
8c5873d6 86 bool is_write);
b2a61371 87static void coroutine_fn bdrv_co_do_rw(void *opaque);
621f0589 88static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
aa7bfbff 89 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
ec530c81 90
1b7bdbc1
SH
91static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
92 QTAILQ_HEAD_INITIALIZER(bdrv_states);
7ee930d0 93
dc364f4c
BC
94static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
95 QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
96
8a22f02a
SH
97static QLIST_HEAD(, BlockDriver) bdrv_drivers =
98 QLIST_HEAD_INITIALIZER(bdrv_drivers);
ea2384d3 99
eb852011
MA
100/* If non-zero, use only whitelisted block drivers */
101static int use_bdrv_whitelist;
102
9e0b22f4
SH
103#ifdef _WIN32
104static int is_windows_drive_prefix(const char *filename)
105{
106 return (((filename[0] >= 'a' && filename[0] <= 'z') ||
107 (filename[0] >= 'A' && filename[0] <= 'Z')) &&
108 filename[1] == ':');
109}
110
111int is_windows_drive(const char *filename)
112{
113 if (is_windows_drive_prefix(filename) &&
114 filename[2] == '\0')
115 return 1;
116 if (strstart(filename, "\\\\.\\", NULL) ||
117 strstart(filename, "//./", NULL))
118 return 1;
119 return 0;
120}
121#endif
122
0563e191 123/* throttling disk I/O limits */
cc0681c4
BC
124void bdrv_set_io_limits(BlockDriverState *bs,
125 ThrottleConfig *cfg)
98f90dba 126{
cc0681c4 127 int i;
98f90dba 128
cc0681c4 129 throttle_config(&bs->throttle_state, cfg);
98f90dba 130
cc0681c4
BC
131 for (i = 0; i < 2; i++) {
132 qemu_co_enter_next(&bs->throttled_reqs[i]);
98f90dba 133 }
cc0681c4
BC
134}
135
136/* this function drain all the throttled IOs */
137static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
138{
139 bool drained = false;
140 bool enabled = bs->io_limits_enabled;
141 int i;
142
143 bs->io_limits_enabled = false;
144
145 for (i = 0; i < 2; i++) {
146 while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
147 drained = true;
148 }
149 }
150
151 bs->io_limits_enabled = enabled;
98f90dba 152
cc0681c4 153 return drained;
98f90dba
ZYW
154}
155
cc0681c4 156void bdrv_io_limits_disable(BlockDriverState *bs)
0563e191 157{
cc0681c4 158 bs->io_limits_enabled = false;
0563e191 159
cc0681c4
BC
160 bdrv_start_throttled_reqs(bs);
161
162 throttle_destroy(&bs->throttle_state);
0563e191
ZYW
163}
164
cc0681c4 165static void bdrv_throttle_read_timer_cb(void *opaque)
0563e191 166{
cc0681c4
BC
167 BlockDriverState *bs = opaque;
168 qemu_co_enter_next(&bs->throttled_reqs[0]);
0563e191
ZYW
169}
170
cc0681c4 171static void bdrv_throttle_write_timer_cb(void *opaque)
0563e191 172{
cc0681c4
BC
173 BlockDriverState *bs = opaque;
174 qemu_co_enter_next(&bs->throttled_reqs[1]);
0563e191
ZYW
175}
176
cc0681c4
BC
177/* should be called before bdrv_set_io_limits if a limit is set */
178void bdrv_io_limits_enable(BlockDriverState *bs)
179{
180 assert(!bs->io_limits_enabled);
181 throttle_init(&bs->throttle_state,
13af91eb 182 bdrv_get_aio_context(bs),
cc0681c4
BC
183 QEMU_CLOCK_VIRTUAL,
184 bdrv_throttle_read_timer_cb,
185 bdrv_throttle_write_timer_cb,
186 bs);
187 bs->io_limits_enabled = true;
188}
189
190/* This function makes an IO wait if needed
191 *
192 * @nb_sectors: the number of sectors of the IO
193 * @is_write: is the IO a write
194 */
98f90dba 195static void bdrv_io_limits_intercept(BlockDriverState *bs,
d5103588 196 unsigned int bytes,
cc0681c4 197 bool is_write)
98f90dba 198{
cc0681c4
BC
199 /* does this io must wait */
200 bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
98f90dba 201
cc0681c4
BC
202 /* if must wait or any request of this type throttled queue the IO */
203 if (must_wait ||
204 !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
205 qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
98f90dba
ZYW
206 }
207
cc0681c4 208 /* the IO will be executed, do the accounting */
d5103588
KW
209 throttle_account(&bs->throttle_state, is_write, bytes);
210
98f90dba 211
cc0681c4
BC
212 /* if the next request must wait -> do nothing */
213 if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
214 return;
98f90dba
ZYW
215 }
216
cc0681c4
BC
217 /* else queue next request for execution */
218 qemu_co_queue_next(&bs->throttled_reqs[is_write]);
98f90dba
ZYW
219}
220
339064d5
KW
221size_t bdrv_opt_mem_align(BlockDriverState *bs)
222{
223 if (!bs || !bs->drv) {
224 /* 4k should be on the safe side */
225 return 4096;
226 }
227
228 return bs->bl.opt_mem_alignment;
229}
230
9e0b22f4
SH
231/* check if the path starts with "<protocol>:" */
232static int path_has_protocol(const char *path)
233{
947995c0
PB
234 const char *p;
235
9e0b22f4
SH
236#ifdef _WIN32
237 if (is_windows_drive(path) ||
238 is_windows_drive_prefix(path)) {
239 return 0;
240 }
947995c0
PB
241 p = path + strcspn(path, ":/\\");
242#else
243 p = path + strcspn(path, ":/");
9e0b22f4
SH
244#endif
245
947995c0 246 return *p == ':';
9e0b22f4
SH
247}
248
83f64091 249int path_is_absolute(const char *path)
3b0d4f61 250{
21664424
FB
251#ifdef _WIN32
252 /* specific case for names like: "\\.\d:" */
f53f4da9 253 if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
21664424 254 return 1;
f53f4da9
PB
255 }
256 return (*path == '/' || *path == '\\');
3b9f94e1 257#else
f53f4da9 258 return (*path == '/');
3b9f94e1 259#endif
3b0d4f61
FB
260}
261
83f64091
FB
262/* if filename is absolute, just copy it to dest. Otherwise, build a
263 path to it by considering it is relative to base_path. URL are
264 supported. */
265void path_combine(char *dest, int dest_size,
266 const char *base_path,
267 const char *filename)
3b0d4f61 268{
83f64091
FB
269 const char *p, *p1;
270 int len;
271
272 if (dest_size <= 0)
273 return;
274 if (path_is_absolute(filename)) {
275 pstrcpy(dest, dest_size, filename);
276 } else {
277 p = strchr(base_path, ':');
278 if (p)
279 p++;
280 else
281 p = base_path;
3b9f94e1
FB
282 p1 = strrchr(base_path, '/');
283#ifdef _WIN32
284 {
285 const char *p2;
286 p2 = strrchr(base_path, '\\');
287 if (!p1 || p2 > p1)
288 p1 = p2;
289 }
290#endif
83f64091
FB
291 if (p1)
292 p1++;
293 else
294 p1 = base_path;
295 if (p1 > p)
296 p = p1;
297 len = p - base_path;
298 if (len > dest_size - 1)
299 len = dest_size - 1;
300 memcpy(dest, base_path, len);
301 dest[len] = '\0';
302 pstrcat(dest, dest_size, filename);
3b0d4f61 303 }
3b0d4f61
FB
304}
305
dc5a1371
PB
306void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
307{
308 if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
309 pstrcpy(dest, sz, bs->backing_file);
310 } else {
311 path_combine(dest, sz, bs->filename, bs->backing_file);
312 }
313}
314
5efa9d5a 315void bdrv_register(BlockDriver *bdrv)
ea2384d3 316{
8c5873d6
SH
317 /* Block drivers without coroutine functions need emulation */
318 if (!bdrv->bdrv_co_readv) {
f9f05dc5
KW
319 bdrv->bdrv_co_readv = bdrv_co_readv_em;
320 bdrv->bdrv_co_writev = bdrv_co_writev_em;
321
f8c35c1d
SH
322 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
323 * the block driver lacks aio we need to emulate that too.
324 */
f9f05dc5
KW
325 if (!bdrv->bdrv_aio_readv) {
326 /* add AIO emulation layer */
327 bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
328 bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
f9f05dc5 329 }
83f64091 330 }
b2e12bc6 331
8a22f02a 332 QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
ea2384d3 333}
b338082b
FB
334
335/* create a new block device (by default it is empty) */
98522f63 336BlockDriverState *bdrv_new(const char *device_name, Error **errp)
b338082b 337{
1b7bdbc1 338 BlockDriverState *bs;
fbe40ff7 339 int i;
b338082b 340
f2d953ec
KW
341 if (bdrv_find(device_name)) {
342 error_setg(errp, "Device with id '%s' already exists",
343 device_name);
344 return NULL;
345 }
346 if (bdrv_find_node(device_name)) {
347 error_setg(errp, "Device with node-name '%s' already exists",
348 device_name);
349 return NULL;
350 }
351
7267c094 352 bs = g_malloc0(sizeof(BlockDriverState));
e4654d2d 353 QLIST_INIT(&bs->dirty_bitmaps);
b338082b 354 pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
ea2384d3 355 if (device_name[0] != '\0') {
dc364f4c 356 QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
ea2384d3 357 }
fbe40ff7
FZ
358 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
359 QLIST_INIT(&bs->op_blockers[i]);
360 }
28a7282a 361 bdrv_iostatus_disable(bs);
d7d512f6 362 notifier_list_init(&bs->close_notifiers);
d616b224 363 notifier_with_return_list_init(&bs->before_write_notifiers);
cc0681c4
BC
364 qemu_co_queue_init(&bs->throttled_reqs[0]);
365 qemu_co_queue_init(&bs->throttled_reqs[1]);
9fcb0251 366 bs->refcnt = 1;
dcd04228 367 bs->aio_context = qemu_get_aio_context();
d7d512f6 368
b338082b
FB
369 return bs;
370}
371
d7d512f6
PB
372void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
373{
374 notifier_list_add(&bs->close_notifiers, notify);
375}
376
ea2384d3
FB
377BlockDriver *bdrv_find_format(const char *format_name)
378{
379 BlockDriver *drv1;
8a22f02a
SH
380 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
381 if (!strcmp(drv1->format_name, format_name)) {
ea2384d3 382 return drv1;
8a22f02a 383 }
ea2384d3
FB
384 }
385 return NULL;
386}
387
b64ec4e4 388static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
eb852011 389{
b64ec4e4
FZ
390 static const char *whitelist_rw[] = {
391 CONFIG_BDRV_RW_WHITELIST
392 };
393 static const char *whitelist_ro[] = {
394 CONFIG_BDRV_RO_WHITELIST
eb852011
MA
395 };
396 const char **p;
397
b64ec4e4 398 if (!whitelist_rw[0] && !whitelist_ro[0]) {
eb852011 399 return 1; /* no whitelist, anything goes */
b64ec4e4 400 }
eb852011 401
b64ec4e4 402 for (p = whitelist_rw; *p; p++) {
eb852011
MA
403 if (!strcmp(drv->format_name, *p)) {
404 return 1;
405 }
406 }
b64ec4e4
FZ
407 if (read_only) {
408 for (p = whitelist_ro; *p; p++) {
409 if (!strcmp(drv->format_name, *p)) {
410 return 1;
411 }
412 }
413 }
eb852011
MA
414 return 0;
415}
416
b64ec4e4
FZ
417BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
418 bool read_only)
eb852011
MA
419{
420 BlockDriver *drv = bdrv_find_format(format_name);
b64ec4e4 421 return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
eb852011
MA
422}
423
5b7e1542
ZYW
424typedef struct CreateCo {
425 BlockDriver *drv;
426 char *filename;
83d0521a 427 QemuOpts *opts;
5b7e1542 428 int ret;
cc84d90f 429 Error *err;
5b7e1542
ZYW
430} CreateCo;
431
432static void coroutine_fn bdrv_create_co_entry(void *opaque)
433{
cc84d90f
HR
434 Error *local_err = NULL;
435 int ret;
436
5b7e1542
ZYW
437 CreateCo *cco = opaque;
438 assert(cco->drv);
439
c282e1fd 440 ret = cco->drv->bdrv_create(cco->filename, cco->opts, &local_err);
84d18f06 441 if (local_err) {
cc84d90f
HR
442 error_propagate(&cco->err, local_err);
443 }
444 cco->ret = ret;
5b7e1542
ZYW
445}
446
0e7e1989 447int bdrv_create(BlockDriver *drv, const char* filename,
83d0521a 448 QemuOpts *opts, Error **errp)
ea2384d3 449{
5b7e1542
ZYW
450 int ret;
451
452 Coroutine *co;
453 CreateCo cco = {
454 .drv = drv,
455 .filename = g_strdup(filename),
83d0521a 456 .opts = opts,
5b7e1542 457 .ret = NOT_DONE,
cc84d90f 458 .err = NULL,
5b7e1542
ZYW
459 };
460
c282e1fd 461 if (!drv->bdrv_create) {
cc84d90f 462 error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
80168bff
LC
463 ret = -ENOTSUP;
464 goto out;
5b7e1542
ZYW
465 }
466
467 if (qemu_in_coroutine()) {
468 /* Fast-path if already in coroutine context */
469 bdrv_create_co_entry(&cco);
470 } else {
471 co = qemu_coroutine_create(bdrv_create_co_entry);
472 qemu_coroutine_enter(co, &cco);
473 while (cco.ret == NOT_DONE) {
474 qemu_aio_wait();
475 }
476 }
477
478 ret = cco.ret;
cc84d90f 479 if (ret < 0) {
84d18f06 480 if (cco.err) {
cc84d90f
HR
481 error_propagate(errp, cco.err);
482 } else {
483 error_setg_errno(errp, -ret, "Could not create image");
484 }
485 }
0e7e1989 486
80168bff
LC
487out:
488 g_free(cco.filename);
5b7e1542 489 return ret;
ea2384d3
FB
490}
491
c282e1fd 492int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp)
84a12e66
CH
493{
494 BlockDriver *drv;
cc84d90f
HR
495 Error *local_err = NULL;
496 int ret;
84a12e66 497
98289620 498 drv = bdrv_find_protocol(filename, true);
84a12e66 499 if (drv == NULL) {
cc84d90f 500 error_setg(errp, "Could not find protocol for file '%s'", filename);
16905d71 501 return -ENOENT;
84a12e66
CH
502 }
503
c282e1fd 504 ret = bdrv_create(drv, filename, opts, &local_err);
84d18f06 505 if (local_err) {
cc84d90f
HR
506 error_propagate(errp, local_err);
507 }
508 return ret;
84a12e66
CH
509}
510
355ef4ac 511int bdrv_refresh_limits(BlockDriverState *bs)
d34682cd
KW
512{
513 BlockDriver *drv = bs->drv;
514
515 memset(&bs->bl, 0, sizeof(bs->bl));
516
466ad822
KW
517 if (!drv) {
518 return 0;
519 }
520
521 /* Take some limits from the children as a default */
522 if (bs->file) {
523 bdrv_refresh_limits(bs->file);
524 bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
339064d5
KW
525 bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
526 } else {
527 bs->bl.opt_mem_alignment = 512;
466ad822
KW
528 }
529
530 if (bs->backing_hd) {
531 bdrv_refresh_limits(bs->backing_hd);
532 bs->bl.opt_transfer_length =
533 MAX(bs->bl.opt_transfer_length,
534 bs->backing_hd->bl.opt_transfer_length);
339064d5
KW
535 bs->bl.opt_mem_alignment =
536 MAX(bs->bl.opt_mem_alignment,
537 bs->backing_hd->bl.opt_mem_alignment);
466ad822
KW
538 }
539
540 /* Then let the driver override it */
541 if (drv->bdrv_refresh_limits) {
d34682cd
KW
542 return drv->bdrv_refresh_limits(bs);
543 }
544
545 return 0;
546}
547
eba25057
JM
548/*
549 * Create a uniquely-named empty temporary file.
550 * Return 0 upon success, otherwise a negative errno value.
551 */
552int get_tmp_filename(char *filename, int size)
d5249393 553{
eba25057 554#ifdef _WIN32
3b9f94e1 555 char temp_dir[MAX_PATH];
eba25057
JM
556 /* GetTempFileName requires that its output buffer (4th param)
557 have length MAX_PATH or greater. */
558 assert(size >= MAX_PATH);
559 return (GetTempPath(MAX_PATH, temp_dir)
560 && GetTempFileName(temp_dir, "qem", 0, filename)
561 ? 0 : -GetLastError());
d5249393 562#else
67b915a5 563 int fd;
7ccfb2eb 564 const char *tmpdir;
0badc1ee 565 tmpdir = getenv("TMPDIR");
69bef793
AS
566 if (!tmpdir) {
567 tmpdir = "/var/tmp";
568 }
eba25057
JM
569 if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
570 return -EOVERFLOW;
571 }
ea2384d3 572 fd = mkstemp(filename);
fe235a06
DH
573 if (fd < 0) {
574 return -errno;
575 }
576 if (close(fd) != 0) {
577 unlink(filename);
eba25057
JM
578 return -errno;
579 }
580 return 0;
d5249393 581#endif
eba25057 582}
fc01f7e7 583
84a12e66
CH
584/*
585 * Detect host devices. By convention, /dev/cdrom[N] is always
586 * recognized as a host CDROM.
587 */
588static BlockDriver *find_hdev_driver(const char *filename)
589{
590 int score_max = 0, score;
591 BlockDriver *drv = NULL, *d;
592
593 QLIST_FOREACH(d, &bdrv_drivers, list) {
594 if (d->bdrv_probe_device) {
595 score = d->bdrv_probe_device(filename);
596 if (score > score_max) {
597 score_max = score;
598 drv = d;
599 }
600 }
601 }
602
603 return drv;
604}
605
98289620
KW
606BlockDriver *bdrv_find_protocol(const char *filename,
607 bool allow_protocol_prefix)
83f64091
FB
608{
609 BlockDriver *drv1;
610 char protocol[128];
1cec71e3 611 int len;
83f64091 612 const char *p;
19cb3738 613
66f82cee
KW
614 /* TODO Drivers without bdrv_file_open must be specified explicitly */
615
39508e7a
CH
616 /*
617 * XXX(hch): we really should not let host device detection
618 * override an explicit protocol specification, but moving this
619 * later breaks access to device names with colons in them.
620 * Thanks to the brain-dead persistent naming schemes on udev-
621 * based Linux systems those actually are quite common.
622 */
623 drv1 = find_hdev_driver(filename);
624 if (drv1) {
625 return drv1;
626 }
627
98289620 628 if (!path_has_protocol(filename) || !allow_protocol_prefix) {
39508e7a 629 return bdrv_find_format("file");
84a12e66 630 }
98289620 631
9e0b22f4
SH
632 p = strchr(filename, ':');
633 assert(p != NULL);
1cec71e3
AL
634 len = p - filename;
635 if (len > sizeof(protocol) - 1)
636 len = sizeof(protocol) - 1;
637 memcpy(protocol, filename, len);
638 protocol[len] = '\0';
8a22f02a 639 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
5fafdf24 640 if (drv1->protocol_name &&
8a22f02a 641 !strcmp(drv1->protocol_name, protocol)) {
83f64091 642 return drv1;
8a22f02a 643 }
83f64091
FB
644 }
645 return NULL;
646}
647
f500a6d3 648static int find_image_format(BlockDriverState *bs, const char *filename,
34b5d2c6 649 BlockDriver **pdrv, Error **errp)
f3a5d3f8 650{
f500a6d3 651 int score, score_max;
f3a5d3f8
CH
652 BlockDriver *drv1, *drv;
653 uint8_t buf[2048];
f500a6d3 654 int ret = 0;
f8ea0b00 655
08a00559 656 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
8e895599 657 if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
c98ac35d
SW
658 drv = bdrv_find_format("raw");
659 if (!drv) {
34b5d2c6 660 error_setg(errp, "Could not find raw image format");
c98ac35d
SW
661 ret = -ENOENT;
662 }
663 *pdrv = drv;
664 return ret;
1a396859 665 }
f8ea0b00 666
83f64091 667 ret = bdrv_pread(bs, 0, buf, sizeof(buf));
83f64091 668 if (ret < 0) {
34b5d2c6
HR
669 error_setg_errno(errp, -ret, "Could not read image for determining its "
670 "format");
c98ac35d
SW
671 *pdrv = NULL;
672 return ret;
83f64091
FB
673 }
674
ea2384d3 675 score_max = 0;
84a12e66 676 drv = NULL;
8a22f02a 677 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
83f64091
FB
678 if (drv1->bdrv_probe) {
679 score = drv1->bdrv_probe(buf, ret, filename);
680 if (score > score_max) {
681 score_max = score;
682 drv = drv1;
683 }
0849bf08 684 }
fc01f7e7 685 }
c98ac35d 686 if (!drv) {
34b5d2c6
HR
687 error_setg(errp, "Could not determine image format: No compatible "
688 "driver found");
c98ac35d
SW
689 ret = -ENOENT;
690 }
691 *pdrv = drv;
692 return ret;
ea2384d3
FB
693}
694
51762288
SH
695/**
696 * Set the current 'total_sectors' value
697 */
698static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
699{
700 BlockDriver *drv = bs->drv;
701
396759ad
NB
702 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
703 if (bs->sg)
704 return 0;
705
51762288
SH
706 /* query actual device if possible, otherwise just trust the hint */
707 if (drv->bdrv_getlength) {
708 int64_t length = drv->bdrv_getlength(bs);
709 if (length < 0) {
710 return length;
711 }
7e382003 712 hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
51762288
SH
713 }
714
715 bs->total_sectors = hint;
716 return 0;
717}
718
9e8f1835
PB
719/**
720 * Set open flags for a given discard mode
721 *
722 * Return 0 on success, -1 if the discard mode was invalid.
723 */
724int bdrv_parse_discard_flags(const char *mode, int *flags)
725{
726 *flags &= ~BDRV_O_UNMAP;
727
728 if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
729 /* do nothing */
730 } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
731 *flags |= BDRV_O_UNMAP;
732 } else {
733 return -1;
734 }
735
736 return 0;
737}
738
c3993cdc
SH
739/**
740 * Set open flags for a given cache mode
741 *
742 * Return 0 on success, -1 if the cache mode was invalid.
743 */
744int bdrv_parse_cache_flags(const char *mode, int *flags)
745{
746 *flags &= ~BDRV_O_CACHE_MASK;
747
748 if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
749 *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
92196b2f
SH
750 } else if (!strcmp(mode, "directsync")) {
751 *flags |= BDRV_O_NOCACHE;
c3993cdc
SH
752 } else if (!strcmp(mode, "writeback")) {
753 *flags |= BDRV_O_CACHE_WB;
754 } else if (!strcmp(mode, "unsafe")) {
755 *flags |= BDRV_O_CACHE_WB;
756 *flags |= BDRV_O_NO_FLUSH;
757 } else if (!strcmp(mode, "writethrough")) {
758 /* this is the default */
759 } else {
760 return -1;
761 }
762
763 return 0;
764}
765
53fec9d3
SH
766/**
767 * The copy-on-read flag is actually a reference count so multiple users may
768 * use the feature without worrying about clobbering its previous state.
769 * Copy-on-read stays enabled until all users have called to disable it.
770 */
771void bdrv_enable_copy_on_read(BlockDriverState *bs)
772{
773 bs->copy_on_read++;
774}
775
776void bdrv_disable_copy_on_read(BlockDriverState *bs)
777{
778 assert(bs->copy_on_read > 0);
779 bs->copy_on_read--;
780}
781
b1e6fc08
KW
782/*
783 * Returns the flags that a temporary snapshot should get, based on the
784 * originally requested flags (the originally requested image will have flags
785 * like a backing file)
786 */
787static int bdrv_temp_snapshot_flags(int flags)
788{
789 return (flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY;
790}
791
0b50cc88
KW
792/*
793 * Returns the flags that bs->file should get, based on the given flags for
794 * the parent BDS
795 */
796static int bdrv_inherited_flags(int flags)
797{
798 /* Enable protocol handling, disable format probing for bs->file */
799 flags |= BDRV_O_PROTOCOL;
800
801 /* Our block drivers take care to send flushes and respect unmap policy,
802 * so we can enable both unconditionally on lower layers. */
803 flags |= BDRV_O_CACHE_WB | BDRV_O_UNMAP;
804
0b50cc88 805 /* Clear flags that only apply to the top layer */
5669b44d 806 flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_COPY_ON_READ);
0b50cc88
KW
807
808 return flags;
809}
810
317fc44e
KW
811/*
812 * Returns the flags that bs->backing_hd should get, based on the given flags
813 * for the parent BDS
814 */
815static int bdrv_backing_flags(int flags)
816{
817 /* backing files always opened read-only */
818 flags &= ~(BDRV_O_RDWR | BDRV_O_COPY_ON_READ);
819
820 /* snapshot=on is handled on the top layer */
8bfea15d 821 flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_TEMPORARY);
317fc44e
KW
822
823 return flags;
824}
825
7b272452
KW
826static int bdrv_open_flags(BlockDriverState *bs, int flags)
827{
828 int open_flags = flags | BDRV_O_CACHE_WB;
829
830 /*
831 * Clear flags that are internal to the block layer before opening the
832 * image.
833 */
834 open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
835
836 /*
837 * Snapshots should be writable.
838 */
8bfea15d 839 if (flags & BDRV_O_TEMPORARY) {
7b272452
KW
840 open_flags |= BDRV_O_RDWR;
841 }
842
843 return open_flags;
844}
845
636ea370
KW
846static void bdrv_assign_node_name(BlockDriverState *bs,
847 const char *node_name,
848 Error **errp)
6913c0c2
BC
849{
850 if (!node_name) {
636ea370 851 return;
6913c0c2
BC
852 }
853
854 /* empty string node name is invalid */
855 if (node_name[0] == '\0') {
856 error_setg(errp, "Empty node name");
636ea370 857 return;
6913c0c2
BC
858 }
859
0c5e94ee
BC
860 /* takes care of avoiding namespaces collisions */
861 if (bdrv_find(node_name)) {
862 error_setg(errp, "node-name=%s is conflicting with a device id",
863 node_name);
636ea370 864 return;
0c5e94ee
BC
865 }
866
6913c0c2
BC
867 /* takes care of avoiding duplicates node names */
868 if (bdrv_find_node(node_name)) {
869 error_setg(errp, "Duplicate node name");
636ea370 870 return;
6913c0c2
BC
871 }
872
873 /* copy node name into the bs and insert it into the graph list */
874 pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
875 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
6913c0c2
BC
876}
877
57915332
KW
878/*
879 * Common part for opening disk images and files
b6ad491a
KW
880 *
881 * Removes all processed options from *options.
57915332 882 */
f500a6d3 883static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
34b5d2c6 884 QDict *options, int flags, BlockDriver *drv, Error **errp)
57915332
KW
885{
886 int ret, open_flags;
035fccdf 887 const char *filename;
6913c0c2 888 const char *node_name = NULL;
34b5d2c6 889 Error *local_err = NULL;
57915332
KW
890
891 assert(drv != NULL);
6405875c 892 assert(bs->file == NULL);
707ff828 893 assert(options != NULL && bs->options != options);
57915332 894
45673671
KW
895 if (file != NULL) {
896 filename = file->filename;
897 } else {
898 filename = qdict_get_try_str(options, "filename");
899 }
900
765003db
KW
901 if (drv->bdrv_needs_filename && !filename) {
902 error_setg(errp, "The '%s' block driver requires a file name",
903 drv->format_name);
904 return -EINVAL;
905 }
906
45673671 907 trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
28dcee10 908
6913c0c2 909 node_name = qdict_get_try_str(options, "node-name");
636ea370 910 bdrv_assign_node_name(bs, node_name, &local_err);
0fb6395c 911 if (local_err) {
636ea370
KW
912 error_propagate(errp, local_err);
913 return -EINVAL;
6913c0c2
BC
914 }
915 qdict_del(options, "node-name");
916
5d186eb0
KW
917 /* bdrv_open() with directly using a protocol as drv. This layer is already
918 * opened, so assign it to bs (while file becomes a closed BlockDriverState)
919 * and return immediately. */
920 if (file != NULL && drv->bdrv_file_open) {
921 bdrv_swap(file, bs);
922 return 0;
923 }
924
57915332 925 bs->open_flags = flags;
1b7fd729 926 bs->guest_block_size = 512;
c25f53b0 927 bs->request_alignment = 512;
0d51b4de 928 bs->zero_beyond_eof = true;
b64ec4e4
FZ
929 open_flags = bdrv_open_flags(bs, flags);
930 bs->read_only = !(open_flags & BDRV_O_RDWR);
931
932 if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
8f94a6e4
KW
933 error_setg(errp,
934 !bs->read_only && bdrv_is_whitelisted(drv, true)
935 ? "Driver '%s' can only be used for read-only devices"
936 : "Driver '%s' is not whitelisted",
937 drv->format_name);
b64ec4e4
FZ
938 return -ENOTSUP;
939 }
57915332 940
53fec9d3 941 assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
0ebd24e0
KW
942 if (flags & BDRV_O_COPY_ON_READ) {
943 if (!bs->read_only) {
944 bdrv_enable_copy_on_read(bs);
945 } else {
946 error_setg(errp, "Can't use copy-on-read on read-only device");
947 return -EINVAL;
948 }
53fec9d3
SH
949 }
950
c2ad1b0c
KW
951 if (filename != NULL) {
952 pstrcpy(bs->filename, sizeof(bs->filename), filename);
953 } else {
954 bs->filename[0] = '\0';
955 }
57915332 956
57915332 957 bs->drv = drv;
7267c094 958 bs->opaque = g_malloc0(drv->instance_size);
57915332 959
03f541bd 960 bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
e7c63796 961
66f82cee
KW
962 /* Open the image, either directly or using a protocol */
963 if (drv->bdrv_file_open) {
5d186eb0 964 assert(file == NULL);
030be321 965 assert(!drv->bdrv_needs_filename || filename != NULL);
34b5d2c6 966 ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
f500a6d3 967 } else {
2af5ef70 968 if (file == NULL) {
34b5d2c6
HR
969 error_setg(errp, "Can't use '%s' as a block driver for the "
970 "protocol level", drv->format_name);
2af5ef70
KW
971 ret = -EINVAL;
972 goto free_and_fail;
973 }
f500a6d3 974 bs->file = file;
34b5d2c6 975 ret = drv->bdrv_open(bs, options, open_flags, &local_err);
66f82cee
KW
976 }
977
57915332 978 if (ret < 0) {
84d18f06 979 if (local_err) {
34b5d2c6 980 error_propagate(errp, local_err);
2fa9aa59
DH
981 } else if (bs->filename[0]) {
982 error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
34b5d2c6
HR
983 } else {
984 error_setg_errno(errp, -ret, "Could not open image");
985 }
57915332
KW
986 goto free_and_fail;
987 }
988
51762288
SH
989 ret = refresh_total_sectors(bs, bs->total_sectors);
990 if (ret < 0) {
34b5d2c6 991 error_setg_errno(errp, -ret, "Could not refresh total sector count");
51762288 992 goto free_and_fail;
57915332 993 }
51762288 994
d34682cd 995 bdrv_refresh_limits(bs);
c25f53b0 996 assert(bdrv_opt_mem_align(bs) != 0);
47ea2de2 997 assert((bs->request_alignment != 0) || bs->sg);
57915332
KW
998 return 0;
999
1000free_and_fail:
f500a6d3 1001 bs->file = NULL;
7267c094 1002 g_free(bs->opaque);
57915332
KW
1003 bs->opaque = NULL;
1004 bs->drv = NULL;
1005 return ret;
1006}
1007
b6ce07aa 1008/*
f54120ff
KW
1009 * Fills in default options for opening images and converts the legacy
1010 * filename/flags pair to option QDict entries.
b6ce07aa 1011 */
462f5bcf 1012static int bdrv_fill_options(QDict **options, const char *filename, int flags,
f54120ff 1013 Error **errp)
ea2384d3 1014{
c2ad1b0c 1015 const char *drvname;
462f5bcf 1016 bool protocol = flags & BDRV_O_PROTOCOL;
e3fa4bfa 1017 bool parse_filename = false;
34b5d2c6 1018 Error *local_err = NULL;
f54120ff 1019 BlockDriver *drv;
83f64091 1020
462f5bcf
KW
1021 if (!protocol) {
1022 return 0;
1023 }
1024
035fccdf 1025 /* Fetch the file name from the options QDict if necessary */
f54120ff
KW
1026 if (filename) {
1027 if (!qdict_haskey(*options, "filename")) {
1028 qdict_put(*options, "filename", qstring_from_str(filename));
1029 parse_filename = true;
1030 } else {
1031 error_setg(errp, "Can't specify 'file' and 'filename' options at "
1032 "the same time");
1033 return -EINVAL;
1034 }
035fccdf
KW
1035 }
1036
c2ad1b0c 1037 /* Find the right block driver */
f54120ff 1038 filename = qdict_get_try_str(*options, "filename");
5acd9d81 1039 drvname = qdict_get_try_str(*options, "driver");
f54120ff
KW
1040
1041 if (!drvname) {
1042 if (filename) {
1043 drv = bdrv_find_protocol(filename, parse_filename);
1044 if (!drv) {
1045 error_setg(errp, "Unknown protocol");
1046 return -EINVAL;
1047 }
1048
1049 drvname = drv->format_name;
1050 qdict_put(*options, "driver", qstring_from_str(drvname));
1051 } else {
1052 error_setg(errp, "Must specify either driver or file");
1053 return -EINVAL;
98289620 1054 }
c2ad1b0c
KW
1055 }
1056
f54120ff 1057 drv = bdrv_find_format(drvname);
c2ad1b0c 1058 if (!drv) {
f54120ff
KW
1059 error_setg(errp, "Unknown driver '%s'", drvname);
1060 return -ENOENT;
c2ad1b0c
KW
1061 }
1062
f54120ff 1063 /* Driver-specific filename parsing */
e3fa4bfa 1064 if (drv->bdrv_parse_filename && parse_filename) {
5acd9d81 1065 drv->bdrv_parse_filename(filename, *options, &local_err);
84d18f06 1066 if (local_err) {
34b5d2c6 1067 error_propagate(errp, local_err);
f54120ff 1068 return -EINVAL;
6963a30d 1069 }
cd5d031e
HR
1070
1071 if (!drv->bdrv_needs_filename) {
1072 qdict_del(*options, "filename");
cd5d031e 1073 }
6963a30d
KW
1074 }
1075
f54120ff
KW
1076 return 0;
1077}
1078
1079/*
1080 * Opens a file using a protocol (file, host_device, nbd, ...)
1081 *
1082 * options is an indirect pointer to a QDict of options to pass to the block
1083 * drivers, or pointer to NULL for an empty set of options. If this function
1084 * takes ownership of the QDict reference, it will set *options to NULL;
1085 * otherwise, it will contain unused/unrecognized options after this function
1086 * returns. Then, the caller is responsible for freeing it. If it intends to
1087 * reuse the QDict, QINCREF() should be called beforehand.
1088 */
462f5bcf
KW
1089static int bdrv_file_open(BlockDriverState *bs, QDict **options, int flags,
1090 Error **errp)
f54120ff
KW
1091{
1092 BlockDriver *drv;
462f5bcf 1093 const char *filename;
f54120ff
KW
1094 const char *drvname;
1095 Error *local_err = NULL;
1096 int ret;
1097
f54120ff
KW
1098 filename = qdict_get_try_str(*options, "filename");
1099 drvname = qdict_get_str(*options, "driver");
1100
1101 drv = bdrv_find_format(drvname);
1102 assert(drv);
1103 qdict_del(*options, "driver");
1104
1105 /* Open the file */
505d7583 1106 if (!drv->bdrv_file_open) {
5acd9d81
HR
1107 ret = bdrv_open(&bs, filename, NULL, *options, flags, drv, &local_err);
1108 *options = NULL;
505d7583 1109 } else {
5acd9d81 1110 ret = bdrv_open_common(bs, NULL, *options, flags, drv, &local_err);
505d7583 1111 }
83f64091 1112 if (ret < 0) {
34b5d2c6 1113 error_propagate(errp, local_err);
707ff828
KW
1114 goto fail;
1115 }
1116
71d0770c 1117 bs->growable = 1;
83f64091 1118 return 0;
707ff828
KW
1119
1120fail:
707ff828 1121 return ret;
83f64091
FB
1122}
1123
8d24cce1
FZ
1124void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd)
1125{
1126
826b6ca0
FZ
1127 if (bs->backing_hd) {
1128 assert(bs->backing_blocker);
1129 bdrv_op_unblock_all(bs->backing_hd, bs->backing_blocker);
1130 } else if (backing_hd) {
1131 error_setg(&bs->backing_blocker,
1132 "device is used as backing hd of '%s'",
1133 bs->device_name);
1134 }
1135
8d24cce1
FZ
1136 bs->backing_hd = backing_hd;
1137 if (!backing_hd) {
826b6ca0
FZ
1138 error_free(bs->backing_blocker);
1139 bs->backing_blocker = NULL;
8d24cce1
FZ
1140 goto out;
1141 }
1142 bs->open_flags &= ~BDRV_O_NO_BACKING;
1143 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_hd->filename);
1144 pstrcpy(bs->backing_format, sizeof(bs->backing_format),
1145 backing_hd->drv ? backing_hd->drv->format_name : "");
826b6ca0
FZ
1146
1147 bdrv_op_block_all(bs->backing_hd, bs->backing_blocker);
1148 /* Otherwise we won't be able to commit due to check in bdrv_commit */
1149 bdrv_op_unblock(bs->backing_hd, BLOCK_OP_TYPE_COMMIT,
1150 bs->backing_blocker);
8d24cce1
FZ
1151out:
1152 bdrv_refresh_limits(bs);
1153}
1154
31ca6d07
KW
1155/*
1156 * Opens the backing file for a BlockDriverState if not yet open
1157 *
1158 * options is a QDict of options to pass to the block drivers, or NULL for an
1159 * empty set of options. The reference to the QDict is transferred to this
1160 * function (even on failure), so if the caller intends to reuse the dictionary,
1161 * it needs to use QINCREF() before calling bdrv_file_open.
1162 */
34b5d2c6 1163int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
9156df12 1164{
1ba4b6a5 1165 char *backing_filename = g_malloc0(PATH_MAX);
317fc44e 1166 int ret = 0;
9156df12 1167 BlockDriver *back_drv = NULL;
8d24cce1 1168 BlockDriverState *backing_hd;
34b5d2c6 1169 Error *local_err = NULL;
9156df12
PB
1170
1171 if (bs->backing_hd != NULL) {
31ca6d07 1172 QDECREF(options);
1ba4b6a5 1173 goto free_exit;
9156df12
PB
1174 }
1175
31ca6d07
KW
1176 /* NULL means an empty set of options */
1177 if (options == NULL) {
1178 options = qdict_new();
1179 }
1180
9156df12 1181 bs->open_flags &= ~BDRV_O_NO_BACKING;
1cb6f506
KW
1182 if (qdict_haskey(options, "file.filename")) {
1183 backing_filename[0] = '\0';
1184 } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
31ca6d07 1185 QDECREF(options);
1ba4b6a5 1186 goto free_exit;
dbecebdd 1187 } else {
1ba4b6a5 1188 bdrv_get_full_backing_filename(bs, backing_filename, PATH_MAX);
9156df12
PB
1189 }
1190
8d24cce1
FZ
1191 backing_hd = bdrv_new("", errp);
1192
9156df12
PB
1193 if (bs->backing_format[0] != '\0') {
1194 back_drv = bdrv_find_format(bs->backing_format);
1195 }
1196
f67503e5 1197 assert(bs->backing_hd == NULL);
8d24cce1 1198 ret = bdrv_open(&backing_hd,
ddf5636d 1199 *backing_filename ? backing_filename : NULL, NULL, options,
317fc44e 1200 bdrv_backing_flags(bs->open_flags), back_drv, &local_err);
9156df12 1201 if (ret < 0) {
8d24cce1
FZ
1202 bdrv_unref(backing_hd);
1203 backing_hd = NULL;
9156df12 1204 bs->open_flags |= BDRV_O_NO_BACKING;
b04b6b6e
FZ
1205 error_setg(errp, "Could not open backing file: %s",
1206 error_get_pretty(local_err));
1207 error_free(local_err);
1ba4b6a5 1208 goto free_exit;
9156df12 1209 }
8d24cce1 1210 bdrv_set_backing_hd(bs, backing_hd);
d80ac658 1211
1ba4b6a5
BC
1212free_exit:
1213 g_free(backing_filename);
1214 return ret;
9156df12
PB
1215}
1216
da557aac
HR
1217/*
1218 * Opens a disk image whose options are given as BlockdevRef in another block
1219 * device's options.
1220 *
da557aac
HR
1221 * If allow_none is true, no image will be opened if filename is false and no
1222 * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
1223 *
1224 * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
1225 * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
1226 * itself, all options starting with "${bdref_key}." are considered part of the
1227 * BlockdevRef.
1228 *
1229 * The BlockdevRef will be removed from the options QDict.
f67503e5
HR
1230 *
1231 * To conform with the behavior of bdrv_open(), *pbs has to be NULL.
da557aac
HR
1232 */
1233int bdrv_open_image(BlockDriverState **pbs, const char *filename,
1234 QDict *options, const char *bdref_key, int flags,
f7d9fd8c 1235 bool allow_none, Error **errp)
da557aac
HR
1236{
1237 QDict *image_options;
1238 int ret;
1239 char *bdref_key_dot;
1240 const char *reference;
1241
f67503e5
HR
1242 assert(pbs);
1243 assert(*pbs == NULL);
1244
da557aac
HR
1245 bdref_key_dot = g_strdup_printf("%s.", bdref_key);
1246 qdict_extract_subqdict(options, &image_options, bdref_key_dot);
1247 g_free(bdref_key_dot);
1248
1249 reference = qdict_get_try_str(options, bdref_key);
1250 if (!filename && !reference && !qdict_size(image_options)) {
1251 if (allow_none) {
1252 ret = 0;
1253 } else {
1254 error_setg(errp, "A block device must be specified for \"%s\"",
1255 bdref_key);
1256 ret = -EINVAL;
1257 }
b20e61e0 1258 QDECREF(image_options);
da557aac
HR
1259 goto done;
1260 }
1261
f7d9fd8c 1262 ret = bdrv_open(pbs, filename, reference, image_options, flags, NULL, errp);
da557aac
HR
1263
1264done:
1265 qdict_del(options, bdref_key);
1266 return ret;
1267}
1268
b1e6fc08 1269void bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp)
b998875d
KW
1270{
1271 /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1ba4b6a5 1272 char *tmp_filename = g_malloc0(PATH_MAX + 1);
b998875d
KW
1273 int64_t total_size;
1274 BlockDriver *bdrv_qcow2;
83d0521a 1275 QemuOpts *opts = NULL;
b998875d
KW
1276 QDict *snapshot_options;
1277 BlockDriverState *bs_snapshot;
1278 Error *local_err;
1279 int ret;
1280
1281 /* if snapshot, we create a temporary backing file and open it
1282 instead of opening 'filename' directly */
1283
1284 /* Get the required size from the image */
f187743a
KW
1285 total_size = bdrv_getlength(bs);
1286 if (total_size < 0) {
1287 error_setg_errno(errp, -total_size, "Could not get image size");
1ba4b6a5 1288 goto out;
f187743a
KW
1289 }
1290 total_size &= BDRV_SECTOR_MASK;
b998875d
KW
1291
1292 /* Create the temporary image */
1ba4b6a5 1293 ret = get_tmp_filename(tmp_filename, PATH_MAX + 1);
b998875d
KW
1294 if (ret < 0) {
1295 error_setg_errno(errp, -ret, "Could not get temporary filename");
1ba4b6a5 1296 goto out;
b998875d
KW
1297 }
1298
1299 bdrv_qcow2 = bdrv_find_format("qcow2");
c282e1fd
CL
1300 opts = qemu_opts_create(bdrv_qcow2->create_opts, NULL, 0,
1301 &error_abort);
83d0521a 1302 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, total_size);
c282e1fd 1303 ret = bdrv_create(bdrv_qcow2, tmp_filename, opts, &local_err);
83d0521a 1304 qemu_opts_del(opts);
b998875d
KW
1305 if (ret < 0) {
1306 error_setg_errno(errp, -ret, "Could not create temporary overlay "
1307 "'%s': %s", tmp_filename,
1308 error_get_pretty(local_err));
1309 error_free(local_err);
1ba4b6a5 1310 goto out;
b998875d
KW
1311 }
1312
1313 /* Prepare a new options QDict for the temporary file */
1314 snapshot_options = qdict_new();
1315 qdict_put(snapshot_options, "file.driver",
1316 qstring_from_str("file"));
1317 qdict_put(snapshot_options, "file.filename",
1318 qstring_from_str(tmp_filename));
1319
98522f63 1320 bs_snapshot = bdrv_new("", &error_abort);
b998875d
KW
1321
1322 ret = bdrv_open(&bs_snapshot, NULL, NULL, snapshot_options,
b1e6fc08 1323 flags, bdrv_qcow2, &local_err);
b998875d
KW
1324 if (ret < 0) {
1325 error_propagate(errp, local_err);
1ba4b6a5 1326 goto out;
b998875d
KW
1327 }
1328
1329 bdrv_append(bs_snapshot, bs);
1ba4b6a5
BC
1330
1331out:
1332 g_free(tmp_filename);
b998875d
KW
1333}
1334
4993f7ea
HR
1335static QDict *parse_json_filename(const char *filename, Error **errp)
1336{
1337 QObject *options_obj;
1338 QDict *options;
1339 int ret;
1340
1341 ret = strstart(filename, "json:", &filename);
1342 assert(ret);
1343
1344 options_obj = qobject_from_json(filename);
1345 if (!options_obj) {
1346 error_setg(errp, "Could not parse the JSON options");
1347 return NULL;
1348 }
1349
1350 if (qobject_type(options_obj) != QTYPE_QDICT) {
1351 qobject_decref(options_obj);
1352 error_setg(errp, "Invalid JSON object given");
1353 return NULL;
1354 }
1355
1356 options = qobject_to_qdict(options_obj);
1357 qdict_flatten(options);
1358
1359 return options;
1360}
1361
b6ce07aa
KW
1362/*
1363 * Opens a disk image (raw, qcow2, vmdk, ...)
de9c0cec
KW
1364 *
1365 * options is a QDict of options to pass to the block drivers, or NULL for an
1366 * empty set of options. The reference to the QDict belongs to the block layer
1367 * after the call (even on failure), so if the caller intends to reuse the
1368 * dictionary, it needs to use QINCREF() before calling bdrv_open.
f67503e5
HR
1369 *
1370 * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
1371 * If it is not NULL, the referenced BDS will be reused.
ddf5636d
HR
1372 *
1373 * The reference parameter may be used to specify an existing block device which
1374 * should be opened. If specified, neither options nor a filename may be given,
1375 * nor can an existing BDS be reused (that is, *pbs has to be NULL).
b6ce07aa 1376 */
ddf5636d
HR
1377int bdrv_open(BlockDriverState **pbs, const char *filename,
1378 const char *reference, QDict *options, int flags,
1379 BlockDriver *drv, Error **errp)
ea2384d3 1380{
b6ce07aa 1381 int ret;
f67503e5 1382 BlockDriverState *file = NULL, *bs;
74fe54f2 1383 const char *drvname;
34b5d2c6 1384 Error *local_err = NULL;
b1e6fc08 1385 int snapshot_flags = 0;
712e7874 1386
f67503e5
HR
1387 assert(pbs);
1388
ddf5636d
HR
1389 if (reference) {
1390 bool options_non_empty = options ? qdict_size(options) : false;
1391 QDECREF(options);
1392
1393 if (*pbs) {
1394 error_setg(errp, "Cannot reuse an existing BDS when referencing "
1395 "another block device");
1396 return -EINVAL;
1397 }
1398
1399 if (filename || options_non_empty) {
1400 error_setg(errp, "Cannot reference an existing block device with "
1401 "additional options or a new filename");
1402 return -EINVAL;
1403 }
1404
1405 bs = bdrv_lookup_bs(reference, reference, errp);
1406 if (!bs) {
1407 return -ENODEV;
1408 }
1409 bdrv_ref(bs);
1410 *pbs = bs;
1411 return 0;
1412 }
1413
f67503e5
HR
1414 if (*pbs) {
1415 bs = *pbs;
1416 } else {
98522f63 1417 bs = bdrv_new("", &error_abort);
f67503e5
HR
1418 }
1419
de9c0cec
KW
1420 /* NULL means an empty set of options */
1421 if (options == NULL) {
1422 options = qdict_new();
1423 }
1424
4993f7ea
HR
1425 if (filename && g_str_has_prefix(filename, "json:")) {
1426 QDict *json_options = parse_json_filename(filename, &local_err);
1427 if (local_err) {
1428 ret = -EINVAL;
1429 goto fail;
1430 }
1431
1432 /* Options given in the filename have lower priority than options
1433 * specified directly */
1434 qdict_join(options, json_options, false);
1435 QDECREF(json_options);
1436 filename = NULL;
1437 }
1438
462f5bcf
KW
1439 ret = bdrv_fill_options(&options, filename, flags, &local_err);
1440 if (local_err) {
1441 goto fail;
1442 }
1443
de9c0cec 1444 bs->options = options;
b6ad491a 1445 options = qdict_clone_shallow(options);
de9c0cec 1446
5469a2a6
HR
1447 if (flags & BDRV_O_PROTOCOL) {
1448 assert(!drv);
462f5bcf 1449 ret = bdrv_file_open(bs, &options, flags & ~BDRV_O_PROTOCOL,
5469a2a6 1450 &local_err);
5469a2a6 1451 if (!ret) {
eb909c7f 1452 drv = bs->drv;
5acd9d81 1453 goto done;
5469a2a6
HR
1454 } else if (bs->drv) {
1455 goto close_and_fail;
1456 } else {
1457 goto fail;
1458 }
1459 }
1460
f500a6d3
KW
1461 /* Open image file without format layer */
1462 if (flags & BDRV_O_RDWR) {
1463 flags |= BDRV_O_ALLOW_RDWR;
1464 }
b1e6fc08
KW
1465 if (flags & BDRV_O_SNAPSHOT) {
1466 snapshot_flags = bdrv_temp_snapshot_flags(flags);
1467 flags = bdrv_backing_flags(flags);
1468 }
f500a6d3 1469
f67503e5 1470 assert(file == NULL);
054963f8 1471 ret = bdrv_open_image(&file, filename, options, "file",
0b50cc88
KW
1472 bdrv_inherited_flags(flags),
1473 true, &local_err);
054963f8 1474 if (ret < 0) {
8bfea15d 1475 goto fail;
f500a6d3
KW
1476 }
1477
b6ce07aa 1478 /* Find the right image format driver */
74fe54f2
KW
1479 drvname = qdict_get_try_str(options, "driver");
1480 if (drvname) {
8f94a6e4 1481 drv = bdrv_find_format(drvname);
74fe54f2 1482 qdict_del(options, "driver");
06d22aa3
KW
1483 if (!drv) {
1484 error_setg(errp, "Invalid driver: '%s'", drvname);
1485 ret = -EINVAL;
8bfea15d 1486 goto fail;
06d22aa3 1487 }
74fe54f2
KW
1488 }
1489
6db95603 1490 if (!drv) {
2a05cbe4
HR
1491 if (file) {
1492 ret = find_image_format(file, filename, &drv, &local_err);
1493 } else {
1494 error_setg(errp, "Must specify either driver or file");
1495 ret = -EINVAL;
8bfea15d 1496 goto fail;
2a05cbe4 1497 }
51d7c00c 1498 }
6987307c 1499
51d7c00c 1500 if (!drv) {
8bfea15d 1501 goto fail;
ea2384d3 1502 }
b6ce07aa
KW
1503
1504 /* Open the image */
34b5d2c6 1505 ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
b6ce07aa 1506 if (ret < 0) {
8bfea15d 1507 goto fail;
6987307c
CH
1508 }
1509
2a05cbe4 1510 if (file && (bs->file != file)) {
4f6fd349 1511 bdrv_unref(file);
f500a6d3
KW
1512 file = NULL;
1513 }
1514
b6ce07aa 1515 /* If there is a backing file, use it */
9156df12 1516 if ((flags & BDRV_O_NO_BACKING) == 0) {
31ca6d07
KW
1517 QDict *backing_options;
1518
5726d872 1519 qdict_extract_subqdict(options, &backing_options, "backing.");
34b5d2c6 1520 ret = bdrv_open_backing_file(bs, backing_options, &local_err);
b6ce07aa 1521 if (ret < 0) {
b6ad491a 1522 goto close_and_fail;
b6ce07aa 1523 }
b6ce07aa
KW
1524 }
1525
b998875d
KW
1526 /* For snapshot=on, create a temporary qcow2 overlay. bs points to the
1527 * temporary snapshot afterwards. */
b1e6fc08
KW
1528 if (snapshot_flags) {
1529 bdrv_append_temp_snapshot(bs, snapshot_flags, &local_err);
b998875d
KW
1530 if (local_err) {
1531 error_propagate(errp, local_err);
1532 goto close_and_fail;
1533 }
1534 }
1535
1536
5acd9d81 1537done:
b6ad491a 1538 /* Check if any unknown options were used */
5acd9d81 1539 if (options && (qdict_size(options) != 0)) {
b6ad491a 1540 const QDictEntry *entry = qdict_first(options);
5acd9d81
HR
1541 if (flags & BDRV_O_PROTOCOL) {
1542 error_setg(errp, "Block protocol '%s' doesn't support the option "
1543 "'%s'", drv->format_name, entry->key);
1544 } else {
1545 error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1546 "support the option '%s'", drv->format_name,
1547 bs->device_name, entry->key);
1548 }
b6ad491a
KW
1549
1550 ret = -EINVAL;
1551 goto close_and_fail;
1552 }
b6ad491a 1553
b6ce07aa 1554 if (!bdrv_key_required(bs)) {
7d4b4ba5 1555 bdrv_dev_change_media_cb(bs, true);
c3adb58f
MA
1556 } else if (!runstate_check(RUN_STATE_PRELAUNCH)
1557 && !runstate_check(RUN_STATE_INMIGRATE)
1558 && !runstate_check(RUN_STATE_PAUSED)) { /* HACK */
1559 error_setg(errp,
1560 "Guest must be stopped for opening of encrypted image");
1561 ret = -EBUSY;
1562 goto close_and_fail;
b6ce07aa
KW
1563 }
1564
c3adb58f 1565 QDECREF(options);
f67503e5 1566 *pbs = bs;
b6ce07aa
KW
1567 return 0;
1568
8bfea15d 1569fail:
f500a6d3 1570 if (file != NULL) {
4f6fd349 1571 bdrv_unref(file);
f500a6d3 1572 }
de9c0cec 1573 QDECREF(bs->options);
b6ad491a 1574 QDECREF(options);
de9c0cec 1575 bs->options = NULL;
f67503e5
HR
1576 if (!*pbs) {
1577 /* If *pbs is NULL, a new BDS has been created in this function and
1578 needs to be freed now. Otherwise, it does not need to be closed,
1579 since it has not really been opened yet. */
1580 bdrv_unref(bs);
1581 }
84d18f06 1582 if (local_err) {
34b5d2c6
HR
1583 error_propagate(errp, local_err);
1584 }
b6ad491a 1585 return ret;
de9c0cec 1586
b6ad491a 1587close_and_fail:
f67503e5
HR
1588 /* See fail path, but now the BDS has to be always closed */
1589 if (*pbs) {
1590 bdrv_close(bs);
1591 } else {
1592 bdrv_unref(bs);
1593 }
b6ad491a 1594 QDECREF(options);
84d18f06 1595 if (local_err) {
34b5d2c6
HR
1596 error_propagate(errp, local_err);
1597 }
b6ce07aa
KW
1598 return ret;
1599}
1600
e971aa12
JC
1601typedef struct BlockReopenQueueEntry {
1602 bool prepared;
1603 BDRVReopenState state;
1604 QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1605} BlockReopenQueueEntry;
1606
1607/*
1608 * Adds a BlockDriverState to a simple queue for an atomic, transactional
1609 * reopen of multiple devices.
1610 *
1611 * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1612 * already performed, or alternatively may be NULL a new BlockReopenQueue will
1613 * be created and initialized. This newly created BlockReopenQueue should be
1614 * passed back in for subsequent calls that are intended to be of the same
1615 * atomic 'set'.
1616 *
1617 * bs is the BlockDriverState to add to the reopen queue.
1618 *
1619 * flags contains the open flags for the associated bs
1620 *
1621 * returns a pointer to bs_queue, which is either the newly allocated
1622 * bs_queue, or the existing bs_queue being used.
1623 *
1624 */
1625BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1626 BlockDriverState *bs, int flags)
1627{
1628 assert(bs != NULL);
1629
1630 BlockReopenQueueEntry *bs_entry;
1631 if (bs_queue == NULL) {
1632 bs_queue = g_new0(BlockReopenQueue, 1);
1633 QSIMPLEQ_INIT(bs_queue);
1634 }
1635
f1f25a2e
KW
1636 /* bdrv_open() masks this flag out */
1637 flags &= ~BDRV_O_PROTOCOL;
1638
e971aa12 1639 if (bs->file) {
f1f25a2e 1640 bdrv_reopen_queue(bs_queue, bs->file, bdrv_inherited_flags(flags));
e971aa12
JC
1641 }
1642
1643 bs_entry = g_new0(BlockReopenQueueEntry, 1);
1644 QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1645
1646 bs_entry->state.bs = bs;
1647 bs_entry->state.flags = flags;
1648
1649 return bs_queue;
1650}
1651
1652/*
1653 * Reopen multiple BlockDriverStates atomically & transactionally.
1654 *
1655 * The queue passed in (bs_queue) must have been built up previous
1656 * via bdrv_reopen_queue().
1657 *
1658 * Reopens all BDS specified in the queue, with the appropriate
1659 * flags. All devices are prepared for reopen, and failure of any
1660 * device will cause all device changes to be abandonded, and intermediate
1661 * data cleaned up.
1662 *
1663 * If all devices prepare successfully, then the changes are committed
1664 * to all devices.
1665 *
1666 */
1667int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1668{
1669 int ret = -1;
1670 BlockReopenQueueEntry *bs_entry, *next;
1671 Error *local_err = NULL;
1672
1673 assert(bs_queue != NULL);
1674
1675 bdrv_drain_all();
1676
1677 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1678 if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1679 error_propagate(errp, local_err);
1680 goto cleanup;
1681 }
1682 bs_entry->prepared = true;
1683 }
1684
1685 /* If we reach this point, we have success and just need to apply the
1686 * changes
1687 */
1688 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1689 bdrv_reopen_commit(&bs_entry->state);
1690 }
1691
1692 ret = 0;
1693
1694cleanup:
1695 QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1696 if (ret && bs_entry->prepared) {
1697 bdrv_reopen_abort(&bs_entry->state);
1698 }
1699 g_free(bs_entry);
1700 }
1701 g_free(bs_queue);
1702 return ret;
1703}
1704
1705
1706/* Reopen a single BlockDriverState with the specified flags. */
1707int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1708{
1709 int ret = -1;
1710 Error *local_err = NULL;
1711 BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1712
1713 ret = bdrv_reopen_multiple(queue, &local_err);
1714 if (local_err != NULL) {
1715 error_propagate(errp, local_err);
1716 }
1717 return ret;
1718}
1719
1720
1721/*
1722 * Prepares a BlockDriverState for reopen. All changes are staged in the
1723 * 'opaque' field of the BDRVReopenState, which is used and allocated by
1724 * the block driver layer .bdrv_reopen_prepare()
1725 *
1726 * bs is the BlockDriverState to reopen
1727 * flags are the new open flags
1728 * queue is the reopen queue
1729 *
1730 * Returns 0 on success, non-zero on error. On error errp will be set
1731 * as well.
1732 *
1733 * On failure, bdrv_reopen_abort() will be called to clean up any data.
1734 * It is the responsibility of the caller to then call the abort() or
1735 * commit() for any other BDS that have been left in a prepare() state
1736 *
1737 */
1738int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1739 Error **errp)
1740{
1741 int ret = -1;
1742 Error *local_err = NULL;
1743 BlockDriver *drv;
1744
1745 assert(reopen_state != NULL);
1746 assert(reopen_state->bs->drv != NULL);
1747 drv = reopen_state->bs->drv;
1748
1749 /* if we are to stay read-only, do not allow permission change
1750 * to r/w */
1751 if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1752 reopen_state->flags & BDRV_O_RDWR) {
1753 error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1754 reopen_state->bs->device_name);
1755 goto error;
1756 }
1757
1758
1759 ret = bdrv_flush(reopen_state->bs);
1760 if (ret) {
1761 error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1762 strerror(-ret));
1763 goto error;
1764 }
1765
1766 if (drv->bdrv_reopen_prepare) {
1767 ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1768 if (ret) {
1769 if (local_err != NULL) {
1770 error_propagate(errp, local_err);
1771 } else {
d8b6895f
LC
1772 error_setg(errp, "failed while preparing to reopen image '%s'",
1773 reopen_state->bs->filename);
e971aa12
JC
1774 }
1775 goto error;
1776 }
1777 } else {
1778 /* It is currently mandatory to have a bdrv_reopen_prepare()
1779 * handler for each supported drv. */
1780 error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1781 drv->format_name, reopen_state->bs->device_name,
1782 "reopening of file");
1783 ret = -1;
1784 goto error;
1785 }
1786
1787 ret = 0;
1788
1789error:
1790 return ret;
1791}
1792
1793/*
1794 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1795 * makes them final by swapping the staging BlockDriverState contents into
1796 * the active BlockDriverState contents.
1797 */
1798void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1799{
1800 BlockDriver *drv;
1801
1802 assert(reopen_state != NULL);
1803 drv = reopen_state->bs->drv;
1804 assert(drv != NULL);
1805
1806 /* If there are any driver level actions to take */
1807 if (drv->bdrv_reopen_commit) {
1808 drv->bdrv_reopen_commit(reopen_state);
1809 }
1810
1811 /* set BDS specific flags now */
1812 reopen_state->bs->open_flags = reopen_state->flags;
1813 reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1814 BDRV_O_CACHE_WB);
1815 reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
355ef4ac
KW
1816
1817 bdrv_refresh_limits(reopen_state->bs);
e971aa12
JC
1818}
1819
1820/*
1821 * Abort the reopen, and delete and free the staged changes in
1822 * reopen_state
1823 */
1824void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1825{
1826 BlockDriver *drv;
1827
1828 assert(reopen_state != NULL);
1829 drv = reopen_state->bs->drv;
1830 assert(drv != NULL);
1831
1832 if (drv->bdrv_reopen_abort) {
1833 drv->bdrv_reopen_abort(reopen_state);
1834 }
1835}
1836
1837
fc01f7e7
FB
1838void bdrv_close(BlockDriverState *bs)
1839{
3cbc002c
PB
1840 if (bs->job) {
1841 block_job_cancel_sync(bs->job);
1842 }
58fda173
SH
1843 bdrv_drain_all(); /* complete I/O */
1844 bdrv_flush(bs);
1845 bdrv_drain_all(); /* in case flush left pending I/O */
d7d512f6 1846 notifier_list_notify(&bs->close_notifiers, bs);
7094f12f 1847
3cbc002c 1848 if (bs->drv) {
557df6ac 1849 if (bs->backing_hd) {
826b6ca0
FZ
1850 BlockDriverState *backing_hd = bs->backing_hd;
1851 bdrv_set_backing_hd(bs, NULL);
1852 bdrv_unref(backing_hd);
557df6ac 1853 }
ea2384d3 1854 bs->drv->bdrv_close(bs);
7267c094 1855 g_free(bs->opaque);
ea2384d3
FB
1856 bs->opaque = NULL;
1857 bs->drv = NULL;
53fec9d3 1858 bs->copy_on_read = 0;
a275fa42
PB
1859 bs->backing_file[0] = '\0';
1860 bs->backing_format[0] = '\0';
6405875c
PB
1861 bs->total_sectors = 0;
1862 bs->encrypted = 0;
1863 bs->valid_key = 0;
1864 bs->sg = 0;
1865 bs->growable = 0;
0d51b4de 1866 bs->zero_beyond_eof = false;
de9c0cec
KW
1867 QDECREF(bs->options);
1868 bs->options = NULL;
b338082b 1869
66f82cee 1870 if (bs->file != NULL) {
4f6fd349 1871 bdrv_unref(bs->file);
0ac9377d 1872 bs->file = NULL;
66f82cee 1873 }
b338082b 1874 }
98f90dba 1875
9ca11154
PH
1876 bdrv_dev_change_media_cb(bs, false);
1877
98f90dba
ZYW
1878 /*throttling disk I/O limits*/
1879 if (bs->io_limits_enabled) {
1880 bdrv_io_limits_disable(bs);
1881 }
b338082b
FB
1882}
1883
2bc93fed
MK
1884void bdrv_close_all(void)
1885{
1886 BlockDriverState *bs;
1887
dc364f4c 1888 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
ed78cda3
SH
1889 AioContext *aio_context = bdrv_get_aio_context(bs);
1890
1891 aio_context_acquire(aio_context);
2bc93fed 1892 bdrv_close(bs);
ed78cda3 1893 aio_context_release(aio_context);
2bc93fed
MK
1894 }
1895}
1896
88266f5a
SH
1897/* Check if any requests are in-flight (including throttled requests) */
1898static bool bdrv_requests_pending(BlockDriverState *bs)
1899{
1900 if (!QLIST_EMPTY(&bs->tracked_requests)) {
1901 return true;
1902 }
cc0681c4
BC
1903 if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1904 return true;
1905 }
1906 if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
88266f5a
SH
1907 return true;
1908 }
1909 if (bs->file && bdrv_requests_pending(bs->file)) {
1910 return true;
1911 }
1912 if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1913 return true;
1914 }
1915 return false;
1916}
1917
922453bc
SH
1918/*
1919 * Wait for pending requests to complete across all BlockDriverStates
1920 *
1921 * This function does not flush data to disk, use bdrv_flush_all() for that
1922 * after calling this function.
4c355d53
ZYW
1923 *
1924 * Note that completion of an asynchronous I/O operation can trigger any
1925 * number of other I/O operations on other devices---for example a coroutine
1926 * can be arbitrarily complex and a constant flow of I/O can come until the
1927 * coroutine is complete. Because of this, it is not possible to have a
1928 * function to drain a single device's I/O queue.
922453bc
SH
1929 */
1930void bdrv_drain_all(void)
1931{
88266f5a
SH
1932 /* Always run first iteration so any pending completion BHs run */
1933 bool busy = true;
922453bc
SH
1934 BlockDriverState *bs;
1935
88266f5a 1936 while (busy) {
9b536adc
SH
1937 busy = false;
1938
dc364f4c 1939 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
9b536adc
SH
1940 AioContext *aio_context = bdrv_get_aio_context(bs);
1941 bool bs_busy;
1942
1943 aio_context_acquire(aio_context);
0b06ef3b 1944 bdrv_start_throttled_reqs(bs);
9b536adc
SH
1945 bs_busy = bdrv_requests_pending(bs);
1946 bs_busy |= aio_poll(aio_context, bs_busy);
1947 aio_context_release(aio_context);
922453bc 1948
9b536adc
SH
1949 busy |= bs_busy;
1950 }
922453bc
SH
1951 }
1952}
1953
dc364f4c
BC
1954/* make a BlockDriverState anonymous by removing from bdrv_state and
1955 * graph_bdrv_state list.
d22b2f41
RH
1956 Also, NULL terminate the device_name to prevent double remove */
1957void bdrv_make_anon(BlockDriverState *bs)
1958{
1959 if (bs->device_name[0] != '\0') {
dc364f4c 1960 QTAILQ_REMOVE(&bdrv_states, bs, device_list);
d22b2f41
RH
1961 }
1962 bs->device_name[0] = '\0';
dc364f4c
BC
1963 if (bs->node_name[0] != '\0') {
1964 QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
1965 }
1966 bs->node_name[0] = '\0';
d22b2f41
RH
1967}
1968
e023b2e2
PB
1969static void bdrv_rebind(BlockDriverState *bs)
1970{
1971 if (bs->drv && bs->drv->bdrv_rebind) {
1972 bs->drv->bdrv_rebind(bs);
1973 }
1974}
1975
4ddc07ca
PB
1976static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
1977 BlockDriverState *bs_src)
8802d1fd 1978{
4ddc07ca 1979 /* move some fields that need to stay attached to the device */
8802d1fd
JC
1980
1981 /* dev info */
4ddc07ca
PB
1982 bs_dest->dev_ops = bs_src->dev_ops;
1983 bs_dest->dev_opaque = bs_src->dev_opaque;
1984 bs_dest->dev = bs_src->dev;
1b7fd729 1985 bs_dest->guest_block_size = bs_src->guest_block_size;
4ddc07ca 1986 bs_dest->copy_on_read = bs_src->copy_on_read;
8802d1fd 1987
4ddc07ca 1988 bs_dest->enable_write_cache = bs_src->enable_write_cache;
c4a248a1 1989
cc0681c4
BC
1990 /* i/o throttled req */
1991 memcpy(&bs_dest->throttle_state,
1992 &bs_src->throttle_state,
1993 sizeof(ThrottleState));
1994 bs_dest->throttled_reqs[0] = bs_src->throttled_reqs[0];
1995 bs_dest->throttled_reqs[1] = bs_src->throttled_reqs[1];
4ddc07ca 1996 bs_dest->io_limits_enabled = bs_src->io_limits_enabled;
8802d1fd 1997
8802d1fd 1998 /* r/w error */
4ddc07ca
PB
1999 bs_dest->on_read_error = bs_src->on_read_error;
2000 bs_dest->on_write_error = bs_src->on_write_error;
8802d1fd
JC
2001
2002 /* i/o status */
4ddc07ca
PB
2003 bs_dest->iostatus_enabled = bs_src->iostatus_enabled;
2004 bs_dest->iostatus = bs_src->iostatus;
8802d1fd 2005
a9fc4408 2006 /* dirty bitmap */
e4654d2d 2007 bs_dest->dirty_bitmaps = bs_src->dirty_bitmaps;
a9fc4408 2008
9fcb0251
FZ
2009 /* reference count */
2010 bs_dest->refcnt = bs_src->refcnt;
2011
a9fc4408 2012 /* job */
4ddc07ca 2013 bs_dest->job = bs_src->job;
a9fc4408 2014
8802d1fd 2015 /* keep the same entry in bdrv_states */
4ddc07ca
PB
2016 pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name),
2017 bs_src->device_name);
dc364f4c 2018 bs_dest->device_list = bs_src->device_list;
fbe40ff7
FZ
2019 memcpy(bs_dest->op_blockers, bs_src->op_blockers,
2020 sizeof(bs_dest->op_blockers));
4ddc07ca 2021}
8802d1fd 2022
4ddc07ca
PB
2023/*
2024 * Swap bs contents for two image chains while they are live,
2025 * while keeping required fields on the BlockDriverState that is
2026 * actually attached to a device.
2027 *
2028 * This will modify the BlockDriverState fields, and swap contents
2029 * between bs_new and bs_old. Both bs_new and bs_old are modified.
2030 *
2031 * bs_new is required to be anonymous.
2032 *
2033 * This function does not create any image files.
2034 */
2035void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
2036{
2037 BlockDriverState tmp;
f6801b83 2038
90ce8a06
BC
2039 /* The code needs to swap the node_name but simply swapping node_list won't
2040 * work so first remove the nodes from the graph list, do the swap then
2041 * insert them back if needed.
2042 */
2043 if (bs_new->node_name[0] != '\0') {
2044 QTAILQ_REMOVE(&graph_bdrv_states, bs_new, node_list);
2045 }
2046 if (bs_old->node_name[0] != '\0') {
2047 QTAILQ_REMOVE(&graph_bdrv_states, bs_old, node_list);
2048 }
2049
4ddc07ca
PB
2050 /* bs_new must be anonymous and shouldn't have anything fancy enabled */
2051 assert(bs_new->device_name[0] == '\0');
e4654d2d 2052 assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
4ddc07ca
PB
2053 assert(bs_new->job == NULL);
2054 assert(bs_new->dev == NULL);
4ddc07ca 2055 assert(bs_new->io_limits_enabled == false);
cc0681c4 2056 assert(!throttle_have_timer(&bs_new->throttle_state));
8802d1fd 2057
4ddc07ca
PB
2058 tmp = *bs_new;
2059 *bs_new = *bs_old;
2060 *bs_old = tmp;
a9fc4408 2061
4ddc07ca
PB
2062 /* there are some fields that should not be swapped, move them back */
2063 bdrv_move_feature_fields(&tmp, bs_old);
2064 bdrv_move_feature_fields(bs_old, bs_new);
2065 bdrv_move_feature_fields(bs_new, &tmp);
8802d1fd 2066
4ddc07ca
PB
2067 /* bs_new shouldn't be in bdrv_states even after the swap! */
2068 assert(bs_new->device_name[0] == '\0');
2069
2070 /* Check a few fields that should remain attached to the device */
2071 assert(bs_new->dev == NULL);
2072 assert(bs_new->job == NULL);
4ddc07ca 2073 assert(bs_new->io_limits_enabled == false);
cc0681c4 2074 assert(!throttle_have_timer(&bs_new->throttle_state));
e023b2e2 2075
90ce8a06
BC
2076 /* insert the nodes back into the graph node list if needed */
2077 if (bs_new->node_name[0] != '\0') {
2078 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_new, node_list);
2079 }
2080 if (bs_old->node_name[0] != '\0') {
2081 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_old, node_list);
2082 }
2083
e023b2e2 2084 bdrv_rebind(bs_new);
4ddc07ca
PB
2085 bdrv_rebind(bs_old);
2086}
2087
2088/*
2089 * Add new bs contents at the top of an image chain while the chain is
2090 * live, while keeping required fields on the top layer.
2091 *
2092 * This will modify the BlockDriverState fields, and swap contents
2093 * between bs_new and bs_top. Both bs_new and bs_top are modified.
2094 *
2095 * bs_new is required to be anonymous.
2096 *
2097 * This function does not create any image files.
2098 */
2099void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
2100{
2101 bdrv_swap(bs_new, bs_top);
2102
2103 /* The contents of 'tmp' will become bs_top, as we are
2104 * swapping bs_new and bs_top contents. */
8d24cce1 2105 bdrv_set_backing_hd(bs_top, bs_new);
8802d1fd
JC
2106}
2107
4f6fd349 2108static void bdrv_delete(BlockDriverState *bs)
b338082b 2109{
fa879d62 2110 assert(!bs->dev);
3e914655 2111 assert(!bs->job);
3718d8ab 2112 assert(bdrv_op_blocker_is_empty(bs));
4f6fd349 2113 assert(!bs->refcnt);
e4654d2d 2114 assert(QLIST_EMPTY(&bs->dirty_bitmaps));
18846dee 2115
e1b5c52e
SH
2116 bdrv_close(bs);
2117
1b7bdbc1 2118 /* remove from list, if necessary */
d22b2f41 2119 bdrv_make_anon(bs);
34c6f050 2120
7267c094 2121 g_free(bs);
fc01f7e7
FB
2122}
2123
fa879d62
MA
2124int bdrv_attach_dev(BlockDriverState *bs, void *dev)
2125/* TODO change to DeviceState *dev when all users are qdevified */
18846dee 2126{
fa879d62 2127 if (bs->dev) {
18846dee
MA
2128 return -EBUSY;
2129 }
fa879d62 2130 bs->dev = dev;
28a7282a 2131 bdrv_iostatus_reset(bs);
18846dee
MA
2132 return 0;
2133}
2134
fa879d62
MA
2135/* TODO qdevified devices don't use this, remove when devices are qdevified */
2136void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
18846dee 2137{
fa879d62
MA
2138 if (bdrv_attach_dev(bs, dev) < 0) {
2139 abort();
2140 }
2141}
2142
2143void bdrv_detach_dev(BlockDriverState *bs, void *dev)
2144/* TODO change to DeviceState *dev when all users are qdevified */
2145{
2146 assert(bs->dev == dev);
2147 bs->dev = NULL;
0e49de52
MA
2148 bs->dev_ops = NULL;
2149 bs->dev_opaque = NULL;
1b7fd729 2150 bs->guest_block_size = 512;
18846dee
MA
2151}
2152
fa879d62
MA
2153/* TODO change to return DeviceState * when all users are qdevified */
2154void *bdrv_get_attached_dev(BlockDriverState *bs)
18846dee 2155{
fa879d62 2156 return bs->dev;
18846dee
MA
2157}
2158
0e49de52
MA
2159void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
2160 void *opaque)
2161{
2162 bs->dev_ops = ops;
2163 bs->dev_opaque = opaque;
2164}
2165
7d4b4ba5 2166static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
0e49de52 2167{
145feb17 2168 if (bs->dev_ops && bs->dev_ops->change_media_cb) {
6f382ed2 2169 bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
7d4b4ba5 2170 bs->dev_ops->change_media_cb(bs->dev_opaque, load);
6f382ed2
LC
2171 if (tray_was_closed) {
2172 /* tray open */
a5ee7bd4
WX
2173 qapi_event_send_device_tray_moved(bdrv_get_device_name(bs),
2174 true, &error_abort);
6f382ed2
LC
2175 }
2176 if (load) {
2177 /* tray close */
a5ee7bd4
WX
2178 qapi_event_send_device_tray_moved(bdrv_get_device_name(bs),
2179 false, &error_abort);
6f382ed2 2180 }
145feb17
MA
2181 }
2182}
2183
2c6942fa
MA
2184bool bdrv_dev_has_removable_media(BlockDriverState *bs)
2185{
2186 return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
2187}
2188
025ccaa7
PB
2189void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
2190{
2191 if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
2192 bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
2193 }
2194}
2195
e4def80b
MA
2196bool bdrv_dev_is_tray_open(BlockDriverState *bs)
2197{
2198 if (bs->dev_ops && bs->dev_ops->is_tray_open) {
2199 return bs->dev_ops->is_tray_open(bs->dev_opaque);
2200 }
2201 return false;
2202}
2203
145feb17
MA
2204static void bdrv_dev_resize_cb(BlockDriverState *bs)
2205{
2206 if (bs->dev_ops && bs->dev_ops->resize_cb) {
2207 bs->dev_ops->resize_cb(bs->dev_opaque);
0e49de52
MA
2208 }
2209}
2210
f107639a
MA
2211bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
2212{
2213 if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
2214 return bs->dev_ops->is_medium_locked(bs->dev_opaque);
2215 }
2216 return false;
2217}
2218
e97fc193
AL
2219/*
2220 * Run consistency checks on an image
2221 *
e076f338 2222 * Returns 0 if the check could be completed (it doesn't mean that the image is
a1c7273b 2223 * free of errors) or -errno when an internal error occurred. The results of the
e076f338 2224 * check are stored in res.
e97fc193 2225 */
4534ff54 2226int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
e97fc193
AL
2227{
2228 if (bs->drv->bdrv_check == NULL) {
2229 return -ENOTSUP;
2230 }
2231
e076f338 2232 memset(res, 0, sizeof(*res));
4534ff54 2233 return bs->drv->bdrv_check(bs, res, fix);
e97fc193
AL
2234}
2235
8a426614
KW
2236#define COMMIT_BUF_SECTORS 2048
2237
33e3963e
FB
2238/* commit COW file into the raw image */
2239int bdrv_commit(BlockDriverState *bs)
2240{
19cb3738 2241 BlockDriver *drv = bs->drv;
72706ea4 2242 int64_t sector, total_sectors, length, backing_length;
8a426614 2243 int n, ro, open_flags;
0bce597d 2244 int ret = 0;
72706ea4 2245 uint8_t *buf = NULL;
c2cba3d9 2246 char filename[PATH_MAX];
33e3963e 2247
19cb3738
FB
2248 if (!drv)
2249 return -ENOMEDIUM;
4dca4b63
NS
2250
2251 if (!bs->backing_hd) {
2252 return -ENOTSUP;
33e3963e
FB
2253 }
2254
3718d8ab
FZ
2255 if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_COMMIT, NULL) ||
2256 bdrv_op_is_blocked(bs->backing_hd, BLOCK_OP_TYPE_COMMIT, NULL)) {
2d3735d3
SH
2257 return -EBUSY;
2258 }
2259
4dca4b63 2260 ro = bs->backing_hd->read_only;
c2cba3d9
JM
2261 /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
2262 pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
4dca4b63
NS
2263 open_flags = bs->backing_hd->open_flags;
2264
2265 if (ro) {
0bce597d
JC
2266 if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
2267 return -EACCES;
4dca4b63 2268 }
ea2384d3 2269 }
33e3963e 2270
72706ea4
JC
2271 length = bdrv_getlength(bs);
2272 if (length < 0) {
2273 ret = length;
2274 goto ro_cleanup;
2275 }
2276
2277 backing_length = bdrv_getlength(bs->backing_hd);
2278 if (backing_length < 0) {
2279 ret = backing_length;
2280 goto ro_cleanup;
2281 }
2282
2283 /* If our top snapshot is larger than the backing file image,
2284 * grow the backing file image if possible. If not possible,
2285 * we must return an error */
2286 if (length > backing_length) {
2287 ret = bdrv_truncate(bs->backing_hd, length);
2288 if (ret < 0) {
2289 goto ro_cleanup;
2290 }
2291 }
2292
2293 total_sectors = length >> BDRV_SECTOR_BITS;
7267c094 2294 buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
8a426614
KW
2295
2296 for (sector = 0; sector < total_sectors; sector += n) {
d663640c
PB
2297 ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
2298 if (ret < 0) {
2299 goto ro_cleanup;
2300 }
2301 if (ret) {
dabfa6cc
KW
2302 ret = bdrv_read(bs, sector, buf, n);
2303 if (ret < 0) {
8a426614
KW
2304 goto ro_cleanup;
2305 }
2306
dabfa6cc
KW
2307 ret = bdrv_write(bs->backing_hd, sector, buf, n);
2308 if (ret < 0) {
8a426614
KW
2309 goto ro_cleanup;
2310 }
ea2384d3 2311 }
33e3963e 2312 }
95389c86 2313
1d44952f
CH
2314 if (drv->bdrv_make_empty) {
2315 ret = drv->bdrv_make_empty(bs);
dabfa6cc
KW
2316 if (ret < 0) {
2317 goto ro_cleanup;
2318 }
1d44952f
CH
2319 bdrv_flush(bs);
2320 }
95389c86 2321
3f5075ae
CH
2322 /*
2323 * Make sure all data we wrote to the backing device is actually
2324 * stable on disk.
2325 */
dabfa6cc 2326 if (bs->backing_hd) {
3f5075ae 2327 bdrv_flush(bs->backing_hd);
dabfa6cc 2328 }
4dca4b63 2329
dabfa6cc 2330 ret = 0;
4dca4b63 2331ro_cleanup:
7267c094 2332 g_free(buf);
4dca4b63
NS
2333
2334 if (ro) {
0bce597d
JC
2335 /* ignoring error return here */
2336 bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
4dca4b63
NS
2337 }
2338
1d44952f 2339 return ret;
33e3963e
FB
2340}
2341
e8877497 2342int bdrv_commit_all(void)
6ab4b5ab
MA
2343{
2344 BlockDriverState *bs;
2345
dc364f4c 2346 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
ed78cda3
SH
2347 AioContext *aio_context = bdrv_get_aio_context(bs);
2348
2349 aio_context_acquire(aio_context);
272d2d8e
JC
2350 if (bs->drv && bs->backing_hd) {
2351 int ret = bdrv_commit(bs);
2352 if (ret < 0) {
ed78cda3 2353 aio_context_release(aio_context);
272d2d8e
JC
2354 return ret;
2355 }
e8877497 2356 }
ed78cda3 2357 aio_context_release(aio_context);
6ab4b5ab 2358 }
e8877497 2359 return 0;
6ab4b5ab
MA
2360}
2361
dbffbdcf
SH
2362/**
2363 * Remove an active request from the tracked requests list
2364 *
2365 * This function should be called when a tracked request is completing.
2366 */
2367static void tracked_request_end(BdrvTrackedRequest *req)
2368{
2dbafdc0
KW
2369 if (req->serialising) {
2370 req->bs->serialising_in_flight--;
2371 }
2372
dbffbdcf 2373 QLIST_REMOVE(req, list);
f4658285 2374 qemu_co_queue_restart_all(&req->wait_queue);
dbffbdcf
SH
2375}
2376
2377/**
2378 * Add an active request to the tracked requests list
2379 */
2380static void tracked_request_begin(BdrvTrackedRequest *req,
2381 BlockDriverState *bs,
793ed47a
KW
2382 int64_t offset,
2383 unsigned int bytes, bool is_write)
dbffbdcf
SH
2384{
2385 *req = (BdrvTrackedRequest){
2386 .bs = bs,
2dbafdc0
KW
2387 .offset = offset,
2388 .bytes = bytes,
2389 .is_write = is_write,
2390 .co = qemu_coroutine_self(),
2391 .serialising = false,
7327145f
KW
2392 .overlap_offset = offset,
2393 .overlap_bytes = bytes,
dbffbdcf
SH
2394 };
2395
f4658285
SH
2396 qemu_co_queue_init(&req->wait_queue);
2397
dbffbdcf
SH
2398 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2399}
2400
e96126ff 2401static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
2dbafdc0 2402{
7327145f 2403 int64_t overlap_offset = req->offset & ~(align - 1);
e96126ff
KW
2404 unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
2405 - overlap_offset;
7327145f 2406
2dbafdc0
KW
2407 if (!req->serialising) {
2408 req->bs->serialising_in_flight++;
2409 req->serialising = true;
2410 }
7327145f
KW
2411
2412 req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
2413 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
2dbafdc0
KW
2414}
2415
d83947ac
SH
2416/**
2417 * Round a region to cluster boundaries
2418 */
343bded4
PB
2419void bdrv_round_to_clusters(BlockDriverState *bs,
2420 int64_t sector_num, int nb_sectors,
2421 int64_t *cluster_sector_num,
2422 int *cluster_nb_sectors)
d83947ac
SH
2423{
2424 BlockDriverInfo bdi;
2425
2426 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2427 *cluster_sector_num = sector_num;
2428 *cluster_nb_sectors = nb_sectors;
2429 } else {
2430 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2431 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2432 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2433 nb_sectors, c);
2434 }
2435}
2436
7327145f 2437static int bdrv_get_cluster_size(BlockDriverState *bs)
793ed47a
KW
2438{
2439 BlockDriverInfo bdi;
7327145f 2440 int ret;
793ed47a 2441
7327145f
KW
2442 ret = bdrv_get_info(bs, &bdi);
2443 if (ret < 0 || bdi.cluster_size == 0) {
2444 return bs->request_alignment;
793ed47a 2445 } else {
7327145f 2446 return bdi.cluster_size;
793ed47a
KW
2447 }
2448}
2449
f4658285 2450static bool tracked_request_overlaps(BdrvTrackedRequest *req,
793ed47a
KW
2451 int64_t offset, unsigned int bytes)
2452{
d83947ac 2453 /* aaaa bbbb */
7327145f 2454 if (offset >= req->overlap_offset + req->overlap_bytes) {
d83947ac
SH
2455 return false;
2456 }
2457 /* bbbb aaaa */
7327145f 2458 if (req->overlap_offset >= offset + bytes) {
d83947ac
SH
2459 return false;
2460 }
2461 return true;
f4658285
SH
2462}
2463
28de2dcd 2464static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
f4658285 2465{
2dbafdc0 2466 BlockDriverState *bs = self->bs;
f4658285
SH
2467 BdrvTrackedRequest *req;
2468 bool retry;
28de2dcd 2469 bool waited = false;
f4658285 2470
2dbafdc0 2471 if (!bs->serialising_in_flight) {
28de2dcd 2472 return false;
2dbafdc0
KW
2473 }
2474
f4658285
SH
2475 do {
2476 retry = false;
2477 QLIST_FOREACH(req, &bs->tracked_requests, list) {
2dbafdc0 2478 if (req == self || (!req->serialising && !self->serialising)) {
65afd211
KW
2479 continue;
2480 }
7327145f
KW
2481 if (tracked_request_overlaps(req, self->overlap_offset,
2482 self->overlap_bytes))
2483 {
5f8b6491
SH
2484 /* Hitting this means there was a reentrant request, for
2485 * example, a block driver issuing nested requests. This must
2486 * never happen since it means deadlock.
2487 */
2488 assert(qemu_coroutine_self() != req->co);
2489
6460440f
KW
2490 /* If the request is already (indirectly) waiting for us, or
2491 * will wait for us as soon as it wakes up, then just go on
2492 * (instead of producing a deadlock in the former case). */
2493 if (!req->waiting_for) {
2494 self->waiting_for = req;
2495 qemu_co_queue_wait(&req->wait_queue);
2496 self->waiting_for = NULL;
2497 retry = true;
28de2dcd 2498 waited = true;
6460440f
KW
2499 break;
2500 }
f4658285
SH
2501 }
2502 }
2503 } while (retry);
28de2dcd
KW
2504
2505 return waited;
f4658285
SH
2506}
2507
756e6736
KW
2508/*
2509 * Return values:
2510 * 0 - success
2511 * -EINVAL - backing format specified, but no file
2512 * -ENOSPC - can't update the backing file because no space is left in the
2513 * image file header
2514 * -ENOTSUP - format driver doesn't support changing the backing file
2515 */
2516int bdrv_change_backing_file(BlockDriverState *bs,
2517 const char *backing_file, const char *backing_fmt)
2518{
2519 BlockDriver *drv = bs->drv;
469ef350 2520 int ret;
756e6736 2521
5f377794
PB
2522 /* Backing file format doesn't make sense without a backing file */
2523 if (backing_fmt && !backing_file) {
2524 return -EINVAL;
2525 }
2526
756e6736 2527 if (drv->bdrv_change_backing_file != NULL) {
469ef350 2528 ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
756e6736 2529 } else {
469ef350 2530 ret = -ENOTSUP;
756e6736 2531 }
469ef350
PB
2532
2533 if (ret == 0) {
2534 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2535 pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2536 }
2537 return ret;
756e6736
KW
2538}
2539
6ebdcee2
JC
2540/*
2541 * Finds the image layer in the chain that has 'bs' as its backing file.
2542 *
2543 * active is the current topmost image.
2544 *
2545 * Returns NULL if bs is not found in active's image chain,
2546 * or if active == bs.
2547 */
2548BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2549 BlockDriverState *bs)
2550{
2551 BlockDriverState *overlay = NULL;
2552 BlockDriverState *intermediate;
2553
2554 assert(active != NULL);
2555 assert(bs != NULL);
2556
2557 /* if bs is the same as active, then by definition it has no overlay
2558 */
2559 if (active == bs) {
2560 return NULL;
2561 }
2562
2563 intermediate = active;
2564 while (intermediate->backing_hd) {
2565 if (intermediate->backing_hd == bs) {
2566 overlay = intermediate;
2567 break;
2568 }
2569 intermediate = intermediate->backing_hd;
2570 }
2571
2572 return overlay;
2573}
2574
2575typedef struct BlkIntermediateStates {
2576 BlockDriverState *bs;
2577 QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2578} BlkIntermediateStates;
2579
2580
2581/*
2582 * Drops images above 'base' up to and including 'top', and sets the image
2583 * above 'top' to have base as its backing file.
2584 *
2585 * Requires that the overlay to 'top' is opened r/w, so that the backing file
2586 * information in 'bs' can be properly updated.
2587 *
2588 * E.g., this will convert the following chain:
2589 * bottom <- base <- intermediate <- top <- active
2590 *
2591 * to
2592 *
2593 * bottom <- base <- active
2594 *
2595 * It is allowed for bottom==base, in which case it converts:
2596 *
2597 * base <- intermediate <- top <- active
2598 *
2599 * to
2600 *
2601 * base <- active
2602 *
2603 * Error conditions:
2604 * if active == top, that is considered an error
2605 *
2606 */
2607int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2608 BlockDriverState *base)
2609{
2610 BlockDriverState *intermediate;
2611 BlockDriverState *base_bs = NULL;
2612 BlockDriverState *new_top_bs = NULL;
2613 BlkIntermediateStates *intermediate_state, *next;
2614 int ret = -EIO;
2615
2616 QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2617 QSIMPLEQ_INIT(&states_to_delete);
2618
2619 if (!top->drv || !base->drv) {
2620 goto exit;
2621 }
2622
2623 new_top_bs = bdrv_find_overlay(active, top);
2624
2625 if (new_top_bs == NULL) {
2626 /* we could not find the image above 'top', this is an error */
2627 goto exit;
2628 }
2629
2630 /* special case of new_top_bs->backing_hd already pointing to base - nothing
2631 * to do, no intermediate images */
2632 if (new_top_bs->backing_hd == base) {
2633 ret = 0;
2634 goto exit;
2635 }
2636
2637 intermediate = top;
2638
2639 /* now we will go down through the list, and add each BDS we find
2640 * into our deletion queue, until we hit the 'base'
2641 */
2642 while (intermediate) {
2643 intermediate_state = g_malloc0(sizeof(BlkIntermediateStates));
2644 intermediate_state->bs = intermediate;
2645 QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2646
2647 if (intermediate->backing_hd == base) {
2648 base_bs = intermediate->backing_hd;
2649 break;
2650 }
2651 intermediate = intermediate->backing_hd;
2652 }
2653 if (base_bs == NULL) {
2654 /* something went wrong, we did not end at the base. safely
2655 * unravel everything, and exit with error */
2656 goto exit;
2657 }
2658
2659 /* success - we can delete the intermediate states, and link top->base */
2660 ret = bdrv_change_backing_file(new_top_bs, base_bs->filename,
2661 base_bs->drv ? base_bs->drv->format_name : "");
2662 if (ret) {
2663 goto exit;
2664 }
920beae1 2665 bdrv_set_backing_hd(new_top_bs, base_bs);
6ebdcee2
JC
2666
2667 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2668 /* so that bdrv_close() does not recursively close the chain */
920beae1 2669 bdrv_set_backing_hd(intermediate_state->bs, NULL);
4f6fd349 2670 bdrv_unref(intermediate_state->bs);
6ebdcee2
JC
2671 }
2672 ret = 0;
2673
2674exit:
2675 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2676 g_free(intermediate_state);
2677 }
2678 return ret;
2679}
2680
2681
71d0770c
AL
2682static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2683 size_t size)
2684{
2685 int64_t len;
2686
1dd3a447
KW
2687 if (size > INT_MAX) {
2688 return -EIO;
2689 }
2690
71d0770c
AL
2691 if (!bdrv_is_inserted(bs))
2692 return -ENOMEDIUM;
2693
2694 if (bs->growable)
2695 return 0;
2696
2697 len = bdrv_getlength(bs);
2698
fbb7b4e0
KW
2699 if (offset < 0)
2700 return -EIO;
2701
2702 if ((offset > len) || (len - offset < size))
71d0770c
AL
2703 return -EIO;
2704
2705 return 0;
2706}
2707
2708static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2709 int nb_sectors)
2710{
54db38a4 2711 if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
8f4754ed
KW
2712 return -EIO;
2713 }
2714
eb5a3165
JS
2715 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2716 nb_sectors * BDRV_SECTOR_SIZE);
71d0770c
AL
2717}
2718
1c9805a3
SH
2719typedef struct RwCo {
2720 BlockDriverState *bs;
775aa8b6 2721 int64_t offset;
1c9805a3
SH
2722 QEMUIOVector *qiov;
2723 bool is_write;
2724 int ret;
4105eaaa 2725 BdrvRequestFlags flags;
1c9805a3
SH
2726} RwCo;
2727
2728static void coroutine_fn bdrv_rw_co_entry(void *opaque)
fc01f7e7 2729{
1c9805a3 2730 RwCo *rwco = opaque;
ea2384d3 2731
1c9805a3 2732 if (!rwco->is_write) {
775aa8b6
KW
2733 rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
2734 rwco->qiov->size, rwco->qiov,
4105eaaa 2735 rwco->flags);
775aa8b6
KW
2736 } else {
2737 rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
2738 rwco->qiov->size, rwco->qiov,
2739 rwco->flags);
1c9805a3
SH
2740 }
2741}
e7a8a783 2742
1c9805a3 2743/*
8d3b1a2d 2744 * Process a vectored synchronous request using coroutines
1c9805a3 2745 */
775aa8b6
KW
2746static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
2747 QEMUIOVector *qiov, bool is_write,
2748 BdrvRequestFlags flags)
1c9805a3 2749{
1c9805a3
SH
2750 Coroutine *co;
2751 RwCo rwco = {
2752 .bs = bs,
775aa8b6 2753 .offset = offset,
8d3b1a2d 2754 .qiov = qiov,
1c9805a3
SH
2755 .is_write = is_write,
2756 .ret = NOT_DONE,
4105eaaa 2757 .flags = flags,
1c9805a3 2758 };
e7a8a783 2759
498e386c
ZYW
2760 /**
2761 * In sync call context, when the vcpu is blocked, this throttling timer
2762 * will not fire; so the I/O throttling function has to be disabled here
2763 * if it has been enabled.
2764 */
2765 if (bs->io_limits_enabled) {
2766 fprintf(stderr, "Disabling I/O throttling on '%s' due "
2767 "to synchronous I/O.\n", bdrv_get_device_name(bs));
2768 bdrv_io_limits_disable(bs);
2769 }
2770
1c9805a3
SH
2771 if (qemu_in_coroutine()) {
2772 /* Fast-path if already in coroutine context */
2773 bdrv_rw_co_entry(&rwco);
2774 } else {
2572b37a
SH
2775 AioContext *aio_context = bdrv_get_aio_context(bs);
2776
1c9805a3
SH
2777 co = qemu_coroutine_create(bdrv_rw_co_entry);
2778 qemu_coroutine_enter(co, &rwco);
2779 while (rwco.ret == NOT_DONE) {
2572b37a 2780 aio_poll(aio_context, true);
1c9805a3
SH
2781 }
2782 }
2783 return rwco.ret;
2784}
b338082b 2785
8d3b1a2d
KW
2786/*
2787 * Process a synchronous request using coroutines
2788 */
2789static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
4105eaaa 2790 int nb_sectors, bool is_write, BdrvRequestFlags flags)
8d3b1a2d
KW
2791{
2792 QEMUIOVector qiov;
2793 struct iovec iov = {
2794 .iov_base = (void *)buf,
2795 .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2796 };
2797
da15ee51
KW
2798 if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2799 return -EINVAL;
2800 }
2801
8d3b1a2d 2802 qemu_iovec_init_external(&qiov, &iov, 1);
775aa8b6
KW
2803 return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
2804 &qiov, is_write, flags);
8d3b1a2d
KW
2805}
2806
1c9805a3
SH
2807/* return < 0 if error. See bdrv_write() for the return codes */
2808int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2809 uint8_t *buf, int nb_sectors)
2810{
4105eaaa 2811 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
fc01f7e7
FB
2812}
2813
07d27a44
MA
2814/* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2815int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2816 uint8_t *buf, int nb_sectors)
2817{
2818 bool enabled;
2819 int ret;
2820
2821 enabled = bs->io_limits_enabled;
2822 bs->io_limits_enabled = false;
4e7395e8 2823 ret = bdrv_read(bs, sector_num, buf, nb_sectors);
07d27a44
MA
2824 bs->io_limits_enabled = enabled;
2825 return ret;
2826}
2827
5fafdf24 2828/* Return < 0 if error. Important errors are:
19cb3738
FB
2829 -EIO generic I/O error (may happen for all errors)
2830 -ENOMEDIUM No media inserted.
2831 -EINVAL Invalid sector number or nb_sectors
2832 -EACCES Trying to write a read-only device
2833*/
5fafdf24 2834int bdrv_write(BlockDriverState *bs, int64_t sector_num,
fc01f7e7
FB
2835 const uint8_t *buf, int nb_sectors)
2836{
4105eaaa 2837 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
83f64091
FB
2838}
2839
aa7bfbff
PL
2840int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2841 int nb_sectors, BdrvRequestFlags flags)
4105eaaa
PL
2842{
2843 return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
aa7bfbff 2844 BDRV_REQ_ZERO_WRITE | flags);
8d3b1a2d
KW
2845}
2846
d75cbb5e
PL
2847/*
2848 * Completely zero out a block device with the help of bdrv_write_zeroes.
2849 * The operation is sped up by checking the block status and only writing
2850 * zeroes to the device if they currently do not return zeroes. Optional
2851 * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2852 *
2853 * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2854 */
2855int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2856{
9ce10c0b 2857 int64_t target_size;
d75cbb5e
PL
2858 int64_t ret, nb_sectors, sector_num = 0;
2859 int n;
2860
9ce10c0b
KW
2861 target_size = bdrv_getlength(bs);
2862 if (target_size < 0) {
2863 return target_size;
2864 }
2865 target_size /= BDRV_SECTOR_SIZE;
2866
d75cbb5e
PL
2867 for (;;) {
2868 nb_sectors = target_size - sector_num;
2869 if (nb_sectors <= 0) {
2870 return 0;
2871 }
2872 if (nb_sectors > INT_MAX) {
2873 nb_sectors = INT_MAX;
2874 }
2875 ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
3d94ce60
PL
2876 if (ret < 0) {
2877 error_report("error getting block status at sector %" PRId64 ": %s",
2878 sector_num, strerror(-ret));
2879 return ret;
2880 }
d75cbb5e
PL
2881 if (ret & BDRV_BLOCK_ZERO) {
2882 sector_num += n;
2883 continue;
2884 }
2885 ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2886 if (ret < 0) {
2887 error_report("error writing zeroes at sector %" PRId64 ": %s",
2888 sector_num, strerror(-ret));
2889 return ret;
2890 }
2891 sector_num += n;
2892 }
2893}
2894
a3ef6571 2895int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
83f64091 2896{
a3ef6571
KW
2897 QEMUIOVector qiov;
2898 struct iovec iov = {
2899 .iov_base = (void *)buf,
2900 .iov_len = bytes,
2901 };
9a8c4cce 2902 int ret;
83f64091 2903
a3ef6571
KW
2904 if (bytes < 0) {
2905 return -EINVAL;
83f64091
FB
2906 }
2907
a3ef6571
KW
2908 qemu_iovec_init_external(&qiov, &iov, 1);
2909 ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
2910 if (ret < 0) {
2911 return ret;
83f64091 2912 }
a3ef6571
KW
2913
2914 return bytes;
83f64091
FB
2915}
2916
8d3b1a2d 2917int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
83f64091 2918{
9a8c4cce 2919 int ret;
83f64091 2920
8407d5d7
KW
2921 ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
2922 if (ret < 0) {
2923 return ret;
83f64091
FB
2924 }
2925
8d3b1a2d
KW
2926 return qiov->size;
2927}
2928
2929int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
8407d5d7 2930 const void *buf, int bytes)
8d3b1a2d
KW
2931{
2932 QEMUIOVector qiov;
2933 struct iovec iov = {
2934 .iov_base = (void *) buf,
8407d5d7 2935 .iov_len = bytes,
8d3b1a2d
KW
2936 };
2937
8407d5d7
KW
2938 if (bytes < 0) {
2939 return -EINVAL;
2940 }
2941
8d3b1a2d
KW
2942 qemu_iovec_init_external(&qiov, &iov, 1);
2943 return bdrv_pwritev(bs, offset, &qiov);
83f64091 2944}
83f64091 2945
f08145fe
KW
2946/*
2947 * Writes to the file and ensures that no writes are reordered across this
2948 * request (acts as a barrier)
2949 *
2950 * Returns 0 on success, -errno in error cases.
2951 */
2952int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2953 const void *buf, int count)
2954{
2955 int ret;
2956
2957 ret = bdrv_pwrite(bs, offset, buf, count);
2958 if (ret < 0) {
2959 return ret;
2960 }
2961
f05fa4ad
PB
2962 /* No flush needed for cache modes that already do it */
2963 if (bs->enable_write_cache) {
f08145fe
KW
2964 bdrv_flush(bs);
2965 }
2966
2967 return 0;
2968}
2969
470c0504 2970static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
ab185921
SH
2971 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2972{
2973 /* Perform I/O through a temporary buffer so that users who scribble over
2974 * their read buffer while the operation is in progress do not end up
2975 * modifying the image file. This is critical for zero-copy guest I/O
2976 * where anything might happen inside guest memory.
2977 */
2978 void *bounce_buffer;
2979
79c053bd 2980 BlockDriver *drv = bs->drv;
ab185921
SH
2981 struct iovec iov;
2982 QEMUIOVector bounce_qiov;
2983 int64_t cluster_sector_num;
2984 int cluster_nb_sectors;
2985 size_t skip_bytes;
2986 int ret;
2987
2988 /* Cover entire cluster so no additional backing file I/O is required when
2989 * allocating cluster in the image file.
2990 */
343bded4
PB
2991 bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2992 &cluster_sector_num, &cluster_nb_sectors);
ab185921 2993
470c0504
SH
2994 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2995 cluster_sector_num, cluster_nb_sectors);
ab185921
SH
2996
2997 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2998 iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
2999 qemu_iovec_init_external(&bounce_qiov, &iov, 1);
3000
79c053bd
SH
3001 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
3002 &bounce_qiov);
ab185921
SH
3003 if (ret < 0) {
3004 goto err;
3005 }
3006
79c053bd
SH
3007 if (drv->bdrv_co_write_zeroes &&
3008 buffer_is_zero(bounce_buffer, iov.iov_len)) {
621f0589 3009 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
aa7bfbff 3010 cluster_nb_sectors, 0);
79c053bd 3011 } else {
f05fa4ad
PB
3012 /* This does not change the data on the disk, it is not necessary
3013 * to flush even in cache=writethrough mode.
3014 */
79c053bd 3015 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
ab185921 3016 &bounce_qiov);
79c053bd
SH
3017 }
3018
ab185921
SH
3019 if (ret < 0) {
3020 /* It might be okay to ignore write errors for guest requests. If this
3021 * is a deliberate copy-on-read then we don't want to ignore the error.
3022 * Simply report it in all cases.
3023 */
3024 goto err;
3025 }
3026
3027 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
03396148
MT
3028 qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
3029 nb_sectors * BDRV_SECTOR_SIZE);
ab185921
SH
3030
3031err:
3032 qemu_vfree(bounce_buffer);
3033 return ret;
3034}
3035
c5fbe571 3036/*
d0c7f642
KW
3037 * Forwards an already correctly aligned request to the BlockDriver. This
3038 * handles copy on read and zeroing after EOF; any other features must be
3039 * implemented by the caller.
c5fbe571 3040 */
d0c7f642 3041static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
65afd211 3042 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
ec746e10 3043 int64_t align, QEMUIOVector *qiov, int flags)
da1fa91d
KW
3044{
3045 BlockDriver *drv = bs->drv;
dbffbdcf 3046 int ret;
da1fa91d 3047
d0c7f642
KW
3048 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3049 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
da1fa91d 3050
d0c7f642
KW
3051 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3052 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3053
3054 /* Handle Copy on Read and associated serialisation */
470c0504 3055 if (flags & BDRV_REQ_COPY_ON_READ) {
7327145f
KW
3056 /* If we touch the same cluster it counts as an overlap. This
3057 * guarantees that allocating writes will be serialized and not race
3058 * with each other for the same cluster. For example, in copy-on-read
3059 * it ensures that the CoR read and write operations are atomic and
3060 * guest writes cannot interleave between them. */
3061 mark_request_serialising(req, bdrv_get_cluster_size(bs));
470c0504
SH
3062 }
3063
2dbafdc0 3064 wait_serialising_requests(req);
f4658285 3065
470c0504 3066 if (flags & BDRV_REQ_COPY_ON_READ) {
ab185921
SH
3067 int pnum;
3068
bdad13b9 3069 ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
ab185921
SH
3070 if (ret < 0) {
3071 goto out;
3072 }
3073
3074 if (!ret || pnum != nb_sectors) {
470c0504 3075 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
ab185921
SH
3076 goto out;
3077 }
3078 }
3079
d0c7f642 3080 /* Forward the request to the BlockDriver */
893a8f62
MK
3081 if (!(bs->zero_beyond_eof && bs->growable)) {
3082 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
3083 } else {
3084 /* Read zeros after EOF of growable BDSes */
3085 int64_t len, total_sectors, max_nb_sectors;
3086
3087 len = bdrv_getlength(bs);
3088 if (len < 0) {
3089 ret = len;
3090 goto out;
3091 }
3092
d055a1fe 3093 total_sectors = DIV_ROUND_UP(len, BDRV_SECTOR_SIZE);
5f5bcd80
KW
3094 max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
3095 align >> BDRV_SECTOR_BITS);
893a8f62
MK
3096 if (max_nb_sectors > 0) {
3097 ret = drv->bdrv_co_readv(bs, sector_num,
3098 MIN(nb_sectors, max_nb_sectors), qiov);
3099 } else {
3100 ret = 0;
3101 }
3102
3103 /* Reading beyond end of file is supposed to produce zeroes */
3104 if (ret == 0 && total_sectors < sector_num + nb_sectors) {
3105 uint64_t offset = MAX(0, total_sectors - sector_num);
3106 uint64_t bytes = (sector_num + nb_sectors - offset) *
3107 BDRV_SECTOR_SIZE;
3108 qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
3109 }
3110 }
ab185921
SH
3111
3112out:
dbffbdcf 3113 return ret;
da1fa91d
KW
3114}
3115
d0c7f642
KW
3116/*
3117 * Handle a read request in coroutine context
3118 */
1b0288ae
KW
3119static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
3120 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
d0c7f642
KW
3121 BdrvRequestFlags flags)
3122{
3123 BlockDriver *drv = bs->drv;
65afd211
KW
3124 BdrvTrackedRequest req;
3125
1b0288ae
KW
3126 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3127 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3128 uint8_t *head_buf = NULL;
3129 uint8_t *tail_buf = NULL;
3130 QEMUIOVector local_qiov;
3131 bool use_local_qiov = false;
d0c7f642
KW
3132 int ret;
3133
3134 if (!drv) {
3135 return -ENOMEDIUM;
3136 }
1b0288ae 3137 if (bdrv_check_byte_request(bs, offset, bytes)) {
d0c7f642
KW
3138 return -EIO;
3139 }
3140
3141 if (bs->copy_on_read) {
3142 flags |= BDRV_REQ_COPY_ON_READ;
3143 }
3144
3145 /* throttling disk I/O */
3146 if (bs->io_limits_enabled) {
d5103588 3147 bdrv_io_limits_intercept(bs, bytes, false);
1b0288ae
KW
3148 }
3149
3150 /* Align read if necessary by padding qiov */
3151 if (offset & (align - 1)) {
3152 head_buf = qemu_blockalign(bs, align);
3153 qemu_iovec_init(&local_qiov, qiov->niov + 2);
3154 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3155 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3156 use_local_qiov = true;
3157
3158 bytes += offset & (align - 1);
3159 offset = offset & ~(align - 1);
3160 }
3161
3162 if ((offset + bytes) & (align - 1)) {
3163 if (!use_local_qiov) {
3164 qemu_iovec_init(&local_qiov, qiov->niov + 1);
3165 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3166 use_local_qiov = true;
3167 }
3168 tail_buf = qemu_blockalign(bs, align);
3169 qemu_iovec_add(&local_qiov, tail_buf,
3170 align - ((offset + bytes) & (align - 1)));
3171
3172 bytes = ROUND_UP(bytes, align);
3173 }
3174
65afd211 3175 tracked_request_begin(&req, bs, offset, bytes, false);
ec746e10 3176 ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
1b0288ae
KW
3177 use_local_qiov ? &local_qiov : qiov,
3178 flags);
65afd211 3179 tracked_request_end(&req);
1b0288ae
KW
3180
3181 if (use_local_qiov) {
3182 qemu_iovec_destroy(&local_qiov);
3183 qemu_vfree(head_buf);
3184 qemu_vfree(tail_buf);
d0c7f642
KW
3185 }
3186
d0c7f642
KW
3187 return ret;
3188}
3189
1b0288ae
KW
3190static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
3191 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3192 BdrvRequestFlags flags)
3193{
3194 if (nb_sectors < 0 || nb_sectors > (UINT_MAX >> BDRV_SECTOR_BITS)) {
3195 return -EINVAL;
3196 }
3197
3198 return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
3199 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3200}
3201
c5fbe571 3202int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
da1fa91d
KW
3203 int nb_sectors, QEMUIOVector *qiov)
3204{
c5fbe571 3205 trace_bdrv_co_readv(bs, sector_num, nb_sectors);
da1fa91d 3206
470c0504
SH
3207 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
3208}
3209
3210int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
3211 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
3212{
3213 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
3214
3215 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
3216 BDRV_REQ_COPY_ON_READ);
c5fbe571
SH
3217}
3218
c31cb707
PL
3219/* if no limit is specified in the BlockLimits use a default
3220 * of 32768 512-byte sectors (16 MiB) per request.
3221 */
3222#define MAX_WRITE_ZEROES_DEFAULT 32768
3223
f08f2dda 3224static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
aa7bfbff 3225 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
f08f2dda
SH
3226{
3227 BlockDriver *drv = bs->drv;
3228 QEMUIOVector qiov;
c31cb707
PL
3229 struct iovec iov = {0};
3230 int ret = 0;
f08f2dda 3231
c31cb707
PL
3232 int max_write_zeroes = bs->bl.max_write_zeroes ?
3233 bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
621f0589 3234
c31cb707
PL
3235 while (nb_sectors > 0 && !ret) {
3236 int num = nb_sectors;
3237
b8d71c09
PB
3238 /* Align request. Block drivers can expect the "bulk" of the request
3239 * to be aligned.
3240 */
3241 if (bs->bl.write_zeroes_alignment
3242 && num > bs->bl.write_zeroes_alignment) {
3243 if (sector_num % bs->bl.write_zeroes_alignment != 0) {
3244 /* Make a small request up to the first aligned sector. */
c31cb707 3245 num = bs->bl.write_zeroes_alignment;
b8d71c09
PB
3246 num -= sector_num % bs->bl.write_zeroes_alignment;
3247 } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
3248 /* Shorten the request to the last aligned sector. num cannot
3249 * underflow because num > bs->bl.write_zeroes_alignment.
3250 */
3251 num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
c31cb707 3252 }
621f0589 3253 }
f08f2dda 3254
c31cb707
PL
3255 /* limit request size */
3256 if (num > max_write_zeroes) {
3257 num = max_write_zeroes;
3258 }
3259
3260 ret = -ENOTSUP;
3261 /* First try the efficient write zeroes operation */
3262 if (drv->bdrv_co_write_zeroes) {
3263 ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
3264 }
3265
3266 if (ret == -ENOTSUP) {
3267 /* Fall back to bounce buffer if write zeroes is unsupported */
3268 iov.iov_len = num * BDRV_SECTOR_SIZE;
3269 if (iov.iov_base == NULL) {
b8d71c09
PB
3270 iov.iov_base = qemu_blockalign(bs, num * BDRV_SECTOR_SIZE);
3271 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
c31cb707
PL
3272 }
3273 qemu_iovec_init_external(&qiov, &iov, 1);
f08f2dda 3274
c31cb707 3275 ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
b8d71c09
PB
3276
3277 /* Keep bounce buffer around if it is big enough for all
3278 * all future requests.
3279 */
3280 if (num < max_write_zeroes) {
3281 qemu_vfree(iov.iov_base);
3282 iov.iov_base = NULL;
3283 }
c31cb707
PL
3284 }
3285
3286 sector_num += num;
3287 nb_sectors -= num;
3288 }
f08f2dda
SH
3289
3290 qemu_vfree(iov.iov_base);
3291 return ret;
3292}
3293
c5fbe571 3294/*
b404f720 3295 * Forwards an already correctly aligned write request to the BlockDriver.
c5fbe571 3296 */
b404f720 3297static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
65afd211
KW
3298 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3299 QEMUIOVector *qiov, int flags)
c5fbe571
SH
3300{
3301 BlockDriver *drv = bs->drv;
28de2dcd 3302 bool waited;
6b7cb247 3303 int ret;
da1fa91d 3304
b404f720
KW
3305 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3306 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
f4658285 3307
b404f720
KW
3308 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3309 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
cc0681c4 3310
28de2dcd
KW
3311 waited = wait_serialising_requests(req);
3312 assert(!waited || !req->serialising);
af91f9a7
KW
3313 assert(req->overlap_offset <= offset);
3314 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
244eadef 3315
65afd211 3316 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
d616b224 3317
465bee1d
PL
3318 if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
3319 !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes &&
3320 qemu_iovec_is_zero(qiov)) {
3321 flags |= BDRV_REQ_ZERO_WRITE;
3322 if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
3323 flags |= BDRV_REQ_MAY_UNMAP;
3324 }
3325 }
3326
d616b224
SH
3327 if (ret < 0) {
3328 /* Do nothing, write notifier decided to fail this request */
3329 } else if (flags & BDRV_REQ_ZERO_WRITE) {
9e1cb96d 3330 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
aa7bfbff 3331 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
f08f2dda 3332 } else {
9e1cb96d 3333 BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
f08f2dda
SH
3334 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
3335 }
9e1cb96d 3336 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
6b7cb247 3337
f05fa4ad
PB
3338 if (ret == 0 && !bs->enable_write_cache) {
3339 ret = bdrv_co_flush(bs);
3340 }
3341
e4654d2d 3342 bdrv_set_dirty(bs, sector_num, nb_sectors);
da1fa91d
KW
3343
3344 if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
3345 bs->wr_highest_sector = sector_num + nb_sectors - 1;
3346 }
df2a6f29
PB
3347 if (bs->growable && ret >= 0) {
3348 bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
3349 }
da1fa91d 3350
6b7cb247 3351 return ret;
da1fa91d
KW
3352}
3353
b404f720
KW
3354/*
3355 * Handle a write request in coroutine context
3356 */
6601553e
KW
3357static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
3358 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
b404f720
KW
3359 BdrvRequestFlags flags)
3360{
65afd211 3361 BdrvTrackedRequest req;
3b8242e0
KW
3362 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3363 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3364 uint8_t *head_buf = NULL;
3365 uint8_t *tail_buf = NULL;
3366 QEMUIOVector local_qiov;
3367 bool use_local_qiov = false;
b404f720
KW
3368 int ret;
3369
3370 if (!bs->drv) {
3371 return -ENOMEDIUM;
3372 }
3373 if (bs->read_only) {
3374 return -EACCES;
3375 }
6601553e 3376 if (bdrv_check_byte_request(bs, offset, bytes)) {
b404f720
KW
3377 return -EIO;
3378 }
3379
b404f720
KW
3380 /* throttling disk I/O */
3381 if (bs->io_limits_enabled) {
d5103588 3382 bdrv_io_limits_intercept(bs, bytes, true);
b404f720
KW
3383 }
3384
3b8242e0
KW
3385 /*
3386 * Align write if necessary by performing a read-modify-write cycle.
3387 * Pad qiov with the read parts and be sure to have a tracked request not
3388 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
3389 */
65afd211 3390 tracked_request_begin(&req, bs, offset, bytes, true);
3b8242e0
KW
3391
3392 if (offset & (align - 1)) {
3393 QEMUIOVector head_qiov;
3394 struct iovec head_iov;
3395
3396 mark_request_serialising(&req, align);
3397 wait_serialising_requests(&req);
3398
3399 head_buf = qemu_blockalign(bs, align);
3400 head_iov = (struct iovec) {
3401 .iov_base = head_buf,
3402 .iov_len = align,
3403 };
3404 qemu_iovec_init_external(&head_qiov, &head_iov, 1);
3405
9e1cb96d 3406 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
3b8242e0
KW
3407 ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
3408 align, &head_qiov, 0);
3409 if (ret < 0) {
3410 goto fail;
3411 }
9e1cb96d 3412 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
3b8242e0
KW
3413
3414 qemu_iovec_init(&local_qiov, qiov->niov + 2);
3415 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3416 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3417 use_local_qiov = true;
3418
3419 bytes += offset & (align - 1);
3420 offset = offset & ~(align - 1);
3421 }
3422
3423 if ((offset + bytes) & (align - 1)) {
3424 QEMUIOVector tail_qiov;
3425 struct iovec tail_iov;
3426 size_t tail_bytes;
28de2dcd 3427 bool waited;
3b8242e0
KW
3428
3429 mark_request_serialising(&req, align);
28de2dcd
KW
3430 waited = wait_serialising_requests(&req);
3431 assert(!waited || !use_local_qiov);
3b8242e0
KW
3432
3433 tail_buf = qemu_blockalign(bs, align);
3434 tail_iov = (struct iovec) {
3435 .iov_base = tail_buf,
3436 .iov_len = align,
3437 };
3438 qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
3439
9e1cb96d 3440 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
3b8242e0
KW
3441 ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
3442 align, &tail_qiov, 0);
3443 if (ret < 0) {
3444 goto fail;
3445 }
9e1cb96d 3446 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
3b8242e0
KW
3447
3448 if (!use_local_qiov) {
3449 qemu_iovec_init(&local_qiov, qiov->niov + 1);
3450 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3451 use_local_qiov = true;
3452 }
3453
3454 tail_bytes = (offset + bytes) & (align - 1);
3455 qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
3456
3457 bytes = ROUND_UP(bytes, align);
3458 }
3459
3460 ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
3461 use_local_qiov ? &local_qiov : qiov,
3462 flags);
3463
3464fail:
65afd211 3465 tracked_request_end(&req);
b404f720 3466
3b8242e0
KW
3467 if (use_local_qiov) {
3468 qemu_iovec_destroy(&local_qiov);
3b8242e0 3469 }
99c4a85c
KW
3470 qemu_vfree(head_buf);
3471 qemu_vfree(tail_buf);
3b8242e0 3472
b404f720
KW
3473 return ret;
3474}
3475
6601553e
KW
3476static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
3477 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3478 BdrvRequestFlags flags)
3479{
3480 if (nb_sectors < 0 || nb_sectors > (INT_MAX >> BDRV_SECTOR_BITS)) {
3481 return -EINVAL;
3482 }
3483
3484 return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
3485 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3486}
3487
c5fbe571
SH
3488int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
3489 int nb_sectors, QEMUIOVector *qiov)
3490{
3491 trace_bdrv_co_writev(bs, sector_num, nb_sectors);
3492
f08f2dda
SH
3493 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
3494}
3495
3496int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
aa7bfbff
PL
3497 int64_t sector_num, int nb_sectors,
3498 BdrvRequestFlags flags)
f08f2dda 3499{
94d6ff21 3500 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
f08f2dda 3501
d32f35cb
PL
3502 if (!(bs->open_flags & BDRV_O_UNMAP)) {
3503 flags &= ~BDRV_REQ_MAY_UNMAP;
3504 }
3505
f08f2dda 3506 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
aa7bfbff 3507 BDRV_REQ_ZERO_WRITE | flags);
c5fbe571
SH
3508}
3509
83f64091
FB
3510/**
3511 * Truncate file to 'offset' bytes (needed only for file protocols)
3512 */
3513int bdrv_truncate(BlockDriverState *bs, int64_t offset)
3514{
3515 BlockDriver *drv = bs->drv;
51762288 3516 int ret;
83f64091 3517 if (!drv)
19cb3738 3518 return -ENOMEDIUM;
83f64091
FB
3519 if (!drv->bdrv_truncate)
3520 return -ENOTSUP;
59f2689d
NS
3521 if (bs->read_only)
3522 return -EACCES;
3718d8ab 3523 if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_RESIZE, NULL)) {
8591675f 3524 return -EBUSY;
3718d8ab 3525 }
51762288
SH
3526 ret = drv->bdrv_truncate(bs, offset);
3527 if (ret == 0) {
3528 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
145feb17 3529 bdrv_dev_resize_cb(bs);
51762288
SH
3530 }
3531 return ret;
83f64091
FB
3532}
3533
4a1d5e1f
FZ
3534/**
3535 * Length of a allocated file in bytes. Sparse files are counted by actual
3536 * allocated space. Return < 0 if error or unknown.
3537 */
3538int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
3539{
3540 BlockDriver *drv = bs->drv;
3541 if (!drv) {
3542 return -ENOMEDIUM;
3543 }
3544 if (drv->bdrv_get_allocated_file_size) {
3545 return drv->bdrv_get_allocated_file_size(bs);
3546 }
3547 if (bs->file) {
3548 return bdrv_get_allocated_file_size(bs->file);
3549 }
3550 return -ENOTSUP;
3551}
3552
83f64091
FB
3553/**
3554 * Length of a file in bytes. Return < 0 if error or unknown.
3555 */
3556int64_t bdrv_getlength(BlockDriverState *bs)
3557{
3558 BlockDriver *drv = bs->drv;
3559 if (!drv)
19cb3738 3560 return -ENOMEDIUM;
51762288 3561
b94a2610
KW
3562 if (drv->has_variable_length) {
3563 int ret = refresh_total_sectors(bs, bs->total_sectors);
3564 if (ret < 0) {
3565 return ret;
46a4e4e6 3566 }
83f64091 3567 }
46a4e4e6 3568 return bs->total_sectors * BDRV_SECTOR_SIZE;
fc01f7e7
FB
3569}
3570
19cb3738 3571/* return 0 as number of sectors if no device present or error */
96b8f136 3572void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
fc01f7e7 3573{
19cb3738
FB
3574 int64_t length;
3575 length = bdrv_getlength(bs);
3576 if (length < 0)
3577 length = 0;
3578 else
6ea44308 3579 length = length >> BDRV_SECTOR_BITS;
19cb3738 3580 *nb_sectors_ptr = length;
fc01f7e7 3581}
cf98951b 3582
ff06f5f3
PB
3583void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
3584 BlockdevOnError on_write_error)
abd7f68d
MA
3585{
3586 bs->on_read_error = on_read_error;
3587 bs->on_write_error = on_write_error;
3588}
3589
1ceee0d5 3590BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
abd7f68d
MA
3591{
3592 return is_read ? bs->on_read_error : bs->on_write_error;
3593}
3594
3e1caa5f
PB
3595BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
3596{
3597 BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
3598
3599 switch (on_err) {
3600 case BLOCKDEV_ON_ERROR_ENOSPC:
a589569f
WX
3601 return (error == ENOSPC) ?
3602 BLOCK_ERROR_ACTION_STOP : BLOCK_ERROR_ACTION_REPORT;
3e1caa5f 3603 case BLOCKDEV_ON_ERROR_STOP:
a589569f 3604 return BLOCK_ERROR_ACTION_STOP;
3e1caa5f 3605 case BLOCKDEV_ON_ERROR_REPORT:
a589569f 3606 return BLOCK_ERROR_ACTION_REPORT;
3e1caa5f 3607 case BLOCKDEV_ON_ERROR_IGNORE:
a589569f 3608 return BLOCK_ERROR_ACTION_IGNORE;
3e1caa5f
PB
3609 default:
3610 abort();
3611 }
3612}
3613
3614/* This is done by device models because, while the block layer knows
3615 * about the error, it does not know whether an operation comes from
3616 * the device or the block layer (from a job, for example).
3617 */
3618void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3619 bool is_read, int error)
3620{
3621 assert(error >= 0);
2bd3bce8 3622
a589569f 3623 if (action == BLOCK_ERROR_ACTION_STOP) {
2bd3bce8
PB
3624 /* First set the iostatus, so that "info block" returns an iostatus
3625 * that matches the events raised so far (an additional error iostatus
3626 * is fine, but not a lost one).
3627 */
3e1caa5f 3628 bdrv_iostatus_set_err(bs, error);
2bd3bce8
PB
3629
3630 /* Then raise the request to stop the VM and the event.
3631 * qemu_system_vmstop_request_prepare has two effects. First,
3632 * it ensures that the STOP event always comes after the
3633 * BLOCK_IO_ERROR event. Second, it ensures that even if management
3634 * can observe the STOP event and do a "cont" before the STOP
3635 * event is issued, the VM will not stop. In this case, vm_start()
3636 * also ensures that the STOP/RESUME pair of events is emitted.
3637 */
3638 qemu_system_vmstop_request_prepare();
5a2d2cbd
WX
3639 qapi_event_send_block_io_error(bdrv_get_device_name(bs),
3640 is_read ? IO_OPERATION_TYPE_READ :
3641 IO_OPERATION_TYPE_WRITE,
3642 action, &error_abort);
2bd3bce8
PB
3643 qemu_system_vmstop_request(RUN_STATE_IO_ERROR);
3644 } else {
5a2d2cbd
WX
3645 qapi_event_send_block_io_error(bdrv_get_device_name(bs),
3646 is_read ? IO_OPERATION_TYPE_READ :
3647 IO_OPERATION_TYPE_WRITE,
3648 action, &error_abort);
3e1caa5f
PB
3649 }
3650}
3651
b338082b
FB
3652int bdrv_is_read_only(BlockDriverState *bs)
3653{
3654 return bs->read_only;
3655}
3656
985a03b0
TS
3657int bdrv_is_sg(BlockDriverState *bs)
3658{
3659 return bs->sg;
3660}
3661
e900a7b7
CH
3662int bdrv_enable_write_cache(BlockDriverState *bs)
3663{
3664 return bs->enable_write_cache;
3665}
3666
425b0148
PB
3667void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3668{
3669 bs->enable_write_cache = wce;
55b110f2
JC
3670
3671 /* so a reopen() will preserve wce */
3672 if (wce) {
3673 bs->open_flags |= BDRV_O_CACHE_WB;
3674 } else {
3675 bs->open_flags &= ~BDRV_O_CACHE_WB;
3676 }
425b0148
PB
3677}
3678
ea2384d3
FB
3679int bdrv_is_encrypted(BlockDriverState *bs)
3680{
3681 if (bs->backing_hd && bs->backing_hd->encrypted)
3682 return 1;
3683 return bs->encrypted;
3684}
3685
c0f4ce77
AL
3686int bdrv_key_required(BlockDriverState *bs)
3687{
3688 BlockDriverState *backing_hd = bs->backing_hd;
3689
3690 if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3691 return 1;
3692 return (bs->encrypted && !bs->valid_key);
3693}
3694
ea2384d3
FB
3695int bdrv_set_key(BlockDriverState *bs, const char *key)
3696{
3697 int ret;
3698 if (bs->backing_hd && bs->backing_hd->encrypted) {
3699 ret = bdrv_set_key(bs->backing_hd, key);
3700 if (ret < 0)
3701 return ret;
3702 if (!bs->encrypted)
3703 return 0;
3704 }
fd04a2ae
SH
3705 if (!bs->encrypted) {
3706 return -EINVAL;
3707 } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3708 return -ENOMEDIUM;
3709 }
c0f4ce77 3710 ret = bs->drv->bdrv_set_key(bs, key);
bb5fc20f
AL
3711 if (ret < 0) {
3712 bs->valid_key = 0;
3713 } else if (!bs->valid_key) {
3714 bs->valid_key = 1;
3715 /* call the change callback now, we skipped it on open */
7d4b4ba5 3716 bdrv_dev_change_media_cb(bs, true);
bb5fc20f 3717 }
c0f4ce77 3718 return ret;
ea2384d3
FB
3719}
3720
f8d6bba1 3721const char *bdrv_get_format_name(BlockDriverState *bs)
ea2384d3 3722{
f8d6bba1 3723 return bs->drv ? bs->drv->format_name : NULL;
ea2384d3
FB
3724}
3725
5fafdf24 3726void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
ea2384d3
FB
3727 void *opaque)
3728{
3729 BlockDriver *drv;
e855e4fb
JC
3730 int count = 0;
3731 const char **formats = NULL;
ea2384d3 3732
8a22f02a 3733 QLIST_FOREACH(drv, &bdrv_drivers, list) {
e855e4fb
JC
3734 if (drv->format_name) {
3735 bool found = false;
3736 int i = count;
3737 while (formats && i && !found) {
3738 found = !strcmp(formats[--i], drv->format_name);
3739 }
3740
3741 if (!found) {
3742 formats = g_realloc(formats, (count + 1) * sizeof(char *));
3743 formats[count++] = drv->format_name;
3744 it(opaque, drv->format_name);
3745 }
3746 }
ea2384d3 3747 }
e855e4fb 3748 g_free(formats);
ea2384d3
FB
3749}
3750
dc364f4c 3751/* This function is to find block backend bs */
b338082b
FB
3752BlockDriverState *bdrv_find(const char *name)
3753{
3754 BlockDriverState *bs;
3755
dc364f4c 3756 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1b7bdbc1 3757 if (!strcmp(name, bs->device_name)) {
b338082b 3758 return bs;
1b7bdbc1 3759 }
b338082b
FB
3760 }
3761 return NULL;
3762}
3763
dc364f4c
BC
3764/* This function is to find a node in the bs graph */
3765BlockDriverState *bdrv_find_node(const char *node_name)
3766{
3767 BlockDriverState *bs;
3768
3769 assert(node_name);
3770
3771 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3772 if (!strcmp(node_name, bs->node_name)) {
3773 return bs;
3774 }
3775 }
3776 return NULL;
3777}
3778
c13163fb
BC
3779/* Put this QMP function here so it can access the static graph_bdrv_states. */
3780BlockDeviceInfoList *bdrv_named_nodes_list(void)
3781{
3782 BlockDeviceInfoList *list, *entry;
3783 BlockDriverState *bs;
3784
3785 list = NULL;
3786 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3787 entry = g_malloc0(sizeof(*entry));
3788 entry->value = bdrv_block_device_info(bs);
3789 entry->next = list;
3790 list = entry;
3791 }
3792
3793 return list;
3794}
3795
12d3ba82
BC
3796BlockDriverState *bdrv_lookup_bs(const char *device,
3797 const char *node_name,
3798 Error **errp)
3799{
3800 BlockDriverState *bs = NULL;
3801
12d3ba82
BC
3802 if (device) {
3803 bs = bdrv_find(device);
3804
dd67fa50
BC
3805 if (bs) {
3806 return bs;
12d3ba82 3807 }
12d3ba82
BC
3808 }
3809
dd67fa50
BC
3810 if (node_name) {
3811 bs = bdrv_find_node(node_name);
12d3ba82 3812
dd67fa50
BC
3813 if (bs) {
3814 return bs;
3815 }
12d3ba82
BC
3816 }
3817
dd67fa50
BC
3818 error_setg(errp, "Cannot find device=%s nor node_name=%s",
3819 device ? device : "",
3820 node_name ? node_name : "");
3821 return NULL;
12d3ba82
BC
3822}
3823
2f399b0a
MA
3824BlockDriverState *bdrv_next(BlockDriverState *bs)
3825{
3826 if (!bs) {
3827 return QTAILQ_FIRST(&bdrv_states);
3828 }
dc364f4c 3829 return QTAILQ_NEXT(bs, device_list);
2f399b0a
MA
3830}
3831
51de9760 3832void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
81d0912d
FB
3833{
3834 BlockDriverState *bs;
3835
dc364f4c 3836 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
51de9760 3837 it(opaque, bs);
81d0912d
FB
3838 }
3839}
3840
ea2384d3
FB
3841const char *bdrv_get_device_name(BlockDriverState *bs)
3842{
3843 return bs->device_name;
3844}
3845
c8433287
MA
3846int bdrv_get_flags(BlockDriverState *bs)
3847{
3848 return bs->open_flags;
3849}
3850
f0f0fdfe 3851int bdrv_flush_all(void)
c6ca28d6
AL
3852{
3853 BlockDriverState *bs;
f0f0fdfe 3854 int result = 0;
c6ca28d6 3855
dc364f4c 3856 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
ed78cda3
SH
3857 AioContext *aio_context = bdrv_get_aio_context(bs);
3858 int ret;
3859
3860 aio_context_acquire(aio_context);
3861 ret = bdrv_flush(bs);
f0f0fdfe
KW
3862 if (ret < 0 && !result) {
3863 result = ret;
3864 }
ed78cda3 3865 aio_context_release(aio_context);
1b7bdbc1 3866 }
f0f0fdfe
KW
3867
3868 return result;
c6ca28d6
AL
3869}
3870
3ac21627
PL
3871int bdrv_has_zero_init_1(BlockDriverState *bs)
3872{
3873 return 1;
3874}
3875
f2feebbd
KW
3876int bdrv_has_zero_init(BlockDriverState *bs)
3877{
3878 assert(bs->drv);
3879
11212d8f
PB
3880 /* If BS is a copy on write image, it is initialized to
3881 the contents of the base image, which may not be zeroes. */
3882 if (bs->backing_hd) {
3883 return 0;
3884 }
336c1c12
KW
3885 if (bs->drv->bdrv_has_zero_init) {
3886 return bs->drv->bdrv_has_zero_init(bs);
f2feebbd
KW
3887 }
3888
3ac21627
PL
3889 /* safe default */
3890 return 0;
f2feebbd
KW
3891}
3892
4ce78691
PL
3893bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
3894{
3895 BlockDriverInfo bdi;
3896
3897 if (bs->backing_hd) {
3898 return false;
3899 }
3900
3901 if (bdrv_get_info(bs, &bdi) == 0) {
3902 return bdi.unallocated_blocks_are_zero;
3903 }
3904
3905 return false;
3906}
3907
3908bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
3909{
3910 BlockDriverInfo bdi;
3911
3912 if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
3913 return false;
3914 }
3915
3916 if (bdrv_get_info(bs, &bdi) == 0) {
3917 return bdi.can_write_zeroes_with_unmap;
3918 }
3919
3920 return false;
3921}
3922
b6b8a333 3923typedef struct BdrvCoGetBlockStatusData {
376ae3f1 3924 BlockDriverState *bs;
b35b2bba 3925 BlockDriverState *base;
376ae3f1
SH
3926 int64_t sector_num;
3927 int nb_sectors;
3928 int *pnum;
b6b8a333 3929 int64_t ret;
376ae3f1 3930 bool done;
b6b8a333 3931} BdrvCoGetBlockStatusData;
376ae3f1 3932
f58c7b35
TS
3933/*
3934 * Returns true iff the specified sector is present in the disk image. Drivers
3935 * not implementing the functionality are assumed to not support backing files,
3936 * hence all their sectors are reported as allocated.
3937 *
bd9533e3
SH
3938 * If 'sector_num' is beyond the end of the disk image the return value is 0
3939 * and 'pnum' is set to 0.
3940 *
f58c7b35
TS
3941 * 'pnum' is set to the number of sectors (including and immediately following
3942 * the specified sector) that are known to be in the same
3943 * allocated/unallocated state.
3944 *
bd9533e3
SH
3945 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
3946 * beyond the end of the disk image it will be clamped.
f58c7b35 3947 */
b6b8a333
PB
3948static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
3949 int64_t sector_num,
3950 int nb_sectors, int *pnum)
f58c7b35 3951{
617ccb46 3952 int64_t length;
bd9533e3 3953 int64_t n;
5daa74a6 3954 int64_t ret, ret2;
bd9533e3 3955
617ccb46
PB
3956 length = bdrv_getlength(bs);
3957 if (length < 0) {
3958 return length;
3959 }
3960
3961 if (sector_num >= (length >> BDRV_SECTOR_BITS)) {
bd9533e3
SH
3962 *pnum = 0;
3963 return 0;
3964 }
3965
3966 n = bs->total_sectors - sector_num;
3967 if (n < nb_sectors) {
3968 nb_sectors = n;
3969 }
3970
b6b8a333 3971 if (!bs->drv->bdrv_co_get_block_status) {
bd9533e3 3972 *pnum = nb_sectors;
e88ae226 3973 ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
918e92d7
PB
3974 if (bs->drv->protocol_name) {
3975 ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
3976 }
3977 return ret;
f58c7b35 3978 }
6aebab14 3979
415b5b01
PB
3980 ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
3981 if (ret < 0) {
3e0a233d 3982 *pnum = 0;
415b5b01
PB
3983 return ret;
3984 }
3985
92bc50a5
PL
3986 if (ret & BDRV_BLOCK_RAW) {
3987 assert(ret & BDRV_BLOCK_OFFSET_VALID);
3988 return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3989 *pnum, pnum);
3990 }
3991
e88ae226
KW
3992 if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
3993 ret |= BDRV_BLOCK_ALLOCATED;
3994 }
3995
c3d86884
PL
3996 if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
3997 if (bdrv_unallocated_blocks_are_zero(bs)) {
f0ad5712 3998 ret |= BDRV_BLOCK_ZERO;
1f9db224 3999 } else if (bs->backing_hd) {
f0ad5712
PB
4000 BlockDriverState *bs2 = bs->backing_hd;
4001 int64_t length2 = bdrv_getlength(bs2);
4002 if (length2 >= 0 && sector_num >= (length2 >> BDRV_SECTOR_BITS)) {
4003 ret |= BDRV_BLOCK_ZERO;
4004 }
4005 }
415b5b01 4006 }
5daa74a6
PB
4007
4008 if (bs->file &&
4009 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
4010 (ret & BDRV_BLOCK_OFFSET_VALID)) {
4011 ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
4012 *pnum, pnum);
4013 if (ret2 >= 0) {
4014 /* Ignore errors. This is just providing extra information, it
4015 * is useful but not necessary.
4016 */
4017 ret |= (ret2 & BDRV_BLOCK_ZERO);
4018 }
4019 }
4020
415b5b01 4021 return ret;
060f51c9
SH
4022}
4023
b6b8a333
PB
4024/* Coroutine wrapper for bdrv_get_block_status() */
4025static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
060f51c9 4026{
b6b8a333 4027 BdrvCoGetBlockStatusData *data = opaque;
060f51c9
SH
4028 BlockDriverState *bs = data->bs;
4029
b6b8a333
PB
4030 data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
4031 data->pnum);
060f51c9
SH
4032 data->done = true;
4033}
4034
4035/*
b6b8a333 4036 * Synchronous wrapper around bdrv_co_get_block_status().
060f51c9 4037 *
b6b8a333 4038 * See bdrv_co_get_block_status() for details.
060f51c9 4039 */
b6b8a333
PB
4040int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
4041 int nb_sectors, int *pnum)
060f51c9 4042{
6aebab14 4043 Coroutine *co;
b6b8a333 4044 BdrvCoGetBlockStatusData data = {
6aebab14
SH
4045 .bs = bs,
4046 .sector_num = sector_num,
4047 .nb_sectors = nb_sectors,
4048 .pnum = pnum,
4049 .done = false,
4050 };
4051
bdad13b9
PB
4052 if (qemu_in_coroutine()) {
4053 /* Fast-path if already in coroutine context */
b6b8a333 4054 bdrv_get_block_status_co_entry(&data);
bdad13b9 4055 } else {
2572b37a
SH
4056 AioContext *aio_context = bdrv_get_aio_context(bs);
4057
b6b8a333 4058 co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
bdad13b9
PB
4059 qemu_coroutine_enter(co, &data);
4060 while (!data.done) {
2572b37a 4061 aio_poll(aio_context, true);
bdad13b9 4062 }
6aebab14
SH
4063 }
4064 return data.ret;
f58c7b35
TS
4065}
4066
b6b8a333
PB
4067int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
4068 int nb_sectors, int *pnum)
4069{
4333bb71
PB
4070 int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
4071 if (ret < 0) {
4072 return ret;
4073 }
e88ae226 4074 return (ret & BDRV_BLOCK_ALLOCATED);
b6b8a333
PB
4075}
4076
188a7bbf
PB
4077/*
4078 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
4079 *
4080 * Return true if the given sector is allocated in any image between
4081 * BASE and TOP (inclusive). BASE can be NULL to check if the given
4082 * sector is allocated in any image of the chain. Return false otherwise.
4083 *
4084 * 'pnum' is set to the number of sectors (including and immediately following
4085 * the specified sector) that are known to be in the same
4086 * allocated/unallocated state.
4087 *
4088 */
4f578637
PB
4089int bdrv_is_allocated_above(BlockDriverState *top,
4090 BlockDriverState *base,
4091 int64_t sector_num,
4092 int nb_sectors, int *pnum)
188a7bbf
PB
4093{
4094 BlockDriverState *intermediate;
4095 int ret, n = nb_sectors;
4096
4097 intermediate = top;
4098 while (intermediate && intermediate != base) {
4099 int pnum_inter;
bdad13b9
PB
4100 ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
4101 &pnum_inter);
188a7bbf
PB
4102 if (ret < 0) {
4103 return ret;
4104 } else if (ret) {
4105 *pnum = pnum_inter;
4106 return 1;
4107 }
4108
4109 /*
4110 * [sector_num, nb_sectors] is unallocated on top but intermediate
4111 * might have
4112 *
4113 * [sector_num+x, nr_sectors] allocated.
4114 */
63ba17d3
VI
4115 if (n > pnum_inter &&
4116 (intermediate == top ||
4117 sector_num + pnum_inter < intermediate->total_sectors)) {
188a7bbf
PB
4118 n = pnum_inter;
4119 }
4120
4121 intermediate = intermediate->backing_hd;
4122 }
4123
4124 *pnum = n;
4125 return 0;
4126}
4127
045df330
AL
4128const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
4129{
4130 if (bs->backing_hd && bs->backing_hd->encrypted)
4131 return bs->backing_file;
4132 else if (bs->encrypted)
4133 return bs->filename;
4134 else
4135 return NULL;
4136}
4137
5fafdf24 4138void bdrv_get_backing_filename(BlockDriverState *bs,
83f64091
FB
4139 char *filename, int filename_size)
4140{
3574c608 4141 pstrcpy(filename, filename_size, bs->backing_file);
83f64091
FB
4142}
4143
5fafdf24 4144int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
faea38e7
FB
4145 const uint8_t *buf, int nb_sectors)
4146{
4147 BlockDriver *drv = bs->drv;
4148 if (!drv)
19cb3738 4149 return -ENOMEDIUM;
faea38e7
FB
4150 if (!drv->bdrv_write_compressed)
4151 return -ENOTSUP;
fbb7b4e0
KW
4152 if (bdrv_check_request(bs, sector_num, nb_sectors))
4153 return -EIO;
a55eb92c 4154
e4654d2d 4155 assert(QLIST_EMPTY(&bs->dirty_bitmaps));
a55eb92c 4156
faea38e7
FB
4157 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
4158}
3b46e624 4159
faea38e7
FB
4160int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
4161{
4162 BlockDriver *drv = bs->drv;
4163 if (!drv)
19cb3738 4164 return -ENOMEDIUM;
faea38e7
FB
4165 if (!drv->bdrv_get_info)
4166 return -ENOTSUP;
4167 memset(bdi, 0, sizeof(*bdi));
4168 return drv->bdrv_get_info(bs, bdi);
4169}
4170
eae041fe
HR
4171ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
4172{
4173 BlockDriver *drv = bs->drv;
4174 if (drv && drv->bdrv_get_specific_info) {
4175 return drv->bdrv_get_specific_info(bs);
4176 }
4177 return NULL;
4178}
4179
45566e9c
CH
4180int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
4181 int64_t pos, int size)
cf8074b3
KW
4182{
4183 QEMUIOVector qiov;
4184 struct iovec iov = {
4185 .iov_base = (void *) buf,
4186 .iov_len = size,
4187 };
4188
4189 qemu_iovec_init_external(&qiov, &iov, 1);
4190 return bdrv_writev_vmstate(bs, &qiov, pos);
4191}
4192
4193int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
178e08a5
AL
4194{
4195 BlockDriver *drv = bs->drv;
cf8074b3
KW
4196
4197 if (!drv) {
178e08a5 4198 return -ENOMEDIUM;
cf8074b3
KW
4199 } else if (drv->bdrv_save_vmstate) {
4200 return drv->bdrv_save_vmstate(bs, qiov, pos);
4201 } else if (bs->file) {
4202 return bdrv_writev_vmstate(bs->file, qiov, pos);
4203 }
4204
7cdb1f6d 4205 return -ENOTSUP;
178e08a5
AL
4206}
4207
45566e9c
CH
4208int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
4209 int64_t pos, int size)
178e08a5
AL
4210{
4211 BlockDriver *drv = bs->drv;
4212 if (!drv)
4213 return -ENOMEDIUM;
7cdb1f6d
MK
4214 if (drv->bdrv_load_vmstate)
4215 return drv->bdrv_load_vmstate(bs, buf, pos, size);
4216 if (bs->file)
4217 return bdrv_load_vmstate(bs->file, buf, pos, size);
4218 return -ENOTSUP;
178e08a5
AL
4219}
4220
8b9b0cc2
KW
4221void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
4222{
bf736fe3 4223 if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
8b9b0cc2
KW
4224 return;
4225 }
4226
bf736fe3 4227 bs->drv->bdrv_debug_event(bs, event);
41c695c7
KW
4228}
4229
4230int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
4231 const char *tag)
4232{
4233 while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
4234 bs = bs->file;
4235 }
4236
4237 if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
4238 return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
4239 }
4240
4241 return -ENOTSUP;
4242}
4243
4cc70e93
FZ
4244int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
4245{
4246 while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
4247 bs = bs->file;
4248 }
4249
4250 if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
4251 return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
4252 }
4253
4254 return -ENOTSUP;
4255}
4256
41c695c7
KW
4257int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
4258{
938789ea 4259 while (bs && (!bs->drv || !bs->drv->bdrv_debug_resume)) {
41c695c7
KW
4260 bs = bs->file;
4261 }
8b9b0cc2 4262
41c695c7
KW
4263 if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
4264 return bs->drv->bdrv_debug_resume(bs, tag);
4265 }
4266
4267 return -ENOTSUP;
4268}
4269
4270bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
4271{
4272 while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
4273 bs = bs->file;
4274 }
4275
4276 if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
4277 return bs->drv->bdrv_debug_is_suspended(bs, tag);
4278 }
4279
4280 return false;
8b9b0cc2
KW
4281}
4282
199630b6
BS
4283int bdrv_is_snapshot(BlockDriverState *bs)
4284{
4285 return !!(bs->open_flags & BDRV_O_SNAPSHOT);
4286}
4287
b1b1d783
JC
4288/* backing_file can either be relative, or absolute, or a protocol. If it is
4289 * relative, it must be relative to the chain. So, passing in bs->filename
4290 * from a BDS as backing_file should not be done, as that may be relative to
4291 * the CWD rather than the chain. */
e8a6bb9c
MT
4292BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
4293 const char *backing_file)
4294{
b1b1d783
JC
4295 char *filename_full = NULL;
4296 char *backing_file_full = NULL;
4297 char *filename_tmp = NULL;
4298 int is_protocol = 0;
4299 BlockDriverState *curr_bs = NULL;
4300 BlockDriverState *retval = NULL;
4301
4302 if (!bs || !bs->drv || !backing_file) {
e8a6bb9c
MT
4303 return NULL;
4304 }
4305
b1b1d783
JC
4306 filename_full = g_malloc(PATH_MAX);
4307 backing_file_full = g_malloc(PATH_MAX);
4308 filename_tmp = g_malloc(PATH_MAX);
4309
4310 is_protocol = path_has_protocol(backing_file);
4311
4312 for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
4313
4314 /* If either of the filename paths is actually a protocol, then
4315 * compare unmodified paths; otherwise make paths relative */
4316 if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
4317 if (strcmp(backing_file, curr_bs->backing_file) == 0) {
4318 retval = curr_bs->backing_hd;
4319 break;
4320 }
e8a6bb9c 4321 } else {
b1b1d783
JC
4322 /* If not an absolute filename path, make it relative to the current
4323 * image's filename path */
4324 path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4325 backing_file);
4326
4327 /* We are going to compare absolute pathnames */
4328 if (!realpath(filename_tmp, filename_full)) {
4329 continue;
4330 }
4331
4332 /* We need to make sure the backing filename we are comparing against
4333 * is relative to the current image filename (or absolute) */
4334 path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4335 curr_bs->backing_file);
4336
4337 if (!realpath(filename_tmp, backing_file_full)) {
4338 continue;
4339 }
4340
4341 if (strcmp(backing_file_full, filename_full) == 0) {
4342 retval = curr_bs->backing_hd;
4343 break;
4344 }
e8a6bb9c
MT
4345 }
4346 }
4347
b1b1d783
JC
4348 g_free(filename_full);
4349 g_free(backing_file_full);
4350 g_free(filename_tmp);
4351 return retval;
e8a6bb9c
MT
4352}
4353
f198fd1c
BC
4354int bdrv_get_backing_file_depth(BlockDriverState *bs)
4355{
4356 if (!bs->drv) {
4357 return 0;
4358 }
4359
4360 if (!bs->backing_hd) {
4361 return 0;
4362 }
4363
4364 return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
4365}
4366
79fac568
JC
4367BlockDriverState *bdrv_find_base(BlockDriverState *bs)
4368{
4369 BlockDriverState *curr_bs = NULL;
4370
4371 if (!bs) {
4372 return NULL;
4373 }
4374
4375 curr_bs = bs;
4376
4377 while (curr_bs->backing_hd) {
4378 curr_bs = curr_bs->backing_hd;
4379 }
4380 return curr_bs;
4381}
4382
ea2384d3 4383/**************************************************************/
83f64091 4384/* async I/Os */
ea2384d3 4385
3b69e4b9 4386BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
f141eafe 4387 QEMUIOVector *qiov, int nb_sectors,
3b69e4b9 4388 BlockDriverCompletionFunc *cb, void *opaque)
83f64091 4389{
bbf0a440
SH
4390 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
4391
d20d9b7c 4392 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
8c5873d6 4393 cb, opaque, false);
ea2384d3
FB
4394}
4395
f141eafe
AL
4396BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
4397 QEMUIOVector *qiov, int nb_sectors,
4398 BlockDriverCompletionFunc *cb, void *opaque)
ea2384d3 4399{
bbf0a440
SH
4400 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
4401
d20d9b7c 4402 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
8c5873d6 4403 cb, opaque, true);
83f64091
FB
4404}
4405
d5ef94d4
PB
4406BlockDriverAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
4407 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
4408 BlockDriverCompletionFunc *cb, void *opaque)
4409{
4410 trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
4411
4412 return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
4413 BDRV_REQ_ZERO_WRITE | flags,
4414 cb, opaque, true);
4415}
4416
40b4f539
KW
4417
4418typedef struct MultiwriteCB {
4419 int error;
4420 int num_requests;
4421 int num_callbacks;
4422 struct {
4423 BlockDriverCompletionFunc *cb;
4424 void *opaque;
4425 QEMUIOVector *free_qiov;
40b4f539
KW
4426 } callbacks[];
4427} MultiwriteCB;
4428
4429static void multiwrite_user_cb(MultiwriteCB *mcb)
4430{
4431 int i;
4432
4433 for (i = 0; i < mcb->num_callbacks; i++) {
4434 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
1e1ea48d
SH
4435 if (mcb->callbacks[i].free_qiov) {
4436 qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
4437 }
7267c094 4438 g_free(mcb->callbacks[i].free_qiov);
40b4f539
KW
4439 }
4440}
4441
4442static void multiwrite_cb(void *opaque, int ret)
4443{
4444 MultiwriteCB *mcb = opaque;
4445
6d519a5f
SH
4446 trace_multiwrite_cb(mcb, ret);
4447
cb6d3ca0 4448 if (ret < 0 && !mcb->error) {
40b4f539 4449 mcb->error = ret;
40b4f539
KW
4450 }
4451
4452 mcb->num_requests--;
4453 if (mcb->num_requests == 0) {
de189a1b 4454 multiwrite_user_cb(mcb);
7267c094 4455 g_free(mcb);
40b4f539
KW
4456 }
4457}
4458
4459static int multiwrite_req_compare(const void *a, const void *b)
4460{
77be4366
CH
4461 const BlockRequest *req1 = a, *req2 = b;
4462
4463 /*
4464 * Note that we can't simply subtract req2->sector from req1->sector
4465 * here as that could overflow the return value.
4466 */
4467 if (req1->sector > req2->sector) {
4468 return 1;
4469 } else if (req1->sector < req2->sector) {
4470 return -1;
4471 } else {
4472 return 0;
4473 }
40b4f539
KW
4474}
4475
4476/*
4477 * Takes a bunch of requests and tries to merge them. Returns the number of
4478 * requests that remain after merging.
4479 */
4480static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
4481 int num_reqs, MultiwriteCB *mcb)
4482{
4483 int i, outidx;
4484
4485 // Sort requests by start sector
4486 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
4487
4488 // Check if adjacent requests touch the same clusters. If so, combine them,
4489 // filling up gaps with zero sectors.
4490 outidx = 0;
4491 for (i = 1; i < num_reqs; i++) {
4492 int merge = 0;
4493 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
4494
b6a127a1 4495 // Handle exactly sequential writes and overlapping writes.
40b4f539
KW
4496 if (reqs[i].sector <= oldreq_last) {
4497 merge = 1;
4498 }
4499
e2a305fb
CH
4500 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
4501 merge = 0;
4502 }
4503
40b4f539
KW
4504 if (merge) {
4505 size_t size;
7267c094 4506 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
40b4f539
KW
4507 qemu_iovec_init(qiov,
4508 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
4509
4510 // Add the first request to the merged one. If the requests are
4511 // overlapping, drop the last sectors of the first request.
4512 size = (reqs[i].sector - reqs[outidx].sector) << 9;
1b093c48 4513 qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
40b4f539 4514
b6a127a1
PB
4515 // We should need to add any zeros between the two requests
4516 assert (reqs[i].sector <= oldreq_last);
40b4f539
KW
4517
4518 // Add the second request
1b093c48 4519 qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
40b4f539 4520
cbf1dff2 4521 reqs[outidx].nb_sectors = qiov->size >> 9;
40b4f539
KW
4522 reqs[outidx].qiov = qiov;
4523
4524 mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
4525 } else {
4526 outidx++;
4527 reqs[outidx].sector = reqs[i].sector;
4528 reqs[outidx].nb_sectors = reqs[i].nb_sectors;
4529 reqs[outidx].qiov = reqs[i].qiov;
4530 }
4531 }
4532
4533 return outidx + 1;
4534}
4535
4536/*
4537 * Submit multiple AIO write requests at once.
4538 *
4539 * On success, the function returns 0 and all requests in the reqs array have
4540 * been submitted. In error case this function returns -1, and any of the
4541 * requests may or may not be submitted yet. In particular, this means that the
4542 * callback will be called for some of the requests, for others it won't. The
4543 * caller must check the error field of the BlockRequest to wait for the right
4544 * callbacks (if error != 0, no callback will be called).
4545 *
4546 * The implementation may modify the contents of the reqs array, e.g. to merge
4547 * requests. However, the fields opaque and error are left unmodified as they
4548 * are used to signal failure for a single request to the caller.
4549 */
4550int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
4551{
40b4f539
KW
4552 MultiwriteCB *mcb;
4553 int i;
4554
301db7c2
RH
4555 /* don't submit writes if we don't have a medium */
4556 if (bs->drv == NULL) {
4557 for (i = 0; i < num_reqs; i++) {
4558 reqs[i].error = -ENOMEDIUM;
4559 }
4560 return -1;
4561 }
4562
40b4f539
KW
4563 if (num_reqs == 0) {
4564 return 0;
4565 }
4566
4567 // Create MultiwriteCB structure
7267c094 4568 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
40b4f539
KW
4569 mcb->num_requests = 0;
4570 mcb->num_callbacks = num_reqs;
4571
4572 for (i = 0; i < num_reqs; i++) {
4573 mcb->callbacks[i].cb = reqs[i].cb;
4574 mcb->callbacks[i].opaque = reqs[i].opaque;
4575 }
4576
4577 // Check for mergable requests
4578 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
4579
6d519a5f
SH
4580 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
4581
df9309fb
PB
4582 /* Run the aio requests. */
4583 mcb->num_requests = num_reqs;
40b4f539 4584 for (i = 0; i < num_reqs; i++) {
d20d9b7c
PB
4585 bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
4586 reqs[i].nb_sectors, reqs[i].flags,
4587 multiwrite_cb, mcb,
4588 true);
40b4f539
KW
4589 }
4590
4591 return 0;
40b4f539
KW
4592}
4593
83f64091 4594void bdrv_aio_cancel(BlockDriverAIOCB *acb)
83f64091 4595{
d7331bed 4596 acb->aiocb_info->cancel(acb);
83f64091
FB
4597}
4598
4599/**************************************************************/
4600/* async block device emulation */
4601
c16b5a2c
CH
4602typedef struct BlockDriverAIOCBSync {
4603 BlockDriverAIOCB common;
4604 QEMUBH *bh;
4605 int ret;
4606 /* vector translation state */
4607 QEMUIOVector *qiov;
4608 uint8_t *bounce;
4609 int is_write;
4610} BlockDriverAIOCBSync;
4611
4612static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
4613{
b666d239
KW
4614 BlockDriverAIOCBSync *acb =
4615 container_of(blockacb, BlockDriverAIOCBSync, common);
6a7ad299 4616 qemu_bh_delete(acb->bh);
36afc451 4617 acb->bh = NULL;
c16b5a2c
CH
4618 qemu_aio_release(acb);
4619}
4620
d7331bed 4621static const AIOCBInfo bdrv_em_aiocb_info = {
c16b5a2c
CH
4622 .aiocb_size = sizeof(BlockDriverAIOCBSync),
4623 .cancel = bdrv_aio_cancel_em,
4624};
4625
ce1a14dc 4626static void bdrv_aio_bh_cb(void *opaque)
83f64091 4627{
ce1a14dc 4628 BlockDriverAIOCBSync *acb = opaque;
f141eafe 4629
f141eafe 4630 if (!acb->is_write)
03396148 4631 qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
ceb42de8 4632 qemu_vfree(acb->bounce);
ce1a14dc 4633 acb->common.cb(acb->common.opaque, acb->ret);
6a7ad299 4634 qemu_bh_delete(acb->bh);
36afc451 4635 acb->bh = NULL;
ce1a14dc 4636 qemu_aio_release(acb);
83f64091 4637}
beac80cd 4638
f141eafe
AL
4639static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
4640 int64_t sector_num,
4641 QEMUIOVector *qiov,
4642 int nb_sectors,
4643 BlockDriverCompletionFunc *cb,
4644 void *opaque,
4645 int is_write)
4646
83f64091 4647{
ce1a14dc 4648 BlockDriverAIOCBSync *acb;
ce1a14dc 4649
d7331bed 4650 acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
f141eafe
AL
4651 acb->is_write = is_write;
4652 acb->qiov = qiov;
e268ca52 4653 acb->bounce = qemu_blockalign(bs, qiov->size);
2572b37a 4654 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb);
f141eafe
AL
4655
4656 if (is_write) {
d5e6b161 4657 qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
1ed20acf 4658 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
f141eafe 4659 } else {
1ed20acf 4660 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
f141eafe
AL
4661 }
4662
ce1a14dc 4663 qemu_bh_schedule(acb->bh);
f141eafe 4664
ce1a14dc 4665 return &acb->common;
beac80cd
FB
4666}
4667
f141eafe
AL
4668static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
4669 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
ce1a14dc 4670 BlockDriverCompletionFunc *cb, void *opaque)
beac80cd 4671{
f141eafe
AL
4672 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
4673}
83f64091 4674
f141eafe
AL
4675static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
4676 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4677 BlockDriverCompletionFunc *cb, void *opaque)
4678{
4679 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
beac80cd 4680}
beac80cd 4681
68485420
KW
4682
4683typedef struct BlockDriverAIOCBCoroutine {
4684 BlockDriverAIOCB common;
4685 BlockRequest req;
4686 bool is_write;
d318aea9 4687 bool *done;
68485420
KW
4688 QEMUBH* bh;
4689} BlockDriverAIOCBCoroutine;
4690
4691static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
4692{
2572b37a 4693 AioContext *aio_context = bdrv_get_aio_context(blockacb->bs);
d318aea9
KW
4694 BlockDriverAIOCBCoroutine *acb =
4695 container_of(blockacb, BlockDriverAIOCBCoroutine, common);
4696 bool done = false;
4697
4698 acb->done = &done;
4699 while (!done) {
2572b37a 4700 aio_poll(aio_context, true);
d318aea9 4701 }
68485420
KW
4702}
4703
d7331bed 4704static const AIOCBInfo bdrv_em_co_aiocb_info = {
68485420
KW
4705 .aiocb_size = sizeof(BlockDriverAIOCBCoroutine),
4706 .cancel = bdrv_aio_co_cancel_em,
4707};
4708
35246a68 4709static void bdrv_co_em_bh(void *opaque)
68485420
KW
4710{
4711 BlockDriverAIOCBCoroutine *acb = opaque;
4712
4713 acb->common.cb(acb->common.opaque, acb->req.error);
d318aea9
KW
4714
4715 if (acb->done) {
4716 *acb->done = true;
4717 }
4718
68485420
KW
4719 qemu_bh_delete(acb->bh);
4720 qemu_aio_release(acb);
4721}
4722
b2a61371
SH
4723/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4724static void coroutine_fn bdrv_co_do_rw(void *opaque)
4725{
4726 BlockDriverAIOCBCoroutine *acb = opaque;
4727 BlockDriverState *bs = acb->common.bs;
4728
4729 if (!acb->is_write) {
4730 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
d20d9b7c 4731 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
b2a61371
SH
4732 } else {
4733 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
d20d9b7c 4734 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
b2a61371
SH
4735 }
4736
2572b37a 4737 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
b2a61371
SH
4738 qemu_bh_schedule(acb->bh);
4739}
4740
68485420
KW
4741static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4742 int64_t sector_num,
4743 QEMUIOVector *qiov,
4744 int nb_sectors,
d20d9b7c 4745 BdrvRequestFlags flags,
68485420
KW
4746 BlockDriverCompletionFunc *cb,
4747 void *opaque,
8c5873d6 4748 bool is_write)
68485420
KW
4749{
4750 Coroutine *co;
4751 BlockDriverAIOCBCoroutine *acb;
4752
d7331bed 4753 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
68485420
KW
4754 acb->req.sector = sector_num;
4755 acb->req.nb_sectors = nb_sectors;
4756 acb->req.qiov = qiov;
d20d9b7c 4757 acb->req.flags = flags;
68485420 4758 acb->is_write = is_write;
d318aea9 4759 acb->done = NULL;
68485420 4760
8c5873d6 4761 co = qemu_coroutine_create(bdrv_co_do_rw);
68485420
KW
4762 qemu_coroutine_enter(co, acb);
4763
4764 return &acb->common;
4765}
4766
07f07615 4767static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
b2e12bc6 4768{
07f07615
PB
4769 BlockDriverAIOCBCoroutine *acb = opaque;
4770 BlockDriverState *bs = acb->common.bs;
b2e12bc6 4771
07f07615 4772 acb->req.error = bdrv_co_flush(bs);
2572b37a 4773 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
b2e12bc6 4774 qemu_bh_schedule(acb->bh);
b2e12bc6
CH
4775}
4776
07f07615 4777BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
016f5cf6
AG
4778 BlockDriverCompletionFunc *cb, void *opaque)
4779{
07f07615 4780 trace_bdrv_aio_flush(bs, opaque);
016f5cf6 4781
07f07615
PB
4782 Coroutine *co;
4783 BlockDriverAIOCBCoroutine *acb;
016f5cf6 4784
d7331bed 4785 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
d318aea9
KW
4786 acb->done = NULL;
4787
07f07615
PB
4788 co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4789 qemu_coroutine_enter(co, acb);
016f5cf6 4790
016f5cf6
AG
4791 return &acb->common;
4792}
4793
4265d620
PB
4794static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4795{
4796 BlockDriverAIOCBCoroutine *acb = opaque;
4797 BlockDriverState *bs = acb->common.bs;
4798
4799 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
2572b37a 4800 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4265d620
PB
4801 qemu_bh_schedule(acb->bh);
4802}
4803
4804BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4805 int64_t sector_num, int nb_sectors,
4806 BlockDriverCompletionFunc *cb, void *opaque)
4807{
4808 Coroutine *co;
4809 BlockDriverAIOCBCoroutine *acb;
4810
4811 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4812
d7331bed 4813 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4265d620
PB
4814 acb->req.sector = sector_num;
4815 acb->req.nb_sectors = nb_sectors;
d318aea9 4816 acb->done = NULL;
4265d620
PB
4817 co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
4818 qemu_coroutine_enter(co, acb);
4819
4820 return &acb->common;
4821}
4822
ea2384d3
FB
4823void bdrv_init(void)
4824{
5efa9d5a 4825 module_call_init(MODULE_INIT_BLOCK);
ea2384d3 4826}
ce1a14dc 4827
eb852011
MA
4828void bdrv_init_with_whitelist(void)
4829{
4830 use_bdrv_whitelist = 1;
4831 bdrv_init();
4832}
4833
d7331bed 4834void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
c16b5a2c 4835 BlockDriverCompletionFunc *cb, void *opaque)
ce1a14dc 4836{
ce1a14dc
PB
4837 BlockDriverAIOCB *acb;
4838
d7331bed
SH
4839 acb = g_slice_alloc(aiocb_info->aiocb_size);
4840 acb->aiocb_info = aiocb_info;
ce1a14dc
PB
4841 acb->bs = bs;
4842 acb->cb = cb;
4843 acb->opaque = opaque;
4844 return acb;
4845}
4846
4847void qemu_aio_release(void *p)
4848{
d37c975f 4849 BlockDriverAIOCB *acb = p;
d7331bed 4850 g_slice_free1(acb->aiocb_info->aiocb_size, acb);
ce1a14dc 4851}
19cb3738 4852
f9f05dc5
KW
4853/**************************************************************/
4854/* Coroutine block device emulation */
4855
4856typedef struct CoroutineIOCompletion {
4857 Coroutine *coroutine;
4858 int ret;
4859} CoroutineIOCompletion;
4860
4861static void bdrv_co_io_em_complete(void *opaque, int ret)
4862{
4863 CoroutineIOCompletion *co = opaque;
4864
4865 co->ret = ret;
4866 qemu_coroutine_enter(co->coroutine, NULL);
4867}
4868
4869static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
4870 int nb_sectors, QEMUIOVector *iov,
4871 bool is_write)
4872{
4873 CoroutineIOCompletion co = {
4874 .coroutine = qemu_coroutine_self(),
4875 };
4876 BlockDriverAIOCB *acb;
4877
4878 if (is_write) {
a652d160
SH
4879 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
4880 bdrv_co_io_em_complete, &co);
f9f05dc5 4881 } else {
a652d160
SH
4882 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
4883 bdrv_co_io_em_complete, &co);
f9f05dc5
KW
4884 }
4885
59370aaa 4886 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
f9f05dc5
KW
4887 if (!acb) {
4888 return -EIO;
4889 }
4890 qemu_coroutine_yield();
4891
4892 return co.ret;
4893}
4894
4895static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4896 int64_t sector_num, int nb_sectors,
4897 QEMUIOVector *iov)
4898{
4899 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4900}
4901
4902static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4903 int64_t sector_num, int nb_sectors,
4904 QEMUIOVector *iov)
4905{
4906 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4907}
4908
07f07615 4909static void coroutine_fn bdrv_flush_co_entry(void *opaque)
e7a8a783 4910{
07f07615
PB
4911 RwCo *rwco = opaque;
4912
4913 rwco->ret = bdrv_co_flush(rwco->bs);
4914}
4915
4916int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4917{
eb489bb1
KW
4918 int ret;
4919
29cdb251 4920 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
07f07615 4921 return 0;
eb489bb1
KW
4922 }
4923
ca716364 4924 /* Write back cached data to the OS even with cache=unsafe */
bf736fe3 4925 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
eb489bb1
KW
4926 if (bs->drv->bdrv_co_flush_to_os) {
4927 ret = bs->drv->bdrv_co_flush_to_os(bs);
4928 if (ret < 0) {
4929 return ret;
4930 }
4931 }
4932
ca716364
KW
4933 /* But don't actually force it to the disk with cache=unsafe */
4934 if (bs->open_flags & BDRV_O_NO_FLUSH) {
d4c82329 4935 goto flush_parent;
ca716364
KW
4936 }
4937
bf736fe3 4938 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
eb489bb1 4939 if (bs->drv->bdrv_co_flush_to_disk) {
29cdb251 4940 ret = bs->drv->bdrv_co_flush_to_disk(bs);
07f07615
PB
4941 } else if (bs->drv->bdrv_aio_flush) {
4942 BlockDriverAIOCB *acb;
4943 CoroutineIOCompletion co = {
4944 .coroutine = qemu_coroutine_self(),
4945 };
4946
4947 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4948 if (acb == NULL) {
29cdb251 4949 ret = -EIO;
07f07615
PB
4950 } else {
4951 qemu_coroutine_yield();
29cdb251 4952 ret = co.ret;
07f07615 4953 }
07f07615
PB
4954 } else {
4955 /*
4956 * Some block drivers always operate in either writethrough or unsafe
4957 * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4958 * know how the server works (because the behaviour is hardcoded or
4959 * depends on server-side configuration), so we can't ensure that
4960 * everything is safe on disk. Returning an error doesn't work because
4961 * that would break guests even if the server operates in writethrough
4962 * mode.
4963 *
4964 * Let's hope the user knows what he's doing.
4965 */
29cdb251 4966 ret = 0;
07f07615 4967 }
29cdb251
PB
4968 if (ret < 0) {
4969 return ret;
4970 }
4971
4972 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
4973 * in the case of cache=unsafe, so there are no useless flushes.
4974 */
d4c82329 4975flush_parent:
29cdb251 4976 return bdrv_co_flush(bs->file);
07f07615
PB
4977}
4978
5a8a30db 4979void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)
0f15423c 4980{
5a8a30db
KW
4981 Error *local_err = NULL;
4982 int ret;
4983
3456a8d1
KW
4984 if (!bs->drv) {
4985 return;
4986 }
4987
4988 if (bs->drv->bdrv_invalidate_cache) {
5a8a30db 4989 bs->drv->bdrv_invalidate_cache(bs, &local_err);
3456a8d1 4990 } else if (bs->file) {
5a8a30db
KW
4991 bdrv_invalidate_cache(bs->file, &local_err);
4992 }
4993 if (local_err) {
4994 error_propagate(errp, local_err);
4995 return;
0f15423c 4996 }
3456a8d1 4997
5a8a30db
KW
4998 ret = refresh_total_sectors(bs, bs->total_sectors);
4999 if (ret < 0) {
5000 error_setg_errno(errp, -ret, "Could not refresh total sector count");
5001 return;
5002 }
0f15423c
AL
5003}
5004
5a8a30db 5005void bdrv_invalidate_cache_all(Error **errp)
0f15423c
AL
5006{
5007 BlockDriverState *bs;
5a8a30db 5008 Error *local_err = NULL;
0f15423c 5009
dc364f4c 5010 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
ed78cda3
SH
5011 AioContext *aio_context = bdrv_get_aio_context(bs);
5012
5013 aio_context_acquire(aio_context);
5a8a30db 5014 bdrv_invalidate_cache(bs, &local_err);
ed78cda3 5015 aio_context_release(aio_context);
5a8a30db
KW
5016 if (local_err) {
5017 error_propagate(errp, local_err);
5018 return;
5019 }
0f15423c
AL
5020 }
5021}
5022
07789269
BC
5023void bdrv_clear_incoming_migration_all(void)
5024{
5025 BlockDriverState *bs;
5026
dc364f4c 5027 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
ed78cda3
SH
5028 AioContext *aio_context = bdrv_get_aio_context(bs);
5029
5030 aio_context_acquire(aio_context);
07789269 5031 bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
ed78cda3 5032 aio_context_release(aio_context);
07789269
BC
5033 }
5034}
5035
07f07615
PB
5036int bdrv_flush(BlockDriverState *bs)
5037{
5038 Coroutine *co;
5039 RwCo rwco = {
5040 .bs = bs,
5041 .ret = NOT_DONE,
e7a8a783 5042 };
e7a8a783 5043
07f07615
PB
5044 if (qemu_in_coroutine()) {
5045 /* Fast-path if already in coroutine context */
5046 bdrv_flush_co_entry(&rwco);
5047 } else {
2572b37a
SH
5048 AioContext *aio_context = bdrv_get_aio_context(bs);
5049
07f07615
PB
5050 co = qemu_coroutine_create(bdrv_flush_co_entry);
5051 qemu_coroutine_enter(co, &rwco);
5052 while (rwco.ret == NOT_DONE) {
2572b37a 5053 aio_poll(aio_context, true);
07f07615 5054 }
e7a8a783 5055 }
07f07615
PB
5056
5057 return rwco.ret;
e7a8a783
KW
5058}
5059
775aa8b6
KW
5060typedef struct DiscardCo {
5061 BlockDriverState *bs;
5062 int64_t sector_num;
5063 int nb_sectors;
5064 int ret;
5065} DiscardCo;
4265d620
PB
5066static void coroutine_fn bdrv_discard_co_entry(void *opaque)
5067{
775aa8b6 5068 DiscardCo *rwco = opaque;
4265d620
PB
5069
5070 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
5071}
5072
6f14da52
PL
5073/* if no limit is specified in the BlockLimits use a default
5074 * of 32768 512-byte sectors (16 MiB) per request.
5075 */
5076#define MAX_DISCARD_DEFAULT 32768
5077
4265d620
PB
5078int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
5079 int nb_sectors)
5080{
d51e9fe5
PB
5081 int max_discard;
5082
4265d620
PB
5083 if (!bs->drv) {
5084 return -ENOMEDIUM;
5085 } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
5086 return -EIO;
5087 } else if (bs->read_only) {
5088 return -EROFS;
df702c9b
PB
5089 }
5090
e4654d2d 5091 bdrv_reset_dirty(bs, sector_num, nb_sectors);
df702c9b 5092
9e8f1835
PB
5093 /* Do nothing if disabled. */
5094 if (!(bs->open_flags & BDRV_O_UNMAP)) {
5095 return 0;
5096 }
5097
d51e9fe5
PB
5098 if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
5099 return 0;
5100 }
6f14da52 5101
d51e9fe5
PB
5102 max_discard = bs->bl.max_discard ? bs->bl.max_discard : MAX_DISCARD_DEFAULT;
5103 while (nb_sectors > 0) {
5104 int ret;
5105 int num = nb_sectors;
6f14da52 5106
d51e9fe5
PB
5107 /* align request */
5108 if (bs->bl.discard_alignment &&
5109 num >= bs->bl.discard_alignment &&
5110 sector_num % bs->bl.discard_alignment) {
5111 if (num > bs->bl.discard_alignment) {
5112 num = bs->bl.discard_alignment;
6f14da52 5113 }
d51e9fe5
PB
5114 num -= sector_num % bs->bl.discard_alignment;
5115 }
6f14da52 5116
d51e9fe5
PB
5117 /* limit request size */
5118 if (num > max_discard) {
5119 num = max_discard;
5120 }
6f14da52 5121
d51e9fe5 5122 if (bs->drv->bdrv_co_discard) {
6f14da52 5123 ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
d51e9fe5
PB
5124 } else {
5125 BlockDriverAIOCB *acb;
5126 CoroutineIOCompletion co = {
5127 .coroutine = qemu_coroutine_self(),
5128 };
5129
5130 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
5131 bdrv_co_io_em_complete, &co);
5132 if (acb == NULL) {
5133 return -EIO;
5134 } else {
5135 qemu_coroutine_yield();
5136 ret = co.ret;
6f14da52 5137 }
6f14da52 5138 }
7ce21016 5139 if (ret && ret != -ENOTSUP) {
d51e9fe5 5140 return ret;
4265d620 5141 }
d51e9fe5
PB
5142
5143 sector_num += num;
5144 nb_sectors -= num;
4265d620 5145 }
d51e9fe5 5146 return 0;
4265d620
PB
5147}
5148
5149int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
5150{
5151 Coroutine *co;
775aa8b6 5152 DiscardCo rwco = {
4265d620
PB
5153 .bs = bs,
5154 .sector_num = sector_num,
5155 .nb_sectors = nb_sectors,
5156 .ret = NOT_DONE,
5157 };
5158
5159 if (qemu_in_coroutine()) {
5160 /* Fast-path if already in coroutine context */
5161 bdrv_discard_co_entry(&rwco);
5162 } else {
2572b37a
SH
5163 AioContext *aio_context = bdrv_get_aio_context(bs);
5164
4265d620
PB
5165 co = qemu_coroutine_create(bdrv_discard_co_entry);
5166 qemu_coroutine_enter(co, &rwco);
5167 while (rwco.ret == NOT_DONE) {
2572b37a 5168 aio_poll(aio_context, true);
4265d620
PB
5169 }
5170 }
5171
5172 return rwco.ret;
5173}
5174
19cb3738
FB
5175/**************************************************************/
5176/* removable device support */
5177
5178/**
5179 * Return TRUE if the media is present
5180 */
5181int bdrv_is_inserted(BlockDriverState *bs)
5182{
5183 BlockDriver *drv = bs->drv;
a1aff5bf 5184
19cb3738
FB
5185 if (!drv)
5186 return 0;
5187 if (!drv->bdrv_is_inserted)
a1aff5bf
MA
5188 return 1;
5189 return drv->bdrv_is_inserted(bs);
19cb3738
FB
5190}
5191
5192/**
8e49ca46
MA
5193 * Return whether the media changed since the last call to this
5194 * function, or -ENOTSUP if we don't know. Most drivers don't know.
19cb3738
FB
5195 */
5196int bdrv_media_changed(BlockDriverState *bs)
5197{
5198 BlockDriver *drv = bs->drv;
19cb3738 5199
8e49ca46
MA
5200 if (drv && drv->bdrv_media_changed) {
5201 return drv->bdrv_media_changed(bs);
5202 }
5203 return -ENOTSUP;
19cb3738
FB
5204}
5205
5206/**
5207 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
5208 */
f36f3949 5209void bdrv_eject(BlockDriverState *bs, bool eject_flag)
19cb3738
FB
5210{
5211 BlockDriver *drv = bs->drv;
19cb3738 5212
822e1cd1
MA
5213 if (drv && drv->bdrv_eject) {
5214 drv->bdrv_eject(bs, eject_flag);
19cb3738 5215 }
6f382ed2
LC
5216
5217 if (bs->device_name[0] != '\0') {
a5ee7bd4
WX
5218 qapi_event_send_device_tray_moved(bdrv_get_device_name(bs),
5219 eject_flag, &error_abort);
6f382ed2 5220 }
19cb3738
FB
5221}
5222
19cb3738
FB
5223/**
5224 * Lock or unlock the media (if it is locked, the user won't be able
5225 * to eject it manually).
5226 */
025e849a 5227void bdrv_lock_medium(BlockDriverState *bs, bool locked)
19cb3738
FB
5228{
5229 BlockDriver *drv = bs->drv;
5230
025e849a 5231 trace_bdrv_lock_medium(bs, locked);
b8c6d095 5232
025e849a
MA
5233 if (drv && drv->bdrv_lock_medium) {
5234 drv->bdrv_lock_medium(bs, locked);
19cb3738
FB
5235 }
5236}
985a03b0
TS
5237
5238/* needed for generic scsi interface */
5239
5240int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
5241{
5242 BlockDriver *drv = bs->drv;
5243
5244 if (drv && drv->bdrv_ioctl)
5245 return drv->bdrv_ioctl(bs, req, buf);
5246 return -ENOTSUP;
5247}
7d780669 5248
221f715d
AL
5249BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
5250 unsigned long int req, void *buf,
5251 BlockDriverCompletionFunc *cb, void *opaque)
7d780669 5252{
221f715d 5253 BlockDriver *drv = bs->drv;
7d780669 5254
221f715d
AL
5255 if (drv && drv->bdrv_aio_ioctl)
5256 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
5257 return NULL;
7d780669 5258}
e268ca52 5259
1b7fd729 5260void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
7b6f9300 5261{
1b7fd729 5262 bs->guest_block_size = align;
7b6f9300 5263}
7cd1e32a 5264
e268ca52
AL
5265void *qemu_blockalign(BlockDriverState *bs, size_t size)
5266{
339064d5 5267 return qemu_memalign(bdrv_opt_mem_align(bs), size);
e268ca52 5268}
7cd1e32a 5269
c53b1c51
SH
5270/*
5271 * Check if all memory in this vector is sector aligned.
5272 */
5273bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
5274{
5275 int i;
339064d5 5276 size_t alignment = bdrv_opt_mem_align(bs);
c53b1c51
SH
5277
5278 for (i = 0; i < qiov->niov; i++) {
339064d5 5279 if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
c53b1c51 5280 return false;
1ff735bd 5281 }
339064d5 5282 if (qiov->iov[i].iov_len % alignment) {
1ff735bd 5283 return false;
c53b1c51
SH
5284 }
5285 }
5286
5287 return true;
5288}
5289
b8afb520
FZ
5290BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity,
5291 Error **errp)
7cd1e32a
LS
5292{
5293 int64_t bitmap_size;
e4654d2d 5294 BdrvDirtyBitmap *bitmap;
a55eb92c 5295
50717e94
PB
5296 assert((granularity & (granularity - 1)) == 0);
5297
e4654d2d
FZ
5298 granularity >>= BDRV_SECTOR_BITS;
5299 assert(granularity);
b8afb520
FZ
5300 bitmap_size = bdrv_getlength(bs);
5301 if (bitmap_size < 0) {
5302 error_setg_errno(errp, -bitmap_size, "could not get length of device");
5303 errno = -bitmap_size;
5304 return NULL;
5305 }
5306 bitmap_size >>= BDRV_SECTOR_BITS;
e4654d2d
FZ
5307 bitmap = g_malloc0(sizeof(BdrvDirtyBitmap));
5308 bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
5309 QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
5310 return bitmap;
5311}
5312
5313void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5314{
5315 BdrvDirtyBitmap *bm, *next;
5316 QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
5317 if (bm == bitmap) {
5318 QLIST_REMOVE(bitmap, list);
5319 hbitmap_free(bitmap->bitmap);
5320 g_free(bitmap);
5321 return;
a55eb92c 5322 }
7cd1e32a
LS
5323 }
5324}
5325
21b56835
FZ
5326BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
5327{
5328 BdrvDirtyBitmap *bm;
5329 BlockDirtyInfoList *list = NULL;
5330 BlockDirtyInfoList **plist = &list;
5331
5332 QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
5333 BlockDirtyInfo *info = g_malloc0(sizeof(BlockDirtyInfo));
5334 BlockDirtyInfoList *entry = g_malloc0(sizeof(BlockDirtyInfoList));
5335 info->count = bdrv_get_dirty_count(bs, bm);
5336 info->granularity =
5337 ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
5338 entry->value = info;
5339 *plist = entry;
5340 plist = &entry->next;
5341 }
5342
5343 return list;
5344}
5345
e4654d2d 5346int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
7cd1e32a 5347{
e4654d2d
FZ
5348 if (bitmap) {
5349 return hbitmap_get(bitmap->bitmap, sector);
7cd1e32a
LS
5350 } else {
5351 return 0;
5352 }
5353}
5354
e4654d2d
FZ
5355void bdrv_dirty_iter_init(BlockDriverState *bs,
5356 BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
1755da16 5357{
e4654d2d 5358 hbitmap_iter_init(hbi, bitmap->bitmap, 0);
1755da16
PB
5359}
5360
5361void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
5362 int nr_sectors)
5363{
e4654d2d
FZ
5364 BdrvDirtyBitmap *bitmap;
5365 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5366 hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5367 }
1755da16
PB
5368}
5369
e4654d2d 5370void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors)
7cd1e32a 5371{
e4654d2d
FZ
5372 BdrvDirtyBitmap *bitmap;
5373 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5374 hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5375 }
7cd1e32a 5376}
aaa0eb75 5377
e4654d2d 5378int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
aaa0eb75 5379{
e4654d2d 5380 return hbitmap_count(bitmap->bitmap);
aaa0eb75 5381}
f88e1a42 5382
9fcb0251
FZ
5383/* Get a reference to bs */
5384void bdrv_ref(BlockDriverState *bs)
5385{
5386 bs->refcnt++;
5387}
5388
5389/* Release a previously grabbed reference to bs.
5390 * If after releasing, reference count is zero, the BlockDriverState is
5391 * deleted. */
5392void bdrv_unref(BlockDriverState *bs)
5393{
5394 assert(bs->refcnt > 0);
5395 if (--bs->refcnt == 0) {
5396 bdrv_delete(bs);
5397 }
5398}
5399
fbe40ff7
FZ
5400struct BdrvOpBlocker {
5401 Error *reason;
5402 QLIST_ENTRY(BdrvOpBlocker) list;
5403};
5404
5405bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp)
5406{
5407 BdrvOpBlocker *blocker;
5408 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5409 if (!QLIST_EMPTY(&bs->op_blockers[op])) {
5410 blocker = QLIST_FIRST(&bs->op_blockers[op]);
5411 if (errp) {
5412 error_setg(errp, "Device '%s' is busy: %s",
5413 bs->device_name, error_get_pretty(blocker->reason));
5414 }
5415 return true;
5416 }
5417 return false;
5418}
5419
5420void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason)
5421{
5422 BdrvOpBlocker *blocker;
5423 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5424
5425 blocker = g_malloc0(sizeof(BdrvOpBlocker));
5426 blocker->reason = reason;
5427 QLIST_INSERT_HEAD(&bs->op_blockers[op], blocker, list);
5428}
5429
5430void bdrv_op_unblock(BlockDriverState *bs, BlockOpType op, Error *reason)
5431{
5432 BdrvOpBlocker *blocker, *next;
5433 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5434 QLIST_FOREACH_SAFE(blocker, &bs->op_blockers[op], list, next) {
5435 if (blocker->reason == reason) {
5436 QLIST_REMOVE(blocker, list);
5437 g_free(blocker);
5438 }
5439 }
5440}
5441
5442void bdrv_op_block_all(BlockDriverState *bs, Error *reason)
5443{
5444 int i;
5445 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5446 bdrv_op_block(bs, i, reason);
5447 }
5448}
5449
5450void bdrv_op_unblock_all(BlockDriverState *bs, Error *reason)
5451{
5452 int i;
5453 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5454 bdrv_op_unblock(bs, i, reason);
5455 }
5456}
5457
5458bool bdrv_op_blocker_is_empty(BlockDriverState *bs)
5459{
5460 int i;
5461
5462 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5463 if (!QLIST_EMPTY(&bs->op_blockers[i])) {
5464 return false;
5465 }
5466 }
5467 return true;
5468}
5469
28a7282a
LC
5470void bdrv_iostatus_enable(BlockDriverState *bs)
5471{
d6bf279e 5472 bs->iostatus_enabled = true;
58e21ef5 5473 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
28a7282a
LC
5474}
5475
5476/* The I/O status is only enabled if the drive explicitly
5477 * enables it _and_ the VM is configured to stop on errors */
5478bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
5479{
d6bf279e 5480 return (bs->iostatus_enabled &&
92aa5c6d
PB
5481 (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
5482 bs->on_write_error == BLOCKDEV_ON_ERROR_STOP ||
5483 bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
28a7282a
LC
5484}
5485
5486void bdrv_iostatus_disable(BlockDriverState *bs)
5487{
d6bf279e 5488 bs->iostatus_enabled = false;
28a7282a
LC
5489}
5490
5491void bdrv_iostatus_reset(BlockDriverState *bs)
5492{
5493 if (bdrv_iostatus_is_enabled(bs)) {
58e21ef5 5494 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
3bd293c3
PB
5495 if (bs->job) {
5496 block_job_iostatus_reset(bs->job);
5497 }
28a7282a
LC
5498 }
5499}
5500
28a7282a
LC
5501void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
5502{
3e1caa5f
PB
5503 assert(bdrv_iostatus_is_enabled(bs));
5504 if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
58e21ef5
LC
5505 bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
5506 BLOCK_DEVICE_IO_STATUS_FAILED;
28a7282a
LC
5507 }
5508}
5509
a597e79c
CH
5510void
5511bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
5512 enum BlockAcctType type)
5513{
5514 assert(type < BDRV_MAX_IOTYPE);
5515
5516 cookie->bytes = bytes;
c488c7f6 5517 cookie->start_time_ns = get_clock();
a597e79c
CH
5518 cookie->type = type;
5519}
5520
5521void
5522bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
5523{
5524 assert(cookie->type < BDRV_MAX_IOTYPE);
5525
5526 bs->nr_bytes[cookie->type] += cookie->bytes;
5527 bs->nr_ops[cookie->type]++;
c488c7f6 5528 bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
a597e79c
CH
5529}
5530
d92ada22
LC
5531void bdrv_img_create(const char *filename, const char *fmt,
5532 const char *base_filename, const char *base_fmt,
f382d43a
MR
5533 char *options, uint64_t img_size, int flags,
5534 Error **errp, bool quiet)
f88e1a42 5535{
83d0521a
CL
5536 QemuOptsList *create_opts = NULL;
5537 QemuOpts *opts = NULL;
5538 const char *backing_fmt, *backing_file;
5539 int64_t size;
f88e1a42 5540 BlockDriver *drv, *proto_drv;
96df67d1 5541 BlockDriver *backing_drv = NULL;
cc84d90f 5542 Error *local_err = NULL;
f88e1a42
JS
5543 int ret = 0;
5544
5545 /* Find driver and parse its options */
5546 drv = bdrv_find_format(fmt);
5547 if (!drv) {
71c79813 5548 error_setg(errp, "Unknown file format '%s'", fmt);
d92ada22 5549 return;
f88e1a42
JS
5550 }
5551
98289620 5552 proto_drv = bdrv_find_protocol(filename, true);
f88e1a42 5553 if (!proto_drv) {
71c79813 5554 error_setg(errp, "Unknown protocol '%s'", filename);
d92ada22 5555 return;
f88e1a42
JS
5556 }
5557
c282e1fd
CL
5558 create_opts = qemu_opts_append(create_opts, drv->create_opts);
5559 create_opts = qemu_opts_append(create_opts, proto_drv->create_opts);
f88e1a42
JS
5560
5561 /* Create parameter list with default values */
83d0521a
CL
5562 opts = qemu_opts_create(create_opts, NULL, 0, &error_abort);
5563 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, img_size);
f88e1a42
JS
5564
5565 /* Parse -o options */
5566 if (options) {
83d0521a
CL
5567 if (qemu_opts_do_parse(opts, options, NULL) != 0) {
5568 error_setg(errp, "Invalid options for file format '%s'", fmt);
f88e1a42
JS
5569 goto out;
5570 }
5571 }
5572
5573 if (base_filename) {
83d0521a 5574 if (qemu_opt_set(opts, BLOCK_OPT_BACKING_FILE, base_filename)) {
71c79813
LC
5575 error_setg(errp, "Backing file not supported for file format '%s'",
5576 fmt);
f88e1a42
JS
5577 goto out;
5578 }
5579 }
5580
5581 if (base_fmt) {
83d0521a 5582 if (qemu_opt_set(opts, BLOCK_OPT_BACKING_FMT, base_fmt)) {
71c79813
LC
5583 error_setg(errp, "Backing file format not supported for file "
5584 "format '%s'", fmt);
f88e1a42
JS
5585 goto out;
5586 }
5587 }
5588
83d0521a
CL
5589 backing_file = qemu_opt_get(opts, BLOCK_OPT_BACKING_FILE);
5590 if (backing_file) {
5591 if (!strcmp(filename, backing_file)) {
71c79813
LC
5592 error_setg(errp, "Error: Trying to create an image with the "
5593 "same filename as the backing file");
792da93a
JS
5594 goto out;
5595 }
5596 }
5597
83d0521a
CL
5598 backing_fmt = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT);
5599 if (backing_fmt) {
5600 backing_drv = bdrv_find_format(backing_fmt);
96df67d1 5601 if (!backing_drv) {
71c79813 5602 error_setg(errp, "Unknown backing file format '%s'",
83d0521a 5603 backing_fmt);
f88e1a42
JS
5604 goto out;
5605 }
5606 }
5607
5608 // The size for the image must always be specified, with one exception:
5609 // If we are using a backing file, we can obtain the size from there
83d0521a
CL
5610 size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 0);
5611 if (size == -1) {
5612 if (backing_file) {
66f6b814 5613 BlockDriverState *bs;
f88e1a42 5614 uint64_t size;
63090dac
PB
5615 int back_flags;
5616
5617 /* backing files always opened read-only */
5618 back_flags =
5619 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
f88e1a42 5620
f67503e5 5621 bs = NULL;
83d0521a 5622 ret = bdrv_open(&bs, backing_file, NULL, NULL, back_flags,
cc84d90f 5623 backing_drv, &local_err);
f88e1a42 5624 if (ret < 0) {
cc84d90f 5625 error_setg_errno(errp, -ret, "Could not open '%s': %s",
83d0521a 5626 backing_file,
cc84d90f
HR
5627 error_get_pretty(local_err));
5628 error_free(local_err);
5629 local_err = NULL;
f88e1a42
JS
5630 goto out;
5631 }
5632 bdrv_get_geometry(bs, &size);
5633 size *= 512;
5634
83d0521a 5635 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, size);
66f6b814
HR
5636
5637 bdrv_unref(bs);
f88e1a42 5638 } else {
71c79813 5639 error_setg(errp, "Image creation needs a size parameter");
f88e1a42
JS
5640 goto out;
5641 }
5642 }
5643
f382d43a
MR
5644 if (!quiet) {
5645 printf("Formatting '%s', fmt=%s ", filename, fmt);
83d0521a 5646 qemu_opts_print(opts);
f382d43a
MR
5647 puts("");
5648 }
83d0521a 5649
c282e1fd 5650 ret = bdrv_create(drv, filename, opts, &local_err);
83d0521a 5651
cc84d90f
HR
5652 if (ret == -EFBIG) {
5653 /* This is generally a better message than whatever the driver would
5654 * deliver (especially because of the cluster_size_hint), since that
5655 * is most probably not much different from "image too large". */
5656 const char *cluster_size_hint = "";
83d0521a 5657 if (qemu_opt_get_size(opts, BLOCK_OPT_CLUSTER_SIZE, 0)) {
cc84d90f 5658 cluster_size_hint = " (try using a larger cluster size)";
f88e1a42 5659 }
cc84d90f
HR
5660 error_setg(errp, "The image size is too large for file format '%s'"
5661 "%s", fmt, cluster_size_hint);
5662 error_free(local_err);
5663 local_err = NULL;
f88e1a42
JS
5664 }
5665
5666out:
83d0521a
CL
5667 qemu_opts_del(opts);
5668 qemu_opts_free(create_opts);
84d18f06 5669 if (local_err) {
cc84d90f
HR
5670 error_propagate(errp, local_err);
5671 }
f88e1a42 5672}
85d126f3
SH
5673
5674AioContext *bdrv_get_aio_context(BlockDriverState *bs)
5675{
dcd04228
SH
5676 return bs->aio_context;
5677}
5678
5679void bdrv_detach_aio_context(BlockDriverState *bs)
5680{
5681 if (!bs->drv) {
5682 return;
5683 }
5684
13af91eb
SH
5685 if (bs->io_limits_enabled) {
5686 throttle_detach_aio_context(&bs->throttle_state);
5687 }
dcd04228
SH
5688 if (bs->drv->bdrv_detach_aio_context) {
5689 bs->drv->bdrv_detach_aio_context(bs);
5690 }
5691 if (bs->file) {
5692 bdrv_detach_aio_context(bs->file);
5693 }
5694 if (bs->backing_hd) {
5695 bdrv_detach_aio_context(bs->backing_hd);
5696 }
5697
5698 bs->aio_context = NULL;
5699}
5700
5701void bdrv_attach_aio_context(BlockDriverState *bs,
5702 AioContext *new_context)
5703{
5704 if (!bs->drv) {
5705 return;
5706 }
5707
5708 bs->aio_context = new_context;
5709
5710 if (bs->backing_hd) {
5711 bdrv_attach_aio_context(bs->backing_hd, new_context);
5712 }
5713 if (bs->file) {
5714 bdrv_attach_aio_context(bs->file, new_context);
5715 }
5716 if (bs->drv->bdrv_attach_aio_context) {
5717 bs->drv->bdrv_attach_aio_context(bs, new_context);
5718 }
13af91eb
SH
5719 if (bs->io_limits_enabled) {
5720 throttle_attach_aio_context(&bs->throttle_state, new_context);
5721 }
dcd04228
SH
5722}
5723
5724void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
5725{
5726 bdrv_drain_all(); /* ensure there are no in-flight requests */
5727
5728 bdrv_detach_aio_context(bs);
5729
5730 /* This function executes in the old AioContext so acquire the new one in
5731 * case it runs in a different thread.
5732 */
5733 aio_context_acquire(new_context);
5734 bdrv_attach_aio_context(bs, new_context);
5735 aio_context_release(new_context);
85d126f3 5736}
d616b224
SH
5737
5738void bdrv_add_before_write_notifier(BlockDriverState *bs,
5739 NotifierWithReturn *notifier)
5740{
5741 notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
5742}
6f176b48 5743
c282e1fd 5744int bdrv_amend_options(BlockDriverState *bs, QemuOpts *opts)
6f176b48 5745{
c282e1fd 5746 if (!bs->drv->bdrv_amend_options) {
6f176b48
HR
5747 return -ENOTSUP;
5748 }
c282e1fd 5749 return bs->drv->bdrv_amend_options(bs, opts);
6f176b48 5750}
f6186f49 5751
b5042a36
BC
5752/* This function will be called by the bdrv_recurse_is_first_non_filter method
5753 * of block filter and by bdrv_is_first_non_filter.
5754 * It is used to test if the given bs is the candidate or recurse more in the
5755 * node graph.
212a5a8f 5756 */
b5042a36 5757bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
212a5a8f 5758 BlockDriverState *candidate)
f6186f49 5759{
b5042a36
BC
5760 /* return false if basic checks fails */
5761 if (!bs || !bs->drv) {
212a5a8f 5762 return false;
f6186f49
BC
5763 }
5764
b5042a36
BC
5765 /* the code reached a non block filter driver -> check if the bs is
5766 * the same as the candidate. It's the recursion termination condition.
5767 */
5768 if (!bs->drv->is_filter) {
5769 return bs == candidate;
212a5a8f 5770 }
b5042a36 5771 /* Down this path the driver is a block filter driver */
212a5a8f 5772
b5042a36
BC
5773 /* If the block filter recursion method is defined use it to recurse down
5774 * the node graph.
5775 */
5776 if (bs->drv->bdrv_recurse_is_first_non_filter) {
212a5a8f 5777 return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate);
f6186f49
BC
5778 }
5779
b5042a36
BC
5780 /* the driver is a block filter but don't allow to recurse -> return false
5781 */
5782 return false;
f6186f49
BC
5783}
5784
212a5a8f
BC
5785/* This function checks if the candidate is the first non filter bs down it's
5786 * bs chain. Since we don't have pointers to parents it explore all bs chains
5787 * from the top. Some filters can choose not to pass down the recursion.
5788 */
5789bool bdrv_is_first_non_filter(BlockDriverState *candidate)
f6186f49 5790{
212a5a8f
BC
5791 BlockDriverState *bs;
5792
5793 /* walk down the bs forest recursively */
5794 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5795 bool perm;
5796
b5042a36 5797 /* try to recurse in this top level bs */
e6dc8a1f 5798 perm = bdrv_recurse_is_first_non_filter(bs, candidate);
212a5a8f
BC
5799
5800 /* candidate is the first non filter */
5801 if (perm) {
5802 return true;
5803 }
5804 }
5805
5806 return false;
f6186f49 5807}